QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tmc_dslash_g80_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 19
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define i00_re I0.x
15 #define i00_im I0.y
16 #define i01_re I1.x
17 #define i01_im I1.y
18 #define i02_re I2.x
19 #define i02_im I2.y
20 #define i10_re I3.x
21 #define i10_im I3.y
22 #define i11_re I4.x
23 #define i11_im I4.y
24 #define i12_re I5.x
25 #define i12_im I5.y
26 #define i20_re I6.x
27 #define i20_im I6.y
28 #define i21_re I7.x
29 #define i21_im I7.y
30 #define i22_re I8.x
31 #define i22_im I8.y
32 #define i30_re I9.x
33 #define i30_im I9.y
34 #define i31_re I10.x
35 #define i31_im I10.y
36 #define i32_re I11.x
37 #define i32_im I11.y
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
62 #else
63 #define spinorFloat float
64 #define i00_re I0.x
65 #define i00_im I0.y
66 #define i01_re I0.z
67 #define i01_im I0.w
68 #define i02_re I1.x
69 #define i02_im I1.y
70 #define i10_re I1.z
71 #define i10_im I1.w
72 #define i11_re I2.x
73 #define i11_im I2.y
74 #define i12_re I2.z
75 #define i12_im I2.w
76 #define i20_re I3.x
77 #define i20_im I3.y
78 #define i21_re I3.z
79 #define i21_im I3.w
80 #define i22_re I4.x
81 #define i22_im I4.y
82 #define i30_re I4.z
83 #define i30_im I4.w
84 #define i31_re I5.x
85 #define i31_im I5.y
86 #define i32_re I5.z
87 #define i32_im I5.w
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
113 
114 // gauge link
115 #ifdef GAUGE_FLOAT2
116 #define g00_re G0.x
117 #define g00_im G0.y
118 #define g01_re G1.x
119 #define g01_im G1.y
120 #define g02_re G2.x
121 #define g02_im G2.y
122 #define g10_re G3.x
123 #define g10_im G3.y
124 #define g11_re G4.x
125 #define g11_im G4.y
126 #define g12_re G5.x
127 #define g12_im G5.y
128 #define g20_re G6.x
129 #define g20_im G6.y
130 #define g21_re G7.x
131 #define g21_im G7.y
132 #define g22_re G8.x
133 #define g22_im G8.y
134 
135 #else
136 #define g00_re G0.x
137 #define g00_im G0.y
138 #define g01_re G0.z
139 #define g01_im G0.w
140 #define g02_re G1.x
141 #define g02_im G1.y
142 #define g10_re G1.z
143 #define g10_im G1.w
144 #define g11_re G2.x
145 #define g11_im G2.y
146 #define g12_re G2.z
147 #define g12_im G2.w
148 #define g20_re G3.x
149 #define g20_im G3.y
150 #define g21_re G3.z
151 #define g21_im G3.w
152 #define g22_re G4.x
153 #define g22_im G4.y
154 
155 #endif // GAUGE_DOUBLE
156 
157 // conjugated gauge link
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
176 
177 // first chiral block of clover term
178 #ifdef CLOVER_DOUBLE
179 #define c00_00_re C0.x
180 #define c01_01_re C0.y
181 #define c02_02_re C1.x
182 #define c10_10_re C1.y
183 #define c11_11_re C2.x
184 #define c12_12_re C2.y
185 #define c01_00_re C3.x
186 #define c01_00_im C3.y
187 #define c02_00_re C4.x
188 #define c02_00_im C4.y
189 #define c10_00_re C5.x
190 #define c10_00_im C5.y
191 #define c11_00_re C6.x
192 #define c11_00_im C6.y
193 #define c12_00_re C7.x
194 #define c12_00_im C7.y
195 #define c02_01_re C8.x
196 #define c02_01_im C8.y
197 #define c10_01_re C9.x
198 #define c10_01_im C9.y
199 #define c11_01_re C10.x
200 #define c11_01_im C10.y
201 #define c12_01_re C11.x
202 #define c12_01_im C11.y
203 #define c10_02_re C12.x
204 #define c10_02_im C12.y
205 #define c11_02_re C13.x
206 #define c11_02_im C13.y
207 #define c12_02_re C14.x
208 #define c12_02_im C14.y
209 #define c11_10_re C15.x
210 #define c11_10_im C15.y
211 #define c12_10_re C16.x
212 #define c12_10_im C16.y
213 #define c12_11_re C17.x
214 #define c12_11_im C17.y
215 #else
216 #define c00_00_re C0.x
217 #define c01_01_re C0.y
218 #define c02_02_re C0.z
219 #define c10_10_re C0.w
220 #define c11_11_re C1.x
221 #define c12_12_re C1.y
222 #define c01_00_re C1.z
223 #define c01_00_im C1.w
224 #define c02_00_re C2.x
225 #define c02_00_im C2.y
226 #define c10_00_re C2.z
227 #define c10_00_im C2.w
228 #define c11_00_re C3.x
229 #define c11_00_im C3.y
230 #define c12_00_re C3.z
231 #define c12_00_im C3.w
232 #define c02_01_re C4.x
233 #define c02_01_im C4.y
234 #define c10_01_re C4.z
235 #define c10_01_im C4.w
236 #define c11_01_re C5.x
237 #define c11_01_im C5.y
238 #define c12_01_re C5.z
239 #define c12_01_im C5.w
240 #define c10_02_re C6.x
241 #define c10_02_im C6.y
242 #define c11_02_re C6.z
243 #define c11_02_im C6.w
244 #define c12_02_re C7.x
245 #define c12_02_im C7.y
246 #define c11_10_re C7.z
247 #define c11_10_im C7.w
248 #define c12_10_re C8.x
249 #define c12_10_im C8.y
250 #define c12_11_re C8.z
251 #define c12_11_im C8.w
252 #endif // CLOVER_DOUBLE
253 
254 #define c00_01_re (+c01_00_re)
255 #define c00_01_im (-c01_00_im)
256 #define c00_02_re (+c02_00_re)
257 #define c00_02_im (-c02_00_im)
258 #define c01_02_re (+c02_01_re)
259 #define c01_02_im (-c02_01_im)
260 #define c00_10_re (+c10_00_re)
261 #define c00_10_im (-c10_00_im)
262 #define c01_10_re (+c10_01_re)
263 #define c01_10_im (-c10_01_im)
264 #define c02_10_re (+c10_02_re)
265 #define c02_10_im (-c10_02_im)
266 #define c00_11_re (+c11_00_re)
267 #define c00_11_im (-c11_00_im)
268 #define c01_11_re (+c11_01_re)
269 #define c01_11_im (-c11_01_im)
270 #define c02_11_re (+c11_02_re)
271 #define c02_11_im (-c11_02_im)
272 #define c10_11_re (+c11_10_re)
273 #define c10_11_im (-c11_10_im)
274 #define c00_12_re (+c12_00_re)
275 #define c00_12_im (-c12_00_im)
276 #define c01_12_re (+c12_01_re)
277 #define c01_12_im (-c12_01_im)
278 #define c02_12_re (+c12_02_re)
279 #define c02_12_im (-c12_02_im)
280 #define c10_12_re (+c12_10_re)
281 #define c10_12_im (-c12_10_im)
282 #define c11_12_re (+c12_11_re)
283 #define c11_12_im (-c12_11_im)
284 
285 // second chiral block of clover term (reuses C0,...,C9)
286 #define c20_20_re c00_00_re
287 #define c21_20_re c01_00_re
288 #define c21_20_im c01_00_im
289 #define c22_20_re c02_00_re
290 #define c22_20_im c02_00_im
291 #define c30_20_re c10_00_re
292 #define c30_20_im c10_00_im
293 #define c31_20_re c11_00_re
294 #define c31_20_im c11_00_im
295 #define c32_20_re c12_00_re
296 #define c32_20_im c12_00_im
297 #define c20_21_re c00_01_re
298 #define c20_21_im c00_01_im
299 #define c21_21_re c01_01_re
300 #define c22_21_re c02_01_re
301 #define c22_21_im c02_01_im
302 #define c30_21_re c10_01_re
303 #define c30_21_im c10_01_im
304 #define c31_21_re c11_01_re
305 #define c31_21_im c11_01_im
306 #define c32_21_re c12_01_re
307 #define c32_21_im c12_01_im
308 #define c20_22_re c00_02_re
309 #define c20_22_im c00_02_im
310 #define c21_22_re c01_02_re
311 #define c21_22_im c01_02_im
312 #define c22_22_re c02_02_re
313 #define c30_22_re c10_02_re
314 #define c30_22_im c10_02_im
315 #define c31_22_re c11_02_re
316 #define c31_22_im c11_02_im
317 #define c32_22_re c12_02_re
318 #define c32_22_im c12_02_im
319 #define c20_30_re c00_10_re
320 #define c20_30_im c00_10_im
321 #define c21_30_re c01_10_re
322 #define c21_30_im c01_10_im
323 #define c22_30_re c02_10_re
324 #define c22_30_im c02_10_im
325 #define c30_30_re c10_10_re
326 #define c31_30_re c11_10_re
327 #define c31_30_im c11_10_im
328 #define c32_30_re c12_10_re
329 #define c32_30_im c12_10_im
330 #define c20_31_re c00_11_re
331 #define c20_31_im c00_11_im
332 #define c21_31_re c01_11_re
333 #define c21_31_im c01_11_im
334 #define c22_31_re c02_11_re
335 #define c22_31_im c02_11_im
336 #define c30_31_re c10_11_re
337 #define c30_31_im c10_11_im
338 #define c31_31_re c11_11_re
339 #define c32_31_re c12_11_re
340 #define c32_31_im c12_11_im
341 #define c20_32_re c00_12_re
342 #define c20_32_im c00_12_im
343 #define c21_32_re c01_12_re
344 #define c21_32_im c01_12_im
345 #define c22_32_re c02_12_re
346 #define c22_32_im c02_12_im
347 #define c30_32_re c10_12_re
348 #define c30_32_im c10_12_im
349 #define c31_32_re c11_12_re
350 #define c31_32_im c11_12_im
351 #define c32_32_re c12_12_re
352 
353 
354 // first chiral block of inverted clover term
355 #ifdef CLOVER_DOUBLE
356 #define cinv00_00_re C0.x
357 #define cinv01_01_re C0.y
358 #define cinv02_02_re C1.x
359 #define cinv10_10_re C1.y
360 #define cinv11_11_re C2.x
361 #define cinv12_12_re C2.y
362 #define cinv01_00_re C3.x
363 #define cinv01_00_im C3.y
364 #define cinv02_00_re C4.x
365 #define cinv02_00_im C4.y
366 #define cinv10_00_re C5.x
367 #define cinv10_00_im C5.y
368 #define cinv11_00_re C6.x
369 #define cinv11_00_im C6.y
370 #define cinv12_00_re C7.x
371 #define cinv12_00_im C7.y
372 #define cinv02_01_re C8.x
373 #define cinv02_01_im C8.y
374 #define cinv10_01_re C9.x
375 #define cinv10_01_im C9.y
376 #define cinv11_01_re C10.x
377 #define cinv11_01_im C10.y
378 #define cinv12_01_re C11.x
379 #define cinv12_01_im C11.y
380 #define cinv10_02_re C12.x
381 #define cinv10_02_im C12.y
382 #define cinv11_02_re C13.x
383 #define cinv11_02_im C13.y
384 #define cinv12_02_re C14.x
385 #define cinv12_02_im C14.y
386 #define cinv11_10_re C15.x
387 #define cinv11_10_im C15.y
388 #define cinv12_10_re C16.x
389 #define cinv12_10_im C16.y
390 #define cinv12_11_re C17.x
391 #define cinv12_11_im C17.y
392 #else
393 #define cinv00_00_re C0.x
394 #define cinv01_01_re C0.y
395 #define cinv02_02_re C0.z
396 #define cinv10_10_re C0.w
397 #define cinv11_11_re C1.x
398 #define cinv12_12_re C1.y
399 #define cinv01_00_re C1.z
400 #define cinv01_00_im C1.w
401 #define cinv02_00_re C2.x
402 #define cinv02_00_im C2.y
403 #define cinv10_00_re C2.z
404 #define cinv10_00_im C2.w
405 #define cinv11_00_re C3.x
406 #define cinv11_00_im C3.y
407 #define cinv12_00_re C3.z
408 #define cinv12_00_im C3.w
409 #define cinv02_01_re C4.x
410 #define cinv02_01_im C4.y
411 #define cinv10_01_re C4.z
412 #define cinv10_01_im C4.w
413 #define cinv11_01_re C5.x
414 #define cinv11_01_im C5.y
415 #define cinv12_01_re C5.z
416 #define cinv12_01_im C5.w
417 #define cinv10_02_re C6.x
418 #define cinv10_02_im C6.y
419 #define cinv11_02_re C6.z
420 #define cinv11_02_im C6.w
421 #define cinv12_02_re C7.x
422 #define cinv12_02_im C7.y
423 #define cinv11_10_re C7.z
424 #define cinv11_10_im C7.w
425 #define cinv12_10_re C8.x
426 #define cinv12_10_im C8.y
427 #define cinv12_11_re C8.z
428 #define cinv12_11_im C8.w
429 #endif // CLOVER_DOUBLE
430 
431 #define cinv00_01_re (+cinv01_00_re)
432 #define cinv00_01_im (-cinv01_00_im)
433 #define cinv00_02_re (+cinv02_00_re)
434 #define cinv00_02_im (-cinv02_00_im)
435 #define cinv01_02_re (+cinv02_01_re)
436 #define cinv01_02_im (-cinv02_01_im)
437 #define cinv00_10_re (+cinv10_00_re)
438 #define cinv00_10_im (-cinv10_00_im)
439 #define cinv01_10_re (+cinv10_01_re)
440 #define cinv01_10_im (-cinv10_01_im)
441 #define cinv02_10_re (+cinv10_02_re)
442 #define cinv02_10_im (-cinv10_02_im)
443 #define cinv00_11_re (+cinv11_00_re)
444 #define cinv00_11_im (-cinv11_00_im)
445 #define cinv01_11_re (+cinv11_01_re)
446 #define cinv01_11_im (-cinv11_01_im)
447 #define cinv02_11_re (+cinv11_02_re)
448 #define cinv02_11_im (-cinv11_02_im)
449 #define cinv10_11_re (+cinv11_10_re)
450 #define cinv10_11_im (-cinv11_10_im)
451 #define cinv00_12_re (+cinv12_00_re)
452 #define cinv00_12_im (-cinv12_00_im)
453 #define cinv01_12_re (+cinv12_01_re)
454 #define cinv01_12_im (-cinv12_01_im)
455 #define cinv02_12_re (+cinv12_02_re)
456 #define cinv02_12_im (-cinv12_02_im)
457 #define cinv10_12_re (+cinv12_10_re)
458 #define cinv10_12_im (-cinv12_10_im)
459 #define cinv11_12_re (+cinv12_11_re)
460 #define cinv11_12_im (-cinv12_11_im)
461 
462 // second chiral block of inverted clover term (reuses C0,...,C9)
463 #define cinv20_20_re cinv00_00_re
464 #define cinv21_20_re cinv01_00_re
465 #define cinv21_20_im cinv01_00_im
466 #define cinv22_20_re cinv02_00_re
467 #define cinv22_20_im cinv02_00_im
468 #define cinv30_20_re cinv10_00_re
469 #define cinv30_20_im cinv10_00_im
470 #define cinv31_20_re cinv11_00_re
471 #define cinv31_20_im cinv11_00_im
472 #define cinv32_20_re cinv12_00_re
473 #define cinv32_20_im cinv12_00_im
474 #define cinv20_21_re cinv00_01_re
475 #define cinv20_21_im cinv00_01_im
476 #define cinv21_21_re cinv01_01_re
477 #define cinv22_21_re cinv02_01_re
478 #define cinv22_21_im cinv02_01_im
479 #define cinv30_21_re cinv10_01_re
480 #define cinv30_21_im cinv10_01_im
481 #define cinv31_21_re cinv11_01_re
482 #define cinv31_21_im cinv11_01_im
483 #define cinv32_21_re cinv12_01_re
484 #define cinv32_21_im cinv12_01_im
485 #define cinv20_22_re cinv00_02_re
486 #define cinv20_22_im cinv00_02_im
487 #define cinv21_22_re cinv01_02_re
488 #define cinv21_22_im cinv01_02_im
489 #define cinv22_22_re cinv02_02_re
490 #define cinv30_22_re cinv10_02_re
491 #define cinv30_22_im cinv10_02_im
492 #define cinv31_22_re cinv11_02_re
493 #define cinv31_22_im cinv11_02_im
494 #define cinv32_22_re cinv12_02_re
495 #define cinv32_22_im cinv12_02_im
496 #define cinv20_30_re cinv00_10_re
497 #define cinv20_30_im cinv00_10_im
498 #define cinv21_30_re cinv01_10_re
499 #define cinv21_30_im cinv01_10_im
500 #define cinv22_30_re cinv02_10_re
501 #define cinv22_30_im cinv02_10_im
502 #define cinv30_30_re cinv10_10_re
503 #define cinv31_30_re cinv11_10_re
504 #define cinv31_30_im cinv11_10_im
505 #define cinv32_30_re cinv12_10_re
506 #define cinv32_30_im cinv12_10_im
507 #define cinv20_31_re cinv00_11_re
508 #define cinv20_31_im cinv00_11_im
509 #define cinv21_31_re cinv01_11_re
510 #define cinv21_31_im cinv01_11_im
511 #define cinv22_31_re cinv02_11_re
512 #define cinv22_31_im cinv02_11_im
513 #define cinv30_31_re cinv10_11_re
514 #define cinv30_31_im cinv10_11_im
515 #define cinv31_31_re cinv11_11_re
516 #define cinv32_31_re cinv12_11_re
517 #define cinv32_31_im cinv12_11_im
518 #define cinv20_32_re cinv00_12_re
519 #define cinv20_32_im cinv00_12_im
520 #define cinv21_32_re cinv01_12_re
521 #define cinv21_32_im cinv01_12_im
522 #define cinv22_32_re cinv02_12_re
523 #define cinv22_32_im cinv02_12_im
524 #define cinv30_32_re cinv10_12_re
525 #define cinv30_32_im cinv10_12_im
526 #define cinv31_32_re cinv11_12_re
527 #define cinv31_32_im cinv11_12_im
528 #define cinv32_32_re cinv12_12_re
529 
530 
531 
532 // declare C## here and use ASSN below instead of READ
533 #ifdef CLOVER_DOUBLE
534 double2 C0;
535 double2 C1;
536 double2 C2;
537 double2 C3;
538 double2 C4;
539 double2 C5;
540 double2 C6;
541 double2 C7;
542 double2 C8;
543 double2 C9;
544 double2 C10;
545 double2 C11;
546 double2 C12;
547 double2 C13;
548 double2 C14;
549 double2 C15;
550 double2 C16;
551 double2 C17;
552 #else
553 float4 C0;
554 float4 C1;
555 float4 C2;
556 float4 C3;
557 float4 C4;
558 float4 C5;
559 float4 C6;
560 float4 C7;
561 float4 C8;
562 
563 #if (DD_PREC==2)
564 float K;
565 #endif
566 
567 #endif // CLOVER_DOUBLE
568 // output spinor
569 #define o00_re s[0*SHARED_STRIDE]
570 #define o00_im s[1*SHARED_STRIDE]
571 #define o01_re s[2*SHARED_STRIDE]
572 #define o01_im s[3*SHARED_STRIDE]
573 #define o02_re s[4*SHARED_STRIDE]
574 #define o02_im s[5*SHARED_STRIDE]
575 #define o10_re s[6*SHARED_STRIDE]
576 #define o10_im s[7*SHARED_STRIDE]
577 #define o11_re s[8*SHARED_STRIDE]
578 #define o11_im s[9*SHARED_STRIDE]
579 #define o12_re s[10*SHARED_STRIDE]
580 #define o12_im s[11*SHARED_STRIDE]
581 #define o20_re s[12*SHARED_STRIDE]
582 #define o20_im s[13*SHARED_STRIDE]
583 #define o21_re s[14*SHARED_STRIDE]
584 #define o21_im s[15*SHARED_STRIDE]
585 #define o22_re s[16*SHARED_STRIDE]
586 #define o22_im s[17*SHARED_STRIDE]
587 #define o30_re s[18*SHARED_STRIDE]
593 
594 #ifdef SPINOR_DOUBLE
595 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
596 #else
597 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
598 #endif
599 
600 extern __shared__ char s_data[];
601 
603  + (threadIdx.x % SHARED_STRIDE);
604 
605 #include "read_gauge.h"
606 #include "io_spinor.h"
607 #include "read_clover.h"
608 #include "tmc_core.h"
609 
610 int x1, x2, x3, x4;
611 int X;
612 
613 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
614 int sp_norm_idx;
615 #endif // MULTI_GPU half precision
616 
617 int sid;
618 
619 #ifdef MULTI_GPU
620 int face_idx;
622 #endif
623 
624  sid = blockIdx.x*blockDim.x + threadIdx.x;
625  if (sid >= param.threads) return;
626 
627  // Inline by hand for the moment and assume even dimensions
628  const int dims[] = {X1, X2, X3, X4};
629  coordsFromIndex<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
630 
631  o00_re = 0; o00_im = 0;
632  o01_re = 0; o01_im = 0;
633  o02_re = 0; o02_im = 0;
634  o10_re = 0; o10_im = 0;
635  o11_re = 0; o11_im = 0;
636  o12_re = 0; o12_im = 0;
637  o20_re = 0; o20_im = 0;
638  o21_re = 0; o21_im = 0;
639  o22_re = 0; o22_im = 0;
640  o30_re = 0; o30_im = 0;
641  o31_re = 0; o31_im = 0;
642  o32_re = 0; o32_im = 0;
643 
644 #ifdef MULTI_GPU
645 } else { // exterior kernel
646 
647  sid = blockIdx.x*blockDim.x + threadIdx.x;
648  if (sid >= param.threads) return;
649 
650  const int dim = static_cast<int>(kernel_type);
651  const int face_volume = (param.threads >> 1); // volume of one face
652  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
653  face_idx = sid - face_num*face_volume; // index into the respective face
654 
655  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
656  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
657  //sp_idx = face_idx + param.ghostOffset[dim];
658 
659 #if (DD_PREC==2) // half precision
660  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
661 #endif
662 
663  const int dims[] = {X1, X2, X3, X4};
664  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
665 
666  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
667 
668  o00_re = i00_re; o00_im = i00_im;
669  o01_re = i01_re; o01_im = i01_im;
670  o02_re = i02_re; o02_im = i02_im;
671  o10_re = i10_re; o10_im = i10_im;
672  o11_re = i11_re; o11_im = i11_im;
673  o12_re = i12_re; o12_im = i12_im;
674  o20_re = i20_re; o20_im = i20_im;
675  o21_re = i21_re; o21_im = i21_im;
676  o22_re = i22_re; o22_im = i22_im;
677  o30_re = i30_re; o30_im = i30_im;
678  o31_re = i31_re; o31_im = i31_im;
679  o32_re = i32_re; o32_im = i32_im;
680 }
681 #endif // MULTI_GPU
682 
683 
684 #ifdef MULTI_GPU
685 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
686  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
687 #endif
688 {
689  // Projector P0-
690  // 1 0 0 -i
691  // 0 1 -i 0
692  // 0 i 1 0
693  // i 0 0 1
694 
695 #ifdef MULTI_GPU
696  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
697  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
698 #else
699  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
700 #endif
701 
702  const int ga_idx = sid;
703 
710 
711 #ifdef MULTI_GPU
712  if (kernel_type == INTERIOR_KERNEL) {
713 #endif
714 
715  // read spinor from device memory
716  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
717 
718  // project spinor into half spinors
719  a0_re = +i00_re+i30_im;
720  a0_im = +i00_im-i30_re;
721  a1_re = +i01_re+i31_im;
722  a1_im = +i01_im-i31_re;
723  a2_re = +i02_re+i32_im;
724  a2_im = +i02_im-i32_re;
725  b0_re = +i10_re+i20_im;
726  b0_im = +i10_im-i20_re;
727  b1_re = +i11_re+i21_im;
728  b1_im = +i11_im-i21_re;
729  b2_re = +i12_re+i22_im;
730  b2_im = +i12_im-i22_re;
731 
732 #ifdef MULTI_GPU
733  } else {
734 
735  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
736 
737  // read half spinor from device memory
738  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
739 
740  a0_re = i00_re; a0_im = i00_im;
741  a1_re = i01_re; a1_im = i01_im;
742  a2_re = i02_re; a2_im = i02_im;
743  b0_re = i10_re; b0_im = i10_im;
744  b1_re = i11_re; b1_im = i11_im;
745  b2_re = i12_re; b2_im = i12_im;
746 
747  }
748 #endif // MULTI_GPU
749 
750  // read gauge matrix from device memory
751  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
752 
753  // reconstruct gauge matrix
755 
756  // multiply row 0
758  A0_re += g00_re * a0_re;
759  A0_re -= g00_im * a0_im;
760  A0_re += g01_re * a1_re;
761  A0_re -= g01_im * a1_im;
762  A0_re += g02_re * a2_re;
763  A0_re -= g02_im * a2_im;
765  A0_im += g00_re * a0_im;
766  A0_im += g00_im * a0_re;
767  A0_im += g01_re * a1_im;
768  A0_im += g01_im * a1_re;
769  A0_im += g02_re * a2_im;
770  A0_im += g02_im * a2_re;
772  B0_re += g00_re * b0_re;
773  B0_re -= g00_im * b0_im;
774  B0_re += g01_re * b1_re;
775  B0_re -= g01_im * b1_im;
776  B0_re += g02_re * b2_re;
777  B0_re -= g02_im * b2_im;
779  B0_im += g00_re * b0_im;
780  B0_im += g00_im * b0_re;
781  B0_im += g01_re * b1_im;
782  B0_im += g01_im * b1_re;
783  B0_im += g02_re * b2_im;
784  B0_im += g02_im * b2_re;
785 
786  // multiply row 1
788  A1_re += g10_re * a0_re;
789  A1_re -= g10_im * a0_im;
790  A1_re += g11_re * a1_re;
791  A1_re -= g11_im * a1_im;
792  A1_re += g12_re * a2_re;
793  A1_re -= g12_im * a2_im;
795  A1_im += g10_re * a0_im;
796  A1_im += g10_im * a0_re;
797  A1_im += g11_re * a1_im;
798  A1_im += g11_im * a1_re;
799  A1_im += g12_re * a2_im;
800  A1_im += g12_im * a2_re;
802  B1_re += g10_re * b0_re;
803  B1_re -= g10_im * b0_im;
804  B1_re += g11_re * b1_re;
805  B1_re -= g11_im * b1_im;
806  B1_re += g12_re * b2_re;
807  B1_re -= g12_im * b2_im;
809  B1_im += g10_re * b0_im;
810  B1_im += g10_im * b0_re;
811  B1_im += g11_re * b1_im;
812  B1_im += g11_im * b1_re;
813  B1_im += g12_re * b2_im;
814  B1_im += g12_im * b2_re;
815 
816  // multiply row 2
818  A2_re += g20_re * a0_re;
819  A2_re -= g20_im * a0_im;
820  A2_re += g21_re * a1_re;
821  A2_re -= g21_im * a1_im;
822  A2_re += g22_re * a2_re;
823  A2_re -= g22_im * a2_im;
825  A2_im += g20_re * a0_im;
826  A2_im += g20_im * a0_re;
827  A2_im += g21_re * a1_im;
828  A2_im += g21_im * a1_re;
829  A2_im += g22_re * a2_im;
830  A2_im += g22_im * a2_re;
832  B2_re += g20_re * b0_re;
833  B2_re -= g20_im * b0_im;
834  B2_re += g21_re * b1_re;
835  B2_re -= g21_im * b1_im;
836  B2_re += g22_re * b2_re;
837  B2_re -= g22_im * b2_im;
839  B2_im += g20_re * b0_im;
840  B2_im += g20_im * b0_re;
841  B2_im += g21_re * b1_im;
842  B2_im += g21_im * b1_re;
843  B2_im += g22_re * b2_im;
844  B2_im += g22_im * b2_re;
845 
846  o00_re += A0_re;
847  o00_im += A0_im;
848  o10_re += B0_re;
849  o10_im += B0_im;
850  o20_re -= B0_im;
851  o20_im += B0_re;
852  o30_re -= A0_im;
853  o30_im += A0_re;
854 
855  o01_re += A1_re;
856  o01_im += A1_im;
857  o11_re += B1_re;
858  o11_im += B1_im;
859  o21_re -= B1_im;
860  o21_im += B1_re;
861  o31_re -= A1_im;
862  o31_im += A1_re;
863 
864  o02_re += A2_re;
865  o02_im += A2_im;
866  o12_re += B2_re;
867  o12_im += B2_im;
868  o22_re -= B2_im;
869  o22_im += B2_re;
870  o32_re -= A2_im;
871  o32_im += A2_re;
872 
873 }
874 
875 #ifdef MULTI_GPU
876 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
877  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
878 #endif
879 {
880  // Projector P0+
881  // 1 0 0 i
882  // 0 1 i 0
883  // 0 -i 1 0
884  // -i 0 0 1
885 
886 #ifdef MULTI_GPU
887  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
888  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
889 #else
890  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
891 #endif
892 
893 #ifdef MULTI_GPU
894  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
895 #else
896  const int ga_idx = sp_idx;
897 #endif
898 
905 
906 #ifdef MULTI_GPU
907  if (kernel_type == INTERIOR_KERNEL) {
908 #endif
909 
910  // read spinor from device memory
911  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
912 
913  // project spinor into half spinors
914  a0_re = +i00_re-i30_im;
915  a0_im = +i00_im+i30_re;
916  a1_re = +i01_re-i31_im;
917  a1_im = +i01_im+i31_re;
918  a2_re = +i02_re-i32_im;
919  a2_im = +i02_im+i32_re;
920  b0_re = +i10_re-i20_im;
921  b0_im = +i10_im+i20_re;
922  b1_re = +i11_re-i21_im;
923  b1_im = +i11_im+i21_re;
924  b2_re = +i12_re-i22_im;
925  b2_im = +i12_im+i22_re;
926 
927 #ifdef MULTI_GPU
928  } else {
929 
930  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
931 
932  // read half spinor from device memory
933  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
934 
935  a0_re = i00_re; a0_im = i00_im;
936  a1_re = i01_re; a1_im = i01_im;
937  a2_re = i02_re; a2_im = i02_im;
938  b0_re = i10_re; b0_im = i10_im;
939  b1_re = i11_re; b1_im = i11_im;
940  b2_re = i12_re; b2_im = i12_im;
941 
942  }
943 #endif // MULTI_GPU
944 
945  // read gauge matrix from device memory
946  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
947 
948  // reconstruct gauge matrix
950 
951  // multiply row 0
952  spinorFloat A0_re = 0;
953  A0_re += gT00_re * a0_re;
954  A0_re -= gT00_im * a0_im;
955  A0_re += gT01_re * a1_re;
956  A0_re -= gT01_im * a1_im;
957  A0_re += gT02_re * a2_re;
958  A0_re -= gT02_im * a2_im;
959  spinorFloat A0_im = 0;
960  A0_im += gT00_re * a0_im;
961  A0_im += gT00_im * a0_re;
962  A0_im += gT01_re * a1_im;
963  A0_im += gT01_im * a1_re;
964  A0_im += gT02_re * a2_im;
965  A0_im += gT02_im * a2_re;
966  spinorFloat B0_re = 0;
967  B0_re += gT00_re * b0_re;
968  B0_re -= gT00_im * b0_im;
969  B0_re += gT01_re * b1_re;
970  B0_re -= gT01_im * b1_im;
971  B0_re += gT02_re * b2_re;
972  B0_re -= gT02_im * b2_im;
973  spinorFloat B0_im = 0;
974  B0_im += gT00_re * b0_im;
975  B0_im += gT00_im * b0_re;
976  B0_im += gT01_re * b1_im;
977  B0_im += gT01_im * b1_re;
978  B0_im += gT02_re * b2_im;
979  B0_im += gT02_im * b2_re;
980 
981  // multiply row 1
982  spinorFloat A1_re = 0;
983  A1_re += gT10_re * a0_re;
984  A1_re -= gT10_im * a0_im;
985  A1_re += gT11_re * a1_re;
986  A1_re -= gT11_im * a1_im;
987  A1_re += gT12_re * a2_re;
988  A1_re -= gT12_im * a2_im;
989  spinorFloat A1_im = 0;
990  A1_im += gT10_re * a0_im;
991  A1_im += gT10_im * a0_re;
992  A1_im += gT11_re * a1_im;
993  A1_im += gT11_im * a1_re;
994  A1_im += gT12_re * a2_im;
995  A1_im += gT12_im * a2_re;
996  spinorFloat B1_re = 0;
997  B1_re += gT10_re * b0_re;
998  B1_re -= gT10_im * b0_im;
999  B1_re += gT11_re * b1_re;
1000  B1_re -= gT11_im * b1_im;
1001  B1_re += gT12_re * b2_re;
1002  B1_re -= gT12_im * b2_im;
1003  spinorFloat B1_im = 0;
1004  B1_im += gT10_re * b0_im;
1005  B1_im += gT10_im * b0_re;
1006  B1_im += gT11_re * b1_im;
1007  B1_im += gT11_im * b1_re;
1008  B1_im += gT12_re * b2_im;
1009  B1_im += gT12_im * b2_re;
1010 
1011  // multiply row 2
1012  spinorFloat A2_re = 0;
1013  A2_re += gT20_re * a0_re;
1014  A2_re -= gT20_im * a0_im;
1015  A2_re += gT21_re * a1_re;
1016  A2_re -= gT21_im * a1_im;
1017  A2_re += gT22_re * a2_re;
1018  A2_re -= gT22_im * a2_im;
1019  spinorFloat A2_im = 0;
1020  A2_im += gT20_re * a0_im;
1021  A2_im += gT20_im * a0_re;
1022  A2_im += gT21_re * a1_im;
1023  A2_im += gT21_im * a1_re;
1024  A2_im += gT22_re * a2_im;
1025  A2_im += gT22_im * a2_re;
1026  spinorFloat B2_re = 0;
1027  B2_re += gT20_re * b0_re;
1028  B2_re -= gT20_im * b0_im;
1029  B2_re += gT21_re * b1_re;
1030  B2_re -= gT21_im * b1_im;
1031  B2_re += gT22_re * b2_re;
1032  B2_re -= gT22_im * b2_im;
1033  spinorFloat B2_im = 0;
1034  B2_im += gT20_re * b0_im;
1035  B2_im += gT20_im * b0_re;
1036  B2_im += gT21_re * b1_im;
1037  B2_im += gT21_im * b1_re;
1038  B2_im += gT22_re * b2_im;
1039  B2_im += gT22_im * b2_re;
1040 
1041  o00_re += A0_re;
1042  o00_im += A0_im;
1043  o10_re += B0_re;
1044  o10_im += B0_im;
1045  o20_re += B0_im;
1046  o20_im -= B0_re;
1047  o30_re += A0_im;
1048  o30_im -= A0_re;
1049 
1050  o01_re += A1_re;
1051  o01_im += A1_im;
1052  o11_re += B1_re;
1053  o11_im += B1_im;
1054  o21_re += B1_im;
1055  o21_im -= B1_re;
1056  o31_re += A1_im;
1057  o31_im -= A1_re;
1058 
1059  o02_re += A2_re;
1060  o02_im += A2_im;
1061  o12_re += B2_re;
1062  o12_im += B2_im;
1063  o22_re += B2_im;
1064  o22_im -= B2_re;
1065  o32_re += A2_im;
1066  o32_im -= A2_re;
1067 
1068 }
1069 
1070 #ifdef MULTI_GPU
1071 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
1072  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
1073 #endif
1074 {
1075  // Projector P1-
1076  // 1 0 0 -1
1077  // 0 1 1 0
1078  // 0 1 1 0
1079  // -1 0 0 1
1080 
1081 #ifdef MULTI_GPU
1082  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
1083  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1084 #else
1085  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
1086 #endif
1087 
1088  const int ga_idx = sid;
1089 
1096 
1097 #ifdef MULTI_GPU
1098  if (kernel_type == INTERIOR_KERNEL) {
1099 #endif
1100 
1101  // read spinor from device memory
1102  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1103 
1104  // project spinor into half spinors
1105  a0_re = +i00_re-i30_re;
1106  a0_im = +i00_im-i30_im;
1107  a1_re = +i01_re-i31_re;
1108  a1_im = +i01_im-i31_im;
1109  a2_re = +i02_re-i32_re;
1110  a2_im = +i02_im-i32_im;
1111  b0_re = +i10_re+i20_re;
1112  b0_im = +i10_im+i20_im;
1113  b1_re = +i11_re+i21_re;
1114  b1_im = +i11_im+i21_im;
1115  b2_re = +i12_re+i22_re;
1116  b2_im = +i12_im+i22_im;
1117 
1118 #ifdef MULTI_GPU
1119  } else {
1120 
1121  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1122 
1123  // read half spinor from device memory
1124  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1125 
1126  a0_re = i00_re; a0_im = i00_im;
1127  a1_re = i01_re; a1_im = i01_im;
1128  a2_re = i02_re; a2_im = i02_im;
1129  b0_re = i10_re; b0_im = i10_im;
1130  b1_re = i11_re; b1_im = i11_im;
1131  b2_re = i12_re; b2_im = i12_im;
1132 
1133  }
1134 #endif // MULTI_GPU
1135 
1136  // read gauge matrix from device memory
1137  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
1138 
1139  // reconstruct gauge matrix
1141 
1142  // multiply row 0
1143  spinorFloat A0_re = 0;
1144  A0_re += g00_re * a0_re;
1145  A0_re -= g00_im * a0_im;
1146  A0_re += g01_re * a1_re;
1147  A0_re -= g01_im * a1_im;
1148  A0_re += g02_re * a2_re;
1149  A0_re -= g02_im * a2_im;
1150  spinorFloat A0_im = 0;
1151  A0_im += g00_re * a0_im;
1152  A0_im += g00_im * a0_re;
1153  A0_im += g01_re * a1_im;
1154  A0_im += g01_im * a1_re;
1155  A0_im += g02_re * a2_im;
1156  A0_im += g02_im * a2_re;
1157  spinorFloat B0_re = 0;
1158  B0_re += g00_re * b0_re;
1159  B0_re -= g00_im * b0_im;
1160  B0_re += g01_re * b1_re;
1161  B0_re -= g01_im * b1_im;
1162  B0_re += g02_re * b2_re;
1163  B0_re -= g02_im * b2_im;
1164  spinorFloat B0_im = 0;
1165  B0_im += g00_re * b0_im;
1166  B0_im += g00_im * b0_re;
1167  B0_im += g01_re * b1_im;
1168  B0_im += g01_im * b1_re;
1169  B0_im += g02_re * b2_im;
1170  B0_im += g02_im * b2_re;
1171 
1172  // multiply row 1
1173  spinorFloat A1_re = 0;
1174  A1_re += g10_re * a0_re;
1175  A1_re -= g10_im * a0_im;
1176  A1_re += g11_re * a1_re;
1177  A1_re -= g11_im * a1_im;
1178  A1_re += g12_re * a2_re;
1179  A1_re -= g12_im * a2_im;
1180  spinorFloat A1_im = 0;
1181  A1_im += g10_re * a0_im;
1182  A1_im += g10_im * a0_re;
1183  A1_im += g11_re * a1_im;
1184  A1_im += g11_im * a1_re;
1185  A1_im += g12_re * a2_im;
1186  A1_im += g12_im * a2_re;
1187  spinorFloat B1_re = 0;
1188  B1_re += g10_re * b0_re;
1189  B1_re -= g10_im * b0_im;
1190  B1_re += g11_re * b1_re;
1191  B1_re -= g11_im * b1_im;
1192  B1_re += g12_re * b2_re;
1193  B1_re -= g12_im * b2_im;
1194  spinorFloat B1_im = 0;
1195  B1_im += g10_re * b0_im;
1196  B1_im += g10_im * b0_re;
1197  B1_im += g11_re * b1_im;
1198  B1_im += g11_im * b1_re;
1199  B1_im += g12_re * b2_im;
1200  B1_im += g12_im * b2_re;
1201 
1202  // multiply row 2
1203  spinorFloat A2_re = 0;
1204  A2_re += g20_re * a0_re;
1205  A2_re -= g20_im * a0_im;
1206  A2_re += g21_re * a1_re;
1207  A2_re -= g21_im * a1_im;
1208  A2_re += g22_re * a2_re;
1209  A2_re -= g22_im * a2_im;
1210  spinorFloat A2_im = 0;
1211  A2_im += g20_re * a0_im;
1212  A2_im += g20_im * a0_re;
1213  A2_im += g21_re * a1_im;
1214  A2_im += g21_im * a1_re;
1215  A2_im += g22_re * a2_im;
1216  A2_im += g22_im * a2_re;
1217  spinorFloat B2_re = 0;
1218  B2_re += g20_re * b0_re;
1219  B2_re -= g20_im * b0_im;
1220  B2_re += g21_re * b1_re;
1221  B2_re -= g21_im * b1_im;
1222  B2_re += g22_re * b2_re;
1223  B2_re -= g22_im * b2_im;
1224  spinorFloat B2_im = 0;
1225  B2_im += g20_re * b0_im;
1226  B2_im += g20_im * b0_re;
1227  B2_im += g21_re * b1_im;
1228  B2_im += g21_im * b1_re;
1229  B2_im += g22_re * b2_im;
1230  B2_im += g22_im * b2_re;
1231 
1232  o00_re += A0_re;
1233  o00_im += A0_im;
1234  o10_re += B0_re;
1235  o10_im += B0_im;
1236  o20_re += B0_re;
1237  o20_im += B0_im;
1238  o30_re -= A0_re;
1239  o30_im -= A0_im;
1240 
1241  o01_re += A1_re;
1242  o01_im += A1_im;
1243  o11_re += B1_re;
1244  o11_im += B1_im;
1245  o21_re += B1_re;
1246  o21_im += B1_im;
1247  o31_re -= A1_re;
1248  o31_im -= A1_im;
1249 
1250  o02_re += A2_re;
1251  o02_im += A2_im;
1252  o12_re += B2_re;
1253  o12_im += B2_im;
1254  o22_re += B2_re;
1255  o22_im += B2_im;
1256  o32_re -= A2_re;
1257  o32_im -= A2_im;
1258 
1259 }
1260 
1261 #ifdef MULTI_GPU
1262 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
1263  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
1264 #endif
1265 {
1266  // Projector P1+
1267  // 1 0 0 1
1268  // 0 1 -1 0
1269  // 0 -1 1 0
1270  // 1 0 0 1
1271 
1272 #ifdef MULTI_GPU
1273  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
1274  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1275 #else
1276  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
1277 #endif
1278 
1279 #ifdef MULTI_GPU
1280  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1281 #else
1282  const int ga_idx = sp_idx;
1283 #endif
1284 
1291 
1292 #ifdef MULTI_GPU
1293  if (kernel_type == INTERIOR_KERNEL) {
1294 #endif
1295 
1296  // read spinor from device memory
1297  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1298 
1299  // project spinor into half spinors
1300  a0_re = +i00_re+i30_re;
1301  a0_im = +i00_im+i30_im;
1302  a1_re = +i01_re+i31_re;
1303  a1_im = +i01_im+i31_im;
1304  a2_re = +i02_re+i32_re;
1305  a2_im = +i02_im+i32_im;
1306  b0_re = +i10_re-i20_re;
1307  b0_im = +i10_im-i20_im;
1308  b1_re = +i11_re-i21_re;
1309  b1_im = +i11_im-i21_im;
1310  b2_re = +i12_re-i22_re;
1311  b2_im = +i12_im-i22_im;
1312 
1313 #ifdef MULTI_GPU
1314  } else {
1315 
1316  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1317 
1318  // read half spinor from device memory
1319  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1320 
1321  a0_re = i00_re; a0_im = i00_im;
1322  a1_re = i01_re; a1_im = i01_im;
1323  a2_re = i02_re; a2_im = i02_im;
1324  b0_re = i10_re; b0_im = i10_im;
1325  b1_re = i11_re; b1_im = i11_im;
1326  b2_re = i12_re; b2_im = i12_im;
1327 
1328  }
1329 #endif // MULTI_GPU
1330 
1331  // read gauge matrix from device memory
1332  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
1333 
1334  // reconstruct gauge matrix
1336 
1337  // multiply row 0
1338  spinorFloat A0_re = 0;
1339  A0_re += gT00_re * a0_re;
1340  A0_re -= gT00_im * a0_im;
1341  A0_re += gT01_re * a1_re;
1342  A0_re -= gT01_im * a1_im;
1343  A0_re += gT02_re * a2_re;
1344  A0_re -= gT02_im * a2_im;
1345  spinorFloat A0_im = 0;
1346  A0_im += gT00_re * a0_im;
1347  A0_im += gT00_im * a0_re;
1348  A0_im += gT01_re * a1_im;
1349  A0_im += gT01_im * a1_re;
1350  A0_im += gT02_re * a2_im;
1351  A0_im += gT02_im * a2_re;
1352  spinorFloat B0_re = 0;
1353  B0_re += gT00_re * b0_re;
1354  B0_re -= gT00_im * b0_im;
1355  B0_re += gT01_re * b1_re;
1356  B0_re -= gT01_im * b1_im;
1357  B0_re += gT02_re * b2_re;
1358  B0_re -= gT02_im * b2_im;
1359  spinorFloat B0_im = 0;
1360  B0_im += gT00_re * b0_im;
1361  B0_im += gT00_im * b0_re;
1362  B0_im += gT01_re * b1_im;
1363  B0_im += gT01_im * b1_re;
1364  B0_im += gT02_re * b2_im;
1365  B0_im += gT02_im * b2_re;
1366 
1367  // multiply row 1
1368  spinorFloat A1_re = 0;
1369  A1_re += gT10_re * a0_re;
1370  A1_re -= gT10_im * a0_im;
1371  A1_re += gT11_re * a1_re;
1372  A1_re -= gT11_im * a1_im;
1373  A1_re += gT12_re * a2_re;
1374  A1_re -= gT12_im * a2_im;
1375  spinorFloat A1_im = 0;
1376  A1_im += gT10_re * a0_im;
1377  A1_im += gT10_im * a0_re;
1378  A1_im += gT11_re * a1_im;
1379  A1_im += gT11_im * a1_re;
1380  A1_im += gT12_re * a2_im;
1381  A1_im += gT12_im * a2_re;
1382  spinorFloat B1_re = 0;
1383  B1_re += gT10_re * b0_re;
1384  B1_re -= gT10_im * b0_im;
1385  B1_re += gT11_re * b1_re;
1386  B1_re -= gT11_im * b1_im;
1387  B1_re += gT12_re * b2_re;
1388  B1_re -= gT12_im * b2_im;
1389  spinorFloat B1_im = 0;
1390  B1_im += gT10_re * b0_im;
1391  B1_im += gT10_im * b0_re;
1392  B1_im += gT11_re * b1_im;
1393  B1_im += gT11_im * b1_re;
1394  B1_im += gT12_re * b2_im;
1395  B1_im += gT12_im * b2_re;
1396 
1397  // multiply row 2
1398  spinorFloat A2_re = 0;
1399  A2_re += gT20_re * a0_re;
1400  A2_re -= gT20_im * a0_im;
1401  A2_re += gT21_re * a1_re;
1402  A2_re -= gT21_im * a1_im;
1403  A2_re += gT22_re * a2_re;
1404  A2_re -= gT22_im * a2_im;
1405  spinorFloat A2_im = 0;
1406  A2_im += gT20_re * a0_im;
1407  A2_im += gT20_im * a0_re;
1408  A2_im += gT21_re * a1_im;
1409  A2_im += gT21_im * a1_re;
1410  A2_im += gT22_re * a2_im;
1411  A2_im += gT22_im * a2_re;
1412  spinorFloat B2_re = 0;
1413  B2_re += gT20_re * b0_re;
1414  B2_re -= gT20_im * b0_im;
1415  B2_re += gT21_re * b1_re;
1416  B2_re -= gT21_im * b1_im;
1417  B2_re += gT22_re * b2_re;
1418  B2_re -= gT22_im * b2_im;
1419  spinorFloat B2_im = 0;
1420  B2_im += gT20_re * b0_im;
1421  B2_im += gT20_im * b0_re;
1422  B2_im += gT21_re * b1_im;
1423  B2_im += gT21_im * b1_re;
1424  B2_im += gT22_re * b2_im;
1425  B2_im += gT22_im * b2_re;
1426 
1427  o00_re += A0_re;
1428  o00_im += A0_im;
1429  o10_re += B0_re;
1430  o10_im += B0_im;
1431  o20_re -= B0_re;
1432  o20_im -= B0_im;
1433  o30_re += A0_re;
1434  o30_im += A0_im;
1435 
1436  o01_re += A1_re;
1437  o01_im += A1_im;
1438  o11_re += B1_re;
1439  o11_im += B1_im;
1440  o21_re -= B1_re;
1441  o21_im -= B1_im;
1442  o31_re += A1_re;
1443  o31_im += A1_im;
1444 
1445  o02_re += A2_re;
1446  o02_im += A2_im;
1447  o12_re += B2_re;
1448  o12_im += B2_im;
1449  o22_re -= B2_re;
1450  o22_im -= B2_im;
1451  o32_re += A2_re;
1452  o32_im += A2_im;
1453 
1454 }
1455 
1456 #ifdef MULTI_GPU
1457 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1458  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1459 #endif
1460 {
1461  // Projector P2-
1462  // 1 0 -i 0
1463  // 0 1 0 i
1464  // i 0 1 0
1465  // 0 -i 0 1
1466 
1467 #ifdef MULTI_GPU
1468  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1469  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1470 #else
1471  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1472 #endif
1473 
1474  const int ga_idx = sid;
1475 
1482 
1483 #ifdef MULTI_GPU
1484  if (kernel_type == INTERIOR_KERNEL) {
1485 #endif
1486 
1487  // read spinor from device memory
1488  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1489 
1490  // project spinor into half spinors
1491  a0_re = +i00_re+i20_im;
1492  a0_im = +i00_im-i20_re;
1493  a1_re = +i01_re+i21_im;
1494  a1_im = +i01_im-i21_re;
1495  a2_re = +i02_re+i22_im;
1496  a2_im = +i02_im-i22_re;
1497  b0_re = +i10_re-i30_im;
1498  b0_im = +i10_im+i30_re;
1499  b1_re = +i11_re-i31_im;
1500  b1_im = +i11_im+i31_re;
1501  b2_re = +i12_re-i32_im;
1502  b2_im = +i12_im+i32_re;
1503 
1504 #ifdef MULTI_GPU
1505  } else {
1506 
1507  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1508 
1509  // read half spinor from device memory
1510  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1511 
1512  a0_re = i00_re; a0_im = i00_im;
1513  a1_re = i01_re; a1_im = i01_im;
1514  a2_re = i02_re; a2_im = i02_im;
1515  b0_re = i10_re; b0_im = i10_im;
1516  b1_re = i11_re; b1_im = i11_im;
1517  b2_re = i12_re; b2_im = i12_im;
1518 
1519  }
1520 #endif // MULTI_GPU
1521 
1522  // read gauge matrix from device memory
1523  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1524 
1525  // reconstruct gauge matrix
1527 
1528  // multiply row 0
1529  spinorFloat A0_re = 0;
1530  A0_re += g00_re * a0_re;
1531  A0_re -= g00_im * a0_im;
1532  A0_re += g01_re * a1_re;
1533  A0_re -= g01_im * a1_im;
1534  A0_re += g02_re * a2_re;
1535  A0_re -= g02_im * a2_im;
1536  spinorFloat A0_im = 0;
1537  A0_im += g00_re * a0_im;
1538  A0_im += g00_im * a0_re;
1539  A0_im += g01_re * a1_im;
1540  A0_im += g01_im * a1_re;
1541  A0_im += g02_re * a2_im;
1542  A0_im += g02_im * a2_re;
1543  spinorFloat B0_re = 0;
1544  B0_re += g00_re * b0_re;
1545  B0_re -= g00_im * b0_im;
1546  B0_re += g01_re * b1_re;
1547  B0_re -= g01_im * b1_im;
1548  B0_re += g02_re * b2_re;
1549  B0_re -= g02_im * b2_im;
1550  spinorFloat B0_im = 0;
1551  B0_im += g00_re * b0_im;
1552  B0_im += g00_im * b0_re;
1553  B0_im += g01_re * b1_im;
1554  B0_im += g01_im * b1_re;
1555  B0_im += g02_re * b2_im;
1556  B0_im += g02_im * b2_re;
1557 
1558  // multiply row 1
1559  spinorFloat A1_re = 0;
1560  A1_re += g10_re * a0_re;
1561  A1_re -= g10_im * a0_im;
1562  A1_re += g11_re * a1_re;
1563  A1_re -= g11_im * a1_im;
1564  A1_re += g12_re * a2_re;
1565  A1_re -= g12_im * a2_im;
1566  spinorFloat A1_im = 0;
1567  A1_im += g10_re * a0_im;
1568  A1_im += g10_im * a0_re;
1569  A1_im += g11_re * a1_im;
1570  A1_im += g11_im * a1_re;
1571  A1_im += g12_re * a2_im;
1572  A1_im += g12_im * a2_re;
1573  spinorFloat B1_re = 0;
1574  B1_re += g10_re * b0_re;
1575  B1_re -= g10_im * b0_im;
1576  B1_re += g11_re * b1_re;
1577  B1_re -= g11_im * b1_im;
1578  B1_re += g12_re * b2_re;
1579  B1_re -= g12_im * b2_im;
1580  spinorFloat B1_im = 0;
1581  B1_im += g10_re * b0_im;
1582  B1_im += g10_im * b0_re;
1583  B1_im += g11_re * b1_im;
1584  B1_im += g11_im * b1_re;
1585  B1_im += g12_re * b2_im;
1586  B1_im += g12_im * b2_re;
1587 
1588  // multiply row 2
1589  spinorFloat A2_re = 0;
1590  A2_re += g20_re * a0_re;
1591  A2_re -= g20_im * a0_im;
1592  A2_re += g21_re * a1_re;
1593  A2_re -= g21_im * a1_im;
1594  A2_re += g22_re * a2_re;
1595  A2_re -= g22_im * a2_im;
1596  spinorFloat A2_im = 0;
1597  A2_im += g20_re * a0_im;
1598  A2_im += g20_im * a0_re;
1599  A2_im += g21_re * a1_im;
1600  A2_im += g21_im * a1_re;
1601  A2_im += g22_re * a2_im;
1602  A2_im += g22_im * a2_re;
1603  spinorFloat B2_re = 0;
1604  B2_re += g20_re * b0_re;
1605  B2_re -= g20_im * b0_im;
1606  B2_re += g21_re * b1_re;
1607  B2_re -= g21_im * b1_im;
1608  B2_re += g22_re * b2_re;
1609  B2_re -= g22_im * b2_im;
1610  spinorFloat B2_im = 0;
1611  B2_im += g20_re * b0_im;
1612  B2_im += g20_im * b0_re;
1613  B2_im += g21_re * b1_im;
1614  B2_im += g21_im * b1_re;
1615  B2_im += g22_re * b2_im;
1616  B2_im += g22_im * b2_re;
1617 
1618  o00_re += A0_re;
1619  o00_im += A0_im;
1620  o10_re += B0_re;
1621  o10_im += B0_im;
1622  o20_re -= A0_im;
1623  o20_im += A0_re;
1624  o30_re += B0_im;
1625  o30_im -= B0_re;
1626 
1627  o01_re += A1_re;
1628  o01_im += A1_im;
1629  o11_re += B1_re;
1630  o11_im += B1_im;
1631  o21_re -= A1_im;
1632  o21_im += A1_re;
1633  o31_re += B1_im;
1634  o31_im -= B1_re;
1635 
1636  o02_re += A2_re;
1637  o02_im += A2_im;
1638  o12_re += B2_re;
1639  o12_im += B2_im;
1640  o22_re -= A2_im;
1641  o22_im += A2_re;
1642  o32_re += B2_im;
1643  o32_im -= B2_re;
1644 
1645 }
1646 
1647 #ifdef MULTI_GPU
1648 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1649  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1650 #endif
1651 {
1652  // Projector P2+
1653  // 1 0 i 0
1654  // 0 1 0 -i
1655  // -i 0 1 0
1656  // 0 i 0 1
1657 
1658 #ifdef MULTI_GPU
1659  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1660  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1661 #else
1662  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1663 #endif
1664 
1665 #ifdef MULTI_GPU
1666  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1667 #else
1668  const int ga_idx = sp_idx;
1669 #endif
1670 
1677 
1678 #ifdef MULTI_GPU
1679  if (kernel_type == INTERIOR_KERNEL) {
1680 #endif
1681 
1682  // read spinor from device memory
1683  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1684 
1685  // project spinor into half spinors
1686  a0_re = +i00_re-i20_im;
1687  a0_im = +i00_im+i20_re;
1688  a1_re = +i01_re-i21_im;
1689  a1_im = +i01_im+i21_re;
1690  a2_re = +i02_re-i22_im;
1691  a2_im = +i02_im+i22_re;
1692  b0_re = +i10_re+i30_im;
1693  b0_im = +i10_im-i30_re;
1694  b1_re = +i11_re+i31_im;
1695  b1_im = +i11_im-i31_re;
1696  b2_re = +i12_re+i32_im;
1697  b2_im = +i12_im-i32_re;
1698 
1699 #ifdef MULTI_GPU
1700  } else {
1701 
1702  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1703 
1704  // read half spinor from device memory
1705  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1706 
1707  a0_re = i00_re; a0_im = i00_im;
1708  a1_re = i01_re; a1_im = i01_im;
1709  a2_re = i02_re; a2_im = i02_im;
1710  b0_re = i10_re; b0_im = i10_im;
1711  b1_re = i11_re; b1_im = i11_im;
1712  b2_re = i12_re; b2_im = i12_im;
1713 
1714  }
1715 #endif // MULTI_GPU
1716 
1717  // read gauge matrix from device memory
1718  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1719 
1720  // reconstruct gauge matrix
1722 
1723  // multiply row 0
1724  spinorFloat A0_re = 0;
1725  A0_re += gT00_re * a0_re;
1726  A0_re -= gT00_im * a0_im;
1727  A0_re += gT01_re * a1_re;
1728  A0_re -= gT01_im * a1_im;
1729  A0_re += gT02_re * a2_re;
1730  A0_re -= gT02_im * a2_im;
1731  spinorFloat A0_im = 0;
1732  A0_im += gT00_re * a0_im;
1733  A0_im += gT00_im * a0_re;
1734  A0_im += gT01_re * a1_im;
1735  A0_im += gT01_im * a1_re;
1736  A0_im += gT02_re * a2_im;
1737  A0_im += gT02_im * a2_re;
1738  spinorFloat B0_re = 0;
1739  B0_re += gT00_re * b0_re;
1740  B0_re -= gT00_im * b0_im;
1741  B0_re += gT01_re * b1_re;
1742  B0_re -= gT01_im * b1_im;
1743  B0_re += gT02_re * b2_re;
1744  B0_re -= gT02_im * b2_im;
1745  spinorFloat B0_im = 0;
1746  B0_im += gT00_re * b0_im;
1747  B0_im += gT00_im * b0_re;
1748  B0_im += gT01_re * b1_im;
1749  B0_im += gT01_im * b1_re;
1750  B0_im += gT02_re * b2_im;
1751  B0_im += gT02_im * b2_re;
1752 
1753  // multiply row 1
1754  spinorFloat A1_re = 0;
1755  A1_re += gT10_re * a0_re;
1756  A1_re -= gT10_im * a0_im;
1757  A1_re += gT11_re * a1_re;
1758  A1_re -= gT11_im * a1_im;
1759  A1_re += gT12_re * a2_re;
1760  A1_re -= gT12_im * a2_im;
1761  spinorFloat A1_im = 0;
1762  A1_im += gT10_re * a0_im;
1763  A1_im += gT10_im * a0_re;
1764  A1_im += gT11_re * a1_im;
1765  A1_im += gT11_im * a1_re;
1766  A1_im += gT12_re * a2_im;
1767  A1_im += gT12_im * a2_re;
1768  spinorFloat B1_re = 0;
1769  B1_re += gT10_re * b0_re;
1770  B1_re -= gT10_im * b0_im;
1771  B1_re += gT11_re * b1_re;
1772  B1_re -= gT11_im * b1_im;
1773  B1_re += gT12_re * b2_re;
1774  B1_re -= gT12_im * b2_im;
1775  spinorFloat B1_im = 0;
1776  B1_im += gT10_re * b0_im;
1777  B1_im += gT10_im * b0_re;
1778  B1_im += gT11_re * b1_im;
1779  B1_im += gT11_im * b1_re;
1780  B1_im += gT12_re * b2_im;
1781  B1_im += gT12_im * b2_re;
1782 
1783  // multiply row 2
1784  spinorFloat A2_re = 0;
1785  A2_re += gT20_re * a0_re;
1786  A2_re -= gT20_im * a0_im;
1787  A2_re += gT21_re * a1_re;
1788  A2_re -= gT21_im * a1_im;
1789  A2_re += gT22_re * a2_re;
1790  A2_re -= gT22_im * a2_im;
1791  spinorFloat A2_im = 0;
1792  A2_im += gT20_re * a0_im;
1793  A2_im += gT20_im * a0_re;
1794  A2_im += gT21_re * a1_im;
1795  A2_im += gT21_im * a1_re;
1796  A2_im += gT22_re * a2_im;
1797  A2_im += gT22_im * a2_re;
1798  spinorFloat B2_re = 0;
1799  B2_re += gT20_re * b0_re;
1800  B2_re -= gT20_im * b0_im;
1801  B2_re += gT21_re * b1_re;
1802  B2_re -= gT21_im * b1_im;
1803  B2_re += gT22_re * b2_re;
1804  B2_re -= gT22_im * b2_im;
1805  spinorFloat B2_im = 0;
1806  B2_im += gT20_re * b0_im;
1807  B2_im += gT20_im * b0_re;
1808  B2_im += gT21_re * b1_im;
1809  B2_im += gT21_im * b1_re;
1810  B2_im += gT22_re * b2_im;
1811  B2_im += gT22_im * b2_re;
1812 
1813  o00_re += A0_re;
1814  o00_im += A0_im;
1815  o10_re += B0_re;
1816  o10_im += B0_im;
1817  o20_re += A0_im;
1818  o20_im -= A0_re;
1819  o30_re -= B0_im;
1820  o30_im += B0_re;
1821 
1822  o01_re += A1_re;
1823  o01_im += A1_im;
1824  o11_re += B1_re;
1825  o11_im += B1_im;
1826  o21_re += A1_im;
1827  o21_im -= A1_re;
1828  o31_re -= B1_im;
1829  o31_im += B1_re;
1830 
1831  o02_re += A2_re;
1832  o02_im += A2_im;
1833  o12_re += B2_re;
1834  o12_im += B2_im;
1835  o22_re += A2_im;
1836  o22_im -= A2_re;
1837  o32_re -= B2_im;
1838  o32_im += B2_re;
1839 
1840 }
1841 
1842 #ifdef MULTI_GPU
1843 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1844  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
1845 #endif
1846 {
1847  // Projector P3-
1848  // 0 0 0 0
1849  // 0 0 0 0
1850  // 0 0 2 0
1851  // 0 0 0 2
1852 
1853 #ifdef MULTI_GPU
1854  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1855  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1856 #else
1857  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1858 #endif
1859 
1860  const int ga_idx = sid;
1861 
1863  {
1870 
1871 #ifdef MULTI_GPU
1872  if (kernel_type == INTERIOR_KERNEL) {
1873 #endif
1874 
1875  // read spinor from device memory
1876  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1877 
1878  // project spinor into half spinors
1879  a0_re = +2*i20_re;
1880  a0_im = +2*i20_im;
1881  a1_re = +2*i21_re;
1882  a1_im = +2*i21_im;
1883  a2_re = +2*i22_re;
1884  a2_im = +2*i22_im;
1885  b0_re = +2*i30_re;
1886  b0_im = +2*i30_im;
1887  b1_re = +2*i31_re;
1888  b1_im = +2*i31_im;
1889  b2_re = +2*i32_re;
1890  b2_im = +2*i32_im;
1891 
1892 #ifdef MULTI_GPU
1893  } else {
1894 
1895  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1896  const int t_proj_scale = TPROJSCALE;
1897 
1898  // read half spinor from device memory
1899  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1900 
1901  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1902  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1903  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1904  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1905  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1906  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1907 
1908  }
1909 #endif // MULTI_GPU
1910 
1911  // identity gauge matrix
1918 
1919  o20_re += A0_re;
1920  o20_im += A0_im;
1921  o30_re += B0_re;
1922  o30_im += B0_im;
1923 
1924  o21_re += A1_re;
1925  o21_im += A1_im;
1926  o31_re += B1_re;
1927  o31_im += B1_im;
1928 
1929  o22_re += A2_re;
1930  o22_im += A2_im;
1931  o32_re += B2_re;
1932  o32_im += B2_im;
1933 
1934  } else {
1941 
1942 #ifdef MULTI_GPU
1943  if (kernel_type == INTERIOR_KERNEL) {
1944 #endif
1945 
1946  // read spinor from device memory
1947  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1948 
1949  // project spinor into half spinors
1950  a0_re = +2*i20_re;
1951  a0_im = +2*i20_im;
1952  a1_re = +2*i21_re;
1953  a1_im = +2*i21_im;
1954  a2_re = +2*i22_re;
1955  a2_im = +2*i22_im;
1956  b0_re = +2*i30_re;
1957  b0_im = +2*i30_im;
1958  b1_re = +2*i31_re;
1959  b1_im = +2*i31_im;
1960  b2_re = +2*i32_re;
1961  b2_im = +2*i32_im;
1962 
1963 #ifdef MULTI_GPU
1964  } else {
1965 
1966  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1967  const int t_proj_scale = TPROJSCALE;
1968 
1969  // read half spinor from device memory
1970  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1971 
1972  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1973  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1974  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1975  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1976  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1977  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1978 
1979  }
1980 #endif // MULTI_GPU
1981 
1982  // read gauge matrix from device memory
1983  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1984 
1985  // reconstruct gauge matrix
1987 
1988  // multiply row 0
1989  spinorFloat A0_re = 0;
1990  A0_re += g00_re * a0_re;
1991  A0_re -= g00_im * a0_im;
1992  A0_re += g01_re * a1_re;
1993  A0_re -= g01_im * a1_im;
1994  A0_re += g02_re * a2_re;
1995  A0_re -= g02_im * a2_im;
1996  spinorFloat A0_im = 0;
1997  A0_im += g00_re * a0_im;
1998  A0_im += g00_im * a0_re;
1999  A0_im += g01_re * a1_im;
2000  A0_im += g01_im * a1_re;
2001  A0_im += g02_re * a2_im;
2002  A0_im += g02_im * a2_re;
2003  spinorFloat B0_re = 0;
2004  B0_re += g00_re * b0_re;
2005  B0_re -= g00_im * b0_im;
2006  B0_re += g01_re * b1_re;
2007  B0_re -= g01_im * b1_im;
2008  B0_re += g02_re * b2_re;
2009  B0_re -= g02_im * b2_im;
2010  spinorFloat B0_im = 0;
2011  B0_im += g00_re * b0_im;
2012  B0_im += g00_im * b0_re;
2013  B0_im += g01_re * b1_im;
2014  B0_im += g01_im * b1_re;
2015  B0_im += g02_re * b2_im;
2016  B0_im += g02_im * b2_re;
2017 
2018  // multiply row 1
2019  spinorFloat A1_re = 0;
2020  A1_re += g10_re * a0_re;
2021  A1_re -= g10_im * a0_im;
2022  A1_re += g11_re * a1_re;
2023  A1_re -= g11_im * a1_im;
2024  A1_re += g12_re * a2_re;
2025  A1_re -= g12_im * a2_im;
2026  spinorFloat A1_im = 0;
2027  A1_im += g10_re * a0_im;
2028  A1_im += g10_im * a0_re;
2029  A1_im += g11_re * a1_im;
2030  A1_im += g11_im * a1_re;
2031  A1_im += g12_re * a2_im;
2032  A1_im += g12_im * a2_re;
2033  spinorFloat B1_re = 0;
2034  B1_re += g10_re * b0_re;
2035  B1_re -= g10_im * b0_im;
2036  B1_re += g11_re * b1_re;
2037  B1_re -= g11_im * b1_im;
2038  B1_re += g12_re * b2_re;
2039  B1_re -= g12_im * b2_im;
2040  spinorFloat B1_im = 0;
2041  B1_im += g10_re * b0_im;
2042  B1_im += g10_im * b0_re;
2043  B1_im += g11_re * b1_im;
2044  B1_im += g11_im * b1_re;
2045  B1_im += g12_re * b2_im;
2046  B1_im += g12_im * b2_re;
2047 
2048  // multiply row 2
2049  spinorFloat A2_re = 0;
2050  A2_re += g20_re * a0_re;
2051  A2_re -= g20_im * a0_im;
2052  A2_re += g21_re * a1_re;
2053  A2_re -= g21_im * a1_im;
2054  A2_re += g22_re * a2_re;
2055  A2_re -= g22_im * a2_im;
2056  spinorFloat A2_im = 0;
2057  A2_im += g20_re * a0_im;
2058  A2_im += g20_im * a0_re;
2059  A2_im += g21_re * a1_im;
2060  A2_im += g21_im * a1_re;
2061  A2_im += g22_re * a2_im;
2062  A2_im += g22_im * a2_re;
2063  spinorFloat B2_re = 0;
2064  B2_re += g20_re * b0_re;
2065  B2_re -= g20_im * b0_im;
2066  B2_re += g21_re * b1_re;
2067  B2_re -= g21_im * b1_im;
2068  B2_re += g22_re * b2_re;
2069  B2_re -= g22_im * b2_im;
2070  spinorFloat B2_im = 0;
2071  B2_im += g20_re * b0_im;
2072  B2_im += g20_im * b0_re;
2073  B2_im += g21_re * b1_im;
2074  B2_im += g21_im * b1_re;
2075  B2_im += g22_re * b2_im;
2076  B2_im += g22_im * b2_re;
2077 
2078  o20_re += A0_re;
2079  o20_im += A0_im;
2080  o30_re += B0_re;
2081  o30_im += B0_im;
2082 
2083  o21_re += A1_re;
2084  o21_im += A1_im;
2085  o31_re += B1_re;
2086  o31_im += B1_im;
2087 
2088  o22_re += A2_re;
2089  o22_im += A2_im;
2090  o32_re += B2_re;
2091  o32_im += B2_im;
2092 
2093  }
2094 }
2095 
2096 #ifdef MULTI_GPU
2097 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
2098  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
2099 #endif
2100 {
2101  // Projector P3+
2102  // 2 0 0 0
2103  // 0 2 0 0
2104  // 0 0 0 0
2105  // 0 0 0 0
2106 
2107 #ifdef MULTI_GPU
2108  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
2109  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2110 #else
2111  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
2112 #endif
2113 
2114 #ifdef MULTI_GPU
2115  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
2116 #else
2117  const int ga_idx = sp_idx;
2118 #endif
2119 
2120  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
2121  {
2128 
2129 #ifdef MULTI_GPU
2130  if (kernel_type == INTERIOR_KERNEL) {
2131 #endif
2132 
2133  // read spinor from device memory
2134  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2135 
2136  // project spinor into half spinors
2137  a0_re = +2*i00_re;
2138  a0_im = +2*i00_im;
2139  a1_re = +2*i01_re;
2140  a1_im = +2*i01_im;
2141  a2_re = +2*i02_re;
2142  a2_im = +2*i02_im;
2143  b0_re = +2*i10_re;
2144  b0_im = +2*i10_im;
2145  b1_re = +2*i11_re;
2146  b1_im = +2*i11_im;
2147  b2_re = +2*i12_re;
2148  b2_im = +2*i12_im;
2149 
2150 #ifdef MULTI_GPU
2151  } else {
2152 
2153  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2154  const int t_proj_scale = TPROJSCALE;
2155 
2156  // read half spinor from device memory
2157  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2158 
2159  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2160  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2161  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2162  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2163  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2164  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2165 
2166  }
2167 #endif // MULTI_GPU
2168 
2169  // identity gauge matrix
2176 
2177  o00_re += A0_re;
2178  o00_im += A0_im;
2179  o10_re += B0_re;
2180  o10_im += B0_im;
2181 
2182  o01_re += A1_re;
2183  o01_im += A1_im;
2184  o11_re += B1_re;
2185  o11_im += B1_im;
2186 
2187  o02_re += A2_re;
2188  o02_im += A2_im;
2189  o12_re += B2_re;
2190  o12_im += B2_im;
2191 
2192  } else {
2199 
2200 #ifdef MULTI_GPU
2201  if (kernel_type == INTERIOR_KERNEL) {
2202 #endif
2203 
2204  // read spinor from device memory
2205  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2206 
2207  // project spinor into half spinors
2208  a0_re = +2*i00_re;
2209  a0_im = +2*i00_im;
2210  a1_re = +2*i01_re;
2211  a1_im = +2*i01_im;
2212  a2_re = +2*i02_re;
2213  a2_im = +2*i02_im;
2214  b0_re = +2*i10_re;
2215  b0_im = +2*i10_im;
2216  b1_re = +2*i11_re;
2217  b1_im = +2*i11_im;
2218  b2_re = +2*i12_re;
2219  b2_im = +2*i12_im;
2220 
2221 #ifdef MULTI_GPU
2222  } else {
2223 
2224  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2225  const int t_proj_scale = TPROJSCALE;
2226 
2227  // read half spinor from device memory
2228  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2229 
2230  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2231  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2232  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2233  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2234  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2235  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2236 
2237  }
2238 #endif // MULTI_GPU
2239 
2240  // read gauge matrix from device memory
2241  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
2242 
2243  // reconstruct gauge matrix
2245 
2246  // multiply row 0
2247  spinorFloat A0_re = 0;
2248  A0_re += gT00_re * a0_re;
2249  A0_re -= gT00_im * a0_im;
2250  A0_re += gT01_re * a1_re;
2251  A0_re -= gT01_im * a1_im;
2252  A0_re += gT02_re * a2_re;
2253  A0_re -= gT02_im * a2_im;
2254  spinorFloat A0_im = 0;
2255  A0_im += gT00_re * a0_im;
2256  A0_im += gT00_im * a0_re;
2257  A0_im += gT01_re * a1_im;
2258  A0_im += gT01_im * a1_re;
2259  A0_im += gT02_re * a2_im;
2260  A0_im += gT02_im * a2_re;
2261  spinorFloat B0_re = 0;
2262  B0_re += gT00_re * b0_re;
2263  B0_re -= gT00_im * b0_im;
2264  B0_re += gT01_re * b1_re;
2265  B0_re -= gT01_im * b1_im;
2266  B0_re += gT02_re * b2_re;
2267  B0_re -= gT02_im * b2_im;
2268  spinorFloat B0_im = 0;
2269  B0_im += gT00_re * b0_im;
2270  B0_im += gT00_im * b0_re;
2271  B0_im += gT01_re * b1_im;
2272  B0_im += gT01_im * b1_re;
2273  B0_im += gT02_re * b2_im;
2274  B0_im += gT02_im * b2_re;
2275 
2276  // multiply row 1
2277  spinorFloat A1_re = 0;
2278  A1_re += gT10_re * a0_re;
2279  A1_re -= gT10_im * a0_im;
2280  A1_re += gT11_re * a1_re;
2281  A1_re -= gT11_im * a1_im;
2282  A1_re += gT12_re * a2_re;
2283  A1_re -= gT12_im * a2_im;
2284  spinorFloat A1_im = 0;
2285  A1_im += gT10_re * a0_im;
2286  A1_im += gT10_im * a0_re;
2287  A1_im += gT11_re * a1_im;
2288  A1_im += gT11_im * a1_re;
2289  A1_im += gT12_re * a2_im;
2290  A1_im += gT12_im * a2_re;
2291  spinorFloat B1_re = 0;
2292  B1_re += gT10_re * b0_re;
2293  B1_re -= gT10_im * b0_im;
2294  B1_re += gT11_re * b1_re;
2295  B1_re -= gT11_im * b1_im;
2296  B1_re += gT12_re * b2_re;
2297  B1_re -= gT12_im * b2_im;
2298  spinorFloat B1_im = 0;
2299  B1_im += gT10_re * b0_im;
2300  B1_im += gT10_im * b0_re;
2301  B1_im += gT11_re * b1_im;
2302  B1_im += gT11_im * b1_re;
2303  B1_im += gT12_re * b2_im;
2304  B1_im += gT12_im * b2_re;
2305 
2306  // multiply row 2
2307  spinorFloat A2_re = 0;
2308  A2_re += gT20_re * a0_re;
2309  A2_re -= gT20_im * a0_im;
2310  A2_re += gT21_re * a1_re;
2311  A2_re -= gT21_im * a1_im;
2312  A2_re += gT22_re * a2_re;
2313  A2_re -= gT22_im * a2_im;
2314  spinorFloat A2_im = 0;
2315  A2_im += gT20_re * a0_im;
2316  A2_im += gT20_im * a0_re;
2317  A2_im += gT21_re * a1_im;
2318  A2_im += gT21_im * a1_re;
2319  A2_im += gT22_re * a2_im;
2320  A2_im += gT22_im * a2_re;
2321  spinorFloat B2_re = 0;
2322  B2_re += gT20_re * b0_re;
2323  B2_re -= gT20_im * b0_im;
2324  B2_re += gT21_re * b1_re;
2325  B2_re -= gT21_im * b1_im;
2326  B2_re += gT22_re * b2_re;
2327  B2_re -= gT22_im * b2_im;
2328  spinorFloat B2_im = 0;
2329  B2_im += gT20_re * b0_im;
2330  B2_im += gT20_im * b0_re;
2331  B2_im += gT21_re * b1_im;
2332  B2_im += gT21_im * b1_re;
2333  B2_im += gT22_re * b2_im;
2334  B2_im += gT22_im * b2_re;
2335 
2336  o00_re += A0_re;
2337  o00_im += A0_im;
2338  o10_re += B0_re;
2339  o10_im += B0_im;
2340 
2341  o01_re += A1_re;
2342  o01_im += A1_im;
2343  o11_re += B1_re;
2344  o11_im += B1_im;
2345 
2346  o02_re += A2_re;
2347  o02_im += A2_im;
2348  o12_re += B2_re;
2349  o12_im += B2_im;
2350 
2351  }
2352 }
2353 
2354 #ifdef MULTI_GPU
2355 
2356 int incomplete = 0; // Have all 8 contributions been computed for this site?
2357 
2358 switch(kernel_type) { // intentional fall-through
2359 case INTERIOR_KERNEL:
2360  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
2361 case EXTERIOR_KERNEL_T:
2362  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
2363 case EXTERIOR_KERNEL_Z:
2364  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
2365 case EXTERIOR_KERNEL_Y:
2366  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
2367 }
2368 
2369 if (!incomplete)
2370 #endif // MULTI_GPU
2371 {
2372 #ifdef DSLASH_XPAY
2373  READ_ACCUM(ACCUMTEX, param.sp_stride)
2374 
2375 #ifndef CLOVER_TWIST_XPAY
2376  //perform invert twist first:
2377  APPLY_CLOVER_TWIST_INV(c, cinv, a, o);
2378  o00_re = b*o00_re + acc00_re;
2379  o00_im = b*o00_im + acc00_im;
2380  o01_re = b*o01_re + acc01_re;
2381  o01_im = b*o01_im + acc01_im;
2382  o02_re = b*o02_re + acc02_re;
2383  o02_im = b*o02_im + acc02_im;
2384  o10_re = b*o10_re + acc10_re;
2385  o10_im = b*o10_im + acc10_im;
2386  o11_re = b*o11_re + acc11_re;
2387  o11_im = b*o11_im + acc11_im;
2388  o12_re = b*o12_re + acc12_re;
2389  o12_im = b*o12_im + acc12_im;
2390  o20_re = b*o20_re + acc20_re;
2391  o20_im = b*o20_im + acc20_im;
2392  o21_re = b*o21_re + acc21_re;
2393  o21_im = b*o21_im + acc21_im;
2394  o22_re = b*o22_re + acc22_re;
2395  o22_im = b*o22_im + acc22_im;
2396  o30_re = b*o30_re + acc30_re;
2397  o30_im = b*o30_im + acc30_im;
2398  o31_re = b*o31_re + acc31_re;
2399  o31_im = b*o31_im + acc31_im;
2400  o32_re = b*o32_re + acc32_re;
2401  o32_im = b*o32_im + acc32_im;
2402 #else
2403  APPLY_CLOVER_TWIST(c, a, acc);
2404  o00_re = b*o00_re + acc00_re;
2405  o00_im = b*o00_im + acc00_im;
2406  o01_re = b*o01_re + acc01_re;
2407  o01_im = b*o01_im + acc01_im;
2408  o02_re = b*o02_re + acc02_re;
2409  o02_im = b*o02_im + acc02_im;
2410  o10_re = b*o10_re + acc10_re;
2411  o10_im = b*o10_im + acc10_im;
2412  o11_re = b*o11_re + acc11_re;
2413  o11_im = b*o11_im + acc11_im;
2414  o12_re = b*o12_re + acc12_re;
2415  o12_im = b*o12_im + acc12_im;
2416  o20_re = b*o20_re + acc20_re;
2417  o20_im = b*o20_im + acc20_im;
2418  o21_re = b*o21_re + acc21_re;
2419  o21_im = b*o21_im + acc21_im;
2420  o22_re = b*o22_re + acc22_re;
2421  o22_im = b*o22_im + acc22_im;
2422  o30_re = b*o30_re + acc30_re;
2423  o30_im = b*o30_im + acc30_im;
2424  o31_re = b*o31_re + acc31_re;
2425  o31_im = b*o31_im + acc31_im;
2426  o32_re = b*o32_re + acc32_re;
2427  o32_im = b*o32_im + acc32_im;
2428 #endif//CLOVER_TWIST_XPAY
2429 #else //no XPAY
2430  APPLY_CLOVER_TWIST_INV(c, cinv, a, o);
2431 #endif
2432 }
2433 
2434 // write spinor field back to device memory
2435 WRITE_SPINOR(param.sp_stride);
2436 
2437 // undefine to prevent warning when precision is changed
2438 #undef spinorFloat
2439 #undef SHARED_STRIDE
2440 
2441 #undef g00_re
2442 #undef g00_im
2443 #undef g01_re
2444 #undef g01_im
2445 #undef g02_re
2446 #undef g02_im
2447 #undef g10_re
2448 #undef g10_im
2449 #undef g11_re
2450 #undef g11_im
2451 #undef g12_re
2452 #undef g12_im
2453 #undef g20_re
2454 #undef g20_im
2455 #undef g21_re
2456 #undef g21_im
2457 #undef g22_re
2458 #undef g22_im
2459 
2460 #undef i00_re
2461 #undef i00_im
2462 #undef i01_re
2463 #undef i01_im
2464 #undef i02_re
2465 #undef i02_im
2466 #undef i10_re
2467 #undef i10_im
2468 #undef i11_re
2469 #undef i11_im
2470 #undef i12_re
2471 #undef i12_im
2472 #undef i20_re
2473 #undef i20_im
2474 #undef i21_re
2475 #undef i21_im
2476 #undef i22_re
2477 #undef i22_im
2478 #undef i30_re
2479 #undef i30_im
2480 #undef i31_re
2481 #undef i31_im
2482 #undef i32_re
2483 #undef i32_im
2484 
2485 #undef c00_00_re
2486 #undef c01_01_re
2487 #undef c02_02_re
2488 #undef c10_10_re
2489 #undef c11_11_re
2490 #undef c12_12_re
2491 #undef c01_00_re
2492 #undef c01_00_im
2493 #undef c02_00_re
2494 #undef c02_00_im
2495 #undef c10_00_re
2496 #undef c10_00_im
2497 #undef c11_00_re
2498 #undef c11_00_im
2499 #undef c12_00_re
2500 #undef c12_00_im
2501 #undef c02_01_re
2502 #undef c02_01_im
2503 #undef c10_01_re
2504 #undef c10_01_im
2505 #undef c11_01_re
2506 #undef c11_01_im
2507 #undef c12_01_re
2508 #undef c12_01_im
2509 #undef c10_02_re
2510 #undef c10_02_im
2511 #undef c11_02_re
2512 #undef c11_02_im
2513 #undef c12_02_re
2514 #undef c12_02_im
2515 #undef c11_10_re
2516 #undef c11_10_im
2517 #undef c12_10_re
2518 #undef c12_10_im
2519 #undef c12_11_re
2520 #undef c12_11_im
2521 
2522 #undef cinv00_00_re
2523 #undef cinv01_01_re
2524 #undef cinv02_02_re
2525 #undef cinv10_10_re
2526 #undef cinv11_11_re
2527 #undef cinv12_12_re
2528 #undef cinv01_00_re
2529 #undef cinv01_00_im
2530 #undef cinv02_00_re
2531 #undef cinv02_00_im
2532 #undef cinv10_00_re
2533 #undef cinv10_00_im
2534 #undef cinv11_00_re
2535 #undef cinv11_00_im
2536 #undef cinv12_00_re
2537 #undef cinv12_00_im
2538 #undef cinv02_01_re
2539 #undef cinv02_01_im
2540 #undef cinv10_01_re
2541 #undef cinv10_01_im
2542 #undef cinv11_01_re
2543 #undef cinv11_01_im
2544 #undef cinv12_01_re
2545 #undef cinv12_01_im
2546 #undef cinv10_02_re
2547 #undef cinv10_02_im
2548 #undef cinv11_02_re
2549 #undef cinv11_02_im
2550 #undef cinv12_02_re
2551 #undef cinv12_02_im
2552 #undef cinv11_10_re
2553 #undef cinv11_10_im
2554 #undef cinv12_10_re
2555 #undef cinv12_10_im
2556 #undef cinv12_11_re
2557 #undef cinv12_11_im
2558 
2559 #undef acc00_re
2560 #undef acc00_im
2561 #undef acc01_re
2562 #undef acc01_im
2563 #undef acc02_re
2564 #undef acc02_im
2565 #undef acc10_re
2566 #undef acc10_im
2567 #undef acc11_re
2568 #undef acc11_im
2569 #undef acc12_re
2570 #undef acc12_im
2571 #undef acc20_re
2572 #undef acc20_im
2573 #undef acc21_re
2574 #undef acc21_im
2575 #undef acc22_re
2576 #undef acc22_im
2577 #undef acc30_re
2578 #undef acc30_im
2579 #undef acc31_re
2580 #undef acc31_im
2581 #undef acc32_re
2582 #undef acc32_im
2583 
2584 
2585 #undef o00_re
2586 #undef o00_im
2587 #undef o01_re
2588 #undef o01_im
2589 #undef o02_re
2590 #undef o02_im
2591 #undef o10_re
2592 #undef o10_im
2593 #undef o11_re
2594 #undef o11_im
2595 #undef o12_re
2596 #undef o12_im
2597 #undef o20_re
2598 #undef o20_im
2599 #undef o21_re
2600 #undef o21_im
2601 #undef o22_re
2602 #undef o22_im
2603 #undef o30_re
2604 
2605 #undef VOLATILE
float4 C0
spinorFloat B0_re
#define acc02_re
#define g12_im
#define i20_im
#define SHARED_STRIDE
#define i12_re
spinorFloat A0_re
__constant__ int Vh
#define i01_re
#define o22_re
VOLATILE spinorFloat o32_im
#define o00_re
int x2
#define i21_re
#define acc20_re
#define acc11_re
#define APPLY_CLOVER_TWIST(c, a, reg)
Definition: tmc_core.h:1
__constant__ int X2
#define acc00_re
#define g01_im
__constant__ int X2X1mX1
spinorFloat B1_re
#define acc31_re
#define acc32_re
const int ga_idx
__constant__ int X3X2X1mX2X1
#define gT21_im
#define i10_im
#define o02_re
spinorFloat b2_im
__constant__ int X1
#define o21_im
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
#define o21_re
int sp_idx
#define spinorFloat
#define o30_re
#define g22_re
#define i32_re
__constant__ int X3X2X1
#define o01_re
spinorFloat B1_im
#define gT01_im
#define acc32_im
int x3
#define DSLASH_SHARED_FLOATS_PER_THREAD
spinorFloat a1_im
#define acc02_im
#define o10_im
#define g02_im
#define acc21_im
#define gT21_re
#define acc31_im
#define gT00_re
#define acc20_im
int sid
#define o12_re
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
spinorFloat b0_re
#define o00_im
#define o11_im
#define i30_re
QudaGaugeParam param
Definition: pack_test.cpp:17
spinorFloat b1_im
VOLATILE spinorFloat o32_re
spinorFloat a0_re
__constant__ int ghostFace[QUDA_MAX_DIM+1]
spinorFloat a2_im
spinorFloat b1_re
#define acc01_im
#define i31_re
float4 C3
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define acc10_im
#define g10_re
float4 C5
#define i11_im
#define APPLY_CLOVER_TWIST_INV(c, cinv, a, reg)
Definition: tmc_core.h:432
float4 C6
#define acc00_im
#define g11_im
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
#define g22_im
#define g01_re
VOLATILE spinorFloat * s
float4 C2
#define g00_re
#define i21_im
spinorFloat B2_re
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
#define GAUGE0TEX
Definition: covDev.h:112
#define acc12_im
#define acc21_re
WRITE_SPINOR(param.sp_stride)
#define gT11_im
#define g20_im
const int dims[]
#define o20_re
#define acc30_im
VOLATILE spinorFloat o30_im
float4 C4
__constant__ int X2m1
#define SPINORTEX
Definition: clover_def.h:40
__constant__ int gauge_fixed
spinorFloat B0_im
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define VOLATILE
#define gT02_re
__constant__ int X4X3X2X1mX3X2X1
#define gT20_im
#define SPINOR_HOP
Definition: covDev.h:158
#define i02_re
#define o02_im
#define g11_re
#define o22_im
spinorFloat a1_re
__constant__ int ga_stride
#define g20_re
spinorFloat A2_re
#define i00_im
spinorFloat a2_re
#define g10_im
float4 C7
#define g02_re
#define o12_im
VOLATILE spinorFloat o31_re
spinorFloat A1_im
#define i11_re
#define gT01_re
#define g21_re
#define acc10_re
__constant__ int X1m1
#define g00_im
__constant__ int X3
#define gT12_re
#define i00_re
#define i01_im
spinorFloat A1_re
spinorFloat A2_im
#define gT02_im
#define gT11_re
int x1
#define acc11_im
#define gT12_im
#define gT10_im
#define i30_im
spinorFloat B2_im
spinorFloat b0_im
spinorFloat A0_im
#define GAUGE1TEX
Definition: covDev.h:113
#define o11_re
#define acc30_re
#define g12_re
int x4
#define gT20_re
#define o10_re
#define gT00_im
spinorFloat a0_im
#define i20_re
#define i31_im
__constant__ int X4m1
#define i32_im
#define i22_re
spinorFloat b2_re
#define gT22_im
#define i12_im
#define acc01_re
RECONSTRUCT_GAUGE_MATRIX(0)
__shared__ char s_data[]
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
#define i22_im
#define gT22_re
__constant__ int X4X3X2X1hmX3X2X1h
#define i10_re
float4 C8
#define o20_im
#define acc22_im
KernelType kernel_type
#define o01_im
float4 C1
VOLATILE spinorFloat o31_im
#define g21_im
__constant__ int X4
__constant__ int X3m1
#define acc12_re
#define TPROJSCALE
Definition: covDev.h:101
#define gT10_re
#define acc22_re
__constant__ int X2X1
#define i02_im