QUDA  v0.7.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tmc_dslash_fermi_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 24
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2
15 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
64 #else
65 #define spinorFloat float
66 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4
67 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4
68 #define i00_re I0.x
69 #define i00_im I0.y
70 #define i01_re I0.z
71 #define i01_im I0.w
72 #define i02_re I1.x
73 #define i02_im I1.y
74 #define i10_re I1.z
75 #define i10_im I1.w
76 #define i11_re I2.x
77 #define i11_im I2.y
78 #define i12_re I2.z
79 #define i12_im I2.w
80 #define i20_re I3.x
81 #define i20_im I3.y
82 #define i21_re I3.z
83 #define i21_im I3.w
84 #define i22_re I4.x
85 #define i22_im I4.y
86 #define i30_re I4.z
87 #define i30_im I4.w
88 #define i31_re I5.x
89 #define i31_im I5.y
90 #define i32_re I5.z
91 #define i32_im I5.w
92 #define acc00_re accum0.x
93 #define acc00_im accum0.y
94 #define acc01_re accum0.z
95 #define acc01_im accum0.w
96 #define acc02_re accum1.x
97 #define acc02_im accum1.y
98 #define acc10_re accum1.z
99 #define acc10_im accum1.w
100 #define acc11_re accum2.x
101 #define acc11_im accum2.y
102 #define acc12_re accum2.z
103 #define acc12_im accum2.w
104 #define acc20_re accum3.x
105 #define acc20_im accum3.y
106 #define acc21_re accum3.z
107 #define acc21_im accum3.w
108 #define acc22_re accum4.x
109 #define acc22_im accum4.y
110 #define acc30_re accum4.z
111 #define acc30_im accum4.w
112 #define acc31_re accum5.x
113 #define acc31_im accum5.y
114 #define acc32_re accum5.z
115 #define acc32_im accum5.w
116 #endif // SPINOR_DOUBLE
117 
118 // gauge link
119 #ifdef GAUGE_FLOAT2
120 #define g00_re G0.x
121 #define g00_im G0.y
122 #define g01_re G1.x
123 #define g01_im G1.y
124 #define g02_re G2.x
125 #define g02_im G2.y
126 #define g10_re G3.x
127 #define g10_im G3.y
128 #define g11_re G4.x
129 #define g11_im G4.y
130 #define g12_re G5.x
131 #define g12_im G5.y
132 #define g20_re G6.x
133 #define g20_im G6.y
134 #define g21_re G7.x
135 #define g21_im G7.y
136 #define g22_re G8.x
137 #define g22_im G8.y
138 
139 #else
140 #define g00_re G0.x
141 #define g00_im G0.y
142 #define g01_re G0.z
143 #define g01_im G0.w
144 #define g02_re G1.x
145 #define g02_im G1.y
146 #define g10_re G1.z
147 #define g10_im G1.w
148 #define g11_re G2.x
149 #define g11_im G2.y
150 #define g12_re G2.z
151 #define g12_im G2.w
152 #define g20_re G3.x
153 #define g20_im G3.y
154 #define g21_re G3.z
155 #define g21_im G3.w
156 #define g22_re G4.x
157 #define g22_im G4.y
158 
159 #endif // GAUGE_DOUBLE
160 
161 // conjugated gauge link
162 #define gT00_re (+g00_re)
163 #define gT00_im (-g00_im)
164 #define gT01_re (+g10_re)
165 #define gT01_im (-g10_im)
166 #define gT02_re (+g20_re)
167 #define gT02_im (-g20_im)
168 #define gT10_re (+g01_re)
169 #define gT10_im (-g01_im)
170 #define gT11_re (+g11_re)
171 #define gT11_im (-g11_im)
172 #define gT12_re (+g21_re)
173 #define gT12_im (-g21_im)
174 #define gT20_re (+g02_re)
175 #define gT20_im (-g02_im)
176 #define gT21_re (+g12_re)
177 #define gT21_im (-g12_im)
178 #define gT22_re (+g22_re)
179 #define gT22_im (-g22_im)
180 
181 // first chiral block of clover term
182 #ifdef CLOVER_DOUBLE
183 #define c00_00_re C0.x
184 #define c01_01_re C0.y
185 #define c02_02_re C1.x
186 #define c10_10_re C1.y
187 #define c11_11_re C2.x
188 #define c12_12_re C2.y
189 #define c01_00_re C3.x
190 #define c01_00_im C3.y
191 #define c02_00_re C4.x
192 #define c02_00_im C4.y
193 #define c10_00_re C5.x
194 #define c10_00_im C5.y
195 #define c11_00_re C6.x
196 #define c11_00_im C6.y
197 #define c12_00_re C7.x
198 #define c12_00_im C7.y
199 #define c02_01_re C8.x
200 #define c02_01_im C8.y
201 #define c10_01_re C9.x
202 #define c10_01_im C9.y
203 #define c11_01_re C10.x
204 #define c11_01_im C10.y
205 #define c12_01_re C11.x
206 #define c12_01_im C11.y
207 #define c10_02_re C12.x
208 #define c10_02_im C12.y
209 #define c11_02_re C13.x
210 #define c11_02_im C13.y
211 #define c12_02_re C14.x
212 #define c12_02_im C14.y
213 #define c11_10_re C15.x
214 #define c11_10_im C15.y
215 #define c12_10_re C16.x
216 #define c12_10_im C16.y
217 #define c12_11_re C17.x
218 #define c12_11_im C17.y
219 #else
220 #define c00_00_re C0.x
221 #define c01_01_re C0.y
222 #define c02_02_re C0.z
223 #define c10_10_re C0.w
224 #define c11_11_re C1.x
225 #define c12_12_re C1.y
226 #define c01_00_re C1.z
227 #define c01_00_im C1.w
228 #define c02_00_re C2.x
229 #define c02_00_im C2.y
230 #define c10_00_re C2.z
231 #define c10_00_im C2.w
232 #define c11_00_re C3.x
233 #define c11_00_im C3.y
234 #define c12_00_re C3.z
235 #define c12_00_im C3.w
236 #define c02_01_re C4.x
237 #define c02_01_im C4.y
238 #define c10_01_re C4.z
239 #define c10_01_im C4.w
240 #define c11_01_re C5.x
241 #define c11_01_im C5.y
242 #define c12_01_re C5.z
243 #define c12_01_im C5.w
244 #define c10_02_re C6.x
245 #define c10_02_im C6.y
246 #define c11_02_re C6.z
247 #define c11_02_im C6.w
248 #define c12_02_re C7.x
249 #define c12_02_im C7.y
250 #define c11_10_re C7.z
251 #define c11_10_im C7.w
252 #define c12_10_re C8.x
253 #define c12_10_im C8.y
254 #define c12_11_re C8.z
255 #define c12_11_im C8.w
256 #endif // CLOVER_DOUBLE
257 
258 #define c00_01_re (+c01_00_re)
259 #define c00_01_im (-c01_00_im)
260 #define c00_02_re (+c02_00_re)
261 #define c00_02_im (-c02_00_im)
262 #define c01_02_re (+c02_01_re)
263 #define c01_02_im (-c02_01_im)
264 #define c00_10_re (+c10_00_re)
265 #define c00_10_im (-c10_00_im)
266 #define c01_10_re (+c10_01_re)
267 #define c01_10_im (-c10_01_im)
268 #define c02_10_re (+c10_02_re)
269 #define c02_10_im (-c10_02_im)
270 #define c00_11_re (+c11_00_re)
271 #define c00_11_im (-c11_00_im)
272 #define c01_11_re (+c11_01_re)
273 #define c01_11_im (-c11_01_im)
274 #define c02_11_re (+c11_02_re)
275 #define c02_11_im (-c11_02_im)
276 #define c10_11_re (+c11_10_re)
277 #define c10_11_im (-c11_10_im)
278 #define c00_12_re (+c12_00_re)
279 #define c00_12_im (-c12_00_im)
280 #define c01_12_re (+c12_01_re)
281 #define c01_12_im (-c12_01_im)
282 #define c02_12_re (+c12_02_re)
283 #define c02_12_im (-c12_02_im)
284 #define c10_12_re (+c12_10_re)
285 #define c10_12_im (-c12_10_im)
286 #define c11_12_re (+c12_11_re)
287 #define c11_12_im (-c12_11_im)
288 
289 // second chiral block of clover term (reuses C0,...,C9)
290 #define c20_20_re c00_00_re
291 #define c21_20_re c01_00_re
292 #define c21_20_im c01_00_im
293 #define c22_20_re c02_00_re
294 #define c22_20_im c02_00_im
295 #define c30_20_re c10_00_re
296 #define c30_20_im c10_00_im
297 #define c31_20_re c11_00_re
298 #define c31_20_im c11_00_im
299 #define c32_20_re c12_00_re
300 #define c32_20_im c12_00_im
301 #define c20_21_re c00_01_re
302 #define c20_21_im c00_01_im
303 #define c21_21_re c01_01_re
304 #define c22_21_re c02_01_re
305 #define c22_21_im c02_01_im
306 #define c30_21_re c10_01_re
307 #define c30_21_im c10_01_im
308 #define c31_21_re c11_01_re
309 #define c31_21_im c11_01_im
310 #define c32_21_re c12_01_re
311 #define c32_21_im c12_01_im
312 #define c20_22_re c00_02_re
313 #define c20_22_im c00_02_im
314 #define c21_22_re c01_02_re
315 #define c21_22_im c01_02_im
316 #define c22_22_re c02_02_re
317 #define c30_22_re c10_02_re
318 #define c30_22_im c10_02_im
319 #define c31_22_re c11_02_re
320 #define c31_22_im c11_02_im
321 #define c32_22_re c12_02_re
322 #define c32_22_im c12_02_im
323 #define c20_30_re c00_10_re
324 #define c20_30_im c00_10_im
325 #define c21_30_re c01_10_re
326 #define c21_30_im c01_10_im
327 #define c22_30_re c02_10_re
328 #define c22_30_im c02_10_im
329 #define c30_30_re c10_10_re
330 #define c31_30_re c11_10_re
331 #define c31_30_im c11_10_im
332 #define c32_30_re c12_10_re
333 #define c32_30_im c12_10_im
334 #define c20_31_re c00_11_re
335 #define c20_31_im c00_11_im
336 #define c21_31_re c01_11_re
337 #define c21_31_im c01_11_im
338 #define c22_31_re c02_11_re
339 #define c22_31_im c02_11_im
340 #define c30_31_re c10_11_re
341 #define c30_31_im c10_11_im
342 #define c31_31_re c11_11_re
343 #define c32_31_re c12_11_re
344 #define c32_31_im c12_11_im
345 #define c20_32_re c00_12_re
346 #define c20_32_im c00_12_im
347 #define c21_32_re c01_12_re
348 #define c21_32_im c01_12_im
349 #define c22_32_re c02_12_re
350 #define c22_32_im c02_12_im
351 #define c30_32_re c10_12_re
352 #define c30_32_im c10_12_im
353 #define c31_32_re c11_12_re
354 #define c31_32_im c11_12_im
355 #define c32_32_re c12_12_re
356 
357 
358 // first chiral block of inverted clover term
359 #ifdef CLOVER_DOUBLE
360 #define cinv00_00_re C0.x
361 #define cinv01_01_re C0.y
362 #define cinv02_02_re C1.x
363 #define cinv10_10_re C1.y
364 #define cinv11_11_re C2.x
365 #define cinv12_12_re C2.y
366 #define cinv01_00_re C3.x
367 #define cinv01_00_im C3.y
368 #define cinv02_00_re C4.x
369 #define cinv02_00_im C4.y
370 #define cinv10_00_re C5.x
371 #define cinv10_00_im C5.y
372 #define cinv11_00_re C6.x
373 #define cinv11_00_im C6.y
374 #define cinv12_00_re C7.x
375 #define cinv12_00_im C7.y
376 #define cinv02_01_re C8.x
377 #define cinv02_01_im C8.y
378 #define cinv10_01_re C9.x
379 #define cinv10_01_im C9.y
380 #define cinv11_01_re C10.x
381 #define cinv11_01_im C10.y
382 #define cinv12_01_re C11.x
383 #define cinv12_01_im C11.y
384 #define cinv10_02_re C12.x
385 #define cinv10_02_im C12.y
386 #define cinv11_02_re C13.x
387 #define cinv11_02_im C13.y
388 #define cinv12_02_re C14.x
389 #define cinv12_02_im C14.y
390 #define cinv11_10_re C15.x
391 #define cinv11_10_im C15.y
392 #define cinv12_10_re C16.x
393 #define cinv12_10_im C16.y
394 #define cinv12_11_re C17.x
395 #define cinv12_11_im C17.y
396 #else
397 #define cinv00_00_re C0.x
398 #define cinv01_01_re C0.y
399 #define cinv02_02_re C0.z
400 #define cinv10_10_re C0.w
401 #define cinv11_11_re C1.x
402 #define cinv12_12_re C1.y
403 #define cinv01_00_re C1.z
404 #define cinv01_00_im C1.w
405 #define cinv02_00_re C2.x
406 #define cinv02_00_im C2.y
407 #define cinv10_00_re C2.z
408 #define cinv10_00_im C2.w
409 #define cinv11_00_re C3.x
410 #define cinv11_00_im C3.y
411 #define cinv12_00_re C3.z
412 #define cinv12_00_im C3.w
413 #define cinv02_01_re C4.x
414 #define cinv02_01_im C4.y
415 #define cinv10_01_re C4.z
416 #define cinv10_01_im C4.w
417 #define cinv11_01_re C5.x
418 #define cinv11_01_im C5.y
419 #define cinv12_01_re C5.z
420 #define cinv12_01_im C5.w
421 #define cinv10_02_re C6.x
422 #define cinv10_02_im C6.y
423 #define cinv11_02_re C6.z
424 #define cinv11_02_im C6.w
425 #define cinv12_02_re C7.x
426 #define cinv12_02_im C7.y
427 #define cinv11_10_re C7.z
428 #define cinv11_10_im C7.w
429 #define cinv12_10_re C8.x
430 #define cinv12_10_im C8.y
431 #define cinv12_11_re C8.z
432 #define cinv12_11_im C8.w
433 #endif // CLOVER_DOUBLE
434 
435 #define cinv00_01_re (+cinv01_00_re)
436 #define cinv00_01_im (-cinv01_00_im)
437 #define cinv00_02_re (+cinv02_00_re)
438 #define cinv00_02_im (-cinv02_00_im)
439 #define cinv01_02_re (+cinv02_01_re)
440 #define cinv01_02_im (-cinv02_01_im)
441 #define cinv00_10_re (+cinv10_00_re)
442 #define cinv00_10_im (-cinv10_00_im)
443 #define cinv01_10_re (+cinv10_01_re)
444 #define cinv01_10_im (-cinv10_01_im)
445 #define cinv02_10_re (+cinv10_02_re)
446 #define cinv02_10_im (-cinv10_02_im)
447 #define cinv00_11_re (+cinv11_00_re)
448 #define cinv00_11_im (-cinv11_00_im)
449 #define cinv01_11_re (+cinv11_01_re)
450 #define cinv01_11_im (-cinv11_01_im)
451 #define cinv02_11_re (+cinv11_02_re)
452 #define cinv02_11_im (-cinv11_02_im)
453 #define cinv10_11_re (+cinv11_10_re)
454 #define cinv10_11_im (-cinv11_10_im)
455 #define cinv00_12_re (+cinv12_00_re)
456 #define cinv00_12_im (-cinv12_00_im)
457 #define cinv01_12_re (+cinv12_01_re)
458 #define cinv01_12_im (-cinv12_01_im)
459 #define cinv02_12_re (+cinv12_02_re)
460 #define cinv02_12_im (-cinv12_02_im)
461 #define cinv10_12_re (+cinv12_10_re)
462 #define cinv10_12_im (-cinv12_10_im)
463 #define cinv11_12_re (+cinv12_11_re)
464 #define cinv11_12_im (-cinv12_11_im)
465 
466 // second chiral block of inverted clover term (reuses C0,...,C9)
467 #define cinv20_20_re cinv00_00_re
468 #define cinv21_20_re cinv01_00_re
469 #define cinv21_20_im cinv01_00_im
470 #define cinv22_20_re cinv02_00_re
471 #define cinv22_20_im cinv02_00_im
472 #define cinv30_20_re cinv10_00_re
473 #define cinv30_20_im cinv10_00_im
474 #define cinv31_20_re cinv11_00_re
475 #define cinv31_20_im cinv11_00_im
476 #define cinv32_20_re cinv12_00_re
477 #define cinv32_20_im cinv12_00_im
478 #define cinv20_21_re cinv00_01_re
479 #define cinv20_21_im cinv00_01_im
480 #define cinv21_21_re cinv01_01_re
481 #define cinv22_21_re cinv02_01_re
482 #define cinv22_21_im cinv02_01_im
483 #define cinv30_21_re cinv10_01_re
484 #define cinv30_21_im cinv10_01_im
485 #define cinv31_21_re cinv11_01_re
486 #define cinv31_21_im cinv11_01_im
487 #define cinv32_21_re cinv12_01_re
488 #define cinv32_21_im cinv12_01_im
489 #define cinv20_22_re cinv00_02_re
490 #define cinv20_22_im cinv00_02_im
491 #define cinv21_22_re cinv01_02_re
492 #define cinv21_22_im cinv01_02_im
493 #define cinv22_22_re cinv02_02_re
494 #define cinv30_22_re cinv10_02_re
495 #define cinv30_22_im cinv10_02_im
496 #define cinv31_22_re cinv11_02_re
497 #define cinv31_22_im cinv11_02_im
498 #define cinv32_22_re cinv12_02_re
499 #define cinv32_22_im cinv12_02_im
500 #define cinv20_30_re cinv00_10_re
501 #define cinv20_30_im cinv00_10_im
502 #define cinv21_30_re cinv01_10_re
503 #define cinv21_30_im cinv01_10_im
504 #define cinv22_30_re cinv02_10_re
505 #define cinv22_30_im cinv02_10_im
506 #define cinv30_30_re cinv10_10_re
507 #define cinv31_30_re cinv11_10_re
508 #define cinv31_30_im cinv11_10_im
509 #define cinv32_30_re cinv12_10_re
510 #define cinv32_30_im cinv12_10_im
511 #define cinv20_31_re cinv00_11_re
512 #define cinv20_31_im cinv00_11_im
513 #define cinv21_31_re cinv01_11_re
514 #define cinv21_31_im cinv01_11_im
515 #define cinv22_31_re cinv02_11_re
516 #define cinv22_31_im cinv02_11_im
517 #define cinv30_31_re cinv10_11_re
518 #define cinv30_31_im cinv10_11_im
519 #define cinv31_31_re cinv11_11_re
520 #define cinv32_31_re cinv12_11_re
521 #define cinv32_31_im cinv12_11_im
522 #define cinv20_32_re cinv00_12_re
523 #define cinv20_32_im cinv00_12_im
524 #define cinv21_32_re cinv01_12_re
525 #define cinv21_32_im cinv01_12_im
526 #define cinv22_32_re cinv02_12_re
527 #define cinv22_32_im cinv02_12_im
528 #define cinv30_32_re cinv10_12_re
529 #define cinv30_32_im cinv10_12_im
530 #define cinv31_32_re cinv11_12_re
531 #define cinv31_32_im cinv11_12_im
532 #define cinv32_32_re cinv12_12_re
533 
534 
535 
536 // declare C## here and use ASSN below instead of READ
537 #ifdef CLOVER_DOUBLE
538 double2 C0;
539 double2 C1;
540 double2 C2;
541 double2 C3;
542 double2 C4;
543 double2 C5;
544 double2 C6;
545 double2 C7;
546 double2 C8;
547 double2 C9;
548 double2 C10;
549 double2 C11;
550 double2 C12;
551 double2 C13;
552 double2 C14;
553 double2 C15;
554 double2 C16;
555 double2 C17;
556 #else
557 float4 C0;
558 float4 C1;
559 float4 C2;
560 float4 C3;
561 float4 C4;
562 float4 C5;
563 float4 C6;
564 float4 C7;
565 float4 C8;
566 
567 #if (DD_PREC==2)
568 float K;
569 #endif
570 
571 #endif // CLOVER_DOUBLE
572 // output spinor
597 
598 #ifdef SPINOR_DOUBLE
599 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
600 #else
601 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
602 #endif
603 
604 #include "read_gauge.h"
605 #include "io_spinor.h"
606 #include "read_clover.h"
607 #include "tmc_core.h"
608 
609 int x1, x2, x3, x4;
610 int X;
611 
612 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
613 int sp_norm_idx;
614 #endif // MULTI_GPU half precision
615 
616 int sid;
617 
618 #ifdef MULTI_GPU
619 int face_idx;
621 #endif
622 
623  // Inline by hand for the moment and assume even dimensions
624  const int dims[] = {X1, X2, X3, X4};
625  coordsFromIndex3D<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
626 
627  // only need to check Y and Z dims currently since X and T set to match exactly
628  if (x2 >= X2) return;
629  if (x3 >= X3) return;
630 
631  o00_re = 0; o00_im = 0;
632  o01_re = 0; o01_im = 0;
633  o02_re = 0; o02_im = 0;
634  o10_re = 0; o10_im = 0;
635  o11_re = 0; o11_im = 0;
636  o12_re = 0; o12_im = 0;
637  o20_re = 0; o20_im = 0;
638  o21_re = 0; o21_im = 0;
639  o22_re = 0; o22_im = 0;
640  o30_re = 0; o30_im = 0;
641  o31_re = 0; o31_im = 0;
642  o32_re = 0; o32_im = 0;
643 
644 #ifdef MULTI_GPU
645 } else { // exterior kernel
646 
647  sid = blockIdx.x*blockDim.x + threadIdx.x;
648  if (sid >= param.threads) return;
649 
650  const int dim = static_cast<int>(kernel_type);
651  const int face_volume = (param.threads >> 1); // volume of one face
652  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
653  face_idx = sid - face_num*face_volume; // index into the respective face
654 
655  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
656  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
657  //sp_idx = face_idx + param.ghostOffset[dim];
658 
659 #if (DD_PREC==2) // half precision
660  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
661 #endif
662 
663  const int dims[] = {X1, X2, X3, X4};
664  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
665 
666  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
667 
668  o00_re = i00_re; o00_im = i00_im;
669  o01_re = i01_re; o01_im = i01_im;
670  o02_re = i02_re; o02_im = i02_im;
671  o10_re = i10_re; o10_im = i10_im;
672  o11_re = i11_re; o11_im = i11_im;
673  o12_re = i12_re; o12_im = i12_im;
674  o20_re = i20_re; o20_im = i20_im;
675  o21_re = i21_re; o21_im = i21_im;
676  o22_re = i22_re; o22_im = i22_im;
677  o30_re = i30_re; o30_im = i30_im;
678  o31_re = i31_re; o31_im = i31_im;
679  o32_re = i32_re; o32_im = i32_im;
680 }
681 #endif // MULTI_GPU
682 
683 
684 #ifdef MULTI_GPU
685 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
686  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
687 #endif
688 {
689  // Projector P0-
690  // 1 0 0 -i
691  // 0 1 -i 0
692  // 0 i 1 0
693  // i 0 0 1
694 
695 #ifdef MULTI_GPU
696  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
697  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
698 #else
699  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
700 #endif
701 
702  const int ga_idx = sid;
703 
710 
711 #ifdef MULTI_GPU
712  if (kernel_type == INTERIOR_KERNEL) {
713 #endif
714 
715  // read spinor from device memory
716  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
717 
718  // store spinor into shared memory
719  WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i);
720 
721  // project spinor into half spinors
722  a0_re = +i00_re+i30_im;
723  a0_im = +i00_im-i30_re;
724  a1_re = +i01_re+i31_im;
725  a1_im = +i01_im-i31_re;
726  a2_re = +i02_re+i32_im;
727  a2_im = +i02_im-i32_re;
728  b0_re = +i10_re+i20_im;
729  b0_im = +i10_im-i20_re;
730  b1_re = +i11_re+i21_im;
731  b1_im = +i11_im-i21_re;
732  b2_re = +i12_re+i22_im;
733  b2_im = +i12_im-i22_re;
734 
735 #ifdef MULTI_GPU
736  } else {
737 
738  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
739 
740  // read half spinor from device memory
741  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
742 
743  a0_re = i00_re; a0_im = i00_im;
744  a1_re = i01_re; a1_im = i01_im;
745  a2_re = i02_re; a2_im = i02_im;
746  b0_re = i10_re; b0_im = i10_im;
747  b1_re = i11_re; b1_im = i11_im;
748  b2_re = i12_re; b2_im = i12_im;
749 
750  }
751 #endif // MULTI_GPU
752 
753  // read gauge matrix from device memory
754  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
755 
756  // reconstruct gauge matrix
758 
759  // multiply row 0
761  A0_re += g00_re * a0_re;
762  A0_re -= g00_im * a0_im;
763  A0_re += g01_re * a1_re;
764  A0_re -= g01_im * a1_im;
765  A0_re += g02_re * a2_re;
766  A0_re -= g02_im * a2_im;
768  A0_im += g00_re * a0_im;
769  A0_im += g00_im * a0_re;
770  A0_im += g01_re * a1_im;
771  A0_im += g01_im * a1_re;
772  A0_im += g02_re * a2_im;
773  A0_im += g02_im * a2_re;
775  B0_re += g00_re * b0_re;
776  B0_re -= g00_im * b0_im;
777  B0_re += g01_re * b1_re;
778  B0_re -= g01_im * b1_im;
779  B0_re += g02_re * b2_re;
780  B0_re -= g02_im * b2_im;
782  B0_im += g00_re * b0_im;
783  B0_im += g00_im * b0_re;
784  B0_im += g01_re * b1_im;
785  B0_im += g01_im * b1_re;
786  B0_im += g02_re * b2_im;
787  B0_im += g02_im * b2_re;
788 
789  // multiply row 1
791  A1_re += g10_re * a0_re;
792  A1_re -= g10_im * a0_im;
793  A1_re += g11_re * a1_re;
794  A1_re -= g11_im * a1_im;
795  A1_re += g12_re * a2_re;
796  A1_re -= g12_im * a2_im;
798  A1_im += g10_re * a0_im;
799  A1_im += g10_im * a0_re;
800  A1_im += g11_re * a1_im;
801  A1_im += g11_im * a1_re;
802  A1_im += g12_re * a2_im;
803  A1_im += g12_im * a2_re;
805  B1_re += g10_re * b0_re;
806  B1_re -= g10_im * b0_im;
807  B1_re += g11_re * b1_re;
808  B1_re -= g11_im * b1_im;
809  B1_re += g12_re * b2_re;
810  B1_re -= g12_im * b2_im;
812  B1_im += g10_re * b0_im;
813  B1_im += g10_im * b0_re;
814  B1_im += g11_re * b1_im;
815  B1_im += g11_im * b1_re;
816  B1_im += g12_re * b2_im;
817  B1_im += g12_im * b2_re;
818 
819  // multiply row 2
821  A2_re += g20_re * a0_re;
822  A2_re -= g20_im * a0_im;
823  A2_re += g21_re * a1_re;
824  A2_re -= g21_im * a1_im;
825  A2_re += g22_re * a2_re;
826  A2_re -= g22_im * a2_im;
828  A2_im += g20_re * a0_im;
829  A2_im += g20_im * a0_re;
830  A2_im += g21_re * a1_im;
831  A2_im += g21_im * a1_re;
832  A2_im += g22_re * a2_im;
833  A2_im += g22_im * a2_re;
835  B2_re += g20_re * b0_re;
836  B2_re -= g20_im * b0_im;
837  B2_re += g21_re * b1_re;
838  B2_re -= g21_im * b1_im;
839  B2_re += g22_re * b2_re;
840  B2_re -= g22_im * b2_im;
842  B2_im += g20_re * b0_im;
843  B2_im += g20_im * b0_re;
844  B2_im += g21_re * b1_im;
845  B2_im += g21_im * b1_re;
846  B2_im += g22_re * b2_im;
847  B2_im += g22_im * b2_re;
848 
849  o00_re += A0_re;
850  o00_im += A0_im;
851  o10_re += B0_re;
852  o10_im += B0_im;
853  o20_re -= B0_im;
854  o20_im += B0_re;
855  o30_re -= A0_im;
856  o30_im += A0_re;
857 
858  o01_re += A1_re;
859  o01_im += A1_im;
860  o11_re += B1_re;
861  o11_im += B1_im;
862  o21_re -= B1_im;
863  o21_im += B1_re;
864  o31_re -= A1_im;
865  o31_im += A1_re;
866 
867  o02_re += A2_re;
868  o02_im += A2_im;
869  o12_re += B2_re;
870  o12_im += B2_im;
871  o22_re -= B2_im;
872  o22_im += B2_re;
873  o32_re -= A2_im;
874  o32_im += A2_re;
875 
876 }
877 
878 #ifdef MULTI_GPU
879 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
880  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
881 #endif
882 {
883  // Projector P0+
884  // 1 0 0 i
885  // 0 1 i 0
886  // 0 -i 1 0
887  // -i 0 0 1
888 
889 #ifdef MULTI_GPU
890  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
891  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
892 #else
893  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
894 #endif
895 
896 #ifdef MULTI_GPU
897  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
898 #else
899  const int ga_idx = sp_idx;
900 #endif
901 
908 
909 #ifdef MULTI_GPU
910  if (kernel_type == INTERIOR_KERNEL) {
911 #endif
912 
913  // load spinor from shared memory
914  int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1;
915  __syncthreads();
916  READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z);
917 
918  // project spinor into half spinors
919  a0_re = +i00_re-i30_im;
920  a0_im = +i00_im+i30_re;
921  a1_re = +i01_re-i31_im;
922  a1_im = +i01_im+i31_re;
923  a2_re = +i02_re-i32_im;
924  a2_im = +i02_im+i32_re;
925  b0_re = +i10_re-i20_im;
926  b0_im = +i10_im+i20_re;
927  b1_re = +i11_re-i21_im;
928  b1_im = +i11_im+i21_re;
929  b2_re = +i12_re-i22_im;
930  b2_im = +i12_im+i22_re;
931 
932 #ifdef MULTI_GPU
933  } else {
934 
935  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
936 
937  // read half spinor from device memory
938  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
939 
940  a0_re = i00_re; a0_im = i00_im;
941  a1_re = i01_re; a1_im = i01_im;
942  a2_re = i02_re; a2_im = i02_im;
943  b0_re = i10_re; b0_im = i10_im;
944  b1_re = i11_re; b1_im = i11_im;
945  b2_re = i12_re; b2_im = i12_im;
946 
947  }
948 #endif // MULTI_GPU
949 
950  // read gauge matrix from device memory
951  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
952 
953  // reconstruct gauge matrix
955 
956  // multiply row 0
957  spinorFloat A0_re = 0;
958  A0_re += gT00_re * a0_re;
959  A0_re -= gT00_im * a0_im;
960  A0_re += gT01_re * a1_re;
961  A0_re -= gT01_im * a1_im;
962  A0_re += gT02_re * a2_re;
963  A0_re -= gT02_im * a2_im;
964  spinorFloat A0_im = 0;
965  A0_im += gT00_re * a0_im;
966  A0_im += gT00_im * a0_re;
967  A0_im += gT01_re * a1_im;
968  A0_im += gT01_im * a1_re;
969  A0_im += gT02_re * a2_im;
970  A0_im += gT02_im * a2_re;
971  spinorFloat B0_re = 0;
972  B0_re += gT00_re * b0_re;
973  B0_re -= gT00_im * b0_im;
974  B0_re += gT01_re * b1_re;
975  B0_re -= gT01_im * b1_im;
976  B0_re += gT02_re * b2_re;
977  B0_re -= gT02_im * b2_im;
978  spinorFloat B0_im = 0;
979  B0_im += gT00_re * b0_im;
980  B0_im += gT00_im * b0_re;
981  B0_im += gT01_re * b1_im;
982  B0_im += gT01_im * b1_re;
983  B0_im += gT02_re * b2_im;
984  B0_im += gT02_im * b2_re;
985 
986  // multiply row 1
987  spinorFloat A1_re = 0;
988  A1_re += gT10_re * a0_re;
989  A1_re -= gT10_im * a0_im;
990  A1_re += gT11_re * a1_re;
991  A1_re -= gT11_im * a1_im;
992  A1_re += gT12_re * a2_re;
993  A1_re -= gT12_im * a2_im;
994  spinorFloat A1_im = 0;
995  A1_im += gT10_re * a0_im;
996  A1_im += gT10_im * a0_re;
997  A1_im += gT11_re * a1_im;
998  A1_im += gT11_im * a1_re;
999  A1_im += gT12_re * a2_im;
1000  A1_im += gT12_im * a2_re;
1001  spinorFloat B1_re = 0;
1002  B1_re += gT10_re * b0_re;
1003  B1_re -= gT10_im * b0_im;
1004  B1_re += gT11_re * b1_re;
1005  B1_re -= gT11_im * b1_im;
1006  B1_re += gT12_re * b2_re;
1007  B1_re -= gT12_im * b2_im;
1008  spinorFloat B1_im = 0;
1009  B1_im += gT10_re * b0_im;
1010  B1_im += gT10_im * b0_re;
1011  B1_im += gT11_re * b1_im;
1012  B1_im += gT11_im * b1_re;
1013  B1_im += gT12_re * b2_im;
1014  B1_im += gT12_im * b2_re;
1015 
1016  // multiply row 2
1017  spinorFloat A2_re = 0;
1018  A2_re += gT20_re * a0_re;
1019  A2_re -= gT20_im * a0_im;
1020  A2_re += gT21_re * a1_re;
1021  A2_re -= gT21_im * a1_im;
1022  A2_re += gT22_re * a2_re;
1023  A2_re -= gT22_im * a2_im;
1024  spinorFloat A2_im = 0;
1025  A2_im += gT20_re * a0_im;
1026  A2_im += gT20_im * a0_re;
1027  A2_im += gT21_re * a1_im;
1028  A2_im += gT21_im * a1_re;
1029  A2_im += gT22_re * a2_im;
1030  A2_im += gT22_im * a2_re;
1031  spinorFloat B2_re = 0;
1032  B2_re += gT20_re * b0_re;
1033  B2_re -= gT20_im * b0_im;
1034  B2_re += gT21_re * b1_re;
1035  B2_re -= gT21_im * b1_im;
1036  B2_re += gT22_re * b2_re;
1037  B2_re -= gT22_im * b2_im;
1038  spinorFloat B2_im = 0;
1039  B2_im += gT20_re * b0_im;
1040  B2_im += gT20_im * b0_re;
1041  B2_im += gT21_re * b1_im;
1042  B2_im += gT21_im * b1_re;
1043  B2_im += gT22_re * b2_im;
1044  B2_im += gT22_im * b2_re;
1045 
1046  o00_re += A0_re;
1047  o00_im += A0_im;
1048  o10_re += B0_re;
1049  o10_im += B0_im;
1050  o20_re += B0_im;
1051  o20_im -= B0_re;
1052  o30_re += A0_im;
1053  o30_im -= A0_re;
1054 
1055  o01_re += A1_re;
1056  o01_im += A1_im;
1057  o11_re += B1_re;
1058  o11_im += B1_im;
1059  o21_re += B1_im;
1060  o21_im -= B1_re;
1061  o31_re += A1_im;
1062  o31_im -= A1_re;
1063 
1064  o02_re += A2_re;
1065  o02_im += A2_im;
1066  o12_re += B2_re;
1067  o12_im += B2_im;
1068  o22_re += B2_im;
1069  o22_im -= B2_re;
1070  o32_re += A2_im;
1071  o32_im -= A2_re;
1072 
1073 }
1074 
1075 #ifdef MULTI_GPU
1076 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
1077  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
1078 #endif
1079 {
1080  // Projector P1-
1081  // 1 0 0 -1
1082  // 0 1 1 0
1083  // 0 1 1 0
1084  // -1 0 0 1
1085 
1086 #ifdef MULTI_GPU
1087  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
1088  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1089 #else
1090  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
1091 #endif
1092 
1093  const int ga_idx = sid;
1094 
1101 
1102 #ifdef MULTI_GPU
1103  if (kernel_type == INTERIOR_KERNEL) {
1104 #endif
1105 
1106  if (threadIdx.y == blockDim.y-1 && blockDim.y < X2 ) {
1107  // read spinor from device memory
1108  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1109 
1110  // project spinor into half spinors
1111  a0_re = +i00_re-i30_re;
1112  a0_im = +i00_im-i30_im;
1113  a1_re = +i01_re-i31_re;
1114  a1_im = +i01_im-i31_im;
1115  a2_re = +i02_re-i32_re;
1116  a2_im = +i02_im-i32_im;
1117  b0_re = +i10_re+i20_re;
1118  b0_im = +i10_im+i20_im;
1119  b1_re = +i11_re+i21_re;
1120  b1_im = +i11_im+i21_im;
1121  b2_re = +i12_re+i22_re;
1122  b2_im = +i12_im+i22_im;
1123  } else {
1124  // load spinor from shared memory
1125  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x;
1126  int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0;
1127  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
1128 
1129  // project spinor into half spinors
1130  a0_re = +i00_re-i30_re;
1131  a0_im = +i00_im-i30_im;
1132  a1_re = +i01_re-i31_re;
1133  a1_im = +i01_im-i31_im;
1134  a2_re = +i02_re-i32_re;
1135  a2_im = +i02_im-i32_im;
1136  b0_re = +i10_re+i20_re;
1137  b0_im = +i10_im+i20_im;
1138  b1_re = +i11_re+i21_re;
1139  b1_im = +i11_im+i21_im;
1140  b2_re = +i12_re+i22_re;
1141  b2_im = +i12_im+i22_im;
1142  }
1143 
1144 #ifdef MULTI_GPU
1145  } else {
1146 
1147  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1148 
1149  // read half spinor from device memory
1150  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1151 
1152  a0_re = i00_re; a0_im = i00_im;
1153  a1_re = i01_re; a1_im = i01_im;
1154  a2_re = i02_re; a2_im = i02_im;
1155  b0_re = i10_re; b0_im = i10_im;
1156  b1_re = i11_re; b1_im = i11_im;
1157  b2_re = i12_re; b2_im = i12_im;
1158 
1159  }
1160 #endif // MULTI_GPU
1161 
1162  // read gauge matrix from device memory
1163  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
1164 
1165  // reconstruct gauge matrix
1167 
1168  // multiply row 0
1169  spinorFloat A0_re = 0;
1170  A0_re += g00_re * a0_re;
1171  A0_re -= g00_im * a0_im;
1172  A0_re += g01_re * a1_re;
1173  A0_re -= g01_im * a1_im;
1174  A0_re += g02_re * a2_re;
1175  A0_re -= g02_im * a2_im;
1176  spinorFloat A0_im = 0;
1177  A0_im += g00_re * a0_im;
1178  A0_im += g00_im * a0_re;
1179  A0_im += g01_re * a1_im;
1180  A0_im += g01_im * a1_re;
1181  A0_im += g02_re * a2_im;
1182  A0_im += g02_im * a2_re;
1183  spinorFloat B0_re = 0;
1184  B0_re += g00_re * b0_re;
1185  B0_re -= g00_im * b0_im;
1186  B0_re += g01_re * b1_re;
1187  B0_re -= g01_im * b1_im;
1188  B0_re += g02_re * b2_re;
1189  B0_re -= g02_im * b2_im;
1190  spinorFloat B0_im = 0;
1191  B0_im += g00_re * b0_im;
1192  B0_im += g00_im * b0_re;
1193  B0_im += g01_re * b1_im;
1194  B0_im += g01_im * b1_re;
1195  B0_im += g02_re * b2_im;
1196  B0_im += g02_im * b2_re;
1197 
1198  // multiply row 1
1199  spinorFloat A1_re = 0;
1200  A1_re += g10_re * a0_re;
1201  A1_re -= g10_im * a0_im;
1202  A1_re += g11_re * a1_re;
1203  A1_re -= g11_im * a1_im;
1204  A1_re += g12_re * a2_re;
1205  A1_re -= g12_im * a2_im;
1206  spinorFloat A1_im = 0;
1207  A1_im += g10_re * a0_im;
1208  A1_im += g10_im * a0_re;
1209  A1_im += g11_re * a1_im;
1210  A1_im += g11_im * a1_re;
1211  A1_im += g12_re * a2_im;
1212  A1_im += g12_im * a2_re;
1213  spinorFloat B1_re = 0;
1214  B1_re += g10_re * b0_re;
1215  B1_re -= g10_im * b0_im;
1216  B1_re += g11_re * b1_re;
1217  B1_re -= g11_im * b1_im;
1218  B1_re += g12_re * b2_re;
1219  B1_re -= g12_im * b2_im;
1220  spinorFloat B1_im = 0;
1221  B1_im += g10_re * b0_im;
1222  B1_im += g10_im * b0_re;
1223  B1_im += g11_re * b1_im;
1224  B1_im += g11_im * b1_re;
1225  B1_im += g12_re * b2_im;
1226  B1_im += g12_im * b2_re;
1227 
1228  // multiply row 2
1229  spinorFloat A2_re = 0;
1230  A2_re += g20_re * a0_re;
1231  A2_re -= g20_im * a0_im;
1232  A2_re += g21_re * a1_re;
1233  A2_re -= g21_im * a1_im;
1234  A2_re += g22_re * a2_re;
1235  A2_re -= g22_im * a2_im;
1236  spinorFloat A2_im = 0;
1237  A2_im += g20_re * a0_im;
1238  A2_im += g20_im * a0_re;
1239  A2_im += g21_re * a1_im;
1240  A2_im += g21_im * a1_re;
1241  A2_im += g22_re * a2_im;
1242  A2_im += g22_im * a2_re;
1243  spinorFloat B2_re = 0;
1244  B2_re += g20_re * b0_re;
1245  B2_re -= g20_im * b0_im;
1246  B2_re += g21_re * b1_re;
1247  B2_re -= g21_im * b1_im;
1248  B2_re += g22_re * b2_re;
1249  B2_re -= g22_im * b2_im;
1250  spinorFloat B2_im = 0;
1251  B2_im += g20_re * b0_im;
1252  B2_im += g20_im * b0_re;
1253  B2_im += g21_re * b1_im;
1254  B2_im += g21_im * b1_re;
1255  B2_im += g22_re * b2_im;
1256  B2_im += g22_im * b2_re;
1257 
1258  o00_re += A0_re;
1259  o00_im += A0_im;
1260  o10_re += B0_re;
1261  o10_im += B0_im;
1262  o20_re += B0_re;
1263  o20_im += B0_im;
1264  o30_re -= A0_re;
1265  o30_im -= A0_im;
1266 
1267  o01_re += A1_re;
1268  o01_im += A1_im;
1269  o11_re += B1_re;
1270  o11_im += B1_im;
1271  o21_re += B1_re;
1272  o21_im += B1_im;
1273  o31_re -= A1_re;
1274  o31_im -= A1_im;
1275 
1276  o02_re += A2_re;
1277  o02_im += A2_im;
1278  o12_re += B2_re;
1279  o12_im += B2_im;
1280  o22_re += B2_re;
1281  o22_im += B2_im;
1282  o32_re -= A2_re;
1283  o32_im -= A2_im;
1284 
1285 }
1286 
1287 #ifdef MULTI_GPU
1288 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
1289  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
1290 #endif
1291 {
1292  // Projector P1+
1293  // 1 0 0 1
1294  // 0 1 -1 0
1295  // 0 -1 1 0
1296  // 1 0 0 1
1297 
1298 #ifdef MULTI_GPU
1299  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
1300  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1301 #else
1302  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
1303 #endif
1304 
1305 #ifdef MULTI_GPU
1306  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1307 #else
1308  const int ga_idx = sp_idx;
1309 #endif
1310 
1317 
1318 #ifdef MULTI_GPU
1319  if (kernel_type == INTERIOR_KERNEL) {
1320 #endif
1321 
1322  if (threadIdx.y == 0 && blockDim.y < X2) {
1323  // read spinor from device memory
1324  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1325 
1326  // project spinor into half spinors
1327  a0_re = +i00_re+i30_re;
1328  a0_im = +i00_im+i30_im;
1329  a1_re = +i01_re+i31_re;
1330  a1_im = +i01_im+i31_im;
1331  a2_re = +i02_re+i32_re;
1332  a2_im = +i02_im+i32_im;
1333  b0_re = +i10_re-i20_re;
1334  b0_im = +i10_im-i20_im;
1335  b1_re = +i11_re-i21_re;
1336  b1_im = +i11_im-i21_im;
1337  b2_re = +i12_re-i22_re;
1338  b2_im = +i12_im-i22_im;
1339  } else {
1340  // load spinor from shared memory
1341  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x;
1342  int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1;
1343  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
1344 
1345  // project spinor into half spinors
1346  a0_re = +i00_re+i30_re;
1347  a0_im = +i00_im+i30_im;
1348  a1_re = +i01_re+i31_re;
1349  a1_im = +i01_im+i31_im;
1350  a2_re = +i02_re+i32_re;
1351  a2_im = +i02_im+i32_im;
1352  b0_re = +i10_re-i20_re;
1353  b0_im = +i10_im-i20_im;
1354  b1_re = +i11_re-i21_re;
1355  b1_im = +i11_im-i21_im;
1356  b2_re = +i12_re-i22_re;
1357  b2_im = +i12_im-i22_im;
1358  }
1359 
1360 #ifdef MULTI_GPU
1361  } else {
1362 
1363  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1364 
1365  // read half spinor from device memory
1366  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1367 
1368  a0_re = i00_re; a0_im = i00_im;
1369  a1_re = i01_re; a1_im = i01_im;
1370  a2_re = i02_re; a2_im = i02_im;
1371  b0_re = i10_re; b0_im = i10_im;
1372  b1_re = i11_re; b1_im = i11_im;
1373  b2_re = i12_re; b2_im = i12_im;
1374 
1375  }
1376 #endif // MULTI_GPU
1377 
1378  // read gauge matrix from device memory
1379  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
1380 
1381  // reconstruct gauge matrix
1383 
1384  // multiply row 0
1385  spinorFloat A0_re = 0;
1386  A0_re += gT00_re * a0_re;
1387  A0_re -= gT00_im * a0_im;
1388  A0_re += gT01_re * a1_re;
1389  A0_re -= gT01_im * a1_im;
1390  A0_re += gT02_re * a2_re;
1391  A0_re -= gT02_im * a2_im;
1392  spinorFloat A0_im = 0;
1393  A0_im += gT00_re * a0_im;
1394  A0_im += gT00_im * a0_re;
1395  A0_im += gT01_re * a1_im;
1396  A0_im += gT01_im * a1_re;
1397  A0_im += gT02_re * a2_im;
1398  A0_im += gT02_im * a2_re;
1399  spinorFloat B0_re = 0;
1400  B0_re += gT00_re * b0_re;
1401  B0_re -= gT00_im * b0_im;
1402  B0_re += gT01_re * b1_re;
1403  B0_re -= gT01_im * b1_im;
1404  B0_re += gT02_re * b2_re;
1405  B0_re -= gT02_im * b2_im;
1406  spinorFloat B0_im = 0;
1407  B0_im += gT00_re * b0_im;
1408  B0_im += gT00_im * b0_re;
1409  B0_im += gT01_re * b1_im;
1410  B0_im += gT01_im * b1_re;
1411  B0_im += gT02_re * b2_im;
1412  B0_im += gT02_im * b2_re;
1413 
1414  // multiply row 1
1415  spinorFloat A1_re = 0;
1416  A1_re += gT10_re * a0_re;
1417  A1_re -= gT10_im * a0_im;
1418  A1_re += gT11_re * a1_re;
1419  A1_re -= gT11_im * a1_im;
1420  A1_re += gT12_re * a2_re;
1421  A1_re -= gT12_im * a2_im;
1422  spinorFloat A1_im = 0;
1423  A1_im += gT10_re * a0_im;
1424  A1_im += gT10_im * a0_re;
1425  A1_im += gT11_re * a1_im;
1426  A1_im += gT11_im * a1_re;
1427  A1_im += gT12_re * a2_im;
1428  A1_im += gT12_im * a2_re;
1429  spinorFloat B1_re = 0;
1430  B1_re += gT10_re * b0_re;
1431  B1_re -= gT10_im * b0_im;
1432  B1_re += gT11_re * b1_re;
1433  B1_re -= gT11_im * b1_im;
1434  B1_re += gT12_re * b2_re;
1435  B1_re -= gT12_im * b2_im;
1436  spinorFloat B1_im = 0;
1437  B1_im += gT10_re * b0_im;
1438  B1_im += gT10_im * b0_re;
1439  B1_im += gT11_re * b1_im;
1440  B1_im += gT11_im * b1_re;
1441  B1_im += gT12_re * b2_im;
1442  B1_im += gT12_im * b2_re;
1443 
1444  // multiply row 2
1445  spinorFloat A2_re = 0;
1446  A2_re += gT20_re * a0_re;
1447  A2_re -= gT20_im * a0_im;
1448  A2_re += gT21_re * a1_re;
1449  A2_re -= gT21_im * a1_im;
1450  A2_re += gT22_re * a2_re;
1451  A2_re -= gT22_im * a2_im;
1452  spinorFloat A2_im = 0;
1453  A2_im += gT20_re * a0_im;
1454  A2_im += gT20_im * a0_re;
1455  A2_im += gT21_re * a1_im;
1456  A2_im += gT21_im * a1_re;
1457  A2_im += gT22_re * a2_im;
1458  A2_im += gT22_im * a2_re;
1459  spinorFloat B2_re = 0;
1460  B2_re += gT20_re * b0_re;
1461  B2_re -= gT20_im * b0_im;
1462  B2_re += gT21_re * b1_re;
1463  B2_re -= gT21_im * b1_im;
1464  B2_re += gT22_re * b2_re;
1465  B2_re -= gT22_im * b2_im;
1466  spinorFloat B2_im = 0;
1467  B2_im += gT20_re * b0_im;
1468  B2_im += gT20_im * b0_re;
1469  B2_im += gT21_re * b1_im;
1470  B2_im += gT21_im * b1_re;
1471  B2_im += gT22_re * b2_im;
1472  B2_im += gT22_im * b2_re;
1473 
1474  o00_re += A0_re;
1475  o00_im += A0_im;
1476  o10_re += B0_re;
1477  o10_im += B0_im;
1478  o20_re -= B0_re;
1479  o20_im -= B0_im;
1480  o30_re += A0_re;
1481  o30_im += A0_im;
1482 
1483  o01_re += A1_re;
1484  o01_im += A1_im;
1485  o11_re += B1_re;
1486  o11_im += B1_im;
1487  o21_re -= B1_re;
1488  o21_im -= B1_im;
1489  o31_re += A1_re;
1490  o31_im += A1_im;
1491 
1492  o02_re += A2_re;
1493  o02_im += A2_im;
1494  o12_re += B2_re;
1495  o12_im += B2_im;
1496  o22_re -= B2_re;
1497  o22_im -= B2_im;
1498  o32_re += A2_re;
1499  o32_im += A2_im;
1500 
1501 }
1502 
1503 #ifdef MULTI_GPU
1504 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1505  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1506 #endif
1507 {
1508  // Projector P2-
1509  // 1 0 -i 0
1510  // 0 1 0 i
1511  // i 0 1 0
1512  // 0 -i 0 1
1513 
1514 #ifdef MULTI_GPU
1515  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1516  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1517 #else
1518  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1519 #endif
1520 
1521  const int ga_idx = sid;
1522 
1529 
1530 #ifdef MULTI_GPU
1531  if (kernel_type == INTERIOR_KERNEL) {
1532 #endif
1533 
1534  if (threadIdx.z == blockDim.z-1 && blockDim.z < X3) {
1535  // read spinor from device memory
1536  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1537 
1538  // project spinor into half spinors
1539  a0_re = +i00_re+i20_im;
1540  a0_im = +i00_im-i20_re;
1541  a1_re = +i01_re+i21_im;
1542  a1_im = +i01_im-i21_re;
1543  a2_re = +i02_re+i22_im;
1544  a2_im = +i02_im-i22_re;
1545  b0_re = +i10_re-i30_im;
1546  b0_im = +i10_im+i30_re;
1547  b1_re = +i11_re-i31_im;
1548  b1_im = +i11_im+i31_re;
1549  b2_re = +i12_re-i32_im;
1550  b2_im = +i12_im+i32_re;
1551  } else {
1552  // load spinor from shared memory
1553  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x;
1554  int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0;
1555  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1556 
1557  // project spinor into half spinors
1558  a0_re = +i00_re+i20_im;
1559  a0_im = +i00_im-i20_re;
1560  a1_re = +i01_re+i21_im;
1561  a1_im = +i01_im-i21_re;
1562  a2_re = +i02_re+i22_im;
1563  a2_im = +i02_im-i22_re;
1564  b0_re = +i10_re-i30_im;
1565  b0_im = +i10_im+i30_re;
1566  b1_re = +i11_re-i31_im;
1567  b1_im = +i11_im+i31_re;
1568  b2_re = +i12_re-i32_im;
1569  b2_im = +i12_im+i32_re;
1570  }
1571 
1572 #ifdef MULTI_GPU
1573  } else {
1574 
1575  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1576 
1577  // read half spinor from device memory
1578  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1579 
1580  a0_re = i00_re; a0_im = i00_im;
1581  a1_re = i01_re; a1_im = i01_im;
1582  a2_re = i02_re; a2_im = i02_im;
1583  b0_re = i10_re; b0_im = i10_im;
1584  b1_re = i11_re; b1_im = i11_im;
1585  b2_re = i12_re; b2_im = i12_im;
1586 
1587  }
1588 #endif // MULTI_GPU
1589 
1590  // read gauge matrix from device memory
1591  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1592 
1593  // reconstruct gauge matrix
1595 
1596  // multiply row 0
1597  spinorFloat A0_re = 0;
1598  A0_re += g00_re * a0_re;
1599  A0_re -= g00_im * a0_im;
1600  A0_re += g01_re * a1_re;
1601  A0_re -= g01_im * a1_im;
1602  A0_re += g02_re * a2_re;
1603  A0_re -= g02_im * a2_im;
1604  spinorFloat A0_im = 0;
1605  A0_im += g00_re * a0_im;
1606  A0_im += g00_im * a0_re;
1607  A0_im += g01_re * a1_im;
1608  A0_im += g01_im * a1_re;
1609  A0_im += g02_re * a2_im;
1610  A0_im += g02_im * a2_re;
1611  spinorFloat B0_re = 0;
1612  B0_re += g00_re * b0_re;
1613  B0_re -= g00_im * b0_im;
1614  B0_re += g01_re * b1_re;
1615  B0_re -= g01_im * b1_im;
1616  B0_re += g02_re * b2_re;
1617  B0_re -= g02_im * b2_im;
1618  spinorFloat B0_im = 0;
1619  B0_im += g00_re * b0_im;
1620  B0_im += g00_im * b0_re;
1621  B0_im += g01_re * b1_im;
1622  B0_im += g01_im * b1_re;
1623  B0_im += g02_re * b2_im;
1624  B0_im += g02_im * b2_re;
1625 
1626  // multiply row 1
1627  spinorFloat A1_re = 0;
1628  A1_re += g10_re * a0_re;
1629  A1_re -= g10_im * a0_im;
1630  A1_re += g11_re * a1_re;
1631  A1_re -= g11_im * a1_im;
1632  A1_re += g12_re * a2_re;
1633  A1_re -= g12_im * a2_im;
1634  spinorFloat A1_im = 0;
1635  A1_im += g10_re * a0_im;
1636  A1_im += g10_im * a0_re;
1637  A1_im += g11_re * a1_im;
1638  A1_im += g11_im * a1_re;
1639  A1_im += g12_re * a2_im;
1640  A1_im += g12_im * a2_re;
1641  spinorFloat B1_re = 0;
1642  B1_re += g10_re * b0_re;
1643  B1_re -= g10_im * b0_im;
1644  B1_re += g11_re * b1_re;
1645  B1_re -= g11_im * b1_im;
1646  B1_re += g12_re * b2_re;
1647  B1_re -= g12_im * b2_im;
1648  spinorFloat B1_im = 0;
1649  B1_im += g10_re * b0_im;
1650  B1_im += g10_im * b0_re;
1651  B1_im += g11_re * b1_im;
1652  B1_im += g11_im * b1_re;
1653  B1_im += g12_re * b2_im;
1654  B1_im += g12_im * b2_re;
1655 
1656  // multiply row 2
1657  spinorFloat A2_re = 0;
1658  A2_re += g20_re * a0_re;
1659  A2_re -= g20_im * a0_im;
1660  A2_re += g21_re * a1_re;
1661  A2_re -= g21_im * a1_im;
1662  A2_re += g22_re * a2_re;
1663  A2_re -= g22_im * a2_im;
1664  spinorFloat A2_im = 0;
1665  A2_im += g20_re * a0_im;
1666  A2_im += g20_im * a0_re;
1667  A2_im += g21_re * a1_im;
1668  A2_im += g21_im * a1_re;
1669  A2_im += g22_re * a2_im;
1670  A2_im += g22_im * a2_re;
1671  spinorFloat B2_re = 0;
1672  B2_re += g20_re * b0_re;
1673  B2_re -= g20_im * b0_im;
1674  B2_re += g21_re * b1_re;
1675  B2_re -= g21_im * b1_im;
1676  B2_re += g22_re * b2_re;
1677  B2_re -= g22_im * b2_im;
1678  spinorFloat B2_im = 0;
1679  B2_im += g20_re * b0_im;
1680  B2_im += g20_im * b0_re;
1681  B2_im += g21_re * b1_im;
1682  B2_im += g21_im * b1_re;
1683  B2_im += g22_re * b2_im;
1684  B2_im += g22_im * b2_re;
1685 
1686  o00_re += A0_re;
1687  o00_im += A0_im;
1688  o10_re += B0_re;
1689  o10_im += B0_im;
1690  o20_re -= A0_im;
1691  o20_im += A0_re;
1692  o30_re += B0_im;
1693  o30_im -= B0_re;
1694 
1695  o01_re += A1_re;
1696  o01_im += A1_im;
1697  o11_re += B1_re;
1698  o11_im += B1_im;
1699  o21_re -= A1_im;
1700  o21_im += A1_re;
1701  o31_re += B1_im;
1702  o31_im -= B1_re;
1703 
1704  o02_re += A2_re;
1705  o02_im += A2_im;
1706  o12_re += B2_re;
1707  o12_im += B2_im;
1708  o22_re -= A2_im;
1709  o22_im += A2_re;
1710  o32_re += B2_im;
1711  o32_im -= B2_re;
1712 
1713 }
1714 
1715 #ifdef MULTI_GPU
1716 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1717  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1718 #endif
1719 {
1720  // Projector P2+
1721  // 1 0 i 0
1722  // 0 1 0 -i
1723  // -i 0 1 0
1724  // 0 i 0 1
1725 
1726 #ifdef MULTI_GPU
1727  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1728  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1729 #else
1730  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1731 #endif
1732 
1733 #ifdef MULTI_GPU
1734  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1735 #else
1736  const int ga_idx = sp_idx;
1737 #endif
1738 
1745 
1746 #ifdef MULTI_GPU
1747  if (kernel_type == INTERIOR_KERNEL) {
1748 #endif
1749 
1750  if (threadIdx.z == 0 && blockDim.z < X3) {
1751  // read spinor from device memory
1752  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1753 
1754  // project spinor into half spinors
1755  a0_re = +i00_re-i20_im;
1756  a0_im = +i00_im+i20_re;
1757  a1_re = +i01_re-i21_im;
1758  a1_im = +i01_im+i21_re;
1759  a2_re = +i02_re-i22_im;
1760  a2_im = +i02_im+i22_re;
1761  b0_re = +i10_re+i30_im;
1762  b0_im = +i10_im-i30_re;
1763  b1_re = +i11_re+i31_im;
1764  b1_im = +i11_im-i31_re;
1765  b2_re = +i12_re+i32_im;
1766  b2_im = +i12_im-i32_re;
1767  } else {
1768  // load spinor from shared memory
1769  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x;
1770  int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1;
1771  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1772 
1773  // project spinor into half spinors
1774  a0_re = +i00_re-i20_im;
1775  a0_im = +i00_im+i20_re;
1776  a1_re = +i01_re-i21_im;
1777  a1_im = +i01_im+i21_re;
1778  a2_re = +i02_re-i22_im;
1779  a2_im = +i02_im+i22_re;
1780  b0_re = +i10_re+i30_im;
1781  b0_im = +i10_im-i30_re;
1782  b1_re = +i11_re+i31_im;
1783  b1_im = +i11_im-i31_re;
1784  b2_re = +i12_re+i32_im;
1785  b2_im = +i12_im-i32_re;
1786  }
1787 
1788 #ifdef MULTI_GPU
1789  } else {
1790 
1791  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1792 
1793  // read half spinor from device memory
1794  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1795 
1796  a0_re = i00_re; a0_im = i00_im;
1797  a1_re = i01_re; a1_im = i01_im;
1798  a2_re = i02_re; a2_im = i02_im;
1799  b0_re = i10_re; b0_im = i10_im;
1800  b1_re = i11_re; b1_im = i11_im;
1801  b2_re = i12_re; b2_im = i12_im;
1802 
1803  }
1804 #endif // MULTI_GPU
1805 
1806  // read gauge matrix from device memory
1807  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1808 
1809  // reconstruct gauge matrix
1811 
1812  // multiply row 0
1813  spinorFloat A0_re = 0;
1814  A0_re += gT00_re * a0_re;
1815  A0_re -= gT00_im * a0_im;
1816  A0_re += gT01_re * a1_re;
1817  A0_re -= gT01_im * a1_im;
1818  A0_re += gT02_re * a2_re;
1819  A0_re -= gT02_im * a2_im;
1820  spinorFloat A0_im = 0;
1821  A0_im += gT00_re * a0_im;
1822  A0_im += gT00_im * a0_re;
1823  A0_im += gT01_re * a1_im;
1824  A0_im += gT01_im * a1_re;
1825  A0_im += gT02_re * a2_im;
1826  A0_im += gT02_im * a2_re;
1827  spinorFloat B0_re = 0;
1828  B0_re += gT00_re * b0_re;
1829  B0_re -= gT00_im * b0_im;
1830  B0_re += gT01_re * b1_re;
1831  B0_re -= gT01_im * b1_im;
1832  B0_re += gT02_re * b2_re;
1833  B0_re -= gT02_im * b2_im;
1834  spinorFloat B0_im = 0;
1835  B0_im += gT00_re * b0_im;
1836  B0_im += gT00_im * b0_re;
1837  B0_im += gT01_re * b1_im;
1838  B0_im += gT01_im * b1_re;
1839  B0_im += gT02_re * b2_im;
1840  B0_im += gT02_im * b2_re;
1841 
1842  // multiply row 1
1843  spinorFloat A1_re = 0;
1844  A1_re += gT10_re * a0_re;
1845  A1_re -= gT10_im * a0_im;
1846  A1_re += gT11_re * a1_re;
1847  A1_re -= gT11_im * a1_im;
1848  A1_re += gT12_re * a2_re;
1849  A1_re -= gT12_im * a2_im;
1850  spinorFloat A1_im = 0;
1851  A1_im += gT10_re * a0_im;
1852  A1_im += gT10_im * a0_re;
1853  A1_im += gT11_re * a1_im;
1854  A1_im += gT11_im * a1_re;
1855  A1_im += gT12_re * a2_im;
1856  A1_im += gT12_im * a2_re;
1857  spinorFloat B1_re = 0;
1858  B1_re += gT10_re * b0_re;
1859  B1_re -= gT10_im * b0_im;
1860  B1_re += gT11_re * b1_re;
1861  B1_re -= gT11_im * b1_im;
1862  B1_re += gT12_re * b2_re;
1863  B1_re -= gT12_im * b2_im;
1864  spinorFloat B1_im = 0;
1865  B1_im += gT10_re * b0_im;
1866  B1_im += gT10_im * b0_re;
1867  B1_im += gT11_re * b1_im;
1868  B1_im += gT11_im * b1_re;
1869  B1_im += gT12_re * b2_im;
1870  B1_im += gT12_im * b2_re;
1871 
1872  // multiply row 2
1873  spinorFloat A2_re = 0;
1874  A2_re += gT20_re * a0_re;
1875  A2_re -= gT20_im * a0_im;
1876  A2_re += gT21_re * a1_re;
1877  A2_re -= gT21_im * a1_im;
1878  A2_re += gT22_re * a2_re;
1879  A2_re -= gT22_im * a2_im;
1880  spinorFloat A2_im = 0;
1881  A2_im += gT20_re * a0_im;
1882  A2_im += gT20_im * a0_re;
1883  A2_im += gT21_re * a1_im;
1884  A2_im += gT21_im * a1_re;
1885  A2_im += gT22_re * a2_im;
1886  A2_im += gT22_im * a2_re;
1887  spinorFloat B2_re = 0;
1888  B2_re += gT20_re * b0_re;
1889  B2_re -= gT20_im * b0_im;
1890  B2_re += gT21_re * b1_re;
1891  B2_re -= gT21_im * b1_im;
1892  B2_re += gT22_re * b2_re;
1893  B2_re -= gT22_im * b2_im;
1894  spinorFloat B2_im = 0;
1895  B2_im += gT20_re * b0_im;
1896  B2_im += gT20_im * b0_re;
1897  B2_im += gT21_re * b1_im;
1898  B2_im += gT21_im * b1_re;
1899  B2_im += gT22_re * b2_im;
1900  B2_im += gT22_im * b2_re;
1901 
1902  o00_re += A0_re;
1903  o00_im += A0_im;
1904  o10_re += B0_re;
1905  o10_im += B0_im;
1906  o20_re += A0_im;
1907  o20_im -= A0_re;
1908  o30_re -= B0_im;
1909  o30_im += B0_re;
1910 
1911  o01_re += A1_re;
1912  o01_im += A1_im;
1913  o11_re += B1_re;
1914  o11_im += B1_im;
1915  o21_re += A1_im;
1916  o21_im -= A1_re;
1917  o31_re -= B1_im;
1918  o31_im += B1_re;
1919 
1920  o02_re += A2_re;
1921  o02_im += A2_im;
1922  o12_re += B2_re;
1923  o12_im += B2_im;
1924  o22_re += A2_im;
1925  o22_im -= A2_re;
1926  o32_re -= B2_im;
1927  o32_im += B2_re;
1928 
1929 }
1930 
1931 #ifdef MULTI_GPU
1932 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1933  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
1934 #endif
1935 {
1936  // Projector P3-
1937  // 0 0 0 0
1938  // 0 0 0 0
1939  // 0 0 2 0
1940  // 0 0 0 2
1941 
1942 #ifdef MULTI_GPU
1943  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1944  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1945 #else
1946  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1947 #endif
1948 
1949  const int ga_idx = sid;
1950 
1951  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1952  {
1959 
1960 #ifdef MULTI_GPU
1961  if (kernel_type == INTERIOR_KERNEL) {
1962 #endif
1963 
1964  // read spinor from device memory
1965  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1966 
1967  // project spinor into half spinors
1968  a0_re = +2*i20_re;
1969  a0_im = +2*i20_im;
1970  a1_re = +2*i21_re;
1971  a1_im = +2*i21_im;
1972  a2_re = +2*i22_re;
1973  a2_im = +2*i22_im;
1974  b0_re = +2*i30_re;
1975  b0_im = +2*i30_im;
1976  b1_re = +2*i31_re;
1977  b1_im = +2*i31_im;
1978  b2_re = +2*i32_re;
1979  b2_im = +2*i32_im;
1980 
1981 #ifdef MULTI_GPU
1982  } else {
1983 
1984  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1985  const int t_proj_scale = TPROJSCALE;
1986 
1987  // read half spinor from device memory
1988  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1989 
1990  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1991  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1992  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1993  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1994  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1995  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1996 
1997  }
1998 #endif // MULTI_GPU
1999 
2000  // identity gauge matrix
2007 
2008  o20_re += A0_re;
2009  o20_im += A0_im;
2010  o30_re += B0_re;
2011  o30_im += B0_im;
2012 
2013  o21_re += A1_re;
2014  o21_im += A1_im;
2015  o31_re += B1_re;
2016  o31_im += B1_im;
2017 
2018  o22_re += A2_re;
2019  o22_im += A2_im;
2020  o32_re += B2_re;
2021  o32_im += B2_im;
2022 
2023  } else {
2030 
2031 #ifdef MULTI_GPU
2032  if (kernel_type == INTERIOR_KERNEL) {
2033 #endif
2034 
2035  // read spinor from device memory
2036  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2037 
2038  // project spinor into half spinors
2039  a0_re = +2*i20_re;
2040  a0_im = +2*i20_im;
2041  a1_re = +2*i21_re;
2042  a1_im = +2*i21_im;
2043  a2_re = +2*i22_re;
2044  a2_im = +2*i22_im;
2045  b0_re = +2*i30_re;
2046  b0_im = +2*i30_im;
2047  b1_re = +2*i31_re;
2048  b1_im = +2*i31_im;
2049  b2_re = +2*i32_re;
2050  b2_im = +2*i32_im;
2051 
2052 #ifdef MULTI_GPU
2053  } else {
2054 
2055  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2056  const int t_proj_scale = TPROJSCALE;
2057 
2058  // read half spinor from device memory
2059  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
2060 
2061  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2062  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2063  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2064  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2065  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2066  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2067 
2068  }
2069 #endif // MULTI_GPU
2070 
2071  // read gauge matrix from device memory
2072  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
2073 
2074  // reconstruct gauge matrix
2076 
2077  // multiply row 0
2078  spinorFloat A0_re = 0;
2079  A0_re += g00_re * a0_re;
2080  A0_re -= g00_im * a0_im;
2081  A0_re += g01_re * a1_re;
2082  A0_re -= g01_im * a1_im;
2083  A0_re += g02_re * a2_re;
2084  A0_re -= g02_im * a2_im;
2085  spinorFloat A0_im = 0;
2086  A0_im += g00_re * a0_im;
2087  A0_im += g00_im * a0_re;
2088  A0_im += g01_re * a1_im;
2089  A0_im += g01_im * a1_re;
2090  A0_im += g02_re * a2_im;
2091  A0_im += g02_im * a2_re;
2092  spinorFloat B0_re = 0;
2093  B0_re += g00_re * b0_re;
2094  B0_re -= g00_im * b0_im;
2095  B0_re += g01_re * b1_re;
2096  B0_re -= g01_im * b1_im;
2097  B0_re += g02_re * b2_re;
2098  B0_re -= g02_im * b2_im;
2099  spinorFloat B0_im = 0;
2100  B0_im += g00_re * b0_im;
2101  B0_im += g00_im * b0_re;
2102  B0_im += g01_re * b1_im;
2103  B0_im += g01_im * b1_re;
2104  B0_im += g02_re * b2_im;
2105  B0_im += g02_im * b2_re;
2106 
2107  // multiply row 1
2108  spinorFloat A1_re = 0;
2109  A1_re += g10_re * a0_re;
2110  A1_re -= g10_im * a0_im;
2111  A1_re += g11_re * a1_re;
2112  A1_re -= g11_im * a1_im;
2113  A1_re += g12_re * a2_re;
2114  A1_re -= g12_im * a2_im;
2115  spinorFloat A1_im = 0;
2116  A1_im += g10_re * a0_im;
2117  A1_im += g10_im * a0_re;
2118  A1_im += g11_re * a1_im;
2119  A1_im += g11_im * a1_re;
2120  A1_im += g12_re * a2_im;
2121  A1_im += g12_im * a2_re;
2122  spinorFloat B1_re = 0;
2123  B1_re += g10_re * b0_re;
2124  B1_re -= g10_im * b0_im;
2125  B1_re += g11_re * b1_re;
2126  B1_re -= g11_im * b1_im;
2127  B1_re += g12_re * b2_re;
2128  B1_re -= g12_im * b2_im;
2129  spinorFloat B1_im = 0;
2130  B1_im += g10_re * b0_im;
2131  B1_im += g10_im * b0_re;
2132  B1_im += g11_re * b1_im;
2133  B1_im += g11_im * b1_re;
2134  B1_im += g12_re * b2_im;
2135  B1_im += g12_im * b2_re;
2136 
2137  // multiply row 2
2138  spinorFloat A2_re = 0;
2139  A2_re += g20_re * a0_re;
2140  A2_re -= g20_im * a0_im;
2141  A2_re += g21_re * a1_re;
2142  A2_re -= g21_im * a1_im;
2143  A2_re += g22_re * a2_re;
2144  A2_re -= g22_im * a2_im;
2145  spinorFloat A2_im = 0;
2146  A2_im += g20_re * a0_im;
2147  A2_im += g20_im * a0_re;
2148  A2_im += g21_re * a1_im;
2149  A2_im += g21_im * a1_re;
2150  A2_im += g22_re * a2_im;
2151  A2_im += g22_im * a2_re;
2152  spinorFloat B2_re = 0;
2153  B2_re += g20_re * b0_re;
2154  B2_re -= g20_im * b0_im;
2155  B2_re += g21_re * b1_re;
2156  B2_re -= g21_im * b1_im;
2157  B2_re += g22_re * b2_re;
2158  B2_re -= g22_im * b2_im;
2159  spinorFloat B2_im = 0;
2160  B2_im += g20_re * b0_im;
2161  B2_im += g20_im * b0_re;
2162  B2_im += g21_re * b1_im;
2163  B2_im += g21_im * b1_re;
2164  B2_im += g22_re * b2_im;
2165  B2_im += g22_im * b2_re;
2166 
2167  o20_re += A0_re;
2168  o20_im += A0_im;
2169  o30_re += B0_re;
2170  o30_im += B0_im;
2171 
2172  o21_re += A1_re;
2173  o21_im += A1_im;
2174  o31_re += B1_re;
2175  o31_im += B1_im;
2176 
2177  o22_re += A2_re;
2178  o22_im += A2_im;
2179  o32_re += B2_re;
2180  o32_im += B2_im;
2181 
2182  }
2183 }
2184 
2185 #ifdef MULTI_GPU
2186 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
2187  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
2188 #endif
2189 {
2190  // Projector P3+
2191  // 2 0 0 0
2192  // 0 2 0 0
2193  // 0 0 0 0
2194  // 0 0 0 0
2195 
2196 #ifdef MULTI_GPU
2197  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
2198  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2199 #else
2200  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
2201 #endif
2202 
2203 #ifdef MULTI_GPU
2204  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
2205 #else
2206  const int ga_idx = sp_idx;
2207 #endif
2208 
2209  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
2210  {
2217 
2218 #ifdef MULTI_GPU
2219  if (kernel_type == INTERIOR_KERNEL) {
2220 #endif
2221 
2222  // read spinor from device memory
2223  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2224 
2225  // project spinor into half spinors
2226  a0_re = +2*i00_re;
2227  a0_im = +2*i00_im;
2228  a1_re = +2*i01_re;
2229  a1_im = +2*i01_im;
2230  a2_re = +2*i02_re;
2231  a2_im = +2*i02_im;
2232  b0_re = +2*i10_re;
2233  b0_im = +2*i10_im;
2234  b1_re = +2*i11_re;
2235  b1_im = +2*i11_im;
2236  b2_re = +2*i12_re;
2237  b2_im = +2*i12_im;
2238 
2239 #ifdef MULTI_GPU
2240  } else {
2241 
2242  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2243  const int t_proj_scale = TPROJSCALE;
2244 
2245  // read half spinor from device memory
2246  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2247 
2248  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2249  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2250  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2251  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2252  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2253  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2254 
2255  }
2256 #endif // MULTI_GPU
2257 
2258  // identity gauge matrix
2265 
2266  o00_re += A0_re;
2267  o00_im += A0_im;
2268  o10_re += B0_re;
2269  o10_im += B0_im;
2270 
2271  o01_re += A1_re;
2272  o01_im += A1_im;
2273  o11_re += B1_re;
2274  o11_im += B1_im;
2275 
2276  o02_re += A2_re;
2277  o02_im += A2_im;
2278  o12_re += B2_re;
2279  o12_im += B2_im;
2280 
2281  } else {
2288 
2289 #ifdef MULTI_GPU
2290  if (kernel_type == INTERIOR_KERNEL) {
2291 #endif
2292 
2293  // read spinor from device memory
2294  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2295 
2296  // project spinor into half spinors
2297  a0_re = +2*i00_re;
2298  a0_im = +2*i00_im;
2299  a1_re = +2*i01_re;
2300  a1_im = +2*i01_im;
2301  a2_re = +2*i02_re;
2302  a2_im = +2*i02_im;
2303  b0_re = +2*i10_re;
2304  b0_im = +2*i10_im;
2305  b1_re = +2*i11_re;
2306  b1_im = +2*i11_im;
2307  b2_re = +2*i12_re;
2308  b2_im = +2*i12_im;
2309 
2310 #ifdef MULTI_GPU
2311  } else {
2312 
2313  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2314  const int t_proj_scale = TPROJSCALE;
2315 
2316  // read half spinor from device memory
2317  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2318 
2319  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2320  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2321  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2322  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2323  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2324  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2325 
2326  }
2327 #endif // MULTI_GPU
2328 
2329  // read gauge matrix from device memory
2330  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
2331 
2332  // reconstruct gauge matrix
2334 
2335  // multiply row 0
2336  spinorFloat A0_re = 0;
2337  A0_re += gT00_re * a0_re;
2338  A0_re -= gT00_im * a0_im;
2339  A0_re += gT01_re * a1_re;
2340  A0_re -= gT01_im * a1_im;
2341  A0_re += gT02_re * a2_re;
2342  A0_re -= gT02_im * a2_im;
2343  spinorFloat A0_im = 0;
2344  A0_im += gT00_re * a0_im;
2345  A0_im += gT00_im * a0_re;
2346  A0_im += gT01_re * a1_im;
2347  A0_im += gT01_im * a1_re;
2348  A0_im += gT02_re * a2_im;
2349  A0_im += gT02_im * a2_re;
2350  spinorFloat B0_re = 0;
2351  B0_re += gT00_re * b0_re;
2352  B0_re -= gT00_im * b0_im;
2353  B0_re += gT01_re * b1_re;
2354  B0_re -= gT01_im * b1_im;
2355  B0_re += gT02_re * b2_re;
2356  B0_re -= gT02_im * b2_im;
2357  spinorFloat B0_im = 0;
2358  B0_im += gT00_re * b0_im;
2359  B0_im += gT00_im * b0_re;
2360  B0_im += gT01_re * b1_im;
2361  B0_im += gT01_im * b1_re;
2362  B0_im += gT02_re * b2_im;
2363  B0_im += gT02_im * b2_re;
2364 
2365  // multiply row 1
2366  spinorFloat A1_re = 0;
2367  A1_re += gT10_re * a0_re;
2368  A1_re -= gT10_im * a0_im;
2369  A1_re += gT11_re * a1_re;
2370  A1_re -= gT11_im * a1_im;
2371  A1_re += gT12_re * a2_re;
2372  A1_re -= gT12_im * a2_im;
2373  spinorFloat A1_im = 0;
2374  A1_im += gT10_re * a0_im;
2375  A1_im += gT10_im * a0_re;
2376  A1_im += gT11_re * a1_im;
2377  A1_im += gT11_im * a1_re;
2378  A1_im += gT12_re * a2_im;
2379  A1_im += gT12_im * a2_re;
2380  spinorFloat B1_re = 0;
2381  B1_re += gT10_re * b0_re;
2382  B1_re -= gT10_im * b0_im;
2383  B1_re += gT11_re * b1_re;
2384  B1_re -= gT11_im * b1_im;
2385  B1_re += gT12_re * b2_re;
2386  B1_re -= gT12_im * b2_im;
2387  spinorFloat B1_im = 0;
2388  B1_im += gT10_re * b0_im;
2389  B1_im += gT10_im * b0_re;
2390  B1_im += gT11_re * b1_im;
2391  B1_im += gT11_im * b1_re;
2392  B1_im += gT12_re * b2_im;
2393  B1_im += gT12_im * b2_re;
2394 
2395  // multiply row 2
2396  spinorFloat A2_re = 0;
2397  A2_re += gT20_re * a0_re;
2398  A2_re -= gT20_im * a0_im;
2399  A2_re += gT21_re * a1_re;
2400  A2_re -= gT21_im * a1_im;
2401  A2_re += gT22_re * a2_re;
2402  A2_re -= gT22_im * a2_im;
2403  spinorFloat A2_im = 0;
2404  A2_im += gT20_re * a0_im;
2405  A2_im += gT20_im * a0_re;
2406  A2_im += gT21_re * a1_im;
2407  A2_im += gT21_im * a1_re;
2408  A2_im += gT22_re * a2_im;
2409  A2_im += gT22_im * a2_re;
2410  spinorFloat B2_re = 0;
2411  B2_re += gT20_re * b0_re;
2412  B2_re -= gT20_im * b0_im;
2413  B2_re += gT21_re * b1_re;
2414  B2_re -= gT21_im * b1_im;
2415  B2_re += gT22_re * b2_re;
2416  B2_re -= gT22_im * b2_im;
2417  spinorFloat B2_im = 0;
2418  B2_im += gT20_re * b0_im;
2419  B2_im += gT20_im * b0_re;
2420  B2_im += gT21_re * b1_im;
2421  B2_im += gT21_im * b1_re;
2422  B2_im += gT22_re * b2_im;
2423  B2_im += gT22_im * b2_re;
2424 
2425  o00_re += A0_re;
2426  o00_im += A0_im;
2427  o10_re += B0_re;
2428  o10_im += B0_im;
2429 
2430  o01_re += A1_re;
2431  o01_im += A1_im;
2432  o11_re += B1_re;
2433  o11_im += B1_im;
2434 
2435  o02_re += A2_re;
2436  o02_im += A2_im;
2437  o12_re += B2_re;
2438  o12_im += B2_im;
2439 
2440  }
2441 }
2442 
2443 #ifdef MULTI_GPU
2444 
2445 int incomplete = 0; // Have all 8 contributions been computed for this site?
2446 
2447 switch(kernel_type) { // intentional fall-through
2448 case INTERIOR_KERNEL:
2449  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
2450 case EXTERIOR_KERNEL_T:
2451  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
2452 case EXTERIOR_KERNEL_Z:
2453  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
2454 case EXTERIOR_KERNEL_Y:
2455  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
2456 }
2457 
2458 if (!incomplete)
2459 #endif // MULTI_GPU
2460 {
2461 #ifdef DSLASH_XPAY
2462  READ_ACCUM(ACCUMTEX, param.sp_stride)
2463 
2464 #ifndef CLOVER_TWIST_XPAY
2465  //perform invert twist first:
2466  APPLY_CLOVER_TWIST_INV(c, cinv, a, o);
2467  o00_re = b*o00_re + acc00_re;
2468  o00_im = b*o00_im + acc00_im;
2469  o01_re = b*o01_re + acc01_re;
2470  o01_im = b*o01_im + acc01_im;
2471  o02_re = b*o02_re + acc02_re;
2472  o02_im = b*o02_im + acc02_im;
2473  o10_re = b*o10_re + acc10_re;
2474  o10_im = b*o10_im + acc10_im;
2475  o11_re = b*o11_re + acc11_re;
2476  o11_im = b*o11_im + acc11_im;
2477  o12_re = b*o12_re + acc12_re;
2478  o12_im = b*o12_im + acc12_im;
2479  o20_re = b*o20_re + acc20_re;
2480  o20_im = b*o20_im + acc20_im;
2481  o21_re = b*o21_re + acc21_re;
2482  o21_im = b*o21_im + acc21_im;
2483  o22_re = b*o22_re + acc22_re;
2484  o22_im = b*o22_im + acc22_im;
2485  o30_re = b*o30_re + acc30_re;
2486  o30_im = b*o30_im + acc30_im;
2487  o31_re = b*o31_re + acc31_re;
2488  o31_im = b*o31_im + acc31_im;
2489  o32_re = b*o32_re + acc32_re;
2490  o32_im = b*o32_im + acc32_im;
2491 #else
2492  APPLY_CLOVER_TWIST(c, a, acc);
2493  o00_re = b*o00_re + acc00_re;
2494  o00_im = b*o00_im + acc00_im;
2495  o01_re = b*o01_re + acc01_re;
2496  o01_im = b*o01_im + acc01_im;
2497  o02_re = b*o02_re + acc02_re;
2498  o02_im = b*o02_im + acc02_im;
2499  o10_re = b*o10_re + acc10_re;
2500  o10_im = b*o10_im + acc10_im;
2501  o11_re = b*o11_re + acc11_re;
2502  o11_im = b*o11_im + acc11_im;
2503  o12_re = b*o12_re + acc12_re;
2504  o12_im = b*o12_im + acc12_im;
2505  o20_re = b*o20_re + acc20_re;
2506  o20_im = b*o20_im + acc20_im;
2507  o21_re = b*o21_re + acc21_re;
2508  o21_im = b*o21_im + acc21_im;
2509  o22_re = b*o22_re + acc22_re;
2510  o22_im = b*o22_im + acc22_im;
2511  o30_re = b*o30_re + acc30_re;
2512  o30_im = b*o30_im + acc30_im;
2513  o31_re = b*o31_re + acc31_re;
2514  o31_im = b*o31_im + acc31_im;
2515  o32_re = b*o32_re + acc32_re;
2516  o32_im = b*o32_im + acc32_im;
2517 #endif//CLOVER_TWIST_XPAY
2518 #else //no XPAY
2519  APPLY_CLOVER_TWIST_INV(c, cinv, a, o);
2520 #endif
2521 }
2522 
2523 // write spinor field back to device memory
2524 WRITE_SPINOR(param.sp_stride);
2525 
2526 // undefine to prevent warning when precision is changed
2527 #undef spinorFloat
2528 #undef WRITE_SPINOR_SHARED
2529 #undef READ_SPINOR_SHARED
2530 #undef SHARED_STRIDE
2531 
2532 #undef g00_re
2533 #undef g00_im
2534 #undef g01_re
2535 #undef g01_im
2536 #undef g02_re
2537 #undef g02_im
2538 #undef g10_re
2539 #undef g10_im
2540 #undef g11_re
2541 #undef g11_im
2542 #undef g12_re
2543 #undef g12_im
2544 #undef g20_re
2545 #undef g20_im
2546 #undef g21_re
2547 #undef g21_im
2548 #undef g22_re
2549 #undef g22_im
2550 
2551 #undef i00_re
2552 #undef i00_im
2553 #undef i01_re
2554 #undef i01_im
2555 #undef i02_re
2556 #undef i02_im
2557 #undef i10_re
2558 #undef i10_im
2559 #undef i11_re
2560 #undef i11_im
2561 #undef i12_re
2562 #undef i12_im
2563 #undef i20_re
2564 #undef i20_im
2565 #undef i21_re
2566 #undef i21_im
2567 #undef i22_re
2568 #undef i22_im
2569 #undef i30_re
2570 #undef i30_im
2571 #undef i31_re
2572 #undef i31_im
2573 #undef i32_re
2574 #undef i32_im
2575 
2576 #undef c00_00_re
2577 #undef c01_01_re
2578 #undef c02_02_re
2579 #undef c10_10_re
2580 #undef c11_11_re
2581 #undef c12_12_re
2582 #undef c01_00_re
2583 #undef c01_00_im
2584 #undef c02_00_re
2585 #undef c02_00_im
2586 #undef c10_00_re
2587 #undef c10_00_im
2588 #undef c11_00_re
2589 #undef c11_00_im
2590 #undef c12_00_re
2591 #undef c12_00_im
2592 #undef c02_01_re
2593 #undef c02_01_im
2594 #undef c10_01_re
2595 #undef c10_01_im
2596 #undef c11_01_re
2597 #undef c11_01_im
2598 #undef c12_01_re
2599 #undef c12_01_im
2600 #undef c10_02_re
2601 #undef c10_02_im
2602 #undef c11_02_re
2603 #undef c11_02_im
2604 #undef c12_02_re
2605 #undef c12_02_im
2606 #undef c11_10_re
2607 #undef c11_10_im
2608 #undef c12_10_re
2609 #undef c12_10_im
2610 #undef c12_11_re
2611 #undef c12_11_im
2612 
2613 #undef cinv00_00_re
2614 #undef cinv01_01_re
2615 #undef cinv02_02_re
2616 #undef cinv10_10_re
2617 #undef cinv11_11_re
2618 #undef cinv12_12_re
2619 #undef cinv01_00_re
2620 #undef cinv01_00_im
2621 #undef cinv02_00_re
2622 #undef cinv02_00_im
2623 #undef cinv10_00_re
2624 #undef cinv10_00_im
2625 #undef cinv11_00_re
2626 #undef cinv11_00_im
2627 #undef cinv12_00_re
2628 #undef cinv12_00_im
2629 #undef cinv02_01_re
2630 #undef cinv02_01_im
2631 #undef cinv10_01_re
2632 #undef cinv10_01_im
2633 #undef cinv11_01_re
2634 #undef cinv11_01_im
2635 #undef cinv12_01_re
2636 #undef cinv12_01_im
2637 #undef cinv10_02_re
2638 #undef cinv10_02_im
2639 #undef cinv11_02_re
2640 #undef cinv11_02_im
2641 #undef cinv12_02_re
2642 #undef cinv12_02_im
2643 #undef cinv11_10_re
2644 #undef cinv11_10_im
2645 #undef cinv12_10_re
2646 #undef cinv12_10_im
2647 #undef cinv12_11_re
2648 #undef cinv12_11_im
2649 
2650 #undef acc00_re
2651 #undef acc00_im
2652 #undef acc01_re
2653 #undef acc01_im
2654 #undef acc02_re
2655 #undef acc02_im
2656 #undef acc10_re
2657 #undef acc10_im
2658 #undef acc11_re
2659 #undef acc11_im
2660 #undef acc12_re
2661 #undef acc12_im
2662 #undef acc20_re
2663 #undef acc20_im
2664 #undef acc21_re
2665 #undef acc21_im
2666 #undef acc22_re
2667 #undef acc22_im
2668 #undef acc30_re
2669 #undef acc30_im
2670 #undef acc31_re
2671 #undef acc31_im
2672 #undef acc32_re
2673 #undef acc32_im
2674 
2675 
2676 #undef o00_re
2677 #undef o00_im
2678 #undef o01_re
2679 #undef o01_im
2680 #undef o02_re
2681 #undef o02_im
2682 #undef o10_re
2683 #undef o10_im
2684 #undef o11_re
2685 #undef o11_im
2686 #undef o12_re
2687 #undef o12_im
2688 #undef o20_re
2689 #undef o20_im
2690 #undef o21_re
2691 #undef o21_im
2692 #undef o22_re
2693 #undef o22_im
2694 #undef o30_re
2695 #undef o30_im
2696 #undef o31_re
2697 #undef o31_im
2698 #undef o32_re
2699 #undef o32_im
2700 
2701 #undef VOLATILE
#define i22_re
#define g21_im
spinorFloat B2_im
#define WRITE_SPINOR_SHARED
__constant__ int Vh
#define acc30_re
float4 C3
const int ga_idx
VOLATILE spinorFloat o12_re
spinorFloat a1_im
#define APPLY_CLOVER_TWIST(c, a, reg)
Definition: tmc_core.h:1
__constant__ int X2
#define g21_re
#define acc22_im
spinorFloat A1_im
#define i00_re
VOLATILE spinorFloat o31_re
#define g00_re
__constant__ int X2X1mX1
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o30_re
#define g12_re
#define g10_re
#define acc31_im
#define acc20_im
spinorFloat a1_re
__constant__ int X3X2X1mX2X1
#define gT02_re
__constant__ int X1
#define i10_im
#define spinorFloat
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
VOLATILE spinorFloat o32_im
int sp_idx
#define acc00_im
#define VOLATILE
spinorFloat B0_im
spinorFloat A0_re
#define i30_im
#define acc10_im
#define gT00_re
#define acc32_im
float4 C6
#define gT10_im
#define g11_im
float4 C0
__constant__ int X3X2X1
#define acc02_re
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
float4 C7
VOLATILE spinorFloat o30_im
spinorFloat b0_im
#define acc31_re
spinorFloat B0_re
#define acc22_re
#define gT00_im
#define gT12_im
#define gT20_re
#define i10_re
#define acc21_re
VOLATILE spinorFloat o20_re
WRITE_SPINOR(param.sp_stride)
spinorFloat b2_im
#define gT01_re
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
#define i31_re
#define g12_im
#define i31_im
#define gT12_re
#define i20_re
QudaGaugeParam param
Definition: pack_test.cpp:17
spinorFloat b1_re
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define i02_im
spinorFloat B2_re
#define APPLY_CLOVER_TWIST_INV(c, cinv, a, reg)
Definition: tmc_core.h:432
#define g20_re
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o11_re
#define g02_re
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
spinorFloat b1_im
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o21_im
#define acc21_im
#define g20_im
VOLATILE spinorFloat o10_re
#define GAUGE0TEX
Definition: covDev.h:112
VOLATILE spinorFloat o02_re
const int dims[]
#define acc01_im
VOLATILE spinorFloat o00_re
#define gT22_re
#define acc00_re
#define acc11_im
spinorFloat A0_im
spinorFloat A2_im
#define g10_im
VOLATILE spinorFloat o20_im
#define i32_re
float4 C1
#define acc32_re
#define g02_im
__constant__ int X2m1
#define i22_im
spinorFloat a2_re
__syncthreads()
#define SPINORTEX
Definition: clover_def.h:40
VOLATILE spinorFloat o11_im
spinorFloat B1_im
__constant__ int gauge_fixed
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o32_re
VOLATILE spinorFloat o00_im
__constant__ int X4X3X2X1mX3X2X1
#define READ_SPINOR_SHARED
#define i00_im
#define i21_im
#define i21_re
#define SPINOR_HOP
Definition: covDev.h:158
#define g11_re
#define gT11_im
RECONSTRUCT_GAUGE_MATRIX(0)
__constant__ int ga_stride
#define g01_im
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define i02_re
#define i11_re
VOLATILE spinorFloat o31_im
spinorFloat B1_re
spinorFloat b0_re
#define i30_re
#define gT01_im
#define i01_im
__constant__ int X1m1
#define gT11_re
#define acc01_re
__constant__ int X3
#define acc12_re
#define gT21_im
#define gT22_im
#define gT21_re
#define i12_re
#define acc02_im
#define GAUGE1TEX
Definition: covDev.h:113
spinorFloat a2_im
#define i01_re
float4 C5
#define gT10_re
#define acc20_re
#define g01_re
float4 C8
float4 C2
#define g22_im
#define i20_im
__constant__ int X4m1
VOLATILE spinorFloat o21_re
spinorFloat a0_im
#define gT20_im
float4 C4
#define acc11_re
spinorFloat A1_re
#define i32_im
spinorFloat A2_re
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
spinorFloat a0_re
#define acc12_im
#define g22_re
__constant__ int X4X3X2X1hmX3X2X1h
#define i12_im
#define acc30_im
coordsFromIndex3D< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o12_im
#define gT02_im
KernelType kernel_type
spinorFloat b2_re
__constant__ int X4
#define g00_im
__constant__ int X3m1
#define TPROJSCALE
Definition: covDev.h:101
VOLATILE spinorFloat o01_im
#define i11_im
#define acc10_re
__constant__ int X2X1