QUDA  0.9.0
tmc_dslash_gt200_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define i00_re I0.x
15 #define i00_im I0.y
16 #define i01_re I1.x
17 #define i01_im I1.y
18 #define i02_re I2.x
19 #define i02_im I2.y
20 #define i10_re I3.x
21 #define i10_im I3.y
22 #define i11_re I4.x
23 #define i11_im I4.y
24 #define i12_re I5.x
25 #define i12_im I5.y
26 #define i20_re I6.x
27 #define i20_im I6.y
28 #define i21_re I7.x
29 #define i21_im I7.y
30 #define i22_re I8.x
31 #define i22_im I8.y
32 #define i30_re I9.x
33 #define i30_im I9.y
34 #define i31_re I10.x
35 #define i31_im I10.y
36 #define i32_re I11.x
37 #define i32_im I11.y
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
62 #else
63 #define spinorFloat float
64 #define i00_re I0.x
65 #define i00_im I0.y
66 #define i01_re I0.z
67 #define i01_im I0.w
68 #define i02_re I1.x
69 #define i02_im I1.y
70 #define i10_re I1.z
71 #define i10_im I1.w
72 #define i11_re I2.x
73 #define i11_im I2.y
74 #define i12_re I2.z
75 #define i12_im I2.w
76 #define i20_re I3.x
77 #define i20_im I3.y
78 #define i21_re I3.z
79 #define i21_im I3.w
80 #define i22_re I4.x
81 #define i22_im I4.y
82 #define i30_re I4.z
83 #define i30_im I4.w
84 #define i31_re I5.x
85 #define i31_im I5.y
86 #define i32_re I5.z
87 #define i32_im I5.w
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
113 
114 // gauge link
115 #ifdef GAUGE_FLOAT2
116 #define g00_re G0.x
117 #define g00_im G0.y
118 #define g01_re G1.x
119 #define g01_im G1.y
120 #define g02_re G2.x
121 #define g02_im G2.y
122 #define g10_re G3.x
123 #define g10_im G3.y
124 #define g11_re G4.x
125 #define g11_im G4.y
126 #define g12_re G5.x
127 #define g12_im G5.y
128 #define g20_re G6.x
129 #define g20_im G6.y
130 #define g21_re G7.x
131 #define g21_im G7.y
132 #define g22_re G8.x
133 #define g22_im G8.y
134 
135 #else
136 #define g00_re G0.x
137 #define g00_im G0.y
138 #define g01_re G0.z
139 #define g01_im G0.w
140 #define g02_re G1.x
141 #define g02_im G1.y
142 #define g10_re G1.z
143 #define g10_im G1.w
144 #define g11_re G2.x
145 #define g11_im G2.y
146 #define g12_re G2.z
147 #define g12_im G2.w
148 #define g20_re G3.x
149 #define g20_im G3.y
150 #define g21_re G3.z
151 #define g21_im G3.w
152 #define g22_re G4.x
153 #define g22_im G4.y
154 
155 #endif // GAUGE_DOUBLE
156 
157 // conjugated gauge link
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
176 
177 // first chiral block of clover term
178 #ifdef CLOVER_DOUBLE
179 #define c00_00_re C0.x
180 #define c01_01_re C0.y
181 #define c02_02_re C1.x
182 #define c10_10_re C1.y
183 #define c11_11_re C2.x
184 #define c12_12_re C2.y
185 #define c01_00_re C3.x
186 #define c01_00_im C3.y
187 #define c02_00_re C4.x
188 #define c02_00_im C4.y
189 #define c10_00_re C5.x
190 #define c10_00_im C5.y
191 #define c11_00_re C6.x
192 #define c11_00_im C6.y
193 #define c12_00_re C7.x
194 #define c12_00_im C7.y
195 #define c02_01_re C8.x
196 #define c02_01_im C8.y
197 #define c10_01_re C9.x
198 #define c10_01_im C9.y
199 #define c11_01_re C10.x
200 #define c11_01_im C10.y
201 #define c12_01_re C11.x
202 #define c12_01_im C11.y
203 #define c10_02_re C12.x
204 #define c10_02_im C12.y
205 #define c11_02_re C13.x
206 #define c11_02_im C13.y
207 #define c12_02_re C14.x
208 #define c12_02_im C14.y
209 #define c11_10_re C15.x
210 #define c11_10_im C15.y
211 #define c12_10_re C16.x
212 #define c12_10_im C16.y
213 #define c12_11_re C17.x
214 #define c12_11_im C17.y
215 #else
216 #define c00_00_re C0.x
217 #define c01_01_re C0.y
218 #define c02_02_re C0.z
219 #define c10_10_re C0.w
220 #define c11_11_re C1.x
221 #define c12_12_re C1.y
222 #define c01_00_re C1.z
223 #define c01_00_im C1.w
224 #define c02_00_re C2.x
225 #define c02_00_im C2.y
226 #define c10_00_re C2.z
227 #define c10_00_im C2.w
228 #define c11_00_re C3.x
229 #define c11_00_im C3.y
230 #define c12_00_re C3.z
231 #define c12_00_im C3.w
232 #define c02_01_re C4.x
233 #define c02_01_im C4.y
234 #define c10_01_re C4.z
235 #define c10_01_im C4.w
236 #define c11_01_re C5.x
237 #define c11_01_im C5.y
238 #define c12_01_re C5.z
239 #define c12_01_im C5.w
240 #define c10_02_re C6.x
241 #define c10_02_im C6.y
242 #define c11_02_re C6.z
243 #define c11_02_im C6.w
244 #define c12_02_re C7.x
245 #define c12_02_im C7.y
246 #define c11_10_re C7.z
247 #define c11_10_im C7.w
248 #define c12_10_re C8.x
249 #define c12_10_im C8.y
250 #define c12_11_re C8.z
251 #define c12_11_im C8.w
252 #endif // CLOVER_DOUBLE
253 
254 #define c00_01_re (+c01_00_re)
255 #define c00_01_im (-c01_00_im)
256 #define c00_02_re (+c02_00_re)
257 #define c00_02_im (-c02_00_im)
258 #define c01_02_re (+c02_01_re)
259 #define c01_02_im (-c02_01_im)
260 #define c00_10_re (+c10_00_re)
261 #define c00_10_im (-c10_00_im)
262 #define c01_10_re (+c10_01_re)
263 #define c01_10_im (-c10_01_im)
264 #define c02_10_re (+c10_02_re)
265 #define c02_10_im (-c10_02_im)
266 #define c00_11_re (+c11_00_re)
267 #define c00_11_im (-c11_00_im)
268 #define c01_11_re (+c11_01_re)
269 #define c01_11_im (-c11_01_im)
270 #define c02_11_re (+c11_02_re)
271 #define c02_11_im (-c11_02_im)
272 #define c10_11_re (+c11_10_re)
273 #define c10_11_im (-c11_10_im)
274 #define c00_12_re (+c12_00_re)
275 #define c00_12_im (-c12_00_im)
276 #define c01_12_re (+c12_01_re)
277 #define c01_12_im (-c12_01_im)
278 #define c02_12_re (+c12_02_re)
279 #define c02_12_im (-c12_02_im)
280 #define c10_12_re (+c12_10_re)
281 #define c10_12_im (-c12_10_im)
282 #define c11_12_re (+c12_11_re)
283 #define c11_12_im (-c12_11_im)
284 
285 // second chiral block of clover term (reuses C0,...,C9)
286 #define c20_20_re c00_00_re
287 #define c21_20_re c01_00_re
288 #define c21_20_im c01_00_im
289 #define c22_20_re c02_00_re
290 #define c22_20_im c02_00_im
291 #define c30_20_re c10_00_re
292 #define c30_20_im c10_00_im
293 #define c31_20_re c11_00_re
294 #define c31_20_im c11_00_im
295 #define c32_20_re c12_00_re
296 #define c32_20_im c12_00_im
297 #define c20_21_re c00_01_re
298 #define c20_21_im c00_01_im
299 #define c21_21_re c01_01_re
300 #define c22_21_re c02_01_re
301 #define c22_21_im c02_01_im
302 #define c30_21_re c10_01_re
303 #define c30_21_im c10_01_im
304 #define c31_21_re c11_01_re
305 #define c31_21_im c11_01_im
306 #define c32_21_re c12_01_re
307 #define c32_21_im c12_01_im
308 #define c20_22_re c00_02_re
309 #define c20_22_im c00_02_im
310 #define c21_22_re c01_02_re
311 #define c21_22_im c01_02_im
312 #define c22_22_re c02_02_re
313 #define c30_22_re c10_02_re
314 #define c30_22_im c10_02_im
315 #define c31_22_re c11_02_re
316 #define c31_22_im c11_02_im
317 #define c32_22_re c12_02_re
318 #define c32_22_im c12_02_im
319 #define c20_30_re c00_10_re
320 #define c20_30_im c00_10_im
321 #define c21_30_re c01_10_re
322 #define c21_30_im c01_10_im
323 #define c22_30_re c02_10_re
324 #define c22_30_im c02_10_im
325 #define c30_30_re c10_10_re
326 #define c31_30_re c11_10_re
327 #define c31_30_im c11_10_im
328 #define c32_30_re c12_10_re
329 #define c32_30_im c12_10_im
330 #define c20_31_re c00_11_re
331 #define c20_31_im c00_11_im
332 #define c21_31_re c01_11_re
333 #define c21_31_im c01_11_im
334 #define c22_31_re c02_11_re
335 #define c22_31_im c02_11_im
336 #define c30_31_re c10_11_re
337 #define c30_31_im c10_11_im
338 #define c31_31_re c11_11_re
339 #define c32_31_re c12_11_re
340 #define c32_31_im c12_11_im
341 #define c20_32_re c00_12_re
342 #define c20_32_im c00_12_im
343 #define c21_32_re c01_12_re
344 #define c21_32_im c01_12_im
345 #define c22_32_re c02_12_re
346 #define c22_32_im c02_12_im
347 #define c30_32_re c10_12_re
348 #define c30_32_im c10_12_im
349 #define c31_32_re c11_12_re
350 #define c31_32_im c11_12_im
351 #define c32_32_re c12_12_re
352 
353 
354 // first chiral block of inverted clover term
355 #ifdef CLOVER_DOUBLE
356 #define cinv00_00_re C0.x
357 #define cinv01_01_re C0.y
358 #define cinv02_02_re C1.x
359 #define cinv10_10_re C1.y
360 #define cinv11_11_re C2.x
361 #define cinv12_12_re C2.y
362 #define cinv01_00_re C3.x
363 #define cinv01_00_im C3.y
364 #define cinv02_00_re C4.x
365 #define cinv02_00_im C4.y
366 #define cinv10_00_re C5.x
367 #define cinv10_00_im C5.y
368 #define cinv11_00_re C6.x
369 #define cinv11_00_im C6.y
370 #define cinv12_00_re C7.x
371 #define cinv12_00_im C7.y
372 #define cinv02_01_re C8.x
373 #define cinv02_01_im C8.y
374 #define cinv10_01_re C9.x
375 #define cinv10_01_im C9.y
376 #define cinv11_01_re C10.x
377 #define cinv11_01_im C10.y
378 #define cinv12_01_re C11.x
379 #define cinv12_01_im C11.y
380 #define cinv10_02_re C12.x
381 #define cinv10_02_im C12.y
382 #define cinv11_02_re C13.x
383 #define cinv11_02_im C13.y
384 #define cinv12_02_re C14.x
385 #define cinv12_02_im C14.y
386 #define cinv11_10_re C15.x
387 #define cinv11_10_im C15.y
388 #define cinv12_10_re C16.x
389 #define cinv12_10_im C16.y
390 #define cinv12_11_re C17.x
391 #define cinv12_11_im C17.y
392 #else
393 #define cinv00_00_re C0.x
394 #define cinv01_01_re C0.y
395 #define cinv02_02_re C0.z
396 #define cinv10_10_re C0.w
397 #define cinv11_11_re C1.x
398 #define cinv12_12_re C1.y
399 #define cinv01_00_re C1.z
400 #define cinv01_00_im C1.w
401 #define cinv02_00_re C2.x
402 #define cinv02_00_im C2.y
403 #define cinv10_00_re C2.z
404 #define cinv10_00_im C2.w
405 #define cinv11_00_re C3.x
406 #define cinv11_00_im C3.y
407 #define cinv12_00_re C3.z
408 #define cinv12_00_im C3.w
409 #define cinv02_01_re C4.x
410 #define cinv02_01_im C4.y
411 #define cinv10_01_re C4.z
412 #define cinv10_01_im C4.w
413 #define cinv11_01_re C5.x
414 #define cinv11_01_im C5.y
415 #define cinv12_01_re C5.z
416 #define cinv12_01_im C5.w
417 #define cinv10_02_re C6.x
418 #define cinv10_02_im C6.y
419 #define cinv11_02_re C6.z
420 #define cinv11_02_im C6.w
421 #define cinv12_02_re C7.x
422 #define cinv12_02_im C7.y
423 #define cinv11_10_re C7.z
424 #define cinv11_10_im C7.w
425 #define cinv12_10_re C8.x
426 #define cinv12_10_im C8.y
427 #define cinv12_11_re C8.z
428 #define cinv12_11_im C8.w
429 #endif // CLOVER_DOUBLE
430 
431 #define cinv00_01_re (+cinv01_00_re)
432 #define cinv00_01_im (-cinv01_00_im)
433 #define cinv00_02_re (+cinv02_00_re)
434 #define cinv00_02_im (-cinv02_00_im)
435 #define cinv01_02_re (+cinv02_01_re)
436 #define cinv01_02_im (-cinv02_01_im)
437 #define cinv00_10_re (+cinv10_00_re)
438 #define cinv00_10_im (-cinv10_00_im)
439 #define cinv01_10_re (+cinv10_01_re)
440 #define cinv01_10_im (-cinv10_01_im)
441 #define cinv02_10_re (+cinv10_02_re)
442 #define cinv02_10_im (-cinv10_02_im)
443 #define cinv00_11_re (+cinv11_00_re)
444 #define cinv00_11_im (-cinv11_00_im)
445 #define cinv01_11_re (+cinv11_01_re)
446 #define cinv01_11_im (-cinv11_01_im)
447 #define cinv02_11_re (+cinv11_02_re)
448 #define cinv02_11_im (-cinv11_02_im)
449 #define cinv10_11_re (+cinv11_10_re)
450 #define cinv10_11_im (-cinv11_10_im)
451 #define cinv00_12_re (+cinv12_00_re)
452 #define cinv00_12_im (-cinv12_00_im)
453 #define cinv01_12_re (+cinv12_01_re)
454 #define cinv01_12_im (-cinv12_01_im)
455 #define cinv02_12_re (+cinv12_02_re)
456 #define cinv02_12_im (-cinv12_02_im)
457 #define cinv10_12_re (+cinv12_10_re)
458 #define cinv10_12_im (-cinv12_10_im)
459 #define cinv11_12_re (+cinv12_11_re)
460 #define cinv11_12_im (-cinv12_11_im)
461 
462 // second chiral block of inverted clover term (reuses C0,...,C9)
463 #define cinv20_20_re cinv00_00_re
464 #define cinv21_20_re cinv01_00_re
465 #define cinv21_20_im cinv01_00_im
466 #define cinv22_20_re cinv02_00_re
467 #define cinv22_20_im cinv02_00_im
468 #define cinv30_20_re cinv10_00_re
469 #define cinv30_20_im cinv10_00_im
470 #define cinv31_20_re cinv11_00_re
471 #define cinv31_20_im cinv11_00_im
472 #define cinv32_20_re cinv12_00_re
473 #define cinv32_20_im cinv12_00_im
474 #define cinv20_21_re cinv00_01_re
475 #define cinv20_21_im cinv00_01_im
476 #define cinv21_21_re cinv01_01_re
477 #define cinv22_21_re cinv02_01_re
478 #define cinv22_21_im cinv02_01_im
479 #define cinv30_21_re cinv10_01_re
480 #define cinv30_21_im cinv10_01_im
481 #define cinv31_21_re cinv11_01_re
482 #define cinv31_21_im cinv11_01_im
483 #define cinv32_21_re cinv12_01_re
484 #define cinv32_21_im cinv12_01_im
485 #define cinv20_22_re cinv00_02_re
486 #define cinv20_22_im cinv00_02_im
487 #define cinv21_22_re cinv01_02_re
488 #define cinv21_22_im cinv01_02_im
489 #define cinv22_22_re cinv02_02_re
490 #define cinv30_22_re cinv10_02_re
491 #define cinv30_22_im cinv10_02_im
492 #define cinv31_22_re cinv11_02_re
493 #define cinv31_22_im cinv11_02_im
494 #define cinv32_22_re cinv12_02_re
495 #define cinv32_22_im cinv12_02_im
496 #define cinv20_30_re cinv00_10_re
497 #define cinv20_30_im cinv00_10_im
498 #define cinv21_30_re cinv01_10_re
499 #define cinv21_30_im cinv01_10_im
500 #define cinv22_30_re cinv02_10_re
501 #define cinv22_30_im cinv02_10_im
502 #define cinv30_30_re cinv10_10_re
503 #define cinv31_30_re cinv11_10_re
504 #define cinv31_30_im cinv11_10_im
505 #define cinv32_30_re cinv12_10_re
506 #define cinv32_30_im cinv12_10_im
507 #define cinv20_31_re cinv00_11_re
508 #define cinv20_31_im cinv00_11_im
509 #define cinv21_31_re cinv01_11_re
510 #define cinv21_31_im cinv01_11_im
511 #define cinv22_31_re cinv02_11_re
512 #define cinv22_31_im cinv02_11_im
513 #define cinv30_31_re cinv10_11_re
514 #define cinv30_31_im cinv10_11_im
515 #define cinv31_31_re cinv11_11_re
516 #define cinv32_31_re cinv12_11_re
517 #define cinv32_31_im cinv12_11_im
518 #define cinv20_32_re cinv00_12_re
519 #define cinv20_32_im cinv00_12_im
520 #define cinv21_32_re cinv01_12_re
521 #define cinv21_32_im cinv01_12_im
522 #define cinv22_32_re cinv02_12_re
523 #define cinv22_32_im cinv02_12_im
524 #define cinv30_32_re cinv10_12_re
525 #define cinv30_32_im cinv10_12_im
526 #define cinv31_32_re cinv11_12_re
527 #define cinv31_32_im cinv11_12_im
528 #define cinv32_32_re cinv12_12_re
529 
530 
531 
532 // declare C## here and use ASSN below instead of READ
533 #ifdef CLOVER_DOUBLE
534 double2 C0;
535 double2 C1;
536 double2 C2;
537 double2 C3;
538 double2 C4;
539 double2 C5;
540 double2 C6;
541 double2 C7;
542 double2 C8;
543 double2 C9;
544 double2 C10;
545 double2 C11;
546 double2 C12;
547 double2 C13;
548 double2 C14;
549 double2 C15;
550 double2 C16;
551 double2 C17;
552 #else
553 float4 C0;
554 float4 C1;
555 float4 C2;
556 float4 C3;
557 float4 C4;
558 float4 C5;
559 float4 C6;
560 float4 C7;
561 float4 C8;
562 
563 #if (DD_PREC==2)
564 float K;
565 #endif
566 
567 #endif // CLOVER_DOUBLE
568 // output spinor
593 
594 #include "read_gauge.h"
595 #include "io_spinor.h"
596 #include "read_clover.h"
597 #include "tmc_core.h"
598 
599 int coord[5];
600 int X;
601 
602 int sid;
603 
604 #ifdef MULTI_GPU
605 int face_idx;
606 if (kernel_type == INTERIOR_KERNEL) {
607 #endif
608 
609  sid = blockIdx.x*blockDim.x + threadIdx.x;
610  if (sid >= param.threads) return;
611 
612  // Assume even dimensions
614 
615  o00_re = 0; o00_im = 0;
616  o01_re = 0; o01_im = 0;
617  o02_re = 0; o02_im = 0;
618  o10_re = 0; o10_im = 0;
619  o11_re = 0; o11_im = 0;
620  o12_re = 0; o12_im = 0;
621  o20_re = 0; o20_im = 0;
622  o21_re = 0; o21_im = 0;
623  o22_re = 0; o22_im = 0;
624  o30_re = 0; o30_im = 0;
625  o31_re = 0; o31_im = 0;
626  o32_re = 0; o32_im = 0;
627 
628 #ifdef MULTI_GPU
629 } else { // exterior kernel
630 
631  sid = blockIdx.x*blockDim.x + threadIdx.x;
632  if (sid >= param.threads) return;
633 
634  const int face_volume = (param.threads >> 1); // volume of one face
635  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
636  face_idx = sid - face_num*face_volume; // index into the respective face
637 
638  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
639  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
640  //sp_idx = face_idx + param.ghostOffset[dim];
641 
642  coordsFromFaceIndex<4,QUDA_4D_PC,kernel_type,1>(X, sid, coord, face_idx, face_num, param);
643 
645 
646  o00_re = i00_re; o00_im = i00_im;
647  o01_re = i01_re; o01_im = i01_im;
648  o02_re = i02_re; o02_im = i02_im;
649  o10_re = i10_re; o10_im = i10_im;
650  o11_re = i11_re; o11_im = i11_im;
651  o12_re = i12_re; o12_im = i12_im;
652  o20_re = i20_re; o20_im = i20_im;
653  o21_re = i21_re; o21_im = i21_im;
654  o22_re = i22_re; o22_im = i22_im;
655  o30_re = i30_re; o30_im = i30_im;
656  o31_re = i31_re; o31_im = i31_im;
657  o32_re = i32_re; o32_im = i32_im;
658 }
659 #endif // MULTI_GPU
660 
661 
662 #ifdef MULTI_GPU
663 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]<(param.dc.X[0]-1))) ||
664  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==(param.dc.X[0]-1)) )
665 #endif
666 {
667  // Projector P0-
668  // 1 0 0 -i
669  // 0 1 -i 0
670  // 0 i 1 0
671  // i 0 0 1
672 
673 #ifdef MULTI_GPU
674  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1 :
675  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
676 #if (DD_PREC==2) // half precision
677  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
678 #endif
679 #else
680  const int sp_idx = (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1;
681 #endif
682 
683  const int ga_idx = sid;
684 
691 
692 #ifdef MULTI_GPU
693  if (kernel_type == INTERIOR_KERNEL) {
694 #endif
695 
696  // read spinor from device memory
697  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
698 
699  // project spinor into half spinors
700  a0_re = +i00_re+i30_im;
701  a0_im = +i00_im-i30_re;
702  a1_re = +i01_re+i31_im;
703  a1_im = +i01_im-i31_re;
704  a2_re = +i02_re+i32_im;
705  a2_im = +i02_im-i32_re;
706  b0_re = +i10_re+i20_im;
707  b0_im = +i10_im-i20_re;
708  b1_re = +i11_re+i21_im;
709  b1_im = +i11_im-i21_re;
710  b2_re = +i12_re+i22_im;
711  b2_im = +i12_im-i22_re;
712 
713 #ifdef MULTI_GPU
714  } else {
715 
716  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
717 
718  // read half spinor from device memory
719  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 0);
720 
721  a0_re = i00_re; a0_im = i00_im;
722  a1_re = i01_re; a1_im = i01_im;
723  a2_re = i02_re; a2_im = i02_im;
724  b0_re = i10_re; b0_im = i10_im;
725  b1_re = i11_re; b1_im = i11_im;
726  b2_re = i12_re; b2_im = i12_im;
727 
728  }
729 #endif // MULTI_GPU
730 
731  // read gauge matrix from device memory
732  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride);
733 
734  // reconstruct gauge matrix
736 
737  // multiply row 0
739  A0_re += g00_re * a0_re;
740  A0_re -= g00_im * a0_im;
741  A0_re += g01_re * a1_re;
742  A0_re -= g01_im * a1_im;
743  A0_re += g02_re * a2_re;
744  A0_re -= g02_im * a2_im;
746  A0_im += g00_re * a0_im;
747  A0_im += g00_im * a0_re;
748  A0_im += g01_re * a1_im;
749  A0_im += g01_im * a1_re;
750  A0_im += g02_re * a2_im;
751  A0_im += g02_im * a2_re;
753  B0_re += g00_re * b0_re;
754  B0_re -= g00_im * b0_im;
755  B0_re += g01_re * b1_re;
756  B0_re -= g01_im * b1_im;
757  B0_re += g02_re * b2_re;
758  B0_re -= g02_im * b2_im;
760  B0_im += g00_re * b0_im;
761  B0_im += g00_im * b0_re;
762  B0_im += g01_re * b1_im;
763  B0_im += g01_im * b1_re;
764  B0_im += g02_re * b2_im;
765  B0_im += g02_im * b2_re;
766 
767  // multiply row 1
769  A1_re += g10_re * a0_re;
770  A1_re -= g10_im * a0_im;
771  A1_re += g11_re * a1_re;
772  A1_re -= g11_im * a1_im;
773  A1_re += g12_re * a2_re;
774  A1_re -= g12_im * a2_im;
776  A1_im += g10_re * a0_im;
777  A1_im += g10_im * a0_re;
778  A1_im += g11_re * a1_im;
779  A1_im += g11_im * a1_re;
780  A1_im += g12_re * a2_im;
781  A1_im += g12_im * a2_re;
783  B1_re += g10_re * b0_re;
784  B1_re -= g10_im * b0_im;
785  B1_re += g11_re * b1_re;
786  B1_re -= g11_im * b1_im;
787  B1_re += g12_re * b2_re;
788  B1_re -= g12_im * b2_im;
790  B1_im += g10_re * b0_im;
791  B1_im += g10_im * b0_re;
792  B1_im += g11_re * b1_im;
793  B1_im += g11_im * b1_re;
794  B1_im += g12_re * b2_im;
795  B1_im += g12_im * b2_re;
796 
797  // multiply row 2
799  A2_re += g20_re * a0_re;
800  A2_re -= g20_im * a0_im;
801  A2_re += g21_re * a1_re;
802  A2_re -= g21_im * a1_im;
803  A2_re += g22_re * a2_re;
804  A2_re -= g22_im * a2_im;
806  A2_im += g20_re * a0_im;
807  A2_im += g20_im * a0_re;
808  A2_im += g21_re * a1_im;
809  A2_im += g21_im * a1_re;
810  A2_im += g22_re * a2_im;
811  A2_im += g22_im * a2_re;
813  B2_re += g20_re * b0_re;
814  B2_re -= g20_im * b0_im;
815  B2_re += g21_re * b1_re;
816  B2_re -= g21_im * b1_im;
817  B2_re += g22_re * b2_re;
818  B2_re -= g22_im * b2_im;
820  B2_im += g20_re * b0_im;
821  B2_im += g20_im * b0_re;
822  B2_im += g21_re * b1_im;
823  B2_im += g21_im * b1_re;
824  B2_im += g22_re * b2_im;
825  B2_im += g22_im * b2_re;
826 
827  o00_re += A0_re;
828  o00_im += A0_im;
829  o10_re += B0_re;
830  o10_im += B0_im;
831  o20_re -= B0_im;
832  o20_im += B0_re;
833  o30_re -= A0_im;
834  o30_im += A0_re;
835 
836  o01_re += A1_re;
837  o01_im += A1_im;
838  o11_re += B1_re;
839  o11_im += B1_im;
840  o21_re -= B1_im;
841  o21_im += B1_re;
842  o31_re -= A1_im;
843  o31_im += A1_re;
844 
845  o02_re += A2_re;
846  o02_im += A2_im;
847  o12_re += B2_re;
848  o12_im += B2_im;
849  o22_re -= B2_im;
850  o22_im += B2_re;
851  o32_re -= A2_im;
852  o32_im += A2_re;
853 
854 }
855 
856 #ifdef MULTI_GPU
857 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]>0)) ||
858  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==0) )
859 #endif
860 {
861  // Projector P0+
862  // 1 0 0 i
863  // 0 1 i 0
864  // 0 -i 1 0
865  // -i 0 0 1
866 
867 #ifdef MULTI_GPU
868  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1 :
869  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
870 #if (DD_PREC==2) // half precision
871  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
872 #endif
873 #else
874  const int sp_idx = (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1;
875 #endif
876 
877 #ifdef MULTI_GPU
878  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
879 #else
880  const int ga_idx = sp_idx;
881 #endif
882 
889 
890 #ifdef MULTI_GPU
891  if (kernel_type == INTERIOR_KERNEL) {
892 #endif
893 
894  // read spinor from device memory
895  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
896 
897  // project spinor into half spinors
898  a0_re = +i00_re-i30_im;
899  a0_im = +i00_im+i30_re;
900  a1_re = +i01_re-i31_im;
901  a1_im = +i01_im+i31_re;
902  a2_re = +i02_re-i32_im;
903  a2_im = +i02_im+i32_re;
904  b0_re = +i10_re-i20_im;
905  b0_im = +i10_im+i20_re;
906  b1_re = +i11_re-i21_im;
907  b1_im = +i11_im+i21_re;
908  b2_re = +i12_re-i22_im;
909  b2_im = +i12_im+i22_re;
910 
911 #ifdef MULTI_GPU
912  } else {
913 
914  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
915 
916  // read half spinor from device memory
917  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 1);
918 
919  a0_re = i00_re; a0_im = i00_im;
920  a1_re = i01_re; a1_im = i01_im;
921  a2_re = i02_re; a2_im = i02_im;
922  b0_re = i10_re; b0_im = i10_im;
923  b1_re = i11_re; b1_im = i11_im;
924  b2_re = i12_re; b2_im = i12_im;
925 
926  }
927 #endif // MULTI_GPU
928 
929  // read gauge matrix from device memory
930  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, param.gauge_stride);
931 
932  // reconstruct gauge matrix
934 
935  // multiply row 0
936  spinorFloat A0_re = 0;
937  A0_re += gT00_re * a0_re;
938  A0_re -= gT00_im * a0_im;
939  A0_re += gT01_re * a1_re;
940  A0_re -= gT01_im * a1_im;
941  A0_re += gT02_re * a2_re;
942  A0_re -= gT02_im * a2_im;
943  spinorFloat A0_im = 0;
944  A0_im += gT00_re * a0_im;
945  A0_im += gT00_im * a0_re;
946  A0_im += gT01_re * a1_im;
947  A0_im += gT01_im * a1_re;
948  A0_im += gT02_re * a2_im;
949  A0_im += gT02_im * a2_re;
950  spinorFloat B0_re = 0;
951  B0_re += gT00_re * b0_re;
952  B0_re -= gT00_im * b0_im;
953  B0_re += gT01_re * b1_re;
954  B0_re -= gT01_im * b1_im;
955  B0_re += gT02_re * b2_re;
956  B0_re -= gT02_im * b2_im;
957  spinorFloat B0_im = 0;
958  B0_im += gT00_re * b0_im;
959  B0_im += gT00_im * b0_re;
960  B0_im += gT01_re * b1_im;
961  B0_im += gT01_im * b1_re;
962  B0_im += gT02_re * b2_im;
963  B0_im += gT02_im * b2_re;
964 
965  // multiply row 1
966  spinorFloat A1_re = 0;
967  A1_re += gT10_re * a0_re;
968  A1_re -= gT10_im * a0_im;
969  A1_re += gT11_re * a1_re;
970  A1_re -= gT11_im * a1_im;
971  A1_re += gT12_re * a2_re;
972  A1_re -= gT12_im * a2_im;
973  spinorFloat A1_im = 0;
974  A1_im += gT10_re * a0_im;
975  A1_im += gT10_im * a0_re;
976  A1_im += gT11_re * a1_im;
977  A1_im += gT11_im * a1_re;
978  A1_im += gT12_re * a2_im;
979  A1_im += gT12_im * a2_re;
980  spinorFloat B1_re = 0;
981  B1_re += gT10_re * b0_re;
982  B1_re -= gT10_im * b0_im;
983  B1_re += gT11_re * b1_re;
984  B1_re -= gT11_im * b1_im;
985  B1_re += gT12_re * b2_re;
986  B1_re -= gT12_im * b2_im;
987  spinorFloat B1_im = 0;
988  B1_im += gT10_re * b0_im;
989  B1_im += gT10_im * b0_re;
990  B1_im += gT11_re * b1_im;
991  B1_im += gT11_im * b1_re;
992  B1_im += gT12_re * b2_im;
993  B1_im += gT12_im * b2_re;
994 
995  // multiply row 2
996  spinorFloat A2_re = 0;
997  A2_re += gT20_re * a0_re;
998  A2_re -= gT20_im * a0_im;
999  A2_re += gT21_re * a1_re;
1000  A2_re -= gT21_im * a1_im;
1001  A2_re += gT22_re * a2_re;
1002  A2_re -= gT22_im * a2_im;
1003  spinorFloat A2_im = 0;
1004  A2_im += gT20_re * a0_im;
1005  A2_im += gT20_im * a0_re;
1006  A2_im += gT21_re * a1_im;
1007  A2_im += gT21_im * a1_re;
1008  A2_im += gT22_re * a2_im;
1009  A2_im += gT22_im * a2_re;
1010  spinorFloat B2_re = 0;
1011  B2_re += gT20_re * b0_re;
1012  B2_re -= gT20_im * b0_im;
1013  B2_re += gT21_re * b1_re;
1014  B2_re -= gT21_im * b1_im;
1015  B2_re += gT22_re * b2_re;
1016  B2_re -= gT22_im * b2_im;
1017  spinorFloat B2_im = 0;
1018  B2_im += gT20_re * b0_im;
1019  B2_im += gT20_im * b0_re;
1020  B2_im += gT21_re * b1_im;
1021  B2_im += gT21_im * b1_re;
1022  B2_im += gT22_re * b2_im;
1023  B2_im += gT22_im * b2_re;
1024 
1025  o00_re += A0_re;
1026  o00_im += A0_im;
1027  o10_re += B0_re;
1028  o10_im += B0_im;
1029  o20_re += B0_im;
1030  o20_im -= B0_re;
1031  o30_re += A0_im;
1032  o30_im -= A0_re;
1033 
1034  o01_re += A1_re;
1035  o01_im += A1_im;
1036  o11_re += B1_re;
1037  o11_im += B1_im;
1038  o21_re += B1_im;
1039  o21_im -= B1_re;
1040  o31_re += A1_im;
1041  o31_im -= A1_re;
1042 
1043  o02_re += A2_re;
1044  o02_im += A2_im;
1045  o12_re += B2_re;
1046  o12_im += B2_im;
1047  o22_re += B2_im;
1048  o22_im -= B2_re;
1049  o32_re += A2_im;
1050  o32_im -= A2_re;
1051 
1052 }
1053 
1054 #ifdef MULTI_GPU
1055 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]<(param.dc.X[1]-1))) ||
1056  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==(param.dc.X[1]-1)) )
1057 #endif
1058 {
1059  // Projector P1-
1060  // 1 0 0 -1
1061  // 0 1 1 0
1062  // 0 1 1 0
1063  // -1 0 0 1
1064 
1065 #ifdef MULTI_GPU
1066  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1 :
1067  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1068 #if (DD_PREC==2) // half precision
1069  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1070 #endif
1071 #else
1072  const int sp_idx = (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1;
1073 #endif
1074 
1075  const int ga_idx = sid;
1076 
1083 
1084 #ifdef MULTI_GPU
1085  if (kernel_type == INTERIOR_KERNEL) {
1086 #endif
1087 
1088  // read spinor from device memory
1089  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1090 
1091  // project spinor into half spinors
1092  a0_re = +i00_re-i30_re;
1093  a0_im = +i00_im-i30_im;
1094  a1_re = +i01_re-i31_re;
1095  a1_im = +i01_im-i31_im;
1096  a2_re = +i02_re-i32_re;
1097  a2_im = +i02_im-i32_im;
1098  b0_re = +i10_re+i20_re;
1099  b0_im = +i10_im+i20_im;
1100  b1_re = +i11_re+i21_re;
1101  b1_im = +i11_im+i21_im;
1102  b2_re = +i12_re+i22_re;
1103  b2_im = +i12_im+i22_im;
1104 
1105 #ifdef MULTI_GPU
1106  } else {
1107 
1108  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1109 
1110  // read half spinor from device memory
1111  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 2);
1112 
1113  a0_re = i00_re; a0_im = i00_im;
1114  a1_re = i01_re; a1_im = i01_im;
1115  a2_re = i02_re; a2_im = i02_im;
1116  b0_re = i10_re; b0_im = i10_im;
1117  b1_re = i11_re; b1_im = i11_im;
1118  b2_re = i12_re; b2_im = i12_im;
1119 
1120  }
1121 #endif // MULTI_GPU
1122 
1123  // read gauge matrix from device memory
1124  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, param.gauge_stride);
1125 
1126  // reconstruct gauge matrix
1128 
1129  // multiply row 0
1130  spinorFloat A0_re = 0;
1131  A0_re += g00_re * a0_re;
1132  A0_re -= g00_im * a0_im;
1133  A0_re += g01_re * a1_re;
1134  A0_re -= g01_im * a1_im;
1135  A0_re += g02_re * a2_re;
1136  A0_re -= g02_im * a2_im;
1137  spinorFloat A0_im = 0;
1138  A0_im += g00_re * a0_im;
1139  A0_im += g00_im * a0_re;
1140  A0_im += g01_re * a1_im;
1141  A0_im += g01_im * a1_re;
1142  A0_im += g02_re * a2_im;
1143  A0_im += g02_im * a2_re;
1144  spinorFloat B0_re = 0;
1145  B0_re += g00_re * b0_re;
1146  B0_re -= g00_im * b0_im;
1147  B0_re += g01_re * b1_re;
1148  B0_re -= g01_im * b1_im;
1149  B0_re += g02_re * b2_re;
1150  B0_re -= g02_im * b2_im;
1151  spinorFloat B0_im = 0;
1152  B0_im += g00_re * b0_im;
1153  B0_im += g00_im * b0_re;
1154  B0_im += g01_re * b1_im;
1155  B0_im += g01_im * b1_re;
1156  B0_im += g02_re * b2_im;
1157  B0_im += g02_im * b2_re;
1158 
1159  // multiply row 1
1160  spinorFloat A1_re = 0;
1161  A1_re += g10_re * a0_re;
1162  A1_re -= g10_im * a0_im;
1163  A1_re += g11_re * a1_re;
1164  A1_re -= g11_im * a1_im;
1165  A1_re += g12_re * a2_re;
1166  A1_re -= g12_im * a2_im;
1167  spinorFloat A1_im = 0;
1168  A1_im += g10_re * a0_im;
1169  A1_im += g10_im * a0_re;
1170  A1_im += g11_re * a1_im;
1171  A1_im += g11_im * a1_re;
1172  A1_im += g12_re * a2_im;
1173  A1_im += g12_im * a2_re;
1174  spinorFloat B1_re = 0;
1175  B1_re += g10_re * b0_re;
1176  B1_re -= g10_im * b0_im;
1177  B1_re += g11_re * b1_re;
1178  B1_re -= g11_im * b1_im;
1179  B1_re += g12_re * b2_re;
1180  B1_re -= g12_im * b2_im;
1181  spinorFloat B1_im = 0;
1182  B1_im += g10_re * b0_im;
1183  B1_im += g10_im * b0_re;
1184  B1_im += g11_re * b1_im;
1185  B1_im += g11_im * b1_re;
1186  B1_im += g12_re * b2_im;
1187  B1_im += g12_im * b2_re;
1188 
1189  // multiply row 2
1190  spinorFloat A2_re = 0;
1191  A2_re += g20_re * a0_re;
1192  A2_re -= g20_im * a0_im;
1193  A2_re += g21_re * a1_re;
1194  A2_re -= g21_im * a1_im;
1195  A2_re += g22_re * a2_re;
1196  A2_re -= g22_im * a2_im;
1197  spinorFloat A2_im = 0;
1198  A2_im += g20_re * a0_im;
1199  A2_im += g20_im * a0_re;
1200  A2_im += g21_re * a1_im;
1201  A2_im += g21_im * a1_re;
1202  A2_im += g22_re * a2_im;
1203  A2_im += g22_im * a2_re;
1204  spinorFloat B2_re = 0;
1205  B2_re += g20_re * b0_re;
1206  B2_re -= g20_im * b0_im;
1207  B2_re += g21_re * b1_re;
1208  B2_re -= g21_im * b1_im;
1209  B2_re += g22_re * b2_re;
1210  B2_re -= g22_im * b2_im;
1211  spinorFloat B2_im = 0;
1212  B2_im += g20_re * b0_im;
1213  B2_im += g20_im * b0_re;
1214  B2_im += g21_re * b1_im;
1215  B2_im += g21_im * b1_re;
1216  B2_im += g22_re * b2_im;
1217  B2_im += g22_im * b2_re;
1218 
1219  o00_re += A0_re;
1220  o00_im += A0_im;
1221  o10_re += B0_re;
1222  o10_im += B0_im;
1223  o20_re += B0_re;
1224  o20_im += B0_im;
1225  o30_re -= A0_re;
1226  o30_im -= A0_im;
1227 
1228  o01_re += A1_re;
1229  o01_im += A1_im;
1230  o11_re += B1_re;
1231  o11_im += B1_im;
1232  o21_re += B1_re;
1233  o21_im += B1_im;
1234  o31_re -= A1_re;
1235  o31_im -= A1_im;
1236 
1237  o02_re += A2_re;
1238  o02_im += A2_im;
1239  o12_re += B2_re;
1240  o12_im += B2_im;
1241  o22_re += B2_re;
1242  o22_im += B2_im;
1243  o32_re -= A2_re;
1244  o32_im -= A2_im;
1245 
1246 }
1247 
1248 #ifdef MULTI_GPU
1249 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]>0)) ||
1250  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==0) )
1251 #endif
1252 {
1253  // Projector P1+
1254  // 1 0 0 1
1255  // 0 1 -1 0
1256  // 0 -1 1 0
1257  // 1 0 0 1
1258 
1259 #ifdef MULTI_GPU
1260  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1 :
1261  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1262 #if (DD_PREC==2) // half precision
1263  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1264 #endif
1265 #else
1266  const int sp_idx = (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1;
1267 #endif
1268 
1269 #ifdef MULTI_GPU
1270  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1271 #else
1272  const int ga_idx = sp_idx;
1273 #endif
1274 
1281 
1282 #ifdef MULTI_GPU
1283  if (kernel_type == INTERIOR_KERNEL) {
1284 #endif
1285 
1286  // read spinor from device memory
1287  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1288 
1289  // project spinor into half spinors
1290  a0_re = +i00_re+i30_re;
1291  a0_im = +i00_im+i30_im;
1292  a1_re = +i01_re+i31_re;
1293  a1_im = +i01_im+i31_im;
1294  a2_re = +i02_re+i32_re;
1295  a2_im = +i02_im+i32_im;
1296  b0_re = +i10_re-i20_re;
1297  b0_im = +i10_im-i20_im;
1298  b1_re = +i11_re-i21_re;
1299  b1_im = +i11_im-i21_im;
1300  b2_re = +i12_re-i22_re;
1301  b2_im = +i12_im-i22_im;
1302 
1303 #ifdef MULTI_GPU
1304  } else {
1305 
1306  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1307 
1308  // read half spinor from device memory
1309  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 3);
1310 
1311  a0_re = i00_re; a0_im = i00_im;
1312  a1_re = i01_re; a1_im = i01_im;
1313  a2_re = i02_re; a2_im = i02_im;
1314  b0_re = i10_re; b0_im = i10_im;
1315  b1_re = i11_re; b1_im = i11_im;
1316  b2_re = i12_re; b2_im = i12_im;
1317 
1318  }
1319 #endif // MULTI_GPU
1320 
1321  // read gauge matrix from device memory
1322  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, param.gauge_stride);
1323 
1324  // reconstruct gauge matrix
1326 
1327  // multiply row 0
1328  spinorFloat A0_re = 0;
1329  A0_re += gT00_re * a0_re;
1330  A0_re -= gT00_im * a0_im;
1331  A0_re += gT01_re * a1_re;
1332  A0_re -= gT01_im * a1_im;
1333  A0_re += gT02_re * a2_re;
1334  A0_re -= gT02_im * a2_im;
1335  spinorFloat A0_im = 0;
1336  A0_im += gT00_re * a0_im;
1337  A0_im += gT00_im * a0_re;
1338  A0_im += gT01_re * a1_im;
1339  A0_im += gT01_im * a1_re;
1340  A0_im += gT02_re * a2_im;
1341  A0_im += gT02_im * a2_re;
1342  spinorFloat B0_re = 0;
1343  B0_re += gT00_re * b0_re;
1344  B0_re -= gT00_im * b0_im;
1345  B0_re += gT01_re * b1_re;
1346  B0_re -= gT01_im * b1_im;
1347  B0_re += gT02_re * b2_re;
1348  B0_re -= gT02_im * b2_im;
1349  spinorFloat B0_im = 0;
1350  B0_im += gT00_re * b0_im;
1351  B0_im += gT00_im * b0_re;
1352  B0_im += gT01_re * b1_im;
1353  B0_im += gT01_im * b1_re;
1354  B0_im += gT02_re * b2_im;
1355  B0_im += gT02_im * b2_re;
1356 
1357  // multiply row 1
1358  spinorFloat A1_re = 0;
1359  A1_re += gT10_re * a0_re;
1360  A1_re -= gT10_im * a0_im;
1361  A1_re += gT11_re * a1_re;
1362  A1_re -= gT11_im * a1_im;
1363  A1_re += gT12_re * a2_re;
1364  A1_re -= gT12_im * a2_im;
1365  spinorFloat A1_im = 0;
1366  A1_im += gT10_re * a0_im;
1367  A1_im += gT10_im * a0_re;
1368  A1_im += gT11_re * a1_im;
1369  A1_im += gT11_im * a1_re;
1370  A1_im += gT12_re * a2_im;
1371  A1_im += gT12_im * a2_re;
1372  spinorFloat B1_re = 0;
1373  B1_re += gT10_re * b0_re;
1374  B1_re -= gT10_im * b0_im;
1375  B1_re += gT11_re * b1_re;
1376  B1_re -= gT11_im * b1_im;
1377  B1_re += gT12_re * b2_re;
1378  B1_re -= gT12_im * b2_im;
1379  spinorFloat B1_im = 0;
1380  B1_im += gT10_re * b0_im;
1381  B1_im += gT10_im * b0_re;
1382  B1_im += gT11_re * b1_im;
1383  B1_im += gT11_im * b1_re;
1384  B1_im += gT12_re * b2_im;
1385  B1_im += gT12_im * b2_re;
1386 
1387  // multiply row 2
1388  spinorFloat A2_re = 0;
1389  A2_re += gT20_re * a0_re;
1390  A2_re -= gT20_im * a0_im;
1391  A2_re += gT21_re * a1_re;
1392  A2_re -= gT21_im * a1_im;
1393  A2_re += gT22_re * a2_re;
1394  A2_re -= gT22_im * a2_im;
1395  spinorFloat A2_im = 0;
1396  A2_im += gT20_re * a0_im;
1397  A2_im += gT20_im * a0_re;
1398  A2_im += gT21_re * a1_im;
1399  A2_im += gT21_im * a1_re;
1400  A2_im += gT22_re * a2_im;
1401  A2_im += gT22_im * a2_re;
1402  spinorFloat B2_re = 0;
1403  B2_re += gT20_re * b0_re;
1404  B2_re -= gT20_im * b0_im;
1405  B2_re += gT21_re * b1_re;
1406  B2_re -= gT21_im * b1_im;
1407  B2_re += gT22_re * b2_re;
1408  B2_re -= gT22_im * b2_im;
1409  spinorFloat B2_im = 0;
1410  B2_im += gT20_re * b0_im;
1411  B2_im += gT20_im * b0_re;
1412  B2_im += gT21_re * b1_im;
1413  B2_im += gT21_im * b1_re;
1414  B2_im += gT22_re * b2_im;
1415  B2_im += gT22_im * b2_re;
1416 
1417  o00_re += A0_re;
1418  o00_im += A0_im;
1419  o10_re += B0_re;
1420  o10_im += B0_im;
1421  o20_re -= B0_re;
1422  o20_im -= B0_im;
1423  o30_re += A0_re;
1424  o30_im += A0_im;
1425 
1426  o01_re += A1_re;
1427  o01_im += A1_im;
1428  o11_re += B1_re;
1429  o11_im += B1_im;
1430  o21_re -= B1_re;
1431  o21_im -= B1_im;
1432  o31_re += A1_re;
1433  o31_im += A1_im;
1434 
1435  o02_re += A2_re;
1436  o02_im += A2_im;
1437  o12_re += B2_re;
1438  o12_im += B2_im;
1439  o22_re -= B2_re;
1440  o22_im -= B2_im;
1441  o32_re += A2_re;
1442  o32_im += A2_im;
1443 
1444 }
1445 
1446 #ifdef MULTI_GPU
1447 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]<(param.dc.X[2]-1))) ||
1448  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==(param.dc.X[2]-1)) )
1449 #endif
1450 {
1451  // Projector P2-
1452  // 1 0 -i 0
1453  // 0 1 0 i
1454  // i 0 1 0
1455  // 0 -i 0 1
1456 
1457 #ifdef MULTI_GPU
1458  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1 :
1459  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1460 #if (DD_PREC==2) // half precision
1461  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1462 #endif
1463 #else
1464  const int sp_idx = (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1;
1465 #endif
1466 
1467  const int ga_idx = sid;
1468 
1475 
1476 #ifdef MULTI_GPU
1477  if (kernel_type == INTERIOR_KERNEL) {
1478 #endif
1479 
1480  // read spinor from device memory
1481  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1482 
1483  // project spinor into half spinors
1484  a0_re = +i00_re+i20_im;
1485  a0_im = +i00_im-i20_re;
1486  a1_re = +i01_re+i21_im;
1487  a1_im = +i01_im-i21_re;
1488  a2_re = +i02_re+i22_im;
1489  a2_im = +i02_im-i22_re;
1490  b0_re = +i10_re-i30_im;
1491  b0_im = +i10_im+i30_re;
1492  b1_re = +i11_re-i31_im;
1493  b1_im = +i11_im+i31_re;
1494  b2_re = +i12_re-i32_im;
1495  b2_im = +i12_im+i32_re;
1496 
1497 #ifdef MULTI_GPU
1498  } else {
1499 
1500  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1501 
1502  // read half spinor from device memory
1503  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 4);
1504 
1505  a0_re = i00_re; a0_im = i00_im;
1506  a1_re = i01_re; a1_im = i01_im;
1507  a2_re = i02_re; a2_im = i02_im;
1508  b0_re = i10_re; b0_im = i10_im;
1509  b1_re = i11_re; b1_im = i11_im;
1510  b2_re = i12_re; b2_im = i12_im;
1511 
1512  }
1513 #endif // MULTI_GPU
1514 
1515  // read gauge matrix from device memory
1516  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, param.gauge_stride);
1517 
1518  // reconstruct gauge matrix
1520 
1521  // multiply row 0
1522  spinorFloat A0_re = 0;
1523  A0_re += g00_re * a0_re;
1524  A0_re -= g00_im * a0_im;
1525  A0_re += g01_re * a1_re;
1526  A0_re -= g01_im * a1_im;
1527  A0_re += g02_re * a2_re;
1528  A0_re -= g02_im * a2_im;
1529  spinorFloat A0_im = 0;
1530  A0_im += g00_re * a0_im;
1531  A0_im += g00_im * a0_re;
1532  A0_im += g01_re * a1_im;
1533  A0_im += g01_im * a1_re;
1534  A0_im += g02_re * a2_im;
1535  A0_im += g02_im * a2_re;
1536  spinorFloat B0_re = 0;
1537  B0_re += g00_re * b0_re;
1538  B0_re -= g00_im * b0_im;
1539  B0_re += g01_re * b1_re;
1540  B0_re -= g01_im * b1_im;
1541  B0_re += g02_re * b2_re;
1542  B0_re -= g02_im * b2_im;
1543  spinorFloat B0_im = 0;
1544  B0_im += g00_re * b0_im;
1545  B0_im += g00_im * b0_re;
1546  B0_im += g01_re * b1_im;
1547  B0_im += g01_im * b1_re;
1548  B0_im += g02_re * b2_im;
1549  B0_im += g02_im * b2_re;
1550 
1551  // multiply row 1
1552  spinorFloat A1_re = 0;
1553  A1_re += g10_re * a0_re;
1554  A1_re -= g10_im * a0_im;
1555  A1_re += g11_re * a1_re;
1556  A1_re -= g11_im * a1_im;
1557  A1_re += g12_re * a2_re;
1558  A1_re -= g12_im * a2_im;
1559  spinorFloat A1_im = 0;
1560  A1_im += g10_re * a0_im;
1561  A1_im += g10_im * a0_re;
1562  A1_im += g11_re * a1_im;
1563  A1_im += g11_im * a1_re;
1564  A1_im += g12_re * a2_im;
1565  A1_im += g12_im * a2_re;
1566  spinorFloat B1_re = 0;
1567  B1_re += g10_re * b0_re;
1568  B1_re -= g10_im * b0_im;
1569  B1_re += g11_re * b1_re;
1570  B1_re -= g11_im * b1_im;
1571  B1_re += g12_re * b2_re;
1572  B1_re -= g12_im * b2_im;
1573  spinorFloat B1_im = 0;
1574  B1_im += g10_re * b0_im;
1575  B1_im += g10_im * b0_re;
1576  B1_im += g11_re * b1_im;
1577  B1_im += g11_im * b1_re;
1578  B1_im += g12_re * b2_im;
1579  B1_im += g12_im * b2_re;
1580 
1581  // multiply row 2
1582  spinorFloat A2_re = 0;
1583  A2_re += g20_re * a0_re;
1584  A2_re -= g20_im * a0_im;
1585  A2_re += g21_re * a1_re;
1586  A2_re -= g21_im * a1_im;
1587  A2_re += g22_re * a2_re;
1588  A2_re -= g22_im * a2_im;
1589  spinorFloat A2_im = 0;
1590  A2_im += g20_re * a0_im;
1591  A2_im += g20_im * a0_re;
1592  A2_im += g21_re * a1_im;
1593  A2_im += g21_im * a1_re;
1594  A2_im += g22_re * a2_im;
1595  A2_im += g22_im * a2_re;
1596  spinorFloat B2_re = 0;
1597  B2_re += g20_re * b0_re;
1598  B2_re -= g20_im * b0_im;
1599  B2_re += g21_re * b1_re;
1600  B2_re -= g21_im * b1_im;
1601  B2_re += g22_re * b2_re;
1602  B2_re -= g22_im * b2_im;
1603  spinorFloat B2_im = 0;
1604  B2_im += g20_re * b0_im;
1605  B2_im += g20_im * b0_re;
1606  B2_im += g21_re * b1_im;
1607  B2_im += g21_im * b1_re;
1608  B2_im += g22_re * b2_im;
1609  B2_im += g22_im * b2_re;
1610 
1611  o00_re += A0_re;
1612  o00_im += A0_im;
1613  o10_re += B0_re;
1614  o10_im += B0_im;
1615  o20_re -= A0_im;
1616  o20_im += A0_re;
1617  o30_re += B0_im;
1618  o30_im -= B0_re;
1619 
1620  o01_re += A1_re;
1621  o01_im += A1_im;
1622  o11_re += B1_re;
1623  o11_im += B1_im;
1624  o21_re -= A1_im;
1625  o21_im += A1_re;
1626  o31_re += B1_im;
1627  o31_im -= B1_re;
1628 
1629  o02_re += A2_re;
1630  o02_im += A2_im;
1631  o12_re += B2_re;
1632  o12_im += B2_im;
1633  o22_re -= A2_im;
1634  o22_im += A2_re;
1635  o32_re += B2_im;
1636  o32_im -= B2_re;
1637 
1638 }
1639 
1640 #ifdef MULTI_GPU
1641 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]>0)) ||
1642  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==0) )
1643 #endif
1644 {
1645  // Projector P2+
1646  // 1 0 i 0
1647  // 0 1 0 -i
1648  // -i 0 1 0
1649  // 0 i 0 1
1650 
1651 #ifdef MULTI_GPU
1652  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1 :
1653  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1654 #if (DD_PREC==2) // half precision
1655  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1656 #endif
1657 #else
1658  const int sp_idx = (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1;
1659 #endif
1660 
1661 #ifdef MULTI_GPU
1662  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1663 #else
1664  const int ga_idx = sp_idx;
1665 #endif
1666 
1673 
1674 #ifdef MULTI_GPU
1675  if (kernel_type == INTERIOR_KERNEL) {
1676 #endif
1677 
1678  // read spinor from device memory
1679  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1680 
1681  // project spinor into half spinors
1682  a0_re = +i00_re-i20_im;
1683  a0_im = +i00_im+i20_re;
1684  a1_re = +i01_re-i21_im;
1685  a1_im = +i01_im+i21_re;
1686  a2_re = +i02_re-i22_im;
1687  a2_im = +i02_im+i22_re;
1688  b0_re = +i10_re+i30_im;
1689  b0_im = +i10_im-i30_re;
1690  b1_re = +i11_re+i31_im;
1691  b1_im = +i11_im-i31_re;
1692  b2_re = +i12_re+i32_im;
1693  b2_im = +i12_im-i32_re;
1694 
1695 #ifdef MULTI_GPU
1696  } else {
1697 
1698  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1699 
1700  // read half spinor from device memory
1701  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 5);
1702 
1703  a0_re = i00_re; a0_im = i00_im;
1704  a1_re = i01_re; a1_im = i01_im;
1705  a2_re = i02_re; a2_im = i02_im;
1706  b0_re = i10_re; b0_im = i10_im;
1707  b1_re = i11_re; b1_im = i11_im;
1708  b2_re = i12_re; b2_im = i12_im;
1709 
1710  }
1711 #endif // MULTI_GPU
1712 
1713  // read gauge matrix from device memory
1714  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, param.gauge_stride);
1715 
1716  // reconstruct gauge matrix
1718 
1719  // multiply row 0
1720  spinorFloat A0_re = 0;
1721  A0_re += gT00_re * a0_re;
1722  A0_re -= gT00_im * a0_im;
1723  A0_re += gT01_re * a1_re;
1724  A0_re -= gT01_im * a1_im;
1725  A0_re += gT02_re * a2_re;
1726  A0_re -= gT02_im * a2_im;
1727  spinorFloat A0_im = 0;
1728  A0_im += gT00_re * a0_im;
1729  A0_im += gT00_im * a0_re;
1730  A0_im += gT01_re * a1_im;
1731  A0_im += gT01_im * a1_re;
1732  A0_im += gT02_re * a2_im;
1733  A0_im += gT02_im * a2_re;
1734  spinorFloat B0_re = 0;
1735  B0_re += gT00_re * b0_re;
1736  B0_re -= gT00_im * b0_im;
1737  B0_re += gT01_re * b1_re;
1738  B0_re -= gT01_im * b1_im;
1739  B0_re += gT02_re * b2_re;
1740  B0_re -= gT02_im * b2_im;
1741  spinorFloat B0_im = 0;
1742  B0_im += gT00_re * b0_im;
1743  B0_im += gT00_im * b0_re;
1744  B0_im += gT01_re * b1_im;
1745  B0_im += gT01_im * b1_re;
1746  B0_im += gT02_re * b2_im;
1747  B0_im += gT02_im * b2_re;
1748 
1749  // multiply row 1
1750  spinorFloat A1_re = 0;
1751  A1_re += gT10_re * a0_re;
1752  A1_re -= gT10_im * a0_im;
1753  A1_re += gT11_re * a1_re;
1754  A1_re -= gT11_im * a1_im;
1755  A1_re += gT12_re * a2_re;
1756  A1_re -= gT12_im * a2_im;
1757  spinorFloat A1_im = 0;
1758  A1_im += gT10_re * a0_im;
1759  A1_im += gT10_im * a0_re;
1760  A1_im += gT11_re * a1_im;
1761  A1_im += gT11_im * a1_re;
1762  A1_im += gT12_re * a2_im;
1763  A1_im += gT12_im * a2_re;
1764  spinorFloat B1_re = 0;
1765  B1_re += gT10_re * b0_re;
1766  B1_re -= gT10_im * b0_im;
1767  B1_re += gT11_re * b1_re;
1768  B1_re -= gT11_im * b1_im;
1769  B1_re += gT12_re * b2_re;
1770  B1_re -= gT12_im * b2_im;
1771  spinorFloat B1_im = 0;
1772  B1_im += gT10_re * b0_im;
1773  B1_im += gT10_im * b0_re;
1774  B1_im += gT11_re * b1_im;
1775  B1_im += gT11_im * b1_re;
1776  B1_im += gT12_re * b2_im;
1777  B1_im += gT12_im * b2_re;
1778 
1779  // multiply row 2
1780  spinorFloat A2_re = 0;
1781  A2_re += gT20_re * a0_re;
1782  A2_re -= gT20_im * a0_im;
1783  A2_re += gT21_re * a1_re;
1784  A2_re -= gT21_im * a1_im;
1785  A2_re += gT22_re * a2_re;
1786  A2_re -= gT22_im * a2_im;
1787  spinorFloat A2_im = 0;
1788  A2_im += gT20_re * a0_im;
1789  A2_im += gT20_im * a0_re;
1790  A2_im += gT21_re * a1_im;
1791  A2_im += gT21_im * a1_re;
1792  A2_im += gT22_re * a2_im;
1793  A2_im += gT22_im * a2_re;
1794  spinorFloat B2_re = 0;
1795  B2_re += gT20_re * b0_re;
1796  B2_re -= gT20_im * b0_im;
1797  B2_re += gT21_re * b1_re;
1798  B2_re -= gT21_im * b1_im;
1799  B2_re += gT22_re * b2_re;
1800  B2_re -= gT22_im * b2_im;
1801  spinorFloat B2_im = 0;
1802  B2_im += gT20_re * b0_im;
1803  B2_im += gT20_im * b0_re;
1804  B2_im += gT21_re * b1_im;
1805  B2_im += gT21_im * b1_re;
1806  B2_im += gT22_re * b2_im;
1807  B2_im += gT22_im * b2_re;
1808 
1809  o00_re += A0_re;
1810  o00_im += A0_im;
1811  o10_re += B0_re;
1812  o10_im += B0_im;
1813  o20_re += A0_im;
1814  o20_im -= A0_re;
1815  o30_re -= B0_im;
1816  o30_im += B0_re;
1817 
1818  o01_re += A1_re;
1819  o01_im += A1_im;
1820  o11_re += B1_re;
1821  o11_im += B1_im;
1822  o21_re += A1_im;
1823  o21_im -= A1_re;
1824  o31_re -= B1_im;
1825  o31_im += B1_re;
1826 
1827  o02_re += A2_re;
1828  o02_im += A2_im;
1829  o12_re += B2_re;
1830  o12_im += B2_im;
1831  o22_re += A2_im;
1832  o22_im -= A2_re;
1833  o32_re -= B2_im;
1834  o32_im += B2_re;
1835 
1836 }
1837 
1838 #ifdef MULTI_GPU
1839 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]<(param.dc.X[3]-1))) ||
1840  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==(param.dc.X[3]-1)) )
1841 #endif
1842 {
1843  // Projector P3-
1844  // 0 0 0 0
1845  // 0 0 0 0
1846  // 0 0 2 0
1847  // 0 0 0 2
1848 
1849 #ifdef MULTI_GPU
1850  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1 :
1851  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1852 #if (DD_PREC==2) // half precision
1853  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1854 #endif
1855 #else
1856  const int sp_idx = (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1;
1857 #endif
1858 
1859  const int ga_idx = sid;
1860 
1861  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1862  {
1869 
1870 #ifdef MULTI_GPU
1871  if (kernel_type == INTERIOR_KERNEL) {
1872 #endif
1873 
1874  // read spinor from device memory
1876 
1877  // project spinor into half spinors
1878  a0_re = +2*i20_re;
1879  a0_im = +2*i20_im;
1880  a1_re = +2*i21_re;
1881  a1_im = +2*i21_im;
1882  a2_re = +2*i22_re;
1883  a2_im = +2*i22_im;
1884  b0_re = +2*i30_re;
1885  b0_im = +2*i30_im;
1886  b1_re = +2*i31_re;
1887  b1_im = +2*i31_im;
1888  b2_re = +2*i32_re;
1889  b2_im = +2*i32_im;
1890 
1891 #ifdef MULTI_GPU
1892  } else {
1893 
1894  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1895  const int t_proj_scale = TPROJSCALE;
1896 
1897  // read half spinor from device memory
1898  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1899 
1900  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1901  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1902  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1903  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1904  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1905  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1906 
1907  }
1908 #endif // MULTI_GPU
1909 
1910  // identity gauge matrix
1917 
1918  o20_re += A0_re;
1919  o20_im += A0_im;
1920  o30_re += B0_re;
1921  o30_im += B0_im;
1922 
1923  o21_re += A1_re;
1924  o21_im += A1_im;
1925  o31_re += B1_re;
1926  o31_im += B1_im;
1927 
1928  o22_re += A2_re;
1929  o22_im += A2_im;
1930  o32_re += B2_re;
1931  o32_im += B2_im;
1932 
1933  } else {
1940 
1941 #ifdef MULTI_GPU
1942  if (kernel_type == INTERIOR_KERNEL) {
1943 #endif
1944 
1945  // read spinor from device memory
1947 
1948  // project spinor into half spinors
1949  a0_re = +2*i20_re;
1950  a0_im = +2*i20_im;
1951  a1_re = +2*i21_re;
1952  a1_im = +2*i21_im;
1953  a2_re = +2*i22_re;
1954  a2_im = +2*i22_im;
1955  b0_re = +2*i30_re;
1956  b0_im = +2*i30_im;
1957  b1_re = +2*i31_re;
1958  b1_im = +2*i31_im;
1959  b2_re = +2*i32_re;
1960  b2_im = +2*i32_im;
1961 
1962 #ifdef MULTI_GPU
1963  } else {
1964 
1965  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1966  const int t_proj_scale = TPROJSCALE;
1967 
1968  // read half spinor from device memory
1969  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1970 
1971  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1972  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1973  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1974  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1975  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1976  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1977 
1978  }
1979 #endif // MULTI_GPU
1980 
1981  // read gauge matrix from device memory
1982  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, param.gauge_stride);
1983 
1984  // reconstruct gauge matrix
1986 
1987  // multiply row 0
1988  spinorFloat A0_re = 0;
1989  A0_re += g00_re * a0_re;
1990  A0_re -= g00_im * a0_im;
1991  A0_re += g01_re * a1_re;
1992  A0_re -= g01_im * a1_im;
1993  A0_re += g02_re * a2_re;
1994  A0_re -= g02_im * a2_im;
1995  spinorFloat A0_im = 0;
1996  A0_im += g00_re * a0_im;
1997  A0_im += g00_im * a0_re;
1998  A0_im += g01_re * a1_im;
1999  A0_im += g01_im * a1_re;
2000  A0_im += g02_re * a2_im;
2001  A0_im += g02_im * a2_re;
2002  spinorFloat B0_re = 0;
2003  B0_re += g00_re * b0_re;
2004  B0_re -= g00_im * b0_im;
2005  B0_re += g01_re * b1_re;
2006  B0_re -= g01_im * b1_im;
2007  B0_re += g02_re * b2_re;
2008  B0_re -= g02_im * b2_im;
2009  spinorFloat B0_im = 0;
2010  B0_im += g00_re * b0_im;
2011  B0_im += g00_im * b0_re;
2012  B0_im += g01_re * b1_im;
2013  B0_im += g01_im * b1_re;
2014  B0_im += g02_re * b2_im;
2015  B0_im += g02_im * b2_re;
2016 
2017  // multiply row 1
2018  spinorFloat A1_re = 0;
2019  A1_re += g10_re * a0_re;
2020  A1_re -= g10_im * a0_im;
2021  A1_re += g11_re * a1_re;
2022  A1_re -= g11_im * a1_im;
2023  A1_re += g12_re * a2_re;
2024  A1_re -= g12_im * a2_im;
2025  spinorFloat A1_im = 0;
2026  A1_im += g10_re * a0_im;
2027  A1_im += g10_im * a0_re;
2028  A1_im += g11_re * a1_im;
2029  A1_im += g11_im * a1_re;
2030  A1_im += g12_re * a2_im;
2031  A1_im += g12_im * a2_re;
2032  spinorFloat B1_re = 0;
2033  B1_re += g10_re * b0_re;
2034  B1_re -= g10_im * b0_im;
2035  B1_re += g11_re * b1_re;
2036  B1_re -= g11_im * b1_im;
2037  B1_re += g12_re * b2_re;
2038  B1_re -= g12_im * b2_im;
2039  spinorFloat B1_im = 0;
2040  B1_im += g10_re * b0_im;
2041  B1_im += g10_im * b0_re;
2042  B1_im += g11_re * b1_im;
2043  B1_im += g11_im * b1_re;
2044  B1_im += g12_re * b2_im;
2045  B1_im += g12_im * b2_re;
2046 
2047  // multiply row 2
2048  spinorFloat A2_re = 0;
2049  A2_re += g20_re * a0_re;
2050  A2_re -= g20_im * a0_im;
2051  A2_re += g21_re * a1_re;
2052  A2_re -= g21_im * a1_im;
2053  A2_re += g22_re * a2_re;
2054  A2_re -= g22_im * a2_im;
2055  spinorFloat A2_im = 0;
2056  A2_im += g20_re * a0_im;
2057  A2_im += g20_im * a0_re;
2058  A2_im += g21_re * a1_im;
2059  A2_im += g21_im * a1_re;
2060  A2_im += g22_re * a2_im;
2061  A2_im += g22_im * a2_re;
2062  spinorFloat B2_re = 0;
2063  B2_re += g20_re * b0_re;
2064  B2_re -= g20_im * b0_im;
2065  B2_re += g21_re * b1_re;
2066  B2_re -= g21_im * b1_im;
2067  B2_re += g22_re * b2_re;
2068  B2_re -= g22_im * b2_im;
2069  spinorFloat B2_im = 0;
2070  B2_im += g20_re * b0_im;
2071  B2_im += g20_im * b0_re;
2072  B2_im += g21_re * b1_im;
2073  B2_im += g21_im * b1_re;
2074  B2_im += g22_re * b2_im;
2075  B2_im += g22_im * b2_re;
2076 
2077  o20_re += A0_re;
2078  o20_im += A0_im;
2079  o30_re += B0_re;
2080  o30_im += B0_im;
2081 
2082  o21_re += A1_re;
2083  o21_im += A1_im;
2084  o31_re += B1_re;
2085  o31_im += B1_im;
2086 
2087  o22_re += A2_re;
2088  o22_im += A2_im;
2089  o32_re += B2_re;
2090  o32_im += B2_im;
2091 
2092  }
2093 }
2094 
2095 #ifdef MULTI_GPU
2096 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]>0)) ||
2097  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==0) )
2098 #endif
2099 {
2100  // Projector P3+
2101  // 2 0 0 0
2102  // 0 2 0 0
2103  // 0 0 0 0
2104  // 0 0 0 0
2105 
2106 #ifdef MULTI_GPU
2107  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1 :
2108  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
2109 #if (DD_PREC==2) // half precision
2110  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
2111 #endif
2112 #else
2113  const int sp_idx = (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1;
2114 #endif
2115 
2116 #ifdef MULTI_GPU
2117  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
2118 #else
2119  const int ga_idx = sp_idx;
2120 #endif
2121 
2122  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
2123  {
2130 
2131 #ifdef MULTI_GPU
2132  if (kernel_type == INTERIOR_KERNEL) {
2133 #endif
2134 
2135  // read spinor from device memory
2136  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2137 
2138  // project spinor into half spinors
2139  a0_re = +2*i00_re;
2140  a0_im = +2*i00_im;
2141  a1_re = +2*i01_re;
2142  a1_im = +2*i01_im;
2143  a2_re = +2*i02_re;
2144  a2_im = +2*i02_im;
2145  b0_re = +2*i10_re;
2146  b0_im = +2*i10_im;
2147  b1_re = +2*i11_re;
2148  b1_im = +2*i11_im;
2149  b2_re = +2*i12_re;
2150  b2_im = +2*i12_im;
2151 
2152 #ifdef MULTI_GPU
2153  } else {
2154 
2155  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2156  const int t_proj_scale = TPROJSCALE;
2157 
2158  // read half spinor from device memory
2159  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
2160 
2161  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2162  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2163  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2164  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2165  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2166  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2167 
2168  }
2169 #endif // MULTI_GPU
2170 
2171  // identity gauge matrix
2178 
2179  o00_re += A0_re;
2180  o00_im += A0_im;
2181  o10_re += B0_re;
2182  o10_im += B0_im;
2183 
2184  o01_re += A1_re;
2185  o01_im += A1_im;
2186  o11_re += B1_re;
2187  o11_im += B1_im;
2188 
2189  o02_re += A2_re;
2190  o02_im += A2_im;
2191  o12_re += B2_re;
2192  o12_im += B2_im;
2193 
2194  } else {
2201 
2202 #ifdef MULTI_GPU
2203  if (kernel_type == INTERIOR_KERNEL) {
2204 #endif
2205 
2206  // read spinor from device memory
2207  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2208 
2209  // project spinor into half spinors
2210  a0_re = +2*i00_re;
2211  a0_im = +2*i00_im;
2212  a1_re = +2*i01_re;
2213  a1_im = +2*i01_im;
2214  a2_re = +2*i02_re;
2215  a2_im = +2*i02_im;
2216  b0_re = +2*i10_re;
2217  b0_im = +2*i10_im;
2218  b1_re = +2*i11_re;
2219  b1_im = +2*i11_im;
2220  b2_re = +2*i12_re;
2221  b2_im = +2*i12_im;
2222 
2223 #ifdef MULTI_GPU
2224  } else {
2225 
2226  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2227  const int t_proj_scale = TPROJSCALE;
2228 
2229  // read half spinor from device memory
2230  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
2231 
2232  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2233  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2234  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2235  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2236  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2237  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2238 
2239  }
2240 #endif // MULTI_GPU
2241 
2242  // read gauge matrix from device memory
2243  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, param.gauge_stride);
2244 
2245  // reconstruct gauge matrix
2247 
2248  // multiply row 0
2249  spinorFloat A0_re = 0;
2250  A0_re += gT00_re * a0_re;
2251  A0_re -= gT00_im * a0_im;
2252  A0_re += gT01_re * a1_re;
2253  A0_re -= gT01_im * a1_im;
2254  A0_re += gT02_re * a2_re;
2255  A0_re -= gT02_im * a2_im;
2256  spinorFloat A0_im = 0;
2257  A0_im += gT00_re * a0_im;
2258  A0_im += gT00_im * a0_re;
2259  A0_im += gT01_re * a1_im;
2260  A0_im += gT01_im * a1_re;
2261  A0_im += gT02_re * a2_im;
2262  A0_im += gT02_im * a2_re;
2263  spinorFloat B0_re = 0;
2264  B0_re += gT00_re * b0_re;
2265  B0_re -= gT00_im * b0_im;
2266  B0_re += gT01_re * b1_re;
2267  B0_re -= gT01_im * b1_im;
2268  B0_re += gT02_re * b2_re;
2269  B0_re -= gT02_im * b2_im;
2270  spinorFloat B0_im = 0;
2271  B0_im += gT00_re * b0_im;
2272  B0_im += gT00_im * b0_re;
2273  B0_im += gT01_re * b1_im;
2274  B0_im += gT01_im * b1_re;
2275  B0_im += gT02_re * b2_im;
2276  B0_im += gT02_im * b2_re;
2277 
2278  // multiply row 1
2279  spinorFloat A1_re = 0;
2280  A1_re += gT10_re * a0_re;
2281  A1_re -= gT10_im * a0_im;
2282  A1_re += gT11_re * a1_re;
2283  A1_re -= gT11_im * a1_im;
2284  A1_re += gT12_re * a2_re;
2285  A1_re -= gT12_im * a2_im;
2286  spinorFloat A1_im = 0;
2287  A1_im += gT10_re * a0_im;
2288  A1_im += gT10_im * a0_re;
2289  A1_im += gT11_re * a1_im;
2290  A1_im += gT11_im * a1_re;
2291  A1_im += gT12_re * a2_im;
2292  A1_im += gT12_im * a2_re;
2293  spinorFloat B1_re = 0;
2294  B1_re += gT10_re * b0_re;
2295  B1_re -= gT10_im * b0_im;
2296  B1_re += gT11_re * b1_re;
2297  B1_re -= gT11_im * b1_im;
2298  B1_re += gT12_re * b2_re;
2299  B1_re -= gT12_im * b2_im;
2300  spinorFloat B1_im = 0;
2301  B1_im += gT10_re * b0_im;
2302  B1_im += gT10_im * b0_re;
2303  B1_im += gT11_re * b1_im;
2304  B1_im += gT11_im * b1_re;
2305  B1_im += gT12_re * b2_im;
2306  B1_im += gT12_im * b2_re;
2307 
2308  // multiply row 2
2309  spinorFloat A2_re = 0;
2310  A2_re += gT20_re * a0_re;
2311  A2_re -= gT20_im * a0_im;
2312  A2_re += gT21_re * a1_re;
2313  A2_re -= gT21_im * a1_im;
2314  A2_re += gT22_re * a2_re;
2315  A2_re -= gT22_im * a2_im;
2316  spinorFloat A2_im = 0;
2317  A2_im += gT20_re * a0_im;
2318  A2_im += gT20_im * a0_re;
2319  A2_im += gT21_re * a1_im;
2320  A2_im += gT21_im * a1_re;
2321  A2_im += gT22_re * a2_im;
2322  A2_im += gT22_im * a2_re;
2323  spinorFloat B2_re = 0;
2324  B2_re += gT20_re * b0_re;
2325  B2_re -= gT20_im * b0_im;
2326  B2_re += gT21_re * b1_re;
2327  B2_re -= gT21_im * b1_im;
2328  B2_re += gT22_re * b2_re;
2329  B2_re -= gT22_im * b2_im;
2330  spinorFloat B2_im = 0;
2331  B2_im += gT20_re * b0_im;
2332  B2_im += gT20_im * b0_re;
2333  B2_im += gT21_re * b1_im;
2334  B2_im += gT21_im * b1_re;
2335  B2_im += gT22_re * b2_im;
2336  B2_im += gT22_im * b2_re;
2337 
2338  o00_re += A0_re;
2339  o00_im += A0_im;
2340  o10_re += B0_re;
2341  o10_im += B0_im;
2342 
2343  o01_re += A1_re;
2344  o01_im += A1_im;
2345  o11_re += B1_re;
2346  o11_im += B1_im;
2347 
2348  o02_re += A2_re;
2349  o02_im += A2_im;
2350  o12_re += B2_re;
2351  o12_im += B2_im;
2352 
2353  }
2354 }
2355 
2356 #ifdef MULTI_GPU
2357 
2358 int incomplete = 0; // Have all 8 contributions been computed for this site?
2359 
2360 switch(kernel_type) { // intentional fall-through
2361 
2362 case INTERIOR_KERNEL:
2363  incomplete = incomplete || (param.commDim[3] && (coord[3]==0 || coord[3]==(param.dc.X[3]-1)));
2364 case EXTERIOR_KERNEL_T:
2365  incomplete = incomplete || (param.commDim[2] && (coord[2]==0 || coord[2]==(param.dc.X[2]-1)));
2366 case EXTERIOR_KERNEL_Z:
2367  incomplete = incomplete || (param.commDim[1] && (coord[1]==0 || coord[1]==(param.dc.X[1]-1)));
2368 case EXTERIOR_KERNEL_Y:
2369  incomplete = incomplete || (param.commDim[0] && (coord[0]==0 || coord[0]==(param.dc.X[0]-1)));
2370 }
2371 
2372 if (!incomplete)
2373 #endif // MULTI_GPU
2374 {
2375 #ifdef SPINOR_DOUBLE
2376  spinorFloat a = param.a;
2377 #else
2378  spinorFloat a = param.a_f;
2379 #endif
2380 #ifdef DSLASH_XPAY
2381 #ifdef SPINOR_DOUBLE
2382  spinorFloat b = param.b;
2383 #else
2384  spinorFloat b = param.b_f;
2385 #endif
2386  READ_ACCUM(ACCUMTEX, param.sp_stride)
2387 
2388 #ifndef CLOVER_TWIST_XPAY
2389  //perform invert twist first:
2390 #ifndef DYNAMIC_CLOVER
2391  APPLY_CLOVER_TWIST_INV(c, cinv, a, o);
2392 #else
2394 #endif
2395  o00_re = b*o00_re + acc00_re;
2396  o00_im = b*o00_im + acc00_im;
2397  o01_re = b*o01_re + acc01_re;
2398  o01_im = b*o01_im + acc01_im;
2399  o02_re = b*o02_re + acc02_re;
2400  o02_im = b*o02_im + acc02_im;
2401  o10_re = b*o10_re + acc10_re;
2402  o10_im = b*o10_im + acc10_im;
2403  o11_re = b*o11_re + acc11_re;
2404  o11_im = b*o11_im + acc11_im;
2405  o12_re = b*o12_re + acc12_re;
2406  o12_im = b*o12_im + acc12_im;
2407  o20_re = b*o20_re + acc20_re;
2408  o20_im = b*o20_im + acc20_im;
2409  o21_re = b*o21_re + acc21_re;
2410  o21_im = b*o21_im + acc21_im;
2411  o22_re = b*o22_re + acc22_re;
2412  o22_im = b*o22_im + acc22_im;
2413  o30_re = b*o30_re + acc30_re;
2414  o30_im = b*o30_im + acc30_im;
2415  o31_re = b*o31_re + acc31_re;
2416  o31_im = b*o31_im + acc31_im;
2417  o32_re = b*o32_re + acc32_re;
2418  o32_im = b*o32_im + acc32_im;
2419 #else
2420  APPLY_CLOVER_TWIST(c, a, acc);
2421  o00_re = b*o00_re + acc00_re;
2422  o00_im = b*o00_im + acc00_im;
2423  o01_re = b*o01_re + acc01_re;
2424  o01_im = b*o01_im + acc01_im;
2425  o02_re = b*o02_re + acc02_re;
2426  o02_im = b*o02_im + acc02_im;
2427  o10_re = b*o10_re + acc10_re;
2428  o10_im = b*o10_im + acc10_im;
2429  o11_re = b*o11_re + acc11_re;
2430  o11_im = b*o11_im + acc11_im;
2431  o12_re = b*o12_re + acc12_re;
2432  o12_im = b*o12_im + acc12_im;
2433  o20_re = b*o20_re + acc20_re;
2434  o20_im = b*o20_im + acc20_im;
2435  o21_re = b*o21_re + acc21_re;
2436  o21_im = b*o21_im + acc21_im;
2437  o22_re = b*o22_re + acc22_re;
2438  o22_im = b*o22_im + acc22_im;
2439  o30_re = b*o30_re + acc30_re;
2440  o30_im = b*o30_im + acc30_im;
2441  o31_re = b*o31_re + acc31_re;
2442  o31_im = b*o31_im + acc31_im;
2443  o32_re = b*o32_re + acc32_re;
2444  o32_im = b*o32_im + acc32_im;
2445 #endif//CLOVER_TWIST_XPAY
2446 #else //no XPAY
2447 #ifndef DYNAMIC_CLOVER
2448  APPLY_CLOVER_TWIST_INV(c, cinv, a, o);
2449 #else
2451 #endif
2452 #endif
2453 }
2454 
2455 // write spinor field back to device memory
2456 WRITE_SPINOR(param.sp_stride);
2457 
2458 // undefine to prevent warning when precision is changed
2459 #undef spinorFloat
2460 #undef g00_re
2461 #undef g00_im
2462 #undef g01_re
2463 #undef g01_im
2464 #undef g02_re
2465 #undef g02_im
2466 #undef g10_re
2467 #undef g10_im
2468 #undef g11_re
2469 #undef g11_im
2470 #undef g12_re
2471 #undef g12_im
2472 #undef g20_re
2473 #undef g20_im
2474 #undef g21_re
2475 #undef g21_im
2476 #undef g22_re
2477 #undef g22_im
2478 
2479 #undef i00_re
2480 #undef i00_im
2481 #undef i01_re
2482 #undef i01_im
2483 #undef i02_re
2484 #undef i02_im
2485 #undef i10_re
2486 #undef i10_im
2487 #undef i11_re
2488 #undef i11_im
2489 #undef i12_re
2490 #undef i12_im
2491 #undef i20_re
2492 #undef i20_im
2493 #undef i21_re
2494 #undef i21_im
2495 #undef i22_re
2496 #undef i22_im
2497 #undef i30_re
2498 #undef i30_im
2499 #undef i31_re
2500 #undef i31_im
2501 #undef i32_re
2502 #undef i32_im
2503 
2504 #undef c00_00_re
2505 #undef c01_01_re
2506 #undef c02_02_re
2507 #undef c10_10_re
2508 #undef c11_11_re
2509 #undef c12_12_re
2510 #undef c01_00_re
2511 #undef c01_00_im
2512 #undef c02_00_re
2513 #undef c02_00_im
2514 #undef c10_00_re
2515 #undef c10_00_im
2516 #undef c11_00_re
2517 #undef c11_00_im
2518 #undef c12_00_re
2519 #undef c12_00_im
2520 #undef c02_01_re
2521 #undef c02_01_im
2522 #undef c10_01_re
2523 #undef c10_01_im
2524 #undef c11_01_re
2525 #undef c11_01_im
2526 #undef c12_01_re
2527 #undef c12_01_im
2528 #undef c10_02_re
2529 #undef c10_02_im
2530 #undef c11_02_re
2531 #undef c11_02_im
2532 #undef c12_02_re
2533 #undef c12_02_im
2534 #undef c11_10_re
2535 #undef c11_10_im
2536 #undef c12_10_re
2537 #undef c12_10_im
2538 #undef c12_11_re
2539 #undef c12_11_im
2540 
2541 #undef cinv00_00_re
2542 #undef cinv01_01_re
2543 #undef cinv02_02_re
2544 #undef cinv10_10_re
2545 #undef cinv11_11_re
2546 #undef cinv12_12_re
2547 #undef cinv01_00_re
2548 #undef cinv01_00_im
2549 #undef cinv02_00_re
2550 #undef cinv02_00_im
2551 #undef cinv10_00_re
2552 #undef cinv10_00_im
2553 #undef cinv11_00_re
2554 #undef cinv11_00_im
2555 #undef cinv12_00_re
2556 #undef cinv12_00_im
2557 #undef cinv02_01_re
2558 #undef cinv02_01_im
2559 #undef cinv10_01_re
2560 #undef cinv10_01_im
2561 #undef cinv11_01_re
2562 #undef cinv11_01_im
2563 #undef cinv12_01_re
2564 #undef cinv12_01_im
2565 #undef cinv10_02_re
2566 #undef cinv10_02_im
2567 #undef cinv11_02_re
2568 #undef cinv11_02_im
2569 #undef cinv12_02_re
2570 #undef cinv12_02_im
2571 #undef cinv11_10_re
2572 #undef cinv11_10_im
2573 #undef cinv12_10_re
2574 #undef cinv12_10_im
2575 #undef cinv12_11_re
2576 #undef cinv12_11_im
2577 
2578 #undef acc00_re
2579 #undef acc00_im
2580 #undef acc01_re
2581 #undef acc01_im
2582 #undef acc02_re
2583 #undef acc02_im
2584 #undef acc10_re
2585 #undef acc10_im
2586 #undef acc11_re
2587 #undef acc11_im
2588 #undef acc12_re
2589 #undef acc12_im
2590 #undef acc20_re
2591 #undef acc20_im
2592 #undef acc21_re
2593 #undef acc21_im
2594 #undef acc22_re
2595 #undef acc22_im
2596 #undef acc30_re
2597 #undef acc30_im
2598 #undef acc31_re
2599 #undef acc31_im
2600 #undef acc32_re
2601 #undef acc32_im
2602 
2603 
2604 
2605 #undef VOLATILE
#define gT22_im
#define g00_re
#define g01_im
spinorFloat b0_re
#define g20_re
#define i12_im
VOLATILE spinorFloat o00_re
spinorFloat A2_re
dim3 dim3 blockDim
#define i01_im
#define APPLY_CLOVER_TWIST(c, a, reg)
Definition: tmc_core.h:832
VOLATILE spinorFloat o10_re
#define g21_im
#define i31_re
#define g12_re
spinorFloat a0_im
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)
spinorFloat b2_im
#define i20_im
#define g21_re
float4 C1
#define acc11_im
#define g10_im
RECONSTRUCT_GAUGE_MATRIX(0)
APPLY_CLOVER_TWIST_INV(c, cinv, a, o)
spinorFloat B2_im
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o32_im
spinorFloat B1_re
int sp_idx
#define acc31_re
#define gT12_re
float4 C0
#define g02_re
#define acc32_re
VOLATILE spinorFloat o01_re
#define i21_re
#define gT02_re
const int ga_idx
#define gT01_re
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define APPLY_CLOVER_TWIST_DYN_INV(c, a, reg)
Definition: tmc_core.h:2004
#define i11_re
#define i11_im
#define i21_im
#define g11_re
#define acc12_im
VOLATILE spinorFloat o30_re
#define acc01_re
spinorFloat A0_im
VOLATILE spinorFloat o20_re
spinorFloat a2_re
coordsFromIndex< 4, QUDA_4D_PC, EVEN_X >(X, coord, sid, param)
#define GAUGE0TEX
float4 C6
#define gT20_im
#define gT22_re
#define g00_im
spinorFloat A0_re
spinorFloat A1_im
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
#define i22_im
#define i30_re
#define gT21_re
#define VOLATILE
VOLATILE spinorFloat o01_im
#define acc01_im
#define acc30_re
#define acc20_re
#define g01_re
#define i30_im
WRITE_SPINOR(param.sp_stride)
#define spinorFloat
spinorFloat a0_re
#define acc10_re
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o11_re
#define gT11_re
#define acc12_re
VOLATILE spinorFloat o12_im
#define GAUGE1TEX
#define acc00_re
#define i31_im
#define gT10_re
#define SPINORTEX
VOLATILE spinorFloat o02_re
#define acc21_re
VOLATILE spinorFloat o31_re
float4 C8
VOLATILE spinorFloat o21_re
#define acc00_im
#define acc22_im
spinorFloat B0_re
#define READ_INTERMEDIATE_SPINOR
#define i20_re
#define i32_re
VOLATILE spinorFloat o30_im
int coord[5]
VOLATILE spinorFloat o10_im
#define g02_im
int X[4]
Definition: quda.h:29
float4 C3
#define acc02_re
VOLATILE spinorFloat o02_im
#define i00_im
spinorFloat B1_im
float4 C2
#define READ_SPINOR_GHOST
#define i22_re
VOLATILE spinorFloat o12_re
#define gT21_im
VOLATILE spinorFloat o00_im
spinorFloat b2_re
#define gT02_im
VOLATILE spinorFloat o22_im
#define acc31_im
#define acc30_im
#define acc32_im
#define i02_re
#define i32_im
#define acc02_im
float4 C7
#define INTERTEX
spinorFloat b0_im
#define g22_re
#define acc10_im
spinorFloat a1_im
spinorFloat b1_im
int face_idx
#define gT20_re
#define gT12_im
#define acc22_re
float4 C4
spinorFloat A1_re
spinorFloat B2_re
#define acc20_im
spinorFloat B0_im
spinorFloat a1_re
const void * c
#define i02_im
const int face_num
#define TPROJSCALE
#define g22_im
#define gT01_im
#define i10_im
VOLATILE spinorFloat o11_im
#define GHOSTSPINORTEX
#define i12_re
spinorFloat a2_im
VOLATILE spinorFloat o31_im
#define g10_re
float4 C5
#define acc11_re
#define g12_im
#define gT10_im
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define gT11_im
VOLATILE spinorFloat o22_re
#define gT00_im
#define a
#define acc21_im
#define i10_re
VOLATILE spinorFloat o20_im
#define i00_re
#define g20_im
#define gT00_re
#define i01_re
#define g11_im
spinorFloat A2_im
spinorFloat b1_re
VOLATILE spinorFloat o32_re