QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tmc_dslash_gt200_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define i00_re I0.x
15 #define i00_im I0.y
16 #define i01_re I1.x
17 #define i01_im I1.y
18 #define i02_re I2.x
19 #define i02_im I2.y
20 #define i10_re I3.x
21 #define i10_im I3.y
22 #define i11_re I4.x
23 #define i11_im I4.y
24 #define i12_re I5.x
25 #define i12_im I5.y
26 #define i20_re I6.x
27 #define i20_im I6.y
28 #define i21_re I7.x
29 #define i21_im I7.y
30 #define i22_re I8.x
31 #define i22_im I8.y
32 #define i30_re I9.x
33 #define i30_im I9.y
34 #define i31_re I10.x
35 #define i31_im I10.y
36 #define i32_re I11.x
37 #define i32_im I11.y
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
62 #else
63 #define spinorFloat float
64 #define i00_re I0.x
65 #define i00_im I0.y
66 #define i01_re I0.z
67 #define i01_im I0.w
68 #define i02_re I1.x
69 #define i02_im I1.y
70 #define i10_re I1.z
71 #define i10_im I1.w
72 #define i11_re I2.x
73 #define i11_im I2.y
74 #define i12_re I2.z
75 #define i12_im I2.w
76 #define i20_re I3.x
77 #define i20_im I3.y
78 #define i21_re I3.z
79 #define i21_im I3.w
80 #define i22_re I4.x
81 #define i22_im I4.y
82 #define i30_re I4.z
83 #define i30_im I4.w
84 #define i31_re I5.x
85 #define i31_im I5.y
86 #define i32_re I5.z
87 #define i32_im I5.w
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
113 
114 // gauge link
115 #ifdef GAUGE_FLOAT2
116 #define g00_re G0.x
117 #define g00_im G0.y
118 #define g01_re G1.x
119 #define g01_im G1.y
120 #define g02_re G2.x
121 #define g02_im G2.y
122 #define g10_re G3.x
123 #define g10_im G3.y
124 #define g11_re G4.x
125 #define g11_im G4.y
126 #define g12_re G5.x
127 #define g12_im G5.y
128 #define g20_re G6.x
129 #define g20_im G6.y
130 #define g21_re G7.x
131 #define g21_im G7.y
132 #define g22_re G8.x
133 #define g22_im G8.y
134 
135 #else
136 #define g00_re G0.x
137 #define g00_im G0.y
138 #define g01_re G0.z
139 #define g01_im G0.w
140 #define g02_re G1.x
141 #define g02_im G1.y
142 #define g10_re G1.z
143 #define g10_im G1.w
144 #define g11_re G2.x
145 #define g11_im G2.y
146 #define g12_re G2.z
147 #define g12_im G2.w
148 #define g20_re G3.x
149 #define g20_im G3.y
150 #define g21_re G3.z
151 #define g21_im G3.w
152 #define g22_re G4.x
153 #define g22_im G4.y
154 
155 #endif // GAUGE_DOUBLE
156 
157 // conjugated gauge link
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
176 
177 // first chiral block of clover term
178 #ifdef CLOVER_DOUBLE
179 #define c00_00_re C0.x
180 #define c01_01_re C0.y
181 #define c02_02_re C1.x
182 #define c10_10_re C1.y
183 #define c11_11_re C2.x
184 #define c12_12_re C2.y
185 #define c01_00_re C3.x
186 #define c01_00_im C3.y
187 #define c02_00_re C4.x
188 #define c02_00_im C4.y
189 #define c10_00_re C5.x
190 #define c10_00_im C5.y
191 #define c11_00_re C6.x
192 #define c11_00_im C6.y
193 #define c12_00_re C7.x
194 #define c12_00_im C7.y
195 #define c02_01_re C8.x
196 #define c02_01_im C8.y
197 #define c10_01_re C9.x
198 #define c10_01_im C9.y
199 #define c11_01_re C10.x
200 #define c11_01_im C10.y
201 #define c12_01_re C11.x
202 #define c12_01_im C11.y
203 #define c10_02_re C12.x
204 #define c10_02_im C12.y
205 #define c11_02_re C13.x
206 #define c11_02_im C13.y
207 #define c12_02_re C14.x
208 #define c12_02_im C14.y
209 #define c11_10_re C15.x
210 #define c11_10_im C15.y
211 #define c12_10_re C16.x
212 #define c12_10_im C16.y
213 #define c12_11_re C17.x
214 #define c12_11_im C17.y
215 #else
216 #define c00_00_re C0.x
217 #define c01_01_re C0.y
218 #define c02_02_re C0.z
219 #define c10_10_re C0.w
220 #define c11_11_re C1.x
221 #define c12_12_re C1.y
222 #define c01_00_re C1.z
223 #define c01_00_im C1.w
224 #define c02_00_re C2.x
225 #define c02_00_im C2.y
226 #define c10_00_re C2.z
227 #define c10_00_im C2.w
228 #define c11_00_re C3.x
229 #define c11_00_im C3.y
230 #define c12_00_re C3.z
231 #define c12_00_im C3.w
232 #define c02_01_re C4.x
233 #define c02_01_im C4.y
234 #define c10_01_re C4.z
235 #define c10_01_im C4.w
236 #define c11_01_re C5.x
237 #define c11_01_im C5.y
238 #define c12_01_re C5.z
239 #define c12_01_im C5.w
240 #define c10_02_re C6.x
241 #define c10_02_im C6.y
242 #define c11_02_re C6.z
243 #define c11_02_im C6.w
244 #define c12_02_re C7.x
245 #define c12_02_im C7.y
246 #define c11_10_re C7.z
247 #define c11_10_im C7.w
248 #define c12_10_re C8.x
249 #define c12_10_im C8.y
250 #define c12_11_re C8.z
251 #define c12_11_im C8.w
252 #endif // CLOVER_DOUBLE
253 
254 #define c00_01_re (+c01_00_re)
255 #define c00_01_im (-c01_00_im)
256 #define c00_02_re (+c02_00_re)
257 #define c00_02_im (-c02_00_im)
258 #define c01_02_re (+c02_01_re)
259 #define c01_02_im (-c02_01_im)
260 #define c00_10_re (+c10_00_re)
261 #define c00_10_im (-c10_00_im)
262 #define c01_10_re (+c10_01_re)
263 #define c01_10_im (-c10_01_im)
264 #define c02_10_re (+c10_02_re)
265 #define c02_10_im (-c10_02_im)
266 #define c00_11_re (+c11_00_re)
267 #define c00_11_im (-c11_00_im)
268 #define c01_11_re (+c11_01_re)
269 #define c01_11_im (-c11_01_im)
270 #define c02_11_re (+c11_02_re)
271 #define c02_11_im (-c11_02_im)
272 #define c10_11_re (+c11_10_re)
273 #define c10_11_im (-c11_10_im)
274 #define c00_12_re (+c12_00_re)
275 #define c00_12_im (-c12_00_im)
276 #define c01_12_re (+c12_01_re)
277 #define c01_12_im (-c12_01_im)
278 #define c02_12_re (+c12_02_re)
279 #define c02_12_im (-c12_02_im)
280 #define c10_12_re (+c12_10_re)
281 #define c10_12_im (-c12_10_im)
282 #define c11_12_re (+c12_11_re)
283 #define c11_12_im (-c12_11_im)
284 
285 // second chiral block of clover term (reuses C0,...,C9)
286 #define c20_20_re c00_00_re
287 #define c21_20_re c01_00_re
288 #define c21_20_im c01_00_im
289 #define c22_20_re c02_00_re
290 #define c22_20_im c02_00_im
291 #define c30_20_re c10_00_re
292 #define c30_20_im c10_00_im
293 #define c31_20_re c11_00_re
294 #define c31_20_im c11_00_im
295 #define c32_20_re c12_00_re
296 #define c32_20_im c12_00_im
297 #define c20_21_re c00_01_re
298 #define c20_21_im c00_01_im
299 #define c21_21_re c01_01_re
300 #define c22_21_re c02_01_re
301 #define c22_21_im c02_01_im
302 #define c30_21_re c10_01_re
303 #define c30_21_im c10_01_im
304 #define c31_21_re c11_01_re
305 #define c31_21_im c11_01_im
306 #define c32_21_re c12_01_re
307 #define c32_21_im c12_01_im
308 #define c20_22_re c00_02_re
309 #define c20_22_im c00_02_im
310 #define c21_22_re c01_02_re
311 #define c21_22_im c01_02_im
312 #define c22_22_re c02_02_re
313 #define c30_22_re c10_02_re
314 #define c30_22_im c10_02_im
315 #define c31_22_re c11_02_re
316 #define c31_22_im c11_02_im
317 #define c32_22_re c12_02_re
318 #define c32_22_im c12_02_im
319 #define c20_30_re c00_10_re
320 #define c20_30_im c00_10_im
321 #define c21_30_re c01_10_re
322 #define c21_30_im c01_10_im
323 #define c22_30_re c02_10_re
324 #define c22_30_im c02_10_im
325 #define c30_30_re c10_10_re
326 #define c31_30_re c11_10_re
327 #define c31_30_im c11_10_im
328 #define c32_30_re c12_10_re
329 #define c32_30_im c12_10_im
330 #define c20_31_re c00_11_re
331 #define c20_31_im c00_11_im
332 #define c21_31_re c01_11_re
333 #define c21_31_im c01_11_im
334 #define c22_31_re c02_11_re
335 #define c22_31_im c02_11_im
336 #define c30_31_re c10_11_re
337 #define c30_31_im c10_11_im
338 #define c31_31_re c11_11_re
339 #define c32_31_re c12_11_re
340 #define c32_31_im c12_11_im
341 #define c20_32_re c00_12_re
342 #define c20_32_im c00_12_im
343 #define c21_32_re c01_12_re
344 #define c21_32_im c01_12_im
345 #define c22_32_re c02_12_re
346 #define c22_32_im c02_12_im
347 #define c30_32_re c10_12_re
348 #define c30_32_im c10_12_im
349 #define c31_32_re c11_12_re
350 #define c31_32_im c11_12_im
351 #define c32_32_re c12_12_re
352 
353 
354 // first chiral block of inverted clover term
355 #ifdef CLOVER_DOUBLE
356 #define cinv00_00_re C0.x
357 #define cinv01_01_re C0.y
358 #define cinv02_02_re C1.x
359 #define cinv10_10_re C1.y
360 #define cinv11_11_re C2.x
361 #define cinv12_12_re C2.y
362 #define cinv01_00_re C3.x
363 #define cinv01_00_im C3.y
364 #define cinv02_00_re C4.x
365 #define cinv02_00_im C4.y
366 #define cinv10_00_re C5.x
367 #define cinv10_00_im C5.y
368 #define cinv11_00_re C6.x
369 #define cinv11_00_im C6.y
370 #define cinv12_00_re C7.x
371 #define cinv12_00_im C7.y
372 #define cinv02_01_re C8.x
373 #define cinv02_01_im C8.y
374 #define cinv10_01_re C9.x
375 #define cinv10_01_im C9.y
376 #define cinv11_01_re C10.x
377 #define cinv11_01_im C10.y
378 #define cinv12_01_re C11.x
379 #define cinv12_01_im C11.y
380 #define cinv10_02_re C12.x
381 #define cinv10_02_im C12.y
382 #define cinv11_02_re C13.x
383 #define cinv11_02_im C13.y
384 #define cinv12_02_re C14.x
385 #define cinv12_02_im C14.y
386 #define cinv11_10_re C15.x
387 #define cinv11_10_im C15.y
388 #define cinv12_10_re C16.x
389 #define cinv12_10_im C16.y
390 #define cinv12_11_re C17.x
391 #define cinv12_11_im C17.y
392 #else
393 #define cinv00_00_re C0.x
394 #define cinv01_01_re C0.y
395 #define cinv02_02_re C0.z
396 #define cinv10_10_re C0.w
397 #define cinv11_11_re C1.x
398 #define cinv12_12_re C1.y
399 #define cinv01_00_re C1.z
400 #define cinv01_00_im C1.w
401 #define cinv02_00_re C2.x
402 #define cinv02_00_im C2.y
403 #define cinv10_00_re C2.z
404 #define cinv10_00_im C2.w
405 #define cinv11_00_re C3.x
406 #define cinv11_00_im C3.y
407 #define cinv12_00_re C3.z
408 #define cinv12_00_im C3.w
409 #define cinv02_01_re C4.x
410 #define cinv02_01_im C4.y
411 #define cinv10_01_re C4.z
412 #define cinv10_01_im C4.w
413 #define cinv11_01_re C5.x
414 #define cinv11_01_im C5.y
415 #define cinv12_01_re C5.z
416 #define cinv12_01_im C5.w
417 #define cinv10_02_re C6.x
418 #define cinv10_02_im C6.y
419 #define cinv11_02_re C6.z
420 #define cinv11_02_im C6.w
421 #define cinv12_02_re C7.x
422 #define cinv12_02_im C7.y
423 #define cinv11_10_re C7.z
424 #define cinv11_10_im C7.w
425 #define cinv12_10_re C8.x
426 #define cinv12_10_im C8.y
427 #define cinv12_11_re C8.z
428 #define cinv12_11_im C8.w
429 #endif // CLOVER_DOUBLE
430 
431 #define cinv00_01_re (+cinv01_00_re)
432 #define cinv00_01_im (-cinv01_00_im)
433 #define cinv00_02_re (+cinv02_00_re)
434 #define cinv00_02_im (-cinv02_00_im)
435 #define cinv01_02_re (+cinv02_01_re)
436 #define cinv01_02_im (-cinv02_01_im)
437 #define cinv00_10_re (+cinv10_00_re)
438 #define cinv00_10_im (-cinv10_00_im)
439 #define cinv01_10_re (+cinv10_01_re)
440 #define cinv01_10_im (-cinv10_01_im)
441 #define cinv02_10_re (+cinv10_02_re)
442 #define cinv02_10_im (-cinv10_02_im)
443 #define cinv00_11_re (+cinv11_00_re)
444 #define cinv00_11_im (-cinv11_00_im)
445 #define cinv01_11_re (+cinv11_01_re)
446 #define cinv01_11_im (-cinv11_01_im)
447 #define cinv02_11_re (+cinv11_02_re)
448 #define cinv02_11_im (-cinv11_02_im)
449 #define cinv10_11_re (+cinv11_10_re)
450 #define cinv10_11_im (-cinv11_10_im)
451 #define cinv00_12_re (+cinv12_00_re)
452 #define cinv00_12_im (-cinv12_00_im)
453 #define cinv01_12_re (+cinv12_01_re)
454 #define cinv01_12_im (-cinv12_01_im)
455 #define cinv02_12_re (+cinv12_02_re)
456 #define cinv02_12_im (-cinv12_02_im)
457 #define cinv10_12_re (+cinv12_10_re)
458 #define cinv10_12_im (-cinv12_10_im)
459 #define cinv11_12_re (+cinv12_11_re)
460 #define cinv11_12_im (-cinv12_11_im)
461 
462 // second chiral block of inverted clover term (reuses C0,...,C9)
463 #define cinv20_20_re cinv00_00_re
464 #define cinv21_20_re cinv01_00_re
465 #define cinv21_20_im cinv01_00_im
466 #define cinv22_20_re cinv02_00_re
467 #define cinv22_20_im cinv02_00_im
468 #define cinv30_20_re cinv10_00_re
469 #define cinv30_20_im cinv10_00_im
470 #define cinv31_20_re cinv11_00_re
471 #define cinv31_20_im cinv11_00_im
472 #define cinv32_20_re cinv12_00_re
473 #define cinv32_20_im cinv12_00_im
474 #define cinv20_21_re cinv00_01_re
475 #define cinv20_21_im cinv00_01_im
476 #define cinv21_21_re cinv01_01_re
477 #define cinv22_21_re cinv02_01_re
478 #define cinv22_21_im cinv02_01_im
479 #define cinv30_21_re cinv10_01_re
480 #define cinv30_21_im cinv10_01_im
481 #define cinv31_21_re cinv11_01_re
482 #define cinv31_21_im cinv11_01_im
483 #define cinv32_21_re cinv12_01_re
484 #define cinv32_21_im cinv12_01_im
485 #define cinv20_22_re cinv00_02_re
486 #define cinv20_22_im cinv00_02_im
487 #define cinv21_22_re cinv01_02_re
488 #define cinv21_22_im cinv01_02_im
489 #define cinv22_22_re cinv02_02_re
490 #define cinv30_22_re cinv10_02_re
491 #define cinv30_22_im cinv10_02_im
492 #define cinv31_22_re cinv11_02_re
493 #define cinv31_22_im cinv11_02_im
494 #define cinv32_22_re cinv12_02_re
495 #define cinv32_22_im cinv12_02_im
496 #define cinv20_30_re cinv00_10_re
497 #define cinv20_30_im cinv00_10_im
498 #define cinv21_30_re cinv01_10_re
499 #define cinv21_30_im cinv01_10_im
500 #define cinv22_30_re cinv02_10_re
501 #define cinv22_30_im cinv02_10_im
502 #define cinv30_30_re cinv10_10_re
503 #define cinv31_30_re cinv11_10_re
504 #define cinv31_30_im cinv11_10_im
505 #define cinv32_30_re cinv12_10_re
506 #define cinv32_30_im cinv12_10_im
507 #define cinv20_31_re cinv00_11_re
508 #define cinv20_31_im cinv00_11_im
509 #define cinv21_31_re cinv01_11_re
510 #define cinv21_31_im cinv01_11_im
511 #define cinv22_31_re cinv02_11_re
512 #define cinv22_31_im cinv02_11_im
513 #define cinv30_31_re cinv10_11_re
514 #define cinv30_31_im cinv10_11_im
515 #define cinv31_31_re cinv11_11_re
516 #define cinv32_31_re cinv12_11_re
517 #define cinv32_31_im cinv12_11_im
518 #define cinv20_32_re cinv00_12_re
519 #define cinv20_32_im cinv00_12_im
520 #define cinv21_32_re cinv01_12_re
521 #define cinv21_32_im cinv01_12_im
522 #define cinv22_32_re cinv02_12_re
523 #define cinv22_32_im cinv02_12_im
524 #define cinv30_32_re cinv10_12_re
525 #define cinv30_32_im cinv10_12_im
526 #define cinv31_32_re cinv11_12_re
527 #define cinv31_32_im cinv11_12_im
528 #define cinv32_32_re cinv12_12_re
529 
530 
531 
532 // declare C## here and use ASSN below instead of READ
533 #ifdef CLOVER_DOUBLE
534 double2 C0;
535 double2 C1;
536 double2 C2;
537 double2 C3;
538 double2 C4;
539 double2 C5;
540 double2 C6;
541 double2 C7;
542 double2 C8;
543 double2 C9;
544 double2 C10;
545 double2 C11;
546 double2 C12;
547 double2 C13;
548 double2 C14;
549 double2 C15;
550 double2 C16;
551 double2 C17;
552 #else
553 float4 C0;
554 float4 C1;
555 float4 C2;
556 float4 C3;
557 float4 C4;
558 float4 C5;
559 float4 C6;
560 float4 C7;
561 float4 C8;
562 
563 #if (DD_PREC==2)
564 float K;
565 #endif
566 
567 #endif // CLOVER_DOUBLE
568 // output spinor
593 
594 #include "read_gauge.h"
595 #include "io_spinor.h"
596 #include "read_clover.h"
597 #include "tmc_core.h"
598 
599 int x1, x2, x3, x4;
600 int X;
601 
602 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
603 int sp_norm_idx;
604 #endif // MULTI_GPU half precision
605 
606 int sid;
607 
608 #ifdef MULTI_GPU
609 int face_idx;
611 #endif
612 
613  sid = blockIdx.x*blockDim.x + threadIdx.x;
614  if (sid >= param.threads) return;
615 
616  // Inline by hand for the moment and assume even dimensions
617  const int dims[] = {X1, X2, X3, X4};
618  coordsFromIndex<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
619 
620  o00_re = 0; o00_im = 0;
621  o01_re = 0; o01_im = 0;
622  o02_re = 0; o02_im = 0;
623  o10_re = 0; o10_im = 0;
624  o11_re = 0; o11_im = 0;
625  o12_re = 0; o12_im = 0;
626  o20_re = 0; o20_im = 0;
627  o21_re = 0; o21_im = 0;
628  o22_re = 0; o22_im = 0;
629  o30_re = 0; o30_im = 0;
630  o31_re = 0; o31_im = 0;
631  o32_re = 0; o32_im = 0;
632 
633 #ifdef MULTI_GPU
634 } else { // exterior kernel
635 
636  sid = blockIdx.x*blockDim.x + threadIdx.x;
637  if (sid >= param.threads) return;
638 
639  const int dim = static_cast<int>(kernel_type);
640  const int face_volume = (param.threads >> 1); // volume of one face
641  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
642  face_idx = sid - face_num*face_volume; // index into the respective face
643 
644  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
645  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
646  //sp_idx = face_idx + param.ghostOffset[dim];
647 
648 #if (DD_PREC==2) // half precision
649  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
650 #endif
651 
652  const int dims[] = {X1, X2, X3, X4};
653  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
654 
655  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
656 
657  o00_re = i00_re; o00_im = i00_im;
658  o01_re = i01_re; o01_im = i01_im;
659  o02_re = i02_re; o02_im = i02_im;
660  o10_re = i10_re; o10_im = i10_im;
661  o11_re = i11_re; o11_im = i11_im;
662  o12_re = i12_re; o12_im = i12_im;
663  o20_re = i20_re; o20_im = i20_im;
664  o21_re = i21_re; o21_im = i21_im;
665  o22_re = i22_re; o22_im = i22_im;
666  o30_re = i30_re; o30_im = i30_im;
667  o31_re = i31_re; o31_im = i31_im;
668  o32_re = i32_re; o32_im = i32_im;
669 }
670 #endif // MULTI_GPU
671 
672 
673 #ifdef MULTI_GPU
674 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
676 #endif
677 {
678  // Projector P0-
679  // 1 0 0 -i
680  // 0 1 -i 0
681  // 0 i 1 0
682  // i 0 0 1
683 
684 #ifdef MULTI_GPU
685  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
686  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
687 #else
688  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
689 #endif
690 
691  const int ga_idx = sid;
692 
699 
700 #ifdef MULTI_GPU
701  if (kernel_type == INTERIOR_KERNEL) {
702 #endif
703 
704  // read spinor from device memory
705  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
706 
707  // project spinor into half spinors
708  a0_re = +i00_re+i30_im;
709  a0_im = +i00_im-i30_re;
710  a1_re = +i01_re+i31_im;
711  a1_im = +i01_im-i31_re;
712  a2_re = +i02_re+i32_im;
713  a2_im = +i02_im-i32_re;
714  b0_re = +i10_re+i20_im;
715  b0_im = +i10_im-i20_re;
716  b1_re = +i11_re+i21_im;
717  b1_im = +i11_im-i21_re;
718  b2_re = +i12_re+i22_im;
719  b2_im = +i12_im-i22_re;
720 
721 #ifdef MULTI_GPU
722  } else {
723 
724  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
725 
726  // read half spinor from device memory
727  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
728 
729  a0_re = i00_re; a0_im = i00_im;
730  a1_re = i01_re; a1_im = i01_im;
731  a2_re = i02_re; a2_im = i02_im;
732  b0_re = i10_re; b0_im = i10_im;
733  b1_re = i11_re; b1_im = i11_im;
734  b2_re = i12_re; b2_im = i12_im;
735 
736  }
737 #endif // MULTI_GPU
738 
739  // read gauge matrix from device memory
740  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
741 
742  // reconstruct gauge matrix
744 
745  // multiply row 0
747  A0_re += g00_re * a0_re;
748  A0_re -= g00_im * a0_im;
749  A0_re += g01_re * a1_re;
750  A0_re -= g01_im * a1_im;
751  A0_re += g02_re * a2_re;
752  A0_re -= g02_im * a2_im;
754  A0_im += g00_re * a0_im;
755  A0_im += g00_im * a0_re;
756  A0_im += g01_re * a1_im;
757  A0_im += g01_im * a1_re;
758  A0_im += g02_re * a2_im;
759  A0_im += g02_im * a2_re;
761  B0_re += g00_re * b0_re;
762  B0_re -= g00_im * b0_im;
763  B0_re += g01_re * b1_re;
764  B0_re -= g01_im * b1_im;
765  B0_re += g02_re * b2_re;
766  B0_re -= g02_im * b2_im;
768  B0_im += g00_re * b0_im;
769  B0_im += g00_im * b0_re;
770  B0_im += g01_re * b1_im;
771  B0_im += g01_im * b1_re;
772  B0_im += g02_re * b2_im;
773  B0_im += g02_im * b2_re;
774 
775  // multiply row 1
777  A1_re += g10_re * a0_re;
778  A1_re -= g10_im * a0_im;
779  A1_re += g11_re * a1_re;
780  A1_re -= g11_im * a1_im;
781  A1_re += g12_re * a2_re;
782  A1_re -= g12_im * a2_im;
784  A1_im += g10_re * a0_im;
785  A1_im += g10_im * a0_re;
786  A1_im += g11_re * a1_im;
787  A1_im += g11_im * a1_re;
788  A1_im += g12_re * a2_im;
789  A1_im += g12_im * a2_re;
791  B1_re += g10_re * b0_re;
792  B1_re -= g10_im * b0_im;
793  B1_re += g11_re * b1_re;
794  B1_re -= g11_im * b1_im;
795  B1_re += g12_re * b2_re;
796  B1_re -= g12_im * b2_im;
798  B1_im += g10_re * b0_im;
799  B1_im += g10_im * b0_re;
800  B1_im += g11_re * b1_im;
801  B1_im += g11_im * b1_re;
802  B1_im += g12_re * b2_im;
803  B1_im += g12_im * b2_re;
804 
805  // multiply row 2
807  A2_re += g20_re * a0_re;
808  A2_re -= g20_im * a0_im;
809  A2_re += g21_re * a1_re;
810  A2_re -= g21_im * a1_im;
811  A2_re += g22_re * a2_re;
812  A2_re -= g22_im * a2_im;
814  A2_im += g20_re * a0_im;
815  A2_im += g20_im * a0_re;
816  A2_im += g21_re * a1_im;
817  A2_im += g21_im * a1_re;
818  A2_im += g22_re * a2_im;
819  A2_im += g22_im * a2_re;
821  B2_re += g20_re * b0_re;
822  B2_re -= g20_im * b0_im;
823  B2_re += g21_re * b1_re;
824  B2_re -= g21_im * b1_im;
825  B2_re += g22_re * b2_re;
826  B2_re -= g22_im * b2_im;
828  B2_im += g20_re * b0_im;
829  B2_im += g20_im * b0_re;
830  B2_im += g21_re * b1_im;
831  B2_im += g21_im * b1_re;
832  B2_im += g22_re * b2_im;
833  B2_im += g22_im * b2_re;
834 
835  o00_re += A0_re;
836  o00_im += A0_im;
837  o10_re += B0_re;
838  o10_im += B0_im;
839  o20_re -= B0_im;
840  o20_im += B0_re;
841  o30_re -= A0_im;
842  o30_im += A0_re;
843 
844  o01_re += A1_re;
845  o01_im += A1_im;
846  o11_re += B1_re;
847  o11_im += B1_im;
848  o21_re -= B1_im;
849  o21_im += B1_re;
850  o31_re -= A1_im;
851  o31_im += A1_re;
852 
853  o02_re += A2_re;
854  o02_im += A2_im;
855  o12_re += B2_re;
856  o12_im += B2_im;
857  o22_re -= B2_im;
858  o22_im += B2_re;
859  o32_re -= A2_im;
860  o32_im += A2_re;
861 
862 }
863 
864 #ifdef MULTI_GPU
865 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
866  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
867 #endif
868 {
869  // Projector P0+
870  // 1 0 0 i
871  // 0 1 i 0
872  // 0 -i 1 0
873  // -i 0 0 1
874 
875 #ifdef MULTI_GPU
876  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
877  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
878 #else
879  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
880 #endif
881 
882 #ifdef MULTI_GPU
883  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
884 #else
885  const int ga_idx = sp_idx;
886 #endif
887 
894 
895 #ifdef MULTI_GPU
896  if (kernel_type == INTERIOR_KERNEL) {
897 #endif
898 
899  // read spinor from device memory
900  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
901 
902  // project spinor into half spinors
903  a0_re = +i00_re-i30_im;
904  a0_im = +i00_im+i30_re;
905  a1_re = +i01_re-i31_im;
906  a1_im = +i01_im+i31_re;
907  a2_re = +i02_re-i32_im;
908  a2_im = +i02_im+i32_re;
909  b0_re = +i10_re-i20_im;
910  b0_im = +i10_im+i20_re;
911  b1_re = +i11_re-i21_im;
912  b1_im = +i11_im+i21_re;
913  b2_re = +i12_re-i22_im;
914  b2_im = +i12_im+i22_re;
915 
916 #ifdef MULTI_GPU
917  } else {
918 
919  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
920 
921  // read half spinor from device memory
922  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
923 
924  a0_re = i00_re; a0_im = i00_im;
925  a1_re = i01_re; a1_im = i01_im;
926  a2_re = i02_re; a2_im = i02_im;
927  b0_re = i10_re; b0_im = i10_im;
928  b1_re = i11_re; b1_im = i11_im;
929  b2_re = i12_re; b2_im = i12_im;
930 
931  }
932 #endif // MULTI_GPU
933 
934  // read gauge matrix from device memory
935  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
936 
937  // reconstruct gauge matrix
939 
940  // multiply row 0
941  spinorFloat A0_re = 0;
942  A0_re += gT00_re * a0_re;
943  A0_re -= gT00_im * a0_im;
944  A0_re += gT01_re * a1_re;
945  A0_re -= gT01_im * a1_im;
946  A0_re += gT02_re * a2_re;
947  A0_re -= gT02_im * a2_im;
948  spinorFloat A0_im = 0;
949  A0_im += gT00_re * a0_im;
950  A0_im += gT00_im * a0_re;
951  A0_im += gT01_re * a1_im;
952  A0_im += gT01_im * a1_re;
953  A0_im += gT02_re * a2_im;
954  A0_im += gT02_im * a2_re;
955  spinorFloat B0_re = 0;
956  B0_re += gT00_re * b0_re;
957  B0_re -= gT00_im * b0_im;
958  B0_re += gT01_re * b1_re;
959  B0_re -= gT01_im * b1_im;
960  B0_re += gT02_re * b2_re;
961  B0_re -= gT02_im * b2_im;
962  spinorFloat B0_im = 0;
963  B0_im += gT00_re * b0_im;
964  B0_im += gT00_im * b0_re;
965  B0_im += gT01_re * b1_im;
966  B0_im += gT01_im * b1_re;
967  B0_im += gT02_re * b2_im;
968  B0_im += gT02_im * b2_re;
969 
970  // multiply row 1
971  spinorFloat A1_re = 0;
972  A1_re += gT10_re * a0_re;
973  A1_re -= gT10_im * a0_im;
974  A1_re += gT11_re * a1_re;
975  A1_re -= gT11_im * a1_im;
976  A1_re += gT12_re * a2_re;
977  A1_re -= gT12_im * a2_im;
978  spinorFloat A1_im = 0;
979  A1_im += gT10_re * a0_im;
980  A1_im += gT10_im * a0_re;
981  A1_im += gT11_re * a1_im;
982  A1_im += gT11_im * a1_re;
983  A1_im += gT12_re * a2_im;
984  A1_im += gT12_im * a2_re;
985  spinorFloat B1_re = 0;
986  B1_re += gT10_re * b0_re;
987  B1_re -= gT10_im * b0_im;
988  B1_re += gT11_re * b1_re;
989  B1_re -= gT11_im * b1_im;
990  B1_re += gT12_re * b2_re;
991  B1_re -= gT12_im * b2_im;
992  spinorFloat B1_im = 0;
993  B1_im += gT10_re * b0_im;
994  B1_im += gT10_im * b0_re;
995  B1_im += gT11_re * b1_im;
996  B1_im += gT11_im * b1_re;
997  B1_im += gT12_re * b2_im;
998  B1_im += gT12_im * b2_re;
999 
1000  // multiply row 2
1001  spinorFloat A2_re = 0;
1002  A2_re += gT20_re * a0_re;
1003  A2_re -= gT20_im * a0_im;
1004  A2_re += gT21_re * a1_re;
1005  A2_re -= gT21_im * a1_im;
1006  A2_re += gT22_re * a2_re;
1007  A2_re -= gT22_im * a2_im;
1008  spinorFloat A2_im = 0;
1009  A2_im += gT20_re * a0_im;
1010  A2_im += gT20_im * a0_re;
1011  A2_im += gT21_re * a1_im;
1012  A2_im += gT21_im * a1_re;
1013  A2_im += gT22_re * a2_im;
1014  A2_im += gT22_im * a2_re;
1015  spinorFloat B2_re = 0;
1016  B2_re += gT20_re * b0_re;
1017  B2_re -= gT20_im * b0_im;
1018  B2_re += gT21_re * b1_re;
1019  B2_re -= gT21_im * b1_im;
1020  B2_re += gT22_re * b2_re;
1021  B2_re -= gT22_im * b2_im;
1022  spinorFloat B2_im = 0;
1023  B2_im += gT20_re * b0_im;
1024  B2_im += gT20_im * b0_re;
1025  B2_im += gT21_re * b1_im;
1026  B2_im += gT21_im * b1_re;
1027  B2_im += gT22_re * b2_im;
1028  B2_im += gT22_im * b2_re;
1029 
1030  o00_re += A0_re;
1031  o00_im += A0_im;
1032  o10_re += B0_re;
1033  o10_im += B0_im;
1034  o20_re += B0_im;
1035  o20_im -= B0_re;
1036  o30_re += A0_im;
1037  o30_im -= A0_re;
1038 
1039  o01_re += A1_re;
1040  o01_im += A1_im;
1041  o11_re += B1_re;
1042  o11_im += B1_im;
1043  o21_re += B1_im;
1044  o21_im -= B1_re;
1045  o31_re += A1_im;
1046  o31_im -= A1_re;
1047 
1048  o02_re += A2_re;
1049  o02_im += A2_im;
1050  o12_re += B2_re;
1051  o12_im += B2_im;
1052  o22_re += B2_im;
1053  o22_im -= B2_re;
1054  o32_re += A2_im;
1055  o32_im -= A2_re;
1056 
1057 }
1058 
1059 #ifdef MULTI_GPU
1060 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
1062 #endif
1063 {
1064  // Projector P1-
1065  // 1 0 0 -1
1066  // 0 1 1 0
1067  // 0 1 1 0
1068  // -1 0 0 1
1069 
1070 #ifdef MULTI_GPU
1071  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
1072  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1073 #else
1074  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
1075 #endif
1076 
1077  const int ga_idx = sid;
1078 
1085 
1086 #ifdef MULTI_GPU
1087  if (kernel_type == INTERIOR_KERNEL) {
1088 #endif
1089 
1090  // read spinor from device memory
1091  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1092 
1093  // project spinor into half spinors
1094  a0_re = +i00_re-i30_re;
1095  a0_im = +i00_im-i30_im;
1096  a1_re = +i01_re-i31_re;
1097  a1_im = +i01_im-i31_im;
1098  a2_re = +i02_re-i32_re;
1099  a2_im = +i02_im-i32_im;
1100  b0_re = +i10_re+i20_re;
1101  b0_im = +i10_im+i20_im;
1102  b1_re = +i11_re+i21_re;
1103  b1_im = +i11_im+i21_im;
1104  b2_re = +i12_re+i22_re;
1105  b2_im = +i12_im+i22_im;
1106 
1107 #ifdef MULTI_GPU
1108  } else {
1109 
1110  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1111 
1112  // read half spinor from device memory
1113  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1114 
1115  a0_re = i00_re; a0_im = i00_im;
1116  a1_re = i01_re; a1_im = i01_im;
1117  a2_re = i02_re; a2_im = i02_im;
1118  b0_re = i10_re; b0_im = i10_im;
1119  b1_re = i11_re; b1_im = i11_im;
1120  b2_re = i12_re; b2_im = i12_im;
1121 
1122  }
1123 #endif // MULTI_GPU
1124 
1125  // read gauge matrix from device memory
1126  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
1127 
1128  // reconstruct gauge matrix
1130 
1131  // multiply row 0
1132  spinorFloat A0_re = 0;
1133  A0_re += g00_re * a0_re;
1134  A0_re -= g00_im * a0_im;
1135  A0_re += g01_re * a1_re;
1136  A0_re -= g01_im * a1_im;
1137  A0_re += g02_re * a2_re;
1138  A0_re -= g02_im * a2_im;
1139  spinorFloat A0_im = 0;
1140  A0_im += g00_re * a0_im;
1141  A0_im += g00_im * a0_re;
1142  A0_im += g01_re * a1_im;
1143  A0_im += g01_im * a1_re;
1144  A0_im += g02_re * a2_im;
1145  A0_im += g02_im * a2_re;
1146  spinorFloat B0_re = 0;
1147  B0_re += g00_re * b0_re;
1148  B0_re -= g00_im * b0_im;
1149  B0_re += g01_re * b1_re;
1150  B0_re -= g01_im * b1_im;
1151  B0_re += g02_re * b2_re;
1152  B0_re -= g02_im * b2_im;
1153  spinorFloat B0_im = 0;
1154  B0_im += g00_re * b0_im;
1155  B0_im += g00_im * b0_re;
1156  B0_im += g01_re * b1_im;
1157  B0_im += g01_im * b1_re;
1158  B0_im += g02_re * b2_im;
1159  B0_im += g02_im * b2_re;
1160 
1161  // multiply row 1
1162  spinorFloat A1_re = 0;
1163  A1_re += g10_re * a0_re;
1164  A1_re -= g10_im * a0_im;
1165  A1_re += g11_re * a1_re;
1166  A1_re -= g11_im * a1_im;
1167  A1_re += g12_re * a2_re;
1168  A1_re -= g12_im * a2_im;
1169  spinorFloat A1_im = 0;
1170  A1_im += g10_re * a0_im;
1171  A1_im += g10_im * a0_re;
1172  A1_im += g11_re * a1_im;
1173  A1_im += g11_im * a1_re;
1174  A1_im += g12_re * a2_im;
1175  A1_im += g12_im * a2_re;
1176  spinorFloat B1_re = 0;
1177  B1_re += g10_re * b0_re;
1178  B1_re -= g10_im * b0_im;
1179  B1_re += g11_re * b1_re;
1180  B1_re -= g11_im * b1_im;
1181  B1_re += g12_re * b2_re;
1182  B1_re -= g12_im * b2_im;
1183  spinorFloat B1_im = 0;
1184  B1_im += g10_re * b0_im;
1185  B1_im += g10_im * b0_re;
1186  B1_im += g11_re * b1_im;
1187  B1_im += g11_im * b1_re;
1188  B1_im += g12_re * b2_im;
1189  B1_im += g12_im * b2_re;
1190 
1191  // multiply row 2
1192  spinorFloat A2_re = 0;
1193  A2_re += g20_re * a0_re;
1194  A2_re -= g20_im * a0_im;
1195  A2_re += g21_re * a1_re;
1196  A2_re -= g21_im * a1_im;
1197  A2_re += g22_re * a2_re;
1198  A2_re -= g22_im * a2_im;
1199  spinorFloat A2_im = 0;
1200  A2_im += g20_re * a0_im;
1201  A2_im += g20_im * a0_re;
1202  A2_im += g21_re * a1_im;
1203  A2_im += g21_im * a1_re;
1204  A2_im += g22_re * a2_im;
1205  A2_im += g22_im * a2_re;
1206  spinorFloat B2_re = 0;
1207  B2_re += g20_re * b0_re;
1208  B2_re -= g20_im * b0_im;
1209  B2_re += g21_re * b1_re;
1210  B2_re -= g21_im * b1_im;
1211  B2_re += g22_re * b2_re;
1212  B2_re -= g22_im * b2_im;
1213  spinorFloat B2_im = 0;
1214  B2_im += g20_re * b0_im;
1215  B2_im += g20_im * b0_re;
1216  B2_im += g21_re * b1_im;
1217  B2_im += g21_im * b1_re;
1218  B2_im += g22_re * b2_im;
1219  B2_im += g22_im * b2_re;
1220 
1221  o00_re += A0_re;
1222  o00_im += A0_im;
1223  o10_re += B0_re;
1224  o10_im += B0_im;
1225  o20_re += B0_re;
1226  o20_im += B0_im;
1227  o30_re -= A0_re;
1228  o30_im -= A0_im;
1229 
1230  o01_re += A1_re;
1231  o01_im += A1_im;
1232  o11_re += B1_re;
1233  o11_im += B1_im;
1234  o21_re += B1_re;
1235  o21_im += B1_im;
1236  o31_re -= A1_re;
1237  o31_im -= A1_im;
1238 
1239  o02_re += A2_re;
1240  o02_im += A2_im;
1241  o12_re += B2_re;
1242  o12_im += B2_im;
1243  o22_re += B2_re;
1244  o22_im += B2_im;
1245  o32_re -= A2_re;
1246  o32_im -= A2_im;
1247 
1248 }
1249 
1250 #ifdef MULTI_GPU
1251 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
1252  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
1253 #endif
1254 {
1255  // Projector P1+
1256  // 1 0 0 1
1257  // 0 1 -1 0
1258  // 0 -1 1 0
1259  // 1 0 0 1
1260 
1261 #ifdef MULTI_GPU
1262  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
1263  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1264 #else
1265  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
1266 #endif
1267 
1268 #ifdef MULTI_GPU
1269  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1270 #else
1271  const int ga_idx = sp_idx;
1272 #endif
1273 
1280 
1281 #ifdef MULTI_GPU
1282  if (kernel_type == INTERIOR_KERNEL) {
1283 #endif
1284 
1285  // read spinor from device memory
1286  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1287 
1288  // project spinor into half spinors
1289  a0_re = +i00_re+i30_re;
1290  a0_im = +i00_im+i30_im;
1291  a1_re = +i01_re+i31_re;
1292  a1_im = +i01_im+i31_im;
1293  a2_re = +i02_re+i32_re;
1294  a2_im = +i02_im+i32_im;
1295  b0_re = +i10_re-i20_re;
1296  b0_im = +i10_im-i20_im;
1297  b1_re = +i11_re-i21_re;
1298  b1_im = +i11_im-i21_im;
1299  b2_re = +i12_re-i22_re;
1300  b2_im = +i12_im-i22_im;
1301 
1302 #ifdef MULTI_GPU
1303  } else {
1304 
1305  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1306 
1307  // read half spinor from device memory
1308  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1309 
1310  a0_re = i00_re; a0_im = i00_im;
1311  a1_re = i01_re; a1_im = i01_im;
1312  a2_re = i02_re; a2_im = i02_im;
1313  b0_re = i10_re; b0_im = i10_im;
1314  b1_re = i11_re; b1_im = i11_im;
1315  b2_re = i12_re; b2_im = i12_im;
1316 
1317  }
1318 #endif // MULTI_GPU
1319 
1320  // read gauge matrix from device memory
1321  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
1322 
1323  // reconstruct gauge matrix
1325 
1326  // multiply row 0
1327  spinorFloat A0_re = 0;
1328  A0_re += gT00_re * a0_re;
1329  A0_re -= gT00_im * a0_im;
1330  A0_re += gT01_re * a1_re;
1331  A0_re -= gT01_im * a1_im;
1332  A0_re += gT02_re * a2_re;
1333  A0_re -= gT02_im * a2_im;
1334  spinorFloat A0_im = 0;
1335  A0_im += gT00_re * a0_im;
1336  A0_im += gT00_im * a0_re;
1337  A0_im += gT01_re * a1_im;
1338  A0_im += gT01_im * a1_re;
1339  A0_im += gT02_re * a2_im;
1340  A0_im += gT02_im * a2_re;
1341  spinorFloat B0_re = 0;
1342  B0_re += gT00_re * b0_re;
1343  B0_re -= gT00_im * b0_im;
1344  B0_re += gT01_re * b1_re;
1345  B0_re -= gT01_im * b1_im;
1346  B0_re += gT02_re * b2_re;
1347  B0_re -= gT02_im * b2_im;
1348  spinorFloat B0_im = 0;
1349  B0_im += gT00_re * b0_im;
1350  B0_im += gT00_im * b0_re;
1351  B0_im += gT01_re * b1_im;
1352  B0_im += gT01_im * b1_re;
1353  B0_im += gT02_re * b2_im;
1354  B0_im += gT02_im * b2_re;
1355 
1356  // multiply row 1
1357  spinorFloat A1_re = 0;
1358  A1_re += gT10_re * a0_re;
1359  A1_re -= gT10_im * a0_im;
1360  A1_re += gT11_re * a1_re;
1361  A1_re -= gT11_im * a1_im;
1362  A1_re += gT12_re * a2_re;
1363  A1_re -= gT12_im * a2_im;
1364  spinorFloat A1_im = 0;
1365  A1_im += gT10_re * a0_im;
1366  A1_im += gT10_im * a0_re;
1367  A1_im += gT11_re * a1_im;
1368  A1_im += gT11_im * a1_re;
1369  A1_im += gT12_re * a2_im;
1370  A1_im += gT12_im * a2_re;
1371  spinorFloat B1_re = 0;
1372  B1_re += gT10_re * b0_re;
1373  B1_re -= gT10_im * b0_im;
1374  B1_re += gT11_re * b1_re;
1375  B1_re -= gT11_im * b1_im;
1376  B1_re += gT12_re * b2_re;
1377  B1_re -= gT12_im * b2_im;
1378  spinorFloat B1_im = 0;
1379  B1_im += gT10_re * b0_im;
1380  B1_im += gT10_im * b0_re;
1381  B1_im += gT11_re * b1_im;
1382  B1_im += gT11_im * b1_re;
1383  B1_im += gT12_re * b2_im;
1384  B1_im += gT12_im * b2_re;
1385 
1386  // multiply row 2
1387  spinorFloat A2_re = 0;
1388  A2_re += gT20_re * a0_re;
1389  A2_re -= gT20_im * a0_im;
1390  A2_re += gT21_re * a1_re;
1391  A2_re -= gT21_im * a1_im;
1392  A2_re += gT22_re * a2_re;
1393  A2_re -= gT22_im * a2_im;
1394  spinorFloat A2_im = 0;
1395  A2_im += gT20_re * a0_im;
1396  A2_im += gT20_im * a0_re;
1397  A2_im += gT21_re * a1_im;
1398  A2_im += gT21_im * a1_re;
1399  A2_im += gT22_re * a2_im;
1400  A2_im += gT22_im * a2_re;
1401  spinorFloat B2_re = 0;
1402  B2_re += gT20_re * b0_re;
1403  B2_re -= gT20_im * b0_im;
1404  B2_re += gT21_re * b1_re;
1405  B2_re -= gT21_im * b1_im;
1406  B2_re += gT22_re * b2_re;
1407  B2_re -= gT22_im * b2_im;
1408  spinorFloat B2_im = 0;
1409  B2_im += gT20_re * b0_im;
1410  B2_im += gT20_im * b0_re;
1411  B2_im += gT21_re * b1_im;
1412  B2_im += gT21_im * b1_re;
1413  B2_im += gT22_re * b2_im;
1414  B2_im += gT22_im * b2_re;
1415 
1416  o00_re += A0_re;
1417  o00_im += A0_im;
1418  o10_re += B0_re;
1419  o10_im += B0_im;
1420  o20_re -= B0_re;
1421  o20_im -= B0_im;
1422  o30_re += A0_re;
1423  o30_im += A0_im;
1424 
1425  o01_re += A1_re;
1426  o01_im += A1_im;
1427  o11_re += B1_re;
1428  o11_im += B1_im;
1429  o21_re -= B1_re;
1430  o21_im -= B1_im;
1431  o31_re += A1_re;
1432  o31_im += A1_im;
1433 
1434  o02_re += A2_re;
1435  o02_im += A2_im;
1436  o12_re += B2_re;
1437  o12_im += B2_im;
1438  o22_re -= B2_re;
1439  o22_im -= B2_im;
1440  o32_re += A2_re;
1441  o32_im += A2_im;
1442 
1443 }
1444 
1445 #ifdef MULTI_GPU
1446 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1448 #endif
1449 {
1450  // Projector P2-
1451  // 1 0 -i 0
1452  // 0 1 0 i
1453  // i 0 1 0
1454  // 0 -i 0 1
1455 
1456 #ifdef MULTI_GPU
1457  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1458  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1459 #else
1460  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1461 #endif
1462 
1463  const int ga_idx = sid;
1464 
1471 
1472 #ifdef MULTI_GPU
1473  if (kernel_type == INTERIOR_KERNEL) {
1474 #endif
1475 
1476  // read spinor from device memory
1477  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1478 
1479  // project spinor into half spinors
1480  a0_re = +i00_re+i20_im;
1481  a0_im = +i00_im-i20_re;
1482  a1_re = +i01_re+i21_im;
1483  a1_im = +i01_im-i21_re;
1484  a2_re = +i02_re+i22_im;
1485  a2_im = +i02_im-i22_re;
1486  b0_re = +i10_re-i30_im;
1487  b0_im = +i10_im+i30_re;
1488  b1_re = +i11_re-i31_im;
1489  b1_im = +i11_im+i31_re;
1490  b2_re = +i12_re-i32_im;
1491  b2_im = +i12_im+i32_re;
1492 
1493 #ifdef MULTI_GPU
1494  } else {
1495 
1496  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1497 
1498  // read half spinor from device memory
1499  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1500 
1501  a0_re = i00_re; a0_im = i00_im;
1502  a1_re = i01_re; a1_im = i01_im;
1503  a2_re = i02_re; a2_im = i02_im;
1504  b0_re = i10_re; b0_im = i10_im;
1505  b1_re = i11_re; b1_im = i11_im;
1506  b2_re = i12_re; b2_im = i12_im;
1507 
1508  }
1509 #endif // MULTI_GPU
1510 
1511  // read gauge matrix from device memory
1512  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1513 
1514  // reconstruct gauge matrix
1516 
1517  // multiply row 0
1518  spinorFloat A0_re = 0;
1519  A0_re += g00_re * a0_re;
1520  A0_re -= g00_im * a0_im;
1521  A0_re += g01_re * a1_re;
1522  A0_re -= g01_im * a1_im;
1523  A0_re += g02_re * a2_re;
1524  A0_re -= g02_im * a2_im;
1525  spinorFloat A0_im = 0;
1526  A0_im += g00_re * a0_im;
1527  A0_im += g00_im * a0_re;
1528  A0_im += g01_re * a1_im;
1529  A0_im += g01_im * a1_re;
1530  A0_im += g02_re * a2_im;
1531  A0_im += g02_im * a2_re;
1532  spinorFloat B0_re = 0;
1533  B0_re += g00_re * b0_re;
1534  B0_re -= g00_im * b0_im;
1535  B0_re += g01_re * b1_re;
1536  B0_re -= g01_im * b1_im;
1537  B0_re += g02_re * b2_re;
1538  B0_re -= g02_im * b2_im;
1539  spinorFloat B0_im = 0;
1540  B0_im += g00_re * b0_im;
1541  B0_im += g00_im * b0_re;
1542  B0_im += g01_re * b1_im;
1543  B0_im += g01_im * b1_re;
1544  B0_im += g02_re * b2_im;
1545  B0_im += g02_im * b2_re;
1546 
1547  // multiply row 1
1548  spinorFloat A1_re = 0;
1549  A1_re += g10_re * a0_re;
1550  A1_re -= g10_im * a0_im;
1551  A1_re += g11_re * a1_re;
1552  A1_re -= g11_im * a1_im;
1553  A1_re += g12_re * a2_re;
1554  A1_re -= g12_im * a2_im;
1555  spinorFloat A1_im = 0;
1556  A1_im += g10_re * a0_im;
1557  A1_im += g10_im * a0_re;
1558  A1_im += g11_re * a1_im;
1559  A1_im += g11_im * a1_re;
1560  A1_im += g12_re * a2_im;
1561  A1_im += g12_im * a2_re;
1562  spinorFloat B1_re = 0;
1563  B1_re += g10_re * b0_re;
1564  B1_re -= g10_im * b0_im;
1565  B1_re += g11_re * b1_re;
1566  B1_re -= g11_im * b1_im;
1567  B1_re += g12_re * b2_re;
1568  B1_re -= g12_im * b2_im;
1569  spinorFloat B1_im = 0;
1570  B1_im += g10_re * b0_im;
1571  B1_im += g10_im * b0_re;
1572  B1_im += g11_re * b1_im;
1573  B1_im += g11_im * b1_re;
1574  B1_im += g12_re * b2_im;
1575  B1_im += g12_im * b2_re;
1576 
1577  // multiply row 2
1578  spinorFloat A2_re = 0;
1579  A2_re += g20_re * a0_re;
1580  A2_re -= g20_im * a0_im;
1581  A2_re += g21_re * a1_re;
1582  A2_re -= g21_im * a1_im;
1583  A2_re += g22_re * a2_re;
1584  A2_re -= g22_im * a2_im;
1585  spinorFloat A2_im = 0;
1586  A2_im += g20_re * a0_im;
1587  A2_im += g20_im * a0_re;
1588  A2_im += g21_re * a1_im;
1589  A2_im += g21_im * a1_re;
1590  A2_im += g22_re * a2_im;
1591  A2_im += g22_im * a2_re;
1592  spinorFloat B2_re = 0;
1593  B2_re += g20_re * b0_re;
1594  B2_re -= g20_im * b0_im;
1595  B2_re += g21_re * b1_re;
1596  B2_re -= g21_im * b1_im;
1597  B2_re += g22_re * b2_re;
1598  B2_re -= g22_im * b2_im;
1599  spinorFloat B2_im = 0;
1600  B2_im += g20_re * b0_im;
1601  B2_im += g20_im * b0_re;
1602  B2_im += g21_re * b1_im;
1603  B2_im += g21_im * b1_re;
1604  B2_im += g22_re * b2_im;
1605  B2_im += g22_im * b2_re;
1606 
1607  o00_re += A0_re;
1608  o00_im += A0_im;
1609  o10_re += B0_re;
1610  o10_im += B0_im;
1611  o20_re -= A0_im;
1612  o20_im += A0_re;
1613  o30_re += B0_im;
1614  o30_im -= B0_re;
1615 
1616  o01_re += A1_re;
1617  o01_im += A1_im;
1618  o11_re += B1_re;
1619  o11_im += B1_im;
1620  o21_re -= A1_im;
1621  o21_im += A1_re;
1622  o31_re += B1_im;
1623  o31_im -= B1_re;
1624 
1625  o02_re += A2_re;
1626  o02_im += A2_im;
1627  o12_re += B2_re;
1628  o12_im += B2_im;
1629  o22_re -= A2_im;
1630  o22_im += A2_re;
1631  o32_re += B2_im;
1632  o32_im -= B2_re;
1633 
1634 }
1635 
1636 #ifdef MULTI_GPU
1637 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1638  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1639 #endif
1640 {
1641  // Projector P2+
1642  // 1 0 i 0
1643  // 0 1 0 -i
1644  // -i 0 1 0
1645  // 0 i 0 1
1646 
1647 #ifdef MULTI_GPU
1648  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1649  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1650 #else
1651  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1652 #endif
1653 
1654 #ifdef MULTI_GPU
1655  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1656 #else
1657  const int ga_idx = sp_idx;
1658 #endif
1659 
1666 
1667 #ifdef MULTI_GPU
1668  if (kernel_type == INTERIOR_KERNEL) {
1669 #endif
1670 
1671  // read spinor from device memory
1672  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1673 
1674  // project spinor into half spinors
1675  a0_re = +i00_re-i20_im;
1676  a0_im = +i00_im+i20_re;
1677  a1_re = +i01_re-i21_im;
1678  a1_im = +i01_im+i21_re;
1679  a2_re = +i02_re-i22_im;
1680  a2_im = +i02_im+i22_re;
1681  b0_re = +i10_re+i30_im;
1682  b0_im = +i10_im-i30_re;
1683  b1_re = +i11_re+i31_im;
1684  b1_im = +i11_im-i31_re;
1685  b2_re = +i12_re+i32_im;
1686  b2_im = +i12_im-i32_re;
1687 
1688 #ifdef MULTI_GPU
1689  } else {
1690 
1691  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1692 
1693  // read half spinor from device memory
1694  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1695 
1696  a0_re = i00_re; a0_im = i00_im;
1697  a1_re = i01_re; a1_im = i01_im;
1698  a2_re = i02_re; a2_im = i02_im;
1699  b0_re = i10_re; b0_im = i10_im;
1700  b1_re = i11_re; b1_im = i11_im;
1701  b2_re = i12_re; b2_im = i12_im;
1702 
1703  }
1704 #endif // MULTI_GPU
1705 
1706  // read gauge matrix from device memory
1707  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1708 
1709  // reconstruct gauge matrix
1711 
1712  // multiply row 0
1713  spinorFloat A0_re = 0;
1714  A0_re += gT00_re * a0_re;
1715  A0_re -= gT00_im * a0_im;
1716  A0_re += gT01_re * a1_re;
1717  A0_re -= gT01_im * a1_im;
1718  A0_re += gT02_re * a2_re;
1719  A0_re -= gT02_im * a2_im;
1720  spinorFloat A0_im = 0;
1721  A0_im += gT00_re * a0_im;
1722  A0_im += gT00_im * a0_re;
1723  A0_im += gT01_re * a1_im;
1724  A0_im += gT01_im * a1_re;
1725  A0_im += gT02_re * a2_im;
1726  A0_im += gT02_im * a2_re;
1727  spinorFloat B0_re = 0;
1728  B0_re += gT00_re * b0_re;
1729  B0_re -= gT00_im * b0_im;
1730  B0_re += gT01_re * b1_re;
1731  B0_re -= gT01_im * b1_im;
1732  B0_re += gT02_re * b2_re;
1733  B0_re -= gT02_im * b2_im;
1734  spinorFloat B0_im = 0;
1735  B0_im += gT00_re * b0_im;
1736  B0_im += gT00_im * b0_re;
1737  B0_im += gT01_re * b1_im;
1738  B0_im += gT01_im * b1_re;
1739  B0_im += gT02_re * b2_im;
1740  B0_im += gT02_im * b2_re;
1741 
1742  // multiply row 1
1743  spinorFloat A1_re = 0;
1744  A1_re += gT10_re * a0_re;
1745  A1_re -= gT10_im * a0_im;
1746  A1_re += gT11_re * a1_re;
1747  A1_re -= gT11_im * a1_im;
1748  A1_re += gT12_re * a2_re;
1749  A1_re -= gT12_im * a2_im;
1750  spinorFloat A1_im = 0;
1751  A1_im += gT10_re * a0_im;
1752  A1_im += gT10_im * a0_re;
1753  A1_im += gT11_re * a1_im;
1754  A1_im += gT11_im * a1_re;
1755  A1_im += gT12_re * a2_im;
1756  A1_im += gT12_im * a2_re;
1757  spinorFloat B1_re = 0;
1758  B1_re += gT10_re * b0_re;
1759  B1_re -= gT10_im * b0_im;
1760  B1_re += gT11_re * b1_re;
1761  B1_re -= gT11_im * b1_im;
1762  B1_re += gT12_re * b2_re;
1763  B1_re -= gT12_im * b2_im;
1764  spinorFloat B1_im = 0;
1765  B1_im += gT10_re * b0_im;
1766  B1_im += gT10_im * b0_re;
1767  B1_im += gT11_re * b1_im;
1768  B1_im += gT11_im * b1_re;
1769  B1_im += gT12_re * b2_im;
1770  B1_im += gT12_im * b2_re;
1771 
1772  // multiply row 2
1773  spinorFloat A2_re = 0;
1774  A2_re += gT20_re * a0_re;
1775  A2_re -= gT20_im * a0_im;
1776  A2_re += gT21_re * a1_re;
1777  A2_re -= gT21_im * a1_im;
1778  A2_re += gT22_re * a2_re;
1779  A2_re -= gT22_im * a2_im;
1780  spinorFloat A2_im = 0;
1781  A2_im += gT20_re * a0_im;
1782  A2_im += gT20_im * a0_re;
1783  A2_im += gT21_re * a1_im;
1784  A2_im += gT21_im * a1_re;
1785  A2_im += gT22_re * a2_im;
1786  A2_im += gT22_im * a2_re;
1787  spinorFloat B2_re = 0;
1788  B2_re += gT20_re * b0_re;
1789  B2_re -= gT20_im * b0_im;
1790  B2_re += gT21_re * b1_re;
1791  B2_re -= gT21_im * b1_im;
1792  B2_re += gT22_re * b2_re;
1793  B2_re -= gT22_im * b2_im;
1794  spinorFloat B2_im = 0;
1795  B2_im += gT20_re * b0_im;
1796  B2_im += gT20_im * b0_re;
1797  B2_im += gT21_re * b1_im;
1798  B2_im += gT21_im * b1_re;
1799  B2_im += gT22_re * b2_im;
1800  B2_im += gT22_im * b2_re;
1801 
1802  o00_re += A0_re;
1803  o00_im += A0_im;
1804  o10_re += B0_re;
1805  o10_im += B0_im;
1806  o20_re += A0_im;
1807  o20_im -= A0_re;
1808  o30_re -= B0_im;
1809  o30_im += B0_re;
1810 
1811  o01_re += A1_re;
1812  o01_im += A1_im;
1813  o11_re += B1_re;
1814  o11_im += B1_im;
1815  o21_re += A1_im;
1816  o21_im -= A1_re;
1817  o31_re -= B1_im;
1818  o31_im += B1_re;
1819 
1820  o02_re += A2_re;
1821  o02_im += A2_im;
1822  o12_re += B2_re;
1823  o12_im += B2_im;
1824  o22_re += A2_im;
1825  o22_im -= A2_re;
1826  o32_re -= B2_im;
1827  o32_im += B2_re;
1828 
1829 }
1830 
1831 #ifdef MULTI_GPU
1832 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1834 #endif
1835 {
1836  // Projector P3-
1837  // 0 0 0 0
1838  // 0 0 0 0
1839  // 0 0 2 0
1840  // 0 0 0 2
1841 
1842 #ifdef MULTI_GPU
1843  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1844  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1845 #else
1846  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1847 #endif
1848 
1849  const int ga_idx = sid;
1850 
1852  {
1859 
1860 #ifdef MULTI_GPU
1861  if (kernel_type == INTERIOR_KERNEL) {
1862 #endif
1863 
1864  // read spinor from device memory
1865  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1866 
1867  // project spinor into half spinors
1868  a0_re = +2*i20_re;
1869  a0_im = +2*i20_im;
1870  a1_re = +2*i21_re;
1871  a1_im = +2*i21_im;
1872  a2_re = +2*i22_re;
1873  a2_im = +2*i22_im;
1874  b0_re = +2*i30_re;
1875  b0_im = +2*i30_im;
1876  b1_re = +2*i31_re;
1877  b1_im = +2*i31_im;
1878  b2_re = +2*i32_re;
1879  b2_im = +2*i32_im;
1880 
1881 #ifdef MULTI_GPU
1882  } else {
1883 
1884  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1885  const int t_proj_scale = TPROJSCALE;
1886 
1887  // read half spinor from device memory
1888  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1889 
1890  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1891  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1892  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1893  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1894  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1895  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1896 
1897  }
1898 #endif // MULTI_GPU
1899 
1900  // identity gauge matrix
1907 
1908  o20_re += A0_re;
1909  o20_im += A0_im;
1910  o30_re += B0_re;
1911  o30_im += B0_im;
1912 
1913  o21_re += A1_re;
1914  o21_im += A1_im;
1915  o31_re += B1_re;
1916  o31_im += B1_im;
1917 
1918  o22_re += A2_re;
1919  o22_im += A2_im;
1920  o32_re += B2_re;
1921  o32_im += B2_im;
1922 
1923  } else {
1930 
1931 #ifdef MULTI_GPU
1932  if (kernel_type == INTERIOR_KERNEL) {
1933 #endif
1934 
1935  // read spinor from device memory
1936  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1937 
1938  // project spinor into half spinors
1939  a0_re = +2*i20_re;
1940  a0_im = +2*i20_im;
1941  a1_re = +2*i21_re;
1942  a1_im = +2*i21_im;
1943  a2_re = +2*i22_re;
1944  a2_im = +2*i22_im;
1945  b0_re = +2*i30_re;
1946  b0_im = +2*i30_im;
1947  b1_re = +2*i31_re;
1948  b1_im = +2*i31_im;
1949  b2_re = +2*i32_re;
1950  b2_im = +2*i32_im;
1951 
1952 #ifdef MULTI_GPU
1953  } else {
1954 
1955  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1956  const int t_proj_scale = TPROJSCALE;
1957 
1958  // read half spinor from device memory
1959  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1960 
1961  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1962  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1963  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1964  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1965  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1966  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1967 
1968  }
1969 #endif // MULTI_GPU
1970 
1971  // read gauge matrix from device memory
1972  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1973 
1974  // reconstruct gauge matrix
1976 
1977  // multiply row 0
1978  spinorFloat A0_re = 0;
1979  A0_re += g00_re * a0_re;
1980  A0_re -= g00_im * a0_im;
1981  A0_re += g01_re * a1_re;
1982  A0_re -= g01_im * a1_im;
1983  A0_re += g02_re * a2_re;
1984  A0_re -= g02_im * a2_im;
1985  spinorFloat A0_im = 0;
1986  A0_im += g00_re * a0_im;
1987  A0_im += g00_im * a0_re;
1988  A0_im += g01_re * a1_im;
1989  A0_im += g01_im * a1_re;
1990  A0_im += g02_re * a2_im;
1991  A0_im += g02_im * a2_re;
1992  spinorFloat B0_re = 0;
1993  B0_re += g00_re * b0_re;
1994  B0_re -= g00_im * b0_im;
1995  B0_re += g01_re * b1_re;
1996  B0_re -= g01_im * b1_im;
1997  B0_re += g02_re * b2_re;
1998  B0_re -= g02_im * b2_im;
1999  spinorFloat B0_im = 0;
2000  B0_im += g00_re * b0_im;
2001  B0_im += g00_im * b0_re;
2002  B0_im += g01_re * b1_im;
2003  B0_im += g01_im * b1_re;
2004  B0_im += g02_re * b2_im;
2005  B0_im += g02_im * b2_re;
2006 
2007  // multiply row 1
2008  spinorFloat A1_re = 0;
2009  A1_re += g10_re * a0_re;
2010  A1_re -= g10_im * a0_im;
2011  A1_re += g11_re * a1_re;
2012  A1_re -= g11_im * a1_im;
2013  A1_re += g12_re * a2_re;
2014  A1_re -= g12_im * a2_im;
2015  spinorFloat A1_im = 0;
2016  A1_im += g10_re * a0_im;
2017  A1_im += g10_im * a0_re;
2018  A1_im += g11_re * a1_im;
2019  A1_im += g11_im * a1_re;
2020  A1_im += g12_re * a2_im;
2021  A1_im += g12_im * a2_re;
2022  spinorFloat B1_re = 0;
2023  B1_re += g10_re * b0_re;
2024  B1_re -= g10_im * b0_im;
2025  B1_re += g11_re * b1_re;
2026  B1_re -= g11_im * b1_im;
2027  B1_re += g12_re * b2_re;
2028  B1_re -= g12_im * b2_im;
2029  spinorFloat B1_im = 0;
2030  B1_im += g10_re * b0_im;
2031  B1_im += g10_im * b0_re;
2032  B1_im += g11_re * b1_im;
2033  B1_im += g11_im * b1_re;
2034  B1_im += g12_re * b2_im;
2035  B1_im += g12_im * b2_re;
2036 
2037  // multiply row 2
2038  spinorFloat A2_re = 0;
2039  A2_re += g20_re * a0_re;
2040  A2_re -= g20_im * a0_im;
2041  A2_re += g21_re * a1_re;
2042  A2_re -= g21_im * a1_im;
2043  A2_re += g22_re * a2_re;
2044  A2_re -= g22_im * a2_im;
2045  spinorFloat A2_im = 0;
2046  A2_im += g20_re * a0_im;
2047  A2_im += g20_im * a0_re;
2048  A2_im += g21_re * a1_im;
2049  A2_im += g21_im * a1_re;
2050  A2_im += g22_re * a2_im;
2051  A2_im += g22_im * a2_re;
2052  spinorFloat B2_re = 0;
2053  B2_re += g20_re * b0_re;
2054  B2_re -= g20_im * b0_im;
2055  B2_re += g21_re * b1_re;
2056  B2_re -= g21_im * b1_im;
2057  B2_re += g22_re * b2_re;
2058  B2_re -= g22_im * b2_im;
2059  spinorFloat B2_im = 0;
2060  B2_im += g20_re * b0_im;
2061  B2_im += g20_im * b0_re;
2062  B2_im += g21_re * b1_im;
2063  B2_im += g21_im * b1_re;
2064  B2_im += g22_re * b2_im;
2065  B2_im += g22_im * b2_re;
2066 
2067  o20_re += A0_re;
2068  o20_im += A0_im;
2069  o30_re += B0_re;
2070  o30_im += B0_im;
2071 
2072  o21_re += A1_re;
2073  o21_im += A1_im;
2074  o31_re += B1_re;
2075  o31_im += B1_im;
2076 
2077  o22_re += A2_re;
2078  o22_im += A2_im;
2079  o32_re += B2_re;
2080  o32_im += B2_im;
2081 
2082  }
2083 }
2084 
2085 #ifdef MULTI_GPU
2086 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
2087  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
2088 #endif
2089 {
2090  // Projector P3+
2091  // 2 0 0 0
2092  // 0 2 0 0
2093  // 0 0 0 0
2094  // 0 0 0 0
2095 
2096 #ifdef MULTI_GPU
2097  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
2098  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2099 #else
2100  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
2101 #endif
2102 
2103 #ifdef MULTI_GPU
2104  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
2105 #else
2106  const int ga_idx = sp_idx;
2107 #endif
2108 
2109  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
2110  {
2117 
2118 #ifdef MULTI_GPU
2119  if (kernel_type == INTERIOR_KERNEL) {
2120 #endif
2121 
2122  // read spinor from device memory
2123  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2124 
2125  // project spinor into half spinors
2126  a0_re = +2*i00_re;
2127  a0_im = +2*i00_im;
2128  a1_re = +2*i01_re;
2129  a1_im = +2*i01_im;
2130  a2_re = +2*i02_re;
2131  a2_im = +2*i02_im;
2132  b0_re = +2*i10_re;
2133  b0_im = +2*i10_im;
2134  b1_re = +2*i11_re;
2135  b1_im = +2*i11_im;
2136  b2_re = +2*i12_re;
2137  b2_im = +2*i12_im;
2138 
2139 #ifdef MULTI_GPU
2140  } else {
2141 
2142  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2143  const int t_proj_scale = TPROJSCALE;
2144 
2145  // read half spinor from device memory
2146  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2147 
2148  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2149  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2150  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2151  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2152  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2153  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2154 
2155  }
2156 #endif // MULTI_GPU
2157 
2158  // identity gauge matrix
2165 
2166  o00_re += A0_re;
2167  o00_im += A0_im;
2168  o10_re += B0_re;
2169  o10_im += B0_im;
2170 
2171  o01_re += A1_re;
2172  o01_im += A1_im;
2173  o11_re += B1_re;
2174  o11_im += B1_im;
2175 
2176  o02_re += A2_re;
2177  o02_im += A2_im;
2178  o12_re += B2_re;
2179  o12_im += B2_im;
2180 
2181  } else {
2188 
2189 #ifdef MULTI_GPU
2190  if (kernel_type == INTERIOR_KERNEL) {
2191 #endif
2192 
2193  // read spinor from device memory
2194  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2195 
2196  // project spinor into half spinors
2197  a0_re = +2*i00_re;
2198  a0_im = +2*i00_im;
2199  a1_re = +2*i01_re;
2200  a1_im = +2*i01_im;
2201  a2_re = +2*i02_re;
2202  a2_im = +2*i02_im;
2203  b0_re = +2*i10_re;
2204  b0_im = +2*i10_im;
2205  b1_re = +2*i11_re;
2206  b1_im = +2*i11_im;
2207  b2_re = +2*i12_re;
2208  b2_im = +2*i12_im;
2209 
2210 #ifdef MULTI_GPU
2211  } else {
2212 
2213  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2214  const int t_proj_scale = TPROJSCALE;
2215 
2216  // read half spinor from device memory
2217  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2218 
2219  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2220  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2221  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2222  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2223  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2224  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2225 
2226  }
2227 #endif // MULTI_GPU
2228 
2229  // read gauge matrix from device memory
2230  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
2231 
2232  // reconstruct gauge matrix
2234 
2235  // multiply row 0
2236  spinorFloat A0_re = 0;
2237  A0_re += gT00_re * a0_re;
2238  A0_re -= gT00_im * a0_im;
2239  A0_re += gT01_re * a1_re;
2240  A0_re -= gT01_im * a1_im;
2241  A0_re += gT02_re * a2_re;
2242  A0_re -= gT02_im * a2_im;
2243  spinorFloat A0_im = 0;
2244  A0_im += gT00_re * a0_im;
2245  A0_im += gT00_im * a0_re;
2246  A0_im += gT01_re * a1_im;
2247  A0_im += gT01_im * a1_re;
2248  A0_im += gT02_re * a2_im;
2249  A0_im += gT02_im * a2_re;
2250  spinorFloat B0_re = 0;
2251  B0_re += gT00_re * b0_re;
2252  B0_re -= gT00_im * b0_im;
2253  B0_re += gT01_re * b1_re;
2254  B0_re -= gT01_im * b1_im;
2255  B0_re += gT02_re * b2_re;
2256  B0_re -= gT02_im * b2_im;
2257  spinorFloat B0_im = 0;
2258  B0_im += gT00_re * b0_im;
2259  B0_im += gT00_im * b0_re;
2260  B0_im += gT01_re * b1_im;
2261  B0_im += gT01_im * b1_re;
2262  B0_im += gT02_re * b2_im;
2263  B0_im += gT02_im * b2_re;
2264 
2265  // multiply row 1
2266  spinorFloat A1_re = 0;
2267  A1_re += gT10_re * a0_re;
2268  A1_re -= gT10_im * a0_im;
2269  A1_re += gT11_re * a1_re;
2270  A1_re -= gT11_im * a1_im;
2271  A1_re += gT12_re * a2_re;
2272  A1_re -= gT12_im * a2_im;
2273  spinorFloat A1_im = 0;
2274  A1_im += gT10_re * a0_im;
2275  A1_im += gT10_im * a0_re;
2276  A1_im += gT11_re * a1_im;
2277  A1_im += gT11_im * a1_re;
2278  A1_im += gT12_re * a2_im;
2279  A1_im += gT12_im * a2_re;
2280  spinorFloat B1_re = 0;
2281  B1_re += gT10_re * b0_re;
2282  B1_re -= gT10_im * b0_im;
2283  B1_re += gT11_re * b1_re;
2284  B1_re -= gT11_im * b1_im;
2285  B1_re += gT12_re * b2_re;
2286  B1_re -= gT12_im * b2_im;
2287  spinorFloat B1_im = 0;
2288  B1_im += gT10_re * b0_im;
2289  B1_im += gT10_im * b0_re;
2290  B1_im += gT11_re * b1_im;
2291  B1_im += gT11_im * b1_re;
2292  B1_im += gT12_re * b2_im;
2293  B1_im += gT12_im * b2_re;
2294 
2295  // multiply row 2
2296  spinorFloat A2_re = 0;
2297  A2_re += gT20_re * a0_re;
2298  A2_re -= gT20_im * a0_im;
2299  A2_re += gT21_re * a1_re;
2300  A2_re -= gT21_im * a1_im;
2301  A2_re += gT22_re * a2_re;
2302  A2_re -= gT22_im * a2_im;
2303  spinorFloat A2_im = 0;
2304  A2_im += gT20_re * a0_im;
2305  A2_im += gT20_im * a0_re;
2306  A2_im += gT21_re * a1_im;
2307  A2_im += gT21_im * a1_re;
2308  A2_im += gT22_re * a2_im;
2309  A2_im += gT22_im * a2_re;
2310  spinorFloat B2_re = 0;
2311  B2_re += gT20_re * b0_re;
2312  B2_re -= gT20_im * b0_im;
2313  B2_re += gT21_re * b1_re;
2314  B2_re -= gT21_im * b1_im;
2315  B2_re += gT22_re * b2_re;
2316  B2_re -= gT22_im * b2_im;
2317  spinorFloat B2_im = 0;
2318  B2_im += gT20_re * b0_im;
2319  B2_im += gT20_im * b0_re;
2320  B2_im += gT21_re * b1_im;
2321  B2_im += gT21_im * b1_re;
2322  B2_im += gT22_re * b2_im;
2323  B2_im += gT22_im * b2_re;
2324 
2325  o00_re += A0_re;
2326  o00_im += A0_im;
2327  o10_re += B0_re;
2328  o10_im += B0_im;
2329 
2330  o01_re += A1_re;
2331  o01_im += A1_im;
2332  o11_re += B1_re;
2333  o11_im += B1_im;
2334 
2335  o02_re += A2_re;
2336  o02_im += A2_im;
2337  o12_re += B2_re;
2338  o12_im += B2_im;
2339 
2340  }
2341 }
2342 
2343 #ifdef MULTI_GPU
2344 
2345 int incomplete = 0; // Have all 8 contributions been computed for this site?
2346 
2347 switch(kernel_type) { // intentional fall-through
2348 case INTERIOR_KERNEL:
2349  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
2350 case EXTERIOR_KERNEL_T:
2351  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
2352 case EXTERIOR_KERNEL_Z:
2353  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
2354 case EXTERIOR_KERNEL_Y:
2355  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
2356 }
2357 
2358 if (!incomplete)
2359 #endif // MULTI_GPU
2360 {
2361 #ifdef DSLASH_XPAY
2362  READ_ACCUM(ACCUMTEX, param.sp_stride)
2363 
2364 #ifndef CLOVER_TWIST_XPAY
2365  //perform invert twist first:
2366  APPLY_CLOVER_TWIST_INV(c, cinv, a, o);
2367  o00_re = b*o00_re + acc00_re;
2368  o00_im = b*o00_im + acc00_im;
2369  o01_re = b*o01_re + acc01_re;
2370  o01_im = b*o01_im + acc01_im;
2371  o02_re = b*o02_re + acc02_re;
2372  o02_im = b*o02_im + acc02_im;
2373  o10_re = b*o10_re + acc10_re;
2374  o10_im = b*o10_im + acc10_im;
2375  o11_re = b*o11_re + acc11_re;
2376  o11_im = b*o11_im + acc11_im;
2377  o12_re = b*o12_re + acc12_re;
2378  o12_im = b*o12_im + acc12_im;
2379  o20_re = b*o20_re + acc20_re;
2380  o20_im = b*o20_im + acc20_im;
2381  o21_re = b*o21_re + acc21_re;
2382  o21_im = b*o21_im + acc21_im;
2383  o22_re = b*o22_re + acc22_re;
2384  o22_im = b*o22_im + acc22_im;
2385  o30_re = b*o30_re + acc30_re;
2386  o30_im = b*o30_im + acc30_im;
2387  o31_re = b*o31_re + acc31_re;
2388  o31_im = b*o31_im + acc31_im;
2389  o32_re = b*o32_re + acc32_re;
2390  o32_im = b*o32_im + acc32_im;
2391 #else
2392  APPLY_CLOVER_TWIST(c, a, acc);
2393  o00_re = b*o00_re + acc00_re;
2394  o00_im = b*o00_im + acc00_im;
2395  o01_re = b*o01_re + acc01_re;
2396  o01_im = b*o01_im + acc01_im;
2397  o02_re = b*o02_re + acc02_re;
2398  o02_im = b*o02_im + acc02_im;
2399  o10_re = b*o10_re + acc10_re;
2400  o10_im = b*o10_im + acc10_im;
2401  o11_re = b*o11_re + acc11_re;
2402  o11_im = b*o11_im + acc11_im;
2403  o12_re = b*o12_re + acc12_re;
2404  o12_im = b*o12_im + acc12_im;
2405  o20_re = b*o20_re + acc20_re;
2406  o20_im = b*o20_im + acc20_im;
2407  o21_re = b*o21_re + acc21_re;
2408  o21_im = b*o21_im + acc21_im;
2409  o22_re = b*o22_re + acc22_re;
2410  o22_im = b*o22_im + acc22_im;
2411  o30_re = b*o30_re + acc30_re;
2412  o30_im = b*o30_im + acc30_im;
2413  o31_re = b*o31_re + acc31_re;
2414  o31_im = b*o31_im + acc31_im;
2415  o32_re = b*o32_re + acc32_re;
2416  o32_im = b*o32_im + acc32_im;
2417 #endif//CLOVER_TWIST_XPAY
2418 #else //no XPAY
2419  APPLY_CLOVER_TWIST_INV(c, cinv, a, o);
2420 #endif
2421 }
2422 
2423 // write spinor field back to device memory
2424 WRITE_SPINOR(param.sp_stride);
2425 
2426 // undefine to prevent warning when precision is changed
2427 #undef spinorFloat
2428 #undef g00_re
2429 #undef g00_im
2430 #undef g01_re
2431 #undef g01_im
2432 #undef g02_re
2433 #undef g02_im
2434 #undef g10_re
2435 #undef g10_im
2436 #undef g11_re
2437 #undef g11_im
2438 #undef g12_re
2439 #undef g12_im
2440 #undef g20_re
2441 #undef g20_im
2442 #undef g21_re
2443 #undef g21_im
2444 #undef g22_re
2445 #undef g22_im
2446 
2447 #undef i00_re
2448 #undef i00_im
2449 #undef i01_re
2450 #undef i01_im
2451 #undef i02_re
2452 #undef i02_im
2453 #undef i10_re
2454 #undef i10_im
2455 #undef i11_re
2456 #undef i11_im
2457 #undef i12_re
2458 #undef i12_im
2459 #undef i20_re
2460 #undef i20_im
2461 #undef i21_re
2462 #undef i21_im
2463 #undef i22_re
2464 #undef i22_im
2465 #undef i30_re
2466 #undef i30_im
2467 #undef i31_re
2468 #undef i31_im
2469 #undef i32_re
2470 #undef i32_im
2471 
2472 #undef c00_00_re
2473 #undef c01_01_re
2474 #undef c02_02_re
2475 #undef c10_10_re
2476 #undef c11_11_re
2477 #undef c12_12_re
2478 #undef c01_00_re
2479 #undef c01_00_im
2480 #undef c02_00_re
2481 #undef c02_00_im
2482 #undef c10_00_re
2483 #undef c10_00_im
2484 #undef c11_00_re
2485 #undef c11_00_im
2486 #undef c12_00_re
2487 #undef c12_00_im
2488 #undef c02_01_re
2489 #undef c02_01_im
2490 #undef c10_01_re
2491 #undef c10_01_im
2492 #undef c11_01_re
2493 #undef c11_01_im
2494 #undef c12_01_re
2495 #undef c12_01_im
2496 #undef c10_02_re
2497 #undef c10_02_im
2498 #undef c11_02_re
2499 #undef c11_02_im
2500 #undef c12_02_re
2501 #undef c12_02_im
2502 #undef c11_10_re
2503 #undef c11_10_im
2504 #undef c12_10_re
2505 #undef c12_10_im
2506 #undef c12_11_re
2507 #undef c12_11_im
2508 
2509 #undef cinv00_00_re
2510 #undef cinv01_01_re
2511 #undef cinv02_02_re
2512 #undef cinv10_10_re
2513 #undef cinv11_11_re
2514 #undef cinv12_12_re
2515 #undef cinv01_00_re
2516 #undef cinv01_00_im
2517 #undef cinv02_00_re
2518 #undef cinv02_00_im
2519 #undef cinv10_00_re
2520 #undef cinv10_00_im
2521 #undef cinv11_00_re
2522 #undef cinv11_00_im
2523 #undef cinv12_00_re
2524 #undef cinv12_00_im
2525 #undef cinv02_01_re
2526 #undef cinv02_01_im
2527 #undef cinv10_01_re
2528 #undef cinv10_01_im
2529 #undef cinv11_01_re
2530 #undef cinv11_01_im
2531 #undef cinv12_01_re
2532 #undef cinv12_01_im
2533 #undef cinv10_02_re
2534 #undef cinv10_02_im
2535 #undef cinv11_02_re
2536 #undef cinv11_02_im
2537 #undef cinv12_02_re
2538 #undef cinv12_02_im
2539 #undef cinv11_10_re
2540 #undef cinv11_10_im
2541 #undef cinv12_10_re
2542 #undef cinv12_10_im
2543 #undef cinv12_11_re
2544 #undef cinv12_11_im
2545 
2546 #undef acc00_re
2547 #undef acc00_im
2548 #undef acc01_re
2549 #undef acc01_im
2550 #undef acc02_re
2551 #undef acc02_im
2552 #undef acc10_re
2553 #undef acc10_im
2554 #undef acc11_re
2555 #undef acc11_im
2556 #undef acc12_re
2557 #undef acc12_im
2558 #undef acc20_re
2559 #undef acc20_im
2560 #undef acc21_re
2561 #undef acc21_im
2562 #undef acc22_re
2563 #undef acc22_im
2564 #undef acc30_re
2565 #undef acc30_im
2566 #undef acc31_re
2567 #undef acc31_im
2568 #undef acc32_re
2569 #undef acc32_im
2570 
2571 
2572 
2573 #undef VOLATILE
#define gT22_im
#define g00_re
#define g01_im
spinorFloat b0_re
#define g20_re
#define i12_im
VOLATILE spinorFloat o00_re
__constant__ int Vh
spinorFloat A2_re
#define i01_im
#define APPLY_CLOVER_TWIST(c, a, reg)
Definition: tmc_core.h:1
VOLATILE spinorFloat o10_re
__constant__ int X2
#define g21_im
#define i31_re
#define g12_re
__constant__ int X2X1mX1
spinorFloat a0_im
spinorFloat b2_im
#define i20_im
#define g21_re
float4 C1
#define acc11_im
#define g10_im
RECONSTRUCT_GAUGE_MATRIX(0)
__constant__ int X3X2X1mX2X1
spinorFloat B2_im
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o32_im
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
spinorFloat B1_re
int sp_idx
#define acc31_re
#define gT12_re
float4 C0
#define g02_re
#define acc32_re
VOLATILE spinorFloat o01_re
#define i21_re
#define gT02_re
const int ga_idx
#define gT01_re
__constant__ int X3X2X1
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define i11_re
#define i11_im
#define i21_im
#define g11_re
#define acc12_im
VOLATILE spinorFloat o30_re
#define acc01_re
spinorFloat A0_im
VOLATILE spinorFloat o20_re
spinorFloat a2_re
float4 C6
#define gT20_im
#define gT22_re
#define g00_im
spinorFloat A0_re
spinorFloat A1_im
QudaGaugeParam param
Definition: pack_test.cpp:17
#define i22_im
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define i30_re
#define gT21_re
#define VOLATILE
VOLATILE spinorFloat o01_im
#define APPLY_CLOVER_TWIST_INV(c, cinv, a, reg)
Definition: tmc_core.h:432
#define acc01_im
#define acc30_re
#define acc20_re
#define g01_re
#define i30_im
WRITE_SPINOR(param.sp_stride)
#define spinorFloat
spinorFloat a0_re
#define acc10_re
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o11_re
#define gT11_re
#define GAUGE0TEX
Definition: covDev.h:112
#define acc12_re
VOLATILE spinorFloat o12_im
#define acc00_re
#define i31_im
#define gT10_re
VOLATILE spinorFloat o02_re
#define acc21_re
VOLATILE spinorFloat o31_re
float4 C8
VOLATILE spinorFloat o21_re
#define acc00_im
#define acc22_im
spinorFloat B0_re
#define i20_re
__constant__ int X2m1
#define i32_re
VOLATILE spinorFloat o30_im
VOLATILE spinorFloat o10_im
#define g02_im
#define SPINORTEX
Definition: clover_def.h:40
__constant__ int gauge_fixed
float4 C3
#define acc02_re
VOLATILE spinorFloat o02_im
#define i00_im
__constant__ int X4X3X2X1mX3X2X1
spinorFloat B1_im
const int dims[]
float4 C2
#define SPINOR_HOP
Definition: covDev.h:158
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
#define i22_re
VOLATILE spinorFloat o12_re
#define gT21_im
VOLATILE spinorFloat o00_im
spinorFloat b2_re
__constant__ int ga_stride
#define gT02_im
VOLATILE spinorFloat o22_im
#define acc31_im
#define acc30_im
#define acc32_im
__constant__ int X1m1
#define i02_re
__constant__ int X3
#define i32_im
#define acc02_im
float4 C7
spinorFloat b0_im
#define g22_re
#define acc10_im
spinorFloat a1_im
spinorFloat b1_im
#define gT20_re
#define GAUGE1TEX
Definition: covDev.h:113
#define gT12_im
#define acc22_re
float4 C4
spinorFloat A1_re
spinorFloat B2_re
#define acc20_im
spinorFloat B0_im
spinorFloat a1_re
#define i02_im
#define g22_im
#define gT01_im
__constant__ int X4m1
#define i10_im
VOLATILE spinorFloat o11_im
#define i12_re
spinorFloat a2_im
VOLATILE spinorFloat o31_im
#define g10_re
float4 C5
#define acc11_re
#define g12_im
#define gT10_im
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define gT11_im
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
VOLATILE spinorFloat o22_re
#define gT00_im
__constant__ int X4X3X2X1hmX3X2X1h
#define acc21_im
#define i10_re
KernelType kernel_type
VOLATILE spinorFloat o20_im
#define i00_re
#define g20_im
#define gT00_re
#define i01_re
#define g11_im
spinorFloat A2_im
__constant__ int X4
spinorFloat b1_re
__constant__ int X3m1
#define TPROJSCALE
Definition: covDev.h:101
__constant__ int X2X1
VOLATILE spinorFloat o32_re