QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tmc_dslash_dagger_gt200_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define i00_re I0.x
15 #define i00_im I0.y
16 #define i01_re I1.x
17 #define i01_im I1.y
18 #define i02_re I2.x
19 #define i02_im I2.y
20 #define i10_re I3.x
21 #define i10_im I3.y
22 #define i11_re I4.x
23 #define i11_im I4.y
24 #define i12_re I5.x
25 #define i12_im I5.y
26 #define i20_re I6.x
27 #define i20_im I6.y
28 #define i21_re I7.x
29 #define i21_im I7.y
30 #define i22_re I8.x
31 #define i22_im I8.y
32 #define i30_re I9.x
33 #define i30_im I9.y
34 #define i31_re I10.x
35 #define i31_im I10.y
36 #define i32_re I11.x
37 #define i32_im I11.y
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
62 #else
63 #define spinorFloat float
64 #define i00_re I0.x
65 #define i00_im I0.y
66 #define i01_re I0.z
67 #define i01_im I0.w
68 #define i02_re I1.x
69 #define i02_im I1.y
70 #define i10_re I1.z
71 #define i10_im I1.w
72 #define i11_re I2.x
73 #define i11_im I2.y
74 #define i12_re I2.z
75 #define i12_im I2.w
76 #define i20_re I3.x
77 #define i20_im I3.y
78 #define i21_re I3.z
79 #define i21_im I3.w
80 #define i22_re I4.x
81 #define i22_im I4.y
82 #define i30_re I4.z
83 #define i30_im I4.w
84 #define i31_re I5.x
85 #define i31_im I5.y
86 #define i32_re I5.z
87 #define i32_im I5.w
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
113 
114 // gauge link
115 #ifdef GAUGE_FLOAT2
116 #define g00_re G0.x
117 #define g00_im G0.y
118 #define g01_re G1.x
119 #define g01_im G1.y
120 #define g02_re G2.x
121 #define g02_im G2.y
122 #define g10_re G3.x
123 #define g10_im G3.y
124 #define g11_re G4.x
125 #define g11_im G4.y
126 #define g12_re G5.x
127 #define g12_im G5.y
128 #define g20_re G6.x
129 #define g20_im G6.y
130 #define g21_re G7.x
131 #define g21_im G7.y
132 #define g22_re G8.x
133 #define g22_im G8.y
134 
135 #else
136 #define g00_re G0.x
137 #define g00_im G0.y
138 #define g01_re G0.z
139 #define g01_im G0.w
140 #define g02_re G1.x
141 #define g02_im G1.y
142 #define g10_re G1.z
143 #define g10_im G1.w
144 #define g11_re G2.x
145 #define g11_im G2.y
146 #define g12_re G2.z
147 #define g12_im G2.w
148 #define g20_re G3.x
149 #define g20_im G3.y
150 #define g21_re G3.z
151 #define g21_im G3.w
152 #define g22_re G4.x
153 #define g22_im G4.y
154 
155 #endif // GAUGE_DOUBLE
156 
157 // conjugated gauge link
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
176 
177 // first chiral block of clover term
178 #ifdef CLOVER_DOUBLE
179 #define c00_00_re C0.x
180 #define c01_01_re C0.y
181 #define c02_02_re C1.x
182 #define c10_10_re C1.y
183 #define c11_11_re C2.x
184 #define c12_12_re C2.y
185 #define c01_00_re C3.x
186 #define c01_00_im C3.y
187 #define c02_00_re C4.x
188 #define c02_00_im C4.y
189 #define c10_00_re C5.x
190 #define c10_00_im C5.y
191 #define c11_00_re C6.x
192 #define c11_00_im C6.y
193 #define c12_00_re C7.x
194 #define c12_00_im C7.y
195 #define c02_01_re C8.x
196 #define c02_01_im C8.y
197 #define c10_01_re C9.x
198 #define c10_01_im C9.y
199 #define c11_01_re C10.x
200 #define c11_01_im C10.y
201 #define c12_01_re C11.x
202 #define c12_01_im C11.y
203 #define c10_02_re C12.x
204 #define c10_02_im C12.y
205 #define c11_02_re C13.x
206 #define c11_02_im C13.y
207 #define c12_02_re C14.x
208 #define c12_02_im C14.y
209 #define c11_10_re C15.x
210 #define c11_10_im C15.y
211 #define c12_10_re C16.x
212 #define c12_10_im C16.y
213 #define c12_11_re C17.x
214 #define c12_11_im C17.y
215 #else
216 #define c00_00_re C0.x
217 #define c01_01_re C0.y
218 #define c02_02_re C0.z
219 #define c10_10_re C0.w
220 #define c11_11_re C1.x
221 #define c12_12_re C1.y
222 #define c01_00_re C1.z
223 #define c01_00_im C1.w
224 #define c02_00_re C2.x
225 #define c02_00_im C2.y
226 #define c10_00_re C2.z
227 #define c10_00_im C2.w
228 #define c11_00_re C3.x
229 #define c11_00_im C3.y
230 #define c12_00_re C3.z
231 #define c12_00_im C3.w
232 #define c02_01_re C4.x
233 #define c02_01_im C4.y
234 #define c10_01_re C4.z
235 #define c10_01_im C4.w
236 #define c11_01_re C5.x
237 #define c11_01_im C5.y
238 #define c12_01_re C5.z
239 #define c12_01_im C5.w
240 #define c10_02_re C6.x
241 #define c10_02_im C6.y
242 #define c11_02_re C6.z
243 #define c11_02_im C6.w
244 #define c12_02_re C7.x
245 #define c12_02_im C7.y
246 #define c11_10_re C7.z
247 #define c11_10_im C7.w
248 #define c12_10_re C8.x
249 #define c12_10_im C8.y
250 #define c12_11_re C8.z
251 #define c12_11_im C8.w
252 #endif // CLOVER_DOUBLE
253 
254 #define c00_01_re (+c01_00_re)
255 #define c00_01_im (-c01_00_im)
256 #define c00_02_re (+c02_00_re)
257 #define c00_02_im (-c02_00_im)
258 #define c01_02_re (+c02_01_re)
259 #define c01_02_im (-c02_01_im)
260 #define c00_10_re (+c10_00_re)
261 #define c00_10_im (-c10_00_im)
262 #define c01_10_re (+c10_01_re)
263 #define c01_10_im (-c10_01_im)
264 #define c02_10_re (+c10_02_re)
265 #define c02_10_im (-c10_02_im)
266 #define c00_11_re (+c11_00_re)
267 #define c00_11_im (-c11_00_im)
268 #define c01_11_re (+c11_01_re)
269 #define c01_11_im (-c11_01_im)
270 #define c02_11_re (+c11_02_re)
271 #define c02_11_im (-c11_02_im)
272 #define c10_11_re (+c11_10_re)
273 #define c10_11_im (-c11_10_im)
274 #define c00_12_re (+c12_00_re)
275 #define c00_12_im (-c12_00_im)
276 #define c01_12_re (+c12_01_re)
277 #define c01_12_im (-c12_01_im)
278 #define c02_12_re (+c12_02_re)
279 #define c02_12_im (-c12_02_im)
280 #define c10_12_re (+c12_10_re)
281 #define c10_12_im (-c12_10_im)
282 #define c11_12_re (+c12_11_re)
283 #define c11_12_im (-c12_11_im)
284 
285 // second chiral block of clover term (reuses C0,...,C9)
286 #define c20_20_re c00_00_re
287 #define c21_20_re c01_00_re
288 #define c21_20_im c01_00_im
289 #define c22_20_re c02_00_re
290 #define c22_20_im c02_00_im
291 #define c30_20_re c10_00_re
292 #define c30_20_im c10_00_im
293 #define c31_20_re c11_00_re
294 #define c31_20_im c11_00_im
295 #define c32_20_re c12_00_re
296 #define c32_20_im c12_00_im
297 #define c20_21_re c00_01_re
298 #define c20_21_im c00_01_im
299 #define c21_21_re c01_01_re
300 #define c22_21_re c02_01_re
301 #define c22_21_im c02_01_im
302 #define c30_21_re c10_01_re
303 #define c30_21_im c10_01_im
304 #define c31_21_re c11_01_re
305 #define c31_21_im c11_01_im
306 #define c32_21_re c12_01_re
307 #define c32_21_im c12_01_im
308 #define c20_22_re c00_02_re
309 #define c20_22_im c00_02_im
310 #define c21_22_re c01_02_re
311 #define c21_22_im c01_02_im
312 #define c22_22_re c02_02_re
313 #define c30_22_re c10_02_re
314 #define c30_22_im c10_02_im
315 #define c31_22_re c11_02_re
316 #define c31_22_im c11_02_im
317 #define c32_22_re c12_02_re
318 #define c32_22_im c12_02_im
319 #define c20_30_re c00_10_re
320 #define c20_30_im c00_10_im
321 #define c21_30_re c01_10_re
322 #define c21_30_im c01_10_im
323 #define c22_30_re c02_10_re
324 #define c22_30_im c02_10_im
325 #define c30_30_re c10_10_re
326 #define c31_30_re c11_10_re
327 #define c31_30_im c11_10_im
328 #define c32_30_re c12_10_re
329 #define c32_30_im c12_10_im
330 #define c20_31_re c00_11_re
331 #define c20_31_im c00_11_im
332 #define c21_31_re c01_11_re
333 #define c21_31_im c01_11_im
334 #define c22_31_re c02_11_re
335 #define c22_31_im c02_11_im
336 #define c30_31_re c10_11_re
337 #define c30_31_im c10_11_im
338 #define c31_31_re c11_11_re
339 #define c32_31_re c12_11_re
340 #define c32_31_im c12_11_im
341 #define c20_32_re c00_12_re
342 #define c20_32_im c00_12_im
343 #define c21_32_re c01_12_re
344 #define c21_32_im c01_12_im
345 #define c22_32_re c02_12_re
346 #define c22_32_im c02_12_im
347 #define c30_32_re c10_12_re
348 #define c30_32_im c10_12_im
349 #define c31_32_re c11_12_re
350 #define c31_32_im c11_12_im
351 #define c32_32_re c12_12_re
352 
353 
354 // first chiral block of inverted clover term
355 #ifdef CLOVER_DOUBLE
356 #define cinv00_00_re C0.x
357 #define cinv01_01_re C0.y
358 #define cinv02_02_re C1.x
359 #define cinv10_10_re C1.y
360 #define cinv11_11_re C2.x
361 #define cinv12_12_re C2.y
362 #define cinv01_00_re C3.x
363 #define cinv01_00_im C3.y
364 #define cinv02_00_re C4.x
365 #define cinv02_00_im C4.y
366 #define cinv10_00_re C5.x
367 #define cinv10_00_im C5.y
368 #define cinv11_00_re C6.x
369 #define cinv11_00_im C6.y
370 #define cinv12_00_re C7.x
371 #define cinv12_00_im C7.y
372 #define cinv02_01_re C8.x
373 #define cinv02_01_im C8.y
374 #define cinv10_01_re C9.x
375 #define cinv10_01_im C9.y
376 #define cinv11_01_re C10.x
377 #define cinv11_01_im C10.y
378 #define cinv12_01_re C11.x
379 #define cinv12_01_im C11.y
380 #define cinv10_02_re C12.x
381 #define cinv10_02_im C12.y
382 #define cinv11_02_re C13.x
383 #define cinv11_02_im C13.y
384 #define cinv12_02_re C14.x
385 #define cinv12_02_im C14.y
386 #define cinv11_10_re C15.x
387 #define cinv11_10_im C15.y
388 #define cinv12_10_re C16.x
389 #define cinv12_10_im C16.y
390 #define cinv12_11_re C17.x
391 #define cinv12_11_im C17.y
392 #else
393 #define cinv00_00_re C0.x
394 #define cinv01_01_re C0.y
395 #define cinv02_02_re C0.z
396 #define cinv10_10_re C0.w
397 #define cinv11_11_re C1.x
398 #define cinv12_12_re C1.y
399 #define cinv01_00_re C1.z
400 #define cinv01_00_im C1.w
401 #define cinv02_00_re C2.x
402 #define cinv02_00_im C2.y
403 #define cinv10_00_re C2.z
404 #define cinv10_00_im C2.w
405 #define cinv11_00_re C3.x
406 #define cinv11_00_im C3.y
407 #define cinv12_00_re C3.z
408 #define cinv12_00_im C3.w
409 #define cinv02_01_re C4.x
410 #define cinv02_01_im C4.y
411 #define cinv10_01_re C4.z
412 #define cinv10_01_im C4.w
413 #define cinv11_01_re C5.x
414 #define cinv11_01_im C5.y
415 #define cinv12_01_re C5.z
416 #define cinv12_01_im C5.w
417 #define cinv10_02_re C6.x
418 #define cinv10_02_im C6.y
419 #define cinv11_02_re C6.z
420 #define cinv11_02_im C6.w
421 #define cinv12_02_re C7.x
422 #define cinv12_02_im C7.y
423 #define cinv11_10_re C7.z
424 #define cinv11_10_im C7.w
425 #define cinv12_10_re C8.x
426 #define cinv12_10_im C8.y
427 #define cinv12_11_re C8.z
428 #define cinv12_11_im C8.w
429 #endif // CLOVER_DOUBLE
430 
431 #define cinv00_01_re (+cinv01_00_re)
432 #define cinv00_01_im (-cinv01_00_im)
433 #define cinv00_02_re (+cinv02_00_re)
434 #define cinv00_02_im (-cinv02_00_im)
435 #define cinv01_02_re (+cinv02_01_re)
436 #define cinv01_02_im (-cinv02_01_im)
437 #define cinv00_10_re (+cinv10_00_re)
438 #define cinv00_10_im (-cinv10_00_im)
439 #define cinv01_10_re (+cinv10_01_re)
440 #define cinv01_10_im (-cinv10_01_im)
441 #define cinv02_10_re (+cinv10_02_re)
442 #define cinv02_10_im (-cinv10_02_im)
443 #define cinv00_11_re (+cinv11_00_re)
444 #define cinv00_11_im (-cinv11_00_im)
445 #define cinv01_11_re (+cinv11_01_re)
446 #define cinv01_11_im (-cinv11_01_im)
447 #define cinv02_11_re (+cinv11_02_re)
448 #define cinv02_11_im (-cinv11_02_im)
449 #define cinv10_11_re (+cinv11_10_re)
450 #define cinv10_11_im (-cinv11_10_im)
451 #define cinv00_12_re (+cinv12_00_re)
452 #define cinv00_12_im (-cinv12_00_im)
453 #define cinv01_12_re (+cinv12_01_re)
454 #define cinv01_12_im (-cinv12_01_im)
455 #define cinv02_12_re (+cinv12_02_re)
456 #define cinv02_12_im (-cinv12_02_im)
457 #define cinv10_12_re (+cinv12_10_re)
458 #define cinv10_12_im (-cinv12_10_im)
459 #define cinv11_12_re (+cinv12_11_re)
460 #define cinv11_12_im (-cinv12_11_im)
461 
462 // second chiral block of inverted clover term (reuses C0,...,C9)
463 #define cinv20_20_re cinv00_00_re
464 #define cinv21_20_re cinv01_00_re
465 #define cinv21_20_im cinv01_00_im
466 #define cinv22_20_re cinv02_00_re
467 #define cinv22_20_im cinv02_00_im
468 #define cinv30_20_re cinv10_00_re
469 #define cinv30_20_im cinv10_00_im
470 #define cinv31_20_re cinv11_00_re
471 #define cinv31_20_im cinv11_00_im
472 #define cinv32_20_re cinv12_00_re
473 #define cinv32_20_im cinv12_00_im
474 #define cinv20_21_re cinv00_01_re
475 #define cinv20_21_im cinv00_01_im
476 #define cinv21_21_re cinv01_01_re
477 #define cinv22_21_re cinv02_01_re
478 #define cinv22_21_im cinv02_01_im
479 #define cinv30_21_re cinv10_01_re
480 #define cinv30_21_im cinv10_01_im
481 #define cinv31_21_re cinv11_01_re
482 #define cinv31_21_im cinv11_01_im
483 #define cinv32_21_re cinv12_01_re
484 #define cinv32_21_im cinv12_01_im
485 #define cinv20_22_re cinv00_02_re
486 #define cinv20_22_im cinv00_02_im
487 #define cinv21_22_re cinv01_02_re
488 #define cinv21_22_im cinv01_02_im
489 #define cinv22_22_re cinv02_02_re
490 #define cinv30_22_re cinv10_02_re
491 #define cinv30_22_im cinv10_02_im
492 #define cinv31_22_re cinv11_02_re
493 #define cinv31_22_im cinv11_02_im
494 #define cinv32_22_re cinv12_02_re
495 #define cinv32_22_im cinv12_02_im
496 #define cinv20_30_re cinv00_10_re
497 #define cinv20_30_im cinv00_10_im
498 #define cinv21_30_re cinv01_10_re
499 #define cinv21_30_im cinv01_10_im
500 #define cinv22_30_re cinv02_10_re
501 #define cinv22_30_im cinv02_10_im
502 #define cinv30_30_re cinv10_10_re
503 #define cinv31_30_re cinv11_10_re
504 #define cinv31_30_im cinv11_10_im
505 #define cinv32_30_re cinv12_10_re
506 #define cinv32_30_im cinv12_10_im
507 #define cinv20_31_re cinv00_11_re
508 #define cinv20_31_im cinv00_11_im
509 #define cinv21_31_re cinv01_11_re
510 #define cinv21_31_im cinv01_11_im
511 #define cinv22_31_re cinv02_11_re
512 #define cinv22_31_im cinv02_11_im
513 #define cinv30_31_re cinv10_11_re
514 #define cinv30_31_im cinv10_11_im
515 #define cinv31_31_re cinv11_11_re
516 #define cinv32_31_re cinv12_11_re
517 #define cinv32_31_im cinv12_11_im
518 #define cinv20_32_re cinv00_12_re
519 #define cinv20_32_im cinv00_12_im
520 #define cinv21_32_re cinv01_12_re
521 #define cinv21_32_im cinv01_12_im
522 #define cinv22_32_re cinv02_12_re
523 #define cinv22_32_im cinv02_12_im
524 #define cinv30_32_re cinv10_12_re
525 #define cinv30_32_im cinv10_12_im
526 #define cinv31_32_re cinv11_12_re
527 #define cinv31_32_im cinv11_12_im
528 #define cinv32_32_re cinv12_12_re
529 
530 
531 #ifndef CLOVER_TWIST_INV_DSLASH
532 
533 // declare C## here and use ASSN below instead of READ
534 #ifdef CLOVER_DOUBLE
535 double2 C0;
536 double2 C1;
537 double2 C2;
538 double2 C3;
539 double2 C4;
540 double2 C5;
541 double2 C6;
542 double2 C7;
543 double2 C8;
544 double2 C9;
545 double2 C10;
546 double2 C11;
547 double2 C12;
548 double2 C13;
549 double2 C14;
550 double2 C15;
551 double2 C16;
552 double2 C17;
553 #else
554 float4 C0;
555 float4 C1;
556 float4 C2;
557 float4 C3;
558 float4 C4;
559 float4 C5;
560 float4 C6;
561 float4 C7;
562 float4 C8;
563 
564 #if (DD_PREC==2)
565 float K;
566 #endif
567 
568 #endif // CLOVER_DOUBLE
569 #endif
570 
571 // output spinor
596 
597 #include "read_gauge.h"
598 #include "io_spinor.h"
599 #include "read_clover.h"
600 #include "tmc_core.h"
601 
602 int x1, x2, x3, x4;
603 int X;
604 
605 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
606 int sp_norm_idx;
607 #endif // MULTI_GPU half precision
608 
609 int sid;
610 
611 #ifdef MULTI_GPU
612 int face_idx;
614 #endif
615 
616  sid = blockIdx.x*blockDim.x + threadIdx.x;
617  if (sid >= param.threads) return;
618 
619  // Inline by hand for the moment and assume even dimensions
620  const int dims[] = {X1, X2, X3, X4};
621  coordsFromIndex<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
622 
623  o00_re = 0; o00_im = 0;
624  o01_re = 0; o01_im = 0;
625  o02_re = 0; o02_im = 0;
626  o10_re = 0; o10_im = 0;
627  o11_re = 0; o11_im = 0;
628  o12_re = 0; o12_im = 0;
629  o20_re = 0; o20_im = 0;
630  o21_re = 0; o21_im = 0;
631  o22_re = 0; o22_im = 0;
632  o30_re = 0; o30_im = 0;
633  o31_re = 0; o31_im = 0;
634  o32_re = 0; o32_im = 0;
635 
636 #ifdef MULTI_GPU
637 } else { // exterior kernel
638 
639  sid = blockIdx.x*blockDim.x + threadIdx.x;
640  if (sid >= param.threads) return;
641 
642  const int dim = static_cast<int>(kernel_type);
643  const int face_volume = (param.threads >> 1); // volume of one face
644  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
645  face_idx = sid - face_num*face_volume; // index into the respective face
646 
647  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
648  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
649  //sp_idx = face_idx + param.ghostOffset[dim];
650 
651 #if (DD_PREC==2) // half precision
652  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
653 #endif
654 
655  const int dims[] = {X1, X2, X3, X4};
656  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
657 
658  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
659 
660  o00_re = i00_re; o00_im = i00_im;
661  o01_re = i01_re; o01_im = i01_im;
662  o02_re = i02_re; o02_im = i02_im;
663  o10_re = i10_re; o10_im = i10_im;
664  o11_re = i11_re; o11_im = i11_im;
665  o12_re = i12_re; o12_im = i12_im;
666  o20_re = i20_re; o20_im = i20_im;
667  o21_re = i21_re; o21_im = i21_im;
668  o22_re = i22_re; o22_im = i22_im;
669  o30_re = i30_re; o30_im = i30_im;
670  o31_re = i31_re; o31_im = i31_im;
671  o32_re = i32_re; o32_im = i32_im;
672 }
673 #endif // MULTI_GPU
674 
675 
676 #ifdef MULTI_GPU
677 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
679 #endif
680 {
681  // Projector P0+
682  // 1 0 0 i
683  // 0 1 i 0
684  // 0 -i 1 0
685  // -i 0 0 1
686 
687 #ifdef MULTI_GPU
688  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
689  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
690 #else
691  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
692 #endif
693 
694  const int ga_idx = sid;
695 
702 
703 #ifdef MULTI_GPU
704  if (kernel_type == INTERIOR_KERNEL) {
705 #endif
706 
707  // read spinor from device memory
708  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
709 
710  // project spinor into half spinors
711  a0_re = +i00_re-i30_im;
712  a0_im = +i00_im+i30_re;
713  a1_re = +i01_re-i31_im;
714  a1_im = +i01_im+i31_re;
715  a2_re = +i02_re-i32_im;
716  a2_im = +i02_im+i32_re;
717  b0_re = +i10_re-i20_im;
718  b0_im = +i10_im+i20_re;
719  b1_re = +i11_re-i21_im;
720  b1_im = +i11_im+i21_re;
721  b2_re = +i12_re-i22_im;
722  b2_im = +i12_im+i22_re;
723 
724 #ifdef MULTI_GPU
725  } else {
726 
727  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
728 
729  // read half spinor from device memory
730  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
731 
732  a0_re = i00_re; a0_im = i00_im;
733  a1_re = i01_re; a1_im = i01_im;
734  a2_re = i02_re; a2_im = i02_im;
735  b0_re = i10_re; b0_im = i10_im;
736  b1_re = i11_re; b1_im = i11_im;
737  b2_re = i12_re; b2_im = i12_im;
738 
739  }
740 #endif // MULTI_GPU
741 
742  // read gauge matrix from device memory
743  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
744 
745  // reconstruct gauge matrix
747 
748  // multiply row 0
750  A0_re += g00_re * a0_re;
751  A0_re -= g00_im * a0_im;
752  A0_re += g01_re * a1_re;
753  A0_re -= g01_im * a1_im;
754  A0_re += g02_re * a2_re;
755  A0_re -= g02_im * a2_im;
757  A0_im += g00_re * a0_im;
758  A0_im += g00_im * a0_re;
759  A0_im += g01_re * a1_im;
760  A0_im += g01_im * a1_re;
761  A0_im += g02_re * a2_im;
762  A0_im += g02_im * a2_re;
764  B0_re += g00_re * b0_re;
765  B0_re -= g00_im * b0_im;
766  B0_re += g01_re * b1_re;
767  B0_re -= g01_im * b1_im;
768  B0_re += g02_re * b2_re;
769  B0_re -= g02_im * b2_im;
771  B0_im += g00_re * b0_im;
772  B0_im += g00_im * b0_re;
773  B0_im += g01_re * b1_im;
774  B0_im += g01_im * b1_re;
775  B0_im += g02_re * b2_im;
776  B0_im += g02_im * b2_re;
777 
778  // multiply row 1
780  A1_re += g10_re * a0_re;
781  A1_re -= g10_im * a0_im;
782  A1_re += g11_re * a1_re;
783  A1_re -= g11_im * a1_im;
784  A1_re += g12_re * a2_re;
785  A1_re -= g12_im * a2_im;
787  A1_im += g10_re * a0_im;
788  A1_im += g10_im * a0_re;
789  A1_im += g11_re * a1_im;
790  A1_im += g11_im * a1_re;
791  A1_im += g12_re * a2_im;
792  A1_im += g12_im * a2_re;
794  B1_re += g10_re * b0_re;
795  B1_re -= g10_im * b0_im;
796  B1_re += g11_re * b1_re;
797  B1_re -= g11_im * b1_im;
798  B1_re += g12_re * b2_re;
799  B1_re -= g12_im * b2_im;
801  B1_im += g10_re * b0_im;
802  B1_im += g10_im * b0_re;
803  B1_im += g11_re * b1_im;
804  B1_im += g11_im * b1_re;
805  B1_im += g12_re * b2_im;
806  B1_im += g12_im * b2_re;
807 
808  // multiply row 2
810  A2_re += g20_re * a0_re;
811  A2_re -= g20_im * a0_im;
812  A2_re += g21_re * a1_re;
813  A2_re -= g21_im * a1_im;
814  A2_re += g22_re * a2_re;
815  A2_re -= g22_im * a2_im;
817  A2_im += g20_re * a0_im;
818  A2_im += g20_im * a0_re;
819  A2_im += g21_re * a1_im;
820  A2_im += g21_im * a1_re;
821  A2_im += g22_re * a2_im;
822  A2_im += g22_im * a2_re;
824  B2_re += g20_re * b0_re;
825  B2_re -= g20_im * b0_im;
826  B2_re += g21_re * b1_re;
827  B2_re -= g21_im * b1_im;
828  B2_re += g22_re * b2_re;
829  B2_re -= g22_im * b2_im;
831  B2_im += g20_re * b0_im;
832  B2_im += g20_im * b0_re;
833  B2_im += g21_re * b1_im;
834  B2_im += g21_im * b1_re;
835  B2_im += g22_re * b2_im;
836  B2_im += g22_im * b2_re;
837 
838  o00_re += A0_re;
839  o00_im += A0_im;
840  o10_re += B0_re;
841  o10_im += B0_im;
842  o20_re += B0_im;
843  o20_im -= B0_re;
844  o30_re += A0_im;
845  o30_im -= A0_re;
846 
847  o01_re += A1_re;
848  o01_im += A1_im;
849  o11_re += B1_re;
850  o11_im += B1_im;
851  o21_re += B1_im;
852  o21_im -= B1_re;
853  o31_re += A1_im;
854  o31_im -= A1_re;
855 
856  o02_re += A2_re;
857  o02_im += A2_im;
858  o12_re += B2_re;
859  o12_im += B2_im;
860  o22_re += B2_im;
861  o22_im -= B2_re;
862  o32_re += A2_im;
863  o32_im -= A2_re;
864 
865 }
866 
867 #ifdef MULTI_GPU
868 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
869  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
870 #endif
871 {
872  // Projector P0-
873  // 1 0 0 -i
874  // 0 1 -i 0
875  // 0 i 1 0
876  // i 0 0 1
877 
878 #ifdef MULTI_GPU
879  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
880  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
881 #else
882  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
883 #endif
884 
885 #ifdef MULTI_GPU
886  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
887 #else
888  const int ga_idx = sp_idx;
889 #endif
890 
897 
898 #ifdef MULTI_GPU
899  if (kernel_type == INTERIOR_KERNEL) {
900 #endif
901 
902  // read spinor from device memory
903  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
904 
905  // project spinor into half spinors
906  a0_re = +i00_re+i30_im;
907  a0_im = +i00_im-i30_re;
908  a1_re = +i01_re+i31_im;
909  a1_im = +i01_im-i31_re;
910  a2_re = +i02_re+i32_im;
911  a2_im = +i02_im-i32_re;
912  b0_re = +i10_re+i20_im;
913  b0_im = +i10_im-i20_re;
914  b1_re = +i11_re+i21_im;
915  b1_im = +i11_im-i21_re;
916  b2_re = +i12_re+i22_im;
917  b2_im = +i12_im-i22_re;
918 
919 #ifdef MULTI_GPU
920  } else {
921 
922  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
923 
924  // read half spinor from device memory
925  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
926 
927  a0_re = i00_re; a0_im = i00_im;
928  a1_re = i01_re; a1_im = i01_im;
929  a2_re = i02_re; a2_im = i02_im;
930  b0_re = i10_re; b0_im = i10_im;
931  b1_re = i11_re; b1_im = i11_im;
932  b2_re = i12_re; b2_im = i12_im;
933 
934  }
935 #endif // MULTI_GPU
936 
937  // read gauge matrix from device memory
938  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
939 
940  // reconstruct gauge matrix
942 
943  // multiply row 0
944  spinorFloat A0_re = 0;
945  A0_re += gT00_re * a0_re;
946  A0_re -= gT00_im * a0_im;
947  A0_re += gT01_re * a1_re;
948  A0_re -= gT01_im * a1_im;
949  A0_re += gT02_re * a2_re;
950  A0_re -= gT02_im * a2_im;
951  spinorFloat A0_im = 0;
952  A0_im += gT00_re * a0_im;
953  A0_im += gT00_im * a0_re;
954  A0_im += gT01_re * a1_im;
955  A0_im += gT01_im * a1_re;
956  A0_im += gT02_re * a2_im;
957  A0_im += gT02_im * a2_re;
958  spinorFloat B0_re = 0;
959  B0_re += gT00_re * b0_re;
960  B0_re -= gT00_im * b0_im;
961  B0_re += gT01_re * b1_re;
962  B0_re -= gT01_im * b1_im;
963  B0_re += gT02_re * b2_re;
964  B0_re -= gT02_im * b2_im;
965  spinorFloat B0_im = 0;
966  B0_im += gT00_re * b0_im;
967  B0_im += gT00_im * b0_re;
968  B0_im += gT01_re * b1_im;
969  B0_im += gT01_im * b1_re;
970  B0_im += gT02_re * b2_im;
971  B0_im += gT02_im * b2_re;
972 
973  // multiply row 1
974  spinorFloat A1_re = 0;
975  A1_re += gT10_re * a0_re;
976  A1_re -= gT10_im * a0_im;
977  A1_re += gT11_re * a1_re;
978  A1_re -= gT11_im * a1_im;
979  A1_re += gT12_re * a2_re;
980  A1_re -= gT12_im * a2_im;
981  spinorFloat A1_im = 0;
982  A1_im += gT10_re * a0_im;
983  A1_im += gT10_im * a0_re;
984  A1_im += gT11_re * a1_im;
985  A1_im += gT11_im * a1_re;
986  A1_im += gT12_re * a2_im;
987  A1_im += gT12_im * a2_re;
988  spinorFloat B1_re = 0;
989  B1_re += gT10_re * b0_re;
990  B1_re -= gT10_im * b0_im;
991  B1_re += gT11_re * b1_re;
992  B1_re -= gT11_im * b1_im;
993  B1_re += gT12_re * b2_re;
994  B1_re -= gT12_im * b2_im;
995  spinorFloat B1_im = 0;
996  B1_im += gT10_re * b0_im;
997  B1_im += gT10_im * b0_re;
998  B1_im += gT11_re * b1_im;
999  B1_im += gT11_im * b1_re;
1000  B1_im += gT12_re * b2_im;
1001  B1_im += gT12_im * b2_re;
1002 
1003  // multiply row 2
1004  spinorFloat A2_re = 0;
1005  A2_re += gT20_re * a0_re;
1006  A2_re -= gT20_im * a0_im;
1007  A2_re += gT21_re * a1_re;
1008  A2_re -= gT21_im * a1_im;
1009  A2_re += gT22_re * a2_re;
1010  A2_re -= gT22_im * a2_im;
1011  spinorFloat A2_im = 0;
1012  A2_im += gT20_re * a0_im;
1013  A2_im += gT20_im * a0_re;
1014  A2_im += gT21_re * a1_im;
1015  A2_im += gT21_im * a1_re;
1016  A2_im += gT22_re * a2_im;
1017  A2_im += gT22_im * a2_re;
1018  spinorFloat B2_re = 0;
1019  B2_re += gT20_re * b0_re;
1020  B2_re -= gT20_im * b0_im;
1021  B2_re += gT21_re * b1_re;
1022  B2_re -= gT21_im * b1_im;
1023  B2_re += gT22_re * b2_re;
1024  B2_re -= gT22_im * b2_im;
1025  spinorFloat B2_im = 0;
1026  B2_im += gT20_re * b0_im;
1027  B2_im += gT20_im * b0_re;
1028  B2_im += gT21_re * b1_im;
1029  B2_im += gT21_im * b1_re;
1030  B2_im += gT22_re * b2_im;
1031  B2_im += gT22_im * b2_re;
1032 
1033  o00_re += A0_re;
1034  o00_im += A0_im;
1035  o10_re += B0_re;
1036  o10_im += B0_im;
1037  o20_re -= B0_im;
1038  o20_im += B0_re;
1039  o30_re -= A0_im;
1040  o30_im += A0_re;
1041 
1042  o01_re += A1_re;
1043  o01_im += A1_im;
1044  o11_re += B1_re;
1045  o11_im += B1_im;
1046  o21_re -= B1_im;
1047  o21_im += B1_re;
1048  o31_re -= A1_im;
1049  o31_im += A1_re;
1050 
1051  o02_re += A2_re;
1052  o02_im += A2_im;
1053  o12_re += B2_re;
1054  o12_im += B2_im;
1055  o22_re -= B2_im;
1056  o22_im += B2_re;
1057  o32_re -= A2_im;
1058  o32_im += A2_re;
1059 
1060 }
1061 
1062 #ifdef MULTI_GPU
1063 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
1065 #endif
1066 {
1067  // Projector P1+
1068  // 1 0 0 1
1069  // 0 1 -1 0
1070  // 0 -1 1 0
1071  // 1 0 0 1
1072 
1073 #ifdef MULTI_GPU
1074  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
1075  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1076 #else
1077  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
1078 #endif
1079 
1080  const int ga_idx = sid;
1081 
1088 
1089 #ifdef MULTI_GPU
1090  if (kernel_type == INTERIOR_KERNEL) {
1091 #endif
1092 
1093  // read spinor from device memory
1094  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1095 
1096  // project spinor into half spinors
1097  a0_re = +i00_re+i30_re;
1098  a0_im = +i00_im+i30_im;
1099  a1_re = +i01_re+i31_re;
1100  a1_im = +i01_im+i31_im;
1101  a2_re = +i02_re+i32_re;
1102  a2_im = +i02_im+i32_im;
1103  b0_re = +i10_re-i20_re;
1104  b0_im = +i10_im-i20_im;
1105  b1_re = +i11_re-i21_re;
1106  b1_im = +i11_im-i21_im;
1107  b2_re = +i12_re-i22_re;
1108  b2_im = +i12_im-i22_im;
1109 
1110 #ifdef MULTI_GPU
1111  } else {
1112 
1113  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1114 
1115  // read half spinor from device memory
1116  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1117 
1118  a0_re = i00_re; a0_im = i00_im;
1119  a1_re = i01_re; a1_im = i01_im;
1120  a2_re = i02_re; a2_im = i02_im;
1121  b0_re = i10_re; b0_im = i10_im;
1122  b1_re = i11_re; b1_im = i11_im;
1123  b2_re = i12_re; b2_im = i12_im;
1124 
1125  }
1126 #endif // MULTI_GPU
1127 
1128  // read gauge matrix from device memory
1129  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
1130 
1131  // reconstruct gauge matrix
1133 
1134  // multiply row 0
1135  spinorFloat A0_re = 0;
1136  A0_re += g00_re * a0_re;
1137  A0_re -= g00_im * a0_im;
1138  A0_re += g01_re * a1_re;
1139  A0_re -= g01_im * a1_im;
1140  A0_re += g02_re * a2_re;
1141  A0_re -= g02_im * a2_im;
1142  spinorFloat A0_im = 0;
1143  A0_im += g00_re * a0_im;
1144  A0_im += g00_im * a0_re;
1145  A0_im += g01_re * a1_im;
1146  A0_im += g01_im * a1_re;
1147  A0_im += g02_re * a2_im;
1148  A0_im += g02_im * a2_re;
1149  spinorFloat B0_re = 0;
1150  B0_re += g00_re * b0_re;
1151  B0_re -= g00_im * b0_im;
1152  B0_re += g01_re * b1_re;
1153  B0_re -= g01_im * b1_im;
1154  B0_re += g02_re * b2_re;
1155  B0_re -= g02_im * b2_im;
1156  spinorFloat B0_im = 0;
1157  B0_im += g00_re * b0_im;
1158  B0_im += g00_im * b0_re;
1159  B0_im += g01_re * b1_im;
1160  B0_im += g01_im * b1_re;
1161  B0_im += g02_re * b2_im;
1162  B0_im += g02_im * b2_re;
1163 
1164  // multiply row 1
1165  spinorFloat A1_re = 0;
1166  A1_re += g10_re * a0_re;
1167  A1_re -= g10_im * a0_im;
1168  A1_re += g11_re * a1_re;
1169  A1_re -= g11_im * a1_im;
1170  A1_re += g12_re * a2_re;
1171  A1_re -= g12_im * a2_im;
1172  spinorFloat A1_im = 0;
1173  A1_im += g10_re * a0_im;
1174  A1_im += g10_im * a0_re;
1175  A1_im += g11_re * a1_im;
1176  A1_im += g11_im * a1_re;
1177  A1_im += g12_re * a2_im;
1178  A1_im += g12_im * a2_re;
1179  spinorFloat B1_re = 0;
1180  B1_re += g10_re * b0_re;
1181  B1_re -= g10_im * b0_im;
1182  B1_re += g11_re * b1_re;
1183  B1_re -= g11_im * b1_im;
1184  B1_re += g12_re * b2_re;
1185  B1_re -= g12_im * b2_im;
1186  spinorFloat B1_im = 0;
1187  B1_im += g10_re * b0_im;
1188  B1_im += g10_im * b0_re;
1189  B1_im += g11_re * b1_im;
1190  B1_im += g11_im * b1_re;
1191  B1_im += g12_re * b2_im;
1192  B1_im += g12_im * b2_re;
1193 
1194  // multiply row 2
1195  spinorFloat A2_re = 0;
1196  A2_re += g20_re * a0_re;
1197  A2_re -= g20_im * a0_im;
1198  A2_re += g21_re * a1_re;
1199  A2_re -= g21_im * a1_im;
1200  A2_re += g22_re * a2_re;
1201  A2_re -= g22_im * a2_im;
1202  spinorFloat A2_im = 0;
1203  A2_im += g20_re * a0_im;
1204  A2_im += g20_im * a0_re;
1205  A2_im += g21_re * a1_im;
1206  A2_im += g21_im * a1_re;
1207  A2_im += g22_re * a2_im;
1208  A2_im += g22_im * a2_re;
1209  spinorFloat B2_re = 0;
1210  B2_re += g20_re * b0_re;
1211  B2_re -= g20_im * b0_im;
1212  B2_re += g21_re * b1_re;
1213  B2_re -= g21_im * b1_im;
1214  B2_re += g22_re * b2_re;
1215  B2_re -= g22_im * b2_im;
1216  spinorFloat B2_im = 0;
1217  B2_im += g20_re * b0_im;
1218  B2_im += g20_im * b0_re;
1219  B2_im += g21_re * b1_im;
1220  B2_im += g21_im * b1_re;
1221  B2_im += g22_re * b2_im;
1222  B2_im += g22_im * b2_re;
1223 
1224  o00_re += A0_re;
1225  o00_im += A0_im;
1226  o10_re += B0_re;
1227  o10_im += B0_im;
1228  o20_re -= B0_re;
1229  o20_im -= B0_im;
1230  o30_re += A0_re;
1231  o30_im += A0_im;
1232 
1233  o01_re += A1_re;
1234  o01_im += A1_im;
1235  o11_re += B1_re;
1236  o11_im += B1_im;
1237  o21_re -= B1_re;
1238  o21_im -= B1_im;
1239  o31_re += A1_re;
1240  o31_im += A1_im;
1241 
1242  o02_re += A2_re;
1243  o02_im += A2_im;
1244  o12_re += B2_re;
1245  o12_im += B2_im;
1246  o22_re -= B2_re;
1247  o22_im -= B2_im;
1248  o32_re += A2_re;
1249  o32_im += A2_im;
1250 
1251 }
1252 
1253 #ifdef MULTI_GPU
1254 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
1255  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
1256 #endif
1257 {
1258  // Projector P1-
1259  // 1 0 0 -1
1260  // 0 1 1 0
1261  // 0 1 1 0
1262  // -1 0 0 1
1263 
1264 #ifdef MULTI_GPU
1265  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
1266  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1267 #else
1268  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
1269 #endif
1270 
1271 #ifdef MULTI_GPU
1272  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1273 #else
1274  const int ga_idx = sp_idx;
1275 #endif
1276 
1283 
1284 #ifdef MULTI_GPU
1285  if (kernel_type == INTERIOR_KERNEL) {
1286 #endif
1287 
1288  // read spinor from device memory
1289  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1290 
1291  // project spinor into half spinors
1292  a0_re = +i00_re-i30_re;
1293  a0_im = +i00_im-i30_im;
1294  a1_re = +i01_re-i31_re;
1295  a1_im = +i01_im-i31_im;
1296  a2_re = +i02_re-i32_re;
1297  a2_im = +i02_im-i32_im;
1298  b0_re = +i10_re+i20_re;
1299  b0_im = +i10_im+i20_im;
1300  b1_re = +i11_re+i21_re;
1301  b1_im = +i11_im+i21_im;
1302  b2_re = +i12_re+i22_re;
1303  b2_im = +i12_im+i22_im;
1304 
1305 #ifdef MULTI_GPU
1306  } else {
1307 
1308  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1309 
1310  // read half spinor from device memory
1311  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1312 
1313  a0_re = i00_re; a0_im = i00_im;
1314  a1_re = i01_re; a1_im = i01_im;
1315  a2_re = i02_re; a2_im = i02_im;
1316  b0_re = i10_re; b0_im = i10_im;
1317  b1_re = i11_re; b1_im = i11_im;
1318  b2_re = i12_re; b2_im = i12_im;
1319 
1320  }
1321 #endif // MULTI_GPU
1322 
1323  // read gauge matrix from device memory
1324  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
1325 
1326  // reconstruct gauge matrix
1328 
1329  // multiply row 0
1330  spinorFloat A0_re = 0;
1331  A0_re += gT00_re * a0_re;
1332  A0_re -= gT00_im * a0_im;
1333  A0_re += gT01_re * a1_re;
1334  A0_re -= gT01_im * a1_im;
1335  A0_re += gT02_re * a2_re;
1336  A0_re -= gT02_im * a2_im;
1337  spinorFloat A0_im = 0;
1338  A0_im += gT00_re * a0_im;
1339  A0_im += gT00_im * a0_re;
1340  A0_im += gT01_re * a1_im;
1341  A0_im += gT01_im * a1_re;
1342  A0_im += gT02_re * a2_im;
1343  A0_im += gT02_im * a2_re;
1344  spinorFloat B0_re = 0;
1345  B0_re += gT00_re * b0_re;
1346  B0_re -= gT00_im * b0_im;
1347  B0_re += gT01_re * b1_re;
1348  B0_re -= gT01_im * b1_im;
1349  B0_re += gT02_re * b2_re;
1350  B0_re -= gT02_im * b2_im;
1351  spinorFloat B0_im = 0;
1352  B0_im += gT00_re * b0_im;
1353  B0_im += gT00_im * b0_re;
1354  B0_im += gT01_re * b1_im;
1355  B0_im += gT01_im * b1_re;
1356  B0_im += gT02_re * b2_im;
1357  B0_im += gT02_im * b2_re;
1358 
1359  // multiply row 1
1360  spinorFloat A1_re = 0;
1361  A1_re += gT10_re * a0_re;
1362  A1_re -= gT10_im * a0_im;
1363  A1_re += gT11_re * a1_re;
1364  A1_re -= gT11_im * a1_im;
1365  A1_re += gT12_re * a2_re;
1366  A1_re -= gT12_im * a2_im;
1367  spinorFloat A1_im = 0;
1368  A1_im += gT10_re * a0_im;
1369  A1_im += gT10_im * a0_re;
1370  A1_im += gT11_re * a1_im;
1371  A1_im += gT11_im * a1_re;
1372  A1_im += gT12_re * a2_im;
1373  A1_im += gT12_im * a2_re;
1374  spinorFloat B1_re = 0;
1375  B1_re += gT10_re * b0_re;
1376  B1_re -= gT10_im * b0_im;
1377  B1_re += gT11_re * b1_re;
1378  B1_re -= gT11_im * b1_im;
1379  B1_re += gT12_re * b2_re;
1380  B1_re -= gT12_im * b2_im;
1381  spinorFloat B1_im = 0;
1382  B1_im += gT10_re * b0_im;
1383  B1_im += gT10_im * b0_re;
1384  B1_im += gT11_re * b1_im;
1385  B1_im += gT11_im * b1_re;
1386  B1_im += gT12_re * b2_im;
1387  B1_im += gT12_im * b2_re;
1388 
1389  // multiply row 2
1390  spinorFloat A2_re = 0;
1391  A2_re += gT20_re * a0_re;
1392  A2_re -= gT20_im * a0_im;
1393  A2_re += gT21_re * a1_re;
1394  A2_re -= gT21_im * a1_im;
1395  A2_re += gT22_re * a2_re;
1396  A2_re -= gT22_im * a2_im;
1397  spinorFloat A2_im = 0;
1398  A2_im += gT20_re * a0_im;
1399  A2_im += gT20_im * a0_re;
1400  A2_im += gT21_re * a1_im;
1401  A2_im += gT21_im * a1_re;
1402  A2_im += gT22_re * a2_im;
1403  A2_im += gT22_im * a2_re;
1404  spinorFloat B2_re = 0;
1405  B2_re += gT20_re * b0_re;
1406  B2_re -= gT20_im * b0_im;
1407  B2_re += gT21_re * b1_re;
1408  B2_re -= gT21_im * b1_im;
1409  B2_re += gT22_re * b2_re;
1410  B2_re -= gT22_im * b2_im;
1411  spinorFloat B2_im = 0;
1412  B2_im += gT20_re * b0_im;
1413  B2_im += gT20_im * b0_re;
1414  B2_im += gT21_re * b1_im;
1415  B2_im += gT21_im * b1_re;
1416  B2_im += gT22_re * b2_im;
1417  B2_im += gT22_im * b2_re;
1418 
1419  o00_re += A0_re;
1420  o00_im += A0_im;
1421  o10_re += B0_re;
1422  o10_im += B0_im;
1423  o20_re += B0_re;
1424  o20_im += B0_im;
1425  o30_re -= A0_re;
1426  o30_im -= A0_im;
1427 
1428  o01_re += A1_re;
1429  o01_im += A1_im;
1430  o11_re += B1_re;
1431  o11_im += B1_im;
1432  o21_re += B1_re;
1433  o21_im += B1_im;
1434  o31_re -= A1_re;
1435  o31_im -= A1_im;
1436 
1437  o02_re += A2_re;
1438  o02_im += A2_im;
1439  o12_re += B2_re;
1440  o12_im += B2_im;
1441  o22_re += B2_re;
1442  o22_im += B2_im;
1443  o32_re -= A2_re;
1444  o32_im -= A2_im;
1445 
1446 }
1447 
1448 #ifdef MULTI_GPU
1449 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1451 #endif
1452 {
1453  // Projector P2+
1454  // 1 0 i 0
1455  // 0 1 0 -i
1456  // -i 0 1 0
1457  // 0 i 0 1
1458 
1459 #ifdef MULTI_GPU
1460  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1461  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1462 #else
1463  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1464 #endif
1465 
1466  const int ga_idx = sid;
1467 
1474 
1475 #ifdef MULTI_GPU
1476  if (kernel_type == INTERIOR_KERNEL) {
1477 #endif
1478 
1479  // read spinor from device memory
1480  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1481 
1482  // project spinor into half spinors
1483  a0_re = +i00_re-i20_im;
1484  a0_im = +i00_im+i20_re;
1485  a1_re = +i01_re-i21_im;
1486  a1_im = +i01_im+i21_re;
1487  a2_re = +i02_re-i22_im;
1488  a2_im = +i02_im+i22_re;
1489  b0_re = +i10_re+i30_im;
1490  b0_im = +i10_im-i30_re;
1491  b1_re = +i11_re+i31_im;
1492  b1_im = +i11_im-i31_re;
1493  b2_re = +i12_re+i32_im;
1494  b2_im = +i12_im-i32_re;
1495 
1496 #ifdef MULTI_GPU
1497  } else {
1498 
1499  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1500 
1501  // read half spinor from device memory
1502  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1503 
1504  a0_re = i00_re; a0_im = i00_im;
1505  a1_re = i01_re; a1_im = i01_im;
1506  a2_re = i02_re; a2_im = i02_im;
1507  b0_re = i10_re; b0_im = i10_im;
1508  b1_re = i11_re; b1_im = i11_im;
1509  b2_re = i12_re; b2_im = i12_im;
1510 
1511  }
1512 #endif // MULTI_GPU
1513 
1514  // read gauge matrix from device memory
1515  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1516 
1517  // reconstruct gauge matrix
1519 
1520  // multiply row 0
1521  spinorFloat A0_re = 0;
1522  A0_re += g00_re * a0_re;
1523  A0_re -= g00_im * a0_im;
1524  A0_re += g01_re * a1_re;
1525  A0_re -= g01_im * a1_im;
1526  A0_re += g02_re * a2_re;
1527  A0_re -= g02_im * a2_im;
1528  spinorFloat A0_im = 0;
1529  A0_im += g00_re * a0_im;
1530  A0_im += g00_im * a0_re;
1531  A0_im += g01_re * a1_im;
1532  A0_im += g01_im * a1_re;
1533  A0_im += g02_re * a2_im;
1534  A0_im += g02_im * a2_re;
1535  spinorFloat B0_re = 0;
1536  B0_re += g00_re * b0_re;
1537  B0_re -= g00_im * b0_im;
1538  B0_re += g01_re * b1_re;
1539  B0_re -= g01_im * b1_im;
1540  B0_re += g02_re * b2_re;
1541  B0_re -= g02_im * b2_im;
1542  spinorFloat B0_im = 0;
1543  B0_im += g00_re * b0_im;
1544  B0_im += g00_im * b0_re;
1545  B0_im += g01_re * b1_im;
1546  B0_im += g01_im * b1_re;
1547  B0_im += g02_re * b2_im;
1548  B0_im += g02_im * b2_re;
1549 
1550  // multiply row 1
1551  spinorFloat A1_re = 0;
1552  A1_re += g10_re * a0_re;
1553  A1_re -= g10_im * a0_im;
1554  A1_re += g11_re * a1_re;
1555  A1_re -= g11_im * a1_im;
1556  A1_re += g12_re * a2_re;
1557  A1_re -= g12_im * a2_im;
1558  spinorFloat A1_im = 0;
1559  A1_im += g10_re * a0_im;
1560  A1_im += g10_im * a0_re;
1561  A1_im += g11_re * a1_im;
1562  A1_im += g11_im * a1_re;
1563  A1_im += g12_re * a2_im;
1564  A1_im += g12_im * a2_re;
1565  spinorFloat B1_re = 0;
1566  B1_re += g10_re * b0_re;
1567  B1_re -= g10_im * b0_im;
1568  B1_re += g11_re * b1_re;
1569  B1_re -= g11_im * b1_im;
1570  B1_re += g12_re * b2_re;
1571  B1_re -= g12_im * b2_im;
1572  spinorFloat B1_im = 0;
1573  B1_im += g10_re * b0_im;
1574  B1_im += g10_im * b0_re;
1575  B1_im += g11_re * b1_im;
1576  B1_im += g11_im * b1_re;
1577  B1_im += g12_re * b2_im;
1578  B1_im += g12_im * b2_re;
1579 
1580  // multiply row 2
1581  spinorFloat A2_re = 0;
1582  A2_re += g20_re * a0_re;
1583  A2_re -= g20_im * a0_im;
1584  A2_re += g21_re * a1_re;
1585  A2_re -= g21_im * a1_im;
1586  A2_re += g22_re * a2_re;
1587  A2_re -= g22_im * a2_im;
1588  spinorFloat A2_im = 0;
1589  A2_im += g20_re * a0_im;
1590  A2_im += g20_im * a0_re;
1591  A2_im += g21_re * a1_im;
1592  A2_im += g21_im * a1_re;
1593  A2_im += g22_re * a2_im;
1594  A2_im += g22_im * a2_re;
1595  spinorFloat B2_re = 0;
1596  B2_re += g20_re * b0_re;
1597  B2_re -= g20_im * b0_im;
1598  B2_re += g21_re * b1_re;
1599  B2_re -= g21_im * b1_im;
1600  B2_re += g22_re * b2_re;
1601  B2_re -= g22_im * b2_im;
1602  spinorFloat B2_im = 0;
1603  B2_im += g20_re * b0_im;
1604  B2_im += g20_im * b0_re;
1605  B2_im += g21_re * b1_im;
1606  B2_im += g21_im * b1_re;
1607  B2_im += g22_re * b2_im;
1608  B2_im += g22_im * b2_re;
1609 
1610  o00_re += A0_re;
1611  o00_im += A0_im;
1612  o10_re += B0_re;
1613  o10_im += B0_im;
1614  o20_re += A0_im;
1615  o20_im -= A0_re;
1616  o30_re -= B0_im;
1617  o30_im += B0_re;
1618 
1619  o01_re += A1_re;
1620  o01_im += A1_im;
1621  o11_re += B1_re;
1622  o11_im += B1_im;
1623  o21_re += A1_im;
1624  o21_im -= A1_re;
1625  o31_re -= B1_im;
1626  o31_im += B1_re;
1627 
1628  o02_re += A2_re;
1629  o02_im += A2_im;
1630  o12_re += B2_re;
1631  o12_im += B2_im;
1632  o22_re += A2_im;
1633  o22_im -= A2_re;
1634  o32_re -= B2_im;
1635  o32_im += B2_re;
1636 
1637 }
1638 
1639 #ifdef MULTI_GPU
1640 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1641  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1642 #endif
1643 {
1644  // Projector P2-
1645  // 1 0 -i 0
1646  // 0 1 0 i
1647  // i 0 1 0
1648  // 0 -i 0 1
1649 
1650 #ifdef MULTI_GPU
1651  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1652  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1653 #else
1654  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1655 #endif
1656 
1657 #ifdef MULTI_GPU
1658  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1659 #else
1660  const int ga_idx = sp_idx;
1661 #endif
1662 
1669 
1670 #ifdef MULTI_GPU
1671  if (kernel_type == INTERIOR_KERNEL) {
1672 #endif
1673 
1674  // read spinor from device memory
1675  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1676 
1677  // project spinor into half spinors
1678  a0_re = +i00_re+i20_im;
1679  a0_im = +i00_im-i20_re;
1680  a1_re = +i01_re+i21_im;
1681  a1_im = +i01_im-i21_re;
1682  a2_re = +i02_re+i22_im;
1683  a2_im = +i02_im-i22_re;
1684  b0_re = +i10_re-i30_im;
1685  b0_im = +i10_im+i30_re;
1686  b1_re = +i11_re-i31_im;
1687  b1_im = +i11_im+i31_re;
1688  b2_re = +i12_re-i32_im;
1689  b2_im = +i12_im+i32_re;
1690 
1691 #ifdef MULTI_GPU
1692  } else {
1693 
1694  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1695 
1696  // read half spinor from device memory
1697  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1698 
1699  a0_re = i00_re; a0_im = i00_im;
1700  a1_re = i01_re; a1_im = i01_im;
1701  a2_re = i02_re; a2_im = i02_im;
1702  b0_re = i10_re; b0_im = i10_im;
1703  b1_re = i11_re; b1_im = i11_im;
1704  b2_re = i12_re; b2_im = i12_im;
1705 
1706  }
1707 #endif // MULTI_GPU
1708 
1709  // read gauge matrix from device memory
1710  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1711 
1712  // reconstruct gauge matrix
1714 
1715  // multiply row 0
1716  spinorFloat A0_re = 0;
1717  A0_re += gT00_re * a0_re;
1718  A0_re -= gT00_im * a0_im;
1719  A0_re += gT01_re * a1_re;
1720  A0_re -= gT01_im * a1_im;
1721  A0_re += gT02_re * a2_re;
1722  A0_re -= gT02_im * a2_im;
1723  spinorFloat A0_im = 0;
1724  A0_im += gT00_re * a0_im;
1725  A0_im += gT00_im * a0_re;
1726  A0_im += gT01_re * a1_im;
1727  A0_im += gT01_im * a1_re;
1728  A0_im += gT02_re * a2_im;
1729  A0_im += gT02_im * a2_re;
1730  spinorFloat B0_re = 0;
1731  B0_re += gT00_re * b0_re;
1732  B0_re -= gT00_im * b0_im;
1733  B0_re += gT01_re * b1_re;
1734  B0_re -= gT01_im * b1_im;
1735  B0_re += gT02_re * b2_re;
1736  B0_re -= gT02_im * b2_im;
1737  spinorFloat B0_im = 0;
1738  B0_im += gT00_re * b0_im;
1739  B0_im += gT00_im * b0_re;
1740  B0_im += gT01_re * b1_im;
1741  B0_im += gT01_im * b1_re;
1742  B0_im += gT02_re * b2_im;
1743  B0_im += gT02_im * b2_re;
1744 
1745  // multiply row 1
1746  spinorFloat A1_re = 0;
1747  A1_re += gT10_re * a0_re;
1748  A1_re -= gT10_im * a0_im;
1749  A1_re += gT11_re * a1_re;
1750  A1_re -= gT11_im * a1_im;
1751  A1_re += gT12_re * a2_re;
1752  A1_re -= gT12_im * a2_im;
1753  spinorFloat A1_im = 0;
1754  A1_im += gT10_re * a0_im;
1755  A1_im += gT10_im * a0_re;
1756  A1_im += gT11_re * a1_im;
1757  A1_im += gT11_im * a1_re;
1758  A1_im += gT12_re * a2_im;
1759  A1_im += gT12_im * a2_re;
1760  spinorFloat B1_re = 0;
1761  B1_re += gT10_re * b0_re;
1762  B1_re -= gT10_im * b0_im;
1763  B1_re += gT11_re * b1_re;
1764  B1_re -= gT11_im * b1_im;
1765  B1_re += gT12_re * b2_re;
1766  B1_re -= gT12_im * b2_im;
1767  spinorFloat B1_im = 0;
1768  B1_im += gT10_re * b0_im;
1769  B1_im += gT10_im * b0_re;
1770  B1_im += gT11_re * b1_im;
1771  B1_im += gT11_im * b1_re;
1772  B1_im += gT12_re * b2_im;
1773  B1_im += gT12_im * b2_re;
1774 
1775  // multiply row 2
1776  spinorFloat A2_re = 0;
1777  A2_re += gT20_re * a0_re;
1778  A2_re -= gT20_im * a0_im;
1779  A2_re += gT21_re * a1_re;
1780  A2_re -= gT21_im * a1_im;
1781  A2_re += gT22_re * a2_re;
1782  A2_re -= gT22_im * a2_im;
1783  spinorFloat A2_im = 0;
1784  A2_im += gT20_re * a0_im;
1785  A2_im += gT20_im * a0_re;
1786  A2_im += gT21_re * a1_im;
1787  A2_im += gT21_im * a1_re;
1788  A2_im += gT22_re * a2_im;
1789  A2_im += gT22_im * a2_re;
1790  spinorFloat B2_re = 0;
1791  B2_re += gT20_re * b0_re;
1792  B2_re -= gT20_im * b0_im;
1793  B2_re += gT21_re * b1_re;
1794  B2_re -= gT21_im * b1_im;
1795  B2_re += gT22_re * b2_re;
1796  B2_re -= gT22_im * b2_im;
1797  spinorFloat B2_im = 0;
1798  B2_im += gT20_re * b0_im;
1799  B2_im += gT20_im * b0_re;
1800  B2_im += gT21_re * b1_im;
1801  B2_im += gT21_im * b1_re;
1802  B2_im += gT22_re * b2_im;
1803  B2_im += gT22_im * b2_re;
1804 
1805  o00_re += A0_re;
1806  o00_im += A0_im;
1807  o10_re += B0_re;
1808  o10_im += B0_im;
1809  o20_re -= A0_im;
1810  o20_im += A0_re;
1811  o30_re += B0_im;
1812  o30_im -= B0_re;
1813 
1814  o01_re += A1_re;
1815  o01_im += A1_im;
1816  o11_re += B1_re;
1817  o11_im += B1_im;
1818  o21_re -= A1_im;
1819  o21_im += A1_re;
1820  o31_re += B1_im;
1821  o31_im -= B1_re;
1822 
1823  o02_re += A2_re;
1824  o02_im += A2_im;
1825  o12_re += B2_re;
1826  o12_im += B2_im;
1827  o22_re -= A2_im;
1828  o22_im += A2_re;
1829  o32_re += B2_im;
1830  o32_im -= B2_re;
1831 
1832 }
1833 
1834 #ifdef MULTI_GPU
1835 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1837 #endif
1838 {
1839  // Projector P3+
1840  // 2 0 0 0
1841  // 0 2 0 0
1842  // 0 0 0 0
1843  // 0 0 0 0
1844 
1845 #ifdef MULTI_GPU
1846  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1847  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1848 #else
1849  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1850 #endif
1851 
1852  const int ga_idx = sid;
1853 
1855  {
1862 
1863 #ifdef MULTI_GPU
1864  if (kernel_type == INTERIOR_KERNEL) {
1865 #endif
1866 
1867  // read spinor from device memory
1868  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1869 
1870  // project spinor into half spinors
1871  a0_re = +2*i00_re;
1872  a0_im = +2*i00_im;
1873  a1_re = +2*i01_re;
1874  a1_im = +2*i01_im;
1875  a2_re = +2*i02_re;
1876  a2_im = +2*i02_im;
1877  b0_re = +2*i10_re;
1878  b0_im = +2*i10_im;
1879  b1_re = +2*i11_re;
1880  b1_im = +2*i11_im;
1881  b2_re = +2*i12_re;
1882  b2_im = +2*i12_im;
1883 
1884 #ifdef MULTI_GPU
1885  } else {
1886 
1887  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1888  const int t_proj_scale = TPROJSCALE;
1889 
1890  // read half spinor from device memory
1891  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1892 
1893  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1894  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1895  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1896  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1897  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1898  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1899 
1900  }
1901 #endif // MULTI_GPU
1902 
1903  // identity gauge matrix
1910 
1911  o00_re += A0_re;
1912  o00_im += A0_im;
1913  o10_re += B0_re;
1914  o10_im += B0_im;
1915 
1916  o01_re += A1_re;
1917  o01_im += A1_im;
1918  o11_re += B1_re;
1919  o11_im += B1_im;
1920 
1921  o02_re += A2_re;
1922  o02_im += A2_im;
1923  o12_re += B2_re;
1924  o12_im += B2_im;
1925 
1926  } else {
1933 
1934 #ifdef MULTI_GPU
1935  if (kernel_type == INTERIOR_KERNEL) {
1936 #endif
1937 
1938  // read spinor from device memory
1939  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1940 
1941  // project spinor into half spinors
1942  a0_re = +2*i00_re;
1943  a0_im = +2*i00_im;
1944  a1_re = +2*i01_re;
1945  a1_im = +2*i01_im;
1946  a2_re = +2*i02_re;
1947  a2_im = +2*i02_im;
1948  b0_re = +2*i10_re;
1949  b0_im = +2*i10_im;
1950  b1_re = +2*i11_re;
1951  b1_im = +2*i11_im;
1952  b2_re = +2*i12_re;
1953  b2_im = +2*i12_im;
1954 
1955 #ifdef MULTI_GPU
1956  } else {
1957 
1958  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1959  const int t_proj_scale = TPROJSCALE;
1960 
1961  // read half spinor from device memory
1962  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1963 
1964  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1965  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1966  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1967  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1968  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1969  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1970 
1971  }
1972 #endif // MULTI_GPU
1973 
1974  // read gauge matrix from device memory
1975  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1976 
1977  // reconstruct gauge matrix
1979 
1980  // multiply row 0
1981  spinorFloat A0_re = 0;
1982  A0_re += g00_re * a0_re;
1983  A0_re -= g00_im * a0_im;
1984  A0_re += g01_re * a1_re;
1985  A0_re -= g01_im * a1_im;
1986  A0_re += g02_re * a2_re;
1987  A0_re -= g02_im * a2_im;
1988  spinorFloat A0_im = 0;
1989  A0_im += g00_re * a0_im;
1990  A0_im += g00_im * a0_re;
1991  A0_im += g01_re * a1_im;
1992  A0_im += g01_im * a1_re;
1993  A0_im += g02_re * a2_im;
1994  A0_im += g02_im * a2_re;
1995  spinorFloat B0_re = 0;
1996  B0_re += g00_re * b0_re;
1997  B0_re -= g00_im * b0_im;
1998  B0_re += g01_re * b1_re;
1999  B0_re -= g01_im * b1_im;
2000  B0_re += g02_re * b2_re;
2001  B0_re -= g02_im * b2_im;
2002  spinorFloat B0_im = 0;
2003  B0_im += g00_re * b0_im;
2004  B0_im += g00_im * b0_re;
2005  B0_im += g01_re * b1_im;
2006  B0_im += g01_im * b1_re;
2007  B0_im += g02_re * b2_im;
2008  B0_im += g02_im * b2_re;
2009 
2010  // multiply row 1
2011  spinorFloat A1_re = 0;
2012  A1_re += g10_re * a0_re;
2013  A1_re -= g10_im * a0_im;
2014  A1_re += g11_re * a1_re;
2015  A1_re -= g11_im * a1_im;
2016  A1_re += g12_re * a2_re;
2017  A1_re -= g12_im * a2_im;
2018  spinorFloat A1_im = 0;
2019  A1_im += g10_re * a0_im;
2020  A1_im += g10_im * a0_re;
2021  A1_im += g11_re * a1_im;
2022  A1_im += g11_im * a1_re;
2023  A1_im += g12_re * a2_im;
2024  A1_im += g12_im * a2_re;
2025  spinorFloat B1_re = 0;
2026  B1_re += g10_re * b0_re;
2027  B1_re -= g10_im * b0_im;
2028  B1_re += g11_re * b1_re;
2029  B1_re -= g11_im * b1_im;
2030  B1_re += g12_re * b2_re;
2031  B1_re -= g12_im * b2_im;
2032  spinorFloat B1_im = 0;
2033  B1_im += g10_re * b0_im;
2034  B1_im += g10_im * b0_re;
2035  B1_im += g11_re * b1_im;
2036  B1_im += g11_im * b1_re;
2037  B1_im += g12_re * b2_im;
2038  B1_im += g12_im * b2_re;
2039 
2040  // multiply row 2
2041  spinorFloat A2_re = 0;
2042  A2_re += g20_re * a0_re;
2043  A2_re -= g20_im * a0_im;
2044  A2_re += g21_re * a1_re;
2045  A2_re -= g21_im * a1_im;
2046  A2_re += g22_re * a2_re;
2047  A2_re -= g22_im * a2_im;
2048  spinorFloat A2_im = 0;
2049  A2_im += g20_re * a0_im;
2050  A2_im += g20_im * a0_re;
2051  A2_im += g21_re * a1_im;
2052  A2_im += g21_im * a1_re;
2053  A2_im += g22_re * a2_im;
2054  A2_im += g22_im * a2_re;
2055  spinorFloat B2_re = 0;
2056  B2_re += g20_re * b0_re;
2057  B2_re -= g20_im * b0_im;
2058  B2_re += g21_re * b1_re;
2059  B2_re -= g21_im * b1_im;
2060  B2_re += g22_re * b2_re;
2061  B2_re -= g22_im * b2_im;
2062  spinorFloat B2_im = 0;
2063  B2_im += g20_re * b0_im;
2064  B2_im += g20_im * b0_re;
2065  B2_im += g21_re * b1_im;
2066  B2_im += g21_im * b1_re;
2067  B2_im += g22_re * b2_im;
2068  B2_im += g22_im * b2_re;
2069 
2070  o00_re += A0_re;
2071  o00_im += A0_im;
2072  o10_re += B0_re;
2073  o10_im += B0_im;
2074 
2075  o01_re += A1_re;
2076  o01_im += A1_im;
2077  o11_re += B1_re;
2078  o11_im += B1_im;
2079 
2080  o02_re += A2_re;
2081  o02_im += A2_im;
2082  o12_re += B2_re;
2083  o12_im += B2_im;
2084 
2085  }
2086 }
2087 
2088 #ifdef MULTI_GPU
2089 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
2090  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
2091 #endif
2092 {
2093  // Projector P3-
2094  // 0 0 0 0
2095  // 0 0 0 0
2096  // 0 0 2 0
2097  // 0 0 0 2
2098 
2099 #ifdef MULTI_GPU
2100  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
2101  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2102 #else
2103  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
2104 #endif
2105 
2106 #ifdef MULTI_GPU
2107  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
2108 #else
2109  const int ga_idx = sp_idx;
2110 #endif
2111 
2112  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
2113  {
2120 
2121 #ifdef MULTI_GPU
2122  if (kernel_type == INTERIOR_KERNEL) {
2123 #endif
2124 
2125  // read spinor from device memory
2126  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2127 
2128  // project spinor into half spinors
2129  a0_re = +2*i20_re;
2130  a0_im = +2*i20_im;
2131  a1_re = +2*i21_re;
2132  a1_im = +2*i21_im;
2133  a2_re = +2*i22_re;
2134  a2_im = +2*i22_im;
2135  b0_re = +2*i30_re;
2136  b0_im = +2*i30_im;
2137  b1_re = +2*i31_re;
2138  b1_im = +2*i31_im;
2139  b2_re = +2*i32_re;
2140  b2_im = +2*i32_im;
2141 
2142 #ifdef MULTI_GPU
2143  } else {
2144 
2145  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2146  const int t_proj_scale = TPROJSCALE;
2147 
2148  // read half spinor from device memory
2149  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2150 
2151  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2152  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2153  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2154  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2155  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2156  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2157 
2158  }
2159 #endif // MULTI_GPU
2160 
2161  // identity gauge matrix
2168 
2169  o20_re += A0_re;
2170  o20_im += A0_im;
2171  o30_re += B0_re;
2172  o30_im += B0_im;
2173 
2174  o21_re += A1_re;
2175  o21_im += A1_im;
2176  o31_re += B1_re;
2177  o31_im += B1_im;
2178 
2179  o22_re += A2_re;
2180  o22_im += A2_im;
2181  o32_re += B2_re;
2182  o32_im += B2_im;
2183 
2184  } else {
2191 
2192 #ifdef MULTI_GPU
2193  if (kernel_type == INTERIOR_KERNEL) {
2194 #endif
2195 
2196  // read spinor from device memory
2197  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2198 
2199  // project spinor into half spinors
2200  a0_re = +2*i20_re;
2201  a0_im = +2*i20_im;
2202  a1_re = +2*i21_re;
2203  a1_im = +2*i21_im;
2204  a2_re = +2*i22_re;
2205  a2_im = +2*i22_im;
2206  b0_re = +2*i30_re;
2207  b0_im = +2*i30_im;
2208  b1_re = +2*i31_re;
2209  b1_im = +2*i31_im;
2210  b2_re = +2*i32_re;
2211  b2_im = +2*i32_im;
2212 
2213 #ifdef MULTI_GPU
2214  } else {
2215 
2216  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2217  const int t_proj_scale = TPROJSCALE;
2218 
2219  // read half spinor from device memory
2220  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2221 
2222  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2223  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2224  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2225  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2226  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2227  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2228 
2229  }
2230 #endif // MULTI_GPU
2231 
2232  // read gauge matrix from device memory
2233  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
2234 
2235  // reconstruct gauge matrix
2237 
2238  // multiply row 0
2239  spinorFloat A0_re = 0;
2240  A0_re += gT00_re * a0_re;
2241  A0_re -= gT00_im * a0_im;
2242  A0_re += gT01_re * a1_re;
2243  A0_re -= gT01_im * a1_im;
2244  A0_re += gT02_re * a2_re;
2245  A0_re -= gT02_im * a2_im;
2246  spinorFloat A0_im = 0;
2247  A0_im += gT00_re * a0_im;
2248  A0_im += gT00_im * a0_re;
2249  A0_im += gT01_re * a1_im;
2250  A0_im += gT01_im * a1_re;
2251  A0_im += gT02_re * a2_im;
2252  A0_im += gT02_im * a2_re;
2253  spinorFloat B0_re = 0;
2254  B0_re += gT00_re * b0_re;
2255  B0_re -= gT00_im * b0_im;
2256  B0_re += gT01_re * b1_re;
2257  B0_re -= gT01_im * b1_im;
2258  B0_re += gT02_re * b2_re;
2259  B0_re -= gT02_im * b2_im;
2260  spinorFloat B0_im = 0;
2261  B0_im += gT00_re * b0_im;
2262  B0_im += gT00_im * b0_re;
2263  B0_im += gT01_re * b1_im;
2264  B0_im += gT01_im * b1_re;
2265  B0_im += gT02_re * b2_im;
2266  B0_im += gT02_im * b2_re;
2267 
2268  // multiply row 1
2269  spinorFloat A1_re = 0;
2270  A1_re += gT10_re * a0_re;
2271  A1_re -= gT10_im * a0_im;
2272  A1_re += gT11_re * a1_re;
2273  A1_re -= gT11_im * a1_im;
2274  A1_re += gT12_re * a2_re;
2275  A1_re -= gT12_im * a2_im;
2276  spinorFloat A1_im = 0;
2277  A1_im += gT10_re * a0_im;
2278  A1_im += gT10_im * a0_re;
2279  A1_im += gT11_re * a1_im;
2280  A1_im += gT11_im * a1_re;
2281  A1_im += gT12_re * a2_im;
2282  A1_im += gT12_im * a2_re;
2283  spinorFloat B1_re = 0;
2284  B1_re += gT10_re * b0_re;
2285  B1_re -= gT10_im * b0_im;
2286  B1_re += gT11_re * b1_re;
2287  B1_re -= gT11_im * b1_im;
2288  B1_re += gT12_re * b2_re;
2289  B1_re -= gT12_im * b2_im;
2290  spinorFloat B1_im = 0;
2291  B1_im += gT10_re * b0_im;
2292  B1_im += gT10_im * b0_re;
2293  B1_im += gT11_re * b1_im;
2294  B1_im += gT11_im * b1_re;
2295  B1_im += gT12_re * b2_im;
2296  B1_im += gT12_im * b2_re;
2297 
2298  // multiply row 2
2299  spinorFloat A2_re = 0;
2300  A2_re += gT20_re * a0_re;
2301  A2_re -= gT20_im * a0_im;
2302  A2_re += gT21_re * a1_re;
2303  A2_re -= gT21_im * a1_im;
2304  A2_re += gT22_re * a2_re;
2305  A2_re -= gT22_im * a2_im;
2306  spinorFloat A2_im = 0;
2307  A2_im += gT20_re * a0_im;
2308  A2_im += gT20_im * a0_re;
2309  A2_im += gT21_re * a1_im;
2310  A2_im += gT21_im * a1_re;
2311  A2_im += gT22_re * a2_im;
2312  A2_im += gT22_im * a2_re;
2313  spinorFloat B2_re = 0;
2314  B2_re += gT20_re * b0_re;
2315  B2_re -= gT20_im * b0_im;
2316  B2_re += gT21_re * b1_re;
2317  B2_re -= gT21_im * b1_im;
2318  B2_re += gT22_re * b2_re;
2319  B2_re -= gT22_im * b2_im;
2320  spinorFloat B2_im = 0;
2321  B2_im += gT20_re * b0_im;
2322  B2_im += gT20_im * b0_re;
2323  B2_im += gT21_re * b1_im;
2324  B2_im += gT21_im * b1_re;
2325  B2_im += gT22_re * b2_im;
2326  B2_im += gT22_im * b2_re;
2327 
2328  o20_re += A0_re;
2329  o20_im += A0_im;
2330  o30_re += B0_re;
2331  o30_im += B0_im;
2332 
2333  o21_re += A1_re;
2334  o21_im += A1_im;
2335  o31_re += B1_re;
2336  o31_im += B1_im;
2337 
2338  o22_re += A2_re;
2339  o22_im += A2_im;
2340  o32_re += B2_re;
2341  o32_im += B2_im;
2342 
2343  }
2344 }
2345 
2346 #ifdef MULTI_GPU
2347 
2348 int incomplete = 0; // Have all 8 contributions been computed for this site?
2349 
2350 switch(kernel_type) { // intentional fall-through
2351 case INTERIOR_KERNEL:
2352  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
2353 case EXTERIOR_KERNEL_T:
2354  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
2355 case EXTERIOR_KERNEL_Z:
2356  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
2357 case EXTERIOR_KERNEL_Y:
2358  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
2359 }
2360 
2361 if (!incomplete)
2362 #endif // MULTI_GPU
2363 {
2364 #ifdef DSLASH_XPAY
2365  READ_ACCUM(ACCUMTEX, param.sp_stride)
2366 
2367 #ifndef CLOVER_TWIST_INV_DSLASH
2368 #ifndef CLOVER_TWIST_XPAY
2369  //perform invert twist first:
2370  APPLY_CLOVER_TWIST_INV(c, cinv, -a, o);
2371 #else
2372  APPLY_CLOVER_TWIST(c, -a, acc);
2373 #endif
2374 #endif
2375  o00_re = b*o00_re + acc00_re;
2376  o00_im = b*o00_im + acc00_im;
2377  o01_re = b*o01_re + acc01_re;
2378  o01_im = b*o01_im + acc01_im;
2379  o02_re = b*o02_re + acc02_re;
2380  o02_im = b*o02_im + acc02_im;
2381  o10_re = b*o10_re + acc10_re;
2382  o10_im = b*o10_im + acc10_im;
2383  o11_re = b*o11_re + acc11_re;
2384  o11_im = b*o11_im + acc11_im;
2385  o12_re = b*o12_re + acc12_re;
2386  o12_im = b*o12_im + acc12_im;
2387  o20_re = b*o20_re + acc20_re;
2388  o20_im = b*o20_im + acc20_im;
2389  o21_re = b*o21_re + acc21_re;
2390  o21_im = b*o21_im + acc21_im;
2391  o22_re = b*o22_re + acc22_re;
2392  o22_im = b*o22_im + acc22_im;
2393  o30_re = b*o30_re + acc30_re;
2394  o30_im = b*o30_im + acc30_im;
2395  o31_re = b*o31_re + acc31_re;
2396  o31_im = b*o31_im + acc31_im;
2397  o32_re = b*o32_re + acc32_re;
2398  o32_im = b*o32_im + acc32_im;
2399 #else //no XPAY
2400 #ifndef CLOVER_TWIST_INV_DSLASH
2401  APPLY_CLOVER_TWIST_INV(c, cinv, -a, o);
2402 #endif
2403 #endif
2404 }
2405 
2406 // write spinor field back to device memory
2407 WRITE_SPINOR(param.sp_stride);
2408 
2409 // undefine to prevent warning when precision is changed
2410 #undef spinorFloat
2411 #undef g00_re
2412 #undef g00_im
2413 #undef g01_re
2414 #undef g01_im
2415 #undef g02_re
2416 #undef g02_im
2417 #undef g10_re
2418 #undef g10_im
2419 #undef g11_re
2420 #undef g11_im
2421 #undef g12_re
2422 #undef g12_im
2423 #undef g20_re
2424 #undef g20_im
2425 #undef g21_re
2426 #undef g21_im
2427 #undef g22_re
2428 #undef g22_im
2429 
2430 #undef i00_re
2431 #undef i00_im
2432 #undef i01_re
2433 #undef i01_im
2434 #undef i02_re
2435 #undef i02_im
2436 #undef i10_re
2437 #undef i10_im
2438 #undef i11_re
2439 #undef i11_im
2440 #undef i12_re
2441 #undef i12_im
2442 #undef i20_re
2443 #undef i20_im
2444 #undef i21_re
2445 #undef i21_im
2446 #undef i22_re
2447 #undef i22_im
2448 #undef i30_re
2449 #undef i30_im
2450 #undef i31_re
2451 #undef i31_im
2452 #undef i32_re
2453 #undef i32_im
2454 
2455 #undef c00_00_re
2456 #undef c01_01_re
2457 #undef c02_02_re
2458 #undef c10_10_re
2459 #undef c11_11_re
2460 #undef c12_12_re
2461 #undef c01_00_re
2462 #undef c01_00_im
2463 #undef c02_00_re
2464 #undef c02_00_im
2465 #undef c10_00_re
2466 #undef c10_00_im
2467 #undef c11_00_re
2468 #undef c11_00_im
2469 #undef c12_00_re
2470 #undef c12_00_im
2471 #undef c02_01_re
2472 #undef c02_01_im
2473 #undef c10_01_re
2474 #undef c10_01_im
2475 #undef c11_01_re
2476 #undef c11_01_im
2477 #undef c12_01_re
2478 #undef c12_01_im
2479 #undef c10_02_re
2480 #undef c10_02_im
2481 #undef c11_02_re
2482 #undef c11_02_im
2483 #undef c12_02_re
2484 #undef c12_02_im
2485 #undef c11_10_re
2486 #undef c11_10_im
2487 #undef c12_10_re
2488 #undef c12_10_im
2489 #undef c12_11_re
2490 #undef c12_11_im
2491 
2492 #undef cinv00_00_re
2493 #undef cinv01_01_re
2494 #undef cinv02_02_re
2495 #undef cinv10_10_re
2496 #undef cinv11_11_re
2497 #undef cinv12_12_re
2498 #undef cinv01_00_re
2499 #undef cinv01_00_im
2500 #undef cinv02_00_re
2501 #undef cinv02_00_im
2502 #undef cinv10_00_re
2503 #undef cinv10_00_im
2504 #undef cinv11_00_re
2505 #undef cinv11_00_im
2506 #undef cinv12_00_re
2507 #undef cinv12_00_im
2508 #undef cinv02_01_re
2509 #undef cinv02_01_im
2510 #undef cinv10_01_re
2511 #undef cinv10_01_im
2512 #undef cinv11_01_re
2513 #undef cinv11_01_im
2514 #undef cinv12_01_re
2515 #undef cinv12_01_im
2516 #undef cinv10_02_re
2517 #undef cinv10_02_im
2518 #undef cinv11_02_re
2519 #undef cinv11_02_im
2520 #undef cinv12_02_re
2521 #undef cinv12_02_im
2522 #undef cinv11_10_re
2523 #undef cinv11_10_im
2524 #undef cinv12_10_re
2525 #undef cinv12_10_im
2526 #undef cinv12_11_re
2527 #undef cinv12_11_im
2528 
2529 #undef acc00_re
2530 #undef acc00_im
2531 #undef acc01_re
2532 #undef acc01_im
2533 #undef acc02_re
2534 #undef acc02_im
2535 #undef acc10_re
2536 #undef acc10_im
2537 #undef acc11_re
2538 #undef acc11_im
2539 #undef acc12_re
2540 #undef acc12_im
2541 #undef acc20_re
2542 #undef acc20_im
2543 #undef acc21_re
2544 #undef acc21_im
2545 #undef acc22_re
2546 #undef acc22_im
2547 #undef acc30_re
2548 #undef acc30_im
2549 #undef acc31_re
2550 #undef acc31_im
2551 #undef acc32_re
2552 #undef acc32_im
2553 
2554 
2555 
2556 #undef VOLATILE
VOLATILE spinorFloat o01_re
__constant__ int Vh
#define acc30_re
#define acc10_im
spinorFloat b2_re
#define acc21_re
RECONSTRUCT_GAUGE_MATRIX(0)
#define APPLY_CLOVER_TWIST(c, a, reg)
Definition: tmc_core.h:1
__constant__ int X2
#define acc10_re
#define spinorFloat
__constant__ int X2X1mX1
#define i31_re
spinorFloat B0_im
VOLATILE spinorFloat o22_im
spinorFloat B2_im
VOLATILE spinorFloat o30_re
__constant__ int X3X2X1mX2X1
#define acc12_re
VOLATILE spinorFloat o10_im
spinorFloat A2_re
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
spinorFloat A0_im
const int dims[]
VOLATILE spinorFloat o12_re
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
spinorFloat b0_im
#define i11_im
VOLATILE spinorFloat o11_im
__constant__ int X3X2X1
spinorFloat a1_re
VOLATILE spinorFloat o01_im
#define i21_im
#define acc02_re
#define acc31_re
#define i11_re
#define acc32_im
VOLATILE spinorFloat o21_re
spinorFloat b0_re
#define acc11_im
#define i10_im
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o10_re
spinorFloat a1_im
QudaGaugeParam param
Definition: pack_test.cpp:17
spinorFloat B1_re
#define i21_re
#define acc01_re
__constant__ int ghostFace[QUDA_MAX_DIM+1]
spinorFloat b1_im
VOLATILE spinorFloat o31_re
#define i30_im
#define APPLY_CLOVER_TWIST_INV(c, cinv, a, reg)
Definition: tmc_core.h:432
#define i22_re
VOLATILE spinorFloat o21_im
spinorFloat a0_im
spinorFloat b1_re
spinorFloat A1_re
#define GAUGE0TEX
Definition: covDev.h:112
#define acc12_im
#define i02_re
#define i01_re
spinorFloat a0_re
spinorFloat B2_re
VOLATILE spinorFloat o00_im
#define acc11_re
VOLATILE spinorFloat o22_re
#define acc02_im
spinorFloat A1_im
#define acc22_re
__constant__ int X2m1
#define acc20_re
#define acc01_im
#define SPINORTEX
Definition: clover_def.h:40
#define i32_im
VOLATILE spinorFloat o20_re
#define i20_im
__constant__ int gauge_fixed
spinorFloat b2_im
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
spinorFloat A2_im
__constant__ int X4X3X2X1mX3X2X1
spinorFloat B1_im
VOLATILE spinorFloat o11_re
#define SPINOR_HOP
Definition: covDev.h:158
#define i02_im
#define acc20_im
#define i10_re
__constant__ int ga_stride
#define i22_im
VOLATILE spinorFloat o20_im
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
VOLATILE spinorFloat o00_re
#define i12_re
#define acc00_im
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
__constant__ int X1m1
__constant__ int X3
VOLATILE spinorFloat o02_re
#define VOLATILE
#define i20_re
spinorFloat A0_re
spinorFloat a2_re
#define GAUGE1TEX
Definition: covDev.h:113
VOLATILE spinorFloat o32_re
const int ga_idx
#define acc32_re
__constant__ int X4m1
#define i01_im
#define i31_im
spinorFloat B0_re
#define acc31_im
#define i30_re
spinorFloat a2_im
VOLATILE spinorFloat o12_im
#define acc21_im
VOLATILE spinorFloat o31_im
#define acc00_re
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
__constant__ int X4X3X2X1hmX3X2X1h
#define i32_re
KernelType kernel_type
#define i00_im
#define acc22_im
#define i00_re
VOLATILE spinorFloat o30_im
#define i12_im
__constant__ int X4
__constant__ int X3m1
#define acc30_im
#define TPROJSCALE
Definition: covDev.h:101
VOLATILE spinorFloat o02_im
WRITE_SPINOR(param.sp_stride)
VOLATILE spinorFloat o32_im
__constant__ int X2X1