QUDA  0.9.0
tmc_dslash_fermi_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 24
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2
15 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
64 #else
65 #define spinorFloat float
66 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4
67 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4
68 #define i00_re I0.x
69 #define i00_im I0.y
70 #define i01_re I0.z
71 #define i01_im I0.w
72 #define i02_re I1.x
73 #define i02_im I1.y
74 #define i10_re I1.z
75 #define i10_im I1.w
76 #define i11_re I2.x
77 #define i11_im I2.y
78 #define i12_re I2.z
79 #define i12_im I2.w
80 #define i20_re I3.x
81 #define i20_im I3.y
82 #define i21_re I3.z
83 #define i21_im I3.w
84 #define i22_re I4.x
85 #define i22_im I4.y
86 #define i30_re I4.z
87 #define i30_im I4.w
88 #define i31_re I5.x
89 #define i31_im I5.y
90 #define i32_re I5.z
91 #define i32_im I5.w
92 #define acc00_re accum0.x
93 #define acc00_im accum0.y
94 #define acc01_re accum0.z
95 #define acc01_im accum0.w
96 #define acc02_re accum1.x
97 #define acc02_im accum1.y
98 #define acc10_re accum1.z
99 #define acc10_im accum1.w
100 #define acc11_re accum2.x
101 #define acc11_im accum2.y
102 #define acc12_re accum2.z
103 #define acc12_im accum2.w
104 #define acc20_re accum3.x
105 #define acc20_im accum3.y
106 #define acc21_re accum3.z
107 #define acc21_im accum3.w
108 #define acc22_re accum4.x
109 #define acc22_im accum4.y
110 #define acc30_re accum4.z
111 #define acc30_im accum4.w
112 #define acc31_re accum5.x
113 #define acc31_im accum5.y
114 #define acc32_re accum5.z
115 #define acc32_im accum5.w
116 #endif // SPINOR_DOUBLE
117 
118 // gauge link
119 #ifdef GAUGE_FLOAT2
120 #define g00_re G0.x
121 #define g00_im G0.y
122 #define g01_re G1.x
123 #define g01_im G1.y
124 #define g02_re G2.x
125 #define g02_im G2.y
126 #define g10_re G3.x
127 #define g10_im G3.y
128 #define g11_re G4.x
129 #define g11_im G4.y
130 #define g12_re G5.x
131 #define g12_im G5.y
132 #define g20_re G6.x
133 #define g20_im G6.y
134 #define g21_re G7.x
135 #define g21_im G7.y
136 #define g22_re G8.x
137 #define g22_im G8.y
138 
139 #else
140 #define g00_re G0.x
141 #define g00_im G0.y
142 #define g01_re G0.z
143 #define g01_im G0.w
144 #define g02_re G1.x
145 #define g02_im G1.y
146 #define g10_re G1.z
147 #define g10_im G1.w
148 #define g11_re G2.x
149 #define g11_im G2.y
150 #define g12_re G2.z
151 #define g12_im G2.w
152 #define g20_re G3.x
153 #define g20_im G3.y
154 #define g21_re G3.z
155 #define g21_im G3.w
156 #define g22_re G4.x
157 #define g22_im G4.y
158 
159 #endif // GAUGE_DOUBLE
160 
161 // conjugated gauge link
162 #define gT00_re (+g00_re)
163 #define gT00_im (-g00_im)
164 #define gT01_re (+g10_re)
165 #define gT01_im (-g10_im)
166 #define gT02_re (+g20_re)
167 #define gT02_im (-g20_im)
168 #define gT10_re (+g01_re)
169 #define gT10_im (-g01_im)
170 #define gT11_re (+g11_re)
171 #define gT11_im (-g11_im)
172 #define gT12_re (+g21_re)
173 #define gT12_im (-g21_im)
174 #define gT20_re (+g02_re)
175 #define gT20_im (-g02_im)
176 #define gT21_re (+g12_re)
177 #define gT21_im (-g12_im)
178 #define gT22_re (+g22_re)
179 #define gT22_im (-g22_im)
180 
181 // first chiral block of clover term
182 #ifdef CLOVER_DOUBLE
183 #define c00_00_re C0.x
184 #define c01_01_re C0.y
185 #define c02_02_re C1.x
186 #define c10_10_re C1.y
187 #define c11_11_re C2.x
188 #define c12_12_re C2.y
189 #define c01_00_re C3.x
190 #define c01_00_im C3.y
191 #define c02_00_re C4.x
192 #define c02_00_im C4.y
193 #define c10_00_re C5.x
194 #define c10_00_im C5.y
195 #define c11_00_re C6.x
196 #define c11_00_im C6.y
197 #define c12_00_re C7.x
198 #define c12_00_im C7.y
199 #define c02_01_re C8.x
200 #define c02_01_im C8.y
201 #define c10_01_re C9.x
202 #define c10_01_im C9.y
203 #define c11_01_re C10.x
204 #define c11_01_im C10.y
205 #define c12_01_re C11.x
206 #define c12_01_im C11.y
207 #define c10_02_re C12.x
208 #define c10_02_im C12.y
209 #define c11_02_re C13.x
210 #define c11_02_im C13.y
211 #define c12_02_re C14.x
212 #define c12_02_im C14.y
213 #define c11_10_re C15.x
214 #define c11_10_im C15.y
215 #define c12_10_re C16.x
216 #define c12_10_im C16.y
217 #define c12_11_re C17.x
218 #define c12_11_im C17.y
219 #else
220 #define c00_00_re C0.x
221 #define c01_01_re C0.y
222 #define c02_02_re C0.z
223 #define c10_10_re C0.w
224 #define c11_11_re C1.x
225 #define c12_12_re C1.y
226 #define c01_00_re C1.z
227 #define c01_00_im C1.w
228 #define c02_00_re C2.x
229 #define c02_00_im C2.y
230 #define c10_00_re C2.z
231 #define c10_00_im C2.w
232 #define c11_00_re C3.x
233 #define c11_00_im C3.y
234 #define c12_00_re C3.z
235 #define c12_00_im C3.w
236 #define c02_01_re C4.x
237 #define c02_01_im C4.y
238 #define c10_01_re C4.z
239 #define c10_01_im C4.w
240 #define c11_01_re C5.x
241 #define c11_01_im C5.y
242 #define c12_01_re C5.z
243 #define c12_01_im C5.w
244 #define c10_02_re C6.x
245 #define c10_02_im C6.y
246 #define c11_02_re C6.z
247 #define c11_02_im C6.w
248 #define c12_02_re C7.x
249 #define c12_02_im C7.y
250 #define c11_10_re C7.z
251 #define c11_10_im C7.w
252 #define c12_10_re C8.x
253 #define c12_10_im C8.y
254 #define c12_11_re C8.z
255 #define c12_11_im C8.w
256 #endif // CLOVER_DOUBLE
257 
258 #define c00_01_re (+c01_00_re)
259 #define c00_01_im (-c01_00_im)
260 #define c00_02_re (+c02_00_re)
261 #define c00_02_im (-c02_00_im)
262 #define c01_02_re (+c02_01_re)
263 #define c01_02_im (-c02_01_im)
264 #define c00_10_re (+c10_00_re)
265 #define c00_10_im (-c10_00_im)
266 #define c01_10_re (+c10_01_re)
267 #define c01_10_im (-c10_01_im)
268 #define c02_10_re (+c10_02_re)
269 #define c02_10_im (-c10_02_im)
270 #define c00_11_re (+c11_00_re)
271 #define c00_11_im (-c11_00_im)
272 #define c01_11_re (+c11_01_re)
273 #define c01_11_im (-c11_01_im)
274 #define c02_11_re (+c11_02_re)
275 #define c02_11_im (-c11_02_im)
276 #define c10_11_re (+c11_10_re)
277 #define c10_11_im (-c11_10_im)
278 #define c00_12_re (+c12_00_re)
279 #define c00_12_im (-c12_00_im)
280 #define c01_12_re (+c12_01_re)
281 #define c01_12_im (-c12_01_im)
282 #define c02_12_re (+c12_02_re)
283 #define c02_12_im (-c12_02_im)
284 #define c10_12_re (+c12_10_re)
285 #define c10_12_im (-c12_10_im)
286 #define c11_12_re (+c12_11_re)
287 #define c11_12_im (-c12_11_im)
288 
289 // second chiral block of clover term (reuses C0,...,C9)
290 #define c20_20_re c00_00_re
291 #define c21_20_re c01_00_re
292 #define c21_20_im c01_00_im
293 #define c22_20_re c02_00_re
294 #define c22_20_im c02_00_im
295 #define c30_20_re c10_00_re
296 #define c30_20_im c10_00_im
297 #define c31_20_re c11_00_re
298 #define c31_20_im c11_00_im
299 #define c32_20_re c12_00_re
300 #define c32_20_im c12_00_im
301 #define c20_21_re c00_01_re
302 #define c20_21_im c00_01_im
303 #define c21_21_re c01_01_re
304 #define c22_21_re c02_01_re
305 #define c22_21_im c02_01_im
306 #define c30_21_re c10_01_re
307 #define c30_21_im c10_01_im
308 #define c31_21_re c11_01_re
309 #define c31_21_im c11_01_im
310 #define c32_21_re c12_01_re
311 #define c32_21_im c12_01_im
312 #define c20_22_re c00_02_re
313 #define c20_22_im c00_02_im
314 #define c21_22_re c01_02_re
315 #define c21_22_im c01_02_im
316 #define c22_22_re c02_02_re
317 #define c30_22_re c10_02_re
318 #define c30_22_im c10_02_im
319 #define c31_22_re c11_02_re
320 #define c31_22_im c11_02_im
321 #define c32_22_re c12_02_re
322 #define c32_22_im c12_02_im
323 #define c20_30_re c00_10_re
324 #define c20_30_im c00_10_im
325 #define c21_30_re c01_10_re
326 #define c21_30_im c01_10_im
327 #define c22_30_re c02_10_re
328 #define c22_30_im c02_10_im
329 #define c30_30_re c10_10_re
330 #define c31_30_re c11_10_re
331 #define c31_30_im c11_10_im
332 #define c32_30_re c12_10_re
333 #define c32_30_im c12_10_im
334 #define c20_31_re c00_11_re
335 #define c20_31_im c00_11_im
336 #define c21_31_re c01_11_re
337 #define c21_31_im c01_11_im
338 #define c22_31_re c02_11_re
339 #define c22_31_im c02_11_im
340 #define c30_31_re c10_11_re
341 #define c30_31_im c10_11_im
342 #define c31_31_re c11_11_re
343 #define c32_31_re c12_11_re
344 #define c32_31_im c12_11_im
345 #define c20_32_re c00_12_re
346 #define c20_32_im c00_12_im
347 #define c21_32_re c01_12_re
348 #define c21_32_im c01_12_im
349 #define c22_32_re c02_12_re
350 #define c22_32_im c02_12_im
351 #define c30_32_re c10_12_re
352 #define c30_32_im c10_12_im
353 #define c31_32_re c11_12_re
354 #define c31_32_im c11_12_im
355 #define c32_32_re c12_12_re
356 
357 
358 // first chiral block of inverted clover term
359 #ifdef CLOVER_DOUBLE
360 #define cinv00_00_re C0.x
361 #define cinv01_01_re C0.y
362 #define cinv02_02_re C1.x
363 #define cinv10_10_re C1.y
364 #define cinv11_11_re C2.x
365 #define cinv12_12_re C2.y
366 #define cinv01_00_re C3.x
367 #define cinv01_00_im C3.y
368 #define cinv02_00_re C4.x
369 #define cinv02_00_im C4.y
370 #define cinv10_00_re C5.x
371 #define cinv10_00_im C5.y
372 #define cinv11_00_re C6.x
373 #define cinv11_00_im C6.y
374 #define cinv12_00_re C7.x
375 #define cinv12_00_im C7.y
376 #define cinv02_01_re C8.x
377 #define cinv02_01_im C8.y
378 #define cinv10_01_re C9.x
379 #define cinv10_01_im C9.y
380 #define cinv11_01_re C10.x
381 #define cinv11_01_im C10.y
382 #define cinv12_01_re C11.x
383 #define cinv12_01_im C11.y
384 #define cinv10_02_re C12.x
385 #define cinv10_02_im C12.y
386 #define cinv11_02_re C13.x
387 #define cinv11_02_im C13.y
388 #define cinv12_02_re C14.x
389 #define cinv12_02_im C14.y
390 #define cinv11_10_re C15.x
391 #define cinv11_10_im C15.y
392 #define cinv12_10_re C16.x
393 #define cinv12_10_im C16.y
394 #define cinv12_11_re C17.x
395 #define cinv12_11_im C17.y
396 #else
397 #define cinv00_00_re C0.x
398 #define cinv01_01_re C0.y
399 #define cinv02_02_re C0.z
400 #define cinv10_10_re C0.w
401 #define cinv11_11_re C1.x
402 #define cinv12_12_re C1.y
403 #define cinv01_00_re C1.z
404 #define cinv01_00_im C1.w
405 #define cinv02_00_re C2.x
406 #define cinv02_00_im C2.y
407 #define cinv10_00_re C2.z
408 #define cinv10_00_im C2.w
409 #define cinv11_00_re C3.x
410 #define cinv11_00_im C3.y
411 #define cinv12_00_re C3.z
412 #define cinv12_00_im C3.w
413 #define cinv02_01_re C4.x
414 #define cinv02_01_im C4.y
415 #define cinv10_01_re C4.z
416 #define cinv10_01_im C4.w
417 #define cinv11_01_re C5.x
418 #define cinv11_01_im C5.y
419 #define cinv12_01_re C5.z
420 #define cinv12_01_im C5.w
421 #define cinv10_02_re C6.x
422 #define cinv10_02_im C6.y
423 #define cinv11_02_re C6.z
424 #define cinv11_02_im C6.w
425 #define cinv12_02_re C7.x
426 #define cinv12_02_im C7.y
427 #define cinv11_10_re C7.z
428 #define cinv11_10_im C7.w
429 #define cinv12_10_re C8.x
430 #define cinv12_10_im C8.y
431 #define cinv12_11_re C8.z
432 #define cinv12_11_im C8.w
433 #endif // CLOVER_DOUBLE
434 
435 #define cinv00_01_re (+cinv01_00_re)
436 #define cinv00_01_im (-cinv01_00_im)
437 #define cinv00_02_re (+cinv02_00_re)
438 #define cinv00_02_im (-cinv02_00_im)
439 #define cinv01_02_re (+cinv02_01_re)
440 #define cinv01_02_im (-cinv02_01_im)
441 #define cinv00_10_re (+cinv10_00_re)
442 #define cinv00_10_im (-cinv10_00_im)
443 #define cinv01_10_re (+cinv10_01_re)
444 #define cinv01_10_im (-cinv10_01_im)
445 #define cinv02_10_re (+cinv10_02_re)
446 #define cinv02_10_im (-cinv10_02_im)
447 #define cinv00_11_re (+cinv11_00_re)
448 #define cinv00_11_im (-cinv11_00_im)
449 #define cinv01_11_re (+cinv11_01_re)
450 #define cinv01_11_im (-cinv11_01_im)
451 #define cinv02_11_re (+cinv11_02_re)
452 #define cinv02_11_im (-cinv11_02_im)
453 #define cinv10_11_re (+cinv11_10_re)
454 #define cinv10_11_im (-cinv11_10_im)
455 #define cinv00_12_re (+cinv12_00_re)
456 #define cinv00_12_im (-cinv12_00_im)
457 #define cinv01_12_re (+cinv12_01_re)
458 #define cinv01_12_im (-cinv12_01_im)
459 #define cinv02_12_re (+cinv12_02_re)
460 #define cinv02_12_im (-cinv12_02_im)
461 #define cinv10_12_re (+cinv12_10_re)
462 #define cinv10_12_im (-cinv12_10_im)
463 #define cinv11_12_re (+cinv12_11_re)
464 #define cinv11_12_im (-cinv12_11_im)
465 
466 // second chiral block of inverted clover term (reuses C0,...,C9)
467 #define cinv20_20_re cinv00_00_re
468 #define cinv21_20_re cinv01_00_re
469 #define cinv21_20_im cinv01_00_im
470 #define cinv22_20_re cinv02_00_re
471 #define cinv22_20_im cinv02_00_im
472 #define cinv30_20_re cinv10_00_re
473 #define cinv30_20_im cinv10_00_im
474 #define cinv31_20_re cinv11_00_re
475 #define cinv31_20_im cinv11_00_im
476 #define cinv32_20_re cinv12_00_re
477 #define cinv32_20_im cinv12_00_im
478 #define cinv20_21_re cinv00_01_re
479 #define cinv20_21_im cinv00_01_im
480 #define cinv21_21_re cinv01_01_re
481 #define cinv22_21_re cinv02_01_re
482 #define cinv22_21_im cinv02_01_im
483 #define cinv30_21_re cinv10_01_re
484 #define cinv30_21_im cinv10_01_im
485 #define cinv31_21_re cinv11_01_re
486 #define cinv31_21_im cinv11_01_im
487 #define cinv32_21_re cinv12_01_re
488 #define cinv32_21_im cinv12_01_im
489 #define cinv20_22_re cinv00_02_re
490 #define cinv20_22_im cinv00_02_im
491 #define cinv21_22_re cinv01_02_re
492 #define cinv21_22_im cinv01_02_im
493 #define cinv22_22_re cinv02_02_re
494 #define cinv30_22_re cinv10_02_re
495 #define cinv30_22_im cinv10_02_im
496 #define cinv31_22_re cinv11_02_re
497 #define cinv31_22_im cinv11_02_im
498 #define cinv32_22_re cinv12_02_re
499 #define cinv32_22_im cinv12_02_im
500 #define cinv20_30_re cinv00_10_re
501 #define cinv20_30_im cinv00_10_im
502 #define cinv21_30_re cinv01_10_re
503 #define cinv21_30_im cinv01_10_im
504 #define cinv22_30_re cinv02_10_re
505 #define cinv22_30_im cinv02_10_im
506 #define cinv30_30_re cinv10_10_re
507 #define cinv31_30_re cinv11_10_re
508 #define cinv31_30_im cinv11_10_im
509 #define cinv32_30_re cinv12_10_re
510 #define cinv32_30_im cinv12_10_im
511 #define cinv20_31_re cinv00_11_re
512 #define cinv20_31_im cinv00_11_im
513 #define cinv21_31_re cinv01_11_re
514 #define cinv21_31_im cinv01_11_im
515 #define cinv22_31_re cinv02_11_re
516 #define cinv22_31_im cinv02_11_im
517 #define cinv30_31_re cinv10_11_re
518 #define cinv30_31_im cinv10_11_im
519 #define cinv31_31_re cinv11_11_re
520 #define cinv32_31_re cinv12_11_re
521 #define cinv32_31_im cinv12_11_im
522 #define cinv20_32_re cinv00_12_re
523 #define cinv20_32_im cinv00_12_im
524 #define cinv21_32_re cinv01_12_re
525 #define cinv21_32_im cinv01_12_im
526 #define cinv22_32_re cinv02_12_re
527 #define cinv22_32_im cinv02_12_im
528 #define cinv30_32_re cinv10_12_re
529 #define cinv30_32_im cinv10_12_im
530 #define cinv31_32_re cinv11_12_re
531 #define cinv31_32_im cinv11_12_im
532 #define cinv32_32_re cinv12_12_re
533 
534 
535 
536 // declare C## here and use ASSN below instead of READ
537 #ifdef CLOVER_DOUBLE
538 double2 C0;
539 double2 C1;
540 double2 C2;
541 double2 C3;
542 double2 C4;
543 double2 C5;
544 double2 C6;
545 double2 C7;
546 double2 C8;
547 double2 C9;
548 double2 C10;
549 double2 C11;
550 double2 C12;
551 double2 C13;
552 double2 C14;
553 double2 C15;
554 double2 C16;
555 double2 C17;
556 #else
557 float4 C0;
558 float4 C1;
559 float4 C2;
560 float4 C3;
561 float4 C4;
562 float4 C5;
563 float4 C6;
564 float4 C7;
565 float4 C8;
566 
567 #if (DD_PREC==2)
568 float K;
569 #endif
570 
571 #endif // CLOVER_DOUBLE
572 // output spinor
597 
598 #ifdef SPINOR_DOUBLE
599 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
600 #else
601 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
602 #endif
603 
604 #include "read_gauge.h"
605 #include "io_spinor.h"
606 #include "read_clover.h"
607 #include "tmc_core.h"
608 
609 int coord[5];
610 int X;
611 
612 int sid;
613 
614 #ifdef MULTI_GPU
615 int face_idx;
616 if (kernel_type == INTERIOR_KERNEL) {
617 #endif
618 
619  // Assume even dimensions
621 
622  // only need to check Y and Z dims currently since X and T set to match exactly
623  if (coord[1] >= param.dc.X[1]) return;
624  if (coord[2] >= param.dc.X[2]) return;
625 
626  o00_re = 0; o00_im = 0;
627  o01_re = 0; o01_im = 0;
628  o02_re = 0; o02_im = 0;
629  o10_re = 0; o10_im = 0;
630  o11_re = 0; o11_im = 0;
631  o12_re = 0; o12_im = 0;
632  o20_re = 0; o20_im = 0;
633  o21_re = 0; o21_im = 0;
634  o22_re = 0; o22_im = 0;
635  o30_re = 0; o30_im = 0;
636  o31_re = 0; o31_im = 0;
637  o32_re = 0; o32_im = 0;
638 
639 #ifdef MULTI_GPU
640 } else { // exterior kernel
641 
642  sid = blockIdx.x*blockDim.x + threadIdx.x;
643  if (sid >= param.threads) return;
644 
645  const int face_volume = (param.threads >> 1); // volume of one face
646  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
647  face_idx = sid - face_num*face_volume; // index into the respective face
648 
649  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
650  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
651  //sp_idx = face_idx + param.ghostOffset[dim];
652 
653  coordsFromFaceIndex<4,QUDA_4D_PC,kernel_type,1>(X, sid, coord, face_idx, face_num, param);
654 
656 
657  o00_re = i00_re; o00_im = i00_im;
658  o01_re = i01_re; o01_im = i01_im;
659  o02_re = i02_re; o02_im = i02_im;
660  o10_re = i10_re; o10_im = i10_im;
661  o11_re = i11_re; o11_im = i11_im;
662  o12_re = i12_re; o12_im = i12_im;
663  o20_re = i20_re; o20_im = i20_im;
664  o21_re = i21_re; o21_im = i21_im;
665  o22_re = i22_re; o22_im = i22_im;
666  o30_re = i30_re; o30_im = i30_im;
667  o31_re = i31_re; o31_im = i31_im;
668  o32_re = i32_re; o32_im = i32_im;
669 }
670 #endif // MULTI_GPU
671 
672 
673 #ifdef MULTI_GPU
674 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]<(param.dc.X[0]-1))) ||
675  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==(param.dc.X[0]-1)) )
676 #endif
677 {
678  // Projector P0-
679  // 1 0 0 -i
680  // 0 1 -i 0
681  // 0 i 1 0
682  // i 0 0 1
683 
684 #ifdef MULTI_GPU
685  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1 :
686  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
687 #if (DD_PREC==2) // half precision
688  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
689 #endif
690 #else
691  const int sp_idx = (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1;
692 #endif
693 
694  const int ga_idx = sid;
695 
702 
703 #ifdef MULTI_GPU
704  if (kernel_type == INTERIOR_KERNEL) {
705 #endif
706 
707  // read spinor from device memory
708  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
709 
710  // store spinor into shared memory
711  WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i);
712 
713  // project spinor into half spinors
714  a0_re = +i00_re+i30_im;
715  a0_im = +i00_im-i30_re;
716  a1_re = +i01_re+i31_im;
717  a1_im = +i01_im-i31_re;
718  a2_re = +i02_re+i32_im;
719  a2_im = +i02_im-i32_re;
720  b0_re = +i10_re+i20_im;
721  b0_im = +i10_im-i20_re;
722  b1_re = +i11_re+i21_im;
723  b1_im = +i11_im-i21_re;
724  b2_re = +i12_re+i22_im;
725  b2_im = +i12_im-i22_re;
726 
727 #ifdef MULTI_GPU
728  } else {
729 
730  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
731 
732  // read half spinor from device memory
733  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 0);
734 
735  a0_re = i00_re; a0_im = i00_im;
736  a1_re = i01_re; a1_im = i01_im;
737  a2_re = i02_re; a2_im = i02_im;
738  b0_re = i10_re; b0_im = i10_im;
739  b1_re = i11_re; b1_im = i11_im;
740  b2_re = i12_re; b2_im = i12_im;
741 
742  }
743 #endif // MULTI_GPU
744 
745  // read gauge matrix from device memory
746  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride);
747 
748  // reconstruct gauge matrix
750 
751  // multiply row 0
753  A0_re += g00_re * a0_re;
754  A0_re -= g00_im * a0_im;
755  A0_re += g01_re * a1_re;
756  A0_re -= g01_im * a1_im;
757  A0_re += g02_re * a2_re;
758  A0_re -= g02_im * a2_im;
760  A0_im += g00_re * a0_im;
761  A0_im += g00_im * a0_re;
762  A0_im += g01_re * a1_im;
763  A0_im += g01_im * a1_re;
764  A0_im += g02_re * a2_im;
765  A0_im += g02_im * a2_re;
767  B0_re += g00_re * b0_re;
768  B0_re -= g00_im * b0_im;
769  B0_re += g01_re * b1_re;
770  B0_re -= g01_im * b1_im;
771  B0_re += g02_re * b2_re;
772  B0_re -= g02_im * b2_im;
774  B0_im += g00_re * b0_im;
775  B0_im += g00_im * b0_re;
776  B0_im += g01_re * b1_im;
777  B0_im += g01_im * b1_re;
778  B0_im += g02_re * b2_im;
779  B0_im += g02_im * b2_re;
780 
781  // multiply row 1
783  A1_re += g10_re * a0_re;
784  A1_re -= g10_im * a0_im;
785  A1_re += g11_re * a1_re;
786  A1_re -= g11_im * a1_im;
787  A1_re += g12_re * a2_re;
788  A1_re -= g12_im * a2_im;
790  A1_im += g10_re * a0_im;
791  A1_im += g10_im * a0_re;
792  A1_im += g11_re * a1_im;
793  A1_im += g11_im * a1_re;
794  A1_im += g12_re * a2_im;
795  A1_im += g12_im * a2_re;
797  B1_re += g10_re * b0_re;
798  B1_re -= g10_im * b0_im;
799  B1_re += g11_re * b1_re;
800  B1_re -= g11_im * b1_im;
801  B1_re += g12_re * b2_re;
802  B1_re -= g12_im * b2_im;
804  B1_im += g10_re * b0_im;
805  B1_im += g10_im * b0_re;
806  B1_im += g11_re * b1_im;
807  B1_im += g11_im * b1_re;
808  B1_im += g12_re * b2_im;
809  B1_im += g12_im * b2_re;
810 
811  // multiply row 2
813  A2_re += g20_re * a0_re;
814  A2_re -= g20_im * a0_im;
815  A2_re += g21_re * a1_re;
816  A2_re -= g21_im * a1_im;
817  A2_re += g22_re * a2_re;
818  A2_re -= g22_im * a2_im;
820  A2_im += g20_re * a0_im;
821  A2_im += g20_im * a0_re;
822  A2_im += g21_re * a1_im;
823  A2_im += g21_im * a1_re;
824  A2_im += g22_re * a2_im;
825  A2_im += g22_im * a2_re;
827  B2_re += g20_re * b0_re;
828  B2_re -= g20_im * b0_im;
829  B2_re += g21_re * b1_re;
830  B2_re -= g21_im * b1_im;
831  B2_re += g22_re * b2_re;
832  B2_re -= g22_im * b2_im;
834  B2_im += g20_re * b0_im;
835  B2_im += g20_im * b0_re;
836  B2_im += g21_re * b1_im;
837  B2_im += g21_im * b1_re;
838  B2_im += g22_re * b2_im;
839  B2_im += g22_im * b2_re;
840 
841  o00_re += A0_re;
842  o00_im += A0_im;
843  o10_re += B0_re;
844  o10_im += B0_im;
845  o20_re -= B0_im;
846  o20_im += B0_re;
847  o30_re -= A0_im;
848  o30_im += A0_re;
849 
850  o01_re += A1_re;
851  o01_im += A1_im;
852  o11_re += B1_re;
853  o11_im += B1_im;
854  o21_re -= B1_im;
855  o21_im += B1_re;
856  o31_re -= A1_im;
857  o31_im += A1_re;
858 
859  o02_re += A2_re;
860  o02_im += A2_im;
861  o12_re += B2_re;
862  o12_im += B2_im;
863  o22_re -= B2_im;
864  o22_im += B2_re;
865  o32_re -= A2_im;
866  o32_im += A2_re;
867 
868 }
869 
870 #ifdef MULTI_GPU
871 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]>0)) ||
872  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==0) )
873 #endif
874 {
875  // Projector P0+
876  // 1 0 0 i
877  // 0 1 i 0
878  // 0 -i 1 0
879  // -i 0 0 1
880 
881 #ifdef MULTI_GPU
882  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1 :
883  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
884 #if (DD_PREC==2) // half precision
885  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
886 #endif
887 #else
888  const int sp_idx = (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1;
889 #endif
890 
891 #ifdef MULTI_GPU
892  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
893 #else
894  const int ga_idx = sp_idx;
895 #endif
896 
903 
904 #ifdef MULTI_GPU
905  if (kernel_type == INTERIOR_KERNEL) {
906 #endif
907 
908  // load spinor from shared memory
909  int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1;
910  __syncthreads();
911  READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z);
912 
913  // project spinor into half spinors
914  a0_re = +i00_re-i30_im;
915  a0_im = +i00_im+i30_re;
916  a1_re = +i01_re-i31_im;
917  a1_im = +i01_im+i31_re;
918  a2_re = +i02_re-i32_im;
919  a2_im = +i02_im+i32_re;
920  b0_re = +i10_re-i20_im;
921  b0_im = +i10_im+i20_re;
922  b1_re = +i11_re-i21_im;
923  b1_im = +i11_im+i21_re;
924  b2_re = +i12_re-i22_im;
925  b2_im = +i12_im+i22_re;
926 
927 #ifdef MULTI_GPU
928  } else {
929 
930  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
931 
932  // read half spinor from device memory
933  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 1);
934 
935  a0_re = i00_re; a0_im = i00_im;
936  a1_re = i01_re; a1_im = i01_im;
937  a2_re = i02_re; a2_im = i02_im;
938  b0_re = i10_re; b0_im = i10_im;
939  b1_re = i11_re; b1_im = i11_im;
940  b2_re = i12_re; b2_im = i12_im;
941 
942  }
943 #endif // MULTI_GPU
944 
945  // read gauge matrix from device memory
946  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, param.gauge_stride);
947 
948  // reconstruct gauge matrix
950 
951  // multiply row 0
952  spinorFloat A0_re = 0;
953  A0_re += gT00_re * a0_re;
954  A0_re -= gT00_im * a0_im;
955  A0_re += gT01_re * a1_re;
956  A0_re -= gT01_im * a1_im;
957  A0_re += gT02_re * a2_re;
958  A0_re -= gT02_im * a2_im;
959  spinorFloat A0_im = 0;
960  A0_im += gT00_re * a0_im;
961  A0_im += gT00_im * a0_re;
962  A0_im += gT01_re * a1_im;
963  A0_im += gT01_im * a1_re;
964  A0_im += gT02_re * a2_im;
965  A0_im += gT02_im * a2_re;
966  spinorFloat B0_re = 0;
967  B0_re += gT00_re * b0_re;
968  B0_re -= gT00_im * b0_im;
969  B0_re += gT01_re * b1_re;
970  B0_re -= gT01_im * b1_im;
971  B0_re += gT02_re * b2_re;
972  B0_re -= gT02_im * b2_im;
973  spinorFloat B0_im = 0;
974  B0_im += gT00_re * b0_im;
975  B0_im += gT00_im * b0_re;
976  B0_im += gT01_re * b1_im;
977  B0_im += gT01_im * b1_re;
978  B0_im += gT02_re * b2_im;
979  B0_im += gT02_im * b2_re;
980 
981  // multiply row 1
982  spinorFloat A1_re = 0;
983  A1_re += gT10_re * a0_re;
984  A1_re -= gT10_im * a0_im;
985  A1_re += gT11_re * a1_re;
986  A1_re -= gT11_im * a1_im;
987  A1_re += gT12_re * a2_re;
988  A1_re -= gT12_im * a2_im;
989  spinorFloat A1_im = 0;
990  A1_im += gT10_re * a0_im;
991  A1_im += gT10_im * a0_re;
992  A1_im += gT11_re * a1_im;
993  A1_im += gT11_im * a1_re;
994  A1_im += gT12_re * a2_im;
995  A1_im += gT12_im * a2_re;
996  spinorFloat B1_re = 0;
997  B1_re += gT10_re * b0_re;
998  B1_re -= gT10_im * b0_im;
999  B1_re += gT11_re * b1_re;
1000  B1_re -= gT11_im * b1_im;
1001  B1_re += gT12_re * b2_re;
1002  B1_re -= gT12_im * b2_im;
1003  spinorFloat B1_im = 0;
1004  B1_im += gT10_re * b0_im;
1005  B1_im += gT10_im * b0_re;
1006  B1_im += gT11_re * b1_im;
1007  B1_im += gT11_im * b1_re;
1008  B1_im += gT12_re * b2_im;
1009  B1_im += gT12_im * b2_re;
1010 
1011  // multiply row 2
1012  spinorFloat A2_re = 0;
1013  A2_re += gT20_re * a0_re;
1014  A2_re -= gT20_im * a0_im;
1015  A2_re += gT21_re * a1_re;
1016  A2_re -= gT21_im * a1_im;
1017  A2_re += gT22_re * a2_re;
1018  A2_re -= gT22_im * a2_im;
1019  spinorFloat A2_im = 0;
1020  A2_im += gT20_re * a0_im;
1021  A2_im += gT20_im * a0_re;
1022  A2_im += gT21_re * a1_im;
1023  A2_im += gT21_im * a1_re;
1024  A2_im += gT22_re * a2_im;
1025  A2_im += gT22_im * a2_re;
1026  spinorFloat B2_re = 0;
1027  B2_re += gT20_re * b0_re;
1028  B2_re -= gT20_im * b0_im;
1029  B2_re += gT21_re * b1_re;
1030  B2_re -= gT21_im * b1_im;
1031  B2_re += gT22_re * b2_re;
1032  B2_re -= gT22_im * b2_im;
1033  spinorFloat B2_im = 0;
1034  B2_im += gT20_re * b0_im;
1035  B2_im += gT20_im * b0_re;
1036  B2_im += gT21_re * b1_im;
1037  B2_im += gT21_im * b1_re;
1038  B2_im += gT22_re * b2_im;
1039  B2_im += gT22_im * b2_re;
1040 
1041  o00_re += A0_re;
1042  o00_im += A0_im;
1043  o10_re += B0_re;
1044  o10_im += B0_im;
1045  o20_re += B0_im;
1046  o20_im -= B0_re;
1047  o30_re += A0_im;
1048  o30_im -= A0_re;
1049 
1050  o01_re += A1_re;
1051  o01_im += A1_im;
1052  o11_re += B1_re;
1053  o11_im += B1_im;
1054  o21_re += B1_im;
1055  o21_im -= B1_re;
1056  o31_re += A1_im;
1057  o31_im -= A1_re;
1058 
1059  o02_re += A2_re;
1060  o02_im += A2_im;
1061  o12_re += B2_re;
1062  o12_im += B2_im;
1063  o22_re += B2_im;
1064  o22_im -= B2_re;
1065  o32_re += A2_im;
1066  o32_im -= A2_re;
1067 
1068 }
1069 
1070 #ifdef MULTI_GPU
1071 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]<(param.dc.X[1]-1))) ||
1072  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==(param.dc.X[1]-1)) )
1073 #endif
1074 {
1075  // Projector P1-
1076  // 1 0 0 -1
1077  // 0 1 1 0
1078  // 0 1 1 0
1079  // -1 0 0 1
1080 
1081 #ifdef MULTI_GPU
1082  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1 :
1083  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1084 #if (DD_PREC==2) // half precision
1085  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1086 #endif
1087 #else
1088  const int sp_idx = (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1;
1089 #endif
1090 
1091  const int ga_idx = sid;
1092 
1099 
1100 #ifdef MULTI_GPU
1101  if (kernel_type == INTERIOR_KERNEL) {
1102 #endif
1103 
1104  if (threadIdx.y == blockDim.y-1 && blockDim.y < param.dc.X[1] ) {
1105  // read spinor from device memory
1106  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1107 
1108  // project spinor into half spinors
1109  a0_re = +i00_re-i30_re;
1110  a0_im = +i00_im-i30_im;
1111  a1_re = +i01_re-i31_re;
1112  a1_im = +i01_im-i31_im;
1113  a2_re = +i02_re-i32_re;
1114  a2_im = +i02_im-i32_im;
1115  b0_re = +i10_re+i20_re;
1116  b0_im = +i10_im+i20_im;
1117  b1_re = +i11_re+i21_re;
1118  b1_im = +i11_im+i21_im;
1119  b2_re = +i12_re+i22_re;
1120  b2_im = +i12_im+i22_im;
1121  } else {
1122  // load spinor from shared memory
1123  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x;
1124  int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0;
1125  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
1126 
1127  // project spinor into half spinors
1128  a0_re = +i00_re-i30_re;
1129  a0_im = +i00_im-i30_im;
1130  a1_re = +i01_re-i31_re;
1131  a1_im = +i01_im-i31_im;
1132  a2_re = +i02_re-i32_re;
1133  a2_im = +i02_im-i32_im;
1134  b0_re = +i10_re+i20_re;
1135  b0_im = +i10_im+i20_im;
1136  b1_re = +i11_re+i21_re;
1137  b1_im = +i11_im+i21_im;
1138  b2_re = +i12_re+i22_re;
1139  b2_im = +i12_im+i22_im;
1140  }
1141 
1142 #ifdef MULTI_GPU
1143  } else {
1144 
1145  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1146 
1147  // read half spinor from device memory
1148  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 2);
1149 
1150  a0_re = i00_re; a0_im = i00_im;
1151  a1_re = i01_re; a1_im = i01_im;
1152  a2_re = i02_re; a2_im = i02_im;
1153  b0_re = i10_re; b0_im = i10_im;
1154  b1_re = i11_re; b1_im = i11_im;
1155  b2_re = i12_re; b2_im = i12_im;
1156 
1157  }
1158 #endif // MULTI_GPU
1159 
1160  // read gauge matrix from device memory
1161  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, param.gauge_stride);
1162 
1163  // reconstruct gauge matrix
1165 
1166  // multiply row 0
1167  spinorFloat A0_re = 0;
1168  A0_re += g00_re * a0_re;
1169  A0_re -= g00_im * a0_im;
1170  A0_re += g01_re * a1_re;
1171  A0_re -= g01_im * a1_im;
1172  A0_re += g02_re * a2_re;
1173  A0_re -= g02_im * a2_im;
1174  spinorFloat A0_im = 0;
1175  A0_im += g00_re * a0_im;
1176  A0_im += g00_im * a0_re;
1177  A0_im += g01_re * a1_im;
1178  A0_im += g01_im * a1_re;
1179  A0_im += g02_re * a2_im;
1180  A0_im += g02_im * a2_re;
1181  spinorFloat B0_re = 0;
1182  B0_re += g00_re * b0_re;
1183  B0_re -= g00_im * b0_im;
1184  B0_re += g01_re * b1_re;
1185  B0_re -= g01_im * b1_im;
1186  B0_re += g02_re * b2_re;
1187  B0_re -= g02_im * b2_im;
1188  spinorFloat B0_im = 0;
1189  B0_im += g00_re * b0_im;
1190  B0_im += g00_im * b0_re;
1191  B0_im += g01_re * b1_im;
1192  B0_im += g01_im * b1_re;
1193  B0_im += g02_re * b2_im;
1194  B0_im += g02_im * b2_re;
1195 
1196  // multiply row 1
1197  spinorFloat A1_re = 0;
1198  A1_re += g10_re * a0_re;
1199  A1_re -= g10_im * a0_im;
1200  A1_re += g11_re * a1_re;
1201  A1_re -= g11_im * a1_im;
1202  A1_re += g12_re * a2_re;
1203  A1_re -= g12_im * a2_im;
1204  spinorFloat A1_im = 0;
1205  A1_im += g10_re * a0_im;
1206  A1_im += g10_im * a0_re;
1207  A1_im += g11_re * a1_im;
1208  A1_im += g11_im * a1_re;
1209  A1_im += g12_re * a2_im;
1210  A1_im += g12_im * a2_re;
1211  spinorFloat B1_re = 0;
1212  B1_re += g10_re * b0_re;
1213  B1_re -= g10_im * b0_im;
1214  B1_re += g11_re * b1_re;
1215  B1_re -= g11_im * b1_im;
1216  B1_re += g12_re * b2_re;
1217  B1_re -= g12_im * b2_im;
1218  spinorFloat B1_im = 0;
1219  B1_im += g10_re * b0_im;
1220  B1_im += g10_im * b0_re;
1221  B1_im += g11_re * b1_im;
1222  B1_im += g11_im * b1_re;
1223  B1_im += g12_re * b2_im;
1224  B1_im += g12_im * b2_re;
1225 
1226  // multiply row 2
1227  spinorFloat A2_re = 0;
1228  A2_re += g20_re * a0_re;
1229  A2_re -= g20_im * a0_im;
1230  A2_re += g21_re * a1_re;
1231  A2_re -= g21_im * a1_im;
1232  A2_re += g22_re * a2_re;
1233  A2_re -= g22_im * a2_im;
1234  spinorFloat A2_im = 0;
1235  A2_im += g20_re * a0_im;
1236  A2_im += g20_im * a0_re;
1237  A2_im += g21_re * a1_im;
1238  A2_im += g21_im * a1_re;
1239  A2_im += g22_re * a2_im;
1240  A2_im += g22_im * a2_re;
1241  spinorFloat B2_re = 0;
1242  B2_re += g20_re * b0_re;
1243  B2_re -= g20_im * b0_im;
1244  B2_re += g21_re * b1_re;
1245  B2_re -= g21_im * b1_im;
1246  B2_re += g22_re * b2_re;
1247  B2_re -= g22_im * b2_im;
1248  spinorFloat B2_im = 0;
1249  B2_im += g20_re * b0_im;
1250  B2_im += g20_im * b0_re;
1251  B2_im += g21_re * b1_im;
1252  B2_im += g21_im * b1_re;
1253  B2_im += g22_re * b2_im;
1254  B2_im += g22_im * b2_re;
1255 
1256  o00_re += A0_re;
1257  o00_im += A0_im;
1258  o10_re += B0_re;
1259  o10_im += B0_im;
1260  o20_re += B0_re;
1261  o20_im += B0_im;
1262  o30_re -= A0_re;
1263  o30_im -= A0_im;
1264 
1265  o01_re += A1_re;
1266  o01_im += A1_im;
1267  o11_re += B1_re;
1268  o11_im += B1_im;
1269  o21_re += B1_re;
1270  o21_im += B1_im;
1271  o31_re -= A1_re;
1272  o31_im -= A1_im;
1273 
1274  o02_re += A2_re;
1275  o02_im += A2_im;
1276  o12_re += B2_re;
1277  o12_im += B2_im;
1278  o22_re += B2_re;
1279  o22_im += B2_im;
1280  o32_re -= A2_re;
1281  o32_im -= A2_im;
1282 
1283 }
1284 
1285 #ifdef MULTI_GPU
1286 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]>0)) ||
1287  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==0) )
1288 #endif
1289 {
1290  // Projector P1+
1291  // 1 0 0 1
1292  // 0 1 -1 0
1293  // 0 -1 1 0
1294  // 1 0 0 1
1295 
1296 #ifdef MULTI_GPU
1297  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1 :
1298  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1299 #if (DD_PREC==2) // half precision
1300  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1301 #endif
1302 #else
1303  const int sp_idx = (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1;
1304 #endif
1305 
1306 #ifdef MULTI_GPU
1307  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1308 #else
1309  const int ga_idx = sp_idx;
1310 #endif
1311 
1318 
1319 #ifdef MULTI_GPU
1320  if (kernel_type == INTERIOR_KERNEL) {
1321 #endif
1322 
1323  if (threadIdx.y == 0 && blockDim.y < param.dc.X[1]) {
1324  // read spinor from device memory
1325  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1326 
1327  // project spinor into half spinors
1328  a0_re = +i00_re+i30_re;
1329  a0_im = +i00_im+i30_im;
1330  a1_re = +i01_re+i31_re;
1331  a1_im = +i01_im+i31_im;
1332  a2_re = +i02_re+i32_re;
1333  a2_im = +i02_im+i32_im;
1334  b0_re = +i10_re-i20_re;
1335  b0_im = +i10_im-i20_im;
1336  b1_re = +i11_re-i21_re;
1337  b1_im = +i11_im-i21_im;
1338  b2_re = +i12_re-i22_re;
1339  b2_im = +i12_im-i22_im;
1340  } else {
1341  // load spinor from shared memory
1342  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x;
1343  int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1;
1344  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
1345 
1346  // project spinor into half spinors
1347  a0_re = +i00_re+i30_re;
1348  a0_im = +i00_im+i30_im;
1349  a1_re = +i01_re+i31_re;
1350  a1_im = +i01_im+i31_im;
1351  a2_re = +i02_re+i32_re;
1352  a2_im = +i02_im+i32_im;
1353  b0_re = +i10_re-i20_re;
1354  b0_im = +i10_im-i20_im;
1355  b1_re = +i11_re-i21_re;
1356  b1_im = +i11_im-i21_im;
1357  b2_re = +i12_re-i22_re;
1358  b2_im = +i12_im-i22_im;
1359  }
1360 
1361 #ifdef MULTI_GPU
1362  } else {
1363 
1364  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1365 
1366  // read half spinor from device memory
1367  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 3);
1368 
1369  a0_re = i00_re; a0_im = i00_im;
1370  a1_re = i01_re; a1_im = i01_im;
1371  a2_re = i02_re; a2_im = i02_im;
1372  b0_re = i10_re; b0_im = i10_im;
1373  b1_re = i11_re; b1_im = i11_im;
1374  b2_re = i12_re; b2_im = i12_im;
1375 
1376  }
1377 #endif // MULTI_GPU
1378 
1379  // read gauge matrix from device memory
1380  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, param.gauge_stride);
1381 
1382  // reconstruct gauge matrix
1384 
1385  // multiply row 0
1386  spinorFloat A0_re = 0;
1387  A0_re += gT00_re * a0_re;
1388  A0_re -= gT00_im * a0_im;
1389  A0_re += gT01_re * a1_re;
1390  A0_re -= gT01_im * a1_im;
1391  A0_re += gT02_re * a2_re;
1392  A0_re -= gT02_im * a2_im;
1393  spinorFloat A0_im = 0;
1394  A0_im += gT00_re * a0_im;
1395  A0_im += gT00_im * a0_re;
1396  A0_im += gT01_re * a1_im;
1397  A0_im += gT01_im * a1_re;
1398  A0_im += gT02_re * a2_im;
1399  A0_im += gT02_im * a2_re;
1400  spinorFloat B0_re = 0;
1401  B0_re += gT00_re * b0_re;
1402  B0_re -= gT00_im * b0_im;
1403  B0_re += gT01_re * b1_re;
1404  B0_re -= gT01_im * b1_im;
1405  B0_re += gT02_re * b2_re;
1406  B0_re -= gT02_im * b2_im;
1407  spinorFloat B0_im = 0;
1408  B0_im += gT00_re * b0_im;
1409  B0_im += gT00_im * b0_re;
1410  B0_im += gT01_re * b1_im;
1411  B0_im += gT01_im * b1_re;
1412  B0_im += gT02_re * b2_im;
1413  B0_im += gT02_im * b2_re;
1414 
1415  // multiply row 1
1416  spinorFloat A1_re = 0;
1417  A1_re += gT10_re * a0_re;
1418  A1_re -= gT10_im * a0_im;
1419  A1_re += gT11_re * a1_re;
1420  A1_re -= gT11_im * a1_im;
1421  A1_re += gT12_re * a2_re;
1422  A1_re -= gT12_im * a2_im;
1423  spinorFloat A1_im = 0;
1424  A1_im += gT10_re * a0_im;
1425  A1_im += gT10_im * a0_re;
1426  A1_im += gT11_re * a1_im;
1427  A1_im += gT11_im * a1_re;
1428  A1_im += gT12_re * a2_im;
1429  A1_im += gT12_im * a2_re;
1430  spinorFloat B1_re = 0;
1431  B1_re += gT10_re * b0_re;
1432  B1_re -= gT10_im * b0_im;
1433  B1_re += gT11_re * b1_re;
1434  B1_re -= gT11_im * b1_im;
1435  B1_re += gT12_re * b2_re;
1436  B1_re -= gT12_im * b2_im;
1437  spinorFloat B1_im = 0;
1438  B1_im += gT10_re * b0_im;
1439  B1_im += gT10_im * b0_re;
1440  B1_im += gT11_re * b1_im;
1441  B1_im += gT11_im * b1_re;
1442  B1_im += gT12_re * b2_im;
1443  B1_im += gT12_im * b2_re;
1444 
1445  // multiply row 2
1446  spinorFloat A2_re = 0;
1447  A2_re += gT20_re * a0_re;
1448  A2_re -= gT20_im * a0_im;
1449  A2_re += gT21_re * a1_re;
1450  A2_re -= gT21_im * a1_im;
1451  A2_re += gT22_re * a2_re;
1452  A2_re -= gT22_im * a2_im;
1453  spinorFloat A2_im = 0;
1454  A2_im += gT20_re * a0_im;
1455  A2_im += gT20_im * a0_re;
1456  A2_im += gT21_re * a1_im;
1457  A2_im += gT21_im * a1_re;
1458  A2_im += gT22_re * a2_im;
1459  A2_im += gT22_im * a2_re;
1460  spinorFloat B2_re = 0;
1461  B2_re += gT20_re * b0_re;
1462  B2_re -= gT20_im * b0_im;
1463  B2_re += gT21_re * b1_re;
1464  B2_re -= gT21_im * b1_im;
1465  B2_re += gT22_re * b2_re;
1466  B2_re -= gT22_im * b2_im;
1467  spinorFloat B2_im = 0;
1468  B2_im += gT20_re * b0_im;
1469  B2_im += gT20_im * b0_re;
1470  B2_im += gT21_re * b1_im;
1471  B2_im += gT21_im * b1_re;
1472  B2_im += gT22_re * b2_im;
1473  B2_im += gT22_im * b2_re;
1474 
1475  o00_re += A0_re;
1476  o00_im += A0_im;
1477  o10_re += B0_re;
1478  o10_im += B0_im;
1479  o20_re -= B0_re;
1480  o20_im -= B0_im;
1481  o30_re += A0_re;
1482  o30_im += A0_im;
1483 
1484  o01_re += A1_re;
1485  o01_im += A1_im;
1486  o11_re += B1_re;
1487  o11_im += B1_im;
1488  o21_re -= B1_re;
1489  o21_im -= B1_im;
1490  o31_re += A1_re;
1491  o31_im += A1_im;
1492 
1493  o02_re += A2_re;
1494  o02_im += A2_im;
1495  o12_re += B2_re;
1496  o12_im += B2_im;
1497  o22_re -= B2_re;
1498  o22_im -= B2_im;
1499  o32_re += A2_re;
1500  o32_im += A2_im;
1501 
1502 }
1503 
1504 #ifdef MULTI_GPU
1505 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]<(param.dc.X[2]-1))) ||
1506  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==(param.dc.X[2]-1)) )
1507 #endif
1508 {
1509  // Projector P2-
1510  // 1 0 -i 0
1511  // 0 1 0 i
1512  // i 0 1 0
1513  // 0 -i 0 1
1514 
1515 #ifdef MULTI_GPU
1516  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1 :
1517  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1518 #if (DD_PREC==2) // half precision
1519  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1520 #endif
1521 #else
1522  const int sp_idx = (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1;
1523 #endif
1524 
1525  const int ga_idx = sid;
1526 
1533 
1534 #ifdef MULTI_GPU
1535  if (kernel_type == INTERIOR_KERNEL) {
1536 #endif
1537 
1538  if (threadIdx.z == blockDim.z-1 && blockDim.z < X3) {
1539  // read spinor from device memory
1540  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1541 
1542  // project spinor into half spinors
1543  a0_re = +i00_re+i20_im;
1544  a0_im = +i00_im-i20_re;
1545  a1_re = +i01_re+i21_im;
1546  a1_im = +i01_im-i21_re;
1547  a2_re = +i02_re+i22_im;
1548  a2_im = +i02_im-i22_re;
1549  b0_re = +i10_re-i30_im;
1550  b0_im = +i10_im+i30_re;
1551  b1_re = +i11_re-i31_im;
1552  b1_im = +i11_im+i31_re;
1553  b2_re = +i12_re-i32_im;
1554  b2_im = +i12_im+i32_re;
1555  } else {
1556  // load spinor from shared memory
1557  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x;
1558  int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0;
1559  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1560 
1561  // project spinor into half spinors
1562  a0_re = +i00_re+i20_im;
1563  a0_im = +i00_im-i20_re;
1564  a1_re = +i01_re+i21_im;
1565  a1_im = +i01_im-i21_re;
1566  a2_re = +i02_re+i22_im;
1567  a2_im = +i02_im-i22_re;
1568  b0_re = +i10_re-i30_im;
1569  b0_im = +i10_im+i30_re;
1570  b1_re = +i11_re-i31_im;
1571  b1_im = +i11_im+i31_re;
1572  b2_re = +i12_re-i32_im;
1573  b2_im = +i12_im+i32_re;
1574  }
1575 
1576 #ifdef MULTI_GPU
1577  } else {
1578 
1579  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1580 
1581  // read half spinor from device memory
1582  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 4);
1583 
1584  a0_re = i00_re; a0_im = i00_im;
1585  a1_re = i01_re; a1_im = i01_im;
1586  a2_re = i02_re; a2_im = i02_im;
1587  b0_re = i10_re; b0_im = i10_im;
1588  b1_re = i11_re; b1_im = i11_im;
1589  b2_re = i12_re; b2_im = i12_im;
1590 
1591  }
1592 #endif // MULTI_GPU
1593 
1594  // read gauge matrix from device memory
1595  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, param.gauge_stride);
1596 
1597  // reconstruct gauge matrix
1599 
1600  // multiply row 0
1601  spinorFloat A0_re = 0;
1602  A0_re += g00_re * a0_re;
1603  A0_re -= g00_im * a0_im;
1604  A0_re += g01_re * a1_re;
1605  A0_re -= g01_im * a1_im;
1606  A0_re += g02_re * a2_re;
1607  A0_re -= g02_im * a2_im;
1608  spinorFloat A0_im = 0;
1609  A0_im += g00_re * a0_im;
1610  A0_im += g00_im * a0_re;
1611  A0_im += g01_re * a1_im;
1612  A0_im += g01_im * a1_re;
1613  A0_im += g02_re * a2_im;
1614  A0_im += g02_im * a2_re;
1615  spinorFloat B0_re = 0;
1616  B0_re += g00_re * b0_re;
1617  B0_re -= g00_im * b0_im;
1618  B0_re += g01_re * b1_re;
1619  B0_re -= g01_im * b1_im;
1620  B0_re += g02_re * b2_re;
1621  B0_re -= g02_im * b2_im;
1622  spinorFloat B0_im = 0;
1623  B0_im += g00_re * b0_im;
1624  B0_im += g00_im * b0_re;
1625  B0_im += g01_re * b1_im;
1626  B0_im += g01_im * b1_re;
1627  B0_im += g02_re * b2_im;
1628  B0_im += g02_im * b2_re;
1629 
1630  // multiply row 1
1631  spinorFloat A1_re = 0;
1632  A1_re += g10_re * a0_re;
1633  A1_re -= g10_im * a0_im;
1634  A1_re += g11_re * a1_re;
1635  A1_re -= g11_im * a1_im;
1636  A1_re += g12_re * a2_re;
1637  A1_re -= g12_im * a2_im;
1638  spinorFloat A1_im = 0;
1639  A1_im += g10_re * a0_im;
1640  A1_im += g10_im * a0_re;
1641  A1_im += g11_re * a1_im;
1642  A1_im += g11_im * a1_re;
1643  A1_im += g12_re * a2_im;
1644  A1_im += g12_im * a2_re;
1645  spinorFloat B1_re = 0;
1646  B1_re += g10_re * b0_re;
1647  B1_re -= g10_im * b0_im;
1648  B1_re += g11_re * b1_re;
1649  B1_re -= g11_im * b1_im;
1650  B1_re += g12_re * b2_re;
1651  B1_re -= g12_im * b2_im;
1652  spinorFloat B1_im = 0;
1653  B1_im += g10_re * b0_im;
1654  B1_im += g10_im * b0_re;
1655  B1_im += g11_re * b1_im;
1656  B1_im += g11_im * b1_re;
1657  B1_im += g12_re * b2_im;
1658  B1_im += g12_im * b2_re;
1659 
1660  // multiply row 2
1661  spinorFloat A2_re = 0;
1662  A2_re += g20_re * a0_re;
1663  A2_re -= g20_im * a0_im;
1664  A2_re += g21_re * a1_re;
1665  A2_re -= g21_im * a1_im;
1666  A2_re += g22_re * a2_re;
1667  A2_re -= g22_im * a2_im;
1668  spinorFloat A2_im = 0;
1669  A2_im += g20_re * a0_im;
1670  A2_im += g20_im * a0_re;
1671  A2_im += g21_re * a1_im;
1672  A2_im += g21_im * a1_re;
1673  A2_im += g22_re * a2_im;
1674  A2_im += g22_im * a2_re;
1675  spinorFloat B2_re = 0;
1676  B2_re += g20_re * b0_re;
1677  B2_re -= g20_im * b0_im;
1678  B2_re += g21_re * b1_re;
1679  B2_re -= g21_im * b1_im;
1680  B2_re += g22_re * b2_re;
1681  B2_re -= g22_im * b2_im;
1682  spinorFloat B2_im = 0;
1683  B2_im += g20_re * b0_im;
1684  B2_im += g20_im * b0_re;
1685  B2_im += g21_re * b1_im;
1686  B2_im += g21_im * b1_re;
1687  B2_im += g22_re * b2_im;
1688  B2_im += g22_im * b2_re;
1689 
1690  o00_re += A0_re;
1691  o00_im += A0_im;
1692  o10_re += B0_re;
1693  o10_im += B0_im;
1694  o20_re -= A0_im;
1695  o20_im += A0_re;
1696  o30_re += B0_im;
1697  o30_im -= B0_re;
1698 
1699  o01_re += A1_re;
1700  o01_im += A1_im;
1701  o11_re += B1_re;
1702  o11_im += B1_im;
1703  o21_re -= A1_im;
1704  o21_im += A1_re;
1705  o31_re += B1_im;
1706  o31_im -= B1_re;
1707 
1708  o02_re += A2_re;
1709  o02_im += A2_im;
1710  o12_re += B2_re;
1711  o12_im += B2_im;
1712  o22_re -= A2_im;
1713  o22_im += A2_re;
1714  o32_re += B2_im;
1715  o32_im -= B2_re;
1716 
1717 }
1718 
1719 #ifdef MULTI_GPU
1720 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]>0)) ||
1721  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==0) )
1722 #endif
1723 {
1724  // Projector P2+
1725  // 1 0 i 0
1726  // 0 1 0 -i
1727  // -i 0 1 0
1728  // 0 i 0 1
1729 
1730 #ifdef MULTI_GPU
1731  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1 :
1732  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1733 #if (DD_PREC==2) // half precision
1734  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1735 #endif
1736 #else
1737  const int sp_idx = (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1;
1738 #endif
1739 
1740 #ifdef MULTI_GPU
1741  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1742 #else
1743  const int ga_idx = sp_idx;
1744 #endif
1745 
1752 
1753 #ifdef MULTI_GPU
1754  if (kernel_type == INTERIOR_KERNEL) {
1755 #endif
1756 
1757  if (threadIdx.z == 0 && blockDim.z < X3) {
1758  // read spinor from device memory
1759  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1760 
1761  // project spinor into half spinors
1762  a0_re = +i00_re-i20_im;
1763  a0_im = +i00_im+i20_re;
1764  a1_re = +i01_re-i21_im;
1765  a1_im = +i01_im+i21_re;
1766  a2_re = +i02_re-i22_im;
1767  a2_im = +i02_im+i22_re;
1768  b0_re = +i10_re+i30_im;
1769  b0_im = +i10_im-i30_re;
1770  b1_re = +i11_re+i31_im;
1771  b1_im = +i11_im-i31_re;
1772  b2_re = +i12_re+i32_im;
1773  b2_im = +i12_im-i32_re;
1774  } else {
1775  // load spinor from shared memory
1776  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x;
1777  int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1;
1778  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1779 
1780  // project spinor into half spinors
1781  a0_re = +i00_re-i20_im;
1782  a0_im = +i00_im+i20_re;
1783  a1_re = +i01_re-i21_im;
1784  a1_im = +i01_im+i21_re;
1785  a2_re = +i02_re-i22_im;
1786  a2_im = +i02_im+i22_re;
1787  b0_re = +i10_re+i30_im;
1788  b0_im = +i10_im-i30_re;
1789  b1_re = +i11_re+i31_im;
1790  b1_im = +i11_im-i31_re;
1791  b2_re = +i12_re+i32_im;
1792  b2_im = +i12_im-i32_re;
1793  }
1794 
1795 #ifdef MULTI_GPU
1796  } else {
1797 
1798  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1799 
1800  // read half spinor from device memory
1801  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 5);
1802 
1803  a0_re = i00_re; a0_im = i00_im;
1804  a1_re = i01_re; a1_im = i01_im;
1805  a2_re = i02_re; a2_im = i02_im;
1806  b0_re = i10_re; b0_im = i10_im;
1807  b1_re = i11_re; b1_im = i11_im;
1808  b2_re = i12_re; b2_im = i12_im;
1809 
1810  }
1811 #endif // MULTI_GPU
1812 
1813  // read gauge matrix from device memory
1814  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, param.gauge_stride);
1815 
1816  // reconstruct gauge matrix
1818 
1819  // multiply row 0
1820  spinorFloat A0_re = 0;
1821  A0_re += gT00_re * a0_re;
1822  A0_re -= gT00_im * a0_im;
1823  A0_re += gT01_re * a1_re;
1824  A0_re -= gT01_im * a1_im;
1825  A0_re += gT02_re * a2_re;
1826  A0_re -= gT02_im * a2_im;
1827  spinorFloat A0_im = 0;
1828  A0_im += gT00_re * a0_im;
1829  A0_im += gT00_im * a0_re;
1830  A0_im += gT01_re * a1_im;
1831  A0_im += gT01_im * a1_re;
1832  A0_im += gT02_re * a2_im;
1833  A0_im += gT02_im * a2_re;
1834  spinorFloat B0_re = 0;
1835  B0_re += gT00_re * b0_re;
1836  B0_re -= gT00_im * b0_im;
1837  B0_re += gT01_re * b1_re;
1838  B0_re -= gT01_im * b1_im;
1839  B0_re += gT02_re * b2_re;
1840  B0_re -= gT02_im * b2_im;
1841  spinorFloat B0_im = 0;
1842  B0_im += gT00_re * b0_im;
1843  B0_im += gT00_im * b0_re;
1844  B0_im += gT01_re * b1_im;
1845  B0_im += gT01_im * b1_re;
1846  B0_im += gT02_re * b2_im;
1847  B0_im += gT02_im * b2_re;
1848 
1849  // multiply row 1
1850  spinorFloat A1_re = 0;
1851  A1_re += gT10_re * a0_re;
1852  A1_re -= gT10_im * a0_im;
1853  A1_re += gT11_re * a1_re;
1854  A1_re -= gT11_im * a1_im;
1855  A1_re += gT12_re * a2_re;
1856  A1_re -= gT12_im * a2_im;
1857  spinorFloat A1_im = 0;
1858  A1_im += gT10_re * a0_im;
1859  A1_im += gT10_im * a0_re;
1860  A1_im += gT11_re * a1_im;
1861  A1_im += gT11_im * a1_re;
1862  A1_im += gT12_re * a2_im;
1863  A1_im += gT12_im * a2_re;
1864  spinorFloat B1_re = 0;
1865  B1_re += gT10_re * b0_re;
1866  B1_re -= gT10_im * b0_im;
1867  B1_re += gT11_re * b1_re;
1868  B1_re -= gT11_im * b1_im;
1869  B1_re += gT12_re * b2_re;
1870  B1_re -= gT12_im * b2_im;
1871  spinorFloat B1_im = 0;
1872  B1_im += gT10_re * b0_im;
1873  B1_im += gT10_im * b0_re;
1874  B1_im += gT11_re * b1_im;
1875  B1_im += gT11_im * b1_re;
1876  B1_im += gT12_re * b2_im;
1877  B1_im += gT12_im * b2_re;
1878 
1879  // multiply row 2
1880  spinorFloat A2_re = 0;
1881  A2_re += gT20_re * a0_re;
1882  A2_re -= gT20_im * a0_im;
1883  A2_re += gT21_re * a1_re;
1884  A2_re -= gT21_im * a1_im;
1885  A2_re += gT22_re * a2_re;
1886  A2_re -= gT22_im * a2_im;
1887  spinorFloat A2_im = 0;
1888  A2_im += gT20_re * a0_im;
1889  A2_im += gT20_im * a0_re;
1890  A2_im += gT21_re * a1_im;
1891  A2_im += gT21_im * a1_re;
1892  A2_im += gT22_re * a2_im;
1893  A2_im += gT22_im * a2_re;
1894  spinorFloat B2_re = 0;
1895  B2_re += gT20_re * b0_re;
1896  B2_re -= gT20_im * b0_im;
1897  B2_re += gT21_re * b1_re;
1898  B2_re -= gT21_im * b1_im;
1899  B2_re += gT22_re * b2_re;
1900  B2_re -= gT22_im * b2_im;
1901  spinorFloat B2_im = 0;
1902  B2_im += gT20_re * b0_im;
1903  B2_im += gT20_im * b0_re;
1904  B2_im += gT21_re * b1_im;
1905  B2_im += gT21_im * b1_re;
1906  B2_im += gT22_re * b2_im;
1907  B2_im += gT22_im * b2_re;
1908 
1909  o00_re += A0_re;
1910  o00_im += A0_im;
1911  o10_re += B0_re;
1912  o10_im += B0_im;
1913  o20_re += A0_im;
1914  o20_im -= A0_re;
1915  o30_re -= B0_im;
1916  o30_im += B0_re;
1917 
1918  o01_re += A1_re;
1919  o01_im += A1_im;
1920  o11_re += B1_re;
1921  o11_im += B1_im;
1922  o21_re += A1_im;
1923  o21_im -= A1_re;
1924  o31_re -= B1_im;
1925  o31_im += B1_re;
1926 
1927  o02_re += A2_re;
1928  o02_im += A2_im;
1929  o12_re += B2_re;
1930  o12_im += B2_im;
1931  o22_re += A2_im;
1932  o22_im -= A2_re;
1933  o32_re -= B2_im;
1934  o32_im += B2_re;
1935 
1936 }
1937 
1938 #ifdef MULTI_GPU
1939 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]<(param.dc.X[3]-1))) ||
1940  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==(param.dc.X[3]-1)) )
1941 #endif
1942 {
1943  // Projector P3-
1944  // 0 0 0 0
1945  // 0 0 0 0
1946  // 0 0 2 0
1947  // 0 0 0 2
1948 
1949 #ifdef MULTI_GPU
1950  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1 :
1951  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1952 #if (DD_PREC==2) // half precision
1953  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1954 #endif
1955 #else
1956  const int sp_idx = (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1;
1957 #endif
1958 
1959  const int ga_idx = sid;
1960 
1961  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1962  {
1969 
1970 #ifdef MULTI_GPU
1971  if (kernel_type == INTERIOR_KERNEL) {
1972 #endif
1973 
1974  // read spinor from device memory
1976 
1977  // project spinor into half spinors
1978  a0_re = +2*i20_re;
1979  a0_im = +2*i20_im;
1980  a1_re = +2*i21_re;
1981  a1_im = +2*i21_im;
1982  a2_re = +2*i22_re;
1983  a2_im = +2*i22_im;
1984  b0_re = +2*i30_re;
1985  b0_im = +2*i30_im;
1986  b1_re = +2*i31_re;
1987  b1_im = +2*i31_im;
1988  b2_re = +2*i32_re;
1989  b2_im = +2*i32_im;
1990 
1991 #ifdef MULTI_GPU
1992  } else {
1993 
1994  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1995  const int t_proj_scale = TPROJSCALE;
1996 
1997  // read half spinor from device memory
1998  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1999 
2000  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2001  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2002  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2003  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2004  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2005  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2006 
2007  }
2008 #endif // MULTI_GPU
2009 
2010  // identity gauge matrix
2017 
2018  o20_re += A0_re;
2019  o20_im += A0_im;
2020  o30_re += B0_re;
2021  o30_im += B0_im;
2022 
2023  o21_re += A1_re;
2024  o21_im += A1_im;
2025  o31_re += B1_re;
2026  o31_im += B1_im;
2027 
2028  o22_re += A2_re;
2029  o22_im += A2_im;
2030  o32_re += B2_re;
2031  o32_im += B2_im;
2032 
2033  } else {
2040 
2041 #ifdef MULTI_GPU
2042  if (kernel_type == INTERIOR_KERNEL) {
2043 #endif
2044 
2045  // read spinor from device memory
2047 
2048  // project spinor into half spinors
2049  a0_re = +2*i20_re;
2050  a0_im = +2*i20_im;
2051  a1_re = +2*i21_re;
2052  a1_im = +2*i21_im;
2053  a2_re = +2*i22_re;
2054  a2_im = +2*i22_im;
2055  b0_re = +2*i30_re;
2056  b0_im = +2*i30_im;
2057  b1_re = +2*i31_re;
2058  b1_im = +2*i31_im;
2059  b2_re = +2*i32_re;
2060  b2_im = +2*i32_im;
2061 
2062 #ifdef MULTI_GPU
2063  } else {
2064 
2065  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2066  const int t_proj_scale = TPROJSCALE;
2067 
2068  // read half spinor from device memory
2069  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
2070 
2071  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2072  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2073  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2074  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2075  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2076  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2077 
2078  }
2079 #endif // MULTI_GPU
2080 
2081  // read gauge matrix from device memory
2082  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, param.gauge_stride);
2083 
2084  // reconstruct gauge matrix
2086 
2087  // multiply row 0
2088  spinorFloat A0_re = 0;
2089  A0_re += g00_re * a0_re;
2090  A0_re -= g00_im * a0_im;
2091  A0_re += g01_re * a1_re;
2092  A0_re -= g01_im * a1_im;
2093  A0_re += g02_re * a2_re;
2094  A0_re -= g02_im * a2_im;
2095  spinorFloat A0_im = 0;
2096  A0_im += g00_re * a0_im;
2097  A0_im += g00_im * a0_re;
2098  A0_im += g01_re * a1_im;
2099  A0_im += g01_im * a1_re;
2100  A0_im += g02_re * a2_im;
2101  A0_im += g02_im * a2_re;
2102  spinorFloat B0_re = 0;
2103  B0_re += g00_re * b0_re;
2104  B0_re -= g00_im * b0_im;
2105  B0_re += g01_re * b1_re;
2106  B0_re -= g01_im * b1_im;
2107  B0_re += g02_re * b2_re;
2108  B0_re -= g02_im * b2_im;
2109  spinorFloat B0_im = 0;
2110  B0_im += g00_re * b0_im;
2111  B0_im += g00_im * b0_re;
2112  B0_im += g01_re * b1_im;
2113  B0_im += g01_im * b1_re;
2114  B0_im += g02_re * b2_im;
2115  B0_im += g02_im * b2_re;
2116 
2117  // multiply row 1
2118  spinorFloat A1_re = 0;
2119  A1_re += g10_re * a0_re;
2120  A1_re -= g10_im * a0_im;
2121  A1_re += g11_re * a1_re;
2122  A1_re -= g11_im * a1_im;
2123  A1_re += g12_re * a2_re;
2124  A1_re -= g12_im * a2_im;
2125  spinorFloat A1_im = 0;
2126  A1_im += g10_re * a0_im;
2127  A1_im += g10_im * a0_re;
2128  A1_im += g11_re * a1_im;
2129  A1_im += g11_im * a1_re;
2130  A1_im += g12_re * a2_im;
2131  A1_im += g12_im * a2_re;
2132  spinorFloat B1_re = 0;
2133  B1_re += g10_re * b0_re;
2134  B1_re -= g10_im * b0_im;
2135  B1_re += g11_re * b1_re;
2136  B1_re -= g11_im * b1_im;
2137  B1_re += g12_re * b2_re;
2138  B1_re -= g12_im * b2_im;
2139  spinorFloat B1_im = 0;
2140  B1_im += g10_re * b0_im;
2141  B1_im += g10_im * b0_re;
2142  B1_im += g11_re * b1_im;
2143  B1_im += g11_im * b1_re;
2144  B1_im += g12_re * b2_im;
2145  B1_im += g12_im * b2_re;
2146 
2147  // multiply row 2
2148  spinorFloat A2_re = 0;
2149  A2_re += g20_re * a0_re;
2150  A2_re -= g20_im * a0_im;
2151  A2_re += g21_re * a1_re;
2152  A2_re -= g21_im * a1_im;
2153  A2_re += g22_re * a2_re;
2154  A2_re -= g22_im * a2_im;
2155  spinorFloat A2_im = 0;
2156  A2_im += g20_re * a0_im;
2157  A2_im += g20_im * a0_re;
2158  A2_im += g21_re * a1_im;
2159  A2_im += g21_im * a1_re;
2160  A2_im += g22_re * a2_im;
2161  A2_im += g22_im * a2_re;
2162  spinorFloat B2_re = 0;
2163  B2_re += g20_re * b0_re;
2164  B2_re -= g20_im * b0_im;
2165  B2_re += g21_re * b1_re;
2166  B2_re -= g21_im * b1_im;
2167  B2_re += g22_re * b2_re;
2168  B2_re -= g22_im * b2_im;
2169  spinorFloat B2_im = 0;
2170  B2_im += g20_re * b0_im;
2171  B2_im += g20_im * b0_re;
2172  B2_im += g21_re * b1_im;
2173  B2_im += g21_im * b1_re;
2174  B2_im += g22_re * b2_im;
2175  B2_im += g22_im * b2_re;
2176 
2177  o20_re += A0_re;
2178  o20_im += A0_im;
2179  o30_re += B0_re;
2180  o30_im += B0_im;
2181 
2182  o21_re += A1_re;
2183  o21_im += A1_im;
2184  o31_re += B1_re;
2185  o31_im += B1_im;
2186 
2187  o22_re += A2_re;
2188  o22_im += A2_im;
2189  o32_re += B2_re;
2190  o32_im += B2_im;
2191 
2192  }
2193 }
2194 
2195 #ifdef MULTI_GPU
2196 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]>0)) ||
2197  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==0) )
2198 #endif
2199 {
2200  // Projector P3+
2201  // 2 0 0 0
2202  // 0 2 0 0
2203  // 0 0 0 0
2204  // 0 0 0 0
2205 
2206 #ifdef MULTI_GPU
2207  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1 :
2208  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
2209 #if (DD_PREC==2) // half precision
2210  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
2211 #endif
2212 #else
2213  const int sp_idx = (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1;
2214 #endif
2215 
2216 #ifdef MULTI_GPU
2217  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
2218 #else
2219  const int ga_idx = sp_idx;
2220 #endif
2221 
2222  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
2223  {
2230 
2231 #ifdef MULTI_GPU
2232  if (kernel_type == INTERIOR_KERNEL) {
2233 #endif
2234 
2235  // read spinor from device memory
2236  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2237 
2238  // project spinor into half spinors
2239  a0_re = +2*i00_re;
2240  a0_im = +2*i00_im;
2241  a1_re = +2*i01_re;
2242  a1_im = +2*i01_im;
2243  a2_re = +2*i02_re;
2244  a2_im = +2*i02_im;
2245  b0_re = +2*i10_re;
2246  b0_im = +2*i10_im;
2247  b1_re = +2*i11_re;
2248  b1_im = +2*i11_im;
2249  b2_re = +2*i12_re;
2250  b2_im = +2*i12_im;
2251 
2252 #ifdef MULTI_GPU
2253  } else {
2254 
2255  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2256  const int t_proj_scale = TPROJSCALE;
2257 
2258  // read half spinor from device memory
2259  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
2260 
2261  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2262  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2263  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2264  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2265  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2266  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2267 
2268  }
2269 #endif // MULTI_GPU
2270 
2271  // identity gauge matrix
2278 
2279  o00_re += A0_re;
2280  o00_im += A0_im;
2281  o10_re += B0_re;
2282  o10_im += B0_im;
2283 
2284  o01_re += A1_re;
2285  o01_im += A1_im;
2286  o11_re += B1_re;
2287  o11_im += B1_im;
2288 
2289  o02_re += A2_re;
2290  o02_im += A2_im;
2291  o12_re += B2_re;
2292  o12_im += B2_im;
2293 
2294  } else {
2301 
2302 #ifdef MULTI_GPU
2303  if (kernel_type == INTERIOR_KERNEL) {
2304 #endif
2305 
2306  // read spinor from device memory
2307  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2308 
2309  // project spinor into half spinors
2310  a0_re = +2*i00_re;
2311  a0_im = +2*i00_im;
2312  a1_re = +2*i01_re;
2313  a1_im = +2*i01_im;
2314  a2_re = +2*i02_re;
2315  a2_im = +2*i02_im;
2316  b0_re = +2*i10_re;
2317  b0_im = +2*i10_im;
2318  b1_re = +2*i11_re;
2319  b1_im = +2*i11_im;
2320  b2_re = +2*i12_re;
2321  b2_im = +2*i12_im;
2322 
2323 #ifdef MULTI_GPU
2324  } else {
2325 
2326  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2327  const int t_proj_scale = TPROJSCALE;
2328 
2329  // read half spinor from device memory
2330  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
2331 
2332  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2333  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2334  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2335  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2336  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2337  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2338 
2339  }
2340 #endif // MULTI_GPU
2341 
2342  // read gauge matrix from device memory
2343  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, param.gauge_stride);
2344 
2345  // reconstruct gauge matrix
2347 
2348  // multiply row 0
2349  spinorFloat A0_re = 0;
2350  A0_re += gT00_re * a0_re;
2351  A0_re -= gT00_im * a0_im;
2352  A0_re += gT01_re * a1_re;
2353  A0_re -= gT01_im * a1_im;
2354  A0_re += gT02_re * a2_re;
2355  A0_re -= gT02_im * a2_im;
2356  spinorFloat A0_im = 0;
2357  A0_im += gT00_re * a0_im;
2358  A0_im += gT00_im * a0_re;
2359  A0_im += gT01_re * a1_im;
2360  A0_im += gT01_im * a1_re;
2361  A0_im += gT02_re * a2_im;
2362  A0_im += gT02_im * a2_re;
2363  spinorFloat B0_re = 0;
2364  B0_re += gT00_re * b0_re;
2365  B0_re -= gT00_im * b0_im;
2366  B0_re += gT01_re * b1_re;
2367  B0_re -= gT01_im * b1_im;
2368  B0_re += gT02_re * b2_re;
2369  B0_re -= gT02_im * b2_im;
2370  spinorFloat B0_im = 0;
2371  B0_im += gT00_re * b0_im;
2372  B0_im += gT00_im * b0_re;
2373  B0_im += gT01_re * b1_im;
2374  B0_im += gT01_im * b1_re;
2375  B0_im += gT02_re * b2_im;
2376  B0_im += gT02_im * b2_re;
2377 
2378  // multiply row 1
2379  spinorFloat A1_re = 0;
2380  A1_re += gT10_re * a0_re;
2381  A1_re -= gT10_im * a0_im;
2382  A1_re += gT11_re * a1_re;
2383  A1_re -= gT11_im * a1_im;
2384  A1_re += gT12_re * a2_re;
2385  A1_re -= gT12_im * a2_im;
2386  spinorFloat A1_im = 0;
2387  A1_im += gT10_re * a0_im;
2388  A1_im += gT10_im * a0_re;
2389  A1_im += gT11_re * a1_im;
2390  A1_im += gT11_im * a1_re;
2391  A1_im += gT12_re * a2_im;
2392  A1_im += gT12_im * a2_re;
2393  spinorFloat B1_re = 0;
2394  B1_re += gT10_re * b0_re;
2395  B1_re -= gT10_im * b0_im;
2396  B1_re += gT11_re * b1_re;
2397  B1_re -= gT11_im * b1_im;
2398  B1_re += gT12_re * b2_re;
2399  B1_re -= gT12_im * b2_im;
2400  spinorFloat B1_im = 0;
2401  B1_im += gT10_re * b0_im;
2402  B1_im += gT10_im * b0_re;
2403  B1_im += gT11_re * b1_im;
2404  B1_im += gT11_im * b1_re;
2405  B1_im += gT12_re * b2_im;
2406  B1_im += gT12_im * b2_re;
2407 
2408  // multiply row 2
2409  spinorFloat A2_re = 0;
2410  A2_re += gT20_re * a0_re;
2411  A2_re -= gT20_im * a0_im;
2412  A2_re += gT21_re * a1_re;
2413  A2_re -= gT21_im * a1_im;
2414  A2_re += gT22_re * a2_re;
2415  A2_re -= gT22_im * a2_im;
2416  spinorFloat A2_im = 0;
2417  A2_im += gT20_re * a0_im;
2418  A2_im += gT20_im * a0_re;
2419  A2_im += gT21_re * a1_im;
2420  A2_im += gT21_im * a1_re;
2421  A2_im += gT22_re * a2_im;
2422  A2_im += gT22_im * a2_re;
2423  spinorFloat B2_re = 0;
2424  B2_re += gT20_re * b0_re;
2425  B2_re -= gT20_im * b0_im;
2426  B2_re += gT21_re * b1_re;
2427  B2_re -= gT21_im * b1_im;
2428  B2_re += gT22_re * b2_re;
2429  B2_re -= gT22_im * b2_im;
2430  spinorFloat B2_im = 0;
2431  B2_im += gT20_re * b0_im;
2432  B2_im += gT20_im * b0_re;
2433  B2_im += gT21_re * b1_im;
2434  B2_im += gT21_im * b1_re;
2435  B2_im += gT22_re * b2_im;
2436  B2_im += gT22_im * b2_re;
2437 
2438  o00_re += A0_re;
2439  o00_im += A0_im;
2440  o10_re += B0_re;
2441  o10_im += B0_im;
2442 
2443  o01_re += A1_re;
2444  o01_im += A1_im;
2445  o11_re += B1_re;
2446  o11_im += B1_im;
2447 
2448  o02_re += A2_re;
2449  o02_im += A2_im;
2450  o12_re += B2_re;
2451  o12_im += B2_im;
2452 
2453  }
2454 }
2455 
2456 #ifdef MULTI_GPU
2457 
2458 int incomplete = 0; // Have all 8 contributions been computed for this site?
2459 
2460 switch(kernel_type) { // intentional fall-through
2461 
2462 case INTERIOR_KERNEL:
2463  incomplete = incomplete || (param.commDim[3] && (coord[3]==0 || coord[3]==(param.dc.X[3]-1)));
2464 case EXTERIOR_KERNEL_T:
2465  incomplete = incomplete || (param.commDim[2] && (coord[2]==0 || coord[2]==(param.dc.X[2]-1)));
2466 case EXTERIOR_KERNEL_Z:
2467  incomplete = incomplete || (param.commDim[1] && (coord[1]==0 || coord[1]==(param.dc.X[1]-1)));
2468 case EXTERIOR_KERNEL_Y:
2469  incomplete = incomplete || (param.commDim[0] && (coord[0]==0 || coord[0]==(param.dc.X[0]-1)));
2470 }
2471 
2472 if (!incomplete)
2473 #endif // MULTI_GPU
2474 {
2475 #ifdef SPINOR_DOUBLE
2476  spinorFloat a = param.a;
2477 #else
2478  spinorFloat a = param.a_f;
2479 #endif
2480 #ifdef DSLASH_XPAY
2481 #ifdef SPINOR_DOUBLE
2482  spinorFloat b = param.b;
2483 #else
2484  spinorFloat b = param.b_f;
2485 #endif
2486  READ_ACCUM(ACCUMTEX, param.sp_stride)
2487 
2488 #ifndef CLOVER_TWIST_XPAY
2489  //perform invert twist first:
2490 #ifndef DYNAMIC_CLOVER
2491  APPLY_CLOVER_TWIST_INV(c, cinv, a, o);
2492 #else
2494 #endif
2495  o00_re = b*o00_re + acc00_re;
2496  o00_im = b*o00_im + acc00_im;
2497  o01_re = b*o01_re + acc01_re;
2498  o01_im = b*o01_im + acc01_im;
2499  o02_re = b*o02_re + acc02_re;
2500  o02_im = b*o02_im + acc02_im;
2501  o10_re = b*o10_re + acc10_re;
2502  o10_im = b*o10_im + acc10_im;
2503  o11_re = b*o11_re + acc11_re;
2504  o11_im = b*o11_im + acc11_im;
2505  o12_re = b*o12_re + acc12_re;
2506  o12_im = b*o12_im + acc12_im;
2507  o20_re = b*o20_re + acc20_re;
2508  o20_im = b*o20_im + acc20_im;
2509  o21_re = b*o21_re + acc21_re;
2510  o21_im = b*o21_im + acc21_im;
2511  o22_re = b*o22_re + acc22_re;
2512  o22_im = b*o22_im + acc22_im;
2513  o30_re = b*o30_re + acc30_re;
2514  o30_im = b*o30_im + acc30_im;
2515  o31_re = b*o31_re + acc31_re;
2516  o31_im = b*o31_im + acc31_im;
2517  o32_re = b*o32_re + acc32_re;
2518  o32_im = b*o32_im + acc32_im;
2519 #else
2520  APPLY_CLOVER_TWIST(c, a, acc);
2521  o00_re = b*o00_re + acc00_re;
2522  o00_im = b*o00_im + acc00_im;
2523  o01_re = b*o01_re + acc01_re;
2524  o01_im = b*o01_im + acc01_im;
2525  o02_re = b*o02_re + acc02_re;
2526  o02_im = b*o02_im + acc02_im;
2527  o10_re = b*o10_re + acc10_re;
2528  o10_im = b*o10_im + acc10_im;
2529  o11_re = b*o11_re + acc11_re;
2530  o11_im = b*o11_im + acc11_im;
2531  o12_re = b*o12_re + acc12_re;
2532  o12_im = b*o12_im + acc12_im;
2533  o20_re = b*o20_re + acc20_re;
2534  o20_im = b*o20_im + acc20_im;
2535  o21_re = b*o21_re + acc21_re;
2536  o21_im = b*o21_im + acc21_im;
2537  o22_re = b*o22_re + acc22_re;
2538  o22_im = b*o22_im + acc22_im;
2539  o30_re = b*o30_re + acc30_re;
2540  o30_im = b*o30_im + acc30_im;
2541  o31_re = b*o31_re + acc31_re;
2542  o31_im = b*o31_im + acc31_im;
2543  o32_re = b*o32_re + acc32_re;
2544  o32_im = b*o32_im + acc32_im;
2545 #endif//CLOVER_TWIST_XPAY
2546 #else //no XPAY
2547 #ifndef DYNAMIC_CLOVER
2548  APPLY_CLOVER_TWIST_INV(c, cinv, a, o);
2549 #else
2551 #endif
2552 #endif
2553 }
2554 
2555 // write spinor field back to device memory
2556 WRITE_SPINOR(param.sp_stride);
2557 
2558 // undefine to prevent warning when precision is changed
2559 #undef spinorFloat
2560 #undef WRITE_SPINOR_SHARED
2561 #undef READ_SPINOR_SHARED
2562 #undef SHARED_STRIDE
2563 
2564 #undef g00_re
2565 #undef g00_im
2566 #undef g01_re
2567 #undef g01_im
2568 #undef g02_re
2569 #undef g02_im
2570 #undef g10_re
2571 #undef g10_im
2572 #undef g11_re
2573 #undef g11_im
2574 #undef g12_re
2575 #undef g12_im
2576 #undef g20_re
2577 #undef g20_im
2578 #undef g21_re
2579 #undef g21_im
2580 #undef g22_re
2581 #undef g22_im
2582 
2583 #undef i00_re
2584 #undef i00_im
2585 #undef i01_re
2586 #undef i01_im
2587 #undef i02_re
2588 #undef i02_im
2589 #undef i10_re
2590 #undef i10_im
2591 #undef i11_re
2592 #undef i11_im
2593 #undef i12_re
2594 #undef i12_im
2595 #undef i20_re
2596 #undef i20_im
2597 #undef i21_re
2598 #undef i21_im
2599 #undef i22_re
2600 #undef i22_im
2601 #undef i30_re
2602 #undef i30_im
2603 #undef i31_re
2604 #undef i31_im
2605 #undef i32_re
2606 #undef i32_im
2607 
2608 #undef c00_00_re
2609 #undef c01_01_re
2610 #undef c02_02_re
2611 #undef c10_10_re
2612 #undef c11_11_re
2613 #undef c12_12_re
2614 #undef c01_00_re
2615 #undef c01_00_im
2616 #undef c02_00_re
2617 #undef c02_00_im
2618 #undef c10_00_re
2619 #undef c10_00_im
2620 #undef c11_00_re
2621 #undef c11_00_im
2622 #undef c12_00_re
2623 #undef c12_00_im
2624 #undef c02_01_re
2625 #undef c02_01_im
2626 #undef c10_01_re
2627 #undef c10_01_im
2628 #undef c11_01_re
2629 #undef c11_01_im
2630 #undef c12_01_re
2631 #undef c12_01_im
2632 #undef c10_02_re
2633 #undef c10_02_im
2634 #undef c11_02_re
2635 #undef c11_02_im
2636 #undef c12_02_re
2637 #undef c12_02_im
2638 #undef c11_10_re
2639 #undef c11_10_im
2640 #undef c12_10_re
2641 #undef c12_10_im
2642 #undef c12_11_re
2643 #undef c12_11_im
2644 
2645 #undef cinv00_00_re
2646 #undef cinv01_01_re
2647 #undef cinv02_02_re
2648 #undef cinv10_10_re
2649 #undef cinv11_11_re
2650 #undef cinv12_12_re
2651 #undef cinv01_00_re
2652 #undef cinv01_00_im
2653 #undef cinv02_00_re
2654 #undef cinv02_00_im
2655 #undef cinv10_00_re
2656 #undef cinv10_00_im
2657 #undef cinv11_00_re
2658 #undef cinv11_00_im
2659 #undef cinv12_00_re
2660 #undef cinv12_00_im
2661 #undef cinv02_01_re
2662 #undef cinv02_01_im
2663 #undef cinv10_01_re
2664 #undef cinv10_01_im
2665 #undef cinv11_01_re
2666 #undef cinv11_01_im
2667 #undef cinv12_01_re
2668 #undef cinv12_01_im
2669 #undef cinv10_02_re
2670 #undef cinv10_02_im
2671 #undef cinv11_02_re
2672 #undef cinv11_02_im
2673 #undef cinv12_02_re
2674 #undef cinv12_02_im
2675 #undef cinv11_10_re
2676 #undef cinv11_10_im
2677 #undef cinv12_10_re
2678 #undef cinv12_10_im
2679 #undef cinv12_11_re
2680 #undef cinv12_11_im
2681 
2682 #undef acc00_re
2683 #undef acc00_im
2684 #undef acc01_re
2685 #undef acc01_im
2686 #undef acc02_re
2687 #undef acc02_im
2688 #undef acc10_re
2689 #undef acc10_im
2690 #undef acc11_re
2691 #undef acc11_im
2692 #undef acc12_re
2693 #undef acc12_im
2694 #undef acc20_re
2695 #undef acc20_im
2696 #undef acc21_re
2697 #undef acc21_im
2698 #undef acc22_re
2699 #undef acc22_im
2700 #undef acc30_re
2701 #undef acc30_im
2702 #undef acc31_re
2703 #undef acc31_im
2704 #undef acc32_re
2705 #undef acc32_im
2706 
2707 
2708 #undef o00_re
2709 #undef o00_im
2710 #undef o01_re
2711 #undef o01_im
2712 #undef o02_re
2713 #undef o02_im
2714 #undef o10_re
2715 #undef o10_im
2716 #undef o11_re
2717 #undef o11_im
2718 #undef o12_re
2719 #undef o12_im
2720 #undef o20_re
2721 #undef o20_im
2722 #undef o21_re
2723 #undef o21_im
2724 #undef o22_re
2725 #undef o22_im
2726 #undef o30_re
2727 #undef o30_im
2728 #undef o31_re
2729 #undef o31_im
2730 #undef o32_re
2731 #undef o32_im
2732 
2733 #undef VOLATILE
#define i22_re
#define g21_im
spinorFloat B2_im
#define WRITE_SPINOR_SHARED
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)
dim3 dim3 blockDim
#define acc30_re
float4 C3
const int ga_idx
VOLATILE spinorFloat o12_re
spinorFloat a1_im
#define APPLY_CLOVER_TWIST(c, a, reg)
Definition: tmc_core.h:832
#define g21_re
#define acc22_im
spinorFloat A1_im
#define i00_re
VOLATILE spinorFloat o31_re
#define g00_re
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o30_re
#define g12_re
#define g10_re
#define acc31_im
#define acc20_im
spinorFloat a1_re
#define gT02_re
#define i10_im
#define spinorFloat
VOLATILE spinorFloat o32_im
int sp_idx
#define acc00_im
#define VOLATILE
spinorFloat B0_im
spinorFloat A0_re
#define i30_im
#define acc10_im
#define gT00_re
#define acc32_im
float4 C6
#define gT10_im
#define g11_im
float4 C0
#define acc02_re
#define APPLY_CLOVER_TWIST_DYN_INV(c, a, reg)
Definition: tmc_core.h:2004
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
float4 C7
VOLATILE spinorFloat o30_im
spinorFloat b0_im
#define acc31_re
spinorFloat B0_re
#define acc22_re
#define gT00_im
#define gT12_im
#define gT20_re
#define i10_re
#define acc21_re
#define GAUGE0TEX
VOLATILE spinorFloat o20_re
WRITE_SPINOR(param.sp_stride)
spinorFloat b2_im
#define gT01_re
#define i31_re
#define g12_im
#define i31_im
#define gT12_re
#define i20_re
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
spinorFloat b1_re
#define i02_im
spinorFloat B2_re
#define g20_re
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o11_re
#define g02_re
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
spinorFloat b1_im
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o21_im
#define acc21_im
#define g20_im
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o02_re
#define GAUGE1TEX
int coord[5]
#define SPINORTEX
#define acc01_im
VOLATILE spinorFloat o00_re
#define gT22_re
#define acc00_re
#define acc11_im
spinorFloat A0_im
spinorFloat A2_im
#define g10_im
VOLATILE spinorFloat o20_im
#define i32_re
float4 C1
#define acc32_re
#define READ_INTERMEDIATE_SPINOR
#define g02_im
#define i22_im
spinorFloat a2_re
__syncthreads()
VOLATILE spinorFloat o11_im
spinorFloat B1_im
int X[4]
Definition: quda.h:29
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o32_re
VOLATILE spinorFloat o00_im
#define READ_SPINOR_SHARED
#define i00_im
#define i21_im
#define i21_re
#define READ_SPINOR_GHOST
#define g11_re
#define gT11_im
RECONSTRUCT_GAUGE_MATRIX(0)
#define g01_im
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
APPLY_CLOVER_TWIST_INV(c, cinv, a, o)
#define i02_re
#define i11_re
VOLATILE spinorFloat o31_im
spinorFloat B1_re
spinorFloat b0_re
#define i30_re
#define gT01_im
#define i01_im
#define gT11_re
#define acc01_re
#define acc12_re
#define gT21_im
#define gT22_im
#define INTERTEX
#define gT21_re
#define i12_re
#define acc02_im
int face_idx
spinorFloat a2_im
#define i01_re
float4 C5
#define gT10_re
#define acc20_re
#define g01_re
const void * c
float4 C8
const int face_num
#define TPROJSCALE
float4 C2
#define g22_im
#define i20_im
VOLATILE spinorFloat o21_re
spinorFloat a0_im
#define gT20_im
#define GHOSTSPINORTEX
float4 C4
#define acc11_re
spinorFloat A1_re
#define i32_im
spinorFloat A2_re
spinorFloat a0_re
#define acc12_im
#define g22_re
#define i12_im
#define acc30_im
#define a
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o12_im
#define gT02_im
spinorFloat b2_re
#define g00_im
VOLATILE spinorFloat o01_im
#define i11_im
#define acc10_re
coordsFromIndex3D< EVEN_X >(X, coord, sid, param)