QUDA  0.9.0
asym_wilson_clover_dslash_gt200_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define i00_re I0.x
15 #define i00_im I0.y
16 #define i01_re I1.x
17 #define i01_im I1.y
18 #define i02_re I2.x
19 #define i02_im I2.y
20 #define i10_re I3.x
21 #define i10_im I3.y
22 #define i11_re I4.x
23 #define i11_im I4.y
24 #define i12_re I5.x
25 #define i12_im I5.y
26 #define i20_re I6.x
27 #define i20_im I6.y
28 #define i21_re I7.x
29 #define i21_im I7.y
30 #define i22_re I8.x
31 #define i22_im I8.y
32 #define i30_re I9.x
33 #define i30_im I9.y
34 #define i31_re I10.x
35 #define i31_im I10.y
36 #define i32_re I11.x
37 #define i32_im I11.y
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
62 #else
63 #define spinorFloat float
64 #define i00_re I0.x
65 #define i00_im I0.y
66 #define i01_re I0.z
67 #define i01_im I0.w
68 #define i02_re I1.x
69 #define i02_im I1.y
70 #define i10_re I1.z
71 #define i10_im I1.w
72 #define i11_re I2.x
73 #define i11_im I2.y
74 #define i12_re I2.z
75 #define i12_im I2.w
76 #define i20_re I3.x
77 #define i20_im I3.y
78 #define i21_re I3.z
79 #define i21_im I3.w
80 #define i22_re I4.x
81 #define i22_im I4.y
82 #define i30_re I4.z
83 #define i30_im I4.w
84 #define i31_re I5.x
85 #define i31_im I5.y
86 #define i32_re I5.z
87 #define i32_im I5.w
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
113 
114 // gauge link
115 #ifdef GAUGE_FLOAT2
116 #define g00_re G0.x
117 #define g00_im G0.y
118 #define g01_re G1.x
119 #define g01_im G1.y
120 #define g02_re G2.x
121 #define g02_im G2.y
122 #define g10_re G3.x
123 #define g10_im G3.y
124 #define g11_re G4.x
125 #define g11_im G4.y
126 #define g12_re G5.x
127 #define g12_im G5.y
128 #define g20_re G6.x
129 #define g20_im G6.y
130 #define g21_re G7.x
131 #define g21_im G7.y
132 #define g22_re G8.x
133 #define g22_im G8.y
134 
135 #else
136 #define g00_re G0.x
137 #define g00_im G0.y
138 #define g01_re G0.z
139 #define g01_im G0.w
140 #define g02_re G1.x
141 #define g02_im G1.y
142 #define g10_re G1.z
143 #define g10_im G1.w
144 #define g11_re G2.x
145 #define g11_im G2.y
146 #define g12_re G2.z
147 #define g12_im G2.w
148 #define g20_re G3.x
149 #define g20_im G3.y
150 #define g21_re G3.z
151 #define g21_im G3.w
152 #define g22_re G4.x
153 #define g22_im G4.y
154 
155 #endif // GAUGE_DOUBLE
156 
157 // conjugated gauge link
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
176 
177 // first chiral block of inverted clover term
178 #ifdef CLOVER_DOUBLE
179 #define c00_00_re C0.x
180 #define c01_01_re C0.y
181 #define c02_02_re C1.x
182 #define c10_10_re C1.y
183 #define c11_11_re C2.x
184 #define c12_12_re C2.y
185 #define c01_00_re C3.x
186 #define c01_00_im C3.y
187 #define c02_00_re C4.x
188 #define c02_00_im C4.y
189 #define c10_00_re C5.x
190 #define c10_00_im C5.y
191 #define c11_00_re C6.x
192 #define c11_00_im C6.y
193 #define c12_00_re C7.x
194 #define c12_00_im C7.y
195 #define c02_01_re C8.x
196 #define c02_01_im C8.y
197 #define c10_01_re C9.x
198 #define c10_01_im C9.y
199 #define c11_01_re C10.x
200 #define c11_01_im C10.y
201 #define c12_01_re C11.x
202 #define c12_01_im C11.y
203 #define c10_02_re C12.x
204 #define c10_02_im C12.y
205 #define c11_02_re C13.x
206 #define c11_02_im C13.y
207 #define c12_02_re C14.x
208 #define c12_02_im C14.y
209 #define c11_10_re C15.x
210 #define c11_10_im C15.y
211 #define c12_10_re C16.x
212 #define c12_10_im C16.y
213 #define c12_11_re C17.x
214 #define c12_11_im C17.y
215 #else
216 #define c00_00_re C0.x
217 #define c01_01_re C0.y
218 #define c02_02_re C0.z
219 #define c10_10_re C0.w
220 #define c11_11_re C1.x
221 #define c12_12_re C1.y
222 #define c01_00_re C1.z
223 #define c01_00_im C1.w
224 #define c02_00_re C2.x
225 #define c02_00_im C2.y
226 #define c10_00_re C2.z
227 #define c10_00_im C2.w
228 #define c11_00_re C3.x
229 #define c11_00_im C3.y
230 #define c12_00_re C3.z
231 #define c12_00_im C3.w
232 #define c02_01_re C4.x
233 #define c02_01_im C4.y
234 #define c10_01_re C4.z
235 #define c10_01_im C4.w
236 #define c11_01_re C5.x
237 #define c11_01_im C5.y
238 #define c12_01_re C5.z
239 #define c12_01_im C5.w
240 #define c10_02_re C6.x
241 #define c10_02_im C6.y
242 #define c11_02_re C6.z
243 #define c11_02_im C6.w
244 #define c12_02_re C7.x
245 #define c12_02_im C7.y
246 #define c11_10_re C7.z
247 #define c11_10_im C7.w
248 #define c12_10_re C8.x
249 #define c12_10_im C8.y
250 #define c12_11_re C8.z
251 #define c12_11_im C8.w
252 #endif // CLOVER_DOUBLE
253 
254 #define c00_01_re (+c01_00_re)
255 #define c00_01_im (-c01_00_im)
256 #define c00_02_re (+c02_00_re)
257 #define c00_02_im (-c02_00_im)
258 #define c01_02_re (+c02_01_re)
259 #define c01_02_im (-c02_01_im)
260 #define c00_10_re (+c10_00_re)
261 #define c00_10_im (-c10_00_im)
262 #define c01_10_re (+c10_01_re)
263 #define c01_10_im (-c10_01_im)
264 #define c02_10_re (+c10_02_re)
265 #define c02_10_im (-c10_02_im)
266 #define c00_11_re (+c11_00_re)
267 #define c00_11_im (-c11_00_im)
268 #define c01_11_re (+c11_01_re)
269 #define c01_11_im (-c11_01_im)
270 #define c02_11_re (+c11_02_re)
271 #define c02_11_im (-c11_02_im)
272 #define c10_11_re (+c11_10_re)
273 #define c10_11_im (-c11_10_im)
274 #define c00_12_re (+c12_00_re)
275 #define c00_12_im (-c12_00_im)
276 #define c01_12_re (+c12_01_re)
277 #define c01_12_im (-c12_01_im)
278 #define c02_12_re (+c12_02_re)
279 #define c02_12_im (-c12_02_im)
280 #define c10_12_re (+c12_10_re)
281 #define c10_12_im (-c12_10_im)
282 #define c11_12_re (+c12_11_re)
283 #define c11_12_im (-c12_11_im)
284 
285 // second chiral block of inverted clover term (reuses C0,...,C9)
286 #define c20_20_re c00_00_re
287 #define c21_20_re c01_00_re
288 #define c21_20_im c01_00_im
289 #define c22_20_re c02_00_re
290 #define c22_20_im c02_00_im
291 #define c30_20_re c10_00_re
292 #define c30_20_im c10_00_im
293 #define c31_20_re c11_00_re
294 #define c31_20_im c11_00_im
295 #define c32_20_re c12_00_re
296 #define c32_20_im c12_00_im
297 #define c20_21_re c00_01_re
298 #define c20_21_im c00_01_im
299 #define c21_21_re c01_01_re
300 #define c22_21_re c02_01_re
301 #define c22_21_im c02_01_im
302 #define c30_21_re c10_01_re
303 #define c30_21_im c10_01_im
304 #define c31_21_re c11_01_re
305 #define c31_21_im c11_01_im
306 #define c32_21_re c12_01_re
307 #define c32_21_im c12_01_im
308 #define c20_22_re c00_02_re
309 #define c20_22_im c00_02_im
310 #define c21_22_re c01_02_re
311 #define c21_22_im c01_02_im
312 #define c22_22_re c02_02_re
313 #define c30_22_re c10_02_re
314 #define c30_22_im c10_02_im
315 #define c31_22_re c11_02_re
316 #define c31_22_im c11_02_im
317 #define c32_22_re c12_02_re
318 #define c32_22_im c12_02_im
319 #define c20_30_re c00_10_re
320 #define c20_30_im c00_10_im
321 #define c21_30_re c01_10_re
322 #define c21_30_im c01_10_im
323 #define c22_30_re c02_10_re
324 #define c22_30_im c02_10_im
325 #define c30_30_re c10_10_re
326 #define c31_30_re c11_10_re
327 #define c31_30_im c11_10_im
328 #define c32_30_re c12_10_re
329 #define c32_30_im c12_10_im
330 #define c20_31_re c00_11_re
331 #define c20_31_im c00_11_im
332 #define c21_31_re c01_11_re
333 #define c21_31_im c01_11_im
334 #define c22_31_re c02_11_re
335 #define c22_31_im c02_11_im
336 #define c30_31_re c10_11_re
337 #define c30_31_im c10_11_im
338 #define c31_31_re c11_11_re
339 #define c32_31_re c12_11_re
340 #define c32_31_im c12_11_im
341 #define c20_32_re c00_12_re
342 #define c20_32_im c00_12_im
343 #define c21_32_re c01_12_re
344 #define c21_32_im c01_12_im
345 #define c22_32_re c02_12_re
346 #define c22_32_im c02_12_im
347 #define c30_32_re c10_12_re
348 #define c30_32_im c10_12_im
349 #define c31_32_re c11_12_re
350 #define c31_32_im c11_12_im
351 #define c32_32_re c12_12_re
352 
353 // output spinor
378 
379 #include "read_gauge.h"
380 #include "read_clover.h"
381 #include "io_spinor.h"
382 
383 int coord[5];
384 int X;
385 
386 int sid;
387 
388 #ifdef MULTI_GPU
389 int face_idx;
390 if (kernel_type == INTERIOR_KERNEL) {
391 #endif
392 
393  sid = blockIdx.x*blockDim.x + threadIdx.x;
394  if (sid >= param.threads) return;
395 
396  // Assume even dimensions
398 
399  o00_re = 0; o00_im = 0;
400  o01_re = 0; o01_im = 0;
401  o02_re = 0; o02_im = 0;
402  o10_re = 0; o10_im = 0;
403  o11_re = 0; o11_im = 0;
404  o12_re = 0; o12_im = 0;
405  o20_re = 0; o20_im = 0;
406  o21_re = 0; o21_im = 0;
407  o22_re = 0; o22_im = 0;
408  o30_re = 0; o30_im = 0;
409  o31_re = 0; o31_im = 0;
410  o32_re = 0; o32_im = 0;
411 #ifdef DSLASH_CLOVER_XPAY
412 
413  READ_ACCUM(ACCUMTEX, param.sp_stride)
414 
415 #ifdef DSLASH_CLOVER
416 
417  // change to chiral basis
418  {
419  spinorFloat a00_re = -acc10_re - acc30_re;
420  spinorFloat a00_im = -acc10_im - acc30_im;
421  spinorFloat a10_re = acc00_re + acc20_re;
422  spinorFloat a10_im = acc00_im + acc20_im;
423  spinorFloat a20_re = -acc10_re + acc30_re;
424  spinorFloat a20_im = -acc10_im + acc30_im;
425  spinorFloat a30_re = acc00_re - acc20_re;
426  spinorFloat a30_im = acc00_im - acc20_im;
427 
428  acc00_re = a00_re; acc00_im = a00_im;
429  acc10_re = a10_re; acc10_im = a10_im;
430  acc20_re = a20_re; acc20_im = a20_im;
431  acc30_re = a30_re; acc30_im = a30_im;
432  }
433 
434  {
435  spinorFloat a01_re = -acc11_re - acc31_re;
436  spinorFloat a01_im = -acc11_im - acc31_im;
437  spinorFloat a11_re = acc01_re + acc21_re;
438  spinorFloat a11_im = acc01_im + acc21_im;
439  spinorFloat a21_re = -acc11_re + acc31_re;
440  spinorFloat a21_im = -acc11_im + acc31_im;
441  spinorFloat a31_re = acc01_re - acc21_re;
442  spinorFloat a31_im = acc01_im - acc21_im;
443 
444  acc01_re = a01_re; acc01_im = a01_im;
445  acc11_re = a11_re; acc11_im = a11_im;
446  acc21_re = a21_re; acc21_im = a21_im;
447  acc31_re = a31_re; acc31_im = a31_im;
448  }
449 
450  {
451  spinorFloat a02_re = -acc12_re - acc32_re;
452  spinorFloat a02_im = -acc12_im - acc32_im;
453  spinorFloat a12_re = acc02_re + acc22_re;
454  spinorFloat a12_im = acc02_im + acc22_im;
455  spinorFloat a22_re = -acc12_re + acc32_re;
456  spinorFloat a22_im = -acc12_im + acc32_im;
457  spinorFloat a32_re = acc02_re - acc22_re;
458  spinorFloat a32_im = acc02_im - acc22_im;
459 
460  acc02_re = a02_re; acc02_im = a02_im;
461  acc12_re = a12_re; acc12_im = a12_im;
462  acc22_re = a22_re; acc22_im = a22_im;
463  acc32_re = a32_re; acc32_im = a32_im;
464  }
465 
466  // apply first chiral block
467  {
469 
470  spinorFloat a00_re = 0; spinorFloat a00_im = 0;
471  spinorFloat a01_re = 0; spinorFloat a01_im = 0;
472  spinorFloat a02_re = 0; spinorFloat a02_im = 0;
473  spinorFloat a10_re = 0; spinorFloat a10_im = 0;
474  spinorFloat a11_re = 0; spinorFloat a11_im = 0;
475  spinorFloat a12_re = 0; spinorFloat a12_im = 0;
476 
477  a00_re += c00_00_re * acc00_re;
478  a00_im += c00_00_re * acc00_im;
479  a00_re += c00_01_re * acc01_re;
480  a00_re -= c00_01_im * acc01_im;
481  a00_im += c00_01_re * acc01_im;
482  a00_im += c00_01_im * acc01_re;
483  a00_re += c00_02_re * acc02_re;
484  a00_re -= c00_02_im * acc02_im;
485  a00_im += c00_02_re * acc02_im;
486  a00_im += c00_02_im * acc02_re;
487  a00_re += c00_10_re * acc10_re;
488  a00_re -= c00_10_im * acc10_im;
489  a00_im += c00_10_re * acc10_im;
490  a00_im += c00_10_im * acc10_re;
491  a00_re += c00_11_re * acc11_re;
492  a00_re -= c00_11_im * acc11_im;
493  a00_im += c00_11_re * acc11_im;
494  a00_im += c00_11_im * acc11_re;
495  a00_re += c00_12_re * acc12_re;
496  a00_re -= c00_12_im * acc12_im;
497  a00_im += c00_12_re * acc12_im;
498  a00_im += c00_12_im * acc12_re;
499 
500  a01_re += c01_00_re * acc00_re;
501  a01_re -= c01_00_im * acc00_im;
502  a01_im += c01_00_re * acc00_im;
503  a01_im += c01_00_im * acc00_re;
504  a01_re += c01_01_re * acc01_re;
505  a01_im += c01_01_re * acc01_im;
506  a01_re += c01_02_re * acc02_re;
507  a01_re -= c01_02_im * acc02_im;
508  a01_im += c01_02_re * acc02_im;
509  a01_im += c01_02_im * acc02_re;
510  a01_re += c01_10_re * acc10_re;
511  a01_re -= c01_10_im * acc10_im;
512  a01_im += c01_10_re * acc10_im;
513  a01_im += c01_10_im * acc10_re;
514  a01_re += c01_11_re * acc11_re;
515  a01_re -= c01_11_im * acc11_im;
516  a01_im += c01_11_re * acc11_im;
517  a01_im += c01_11_im * acc11_re;
518  a01_re += c01_12_re * acc12_re;
519  a01_re -= c01_12_im * acc12_im;
520  a01_im += c01_12_re * acc12_im;
521  a01_im += c01_12_im * acc12_re;
522 
523  a02_re += c02_00_re * acc00_re;
524  a02_re -= c02_00_im * acc00_im;
525  a02_im += c02_00_re * acc00_im;
526  a02_im += c02_00_im * acc00_re;
527  a02_re += c02_01_re * acc01_re;
528  a02_re -= c02_01_im * acc01_im;
529  a02_im += c02_01_re * acc01_im;
530  a02_im += c02_01_im * acc01_re;
531  a02_re += c02_02_re * acc02_re;
532  a02_im += c02_02_re * acc02_im;
533  a02_re += c02_10_re * acc10_re;
534  a02_re -= c02_10_im * acc10_im;
535  a02_im += c02_10_re * acc10_im;
536  a02_im += c02_10_im * acc10_re;
537  a02_re += c02_11_re * acc11_re;
538  a02_re -= c02_11_im * acc11_im;
539  a02_im += c02_11_re * acc11_im;
540  a02_im += c02_11_im * acc11_re;
541  a02_re += c02_12_re * acc12_re;
542  a02_re -= c02_12_im * acc12_im;
543  a02_im += c02_12_re * acc12_im;
544  a02_im += c02_12_im * acc12_re;
545 
546  a10_re += c10_00_re * acc00_re;
547  a10_re -= c10_00_im * acc00_im;
548  a10_im += c10_00_re * acc00_im;
549  a10_im += c10_00_im * acc00_re;
550  a10_re += c10_01_re * acc01_re;
551  a10_re -= c10_01_im * acc01_im;
552  a10_im += c10_01_re * acc01_im;
553  a10_im += c10_01_im * acc01_re;
554  a10_re += c10_02_re * acc02_re;
555  a10_re -= c10_02_im * acc02_im;
556  a10_im += c10_02_re * acc02_im;
557  a10_im += c10_02_im * acc02_re;
558  a10_re += c10_10_re * acc10_re;
559  a10_im += c10_10_re * acc10_im;
560  a10_re += c10_11_re * acc11_re;
561  a10_re -= c10_11_im * acc11_im;
562  a10_im += c10_11_re * acc11_im;
563  a10_im += c10_11_im * acc11_re;
564  a10_re += c10_12_re * acc12_re;
565  a10_re -= c10_12_im * acc12_im;
566  a10_im += c10_12_re * acc12_im;
567  a10_im += c10_12_im * acc12_re;
568 
569  a11_re += c11_00_re * acc00_re;
570  a11_re -= c11_00_im * acc00_im;
571  a11_im += c11_00_re * acc00_im;
572  a11_im += c11_00_im * acc00_re;
573  a11_re += c11_01_re * acc01_re;
574  a11_re -= c11_01_im * acc01_im;
575  a11_im += c11_01_re * acc01_im;
576  a11_im += c11_01_im * acc01_re;
577  a11_re += c11_02_re * acc02_re;
578  a11_re -= c11_02_im * acc02_im;
579  a11_im += c11_02_re * acc02_im;
580  a11_im += c11_02_im * acc02_re;
581  a11_re += c11_10_re * acc10_re;
582  a11_re -= c11_10_im * acc10_im;
583  a11_im += c11_10_re * acc10_im;
584  a11_im += c11_10_im * acc10_re;
585  a11_re += c11_11_re * acc11_re;
586  a11_im += c11_11_re * acc11_im;
587  a11_re += c11_12_re * acc12_re;
588  a11_re -= c11_12_im * acc12_im;
589  a11_im += c11_12_re * acc12_im;
590  a11_im += c11_12_im * acc12_re;
591 
592  a12_re += c12_00_re * acc00_re;
593  a12_re -= c12_00_im * acc00_im;
594  a12_im += c12_00_re * acc00_im;
595  a12_im += c12_00_im * acc00_re;
596  a12_re += c12_01_re * acc01_re;
597  a12_re -= c12_01_im * acc01_im;
598  a12_im += c12_01_re * acc01_im;
599  a12_im += c12_01_im * acc01_re;
600  a12_re += c12_02_re * acc02_re;
601  a12_re -= c12_02_im * acc02_im;
602  a12_im += c12_02_re * acc02_im;
603  a12_im += c12_02_im * acc02_re;
604  a12_re += c12_10_re * acc10_re;
605  a12_re -= c12_10_im * acc10_im;
606  a12_im += c12_10_re * acc10_im;
607  a12_im += c12_10_im * acc10_re;
608  a12_re += c12_11_re * acc11_re;
609  a12_re -= c12_11_im * acc11_im;
610  a12_im += c12_11_re * acc11_im;
611  a12_im += c12_11_im * acc11_re;
612  a12_re += c12_12_re * acc12_re;
613  a12_im += c12_12_re * acc12_im;
614 
615  acc00_re = a00_re; acc00_im = a00_im;
616  acc01_re = a01_re; acc01_im = a01_im;
617  acc02_re = a02_re; acc02_im = a02_im;
618  acc10_re = a10_re; acc10_im = a10_im;
619  acc11_re = a11_re; acc11_im = a11_im;
620  acc12_re = a12_re; acc12_im = a12_im;
621 
622  }
623 
624  // apply second chiral block
625  {
627 
628  spinorFloat a20_re = 0; spinorFloat a20_im = 0;
629  spinorFloat a21_re = 0; spinorFloat a21_im = 0;
630  spinorFloat a22_re = 0; spinorFloat a22_im = 0;
631  spinorFloat a30_re = 0; spinorFloat a30_im = 0;
632  spinorFloat a31_re = 0; spinorFloat a31_im = 0;
633  spinorFloat a32_re = 0; spinorFloat a32_im = 0;
634 
635  a20_re += c20_20_re * acc20_re;
636  a20_im += c20_20_re * acc20_im;
637  a20_re += c20_21_re * acc21_re;
638  a20_re -= c20_21_im * acc21_im;
639  a20_im += c20_21_re * acc21_im;
640  a20_im += c20_21_im * acc21_re;
641  a20_re += c20_22_re * acc22_re;
642  a20_re -= c20_22_im * acc22_im;
643  a20_im += c20_22_re * acc22_im;
644  a20_im += c20_22_im * acc22_re;
645  a20_re += c20_30_re * acc30_re;
646  a20_re -= c20_30_im * acc30_im;
647  a20_im += c20_30_re * acc30_im;
648  a20_im += c20_30_im * acc30_re;
649  a20_re += c20_31_re * acc31_re;
650  a20_re -= c20_31_im * acc31_im;
651  a20_im += c20_31_re * acc31_im;
652  a20_im += c20_31_im * acc31_re;
653  a20_re += c20_32_re * acc32_re;
654  a20_re -= c20_32_im * acc32_im;
655  a20_im += c20_32_re * acc32_im;
656  a20_im += c20_32_im * acc32_re;
657 
658  a21_re += c21_20_re * acc20_re;
659  a21_re -= c21_20_im * acc20_im;
660  a21_im += c21_20_re * acc20_im;
661  a21_im += c21_20_im * acc20_re;
662  a21_re += c21_21_re * acc21_re;
663  a21_im += c21_21_re * acc21_im;
664  a21_re += c21_22_re * acc22_re;
665  a21_re -= c21_22_im * acc22_im;
666  a21_im += c21_22_re * acc22_im;
667  a21_im += c21_22_im * acc22_re;
668  a21_re += c21_30_re * acc30_re;
669  a21_re -= c21_30_im * acc30_im;
670  a21_im += c21_30_re * acc30_im;
671  a21_im += c21_30_im * acc30_re;
672  a21_re += c21_31_re * acc31_re;
673  a21_re -= c21_31_im * acc31_im;
674  a21_im += c21_31_re * acc31_im;
675  a21_im += c21_31_im * acc31_re;
676  a21_re += c21_32_re * acc32_re;
677  a21_re -= c21_32_im * acc32_im;
678  a21_im += c21_32_re * acc32_im;
679  a21_im += c21_32_im * acc32_re;
680 
681  a22_re += c22_20_re * acc20_re;
682  a22_re -= c22_20_im * acc20_im;
683  a22_im += c22_20_re * acc20_im;
684  a22_im += c22_20_im * acc20_re;
685  a22_re += c22_21_re * acc21_re;
686  a22_re -= c22_21_im * acc21_im;
687  a22_im += c22_21_re * acc21_im;
688  a22_im += c22_21_im * acc21_re;
689  a22_re += c22_22_re * acc22_re;
690  a22_im += c22_22_re * acc22_im;
691  a22_re += c22_30_re * acc30_re;
692  a22_re -= c22_30_im * acc30_im;
693  a22_im += c22_30_re * acc30_im;
694  a22_im += c22_30_im * acc30_re;
695  a22_re += c22_31_re * acc31_re;
696  a22_re -= c22_31_im * acc31_im;
697  a22_im += c22_31_re * acc31_im;
698  a22_im += c22_31_im * acc31_re;
699  a22_re += c22_32_re * acc32_re;
700  a22_re -= c22_32_im * acc32_im;
701  a22_im += c22_32_re * acc32_im;
702  a22_im += c22_32_im * acc32_re;
703 
704  a30_re += c30_20_re * acc20_re;
705  a30_re -= c30_20_im * acc20_im;
706  a30_im += c30_20_re * acc20_im;
707  a30_im += c30_20_im * acc20_re;
708  a30_re += c30_21_re * acc21_re;
709  a30_re -= c30_21_im * acc21_im;
710  a30_im += c30_21_re * acc21_im;
711  a30_im += c30_21_im * acc21_re;
712  a30_re += c30_22_re * acc22_re;
713  a30_re -= c30_22_im * acc22_im;
714  a30_im += c30_22_re * acc22_im;
715  a30_im += c30_22_im * acc22_re;
716  a30_re += c30_30_re * acc30_re;
717  a30_im += c30_30_re * acc30_im;
718  a30_re += c30_31_re * acc31_re;
719  a30_re -= c30_31_im * acc31_im;
720  a30_im += c30_31_re * acc31_im;
721  a30_im += c30_31_im * acc31_re;
722  a30_re += c30_32_re * acc32_re;
723  a30_re -= c30_32_im * acc32_im;
724  a30_im += c30_32_re * acc32_im;
725  a30_im += c30_32_im * acc32_re;
726 
727  a31_re += c31_20_re * acc20_re;
728  a31_re -= c31_20_im * acc20_im;
729  a31_im += c31_20_re * acc20_im;
730  a31_im += c31_20_im * acc20_re;
731  a31_re += c31_21_re * acc21_re;
732  a31_re -= c31_21_im * acc21_im;
733  a31_im += c31_21_re * acc21_im;
734  a31_im += c31_21_im * acc21_re;
735  a31_re += c31_22_re * acc22_re;
736  a31_re -= c31_22_im * acc22_im;
737  a31_im += c31_22_re * acc22_im;
738  a31_im += c31_22_im * acc22_re;
739  a31_re += c31_30_re * acc30_re;
740  a31_re -= c31_30_im * acc30_im;
741  a31_im += c31_30_re * acc30_im;
742  a31_im += c31_30_im * acc30_re;
743  a31_re += c31_31_re * acc31_re;
744  a31_im += c31_31_re * acc31_im;
745  a31_re += c31_32_re * acc32_re;
746  a31_re -= c31_32_im * acc32_im;
747  a31_im += c31_32_re * acc32_im;
748  a31_im += c31_32_im * acc32_re;
749 
750  a32_re += c32_20_re * acc20_re;
751  a32_re -= c32_20_im * acc20_im;
752  a32_im += c32_20_re * acc20_im;
753  a32_im += c32_20_im * acc20_re;
754  a32_re += c32_21_re * acc21_re;
755  a32_re -= c32_21_im * acc21_im;
756  a32_im += c32_21_re * acc21_im;
757  a32_im += c32_21_im * acc21_re;
758  a32_re += c32_22_re * acc22_re;
759  a32_re -= c32_22_im * acc22_im;
760  a32_im += c32_22_re * acc22_im;
761  a32_im += c32_22_im * acc22_re;
762  a32_re += c32_30_re * acc30_re;
763  a32_re -= c32_30_im * acc30_im;
764  a32_im += c32_30_re * acc30_im;
765  a32_im += c32_30_im * acc30_re;
766  a32_re += c32_31_re * acc31_re;
767  a32_re -= c32_31_im * acc31_im;
768  a32_im += c32_31_re * acc31_im;
769  a32_im += c32_31_im * acc31_re;
770  a32_re += c32_32_re * acc32_re;
771  a32_im += c32_32_re * acc32_im;
772 
773  acc20_re = a20_re; acc20_im = a20_im;
774  acc21_re = a21_re; acc21_im = a21_im;
775  acc22_re = a22_re; acc22_im = a22_im;
776  acc30_re = a30_re; acc30_im = a30_im;
777  acc31_re = a31_re; acc31_im = a31_im;
778  acc32_re = a32_re; acc32_im = a32_im;
779 
780  }
781 
782  // change back from chiral basis
783  // (note: required factor of 1/2 is included in clover term normalization)
784  {
785  spinorFloat a00_re = acc10_re + acc30_re;
786  spinorFloat a00_im = acc10_im + acc30_im;
787  spinorFloat a10_re = -acc00_re - acc20_re;
788  spinorFloat a10_im = -acc00_im - acc20_im;
789  spinorFloat a20_re = acc10_re - acc30_re;
790  spinorFloat a20_im = acc10_im - acc30_im;
791  spinorFloat a30_re = -acc00_re + acc20_re;
792  spinorFloat a30_im = -acc00_im + acc20_im;
793 
794  acc00_re = a00_re; acc00_im = a00_im;
795  acc10_re = a10_re; acc10_im = a10_im;
796  acc20_re = a20_re; acc20_im = a20_im;
797  acc30_re = a30_re; acc30_im = a30_im;
798  }
799 
800  {
801  spinorFloat a01_re = acc11_re + acc31_re;
802  spinorFloat a01_im = acc11_im + acc31_im;
803  spinorFloat a11_re = -acc01_re - acc21_re;
804  spinorFloat a11_im = -acc01_im - acc21_im;
805  spinorFloat a21_re = acc11_re - acc31_re;
806  spinorFloat a21_im = acc11_im - acc31_im;
807  spinorFloat a31_re = -acc01_re + acc21_re;
808  spinorFloat a31_im = -acc01_im + acc21_im;
809 
810  acc01_re = a01_re; acc01_im = a01_im;
811  acc11_re = a11_re; acc11_im = a11_im;
812  acc21_re = a21_re; acc21_im = a21_im;
813  acc31_re = a31_re; acc31_im = a31_im;
814  }
815 
816  {
817  spinorFloat a02_re = acc12_re + acc32_re;
818  spinorFloat a02_im = acc12_im + acc32_im;
819  spinorFloat a12_re = -acc02_re - acc22_re;
820  spinorFloat a12_im = -acc02_im - acc22_im;
821  spinorFloat a22_re = acc12_re - acc32_re;
822  spinorFloat a22_im = acc12_im - acc32_im;
823  spinorFloat a32_re = -acc02_re + acc22_re;
824  spinorFloat a32_im = -acc02_im + acc22_im;
825 
826  acc02_re = a02_re; acc02_im = a02_im;
827  acc12_re = a12_re; acc12_im = a12_im;
828  acc22_re = a22_re; acc22_im = a22_im;
829  acc32_re = a32_re; acc32_im = a32_im;
830  }
831 
832 #endif // DSLASH_CLOVER
833 
834  o00_re = acc00_re;
835  o00_im = acc00_im;
836  o01_re = acc01_re;
837  o01_im = acc01_im;
838  o02_re = acc02_re;
839  o02_im = acc02_im;
840  o10_re = acc10_re;
841  o10_im = acc10_im;
842  o11_re = acc11_re;
843  o11_im = acc11_im;
844  o12_re = acc12_re;
845  o12_im = acc12_im;
846  o20_re = acc20_re;
847  o20_im = acc20_im;
848  o21_re = acc21_re;
849  o21_im = acc21_im;
850  o22_re = acc22_re;
851  o22_im = acc22_im;
852  o30_re = acc30_re;
853  o30_im = acc30_im;
854  o31_re = acc31_re;
855  o31_im = acc31_im;
856  o32_re = acc32_re;
857  o32_im = acc32_im;
858 #endif // DSLASH_CLOVER_XPAY
859 
860 #ifdef MULTI_GPU
861 } else { // exterior kernel
862 
863  sid = blockIdx.x*blockDim.x + threadIdx.x;
864  if (sid >= param.threads) return;
865 
866  const int face_volume = (param.threads >> 1); // volume of one face
867  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
868  face_idx = sid - face_num*face_volume; // index into the respective face
869 
870  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
871  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
872  //sp_idx = face_idx + param.ghostOffset[dim];
873 
874  coordsFromFaceIndex<4,QUDA_4D_PC,kernel_type,1>(X, sid, coord, face_idx, face_num, param);
875 
877 
878  o00_re = i00_re; o00_im = i00_im;
879  o01_re = i01_re; o01_im = i01_im;
880  o02_re = i02_re; o02_im = i02_im;
881  o10_re = i10_re; o10_im = i10_im;
882  o11_re = i11_re; o11_im = i11_im;
883  o12_re = i12_re; o12_im = i12_im;
884  o20_re = i20_re; o20_im = i20_im;
885  o21_re = i21_re; o21_im = i21_im;
886  o22_re = i22_re; o22_im = i22_im;
887  o30_re = i30_re; o30_im = i30_im;
888  o31_re = i31_re; o31_im = i31_im;
889  o32_re = i32_re; o32_im = i32_im;
890 }
891 #endif // MULTI_GPU
892 
893 
894 #ifdef MULTI_GPU
895 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]<(param.dc.X[0]-1))) ||
896  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==(param.dc.X[0]-1)) )
897 #endif
898 {
899  // Projector P0-
900  // 1 0 0 -i
901  // 0 1 -i 0
902  // 0 i 1 0
903  // i 0 0 1
904 
905 #ifdef MULTI_GPU
906  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1 :
907  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
908 #if (DD_PREC==2) // half precision
909  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
910 #endif
911 #else
912  const int sp_idx = (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1;
913 #endif
914 
915  const int ga_idx = sid;
916 
923 
924 #ifdef MULTI_GPU
925  if (kernel_type == INTERIOR_KERNEL) {
926 #endif
927 
928  // read spinor from device memory
929  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
930 
931  // project spinor into half spinors
932  a0_re = +i00_re+i30_im;
933  a0_im = +i00_im-i30_re;
934  a1_re = +i01_re+i31_im;
935  a1_im = +i01_im-i31_re;
936  a2_re = +i02_re+i32_im;
937  a2_im = +i02_im-i32_re;
938  b0_re = +i10_re+i20_im;
939  b0_im = +i10_im-i20_re;
940  b1_re = +i11_re+i21_im;
941  b1_im = +i11_im-i21_re;
942  b2_re = +i12_re+i22_im;
943  b2_im = +i12_im-i22_re;
944 
945 #ifdef MULTI_GPU
946  } else {
947 
948  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
949 
950  // read half spinor from device memory
951  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 0);
952 
953  a0_re = i00_re; a0_im = i00_im;
954  a1_re = i01_re; a1_im = i01_im;
955  a2_re = i02_re; a2_im = i02_im;
956  b0_re = i10_re; b0_im = i10_im;
957  b1_re = i11_re; b1_im = i11_im;
958  b2_re = i12_re; b2_im = i12_im;
959 
960  }
961 #endif // MULTI_GPU
962 
963  // read gauge matrix from device memory
964  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride);
965 
966  // reconstruct gauge matrix
968 
969  // multiply row 0
971  A0_re += g00_re * a0_re;
972  A0_re -= g00_im * a0_im;
973  A0_re += g01_re * a1_re;
974  A0_re -= g01_im * a1_im;
975  A0_re += g02_re * a2_re;
976  A0_re -= g02_im * a2_im;
978  A0_im += g00_re * a0_im;
979  A0_im += g00_im * a0_re;
980  A0_im += g01_re * a1_im;
981  A0_im += g01_im * a1_re;
982  A0_im += g02_re * a2_im;
983  A0_im += g02_im * a2_re;
985  B0_re += g00_re * b0_re;
986  B0_re -= g00_im * b0_im;
987  B0_re += g01_re * b1_re;
988  B0_re -= g01_im * b1_im;
989  B0_re += g02_re * b2_re;
990  B0_re -= g02_im * b2_im;
992  B0_im += g00_re * b0_im;
993  B0_im += g00_im * b0_re;
994  B0_im += g01_re * b1_im;
995  B0_im += g01_im * b1_re;
996  B0_im += g02_re * b2_im;
997  B0_im += g02_im * b2_re;
998 
999  // multiply row 1
1001  A1_re += g10_re * a0_re;
1002  A1_re -= g10_im * a0_im;
1003  A1_re += g11_re * a1_re;
1004  A1_re -= g11_im * a1_im;
1005  A1_re += g12_re * a2_re;
1006  A1_re -= g12_im * a2_im;
1008  A1_im += g10_re * a0_im;
1009  A1_im += g10_im * a0_re;
1010  A1_im += g11_re * a1_im;
1011  A1_im += g11_im * a1_re;
1012  A1_im += g12_re * a2_im;
1013  A1_im += g12_im * a2_re;
1015  B1_re += g10_re * b0_re;
1016  B1_re -= g10_im * b0_im;
1017  B1_re += g11_re * b1_re;
1018  B1_re -= g11_im * b1_im;
1019  B1_re += g12_re * b2_re;
1020  B1_re -= g12_im * b2_im;
1022  B1_im += g10_re * b0_im;
1023  B1_im += g10_im * b0_re;
1024  B1_im += g11_re * b1_im;
1025  B1_im += g11_im * b1_re;
1026  B1_im += g12_re * b2_im;
1027  B1_im += g12_im * b2_re;
1028 
1029  // multiply row 2
1031  A2_re += g20_re * a0_re;
1032  A2_re -= g20_im * a0_im;
1033  A2_re += g21_re * a1_re;
1034  A2_re -= g21_im * a1_im;
1035  A2_re += g22_re * a2_re;
1036  A2_re -= g22_im * a2_im;
1038  A2_im += g20_re * a0_im;
1039  A2_im += g20_im * a0_re;
1040  A2_im += g21_re * a1_im;
1041  A2_im += g21_im * a1_re;
1042  A2_im += g22_re * a2_im;
1043  A2_im += g22_im * a2_re;
1045  B2_re += g20_re * b0_re;
1046  B2_re -= g20_im * b0_im;
1047  B2_re += g21_re * b1_re;
1048  B2_re -= g21_im * b1_im;
1049  B2_re += g22_re * b2_re;
1050  B2_re -= g22_im * b2_im;
1052  B2_im += g20_re * b0_im;
1053  B2_im += g20_im * b0_re;
1054  B2_im += g21_re * b1_im;
1055  B2_im += g21_im * b1_re;
1056  B2_im += g22_re * b2_im;
1057  B2_im += g22_im * b2_re;
1058 
1059 #ifdef SPINOR_DOUBLE
1060  spinorFloat a = param.a;
1061 #else
1063 #endif
1064  o00_re += a*A0_re;
1065  o00_im += a*A0_im;
1066  o10_re += a*B0_re;
1067  o10_im += a*B0_im;
1068  o20_re -= a*B0_im;
1069  o20_im += a*B0_re;
1070  o30_re -= a*A0_im;
1071  o30_im += a*A0_re;
1072 
1073  o01_re += a*A1_re;
1074  o01_im += a*A1_im;
1075  o11_re += a*B1_re;
1076  o11_im += a*B1_im;
1077  o21_re -= a*B1_im;
1078  o21_im += a*B1_re;
1079  o31_re -= a*A1_im;
1080  o31_im += a*A1_re;
1081 
1082  o02_re += a*A2_re;
1083  o02_im += a*A2_im;
1084  o12_re += a*B2_re;
1085  o12_im += a*B2_im;
1086  o22_re -= a*B2_im;
1087  o22_im += a*B2_re;
1088  o32_re -= a*A2_im;
1089  o32_im += a*A2_re;
1090 
1091 }
1092 
1093 #ifdef MULTI_GPU
1094 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]>0)) ||
1095  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==0) )
1096 #endif
1097 {
1098  // Projector P0+
1099  // 1 0 0 i
1100  // 0 1 i 0
1101  // 0 -i 1 0
1102  // -i 0 0 1
1103 
1104 #ifdef MULTI_GPU
1105  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1 :
1106  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1107 #if (DD_PREC==2) // half precision
1108  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1109 #endif
1110 #else
1111  const int sp_idx = (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1;
1112 #endif
1113 
1114 #ifdef MULTI_GPU
1115  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1116 #else
1117  const int ga_idx = sp_idx;
1118 #endif
1119 
1126 
1127 #ifdef MULTI_GPU
1128  if (kernel_type == INTERIOR_KERNEL) {
1129 #endif
1130 
1131  // read spinor from device memory
1132  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1133 
1134  // project spinor into half spinors
1135  a0_re = +i00_re-i30_im;
1136  a0_im = +i00_im+i30_re;
1137  a1_re = +i01_re-i31_im;
1138  a1_im = +i01_im+i31_re;
1139  a2_re = +i02_re-i32_im;
1140  a2_im = +i02_im+i32_re;
1141  b0_re = +i10_re-i20_im;
1142  b0_im = +i10_im+i20_re;
1143  b1_re = +i11_re-i21_im;
1144  b1_im = +i11_im+i21_re;
1145  b2_re = +i12_re-i22_im;
1146  b2_im = +i12_im+i22_re;
1147 
1148 #ifdef MULTI_GPU
1149  } else {
1150 
1151  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1152 
1153  // read half spinor from device memory
1154  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 1);
1155 
1156  a0_re = i00_re; a0_im = i00_im;
1157  a1_re = i01_re; a1_im = i01_im;
1158  a2_re = i02_re; a2_im = i02_im;
1159  b0_re = i10_re; b0_im = i10_im;
1160  b1_re = i11_re; b1_im = i11_im;
1161  b2_re = i12_re; b2_im = i12_im;
1162 
1163  }
1164 #endif // MULTI_GPU
1165 
1166  // read gauge matrix from device memory
1167  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, param.gauge_stride);
1168 
1169  // reconstruct gauge matrix
1171 
1172  // multiply row 0
1173  spinorFloat A0_re = 0;
1174  A0_re += gT00_re * a0_re;
1175  A0_re -= gT00_im * a0_im;
1176  A0_re += gT01_re * a1_re;
1177  A0_re -= gT01_im * a1_im;
1178  A0_re += gT02_re * a2_re;
1179  A0_re -= gT02_im * a2_im;
1180  spinorFloat A0_im = 0;
1181  A0_im += gT00_re * a0_im;
1182  A0_im += gT00_im * a0_re;
1183  A0_im += gT01_re * a1_im;
1184  A0_im += gT01_im * a1_re;
1185  A0_im += gT02_re * a2_im;
1186  A0_im += gT02_im * a2_re;
1187  spinorFloat B0_re = 0;
1188  B0_re += gT00_re * b0_re;
1189  B0_re -= gT00_im * b0_im;
1190  B0_re += gT01_re * b1_re;
1191  B0_re -= gT01_im * b1_im;
1192  B0_re += gT02_re * b2_re;
1193  B0_re -= gT02_im * b2_im;
1194  spinorFloat B0_im = 0;
1195  B0_im += gT00_re * b0_im;
1196  B0_im += gT00_im * b0_re;
1197  B0_im += gT01_re * b1_im;
1198  B0_im += gT01_im * b1_re;
1199  B0_im += gT02_re * b2_im;
1200  B0_im += gT02_im * b2_re;
1201 
1202  // multiply row 1
1203  spinorFloat A1_re = 0;
1204  A1_re += gT10_re * a0_re;
1205  A1_re -= gT10_im * a0_im;
1206  A1_re += gT11_re * a1_re;
1207  A1_re -= gT11_im * a1_im;
1208  A1_re += gT12_re * a2_re;
1209  A1_re -= gT12_im * a2_im;
1210  spinorFloat A1_im = 0;
1211  A1_im += gT10_re * a0_im;
1212  A1_im += gT10_im * a0_re;
1213  A1_im += gT11_re * a1_im;
1214  A1_im += gT11_im * a1_re;
1215  A1_im += gT12_re * a2_im;
1216  A1_im += gT12_im * a2_re;
1217  spinorFloat B1_re = 0;
1218  B1_re += gT10_re * b0_re;
1219  B1_re -= gT10_im * b0_im;
1220  B1_re += gT11_re * b1_re;
1221  B1_re -= gT11_im * b1_im;
1222  B1_re += gT12_re * b2_re;
1223  B1_re -= gT12_im * b2_im;
1224  spinorFloat B1_im = 0;
1225  B1_im += gT10_re * b0_im;
1226  B1_im += gT10_im * b0_re;
1227  B1_im += gT11_re * b1_im;
1228  B1_im += gT11_im * b1_re;
1229  B1_im += gT12_re * b2_im;
1230  B1_im += gT12_im * b2_re;
1231 
1232  // multiply row 2
1233  spinorFloat A2_re = 0;
1234  A2_re += gT20_re * a0_re;
1235  A2_re -= gT20_im * a0_im;
1236  A2_re += gT21_re * a1_re;
1237  A2_re -= gT21_im * a1_im;
1238  A2_re += gT22_re * a2_re;
1239  A2_re -= gT22_im * a2_im;
1240  spinorFloat A2_im = 0;
1241  A2_im += gT20_re * a0_im;
1242  A2_im += gT20_im * a0_re;
1243  A2_im += gT21_re * a1_im;
1244  A2_im += gT21_im * a1_re;
1245  A2_im += gT22_re * a2_im;
1246  A2_im += gT22_im * a2_re;
1247  spinorFloat B2_re = 0;
1248  B2_re += gT20_re * b0_re;
1249  B2_re -= gT20_im * b0_im;
1250  B2_re += gT21_re * b1_re;
1251  B2_re -= gT21_im * b1_im;
1252  B2_re += gT22_re * b2_re;
1253  B2_re -= gT22_im * b2_im;
1254  spinorFloat B2_im = 0;
1255  B2_im += gT20_re * b0_im;
1256  B2_im += gT20_im * b0_re;
1257  B2_im += gT21_re * b1_im;
1258  B2_im += gT21_im * b1_re;
1259  B2_im += gT22_re * b2_im;
1260  B2_im += gT22_im * b2_re;
1261 
1262 #ifdef SPINOR_DOUBLE
1263  spinorFloat a = param.a;
1264 #else
1265  spinorFloat a = param.a_f;
1266 #endif
1267  o00_re += a*A0_re;
1268  o00_im += a*A0_im;
1269  o10_re += a*B0_re;
1270  o10_im += a*B0_im;
1271  o20_re += a*B0_im;
1272  o20_im -= a*B0_re;
1273  o30_re += a*A0_im;
1274  o30_im -= a*A0_re;
1275 
1276  o01_re += a*A1_re;
1277  o01_im += a*A1_im;
1278  o11_re += a*B1_re;
1279  o11_im += a*B1_im;
1280  o21_re += a*B1_im;
1281  o21_im -= a*B1_re;
1282  o31_re += a*A1_im;
1283  o31_im -= a*A1_re;
1284 
1285  o02_re += a*A2_re;
1286  o02_im += a*A2_im;
1287  o12_re += a*B2_re;
1288  o12_im += a*B2_im;
1289  o22_re += a*B2_im;
1290  o22_im -= a*B2_re;
1291  o32_re += a*A2_im;
1292  o32_im -= a*A2_re;
1293 
1294 }
1295 
1296 #ifdef MULTI_GPU
1297 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]<(param.dc.X[1]-1))) ||
1298  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==(param.dc.X[1]-1)) )
1299 #endif
1300 {
1301  // Projector P1-
1302  // 1 0 0 -1
1303  // 0 1 1 0
1304  // 0 1 1 0
1305  // -1 0 0 1
1306 
1307 #ifdef MULTI_GPU
1308  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1 :
1309  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1310 #if (DD_PREC==2) // half precision
1311  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1312 #endif
1313 #else
1314  const int sp_idx = (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1;
1315 #endif
1316 
1317  const int ga_idx = sid;
1318 
1325 
1326 #ifdef MULTI_GPU
1327  if (kernel_type == INTERIOR_KERNEL) {
1328 #endif
1329 
1330  // read spinor from device memory
1331  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1332 
1333  // project spinor into half spinors
1334  a0_re = +i00_re-i30_re;
1335  a0_im = +i00_im-i30_im;
1336  a1_re = +i01_re-i31_re;
1337  a1_im = +i01_im-i31_im;
1338  a2_re = +i02_re-i32_re;
1339  a2_im = +i02_im-i32_im;
1340  b0_re = +i10_re+i20_re;
1341  b0_im = +i10_im+i20_im;
1342  b1_re = +i11_re+i21_re;
1343  b1_im = +i11_im+i21_im;
1344  b2_re = +i12_re+i22_re;
1345  b2_im = +i12_im+i22_im;
1346 
1347 #ifdef MULTI_GPU
1348  } else {
1349 
1350  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1351 
1352  // read half spinor from device memory
1353  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 2);
1354 
1355  a0_re = i00_re; a0_im = i00_im;
1356  a1_re = i01_re; a1_im = i01_im;
1357  a2_re = i02_re; a2_im = i02_im;
1358  b0_re = i10_re; b0_im = i10_im;
1359  b1_re = i11_re; b1_im = i11_im;
1360  b2_re = i12_re; b2_im = i12_im;
1361 
1362  }
1363 #endif // MULTI_GPU
1364 
1365  // read gauge matrix from device memory
1366  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, param.gauge_stride);
1367 
1368  // reconstruct gauge matrix
1370 
1371  // multiply row 0
1372  spinorFloat A0_re = 0;
1373  A0_re += g00_re * a0_re;
1374  A0_re -= g00_im * a0_im;
1375  A0_re += g01_re * a1_re;
1376  A0_re -= g01_im * a1_im;
1377  A0_re += g02_re * a2_re;
1378  A0_re -= g02_im * a2_im;
1379  spinorFloat A0_im = 0;
1380  A0_im += g00_re * a0_im;
1381  A0_im += g00_im * a0_re;
1382  A0_im += g01_re * a1_im;
1383  A0_im += g01_im * a1_re;
1384  A0_im += g02_re * a2_im;
1385  A0_im += g02_im * a2_re;
1386  spinorFloat B0_re = 0;
1387  B0_re += g00_re * b0_re;
1388  B0_re -= g00_im * b0_im;
1389  B0_re += g01_re * b1_re;
1390  B0_re -= g01_im * b1_im;
1391  B0_re += g02_re * b2_re;
1392  B0_re -= g02_im * b2_im;
1393  spinorFloat B0_im = 0;
1394  B0_im += g00_re * b0_im;
1395  B0_im += g00_im * b0_re;
1396  B0_im += g01_re * b1_im;
1397  B0_im += g01_im * b1_re;
1398  B0_im += g02_re * b2_im;
1399  B0_im += g02_im * b2_re;
1400 
1401  // multiply row 1
1402  spinorFloat A1_re = 0;
1403  A1_re += g10_re * a0_re;
1404  A1_re -= g10_im * a0_im;
1405  A1_re += g11_re * a1_re;
1406  A1_re -= g11_im * a1_im;
1407  A1_re += g12_re * a2_re;
1408  A1_re -= g12_im * a2_im;
1409  spinorFloat A1_im = 0;
1410  A1_im += g10_re * a0_im;
1411  A1_im += g10_im * a0_re;
1412  A1_im += g11_re * a1_im;
1413  A1_im += g11_im * a1_re;
1414  A1_im += g12_re * a2_im;
1415  A1_im += g12_im * a2_re;
1416  spinorFloat B1_re = 0;
1417  B1_re += g10_re * b0_re;
1418  B1_re -= g10_im * b0_im;
1419  B1_re += g11_re * b1_re;
1420  B1_re -= g11_im * b1_im;
1421  B1_re += g12_re * b2_re;
1422  B1_re -= g12_im * b2_im;
1423  spinorFloat B1_im = 0;
1424  B1_im += g10_re * b0_im;
1425  B1_im += g10_im * b0_re;
1426  B1_im += g11_re * b1_im;
1427  B1_im += g11_im * b1_re;
1428  B1_im += g12_re * b2_im;
1429  B1_im += g12_im * b2_re;
1430 
1431  // multiply row 2
1432  spinorFloat A2_re = 0;
1433  A2_re += g20_re * a0_re;
1434  A2_re -= g20_im * a0_im;
1435  A2_re += g21_re * a1_re;
1436  A2_re -= g21_im * a1_im;
1437  A2_re += g22_re * a2_re;
1438  A2_re -= g22_im * a2_im;
1439  spinorFloat A2_im = 0;
1440  A2_im += g20_re * a0_im;
1441  A2_im += g20_im * a0_re;
1442  A2_im += g21_re * a1_im;
1443  A2_im += g21_im * a1_re;
1444  A2_im += g22_re * a2_im;
1445  A2_im += g22_im * a2_re;
1446  spinorFloat B2_re = 0;
1447  B2_re += g20_re * b0_re;
1448  B2_re -= g20_im * b0_im;
1449  B2_re += g21_re * b1_re;
1450  B2_re -= g21_im * b1_im;
1451  B2_re += g22_re * b2_re;
1452  B2_re -= g22_im * b2_im;
1453  spinorFloat B2_im = 0;
1454  B2_im += g20_re * b0_im;
1455  B2_im += g20_im * b0_re;
1456  B2_im += g21_re * b1_im;
1457  B2_im += g21_im * b1_re;
1458  B2_im += g22_re * b2_im;
1459  B2_im += g22_im * b2_re;
1460 
1461 #ifdef SPINOR_DOUBLE
1462  spinorFloat a = param.a;
1463 #else
1464  spinorFloat a = param.a_f;
1465 #endif
1466  o00_re += a*A0_re;
1467  o00_im += a*A0_im;
1468  o10_re += a*B0_re;
1469  o10_im += a*B0_im;
1470  o20_re += a*B0_re;
1471  o20_im += a*B0_im;
1472  o30_re -= a*A0_re;
1473  o30_im -= a*A0_im;
1474 
1475  o01_re += a*A1_re;
1476  o01_im += a*A1_im;
1477  o11_re += a*B1_re;
1478  o11_im += a*B1_im;
1479  o21_re += a*B1_re;
1480  o21_im += a*B1_im;
1481  o31_re -= a*A1_re;
1482  o31_im -= a*A1_im;
1483 
1484  o02_re += a*A2_re;
1485  o02_im += a*A2_im;
1486  o12_re += a*B2_re;
1487  o12_im += a*B2_im;
1488  o22_re += a*B2_re;
1489  o22_im += a*B2_im;
1490  o32_re -= a*A2_re;
1491  o32_im -= a*A2_im;
1492 
1493 }
1494 
1495 #ifdef MULTI_GPU
1496 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]>0)) ||
1497  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==0) )
1498 #endif
1499 {
1500  // Projector P1+
1501  // 1 0 0 1
1502  // 0 1 -1 0
1503  // 0 -1 1 0
1504  // 1 0 0 1
1505 
1506 #ifdef MULTI_GPU
1507  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1 :
1508  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1509 #if (DD_PREC==2) // half precision
1510  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1511 #endif
1512 #else
1513  const int sp_idx = (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1;
1514 #endif
1515 
1516 #ifdef MULTI_GPU
1517  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1518 #else
1519  const int ga_idx = sp_idx;
1520 #endif
1521 
1528 
1529 #ifdef MULTI_GPU
1530  if (kernel_type == INTERIOR_KERNEL) {
1531 #endif
1532 
1533  // read spinor from device memory
1534  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1535 
1536  // project spinor into half spinors
1537  a0_re = +i00_re+i30_re;
1538  a0_im = +i00_im+i30_im;
1539  a1_re = +i01_re+i31_re;
1540  a1_im = +i01_im+i31_im;
1541  a2_re = +i02_re+i32_re;
1542  a2_im = +i02_im+i32_im;
1543  b0_re = +i10_re-i20_re;
1544  b0_im = +i10_im-i20_im;
1545  b1_re = +i11_re-i21_re;
1546  b1_im = +i11_im-i21_im;
1547  b2_re = +i12_re-i22_re;
1548  b2_im = +i12_im-i22_im;
1549 
1550 #ifdef MULTI_GPU
1551  } else {
1552 
1553  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1554 
1555  // read half spinor from device memory
1556  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 3);
1557 
1558  a0_re = i00_re; a0_im = i00_im;
1559  a1_re = i01_re; a1_im = i01_im;
1560  a2_re = i02_re; a2_im = i02_im;
1561  b0_re = i10_re; b0_im = i10_im;
1562  b1_re = i11_re; b1_im = i11_im;
1563  b2_re = i12_re; b2_im = i12_im;
1564 
1565  }
1566 #endif // MULTI_GPU
1567 
1568  // read gauge matrix from device memory
1569  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, param.gauge_stride);
1570 
1571  // reconstruct gauge matrix
1573 
1574  // multiply row 0
1575  spinorFloat A0_re = 0;
1576  A0_re += gT00_re * a0_re;
1577  A0_re -= gT00_im * a0_im;
1578  A0_re += gT01_re * a1_re;
1579  A0_re -= gT01_im * a1_im;
1580  A0_re += gT02_re * a2_re;
1581  A0_re -= gT02_im * a2_im;
1582  spinorFloat A0_im = 0;
1583  A0_im += gT00_re * a0_im;
1584  A0_im += gT00_im * a0_re;
1585  A0_im += gT01_re * a1_im;
1586  A0_im += gT01_im * a1_re;
1587  A0_im += gT02_re * a2_im;
1588  A0_im += gT02_im * a2_re;
1589  spinorFloat B0_re = 0;
1590  B0_re += gT00_re * b0_re;
1591  B0_re -= gT00_im * b0_im;
1592  B0_re += gT01_re * b1_re;
1593  B0_re -= gT01_im * b1_im;
1594  B0_re += gT02_re * b2_re;
1595  B0_re -= gT02_im * b2_im;
1596  spinorFloat B0_im = 0;
1597  B0_im += gT00_re * b0_im;
1598  B0_im += gT00_im * b0_re;
1599  B0_im += gT01_re * b1_im;
1600  B0_im += gT01_im * b1_re;
1601  B0_im += gT02_re * b2_im;
1602  B0_im += gT02_im * b2_re;
1603 
1604  // multiply row 1
1605  spinorFloat A1_re = 0;
1606  A1_re += gT10_re * a0_re;
1607  A1_re -= gT10_im * a0_im;
1608  A1_re += gT11_re * a1_re;
1609  A1_re -= gT11_im * a1_im;
1610  A1_re += gT12_re * a2_re;
1611  A1_re -= gT12_im * a2_im;
1612  spinorFloat A1_im = 0;
1613  A1_im += gT10_re * a0_im;
1614  A1_im += gT10_im * a0_re;
1615  A1_im += gT11_re * a1_im;
1616  A1_im += gT11_im * a1_re;
1617  A1_im += gT12_re * a2_im;
1618  A1_im += gT12_im * a2_re;
1619  spinorFloat B1_re = 0;
1620  B1_re += gT10_re * b0_re;
1621  B1_re -= gT10_im * b0_im;
1622  B1_re += gT11_re * b1_re;
1623  B1_re -= gT11_im * b1_im;
1624  B1_re += gT12_re * b2_re;
1625  B1_re -= gT12_im * b2_im;
1626  spinorFloat B1_im = 0;
1627  B1_im += gT10_re * b0_im;
1628  B1_im += gT10_im * b0_re;
1629  B1_im += gT11_re * b1_im;
1630  B1_im += gT11_im * b1_re;
1631  B1_im += gT12_re * b2_im;
1632  B1_im += gT12_im * b2_re;
1633 
1634  // multiply row 2
1635  spinorFloat A2_re = 0;
1636  A2_re += gT20_re * a0_re;
1637  A2_re -= gT20_im * a0_im;
1638  A2_re += gT21_re * a1_re;
1639  A2_re -= gT21_im * a1_im;
1640  A2_re += gT22_re * a2_re;
1641  A2_re -= gT22_im * a2_im;
1642  spinorFloat A2_im = 0;
1643  A2_im += gT20_re * a0_im;
1644  A2_im += gT20_im * a0_re;
1645  A2_im += gT21_re * a1_im;
1646  A2_im += gT21_im * a1_re;
1647  A2_im += gT22_re * a2_im;
1648  A2_im += gT22_im * a2_re;
1649  spinorFloat B2_re = 0;
1650  B2_re += gT20_re * b0_re;
1651  B2_re -= gT20_im * b0_im;
1652  B2_re += gT21_re * b1_re;
1653  B2_re -= gT21_im * b1_im;
1654  B2_re += gT22_re * b2_re;
1655  B2_re -= gT22_im * b2_im;
1656  spinorFloat B2_im = 0;
1657  B2_im += gT20_re * b0_im;
1658  B2_im += gT20_im * b0_re;
1659  B2_im += gT21_re * b1_im;
1660  B2_im += gT21_im * b1_re;
1661  B2_im += gT22_re * b2_im;
1662  B2_im += gT22_im * b2_re;
1663 
1664 #ifdef SPINOR_DOUBLE
1665  spinorFloat a = param.a;
1666 #else
1667  spinorFloat a = param.a_f;
1668 #endif
1669  o00_re += a*A0_re;
1670  o00_im += a*A0_im;
1671  o10_re += a*B0_re;
1672  o10_im += a*B0_im;
1673  o20_re -= a*B0_re;
1674  o20_im -= a*B0_im;
1675  o30_re += a*A0_re;
1676  o30_im += a*A0_im;
1677 
1678  o01_re += a*A1_re;
1679  o01_im += a*A1_im;
1680  o11_re += a*B1_re;
1681  o11_im += a*B1_im;
1682  o21_re -= a*B1_re;
1683  o21_im -= a*B1_im;
1684  o31_re += a*A1_re;
1685  o31_im += a*A1_im;
1686 
1687  o02_re += a*A2_re;
1688  o02_im += a*A2_im;
1689  o12_re += a*B2_re;
1690  o12_im += a*B2_im;
1691  o22_re -= a*B2_re;
1692  o22_im -= a*B2_im;
1693  o32_re += a*A2_re;
1694  o32_im += a*A2_im;
1695 
1696 }
1697 
1698 #ifdef MULTI_GPU
1699 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]<(param.dc.X[2]-1))) ||
1700  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==(param.dc.X[2]-1)) )
1701 #endif
1702 {
1703  // Projector P2-
1704  // 1 0 -i 0
1705  // 0 1 0 i
1706  // i 0 1 0
1707  // 0 -i 0 1
1708 
1709 #ifdef MULTI_GPU
1710  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1 :
1711  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1712 #if (DD_PREC==2) // half precision
1713  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1714 #endif
1715 #else
1716  const int sp_idx = (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1;
1717 #endif
1718 
1719  const int ga_idx = sid;
1720 
1727 
1728 #ifdef MULTI_GPU
1729  if (kernel_type == INTERIOR_KERNEL) {
1730 #endif
1731 
1732  // read spinor from device memory
1733  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1734 
1735  // project spinor into half spinors
1736  a0_re = +i00_re+i20_im;
1737  a0_im = +i00_im-i20_re;
1738  a1_re = +i01_re+i21_im;
1739  a1_im = +i01_im-i21_re;
1740  a2_re = +i02_re+i22_im;
1741  a2_im = +i02_im-i22_re;
1742  b0_re = +i10_re-i30_im;
1743  b0_im = +i10_im+i30_re;
1744  b1_re = +i11_re-i31_im;
1745  b1_im = +i11_im+i31_re;
1746  b2_re = +i12_re-i32_im;
1747  b2_im = +i12_im+i32_re;
1748 
1749 #ifdef MULTI_GPU
1750  } else {
1751 
1752  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1753 
1754  // read half spinor from device memory
1755  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 4);
1756 
1757  a0_re = i00_re; a0_im = i00_im;
1758  a1_re = i01_re; a1_im = i01_im;
1759  a2_re = i02_re; a2_im = i02_im;
1760  b0_re = i10_re; b0_im = i10_im;
1761  b1_re = i11_re; b1_im = i11_im;
1762  b2_re = i12_re; b2_im = i12_im;
1763 
1764  }
1765 #endif // MULTI_GPU
1766 
1767  // read gauge matrix from device memory
1768  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, param.gauge_stride);
1769 
1770  // reconstruct gauge matrix
1772 
1773  // multiply row 0
1774  spinorFloat A0_re = 0;
1775  A0_re += g00_re * a0_re;
1776  A0_re -= g00_im * a0_im;
1777  A0_re += g01_re * a1_re;
1778  A0_re -= g01_im * a1_im;
1779  A0_re += g02_re * a2_re;
1780  A0_re -= g02_im * a2_im;
1781  spinorFloat A0_im = 0;
1782  A0_im += g00_re * a0_im;
1783  A0_im += g00_im * a0_re;
1784  A0_im += g01_re * a1_im;
1785  A0_im += g01_im * a1_re;
1786  A0_im += g02_re * a2_im;
1787  A0_im += g02_im * a2_re;
1788  spinorFloat B0_re = 0;
1789  B0_re += g00_re * b0_re;
1790  B0_re -= g00_im * b0_im;
1791  B0_re += g01_re * b1_re;
1792  B0_re -= g01_im * b1_im;
1793  B0_re += g02_re * b2_re;
1794  B0_re -= g02_im * b2_im;
1795  spinorFloat B0_im = 0;
1796  B0_im += g00_re * b0_im;
1797  B0_im += g00_im * b0_re;
1798  B0_im += g01_re * b1_im;
1799  B0_im += g01_im * b1_re;
1800  B0_im += g02_re * b2_im;
1801  B0_im += g02_im * b2_re;
1802 
1803  // multiply row 1
1804  spinorFloat A1_re = 0;
1805  A1_re += g10_re * a0_re;
1806  A1_re -= g10_im * a0_im;
1807  A1_re += g11_re * a1_re;
1808  A1_re -= g11_im * a1_im;
1809  A1_re += g12_re * a2_re;
1810  A1_re -= g12_im * a2_im;
1811  spinorFloat A1_im = 0;
1812  A1_im += g10_re * a0_im;
1813  A1_im += g10_im * a0_re;
1814  A1_im += g11_re * a1_im;
1815  A1_im += g11_im * a1_re;
1816  A1_im += g12_re * a2_im;
1817  A1_im += g12_im * a2_re;
1818  spinorFloat B1_re = 0;
1819  B1_re += g10_re * b0_re;
1820  B1_re -= g10_im * b0_im;
1821  B1_re += g11_re * b1_re;
1822  B1_re -= g11_im * b1_im;
1823  B1_re += g12_re * b2_re;
1824  B1_re -= g12_im * b2_im;
1825  spinorFloat B1_im = 0;
1826  B1_im += g10_re * b0_im;
1827  B1_im += g10_im * b0_re;
1828  B1_im += g11_re * b1_im;
1829  B1_im += g11_im * b1_re;
1830  B1_im += g12_re * b2_im;
1831  B1_im += g12_im * b2_re;
1832 
1833  // multiply row 2
1834  spinorFloat A2_re = 0;
1835  A2_re += g20_re * a0_re;
1836  A2_re -= g20_im * a0_im;
1837  A2_re += g21_re * a1_re;
1838  A2_re -= g21_im * a1_im;
1839  A2_re += g22_re * a2_re;
1840  A2_re -= g22_im * a2_im;
1841  spinorFloat A2_im = 0;
1842  A2_im += g20_re * a0_im;
1843  A2_im += g20_im * a0_re;
1844  A2_im += g21_re * a1_im;
1845  A2_im += g21_im * a1_re;
1846  A2_im += g22_re * a2_im;
1847  A2_im += g22_im * a2_re;
1848  spinorFloat B2_re = 0;
1849  B2_re += g20_re * b0_re;
1850  B2_re -= g20_im * b0_im;
1851  B2_re += g21_re * b1_re;
1852  B2_re -= g21_im * b1_im;
1853  B2_re += g22_re * b2_re;
1854  B2_re -= g22_im * b2_im;
1855  spinorFloat B2_im = 0;
1856  B2_im += g20_re * b0_im;
1857  B2_im += g20_im * b0_re;
1858  B2_im += g21_re * b1_im;
1859  B2_im += g21_im * b1_re;
1860  B2_im += g22_re * b2_im;
1861  B2_im += g22_im * b2_re;
1862 
1863 #ifdef SPINOR_DOUBLE
1864  spinorFloat a = param.a;
1865 #else
1866  spinorFloat a = param.a_f;
1867 #endif
1868  o00_re += a*A0_re;
1869  o00_im += a*A0_im;
1870  o10_re += a*B0_re;
1871  o10_im += a*B0_im;
1872  o20_re -= a*A0_im;
1873  o20_im += a*A0_re;
1874  o30_re += a*B0_im;
1875  o30_im -= a*B0_re;
1876 
1877  o01_re += a*A1_re;
1878  o01_im += a*A1_im;
1879  o11_re += a*B1_re;
1880  o11_im += a*B1_im;
1881  o21_re -= a*A1_im;
1882  o21_im += a*A1_re;
1883  o31_re += a*B1_im;
1884  o31_im -= a*B1_re;
1885 
1886  o02_re += a*A2_re;
1887  o02_im += a*A2_im;
1888  o12_re += a*B2_re;
1889  o12_im += a*B2_im;
1890  o22_re -= a*A2_im;
1891  o22_im += a*A2_re;
1892  o32_re += a*B2_im;
1893  o32_im -= a*B2_re;
1894 
1895 }
1896 
1897 #ifdef MULTI_GPU
1898 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]>0)) ||
1899  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==0) )
1900 #endif
1901 {
1902  // Projector P2+
1903  // 1 0 i 0
1904  // 0 1 0 -i
1905  // -i 0 1 0
1906  // 0 i 0 1
1907 
1908 #ifdef MULTI_GPU
1909  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1 :
1910  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1911 #if (DD_PREC==2) // half precision
1912  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1913 #endif
1914 #else
1915  const int sp_idx = (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1;
1916 #endif
1917 
1918 #ifdef MULTI_GPU
1919  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1920 #else
1921  const int ga_idx = sp_idx;
1922 #endif
1923 
1930 
1931 #ifdef MULTI_GPU
1932  if (kernel_type == INTERIOR_KERNEL) {
1933 #endif
1934 
1935  // read spinor from device memory
1936  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1937 
1938  // project spinor into half spinors
1939  a0_re = +i00_re-i20_im;
1940  a0_im = +i00_im+i20_re;
1941  a1_re = +i01_re-i21_im;
1942  a1_im = +i01_im+i21_re;
1943  a2_re = +i02_re-i22_im;
1944  a2_im = +i02_im+i22_re;
1945  b0_re = +i10_re+i30_im;
1946  b0_im = +i10_im-i30_re;
1947  b1_re = +i11_re+i31_im;
1948  b1_im = +i11_im-i31_re;
1949  b2_re = +i12_re+i32_im;
1950  b2_im = +i12_im-i32_re;
1951 
1952 #ifdef MULTI_GPU
1953  } else {
1954 
1955  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1956 
1957  // read half spinor from device memory
1958  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 5);
1959 
1960  a0_re = i00_re; a0_im = i00_im;
1961  a1_re = i01_re; a1_im = i01_im;
1962  a2_re = i02_re; a2_im = i02_im;
1963  b0_re = i10_re; b0_im = i10_im;
1964  b1_re = i11_re; b1_im = i11_im;
1965  b2_re = i12_re; b2_im = i12_im;
1966 
1967  }
1968 #endif // MULTI_GPU
1969 
1970  // read gauge matrix from device memory
1971  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, param.gauge_stride);
1972 
1973  // reconstruct gauge matrix
1975 
1976  // multiply row 0
1977  spinorFloat A0_re = 0;
1978  A0_re += gT00_re * a0_re;
1979  A0_re -= gT00_im * a0_im;
1980  A0_re += gT01_re * a1_re;
1981  A0_re -= gT01_im * a1_im;
1982  A0_re += gT02_re * a2_re;
1983  A0_re -= gT02_im * a2_im;
1984  spinorFloat A0_im = 0;
1985  A0_im += gT00_re * a0_im;
1986  A0_im += gT00_im * a0_re;
1987  A0_im += gT01_re * a1_im;
1988  A0_im += gT01_im * a1_re;
1989  A0_im += gT02_re * a2_im;
1990  A0_im += gT02_im * a2_re;
1991  spinorFloat B0_re = 0;
1992  B0_re += gT00_re * b0_re;
1993  B0_re -= gT00_im * b0_im;
1994  B0_re += gT01_re * b1_re;
1995  B0_re -= gT01_im * b1_im;
1996  B0_re += gT02_re * b2_re;
1997  B0_re -= gT02_im * b2_im;
1998  spinorFloat B0_im = 0;
1999  B0_im += gT00_re * b0_im;
2000  B0_im += gT00_im * b0_re;
2001  B0_im += gT01_re * b1_im;
2002  B0_im += gT01_im * b1_re;
2003  B0_im += gT02_re * b2_im;
2004  B0_im += gT02_im * b2_re;
2005 
2006  // multiply row 1
2007  spinorFloat A1_re = 0;
2008  A1_re += gT10_re * a0_re;
2009  A1_re -= gT10_im * a0_im;
2010  A1_re += gT11_re * a1_re;
2011  A1_re -= gT11_im * a1_im;
2012  A1_re += gT12_re * a2_re;
2013  A1_re -= gT12_im * a2_im;
2014  spinorFloat A1_im = 0;
2015  A1_im += gT10_re * a0_im;
2016  A1_im += gT10_im * a0_re;
2017  A1_im += gT11_re * a1_im;
2018  A1_im += gT11_im * a1_re;
2019  A1_im += gT12_re * a2_im;
2020  A1_im += gT12_im * a2_re;
2021  spinorFloat B1_re = 0;
2022  B1_re += gT10_re * b0_re;
2023  B1_re -= gT10_im * b0_im;
2024  B1_re += gT11_re * b1_re;
2025  B1_re -= gT11_im * b1_im;
2026  B1_re += gT12_re * b2_re;
2027  B1_re -= gT12_im * b2_im;
2028  spinorFloat B1_im = 0;
2029  B1_im += gT10_re * b0_im;
2030  B1_im += gT10_im * b0_re;
2031  B1_im += gT11_re * b1_im;
2032  B1_im += gT11_im * b1_re;
2033  B1_im += gT12_re * b2_im;
2034  B1_im += gT12_im * b2_re;
2035 
2036  // multiply row 2
2037  spinorFloat A2_re = 0;
2038  A2_re += gT20_re * a0_re;
2039  A2_re -= gT20_im * a0_im;
2040  A2_re += gT21_re * a1_re;
2041  A2_re -= gT21_im * a1_im;
2042  A2_re += gT22_re * a2_re;
2043  A2_re -= gT22_im * a2_im;
2044  spinorFloat A2_im = 0;
2045  A2_im += gT20_re * a0_im;
2046  A2_im += gT20_im * a0_re;
2047  A2_im += gT21_re * a1_im;
2048  A2_im += gT21_im * a1_re;
2049  A2_im += gT22_re * a2_im;
2050  A2_im += gT22_im * a2_re;
2051  spinorFloat B2_re = 0;
2052  B2_re += gT20_re * b0_re;
2053  B2_re -= gT20_im * b0_im;
2054  B2_re += gT21_re * b1_re;
2055  B2_re -= gT21_im * b1_im;
2056  B2_re += gT22_re * b2_re;
2057  B2_re -= gT22_im * b2_im;
2058  spinorFloat B2_im = 0;
2059  B2_im += gT20_re * b0_im;
2060  B2_im += gT20_im * b0_re;
2061  B2_im += gT21_re * b1_im;
2062  B2_im += gT21_im * b1_re;
2063  B2_im += gT22_re * b2_im;
2064  B2_im += gT22_im * b2_re;
2065 
2066 #ifdef SPINOR_DOUBLE
2067  spinorFloat a = param.a;
2068 #else
2069  spinorFloat a = param.a_f;
2070 #endif
2071  o00_re += a*A0_re;
2072  o00_im += a*A0_im;
2073  o10_re += a*B0_re;
2074  o10_im += a*B0_im;
2075  o20_re += a*A0_im;
2076  o20_im -= a*A0_re;
2077  o30_re -= a*B0_im;
2078  o30_im += a*B0_re;
2079 
2080  o01_re += a*A1_re;
2081  o01_im += a*A1_im;
2082  o11_re += a*B1_re;
2083  o11_im += a*B1_im;
2084  o21_re += a*A1_im;
2085  o21_im -= a*A1_re;
2086  o31_re -= a*B1_im;
2087  o31_im += a*B1_re;
2088 
2089  o02_re += a*A2_re;
2090  o02_im += a*A2_im;
2091  o12_re += a*B2_re;
2092  o12_im += a*B2_im;
2093  o22_re += a*A2_im;
2094  o22_im -= a*A2_re;
2095  o32_re -= a*B2_im;
2096  o32_im += a*B2_re;
2097 
2098 }
2099 
2100 #ifdef MULTI_GPU
2101 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]<(param.dc.X[3]-1))) ||
2102  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==(param.dc.X[3]-1)) )
2103 #endif
2104 {
2105  // Projector P3-
2106  // 0 0 0 0
2107  // 0 0 0 0
2108  // 0 0 2 0
2109  // 0 0 0 2
2110 
2111 #ifdef MULTI_GPU
2112  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1 :
2113  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
2114 #if (DD_PREC==2) // half precision
2115  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
2116 #endif
2117 #else
2118  const int sp_idx = (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1;
2119 #endif
2120 
2121  const int ga_idx = sid;
2122 
2123  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
2124  {
2131 
2132 #ifdef MULTI_GPU
2133  if (kernel_type == INTERIOR_KERNEL) {
2134 #endif
2135 
2136  // read spinor from device memory
2138 
2139  // project spinor into half spinors
2140  a0_re = +2*i20_re;
2141  a0_im = +2*i20_im;
2142  a1_re = +2*i21_re;
2143  a1_im = +2*i21_im;
2144  a2_re = +2*i22_re;
2145  a2_im = +2*i22_im;
2146  b0_re = +2*i30_re;
2147  b0_im = +2*i30_im;
2148  b1_re = +2*i31_re;
2149  b1_im = +2*i31_im;
2150  b2_re = +2*i32_re;
2151  b2_im = +2*i32_im;
2152 
2153 #ifdef MULTI_GPU
2154  } else {
2155 
2156  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2157  const int t_proj_scale = TPROJSCALE;
2158 
2159  // read half spinor from device memory
2160  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
2161 
2162  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2163  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2164  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2165  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2166  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2167  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2168 
2169  }
2170 #endif // MULTI_GPU
2171 
2172  // identity gauge matrix
2179 
2180 #ifdef SPINOR_DOUBLE
2181  spinorFloat a = param.a;
2182 #else
2183  spinorFloat a = param.a_f;
2184 #endif
2185  o20_re += a*A0_re;
2186  o20_im += a*A0_im;
2187  o30_re += a*B0_re;
2188  o30_im += a*B0_im;
2189 
2190  o21_re += a*A1_re;
2191  o21_im += a*A1_im;
2192  o31_re += a*B1_re;
2193  o31_im += a*B1_im;
2194 
2195  o22_re += a*A2_re;
2196  o22_im += a*A2_im;
2197  o32_re += a*B2_re;
2198  o32_im += a*B2_im;
2199 
2200  } else {
2207 
2208 #ifdef MULTI_GPU
2209  if (kernel_type == INTERIOR_KERNEL) {
2210 #endif
2211 
2212  // read spinor from device memory
2214 
2215  // project spinor into half spinors
2216  a0_re = +2*i20_re;
2217  a0_im = +2*i20_im;
2218  a1_re = +2*i21_re;
2219  a1_im = +2*i21_im;
2220  a2_re = +2*i22_re;
2221  a2_im = +2*i22_im;
2222  b0_re = +2*i30_re;
2223  b0_im = +2*i30_im;
2224  b1_re = +2*i31_re;
2225  b1_im = +2*i31_im;
2226  b2_re = +2*i32_re;
2227  b2_im = +2*i32_im;
2228 
2229 #ifdef MULTI_GPU
2230  } else {
2231 
2232  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2233  const int t_proj_scale = TPROJSCALE;
2234 
2235  // read half spinor from device memory
2236  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
2237 
2238  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2239  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2240  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2241  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2242  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2243  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2244 
2245  }
2246 #endif // MULTI_GPU
2247 
2248  // read gauge matrix from device memory
2249  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, param.gauge_stride);
2250 
2251  // reconstruct gauge matrix
2253 
2254  // multiply row 0
2255  spinorFloat A0_re = 0;
2256  A0_re += g00_re * a0_re;
2257  A0_re -= g00_im * a0_im;
2258  A0_re += g01_re * a1_re;
2259  A0_re -= g01_im * a1_im;
2260  A0_re += g02_re * a2_re;
2261  A0_re -= g02_im * a2_im;
2262  spinorFloat A0_im = 0;
2263  A0_im += g00_re * a0_im;
2264  A0_im += g00_im * a0_re;
2265  A0_im += g01_re * a1_im;
2266  A0_im += g01_im * a1_re;
2267  A0_im += g02_re * a2_im;
2268  A0_im += g02_im * a2_re;
2269  spinorFloat B0_re = 0;
2270  B0_re += g00_re * b0_re;
2271  B0_re -= g00_im * b0_im;
2272  B0_re += g01_re * b1_re;
2273  B0_re -= g01_im * b1_im;
2274  B0_re += g02_re * b2_re;
2275  B0_re -= g02_im * b2_im;
2276  spinorFloat B0_im = 0;
2277  B0_im += g00_re * b0_im;
2278  B0_im += g00_im * b0_re;
2279  B0_im += g01_re * b1_im;
2280  B0_im += g01_im * b1_re;
2281  B0_im += g02_re * b2_im;
2282  B0_im += g02_im * b2_re;
2283 
2284  // multiply row 1
2285  spinorFloat A1_re = 0;
2286  A1_re += g10_re * a0_re;
2287  A1_re -= g10_im * a0_im;
2288  A1_re += g11_re * a1_re;
2289  A1_re -= g11_im * a1_im;
2290  A1_re += g12_re * a2_re;
2291  A1_re -= g12_im * a2_im;
2292  spinorFloat A1_im = 0;
2293  A1_im += g10_re * a0_im;
2294  A1_im += g10_im * a0_re;
2295  A1_im += g11_re * a1_im;
2296  A1_im += g11_im * a1_re;
2297  A1_im += g12_re * a2_im;
2298  A1_im += g12_im * a2_re;
2299  spinorFloat B1_re = 0;
2300  B1_re += g10_re * b0_re;
2301  B1_re -= g10_im * b0_im;
2302  B1_re += g11_re * b1_re;
2303  B1_re -= g11_im * b1_im;
2304  B1_re += g12_re * b2_re;
2305  B1_re -= g12_im * b2_im;
2306  spinorFloat B1_im = 0;
2307  B1_im += g10_re * b0_im;
2308  B1_im += g10_im * b0_re;
2309  B1_im += g11_re * b1_im;
2310  B1_im += g11_im * b1_re;
2311  B1_im += g12_re * b2_im;
2312  B1_im += g12_im * b2_re;
2313 
2314  // multiply row 2
2315  spinorFloat A2_re = 0;
2316  A2_re += g20_re * a0_re;
2317  A2_re -= g20_im * a0_im;
2318  A2_re += g21_re * a1_re;
2319  A2_re -= g21_im * a1_im;
2320  A2_re += g22_re * a2_re;
2321  A2_re -= g22_im * a2_im;
2322  spinorFloat A2_im = 0;
2323  A2_im += g20_re * a0_im;
2324  A2_im += g20_im * a0_re;
2325  A2_im += g21_re * a1_im;
2326  A2_im += g21_im * a1_re;
2327  A2_im += g22_re * a2_im;
2328  A2_im += g22_im * a2_re;
2329  spinorFloat B2_re = 0;
2330  B2_re += g20_re * b0_re;
2331  B2_re -= g20_im * b0_im;
2332  B2_re += g21_re * b1_re;
2333  B2_re -= g21_im * b1_im;
2334  B2_re += g22_re * b2_re;
2335  B2_re -= g22_im * b2_im;
2336  spinorFloat B2_im = 0;
2337  B2_im += g20_re * b0_im;
2338  B2_im += g20_im * b0_re;
2339  B2_im += g21_re * b1_im;
2340  B2_im += g21_im * b1_re;
2341  B2_im += g22_re * b2_im;
2342  B2_im += g22_im * b2_re;
2343 
2344 #ifdef SPINOR_DOUBLE
2345  spinorFloat a = param.a;
2346 #else
2347  spinorFloat a = param.a_f;
2348 #endif
2349  o20_re += a*A0_re;
2350  o20_im += a*A0_im;
2351  o30_re += a*B0_re;
2352  o30_im += a*B0_im;
2353 
2354  o21_re += a*A1_re;
2355  o21_im += a*A1_im;
2356  o31_re += a*B1_re;
2357  o31_im += a*B1_im;
2358 
2359  o22_re += a*A2_re;
2360  o22_im += a*A2_im;
2361  o32_re += a*B2_re;
2362  o32_im += a*B2_im;
2363 
2364  }
2365 }
2366 
2367 #ifdef MULTI_GPU
2368 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]>0)) ||
2369  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==0) )
2370 #endif
2371 {
2372  // Projector P3+
2373  // 2 0 0 0
2374  // 0 2 0 0
2375  // 0 0 0 0
2376  // 0 0 0 0
2377 
2378 #ifdef MULTI_GPU
2379  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1 :
2380  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
2381 #if (DD_PREC==2) // half precision
2382  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
2383 #endif
2384 #else
2385  const int sp_idx = (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1;
2386 #endif
2387 
2388 #ifdef MULTI_GPU
2389  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
2390 #else
2391  const int ga_idx = sp_idx;
2392 #endif
2393 
2394  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
2395  {
2402 
2403 #ifdef MULTI_GPU
2404  if (kernel_type == INTERIOR_KERNEL) {
2405 #endif
2406 
2407  // read spinor from device memory
2408  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2409 
2410  // project spinor into half spinors
2411  a0_re = +2*i00_re;
2412  a0_im = +2*i00_im;
2413  a1_re = +2*i01_re;
2414  a1_im = +2*i01_im;
2415  a2_re = +2*i02_re;
2416  a2_im = +2*i02_im;
2417  b0_re = +2*i10_re;
2418  b0_im = +2*i10_im;
2419  b1_re = +2*i11_re;
2420  b1_im = +2*i11_im;
2421  b2_re = +2*i12_re;
2422  b2_im = +2*i12_im;
2423 
2424 #ifdef MULTI_GPU
2425  } else {
2426 
2427  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2428  const int t_proj_scale = TPROJSCALE;
2429 
2430  // read half spinor from device memory
2431  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
2432 
2433  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2434  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2435  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2436  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2437  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2438  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2439 
2440  }
2441 #endif // MULTI_GPU
2442 
2443  // identity gauge matrix
2450 
2451 #ifdef SPINOR_DOUBLE
2452  spinorFloat a = param.a;
2453 #else
2454  spinorFloat a = param.a_f;
2455 #endif
2456  o00_re += a*A0_re;
2457  o00_im += a*A0_im;
2458  o10_re += a*B0_re;
2459  o10_im += a*B0_im;
2460 
2461  o01_re += a*A1_re;
2462  o01_im += a*A1_im;
2463  o11_re += a*B1_re;
2464  o11_im += a*B1_im;
2465 
2466  o02_re += a*A2_re;
2467  o02_im += a*A2_im;
2468  o12_re += a*B2_re;
2469  o12_im += a*B2_im;
2470 
2471  } else {
2478 
2479 #ifdef MULTI_GPU
2480  if (kernel_type == INTERIOR_KERNEL) {
2481 #endif
2482 
2483  // read spinor from device memory
2484  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2485 
2486  // project spinor into half spinors
2487  a0_re = +2*i00_re;
2488  a0_im = +2*i00_im;
2489  a1_re = +2*i01_re;
2490  a1_im = +2*i01_im;
2491  a2_re = +2*i02_re;
2492  a2_im = +2*i02_im;
2493  b0_re = +2*i10_re;
2494  b0_im = +2*i10_im;
2495  b1_re = +2*i11_re;
2496  b1_im = +2*i11_im;
2497  b2_re = +2*i12_re;
2498  b2_im = +2*i12_im;
2499 
2500 #ifdef MULTI_GPU
2501  } else {
2502 
2503  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2504  const int t_proj_scale = TPROJSCALE;
2505 
2506  // read half spinor from device memory
2507  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
2508 
2509  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2510  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2511  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2512  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2513  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2514  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2515 
2516  }
2517 #endif // MULTI_GPU
2518 
2519  // read gauge matrix from device memory
2520  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, param.gauge_stride);
2521 
2522  // reconstruct gauge matrix
2524 
2525  // multiply row 0
2526  spinorFloat A0_re = 0;
2527  A0_re += gT00_re * a0_re;
2528  A0_re -= gT00_im * a0_im;
2529  A0_re += gT01_re * a1_re;
2530  A0_re -= gT01_im * a1_im;
2531  A0_re += gT02_re * a2_re;
2532  A0_re -= gT02_im * a2_im;
2533  spinorFloat A0_im = 0;
2534  A0_im += gT00_re * a0_im;
2535  A0_im += gT00_im * a0_re;
2536  A0_im += gT01_re * a1_im;
2537  A0_im += gT01_im * a1_re;
2538  A0_im += gT02_re * a2_im;
2539  A0_im += gT02_im * a2_re;
2540  spinorFloat B0_re = 0;
2541  B0_re += gT00_re * b0_re;
2542  B0_re -= gT00_im * b0_im;
2543  B0_re += gT01_re * b1_re;
2544  B0_re -= gT01_im * b1_im;
2545  B0_re += gT02_re * b2_re;
2546  B0_re -= gT02_im * b2_im;
2547  spinorFloat B0_im = 0;
2548  B0_im += gT00_re * b0_im;
2549  B0_im += gT00_im * b0_re;
2550  B0_im += gT01_re * b1_im;
2551  B0_im += gT01_im * b1_re;
2552  B0_im += gT02_re * b2_im;
2553  B0_im += gT02_im * b2_re;
2554 
2555  // multiply row 1
2556  spinorFloat A1_re = 0;
2557  A1_re += gT10_re * a0_re;
2558  A1_re -= gT10_im * a0_im;
2559  A1_re += gT11_re * a1_re;
2560  A1_re -= gT11_im * a1_im;
2561  A1_re += gT12_re * a2_re;
2562  A1_re -= gT12_im * a2_im;
2563  spinorFloat A1_im = 0;
2564  A1_im += gT10_re * a0_im;
2565  A1_im += gT10_im * a0_re;
2566  A1_im += gT11_re * a1_im;
2567  A1_im += gT11_im * a1_re;
2568  A1_im += gT12_re * a2_im;
2569  A1_im += gT12_im * a2_re;
2570  spinorFloat B1_re = 0;
2571  B1_re += gT10_re * b0_re;
2572  B1_re -= gT10_im * b0_im;
2573  B1_re += gT11_re * b1_re;
2574  B1_re -= gT11_im * b1_im;
2575  B1_re += gT12_re * b2_re;
2576  B1_re -= gT12_im * b2_im;
2577  spinorFloat B1_im = 0;
2578  B1_im += gT10_re * b0_im;
2579  B1_im += gT10_im * b0_re;
2580  B1_im += gT11_re * b1_im;
2581  B1_im += gT11_im * b1_re;
2582  B1_im += gT12_re * b2_im;
2583  B1_im += gT12_im * b2_re;
2584 
2585  // multiply row 2
2586  spinorFloat A2_re = 0;
2587  A2_re += gT20_re * a0_re;
2588  A2_re -= gT20_im * a0_im;
2589  A2_re += gT21_re * a1_re;
2590  A2_re -= gT21_im * a1_im;
2591  A2_re += gT22_re * a2_re;
2592  A2_re -= gT22_im * a2_im;
2593  spinorFloat A2_im = 0;
2594  A2_im += gT20_re * a0_im;
2595  A2_im += gT20_im * a0_re;
2596  A2_im += gT21_re * a1_im;
2597  A2_im += gT21_im * a1_re;
2598  A2_im += gT22_re * a2_im;
2599  A2_im += gT22_im * a2_re;
2600  spinorFloat B2_re = 0;
2601  B2_re += gT20_re * b0_re;
2602  B2_re -= gT20_im * b0_im;
2603  B2_re += gT21_re * b1_re;
2604  B2_re -= gT21_im * b1_im;
2605  B2_re += gT22_re * b2_re;
2606  B2_re -= gT22_im * b2_im;
2607  spinorFloat B2_im = 0;
2608  B2_im += gT20_re * b0_im;
2609  B2_im += gT20_im * b0_re;
2610  B2_im += gT21_re * b1_im;
2611  B2_im += gT21_im * b1_re;
2612  B2_im += gT22_re * b2_im;
2613  B2_im += gT22_im * b2_re;
2614 
2615 #ifdef SPINOR_DOUBLE
2616  spinorFloat a = param.a;
2617 #else
2618  spinorFloat a = param.a_f;
2619 #endif
2620  o00_re += a*A0_re;
2621  o00_im += a*A0_im;
2622  o10_re += a*B0_re;
2623  o10_im += a*B0_im;
2624 
2625  o01_re += a*A1_re;
2626  o01_im += a*A1_im;
2627  o11_re += a*B1_re;
2628  o11_im += a*B1_im;
2629 
2630  o02_re += a*A2_re;
2631  o02_im += a*A2_im;
2632  o12_re += a*B2_re;
2633  o12_im += a*B2_im;
2634 
2635  }
2636 }
2637 
2638 
2639 
2640 // write spinor field back to device memory
2641 WRITE_SPINOR(param.sp_stride);
2642 
2643 // undefine to prevent warning when precision is changed
2644 #undef spinorFloat
2645 #undef g00_re
2646 #undef g00_im
2647 #undef g01_re
2648 #undef g01_im
2649 #undef g02_re
2650 #undef g02_im
2651 #undef g10_re
2652 #undef g10_im
2653 #undef g11_re
2654 #undef g11_im
2655 #undef g12_re
2656 #undef g12_im
2657 #undef g20_re
2658 #undef g20_im
2659 #undef g21_re
2660 #undef g21_im
2661 #undef g22_re
2662 #undef g22_im
2663 
2664 #undef i00_re
2665 #undef i00_im
2666 #undef i01_re
2667 #undef i01_im
2668 #undef i02_re
2669 #undef i02_im
2670 #undef i10_re
2671 #undef i10_im
2672 #undef i11_re
2673 #undef i11_im
2674 #undef i12_re
2675 #undef i12_im
2676 #undef i20_re
2677 #undef i20_im
2678 #undef i21_re
2679 #undef i21_im
2680 #undef i22_re
2681 #undef i22_im
2682 #undef i30_re
2683 #undef i30_im
2684 #undef i31_re
2685 #undef i31_im
2686 #undef i32_re
2687 #undef i32_im
2688 
2689 #undef acc00_re
2690 #undef acc00_im
2691 #undef acc01_re
2692 #undef acc01_im
2693 #undef acc02_re
2694 #undef acc02_im
2695 #undef acc10_re
2696 #undef acc10_im
2697 #undef acc11_re
2698 #undef acc11_im
2699 #undef acc12_re
2700 #undef acc12_im
2701 #undef acc20_re
2702 #undef acc20_im
2703 #undef acc21_re
2704 #undef acc21_im
2705 #undef acc22_re
2706 #undef acc22_im
2707 #undef acc30_re
2708 #undef acc30_im
2709 #undef acc31_re
2710 #undef acc31_im
2711 #undef acc32_re
2712 #undef acc32_im
2713 
2714 #undef c00_00_re
2715 #undef c01_01_re
2716 #undef c02_02_re
2717 #undef c10_10_re
2718 #undef c11_11_re
2719 #undef c12_12_re
2720 #undef c01_00_re
2721 #undef c01_00_im
2722 #undef c02_00_re
2723 #undef c02_00_im
2724 #undef c10_00_re
2725 #undef c10_00_im
2726 #undef c11_00_re
2727 #undef c11_00_im
2728 #undef c12_00_re
2729 #undef c12_00_im
2730 #undef c02_01_re
2731 #undef c02_01_im
2732 #undef c10_01_re
2733 #undef c10_01_im
2734 #undef c11_01_re
2735 #undef c11_01_im
2736 #undef c12_01_re
2737 #undef c12_01_im
2738 #undef c10_02_re
2739 #undef c10_02_im
2740 #undef c11_02_re
2741 #undef c11_02_im
2742 #undef c12_02_re
2743 #undef c12_02_im
2744 #undef c11_10_re
2745 #undef c11_10_im
2746 #undef c12_10_re
2747 #undef c12_10_im
2748 #undef c12_11_re
2749 #undef c12_11_im
2750 
2751 
2752 #undef VOLATILE
dim3 dim3 blockDim
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o30_re
VOLATILE spinorFloat o11_re
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o30_im
int sp_idx
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o32_re
VOLATILE spinorFloat o32_im
#define GAUGE0TEX
VOLATILE spinorFloat o20_im
VOLATILE spinorFloat o01_im
QudaGaugeParam param
Definition: pack_test.cpp:17
RECONSTRUCT_GAUGE_MATRIX(0)
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o00_im
coordsFromIndex< 4, QUDA_4D_PC, EVEN_X >(X, coord, sid, param)
#define GAUGE1TEX
#define SPINORTEX
WRITE_SPINOR(param.sp_stride)
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o31_re
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
int X[4]
Definition: quda.h:29
VOLATILE spinorFloat o00_re
#define READ_SPINOR_GHOST
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o20_re
#define CLOVERTEX
#define INTERTEX
VOLATILE spinorFloat o22_im
int face_idx
const int face_num
#define TPROJSCALE
VOLATILE spinorFloat o21_re
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o02_re
#define GHOSTSPINORTEX
#define READ_CLOVER
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o31_im