QUDA  0.9.0
asym_wilson_clover_dslash_fermi_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 24
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2
15 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
64 #else
65 #define spinorFloat float
66 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4
67 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4
68 #define i00_re I0.x
69 #define i00_im I0.y
70 #define i01_re I0.z
71 #define i01_im I0.w
72 #define i02_re I1.x
73 #define i02_im I1.y
74 #define i10_re I1.z
75 #define i10_im I1.w
76 #define i11_re I2.x
77 #define i11_im I2.y
78 #define i12_re I2.z
79 #define i12_im I2.w
80 #define i20_re I3.x
81 #define i20_im I3.y
82 #define i21_re I3.z
83 #define i21_im I3.w
84 #define i22_re I4.x
85 #define i22_im I4.y
86 #define i30_re I4.z
87 #define i30_im I4.w
88 #define i31_re I5.x
89 #define i31_im I5.y
90 #define i32_re I5.z
91 #define i32_im I5.w
92 #define acc00_re accum0.x
93 #define acc00_im accum0.y
94 #define acc01_re accum0.z
95 #define acc01_im accum0.w
96 #define acc02_re accum1.x
97 #define acc02_im accum1.y
98 #define acc10_re accum1.z
99 #define acc10_im accum1.w
100 #define acc11_re accum2.x
101 #define acc11_im accum2.y
102 #define acc12_re accum2.z
103 #define acc12_im accum2.w
104 #define acc20_re accum3.x
105 #define acc20_im accum3.y
106 #define acc21_re accum3.z
107 #define acc21_im accum3.w
108 #define acc22_re accum4.x
109 #define acc22_im accum4.y
110 #define acc30_re accum4.z
111 #define acc30_im accum4.w
112 #define acc31_re accum5.x
113 #define acc31_im accum5.y
114 #define acc32_re accum5.z
115 #define acc32_im accum5.w
116 #endif // SPINOR_DOUBLE
117 
118 // gauge link
119 #ifdef GAUGE_FLOAT2
120 #define g00_re G0.x
121 #define g00_im G0.y
122 #define g01_re G1.x
123 #define g01_im G1.y
124 #define g02_re G2.x
125 #define g02_im G2.y
126 #define g10_re G3.x
127 #define g10_im G3.y
128 #define g11_re G4.x
129 #define g11_im G4.y
130 #define g12_re G5.x
131 #define g12_im G5.y
132 #define g20_re G6.x
133 #define g20_im G6.y
134 #define g21_re G7.x
135 #define g21_im G7.y
136 #define g22_re G8.x
137 #define g22_im G8.y
138 
139 #else
140 #define g00_re G0.x
141 #define g00_im G0.y
142 #define g01_re G0.z
143 #define g01_im G0.w
144 #define g02_re G1.x
145 #define g02_im G1.y
146 #define g10_re G1.z
147 #define g10_im G1.w
148 #define g11_re G2.x
149 #define g11_im G2.y
150 #define g12_re G2.z
151 #define g12_im G2.w
152 #define g20_re G3.x
153 #define g20_im G3.y
154 #define g21_re G3.z
155 #define g21_im G3.w
156 #define g22_re G4.x
157 #define g22_im G4.y
158 
159 #endif // GAUGE_DOUBLE
160 
161 // conjugated gauge link
162 #define gT00_re (+g00_re)
163 #define gT00_im (-g00_im)
164 #define gT01_re (+g10_re)
165 #define gT01_im (-g10_im)
166 #define gT02_re (+g20_re)
167 #define gT02_im (-g20_im)
168 #define gT10_re (+g01_re)
169 #define gT10_im (-g01_im)
170 #define gT11_re (+g11_re)
171 #define gT11_im (-g11_im)
172 #define gT12_re (+g21_re)
173 #define gT12_im (-g21_im)
174 #define gT20_re (+g02_re)
175 #define gT20_im (-g02_im)
176 #define gT21_re (+g12_re)
177 #define gT21_im (-g12_im)
178 #define gT22_re (+g22_re)
179 #define gT22_im (-g22_im)
180 
181 // first chiral block of inverted clover term
182 #ifdef CLOVER_DOUBLE
183 #define c00_00_re C0.x
184 #define c01_01_re C0.y
185 #define c02_02_re C1.x
186 #define c10_10_re C1.y
187 #define c11_11_re C2.x
188 #define c12_12_re C2.y
189 #define c01_00_re C3.x
190 #define c01_00_im C3.y
191 #define c02_00_re C4.x
192 #define c02_00_im C4.y
193 #define c10_00_re C5.x
194 #define c10_00_im C5.y
195 #define c11_00_re C6.x
196 #define c11_00_im C6.y
197 #define c12_00_re C7.x
198 #define c12_00_im C7.y
199 #define c02_01_re C8.x
200 #define c02_01_im C8.y
201 #define c10_01_re C9.x
202 #define c10_01_im C9.y
203 #define c11_01_re C10.x
204 #define c11_01_im C10.y
205 #define c12_01_re C11.x
206 #define c12_01_im C11.y
207 #define c10_02_re C12.x
208 #define c10_02_im C12.y
209 #define c11_02_re C13.x
210 #define c11_02_im C13.y
211 #define c12_02_re C14.x
212 #define c12_02_im C14.y
213 #define c11_10_re C15.x
214 #define c11_10_im C15.y
215 #define c12_10_re C16.x
216 #define c12_10_im C16.y
217 #define c12_11_re C17.x
218 #define c12_11_im C17.y
219 #else
220 #define c00_00_re C0.x
221 #define c01_01_re C0.y
222 #define c02_02_re C0.z
223 #define c10_10_re C0.w
224 #define c11_11_re C1.x
225 #define c12_12_re C1.y
226 #define c01_00_re C1.z
227 #define c01_00_im C1.w
228 #define c02_00_re C2.x
229 #define c02_00_im C2.y
230 #define c10_00_re C2.z
231 #define c10_00_im C2.w
232 #define c11_00_re C3.x
233 #define c11_00_im C3.y
234 #define c12_00_re C3.z
235 #define c12_00_im C3.w
236 #define c02_01_re C4.x
237 #define c02_01_im C4.y
238 #define c10_01_re C4.z
239 #define c10_01_im C4.w
240 #define c11_01_re C5.x
241 #define c11_01_im C5.y
242 #define c12_01_re C5.z
243 #define c12_01_im C5.w
244 #define c10_02_re C6.x
245 #define c10_02_im C6.y
246 #define c11_02_re C6.z
247 #define c11_02_im C6.w
248 #define c12_02_re C7.x
249 #define c12_02_im C7.y
250 #define c11_10_re C7.z
251 #define c11_10_im C7.w
252 #define c12_10_re C8.x
253 #define c12_10_im C8.y
254 #define c12_11_re C8.z
255 #define c12_11_im C8.w
256 #endif // CLOVER_DOUBLE
257 
258 #define c00_01_re (+c01_00_re)
259 #define c00_01_im (-c01_00_im)
260 #define c00_02_re (+c02_00_re)
261 #define c00_02_im (-c02_00_im)
262 #define c01_02_re (+c02_01_re)
263 #define c01_02_im (-c02_01_im)
264 #define c00_10_re (+c10_00_re)
265 #define c00_10_im (-c10_00_im)
266 #define c01_10_re (+c10_01_re)
267 #define c01_10_im (-c10_01_im)
268 #define c02_10_re (+c10_02_re)
269 #define c02_10_im (-c10_02_im)
270 #define c00_11_re (+c11_00_re)
271 #define c00_11_im (-c11_00_im)
272 #define c01_11_re (+c11_01_re)
273 #define c01_11_im (-c11_01_im)
274 #define c02_11_re (+c11_02_re)
275 #define c02_11_im (-c11_02_im)
276 #define c10_11_re (+c11_10_re)
277 #define c10_11_im (-c11_10_im)
278 #define c00_12_re (+c12_00_re)
279 #define c00_12_im (-c12_00_im)
280 #define c01_12_re (+c12_01_re)
281 #define c01_12_im (-c12_01_im)
282 #define c02_12_re (+c12_02_re)
283 #define c02_12_im (-c12_02_im)
284 #define c10_12_re (+c12_10_re)
285 #define c10_12_im (-c12_10_im)
286 #define c11_12_re (+c12_11_re)
287 #define c11_12_im (-c12_11_im)
288 
289 // second chiral block of inverted clover term (reuses C0,...,C9)
290 #define c20_20_re c00_00_re
291 #define c21_20_re c01_00_re
292 #define c21_20_im c01_00_im
293 #define c22_20_re c02_00_re
294 #define c22_20_im c02_00_im
295 #define c30_20_re c10_00_re
296 #define c30_20_im c10_00_im
297 #define c31_20_re c11_00_re
298 #define c31_20_im c11_00_im
299 #define c32_20_re c12_00_re
300 #define c32_20_im c12_00_im
301 #define c20_21_re c00_01_re
302 #define c20_21_im c00_01_im
303 #define c21_21_re c01_01_re
304 #define c22_21_re c02_01_re
305 #define c22_21_im c02_01_im
306 #define c30_21_re c10_01_re
307 #define c30_21_im c10_01_im
308 #define c31_21_re c11_01_re
309 #define c31_21_im c11_01_im
310 #define c32_21_re c12_01_re
311 #define c32_21_im c12_01_im
312 #define c20_22_re c00_02_re
313 #define c20_22_im c00_02_im
314 #define c21_22_re c01_02_re
315 #define c21_22_im c01_02_im
316 #define c22_22_re c02_02_re
317 #define c30_22_re c10_02_re
318 #define c30_22_im c10_02_im
319 #define c31_22_re c11_02_re
320 #define c31_22_im c11_02_im
321 #define c32_22_re c12_02_re
322 #define c32_22_im c12_02_im
323 #define c20_30_re c00_10_re
324 #define c20_30_im c00_10_im
325 #define c21_30_re c01_10_re
326 #define c21_30_im c01_10_im
327 #define c22_30_re c02_10_re
328 #define c22_30_im c02_10_im
329 #define c30_30_re c10_10_re
330 #define c31_30_re c11_10_re
331 #define c31_30_im c11_10_im
332 #define c32_30_re c12_10_re
333 #define c32_30_im c12_10_im
334 #define c20_31_re c00_11_re
335 #define c20_31_im c00_11_im
336 #define c21_31_re c01_11_re
337 #define c21_31_im c01_11_im
338 #define c22_31_re c02_11_re
339 #define c22_31_im c02_11_im
340 #define c30_31_re c10_11_re
341 #define c30_31_im c10_11_im
342 #define c31_31_re c11_11_re
343 #define c32_31_re c12_11_re
344 #define c32_31_im c12_11_im
345 #define c20_32_re c00_12_re
346 #define c20_32_im c00_12_im
347 #define c21_32_re c01_12_re
348 #define c21_32_im c01_12_im
349 #define c22_32_re c02_12_re
350 #define c22_32_im c02_12_im
351 #define c30_32_re c10_12_re
352 #define c30_32_im c10_12_im
353 #define c31_32_re c11_12_re
354 #define c31_32_im c11_12_im
355 #define c32_32_re c12_12_re
356 
357 // output spinor
382 
383 #ifdef SPINOR_DOUBLE
384 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
385 #else
386 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
387 #endif
388 
389 #include "read_gauge.h"
390 #include "read_clover.h"
391 #include "io_spinor.h"
392 
393 int coord[5];
394 int X;
395 
396 int sid;
397 
398 #ifdef MULTI_GPU
399 int face_idx;
400 if (kernel_type == INTERIOR_KERNEL) {
401 #endif
402 
403  // Assume even dimensions
404  coordsFromIndex3D<EVEN_X>(X, x, sid, param.parity, param.dc.X);
405 
406  // only need to check Y and Z dims currently since X and T set to match exactly
407  if (coord[1] >= param.dc.X[1]) return;
408  if (coord[2] >= param.dc.X[2]) return;
409 
410  o00_re = 0; o00_im = 0;
411  o01_re = 0; o01_im = 0;
412  o02_re = 0; o02_im = 0;
413  o10_re = 0; o10_im = 0;
414  o11_re = 0; o11_im = 0;
415  o12_re = 0; o12_im = 0;
416  o20_re = 0; o20_im = 0;
417  o21_re = 0; o21_im = 0;
418  o22_re = 0; o22_im = 0;
419  o30_re = 0; o30_im = 0;
420  o31_re = 0; o31_im = 0;
421  o32_re = 0; o32_im = 0;
422 #ifdef DSLASH_CLOVER_XPAY
423 
424  READ_ACCUM(ACCUMTEX, param.sp_stride)
425 
426 #ifdef DSLASH_CLOVER
427 
428  // change to chiral basis
429  {
430  spinorFloat a00_re = -acc10_re - acc30_re;
431  spinorFloat a00_im = -acc10_im - acc30_im;
432  spinorFloat a10_re = acc00_re + acc20_re;
433  spinorFloat a10_im = acc00_im + acc20_im;
434  spinorFloat a20_re = -acc10_re + acc30_re;
435  spinorFloat a20_im = -acc10_im + acc30_im;
436  spinorFloat a30_re = acc00_re - acc20_re;
437  spinorFloat a30_im = acc00_im - acc20_im;
438 
439  acc00_re = a00_re; acc00_im = a00_im;
440  acc10_re = a10_re; acc10_im = a10_im;
441  acc20_re = a20_re; acc20_im = a20_im;
442  acc30_re = a30_re; acc30_im = a30_im;
443  }
444 
445  {
446  spinorFloat a01_re = -acc11_re - acc31_re;
447  spinorFloat a01_im = -acc11_im - acc31_im;
448  spinorFloat a11_re = acc01_re + acc21_re;
449  spinorFloat a11_im = acc01_im + acc21_im;
450  spinorFloat a21_re = -acc11_re + acc31_re;
451  spinorFloat a21_im = -acc11_im + acc31_im;
452  spinorFloat a31_re = acc01_re - acc21_re;
453  spinorFloat a31_im = acc01_im - acc21_im;
454 
455  acc01_re = a01_re; acc01_im = a01_im;
456  acc11_re = a11_re; acc11_im = a11_im;
457  acc21_re = a21_re; acc21_im = a21_im;
458  acc31_re = a31_re; acc31_im = a31_im;
459  }
460 
461  {
462  spinorFloat a02_re = -acc12_re - acc32_re;
463  spinorFloat a02_im = -acc12_im - acc32_im;
464  spinorFloat a12_re = acc02_re + acc22_re;
465  spinorFloat a12_im = acc02_im + acc22_im;
466  spinorFloat a22_re = -acc12_re + acc32_re;
467  spinorFloat a22_im = -acc12_im + acc32_im;
468  spinorFloat a32_re = acc02_re - acc22_re;
469  spinorFloat a32_im = acc02_im - acc22_im;
470 
471  acc02_re = a02_re; acc02_im = a02_im;
472  acc12_re = a12_re; acc12_im = a12_im;
473  acc22_re = a22_re; acc22_im = a22_im;
474  acc32_re = a32_re; acc32_im = a32_im;
475  }
476 
477  // apply first chiral block
478  {
480 
481  spinorFloat a00_re = 0; spinorFloat a00_im = 0;
482  spinorFloat a01_re = 0; spinorFloat a01_im = 0;
483  spinorFloat a02_re = 0; spinorFloat a02_im = 0;
484  spinorFloat a10_re = 0; spinorFloat a10_im = 0;
485  spinorFloat a11_re = 0; spinorFloat a11_im = 0;
486  spinorFloat a12_re = 0; spinorFloat a12_im = 0;
487 
488  a00_re += c00_00_re * acc00_re;
489  a00_im += c00_00_re * acc00_im;
490  a00_re += c00_01_re * acc01_re;
491  a00_re -= c00_01_im * acc01_im;
492  a00_im += c00_01_re * acc01_im;
493  a00_im += c00_01_im * acc01_re;
494  a00_re += c00_02_re * acc02_re;
495  a00_re -= c00_02_im * acc02_im;
496  a00_im += c00_02_re * acc02_im;
497  a00_im += c00_02_im * acc02_re;
498  a00_re += c00_10_re * acc10_re;
499  a00_re -= c00_10_im * acc10_im;
500  a00_im += c00_10_re * acc10_im;
501  a00_im += c00_10_im * acc10_re;
502  a00_re += c00_11_re * acc11_re;
503  a00_re -= c00_11_im * acc11_im;
504  a00_im += c00_11_re * acc11_im;
505  a00_im += c00_11_im * acc11_re;
506  a00_re += c00_12_re * acc12_re;
507  a00_re -= c00_12_im * acc12_im;
508  a00_im += c00_12_re * acc12_im;
509  a00_im += c00_12_im * acc12_re;
510 
511  a01_re += c01_00_re * acc00_re;
512  a01_re -= c01_00_im * acc00_im;
513  a01_im += c01_00_re * acc00_im;
514  a01_im += c01_00_im * acc00_re;
515  a01_re += c01_01_re * acc01_re;
516  a01_im += c01_01_re * acc01_im;
517  a01_re += c01_02_re * acc02_re;
518  a01_re -= c01_02_im * acc02_im;
519  a01_im += c01_02_re * acc02_im;
520  a01_im += c01_02_im * acc02_re;
521  a01_re += c01_10_re * acc10_re;
522  a01_re -= c01_10_im * acc10_im;
523  a01_im += c01_10_re * acc10_im;
524  a01_im += c01_10_im * acc10_re;
525  a01_re += c01_11_re * acc11_re;
526  a01_re -= c01_11_im * acc11_im;
527  a01_im += c01_11_re * acc11_im;
528  a01_im += c01_11_im * acc11_re;
529  a01_re += c01_12_re * acc12_re;
530  a01_re -= c01_12_im * acc12_im;
531  a01_im += c01_12_re * acc12_im;
532  a01_im += c01_12_im * acc12_re;
533 
534  a02_re += c02_00_re * acc00_re;
535  a02_re -= c02_00_im * acc00_im;
536  a02_im += c02_00_re * acc00_im;
537  a02_im += c02_00_im * acc00_re;
538  a02_re += c02_01_re * acc01_re;
539  a02_re -= c02_01_im * acc01_im;
540  a02_im += c02_01_re * acc01_im;
541  a02_im += c02_01_im * acc01_re;
542  a02_re += c02_02_re * acc02_re;
543  a02_im += c02_02_re * acc02_im;
544  a02_re += c02_10_re * acc10_re;
545  a02_re -= c02_10_im * acc10_im;
546  a02_im += c02_10_re * acc10_im;
547  a02_im += c02_10_im * acc10_re;
548  a02_re += c02_11_re * acc11_re;
549  a02_re -= c02_11_im * acc11_im;
550  a02_im += c02_11_re * acc11_im;
551  a02_im += c02_11_im * acc11_re;
552  a02_re += c02_12_re * acc12_re;
553  a02_re -= c02_12_im * acc12_im;
554  a02_im += c02_12_re * acc12_im;
555  a02_im += c02_12_im * acc12_re;
556 
557  a10_re += c10_00_re * acc00_re;
558  a10_re -= c10_00_im * acc00_im;
559  a10_im += c10_00_re * acc00_im;
560  a10_im += c10_00_im * acc00_re;
561  a10_re += c10_01_re * acc01_re;
562  a10_re -= c10_01_im * acc01_im;
563  a10_im += c10_01_re * acc01_im;
564  a10_im += c10_01_im * acc01_re;
565  a10_re += c10_02_re * acc02_re;
566  a10_re -= c10_02_im * acc02_im;
567  a10_im += c10_02_re * acc02_im;
568  a10_im += c10_02_im * acc02_re;
569  a10_re += c10_10_re * acc10_re;
570  a10_im += c10_10_re * acc10_im;
571  a10_re += c10_11_re * acc11_re;
572  a10_re -= c10_11_im * acc11_im;
573  a10_im += c10_11_re * acc11_im;
574  a10_im += c10_11_im * acc11_re;
575  a10_re += c10_12_re * acc12_re;
576  a10_re -= c10_12_im * acc12_im;
577  a10_im += c10_12_re * acc12_im;
578  a10_im += c10_12_im * acc12_re;
579 
580  a11_re += c11_00_re * acc00_re;
581  a11_re -= c11_00_im * acc00_im;
582  a11_im += c11_00_re * acc00_im;
583  a11_im += c11_00_im * acc00_re;
584  a11_re += c11_01_re * acc01_re;
585  a11_re -= c11_01_im * acc01_im;
586  a11_im += c11_01_re * acc01_im;
587  a11_im += c11_01_im * acc01_re;
588  a11_re += c11_02_re * acc02_re;
589  a11_re -= c11_02_im * acc02_im;
590  a11_im += c11_02_re * acc02_im;
591  a11_im += c11_02_im * acc02_re;
592  a11_re += c11_10_re * acc10_re;
593  a11_re -= c11_10_im * acc10_im;
594  a11_im += c11_10_re * acc10_im;
595  a11_im += c11_10_im * acc10_re;
596  a11_re += c11_11_re * acc11_re;
597  a11_im += c11_11_re * acc11_im;
598  a11_re += c11_12_re * acc12_re;
599  a11_re -= c11_12_im * acc12_im;
600  a11_im += c11_12_re * acc12_im;
601  a11_im += c11_12_im * acc12_re;
602 
603  a12_re += c12_00_re * acc00_re;
604  a12_re -= c12_00_im * acc00_im;
605  a12_im += c12_00_re * acc00_im;
606  a12_im += c12_00_im * acc00_re;
607  a12_re += c12_01_re * acc01_re;
608  a12_re -= c12_01_im * acc01_im;
609  a12_im += c12_01_re * acc01_im;
610  a12_im += c12_01_im * acc01_re;
611  a12_re += c12_02_re * acc02_re;
612  a12_re -= c12_02_im * acc02_im;
613  a12_im += c12_02_re * acc02_im;
614  a12_im += c12_02_im * acc02_re;
615  a12_re += c12_10_re * acc10_re;
616  a12_re -= c12_10_im * acc10_im;
617  a12_im += c12_10_re * acc10_im;
618  a12_im += c12_10_im * acc10_re;
619  a12_re += c12_11_re * acc11_re;
620  a12_re -= c12_11_im * acc11_im;
621  a12_im += c12_11_re * acc11_im;
622  a12_im += c12_11_im * acc11_re;
623  a12_re += c12_12_re * acc12_re;
624  a12_im += c12_12_re * acc12_im;
625 
626  acc00_re = a00_re; acc00_im = a00_im;
627  acc01_re = a01_re; acc01_im = a01_im;
628  acc02_re = a02_re; acc02_im = a02_im;
629  acc10_re = a10_re; acc10_im = a10_im;
630  acc11_re = a11_re; acc11_im = a11_im;
631  acc12_re = a12_re; acc12_im = a12_im;
632 
633  }
634 
635  // apply second chiral block
636  {
638 
639  spinorFloat a20_re = 0; spinorFloat a20_im = 0;
640  spinorFloat a21_re = 0; spinorFloat a21_im = 0;
641  spinorFloat a22_re = 0; spinorFloat a22_im = 0;
642  spinorFloat a30_re = 0; spinorFloat a30_im = 0;
643  spinorFloat a31_re = 0; spinorFloat a31_im = 0;
644  spinorFloat a32_re = 0; spinorFloat a32_im = 0;
645 
646  a20_re += c20_20_re * acc20_re;
647  a20_im += c20_20_re * acc20_im;
648  a20_re += c20_21_re * acc21_re;
649  a20_re -= c20_21_im * acc21_im;
650  a20_im += c20_21_re * acc21_im;
651  a20_im += c20_21_im * acc21_re;
652  a20_re += c20_22_re * acc22_re;
653  a20_re -= c20_22_im * acc22_im;
654  a20_im += c20_22_re * acc22_im;
655  a20_im += c20_22_im * acc22_re;
656  a20_re += c20_30_re * acc30_re;
657  a20_re -= c20_30_im * acc30_im;
658  a20_im += c20_30_re * acc30_im;
659  a20_im += c20_30_im * acc30_re;
660  a20_re += c20_31_re * acc31_re;
661  a20_re -= c20_31_im * acc31_im;
662  a20_im += c20_31_re * acc31_im;
663  a20_im += c20_31_im * acc31_re;
664  a20_re += c20_32_re * acc32_re;
665  a20_re -= c20_32_im * acc32_im;
666  a20_im += c20_32_re * acc32_im;
667  a20_im += c20_32_im * acc32_re;
668 
669  a21_re += c21_20_re * acc20_re;
670  a21_re -= c21_20_im * acc20_im;
671  a21_im += c21_20_re * acc20_im;
672  a21_im += c21_20_im * acc20_re;
673  a21_re += c21_21_re * acc21_re;
674  a21_im += c21_21_re * acc21_im;
675  a21_re += c21_22_re * acc22_re;
676  a21_re -= c21_22_im * acc22_im;
677  a21_im += c21_22_re * acc22_im;
678  a21_im += c21_22_im * acc22_re;
679  a21_re += c21_30_re * acc30_re;
680  a21_re -= c21_30_im * acc30_im;
681  a21_im += c21_30_re * acc30_im;
682  a21_im += c21_30_im * acc30_re;
683  a21_re += c21_31_re * acc31_re;
684  a21_re -= c21_31_im * acc31_im;
685  a21_im += c21_31_re * acc31_im;
686  a21_im += c21_31_im * acc31_re;
687  a21_re += c21_32_re * acc32_re;
688  a21_re -= c21_32_im * acc32_im;
689  a21_im += c21_32_re * acc32_im;
690  a21_im += c21_32_im * acc32_re;
691 
692  a22_re += c22_20_re * acc20_re;
693  a22_re -= c22_20_im * acc20_im;
694  a22_im += c22_20_re * acc20_im;
695  a22_im += c22_20_im * acc20_re;
696  a22_re += c22_21_re * acc21_re;
697  a22_re -= c22_21_im * acc21_im;
698  a22_im += c22_21_re * acc21_im;
699  a22_im += c22_21_im * acc21_re;
700  a22_re += c22_22_re * acc22_re;
701  a22_im += c22_22_re * acc22_im;
702  a22_re += c22_30_re * acc30_re;
703  a22_re -= c22_30_im * acc30_im;
704  a22_im += c22_30_re * acc30_im;
705  a22_im += c22_30_im * acc30_re;
706  a22_re += c22_31_re * acc31_re;
707  a22_re -= c22_31_im * acc31_im;
708  a22_im += c22_31_re * acc31_im;
709  a22_im += c22_31_im * acc31_re;
710  a22_re += c22_32_re * acc32_re;
711  a22_re -= c22_32_im * acc32_im;
712  a22_im += c22_32_re * acc32_im;
713  a22_im += c22_32_im * acc32_re;
714 
715  a30_re += c30_20_re * acc20_re;
716  a30_re -= c30_20_im * acc20_im;
717  a30_im += c30_20_re * acc20_im;
718  a30_im += c30_20_im * acc20_re;
719  a30_re += c30_21_re * acc21_re;
720  a30_re -= c30_21_im * acc21_im;
721  a30_im += c30_21_re * acc21_im;
722  a30_im += c30_21_im * acc21_re;
723  a30_re += c30_22_re * acc22_re;
724  a30_re -= c30_22_im * acc22_im;
725  a30_im += c30_22_re * acc22_im;
726  a30_im += c30_22_im * acc22_re;
727  a30_re += c30_30_re * acc30_re;
728  a30_im += c30_30_re * acc30_im;
729  a30_re += c30_31_re * acc31_re;
730  a30_re -= c30_31_im * acc31_im;
731  a30_im += c30_31_re * acc31_im;
732  a30_im += c30_31_im * acc31_re;
733  a30_re += c30_32_re * acc32_re;
734  a30_re -= c30_32_im * acc32_im;
735  a30_im += c30_32_re * acc32_im;
736  a30_im += c30_32_im * acc32_re;
737 
738  a31_re += c31_20_re * acc20_re;
739  a31_re -= c31_20_im * acc20_im;
740  a31_im += c31_20_re * acc20_im;
741  a31_im += c31_20_im * acc20_re;
742  a31_re += c31_21_re * acc21_re;
743  a31_re -= c31_21_im * acc21_im;
744  a31_im += c31_21_re * acc21_im;
745  a31_im += c31_21_im * acc21_re;
746  a31_re += c31_22_re * acc22_re;
747  a31_re -= c31_22_im * acc22_im;
748  a31_im += c31_22_re * acc22_im;
749  a31_im += c31_22_im * acc22_re;
750  a31_re += c31_30_re * acc30_re;
751  a31_re -= c31_30_im * acc30_im;
752  a31_im += c31_30_re * acc30_im;
753  a31_im += c31_30_im * acc30_re;
754  a31_re += c31_31_re * acc31_re;
755  a31_im += c31_31_re * acc31_im;
756  a31_re += c31_32_re * acc32_re;
757  a31_re -= c31_32_im * acc32_im;
758  a31_im += c31_32_re * acc32_im;
759  a31_im += c31_32_im * acc32_re;
760 
761  a32_re += c32_20_re * acc20_re;
762  a32_re -= c32_20_im * acc20_im;
763  a32_im += c32_20_re * acc20_im;
764  a32_im += c32_20_im * acc20_re;
765  a32_re += c32_21_re * acc21_re;
766  a32_re -= c32_21_im * acc21_im;
767  a32_im += c32_21_re * acc21_im;
768  a32_im += c32_21_im * acc21_re;
769  a32_re += c32_22_re * acc22_re;
770  a32_re -= c32_22_im * acc22_im;
771  a32_im += c32_22_re * acc22_im;
772  a32_im += c32_22_im * acc22_re;
773  a32_re += c32_30_re * acc30_re;
774  a32_re -= c32_30_im * acc30_im;
775  a32_im += c32_30_re * acc30_im;
776  a32_im += c32_30_im * acc30_re;
777  a32_re += c32_31_re * acc31_re;
778  a32_re -= c32_31_im * acc31_im;
779  a32_im += c32_31_re * acc31_im;
780  a32_im += c32_31_im * acc31_re;
781  a32_re += c32_32_re * acc32_re;
782  a32_im += c32_32_re * acc32_im;
783 
784  acc20_re = a20_re; acc20_im = a20_im;
785  acc21_re = a21_re; acc21_im = a21_im;
786  acc22_re = a22_re; acc22_im = a22_im;
787  acc30_re = a30_re; acc30_im = a30_im;
788  acc31_re = a31_re; acc31_im = a31_im;
789  acc32_re = a32_re; acc32_im = a32_im;
790 
791  }
792 
793  // change back from chiral basis
794  // (note: required factor of 1/2 is included in clover term normalization)
795  {
796  spinorFloat a00_re = acc10_re + acc30_re;
797  spinorFloat a00_im = acc10_im + acc30_im;
798  spinorFloat a10_re = -acc00_re - acc20_re;
799  spinorFloat a10_im = -acc00_im - acc20_im;
800  spinorFloat a20_re = acc10_re - acc30_re;
801  spinorFloat a20_im = acc10_im - acc30_im;
802  spinorFloat a30_re = -acc00_re + acc20_re;
803  spinorFloat a30_im = -acc00_im + acc20_im;
804 
805  acc00_re = a00_re; acc00_im = a00_im;
806  acc10_re = a10_re; acc10_im = a10_im;
807  acc20_re = a20_re; acc20_im = a20_im;
808  acc30_re = a30_re; acc30_im = a30_im;
809  }
810 
811  {
812  spinorFloat a01_re = acc11_re + acc31_re;
813  spinorFloat a01_im = acc11_im + acc31_im;
814  spinorFloat a11_re = -acc01_re - acc21_re;
815  spinorFloat a11_im = -acc01_im - acc21_im;
816  spinorFloat a21_re = acc11_re - acc31_re;
817  spinorFloat a21_im = acc11_im - acc31_im;
818  spinorFloat a31_re = -acc01_re + acc21_re;
819  spinorFloat a31_im = -acc01_im + acc21_im;
820 
821  acc01_re = a01_re; acc01_im = a01_im;
822  acc11_re = a11_re; acc11_im = a11_im;
823  acc21_re = a21_re; acc21_im = a21_im;
824  acc31_re = a31_re; acc31_im = a31_im;
825  }
826 
827  {
828  spinorFloat a02_re = acc12_re + acc32_re;
829  spinorFloat a02_im = acc12_im + acc32_im;
830  spinorFloat a12_re = -acc02_re - acc22_re;
831  spinorFloat a12_im = -acc02_im - acc22_im;
832  spinorFloat a22_re = acc12_re - acc32_re;
833  spinorFloat a22_im = acc12_im - acc32_im;
834  spinorFloat a32_re = -acc02_re + acc22_re;
835  spinorFloat a32_im = -acc02_im + acc22_im;
836 
837  acc02_re = a02_re; acc02_im = a02_im;
838  acc12_re = a12_re; acc12_im = a12_im;
839  acc22_re = a22_re; acc22_im = a22_im;
840  acc32_re = a32_re; acc32_im = a32_im;
841  }
842 
843 #endif // DSLASH_CLOVER
844 
845  o00_re = acc00_re;
846  o00_im = acc00_im;
847  o01_re = acc01_re;
848  o01_im = acc01_im;
849  o02_re = acc02_re;
850  o02_im = acc02_im;
851  o10_re = acc10_re;
852  o10_im = acc10_im;
853  o11_re = acc11_re;
854  o11_im = acc11_im;
855  o12_re = acc12_re;
856  o12_im = acc12_im;
857  o20_re = acc20_re;
858  o20_im = acc20_im;
859  o21_re = acc21_re;
860  o21_im = acc21_im;
861  o22_re = acc22_re;
862  o22_im = acc22_im;
863  o30_re = acc30_re;
864  o30_im = acc30_im;
865  o31_re = acc31_re;
866  o31_im = acc31_im;
867  o32_re = acc32_re;
868  o32_im = acc32_im;
869 #endif // DSLASH_CLOVER_XPAY
870 
871 #ifdef MULTI_GPU
872 } else { // exterior kernel
873 
874  sid = blockIdx.x*blockDim.x + threadIdx.x;
875  if (sid >= param.threads) return;
876 
877  const int face_volume = (param.threads >> 1); // volume of one face
878  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
879  face_idx = sid - face_num*face_volume; // index into the respective face
880 
881  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
882  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
883  //sp_idx = face_idx + param.ghostOffset[dim];
884 
885  coordsFromFaceIndex<4,QUDA_4D_PC,kernel_type,1>(X, sid, coord, face_idx, face_num, param);
886 
888 
889  o00_re = i00_re; o00_im = i00_im;
890  o01_re = i01_re; o01_im = i01_im;
891  o02_re = i02_re; o02_im = i02_im;
892  o10_re = i10_re; o10_im = i10_im;
893  o11_re = i11_re; o11_im = i11_im;
894  o12_re = i12_re; o12_im = i12_im;
895  o20_re = i20_re; o20_im = i20_im;
896  o21_re = i21_re; o21_im = i21_im;
897  o22_re = i22_re; o22_im = i22_im;
898  o30_re = i30_re; o30_im = i30_im;
899  o31_re = i31_re; o31_im = i31_im;
900  o32_re = i32_re; o32_im = i32_im;
901 }
902 #endif // MULTI_GPU
903 
904 
905 #ifdef MULTI_GPU
906 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]<(param.dc.X[0]-1))) ||
907  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==(param.dc.X[0]-1)) )
908 #endif
909 {
910  // Projector P0-
911  // 1 0 0 -i
912  // 0 1 -i 0
913  // 0 i 1 0
914  // i 0 0 1
915 
916 #ifdef MULTI_GPU
917  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1 :
918  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
919 #if (DD_PREC==2) // half precision
920  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
921 #endif
922 #else
923  const int sp_idx = (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1;
924 #endif
925 
926  const int ga_idx = sid;
927 
934 
935 #ifdef MULTI_GPU
936  if (kernel_type == INTERIOR_KERNEL) {
937 #endif
938 
939  // read spinor from device memory
940  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
941 
942  // store spinor into shared memory
943  WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i);
944 
945  // project spinor into half spinors
946  a0_re = +i00_re+i30_im;
947  a0_im = +i00_im-i30_re;
948  a1_re = +i01_re+i31_im;
949  a1_im = +i01_im-i31_re;
950  a2_re = +i02_re+i32_im;
951  a2_im = +i02_im-i32_re;
952  b0_re = +i10_re+i20_im;
953  b0_im = +i10_im-i20_re;
954  b1_re = +i11_re+i21_im;
955  b1_im = +i11_im-i21_re;
956  b2_re = +i12_re+i22_im;
957  b2_im = +i12_im-i22_re;
958 
959 #ifdef MULTI_GPU
960  } else {
961 
962  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
963 
964  // read half spinor from device memory
965  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 0);
966 
967  a0_re = i00_re; a0_im = i00_im;
968  a1_re = i01_re; a1_im = i01_im;
969  a2_re = i02_re; a2_im = i02_im;
970  b0_re = i10_re; b0_im = i10_im;
971  b1_re = i11_re; b1_im = i11_im;
972  b2_re = i12_re; b2_im = i12_im;
973 
974  }
975 #endif // MULTI_GPU
976 
977  // read gauge matrix from device memory
978  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride);
979 
980  // reconstruct gauge matrix
982 
983  // multiply row 0
985  A0_re += g00_re * a0_re;
986  A0_re -= g00_im * a0_im;
987  A0_re += g01_re * a1_re;
988  A0_re -= g01_im * a1_im;
989  A0_re += g02_re * a2_re;
990  A0_re -= g02_im * a2_im;
992  A0_im += g00_re * a0_im;
993  A0_im += g00_im * a0_re;
994  A0_im += g01_re * a1_im;
995  A0_im += g01_im * a1_re;
996  A0_im += g02_re * a2_im;
997  A0_im += g02_im * a2_re;
999  B0_re += g00_re * b0_re;
1000  B0_re -= g00_im * b0_im;
1001  B0_re += g01_re * b1_re;
1002  B0_re -= g01_im * b1_im;
1003  B0_re += g02_re * b2_re;
1004  B0_re -= g02_im * b2_im;
1006  B0_im += g00_re * b0_im;
1007  B0_im += g00_im * b0_re;
1008  B0_im += g01_re * b1_im;
1009  B0_im += g01_im * b1_re;
1010  B0_im += g02_re * b2_im;
1011  B0_im += g02_im * b2_re;
1012 
1013  // multiply row 1
1015  A1_re += g10_re * a0_re;
1016  A1_re -= g10_im * a0_im;
1017  A1_re += g11_re * a1_re;
1018  A1_re -= g11_im * a1_im;
1019  A1_re += g12_re * a2_re;
1020  A1_re -= g12_im * a2_im;
1022  A1_im += g10_re * a0_im;
1023  A1_im += g10_im * a0_re;
1024  A1_im += g11_re * a1_im;
1025  A1_im += g11_im * a1_re;
1026  A1_im += g12_re * a2_im;
1027  A1_im += g12_im * a2_re;
1029  B1_re += g10_re * b0_re;
1030  B1_re -= g10_im * b0_im;
1031  B1_re += g11_re * b1_re;
1032  B1_re -= g11_im * b1_im;
1033  B1_re += g12_re * b2_re;
1034  B1_re -= g12_im * b2_im;
1036  B1_im += g10_re * b0_im;
1037  B1_im += g10_im * b0_re;
1038  B1_im += g11_re * b1_im;
1039  B1_im += g11_im * b1_re;
1040  B1_im += g12_re * b2_im;
1041  B1_im += g12_im * b2_re;
1042 
1043  // multiply row 2
1045  A2_re += g20_re * a0_re;
1046  A2_re -= g20_im * a0_im;
1047  A2_re += g21_re * a1_re;
1048  A2_re -= g21_im * a1_im;
1049  A2_re += g22_re * a2_re;
1050  A2_re -= g22_im * a2_im;
1052  A2_im += g20_re * a0_im;
1053  A2_im += g20_im * a0_re;
1054  A2_im += g21_re * a1_im;
1055  A2_im += g21_im * a1_re;
1056  A2_im += g22_re * a2_im;
1057  A2_im += g22_im * a2_re;
1059  B2_re += g20_re * b0_re;
1060  B2_re -= g20_im * b0_im;
1061  B2_re += g21_re * b1_re;
1062  B2_re -= g21_im * b1_im;
1063  B2_re += g22_re * b2_re;
1064  B2_re -= g22_im * b2_im;
1066  B2_im += g20_re * b0_im;
1067  B2_im += g20_im * b0_re;
1068  B2_im += g21_re * b1_im;
1069  B2_im += g21_im * b1_re;
1070  B2_im += g22_re * b2_im;
1071  B2_im += g22_im * b2_re;
1072 
1073 #ifdef SPINOR_DOUBLE
1074  spinorFloat a = param.a;
1075 #else
1077 #endif
1078  o00_re += a*A0_re;
1079  o00_im += a*A0_im;
1080  o10_re += a*B0_re;
1081  o10_im += a*B0_im;
1082  o20_re -= a*B0_im;
1083  o20_im += a*B0_re;
1084  o30_re -= a*A0_im;
1085  o30_im += a*A0_re;
1086 
1087  o01_re += a*A1_re;
1088  o01_im += a*A1_im;
1089  o11_re += a*B1_re;
1090  o11_im += a*B1_im;
1091  o21_re -= a*B1_im;
1092  o21_im += a*B1_re;
1093  o31_re -= a*A1_im;
1094  o31_im += a*A1_re;
1095 
1096  o02_re += a*A2_re;
1097  o02_im += a*A2_im;
1098  o12_re += a*B2_re;
1099  o12_im += a*B2_im;
1100  o22_re -= a*B2_im;
1101  o22_im += a*B2_re;
1102  o32_re -= a*A2_im;
1103  o32_im += a*A2_re;
1104 
1105 }
1106 
1107 #ifdef MULTI_GPU
1108 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]>0)) ||
1109  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==0) )
1110 #endif
1111 {
1112  // Projector P0+
1113  // 1 0 0 i
1114  // 0 1 i 0
1115  // 0 -i 1 0
1116  // -i 0 0 1
1117 
1118 #ifdef MULTI_GPU
1119  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1 :
1120  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1121 #if (DD_PREC==2) // half precision
1122  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1123 #endif
1124 #else
1125  const int sp_idx = (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1;
1126 #endif
1127 
1128 #ifdef MULTI_GPU
1129  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1130 #else
1131  const int ga_idx = sp_idx;
1132 #endif
1133 
1140 
1141 #ifdef MULTI_GPU
1142  if (kernel_type == INTERIOR_KERNEL) {
1143 #endif
1144 
1145  // load spinor from shared memory
1146  int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1;
1147  __syncthreads();
1148  READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z);
1149 
1150  // project spinor into half spinors
1151  a0_re = +i00_re-i30_im;
1152  a0_im = +i00_im+i30_re;
1153  a1_re = +i01_re-i31_im;
1154  a1_im = +i01_im+i31_re;
1155  a2_re = +i02_re-i32_im;
1156  a2_im = +i02_im+i32_re;
1157  b0_re = +i10_re-i20_im;
1158  b0_im = +i10_im+i20_re;
1159  b1_re = +i11_re-i21_im;
1160  b1_im = +i11_im+i21_re;
1161  b2_re = +i12_re-i22_im;
1162  b2_im = +i12_im+i22_re;
1163 
1164 #ifdef MULTI_GPU
1165  } else {
1166 
1167  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1168 
1169  // read half spinor from device memory
1170  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 1);
1171 
1172  a0_re = i00_re; a0_im = i00_im;
1173  a1_re = i01_re; a1_im = i01_im;
1174  a2_re = i02_re; a2_im = i02_im;
1175  b0_re = i10_re; b0_im = i10_im;
1176  b1_re = i11_re; b1_im = i11_im;
1177  b2_re = i12_re; b2_im = i12_im;
1178 
1179  }
1180 #endif // MULTI_GPU
1181 
1182  // read gauge matrix from device memory
1183  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, param.gauge_stride);
1184 
1185  // reconstruct gauge matrix
1187 
1188  // multiply row 0
1189  spinorFloat A0_re = 0;
1190  A0_re += gT00_re * a0_re;
1191  A0_re -= gT00_im * a0_im;
1192  A0_re += gT01_re * a1_re;
1193  A0_re -= gT01_im * a1_im;
1194  A0_re += gT02_re * a2_re;
1195  A0_re -= gT02_im * a2_im;
1196  spinorFloat A0_im = 0;
1197  A0_im += gT00_re * a0_im;
1198  A0_im += gT00_im * a0_re;
1199  A0_im += gT01_re * a1_im;
1200  A0_im += gT01_im * a1_re;
1201  A0_im += gT02_re * a2_im;
1202  A0_im += gT02_im * a2_re;
1203  spinorFloat B0_re = 0;
1204  B0_re += gT00_re * b0_re;
1205  B0_re -= gT00_im * b0_im;
1206  B0_re += gT01_re * b1_re;
1207  B0_re -= gT01_im * b1_im;
1208  B0_re += gT02_re * b2_re;
1209  B0_re -= gT02_im * b2_im;
1210  spinorFloat B0_im = 0;
1211  B0_im += gT00_re * b0_im;
1212  B0_im += gT00_im * b0_re;
1213  B0_im += gT01_re * b1_im;
1214  B0_im += gT01_im * b1_re;
1215  B0_im += gT02_re * b2_im;
1216  B0_im += gT02_im * b2_re;
1217 
1218  // multiply row 1
1219  spinorFloat A1_re = 0;
1220  A1_re += gT10_re * a0_re;
1221  A1_re -= gT10_im * a0_im;
1222  A1_re += gT11_re * a1_re;
1223  A1_re -= gT11_im * a1_im;
1224  A1_re += gT12_re * a2_re;
1225  A1_re -= gT12_im * a2_im;
1226  spinorFloat A1_im = 0;
1227  A1_im += gT10_re * a0_im;
1228  A1_im += gT10_im * a0_re;
1229  A1_im += gT11_re * a1_im;
1230  A1_im += gT11_im * a1_re;
1231  A1_im += gT12_re * a2_im;
1232  A1_im += gT12_im * a2_re;
1233  spinorFloat B1_re = 0;
1234  B1_re += gT10_re * b0_re;
1235  B1_re -= gT10_im * b0_im;
1236  B1_re += gT11_re * b1_re;
1237  B1_re -= gT11_im * b1_im;
1238  B1_re += gT12_re * b2_re;
1239  B1_re -= gT12_im * b2_im;
1240  spinorFloat B1_im = 0;
1241  B1_im += gT10_re * b0_im;
1242  B1_im += gT10_im * b0_re;
1243  B1_im += gT11_re * b1_im;
1244  B1_im += gT11_im * b1_re;
1245  B1_im += gT12_re * b2_im;
1246  B1_im += gT12_im * b2_re;
1247 
1248  // multiply row 2
1249  spinorFloat A2_re = 0;
1250  A2_re += gT20_re * a0_re;
1251  A2_re -= gT20_im * a0_im;
1252  A2_re += gT21_re * a1_re;
1253  A2_re -= gT21_im * a1_im;
1254  A2_re += gT22_re * a2_re;
1255  A2_re -= gT22_im * a2_im;
1256  spinorFloat A2_im = 0;
1257  A2_im += gT20_re * a0_im;
1258  A2_im += gT20_im * a0_re;
1259  A2_im += gT21_re * a1_im;
1260  A2_im += gT21_im * a1_re;
1261  A2_im += gT22_re * a2_im;
1262  A2_im += gT22_im * a2_re;
1263  spinorFloat B2_re = 0;
1264  B2_re += gT20_re * b0_re;
1265  B2_re -= gT20_im * b0_im;
1266  B2_re += gT21_re * b1_re;
1267  B2_re -= gT21_im * b1_im;
1268  B2_re += gT22_re * b2_re;
1269  B2_re -= gT22_im * b2_im;
1270  spinorFloat B2_im = 0;
1271  B2_im += gT20_re * b0_im;
1272  B2_im += gT20_im * b0_re;
1273  B2_im += gT21_re * b1_im;
1274  B2_im += gT21_im * b1_re;
1275  B2_im += gT22_re * b2_im;
1276  B2_im += gT22_im * b2_re;
1277 
1278 #ifdef SPINOR_DOUBLE
1279  spinorFloat a = param.a;
1280 #else
1281  spinorFloat a = param.a_f;
1282 #endif
1283  o00_re += a*A0_re;
1284  o00_im += a*A0_im;
1285  o10_re += a*B0_re;
1286  o10_im += a*B0_im;
1287  o20_re += a*B0_im;
1288  o20_im -= a*B0_re;
1289  o30_re += a*A0_im;
1290  o30_im -= a*A0_re;
1291 
1292  o01_re += a*A1_re;
1293  o01_im += a*A1_im;
1294  o11_re += a*B1_re;
1295  o11_im += a*B1_im;
1296  o21_re += a*B1_im;
1297  o21_im -= a*B1_re;
1298  o31_re += a*A1_im;
1299  o31_im -= a*A1_re;
1300 
1301  o02_re += a*A2_re;
1302  o02_im += a*A2_im;
1303  o12_re += a*B2_re;
1304  o12_im += a*B2_im;
1305  o22_re += a*B2_im;
1306  o22_im -= a*B2_re;
1307  o32_re += a*A2_im;
1308  o32_im -= a*A2_re;
1309 
1310 }
1311 
1312 #ifdef MULTI_GPU
1313 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]<(param.dc.X[1]-1))) ||
1314  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==(param.dc.X[1]-1)) )
1315 #endif
1316 {
1317  // Projector P1-
1318  // 1 0 0 -1
1319  // 0 1 1 0
1320  // 0 1 1 0
1321  // -1 0 0 1
1322 
1323 #ifdef MULTI_GPU
1324  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1 :
1325  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1326 #if (DD_PREC==2) // half precision
1327  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1328 #endif
1329 #else
1330  const int sp_idx = (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1;
1331 #endif
1332 
1333  const int ga_idx = sid;
1334 
1341 
1342 #ifdef MULTI_GPU
1343  if (kernel_type == INTERIOR_KERNEL) {
1344 #endif
1345 
1346  if (threadIdx.y == blockDim.y-1 && blockDim.y < param.dc.X[1] ) {
1347  // read spinor from device memory
1348  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1349 
1350  // project spinor into half spinors
1351  a0_re = +i00_re-i30_re;
1352  a0_im = +i00_im-i30_im;
1353  a1_re = +i01_re-i31_re;
1354  a1_im = +i01_im-i31_im;
1355  a2_re = +i02_re-i32_re;
1356  a2_im = +i02_im-i32_im;
1357  b0_re = +i10_re+i20_re;
1358  b0_im = +i10_im+i20_im;
1359  b1_re = +i11_re+i21_re;
1360  b1_im = +i11_im+i21_im;
1361  b2_re = +i12_re+i22_re;
1362  b2_im = +i12_im+i22_im;
1363  } else {
1364  // load spinor from shared memory
1365  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x;
1366  int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0;
1367  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
1368 
1369  // project spinor into half spinors
1370  a0_re = +i00_re-i30_re;
1371  a0_im = +i00_im-i30_im;
1372  a1_re = +i01_re-i31_re;
1373  a1_im = +i01_im-i31_im;
1374  a2_re = +i02_re-i32_re;
1375  a2_im = +i02_im-i32_im;
1376  b0_re = +i10_re+i20_re;
1377  b0_im = +i10_im+i20_im;
1378  b1_re = +i11_re+i21_re;
1379  b1_im = +i11_im+i21_im;
1380  b2_re = +i12_re+i22_re;
1381  b2_im = +i12_im+i22_im;
1382  }
1383 
1384 #ifdef MULTI_GPU
1385  } else {
1386 
1387  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1388 
1389  // read half spinor from device memory
1390  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 2);
1391 
1392  a0_re = i00_re; a0_im = i00_im;
1393  a1_re = i01_re; a1_im = i01_im;
1394  a2_re = i02_re; a2_im = i02_im;
1395  b0_re = i10_re; b0_im = i10_im;
1396  b1_re = i11_re; b1_im = i11_im;
1397  b2_re = i12_re; b2_im = i12_im;
1398 
1399  }
1400 #endif // MULTI_GPU
1401 
1402  // read gauge matrix from device memory
1403  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, param.gauge_stride);
1404 
1405  // reconstruct gauge matrix
1407 
1408  // multiply row 0
1409  spinorFloat A0_re = 0;
1410  A0_re += g00_re * a0_re;
1411  A0_re -= g00_im * a0_im;
1412  A0_re += g01_re * a1_re;
1413  A0_re -= g01_im * a1_im;
1414  A0_re += g02_re * a2_re;
1415  A0_re -= g02_im * a2_im;
1416  spinorFloat A0_im = 0;
1417  A0_im += g00_re * a0_im;
1418  A0_im += g00_im * a0_re;
1419  A0_im += g01_re * a1_im;
1420  A0_im += g01_im * a1_re;
1421  A0_im += g02_re * a2_im;
1422  A0_im += g02_im * a2_re;
1423  spinorFloat B0_re = 0;
1424  B0_re += g00_re * b0_re;
1425  B0_re -= g00_im * b0_im;
1426  B0_re += g01_re * b1_re;
1427  B0_re -= g01_im * b1_im;
1428  B0_re += g02_re * b2_re;
1429  B0_re -= g02_im * b2_im;
1430  spinorFloat B0_im = 0;
1431  B0_im += g00_re * b0_im;
1432  B0_im += g00_im * b0_re;
1433  B0_im += g01_re * b1_im;
1434  B0_im += g01_im * b1_re;
1435  B0_im += g02_re * b2_im;
1436  B0_im += g02_im * b2_re;
1437 
1438  // multiply row 1
1439  spinorFloat A1_re = 0;
1440  A1_re += g10_re * a0_re;
1441  A1_re -= g10_im * a0_im;
1442  A1_re += g11_re * a1_re;
1443  A1_re -= g11_im * a1_im;
1444  A1_re += g12_re * a2_re;
1445  A1_re -= g12_im * a2_im;
1446  spinorFloat A1_im = 0;
1447  A1_im += g10_re * a0_im;
1448  A1_im += g10_im * a0_re;
1449  A1_im += g11_re * a1_im;
1450  A1_im += g11_im * a1_re;
1451  A1_im += g12_re * a2_im;
1452  A1_im += g12_im * a2_re;
1453  spinorFloat B1_re = 0;
1454  B1_re += g10_re * b0_re;
1455  B1_re -= g10_im * b0_im;
1456  B1_re += g11_re * b1_re;
1457  B1_re -= g11_im * b1_im;
1458  B1_re += g12_re * b2_re;
1459  B1_re -= g12_im * b2_im;
1460  spinorFloat B1_im = 0;
1461  B1_im += g10_re * b0_im;
1462  B1_im += g10_im * b0_re;
1463  B1_im += g11_re * b1_im;
1464  B1_im += g11_im * b1_re;
1465  B1_im += g12_re * b2_im;
1466  B1_im += g12_im * b2_re;
1467 
1468  // multiply row 2
1469  spinorFloat A2_re = 0;
1470  A2_re += g20_re * a0_re;
1471  A2_re -= g20_im * a0_im;
1472  A2_re += g21_re * a1_re;
1473  A2_re -= g21_im * a1_im;
1474  A2_re += g22_re * a2_re;
1475  A2_re -= g22_im * a2_im;
1476  spinorFloat A2_im = 0;
1477  A2_im += g20_re * a0_im;
1478  A2_im += g20_im * a0_re;
1479  A2_im += g21_re * a1_im;
1480  A2_im += g21_im * a1_re;
1481  A2_im += g22_re * a2_im;
1482  A2_im += g22_im * a2_re;
1483  spinorFloat B2_re = 0;
1484  B2_re += g20_re * b0_re;
1485  B2_re -= g20_im * b0_im;
1486  B2_re += g21_re * b1_re;
1487  B2_re -= g21_im * b1_im;
1488  B2_re += g22_re * b2_re;
1489  B2_re -= g22_im * b2_im;
1490  spinorFloat B2_im = 0;
1491  B2_im += g20_re * b0_im;
1492  B2_im += g20_im * b0_re;
1493  B2_im += g21_re * b1_im;
1494  B2_im += g21_im * b1_re;
1495  B2_im += g22_re * b2_im;
1496  B2_im += g22_im * b2_re;
1497 
1498 #ifdef SPINOR_DOUBLE
1499  spinorFloat a = param.a;
1500 #else
1501  spinorFloat a = param.a_f;
1502 #endif
1503  o00_re += a*A0_re;
1504  o00_im += a*A0_im;
1505  o10_re += a*B0_re;
1506  o10_im += a*B0_im;
1507  o20_re += a*B0_re;
1508  o20_im += a*B0_im;
1509  o30_re -= a*A0_re;
1510  o30_im -= a*A0_im;
1511 
1512  o01_re += a*A1_re;
1513  o01_im += a*A1_im;
1514  o11_re += a*B1_re;
1515  o11_im += a*B1_im;
1516  o21_re += a*B1_re;
1517  o21_im += a*B1_im;
1518  o31_re -= a*A1_re;
1519  o31_im -= a*A1_im;
1520 
1521  o02_re += a*A2_re;
1522  o02_im += a*A2_im;
1523  o12_re += a*B2_re;
1524  o12_im += a*B2_im;
1525  o22_re += a*B2_re;
1526  o22_im += a*B2_im;
1527  o32_re -= a*A2_re;
1528  o32_im -= a*A2_im;
1529 
1530 }
1531 
1532 #ifdef MULTI_GPU
1533 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]>0)) ||
1534  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==0) )
1535 #endif
1536 {
1537  // Projector P1+
1538  // 1 0 0 1
1539  // 0 1 -1 0
1540  // 0 -1 1 0
1541  // 1 0 0 1
1542 
1543 #ifdef MULTI_GPU
1544  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1 :
1545  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1546 #if (DD_PREC==2) // half precision
1547  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1548 #endif
1549 #else
1550  const int sp_idx = (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1;
1551 #endif
1552 
1553 #ifdef MULTI_GPU
1554  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1555 #else
1556  const int ga_idx = sp_idx;
1557 #endif
1558 
1565 
1566 #ifdef MULTI_GPU
1567  if (kernel_type == INTERIOR_KERNEL) {
1568 #endif
1569 
1570  if (threadIdx.y == 0 && blockDim.y < param.dc.X[1]) {
1571  // read spinor from device memory
1572  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1573 
1574  // project spinor into half spinors
1575  a0_re = +i00_re+i30_re;
1576  a0_im = +i00_im+i30_im;
1577  a1_re = +i01_re+i31_re;
1578  a1_im = +i01_im+i31_im;
1579  a2_re = +i02_re+i32_re;
1580  a2_im = +i02_im+i32_im;
1581  b0_re = +i10_re-i20_re;
1582  b0_im = +i10_im-i20_im;
1583  b1_re = +i11_re-i21_re;
1584  b1_im = +i11_im-i21_im;
1585  b2_re = +i12_re-i22_re;
1586  b2_im = +i12_im-i22_im;
1587  } else {
1588  // load spinor from shared memory
1589  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x;
1590  int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1;
1591  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
1592 
1593  // project spinor into half spinors
1594  a0_re = +i00_re+i30_re;
1595  a0_im = +i00_im+i30_im;
1596  a1_re = +i01_re+i31_re;
1597  a1_im = +i01_im+i31_im;
1598  a2_re = +i02_re+i32_re;
1599  a2_im = +i02_im+i32_im;
1600  b0_re = +i10_re-i20_re;
1601  b0_im = +i10_im-i20_im;
1602  b1_re = +i11_re-i21_re;
1603  b1_im = +i11_im-i21_im;
1604  b2_re = +i12_re-i22_re;
1605  b2_im = +i12_im-i22_im;
1606  }
1607 
1608 #ifdef MULTI_GPU
1609  } else {
1610 
1611  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1612 
1613  // read half spinor from device memory
1614  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 3);
1615 
1616  a0_re = i00_re; a0_im = i00_im;
1617  a1_re = i01_re; a1_im = i01_im;
1618  a2_re = i02_re; a2_im = i02_im;
1619  b0_re = i10_re; b0_im = i10_im;
1620  b1_re = i11_re; b1_im = i11_im;
1621  b2_re = i12_re; b2_im = i12_im;
1622 
1623  }
1624 #endif // MULTI_GPU
1625 
1626  // read gauge matrix from device memory
1627  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, param.gauge_stride);
1628 
1629  // reconstruct gauge matrix
1631 
1632  // multiply row 0
1633  spinorFloat A0_re = 0;
1634  A0_re += gT00_re * a0_re;
1635  A0_re -= gT00_im * a0_im;
1636  A0_re += gT01_re * a1_re;
1637  A0_re -= gT01_im * a1_im;
1638  A0_re += gT02_re * a2_re;
1639  A0_re -= gT02_im * a2_im;
1640  spinorFloat A0_im = 0;
1641  A0_im += gT00_re * a0_im;
1642  A0_im += gT00_im * a0_re;
1643  A0_im += gT01_re * a1_im;
1644  A0_im += gT01_im * a1_re;
1645  A0_im += gT02_re * a2_im;
1646  A0_im += gT02_im * a2_re;
1647  spinorFloat B0_re = 0;
1648  B0_re += gT00_re * b0_re;
1649  B0_re -= gT00_im * b0_im;
1650  B0_re += gT01_re * b1_re;
1651  B0_re -= gT01_im * b1_im;
1652  B0_re += gT02_re * b2_re;
1653  B0_re -= gT02_im * b2_im;
1654  spinorFloat B0_im = 0;
1655  B0_im += gT00_re * b0_im;
1656  B0_im += gT00_im * b0_re;
1657  B0_im += gT01_re * b1_im;
1658  B0_im += gT01_im * b1_re;
1659  B0_im += gT02_re * b2_im;
1660  B0_im += gT02_im * b2_re;
1661 
1662  // multiply row 1
1663  spinorFloat A1_re = 0;
1664  A1_re += gT10_re * a0_re;
1665  A1_re -= gT10_im * a0_im;
1666  A1_re += gT11_re * a1_re;
1667  A1_re -= gT11_im * a1_im;
1668  A1_re += gT12_re * a2_re;
1669  A1_re -= gT12_im * a2_im;
1670  spinorFloat A1_im = 0;
1671  A1_im += gT10_re * a0_im;
1672  A1_im += gT10_im * a0_re;
1673  A1_im += gT11_re * a1_im;
1674  A1_im += gT11_im * a1_re;
1675  A1_im += gT12_re * a2_im;
1676  A1_im += gT12_im * a2_re;
1677  spinorFloat B1_re = 0;
1678  B1_re += gT10_re * b0_re;
1679  B1_re -= gT10_im * b0_im;
1680  B1_re += gT11_re * b1_re;
1681  B1_re -= gT11_im * b1_im;
1682  B1_re += gT12_re * b2_re;
1683  B1_re -= gT12_im * b2_im;
1684  spinorFloat B1_im = 0;
1685  B1_im += gT10_re * b0_im;
1686  B1_im += gT10_im * b0_re;
1687  B1_im += gT11_re * b1_im;
1688  B1_im += gT11_im * b1_re;
1689  B1_im += gT12_re * b2_im;
1690  B1_im += gT12_im * b2_re;
1691 
1692  // multiply row 2
1693  spinorFloat A2_re = 0;
1694  A2_re += gT20_re * a0_re;
1695  A2_re -= gT20_im * a0_im;
1696  A2_re += gT21_re * a1_re;
1697  A2_re -= gT21_im * a1_im;
1698  A2_re += gT22_re * a2_re;
1699  A2_re -= gT22_im * a2_im;
1700  spinorFloat A2_im = 0;
1701  A2_im += gT20_re * a0_im;
1702  A2_im += gT20_im * a0_re;
1703  A2_im += gT21_re * a1_im;
1704  A2_im += gT21_im * a1_re;
1705  A2_im += gT22_re * a2_im;
1706  A2_im += gT22_im * a2_re;
1707  spinorFloat B2_re = 0;
1708  B2_re += gT20_re * b0_re;
1709  B2_re -= gT20_im * b0_im;
1710  B2_re += gT21_re * b1_re;
1711  B2_re -= gT21_im * b1_im;
1712  B2_re += gT22_re * b2_re;
1713  B2_re -= gT22_im * b2_im;
1714  spinorFloat B2_im = 0;
1715  B2_im += gT20_re * b0_im;
1716  B2_im += gT20_im * b0_re;
1717  B2_im += gT21_re * b1_im;
1718  B2_im += gT21_im * b1_re;
1719  B2_im += gT22_re * b2_im;
1720  B2_im += gT22_im * b2_re;
1721 
1722 #ifdef SPINOR_DOUBLE
1723  spinorFloat a = param.a;
1724 #else
1725  spinorFloat a = param.a_f;
1726 #endif
1727  o00_re += a*A0_re;
1728  o00_im += a*A0_im;
1729  o10_re += a*B0_re;
1730  o10_im += a*B0_im;
1731  o20_re -= a*B0_re;
1732  o20_im -= a*B0_im;
1733  o30_re += a*A0_re;
1734  o30_im += a*A0_im;
1735 
1736  o01_re += a*A1_re;
1737  o01_im += a*A1_im;
1738  o11_re += a*B1_re;
1739  o11_im += a*B1_im;
1740  o21_re -= a*B1_re;
1741  o21_im -= a*B1_im;
1742  o31_re += a*A1_re;
1743  o31_im += a*A1_im;
1744 
1745  o02_re += a*A2_re;
1746  o02_im += a*A2_im;
1747  o12_re += a*B2_re;
1748  o12_im += a*B2_im;
1749  o22_re -= a*B2_re;
1750  o22_im -= a*B2_im;
1751  o32_re += a*A2_re;
1752  o32_im += a*A2_im;
1753 
1754 }
1755 
1756 #ifdef MULTI_GPU
1757 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]<(param.dc.X[2]-1))) ||
1758  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==(param.dc.X[2]-1)) )
1759 #endif
1760 {
1761  // Projector P2-
1762  // 1 0 -i 0
1763  // 0 1 0 i
1764  // i 0 1 0
1765  // 0 -i 0 1
1766 
1767 #ifdef MULTI_GPU
1768  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1 :
1769  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1770 #if (DD_PREC==2) // half precision
1771  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1772 #endif
1773 #else
1774  const int sp_idx = (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1;
1775 #endif
1776 
1777  const int ga_idx = sid;
1778 
1785 
1786 #ifdef MULTI_GPU
1787  if (kernel_type == INTERIOR_KERNEL) {
1788 #endif
1789 
1790  if (threadIdx.z == blockDim.z-1 && blockDim.z < param.dc.X[2]) {
1791  // read spinor from device memory
1792  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1793 
1794  // project spinor into half spinors
1795  a0_re = +i00_re+i20_im;
1796  a0_im = +i00_im-i20_re;
1797  a1_re = +i01_re+i21_im;
1798  a1_im = +i01_im-i21_re;
1799  a2_re = +i02_re+i22_im;
1800  a2_im = +i02_im-i22_re;
1801  b0_re = +i10_re-i30_im;
1802  b0_im = +i10_im+i30_re;
1803  b1_re = +i11_re-i31_im;
1804  b1_im = +i11_im+i31_re;
1805  b2_re = +i12_re-i32_im;
1806  b2_im = +i12_im+i32_re;
1807  } else {
1808  // load spinor from shared memory
1809  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x;
1810  int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0;
1811  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1812 
1813  // project spinor into half spinors
1814  a0_re = +i00_re+i20_im;
1815  a0_im = +i00_im-i20_re;
1816  a1_re = +i01_re+i21_im;
1817  a1_im = +i01_im-i21_re;
1818  a2_re = +i02_re+i22_im;
1819  a2_im = +i02_im-i22_re;
1820  b0_re = +i10_re-i30_im;
1821  b0_im = +i10_im+i30_re;
1822  b1_re = +i11_re-i31_im;
1823  b1_im = +i11_im+i31_re;
1824  b2_re = +i12_re-i32_im;
1825  b2_im = +i12_im+i32_re;
1826  }
1827 
1828 #ifdef MULTI_GPU
1829  } else {
1830 
1831  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1832 
1833  // read half spinor from device memory
1834  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 4);
1835 
1836  a0_re = i00_re; a0_im = i00_im;
1837  a1_re = i01_re; a1_im = i01_im;
1838  a2_re = i02_re; a2_im = i02_im;
1839  b0_re = i10_re; b0_im = i10_im;
1840  b1_re = i11_re; b1_im = i11_im;
1841  b2_re = i12_re; b2_im = i12_im;
1842 
1843  }
1844 #endif // MULTI_GPU
1845 
1846  // read gauge matrix from device memory
1847  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, param.gauge_stride);
1848 
1849  // reconstruct gauge matrix
1851 
1852  // multiply row 0
1853  spinorFloat A0_re = 0;
1854  A0_re += g00_re * a0_re;
1855  A0_re -= g00_im * a0_im;
1856  A0_re += g01_re * a1_re;
1857  A0_re -= g01_im * a1_im;
1858  A0_re += g02_re * a2_re;
1859  A0_re -= g02_im * a2_im;
1860  spinorFloat A0_im = 0;
1861  A0_im += g00_re * a0_im;
1862  A0_im += g00_im * a0_re;
1863  A0_im += g01_re * a1_im;
1864  A0_im += g01_im * a1_re;
1865  A0_im += g02_re * a2_im;
1866  A0_im += g02_im * a2_re;
1867  spinorFloat B0_re = 0;
1868  B0_re += g00_re * b0_re;
1869  B0_re -= g00_im * b0_im;
1870  B0_re += g01_re * b1_re;
1871  B0_re -= g01_im * b1_im;
1872  B0_re += g02_re * b2_re;
1873  B0_re -= g02_im * b2_im;
1874  spinorFloat B0_im = 0;
1875  B0_im += g00_re * b0_im;
1876  B0_im += g00_im * b0_re;
1877  B0_im += g01_re * b1_im;
1878  B0_im += g01_im * b1_re;
1879  B0_im += g02_re * b2_im;
1880  B0_im += g02_im * b2_re;
1881 
1882  // multiply row 1
1883  spinorFloat A1_re = 0;
1884  A1_re += g10_re * a0_re;
1885  A1_re -= g10_im * a0_im;
1886  A1_re += g11_re * a1_re;
1887  A1_re -= g11_im * a1_im;
1888  A1_re += g12_re * a2_re;
1889  A1_re -= g12_im * a2_im;
1890  spinorFloat A1_im = 0;
1891  A1_im += g10_re * a0_im;
1892  A1_im += g10_im * a0_re;
1893  A1_im += g11_re * a1_im;
1894  A1_im += g11_im * a1_re;
1895  A1_im += g12_re * a2_im;
1896  A1_im += g12_im * a2_re;
1897  spinorFloat B1_re = 0;
1898  B1_re += g10_re * b0_re;
1899  B1_re -= g10_im * b0_im;
1900  B1_re += g11_re * b1_re;
1901  B1_re -= g11_im * b1_im;
1902  B1_re += g12_re * b2_re;
1903  B1_re -= g12_im * b2_im;
1904  spinorFloat B1_im = 0;
1905  B1_im += g10_re * b0_im;
1906  B1_im += g10_im * b0_re;
1907  B1_im += g11_re * b1_im;
1908  B1_im += g11_im * b1_re;
1909  B1_im += g12_re * b2_im;
1910  B1_im += g12_im * b2_re;
1911 
1912  // multiply row 2
1913  spinorFloat A2_re = 0;
1914  A2_re += g20_re * a0_re;
1915  A2_re -= g20_im * a0_im;
1916  A2_re += g21_re * a1_re;
1917  A2_re -= g21_im * a1_im;
1918  A2_re += g22_re * a2_re;
1919  A2_re -= g22_im * a2_im;
1920  spinorFloat A2_im = 0;
1921  A2_im += g20_re * a0_im;
1922  A2_im += g20_im * a0_re;
1923  A2_im += g21_re * a1_im;
1924  A2_im += g21_im * a1_re;
1925  A2_im += g22_re * a2_im;
1926  A2_im += g22_im * a2_re;
1927  spinorFloat B2_re = 0;
1928  B2_re += g20_re * b0_re;
1929  B2_re -= g20_im * b0_im;
1930  B2_re += g21_re * b1_re;
1931  B2_re -= g21_im * b1_im;
1932  B2_re += g22_re * b2_re;
1933  B2_re -= g22_im * b2_im;
1934  spinorFloat B2_im = 0;
1935  B2_im += g20_re * b0_im;
1936  B2_im += g20_im * b0_re;
1937  B2_im += g21_re * b1_im;
1938  B2_im += g21_im * b1_re;
1939  B2_im += g22_re * b2_im;
1940  B2_im += g22_im * b2_re;
1941 
1942 #ifdef SPINOR_DOUBLE
1943  spinorFloat a = param.a;
1944 #else
1945  spinorFloat a = param.a_f;
1946 #endif
1947  o00_re += a*A0_re;
1948  o00_im += a*A0_im;
1949  o10_re += a*B0_re;
1950  o10_im += a*B0_im;
1951  o20_re -= a*A0_im;
1952  o20_im += a*A0_re;
1953  o30_re += a*B0_im;
1954  o30_im -= a*B0_re;
1955 
1956  o01_re += a*A1_re;
1957  o01_im += a*A1_im;
1958  o11_re += a*B1_re;
1959  o11_im += a*B1_im;
1960  o21_re -= a*A1_im;
1961  o21_im += a*A1_re;
1962  o31_re += a*B1_im;
1963  o31_im -= a*B1_re;
1964 
1965  o02_re += a*A2_re;
1966  o02_im += a*A2_im;
1967  o12_re += a*B2_re;
1968  o12_im += a*B2_im;
1969  o22_re -= a*A2_im;
1970  o22_im += a*A2_re;
1971  o32_re += a*B2_im;
1972  o32_im -= a*B2_re;
1973 
1974 }
1975 
1976 #ifdef MULTI_GPU
1977 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]>0)) ||
1978  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==0) )
1979 #endif
1980 {
1981  // Projector P2+
1982  // 1 0 i 0
1983  // 0 1 0 -i
1984  // -i 0 1 0
1985  // 0 i 0 1
1986 
1987 #ifdef MULTI_GPU
1988  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1 :
1989  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1990 #if (DD_PREC==2) // half precision
1991  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1992 #endif
1993 #else
1994  const int sp_idx = (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1;
1995 #endif
1996 
1997 #ifdef MULTI_GPU
1998  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1999 #else
2000  const int ga_idx = sp_idx;
2001 #endif
2002 
2009 
2010 #ifdef MULTI_GPU
2011  if (kernel_type == INTERIOR_KERNEL) {
2012 #endif
2013 
2014  if (threadIdx.z == 0 && blockDim.z < param.dc.X[2]) {
2015  // read spinor from device memory
2016  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2017 
2018  // project spinor into half spinors
2019  a0_re = +i00_re-i20_im;
2020  a0_im = +i00_im+i20_re;
2021  a1_re = +i01_re-i21_im;
2022  a1_im = +i01_im+i21_re;
2023  a2_re = +i02_re-i22_im;
2024  a2_im = +i02_im+i22_re;
2025  b0_re = +i10_re+i30_im;
2026  b0_im = +i10_im-i30_re;
2027  b1_re = +i11_re+i31_im;
2028  b1_im = +i11_im-i31_re;
2029  b2_re = +i12_re+i32_im;
2030  b2_im = +i12_im-i32_re;
2031  } else {
2032  // load spinor from shared memory
2033  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x;
2034  int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1;
2035  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
2036 
2037  // project spinor into half spinors
2038  a0_re = +i00_re-i20_im;
2039  a0_im = +i00_im+i20_re;
2040  a1_re = +i01_re-i21_im;
2041  a1_im = +i01_im+i21_re;
2042  a2_re = +i02_re-i22_im;
2043  a2_im = +i02_im+i22_re;
2044  b0_re = +i10_re+i30_im;
2045  b0_im = +i10_im-i30_re;
2046  b1_re = +i11_re+i31_im;
2047  b1_im = +i11_im-i31_re;
2048  b2_re = +i12_re+i32_im;
2049  b2_im = +i12_im-i32_re;
2050  }
2051 
2052 #ifdef MULTI_GPU
2053  } else {
2054 
2055  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2056 
2057  // read half spinor from device memory
2058  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 5);
2059 
2060  a0_re = i00_re; a0_im = i00_im;
2061  a1_re = i01_re; a1_im = i01_im;
2062  a2_re = i02_re; a2_im = i02_im;
2063  b0_re = i10_re; b0_im = i10_im;
2064  b1_re = i11_re; b1_im = i11_im;
2065  b2_re = i12_re; b2_im = i12_im;
2066 
2067  }
2068 #endif // MULTI_GPU
2069 
2070  // read gauge matrix from device memory
2071  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, param.gauge_stride);
2072 
2073  // reconstruct gauge matrix
2075 
2076  // multiply row 0
2077  spinorFloat A0_re = 0;
2078  A0_re += gT00_re * a0_re;
2079  A0_re -= gT00_im * a0_im;
2080  A0_re += gT01_re * a1_re;
2081  A0_re -= gT01_im * a1_im;
2082  A0_re += gT02_re * a2_re;
2083  A0_re -= gT02_im * a2_im;
2084  spinorFloat A0_im = 0;
2085  A0_im += gT00_re * a0_im;
2086  A0_im += gT00_im * a0_re;
2087  A0_im += gT01_re * a1_im;
2088  A0_im += gT01_im * a1_re;
2089  A0_im += gT02_re * a2_im;
2090  A0_im += gT02_im * a2_re;
2091  spinorFloat B0_re = 0;
2092  B0_re += gT00_re * b0_re;
2093  B0_re -= gT00_im * b0_im;
2094  B0_re += gT01_re * b1_re;
2095  B0_re -= gT01_im * b1_im;
2096  B0_re += gT02_re * b2_re;
2097  B0_re -= gT02_im * b2_im;
2098  spinorFloat B0_im = 0;
2099  B0_im += gT00_re * b0_im;
2100  B0_im += gT00_im * b0_re;
2101  B0_im += gT01_re * b1_im;
2102  B0_im += gT01_im * b1_re;
2103  B0_im += gT02_re * b2_im;
2104  B0_im += gT02_im * b2_re;
2105 
2106  // multiply row 1
2107  spinorFloat A1_re = 0;
2108  A1_re += gT10_re * a0_re;
2109  A1_re -= gT10_im * a0_im;
2110  A1_re += gT11_re * a1_re;
2111  A1_re -= gT11_im * a1_im;
2112  A1_re += gT12_re * a2_re;
2113  A1_re -= gT12_im * a2_im;
2114  spinorFloat A1_im = 0;
2115  A1_im += gT10_re * a0_im;
2116  A1_im += gT10_im * a0_re;
2117  A1_im += gT11_re * a1_im;
2118  A1_im += gT11_im * a1_re;
2119  A1_im += gT12_re * a2_im;
2120  A1_im += gT12_im * a2_re;
2121  spinorFloat B1_re = 0;
2122  B1_re += gT10_re * b0_re;
2123  B1_re -= gT10_im * b0_im;
2124  B1_re += gT11_re * b1_re;
2125  B1_re -= gT11_im * b1_im;
2126  B1_re += gT12_re * b2_re;
2127  B1_re -= gT12_im * b2_im;
2128  spinorFloat B1_im = 0;
2129  B1_im += gT10_re * b0_im;
2130  B1_im += gT10_im * b0_re;
2131  B1_im += gT11_re * b1_im;
2132  B1_im += gT11_im * b1_re;
2133  B1_im += gT12_re * b2_im;
2134  B1_im += gT12_im * b2_re;
2135 
2136  // multiply row 2
2137  spinorFloat A2_re = 0;
2138  A2_re += gT20_re * a0_re;
2139  A2_re -= gT20_im * a0_im;
2140  A2_re += gT21_re * a1_re;
2141  A2_re -= gT21_im * a1_im;
2142  A2_re += gT22_re * a2_re;
2143  A2_re -= gT22_im * a2_im;
2144  spinorFloat A2_im = 0;
2145  A2_im += gT20_re * a0_im;
2146  A2_im += gT20_im * a0_re;
2147  A2_im += gT21_re * a1_im;
2148  A2_im += gT21_im * a1_re;
2149  A2_im += gT22_re * a2_im;
2150  A2_im += gT22_im * a2_re;
2151  spinorFloat B2_re = 0;
2152  B2_re += gT20_re * b0_re;
2153  B2_re -= gT20_im * b0_im;
2154  B2_re += gT21_re * b1_re;
2155  B2_re -= gT21_im * b1_im;
2156  B2_re += gT22_re * b2_re;
2157  B2_re -= gT22_im * b2_im;
2158  spinorFloat B2_im = 0;
2159  B2_im += gT20_re * b0_im;
2160  B2_im += gT20_im * b0_re;
2161  B2_im += gT21_re * b1_im;
2162  B2_im += gT21_im * b1_re;
2163  B2_im += gT22_re * b2_im;
2164  B2_im += gT22_im * b2_re;
2165 
2166 #ifdef SPINOR_DOUBLE
2167  spinorFloat a = param.a;
2168 #else
2169  spinorFloat a = param.a_f;
2170 #endif
2171  o00_re += a*A0_re;
2172  o00_im += a*A0_im;
2173  o10_re += a*B0_re;
2174  o10_im += a*B0_im;
2175  o20_re += a*A0_im;
2176  o20_im -= a*A0_re;
2177  o30_re -= a*B0_im;
2178  o30_im += a*B0_re;
2179 
2180  o01_re += a*A1_re;
2181  o01_im += a*A1_im;
2182  o11_re += a*B1_re;
2183  o11_im += a*B1_im;
2184  o21_re += a*A1_im;
2185  o21_im -= a*A1_re;
2186  o31_re -= a*B1_im;
2187  o31_im += a*B1_re;
2188 
2189  o02_re += a*A2_re;
2190  o02_im += a*A2_im;
2191  o12_re += a*B2_re;
2192  o12_im += a*B2_im;
2193  o22_re += a*A2_im;
2194  o22_im -= a*A2_re;
2195  o32_re -= a*B2_im;
2196  o32_im += a*B2_re;
2197 
2198 }
2199 
2200 #ifdef MULTI_GPU
2201 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]<(param.dc.X[3]-1))) ||
2202  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==(param.dc.X[3]-1)) )
2203 #endif
2204 {
2205  // Projector P3-
2206  // 0 0 0 0
2207  // 0 0 0 0
2208  // 0 0 2 0
2209  // 0 0 0 2
2210 
2211 #ifdef MULTI_GPU
2212  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1 :
2213  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
2214 #if (DD_PREC==2) // half precision
2215  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
2216 #endif
2217 #else
2218  const int sp_idx = (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1;
2219 #endif
2220 
2221  const int ga_idx = sid;
2222 
2223  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
2224  {
2231 
2232 #ifdef MULTI_GPU
2233  if (kernel_type == INTERIOR_KERNEL) {
2234 #endif
2235 
2236  // read spinor from device memory
2238 
2239  // project spinor into half spinors
2240  a0_re = +2*i20_re;
2241  a0_im = +2*i20_im;
2242  a1_re = +2*i21_re;
2243  a1_im = +2*i21_im;
2244  a2_re = +2*i22_re;
2245  a2_im = +2*i22_im;
2246  b0_re = +2*i30_re;
2247  b0_im = +2*i30_im;
2248  b1_re = +2*i31_re;
2249  b1_im = +2*i31_im;
2250  b2_re = +2*i32_re;
2251  b2_im = +2*i32_im;
2252 
2253 #ifdef MULTI_GPU
2254  } else {
2255 
2256  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2257  const int t_proj_scale = TPROJSCALE;
2258 
2259  // read half spinor from device memory
2260  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
2261 
2262  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2263  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2264  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2265  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2266  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2267  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2268 
2269  }
2270 #endif // MULTI_GPU
2271 
2272  // identity gauge matrix
2279 
2280 #ifdef SPINOR_DOUBLE
2281  spinorFloat a = param.a;
2282 #else
2283  spinorFloat a = param.a_f;
2284 #endif
2285  o20_re += a*A0_re;
2286  o20_im += a*A0_im;
2287  o30_re += a*B0_re;
2288  o30_im += a*B0_im;
2289 
2290  o21_re += a*A1_re;
2291  o21_im += a*A1_im;
2292  o31_re += a*B1_re;
2293  o31_im += a*B1_im;
2294 
2295  o22_re += a*A2_re;
2296  o22_im += a*A2_im;
2297  o32_re += a*B2_re;
2298  o32_im += a*B2_im;
2299 
2300  } else {
2307 
2308 #ifdef MULTI_GPU
2309  if (kernel_type == INTERIOR_KERNEL) {
2310 #endif
2311 
2312  // read spinor from device memory
2314 
2315  // project spinor into half spinors
2316  a0_re = +2*i20_re;
2317  a0_im = +2*i20_im;
2318  a1_re = +2*i21_re;
2319  a1_im = +2*i21_im;
2320  a2_re = +2*i22_re;
2321  a2_im = +2*i22_im;
2322  b0_re = +2*i30_re;
2323  b0_im = +2*i30_im;
2324  b1_re = +2*i31_re;
2325  b1_im = +2*i31_im;
2326  b2_re = +2*i32_re;
2327  b2_im = +2*i32_im;
2328 
2329 #ifdef MULTI_GPU
2330  } else {
2331 
2332  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2333  const int t_proj_scale = TPROJSCALE;
2334 
2335  // read half spinor from device memory
2336  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
2337 
2338  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2339  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2340  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2341  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2342  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2343  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2344 
2345  }
2346 #endif // MULTI_GPU
2347 
2348  // read gauge matrix from device memory
2349  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, param.gauge_stride);
2350 
2351  // reconstruct gauge matrix
2353 
2354  // multiply row 0
2355  spinorFloat A0_re = 0;
2356  A0_re += g00_re * a0_re;
2357  A0_re -= g00_im * a0_im;
2358  A0_re += g01_re * a1_re;
2359  A0_re -= g01_im * a1_im;
2360  A0_re += g02_re * a2_re;
2361  A0_re -= g02_im * a2_im;
2362  spinorFloat A0_im = 0;
2363  A0_im += g00_re * a0_im;
2364  A0_im += g00_im * a0_re;
2365  A0_im += g01_re * a1_im;
2366  A0_im += g01_im * a1_re;
2367  A0_im += g02_re * a2_im;
2368  A0_im += g02_im * a2_re;
2369  spinorFloat B0_re = 0;
2370  B0_re += g00_re * b0_re;
2371  B0_re -= g00_im * b0_im;
2372  B0_re += g01_re * b1_re;
2373  B0_re -= g01_im * b1_im;
2374  B0_re += g02_re * b2_re;
2375  B0_re -= g02_im * b2_im;
2376  spinorFloat B0_im = 0;
2377  B0_im += g00_re * b0_im;
2378  B0_im += g00_im * b0_re;
2379  B0_im += g01_re * b1_im;
2380  B0_im += g01_im * b1_re;
2381  B0_im += g02_re * b2_im;
2382  B0_im += g02_im * b2_re;
2383 
2384  // multiply row 1
2385  spinorFloat A1_re = 0;
2386  A1_re += g10_re * a0_re;
2387  A1_re -= g10_im * a0_im;
2388  A1_re += g11_re * a1_re;
2389  A1_re -= g11_im * a1_im;
2390  A1_re += g12_re * a2_re;
2391  A1_re -= g12_im * a2_im;
2392  spinorFloat A1_im = 0;
2393  A1_im += g10_re * a0_im;
2394  A1_im += g10_im * a0_re;
2395  A1_im += g11_re * a1_im;
2396  A1_im += g11_im * a1_re;
2397  A1_im += g12_re * a2_im;
2398  A1_im += g12_im * a2_re;
2399  spinorFloat B1_re = 0;
2400  B1_re += g10_re * b0_re;
2401  B1_re -= g10_im * b0_im;
2402  B1_re += g11_re * b1_re;
2403  B1_re -= g11_im * b1_im;
2404  B1_re += g12_re * b2_re;
2405  B1_re -= g12_im * b2_im;
2406  spinorFloat B1_im = 0;
2407  B1_im += g10_re * b0_im;
2408  B1_im += g10_im * b0_re;
2409  B1_im += g11_re * b1_im;
2410  B1_im += g11_im * b1_re;
2411  B1_im += g12_re * b2_im;
2412  B1_im += g12_im * b2_re;
2413 
2414  // multiply row 2
2415  spinorFloat A2_re = 0;
2416  A2_re += g20_re * a0_re;
2417  A2_re -= g20_im * a0_im;
2418  A2_re += g21_re * a1_re;
2419  A2_re -= g21_im * a1_im;
2420  A2_re += g22_re * a2_re;
2421  A2_re -= g22_im * a2_im;
2422  spinorFloat A2_im = 0;
2423  A2_im += g20_re * a0_im;
2424  A2_im += g20_im * a0_re;
2425  A2_im += g21_re * a1_im;
2426  A2_im += g21_im * a1_re;
2427  A2_im += g22_re * a2_im;
2428  A2_im += g22_im * a2_re;
2429  spinorFloat B2_re = 0;
2430  B2_re += g20_re * b0_re;
2431  B2_re -= g20_im * b0_im;
2432  B2_re += g21_re * b1_re;
2433  B2_re -= g21_im * b1_im;
2434  B2_re += g22_re * b2_re;
2435  B2_re -= g22_im * b2_im;
2436  spinorFloat B2_im = 0;
2437  B2_im += g20_re * b0_im;
2438  B2_im += g20_im * b0_re;
2439  B2_im += g21_re * b1_im;
2440  B2_im += g21_im * b1_re;
2441  B2_im += g22_re * b2_im;
2442  B2_im += g22_im * b2_re;
2443 
2444 #ifdef SPINOR_DOUBLE
2445  spinorFloat a = param.a;
2446 #else
2447  spinorFloat a = param.a_f;
2448 #endif
2449  o20_re += a*A0_re;
2450  o20_im += a*A0_im;
2451  o30_re += a*B0_re;
2452  o30_im += a*B0_im;
2453 
2454  o21_re += a*A1_re;
2455  o21_im += a*A1_im;
2456  o31_re += a*B1_re;
2457  o31_im += a*B1_im;
2458 
2459  o22_re += a*A2_re;
2460  o22_im += a*A2_im;
2461  o32_re += a*B2_re;
2462  o32_im += a*B2_im;
2463 
2464  }
2465 }
2466 
2467 #ifdef MULTI_GPU
2468 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]>0)) ||
2469  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==0) )
2470 #endif
2471 {
2472  // Projector P3+
2473  // 2 0 0 0
2474  // 0 2 0 0
2475  // 0 0 0 0
2476  // 0 0 0 0
2477 
2478 #ifdef MULTI_GPU
2479  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1 :
2480  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
2481 #if (DD_PREC==2) // half precision
2482  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
2483 #endif
2484 #else
2485  const int sp_idx = (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1;
2486 #endif
2487 
2488 #ifdef MULTI_GPU
2489  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
2490 #else
2491  const int ga_idx = sp_idx;
2492 #endif
2493 
2494  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
2495  {
2502 
2503 #ifdef MULTI_GPU
2504  if (kernel_type == INTERIOR_KERNEL) {
2505 #endif
2506 
2507  // read spinor from device memory
2508  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2509 
2510  // project spinor into half spinors
2511  a0_re = +2*i00_re;
2512  a0_im = +2*i00_im;
2513  a1_re = +2*i01_re;
2514  a1_im = +2*i01_im;
2515  a2_re = +2*i02_re;
2516  a2_im = +2*i02_im;
2517  b0_re = +2*i10_re;
2518  b0_im = +2*i10_im;
2519  b1_re = +2*i11_re;
2520  b1_im = +2*i11_im;
2521  b2_re = +2*i12_re;
2522  b2_im = +2*i12_im;
2523 
2524 #ifdef MULTI_GPU
2525  } else {
2526 
2527  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2528  const int t_proj_scale = TPROJSCALE;
2529 
2530  // read half spinor from device memory
2531  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
2532 
2533  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2534  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2535  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2536  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2537  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2538  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2539 
2540  }
2541 #endif // MULTI_GPU
2542 
2543  // identity gauge matrix
2550 
2551 #ifdef SPINOR_DOUBLE
2552  spinorFloat a = param.a;
2553 #else
2554  spinorFloat a = param.a_f;
2555 #endif
2556  o00_re += a*A0_re;
2557  o00_im += a*A0_im;
2558  o10_re += a*B0_re;
2559  o10_im += a*B0_im;
2560 
2561  o01_re += a*A1_re;
2562  o01_im += a*A1_im;
2563  o11_re += a*B1_re;
2564  o11_im += a*B1_im;
2565 
2566  o02_re += a*A2_re;
2567  o02_im += a*A2_im;
2568  o12_re += a*B2_re;
2569  o12_im += a*B2_im;
2570 
2571  } else {
2578 
2579 #ifdef MULTI_GPU
2580  if (kernel_type == INTERIOR_KERNEL) {
2581 #endif
2582 
2583  // read spinor from device memory
2584  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2585 
2586  // project spinor into half spinors
2587  a0_re = +2*i00_re;
2588  a0_im = +2*i00_im;
2589  a1_re = +2*i01_re;
2590  a1_im = +2*i01_im;
2591  a2_re = +2*i02_re;
2592  a2_im = +2*i02_im;
2593  b0_re = +2*i10_re;
2594  b0_im = +2*i10_im;
2595  b1_re = +2*i11_re;
2596  b1_im = +2*i11_im;
2597  b2_re = +2*i12_re;
2598  b2_im = +2*i12_im;
2599 
2600 #ifdef MULTI_GPU
2601  } else {
2602 
2603  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2604  const int t_proj_scale = TPROJSCALE;
2605 
2606  // read half spinor from device memory
2607  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
2608 
2609  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2610  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2611  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2612  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2613  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2614  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2615 
2616  }
2617 #endif // MULTI_GPU
2618 
2619  // read gauge matrix from device memory
2620  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, param.gauge_stride);
2621 
2622  // reconstruct gauge matrix
2624 
2625  // multiply row 0
2626  spinorFloat A0_re = 0;
2627  A0_re += gT00_re * a0_re;
2628  A0_re -= gT00_im * a0_im;
2629  A0_re += gT01_re * a1_re;
2630  A0_re -= gT01_im * a1_im;
2631  A0_re += gT02_re * a2_re;
2632  A0_re -= gT02_im * a2_im;
2633  spinorFloat A0_im = 0;
2634  A0_im += gT00_re * a0_im;
2635  A0_im += gT00_im * a0_re;
2636  A0_im += gT01_re * a1_im;
2637  A0_im += gT01_im * a1_re;
2638  A0_im += gT02_re * a2_im;
2639  A0_im += gT02_im * a2_re;
2640  spinorFloat B0_re = 0;
2641  B0_re += gT00_re * b0_re;
2642  B0_re -= gT00_im * b0_im;
2643  B0_re += gT01_re * b1_re;
2644  B0_re -= gT01_im * b1_im;
2645  B0_re += gT02_re * b2_re;
2646  B0_re -= gT02_im * b2_im;
2647  spinorFloat B0_im = 0;
2648  B0_im += gT00_re * b0_im;
2649  B0_im += gT00_im * b0_re;
2650  B0_im += gT01_re * b1_im;
2651  B0_im += gT01_im * b1_re;
2652  B0_im += gT02_re * b2_im;
2653  B0_im += gT02_im * b2_re;
2654 
2655  // multiply row 1
2656  spinorFloat A1_re = 0;
2657  A1_re += gT10_re * a0_re;
2658  A1_re -= gT10_im * a0_im;
2659  A1_re += gT11_re * a1_re;
2660  A1_re -= gT11_im * a1_im;
2661  A1_re += gT12_re * a2_re;
2662  A1_re -= gT12_im * a2_im;
2663  spinorFloat A1_im = 0;
2664  A1_im += gT10_re * a0_im;
2665  A1_im += gT10_im * a0_re;
2666  A1_im += gT11_re * a1_im;
2667  A1_im += gT11_im * a1_re;
2668  A1_im += gT12_re * a2_im;
2669  A1_im += gT12_im * a2_re;
2670  spinorFloat B1_re = 0;
2671  B1_re += gT10_re * b0_re;
2672  B1_re -= gT10_im * b0_im;
2673  B1_re += gT11_re * b1_re;
2674  B1_re -= gT11_im * b1_im;
2675  B1_re += gT12_re * b2_re;
2676  B1_re -= gT12_im * b2_im;
2677  spinorFloat B1_im = 0;
2678  B1_im += gT10_re * b0_im;
2679  B1_im += gT10_im * b0_re;
2680  B1_im += gT11_re * b1_im;
2681  B1_im += gT11_im * b1_re;
2682  B1_im += gT12_re * b2_im;
2683  B1_im += gT12_im * b2_re;
2684 
2685  // multiply row 2
2686  spinorFloat A2_re = 0;
2687  A2_re += gT20_re * a0_re;
2688  A2_re -= gT20_im * a0_im;
2689  A2_re += gT21_re * a1_re;
2690  A2_re -= gT21_im * a1_im;
2691  A2_re += gT22_re * a2_re;
2692  A2_re -= gT22_im * a2_im;
2693  spinorFloat A2_im = 0;
2694  A2_im += gT20_re * a0_im;
2695  A2_im += gT20_im * a0_re;
2696  A2_im += gT21_re * a1_im;
2697  A2_im += gT21_im * a1_re;
2698  A2_im += gT22_re * a2_im;
2699  A2_im += gT22_im * a2_re;
2700  spinorFloat B2_re = 0;
2701  B2_re += gT20_re * b0_re;
2702  B2_re -= gT20_im * b0_im;
2703  B2_re += gT21_re * b1_re;
2704  B2_re -= gT21_im * b1_im;
2705  B2_re += gT22_re * b2_re;
2706  B2_re -= gT22_im * b2_im;
2707  spinorFloat B2_im = 0;
2708  B2_im += gT20_re * b0_im;
2709  B2_im += gT20_im * b0_re;
2710  B2_im += gT21_re * b1_im;
2711  B2_im += gT21_im * b1_re;
2712  B2_im += gT22_re * b2_im;
2713  B2_im += gT22_im * b2_re;
2714 
2715 #ifdef SPINOR_DOUBLE
2716  spinorFloat a = param.a;
2717 #else
2718  spinorFloat a = param.a_f;
2719 #endif
2720  o00_re += a*A0_re;
2721  o00_im += a*A0_im;
2722  o10_re += a*B0_re;
2723  o10_im += a*B0_im;
2724 
2725  o01_re += a*A1_re;
2726  o01_im += a*A1_im;
2727  o11_re += a*B1_re;
2728  o11_im += a*B1_im;
2729 
2730  o02_re += a*A2_re;
2731  o02_im += a*A2_im;
2732  o12_re += a*B2_re;
2733  o12_im += a*B2_im;
2734 
2735  }
2736 }
2737 
2738 
2739 
2740 // write spinor field back to device memory
2741 WRITE_SPINOR(param.sp_stride);
2742 
2743 // undefine to prevent warning when precision is changed
2744 #undef spinorFloat
2745 #undef WRITE_SPINOR_SHARED
2746 #undef READ_SPINOR_SHARED
2747 #undef SHARED_STRIDE
2748 
2749 #undef g00_re
2750 #undef g00_im
2751 #undef g01_re
2752 #undef g01_im
2753 #undef g02_re
2754 #undef g02_im
2755 #undef g10_re
2756 #undef g10_im
2757 #undef g11_re
2758 #undef g11_im
2759 #undef g12_re
2760 #undef g12_im
2761 #undef g20_re
2762 #undef g20_im
2763 #undef g21_re
2764 #undef g21_im
2765 #undef g22_re
2766 #undef g22_im
2767 
2768 #undef i00_re
2769 #undef i00_im
2770 #undef i01_re
2771 #undef i01_im
2772 #undef i02_re
2773 #undef i02_im
2774 #undef i10_re
2775 #undef i10_im
2776 #undef i11_re
2777 #undef i11_im
2778 #undef i12_re
2779 #undef i12_im
2780 #undef i20_re
2781 #undef i20_im
2782 #undef i21_re
2783 #undef i21_im
2784 #undef i22_re
2785 #undef i22_im
2786 #undef i30_re
2787 #undef i30_im
2788 #undef i31_re
2789 #undef i31_im
2790 #undef i32_re
2791 #undef i32_im
2792 
2793 #undef acc00_re
2794 #undef acc00_im
2795 #undef acc01_re
2796 #undef acc01_im
2797 #undef acc02_re
2798 #undef acc02_im
2799 #undef acc10_re
2800 #undef acc10_im
2801 #undef acc11_re
2802 #undef acc11_im
2803 #undef acc12_re
2804 #undef acc12_im
2805 #undef acc20_re
2806 #undef acc20_im
2807 #undef acc21_re
2808 #undef acc21_im
2809 #undef acc22_re
2810 #undef acc22_im
2811 #undef acc30_re
2812 #undef acc30_im
2813 #undef acc31_re
2814 #undef acc31_im
2815 #undef acc32_re
2816 #undef acc32_im
2817 
2818 #undef c00_00_re
2819 #undef c01_01_re
2820 #undef c02_02_re
2821 #undef c10_10_re
2822 #undef c11_11_re
2823 #undef c12_12_re
2824 #undef c01_00_re
2825 #undef c01_00_im
2826 #undef c02_00_re
2827 #undef c02_00_im
2828 #undef c10_00_re
2829 #undef c10_00_im
2830 #undef c11_00_re
2831 #undef c11_00_im
2832 #undef c12_00_re
2833 #undef c12_00_im
2834 #undef c02_01_re
2835 #undef c02_01_im
2836 #undef c10_01_re
2837 #undef c10_01_im
2838 #undef c11_01_re
2839 #undef c11_01_im
2840 #undef c12_01_re
2841 #undef c12_01_im
2842 #undef c10_02_re
2843 #undef c10_02_im
2844 #undef c11_02_re
2845 #undef c11_02_im
2846 #undef c12_02_re
2847 #undef c12_02_im
2848 #undef c11_10_re
2849 #undef c11_10_im
2850 #undef c12_10_re
2851 #undef c12_10_im
2852 #undef c12_11_re
2853 #undef c12_11_im
2854 
2855 #undef o00_re
2856 #undef o00_im
2857 #undef o01_re
2858 #undef o01_im
2859 #undef o02_re
2860 #undef o02_im
2861 #undef o10_re
2862 #undef o10_im
2863 #undef o11_re
2864 #undef o11_im
2865 #undef o12_re
2866 #undef o12_im
2867 #undef o20_re
2868 #undef o20_im
2869 #undef o21_re
2870 #undef o21_im
2871 #undef o22_re
2872 #undef o22_im
2873 #undef o30_re
2874 #undef o30_im
2875 #undef o31_re
2876 #undef o31_im
2877 #undef o32_re
2878 #undef o32_im
2879 
2880 #undef VOLATILE
dim3 dim3 blockDim
VOLATILE spinorFloat o11_im
coordsFromIndex3D< EVEN_X >(X, x, sid, param.parity, param.dc.X)
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o11_re
int sp_idx
__syncthreads()
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o32_re
RECONSTRUCT_GAUGE_MATRIX(0)
VOLATILE spinorFloat o02_im
#define GAUGE0TEX
VOLATILE spinorFloat o21_re
VOLATILE spinorFloat o00_re
VOLATILE spinorFloat o00_im
QudaGaugeParam param
Definition: pack_test.cpp:17
VOLATILE spinorFloat o01_im
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o32_im
#define GAUGE1TEX
VOLATILE spinorFloat o02_re
#define SPINORTEX
#define READ_INTERMEDIATE_SPINOR
int X[4]
Definition: quda.h:29
VOLATILE spinorFloat o30_im
#define READ_SPINOR_GHOST
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o20_im
#define CLOVERTEX
VOLATILE spinorFloat o20_re
VOLATILE spinorFloat o31_re
#define INTERTEX
VOLATILE spinorFloat o30_re
int face_idx
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
const int face_num
#define TPROJSCALE
WRITE_SPINOR(param.sp_stride)
#define GHOSTSPINORTEX
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o10_im
#define READ_CLOVER
VOLATILE spinorFloat o21_im
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o22_im