QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
asym_wilson_clover_dslash_gt200_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define i00_re I0.x
15 #define i00_im I0.y
16 #define i01_re I1.x
17 #define i01_im I1.y
18 #define i02_re I2.x
19 #define i02_im I2.y
20 #define i10_re I3.x
21 #define i10_im I3.y
22 #define i11_re I4.x
23 #define i11_im I4.y
24 #define i12_re I5.x
25 #define i12_im I5.y
26 #define i20_re I6.x
27 #define i20_im I6.y
28 #define i21_re I7.x
29 #define i21_im I7.y
30 #define i22_re I8.x
31 #define i22_im I8.y
32 #define i30_re I9.x
33 #define i30_im I9.y
34 #define i31_re I10.x
35 #define i31_im I10.y
36 #define i32_re I11.x
37 #define i32_im I11.y
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
62 #else
63 #define spinorFloat float
64 #define i00_re I0.x
65 #define i00_im I0.y
66 #define i01_re I0.z
67 #define i01_im I0.w
68 #define i02_re I1.x
69 #define i02_im I1.y
70 #define i10_re I1.z
71 #define i10_im I1.w
72 #define i11_re I2.x
73 #define i11_im I2.y
74 #define i12_re I2.z
75 #define i12_im I2.w
76 #define i20_re I3.x
77 #define i20_im I3.y
78 #define i21_re I3.z
79 #define i21_im I3.w
80 #define i22_re I4.x
81 #define i22_im I4.y
82 #define i30_re I4.z
83 #define i30_im I4.w
84 #define i31_re I5.x
85 #define i31_im I5.y
86 #define i32_re I5.z
87 #define i32_im I5.w
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
113 
114 // gauge link
115 #ifdef GAUGE_FLOAT2
116 #define g00_re G0.x
117 #define g00_im G0.y
118 #define g01_re G1.x
119 #define g01_im G1.y
120 #define g02_re G2.x
121 #define g02_im G2.y
122 #define g10_re G3.x
123 #define g10_im G3.y
124 #define g11_re G4.x
125 #define g11_im G4.y
126 #define g12_re G5.x
127 #define g12_im G5.y
128 #define g20_re G6.x
129 #define g20_im G6.y
130 #define g21_re G7.x
131 #define g21_im G7.y
132 #define g22_re G8.x
133 #define g22_im G8.y
134 
135 #else
136 #define g00_re G0.x
137 #define g00_im G0.y
138 #define g01_re G0.z
139 #define g01_im G0.w
140 #define g02_re G1.x
141 #define g02_im G1.y
142 #define g10_re G1.z
143 #define g10_im G1.w
144 #define g11_re G2.x
145 #define g11_im G2.y
146 #define g12_re G2.z
147 #define g12_im G2.w
148 #define g20_re G3.x
149 #define g20_im G3.y
150 #define g21_re G3.z
151 #define g21_im G3.w
152 #define g22_re G4.x
153 #define g22_im G4.y
154 
155 #endif // GAUGE_DOUBLE
156 
157 // conjugated gauge link
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
176 
177 // first chiral block of inverted clover term
178 #ifdef CLOVER_DOUBLE
179 #define c00_00_re C0.x
180 #define c01_01_re C0.y
181 #define c02_02_re C1.x
182 #define c10_10_re C1.y
183 #define c11_11_re C2.x
184 #define c12_12_re C2.y
185 #define c01_00_re C3.x
186 #define c01_00_im C3.y
187 #define c02_00_re C4.x
188 #define c02_00_im C4.y
189 #define c10_00_re C5.x
190 #define c10_00_im C5.y
191 #define c11_00_re C6.x
192 #define c11_00_im C6.y
193 #define c12_00_re C7.x
194 #define c12_00_im C7.y
195 #define c02_01_re C8.x
196 #define c02_01_im C8.y
197 #define c10_01_re C9.x
198 #define c10_01_im C9.y
199 #define c11_01_re C10.x
200 #define c11_01_im C10.y
201 #define c12_01_re C11.x
202 #define c12_01_im C11.y
203 #define c10_02_re C12.x
204 #define c10_02_im C12.y
205 #define c11_02_re C13.x
206 #define c11_02_im C13.y
207 #define c12_02_re C14.x
208 #define c12_02_im C14.y
209 #define c11_10_re C15.x
210 #define c11_10_im C15.y
211 #define c12_10_re C16.x
212 #define c12_10_im C16.y
213 #define c12_11_re C17.x
214 #define c12_11_im C17.y
215 #else
216 #define c00_00_re C0.x
217 #define c01_01_re C0.y
218 #define c02_02_re C0.z
219 #define c10_10_re C0.w
220 #define c11_11_re C1.x
221 #define c12_12_re C1.y
222 #define c01_00_re C1.z
223 #define c01_00_im C1.w
224 #define c02_00_re C2.x
225 #define c02_00_im C2.y
226 #define c10_00_re C2.z
227 #define c10_00_im C2.w
228 #define c11_00_re C3.x
229 #define c11_00_im C3.y
230 #define c12_00_re C3.z
231 #define c12_00_im C3.w
232 #define c02_01_re C4.x
233 #define c02_01_im C4.y
234 #define c10_01_re C4.z
235 #define c10_01_im C4.w
236 #define c11_01_re C5.x
237 #define c11_01_im C5.y
238 #define c12_01_re C5.z
239 #define c12_01_im C5.w
240 #define c10_02_re C6.x
241 #define c10_02_im C6.y
242 #define c11_02_re C6.z
243 #define c11_02_im C6.w
244 #define c12_02_re C7.x
245 #define c12_02_im C7.y
246 #define c11_10_re C7.z
247 #define c11_10_im C7.w
248 #define c12_10_re C8.x
249 #define c12_10_im C8.y
250 #define c12_11_re C8.z
251 #define c12_11_im C8.w
252 #endif // CLOVER_DOUBLE
253 
254 #define c00_01_re (+c01_00_re)
255 #define c00_01_im (-c01_00_im)
256 #define c00_02_re (+c02_00_re)
257 #define c00_02_im (-c02_00_im)
258 #define c01_02_re (+c02_01_re)
259 #define c01_02_im (-c02_01_im)
260 #define c00_10_re (+c10_00_re)
261 #define c00_10_im (-c10_00_im)
262 #define c01_10_re (+c10_01_re)
263 #define c01_10_im (-c10_01_im)
264 #define c02_10_re (+c10_02_re)
265 #define c02_10_im (-c10_02_im)
266 #define c00_11_re (+c11_00_re)
267 #define c00_11_im (-c11_00_im)
268 #define c01_11_re (+c11_01_re)
269 #define c01_11_im (-c11_01_im)
270 #define c02_11_re (+c11_02_re)
271 #define c02_11_im (-c11_02_im)
272 #define c10_11_re (+c11_10_re)
273 #define c10_11_im (-c11_10_im)
274 #define c00_12_re (+c12_00_re)
275 #define c00_12_im (-c12_00_im)
276 #define c01_12_re (+c12_01_re)
277 #define c01_12_im (-c12_01_im)
278 #define c02_12_re (+c12_02_re)
279 #define c02_12_im (-c12_02_im)
280 #define c10_12_re (+c12_10_re)
281 #define c10_12_im (-c12_10_im)
282 #define c11_12_re (+c12_11_re)
283 #define c11_12_im (-c12_11_im)
284 
285 // second chiral block of inverted clover term (reuses C0,...,C9)
286 #define c20_20_re c00_00_re
287 #define c21_20_re c01_00_re
288 #define c21_20_im c01_00_im
289 #define c22_20_re c02_00_re
290 #define c22_20_im c02_00_im
291 #define c30_20_re c10_00_re
292 #define c30_20_im c10_00_im
293 #define c31_20_re c11_00_re
294 #define c31_20_im c11_00_im
295 #define c32_20_re c12_00_re
296 #define c32_20_im c12_00_im
297 #define c20_21_re c00_01_re
298 #define c20_21_im c00_01_im
299 #define c21_21_re c01_01_re
300 #define c22_21_re c02_01_re
301 #define c22_21_im c02_01_im
302 #define c30_21_re c10_01_re
303 #define c30_21_im c10_01_im
304 #define c31_21_re c11_01_re
305 #define c31_21_im c11_01_im
306 #define c32_21_re c12_01_re
307 #define c32_21_im c12_01_im
308 #define c20_22_re c00_02_re
309 #define c20_22_im c00_02_im
310 #define c21_22_re c01_02_re
311 #define c21_22_im c01_02_im
312 #define c22_22_re c02_02_re
313 #define c30_22_re c10_02_re
314 #define c30_22_im c10_02_im
315 #define c31_22_re c11_02_re
316 #define c31_22_im c11_02_im
317 #define c32_22_re c12_02_re
318 #define c32_22_im c12_02_im
319 #define c20_30_re c00_10_re
320 #define c20_30_im c00_10_im
321 #define c21_30_re c01_10_re
322 #define c21_30_im c01_10_im
323 #define c22_30_re c02_10_re
324 #define c22_30_im c02_10_im
325 #define c30_30_re c10_10_re
326 #define c31_30_re c11_10_re
327 #define c31_30_im c11_10_im
328 #define c32_30_re c12_10_re
329 #define c32_30_im c12_10_im
330 #define c20_31_re c00_11_re
331 #define c20_31_im c00_11_im
332 #define c21_31_re c01_11_re
333 #define c21_31_im c01_11_im
334 #define c22_31_re c02_11_re
335 #define c22_31_im c02_11_im
336 #define c30_31_re c10_11_re
337 #define c30_31_im c10_11_im
338 #define c31_31_re c11_11_re
339 #define c32_31_re c12_11_re
340 #define c32_31_im c12_11_im
341 #define c20_32_re c00_12_re
342 #define c20_32_im c00_12_im
343 #define c21_32_re c01_12_re
344 #define c21_32_im c01_12_im
345 #define c22_32_re c02_12_re
346 #define c22_32_im c02_12_im
347 #define c30_32_re c10_12_re
348 #define c30_32_im c10_12_im
349 #define c31_32_re c11_12_re
350 #define c31_32_im c11_12_im
351 #define c32_32_re c12_12_re
352 
353 // output spinor
378 
379 #include "read_gauge.h"
380 #include "read_clover.h"
381 #include "io_spinor.h"
382 
383 int x1, x2, x3, x4;
384 int X;
385 
386 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
387 int sp_norm_idx;
388 #endif // MULTI_GPU half precision
389 
390 int sid;
391 
392 #ifdef MULTI_GPU
393 int face_idx;
395 #endif
396 
397  sid = blockIdx.x*blockDim.x + threadIdx.x;
398  if (sid >= param.threads) return;
399 
400  // Inline by hand for the moment and assume even dimensions
401  const int dims[] = {X1, X2, X3, X4};
402  coordsFromIndex<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
403 
404  o00_re = 0; o00_im = 0;
405  o01_re = 0; o01_im = 0;
406  o02_re = 0; o02_im = 0;
407  o10_re = 0; o10_im = 0;
408  o11_re = 0; o11_im = 0;
409  o12_re = 0; o12_im = 0;
410  o20_re = 0; o20_im = 0;
411  o21_re = 0; o21_im = 0;
412  o22_re = 0; o22_im = 0;
413  o30_re = 0; o30_im = 0;
414  o31_re = 0; o31_im = 0;
415  o32_re = 0; o32_im = 0;
416 #ifdef DSLASH_CLOVER_XPAY
417 
418  READ_ACCUM(ACCUMTEX, param.sp_stride)
419 
420 #ifdef DSLASH_CLOVER
421 
422  // change to chiral basis
423  {
430  spinorFloat a30_re = acc00_re - acc20_re;
431  spinorFloat a30_im = acc00_im - acc20_im;
432 
435  acc20_re = a20_re; acc20_im = a20_im;
436  acc30_re = a30_re; acc30_im = a30_im;
437  }
438 
439  {
446  spinorFloat a31_re = acc01_re - acc21_re;
447  spinorFloat a31_im = acc01_im - acc21_im;
448 
451  acc21_re = a21_re; acc21_im = a21_im;
452  acc31_re = a31_re; acc31_im = a31_im;
453  }
454 
455  {
462  spinorFloat a32_re = acc02_re - acc22_re;
463  spinorFloat a32_im = acc02_im - acc22_im;
464 
467  acc22_re = a22_re; acc22_im = a22_im;
468  acc32_re = a32_re; acc32_im = a32_im;
469  }
470 
471  // apply first chiral block
472  {
474 
481 
482  a00_re += c00_00_re * acc00_re;
483  a00_im += c00_00_re * acc00_im;
484  a00_re += c00_01_re * acc01_re;
485  a00_re -= c00_01_im * acc01_im;
486  a00_im += c00_01_re * acc01_im;
487  a00_im += c00_01_im * acc01_re;
488  a00_re += c00_02_re * acc02_re;
489  a00_re -= c00_02_im * acc02_im;
490  a00_im += c00_02_re * acc02_im;
491  a00_im += c00_02_im * acc02_re;
492  a00_re += c00_10_re * acc10_re;
493  a00_re -= c00_10_im * acc10_im;
494  a00_im += c00_10_re * acc10_im;
495  a00_im += c00_10_im * acc10_re;
496  a00_re += c00_11_re * acc11_re;
497  a00_re -= c00_11_im * acc11_im;
498  a00_im += c00_11_re * acc11_im;
499  a00_im += c00_11_im * acc11_re;
500  a00_re += c00_12_re * acc12_re;
501  a00_re -= c00_12_im * acc12_im;
502  a00_im += c00_12_re * acc12_im;
503  a00_im += c00_12_im * acc12_re;
504 
505  a01_re += c01_00_re * acc00_re;
506  a01_re -= c01_00_im * acc00_im;
507  a01_im += c01_00_re * acc00_im;
508  a01_im += c01_00_im * acc00_re;
509  a01_re += c01_01_re * acc01_re;
510  a01_im += c01_01_re * acc01_im;
511  a01_re += c01_02_re * acc02_re;
512  a01_re -= c01_02_im * acc02_im;
513  a01_im += c01_02_re * acc02_im;
514  a01_im += c01_02_im * acc02_re;
515  a01_re += c01_10_re * acc10_re;
516  a01_re -= c01_10_im * acc10_im;
517  a01_im += c01_10_re * acc10_im;
518  a01_im += c01_10_im * acc10_re;
519  a01_re += c01_11_re * acc11_re;
520  a01_re -= c01_11_im * acc11_im;
521  a01_im += c01_11_re * acc11_im;
522  a01_im += c01_11_im * acc11_re;
523  a01_re += c01_12_re * acc12_re;
524  a01_re -= c01_12_im * acc12_im;
525  a01_im += c01_12_re * acc12_im;
526  a01_im += c01_12_im * acc12_re;
527 
528  a02_re += c02_00_re * acc00_re;
529  a02_re -= c02_00_im * acc00_im;
530  a02_im += c02_00_re * acc00_im;
531  a02_im += c02_00_im * acc00_re;
532  a02_re += c02_01_re * acc01_re;
533  a02_re -= c02_01_im * acc01_im;
534  a02_im += c02_01_re * acc01_im;
535  a02_im += c02_01_im * acc01_re;
536  a02_re += c02_02_re * acc02_re;
537  a02_im += c02_02_re * acc02_im;
538  a02_re += c02_10_re * acc10_re;
539  a02_re -= c02_10_im * acc10_im;
540  a02_im += c02_10_re * acc10_im;
541  a02_im += c02_10_im * acc10_re;
542  a02_re += c02_11_re * acc11_re;
543  a02_re -= c02_11_im * acc11_im;
544  a02_im += c02_11_re * acc11_im;
545  a02_im += c02_11_im * acc11_re;
546  a02_re += c02_12_re * acc12_re;
547  a02_re -= c02_12_im * acc12_im;
548  a02_im += c02_12_re * acc12_im;
549  a02_im += c02_12_im * acc12_re;
550 
551  a10_re += c10_00_re * acc00_re;
552  a10_re -= c10_00_im * acc00_im;
553  a10_im += c10_00_re * acc00_im;
554  a10_im += c10_00_im * acc00_re;
555  a10_re += c10_01_re * acc01_re;
556  a10_re -= c10_01_im * acc01_im;
557  a10_im += c10_01_re * acc01_im;
558  a10_im += c10_01_im * acc01_re;
559  a10_re += c10_02_re * acc02_re;
560  a10_re -= c10_02_im * acc02_im;
561  a10_im += c10_02_re * acc02_im;
562  a10_im += c10_02_im * acc02_re;
563  a10_re += c10_10_re * acc10_re;
564  a10_im += c10_10_re * acc10_im;
565  a10_re += c10_11_re * acc11_re;
566  a10_re -= c10_11_im * acc11_im;
567  a10_im += c10_11_re * acc11_im;
568  a10_im += c10_11_im * acc11_re;
569  a10_re += c10_12_re * acc12_re;
570  a10_re -= c10_12_im * acc12_im;
571  a10_im += c10_12_re * acc12_im;
572  a10_im += c10_12_im * acc12_re;
573 
574  a11_re += c11_00_re * acc00_re;
575  a11_re -= c11_00_im * acc00_im;
576  a11_im += c11_00_re * acc00_im;
577  a11_im += c11_00_im * acc00_re;
578  a11_re += c11_01_re * acc01_re;
579  a11_re -= c11_01_im * acc01_im;
580  a11_im += c11_01_re * acc01_im;
581  a11_im += c11_01_im * acc01_re;
582  a11_re += c11_02_re * acc02_re;
583  a11_re -= c11_02_im * acc02_im;
584  a11_im += c11_02_re * acc02_im;
585  a11_im += c11_02_im * acc02_re;
586  a11_re += c11_10_re * acc10_re;
587  a11_re -= c11_10_im * acc10_im;
588  a11_im += c11_10_re * acc10_im;
589  a11_im += c11_10_im * acc10_re;
590  a11_re += c11_11_re * acc11_re;
591  a11_im += c11_11_re * acc11_im;
592  a11_re += c11_12_re * acc12_re;
593  a11_re -= c11_12_im * acc12_im;
594  a11_im += c11_12_re * acc12_im;
595  a11_im += c11_12_im * acc12_re;
596 
597  a12_re += c12_00_re * acc00_re;
598  a12_re -= c12_00_im * acc00_im;
599  a12_im += c12_00_re * acc00_im;
600  a12_im += c12_00_im * acc00_re;
601  a12_re += c12_01_re * acc01_re;
602  a12_re -= c12_01_im * acc01_im;
603  a12_im += c12_01_re * acc01_im;
604  a12_im += c12_01_im * acc01_re;
605  a12_re += c12_02_re * acc02_re;
606  a12_re -= c12_02_im * acc02_im;
607  a12_im += c12_02_re * acc02_im;
608  a12_im += c12_02_im * acc02_re;
609  a12_re += c12_10_re * acc10_re;
610  a12_re -= c12_10_im * acc10_im;
611  a12_im += c12_10_re * acc10_im;
612  a12_im += c12_10_im * acc10_re;
613  a12_re += c12_11_re * acc11_re;
614  a12_re -= c12_11_im * acc11_im;
615  a12_im += c12_11_re * acc11_im;
616  a12_im += c12_11_im * acc11_re;
617  a12_re += c12_12_re * acc12_re;
618  a12_im += c12_12_re * acc12_im;
619 
620  acc00_re = a00_re; acc00_im = a00_im;
621  acc01_re = a01_re; acc01_im = a01_im;
622  acc02_re = a02_re; acc02_im = a02_im;
623  acc10_re = a10_re; acc10_im = a10_im;
624  acc11_re = a11_re; acc11_im = a11_im;
625  acc12_re = a12_re; acc12_im = a12_im;
626 
627  }
628 
629  // apply second chiral block
630  {
632 
636  spinorFloat a30_re = 0; spinorFloat a30_im = 0;
637  spinorFloat a31_re = 0; spinorFloat a31_im = 0;
638  spinorFloat a32_re = 0; spinorFloat a32_im = 0;
639 
640  a20_re += c20_20_re * acc20_re;
641  a20_im += c20_20_re * acc20_im;
642  a20_re += c20_21_re * acc21_re;
643  a20_re -= c20_21_im * acc21_im;
644  a20_im += c20_21_re * acc21_im;
645  a20_im += c20_21_im * acc21_re;
646  a20_re += c20_22_re * acc22_re;
647  a20_re -= c20_22_im * acc22_im;
648  a20_im += c20_22_re * acc22_im;
649  a20_im += c20_22_im * acc22_re;
650  a20_re += c20_30_re * acc30_re;
651  a20_re -= c20_30_im * acc30_im;
652  a20_im += c20_30_re * acc30_im;
653  a20_im += c20_30_im * acc30_re;
654  a20_re += c20_31_re * acc31_re;
655  a20_re -= c20_31_im * acc31_im;
656  a20_im += c20_31_re * acc31_im;
657  a20_im += c20_31_im * acc31_re;
658  a20_re += c20_32_re * acc32_re;
659  a20_re -= c20_32_im * acc32_im;
660  a20_im += c20_32_re * acc32_im;
661  a20_im += c20_32_im * acc32_re;
662 
663  a21_re += c21_20_re * acc20_re;
664  a21_re -= c21_20_im * acc20_im;
665  a21_im += c21_20_re * acc20_im;
666  a21_im += c21_20_im * acc20_re;
667  a21_re += c21_21_re * acc21_re;
668  a21_im += c21_21_re * acc21_im;
669  a21_re += c21_22_re * acc22_re;
670  a21_re -= c21_22_im * acc22_im;
671  a21_im += c21_22_re * acc22_im;
672  a21_im += c21_22_im * acc22_re;
673  a21_re += c21_30_re * acc30_re;
674  a21_re -= c21_30_im * acc30_im;
675  a21_im += c21_30_re * acc30_im;
676  a21_im += c21_30_im * acc30_re;
677  a21_re += c21_31_re * acc31_re;
678  a21_re -= c21_31_im * acc31_im;
679  a21_im += c21_31_re * acc31_im;
680  a21_im += c21_31_im * acc31_re;
681  a21_re += c21_32_re * acc32_re;
682  a21_re -= c21_32_im * acc32_im;
683  a21_im += c21_32_re * acc32_im;
684  a21_im += c21_32_im * acc32_re;
685 
686  a22_re += c22_20_re * acc20_re;
687  a22_re -= c22_20_im * acc20_im;
688  a22_im += c22_20_re * acc20_im;
689  a22_im += c22_20_im * acc20_re;
690  a22_re += c22_21_re * acc21_re;
691  a22_re -= c22_21_im * acc21_im;
692  a22_im += c22_21_re * acc21_im;
693  a22_im += c22_21_im * acc21_re;
694  a22_re += c22_22_re * acc22_re;
695  a22_im += c22_22_re * acc22_im;
696  a22_re += c22_30_re * acc30_re;
697  a22_re -= c22_30_im * acc30_im;
698  a22_im += c22_30_re * acc30_im;
699  a22_im += c22_30_im * acc30_re;
700  a22_re += c22_31_re * acc31_re;
701  a22_re -= c22_31_im * acc31_im;
702  a22_im += c22_31_re * acc31_im;
703  a22_im += c22_31_im * acc31_re;
704  a22_re += c22_32_re * acc32_re;
705  a22_re -= c22_32_im * acc32_im;
706  a22_im += c22_32_re * acc32_im;
707  a22_im += c22_32_im * acc32_re;
708 
709  a30_re += c30_20_re * acc20_re;
710  a30_re -= c30_20_im * acc20_im;
711  a30_im += c30_20_re * acc20_im;
712  a30_im += c30_20_im * acc20_re;
713  a30_re += c30_21_re * acc21_re;
714  a30_re -= c30_21_im * acc21_im;
715  a30_im += c30_21_re * acc21_im;
716  a30_im += c30_21_im * acc21_re;
717  a30_re += c30_22_re * acc22_re;
718  a30_re -= c30_22_im * acc22_im;
719  a30_im += c30_22_re * acc22_im;
720  a30_im += c30_22_im * acc22_re;
721  a30_re += c30_30_re * acc30_re;
722  a30_im += c30_30_re * acc30_im;
723  a30_re += c30_31_re * acc31_re;
724  a30_re -= c30_31_im * acc31_im;
725  a30_im += c30_31_re * acc31_im;
726  a30_im += c30_31_im * acc31_re;
727  a30_re += c30_32_re * acc32_re;
728  a30_re -= c30_32_im * acc32_im;
729  a30_im += c30_32_re * acc32_im;
730  a30_im += c30_32_im * acc32_re;
731 
732  a31_re += c31_20_re * acc20_re;
733  a31_re -= c31_20_im * acc20_im;
734  a31_im += c31_20_re * acc20_im;
735  a31_im += c31_20_im * acc20_re;
736  a31_re += c31_21_re * acc21_re;
737  a31_re -= c31_21_im * acc21_im;
738  a31_im += c31_21_re * acc21_im;
739  a31_im += c31_21_im * acc21_re;
740  a31_re += c31_22_re * acc22_re;
741  a31_re -= c31_22_im * acc22_im;
742  a31_im += c31_22_re * acc22_im;
743  a31_im += c31_22_im * acc22_re;
744  a31_re += c31_30_re * acc30_re;
745  a31_re -= c31_30_im * acc30_im;
746  a31_im += c31_30_re * acc30_im;
747  a31_im += c31_30_im * acc30_re;
748  a31_re += c31_31_re * acc31_re;
749  a31_im += c31_31_re * acc31_im;
750  a31_re += c31_32_re * acc32_re;
751  a31_re -= c31_32_im * acc32_im;
752  a31_im += c31_32_re * acc32_im;
753  a31_im += c31_32_im * acc32_re;
754 
755  a32_re += c32_20_re * acc20_re;
756  a32_re -= c32_20_im * acc20_im;
757  a32_im += c32_20_re * acc20_im;
758  a32_im += c32_20_im * acc20_re;
759  a32_re += c32_21_re * acc21_re;
760  a32_re -= c32_21_im * acc21_im;
761  a32_im += c32_21_re * acc21_im;
762  a32_im += c32_21_im * acc21_re;
763  a32_re += c32_22_re * acc22_re;
764  a32_re -= c32_22_im * acc22_im;
765  a32_im += c32_22_re * acc22_im;
766  a32_im += c32_22_im * acc22_re;
767  a32_re += c32_30_re * acc30_re;
768  a32_re -= c32_30_im * acc30_im;
769  a32_im += c32_30_re * acc30_im;
770  a32_im += c32_30_im * acc30_re;
771  a32_re += c32_31_re * acc31_re;
772  a32_re -= c32_31_im * acc31_im;
773  a32_im += c32_31_re * acc31_im;
774  a32_im += c32_31_im * acc31_re;
775  a32_re += c32_32_re * acc32_re;
776  a32_im += c32_32_re * acc32_im;
777 
778  acc20_re = a20_re; acc20_im = a20_im;
779  acc21_re = a21_re; acc21_im = a21_im;
780  acc22_re = a22_re; acc22_im = a22_im;
781  acc30_re = a30_re; acc30_im = a30_im;
782  acc31_re = a31_re; acc31_im = a31_im;
783  acc32_re = a32_re; acc32_im = a32_im;
784 
785  }
786 
787  // change back from chiral basis
788  // (note: required factor of 1/2 is included in clover term normalization)
789  {
790  spinorFloat a00_re = acc10_re + acc30_re;
791  spinorFloat a00_im = acc10_im + acc30_im;
792  spinorFloat a10_re = -acc00_re - acc20_re;
793  spinorFloat a10_im = -acc00_im - acc20_im;
794  spinorFloat a20_re = acc10_re - acc30_re;
795  spinorFloat a20_im = acc10_im - acc30_im;
796  spinorFloat a30_re = -acc00_re + acc20_re;
797  spinorFloat a30_im = -acc00_im + acc20_im;
798 
799  acc00_re = a00_re; acc00_im = a00_im;
800  acc10_re = a10_re; acc10_im = a10_im;
801  acc20_re = a20_re; acc20_im = a20_im;
802  acc30_re = a30_re; acc30_im = a30_im;
803  }
804 
805  {
806  spinorFloat a01_re = acc11_re + acc31_re;
807  spinorFloat a01_im = acc11_im + acc31_im;
808  spinorFloat a11_re = -acc01_re - acc21_re;
809  spinorFloat a11_im = -acc01_im - acc21_im;
810  spinorFloat a21_re = acc11_re - acc31_re;
811  spinorFloat a21_im = acc11_im - acc31_im;
812  spinorFloat a31_re = -acc01_re + acc21_re;
813  spinorFloat a31_im = -acc01_im + acc21_im;
814 
815  acc01_re = a01_re; acc01_im = a01_im;
816  acc11_re = a11_re; acc11_im = a11_im;
817  acc21_re = a21_re; acc21_im = a21_im;
818  acc31_re = a31_re; acc31_im = a31_im;
819  }
820 
821  {
822  spinorFloat a02_re = acc12_re + acc32_re;
823  spinorFloat a02_im = acc12_im + acc32_im;
824  spinorFloat a12_re = -acc02_re - acc22_re;
825  spinorFloat a12_im = -acc02_im - acc22_im;
826  spinorFloat a22_re = acc12_re - acc32_re;
827  spinorFloat a22_im = acc12_im - acc32_im;
828  spinorFloat a32_re = -acc02_re + acc22_re;
829  spinorFloat a32_im = -acc02_im + acc22_im;
830 
831  acc02_re = a02_re; acc02_im = a02_im;
832  acc12_re = a12_re; acc12_im = a12_im;
833  acc22_re = a22_re; acc22_im = a22_im;
834  acc32_re = a32_re; acc32_im = a32_im;
835  }
836 
837 #endif // DSLASH_CLOVER
838 
839  o00_re = acc00_re;
840  o00_im = acc00_im;
841  o01_re = acc01_re;
842  o01_im = acc01_im;
843  o02_re = acc02_re;
844  o02_im = acc02_im;
845  o10_re = acc10_re;
846  o10_im = acc10_im;
847  o11_re = acc11_re;
848  o11_im = acc11_im;
849  o12_re = acc12_re;
850  o12_im = acc12_im;
851  o20_re = acc20_re;
852  o20_im = acc20_im;
853  o21_re = acc21_re;
854  o21_im = acc21_im;
855  o22_re = acc22_re;
856  o22_im = acc22_im;
857  o30_re = acc30_re;
858  o30_im = acc30_im;
859  o31_re = acc31_re;
860  o31_im = acc31_im;
861  o32_re = acc32_re;
862  o32_im = acc32_im;
863 #endif // DSLASH_CLOVER_XPAY
864 
865 #ifdef MULTI_GPU
866 } else { // exterior kernel
867 
868  sid = blockIdx.x*blockDim.x + threadIdx.x;
869  if (sid >= param.threads) return;
870 
871  const int dim = static_cast<int>(kernel_type);
872  const int face_volume = (param.threads >> 1); // volume of one face
873  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
874  face_idx = sid - face_num*face_volume; // index into the respective face
875 
876  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
877  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
878  //sp_idx = face_idx + param.ghostOffset[dim];
879 
880 #if (DD_PREC==2) // half precision
881  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
882 #endif
883 
884  const int dims[] = {X1, X2, X3, X4};
885  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
886 
887  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
888 
889  o00_re = i00_re; o00_im = i00_im;
890  o01_re = i01_re; o01_im = i01_im;
891  o02_re = i02_re; o02_im = i02_im;
892  o10_re = i10_re; o10_im = i10_im;
893  o11_re = i11_re; o11_im = i11_im;
894  o12_re = i12_re; o12_im = i12_im;
895  o20_re = i20_re; o20_im = i20_im;
896  o21_re = i21_re; o21_im = i21_im;
897  o22_re = i22_re; o22_im = i22_im;
898  o30_re = i30_re; o30_im = i30_im;
899  o31_re = i31_re; o31_im = i31_im;
900  o32_re = i32_re; o32_im = i32_im;
901 }
902 #endif // MULTI_GPU
903 
904 
905 #ifdef MULTI_GPU
906 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
907  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
908 #endif
909 {
910  // Projector P0-
911  // 1 0 0 -i
912  // 0 1 -i 0
913  // 0 i 1 0
914  // i 0 0 1
915 
916 #ifdef MULTI_GPU
917  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
918  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
919 #else
920  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
921 #endif
922 
923  const int ga_idx = sid;
924 
931 
932 #ifdef MULTI_GPU
933  if (kernel_type == INTERIOR_KERNEL) {
934 #endif
935 
936  // read spinor from device memory
937  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
938 
939  // project spinor into half spinors
940  a0_re = +i00_re+i30_im;
941  a0_im = +i00_im-i30_re;
942  a1_re = +i01_re+i31_im;
943  a1_im = +i01_im-i31_re;
944  a2_re = +i02_re+i32_im;
945  a2_im = +i02_im-i32_re;
946  b0_re = +i10_re+i20_im;
947  b0_im = +i10_im-i20_re;
948  b1_re = +i11_re+i21_im;
949  b1_im = +i11_im-i21_re;
950  b2_re = +i12_re+i22_im;
951  b2_im = +i12_im-i22_re;
952 
953 #ifdef MULTI_GPU
954  } else {
955 
956  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
957 
958  // read half spinor from device memory
959  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
960 
961  a0_re = i00_re; a0_im = i00_im;
962  a1_re = i01_re; a1_im = i01_im;
963  a2_re = i02_re; a2_im = i02_im;
964  b0_re = i10_re; b0_im = i10_im;
965  b1_re = i11_re; b1_im = i11_im;
966  b2_re = i12_re; b2_im = i12_im;
967 
968  }
969 #endif // MULTI_GPU
970 
971  // read gauge matrix from device memory
972  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
973 
974  // reconstruct gauge matrix
976 
977  // multiply row 0
979  A0_re += g00_re * a0_re;
980  A0_re -= g00_im * a0_im;
981  A0_re += g01_re * a1_re;
982  A0_re -= g01_im * a1_im;
983  A0_re += g02_re * a2_re;
984  A0_re -= g02_im * a2_im;
986  A0_im += g00_re * a0_im;
987  A0_im += g00_im * a0_re;
988  A0_im += g01_re * a1_im;
989  A0_im += g01_im * a1_re;
990  A0_im += g02_re * a2_im;
991  A0_im += g02_im * a2_re;
993  B0_re += g00_re * b0_re;
994  B0_re -= g00_im * b0_im;
995  B0_re += g01_re * b1_re;
996  B0_re -= g01_im * b1_im;
997  B0_re += g02_re * b2_re;
998  B0_re -= g02_im * b2_im;
1000  B0_im += g00_re * b0_im;
1001  B0_im += g00_im * b0_re;
1002  B0_im += g01_re * b1_im;
1003  B0_im += g01_im * b1_re;
1004  B0_im += g02_re * b2_im;
1005  B0_im += g02_im * b2_re;
1006 
1007  // multiply row 1
1009  A1_re += g10_re * a0_re;
1010  A1_re -= g10_im * a0_im;
1011  A1_re += g11_re * a1_re;
1012  A1_re -= g11_im * a1_im;
1013  A1_re += g12_re * a2_re;
1014  A1_re -= g12_im * a2_im;
1016  A1_im += g10_re * a0_im;
1017  A1_im += g10_im * a0_re;
1018  A1_im += g11_re * a1_im;
1019  A1_im += g11_im * a1_re;
1020  A1_im += g12_re * a2_im;
1021  A1_im += g12_im * a2_re;
1023  B1_re += g10_re * b0_re;
1024  B1_re -= g10_im * b0_im;
1025  B1_re += g11_re * b1_re;
1026  B1_re -= g11_im * b1_im;
1027  B1_re += g12_re * b2_re;
1028  B1_re -= g12_im * b2_im;
1030  B1_im += g10_re * b0_im;
1031  B1_im += g10_im * b0_re;
1032  B1_im += g11_re * b1_im;
1033  B1_im += g11_im * b1_re;
1034  B1_im += g12_re * b2_im;
1035  B1_im += g12_im * b2_re;
1036 
1037  // multiply row 2
1039  A2_re += g20_re * a0_re;
1040  A2_re -= g20_im * a0_im;
1041  A2_re += g21_re * a1_re;
1042  A2_re -= g21_im * a1_im;
1043  A2_re += g22_re * a2_re;
1044  A2_re -= g22_im * a2_im;
1046  A2_im += g20_re * a0_im;
1047  A2_im += g20_im * a0_re;
1048  A2_im += g21_re * a1_im;
1049  A2_im += g21_im * a1_re;
1050  A2_im += g22_re * a2_im;
1051  A2_im += g22_im * a2_re;
1053  B2_re += g20_re * b0_re;
1054  B2_re -= g20_im * b0_im;
1055  B2_re += g21_re * b1_re;
1056  B2_re -= g21_im * b1_im;
1057  B2_re += g22_re * b2_re;
1058  B2_re -= g22_im * b2_im;
1060  B2_im += g20_re * b0_im;
1061  B2_im += g20_im * b0_re;
1062  B2_im += g21_re * b1_im;
1063  B2_im += g21_im * b1_re;
1064  B2_im += g22_re * b2_im;
1065  B2_im += g22_im * b2_re;
1066 
1067  o00_re += a*A0_re;
1068  o00_im += a*A0_im;
1069  o10_re += a*B0_re;
1070  o10_im += a*B0_im;
1071  o20_re -= a*B0_im;
1072  o20_im += a*B0_re;
1073  o30_re -= a*A0_im;
1074  o30_im += a*A0_re;
1075 
1076  o01_re += a*A1_re;
1077  o01_im += a*A1_im;
1078  o11_re += a*B1_re;
1079  o11_im += a*B1_im;
1080  o21_re -= a*B1_im;
1081  o21_im += a*B1_re;
1082  o31_re -= a*A1_im;
1083  o31_im += a*A1_re;
1084 
1085  o02_re += a*A2_re;
1086  o02_im += a*A2_im;
1087  o12_re += a*B2_re;
1088  o12_im += a*B2_im;
1089  o22_re -= a*B2_im;
1090  o22_im += a*B2_re;
1091  o32_re -= a*A2_im;
1092  o32_im += a*A2_re;
1093 
1094 }
1095 
1096 #ifdef MULTI_GPU
1097 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
1098  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
1099 #endif
1100 {
1101  // Projector P0+
1102  // 1 0 0 i
1103  // 0 1 i 0
1104  // 0 -i 1 0
1105  // -i 0 0 1
1106 
1107 #ifdef MULTI_GPU
1108  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
1109  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1110 #else
1111  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
1112 #endif
1113 
1114 #ifdef MULTI_GPU
1115  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1116 #else
1117  const int ga_idx = sp_idx;
1118 #endif
1119 
1126 
1127 #ifdef MULTI_GPU
1128  if (kernel_type == INTERIOR_KERNEL) {
1129 #endif
1130 
1131  // read spinor from device memory
1132  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1133 
1134  // project spinor into half spinors
1135  a0_re = +i00_re-i30_im;
1136  a0_im = +i00_im+i30_re;
1137  a1_re = +i01_re-i31_im;
1138  a1_im = +i01_im+i31_re;
1139  a2_re = +i02_re-i32_im;
1140  a2_im = +i02_im+i32_re;
1141  b0_re = +i10_re-i20_im;
1142  b0_im = +i10_im+i20_re;
1143  b1_re = +i11_re-i21_im;
1144  b1_im = +i11_im+i21_re;
1145  b2_re = +i12_re-i22_im;
1146  b2_im = +i12_im+i22_re;
1147 
1148 #ifdef MULTI_GPU
1149  } else {
1150 
1151  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1152 
1153  // read half spinor from device memory
1154  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1155 
1156  a0_re = i00_re; a0_im = i00_im;
1157  a1_re = i01_re; a1_im = i01_im;
1158  a2_re = i02_re; a2_im = i02_im;
1159  b0_re = i10_re; b0_im = i10_im;
1160  b1_re = i11_re; b1_im = i11_im;
1161  b2_re = i12_re; b2_im = i12_im;
1162 
1163  }
1164 #endif // MULTI_GPU
1165 
1166  // read gauge matrix from device memory
1167  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
1168 
1169  // reconstruct gauge matrix
1171 
1172  // multiply row 0
1173  spinorFloat A0_re = 0;
1174  A0_re += gT00_re * a0_re;
1175  A0_re -= gT00_im * a0_im;
1176  A0_re += gT01_re * a1_re;
1177  A0_re -= gT01_im * a1_im;
1178  A0_re += gT02_re * a2_re;
1179  A0_re -= gT02_im * a2_im;
1180  spinorFloat A0_im = 0;
1181  A0_im += gT00_re * a0_im;
1182  A0_im += gT00_im * a0_re;
1183  A0_im += gT01_re * a1_im;
1184  A0_im += gT01_im * a1_re;
1185  A0_im += gT02_re * a2_im;
1186  A0_im += gT02_im * a2_re;
1187  spinorFloat B0_re = 0;
1188  B0_re += gT00_re * b0_re;
1189  B0_re -= gT00_im * b0_im;
1190  B0_re += gT01_re * b1_re;
1191  B0_re -= gT01_im * b1_im;
1192  B0_re += gT02_re * b2_re;
1193  B0_re -= gT02_im * b2_im;
1194  spinorFloat B0_im = 0;
1195  B0_im += gT00_re * b0_im;
1196  B0_im += gT00_im * b0_re;
1197  B0_im += gT01_re * b1_im;
1198  B0_im += gT01_im * b1_re;
1199  B0_im += gT02_re * b2_im;
1200  B0_im += gT02_im * b2_re;
1201 
1202  // multiply row 1
1203  spinorFloat A1_re = 0;
1204  A1_re += gT10_re * a0_re;
1205  A1_re -= gT10_im * a0_im;
1206  A1_re += gT11_re * a1_re;
1207  A1_re -= gT11_im * a1_im;
1208  A1_re += gT12_re * a2_re;
1209  A1_re -= gT12_im * a2_im;
1210  spinorFloat A1_im = 0;
1211  A1_im += gT10_re * a0_im;
1212  A1_im += gT10_im * a0_re;
1213  A1_im += gT11_re * a1_im;
1214  A1_im += gT11_im * a1_re;
1215  A1_im += gT12_re * a2_im;
1216  A1_im += gT12_im * a2_re;
1217  spinorFloat B1_re = 0;
1218  B1_re += gT10_re * b0_re;
1219  B1_re -= gT10_im * b0_im;
1220  B1_re += gT11_re * b1_re;
1221  B1_re -= gT11_im * b1_im;
1222  B1_re += gT12_re * b2_re;
1223  B1_re -= gT12_im * b2_im;
1224  spinorFloat B1_im = 0;
1225  B1_im += gT10_re * b0_im;
1226  B1_im += gT10_im * b0_re;
1227  B1_im += gT11_re * b1_im;
1228  B1_im += gT11_im * b1_re;
1229  B1_im += gT12_re * b2_im;
1230  B1_im += gT12_im * b2_re;
1231 
1232  // multiply row 2
1233  spinorFloat A2_re = 0;
1234  A2_re += gT20_re * a0_re;
1235  A2_re -= gT20_im * a0_im;
1236  A2_re += gT21_re * a1_re;
1237  A2_re -= gT21_im * a1_im;
1238  A2_re += gT22_re * a2_re;
1239  A2_re -= gT22_im * a2_im;
1240  spinorFloat A2_im = 0;
1241  A2_im += gT20_re * a0_im;
1242  A2_im += gT20_im * a0_re;
1243  A2_im += gT21_re * a1_im;
1244  A2_im += gT21_im * a1_re;
1245  A2_im += gT22_re * a2_im;
1246  A2_im += gT22_im * a2_re;
1247  spinorFloat B2_re = 0;
1248  B2_re += gT20_re * b0_re;
1249  B2_re -= gT20_im * b0_im;
1250  B2_re += gT21_re * b1_re;
1251  B2_re -= gT21_im * b1_im;
1252  B2_re += gT22_re * b2_re;
1253  B2_re -= gT22_im * b2_im;
1254  spinorFloat B2_im = 0;
1255  B2_im += gT20_re * b0_im;
1256  B2_im += gT20_im * b0_re;
1257  B2_im += gT21_re * b1_im;
1258  B2_im += gT21_im * b1_re;
1259  B2_im += gT22_re * b2_im;
1260  B2_im += gT22_im * b2_re;
1261 
1262  o00_re += a*A0_re;
1263  o00_im += a*A0_im;
1264  o10_re += a*B0_re;
1265  o10_im += a*B0_im;
1266  o20_re += a*B0_im;
1267  o20_im -= a*B0_re;
1268  o30_re += a*A0_im;
1269  o30_im -= a*A0_re;
1270 
1271  o01_re += a*A1_re;
1272  o01_im += a*A1_im;
1273  o11_re += a*B1_re;
1274  o11_im += a*B1_im;
1275  o21_re += a*B1_im;
1276  o21_im -= a*B1_re;
1277  o31_re += a*A1_im;
1278  o31_im -= a*A1_re;
1279 
1280  o02_re += a*A2_re;
1281  o02_im += a*A2_im;
1282  o12_re += a*B2_re;
1283  o12_im += a*B2_im;
1284  o22_re += a*B2_im;
1285  o22_im -= a*B2_re;
1286  o32_re += a*A2_im;
1287  o32_im -= a*A2_re;
1288 
1289 }
1290 
1291 #ifdef MULTI_GPU
1292 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
1293  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
1294 #endif
1295 {
1296  // Projector P1-
1297  // 1 0 0 -1
1298  // 0 1 1 0
1299  // 0 1 1 0
1300  // -1 0 0 1
1301 
1302 #ifdef MULTI_GPU
1303  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
1304  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1305 #else
1306  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
1307 #endif
1308 
1309  const int ga_idx = sid;
1310 
1317 
1318 #ifdef MULTI_GPU
1319  if (kernel_type == INTERIOR_KERNEL) {
1320 #endif
1321 
1322  // read spinor from device memory
1323  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1324 
1325  // project spinor into half spinors
1326  a0_re = +i00_re-i30_re;
1327  a0_im = +i00_im-i30_im;
1328  a1_re = +i01_re-i31_re;
1329  a1_im = +i01_im-i31_im;
1330  a2_re = +i02_re-i32_re;
1331  a2_im = +i02_im-i32_im;
1332  b0_re = +i10_re+i20_re;
1333  b0_im = +i10_im+i20_im;
1334  b1_re = +i11_re+i21_re;
1335  b1_im = +i11_im+i21_im;
1336  b2_re = +i12_re+i22_re;
1337  b2_im = +i12_im+i22_im;
1338 
1339 #ifdef MULTI_GPU
1340  } else {
1341 
1342  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1343 
1344  // read half spinor from device memory
1345  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1346 
1347  a0_re = i00_re; a0_im = i00_im;
1348  a1_re = i01_re; a1_im = i01_im;
1349  a2_re = i02_re; a2_im = i02_im;
1350  b0_re = i10_re; b0_im = i10_im;
1351  b1_re = i11_re; b1_im = i11_im;
1352  b2_re = i12_re; b2_im = i12_im;
1353 
1354  }
1355 #endif // MULTI_GPU
1356 
1357  // read gauge matrix from device memory
1358  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
1359 
1360  // reconstruct gauge matrix
1362 
1363  // multiply row 0
1364  spinorFloat A0_re = 0;
1365  A0_re += g00_re * a0_re;
1366  A0_re -= g00_im * a0_im;
1367  A0_re += g01_re * a1_re;
1368  A0_re -= g01_im * a1_im;
1369  A0_re += g02_re * a2_re;
1370  A0_re -= g02_im * a2_im;
1371  spinorFloat A0_im = 0;
1372  A0_im += g00_re * a0_im;
1373  A0_im += g00_im * a0_re;
1374  A0_im += g01_re * a1_im;
1375  A0_im += g01_im * a1_re;
1376  A0_im += g02_re * a2_im;
1377  A0_im += g02_im * a2_re;
1378  spinorFloat B0_re = 0;
1379  B0_re += g00_re * b0_re;
1380  B0_re -= g00_im * b0_im;
1381  B0_re += g01_re * b1_re;
1382  B0_re -= g01_im * b1_im;
1383  B0_re += g02_re * b2_re;
1384  B0_re -= g02_im * b2_im;
1385  spinorFloat B0_im = 0;
1386  B0_im += g00_re * b0_im;
1387  B0_im += g00_im * b0_re;
1388  B0_im += g01_re * b1_im;
1389  B0_im += g01_im * b1_re;
1390  B0_im += g02_re * b2_im;
1391  B0_im += g02_im * b2_re;
1392 
1393  // multiply row 1
1394  spinorFloat A1_re = 0;
1395  A1_re += g10_re * a0_re;
1396  A1_re -= g10_im * a0_im;
1397  A1_re += g11_re * a1_re;
1398  A1_re -= g11_im * a1_im;
1399  A1_re += g12_re * a2_re;
1400  A1_re -= g12_im * a2_im;
1401  spinorFloat A1_im = 0;
1402  A1_im += g10_re * a0_im;
1403  A1_im += g10_im * a0_re;
1404  A1_im += g11_re * a1_im;
1405  A1_im += g11_im * a1_re;
1406  A1_im += g12_re * a2_im;
1407  A1_im += g12_im * a2_re;
1408  spinorFloat B1_re = 0;
1409  B1_re += g10_re * b0_re;
1410  B1_re -= g10_im * b0_im;
1411  B1_re += g11_re * b1_re;
1412  B1_re -= g11_im * b1_im;
1413  B1_re += g12_re * b2_re;
1414  B1_re -= g12_im * b2_im;
1415  spinorFloat B1_im = 0;
1416  B1_im += g10_re * b0_im;
1417  B1_im += g10_im * b0_re;
1418  B1_im += g11_re * b1_im;
1419  B1_im += g11_im * b1_re;
1420  B1_im += g12_re * b2_im;
1421  B1_im += g12_im * b2_re;
1422 
1423  // multiply row 2
1424  spinorFloat A2_re = 0;
1425  A2_re += g20_re * a0_re;
1426  A2_re -= g20_im * a0_im;
1427  A2_re += g21_re * a1_re;
1428  A2_re -= g21_im * a1_im;
1429  A2_re += g22_re * a2_re;
1430  A2_re -= g22_im * a2_im;
1431  spinorFloat A2_im = 0;
1432  A2_im += g20_re * a0_im;
1433  A2_im += g20_im * a0_re;
1434  A2_im += g21_re * a1_im;
1435  A2_im += g21_im * a1_re;
1436  A2_im += g22_re * a2_im;
1437  A2_im += g22_im * a2_re;
1438  spinorFloat B2_re = 0;
1439  B2_re += g20_re * b0_re;
1440  B2_re -= g20_im * b0_im;
1441  B2_re += g21_re * b1_re;
1442  B2_re -= g21_im * b1_im;
1443  B2_re += g22_re * b2_re;
1444  B2_re -= g22_im * b2_im;
1445  spinorFloat B2_im = 0;
1446  B2_im += g20_re * b0_im;
1447  B2_im += g20_im * b0_re;
1448  B2_im += g21_re * b1_im;
1449  B2_im += g21_im * b1_re;
1450  B2_im += g22_re * b2_im;
1451  B2_im += g22_im * b2_re;
1452 
1453  o00_re += a*A0_re;
1454  o00_im += a*A0_im;
1455  o10_re += a*B0_re;
1456  o10_im += a*B0_im;
1457  o20_re += a*B0_re;
1458  o20_im += a*B0_im;
1459  o30_re -= a*A0_re;
1460  o30_im -= a*A0_im;
1461 
1462  o01_re += a*A1_re;
1463  o01_im += a*A1_im;
1464  o11_re += a*B1_re;
1465  o11_im += a*B1_im;
1466  o21_re += a*B1_re;
1467  o21_im += a*B1_im;
1468  o31_re -= a*A1_re;
1469  o31_im -= a*A1_im;
1470 
1471  o02_re += a*A2_re;
1472  o02_im += a*A2_im;
1473  o12_re += a*B2_re;
1474  o12_im += a*B2_im;
1475  o22_re += a*B2_re;
1476  o22_im += a*B2_im;
1477  o32_re -= a*A2_re;
1478  o32_im -= a*A2_im;
1479 
1480 }
1481 
1482 #ifdef MULTI_GPU
1483 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
1484  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
1485 #endif
1486 {
1487  // Projector P1+
1488  // 1 0 0 1
1489  // 0 1 -1 0
1490  // 0 -1 1 0
1491  // 1 0 0 1
1492 
1493 #ifdef MULTI_GPU
1494  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
1495  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1496 #else
1497  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
1498 #endif
1499 
1500 #ifdef MULTI_GPU
1501  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1502 #else
1503  const int ga_idx = sp_idx;
1504 #endif
1505 
1512 
1513 #ifdef MULTI_GPU
1514  if (kernel_type == INTERIOR_KERNEL) {
1515 #endif
1516 
1517  // read spinor from device memory
1518  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1519 
1520  // project spinor into half spinors
1521  a0_re = +i00_re+i30_re;
1522  a0_im = +i00_im+i30_im;
1523  a1_re = +i01_re+i31_re;
1524  a1_im = +i01_im+i31_im;
1525  a2_re = +i02_re+i32_re;
1526  a2_im = +i02_im+i32_im;
1527  b0_re = +i10_re-i20_re;
1528  b0_im = +i10_im-i20_im;
1529  b1_re = +i11_re-i21_re;
1530  b1_im = +i11_im-i21_im;
1531  b2_re = +i12_re-i22_re;
1532  b2_im = +i12_im-i22_im;
1533 
1534 #ifdef MULTI_GPU
1535  } else {
1536 
1537  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1538 
1539  // read half spinor from device memory
1540  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1541 
1542  a0_re = i00_re; a0_im = i00_im;
1543  a1_re = i01_re; a1_im = i01_im;
1544  a2_re = i02_re; a2_im = i02_im;
1545  b0_re = i10_re; b0_im = i10_im;
1546  b1_re = i11_re; b1_im = i11_im;
1547  b2_re = i12_re; b2_im = i12_im;
1548 
1549  }
1550 #endif // MULTI_GPU
1551 
1552  // read gauge matrix from device memory
1553  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
1554 
1555  // reconstruct gauge matrix
1557 
1558  // multiply row 0
1559  spinorFloat A0_re = 0;
1560  A0_re += gT00_re * a0_re;
1561  A0_re -= gT00_im * a0_im;
1562  A0_re += gT01_re * a1_re;
1563  A0_re -= gT01_im * a1_im;
1564  A0_re += gT02_re * a2_re;
1565  A0_re -= gT02_im * a2_im;
1566  spinorFloat A0_im = 0;
1567  A0_im += gT00_re * a0_im;
1568  A0_im += gT00_im * a0_re;
1569  A0_im += gT01_re * a1_im;
1570  A0_im += gT01_im * a1_re;
1571  A0_im += gT02_re * a2_im;
1572  A0_im += gT02_im * a2_re;
1573  spinorFloat B0_re = 0;
1574  B0_re += gT00_re * b0_re;
1575  B0_re -= gT00_im * b0_im;
1576  B0_re += gT01_re * b1_re;
1577  B0_re -= gT01_im * b1_im;
1578  B0_re += gT02_re * b2_re;
1579  B0_re -= gT02_im * b2_im;
1580  spinorFloat B0_im = 0;
1581  B0_im += gT00_re * b0_im;
1582  B0_im += gT00_im * b0_re;
1583  B0_im += gT01_re * b1_im;
1584  B0_im += gT01_im * b1_re;
1585  B0_im += gT02_re * b2_im;
1586  B0_im += gT02_im * b2_re;
1587 
1588  // multiply row 1
1589  spinorFloat A1_re = 0;
1590  A1_re += gT10_re * a0_re;
1591  A1_re -= gT10_im * a0_im;
1592  A1_re += gT11_re * a1_re;
1593  A1_re -= gT11_im * a1_im;
1594  A1_re += gT12_re * a2_re;
1595  A1_re -= gT12_im * a2_im;
1596  spinorFloat A1_im = 0;
1597  A1_im += gT10_re * a0_im;
1598  A1_im += gT10_im * a0_re;
1599  A1_im += gT11_re * a1_im;
1600  A1_im += gT11_im * a1_re;
1601  A1_im += gT12_re * a2_im;
1602  A1_im += gT12_im * a2_re;
1603  spinorFloat B1_re = 0;
1604  B1_re += gT10_re * b0_re;
1605  B1_re -= gT10_im * b0_im;
1606  B1_re += gT11_re * b1_re;
1607  B1_re -= gT11_im * b1_im;
1608  B1_re += gT12_re * b2_re;
1609  B1_re -= gT12_im * b2_im;
1610  spinorFloat B1_im = 0;
1611  B1_im += gT10_re * b0_im;
1612  B1_im += gT10_im * b0_re;
1613  B1_im += gT11_re * b1_im;
1614  B1_im += gT11_im * b1_re;
1615  B1_im += gT12_re * b2_im;
1616  B1_im += gT12_im * b2_re;
1617 
1618  // multiply row 2
1619  spinorFloat A2_re = 0;
1620  A2_re += gT20_re * a0_re;
1621  A2_re -= gT20_im * a0_im;
1622  A2_re += gT21_re * a1_re;
1623  A2_re -= gT21_im * a1_im;
1624  A2_re += gT22_re * a2_re;
1625  A2_re -= gT22_im * a2_im;
1626  spinorFloat A2_im = 0;
1627  A2_im += gT20_re * a0_im;
1628  A2_im += gT20_im * a0_re;
1629  A2_im += gT21_re * a1_im;
1630  A2_im += gT21_im * a1_re;
1631  A2_im += gT22_re * a2_im;
1632  A2_im += gT22_im * a2_re;
1633  spinorFloat B2_re = 0;
1634  B2_re += gT20_re * b0_re;
1635  B2_re -= gT20_im * b0_im;
1636  B2_re += gT21_re * b1_re;
1637  B2_re -= gT21_im * b1_im;
1638  B2_re += gT22_re * b2_re;
1639  B2_re -= gT22_im * b2_im;
1640  spinorFloat B2_im = 0;
1641  B2_im += gT20_re * b0_im;
1642  B2_im += gT20_im * b0_re;
1643  B2_im += gT21_re * b1_im;
1644  B2_im += gT21_im * b1_re;
1645  B2_im += gT22_re * b2_im;
1646  B2_im += gT22_im * b2_re;
1647 
1648  o00_re += a*A0_re;
1649  o00_im += a*A0_im;
1650  o10_re += a*B0_re;
1651  o10_im += a*B0_im;
1652  o20_re -= a*B0_re;
1653  o20_im -= a*B0_im;
1654  o30_re += a*A0_re;
1655  o30_im += a*A0_im;
1656 
1657  o01_re += a*A1_re;
1658  o01_im += a*A1_im;
1659  o11_re += a*B1_re;
1660  o11_im += a*B1_im;
1661  o21_re -= a*B1_re;
1662  o21_im -= a*B1_im;
1663  o31_re += a*A1_re;
1664  o31_im += a*A1_im;
1665 
1666  o02_re += a*A2_re;
1667  o02_im += a*A2_im;
1668  o12_re += a*B2_re;
1669  o12_im += a*B2_im;
1670  o22_re -= a*B2_re;
1671  o22_im -= a*B2_im;
1672  o32_re += a*A2_re;
1673  o32_im += a*A2_im;
1674 
1675 }
1676 
1677 #ifdef MULTI_GPU
1678 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1679  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1680 #endif
1681 {
1682  // Projector P2-
1683  // 1 0 -i 0
1684  // 0 1 0 i
1685  // i 0 1 0
1686  // 0 -i 0 1
1687 
1688 #ifdef MULTI_GPU
1689  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1690  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1691 #else
1692  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1693 #endif
1694 
1695  const int ga_idx = sid;
1696 
1703 
1704 #ifdef MULTI_GPU
1705  if (kernel_type == INTERIOR_KERNEL) {
1706 #endif
1707 
1708  // read spinor from device memory
1709  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1710 
1711  // project spinor into half spinors
1712  a0_re = +i00_re+i20_im;
1713  a0_im = +i00_im-i20_re;
1714  a1_re = +i01_re+i21_im;
1715  a1_im = +i01_im-i21_re;
1716  a2_re = +i02_re+i22_im;
1717  a2_im = +i02_im-i22_re;
1718  b0_re = +i10_re-i30_im;
1719  b0_im = +i10_im+i30_re;
1720  b1_re = +i11_re-i31_im;
1721  b1_im = +i11_im+i31_re;
1722  b2_re = +i12_re-i32_im;
1723  b2_im = +i12_im+i32_re;
1724 
1725 #ifdef MULTI_GPU
1726  } else {
1727 
1728  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1729 
1730  // read half spinor from device memory
1731  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1732 
1733  a0_re = i00_re; a0_im = i00_im;
1734  a1_re = i01_re; a1_im = i01_im;
1735  a2_re = i02_re; a2_im = i02_im;
1736  b0_re = i10_re; b0_im = i10_im;
1737  b1_re = i11_re; b1_im = i11_im;
1738  b2_re = i12_re; b2_im = i12_im;
1739 
1740  }
1741 #endif // MULTI_GPU
1742 
1743  // read gauge matrix from device memory
1744  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1745 
1746  // reconstruct gauge matrix
1748 
1749  // multiply row 0
1750  spinorFloat A0_re = 0;
1751  A0_re += g00_re * a0_re;
1752  A0_re -= g00_im * a0_im;
1753  A0_re += g01_re * a1_re;
1754  A0_re -= g01_im * a1_im;
1755  A0_re += g02_re * a2_re;
1756  A0_re -= g02_im * a2_im;
1757  spinorFloat A0_im = 0;
1758  A0_im += g00_re * a0_im;
1759  A0_im += g00_im * a0_re;
1760  A0_im += g01_re * a1_im;
1761  A0_im += g01_im * a1_re;
1762  A0_im += g02_re * a2_im;
1763  A0_im += g02_im * a2_re;
1764  spinorFloat B0_re = 0;
1765  B0_re += g00_re * b0_re;
1766  B0_re -= g00_im * b0_im;
1767  B0_re += g01_re * b1_re;
1768  B0_re -= g01_im * b1_im;
1769  B0_re += g02_re * b2_re;
1770  B0_re -= g02_im * b2_im;
1771  spinorFloat B0_im = 0;
1772  B0_im += g00_re * b0_im;
1773  B0_im += g00_im * b0_re;
1774  B0_im += g01_re * b1_im;
1775  B0_im += g01_im * b1_re;
1776  B0_im += g02_re * b2_im;
1777  B0_im += g02_im * b2_re;
1778 
1779  // multiply row 1
1780  spinorFloat A1_re = 0;
1781  A1_re += g10_re * a0_re;
1782  A1_re -= g10_im * a0_im;
1783  A1_re += g11_re * a1_re;
1784  A1_re -= g11_im * a1_im;
1785  A1_re += g12_re * a2_re;
1786  A1_re -= g12_im * a2_im;
1787  spinorFloat A1_im = 0;
1788  A1_im += g10_re * a0_im;
1789  A1_im += g10_im * a0_re;
1790  A1_im += g11_re * a1_im;
1791  A1_im += g11_im * a1_re;
1792  A1_im += g12_re * a2_im;
1793  A1_im += g12_im * a2_re;
1794  spinorFloat B1_re = 0;
1795  B1_re += g10_re * b0_re;
1796  B1_re -= g10_im * b0_im;
1797  B1_re += g11_re * b1_re;
1798  B1_re -= g11_im * b1_im;
1799  B1_re += g12_re * b2_re;
1800  B1_re -= g12_im * b2_im;
1801  spinorFloat B1_im = 0;
1802  B1_im += g10_re * b0_im;
1803  B1_im += g10_im * b0_re;
1804  B1_im += g11_re * b1_im;
1805  B1_im += g11_im * b1_re;
1806  B1_im += g12_re * b2_im;
1807  B1_im += g12_im * b2_re;
1808 
1809  // multiply row 2
1810  spinorFloat A2_re = 0;
1811  A2_re += g20_re * a0_re;
1812  A2_re -= g20_im * a0_im;
1813  A2_re += g21_re * a1_re;
1814  A2_re -= g21_im * a1_im;
1815  A2_re += g22_re * a2_re;
1816  A2_re -= g22_im * a2_im;
1817  spinorFloat A2_im = 0;
1818  A2_im += g20_re * a0_im;
1819  A2_im += g20_im * a0_re;
1820  A2_im += g21_re * a1_im;
1821  A2_im += g21_im * a1_re;
1822  A2_im += g22_re * a2_im;
1823  A2_im += g22_im * a2_re;
1824  spinorFloat B2_re = 0;
1825  B2_re += g20_re * b0_re;
1826  B2_re -= g20_im * b0_im;
1827  B2_re += g21_re * b1_re;
1828  B2_re -= g21_im * b1_im;
1829  B2_re += g22_re * b2_re;
1830  B2_re -= g22_im * b2_im;
1831  spinorFloat B2_im = 0;
1832  B2_im += g20_re * b0_im;
1833  B2_im += g20_im * b0_re;
1834  B2_im += g21_re * b1_im;
1835  B2_im += g21_im * b1_re;
1836  B2_im += g22_re * b2_im;
1837  B2_im += g22_im * b2_re;
1838 
1839  o00_re += a*A0_re;
1840  o00_im += a*A0_im;
1841  o10_re += a*B0_re;
1842  o10_im += a*B0_im;
1843  o20_re -= a*A0_im;
1844  o20_im += a*A0_re;
1845  o30_re += a*B0_im;
1846  o30_im -= a*B0_re;
1847 
1848  o01_re += a*A1_re;
1849  o01_im += a*A1_im;
1850  o11_re += a*B1_re;
1851  o11_im += a*B1_im;
1852  o21_re -= a*A1_im;
1853  o21_im += a*A1_re;
1854  o31_re += a*B1_im;
1855  o31_im -= a*B1_re;
1856 
1857  o02_re += a*A2_re;
1858  o02_im += a*A2_im;
1859  o12_re += a*B2_re;
1860  o12_im += a*B2_im;
1861  o22_re -= a*A2_im;
1862  o22_im += a*A2_re;
1863  o32_re += a*B2_im;
1864  o32_im -= a*B2_re;
1865 
1866 }
1867 
1868 #ifdef MULTI_GPU
1869 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1870  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1871 #endif
1872 {
1873  // Projector P2+
1874  // 1 0 i 0
1875  // 0 1 0 -i
1876  // -i 0 1 0
1877  // 0 i 0 1
1878 
1879 #ifdef MULTI_GPU
1880  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1881  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1882 #else
1883  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1884 #endif
1885 
1886 #ifdef MULTI_GPU
1887  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1888 #else
1889  const int ga_idx = sp_idx;
1890 #endif
1891 
1898 
1899 #ifdef MULTI_GPU
1900  if (kernel_type == INTERIOR_KERNEL) {
1901 #endif
1902 
1903  // read spinor from device memory
1904  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1905 
1906  // project spinor into half spinors
1907  a0_re = +i00_re-i20_im;
1908  a0_im = +i00_im+i20_re;
1909  a1_re = +i01_re-i21_im;
1910  a1_im = +i01_im+i21_re;
1911  a2_re = +i02_re-i22_im;
1912  a2_im = +i02_im+i22_re;
1913  b0_re = +i10_re+i30_im;
1914  b0_im = +i10_im-i30_re;
1915  b1_re = +i11_re+i31_im;
1916  b1_im = +i11_im-i31_re;
1917  b2_re = +i12_re+i32_im;
1918  b2_im = +i12_im-i32_re;
1919 
1920 #ifdef MULTI_GPU
1921  } else {
1922 
1923  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1924 
1925  // read half spinor from device memory
1926  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1927 
1928  a0_re = i00_re; a0_im = i00_im;
1929  a1_re = i01_re; a1_im = i01_im;
1930  a2_re = i02_re; a2_im = i02_im;
1931  b0_re = i10_re; b0_im = i10_im;
1932  b1_re = i11_re; b1_im = i11_im;
1933  b2_re = i12_re; b2_im = i12_im;
1934 
1935  }
1936 #endif // MULTI_GPU
1937 
1938  // read gauge matrix from device memory
1939  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1940 
1941  // reconstruct gauge matrix
1943 
1944  // multiply row 0
1945  spinorFloat A0_re = 0;
1946  A0_re += gT00_re * a0_re;
1947  A0_re -= gT00_im * a0_im;
1948  A0_re += gT01_re * a1_re;
1949  A0_re -= gT01_im * a1_im;
1950  A0_re += gT02_re * a2_re;
1951  A0_re -= gT02_im * a2_im;
1952  spinorFloat A0_im = 0;
1953  A0_im += gT00_re * a0_im;
1954  A0_im += gT00_im * a0_re;
1955  A0_im += gT01_re * a1_im;
1956  A0_im += gT01_im * a1_re;
1957  A0_im += gT02_re * a2_im;
1958  A0_im += gT02_im * a2_re;
1959  spinorFloat B0_re = 0;
1960  B0_re += gT00_re * b0_re;
1961  B0_re -= gT00_im * b0_im;
1962  B0_re += gT01_re * b1_re;
1963  B0_re -= gT01_im * b1_im;
1964  B0_re += gT02_re * b2_re;
1965  B0_re -= gT02_im * b2_im;
1966  spinorFloat B0_im = 0;
1967  B0_im += gT00_re * b0_im;
1968  B0_im += gT00_im * b0_re;
1969  B0_im += gT01_re * b1_im;
1970  B0_im += gT01_im * b1_re;
1971  B0_im += gT02_re * b2_im;
1972  B0_im += gT02_im * b2_re;
1973 
1974  // multiply row 1
1975  spinorFloat A1_re = 0;
1976  A1_re += gT10_re * a0_re;
1977  A1_re -= gT10_im * a0_im;
1978  A1_re += gT11_re * a1_re;
1979  A1_re -= gT11_im * a1_im;
1980  A1_re += gT12_re * a2_re;
1981  A1_re -= gT12_im * a2_im;
1982  spinorFloat A1_im = 0;
1983  A1_im += gT10_re * a0_im;
1984  A1_im += gT10_im * a0_re;
1985  A1_im += gT11_re * a1_im;
1986  A1_im += gT11_im * a1_re;
1987  A1_im += gT12_re * a2_im;
1988  A1_im += gT12_im * a2_re;
1989  spinorFloat B1_re = 0;
1990  B1_re += gT10_re * b0_re;
1991  B1_re -= gT10_im * b0_im;
1992  B1_re += gT11_re * b1_re;
1993  B1_re -= gT11_im * b1_im;
1994  B1_re += gT12_re * b2_re;
1995  B1_re -= gT12_im * b2_im;
1996  spinorFloat B1_im = 0;
1997  B1_im += gT10_re * b0_im;
1998  B1_im += gT10_im * b0_re;
1999  B1_im += gT11_re * b1_im;
2000  B1_im += gT11_im * b1_re;
2001  B1_im += gT12_re * b2_im;
2002  B1_im += gT12_im * b2_re;
2003 
2004  // multiply row 2
2005  spinorFloat A2_re = 0;
2006  A2_re += gT20_re * a0_re;
2007  A2_re -= gT20_im * a0_im;
2008  A2_re += gT21_re * a1_re;
2009  A2_re -= gT21_im * a1_im;
2010  A2_re += gT22_re * a2_re;
2011  A2_re -= gT22_im * a2_im;
2012  spinorFloat A2_im = 0;
2013  A2_im += gT20_re * a0_im;
2014  A2_im += gT20_im * a0_re;
2015  A2_im += gT21_re * a1_im;
2016  A2_im += gT21_im * a1_re;
2017  A2_im += gT22_re * a2_im;
2018  A2_im += gT22_im * a2_re;
2019  spinorFloat B2_re = 0;
2020  B2_re += gT20_re * b0_re;
2021  B2_re -= gT20_im * b0_im;
2022  B2_re += gT21_re * b1_re;
2023  B2_re -= gT21_im * b1_im;
2024  B2_re += gT22_re * b2_re;
2025  B2_re -= gT22_im * b2_im;
2026  spinorFloat B2_im = 0;
2027  B2_im += gT20_re * b0_im;
2028  B2_im += gT20_im * b0_re;
2029  B2_im += gT21_re * b1_im;
2030  B2_im += gT21_im * b1_re;
2031  B2_im += gT22_re * b2_im;
2032  B2_im += gT22_im * b2_re;
2033 
2034  o00_re += a*A0_re;
2035  o00_im += a*A0_im;
2036  o10_re += a*B0_re;
2037  o10_im += a*B0_im;
2038  o20_re += a*A0_im;
2039  o20_im -= a*A0_re;
2040  o30_re -= a*B0_im;
2041  o30_im += a*B0_re;
2042 
2043  o01_re += a*A1_re;
2044  o01_im += a*A1_im;
2045  o11_re += a*B1_re;
2046  o11_im += a*B1_im;
2047  o21_re += a*A1_im;
2048  o21_im -= a*A1_re;
2049  o31_re -= a*B1_im;
2050  o31_im += a*B1_re;
2051 
2052  o02_re += a*A2_re;
2053  o02_im += a*A2_im;
2054  o12_re += a*B2_re;
2055  o12_im += a*B2_im;
2056  o22_re += a*A2_im;
2057  o22_im -= a*A2_re;
2058  o32_re -= a*B2_im;
2059  o32_im += a*B2_re;
2060 
2061 }
2062 
2063 #ifdef MULTI_GPU
2064 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
2065  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
2066 #endif
2067 {
2068  // Projector P3-
2069  // 0 0 0 0
2070  // 0 0 0 0
2071  // 0 0 2 0
2072  // 0 0 0 2
2073 
2074 #ifdef MULTI_GPU
2075  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
2076  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2077 #else
2078  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
2079 #endif
2080 
2081  const int ga_idx = sid;
2082 
2084  {
2091 
2092 #ifdef MULTI_GPU
2093  if (kernel_type == INTERIOR_KERNEL) {
2094 #endif
2095 
2096  // read spinor from device memory
2097  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2098 
2099  // project spinor into half spinors
2100  a0_re = +2*i20_re;
2101  a0_im = +2*i20_im;
2102  a1_re = +2*i21_re;
2103  a1_im = +2*i21_im;
2104  a2_re = +2*i22_re;
2105  a2_im = +2*i22_im;
2106  b0_re = +2*i30_re;
2107  b0_im = +2*i30_im;
2108  b1_re = +2*i31_re;
2109  b1_im = +2*i31_im;
2110  b2_re = +2*i32_re;
2111  b2_im = +2*i32_im;
2112 
2113 #ifdef MULTI_GPU
2114  } else {
2115 
2116  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2117  const int t_proj_scale = TPROJSCALE;
2118 
2119  // read half spinor from device memory
2120  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
2121 
2122  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2123  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2124  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2125  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2126  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2127  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2128 
2129  }
2130 #endif // MULTI_GPU
2131 
2132  // identity gauge matrix
2139 
2140  o20_re += a*A0_re;
2141  o20_im += a*A0_im;
2142  o30_re += a*B0_re;
2143  o30_im += a*B0_im;
2144 
2145  o21_re += a*A1_re;
2146  o21_im += a*A1_im;
2147  o31_re += a*B1_re;
2148  o31_im += a*B1_im;
2149 
2150  o22_re += a*A2_re;
2151  o22_im += a*A2_im;
2152  o32_re += a*B2_re;
2153  o32_im += a*B2_im;
2154 
2155  } else {
2162 
2163 #ifdef MULTI_GPU
2164  if (kernel_type == INTERIOR_KERNEL) {
2165 #endif
2166 
2167  // read spinor from device memory
2168  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2169 
2170  // project spinor into half spinors
2171  a0_re = +2*i20_re;
2172  a0_im = +2*i20_im;
2173  a1_re = +2*i21_re;
2174  a1_im = +2*i21_im;
2175  a2_re = +2*i22_re;
2176  a2_im = +2*i22_im;
2177  b0_re = +2*i30_re;
2178  b0_im = +2*i30_im;
2179  b1_re = +2*i31_re;
2180  b1_im = +2*i31_im;
2181  b2_re = +2*i32_re;
2182  b2_im = +2*i32_im;
2183 
2184 #ifdef MULTI_GPU
2185  } else {
2186 
2187  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2188  const int t_proj_scale = TPROJSCALE;
2189 
2190  // read half spinor from device memory
2191  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
2192 
2193  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2194  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2195  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2196  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2197  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2198  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2199 
2200  }
2201 #endif // MULTI_GPU
2202 
2203  // read gauge matrix from device memory
2204  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
2205 
2206  // reconstruct gauge matrix
2208 
2209  // multiply row 0
2210  spinorFloat A0_re = 0;
2211  A0_re += g00_re * a0_re;
2212  A0_re -= g00_im * a0_im;
2213  A0_re += g01_re * a1_re;
2214  A0_re -= g01_im * a1_im;
2215  A0_re += g02_re * a2_re;
2216  A0_re -= g02_im * a2_im;
2217  spinorFloat A0_im = 0;
2218  A0_im += g00_re * a0_im;
2219  A0_im += g00_im * a0_re;
2220  A0_im += g01_re * a1_im;
2221  A0_im += g01_im * a1_re;
2222  A0_im += g02_re * a2_im;
2223  A0_im += g02_im * a2_re;
2224  spinorFloat B0_re = 0;
2225  B0_re += g00_re * b0_re;
2226  B0_re -= g00_im * b0_im;
2227  B0_re += g01_re * b1_re;
2228  B0_re -= g01_im * b1_im;
2229  B0_re += g02_re * b2_re;
2230  B0_re -= g02_im * b2_im;
2231  spinorFloat B0_im = 0;
2232  B0_im += g00_re * b0_im;
2233  B0_im += g00_im * b0_re;
2234  B0_im += g01_re * b1_im;
2235  B0_im += g01_im * b1_re;
2236  B0_im += g02_re * b2_im;
2237  B0_im += g02_im * b2_re;
2238 
2239  // multiply row 1
2240  spinorFloat A1_re = 0;
2241  A1_re += g10_re * a0_re;
2242  A1_re -= g10_im * a0_im;
2243  A1_re += g11_re * a1_re;
2244  A1_re -= g11_im * a1_im;
2245  A1_re += g12_re * a2_re;
2246  A1_re -= g12_im * a2_im;
2247  spinorFloat A1_im = 0;
2248  A1_im += g10_re * a0_im;
2249  A1_im += g10_im * a0_re;
2250  A1_im += g11_re * a1_im;
2251  A1_im += g11_im * a1_re;
2252  A1_im += g12_re * a2_im;
2253  A1_im += g12_im * a2_re;
2254  spinorFloat B1_re = 0;
2255  B1_re += g10_re * b0_re;
2256  B1_re -= g10_im * b0_im;
2257  B1_re += g11_re * b1_re;
2258  B1_re -= g11_im * b1_im;
2259  B1_re += g12_re * b2_re;
2260  B1_re -= g12_im * b2_im;
2261  spinorFloat B1_im = 0;
2262  B1_im += g10_re * b0_im;
2263  B1_im += g10_im * b0_re;
2264  B1_im += g11_re * b1_im;
2265  B1_im += g11_im * b1_re;
2266  B1_im += g12_re * b2_im;
2267  B1_im += g12_im * b2_re;
2268 
2269  // multiply row 2
2270  spinorFloat A2_re = 0;
2271  A2_re += g20_re * a0_re;
2272  A2_re -= g20_im * a0_im;
2273  A2_re += g21_re * a1_re;
2274  A2_re -= g21_im * a1_im;
2275  A2_re += g22_re * a2_re;
2276  A2_re -= g22_im * a2_im;
2277  spinorFloat A2_im = 0;
2278  A2_im += g20_re * a0_im;
2279  A2_im += g20_im * a0_re;
2280  A2_im += g21_re * a1_im;
2281  A2_im += g21_im * a1_re;
2282  A2_im += g22_re * a2_im;
2283  A2_im += g22_im * a2_re;
2284  spinorFloat B2_re = 0;
2285  B2_re += g20_re * b0_re;
2286  B2_re -= g20_im * b0_im;
2287  B2_re += g21_re * b1_re;
2288  B2_re -= g21_im * b1_im;
2289  B2_re += g22_re * b2_re;
2290  B2_re -= g22_im * b2_im;
2291  spinorFloat B2_im = 0;
2292  B2_im += g20_re * b0_im;
2293  B2_im += g20_im * b0_re;
2294  B2_im += g21_re * b1_im;
2295  B2_im += g21_im * b1_re;
2296  B2_im += g22_re * b2_im;
2297  B2_im += g22_im * b2_re;
2298 
2299  o20_re += a*A0_re;
2300  o20_im += a*A0_im;
2301  o30_re += a*B0_re;
2302  o30_im += a*B0_im;
2303 
2304  o21_re += a*A1_re;
2305  o21_im += a*A1_im;
2306  o31_re += a*B1_re;
2307  o31_im += a*B1_im;
2308 
2309  o22_re += a*A2_re;
2310  o22_im += a*A2_im;
2311  o32_re += a*B2_re;
2312  o32_im += a*B2_im;
2313 
2314  }
2315 }
2316 
2317 #ifdef MULTI_GPU
2318 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
2319  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
2320 #endif
2321 {
2322  // Projector P3+
2323  // 2 0 0 0
2324  // 0 2 0 0
2325  // 0 0 0 0
2326  // 0 0 0 0
2327 
2328 #ifdef MULTI_GPU
2329  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
2330  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2331 #else
2332  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
2333 #endif
2334 
2335 #ifdef MULTI_GPU
2336  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
2337 #else
2338  const int ga_idx = sp_idx;
2339 #endif
2340 
2341  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
2342  {
2349 
2350 #ifdef MULTI_GPU
2351  if (kernel_type == INTERIOR_KERNEL) {
2352 #endif
2353 
2354  // read spinor from device memory
2355  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2356 
2357  // project spinor into half spinors
2358  a0_re = +2*i00_re;
2359  a0_im = +2*i00_im;
2360  a1_re = +2*i01_re;
2361  a1_im = +2*i01_im;
2362  a2_re = +2*i02_re;
2363  a2_im = +2*i02_im;
2364  b0_re = +2*i10_re;
2365  b0_im = +2*i10_im;
2366  b1_re = +2*i11_re;
2367  b1_im = +2*i11_im;
2368  b2_re = +2*i12_re;
2369  b2_im = +2*i12_im;
2370 
2371 #ifdef MULTI_GPU
2372  } else {
2373 
2374  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2375  const int t_proj_scale = TPROJSCALE;
2376 
2377  // read half spinor from device memory
2378  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2379 
2380  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2381  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2382  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2383  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2384  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2385  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2386 
2387  }
2388 #endif // MULTI_GPU
2389 
2390  // identity gauge matrix
2397 
2398  o00_re += a*A0_re;
2399  o00_im += a*A0_im;
2400  o10_re += a*B0_re;
2401  o10_im += a*B0_im;
2402 
2403  o01_re += a*A1_re;
2404  o01_im += a*A1_im;
2405  o11_re += a*B1_re;
2406  o11_im += a*B1_im;
2407 
2408  o02_re += a*A2_re;
2409  o02_im += a*A2_im;
2410  o12_re += a*B2_re;
2411  o12_im += a*B2_im;
2412 
2413  } else {
2420 
2421 #ifdef MULTI_GPU
2422  if (kernel_type == INTERIOR_KERNEL) {
2423 #endif
2424 
2425  // read spinor from device memory
2426  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2427 
2428  // project spinor into half spinors
2429  a0_re = +2*i00_re;
2430  a0_im = +2*i00_im;
2431  a1_re = +2*i01_re;
2432  a1_im = +2*i01_im;
2433  a2_re = +2*i02_re;
2434  a2_im = +2*i02_im;
2435  b0_re = +2*i10_re;
2436  b0_im = +2*i10_im;
2437  b1_re = +2*i11_re;
2438  b1_im = +2*i11_im;
2439  b2_re = +2*i12_re;
2440  b2_im = +2*i12_im;
2441 
2442 #ifdef MULTI_GPU
2443  } else {
2444 
2445  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2446  const int t_proj_scale = TPROJSCALE;
2447 
2448  // read half spinor from device memory
2449  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2450 
2451  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2452  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2453  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2454  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2455  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2456  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2457 
2458  }
2459 #endif // MULTI_GPU
2460 
2461  // read gauge matrix from device memory
2462  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
2463 
2464  // reconstruct gauge matrix
2466 
2467  // multiply row 0
2468  spinorFloat A0_re = 0;
2469  A0_re += gT00_re * a0_re;
2470  A0_re -= gT00_im * a0_im;
2471  A0_re += gT01_re * a1_re;
2472  A0_re -= gT01_im * a1_im;
2473  A0_re += gT02_re * a2_re;
2474  A0_re -= gT02_im * a2_im;
2475  spinorFloat A0_im = 0;
2476  A0_im += gT00_re * a0_im;
2477  A0_im += gT00_im * a0_re;
2478  A0_im += gT01_re * a1_im;
2479  A0_im += gT01_im * a1_re;
2480  A0_im += gT02_re * a2_im;
2481  A0_im += gT02_im * a2_re;
2482  spinorFloat B0_re = 0;
2483  B0_re += gT00_re * b0_re;
2484  B0_re -= gT00_im * b0_im;
2485  B0_re += gT01_re * b1_re;
2486  B0_re -= gT01_im * b1_im;
2487  B0_re += gT02_re * b2_re;
2488  B0_re -= gT02_im * b2_im;
2489  spinorFloat B0_im = 0;
2490  B0_im += gT00_re * b0_im;
2491  B0_im += gT00_im * b0_re;
2492  B0_im += gT01_re * b1_im;
2493  B0_im += gT01_im * b1_re;
2494  B0_im += gT02_re * b2_im;
2495  B0_im += gT02_im * b2_re;
2496 
2497  // multiply row 1
2498  spinorFloat A1_re = 0;
2499  A1_re += gT10_re * a0_re;
2500  A1_re -= gT10_im * a0_im;
2501  A1_re += gT11_re * a1_re;
2502  A1_re -= gT11_im * a1_im;
2503  A1_re += gT12_re * a2_re;
2504  A1_re -= gT12_im * a2_im;
2505  spinorFloat A1_im = 0;
2506  A1_im += gT10_re * a0_im;
2507  A1_im += gT10_im * a0_re;
2508  A1_im += gT11_re * a1_im;
2509  A1_im += gT11_im * a1_re;
2510  A1_im += gT12_re * a2_im;
2511  A1_im += gT12_im * a2_re;
2512  spinorFloat B1_re = 0;
2513  B1_re += gT10_re * b0_re;
2514  B1_re -= gT10_im * b0_im;
2515  B1_re += gT11_re * b1_re;
2516  B1_re -= gT11_im * b1_im;
2517  B1_re += gT12_re * b2_re;
2518  B1_re -= gT12_im * b2_im;
2519  spinorFloat B1_im = 0;
2520  B1_im += gT10_re * b0_im;
2521  B1_im += gT10_im * b0_re;
2522  B1_im += gT11_re * b1_im;
2523  B1_im += gT11_im * b1_re;
2524  B1_im += gT12_re * b2_im;
2525  B1_im += gT12_im * b2_re;
2526 
2527  // multiply row 2
2528  spinorFloat A2_re = 0;
2529  A2_re += gT20_re * a0_re;
2530  A2_re -= gT20_im * a0_im;
2531  A2_re += gT21_re * a1_re;
2532  A2_re -= gT21_im * a1_im;
2533  A2_re += gT22_re * a2_re;
2534  A2_re -= gT22_im * a2_im;
2535  spinorFloat A2_im = 0;
2536  A2_im += gT20_re * a0_im;
2537  A2_im += gT20_im * a0_re;
2538  A2_im += gT21_re * a1_im;
2539  A2_im += gT21_im * a1_re;
2540  A2_im += gT22_re * a2_im;
2541  A2_im += gT22_im * a2_re;
2542  spinorFloat B2_re = 0;
2543  B2_re += gT20_re * b0_re;
2544  B2_re -= gT20_im * b0_im;
2545  B2_re += gT21_re * b1_re;
2546  B2_re -= gT21_im * b1_im;
2547  B2_re += gT22_re * b2_re;
2548  B2_re -= gT22_im * b2_im;
2549  spinorFloat B2_im = 0;
2550  B2_im += gT20_re * b0_im;
2551  B2_im += gT20_im * b0_re;
2552  B2_im += gT21_re * b1_im;
2553  B2_im += gT21_im * b1_re;
2554  B2_im += gT22_re * b2_im;
2555  B2_im += gT22_im * b2_re;
2556 
2557  o00_re += a*A0_re;
2558  o00_im += a*A0_im;
2559  o10_re += a*B0_re;
2560  o10_im += a*B0_im;
2561 
2562  o01_re += a*A1_re;
2563  o01_im += a*A1_im;
2564  o11_re += a*B1_re;
2565  o11_im += a*B1_im;
2566 
2567  o02_re += a*A2_re;
2568  o02_im += a*A2_im;
2569  o12_re += a*B2_re;
2570  o12_im += a*B2_im;
2571 
2572  }
2573 }
2574 
2575 
2576 
2577 // write spinor field back to device memory
2578 WRITE_SPINOR(param.sp_stride);
2579 
2580 // undefine to prevent warning when precision is changed
2581 #undef spinorFloat
2582 #undef g00_re
2583 #undef g00_im
2584 #undef g01_re
2585 #undef g01_im
2586 #undef g02_re
2587 #undef g02_im
2588 #undef g10_re
2589 #undef g10_im
2590 #undef g11_re
2591 #undef g11_im
2592 #undef g12_re
2593 #undef g12_im
2594 #undef g20_re
2595 #undef g20_im
2596 #undef g21_re
2597 #undef g21_im
2598 #undef g22_re
2599 #undef g22_im
2600 
2601 #undef i00_re
2602 #undef i00_im
2603 #undef i01_re
2604 #undef i01_im
2605 #undef i02_re
2606 #undef i02_im
2607 #undef i10_re
2608 #undef i10_im
2609 #undef i11_re
2610 #undef i11_im
2611 #undef i12_re
2612 #undef i12_im
2613 #undef i20_re
2614 #undef i20_im
2615 #undef i21_re
2616 #undef i21_im
2617 #undef i22_re
2618 #undef i22_im
2619 #undef i30_re
2620 #undef i30_im
2621 #undef i31_re
2622 #undef i31_im
2623 #undef i32_re
2624 #undef i32_im
2625 
2626 #undef acc00_re
2627 #undef acc00_im
2628 #undef acc01_re
2629 #undef acc01_im
2630 #undef acc02_re
2631 #undef acc02_im
2632 #undef acc10_re
2633 #undef acc10_im
2634 #undef acc11_re
2635 #undef acc11_im
2636 #undef acc12_re
2637 #undef acc12_im
2638 #undef acc20_re
2639 #undef acc20_im
2640 #undef acc21_re
2641 #undef acc21_im
2642 #undef acc22_re
2643 #undef acc22_im
2644 #undef acc30_re
2645 #undef acc30_im
2646 #undef acc31_re
2647 #undef acc31_im
2648 #undef acc32_re
2649 #undef acc32_im
2650 
2651 #undef c00_00_re
2652 #undef c01_01_re
2653 #undef c02_02_re
2654 #undef c10_10_re
2655 #undef c11_11_re
2656 #undef c12_12_re
2657 #undef c01_00_re
2658 #undef c01_00_im
2659 #undef c02_00_re
2660 #undef c02_00_im
2661 #undef c10_00_re
2662 #undef c10_00_im
2663 #undef c11_00_re
2664 #undef c11_00_im
2665 #undef c12_00_re
2666 #undef c12_00_im
2667 #undef c02_01_re
2668 #undef c02_01_im
2669 #undef c10_01_re
2670 #undef c10_01_im
2671 #undef c11_01_re
2672 #undef c11_01_im
2673 #undef c12_01_re
2674 #undef c12_01_im
2675 #undef c10_02_re
2676 #undef c10_02_im
2677 #undef c11_02_re
2678 #undef c11_02_im
2679 #undef c12_02_re
2680 #undef c12_02_im
2681 #undef c11_10_re
2682 #undef c11_10_im
2683 #undef c12_10_re
2684 #undef c12_10_im
2685 #undef c12_11_re
2686 #undef c12_11_im
2687 
2688 
2689 #undef VOLATILE
__constant__ int Vh
#define a22_re
Definition: llfat_core.h:131
__constant__ int X2
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o30_re
__constant__ int X2X1mX1
VOLATILE spinorFloat o11_re
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
#define CLOVERTEX
Definition: clover_def.h:101
#define a02_im
Definition: llfat_core.h:120
__constant__ int X3X2X1mX2X1
__constant__ int X1
VOLATILE spinorFloat o10_re
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
VOLATILE spinorFloat o30_im
int sp_idx
#define a22_im
Definition: llfat_core.h:132
__constant__ int X3X2X1
VOLATILE spinorFloat o02_im
#define a01_re
Definition: llfat_core.h:117
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o32_re
VOLATILE spinorFloat o32_im
#define a02_re
Definition: llfat_core.h:119
#define a20_re
Definition: llfat_core.h:127
VOLATILE spinorFloat o20_im
#define a12_im
Definition: llfat_core.h:126
VOLATILE spinorFloat o01_im
#define a20_im
Definition: llfat_core.h:128
QudaGaugeParam param
Definition: pack_test.cpp:17
RECONSTRUCT_GAUGE_MATRIX(0)
__constant__ int ghostFace[QUDA_MAX_DIM+1]
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o00_im
#define a01_im
Definition: llfat_core.h:118
#define a12_re
Definition: llfat_core.h:125
#define GAUGE0TEX
Definition: covDev.h:112
#define a11_re
Definition: llfat_core.h:123
WRITE_SPINOR(param.sp_stride)
__constant__ int X2m1
VOLATILE spinorFloat o31_re
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define SPINORTEX
Definition: clover_def.h:40
__constant__ int gauge_fixed
VOLATILE spinorFloat o00_re
__constant__ int X4X3X2X1mX3X2X1
#define SPINOR_HOP
Definition: covDev.h:158
VOLATILE spinorFloat o10_im
__constant__ int ga_stride
VOLATILE spinorFloat o20_re
#define a00_re
Definition: llfat_core.h:115
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
__constant__ int X1m1
__constant__ int X3
#define a11_im
Definition: llfat_core.h:124
#define a10_re
Definition: llfat_core.h:121
VOLATILE spinorFloat o22_im
#define GAUGE1TEX
Definition: covDev.h:113
VOLATILE spinorFloat o21_re
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o22_re
#define a10_im
Definition: llfat_core.h:122
__constant__ int X4m1
#define a21_re
Definition: llfat_core.h:129
VOLATILE spinorFloat o02_re
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
VOLATILE spinorFloat o12_re
__constant__ int X4X3X2X1hmX3X2X1h
#define a21_im
Definition: llfat_core.h:130
KernelType kernel_type
VOLATILE spinorFloat o31_im
#define READ_CLOVER
Definition: clover_def.h:103
__constant__ int X4
__constant__ int X3m1
#define TPROJSCALE
Definition: covDev.h:101
#define a00_im
Definition: llfat_core.h:116
__constant__ int X2X1