QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
asym_wilson_clover_dslash_dagger_g80_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 19
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define i00_re I0.x
15 #define i00_im I0.y
16 #define i01_re I1.x
17 #define i01_im I1.y
18 #define i02_re I2.x
19 #define i02_im I2.y
20 #define i10_re I3.x
21 #define i10_im I3.y
22 #define i11_re I4.x
23 #define i11_im I4.y
24 #define i12_re I5.x
25 #define i12_im I5.y
26 #define i20_re I6.x
27 #define i20_im I6.y
28 #define i21_re I7.x
29 #define i21_im I7.y
30 #define i22_re I8.x
31 #define i22_im I8.y
32 #define i30_re I9.x
33 #define i30_im I9.y
34 #define i31_re I10.x
35 #define i31_im I10.y
36 #define i32_re I11.x
37 #define i32_im I11.y
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
62 #else
63 #define spinorFloat float
64 #define i00_re I0.x
65 #define i00_im I0.y
66 #define i01_re I0.z
67 #define i01_im I0.w
68 #define i02_re I1.x
69 #define i02_im I1.y
70 #define i10_re I1.z
71 #define i10_im I1.w
72 #define i11_re I2.x
73 #define i11_im I2.y
74 #define i12_re I2.z
75 #define i12_im I2.w
76 #define i20_re I3.x
77 #define i20_im I3.y
78 #define i21_re I3.z
79 #define i21_im I3.w
80 #define i22_re I4.x
81 #define i22_im I4.y
82 #define i30_re I4.z
83 #define i30_im I4.w
84 #define i31_re I5.x
85 #define i31_im I5.y
86 #define i32_re I5.z
87 #define i32_im I5.w
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
113 
114 // gauge link
115 #ifdef GAUGE_FLOAT2
116 #define g00_re G0.x
117 #define g00_im G0.y
118 #define g01_re G1.x
119 #define g01_im G1.y
120 #define g02_re G2.x
121 #define g02_im G2.y
122 #define g10_re G3.x
123 #define g10_im G3.y
124 #define g11_re G4.x
125 #define g11_im G4.y
126 #define g12_re G5.x
127 #define g12_im G5.y
128 #define g20_re G6.x
129 #define g20_im G6.y
130 #define g21_re G7.x
131 #define g21_im G7.y
132 #define g22_re G8.x
133 #define g22_im G8.y
134 
135 #else
136 #define g00_re G0.x
137 #define g00_im G0.y
138 #define g01_re G0.z
139 #define g01_im G0.w
140 #define g02_re G1.x
141 #define g02_im G1.y
142 #define g10_re G1.z
143 #define g10_im G1.w
144 #define g11_re G2.x
145 #define g11_im G2.y
146 #define g12_re G2.z
147 #define g12_im G2.w
148 #define g20_re G3.x
149 #define g20_im G3.y
150 #define g21_re G3.z
151 #define g21_im G3.w
152 #define g22_re G4.x
153 #define g22_im G4.y
154 
155 #endif // GAUGE_DOUBLE
156 
157 // conjugated gauge link
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
176 
177 // first chiral block of inverted clover term
178 #ifdef CLOVER_DOUBLE
179 #define c00_00_re C0.x
180 #define c01_01_re C0.y
181 #define c02_02_re C1.x
182 #define c10_10_re C1.y
183 #define c11_11_re C2.x
184 #define c12_12_re C2.y
185 #define c01_00_re C3.x
186 #define c01_00_im C3.y
187 #define c02_00_re C4.x
188 #define c02_00_im C4.y
189 #define c10_00_re C5.x
190 #define c10_00_im C5.y
191 #define c11_00_re C6.x
192 #define c11_00_im C6.y
193 #define c12_00_re C7.x
194 #define c12_00_im C7.y
195 #define c02_01_re C8.x
196 #define c02_01_im C8.y
197 #define c10_01_re C9.x
198 #define c10_01_im C9.y
199 #define c11_01_re C10.x
200 #define c11_01_im C10.y
201 #define c12_01_re C11.x
202 #define c12_01_im C11.y
203 #define c10_02_re C12.x
204 #define c10_02_im C12.y
205 #define c11_02_re C13.x
206 #define c11_02_im C13.y
207 #define c12_02_re C14.x
208 #define c12_02_im C14.y
209 #define c11_10_re C15.x
210 #define c11_10_im C15.y
211 #define c12_10_re C16.x
212 #define c12_10_im C16.y
213 #define c12_11_re C17.x
214 #define c12_11_im C17.y
215 #else
216 #define c00_00_re C0.x
217 #define c01_01_re C0.y
218 #define c02_02_re C0.z
219 #define c10_10_re C0.w
220 #define c11_11_re C1.x
221 #define c12_12_re C1.y
222 #define c01_00_re C1.z
223 #define c01_00_im C1.w
224 #define c02_00_re C2.x
225 #define c02_00_im C2.y
226 #define c10_00_re C2.z
227 #define c10_00_im C2.w
228 #define c11_00_re C3.x
229 #define c11_00_im C3.y
230 #define c12_00_re C3.z
231 #define c12_00_im C3.w
232 #define c02_01_re C4.x
233 #define c02_01_im C4.y
234 #define c10_01_re C4.z
235 #define c10_01_im C4.w
236 #define c11_01_re C5.x
237 #define c11_01_im C5.y
238 #define c12_01_re C5.z
239 #define c12_01_im C5.w
240 #define c10_02_re C6.x
241 #define c10_02_im C6.y
242 #define c11_02_re C6.z
243 #define c11_02_im C6.w
244 #define c12_02_re C7.x
245 #define c12_02_im C7.y
246 #define c11_10_re C7.z
247 #define c11_10_im C7.w
248 #define c12_10_re C8.x
249 #define c12_10_im C8.y
250 #define c12_11_re C8.z
251 #define c12_11_im C8.w
252 #endif // CLOVER_DOUBLE
253 
254 #define c00_01_re (+c01_00_re)
255 #define c00_01_im (-c01_00_im)
256 #define c00_02_re (+c02_00_re)
257 #define c00_02_im (-c02_00_im)
258 #define c01_02_re (+c02_01_re)
259 #define c01_02_im (-c02_01_im)
260 #define c00_10_re (+c10_00_re)
261 #define c00_10_im (-c10_00_im)
262 #define c01_10_re (+c10_01_re)
263 #define c01_10_im (-c10_01_im)
264 #define c02_10_re (+c10_02_re)
265 #define c02_10_im (-c10_02_im)
266 #define c00_11_re (+c11_00_re)
267 #define c00_11_im (-c11_00_im)
268 #define c01_11_re (+c11_01_re)
269 #define c01_11_im (-c11_01_im)
270 #define c02_11_re (+c11_02_re)
271 #define c02_11_im (-c11_02_im)
272 #define c10_11_re (+c11_10_re)
273 #define c10_11_im (-c11_10_im)
274 #define c00_12_re (+c12_00_re)
275 #define c00_12_im (-c12_00_im)
276 #define c01_12_re (+c12_01_re)
277 #define c01_12_im (-c12_01_im)
278 #define c02_12_re (+c12_02_re)
279 #define c02_12_im (-c12_02_im)
280 #define c10_12_re (+c12_10_re)
281 #define c10_12_im (-c12_10_im)
282 #define c11_12_re (+c12_11_re)
283 #define c11_12_im (-c12_11_im)
284 
285 // second chiral block of inverted clover term (reuses C0,...,C9)
286 #define c20_20_re c00_00_re
287 #define c21_20_re c01_00_re
288 #define c21_20_im c01_00_im
289 #define c22_20_re c02_00_re
290 #define c22_20_im c02_00_im
291 #define c30_20_re c10_00_re
292 #define c30_20_im c10_00_im
293 #define c31_20_re c11_00_re
294 #define c31_20_im c11_00_im
295 #define c32_20_re c12_00_re
296 #define c32_20_im c12_00_im
297 #define c20_21_re c00_01_re
298 #define c20_21_im c00_01_im
299 #define c21_21_re c01_01_re
300 #define c22_21_re c02_01_re
301 #define c22_21_im c02_01_im
302 #define c30_21_re c10_01_re
303 #define c30_21_im c10_01_im
304 #define c31_21_re c11_01_re
305 #define c31_21_im c11_01_im
306 #define c32_21_re c12_01_re
307 #define c32_21_im c12_01_im
308 #define c20_22_re c00_02_re
309 #define c20_22_im c00_02_im
310 #define c21_22_re c01_02_re
311 #define c21_22_im c01_02_im
312 #define c22_22_re c02_02_re
313 #define c30_22_re c10_02_re
314 #define c30_22_im c10_02_im
315 #define c31_22_re c11_02_re
316 #define c31_22_im c11_02_im
317 #define c32_22_re c12_02_re
318 #define c32_22_im c12_02_im
319 #define c20_30_re c00_10_re
320 #define c20_30_im c00_10_im
321 #define c21_30_re c01_10_re
322 #define c21_30_im c01_10_im
323 #define c22_30_re c02_10_re
324 #define c22_30_im c02_10_im
325 #define c30_30_re c10_10_re
326 #define c31_30_re c11_10_re
327 #define c31_30_im c11_10_im
328 #define c32_30_re c12_10_re
329 #define c32_30_im c12_10_im
330 #define c20_31_re c00_11_re
331 #define c20_31_im c00_11_im
332 #define c21_31_re c01_11_re
333 #define c21_31_im c01_11_im
334 #define c22_31_re c02_11_re
335 #define c22_31_im c02_11_im
336 #define c30_31_re c10_11_re
337 #define c30_31_im c10_11_im
338 #define c31_31_re c11_11_re
339 #define c32_31_re c12_11_re
340 #define c32_31_im c12_11_im
341 #define c20_32_re c00_12_re
342 #define c20_32_im c00_12_im
343 #define c21_32_re c01_12_re
344 #define c21_32_im c01_12_im
345 #define c22_32_re c02_12_re
346 #define c22_32_im c02_12_im
347 #define c30_32_re c10_12_re
348 #define c30_32_im c10_12_im
349 #define c31_32_re c11_12_re
350 #define c31_32_im c11_12_im
351 #define c32_32_re c12_12_re
352 
353 // output spinor
354 #define o00_re s[0*SHARED_STRIDE]
355 #define o00_im s[1*SHARED_STRIDE]
356 #define o01_re s[2*SHARED_STRIDE]
357 #define o01_im s[3*SHARED_STRIDE]
358 #define o02_re s[4*SHARED_STRIDE]
359 #define o02_im s[5*SHARED_STRIDE]
360 #define o10_re s[6*SHARED_STRIDE]
361 #define o10_im s[7*SHARED_STRIDE]
362 #define o11_re s[8*SHARED_STRIDE]
363 #define o11_im s[9*SHARED_STRIDE]
364 #define o12_re s[10*SHARED_STRIDE]
365 #define o12_im s[11*SHARED_STRIDE]
366 #define o20_re s[12*SHARED_STRIDE]
367 #define o20_im s[13*SHARED_STRIDE]
368 #define o21_re s[14*SHARED_STRIDE]
369 #define o21_im s[15*SHARED_STRIDE]
370 #define o22_re s[16*SHARED_STRIDE]
371 #define o22_im s[17*SHARED_STRIDE]
372 #define o30_re s[18*SHARED_STRIDE]
378 
379 #ifdef SPINOR_DOUBLE
380 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
381 #else
382 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
383 #endif
384 
385 extern __shared__ char s_data[];
386 
388  + (threadIdx.x % SHARED_STRIDE);
389 
390 #include "read_gauge.h"
391 #include "read_clover.h"
392 #include "io_spinor.h"
393 
394 int x1, x2, x3, x4;
395 int X;
396 
397 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
398 int sp_norm_idx;
399 #endif // MULTI_GPU half precision
400 
401 int sid;
402 
403 #ifdef MULTI_GPU
404 int face_idx;
406 #endif
407 
408  sid = blockIdx.x*blockDim.x + threadIdx.x;
409  if (sid >= param.threads) return;
410 
411  // Inline by hand for the moment and assume even dimensions
412  const int dims[] = {X1, X2, X3, X4};
413  coordsFromIndex<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
414 
415  o00_re = 0; o00_im = 0;
416  o01_re = 0; o01_im = 0;
417  o02_re = 0; o02_im = 0;
418  o10_re = 0; o10_im = 0;
419  o11_re = 0; o11_im = 0;
420  o12_re = 0; o12_im = 0;
421  o20_re = 0; o20_im = 0;
422  o21_re = 0; o21_im = 0;
423  o22_re = 0; o22_im = 0;
424  o30_re = 0; o30_im = 0;
425  o31_re = 0; o31_im = 0;
426  o32_re = 0; o32_im = 0;
427 #ifdef DSLASH_CLOVER_XPAY
428 
429  READ_ACCUM(ACCUMTEX, param.sp_stride)
430 
431 #ifdef DSLASH_CLOVER
432 
433  // change to chiral basis
434  {
441  spinorFloat a30_re = acc00_re - acc20_re;
442  spinorFloat a30_im = acc00_im - acc20_im;
443 
446  acc20_re = a20_re; acc20_im = a20_im;
447  acc30_re = a30_re; acc30_im = a30_im;
448  }
449 
450  {
457  spinorFloat a31_re = acc01_re - acc21_re;
458  spinorFloat a31_im = acc01_im - acc21_im;
459 
462  acc21_re = a21_re; acc21_im = a21_im;
463  acc31_re = a31_re; acc31_im = a31_im;
464  }
465 
466  {
473  spinorFloat a32_re = acc02_re - acc22_re;
474  spinorFloat a32_im = acc02_im - acc22_im;
475 
478  acc22_re = a22_re; acc22_im = a22_im;
479  acc32_re = a32_re; acc32_im = a32_im;
480  }
481 
482  // apply first chiral block
483  {
485 
492 
493  a00_re += c00_00_re * acc00_re;
494  a00_im += c00_00_re * acc00_im;
495  a00_re += c00_01_re * acc01_re;
496  a00_re -= c00_01_im * acc01_im;
497  a00_im += c00_01_re * acc01_im;
498  a00_im += c00_01_im * acc01_re;
499  a00_re += c00_02_re * acc02_re;
500  a00_re -= c00_02_im * acc02_im;
501  a00_im += c00_02_re * acc02_im;
502  a00_im += c00_02_im * acc02_re;
503  a00_re += c00_10_re * acc10_re;
504  a00_re -= c00_10_im * acc10_im;
505  a00_im += c00_10_re * acc10_im;
506  a00_im += c00_10_im * acc10_re;
507  a00_re += c00_11_re * acc11_re;
508  a00_re -= c00_11_im * acc11_im;
509  a00_im += c00_11_re * acc11_im;
510  a00_im += c00_11_im * acc11_re;
511  a00_re += c00_12_re * acc12_re;
512  a00_re -= c00_12_im * acc12_im;
513  a00_im += c00_12_re * acc12_im;
514  a00_im += c00_12_im * acc12_re;
515 
516  a01_re += c01_00_re * acc00_re;
517  a01_re -= c01_00_im * acc00_im;
518  a01_im += c01_00_re * acc00_im;
519  a01_im += c01_00_im * acc00_re;
520  a01_re += c01_01_re * acc01_re;
521  a01_im += c01_01_re * acc01_im;
522  a01_re += c01_02_re * acc02_re;
523  a01_re -= c01_02_im * acc02_im;
524  a01_im += c01_02_re * acc02_im;
525  a01_im += c01_02_im * acc02_re;
526  a01_re += c01_10_re * acc10_re;
527  a01_re -= c01_10_im * acc10_im;
528  a01_im += c01_10_re * acc10_im;
529  a01_im += c01_10_im * acc10_re;
530  a01_re += c01_11_re * acc11_re;
531  a01_re -= c01_11_im * acc11_im;
532  a01_im += c01_11_re * acc11_im;
533  a01_im += c01_11_im * acc11_re;
534  a01_re += c01_12_re * acc12_re;
535  a01_re -= c01_12_im * acc12_im;
536  a01_im += c01_12_re * acc12_im;
537  a01_im += c01_12_im * acc12_re;
538 
539  a02_re += c02_00_re * acc00_re;
540  a02_re -= c02_00_im * acc00_im;
541  a02_im += c02_00_re * acc00_im;
542  a02_im += c02_00_im * acc00_re;
543  a02_re += c02_01_re * acc01_re;
544  a02_re -= c02_01_im * acc01_im;
545  a02_im += c02_01_re * acc01_im;
546  a02_im += c02_01_im * acc01_re;
547  a02_re += c02_02_re * acc02_re;
548  a02_im += c02_02_re * acc02_im;
549  a02_re += c02_10_re * acc10_re;
550  a02_re -= c02_10_im * acc10_im;
551  a02_im += c02_10_re * acc10_im;
552  a02_im += c02_10_im * acc10_re;
553  a02_re += c02_11_re * acc11_re;
554  a02_re -= c02_11_im * acc11_im;
555  a02_im += c02_11_re * acc11_im;
556  a02_im += c02_11_im * acc11_re;
557  a02_re += c02_12_re * acc12_re;
558  a02_re -= c02_12_im * acc12_im;
559  a02_im += c02_12_re * acc12_im;
560  a02_im += c02_12_im * acc12_re;
561 
562  a10_re += c10_00_re * acc00_re;
563  a10_re -= c10_00_im * acc00_im;
564  a10_im += c10_00_re * acc00_im;
565  a10_im += c10_00_im * acc00_re;
566  a10_re += c10_01_re * acc01_re;
567  a10_re -= c10_01_im * acc01_im;
568  a10_im += c10_01_re * acc01_im;
569  a10_im += c10_01_im * acc01_re;
570  a10_re += c10_02_re * acc02_re;
571  a10_re -= c10_02_im * acc02_im;
572  a10_im += c10_02_re * acc02_im;
573  a10_im += c10_02_im * acc02_re;
574  a10_re += c10_10_re * acc10_re;
575  a10_im += c10_10_re * acc10_im;
576  a10_re += c10_11_re * acc11_re;
577  a10_re -= c10_11_im * acc11_im;
578  a10_im += c10_11_re * acc11_im;
579  a10_im += c10_11_im * acc11_re;
580  a10_re += c10_12_re * acc12_re;
581  a10_re -= c10_12_im * acc12_im;
582  a10_im += c10_12_re * acc12_im;
583  a10_im += c10_12_im * acc12_re;
584 
585  a11_re += c11_00_re * acc00_re;
586  a11_re -= c11_00_im * acc00_im;
587  a11_im += c11_00_re * acc00_im;
588  a11_im += c11_00_im * acc00_re;
589  a11_re += c11_01_re * acc01_re;
590  a11_re -= c11_01_im * acc01_im;
591  a11_im += c11_01_re * acc01_im;
592  a11_im += c11_01_im * acc01_re;
593  a11_re += c11_02_re * acc02_re;
594  a11_re -= c11_02_im * acc02_im;
595  a11_im += c11_02_re * acc02_im;
596  a11_im += c11_02_im * acc02_re;
597  a11_re += c11_10_re * acc10_re;
598  a11_re -= c11_10_im * acc10_im;
599  a11_im += c11_10_re * acc10_im;
600  a11_im += c11_10_im * acc10_re;
601  a11_re += c11_11_re * acc11_re;
602  a11_im += c11_11_re * acc11_im;
603  a11_re += c11_12_re * acc12_re;
604  a11_re -= c11_12_im * acc12_im;
605  a11_im += c11_12_re * acc12_im;
606  a11_im += c11_12_im * acc12_re;
607 
608  a12_re += c12_00_re * acc00_re;
609  a12_re -= c12_00_im * acc00_im;
610  a12_im += c12_00_re * acc00_im;
611  a12_im += c12_00_im * acc00_re;
612  a12_re += c12_01_re * acc01_re;
613  a12_re -= c12_01_im * acc01_im;
614  a12_im += c12_01_re * acc01_im;
615  a12_im += c12_01_im * acc01_re;
616  a12_re += c12_02_re * acc02_re;
617  a12_re -= c12_02_im * acc02_im;
618  a12_im += c12_02_re * acc02_im;
619  a12_im += c12_02_im * acc02_re;
620  a12_re += c12_10_re * acc10_re;
621  a12_re -= c12_10_im * acc10_im;
622  a12_im += c12_10_re * acc10_im;
623  a12_im += c12_10_im * acc10_re;
624  a12_re += c12_11_re * acc11_re;
625  a12_re -= c12_11_im * acc11_im;
626  a12_im += c12_11_re * acc11_im;
627  a12_im += c12_11_im * acc11_re;
628  a12_re += c12_12_re * acc12_re;
629  a12_im += c12_12_re * acc12_im;
630 
631  acc00_re = a00_re; acc00_im = a00_im;
632  acc01_re = a01_re; acc01_im = a01_im;
633  acc02_re = a02_re; acc02_im = a02_im;
634  acc10_re = a10_re; acc10_im = a10_im;
635  acc11_re = a11_re; acc11_im = a11_im;
636  acc12_re = a12_re; acc12_im = a12_im;
637 
638  }
639 
640  // apply second chiral block
641  {
643 
647  spinorFloat a30_re = 0; spinorFloat a30_im = 0;
648  spinorFloat a31_re = 0; spinorFloat a31_im = 0;
649  spinorFloat a32_re = 0; spinorFloat a32_im = 0;
650 
651  a20_re += c20_20_re * acc20_re;
652  a20_im += c20_20_re * acc20_im;
653  a20_re += c20_21_re * acc21_re;
654  a20_re -= c20_21_im * acc21_im;
655  a20_im += c20_21_re * acc21_im;
656  a20_im += c20_21_im * acc21_re;
657  a20_re += c20_22_re * acc22_re;
658  a20_re -= c20_22_im * acc22_im;
659  a20_im += c20_22_re * acc22_im;
660  a20_im += c20_22_im * acc22_re;
661  a20_re += c20_30_re * acc30_re;
662  a20_re -= c20_30_im * acc30_im;
663  a20_im += c20_30_re * acc30_im;
664  a20_im += c20_30_im * acc30_re;
665  a20_re += c20_31_re * acc31_re;
666  a20_re -= c20_31_im * acc31_im;
667  a20_im += c20_31_re * acc31_im;
668  a20_im += c20_31_im * acc31_re;
669  a20_re += c20_32_re * acc32_re;
670  a20_re -= c20_32_im * acc32_im;
671  a20_im += c20_32_re * acc32_im;
672  a20_im += c20_32_im * acc32_re;
673 
674  a21_re += c21_20_re * acc20_re;
675  a21_re -= c21_20_im * acc20_im;
676  a21_im += c21_20_re * acc20_im;
677  a21_im += c21_20_im * acc20_re;
678  a21_re += c21_21_re * acc21_re;
679  a21_im += c21_21_re * acc21_im;
680  a21_re += c21_22_re * acc22_re;
681  a21_re -= c21_22_im * acc22_im;
682  a21_im += c21_22_re * acc22_im;
683  a21_im += c21_22_im * acc22_re;
684  a21_re += c21_30_re * acc30_re;
685  a21_re -= c21_30_im * acc30_im;
686  a21_im += c21_30_re * acc30_im;
687  a21_im += c21_30_im * acc30_re;
688  a21_re += c21_31_re * acc31_re;
689  a21_re -= c21_31_im * acc31_im;
690  a21_im += c21_31_re * acc31_im;
691  a21_im += c21_31_im * acc31_re;
692  a21_re += c21_32_re * acc32_re;
693  a21_re -= c21_32_im * acc32_im;
694  a21_im += c21_32_re * acc32_im;
695  a21_im += c21_32_im * acc32_re;
696 
697  a22_re += c22_20_re * acc20_re;
698  a22_re -= c22_20_im * acc20_im;
699  a22_im += c22_20_re * acc20_im;
700  a22_im += c22_20_im * acc20_re;
701  a22_re += c22_21_re * acc21_re;
702  a22_re -= c22_21_im * acc21_im;
703  a22_im += c22_21_re * acc21_im;
704  a22_im += c22_21_im * acc21_re;
705  a22_re += c22_22_re * acc22_re;
706  a22_im += c22_22_re * acc22_im;
707  a22_re += c22_30_re * acc30_re;
708  a22_re -= c22_30_im * acc30_im;
709  a22_im += c22_30_re * acc30_im;
710  a22_im += c22_30_im * acc30_re;
711  a22_re += c22_31_re * acc31_re;
712  a22_re -= c22_31_im * acc31_im;
713  a22_im += c22_31_re * acc31_im;
714  a22_im += c22_31_im * acc31_re;
715  a22_re += c22_32_re * acc32_re;
716  a22_re -= c22_32_im * acc32_im;
717  a22_im += c22_32_re * acc32_im;
718  a22_im += c22_32_im * acc32_re;
719 
720  a30_re += c30_20_re * acc20_re;
721  a30_re -= c30_20_im * acc20_im;
722  a30_im += c30_20_re * acc20_im;
723  a30_im += c30_20_im * acc20_re;
724  a30_re += c30_21_re * acc21_re;
725  a30_re -= c30_21_im * acc21_im;
726  a30_im += c30_21_re * acc21_im;
727  a30_im += c30_21_im * acc21_re;
728  a30_re += c30_22_re * acc22_re;
729  a30_re -= c30_22_im * acc22_im;
730  a30_im += c30_22_re * acc22_im;
731  a30_im += c30_22_im * acc22_re;
732  a30_re += c30_30_re * acc30_re;
733  a30_im += c30_30_re * acc30_im;
734  a30_re += c30_31_re * acc31_re;
735  a30_re -= c30_31_im * acc31_im;
736  a30_im += c30_31_re * acc31_im;
737  a30_im += c30_31_im * acc31_re;
738  a30_re += c30_32_re * acc32_re;
739  a30_re -= c30_32_im * acc32_im;
740  a30_im += c30_32_re * acc32_im;
741  a30_im += c30_32_im * acc32_re;
742 
743  a31_re += c31_20_re * acc20_re;
744  a31_re -= c31_20_im * acc20_im;
745  a31_im += c31_20_re * acc20_im;
746  a31_im += c31_20_im * acc20_re;
747  a31_re += c31_21_re * acc21_re;
748  a31_re -= c31_21_im * acc21_im;
749  a31_im += c31_21_re * acc21_im;
750  a31_im += c31_21_im * acc21_re;
751  a31_re += c31_22_re * acc22_re;
752  a31_re -= c31_22_im * acc22_im;
753  a31_im += c31_22_re * acc22_im;
754  a31_im += c31_22_im * acc22_re;
755  a31_re += c31_30_re * acc30_re;
756  a31_re -= c31_30_im * acc30_im;
757  a31_im += c31_30_re * acc30_im;
758  a31_im += c31_30_im * acc30_re;
759  a31_re += c31_31_re * acc31_re;
760  a31_im += c31_31_re * acc31_im;
761  a31_re += c31_32_re * acc32_re;
762  a31_re -= c31_32_im * acc32_im;
763  a31_im += c31_32_re * acc32_im;
764  a31_im += c31_32_im * acc32_re;
765 
766  a32_re += c32_20_re * acc20_re;
767  a32_re -= c32_20_im * acc20_im;
768  a32_im += c32_20_re * acc20_im;
769  a32_im += c32_20_im * acc20_re;
770  a32_re += c32_21_re * acc21_re;
771  a32_re -= c32_21_im * acc21_im;
772  a32_im += c32_21_re * acc21_im;
773  a32_im += c32_21_im * acc21_re;
774  a32_re += c32_22_re * acc22_re;
775  a32_re -= c32_22_im * acc22_im;
776  a32_im += c32_22_re * acc22_im;
777  a32_im += c32_22_im * acc22_re;
778  a32_re += c32_30_re * acc30_re;
779  a32_re -= c32_30_im * acc30_im;
780  a32_im += c32_30_re * acc30_im;
781  a32_im += c32_30_im * acc30_re;
782  a32_re += c32_31_re * acc31_re;
783  a32_re -= c32_31_im * acc31_im;
784  a32_im += c32_31_re * acc31_im;
785  a32_im += c32_31_im * acc31_re;
786  a32_re += c32_32_re * acc32_re;
787  a32_im += c32_32_re * acc32_im;
788 
789  acc20_re = a20_re; acc20_im = a20_im;
790  acc21_re = a21_re; acc21_im = a21_im;
791  acc22_re = a22_re; acc22_im = a22_im;
792  acc30_re = a30_re; acc30_im = a30_im;
793  acc31_re = a31_re; acc31_im = a31_im;
794  acc32_re = a32_re; acc32_im = a32_im;
795 
796  }
797 
798  // change back from chiral basis
799  // (note: required factor of 1/2 is included in clover term normalization)
800  {
801  spinorFloat a00_re = acc10_re + acc30_re;
802  spinorFloat a00_im = acc10_im + acc30_im;
803  spinorFloat a10_re = -acc00_re - acc20_re;
804  spinorFloat a10_im = -acc00_im - acc20_im;
805  spinorFloat a20_re = acc10_re - acc30_re;
806  spinorFloat a20_im = acc10_im - acc30_im;
807  spinorFloat a30_re = -acc00_re + acc20_re;
808  spinorFloat a30_im = -acc00_im + acc20_im;
809 
810  acc00_re = a00_re; acc00_im = a00_im;
811  acc10_re = a10_re; acc10_im = a10_im;
812  acc20_re = a20_re; acc20_im = a20_im;
813  acc30_re = a30_re; acc30_im = a30_im;
814  }
815 
816  {
817  spinorFloat a01_re = acc11_re + acc31_re;
818  spinorFloat a01_im = acc11_im + acc31_im;
819  spinorFloat a11_re = -acc01_re - acc21_re;
820  spinorFloat a11_im = -acc01_im - acc21_im;
821  spinorFloat a21_re = acc11_re - acc31_re;
822  spinorFloat a21_im = acc11_im - acc31_im;
823  spinorFloat a31_re = -acc01_re + acc21_re;
824  spinorFloat a31_im = -acc01_im + acc21_im;
825 
826  acc01_re = a01_re; acc01_im = a01_im;
827  acc11_re = a11_re; acc11_im = a11_im;
828  acc21_re = a21_re; acc21_im = a21_im;
829  acc31_re = a31_re; acc31_im = a31_im;
830  }
831 
832  {
833  spinorFloat a02_re = acc12_re + acc32_re;
834  spinorFloat a02_im = acc12_im + acc32_im;
835  spinorFloat a12_re = -acc02_re - acc22_re;
836  spinorFloat a12_im = -acc02_im - acc22_im;
837  spinorFloat a22_re = acc12_re - acc32_re;
838  spinorFloat a22_im = acc12_im - acc32_im;
839  spinorFloat a32_re = -acc02_re + acc22_re;
840  spinorFloat a32_im = -acc02_im + acc22_im;
841 
842  acc02_re = a02_re; acc02_im = a02_im;
843  acc12_re = a12_re; acc12_im = a12_im;
844  acc22_re = a22_re; acc22_im = a22_im;
845  acc32_re = a32_re; acc32_im = a32_im;
846  }
847 
848 #endif // DSLASH_CLOVER
849 
850  o00_re = acc00_re;
851  o00_im = acc00_im;
852  o01_re = acc01_re;
853  o01_im = acc01_im;
854  o02_re = acc02_re;
855  o02_im = acc02_im;
856  o10_re = acc10_re;
857  o10_im = acc10_im;
858  o11_re = acc11_re;
859  o11_im = acc11_im;
860  o12_re = acc12_re;
861  o12_im = acc12_im;
862  o20_re = acc20_re;
863  o20_im = acc20_im;
864  o21_re = acc21_re;
865  o21_im = acc21_im;
866  o22_re = acc22_re;
867  o22_im = acc22_im;
868  o30_re = acc30_re;
869  o30_im = acc30_im;
870  o31_re = acc31_re;
871  o31_im = acc31_im;
872  o32_re = acc32_re;
873  o32_im = acc32_im;
874 #endif // DSLASH_CLOVER_XPAY
875 
876 #ifdef MULTI_GPU
877 } else { // exterior kernel
878 
879  sid = blockIdx.x*blockDim.x + threadIdx.x;
880  if (sid >= param.threads) return;
881 
882  const int dim = static_cast<int>(kernel_type);
883  const int face_volume = (param.threads >> 1); // volume of one face
884  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
885  face_idx = sid - face_num*face_volume; // index into the respective face
886 
887  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
888  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
889  //sp_idx = face_idx + param.ghostOffset[dim];
890 
891 #if (DD_PREC==2) // half precision
892  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
893 #endif
894 
895  const int dims[] = {X1, X2, X3, X4};
896  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
897 
898  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
899 
900  o00_re = i00_re; o00_im = i00_im;
901  o01_re = i01_re; o01_im = i01_im;
902  o02_re = i02_re; o02_im = i02_im;
903  o10_re = i10_re; o10_im = i10_im;
904  o11_re = i11_re; o11_im = i11_im;
905  o12_re = i12_re; o12_im = i12_im;
906  o20_re = i20_re; o20_im = i20_im;
907  o21_re = i21_re; o21_im = i21_im;
908  o22_re = i22_re; o22_im = i22_im;
909  o30_re = i30_re; o30_im = i30_im;
910  o31_re = i31_re; o31_im = i31_im;
911  o32_re = i32_re; o32_im = i32_im;
912 }
913 #endif // MULTI_GPU
914 
915 
916 #ifdef MULTI_GPU
917 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
918  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
919 #endif
920 {
921  // Projector P0+
922  // 1 0 0 i
923  // 0 1 i 0
924  // 0 -i 1 0
925  // -i 0 0 1
926 
927 #ifdef MULTI_GPU
928  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
929  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
930 #else
931  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
932 #endif
933 
934  const int ga_idx = sid;
935 
942 
943 #ifdef MULTI_GPU
944  if (kernel_type == INTERIOR_KERNEL) {
945 #endif
946 
947  // read spinor from device memory
948  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
949 
950  // project spinor into half spinors
951  a0_re = +i00_re-i30_im;
952  a0_im = +i00_im+i30_re;
953  a1_re = +i01_re-i31_im;
954  a1_im = +i01_im+i31_re;
955  a2_re = +i02_re-i32_im;
956  a2_im = +i02_im+i32_re;
957  b0_re = +i10_re-i20_im;
958  b0_im = +i10_im+i20_re;
959  b1_re = +i11_re-i21_im;
960  b1_im = +i11_im+i21_re;
961  b2_re = +i12_re-i22_im;
962  b2_im = +i12_im+i22_re;
963 
964 #ifdef MULTI_GPU
965  } else {
966 
967  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
968 
969  // read half spinor from device memory
970  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
971 
972  a0_re = i00_re; a0_im = i00_im;
973  a1_re = i01_re; a1_im = i01_im;
974  a2_re = i02_re; a2_im = i02_im;
975  b0_re = i10_re; b0_im = i10_im;
976  b1_re = i11_re; b1_im = i11_im;
977  b2_re = i12_re; b2_im = i12_im;
978 
979  }
980 #endif // MULTI_GPU
981 
982  // read gauge matrix from device memory
983  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
984 
985  // reconstruct gauge matrix
987 
988  // multiply row 0
990  A0_re += g00_re * a0_re;
991  A0_re -= g00_im * a0_im;
992  A0_re += g01_re * a1_re;
993  A0_re -= g01_im * a1_im;
994  A0_re += g02_re * a2_re;
995  A0_re -= g02_im * a2_im;
997  A0_im += g00_re * a0_im;
998  A0_im += g00_im * a0_re;
999  A0_im += g01_re * a1_im;
1000  A0_im += g01_im * a1_re;
1001  A0_im += g02_re * a2_im;
1002  A0_im += g02_im * a2_re;
1004  B0_re += g00_re * b0_re;
1005  B0_re -= g00_im * b0_im;
1006  B0_re += g01_re * b1_re;
1007  B0_re -= g01_im * b1_im;
1008  B0_re += g02_re * b2_re;
1009  B0_re -= g02_im * b2_im;
1011  B0_im += g00_re * b0_im;
1012  B0_im += g00_im * b0_re;
1013  B0_im += g01_re * b1_im;
1014  B0_im += g01_im * b1_re;
1015  B0_im += g02_re * b2_im;
1016  B0_im += g02_im * b2_re;
1017 
1018  // multiply row 1
1020  A1_re += g10_re * a0_re;
1021  A1_re -= g10_im * a0_im;
1022  A1_re += g11_re * a1_re;
1023  A1_re -= g11_im * a1_im;
1024  A1_re += g12_re * a2_re;
1025  A1_re -= g12_im * a2_im;
1027  A1_im += g10_re * a0_im;
1028  A1_im += g10_im * a0_re;
1029  A1_im += g11_re * a1_im;
1030  A1_im += g11_im * a1_re;
1031  A1_im += g12_re * a2_im;
1032  A1_im += g12_im * a2_re;
1034  B1_re += g10_re * b0_re;
1035  B1_re -= g10_im * b0_im;
1036  B1_re += g11_re * b1_re;
1037  B1_re -= g11_im * b1_im;
1038  B1_re += g12_re * b2_re;
1039  B1_re -= g12_im * b2_im;
1041  B1_im += g10_re * b0_im;
1042  B1_im += g10_im * b0_re;
1043  B1_im += g11_re * b1_im;
1044  B1_im += g11_im * b1_re;
1045  B1_im += g12_re * b2_im;
1046  B1_im += g12_im * b2_re;
1047 
1048  // multiply row 2
1050  A2_re += g20_re * a0_re;
1051  A2_re -= g20_im * a0_im;
1052  A2_re += g21_re * a1_re;
1053  A2_re -= g21_im * a1_im;
1054  A2_re += g22_re * a2_re;
1055  A2_re -= g22_im * a2_im;
1057  A2_im += g20_re * a0_im;
1058  A2_im += g20_im * a0_re;
1059  A2_im += g21_re * a1_im;
1060  A2_im += g21_im * a1_re;
1061  A2_im += g22_re * a2_im;
1062  A2_im += g22_im * a2_re;
1064  B2_re += g20_re * b0_re;
1065  B2_re -= g20_im * b0_im;
1066  B2_re += g21_re * b1_re;
1067  B2_re -= g21_im * b1_im;
1068  B2_re += g22_re * b2_re;
1069  B2_re -= g22_im * b2_im;
1071  B2_im += g20_re * b0_im;
1072  B2_im += g20_im * b0_re;
1073  B2_im += g21_re * b1_im;
1074  B2_im += g21_im * b1_re;
1075  B2_im += g22_re * b2_im;
1076  B2_im += g22_im * b2_re;
1077 
1078  o00_re += a*A0_re;
1079  o00_im += a*A0_im;
1080  o10_re += a*B0_re;
1081  o10_im += a*B0_im;
1082  o20_re += a*B0_im;
1083  o20_im -= a*B0_re;
1084  o30_re += a*A0_im;
1085  o30_im -= a*A0_re;
1086 
1087  o01_re += a*A1_re;
1088  o01_im += a*A1_im;
1089  o11_re += a*B1_re;
1090  o11_im += a*B1_im;
1091  o21_re += a*B1_im;
1092  o21_im -= a*B1_re;
1093  o31_re += a*A1_im;
1094  o31_im -= a*A1_re;
1095 
1096  o02_re += a*A2_re;
1097  o02_im += a*A2_im;
1098  o12_re += a*B2_re;
1099  o12_im += a*B2_im;
1100  o22_re += a*B2_im;
1101  o22_im -= a*B2_re;
1102  o32_re += a*A2_im;
1103  o32_im -= a*A2_re;
1104 
1105 }
1106 
1107 #ifdef MULTI_GPU
1108 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
1109  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
1110 #endif
1111 {
1112  // Projector P0-
1113  // 1 0 0 -i
1114  // 0 1 -i 0
1115  // 0 i 1 0
1116  // i 0 0 1
1117 
1118 #ifdef MULTI_GPU
1119  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
1120  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1121 #else
1122  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
1123 #endif
1124 
1125 #ifdef MULTI_GPU
1126  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1127 #else
1128  const int ga_idx = sp_idx;
1129 #endif
1130 
1137 
1138 #ifdef MULTI_GPU
1139  if (kernel_type == INTERIOR_KERNEL) {
1140 #endif
1141 
1142  // read spinor from device memory
1143  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1144 
1145  // project spinor into half spinors
1146  a0_re = +i00_re+i30_im;
1147  a0_im = +i00_im-i30_re;
1148  a1_re = +i01_re+i31_im;
1149  a1_im = +i01_im-i31_re;
1150  a2_re = +i02_re+i32_im;
1151  a2_im = +i02_im-i32_re;
1152  b0_re = +i10_re+i20_im;
1153  b0_im = +i10_im-i20_re;
1154  b1_re = +i11_re+i21_im;
1155  b1_im = +i11_im-i21_re;
1156  b2_re = +i12_re+i22_im;
1157  b2_im = +i12_im-i22_re;
1158 
1159 #ifdef MULTI_GPU
1160  } else {
1161 
1162  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1163 
1164  // read half spinor from device memory
1165  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1166 
1167  a0_re = i00_re; a0_im = i00_im;
1168  a1_re = i01_re; a1_im = i01_im;
1169  a2_re = i02_re; a2_im = i02_im;
1170  b0_re = i10_re; b0_im = i10_im;
1171  b1_re = i11_re; b1_im = i11_im;
1172  b2_re = i12_re; b2_im = i12_im;
1173 
1174  }
1175 #endif // MULTI_GPU
1176 
1177  // read gauge matrix from device memory
1178  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
1179 
1180  // reconstruct gauge matrix
1182 
1183  // multiply row 0
1184  spinorFloat A0_re = 0;
1185  A0_re += gT00_re * a0_re;
1186  A0_re -= gT00_im * a0_im;
1187  A0_re += gT01_re * a1_re;
1188  A0_re -= gT01_im * a1_im;
1189  A0_re += gT02_re * a2_re;
1190  A0_re -= gT02_im * a2_im;
1191  spinorFloat A0_im = 0;
1192  A0_im += gT00_re * a0_im;
1193  A0_im += gT00_im * a0_re;
1194  A0_im += gT01_re * a1_im;
1195  A0_im += gT01_im * a1_re;
1196  A0_im += gT02_re * a2_im;
1197  A0_im += gT02_im * a2_re;
1198  spinorFloat B0_re = 0;
1199  B0_re += gT00_re * b0_re;
1200  B0_re -= gT00_im * b0_im;
1201  B0_re += gT01_re * b1_re;
1202  B0_re -= gT01_im * b1_im;
1203  B0_re += gT02_re * b2_re;
1204  B0_re -= gT02_im * b2_im;
1205  spinorFloat B0_im = 0;
1206  B0_im += gT00_re * b0_im;
1207  B0_im += gT00_im * b0_re;
1208  B0_im += gT01_re * b1_im;
1209  B0_im += gT01_im * b1_re;
1210  B0_im += gT02_re * b2_im;
1211  B0_im += gT02_im * b2_re;
1212 
1213  // multiply row 1
1214  spinorFloat A1_re = 0;
1215  A1_re += gT10_re * a0_re;
1216  A1_re -= gT10_im * a0_im;
1217  A1_re += gT11_re * a1_re;
1218  A1_re -= gT11_im * a1_im;
1219  A1_re += gT12_re * a2_re;
1220  A1_re -= gT12_im * a2_im;
1221  spinorFloat A1_im = 0;
1222  A1_im += gT10_re * a0_im;
1223  A1_im += gT10_im * a0_re;
1224  A1_im += gT11_re * a1_im;
1225  A1_im += gT11_im * a1_re;
1226  A1_im += gT12_re * a2_im;
1227  A1_im += gT12_im * a2_re;
1228  spinorFloat B1_re = 0;
1229  B1_re += gT10_re * b0_re;
1230  B1_re -= gT10_im * b0_im;
1231  B1_re += gT11_re * b1_re;
1232  B1_re -= gT11_im * b1_im;
1233  B1_re += gT12_re * b2_re;
1234  B1_re -= gT12_im * b2_im;
1235  spinorFloat B1_im = 0;
1236  B1_im += gT10_re * b0_im;
1237  B1_im += gT10_im * b0_re;
1238  B1_im += gT11_re * b1_im;
1239  B1_im += gT11_im * b1_re;
1240  B1_im += gT12_re * b2_im;
1241  B1_im += gT12_im * b2_re;
1242 
1243  // multiply row 2
1244  spinorFloat A2_re = 0;
1245  A2_re += gT20_re * a0_re;
1246  A2_re -= gT20_im * a0_im;
1247  A2_re += gT21_re * a1_re;
1248  A2_re -= gT21_im * a1_im;
1249  A2_re += gT22_re * a2_re;
1250  A2_re -= gT22_im * a2_im;
1251  spinorFloat A2_im = 0;
1252  A2_im += gT20_re * a0_im;
1253  A2_im += gT20_im * a0_re;
1254  A2_im += gT21_re * a1_im;
1255  A2_im += gT21_im * a1_re;
1256  A2_im += gT22_re * a2_im;
1257  A2_im += gT22_im * a2_re;
1258  spinorFloat B2_re = 0;
1259  B2_re += gT20_re * b0_re;
1260  B2_re -= gT20_im * b0_im;
1261  B2_re += gT21_re * b1_re;
1262  B2_re -= gT21_im * b1_im;
1263  B2_re += gT22_re * b2_re;
1264  B2_re -= gT22_im * b2_im;
1265  spinorFloat B2_im = 0;
1266  B2_im += gT20_re * b0_im;
1267  B2_im += gT20_im * b0_re;
1268  B2_im += gT21_re * b1_im;
1269  B2_im += gT21_im * b1_re;
1270  B2_im += gT22_re * b2_im;
1271  B2_im += gT22_im * b2_re;
1272 
1273  o00_re += a*A0_re;
1274  o00_im += a*A0_im;
1275  o10_re += a*B0_re;
1276  o10_im += a*B0_im;
1277  o20_re -= a*B0_im;
1278  o20_im += a*B0_re;
1279  o30_re -= a*A0_im;
1280  o30_im += a*A0_re;
1281 
1282  o01_re += a*A1_re;
1283  o01_im += a*A1_im;
1284  o11_re += a*B1_re;
1285  o11_im += a*B1_im;
1286  o21_re -= a*B1_im;
1287  o21_im += a*B1_re;
1288  o31_re -= a*A1_im;
1289  o31_im += a*A1_re;
1290 
1291  o02_re += a*A2_re;
1292  o02_im += a*A2_im;
1293  o12_re += a*B2_re;
1294  o12_im += a*B2_im;
1295  o22_re -= a*B2_im;
1296  o22_im += a*B2_re;
1297  o32_re -= a*A2_im;
1298  o32_im += a*A2_re;
1299 
1300 }
1301 
1302 #ifdef MULTI_GPU
1303 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
1304  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
1305 #endif
1306 {
1307  // Projector P1+
1308  // 1 0 0 1
1309  // 0 1 -1 0
1310  // 0 -1 1 0
1311  // 1 0 0 1
1312 
1313 #ifdef MULTI_GPU
1314  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
1315  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1316 #else
1317  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
1318 #endif
1319 
1320  const int ga_idx = sid;
1321 
1328 
1329 #ifdef MULTI_GPU
1330  if (kernel_type == INTERIOR_KERNEL) {
1331 #endif
1332 
1333  // read spinor from device memory
1334  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1335 
1336  // project spinor into half spinors
1337  a0_re = +i00_re+i30_re;
1338  a0_im = +i00_im+i30_im;
1339  a1_re = +i01_re+i31_re;
1340  a1_im = +i01_im+i31_im;
1341  a2_re = +i02_re+i32_re;
1342  a2_im = +i02_im+i32_im;
1343  b0_re = +i10_re-i20_re;
1344  b0_im = +i10_im-i20_im;
1345  b1_re = +i11_re-i21_re;
1346  b1_im = +i11_im-i21_im;
1347  b2_re = +i12_re-i22_re;
1348  b2_im = +i12_im-i22_im;
1349 
1350 #ifdef MULTI_GPU
1351  } else {
1352 
1353  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1354 
1355  // read half spinor from device memory
1356  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1357 
1358  a0_re = i00_re; a0_im = i00_im;
1359  a1_re = i01_re; a1_im = i01_im;
1360  a2_re = i02_re; a2_im = i02_im;
1361  b0_re = i10_re; b0_im = i10_im;
1362  b1_re = i11_re; b1_im = i11_im;
1363  b2_re = i12_re; b2_im = i12_im;
1364 
1365  }
1366 #endif // MULTI_GPU
1367 
1368  // read gauge matrix from device memory
1369  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
1370 
1371  // reconstruct gauge matrix
1373 
1374  // multiply row 0
1375  spinorFloat A0_re = 0;
1376  A0_re += g00_re * a0_re;
1377  A0_re -= g00_im * a0_im;
1378  A0_re += g01_re * a1_re;
1379  A0_re -= g01_im * a1_im;
1380  A0_re += g02_re * a2_re;
1381  A0_re -= g02_im * a2_im;
1382  spinorFloat A0_im = 0;
1383  A0_im += g00_re * a0_im;
1384  A0_im += g00_im * a0_re;
1385  A0_im += g01_re * a1_im;
1386  A0_im += g01_im * a1_re;
1387  A0_im += g02_re * a2_im;
1388  A0_im += g02_im * a2_re;
1389  spinorFloat B0_re = 0;
1390  B0_re += g00_re * b0_re;
1391  B0_re -= g00_im * b0_im;
1392  B0_re += g01_re * b1_re;
1393  B0_re -= g01_im * b1_im;
1394  B0_re += g02_re * b2_re;
1395  B0_re -= g02_im * b2_im;
1396  spinorFloat B0_im = 0;
1397  B0_im += g00_re * b0_im;
1398  B0_im += g00_im * b0_re;
1399  B0_im += g01_re * b1_im;
1400  B0_im += g01_im * b1_re;
1401  B0_im += g02_re * b2_im;
1402  B0_im += g02_im * b2_re;
1403 
1404  // multiply row 1
1405  spinorFloat A1_re = 0;
1406  A1_re += g10_re * a0_re;
1407  A1_re -= g10_im * a0_im;
1408  A1_re += g11_re * a1_re;
1409  A1_re -= g11_im * a1_im;
1410  A1_re += g12_re * a2_re;
1411  A1_re -= g12_im * a2_im;
1412  spinorFloat A1_im = 0;
1413  A1_im += g10_re * a0_im;
1414  A1_im += g10_im * a0_re;
1415  A1_im += g11_re * a1_im;
1416  A1_im += g11_im * a1_re;
1417  A1_im += g12_re * a2_im;
1418  A1_im += g12_im * a2_re;
1419  spinorFloat B1_re = 0;
1420  B1_re += g10_re * b0_re;
1421  B1_re -= g10_im * b0_im;
1422  B1_re += g11_re * b1_re;
1423  B1_re -= g11_im * b1_im;
1424  B1_re += g12_re * b2_re;
1425  B1_re -= g12_im * b2_im;
1426  spinorFloat B1_im = 0;
1427  B1_im += g10_re * b0_im;
1428  B1_im += g10_im * b0_re;
1429  B1_im += g11_re * b1_im;
1430  B1_im += g11_im * b1_re;
1431  B1_im += g12_re * b2_im;
1432  B1_im += g12_im * b2_re;
1433 
1434  // multiply row 2
1435  spinorFloat A2_re = 0;
1436  A2_re += g20_re * a0_re;
1437  A2_re -= g20_im * a0_im;
1438  A2_re += g21_re * a1_re;
1439  A2_re -= g21_im * a1_im;
1440  A2_re += g22_re * a2_re;
1441  A2_re -= g22_im * a2_im;
1442  spinorFloat A2_im = 0;
1443  A2_im += g20_re * a0_im;
1444  A2_im += g20_im * a0_re;
1445  A2_im += g21_re * a1_im;
1446  A2_im += g21_im * a1_re;
1447  A2_im += g22_re * a2_im;
1448  A2_im += g22_im * a2_re;
1449  spinorFloat B2_re = 0;
1450  B2_re += g20_re * b0_re;
1451  B2_re -= g20_im * b0_im;
1452  B2_re += g21_re * b1_re;
1453  B2_re -= g21_im * b1_im;
1454  B2_re += g22_re * b2_re;
1455  B2_re -= g22_im * b2_im;
1456  spinorFloat B2_im = 0;
1457  B2_im += g20_re * b0_im;
1458  B2_im += g20_im * b0_re;
1459  B2_im += g21_re * b1_im;
1460  B2_im += g21_im * b1_re;
1461  B2_im += g22_re * b2_im;
1462  B2_im += g22_im * b2_re;
1463 
1464  o00_re += a*A0_re;
1465  o00_im += a*A0_im;
1466  o10_re += a*B0_re;
1467  o10_im += a*B0_im;
1468  o20_re -= a*B0_re;
1469  o20_im -= a*B0_im;
1470  o30_re += a*A0_re;
1471  o30_im += a*A0_im;
1472 
1473  o01_re += a*A1_re;
1474  o01_im += a*A1_im;
1475  o11_re += a*B1_re;
1476  o11_im += a*B1_im;
1477  o21_re -= a*B1_re;
1478  o21_im -= a*B1_im;
1479  o31_re += a*A1_re;
1480  o31_im += a*A1_im;
1481 
1482  o02_re += a*A2_re;
1483  o02_im += a*A2_im;
1484  o12_re += a*B2_re;
1485  o12_im += a*B2_im;
1486  o22_re -= a*B2_re;
1487  o22_im -= a*B2_im;
1488  o32_re += a*A2_re;
1489  o32_im += a*A2_im;
1490 
1491 }
1492 
1493 #ifdef MULTI_GPU
1494 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
1495  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
1496 #endif
1497 {
1498  // Projector P1-
1499  // 1 0 0 -1
1500  // 0 1 1 0
1501  // 0 1 1 0
1502  // -1 0 0 1
1503 
1504 #ifdef MULTI_GPU
1505  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
1506  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1507 #else
1508  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
1509 #endif
1510 
1511 #ifdef MULTI_GPU
1512  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1513 #else
1514  const int ga_idx = sp_idx;
1515 #endif
1516 
1523 
1524 #ifdef MULTI_GPU
1525  if (kernel_type == INTERIOR_KERNEL) {
1526 #endif
1527 
1528  // read spinor from device memory
1529  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1530 
1531  // project spinor into half spinors
1532  a0_re = +i00_re-i30_re;
1533  a0_im = +i00_im-i30_im;
1534  a1_re = +i01_re-i31_re;
1535  a1_im = +i01_im-i31_im;
1536  a2_re = +i02_re-i32_re;
1537  a2_im = +i02_im-i32_im;
1538  b0_re = +i10_re+i20_re;
1539  b0_im = +i10_im+i20_im;
1540  b1_re = +i11_re+i21_re;
1541  b1_im = +i11_im+i21_im;
1542  b2_re = +i12_re+i22_re;
1543  b2_im = +i12_im+i22_im;
1544 
1545 #ifdef MULTI_GPU
1546  } else {
1547 
1548  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1549 
1550  // read half spinor from device memory
1551  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1552 
1553  a0_re = i00_re; a0_im = i00_im;
1554  a1_re = i01_re; a1_im = i01_im;
1555  a2_re = i02_re; a2_im = i02_im;
1556  b0_re = i10_re; b0_im = i10_im;
1557  b1_re = i11_re; b1_im = i11_im;
1558  b2_re = i12_re; b2_im = i12_im;
1559 
1560  }
1561 #endif // MULTI_GPU
1562 
1563  // read gauge matrix from device memory
1564  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
1565 
1566  // reconstruct gauge matrix
1568 
1569  // multiply row 0
1570  spinorFloat A0_re = 0;
1571  A0_re += gT00_re * a0_re;
1572  A0_re -= gT00_im * a0_im;
1573  A0_re += gT01_re * a1_re;
1574  A0_re -= gT01_im * a1_im;
1575  A0_re += gT02_re * a2_re;
1576  A0_re -= gT02_im * a2_im;
1577  spinorFloat A0_im = 0;
1578  A0_im += gT00_re * a0_im;
1579  A0_im += gT00_im * a0_re;
1580  A0_im += gT01_re * a1_im;
1581  A0_im += gT01_im * a1_re;
1582  A0_im += gT02_re * a2_im;
1583  A0_im += gT02_im * a2_re;
1584  spinorFloat B0_re = 0;
1585  B0_re += gT00_re * b0_re;
1586  B0_re -= gT00_im * b0_im;
1587  B0_re += gT01_re * b1_re;
1588  B0_re -= gT01_im * b1_im;
1589  B0_re += gT02_re * b2_re;
1590  B0_re -= gT02_im * b2_im;
1591  spinorFloat B0_im = 0;
1592  B0_im += gT00_re * b0_im;
1593  B0_im += gT00_im * b0_re;
1594  B0_im += gT01_re * b1_im;
1595  B0_im += gT01_im * b1_re;
1596  B0_im += gT02_re * b2_im;
1597  B0_im += gT02_im * b2_re;
1598 
1599  // multiply row 1
1600  spinorFloat A1_re = 0;
1601  A1_re += gT10_re * a0_re;
1602  A1_re -= gT10_im * a0_im;
1603  A1_re += gT11_re * a1_re;
1604  A1_re -= gT11_im * a1_im;
1605  A1_re += gT12_re * a2_re;
1606  A1_re -= gT12_im * a2_im;
1607  spinorFloat A1_im = 0;
1608  A1_im += gT10_re * a0_im;
1609  A1_im += gT10_im * a0_re;
1610  A1_im += gT11_re * a1_im;
1611  A1_im += gT11_im * a1_re;
1612  A1_im += gT12_re * a2_im;
1613  A1_im += gT12_im * a2_re;
1614  spinorFloat B1_re = 0;
1615  B1_re += gT10_re * b0_re;
1616  B1_re -= gT10_im * b0_im;
1617  B1_re += gT11_re * b1_re;
1618  B1_re -= gT11_im * b1_im;
1619  B1_re += gT12_re * b2_re;
1620  B1_re -= gT12_im * b2_im;
1621  spinorFloat B1_im = 0;
1622  B1_im += gT10_re * b0_im;
1623  B1_im += gT10_im * b0_re;
1624  B1_im += gT11_re * b1_im;
1625  B1_im += gT11_im * b1_re;
1626  B1_im += gT12_re * b2_im;
1627  B1_im += gT12_im * b2_re;
1628 
1629  // multiply row 2
1630  spinorFloat A2_re = 0;
1631  A2_re += gT20_re * a0_re;
1632  A2_re -= gT20_im * a0_im;
1633  A2_re += gT21_re * a1_re;
1634  A2_re -= gT21_im * a1_im;
1635  A2_re += gT22_re * a2_re;
1636  A2_re -= gT22_im * a2_im;
1637  spinorFloat A2_im = 0;
1638  A2_im += gT20_re * a0_im;
1639  A2_im += gT20_im * a0_re;
1640  A2_im += gT21_re * a1_im;
1641  A2_im += gT21_im * a1_re;
1642  A2_im += gT22_re * a2_im;
1643  A2_im += gT22_im * a2_re;
1644  spinorFloat B2_re = 0;
1645  B2_re += gT20_re * b0_re;
1646  B2_re -= gT20_im * b0_im;
1647  B2_re += gT21_re * b1_re;
1648  B2_re -= gT21_im * b1_im;
1649  B2_re += gT22_re * b2_re;
1650  B2_re -= gT22_im * b2_im;
1651  spinorFloat B2_im = 0;
1652  B2_im += gT20_re * b0_im;
1653  B2_im += gT20_im * b0_re;
1654  B2_im += gT21_re * b1_im;
1655  B2_im += gT21_im * b1_re;
1656  B2_im += gT22_re * b2_im;
1657  B2_im += gT22_im * b2_re;
1658 
1659  o00_re += a*A0_re;
1660  o00_im += a*A0_im;
1661  o10_re += a*B0_re;
1662  o10_im += a*B0_im;
1663  o20_re += a*B0_re;
1664  o20_im += a*B0_im;
1665  o30_re -= a*A0_re;
1666  o30_im -= a*A0_im;
1667 
1668  o01_re += a*A1_re;
1669  o01_im += a*A1_im;
1670  o11_re += a*B1_re;
1671  o11_im += a*B1_im;
1672  o21_re += a*B1_re;
1673  o21_im += a*B1_im;
1674  o31_re -= a*A1_re;
1675  o31_im -= a*A1_im;
1676 
1677  o02_re += a*A2_re;
1678  o02_im += a*A2_im;
1679  o12_re += a*B2_re;
1680  o12_im += a*B2_im;
1681  o22_re += a*B2_re;
1682  o22_im += a*B2_im;
1683  o32_re -= a*A2_re;
1684  o32_im -= a*A2_im;
1685 
1686 }
1687 
1688 #ifdef MULTI_GPU
1689 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1690  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1691 #endif
1692 {
1693  // Projector P2+
1694  // 1 0 i 0
1695  // 0 1 0 -i
1696  // -i 0 1 0
1697  // 0 i 0 1
1698 
1699 #ifdef MULTI_GPU
1700  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1701  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1702 #else
1703  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1704 #endif
1705 
1706  const int ga_idx = sid;
1707 
1714 
1715 #ifdef MULTI_GPU
1716  if (kernel_type == INTERIOR_KERNEL) {
1717 #endif
1718 
1719  // read spinor from device memory
1720  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1721 
1722  // project spinor into half spinors
1723  a0_re = +i00_re-i20_im;
1724  a0_im = +i00_im+i20_re;
1725  a1_re = +i01_re-i21_im;
1726  a1_im = +i01_im+i21_re;
1727  a2_re = +i02_re-i22_im;
1728  a2_im = +i02_im+i22_re;
1729  b0_re = +i10_re+i30_im;
1730  b0_im = +i10_im-i30_re;
1731  b1_re = +i11_re+i31_im;
1732  b1_im = +i11_im-i31_re;
1733  b2_re = +i12_re+i32_im;
1734  b2_im = +i12_im-i32_re;
1735 
1736 #ifdef MULTI_GPU
1737  } else {
1738 
1739  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1740 
1741  // read half spinor from device memory
1742  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1743 
1744  a0_re = i00_re; a0_im = i00_im;
1745  a1_re = i01_re; a1_im = i01_im;
1746  a2_re = i02_re; a2_im = i02_im;
1747  b0_re = i10_re; b0_im = i10_im;
1748  b1_re = i11_re; b1_im = i11_im;
1749  b2_re = i12_re; b2_im = i12_im;
1750 
1751  }
1752 #endif // MULTI_GPU
1753 
1754  // read gauge matrix from device memory
1755  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1756 
1757  // reconstruct gauge matrix
1759 
1760  // multiply row 0
1761  spinorFloat A0_re = 0;
1762  A0_re += g00_re * a0_re;
1763  A0_re -= g00_im * a0_im;
1764  A0_re += g01_re * a1_re;
1765  A0_re -= g01_im * a1_im;
1766  A0_re += g02_re * a2_re;
1767  A0_re -= g02_im * a2_im;
1768  spinorFloat A0_im = 0;
1769  A0_im += g00_re * a0_im;
1770  A0_im += g00_im * a0_re;
1771  A0_im += g01_re * a1_im;
1772  A0_im += g01_im * a1_re;
1773  A0_im += g02_re * a2_im;
1774  A0_im += g02_im * a2_re;
1775  spinorFloat B0_re = 0;
1776  B0_re += g00_re * b0_re;
1777  B0_re -= g00_im * b0_im;
1778  B0_re += g01_re * b1_re;
1779  B0_re -= g01_im * b1_im;
1780  B0_re += g02_re * b2_re;
1781  B0_re -= g02_im * b2_im;
1782  spinorFloat B0_im = 0;
1783  B0_im += g00_re * b0_im;
1784  B0_im += g00_im * b0_re;
1785  B0_im += g01_re * b1_im;
1786  B0_im += g01_im * b1_re;
1787  B0_im += g02_re * b2_im;
1788  B0_im += g02_im * b2_re;
1789 
1790  // multiply row 1
1791  spinorFloat A1_re = 0;
1792  A1_re += g10_re * a0_re;
1793  A1_re -= g10_im * a0_im;
1794  A1_re += g11_re * a1_re;
1795  A1_re -= g11_im * a1_im;
1796  A1_re += g12_re * a2_re;
1797  A1_re -= g12_im * a2_im;
1798  spinorFloat A1_im = 0;
1799  A1_im += g10_re * a0_im;
1800  A1_im += g10_im * a0_re;
1801  A1_im += g11_re * a1_im;
1802  A1_im += g11_im * a1_re;
1803  A1_im += g12_re * a2_im;
1804  A1_im += g12_im * a2_re;
1805  spinorFloat B1_re = 0;
1806  B1_re += g10_re * b0_re;
1807  B1_re -= g10_im * b0_im;
1808  B1_re += g11_re * b1_re;
1809  B1_re -= g11_im * b1_im;
1810  B1_re += g12_re * b2_re;
1811  B1_re -= g12_im * b2_im;
1812  spinorFloat B1_im = 0;
1813  B1_im += g10_re * b0_im;
1814  B1_im += g10_im * b0_re;
1815  B1_im += g11_re * b1_im;
1816  B1_im += g11_im * b1_re;
1817  B1_im += g12_re * b2_im;
1818  B1_im += g12_im * b2_re;
1819 
1820  // multiply row 2
1821  spinorFloat A2_re = 0;
1822  A2_re += g20_re * a0_re;
1823  A2_re -= g20_im * a0_im;
1824  A2_re += g21_re * a1_re;
1825  A2_re -= g21_im * a1_im;
1826  A2_re += g22_re * a2_re;
1827  A2_re -= g22_im * a2_im;
1828  spinorFloat A2_im = 0;
1829  A2_im += g20_re * a0_im;
1830  A2_im += g20_im * a0_re;
1831  A2_im += g21_re * a1_im;
1832  A2_im += g21_im * a1_re;
1833  A2_im += g22_re * a2_im;
1834  A2_im += g22_im * a2_re;
1835  spinorFloat B2_re = 0;
1836  B2_re += g20_re * b0_re;
1837  B2_re -= g20_im * b0_im;
1838  B2_re += g21_re * b1_re;
1839  B2_re -= g21_im * b1_im;
1840  B2_re += g22_re * b2_re;
1841  B2_re -= g22_im * b2_im;
1842  spinorFloat B2_im = 0;
1843  B2_im += g20_re * b0_im;
1844  B2_im += g20_im * b0_re;
1845  B2_im += g21_re * b1_im;
1846  B2_im += g21_im * b1_re;
1847  B2_im += g22_re * b2_im;
1848  B2_im += g22_im * b2_re;
1849 
1850  o00_re += a*A0_re;
1851  o00_im += a*A0_im;
1852  o10_re += a*B0_re;
1853  o10_im += a*B0_im;
1854  o20_re += a*A0_im;
1855  o20_im -= a*A0_re;
1856  o30_re -= a*B0_im;
1857  o30_im += a*B0_re;
1858 
1859  o01_re += a*A1_re;
1860  o01_im += a*A1_im;
1861  o11_re += a*B1_re;
1862  o11_im += a*B1_im;
1863  o21_re += a*A1_im;
1864  o21_im -= a*A1_re;
1865  o31_re -= a*B1_im;
1866  o31_im += a*B1_re;
1867 
1868  o02_re += a*A2_re;
1869  o02_im += a*A2_im;
1870  o12_re += a*B2_re;
1871  o12_im += a*B2_im;
1872  o22_re += a*A2_im;
1873  o22_im -= a*A2_re;
1874  o32_re -= a*B2_im;
1875  o32_im += a*B2_re;
1876 
1877 }
1878 
1879 #ifdef MULTI_GPU
1880 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1881  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1882 #endif
1883 {
1884  // Projector P2-
1885  // 1 0 -i 0
1886  // 0 1 0 i
1887  // i 0 1 0
1888  // 0 -i 0 1
1889 
1890 #ifdef MULTI_GPU
1891  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1892  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1893 #else
1894  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1895 #endif
1896 
1897 #ifdef MULTI_GPU
1898  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1899 #else
1900  const int ga_idx = sp_idx;
1901 #endif
1902 
1909 
1910 #ifdef MULTI_GPU
1911  if (kernel_type == INTERIOR_KERNEL) {
1912 #endif
1913 
1914  // read spinor from device memory
1915  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1916 
1917  // project spinor into half spinors
1918  a0_re = +i00_re+i20_im;
1919  a0_im = +i00_im-i20_re;
1920  a1_re = +i01_re+i21_im;
1921  a1_im = +i01_im-i21_re;
1922  a2_re = +i02_re+i22_im;
1923  a2_im = +i02_im-i22_re;
1924  b0_re = +i10_re-i30_im;
1925  b0_im = +i10_im+i30_re;
1926  b1_re = +i11_re-i31_im;
1927  b1_im = +i11_im+i31_re;
1928  b2_re = +i12_re-i32_im;
1929  b2_im = +i12_im+i32_re;
1930 
1931 #ifdef MULTI_GPU
1932  } else {
1933 
1934  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1935 
1936  // read half spinor from device memory
1937  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1938 
1939  a0_re = i00_re; a0_im = i00_im;
1940  a1_re = i01_re; a1_im = i01_im;
1941  a2_re = i02_re; a2_im = i02_im;
1942  b0_re = i10_re; b0_im = i10_im;
1943  b1_re = i11_re; b1_im = i11_im;
1944  b2_re = i12_re; b2_im = i12_im;
1945 
1946  }
1947 #endif // MULTI_GPU
1948 
1949  // read gauge matrix from device memory
1950  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1951 
1952  // reconstruct gauge matrix
1954 
1955  // multiply row 0
1956  spinorFloat A0_re = 0;
1957  A0_re += gT00_re * a0_re;
1958  A0_re -= gT00_im * a0_im;
1959  A0_re += gT01_re * a1_re;
1960  A0_re -= gT01_im * a1_im;
1961  A0_re += gT02_re * a2_re;
1962  A0_re -= gT02_im * a2_im;
1963  spinorFloat A0_im = 0;
1964  A0_im += gT00_re * a0_im;
1965  A0_im += gT00_im * a0_re;
1966  A0_im += gT01_re * a1_im;
1967  A0_im += gT01_im * a1_re;
1968  A0_im += gT02_re * a2_im;
1969  A0_im += gT02_im * a2_re;
1970  spinorFloat B0_re = 0;
1971  B0_re += gT00_re * b0_re;
1972  B0_re -= gT00_im * b0_im;
1973  B0_re += gT01_re * b1_re;
1974  B0_re -= gT01_im * b1_im;
1975  B0_re += gT02_re * b2_re;
1976  B0_re -= gT02_im * b2_im;
1977  spinorFloat B0_im = 0;
1978  B0_im += gT00_re * b0_im;
1979  B0_im += gT00_im * b0_re;
1980  B0_im += gT01_re * b1_im;
1981  B0_im += gT01_im * b1_re;
1982  B0_im += gT02_re * b2_im;
1983  B0_im += gT02_im * b2_re;
1984 
1985  // multiply row 1
1986  spinorFloat A1_re = 0;
1987  A1_re += gT10_re * a0_re;
1988  A1_re -= gT10_im * a0_im;
1989  A1_re += gT11_re * a1_re;
1990  A1_re -= gT11_im * a1_im;
1991  A1_re += gT12_re * a2_re;
1992  A1_re -= gT12_im * a2_im;
1993  spinorFloat A1_im = 0;
1994  A1_im += gT10_re * a0_im;
1995  A1_im += gT10_im * a0_re;
1996  A1_im += gT11_re * a1_im;
1997  A1_im += gT11_im * a1_re;
1998  A1_im += gT12_re * a2_im;
1999  A1_im += gT12_im * a2_re;
2000  spinorFloat B1_re = 0;
2001  B1_re += gT10_re * b0_re;
2002  B1_re -= gT10_im * b0_im;
2003  B1_re += gT11_re * b1_re;
2004  B1_re -= gT11_im * b1_im;
2005  B1_re += gT12_re * b2_re;
2006  B1_re -= gT12_im * b2_im;
2007  spinorFloat B1_im = 0;
2008  B1_im += gT10_re * b0_im;
2009  B1_im += gT10_im * b0_re;
2010  B1_im += gT11_re * b1_im;
2011  B1_im += gT11_im * b1_re;
2012  B1_im += gT12_re * b2_im;
2013  B1_im += gT12_im * b2_re;
2014 
2015  // multiply row 2
2016  spinorFloat A2_re = 0;
2017  A2_re += gT20_re * a0_re;
2018  A2_re -= gT20_im * a0_im;
2019  A2_re += gT21_re * a1_re;
2020  A2_re -= gT21_im * a1_im;
2021  A2_re += gT22_re * a2_re;
2022  A2_re -= gT22_im * a2_im;
2023  spinorFloat A2_im = 0;
2024  A2_im += gT20_re * a0_im;
2025  A2_im += gT20_im * a0_re;
2026  A2_im += gT21_re * a1_im;
2027  A2_im += gT21_im * a1_re;
2028  A2_im += gT22_re * a2_im;
2029  A2_im += gT22_im * a2_re;
2030  spinorFloat B2_re = 0;
2031  B2_re += gT20_re * b0_re;
2032  B2_re -= gT20_im * b0_im;
2033  B2_re += gT21_re * b1_re;
2034  B2_re -= gT21_im * b1_im;
2035  B2_re += gT22_re * b2_re;
2036  B2_re -= gT22_im * b2_im;
2037  spinorFloat B2_im = 0;
2038  B2_im += gT20_re * b0_im;
2039  B2_im += gT20_im * b0_re;
2040  B2_im += gT21_re * b1_im;
2041  B2_im += gT21_im * b1_re;
2042  B2_im += gT22_re * b2_im;
2043  B2_im += gT22_im * b2_re;
2044 
2045  o00_re += a*A0_re;
2046  o00_im += a*A0_im;
2047  o10_re += a*B0_re;
2048  o10_im += a*B0_im;
2049  o20_re -= a*A0_im;
2050  o20_im += a*A0_re;
2051  o30_re += a*B0_im;
2052  o30_im -= a*B0_re;
2053 
2054  o01_re += a*A1_re;
2055  o01_im += a*A1_im;
2056  o11_re += a*B1_re;
2057  o11_im += a*B1_im;
2058  o21_re -= a*A1_im;
2059  o21_im += a*A1_re;
2060  o31_re += a*B1_im;
2061  o31_im -= a*B1_re;
2062 
2063  o02_re += a*A2_re;
2064  o02_im += a*A2_im;
2065  o12_re += a*B2_re;
2066  o12_im += a*B2_im;
2067  o22_re -= a*A2_im;
2068  o22_im += a*A2_re;
2069  o32_re += a*B2_im;
2070  o32_im -= a*B2_re;
2071 
2072 }
2073 
2074 #ifdef MULTI_GPU
2075 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
2076  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
2077 #endif
2078 {
2079  // Projector P3+
2080  // 2 0 0 0
2081  // 0 2 0 0
2082  // 0 0 0 0
2083  // 0 0 0 0
2084 
2085 #ifdef MULTI_GPU
2086  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
2087  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2088 #else
2089  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
2090 #endif
2091 
2092  const int ga_idx = sid;
2093 
2095  {
2102 
2103 #ifdef MULTI_GPU
2104  if (kernel_type == INTERIOR_KERNEL) {
2105 #endif
2106 
2107  // read spinor from device memory
2108  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2109 
2110  // project spinor into half spinors
2111  a0_re = +2*i00_re;
2112  a0_im = +2*i00_im;
2113  a1_re = +2*i01_re;
2114  a1_im = +2*i01_im;
2115  a2_re = +2*i02_re;
2116  a2_im = +2*i02_im;
2117  b0_re = +2*i10_re;
2118  b0_im = +2*i10_im;
2119  b1_re = +2*i11_re;
2120  b1_im = +2*i11_im;
2121  b2_re = +2*i12_re;
2122  b2_im = +2*i12_im;
2123 
2124 #ifdef MULTI_GPU
2125  } else {
2126 
2127  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2128  const int t_proj_scale = TPROJSCALE;
2129 
2130  // read half spinor from device memory
2131  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
2132 
2133  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2134  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2135  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2136  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2137  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2138  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2139 
2140  }
2141 #endif // MULTI_GPU
2142 
2143  // identity gauge matrix
2150 
2151  o00_re += a*A0_re;
2152  o00_im += a*A0_im;
2153  o10_re += a*B0_re;
2154  o10_im += a*B0_im;
2155 
2156  o01_re += a*A1_re;
2157  o01_im += a*A1_im;
2158  o11_re += a*B1_re;
2159  o11_im += a*B1_im;
2160 
2161  o02_re += a*A2_re;
2162  o02_im += a*A2_im;
2163  o12_re += a*B2_re;
2164  o12_im += a*B2_im;
2165 
2166  } else {
2173 
2174 #ifdef MULTI_GPU
2175  if (kernel_type == INTERIOR_KERNEL) {
2176 #endif
2177 
2178  // read spinor from device memory
2179  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2180 
2181  // project spinor into half spinors
2182  a0_re = +2*i00_re;
2183  a0_im = +2*i00_im;
2184  a1_re = +2*i01_re;
2185  a1_im = +2*i01_im;
2186  a2_re = +2*i02_re;
2187  a2_im = +2*i02_im;
2188  b0_re = +2*i10_re;
2189  b0_im = +2*i10_im;
2190  b1_re = +2*i11_re;
2191  b1_im = +2*i11_im;
2192  b2_re = +2*i12_re;
2193  b2_im = +2*i12_im;
2194 
2195 #ifdef MULTI_GPU
2196  } else {
2197 
2198  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2199  const int t_proj_scale = TPROJSCALE;
2200 
2201  // read half spinor from device memory
2202  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
2203 
2204  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2205  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2206  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2207  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2208  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2209  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2210 
2211  }
2212 #endif // MULTI_GPU
2213 
2214  // read gauge matrix from device memory
2215  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
2216 
2217  // reconstruct gauge matrix
2219 
2220  // multiply row 0
2221  spinorFloat A0_re = 0;
2222  A0_re += g00_re * a0_re;
2223  A0_re -= g00_im * a0_im;
2224  A0_re += g01_re * a1_re;
2225  A0_re -= g01_im * a1_im;
2226  A0_re += g02_re * a2_re;
2227  A0_re -= g02_im * a2_im;
2228  spinorFloat A0_im = 0;
2229  A0_im += g00_re * a0_im;
2230  A0_im += g00_im * a0_re;
2231  A0_im += g01_re * a1_im;
2232  A0_im += g01_im * a1_re;
2233  A0_im += g02_re * a2_im;
2234  A0_im += g02_im * a2_re;
2235  spinorFloat B0_re = 0;
2236  B0_re += g00_re * b0_re;
2237  B0_re -= g00_im * b0_im;
2238  B0_re += g01_re * b1_re;
2239  B0_re -= g01_im * b1_im;
2240  B0_re += g02_re * b2_re;
2241  B0_re -= g02_im * b2_im;
2242  spinorFloat B0_im = 0;
2243  B0_im += g00_re * b0_im;
2244  B0_im += g00_im * b0_re;
2245  B0_im += g01_re * b1_im;
2246  B0_im += g01_im * b1_re;
2247  B0_im += g02_re * b2_im;
2248  B0_im += g02_im * b2_re;
2249 
2250  // multiply row 1
2251  spinorFloat A1_re = 0;
2252  A1_re += g10_re * a0_re;
2253  A1_re -= g10_im * a0_im;
2254  A1_re += g11_re * a1_re;
2255  A1_re -= g11_im * a1_im;
2256  A1_re += g12_re * a2_re;
2257  A1_re -= g12_im * a2_im;
2258  spinorFloat A1_im = 0;
2259  A1_im += g10_re * a0_im;
2260  A1_im += g10_im * a0_re;
2261  A1_im += g11_re * a1_im;
2262  A1_im += g11_im * a1_re;
2263  A1_im += g12_re * a2_im;
2264  A1_im += g12_im * a2_re;
2265  spinorFloat B1_re = 0;
2266  B1_re += g10_re * b0_re;
2267  B1_re -= g10_im * b0_im;
2268  B1_re += g11_re * b1_re;
2269  B1_re -= g11_im * b1_im;
2270  B1_re += g12_re * b2_re;
2271  B1_re -= g12_im * b2_im;
2272  spinorFloat B1_im = 0;
2273  B1_im += g10_re * b0_im;
2274  B1_im += g10_im * b0_re;
2275  B1_im += g11_re * b1_im;
2276  B1_im += g11_im * b1_re;
2277  B1_im += g12_re * b2_im;
2278  B1_im += g12_im * b2_re;
2279 
2280  // multiply row 2
2281  spinorFloat A2_re = 0;
2282  A2_re += g20_re * a0_re;
2283  A2_re -= g20_im * a0_im;
2284  A2_re += g21_re * a1_re;
2285  A2_re -= g21_im * a1_im;
2286  A2_re += g22_re * a2_re;
2287  A2_re -= g22_im * a2_im;
2288  spinorFloat A2_im = 0;
2289  A2_im += g20_re * a0_im;
2290  A2_im += g20_im * a0_re;
2291  A2_im += g21_re * a1_im;
2292  A2_im += g21_im * a1_re;
2293  A2_im += g22_re * a2_im;
2294  A2_im += g22_im * a2_re;
2295  spinorFloat B2_re = 0;
2296  B2_re += g20_re * b0_re;
2297  B2_re -= g20_im * b0_im;
2298  B2_re += g21_re * b1_re;
2299  B2_re -= g21_im * b1_im;
2300  B2_re += g22_re * b2_re;
2301  B2_re -= g22_im * b2_im;
2302  spinorFloat B2_im = 0;
2303  B2_im += g20_re * b0_im;
2304  B2_im += g20_im * b0_re;
2305  B2_im += g21_re * b1_im;
2306  B2_im += g21_im * b1_re;
2307  B2_im += g22_re * b2_im;
2308  B2_im += g22_im * b2_re;
2309 
2310  o00_re += a*A0_re;
2311  o00_im += a*A0_im;
2312  o10_re += a*B0_re;
2313  o10_im += a*B0_im;
2314 
2315  o01_re += a*A1_re;
2316  o01_im += a*A1_im;
2317  o11_re += a*B1_re;
2318  o11_im += a*B1_im;
2319 
2320  o02_re += a*A2_re;
2321  o02_im += a*A2_im;
2322  o12_re += a*B2_re;
2323  o12_im += a*B2_im;
2324 
2325  }
2326 }
2327 
2328 #ifdef MULTI_GPU
2329 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
2330  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
2331 #endif
2332 {
2333  // Projector P3-
2334  // 0 0 0 0
2335  // 0 0 0 0
2336  // 0 0 2 0
2337  // 0 0 0 2
2338 
2339 #ifdef MULTI_GPU
2340  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
2341  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2342 #else
2343  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
2344 #endif
2345 
2346 #ifdef MULTI_GPU
2347  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
2348 #else
2349  const int ga_idx = sp_idx;
2350 #endif
2351 
2352  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
2353  {
2360 
2361 #ifdef MULTI_GPU
2362  if (kernel_type == INTERIOR_KERNEL) {
2363 #endif
2364 
2365  // read spinor from device memory
2366  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2367 
2368  // project spinor into half spinors
2369  a0_re = +2*i20_re;
2370  a0_im = +2*i20_im;
2371  a1_re = +2*i21_re;
2372  a1_im = +2*i21_im;
2373  a2_re = +2*i22_re;
2374  a2_im = +2*i22_im;
2375  b0_re = +2*i30_re;
2376  b0_im = +2*i30_im;
2377  b1_re = +2*i31_re;
2378  b1_im = +2*i31_im;
2379  b2_re = +2*i32_re;
2380  b2_im = +2*i32_im;
2381 
2382 #ifdef MULTI_GPU
2383  } else {
2384 
2385  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2386  const int t_proj_scale = TPROJSCALE;
2387 
2388  // read half spinor from device memory
2389  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2390 
2391  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2392  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2393  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2394  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2395  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2396  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2397 
2398  }
2399 #endif // MULTI_GPU
2400 
2401  // identity gauge matrix
2408 
2409  o20_re += a*A0_re;
2410  o20_im += a*A0_im;
2411  o30_re += a*B0_re;
2412  o30_im += a*B0_im;
2413 
2414  o21_re += a*A1_re;
2415  o21_im += a*A1_im;
2416  o31_re += a*B1_re;
2417  o31_im += a*B1_im;
2418 
2419  o22_re += a*A2_re;
2420  o22_im += a*A2_im;
2421  o32_re += a*B2_re;
2422  o32_im += a*B2_im;
2423 
2424  } else {
2431 
2432 #ifdef MULTI_GPU
2433  if (kernel_type == INTERIOR_KERNEL) {
2434 #endif
2435 
2436  // read spinor from device memory
2437  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2438 
2439  // project spinor into half spinors
2440  a0_re = +2*i20_re;
2441  a0_im = +2*i20_im;
2442  a1_re = +2*i21_re;
2443  a1_im = +2*i21_im;
2444  a2_re = +2*i22_re;
2445  a2_im = +2*i22_im;
2446  b0_re = +2*i30_re;
2447  b0_im = +2*i30_im;
2448  b1_re = +2*i31_re;
2449  b1_im = +2*i31_im;
2450  b2_re = +2*i32_re;
2451  b2_im = +2*i32_im;
2452 
2453 #ifdef MULTI_GPU
2454  } else {
2455 
2456  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2457  const int t_proj_scale = TPROJSCALE;
2458 
2459  // read half spinor from device memory
2460  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2461 
2462  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2463  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2464  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2465  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2466  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2467  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2468 
2469  }
2470 #endif // MULTI_GPU
2471 
2472  // read gauge matrix from device memory
2473  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
2474 
2475  // reconstruct gauge matrix
2477 
2478  // multiply row 0
2479  spinorFloat A0_re = 0;
2480  A0_re += gT00_re * a0_re;
2481  A0_re -= gT00_im * a0_im;
2482  A0_re += gT01_re * a1_re;
2483  A0_re -= gT01_im * a1_im;
2484  A0_re += gT02_re * a2_re;
2485  A0_re -= gT02_im * a2_im;
2486  spinorFloat A0_im = 0;
2487  A0_im += gT00_re * a0_im;
2488  A0_im += gT00_im * a0_re;
2489  A0_im += gT01_re * a1_im;
2490  A0_im += gT01_im * a1_re;
2491  A0_im += gT02_re * a2_im;
2492  A0_im += gT02_im * a2_re;
2493  spinorFloat B0_re = 0;
2494  B0_re += gT00_re * b0_re;
2495  B0_re -= gT00_im * b0_im;
2496  B0_re += gT01_re * b1_re;
2497  B0_re -= gT01_im * b1_im;
2498  B0_re += gT02_re * b2_re;
2499  B0_re -= gT02_im * b2_im;
2500  spinorFloat B0_im = 0;
2501  B0_im += gT00_re * b0_im;
2502  B0_im += gT00_im * b0_re;
2503  B0_im += gT01_re * b1_im;
2504  B0_im += gT01_im * b1_re;
2505  B0_im += gT02_re * b2_im;
2506  B0_im += gT02_im * b2_re;
2507 
2508  // multiply row 1
2509  spinorFloat A1_re = 0;
2510  A1_re += gT10_re * a0_re;
2511  A1_re -= gT10_im * a0_im;
2512  A1_re += gT11_re * a1_re;
2513  A1_re -= gT11_im * a1_im;
2514  A1_re += gT12_re * a2_re;
2515  A1_re -= gT12_im * a2_im;
2516  spinorFloat A1_im = 0;
2517  A1_im += gT10_re * a0_im;
2518  A1_im += gT10_im * a0_re;
2519  A1_im += gT11_re * a1_im;
2520  A1_im += gT11_im * a1_re;
2521  A1_im += gT12_re * a2_im;
2522  A1_im += gT12_im * a2_re;
2523  spinorFloat B1_re = 0;
2524  B1_re += gT10_re * b0_re;
2525  B1_re -= gT10_im * b0_im;
2526  B1_re += gT11_re * b1_re;
2527  B1_re -= gT11_im * b1_im;
2528  B1_re += gT12_re * b2_re;
2529  B1_re -= gT12_im * b2_im;
2530  spinorFloat B1_im = 0;
2531  B1_im += gT10_re * b0_im;
2532  B1_im += gT10_im * b0_re;
2533  B1_im += gT11_re * b1_im;
2534  B1_im += gT11_im * b1_re;
2535  B1_im += gT12_re * b2_im;
2536  B1_im += gT12_im * b2_re;
2537 
2538  // multiply row 2
2539  spinorFloat A2_re = 0;
2540  A2_re += gT20_re * a0_re;
2541  A2_re -= gT20_im * a0_im;
2542  A2_re += gT21_re * a1_re;
2543  A2_re -= gT21_im * a1_im;
2544  A2_re += gT22_re * a2_re;
2545  A2_re -= gT22_im * a2_im;
2546  spinorFloat A2_im = 0;
2547  A2_im += gT20_re * a0_im;
2548  A2_im += gT20_im * a0_re;
2549  A2_im += gT21_re * a1_im;
2550  A2_im += gT21_im * a1_re;
2551  A2_im += gT22_re * a2_im;
2552  A2_im += gT22_im * a2_re;
2553  spinorFloat B2_re = 0;
2554  B2_re += gT20_re * b0_re;
2555  B2_re -= gT20_im * b0_im;
2556  B2_re += gT21_re * b1_re;
2557  B2_re -= gT21_im * b1_im;
2558  B2_re += gT22_re * b2_re;
2559  B2_re -= gT22_im * b2_im;
2560  spinorFloat B2_im = 0;
2561  B2_im += gT20_re * b0_im;
2562  B2_im += gT20_im * b0_re;
2563  B2_im += gT21_re * b1_im;
2564  B2_im += gT21_im * b1_re;
2565  B2_im += gT22_re * b2_im;
2566  B2_im += gT22_im * b2_re;
2567 
2568  o20_re += a*A0_re;
2569  o20_im += a*A0_im;
2570  o30_re += a*B0_re;
2571  o30_im += a*B0_im;
2572 
2573  o21_re += a*A1_re;
2574  o21_im += a*A1_im;
2575  o31_re += a*B1_re;
2576  o31_im += a*B1_im;
2577 
2578  o22_re += a*A2_re;
2579  o22_im += a*A2_im;
2580  o32_re += a*B2_re;
2581  o32_im += a*B2_im;
2582 
2583  }
2584 }
2585 
2586 
2587 
2588 // write spinor field back to device memory
2589 WRITE_SPINOR(param.sp_stride);
2590 
2591 // undefine to prevent warning when precision is changed
2592 #undef spinorFloat
2593 #undef SHARED_STRIDE
2594 
2595 #undef g00_re
2596 #undef g00_im
2597 #undef g01_re
2598 #undef g01_im
2599 #undef g02_re
2600 #undef g02_im
2601 #undef g10_re
2602 #undef g10_im
2603 #undef g11_re
2604 #undef g11_im
2605 #undef g12_re
2606 #undef g12_im
2607 #undef g20_re
2608 #undef g20_im
2609 #undef g21_re
2610 #undef g21_im
2611 #undef g22_re
2612 #undef g22_im
2613 
2614 #undef i00_re
2615 #undef i00_im
2616 #undef i01_re
2617 #undef i01_im
2618 #undef i02_re
2619 #undef i02_im
2620 #undef i10_re
2621 #undef i10_im
2622 #undef i11_re
2623 #undef i11_im
2624 #undef i12_re
2625 #undef i12_im
2626 #undef i20_re
2627 #undef i20_im
2628 #undef i21_re
2629 #undef i21_im
2630 #undef i22_re
2631 #undef i22_im
2632 #undef i30_re
2633 #undef i30_im
2634 #undef i31_re
2635 #undef i31_im
2636 #undef i32_re
2637 #undef i32_im
2638 
2639 #undef acc00_re
2640 #undef acc00_im
2641 #undef acc01_re
2642 #undef acc01_im
2643 #undef acc02_re
2644 #undef acc02_im
2645 #undef acc10_re
2646 #undef acc10_im
2647 #undef acc11_re
2648 #undef acc11_im
2649 #undef acc12_re
2650 #undef acc12_im
2651 #undef acc20_re
2652 #undef acc20_im
2653 #undef acc21_re
2654 #undef acc21_im
2655 #undef acc22_re
2656 #undef acc22_im
2657 #undef acc30_re
2658 #undef acc30_im
2659 #undef acc31_re
2660 #undef acc31_im
2661 #undef acc32_re
2662 #undef acc32_im
2663 
2664 #undef c00_00_re
2665 #undef c01_01_re
2666 #undef c02_02_re
2667 #undef c10_10_re
2668 #undef c11_11_re
2669 #undef c12_12_re
2670 #undef c01_00_re
2671 #undef c01_00_im
2672 #undef c02_00_re
2673 #undef c02_00_im
2674 #undef c10_00_re
2675 #undef c10_00_im
2676 #undef c11_00_re
2677 #undef c11_00_im
2678 #undef c12_00_re
2679 #undef c12_00_im
2680 #undef c02_01_re
2681 #undef c02_01_im
2682 #undef c10_01_re
2683 #undef c10_01_im
2684 #undef c11_01_re
2685 #undef c11_01_im
2686 #undef c12_01_re
2687 #undef c12_01_im
2688 #undef c10_02_re
2689 #undef c10_02_im
2690 #undef c11_02_re
2691 #undef c11_02_im
2692 #undef c12_02_re
2693 #undef c12_02_im
2694 #undef c11_10_re
2695 #undef c11_10_im
2696 #undef c12_10_re
2697 #undef c12_10_im
2698 #undef c12_11_re
2699 #undef c12_11_im
2700 
2701 #undef o00_re
2702 #undef o00_im
2703 #undef o01_re
2704 #undef o01_im
2705 #undef o02_re
2706 #undef o02_im
2707 #undef o10_re
2708 #undef o10_im
2709 #undef o11_re
2710 #undef o11_im
2711 #undef o12_re
2712 #undef o12_im
2713 #undef o20_re
2714 #undef o20_im
2715 #undef o21_re
2716 #undef o21_im
2717 #undef o22_re
2718 #undef o22_im
2719 #undef o30_re
2720 
2721 #undef VOLATILE
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
__constant__ int Vh
#define a22_re
Definition: llfat_core.h:131
VOLATILE spinorFloat o32_re
__constant__ int X2
__constant__ int X2X1mX1
#define CLOVERTEX
Definition: clover_def.h:101
#define a02_im
Definition: llfat_core.h:120
__constant__ int X3X2X1mX2X1
WRITE_SPINOR(param.sp_stride)
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
#define a22_im
Definition: llfat_core.h:132
VOLATILE spinorFloat o32_im
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
__constant__ int X3X2X1
#define a01_re
Definition: llfat_core.h:117
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define a02_re
Definition: llfat_core.h:119
#define a20_re
Definition: llfat_core.h:127
#define a12_im
Definition: llfat_core.h:126
#define a20_im
Definition: llfat_core.h:128
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int ghostFace[QUDA_MAX_DIM+1]
VOLATILE spinorFloat o31_re
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
__shared__ char s_data[]
#define a01_im
Definition: llfat_core.h:118
#define a12_re
Definition: llfat_core.h:125
#define GAUGE0TEX
Definition: covDev.h:112
#define a11_re
Definition: llfat_core.h:123
RECONSTRUCT_GAUGE_MATRIX(0)
__constant__ int X2m1
#define SPINORTEX
Definition: clover_def.h:40
__constant__ int gauge_fixed
__constant__ int X4X3X2X1mX3X2X1
#define SPINOR_HOP
Definition: covDev.h:158
__constant__ int ga_stride
#define a00_re
Definition: llfat_core.h:115
VOLATILE spinorFloat o30_im
__constant__ int X1m1
__constant__ int X3
#define a11_im
Definition: llfat_core.h:124
VOLATILE spinorFloat o31_im
#define a10_re
Definition: llfat_core.h:121
#define GAUGE1TEX
Definition: covDev.h:113
#define a10_im
Definition: llfat_core.h:122
__constant__ int X4m1
#define a21_re
Definition: llfat_core.h:129
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
VOLATILE spinorFloat * s
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
__constant__ int X4X3X2X1hmX3X2X1h
#define a21_im
Definition: llfat_core.h:130
KernelType kernel_type
#define READ_CLOVER
Definition: clover_def.h:103
__constant__ int X4
__constant__ int X3m1
#define TPROJSCALE
Definition: covDev.h:101
#define a00_im
Definition: llfat_core.h:116
#define DSLASH_SHARED_FLOATS_PER_THREAD
__constant__ int X2X1