QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tm_dslash_dagger_gt200_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define i00_re I0.x
15 #define i00_im I0.y
16 #define i01_re I1.x
17 #define i01_im I1.y
18 #define i02_re I2.x
19 #define i02_im I2.y
20 #define i10_re I3.x
21 #define i10_im I3.y
22 #define i11_re I4.x
23 #define i11_im I4.y
24 #define i12_re I5.x
25 #define i12_im I5.y
26 #define i20_re I6.x
27 #define i20_im I6.y
28 #define i21_re I7.x
29 #define i21_im I7.y
30 #define i22_re I8.x
31 #define i22_im I8.y
32 #define i30_re I9.x
33 #define i30_im I9.y
34 #define i31_re I10.x
35 #define i31_im I10.y
36 #define i32_re I11.x
37 #define i32_im I11.y
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
62 #else
63 #define spinorFloat float
64 #define i00_re I0.x
65 #define i00_im I0.y
66 #define i01_re I0.z
67 #define i01_im I0.w
68 #define i02_re I1.x
69 #define i02_im I1.y
70 #define i10_re I1.z
71 #define i10_im I1.w
72 #define i11_re I2.x
73 #define i11_im I2.y
74 #define i12_re I2.z
75 #define i12_im I2.w
76 #define i20_re I3.x
77 #define i20_im I3.y
78 #define i21_re I3.z
79 #define i21_im I3.w
80 #define i22_re I4.x
81 #define i22_im I4.y
82 #define i30_re I4.z
83 #define i30_im I4.w
84 #define i31_re I5.x
85 #define i31_im I5.y
86 #define i32_re I5.z
87 #define i32_im I5.w
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
113 
114 // gauge link
115 #ifdef GAUGE_FLOAT2
116 #define g00_re G0.x
117 #define g00_im G0.y
118 #define g01_re G1.x
119 #define g01_im G1.y
120 #define g02_re G2.x
121 #define g02_im G2.y
122 #define g10_re G3.x
123 #define g10_im G3.y
124 #define g11_re G4.x
125 #define g11_im G4.y
126 #define g12_re G5.x
127 #define g12_im G5.y
128 #define g20_re G6.x
129 #define g20_im G6.y
130 #define g21_re G7.x
131 #define g21_im G7.y
132 #define g22_re G8.x
133 #define g22_im G8.y
134 
135 #else
136 #define g00_re G0.x
137 #define g00_im G0.y
138 #define g01_re G0.z
139 #define g01_im G0.w
140 #define g02_re G1.x
141 #define g02_im G1.y
142 #define g10_re G1.z
143 #define g10_im G1.w
144 #define g11_re G2.x
145 #define g11_im G2.y
146 #define g12_re G2.z
147 #define g12_im G2.w
148 #define g20_re G3.x
149 #define g20_im G3.y
150 #define g21_re G3.z
151 #define g21_im G3.w
152 #define g22_re G4.x
153 #define g22_im G4.y
154 
155 #endif // GAUGE_DOUBLE
156 
157 // conjugated gauge link
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
176 
177 // output spinor
202 
203 #include "read_gauge.h"
204 #include "read_clover.h"
205 #include "io_spinor.h"
206 
207 int x1, x2, x3, x4;
208 int X;
209 
210 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
211 int sp_norm_idx;
212 #endif // MULTI_GPU half precision
213 
214 int sid;
215 
216 #ifdef MULTI_GPU
217 int face_idx;
218 if (kernel_type == INTERIOR_KERNEL) {
219 #endif
220 
221  sid = blockIdx.x*blockDim.x + threadIdx.x;
222  if (sid >= param.threads) return;
223 
224  // Inline by hand for the moment and assume even dimensions
225  coordsFromIndex<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity);
226 
227  o00_re = 0; o00_im = 0;
228  o01_re = 0; o01_im = 0;
229  o02_re = 0; o02_im = 0;
230  o10_re = 0; o10_im = 0;
231  o11_re = 0; o11_im = 0;
232  o12_re = 0; o12_im = 0;
233  o20_re = 0; o20_im = 0;
234  o21_re = 0; o21_im = 0;
235  o22_re = 0; o22_im = 0;
236  o30_re = 0; o30_im = 0;
237  o31_re = 0; o31_im = 0;
238  o32_re = 0; o32_im = 0;
239 
240 #ifdef MULTI_GPU
241 } else { // exterior kernel
242 
243  sid = blockIdx.x*blockDim.x + threadIdx.x;
244  if (sid >= param.threads) return;
245 
246  const int dim = static_cast<int>(kernel_type);
247  const int face_volume = (param.threads >> 1); // volume of one face
248  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
249  face_idx = sid - face_num*face_volume; // index into the respective face
250 
251  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
252  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
253  //sp_idx = face_idx + param.ghostOffset[dim];
254 
255 #if (DD_PREC==2) // half precision
256  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
257 #endif
258 
259  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity);
260 
262 
263  o00_re = i00_re; o00_im = i00_im;
264  o01_re = i01_re; o01_im = i01_im;
265  o02_re = i02_re; o02_im = i02_im;
266  o10_re = i10_re; o10_im = i10_im;
267  o11_re = i11_re; o11_im = i11_im;
268  o12_re = i12_re; o12_im = i12_im;
269  o20_re = i20_re; o20_im = i20_im;
270  o21_re = i21_re; o21_im = i21_im;
271  o22_re = i22_re; o22_im = i22_im;
272  o30_re = i30_re; o30_im = i30_im;
273  o31_re = i31_re; o31_im = i31_im;
274  o32_re = i32_re; o32_im = i32_im;
275 }
276 #endif // MULTI_GPU
277 
278 
279 #ifdef MULTI_GPU
280 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
281  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
282 #endif
283 {
284  // Projector P0+
285  // 1 0 0 i
286  // 0 1 i 0
287  // 0 -i 1 0
288  // -i 0 0 1
289 
290 #ifdef MULTI_GPU
291  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
292  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
293 #else
294  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
295 #endif
296 
297  const int ga_idx = sid;
298 
305 
306 #ifdef MULTI_GPU
307  if (kernel_type == INTERIOR_KERNEL) {
308 #endif
309 
310  // read spinor from device memory
311  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
312 
313  // project spinor into half spinors
314  a0_re = +i00_re-i30_im;
315  a0_im = +i00_im+i30_re;
316  a1_re = +i01_re-i31_im;
317  a1_im = +i01_im+i31_re;
318  a2_re = +i02_re-i32_im;
319  a2_im = +i02_im+i32_re;
320  b0_re = +i10_re-i20_im;
321  b0_im = +i10_im+i20_re;
322  b1_re = +i11_re-i21_im;
323  b1_im = +i11_im+i21_re;
324  b2_re = +i12_re-i22_im;
325  b2_im = +i12_im+i22_re;
326 
327 #ifdef MULTI_GPU
328  } else {
329 
330  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
331 
332  // read half spinor from device memory
333  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
334 
335  a0_re = i00_re; a0_im = i00_im;
336  a1_re = i01_re; a1_im = i01_im;
337  a2_re = i02_re; a2_im = i02_im;
338  b0_re = i10_re; b0_im = i10_im;
339  b1_re = i11_re; b1_im = i11_im;
340  b2_re = i12_re; b2_im = i12_im;
341 
342  }
343 #endif // MULTI_GPU
344 
345  // read gauge matrix from device memory
346  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
347 
348  // reconstruct gauge matrix
350 
351  // multiply row 0
353  A0_re += g00_re * a0_re;
354  A0_re -= g00_im * a0_im;
355  A0_re += g01_re * a1_re;
356  A0_re -= g01_im * a1_im;
357  A0_re += g02_re * a2_re;
358  A0_re -= g02_im * a2_im;
360  A0_im += g00_re * a0_im;
361  A0_im += g00_im * a0_re;
362  A0_im += g01_re * a1_im;
363  A0_im += g01_im * a1_re;
364  A0_im += g02_re * a2_im;
365  A0_im += g02_im * a2_re;
367  B0_re += g00_re * b0_re;
368  B0_re -= g00_im * b0_im;
369  B0_re += g01_re * b1_re;
370  B0_re -= g01_im * b1_im;
371  B0_re += g02_re * b2_re;
372  B0_re -= g02_im * b2_im;
374  B0_im += g00_re * b0_im;
375  B0_im += g00_im * b0_re;
376  B0_im += g01_re * b1_im;
377  B0_im += g01_im * b1_re;
378  B0_im += g02_re * b2_im;
379  B0_im += g02_im * b2_re;
380 
381  // multiply row 1
383  A1_re += g10_re * a0_re;
384  A1_re -= g10_im * a0_im;
385  A1_re += g11_re * a1_re;
386  A1_re -= g11_im * a1_im;
387  A1_re += g12_re * a2_re;
388  A1_re -= g12_im * a2_im;
390  A1_im += g10_re * a0_im;
391  A1_im += g10_im * a0_re;
392  A1_im += g11_re * a1_im;
393  A1_im += g11_im * a1_re;
394  A1_im += g12_re * a2_im;
395  A1_im += g12_im * a2_re;
397  B1_re += g10_re * b0_re;
398  B1_re -= g10_im * b0_im;
399  B1_re += g11_re * b1_re;
400  B1_re -= g11_im * b1_im;
401  B1_re += g12_re * b2_re;
402  B1_re -= g12_im * b2_im;
404  B1_im += g10_re * b0_im;
405  B1_im += g10_im * b0_re;
406  B1_im += g11_re * b1_im;
407  B1_im += g11_im * b1_re;
408  B1_im += g12_re * b2_im;
409  B1_im += g12_im * b2_re;
410 
411  // multiply row 2
413  A2_re += g20_re * a0_re;
414  A2_re -= g20_im * a0_im;
415  A2_re += g21_re * a1_re;
416  A2_re -= g21_im * a1_im;
417  A2_re += g22_re * a2_re;
418  A2_re -= g22_im * a2_im;
420  A2_im += g20_re * a0_im;
421  A2_im += g20_im * a0_re;
422  A2_im += g21_re * a1_im;
423  A2_im += g21_im * a1_re;
424  A2_im += g22_re * a2_im;
425  A2_im += g22_im * a2_re;
427  B2_re += g20_re * b0_re;
428  B2_re -= g20_im * b0_im;
429  B2_re += g21_re * b1_re;
430  B2_re -= g21_im * b1_im;
431  B2_re += g22_re * b2_re;
432  B2_re -= g22_im * b2_im;
434  B2_im += g20_re * b0_im;
435  B2_im += g20_im * b0_re;
436  B2_im += g21_re * b1_im;
437  B2_im += g21_im * b1_re;
438  B2_im += g22_re * b2_im;
439  B2_im += g22_im * b2_re;
440 
441  o00_re += A0_re;
442  o00_im += A0_im;
443  o10_re += B0_re;
444  o10_im += B0_im;
445  o20_re += B0_im;
446  o20_im -= B0_re;
447  o30_re += A0_im;
448  o30_im -= A0_re;
449 
450  o01_re += A1_re;
451  o01_im += A1_im;
452  o11_re += B1_re;
453  o11_im += B1_im;
454  o21_re += B1_im;
455  o21_im -= B1_re;
456  o31_re += A1_im;
457  o31_im -= A1_re;
458 
459  o02_re += A2_re;
460  o02_im += A2_im;
461  o12_re += B2_re;
462  o12_im += B2_im;
463  o22_re += B2_im;
464  o22_im -= B2_re;
465  o32_re += A2_im;
466  o32_im -= A2_re;
467 
468 }
469 
470 #ifdef MULTI_GPU
471 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
472  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
473 #endif
474 {
475  // Projector P0-
476  // 1 0 0 -i
477  // 0 1 -i 0
478  // 0 i 1 0
479  // i 0 0 1
480 
481 #ifdef MULTI_GPU
482  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
483  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
484 #else
485  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
486 #endif
487 
488 #ifdef MULTI_GPU
489  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
490 #else
491  const int ga_idx = sp_idx;
492 #endif
493 
500 
501 #ifdef MULTI_GPU
502  if (kernel_type == INTERIOR_KERNEL) {
503 #endif
504 
505  // read spinor from device memory
506  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
507 
508  // project spinor into half spinors
509  a0_re = +i00_re+i30_im;
510  a0_im = +i00_im-i30_re;
511  a1_re = +i01_re+i31_im;
512  a1_im = +i01_im-i31_re;
513  a2_re = +i02_re+i32_im;
514  a2_im = +i02_im-i32_re;
515  b0_re = +i10_re+i20_im;
516  b0_im = +i10_im-i20_re;
517  b1_re = +i11_re+i21_im;
518  b1_im = +i11_im-i21_re;
519  b2_re = +i12_re+i22_im;
520  b2_im = +i12_im-i22_re;
521 
522 #ifdef MULTI_GPU
523  } else {
524 
525  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
526 
527  // read half spinor from device memory
528  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
529 
530  a0_re = i00_re; a0_im = i00_im;
531  a1_re = i01_re; a1_im = i01_im;
532  a2_re = i02_re; a2_im = i02_im;
533  b0_re = i10_re; b0_im = i10_im;
534  b1_re = i11_re; b1_im = i11_im;
535  b2_re = i12_re; b2_im = i12_im;
536 
537  }
538 #endif // MULTI_GPU
539 
540  // read gauge matrix from device memory
541  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
542 
543  // reconstruct gauge matrix
545 
546  // multiply row 0
547  spinorFloat A0_re = 0;
548  A0_re += gT00_re * a0_re;
549  A0_re -= gT00_im * a0_im;
550  A0_re += gT01_re * a1_re;
551  A0_re -= gT01_im * a1_im;
552  A0_re += gT02_re * a2_re;
553  A0_re -= gT02_im * a2_im;
554  spinorFloat A0_im = 0;
555  A0_im += gT00_re * a0_im;
556  A0_im += gT00_im * a0_re;
557  A0_im += gT01_re * a1_im;
558  A0_im += gT01_im * a1_re;
559  A0_im += gT02_re * a2_im;
560  A0_im += gT02_im * a2_re;
561  spinorFloat B0_re = 0;
562  B0_re += gT00_re * b0_re;
563  B0_re -= gT00_im * b0_im;
564  B0_re += gT01_re * b1_re;
565  B0_re -= gT01_im * b1_im;
566  B0_re += gT02_re * b2_re;
567  B0_re -= gT02_im * b2_im;
568  spinorFloat B0_im = 0;
569  B0_im += gT00_re * b0_im;
570  B0_im += gT00_im * b0_re;
571  B0_im += gT01_re * b1_im;
572  B0_im += gT01_im * b1_re;
573  B0_im += gT02_re * b2_im;
574  B0_im += gT02_im * b2_re;
575 
576  // multiply row 1
577  spinorFloat A1_re = 0;
578  A1_re += gT10_re * a0_re;
579  A1_re -= gT10_im * a0_im;
580  A1_re += gT11_re * a1_re;
581  A1_re -= gT11_im * a1_im;
582  A1_re += gT12_re * a2_re;
583  A1_re -= gT12_im * a2_im;
584  spinorFloat A1_im = 0;
585  A1_im += gT10_re * a0_im;
586  A1_im += gT10_im * a0_re;
587  A1_im += gT11_re * a1_im;
588  A1_im += gT11_im * a1_re;
589  A1_im += gT12_re * a2_im;
590  A1_im += gT12_im * a2_re;
591  spinorFloat B1_re = 0;
592  B1_re += gT10_re * b0_re;
593  B1_re -= gT10_im * b0_im;
594  B1_re += gT11_re * b1_re;
595  B1_re -= gT11_im * b1_im;
596  B1_re += gT12_re * b2_re;
597  B1_re -= gT12_im * b2_im;
598  spinorFloat B1_im = 0;
599  B1_im += gT10_re * b0_im;
600  B1_im += gT10_im * b0_re;
601  B1_im += gT11_re * b1_im;
602  B1_im += gT11_im * b1_re;
603  B1_im += gT12_re * b2_im;
604  B1_im += gT12_im * b2_re;
605 
606  // multiply row 2
607  spinorFloat A2_re = 0;
608  A2_re += gT20_re * a0_re;
609  A2_re -= gT20_im * a0_im;
610  A2_re += gT21_re * a1_re;
611  A2_re -= gT21_im * a1_im;
612  A2_re += gT22_re * a2_re;
613  A2_re -= gT22_im * a2_im;
614  spinorFloat A2_im = 0;
615  A2_im += gT20_re * a0_im;
616  A2_im += gT20_im * a0_re;
617  A2_im += gT21_re * a1_im;
618  A2_im += gT21_im * a1_re;
619  A2_im += gT22_re * a2_im;
620  A2_im += gT22_im * a2_re;
621  spinorFloat B2_re = 0;
622  B2_re += gT20_re * b0_re;
623  B2_re -= gT20_im * b0_im;
624  B2_re += gT21_re * b1_re;
625  B2_re -= gT21_im * b1_im;
626  B2_re += gT22_re * b2_re;
627  B2_re -= gT22_im * b2_im;
628  spinorFloat B2_im = 0;
629  B2_im += gT20_re * b0_im;
630  B2_im += gT20_im * b0_re;
631  B2_im += gT21_re * b1_im;
632  B2_im += gT21_im * b1_re;
633  B2_im += gT22_re * b2_im;
634  B2_im += gT22_im * b2_re;
635 
636  o00_re += A0_re;
637  o00_im += A0_im;
638  o10_re += B0_re;
639  o10_im += B0_im;
640  o20_re -= B0_im;
641  o20_im += B0_re;
642  o30_re -= A0_im;
643  o30_im += A0_re;
644 
645  o01_re += A1_re;
646  o01_im += A1_im;
647  o11_re += B1_re;
648  o11_im += B1_im;
649  o21_re -= B1_im;
650  o21_im += B1_re;
651  o31_re -= A1_im;
652  o31_im += A1_re;
653 
654  o02_re += A2_re;
655  o02_im += A2_im;
656  o12_re += B2_re;
657  o12_im += B2_im;
658  o22_re -= B2_im;
659  o22_im += B2_re;
660  o32_re -= A2_im;
661  o32_im += A2_re;
662 
663 }
664 
665 #ifdef MULTI_GPU
666 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
667  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
668 #endif
669 {
670  // Projector P1+
671  // 1 0 0 1
672  // 0 1 -1 0
673  // 0 -1 1 0
674  // 1 0 0 1
675 
676 #ifdef MULTI_GPU
677  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
678  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
679 #else
680  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
681 #endif
682 
683  const int ga_idx = sid;
684 
691 
692 #ifdef MULTI_GPU
693  if (kernel_type == INTERIOR_KERNEL) {
694 #endif
695 
696  // read spinor from device memory
697  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
698 
699  // project spinor into half spinors
700  a0_re = +i00_re+i30_re;
701  a0_im = +i00_im+i30_im;
702  a1_re = +i01_re+i31_re;
703  a1_im = +i01_im+i31_im;
704  a2_re = +i02_re+i32_re;
705  a2_im = +i02_im+i32_im;
706  b0_re = +i10_re-i20_re;
707  b0_im = +i10_im-i20_im;
708  b1_re = +i11_re-i21_re;
709  b1_im = +i11_im-i21_im;
710  b2_re = +i12_re-i22_re;
711  b2_im = +i12_im-i22_im;
712 
713 #ifdef MULTI_GPU
714  } else {
715 
716  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
717 
718  // read half spinor from device memory
719  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
720 
721  a0_re = i00_re; a0_im = i00_im;
722  a1_re = i01_re; a1_im = i01_im;
723  a2_re = i02_re; a2_im = i02_im;
724  b0_re = i10_re; b0_im = i10_im;
725  b1_re = i11_re; b1_im = i11_im;
726  b2_re = i12_re; b2_im = i12_im;
727 
728  }
729 #endif // MULTI_GPU
730 
731  // read gauge matrix from device memory
732  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
733 
734  // reconstruct gauge matrix
736 
737  // multiply row 0
738  spinorFloat A0_re = 0;
739  A0_re += g00_re * a0_re;
740  A0_re -= g00_im * a0_im;
741  A0_re += g01_re * a1_re;
742  A0_re -= g01_im * a1_im;
743  A0_re += g02_re * a2_re;
744  A0_re -= g02_im * a2_im;
745  spinorFloat A0_im = 0;
746  A0_im += g00_re * a0_im;
747  A0_im += g00_im * a0_re;
748  A0_im += g01_re * a1_im;
749  A0_im += g01_im * a1_re;
750  A0_im += g02_re * a2_im;
751  A0_im += g02_im * a2_re;
752  spinorFloat B0_re = 0;
753  B0_re += g00_re * b0_re;
754  B0_re -= g00_im * b0_im;
755  B0_re += g01_re * b1_re;
756  B0_re -= g01_im * b1_im;
757  B0_re += g02_re * b2_re;
758  B0_re -= g02_im * b2_im;
759  spinorFloat B0_im = 0;
760  B0_im += g00_re * b0_im;
761  B0_im += g00_im * b0_re;
762  B0_im += g01_re * b1_im;
763  B0_im += g01_im * b1_re;
764  B0_im += g02_re * b2_im;
765  B0_im += g02_im * b2_re;
766 
767  // multiply row 1
768  spinorFloat A1_re = 0;
769  A1_re += g10_re * a0_re;
770  A1_re -= g10_im * a0_im;
771  A1_re += g11_re * a1_re;
772  A1_re -= g11_im * a1_im;
773  A1_re += g12_re * a2_re;
774  A1_re -= g12_im * a2_im;
775  spinorFloat A1_im = 0;
776  A1_im += g10_re * a0_im;
777  A1_im += g10_im * a0_re;
778  A1_im += g11_re * a1_im;
779  A1_im += g11_im * a1_re;
780  A1_im += g12_re * a2_im;
781  A1_im += g12_im * a2_re;
782  spinorFloat B1_re = 0;
783  B1_re += g10_re * b0_re;
784  B1_re -= g10_im * b0_im;
785  B1_re += g11_re * b1_re;
786  B1_re -= g11_im * b1_im;
787  B1_re += g12_re * b2_re;
788  B1_re -= g12_im * b2_im;
789  spinorFloat B1_im = 0;
790  B1_im += g10_re * b0_im;
791  B1_im += g10_im * b0_re;
792  B1_im += g11_re * b1_im;
793  B1_im += g11_im * b1_re;
794  B1_im += g12_re * b2_im;
795  B1_im += g12_im * b2_re;
796 
797  // multiply row 2
798  spinorFloat A2_re = 0;
799  A2_re += g20_re * a0_re;
800  A2_re -= g20_im * a0_im;
801  A2_re += g21_re * a1_re;
802  A2_re -= g21_im * a1_im;
803  A2_re += g22_re * a2_re;
804  A2_re -= g22_im * a2_im;
805  spinorFloat A2_im = 0;
806  A2_im += g20_re * a0_im;
807  A2_im += g20_im * a0_re;
808  A2_im += g21_re * a1_im;
809  A2_im += g21_im * a1_re;
810  A2_im += g22_re * a2_im;
811  A2_im += g22_im * a2_re;
812  spinorFloat B2_re = 0;
813  B2_re += g20_re * b0_re;
814  B2_re -= g20_im * b0_im;
815  B2_re += g21_re * b1_re;
816  B2_re -= g21_im * b1_im;
817  B2_re += g22_re * b2_re;
818  B2_re -= g22_im * b2_im;
819  spinorFloat B2_im = 0;
820  B2_im += g20_re * b0_im;
821  B2_im += g20_im * b0_re;
822  B2_im += g21_re * b1_im;
823  B2_im += g21_im * b1_re;
824  B2_im += g22_re * b2_im;
825  B2_im += g22_im * b2_re;
826 
827  o00_re += A0_re;
828  o00_im += A0_im;
829  o10_re += B0_re;
830  o10_im += B0_im;
831  o20_re -= B0_re;
832  o20_im -= B0_im;
833  o30_re += A0_re;
834  o30_im += A0_im;
835 
836  o01_re += A1_re;
837  o01_im += A1_im;
838  o11_re += B1_re;
839  o11_im += B1_im;
840  o21_re -= B1_re;
841  o21_im -= B1_im;
842  o31_re += A1_re;
843  o31_im += A1_im;
844 
845  o02_re += A2_re;
846  o02_im += A2_im;
847  o12_re += B2_re;
848  o12_im += B2_im;
849  o22_re -= B2_re;
850  o22_im -= B2_im;
851  o32_re += A2_re;
852  o32_im += A2_im;
853 
854 }
855 
856 #ifdef MULTI_GPU
857 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
858  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
859 #endif
860 {
861  // Projector P1-
862  // 1 0 0 -1
863  // 0 1 1 0
864  // 0 1 1 0
865  // -1 0 0 1
866 
867 #ifdef MULTI_GPU
868  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
869  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
870 #else
871  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
872 #endif
873 
874 #ifdef MULTI_GPU
875  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
876 #else
877  const int ga_idx = sp_idx;
878 #endif
879 
886 
887 #ifdef MULTI_GPU
888  if (kernel_type == INTERIOR_KERNEL) {
889 #endif
890 
891  // read spinor from device memory
892  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
893 
894  // project spinor into half spinors
895  a0_re = +i00_re-i30_re;
896  a0_im = +i00_im-i30_im;
897  a1_re = +i01_re-i31_re;
898  a1_im = +i01_im-i31_im;
899  a2_re = +i02_re-i32_re;
900  a2_im = +i02_im-i32_im;
901  b0_re = +i10_re+i20_re;
902  b0_im = +i10_im+i20_im;
903  b1_re = +i11_re+i21_re;
904  b1_im = +i11_im+i21_im;
905  b2_re = +i12_re+i22_re;
906  b2_im = +i12_im+i22_im;
907 
908 #ifdef MULTI_GPU
909  } else {
910 
911  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
912 
913  // read half spinor from device memory
914  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
915 
916  a0_re = i00_re; a0_im = i00_im;
917  a1_re = i01_re; a1_im = i01_im;
918  a2_re = i02_re; a2_im = i02_im;
919  b0_re = i10_re; b0_im = i10_im;
920  b1_re = i11_re; b1_im = i11_im;
921  b2_re = i12_re; b2_im = i12_im;
922 
923  }
924 #endif // MULTI_GPU
925 
926  // read gauge matrix from device memory
927  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
928 
929  // reconstruct gauge matrix
931 
932  // multiply row 0
933  spinorFloat A0_re = 0;
934  A0_re += gT00_re * a0_re;
935  A0_re -= gT00_im * a0_im;
936  A0_re += gT01_re * a1_re;
937  A0_re -= gT01_im * a1_im;
938  A0_re += gT02_re * a2_re;
939  A0_re -= gT02_im * a2_im;
940  spinorFloat A0_im = 0;
941  A0_im += gT00_re * a0_im;
942  A0_im += gT00_im * a0_re;
943  A0_im += gT01_re * a1_im;
944  A0_im += gT01_im * a1_re;
945  A0_im += gT02_re * a2_im;
946  A0_im += gT02_im * a2_re;
947  spinorFloat B0_re = 0;
948  B0_re += gT00_re * b0_re;
949  B0_re -= gT00_im * b0_im;
950  B0_re += gT01_re * b1_re;
951  B0_re -= gT01_im * b1_im;
952  B0_re += gT02_re * b2_re;
953  B0_re -= gT02_im * b2_im;
954  spinorFloat B0_im = 0;
955  B0_im += gT00_re * b0_im;
956  B0_im += gT00_im * b0_re;
957  B0_im += gT01_re * b1_im;
958  B0_im += gT01_im * b1_re;
959  B0_im += gT02_re * b2_im;
960  B0_im += gT02_im * b2_re;
961 
962  // multiply row 1
963  spinorFloat A1_re = 0;
964  A1_re += gT10_re * a0_re;
965  A1_re -= gT10_im * a0_im;
966  A1_re += gT11_re * a1_re;
967  A1_re -= gT11_im * a1_im;
968  A1_re += gT12_re * a2_re;
969  A1_re -= gT12_im * a2_im;
970  spinorFloat A1_im = 0;
971  A1_im += gT10_re * a0_im;
972  A1_im += gT10_im * a0_re;
973  A1_im += gT11_re * a1_im;
974  A1_im += gT11_im * a1_re;
975  A1_im += gT12_re * a2_im;
976  A1_im += gT12_im * a2_re;
977  spinorFloat B1_re = 0;
978  B1_re += gT10_re * b0_re;
979  B1_re -= gT10_im * b0_im;
980  B1_re += gT11_re * b1_re;
981  B1_re -= gT11_im * b1_im;
982  B1_re += gT12_re * b2_re;
983  B1_re -= gT12_im * b2_im;
984  spinorFloat B1_im = 0;
985  B1_im += gT10_re * b0_im;
986  B1_im += gT10_im * b0_re;
987  B1_im += gT11_re * b1_im;
988  B1_im += gT11_im * b1_re;
989  B1_im += gT12_re * b2_im;
990  B1_im += gT12_im * b2_re;
991 
992  // multiply row 2
993  spinorFloat A2_re = 0;
994  A2_re += gT20_re * a0_re;
995  A2_re -= gT20_im * a0_im;
996  A2_re += gT21_re * a1_re;
997  A2_re -= gT21_im * a1_im;
998  A2_re += gT22_re * a2_re;
999  A2_re -= gT22_im * a2_im;
1000  spinorFloat A2_im = 0;
1001  A2_im += gT20_re * a0_im;
1002  A2_im += gT20_im * a0_re;
1003  A2_im += gT21_re * a1_im;
1004  A2_im += gT21_im * a1_re;
1005  A2_im += gT22_re * a2_im;
1006  A2_im += gT22_im * a2_re;
1007  spinorFloat B2_re = 0;
1008  B2_re += gT20_re * b0_re;
1009  B2_re -= gT20_im * b0_im;
1010  B2_re += gT21_re * b1_re;
1011  B2_re -= gT21_im * b1_im;
1012  B2_re += gT22_re * b2_re;
1013  B2_re -= gT22_im * b2_im;
1014  spinorFloat B2_im = 0;
1015  B2_im += gT20_re * b0_im;
1016  B2_im += gT20_im * b0_re;
1017  B2_im += gT21_re * b1_im;
1018  B2_im += gT21_im * b1_re;
1019  B2_im += gT22_re * b2_im;
1020  B2_im += gT22_im * b2_re;
1021 
1022  o00_re += A0_re;
1023  o00_im += A0_im;
1024  o10_re += B0_re;
1025  o10_im += B0_im;
1026  o20_re += B0_re;
1027  o20_im += B0_im;
1028  o30_re -= A0_re;
1029  o30_im -= A0_im;
1030 
1031  o01_re += A1_re;
1032  o01_im += A1_im;
1033  o11_re += B1_re;
1034  o11_im += B1_im;
1035  o21_re += B1_re;
1036  o21_im += B1_im;
1037  o31_re -= A1_re;
1038  o31_im -= A1_im;
1039 
1040  o02_re += A2_re;
1041  o02_im += A2_im;
1042  o12_re += B2_re;
1043  o12_im += B2_im;
1044  o22_re += B2_re;
1045  o22_im += B2_im;
1046  o32_re -= A2_re;
1047  o32_im -= A2_im;
1048 
1049 }
1050 
1051 #ifdef MULTI_GPU
1052 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1053  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1054 #endif
1055 {
1056  // Projector P2+
1057  // 1 0 i 0
1058  // 0 1 0 -i
1059  // -i 0 1 0
1060  // 0 i 0 1
1061 
1062 #ifdef MULTI_GPU
1063  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1064  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1065 #else
1066  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1067 #endif
1068 
1069  const int ga_idx = sid;
1070 
1077 
1078 #ifdef MULTI_GPU
1079  if (kernel_type == INTERIOR_KERNEL) {
1080 #endif
1081 
1082  // read spinor from device memory
1083  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
1084 
1085  // project spinor into half spinors
1086  a0_re = +i00_re-i20_im;
1087  a0_im = +i00_im+i20_re;
1088  a1_re = +i01_re-i21_im;
1089  a1_im = +i01_im+i21_re;
1090  a2_re = +i02_re-i22_im;
1091  a2_im = +i02_im+i22_re;
1092  b0_re = +i10_re+i30_im;
1093  b0_im = +i10_im-i30_re;
1094  b1_re = +i11_re+i31_im;
1095  b1_im = +i11_im-i31_re;
1096  b2_re = +i12_re+i32_im;
1097  b2_im = +i12_im-i32_re;
1098 
1099 #ifdef MULTI_GPU
1100  } else {
1101 
1102  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1103 
1104  // read half spinor from device memory
1105  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1106 
1107  a0_re = i00_re; a0_im = i00_im;
1108  a1_re = i01_re; a1_im = i01_im;
1109  a2_re = i02_re; a2_im = i02_im;
1110  b0_re = i10_re; b0_im = i10_im;
1111  b1_re = i11_re; b1_im = i11_im;
1112  b2_re = i12_re; b2_im = i12_im;
1113 
1114  }
1115 #endif // MULTI_GPU
1116 
1117  // read gauge matrix from device memory
1118  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1119 
1120  // reconstruct gauge matrix
1122 
1123  // multiply row 0
1124  spinorFloat A0_re = 0;
1125  A0_re += g00_re * a0_re;
1126  A0_re -= g00_im * a0_im;
1127  A0_re += g01_re * a1_re;
1128  A0_re -= g01_im * a1_im;
1129  A0_re += g02_re * a2_re;
1130  A0_re -= g02_im * a2_im;
1131  spinorFloat A0_im = 0;
1132  A0_im += g00_re * a0_im;
1133  A0_im += g00_im * a0_re;
1134  A0_im += g01_re * a1_im;
1135  A0_im += g01_im * a1_re;
1136  A0_im += g02_re * a2_im;
1137  A0_im += g02_im * a2_re;
1138  spinorFloat B0_re = 0;
1139  B0_re += g00_re * b0_re;
1140  B0_re -= g00_im * b0_im;
1141  B0_re += g01_re * b1_re;
1142  B0_re -= g01_im * b1_im;
1143  B0_re += g02_re * b2_re;
1144  B0_re -= g02_im * b2_im;
1145  spinorFloat B0_im = 0;
1146  B0_im += g00_re * b0_im;
1147  B0_im += g00_im * b0_re;
1148  B0_im += g01_re * b1_im;
1149  B0_im += g01_im * b1_re;
1150  B0_im += g02_re * b2_im;
1151  B0_im += g02_im * b2_re;
1152 
1153  // multiply row 1
1154  spinorFloat A1_re = 0;
1155  A1_re += g10_re * a0_re;
1156  A1_re -= g10_im * a0_im;
1157  A1_re += g11_re * a1_re;
1158  A1_re -= g11_im * a1_im;
1159  A1_re += g12_re * a2_re;
1160  A1_re -= g12_im * a2_im;
1161  spinorFloat A1_im = 0;
1162  A1_im += g10_re * a0_im;
1163  A1_im += g10_im * a0_re;
1164  A1_im += g11_re * a1_im;
1165  A1_im += g11_im * a1_re;
1166  A1_im += g12_re * a2_im;
1167  A1_im += g12_im * a2_re;
1168  spinorFloat B1_re = 0;
1169  B1_re += g10_re * b0_re;
1170  B1_re -= g10_im * b0_im;
1171  B1_re += g11_re * b1_re;
1172  B1_re -= g11_im * b1_im;
1173  B1_re += g12_re * b2_re;
1174  B1_re -= g12_im * b2_im;
1175  spinorFloat B1_im = 0;
1176  B1_im += g10_re * b0_im;
1177  B1_im += g10_im * b0_re;
1178  B1_im += g11_re * b1_im;
1179  B1_im += g11_im * b1_re;
1180  B1_im += g12_re * b2_im;
1181  B1_im += g12_im * b2_re;
1182 
1183  // multiply row 2
1184  spinorFloat A2_re = 0;
1185  A2_re += g20_re * a0_re;
1186  A2_re -= g20_im * a0_im;
1187  A2_re += g21_re * a1_re;
1188  A2_re -= g21_im * a1_im;
1189  A2_re += g22_re * a2_re;
1190  A2_re -= g22_im * a2_im;
1191  spinorFloat A2_im = 0;
1192  A2_im += g20_re * a0_im;
1193  A2_im += g20_im * a0_re;
1194  A2_im += g21_re * a1_im;
1195  A2_im += g21_im * a1_re;
1196  A2_im += g22_re * a2_im;
1197  A2_im += g22_im * a2_re;
1198  spinorFloat B2_re = 0;
1199  B2_re += g20_re * b0_re;
1200  B2_re -= g20_im * b0_im;
1201  B2_re += g21_re * b1_re;
1202  B2_re -= g21_im * b1_im;
1203  B2_re += g22_re * b2_re;
1204  B2_re -= g22_im * b2_im;
1205  spinorFloat B2_im = 0;
1206  B2_im += g20_re * b0_im;
1207  B2_im += g20_im * b0_re;
1208  B2_im += g21_re * b1_im;
1209  B2_im += g21_im * b1_re;
1210  B2_im += g22_re * b2_im;
1211  B2_im += g22_im * b2_re;
1212 
1213  o00_re += A0_re;
1214  o00_im += A0_im;
1215  o10_re += B0_re;
1216  o10_im += B0_im;
1217  o20_re += A0_im;
1218  o20_im -= A0_re;
1219  o30_re -= B0_im;
1220  o30_im += B0_re;
1221 
1222  o01_re += A1_re;
1223  o01_im += A1_im;
1224  o11_re += B1_re;
1225  o11_im += B1_im;
1226  o21_re += A1_im;
1227  o21_im -= A1_re;
1228  o31_re -= B1_im;
1229  o31_im += B1_re;
1230 
1231  o02_re += A2_re;
1232  o02_im += A2_im;
1233  o12_re += B2_re;
1234  o12_im += B2_im;
1235  o22_re += A2_im;
1236  o22_im -= A2_re;
1237  o32_re -= B2_im;
1238  o32_im += B2_re;
1239 
1240 }
1241 
1242 #ifdef MULTI_GPU
1243 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1244  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1245 #endif
1246 {
1247  // Projector P2-
1248  // 1 0 -i 0
1249  // 0 1 0 i
1250  // i 0 1 0
1251  // 0 -i 0 1
1252 
1253 #ifdef MULTI_GPU
1254  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1255  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1256 #else
1257  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1258 #endif
1259 
1260 #ifdef MULTI_GPU
1261  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1262 #else
1263  const int ga_idx = sp_idx;
1264 #endif
1265 
1272 
1273 #ifdef MULTI_GPU
1274  if (kernel_type == INTERIOR_KERNEL) {
1275 #endif
1276 
1277  // read spinor from device memory
1278  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
1279 
1280  // project spinor into half spinors
1281  a0_re = +i00_re+i20_im;
1282  a0_im = +i00_im-i20_re;
1283  a1_re = +i01_re+i21_im;
1284  a1_im = +i01_im-i21_re;
1285  a2_re = +i02_re+i22_im;
1286  a2_im = +i02_im-i22_re;
1287  b0_re = +i10_re-i30_im;
1288  b0_im = +i10_im+i30_re;
1289  b1_re = +i11_re-i31_im;
1290  b1_im = +i11_im+i31_re;
1291  b2_re = +i12_re-i32_im;
1292  b2_im = +i12_im+i32_re;
1293 
1294 #ifdef MULTI_GPU
1295  } else {
1296 
1297  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1298 
1299  // read half spinor from device memory
1300  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1301 
1302  a0_re = i00_re; a0_im = i00_im;
1303  a1_re = i01_re; a1_im = i01_im;
1304  a2_re = i02_re; a2_im = i02_im;
1305  b0_re = i10_re; b0_im = i10_im;
1306  b1_re = i11_re; b1_im = i11_im;
1307  b2_re = i12_re; b2_im = i12_im;
1308 
1309  }
1310 #endif // MULTI_GPU
1311 
1312  // read gauge matrix from device memory
1313  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1314 
1315  // reconstruct gauge matrix
1317 
1318  // multiply row 0
1319  spinorFloat A0_re = 0;
1320  A0_re += gT00_re * a0_re;
1321  A0_re -= gT00_im * a0_im;
1322  A0_re += gT01_re * a1_re;
1323  A0_re -= gT01_im * a1_im;
1324  A0_re += gT02_re * a2_re;
1325  A0_re -= gT02_im * a2_im;
1326  spinorFloat A0_im = 0;
1327  A0_im += gT00_re * a0_im;
1328  A0_im += gT00_im * a0_re;
1329  A0_im += gT01_re * a1_im;
1330  A0_im += gT01_im * a1_re;
1331  A0_im += gT02_re * a2_im;
1332  A0_im += gT02_im * a2_re;
1333  spinorFloat B0_re = 0;
1334  B0_re += gT00_re * b0_re;
1335  B0_re -= gT00_im * b0_im;
1336  B0_re += gT01_re * b1_re;
1337  B0_re -= gT01_im * b1_im;
1338  B0_re += gT02_re * b2_re;
1339  B0_re -= gT02_im * b2_im;
1340  spinorFloat B0_im = 0;
1341  B0_im += gT00_re * b0_im;
1342  B0_im += gT00_im * b0_re;
1343  B0_im += gT01_re * b1_im;
1344  B0_im += gT01_im * b1_re;
1345  B0_im += gT02_re * b2_im;
1346  B0_im += gT02_im * b2_re;
1347 
1348  // multiply row 1
1349  spinorFloat A1_re = 0;
1350  A1_re += gT10_re * a0_re;
1351  A1_re -= gT10_im * a0_im;
1352  A1_re += gT11_re * a1_re;
1353  A1_re -= gT11_im * a1_im;
1354  A1_re += gT12_re * a2_re;
1355  A1_re -= gT12_im * a2_im;
1356  spinorFloat A1_im = 0;
1357  A1_im += gT10_re * a0_im;
1358  A1_im += gT10_im * a0_re;
1359  A1_im += gT11_re * a1_im;
1360  A1_im += gT11_im * a1_re;
1361  A1_im += gT12_re * a2_im;
1362  A1_im += gT12_im * a2_re;
1363  spinorFloat B1_re = 0;
1364  B1_re += gT10_re * b0_re;
1365  B1_re -= gT10_im * b0_im;
1366  B1_re += gT11_re * b1_re;
1367  B1_re -= gT11_im * b1_im;
1368  B1_re += gT12_re * b2_re;
1369  B1_re -= gT12_im * b2_im;
1370  spinorFloat B1_im = 0;
1371  B1_im += gT10_re * b0_im;
1372  B1_im += gT10_im * b0_re;
1373  B1_im += gT11_re * b1_im;
1374  B1_im += gT11_im * b1_re;
1375  B1_im += gT12_re * b2_im;
1376  B1_im += gT12_im * b2_re;
1377 
1378  // multiply row 2
1379  spinorFloat A2_re = 0;
1380  A2_re += gT20_re * a0_re;
1381  A2_re -= gT20_im * a0_im;
1382  A2_re += gT21_re * a1_re;
1383  A2_re -= gT21_im * a1_im;
1384  A2_re += gT22_re * a2_re;
1385  A2_re -= gT22_im * a2_im;
1386  spinorFloat A2_im = 0;
1387  A2_im += gT20_re * a0_im;
1388  A2_im += gT20_im * a0_re;
1389  A2_im += gT21_re * a1_im;
1390  A2_im += gT21_im * a1_re;
1391  A2_im += gT22_re * a2_im;
1392  A2_im += gT22_im * a2_re;
1393  spinorFloat B2_re = 0;
1394  B2_re += gT20_re * b0_re;
1395  B2_re -= gT20_im * b0_im;
1396  B2_re += gT21_re * b1_re;
1397  B2_re -= gT21_im * b1_im;
1398  B2_re += gT22_re * b2_re;
1399  B2_re -= gT22_im * b2_im;
1400  spinorFloat B2_im = 0;
1401  B2_im += gT20_re * b0_im;
1402  B2_im += gT20_im * b0_re;
1403  B2_im += gT21_re * b1_im;
1404  B2_im += gT21_im * b1_re;
1405  B2_im += gT22_re * b2_im;
1406  B2_im += gT22_im * b2_re;
1407 
1408  o00_re += A0_re;
1409  o00_im += A0_im;
1410  o10_re += B0_re;
1411  o10_im += B0_im;
1412  o20_re -= A0_im;
1413  o20_im += A0_re;
1414  o30_re += B0_im;
1415  o30_im -= B0_re;
1416 
1417  o01_re += A1_re;
1418  o01_im += A1_im;
1419  o11_re += B1_re;
1420  o11_im += B1_im;
1421  o21_re -= A1_im;
1422  o21_im += A1_re;
1423  o31_re += B1_im;
1424  o31_im -= B1_re;
1425 
1426  o02_re += A2_re;
1427  o02_im += A2_im;
1428  o12_re += B2_re;
1429  o12_im += B2_im;
1430  o22_re -= A2_im;
1431  o22_im += A2_re;
1432  o32_re += B2_im;
1433  o32_im -= B2_re;
1434 
1435 }
1436 
1437 #ifdef MULTI_GPU
1438 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1439  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
1440 #endif
1441 {
1442  // Projector P3+
1443  // 2 0 0 0
1444  // 0 2 0 0
1445  // 0 0 0 0
1446  // 0 0 0 0
1447 
1448 #ifdef MULTI_GPU
1449  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1450  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1451 #else
1452  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1453 #endif
1454 
1455  const int ga_idx = sid;
1456 
1458  {
1465 
1466 #ifdef MULTI_GPU
1467  if (kernel_type == INTERIOR_KERNEL) {
1468 #endif
1469 
1470  // read spinor from device memory
1471  READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx);
1472 
1473  // project spinor into half spinors
1474  a0_re = +2*i00_re;
1475  a0_im = +2*i00_im;
1476  a1_re = +2*i01_re;
1477  a1_im = +2*i01_im;
1478  a2_re = +2*i02_re;
1479  a2_im = +2*i02_im;
1480  b0_re = +2*i10_re;
1481  b0_im = +2*i10_im;
1482  b1_re = +2*i11_re;
1483  b1_im = +2*i11_im;
1484  b2_re = +2*i12_re;
1485  b2_im = +2*i12_im;
1486 
1487 #ifdef MULTI_GPU
1488  } else {
1489 
1490  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1491  const int t_proj_scale = TPROJSCALE;
1492 
1493  // read half spinor from device memory
1494  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1495 
1496  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1497  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1498  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1499  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1500  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1501  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1502 
1503  }
1504 #endif // MULTI_GPU
1505 
1506  // identity gauge matrix
1513 
1514  o00_re += A0_re;
1515  o00_im += A0_im;
1516  o10_re += B0_re;
1517  o10_im += B0_im;
1518 
1519  o01_re += A1_re;
1520  o01_im += A1_im;
1521  o11_re += B1_re;
1522  o11_im += B1_im;
1523 
1524  o02_re += A2_re;
1525  o02_im += A2_im;
1526  o12_re += B2_re;
1527  o12_im += B2_im;
1528 
1529  } else {
1536 
1537 #ifdef MULTI_GPU
1538  if (kernel_type == INTERIOR_KERNEL) {
1539 #endif
1540 
1541  // read spinor from device memory
1542  READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx);
1543 
1544  // project spinor into half spinors
1545  a0_re = +2*i00_re;
1546  a0_im = +2*i00_im;
1547  a1_re = +2*i01_re;
1548  a1_im = +2*i01_im;
1549  a2_re = +2*i02_re;
1550  a2_im = +2*i02_im;
1551  b0_re = +2*i10_re;
1552  b0_im = +2*i10_im;
1553  b1_re = +2*i11_re;
1554  b1_im = +2*i11_im;
1555  b2_re = +2*i12_re;
1556  b2_im = +2*i12_im;
1557 
1558 #ifdef MULTI_GPU
1559  } else {
1560 
1561  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1562  const int t_proj_scale = TPROJSCALE;
1563 
1564  // read half spinor from device memory
1565  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1566 
1567  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1568  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1569  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1570  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1571  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1572  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1573 
1574  }
1575 #endif // MULTI_GPU
1576 
1577  // read gauge matrix from device memory
1578  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1579 
1580  // reconstruct gauge matrix
1582 
1583  // multiply row 0
1584  spinorFloat A0_re = 0;
1585  A0_re += g00_re * a0_re;
1586  A0_re -= g00_im * a0_im;
1587  A0_re += g01_re * a1_re;
1588  A0_re -= g01_im * a1_im;
1589  A0_re += g02_re * a2_re;
1590  A0_re -= g02_im * a2_im;
1591  spinorFloat A0_im = 0;
1592  A0_im += g00_re * a0_im;
1593  A0_im += g00_im * a0_re;
1594  A0_im += g01_re * a1_im;
1595  A0_im += g01_im * a1_re;
1596  A0_im += g02_re * a2_im;
1597  A0_im += g02_im * a2_re;
1598  spinorFloat B0_re = 0;
1599  B0_re += g00_re * b0_re;
1600  B0_re -= g00_im * b0_im;
1601  B0_re += g01_re * b1_re;
1602  B0_re -= g01_im * b1_im;
1603  B0_re += g02_re * b2_re;
1604  B0_re -= g02_im * b2_im;
1605  spinorFloat B0_im = 0;
1606  B0_im += g00_re * b0_im;
1607  B0_im += g00_im * b0_re;
1608  B0_im += g01_re * b1_im;
1609  B0_im += g01_im * b1_re;
1610  B0_im += g02_re * b2_im;
1611  B0_im += g02_im * b2_re;
1612 
1613  // multiply row 1
1614  spinorFloat A1_re = 0;
1615  A1_re += g10_re * a0_re;
1616  A1_re -= g10_im * a0_im;
1617  A1_re += g11_re * a1_re;
1618  A1_re -= g11_im * a1_im;
1619  A1_re += g12_re * a2_re;
1620  A1_re -= g12_im * a2_im;
1621  spinorFloat A1_im = 0;
1622  A1_im += g10_re * a0_im;
1623  A1_im += g10_im * a0_re;
1624  A1_im += g11_re * a1_im;
1625  A1_im += g11_im * a1_re;
1626  A1_im += g12_re * a2_im;
1627  A1_im += g12_im * a2_re;
1628  spinorFloat B1_re = 0;
1629  B1_re += g10_re * b0_re;
1630  B1_re -= g10_im * b0_im;
1631  B1_re += g11_re * b1_re;
1632  B1_re -= g11_im * b1_im;
1633  B1_re += g12_re * b2_re;
1634  B1_re -= g12_im * b2_im;
1635  spinorFloat B1_im = 0;
1636  B1_im += g10_re * b0_im;
1637  B1_im += g10_im * b0_re;
1638  B1_im += g11_re * b1_im;
1639  B1_im += g11_im * b1_re;
1640  B1_im += g12_re * b2_im;
1641  B1_im += g12_im * b2_re;
1642 
1643  // multiply row 2
1644  spinorFloat A2_re = 0;
1645  A2_re += g20_re * a0_re;
1646  A2_re -= g20_im * a0_im;
1647  A2_re += g21_re * a1_re;
1648  A2_re -= g21_im * a1_im;
1649  A2_re += g22_re * a2_re;
1650  A2_re -= g22_im * a2_im;
1651  spinorFloat A2_im = 0;
1652  A2_im += g20_re * a0_im;
1653  A2_im += g20_im * a0_re;
1654  A2_im += g21_re * a1_im;
1655  A2_im += g21_im * a1_re;
1656  A2_im += g22_re * a2_im;
1657  A2_im += g22_im * a2_re;
1658  spinorFloat B2_re = 0;
1659  B2_re += g20_re * b0_re;
1660  B2_re -= g20_im * b0_im;
1661  B2_re += g21_re * b1_re;
1662  B2_re -= g21_im * b1_im;
1663  B2_re += g22_re * b2_re;
1664  B2_re -= g22_im * b2_im;
1665  spinorFloat B2_im = 0;
1666  B2_im += g20_re * b0_im;
1667  B2_im += g20_im * b0_re;
1668  B2_im += g21_re * b1_im;
1669  B2_im += g21_im * b1_re;
1670  B2_im += g22_re * b2_im;
1671  B2_im += g22_im * b2_re;
1672 
1673  o00_re += A0_re;
1674  o00_im += A0_im;
1675  o10_re += B0_re;
1676  o10_im += B0_im;
1677 
1678  o01_re += A1_re;
1679  o01_im += A1_im;
1680  o11_re += B1_re;
1681  o11_im += B1_im;
1682 
1683  o02_re += A2_re;
1684  o02_im += A2_im;
1685  o12_re += B2_re;
1686  o12_im += B2_im;
1687 
1688  }
1689 }
1690 
1691 #ifdef MULTI_GPU
1692 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
1693  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
1694 #endif
1695 {
1696  // Projector P3-
1697  // 0 0 0 0
1698  // 0 0 0 0
1699  // 0 0 2 0
1700  // 0 0 0 2
1701 
1702 #ifdef MULTI_GPU
1703  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
1704  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1705 #else
1706  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
1707 #endif
1708 
1709 #ifdef MULTI_GPU
1710  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1711 #else
1712  const int ga_idx = sp_idx;
1713 #endif
1714 
1715  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1716  {
1723 
1724 #ifdef MULTI_GPU
1725  if (kernel_type == INTERIOR_KERNEL) {
1726 #endif
1727 
1728  // read spinor from device memory
1729  READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx);
1730 
1731  // project spinor into half spinors
1732  a0_re = +2*i20_re;
1733  a0_im = +2*i20_im;
1734  a1_re = +2*i21_re;
1735  a1_im = +2*i21_im;
1736  a2_re = +2*i22_re;
1737  a2_im = +2*i22_im;
1738  b0_re = +2*i30_re;
1739  b0_im = +2*i30_im;
1740  b1_re = +2*i31_re;
1741  b1_im = +2*i31_im;
1742  b2_re = +2*i32_re;
1743  b2_im = +2*i32_im;
1744 
1745 #ifdef MULTI_GPU
1746  } else {
1747 
1748  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1749  const int t_proj_scale = TPROJSCALE;
1750 
1751  // read half spinor from device memory
1752  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1753 
1754  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1755  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1756  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1757  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1758  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1759  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1760 
1761  }
1762 #endif // MULTI_GPU
1763 
1764  // identity gauge matrix
1771 
1772  o20_re += A0_re;
1773  o20_im += A0_im;
1774  o30_re += B0_re;
1775  o30_im += B0_im;
1776 
1777  o21_re += A1_re;
1778  o21_im += A1_im;
1779  o31_re += B1_re;
1780  o31_im += B1_im;
1781 
1782  o22_re += A2_re;
1783  o22_im += A2_im;
1784  o32_re += B2_re;
1785  o32_im += B2_im;
1786 
1787  } else {
1794 
1795 #ifdef MULTI_GPU
1796  if (kernel_type == INTERIOR_KERNEL) {
1797 #endif
1798 
1799  // read spinor from device memory
1800  READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx);
1801 
1802  // project spinor into half spinors
1803  a0_re = +2*i20_re;
1804  a0_im = +2*i20_im;
1805  a1_re = +2*i21_re;
1806  a1_im = +2*i21_im;
1807  a2_re = +2*i22_re;
1808  a2_im = +2*i22_im;
1809  b0_re = +2*i30_re;
1810  b0_im = +2*i30_im;
1811  b1_re = +2*i31_re;
1812  b1_im = +2*i31_im;
1813  b2_re = +2*i32_re;
1814  b2_im = +2*i32_im;
1815 
1816 #ifdef MULTI_GPU
1817  } else {
1818 
1819  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1820  const int t_proj_scale = TPROJSCALE;
1821 
1822  // read half spinor from device memory
1823  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1824 
1825  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1826  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1827  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1828  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1829  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1830  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1831 
1832  }
1833 #endif // MULTI_GPU
1834 
1835  // read gauge matrix from device memory
1836  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
1837 
1838  // reconstruct gauge matrix
1840 
1841  // multiply row 0
1842  spinorFloat A0_re = 0;
1843  A0_re += gT00_re * a0_re;
1844  A0_re -= gT00_im * a0_im;
1845  A0_re += gT01_re * a1_re;
1846  A0_re -= gT01_im * a1_im;
1847  A0_re += gT02_re * a2_re;
1848  A0_re -= gT02_im * a2_im;
1849  spinorFloat A0_im = 0;
1850  A0_im += gT00_re * a0_im;
1851  A0_im += gT00_im * a0_re;
1852  A0_im += gT01_re * a1_im;
1853  A0_im += gT01_im * a1_re;
1854  A0_im += gT02_re * a2_im;
1855  A0_im += gT02_im * a2_re;
1856  spinorFloat B0_re = 0;
1857  B0_re += gT00_re * b0_re;
1858  B0_re -= gT00_im * b0_im;
1859  B0_re += gT01_re * b1_re;
1860  B0_re -= gT01_im * b1_im;
1861  B0_re += gT02_re * b2_re;
1862  B0_re -= gT02_im * b2_im;
1863  spinorFloat B0_im = 0;
1864  B0_im += gT00_re * b0_im;
1865  B0_im += gT00_im * b0_re;
1866  B0_im += gT01_re * b1_im;
1867  B0_im += gT01_im * b1_re;
1868  B0_im += gT02_re * b2_im;
1869  B0_im += gT02_im * b2_re;
1870 
1871  // multiply row 1
1872  spinorFloat A1_re = 0;
1873  A1_re += gT10_re * a0_re;
1874  A1_re -= gT10_im * a0_im;
1875  A1_re += gT11_re * a1_re;
1876  A1_re -= gT11_im * a1_im;
1877  A1_re += gT12_re * a2_re;
1878  A1_re -= gT12_im * a2_im;
1879  spinorFloat A1_im = 0;
1880  A1_im += gT10_re * a0_im;
1881  A1_im += gT10_im * a0_re;
1882  A1_im += gT11_re * a1_im;
1883  A1_im += gT11_im * a1_re;
1884  A1_im += gT12_re * a2_im;
1885  A1_im += gT12_im * a2_re;
1886  spinorFloat B1_re = 0;
1887  B1_re += gT10_re * b0_re;
1888  B1_re -= gT10_im * b0_im;
1889  B1_re += gT11_re * b1_re;
1890  B1_re -= gT11_im * b1_im;
1891  B1_re += gT12_re * b2_re;
1892  B1_re -= gT12_im * b2_im;
1893  spinorFloat B1_im = 0;
1894  B1_im += gT10_re * b0_im;
1895  B1_im += gT10_im * b0_re;
1896  B1_im += gT11_re * b1_im;
1897  B1_im += gT11_im * b1_re;
1898  B1_im += gT12_re * b2_im;
1899  B1_im += gT12_im * b2_re;
1900 
1901  // multiply row 2
1902  spinorFloat A2_re = 0;
1903  A2_re += gT20_re * a0_re;
1904  A2_re -= gT20_im * a0_im;
1905  A2_re += gT21_re * a1_re;
1906  A2_re -= gT21_im * a1_im;
1907  A2_re += gT22_re * a2_re;
1908  A2_re -= gT22_im * a2_im;
1909  spinorFloat A2_im = 0;
1910  A2_im += gT20_re * a0_im;
1911  A2_im += gT20_im * a0_re;
1912  A2_im += gT21_re * a1_im;
1913  A2_im += gT21_im * a1_re;
1914  A2_im += gT22_re * a2_im;
1915  A2_im += gT22_im * a2_re;
1916  spinorFloat B2_re = 0;
1917  B2_re += gT20_re * b0_re;
1918  B2_re -= gT20_im * b0_im;
1919  B2_re += gT21_re * b1_re;
1920  B2_re -= gT21_im * b1_im;
1921  B2_re += gT22_re * b2_re;
1922  B2_re -= gT22_im * b2_im;
1923  spinorFloat B2_im = 0;
1924  B2_im += gT20_re * b0_im;
1925  B2_im += gT20_im * b0_re;
1926  B2_im += gT21_re * b1_im;
1927  B2_im += gT21_im * b1_re;
1928  B2_im += gT22_re * b2_im;
1929  B2_im += gT22_im * b2_re;
1930 
1931  o20_re += A0_re;
1932  o20_im += A0_im;
1933  o30_re += B0_re;
1934  o30_im += B0_im;
1935 
1936  o21_re += A1_re;
1937  o21_im += A1_im;
1938  o31_re += B1_re;
1939  o31_im += B1_im;
1940 
1941  o22_re += A2_re;
1942  o22_im += A2_im;
1943  o32_re += B2_re;
1944  o32_im += B2_im;
1945 
1946  }
1947 }
1948 
1949 #ifdef MULTI_GPU
1950 
1951 int incomplete = 0; // Have all 8 contributions been computed for this site?
1952 
1953 switch(kernel_type) { // intentional fall-through
1954 case INTERIOR_KERNEL:
1955  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
1956 case EXTERIOR_KERNEL_T:
1957  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
1958 case EXTERIOR_KERNEL_Z:
1959  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
1960 case EXTERIOR_KERNEL_Y:
1961  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
1962 }
1963 
1964 if (!incomplete)
1965 #endif // MULTI_GPU
1966 {
1967  {
1968  // apply twisted mass rotation
1969  VOLATILE spinorFloat tmp00_re = +o00_re-o20_im*a;
1970  VOLATILE spinorFloat tmp00_im = +o00_im+o20_re*a;
1971  VOLATILE spinorFloat tmp01_re = +o01_re-o21_im*a;
1972  VOLATILE spinorFloat tmp01_im = +o01_im+o21_re*a;
1973  VOLATILE spinorFloat tmp02_re = +o02_re-o22_im*a;
1974  VOLATILE spinorFloat tmp02_im = +o02_im+o22_re*a;
1975 
1976  VOLATILE spinorFloat tmp10_re = +o10_re-o30_im*a;
1977  VOLATILE spinorFloat tmp10_im = +o10_im+o30_re*a;
1978  VOLATILE spinorFloat tmp11_re = +o11_re-o31_im*a;
1979  VOLATILE spinorFloat tmp11_im = +o11_im+o31_re*a;
1980  VOLATILE spinorFloat tmp12_re = +o12_re-o32_im*a;
1981  VOLATILE spinorFloat tmp12_im = +o12_im+o32_re*a;
1982 
1983  VOLATILE spinorFloat tmp20_re = -o00_im*a+o20_re;
1984  VOLATILE spinorFloat tmp20_im = +o00_re*a+o20_im;
1985  VOLATILE spinorFloat tmp21_re = -o01_im*a+o21_re;
1986  VOLATILE spinorFloat tmp21_im = +o01_re*a+o21_im;
1987  VOLATILE spinorFloat tmp22_re = -o02_im*a+o22_re;
1988  VOLATILE spinorFloat tmp22_im = +o02_re*a+o22_im;
1989 
1990  VOLATILE spinorFloat tmp30_re = -o10_im*a+o30_re;
1991  VOLATILE spinorFloat tmp30_im = +o10_re*a+o30_im;
1992  VOLATILE spinorFloat tmp31_re = -o11_im*a+o31_re;
1993  VOLATILE spinorFloat tmp31_im = +o11_re*a+o31_im;
1994  VOLATILE spinorFloat tmp32_re = -o12_im*a+o32_re;
1995  VOLATILE spinorFloat tmp32_im = +o12_re*a+o32_im;
1996 
1997 
1998 #ifndef DSLASH_XPAY
1999  //scale by b = 1/(1 + a*a)
2000  o00_re = b*tmp00_re;
2001  o00_im = b*tmp00_im;
2002  o01_re = b*tmp01_re;
2003  o01_im = b*tmp01_im;
2004  o02_re = b*tmp02_re;
2005  o02_im = b*tmp02_im;
2006  o10_re = b*tmp10_re;
2007  o10_im = b*tmp10_im;
2008  o11_re = b*tmp11_re;
2009  o11_im = b*tmp11_im;
2010  o12_re = b*tmp12_re;
2011  o12_im = b*tmp12_im;
2012  o20_re = b*tmp20_re;
2013  o20_im = b*tmp20_im;
2014  o21_re = b*tmp21_re;
2015  o21_im = b*tmp21_im;
2016  o22_re = b*tmp22_re;
2017  o22_im = b*tmp22_im;
2018  o30_re = b*tmp30_re;
2019  o30_im = b*tmp30_im;
2020  o31_re = b*tmp31_re;
2021  o31_im = b*tmp31_im;
2022  o32_re = b*tmp32_re;
2023  o32_im = b*tmp32_im;
2024 #else
2025  o00_re = tmp00_re;
2026  o00_im = tmp00_im;
2027  o01_re = tmp01_re;
2028  o01_im = tmp01_im;
2029  o02_re = tmp02_re;
2030  o02_im = tmp02_im;
2031  o10_re = tmp10_re;
2032  o10_im = tmp10_im;
2033  o11_re = tmp11_re;
2034  o11_im = tmp11_im;
2035  o12_re = tmp12_re;
2036  o12_im = tmp12_im;
2037  o20_re = tmp20_re;
2038  o20_im = tmp20_im;
2039  o21_re = tmp21_re;
2040  o21_im = tmp21_im;
2041  o22_re = tmp22_re;
2042  o22_im = tmp22_im;
2043  o30_re = tmp30_re;
2044  o30_im = tmp30_im;
2045  o31_re = tmp31_re;
2046  o31_im = tmp31_im;
2047  o32_re = tmp32_re;
2048  o32_im = tmp32_im;
2049 #endif // DSLASH_XPAY
2050 
2051  }
2052 #ifdef DSLASH_XPAY
2053 
2054  READ_ACCUM(ACCUMTEX, sp_stride)
2055 
2056  o00_re = b*o00_re+acc00_re;
2057  o00_im = b*o00_im+acc00_im;
2058  o01_re = b*o01_re+acc01_re;
2059  o01_im = b*o01_im+acc01_im;
2060  o02_re = b*o02_re+acc02_re;
2061  o02_im = b*o02_im+acc02_im;
2062  o10_re = b*o10_re+acc10_re;
2063  o10_im = b*o10_im+acc10_im;
2064  o11_re = b*o11_re+acc11_re;
2065  o11_im = b*o11_im+acc11_im;
2066  o12_re = b*o12_re+acc12_re;
2067  o12_im = b*o12_im+acc12_im;
2068  o20_re = b*o20_re+acc20_re;
2069  o20_im = b*o20_im+acc20_im;
2070  o21_re = b*o21_re+acc21_re;
2071  o21_im = b*o21_im+acc21_im;
2072  o22_re = b*o22_re+acc22_re;
2073  o22_im = b*o22_im+acc22_im;
2074  o30_re = b*o30_re+acc30_re;
2075  o30_im = b*o30_im+acc30_im;
2076  o31_re = b*o31_re+acc31_re;
2077  o31_im = b*o31_im+acc31_im;
2078  o32_re = b*o32_re+acc32_re;
2079  o32_im = b*o32_im+acc32_im;
2080 #endif // DSLASH_XPAY
2081 }
2082 
2083 // write spinor field back to device memory
2085 
2086 // undefine to prevent warning when precision is changed
2087 #undef spinorFloat
2088 #undef g00_re
2089 #undef g00_im
2090 #undef g01_re
2091 #undef g01_im
2092 #undef g02_re
2093 #undef g02_im
2094 #undef g10_re
2095 #undef g10_im
2096 #undef g11_re
2097 #undef g11_im
2098 #undef g12_re
2099 #undef g12_im
2100 #undef g20_re
2101 #undef g20_im
2102 #undef g21_re
2103 #undef g21_im
2104 #undef g22_re
2105 #undef g22_im
2106 
2107 #undef i00_re
2108 #undef i00_im
2109 #undef i01_re
2110 #undef i01_im
2111 #undef i02_re
2112 #undef i02_im
2113 #undef i10_re
2114 #undef i10_im
2115 #undef i11_re
2116 #undef i11_im
2117 #undef i12_re
2118 #undef i12_im
2119 #undef i20_re
2120 #undef i20_im
2121 #undef i21_re
2122 #undef i21_im
2123 #undef i22_re
2124 #undef i22_im
2125 #undef i30_re
2126 #undef i30_im
2127 #undef i31_re
2128 #undef i31_im
2129 #undef i32_re
2130 #undef i32_im
2131 
2132 #undef acc00_re
2133 #undef acc00_im
2134 #undef acc01_re
2135 #undef acc01_im
2136 #undef acc02_re
2137 #undef acc02_im
2138 #undef acc10_re
2139 #undef acc10_im
2140 #undef acc11_re
2141 #undef acc11_im
2142 #undef acc12_re
2143 #undef acc12_im
2144 #undef acc20_re
2145 #undef acc20_im
2146 #undef acc21_re
2147 #undef acc21_im
2148 #undef acc22_re
2149 #undef acc22_im
2150 #undef acc30_re
2151 #undef acc30_im
2152 #undef acc31_re
2153 #undef acc31_im
2154 #undef acc32_re
2155 #undef acc32_im
2156 
2157 
2158 
2159 #undef VOLATILE