QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tm_ndeg_dslash_dagger_core.h
Go to the documentation of this file.
1 // *** CUDA NDEG TWISTED MASS DSLASH DAGGER ***
2 
3 // Arguments (double) mu, (double)eta and (double)delta
4 #define SHARED_TMNDEG_FLOATS_PER_THREAD 0
5 #define FLAVORS 2
6 
7 
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
9 #define VOLATILE
10 #else // Open64 compiler
11 #define VOLATILE volatile
12 #endif
13 // input spinor
14 #ifdef SPINOR_DOUBLE
15 #define spinorFloat double
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #else
41 #define spinorFloat float
42 #define i00_re I0.x
43 #define i00_im I0.y
44 #define i01_re I0.z
45 #define i01_im I0.w
46 #define i02_re I1.x
47 #define i02_im I1.y
48 #define i10_re I1.z
49 #define i10_im I1.w
50 #define i11_re I2.x
51 #define i11_im I2.y
52 #define i12_re I2.z
53 #define i12_im I2.w
54 #define i20_re I3.x
55 #define i20_im I3.y
56 #define i21_re I3.z
57 #define i21_im I3.w
58 #define i22_re I4.x
59 #define i22_im I4.y
60 #define i30_re I4.z
61 #define i30_im I4.w
62 #define i31_re I5.x
63 #define i31_im I5.y
64 #define i32_re I5.z
65 #define i32_im I5.w
66 #endif // SPINOR_DOUBLE
67 
68 // gauge link
69 #ifdef GAUGE_FLOAT2
70 #define g00_re G0.x
71 #define g00_im G0.y
72 #define g01_re G1.x
73 #define g01_im G1.y
74 #define g02_re G2.x
75 #define g02_im G2.y
76 #define g10_re G3.x
77 #define g10_im G3.y
78 #define g11_re G4.x
79 #define g11_im G4.y
80 #define g12_re G5.x
81 #define g12_im G5.y
82 #define g20_re G6.x
83 #define g20_im G6.y
84 #define g21_re G7.x
85 #define g21_im G7.y
86 #define g22_re G8.x
87 #define g22_im G8.y
88 
89 #else
90 #define g00_re G0.x
91 #define g00_im G0.y
92 #define g01_re G0.z
93 #define g01_im G0.w
94 #define g02_re G1.x
95 #define g02_im G1.y
96 #define g10_re G1.z
97 #define g10_im G1.w
98 #define g11_re G2.x
99 #define g11_im G2.y
100 #define g12_re G2.z
101 #define g12_im G2.w
102 #define g20_re G3.x
103 #define g20_im G3.y
104 #define g21_re G3.z
105 #define g21_im G3.w
106 #define g22_re G4.x
107 #define g22_im G4.y
108 
109 #endif // GAUGE_DOUBLE
110 
111 // conjugated gauge link
112 #define gT00_re (+g00_re)
113 #define gT00_im (-g00_im)
114 #define gT01_re (+g10_re)
115 #define gT01_im (-g10_im)
116 #define gT02_re (+g20_re)
117 #define gT02_im (-g20_im)
118 #define gT10_re (+g01_re)
119 #define gT10_im (-g01_im)
120 #define gT11_re (+g11_re)
121 #define gT11_im (-g11_im)
122 #define gT12_re (+g21_re)
123 #define gT12_im (-g21_im)
124 #define gT20_re (+g02_re)
125 #define gT20_im (-g02_im)
126 #define gT21_re (+g12_re)
127 #define gT21_im (-g12_im)
128 #define gT22_re (+g22_re)
129 #define gT22_im (-g22_im)
130 
131 // output spinor for flavor 1
156 // output spinor for flavor 2
181 
182 #include "read_gauge.h"
183 #include "io_spinor.h"
184 
185 int x1, x2, x3, x4;
186 int X;
187 
188 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
189 int sp_norm_idx;
190 #endif // MULTI_GPU half precision
191 
192 int sid;
193 
194 #ifdef MULTI_GPU
195 int face_idx;
196 if (kernel_type == INTERIOR_KERNEL) {
197 #endif
198 
199  sid = blockIdx.x*blockDim.x + threadIdx.x;
200  if (sid >= param.threads) return;
201 
202  // Inline by hand for the moment and assume even dimensions
203  coordsFromIndex<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity);
204 
205  o1_00_re = 0; o1_00_im = 0;
206  o1_01_re = 0; o1_01_im = 0;
207  o1_02_re = 0; o1_02_im = 0;
208  o1_10_re = 0; o1_10_im = 0;
209  o1_11_re = 0; o1_11_im = 0;
210  o1_12_re = 0; o1_12_im = 0;
211  o1_20_re = 0; o1_20_im = 0;
212  o1_21_re = 0; o1_21_im = 0;
213  o1_22_re = 0; o1_22_im = 0;
214  o1_30_re = 0; o1_30_im = 0;
215  o1_31_re = 0; o1_31_im = 0;
216  o1_32_re = 0; o1_32_im = 0;
217 
218  o2_00_re = 0; o2_00_im = 0;
219  o2_01_re = 0; o2_01_im = 0;
220  o2_02_re = 0; o2_02_im = 0;
221  o2_10_re = 0; o2_10_im = 0;
222  o2_11_re = 0; o2_11_im = 0;
223  o2_12_re = 0; o2_12_im = 0;
224  o2_20_re = 0; o2_20_im = 0;
225  o2_21_re = 0; o2_21_im = 0;
226  o2_22_re = 0; o2_22_im = 0;
227  o2_30_re = 0; o2_30_im = 0;
228  o2_31_re = 0; o2_31_im = 0;
229  o2_32_re = 0; o2_32_im = 0;
230 
231 #ifdef MULTI_GPU
232 } else { // exterior kernel
233 
234  sid = blockIdx.x*blockDim.x + threadIdx.x;
235  if (sid >= param.threads) return;
236 
237  const int dim = static_cast<int>(kernel_type);
238  const int face_volume = (param.threads >> 1); // volume of one face (per flavor)
239  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
240  face_idx = sid - face_num*face_volume; // index into the respective face
241 
242  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
243  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
244  //sp_idx = face_idx + param.ghostOffset[dim];
245 
246 #if (DD_PREC==2) // half precision
247  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
248 #endif
249 
250  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity);
251 
252 
253  {
267 
268 
269  }
270  {
284 
285 
286  }
287 }
288 #endif // MULTI_GPU
289 
290 
291 #ifdef MULTI_GPU
292 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
293  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
294 #endif
295 {
296  // Projector P0+
297  // 1 0 0 i
298  // 0 1 i 0
299  // 0 -i 1 0
300  // -i 0 0 1
301 
302 #ifdef MULTI_GPU
303  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
304  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
305 #else
306  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
307 #endif
308 
309  const int ga_idx = sid;
310 
317 
318  // read gauge matrix from device memory
319  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
320 
321  // reconstruct gauge matrix
323 
324  {
325 #ifdef MULTI_GPU
326  if (kernel_type == INTERIOR_KERNEL) {
327 #endif
328 
329  // read flavor 1 from device memory
330  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
331 
332  // project spinor into half spinors
333  a0_re = +i00_re-i30_im;
334  a0_im = +i00_im+i30_re;
335  a1_re = +i01_re-i31_im;
336  a1_im = +i01_im+i31_re;
337  a2_re = +i02_re-i32_im;
338  a2_im = +i02_im+i32_re;
339  b0_re = +i10_re-i20_im;
340  b0_im = +i10_im+i20_re;
341  b1_re = +i11_re-i21_im;
342  b1_im = +i11_im+i21_re;
343  b2_re = +i12_re-i22_im;
344  b2_im = +i12_im+i22_re;
345 
346 #ifdef MULTI_GPU
347  } else {
348 
349  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
350 
351  // read half spinor for the first flavor from device memory
352  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
353 
354  a0_re = i00_re; a0_im = i00_im;
355  a1_re = i01_re; a1_im = i01_im;
356  a2_re = i02_re; a2_im = i02_im;
357  b0_re = i10_re; b0_im = i10_im;
358  b1_re = i11_re; b1_im = i11_im;
359  b2_re = i12_re; b2_im = i12_im;
360 
361  }
362 #endif // MULTI_GPU
363 
364  // multiply row 0
366  A0_re += g00_re * a0_re;
367  A0_re -= g00_im * a0_im;
368  A0_re += g01_re * a1_re;
369  A0_re -= g01_im * a1_im;
370  A0_re += g02_re * a2_re;
371  A0_re -= g02_im * a2_im;
373  A0_im += g00_re * a0_im;
374  A0_im += g00_im * a0_re;
375  A0_im += g01_re * a1_im;
376  A0_im += g01_im * a1_re;
377  A0_im += g02_re * a2_im;
378  A0_im += g02_im * a2_re;
380  B0_re += g00_re * b0_re;
381  B0_re -= g00_im * b0_im;
382  B0_re += g01_re * b1_re;
383  B0_re -= g01_im * b1_im;
384  B0_re += g02_re * b2_re;
385  B0_re -= g02_im * b2_im;
387  B0_im += g00_re * b0_im;
388  B0_im += g00_im * b0_re;
389  B0_im += g01_re * b1_im;
390  B0_im += g01_im * b1_re;
391  B0_im += g02_re * b2_im;
392  B0_im += g02_im * b2_re;
393 
394  // multiply row 1
396  A1_re += g10_re * a0_re;
397  A1_re -= g10_im * a0_im;
398  A1_re += g11_re * a1_re;
399  A1_re -= g11_im * a1_im;
400  A1_re += g12_re * a2_re;
401  A1_re -= g12_im * a2_im;
403  A1_im += g10_re * a0_im;
404  A1_im += g10_im * a0_re;
405  A1_im += g11_re * a1_im;
406  A1_im += g11_im * a1_re;
407  A1_im += g12_re * a2_im;
408  A1_im += g12_im * a2_re;
410  B1_re += g10_re * b0_re;
411  B1_re -= g10_im * b0_im;
412  B1_re += g11_re * b1_re;
413  B1_re -= g11_im * b1_im;
414  B1_re += g12_re * b2_re;
415  B1_re -= g12_im * b2_im;
417  B1_im += g10_re * b0_im;
418  B1_im += g10_im * b0_re;
419  B1_im += g11_re * b1_im;
420  B1_im += g11_im * b1_re;
421  B1_im += g12_re * b2_im;
422  B1_im += g12_im * b2_re;
423 
424  // multiply row 2
426  A2_re += g20_re * a0_re;
427  A2_re -= g20_im * a0_im;
428  A2_re += g21_re * a1_re;
429  A2_re -= g21_im * a1_im;
430  A2_re += g22_re * a2_re;
431  A2_re -= g22_im * a2_im;
433  A2_im += g20_re * a0_im;
434  A2_im += g20_im * a0_re;
435  A2_im += g21_re * a1_im;
436  A2_im += g21_im * a1_re;
437  A2_im += g22_re * a2_im;
438  A2_im += g22_im * a2_re;
440  B2_re += g20_re * b0_re;
441  B2_re -= g20_im * b0_im;
442  B2_re += g21_re * b1_re;
443  B2_re -= g21_im * b1_im;
444  B2_re += g22_re * b2_re;
445  B2_re -= g22_im * b2_im;
447  B2_im += g20_re * b0_im;
448  B2_im += g20_im * b0_re;
449  B2_im += g21_re * b1_im;
450  B2_im += g21_im * b1_re;
451  B2_im += g22_re * b2_im;
452  B2_im += g22_im * b2_re;
453 
454  o1_00_re += A0_re;
455  o1_00_im += A0_im;
456  o1_10_re += B0_re;
457  o1_10_im += B0_im;
458  o1_20_re += B0_im;
459  o1_20_im -= B0_re;
460  o1_30_re += A0_im;
461  o1_30_im -= A0_re;
462 
463  o1_01_re += A1_re;
464  o1_01_im += A1_im;
465  o1_11_re += B1_re;
466  o1_11_im += B1_im;
467  o1_21_re += B1_im;
468  o1_21_im -= B1_re;
469  o1_31_re += A1_im;
470  o1_31_im -= A1_re;
471 
472  o1_02_re += A2_re;
473  o1_02_im += A2_im;
474  o1_12_re += B2_re;
475  o1_12_im += B2_im;
476  o1_22_re += B2_im;
477  o1_22_im -= B2_re;
478  o1_32_re += A2_im;
479  o1_32_im -= A2_re;
480 
481  }
482  {
483 #ifdef MULTI_GPU
484  if (kernel_type == INTERIOR_KERNEL) {
485 #endif
486 
487  // read flavor 2 from device memory
489 
490  // project spinor into half spinors
491  a0_re = +i00_re-i30_im;
492  a0_im = +i00_im+i30_re;
493  a1_re = +i01_re-i31_im;
494  a1_im = +i01_im+i31_re;
495  a2_re = +i02_re-i32_im;
496  a2_im = +i02_im+i32_re;
497  b0_re = +i10_re-i20_im;
498  b0_im = +i10_im+i20_re;
499  b1_re = +i11_re-i21_im;
500  b1_im = +i11_im+i21_re;
501  b2_re = +i12_re-i22_im;
502  b2_im = +i12_im+i22_re;
503 
504 #ifdef MULTI_GPU
505  } else {
506 
507  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
508 
509  // read half spinor for the second flavor from device memory
510  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
511  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+FLAVORS*ghostFace[static_cast<int>(kernel_type)]);
512 
513  a0_re = i00_re; a0_im = i00_im;
514  a1_re = i01_re; a1_im = i01_im;
515  a2_re = i02_re; a2_im = i02_im;
516  b0_re = i10_re; b0_im = i10_im;
517  b1_re = i11_re; b1_im = i11_im;
518  b2_re = i12_re; b2_im = i12_im;
519 
520  }
521 #endif // MULTI_GPU
522 
523  // multiply row 0
524  spinorFloat A0_re = 0;
525  A0_re += g00_re * a0_re;
526  A0_re -= g00_im * a0_im;
527  A0_re += g01_re * a1_re;
528  A0_re -= g01_im * a1_im;
529  A0_re += g02_re * a2_re;
530  A0_re -= g02_im * a2_im;
531  spinorFloat A0_im = 0;
532  A0_im += g00_re * a0_im;
533  A0_im += g00_im * a0_re;
534  A0_im += g01_re * a1_im;
535  A0_im += g01_im * a1_re;
536  A0_im += g02_re * a2_im;
537  A0_im += g02_im * a2_re;
538  spinorFloat B0_re = 0;
539  B0_re += g00_re * b0_re;
540  B0_re -= g00_im * b0_im;
541  B0_re += g01_re * b1_re;
542  B0_re -= g01_im * b1_im;
543  B0_re += g02_re * b2_re;
544  B0_re -= g02_im * b2_im;
545  spinorFloat B0_im = 0;
546  B0_im += g00_re * b0_im;
547  B0_im += g00_im * b0_re;
548  B0_im += g01_re * b1_im;
549  B0_im += g01_im * b1_re;
550  B0_im += g02_re * b2_im;
551  B0_im += g02_im * b2_re;
552 
553  // multiply row 1
554  spinorFloat A1_re = 0;
555  A1_re += g10_re * a0_re;
556  A1_re -= g10_im * a0_im;
557  A1_re += g11_re * a1_re;
558  A1_re -= g11_im * a1_im;
559  A1_re += g12_re * a2_re;
560  A1_re -= g12_im * a2_im;
561  spinorFloat A1_im = 0;
562  A1_im += g10_re * a0_im;
563  A1_im += g10_im * a0_re;
564  A1_im += g11_re * a1_im;
565  A1_im += g11_im * a1_re;
566  A1_im += g12_re * a2_im;
567  A1_im += g12_im * a2_re;
568  spinorFloat B1_re = 0;
569  B1_re += g10_re * b0_re;
570  B1_re -= g10_im * b0_im;
571  B1_re += g11_re * b1_re;
572  B1_re -= g11_im * b1_im;
573  B1_re += g12_re * b2_re;
574  B1_re -= g12_im * b2_im;
575  spinorFloat B1_im = 0;
576  B1_im += g10_re * b0_im;
577  B1_im += g10_im * b0_re;
578  B1_im += g11_re * b1_im;
579  B1_im += g11_im * b1_re;
580  B1_im += g12_re * b2_im;
581  B1_im += g12_im * b2_re;
582 
583  // multiply row 2
584  spinorFloat A2_re = 0;
585  A2_re += g20_re * a0_re;
586  A2_re -= g20_im * a0_im;
587  A2_re += g21_re * a1_re;
588  A2_re -= g21_im * a1_im;
589  A2_re += g22_re * a2_re;
590  A2_re -= g22_im * a2_im;
591  spinorFloat A2_im = 0;
592  A2_im += g20_re * a0_im;
593  A2_im += g20_im * a0_re;
594  A2_im += g21_re * a1_im;
595  A2_im += g21_im * a1_re;
596  A2_im += g22_re * a2_im;
597  A2_im += g22_im * a2_re;
598  spinorFloat B2_re = 0;
599  B2_re += g20_re * b0_re;
600  B2_re -= g20_im * b0_im;
601  B2_re += g21_re * b1_re;
602  B2_re -= g21_im * b1_im;
603  B2_re += g22_re * b2_re;
604  B2_re -= g22_im * b2_im;
605  spinorFloat B2_im = 0;
606  B2_im += g20_re * b0_im;
607  B2_im += g20_im * b0_re;
608  B2_im += g21_re * b1_im;
609  B2_im += g21_im * b1_re;
610  B2_im += g22_re * b2_im;
611  B2_im += g22_im * b2_re;
612 
613  o2_00_re += A0_re;
614  o2_00_im += A0_im;
615  o2_10_re += B0_re;
616  o2_10_im += B0_im;
617  o2_20_re += B0_im;
618  o2_20_im -= B0_re;
619  o2_30_re += A0_im;
620  o2_30_im -= A0_re;
621 
622  o2_01_re += A1_re;
623  o2_01_im += A1_im;
624  o2_11_re += B1_re;
625  o2_11_im += B1_im;
626  o2_21_re += B1_im;
627  o2_21_im -= B1_re;
628  o2_31_re += A1_im;
629  o2_31_im -= A1_re;
630 
631  o2_02_re += A2_re;
632  o2_02_im += A2_im;
633  o2_12_re += B2_re;
634  o2_12_im += B2_im;
635  o2_22_re += B2_im;
636  o2_22_im -= B2_re;
637  o2_32_re += A2_im;
638  o2_32_im -= A2_re;
639 
640  }
641 }
642 
643 #ifdef MULTI_GPU
644 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
645  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
646 #endif
647 {
648  // Projector P0-
649  // 1 0 0 -i
650  // 0 1 -i 0
651  // 0 i 1 0
652  // i 0 0 1
653 
654 #ifdef MULTI_GPU
655  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
656  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
657 #else
658  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
659 #endif
660 
661 #ifdef MULTI_GPU
662  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
663 #else
664  const int ga_idx = sp_idx;
665 #endif
666 
673 
674  // read gauge matrix from device memory
675  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
676 
677  // reconstruct gauge matrix
679 
680  {
681 #ifdef MULTI_GPU
682  if (kernel_type == INTERIOR_KERNEL) {
683 #endif
684 
685  // read flavor 1 from device memory
686  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
687 
688  // project spinor into half spinors
689  a0_re = +i00_re+i30_im;
690  a0_im = +i00_im-i30_re;
691  a1_re = +i01_re+i31_im;
692  a1_im = +i01_im-i31_re;
693  a2_re = +i02_re+i32_im;
694  a2_im = +i02_im-i32_re;
695  b0_re = +i10_re+i20_im;
696  b0_im = +i10_im-i20_re;
697  b1_re = +i11_re+i21_im;
698  b1_im = +i11_im-i21_re;
699  b2_re = +i12_re+i22_im;
700  b2_im = +i12_im-i22_re;
701 
702 #ifdef MULTI_GPU
703  } else {
704 
705  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
706 
707  // read half spinor for the first flavor from device memory
708  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
709 
710  a0_re = i00_re; a0_im = i00_im;
711  a1_re = i01_re; a1_im = i01_im;
712  a2_re = i02_re; a2_im = i02_im;
713  b0_re = i10_re; b0_im = i10_im;
714  b1_re = i11_re; b1_im = i11_im;
715  b2_re = i12_re; b2_im = i12_im;
716 
717  }
718 #endif // MULTI_GPU
719 
720  // multiply row 0
721  spinorFloat A0_re = 0;
722  A0_re += gT00_re * a0_re;
723  A0_re -= gT00_im * a0_im;
724  A0_re += gT01_re * a1_re;
725  A0_re -= gT01_im * a1_im;
726  A0_re += gT02_re * a2_re;
727  A0_re -= gT02_im * a2_im;
728  spinorFloat A0_im = 0;
729  A0_im += gT00_re * a0_im;
730  A0_im += gT00_im * a0_re;
731  A0_im += gT01_re * a1_im;
732  A0_im += gT01_im * a1_re;
733  A0_im += gT02_re * a2_im;
734  A0_im += gT02_im * a2_re;
735  spinorFloat B0_re = 0;
736  B0_re += gT00_re * b0_re;
737  B0_re -= gT00_im * b0_im;
738  B0_re += gT01_re * b1_re;
739  B0_re -= gT01_im * b1_im;
740  B0_re += gT02_re * b2_re;
741  B0_re -= gT02_im * b2_im;
742  spinorFloat B0_im = 0;
743  B0_im += gT00_re * b0_im;
744  B0_im += gT00_im * b0_re;
745  B0_im += gT01_re * b1_im;
746  B0_im += gT01_im * b1_re;
747  B0_im += gT02_re * b2_im;
748  B0_im += gT02_im * b2_re;
749 
750  // multiply row 1
751  spinorFloat A1_re = 0;
752  A1_re += gT10_re * a0_re;
753  A1_re -= gT10_im * a0_im;
754  A1_re += gT11_re * a1_re;
755  A1_re -= gT11_im * a1_im;
756  A1_re += gT12_re * a2_re;
757  A1_re -= gT12_im * a2_im;
758  spinorFloat A1_im = 0;
759  A1_im += gT10_re * a0_im;
760  A1_im += gT10_im * a0_re;
761  A1_im += gT11_re * a1_im;
762  A1_im += gT11_im * a1_re;
763  A1_im += gT12_re * a2_im;
764  A1_im += gT12_im * a2_re;
765  spinorFloat B1_re = 0;
766  B1_re += gT10_re * b0_re;
767  B1_re -= gT10_im * b0_im;
768  B1_re += gT11_re * b1_re;
769  B1_re -= gT11_im * b1_im;
770  B1_re += gT12_re * b2_re;
771  B1_re -= gT12_im * b2_im;
772  spinorFloat B1_im = 0;
773  B1_im += gT10_re * b0_im;
774  B1_im += gT10_im * b0_re;
775  B1_im += gT11_re * b1_im;
776  B1_im += gT11_im * b1_re;
777  B1_im += gT12_re * b2_im;
778  B1_im += gT12_im * b2_re;
779 
780  // multiply row 2
781  spinorFloat A2_re = 0;
782  A2_re += gT20_re * a0_re;
783  A2_re -= gT20_im * a0_im;
784  A2_re += gT21_re * a1_re;
785  A2_re -= gT21_im * a1_im;
786  A2_re += gT22_re * a2_re;
787  A2_re -= gT22_im * a2_im;
788  spinorFloat A2_im = 0;
789  A2_im += gT20_re * a0_im;
790  A2_im += gT20_im * a0_re;
791  A2_im += gT21_re * a1_im;
792  A2_im += gT21_im * a1_re;
793  A2_im += gT22_re * a2_im;
794  A2_im += gT22_im * a2_re;
795  spinorFloat B2_re = 0;
796  B2_re += gT20_re * b0_re;
797  B2_re -= gT20_im * b0_im;
798  B2_re += gT21_re * b1_re;
799  B2_re -= gT21_im * b1_im;
800  B2_re += gT22_re * b2_re;
801  B2_re -= gT22_im * b2_im;
802  spinorFloat B2_im = 0;
803  B2_im += gT20_re * b0_im;
804  B2_im += gT20_im * b0_re;
805  B2_im += gT21_re * b1_im;
806  B2_im += gT21_im * b1_re;
807  B2_im += gT22_re * b2_im;
808  B2_im += gT22_im * b2_re;
809 
810  o1_00_re += A0_re;
811  o1_00_im += A0_im;
812  o1_10_re += B0_re;
813  o1_10_im += B0_im;
814  o1_20_re -= B0_im;
815  o1_20_im += B0_re;
816  o1_30_re -= A0_im;
817  o1_30_im += A0_re;
818 
819  o1_01_re += A1_re;
820  o1_01_im += A1_im;
821  o1_11_re += B1_re;
822  o1_11_im += B1_im;
823  o1_21_re -= B1_im;
824  o1_21_im += B1_re;
825  o1_31_re -= A1_im;
826  o1_31_im += A1_re;
827 
828  o1_02_re += A2_re;
829  o1_02_im += A2_im;
830  o1_12_re += B2_re;
831  o1_12_im += B2_im;
832  o1_22_re -= B2_im;
833  o1_22_im += B2_re;
834  o1_32_re -= A2_im;
835  o1_32_im += A2_re;
836 
837  }
838  {
839 #ifdef MULTI_GPU
840  if (kernel_type == INTERIOR_KERNEL) {
841 #endif
842 
843  // read flavor 2 from device memory
845 
846  // project spinor into half spinors
847  a0_re = +i00_re+i30_im;
848  a0_im = +i00_im-i30_re;
849  a1_re = +i01_re+i31_im;
850  a1_im = +i01_im-i31_re;
851  a2_re = +i02_re+i32_im;
852  a2_im = +i02_im-i32_re;
853  b0_re = +i10_re+i20_im;
854  b0_im = +i10_im-i20_re;
855  b1_re = +i11_re+i21_im;
856  b1_im = +i11_im-i21_re;
857  b2_re = +i12_re+i22_im;
858  b2_im = +i12_im-i22_re;
859 
860 #ifdef MULTI_GPU
861  } else {
862 
863  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
864 
865  // read half spinor for the second flavor from device memory
866  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
867  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
868 
869  a0_re = i00_re; a0_im = i00_im;
870  a1_re = i01_re; a1_im = i01_im;
871  a2_re = i02_re; a2_im = i02_im;
872  b0_re = i10_re; b0_im = i10_im;
873  b1_re = i11_re; b1_im = i11_im;
874  b2_re = i12_re; b2_im = i12_im;
875 
876  }
877 #endif // MULTI_GPU
878 
879  // multiply row 0
880  spinorFloat A0_re = 0;
881  A0_re += gT00_re * a0_re;
882  A0_re -= gT00_im * a0_im;
883  A0_re += gT01_re * a1_re;
884  A0_re -= gT01_im * a1_im;
885  A0_re += gT02_re * a2_re;
886  A0_re -= gT02_im * a2_im;
887  spinorFloat A0_im = 0;
888  A0_im += gT00_re * a0_im;
889  A0_im += gT00_im * a0_re;
890  A0_im += gT01_re * a1_im;
891  A0_im += gT01_im * a1_re;
892  A0_im += gT02_re * a2_im;
893  A0_im += gT02_im * a2_re;
894  spinorFloat B0_re = 0;
895  B0_re += gT00_re * b0_re;
896  B0_re -= gT00_im * b0_im;
897  B0_re += gT01_re * b1_re;
898  B0_re -= gT01_im * b1_im;
899  B0_re += gT02_re * b2_re;
900  B0_re -= gT02_im * b2_im;
901  spinorFloat B0_im = 0;
902  B0_im += gT00_re * b0_im;
903  B0_im += gT00_im * b0_re;
904  B0_im += gT01_re * b1_im;
905  B0_im += gT01_im * b1_re;
906  B0_im += gT02_re * b2_im;
907  B0_im += gT02_im * b2_re;
908 
909  // multiply row 1
910  spinorFloat A1_re = 0;
911  A1_re += gT10_re * a0_re;
912  A1_re -= gT10_im * a0_im;
913  A1_re += gT11_re * a1_re;
914  A1_re -= gT11_im * a1_im;
915  A1_re += gT12_re * a2_re;
916  A1_re -= gT12_im * a2_im;
917  spinorFloat A1_im = 0;
918  A1_im += gT10_re * a0_im;
919  A1_im += gT10_im * a0_re;
920  A1_im += gT11_re * a1_im;
921  A1_im += gT11_im * a1_re;
922  A1_im += gT12_re * a2_im;
923  A1_im += gT12_im * a2_re;
924  spinorFloat B1_re = 0;
925  B1_re += gT10_re * b0_re;
926  B1_re -= gT10_im * b0_im;
927  B1_re += gT11_re * b1_re;
928  B1_re -= gT11_im * b1_im;
929  B1_re += gT12_re * b2_re;
930  B1_re -= gT12_im * b2_im;
931  spinorFloat B1_im = 0;
932  B1_im += gT10_re * b0_im;
933  B1_im += gT10_im * b0_re;
934  B1_im += gT11_re * b1_im;
935  B1_im += gT11_im * b1_re;
936  B1_im += gT12_re * b2_im;
937  B1_im += gT12_im * b2_re;
938 
939  // multiply row 2
940  spinorFloat A2_re = 0;
941  A2_re += gT20_re * a0_re;
942  A2_re -= gT20_im * a0_im;
943  A2_re += gT21_re * a1_re;
944  A2_re -= gT21_im * a1_im;
945  A2_re += gT22_re * a2_re;
946  A2_re -= gT22_im * a2_im;
947  spinorFloat A2_im = 0;
948  A2_im += gT20_re * a0_im;
949  A2_im += gT20_im * a0_re;
950  A2_im += gT21_re * a1_im;
951  A2_im += gT21_im * a1_re;
952  A2_im += gT22_re * a2_im;
953  A2_im += gT22_im * a2_re;
954  spinorFloat B2_re = 0;
955  B2_re += gT20_re * b0_re;
956  B2_re -= gT20_im * b0_im;
957  B2_re += gT21_re * b1_re;
958  B2_re -= gT21_im * b1_im;
959  B2_re += gT22_re * b2_re;
960  B2_re -= gT22_im * b2_im;
961  spinorFloat B2_im = 0;
962  B2_im += gT20_re * b0_im;
963  B2_im += gT20_im * b0_re;
964  B2_im += gT21_re * b1_im;
965  B2_im += gT21_im * b1_re;
966  B2_im += gT22_re * b2_im;
967  B2_im += gT22_im * b2_re;
968 
969  o2_00_re += A0_re;
970  o2_00_im += A0_im;
971  o2_10_re += B0_re;
972  o2_10_im += B0_im;
973  o2_20_re -= B0_im;
974  o2_20_im += B0_re;
975  o2_30_re -= A0_im;
976  o2_30_im += A0_re;
977 
978  o2_01_re += A1_re;
979  o2_01_im += A1_im;
980  o2_11_re += B1_re;
981  o2_11_im += B1_im;
982  o2_21_re -= B1_im;
983  o2_21_im += B1_re;
984  o2_31_re -= A1_im;
985  o2_31_im += A1_re;
986 
987  o2_02_re += A2_re;
988  o2_02_im += A2_im;
989  o2_12_re += B2_re;
990  o2_12_im += B2_im;
991  o2_22_re -= B2_im;
992  o2_22_im += B2_re;
993  o2_32_re -= A2_im;
994  o2_32_im += A2_re;
995 
996  }
997 }
998 
999 #ifdef MULTI_GPU
1000 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
1001  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
1002 #endif
1003 {
1004  // Projector P1+
1005  // 1 0 0 1
1006  // 0 1 -1 0
1007  // 0 -1 1 0
1008  // 1 0 0 1
1009 
1010 #ifdef MULTI_GPU
1011  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
1012  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1013 #else
1014  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
1015 #endif
1016 
1017  const int ga_idx = sid;
1018 
1025 
1026  // read gauge matrix from device memory
1027  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
1028 
1029  // reconstruct gauge matrix
1031 
1032  {
1033 #ifdef MULTI_GPU
1034  if (kernel_type == INTERIOR_KERNEL) {
1035 #endif
1036 
1037  // read flavor 1 from device memory
1038  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
1039 
1040  // project spinor into half spinors
1041  a0_re = +i00_re+i30_re;
1042  a0_im = +i00_im+i30_im;
1043  a1_re = +i01_re+i31_re;
1044  a1_im = +i01_im+i31_im;
1045  a2_re = +i02_re+i32_re;
1046  a2_im = +i02_im+i32_im;
1047  b0_re = +i10_re-i20_re;
1048  b0_im = +i10_im-i20_im;
1049  b1_re = +i11_re-i21_re;
1050  b1_im = +i11_im-i21_im;
1051  b2_re = +i12_re-i22_re;
1052  b2_im = +i12_im-i22_im;
1053 
1054 #ifdef MULTI_GPU
1055  } else {
1056 
1057  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
1058 
1059  // read half spinor for the first flavor from device memory
1060  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
1061 
1062  a0_re = i00_re; a0_im = i00_im;
1063  a1_re = i01_re; a1_im = i01_im;
1064  a2_re = i02_re; a2_im = i02_im;
1065  b0_re = i10_re; b0_im = i10_im;
1066  b1_re = i11_re; b1_im = i11_im;
1067  b2_re = i12_re; b2_im = i12_im;
1068 
1069  }
1070 #endif // MULTI_GPU
1071 
1072  // multiply row 0
1073  spinorFloat A0_re = 0;
1074  A0_re += g00_re * a0_re;
1075  A0_re -= g00_im * a0_im;
1076  A0_re += g01_re * a1_re;
1077  A0_re -= g01_im * a1_im;
1078  A0_re += g02_re * a2_re;
1079  A0_re -= g02_im * a2_im;
1080  spinorFloat A0_im = 0;
1081  A0_im += g00_re * a0_im;
1082  A0_im += g00_im * a0_re;
1083  A0_im += g01_re * a1_im;
1084  A0_im += g01_im * a1_re;
1085  A0_im += g02_re * a2_im;
1086  A0_im += g02_im * a2_re;
1087  spinorFloat B0_re = 0;
1088  B0_re += g00_re * b0_re;
1089  B0_re -= g00_im * b0_im;
1090  B0_re += g01_re * b1_re;
1091  B0_re -= g01_im * b1_im;
1092  B0_re += g02_re * b2_re;
1093  B0_re -= g02_im * b2_im;
1094  spinorFloat B0_im = 0;
1095  B0_im += g00_re * b0_im;
1096  B0_im += g00_im * b0_re;
1097  B0_im += g01_re * b1_im;
1098  B0_im += g01_im * b1_re;
1099  B0_im += g02_re * b2_im;
1100  B0_im += g02_im * b2_re;
1101 
1102  // multiply row 1
1103  spinorFloat A1_re = 0;
1104  A1_re += g10_re * a0_re;
1105  A1_re -= g10_im * a0_im;
1106  A1_re += g11_re * a1_re;
1107  A1_re -= g11_im * a1_im;
1108  A1_re += g12_re * a2_re;
1109  A1_re -= g12_im * a2_im;
1110  spinorFloat A1_im = 0;
1111  A1_im += g10_re * a0_im;
1112  A1_im += g10_im * a0_re;
1113  A1_im += g11_re * a1_im;
1114  A1_im += g11_im * a1_re;
1115  A1_im += g12_re * a2_im;
1116  A1_im += g12_im * a2_re;
1117  spinorFloat B1_re = 0;
1118  B1_re += g10_re * b0_re;
1119  B1_re -= g10_im * b0_im;
1120  B1_re += g11_re * b1_re;
1121  B1_re -= g11_im * b1_im;
1122  B1_re += g12_re * b2_re;
1123  B1_re -= g12_im * b2_im;
1124  spinorFloat B1_im = 0;
1125  B1_im += g10_re * b0_im;
1126  B1_im += g10_im * b0_re;
1127  B1_im += g11_re * b1_im;
1128  B1_im += g11_im * b1_re;
1129  B1_im += g12_re * b2_im;
1130  B1_im += g12_im * b2_re;
1131 
1132  // multiply row 2
1133  spinorFloat A2_re = 0;
1134  A2_re += g20_re * a0_re;
1135  A2_re -= g20_im * a0_im;
1136  A2_re += g21_re * a1_re;
1137  A2_re -= g21_im * a1_im;
1138  A2_re += g22_re * a2_re;
1139  A2_re -= g22_im * a2_im;
1140  spinorFloat A2_im = 0;
1141  A2_im += g20_re * a0_im;
1142  A2_im += g20_im * a0_re;
1143  A2_im += g21_re * a1_im;
1144  A2_im += g21_im * a1_re;
1145  A2_im += g22_re * a2_im;
1146  A2_im += g22_im * a2_re;
1147  spinorFloat B2_re = 0;
1148  B2_re += g20_re * b0_re;
1149  B2_re -= g20_im * b0_im;
1150  B2_re += g21_re * b1_re;
1151  B2_re -= g21_im * b1_im;
1152  B2_re += g22_re * b2_re;
1153  B2_re -= g22_im * b2_im;
1154  spinorFloat B2_im = 0;
1155  B2_im += g20_re * b0_im;
1156  B2_im += g20_im * b0_re;
1157  B2_im += g21_re * b1_im;
1158  B2_im += g21_im * b1_re;
1159  B2_im += g22_re * b2_im;
1160  B2_im += g22_im * b2_re;
1161 
1162  o1_00_re += A0_re;
1163  o1_00_im += A0_im;
1164  o1_10_re += B0_re;
1165  o1_10_im += B0_im;
1166  o1_20_re -= B0_re;
1167  o1_20_im -= B0_im;
1168  o1_30_re += A0_re;
1169  o1_30_im += A0_im;
1170 
1171  o1_01_re += A1_re;
1172  o1_01_im += A1_im;
1173  o1_11_re += B1_re;
1174  o1_11_im += B1_im;
1175  o1_21_re -= B1_re;
1176  o1_21_im -= B1_im;
1177  o1_31_re += A1_re;
1178  o1_31_im += A1_im;
1179 
1180  o1_02_re += A2_re;
1181  o1_02_im += A2_im;
1182  o1_12_re += B2_re;
1183  o1_12_im += B2_im;
1184  o1_22_re -= B2_re;
1185  o1_22_im -= B2_im;
1186  o1_32_re += A2_re;
1187  o1_32_im += A2_im;
1188 
1189  }
1190  {
1191 #ifdef MULTI_GPU
1192  if (kernel_type == INTERIOR_KERNEL) {
1193 #endif
1194 
1195  // read flavor 2 from device memory
1197 
1198  // project spinor into half spinors
1199  a0_re = +i00_re+i30_re;
1200  a0_im = +i00_im+i30_im;
1201  a1_re = +i01_re+i31_re;
1202  a1_im = +i01_im+i31_im;
1203  a2_re = +i02_re+i32_re;
1204  a2_im = +i02_im+i32_im;
1205  b0_re = +i10_re-i20_re;
1206  b0_im = +i10_im-i20_im;
1207  b1_re = +i11_re-i21_re;
1208  b1_im = +i11_im-i21_im;
1209  b2_re = +i12_re-i22_re;
1210  b2_im = +i12_im-i22_im;
1211 
1212 #ifdef MULTI_GPU
1213  } else {
1214 
1215  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
1216 
1217  // read half spinor for the second flavor from device memory
1218  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
1219  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+FLAVORS*ghostFace[static_cast<int>(kernel_type)]);
1220 
1221  a0_re = i00_re; a0_im = i00_im;
1222  a1_re = i01_re; a1_im = i01_im;
1223  a2_re = i02_re; a2_im = i02_im;
1224  b0_re = i10_re; b0_im = i10_im;
1225  b1_re = i11_re; b1_im = i11_im;
1226  b2_re = i12_re; b2_im = i12_im;
1227 
1228  }
1229 #endif // MULTI_GPU
1230 
1231  // multiply row 0
1232  spinorFloat A0_re = 0;
1233  A0_re += g00_re * a0_re;
1234  A0_re -= g00_im * a0_im;
1235  A0_re += g01_re * a1_re;
1236  A0_re -= g01_im * a1_im;
1237  A0_re += g02_re * a2_re;
1238  A0_re -= g02_im * a2_im;
1239  spinorFloat A0_im = 0;
1240  A0_im += g00_re * a0_im;
1241  A0_im += g00_im * a0_re;
1242  A0_im += g01_re * a1_im;
1243  A0_im += g01_im * a1_re;
1244  A0_im += g02_re * a2_im;
1245  A0_im += g02_im * a2_re;
1246  spinorFloat B0_re = 0;
1247  B0_re += g00_re * b0_re;
1248  B0_re -= g00_im * b0_im;
1249  B0_re += g01_re * b1_re;
1250  B0_re -= g01_im * b1_im;
1251  B0_re += g02_re * b2_re;
1252  B0_re -= g02_im * b2_im;
1253  spinorFloat B0_im = 0;
1254  B0_im += g00_re * b0_im;
1255  B0_im += g00_im * b0_re;
1256  B0_im += g01_re * b1_im;
1257  B0_im += g01_im * b1_re;
1258  B0_im += g02_re * b2_im;
1259  B0_im += g02_im * b2_re;
1260 
1261  // multiply row 1
1262  spinorFloat A1_re = 0;
1263  A1_re += g10_re * a0_re;
1264  A1_re -= g10_im * a0_im;
1265  A1_re += g11_re * a1_re;
1266  A1_re -= g11_im * a1_im;
1267  A1_re += g12_re * a2_re;
1268  A1_re -= g12_im * a2_im;
1269  spinorFloat A1_im = 0;
1270  A1_im += g10_re * a0_im;
1271  A1_im += g10_im * a0_re;
1272  A1_im += g11_re * a1_im;
1273  A1_im += g11_im * a1_re;
1274  A1_im += g12_re * a2_im;
1275  A1_im += g12_im * a2_re;
1276  spinorFloat B1_re = 0;
1277  B1_re += g10_re * b0_re;
1278  B1_re -= g10_im * b0_im;
1279  B1_re += g11_re * b1_re;
1280  B1_re -= g11_im * b1_im;
1281  B1_re += g12_re * b2_re;
1282  B1_re -= g12_im * b2_im;
1283  spinorFloat B1_im = 0;
1284  B1_im += g10_re * b0_im;
1285  B1_im += g10_im * b0_re;
1286  B1_im += g11_re * b1_im;
1287  B1_im += g11_im * b1_re;
1288  B1_im += g12_re * b2_im;
1289  B1_im += g12_im * b2_re;
1290 
1291  // multiply row 2
1292  spinorFloat A2_re = 0;
1293  A2_re += g20_re * a0_re;
1294  A2_re -= g20_im * a0_im;
1295  A2_re += g21_re * a1_re;
1296  A2_re -= g21_im * a1_im;
1297  A2_re += g22_re * a2_re;
1298  A2_re -= g22_im * a2_im;
1299  spinorFloat A2_im = 0;
1300  A2_im += g20_re * a0_im;
1301  A2_im += g20_im * a0_re;
1302  A2_im += g21_re * a1_im;
1303  A2_im += g21_im * a1_re;
1304  A2_im += g22_re * a2_im;
1305  A2_im += g22_im * a2_re;
1306  spinorFloat B2_re = 0;
1307  B2_re += g20_re * b0_re;
1308  B2_re -= g20_im * b0_im;
1309  B2_re += g21_re * b1_re;
1310  B2_re -= g21_im * b1_im;
1311  B2_re += g22_re * b2_re;
1312  B2_re -= g22_im * b2_im;
1313  spinorFloat B2_im = 0;
1314  B2_im += g20_re * b0_im;
1315  B2_im += g20_im * b0_re;
1316  B2_im += g21_re * b1_im;
1317  B2_im += g21_im * b1_re;
1318  B2_im += g22_re * b2_im;
1319  B2_im += g22_im * b2_re;
1320 
1321  o2_00_re += A0_re;
1322  o2_00_im += A0_im;
1323  o2_10_re += B0_re;
1324  o2_10_im += B0_im;
1325  o2_20_re -= B0_re;
1326  o2_20_im -= B0_im;
1327  o2_30_re += A0_re;
1328  o2_30_im += A0_im;
1329 
1330  o2_01_re += A1_re;
1331  o2_01_im += A1_im;
1332  o2_11_re += B1_re;
1333  o2_11_im += B1_im;
1334  o2_21_re -= B1_re;
1335  o2_21_im -= B1_im;
1336  o2_31_re += A1_re;
1337  o2_31_im += A1_im;
1338 
1339  o2_02_re += A2_re;
1340  o2_02_im += A2_im;
1341  o2_12_re += B2_re;
1342  o2_12_im += B2_im;
1343  o2_22_re -= B2_re;
1344  o2_22_im -= B2_im;
1345  o2_32_re += A2_re;
1346  o2_32_im += A2_im;
1347 
1348  }
1349 }
1350 
1351 #ifdef MULTI_GPU
1352 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
1353  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
1354 #endif
1355 {
1356  // Projector P1-
1357  // 1 0 0 -1
1358  // 0 1 1 0
1359  // 0 1 1 0
1360  // -1 0 0 1
1361 
1362 #ifdef MULTI_GPU
1363  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
1364  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1365 #else
1366  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
1367 #endif
1368 
1369 #ifdef MULTI_GPU
1370  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1371 #else
1372  const int ga_idx = sp_idx;
1373 #endif
1374 
1381 
1382  // read gauge matrix from device memory
1383  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
1384 
1385  // reconstruct gauge matrix
1387 
1388  {
1389 #ifdef MULTI_GPU
1390  if (kernel_type == INTERIOR_KERNEL) {
1391 #endif
1392 
1393  // read flavor 1 from device memory
1394  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
1395 
1396  // project spinor into half spinors
1397  a0_re = +i00_re-i30_re;
1398  a0_im = +i00_im-i30_im;
1399  a1_re = +i01_re-i31_re;
1400  a1_im = +i01_im-i31_im;
1401  a2_re = +i02_re-i32_re;
1402  a2_im = +i02_im-i32_im;
1403  b0_re = +i10_re+i20_re;
1404  b0_im = +i10_im+i20_im;
1405  b1_re = +i11_re+i21_re;
1406  b1_im = +i11_im+i21_im;
1407  b2_re = +i12_re+i22_re;
1408  b2_im = +i12_im+i22_im;
1409 
1410 #ifdef MULTI_GPU
1411  } else {
1412 
1413  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
1414 
1415  // read half spinor for the first flavor from device memory
1416  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1417 
1418  a0_re = i00_re; a0_im = i00_im;
1419  a1_re = i01_re; a1_im = i01_im;
1420  a2_re = i02_re; a2_im = i02_im;
1421  b0_re = i10_re; b0_im = i10_im;
1422  b1_re = i11_re; b1_im = i11_im;
1423  b2_re = i12_re; b2_im = i12_im;
1424 
1425  }
1426 #endif // MULTI_GPU
1427 
1428  // multiply row 0
1429  spinorFloat A0_re = 0;
1430  A0_re += gT00_re * a0_re;
1431  A0_re -= gT00_im * a0_im;
1432  A0_re += gT01_re * a1_re;
1433  A0_re -= gT01_im * a1_im;
1434  A0_re += gT02_re * a2_re;
1435  A0_re -= gT02_im * a2_im;
1436  spinorFloat A0_im = 0;
1437  A0_im += gT00_re * a0_im;
1438  A0_im += gT00_im * a0_re;
1439  A0_im += gT01_re * a1_im;
1440  A0_im += gT01_im * a1_re;
1441  A0_im += gT02_re * a2_im;
1442  A0_im += gT02_im * a2_re;
1443  spinorFloat B0_re = 0;
1444  B0_re += gT00_re * b0_re;
1445  B0_re -= gT00_im * b0_im;
1446  B0_re += gT01_re * b1_re;
1447  B0_re -= gT01_im * b1_im;
1448  B0_re += gT02_re * b2_re;
1449  B0_re -= gT02_im * b2_im;
1450  spinorFloat B0_im = 0;
1451  B0_im += gT00_re * b0_im;
1452  B0_im += gT00_im * b0_re;
1453  B0_im += gT01_re * b1_im;
1454  B0_im += gT01_im * b1_re;
1455  B0_im += gT02_re * b2_im;
1456  B0_im += gT02_im * b2_re;
1457 
1458  // multiply row 1
1459  spinorFloat A1_re = 0;
1460  A1_re += gT10_re * a0_re;
1461  A1_re -= gT10_im * a0_im;
1462  A1_re += gT11_re * a1_re;
1463  A1_re -= gT11_im * a1_im;
1464  A1_re += gT12_re * a2_re;
1465  A1_re -= gT12_im * a2_im;
1466  spinorFloat A1_im = 0;
1467  A1_im += gT10_re * a0_im;
1468  A1_im += gT10_im * a0_re;
1469  A1_im += gT11_re * a1_im;
1470  A1_im += gT11_im * a1_re;
1471  A1_im += gT12_re * a2_im;
1472  A1_im += gT12_im * a2_re;
1473  spinorFloat B1_re = 0;
1474  B1_re += gT10_re * b0_re;
1475  B1_re -= gT10_im * b0_im;
1476  B1_re += gT11_re * b1_re;
1477  B1_re -= gT11_im * b1_im;
1478  B1_re += gT12_re * b2_re;
1479  B1_re -= gT12_im * b2_im;
1480  spinorFloat B1_im = 0;
1481  B1_im += gT10_re * b0_im;
1482  B1_im += gT10_im * b0_re;
1483  B1_im += gT11_re * b1_im;
1484  B1_im += gT11_im * b1_re;
1485  B1_im += gT12_re * b2_im;
1486  B1_im += gT12_im * b2_re;
1487 
1488  // multiply row 2
1489  spinorFloat A2_re = 0;
1490  A2_re += gT20_re * a0_re;
1491  A2_re -= gT20_im * a0_im;
1492  A2_re += gT21_re * a1_re;
1493  A2_re -= gT21_im * a1_im;
1494  A2_re += gT22_re * a2_re;
1495  A2_re -= gT22_im * a2_im;
1496  spinorFloat A2_im = 0;
1497  A2_im += gT20_re * a0_im;
1498  A2_im += gT20_im * a0_re;
1499  A2_im += gT21_re * a1_im;
1500  A2_im += gT21_im * a1_re;
1501  A2_im += gT22_re * a2_im;
1502  A2_im += gT22_im * a2_re;
1503  spinorFloat B2_re = 0;
1504  B2_re += gT20_re * b0_re;
1505  B2_re -= gT20_im * b0_im;
1506  B2_re += gT21_re * b1_re;
1507  B2_re -= gT21_im * b1_im;
1508  B2_re += gT22_re * b2_re;
1509  B2_re -= gT22_im * b2_im;
1510  spinorFloat B2_im = 0;
1511  B2_im += gT20_re * b0_im;
1512  B2_im += gT20_im * b0_re;
1513  B2_im += gT21_re * b1_im;
1514  B2_im += gT21_im * b1_re;
1515  B2_im += gT22_re * b2_im;
1516  B2_im += gT22_im * b2_re;
1517 
1518  o1_00_re += A0_re;
1519  o1_00_im += A0_im;
1520  o1_10_re += B0_re;
1521  o1_10_im += B0_im;
1522  o1_20_re += B0_re;
1523  o1_20_im += B0_im;
1524  o1_30_re -= A0_re;
1525  o1_30_im -= A0_im;
1526 
1527  o1_01_re += A1_re;
1528  o1_01_im += A1_im;
1529  o1_11_re += B1_re;
1530  o1_11_im += B1_im;
1531  o1_21_re += B1_re;
1532  o1_21_im += B1_im;
1533  o1_31_re -= A1_re;
1534  o1_31_im -= A1_im;
1535 
1536  o1_02_re += A2_re;
1537  o1_02_im += A2_im;
1538  o1_12_re += B2_re;
1539  o1_12_im += B2_im;
1540  o1_22_re += B2_re;
1541  o1_22_im += B2_im;
1542  o1_32_re -= A2_re;
1543  o1_32_im -= A2_im;
1544 
1545  }
1546  {
1547 #ifdef MULTI_GPU
1548  if (kernel_type == INTERIOR_KERNEL) {
1549 #endif
1550 
1551  // read flavor 2 from device memory
1553 
1554  // project spinor into half spinors
1555  a0_re = +i00_re-i30_re;
1556  a0_im = +i00_im-i30_im;
1557  a1_re = +i01_re-i31_re;
1558  a1_im = +i01_im-i31_im;
1559  a2_re = +i02_re-i32_re;
1560  a2_im = +i02_im-i32_im;
1561  b0_re = +i10_re+i20_re;
1562  b0_im = +i10_im+i20_im;
1563  b1_re = +i11_re+i21_re;
1564  b1_im = +i11_im+i21_im;
1565  b2_re = +i12_re+i22_re;
1566  b2_im = +i12_im+i22_im;
1567 
1568 #ifdef MULTI_GPU
1569  } else {
1570 
1571  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
1572 
1573  // read half spinor for the second flavor from device memory
1574  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
1575  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
1576 
1577  a0_re = i00_re; a0_im = i00_im;
1578  a1_re = i01_re; a1_im = i01_im;
1579  a2_re = i02_re; a2_im = i02_im;
1580  b0_re = i10_re; b0_im = i10_im;
1581  b1_re = i11_re; b1_im = i11_im;
1582  b2_re = i12_re; b2_im = i12_im;
1583 
1584  }
1585 #endif // MULTI_GPU
1586 
1587  // multiply row 0
1588  spinorFloat A0_re = 0;
1589  A0_re += gT00_re * a0_re;
1590  A0_re -= gT00_im * a0_im;
1591  A0_re += gT01_re * a1_re;
1592  A0_re -= gT01_im * a1_im;
1593  A0_re += gT02_re * a2_re;
1594  A0_re -= gT02_im * a2_im;
1595  spinorFloat A0_im = 0;
1596  A0_im += gT00_re * a0_im;
1597  A0_im += gT00_im * a0_re;
1598  A0_im += gT01_re * a1_im;
1599  A0_im += gT01_im * a1_re;
1600  A0_im += gT02_re * a2_im;
1601  A0_im += gT02_im * a2_re;
1602  spinorFloat B0_re = 0;
1603  B0_re += gT00_re * b0_re;
1604  B0_re -= gT00_im * b0_im;
1605  B0_re += gT01_re * b1_re;
1606  B0_re -= gT01_im * b1_im;
1607  B0_re += gT02_re * b2_re;
1608  B0_re -= gT02_im * b2_im;
1609  spinorFloat B0_im = 0;
1610  B0_im += gT00_re * b0_im;
1611  B0_im += gT00_im * b0_re;
1612  B0_im += gT01_re * b1_im;
1613  B0_im += gT01_im * b1_re;
1614  B0_im += gT02_re * b2_im;
1615  B0_im += gT02_im * b2_re;
1616 
1617  // multiply row 1
1618  spinorFloat A1_re = 0;
1619  A1_re += gT10_re * a0_re;
1620  A1_re -= gT10_im * a0_im;
1621  A1_re += gT11_re * a1_re;
1622  A1_re -= gT11_im * a1_im;
1623  A1_re += gT12_re * a2_re;
1624  A1_re -= gT12_im * a2_im;
1625  spinorFloat A1_im = 0;
1626  A1_im += gT10_re * a0_im;
1627  A1_im += gT10_im * a0_re;
1628  A1_im += gT11_re * a1_im;
1629  A1_im += gT11_im * a1_re;
1630  A1_im += gT12_re * a2_im;
1631  A1_im += gT12_im * a2_re;
1632  spinorFloat B1_re = 0;
1633  B1_re += gT10_re * b0_re;
1634  B1_re -= gT10_im * b0_im;
1635  B1_re += gT11_re * b1_re;
1636  B1_re -= gT11_im * b1_im;
1637  B1_re += gT12_re * b2_re;
1638  B1_re -= gT12_im * b2_im;
1639  spinorFloat B1_im = 0;
1640  B1_im += gT10_re * b0_im;
1641  B1_im += gT10_im * b0_re;
1642  B1_im += gT11_re * b1_im;
1643  B1_im += gT11_im * b1_re;
1644  B1_im += gT12_re * b2_im;
1645  B1_im += gT12_im * b2_re;
1646 
1647  // multiply row 2
1648  spinorFloat A2_re = 0;
1649  A2_re += gT20_re * a0_re;
1650  A2_re -= gT20_im * a0_im;
1651  A2_re += gT21_re * a1_re;
1652  A2_re -= gT21_im * a1_im;
1653  A2_re += gT22_re * a2_re;
1654  A2_re -= gT22_im * a2_im;
1655  spinorFloat A2_im = 0;
1656  A2_im += gT20_re * a0_im;
1657  A2_im += gT20_im * a0_re;
1658  A2_im += gT21_re * a1_im;
1659  A2_im += gT21_im * a1_re;
1660  A2_im += gT22_re * a2_im;
1661  A2_im += gT22_im * a2_re;
1662  spinorFloat B2_re = 0;
1663  B2_re += gT20_re * b0_re;
1664  B2_re -= gT20_im * b0_im;
1665  B2_re += gT21_re * b1_re;
1666  B2_re -= gT21_im * b1_im;
1667  B2_re += gT22_re * b2_re;
1668  B2_re -= gT22_im * b2_im;
1669  spinorFloat B2_im = 0;
1670  B2_im += gT20_re * b0_im;
1671  B2_im += gT20_im * b0_re;
1672  B2_im += gT21_re * b1_im;
1673  B2_im += gT21_im * b1_re;
1674  B2_im += gT22_re * b2_im;
1675  B2_im += gT22_im * b2_re;
1676 
1677  o2_00_re += A0_re;
1678  o2_00_im += A0_im;
1679  o2_10_re += B0_re;
1680  o2_10_im += B0_im;
1681  o2_20_re += B0_re;
1682  o2_20_im += B0_im;
1683  o2_30_re -= A0_re;
1684  o2_30_im -= A0_im;
1685 
1686  o2_01_re += A1_re;
1687  o2_01_im += A1_im;
1688  o2_11_re += B1_re;
1689  o2_11_im += B1_im;
1690  o2_21_re += B1_re;
1691  o2_21_im += B1_im;
1692  o2_31_re -= A1_re;
1693  o2_31_im -= A1_im;
1694 
1695  o2_02_re += A2_re;
1696  o2_02_im += A2_im;
1697  o2_12_re += B2_re;
1698  o2_12_im += B2_im;
1699  o2_22_re += B2_re;
1700  o2_22_im += B2_im;
1701  o2_32_re -= A2_re;
1702  o2_32_im -= A2_im;
1703 
1704  }
1705 }
1706 
1707 #ifdef MULTI_GPU
1708 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1709  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1710 #endif
1711 {
1712  // Projector P2+
1713  // 1 0 i 0
1714  // 0 1 0 -i
1715  // -i 0 1 0
1716  // 0 i 0 1
1717 
1718 #ifdef MULTI_GPU
1719  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1720  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1721 #else
1722  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1723 #endif
1724 
1725  const int ga_idx = sid;
1726 
1733 
1734  // read gauge matrix from device memory
1735  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1736 
1737  // reconstruct gauge matrix
1739 
1740  {
1741 #ifdef MULTI_GPU
1742  if (kernel_type == INTERIOR_KERNEL) {
1743 #endif
1744 
1745  // read flavor 1 from device memory
1746  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
1747 
1748  // project spinor into half spinors
1749  a0_re = +i00_re-i20_im;
1750  a0_im = +i00_im+i20_re;
1751  a1_re = +i01_re-i21_im;
1752  a1_im = +i01_im+i21_re;
1753  a2_re = +i02_re-i22_im;
1754  a2_im = +i02_im+i22_re;
1755  b0_re = +i10_re+i30_im;
1756  b0_im = +i10_im-i30_re;
1757  b1_re = +i11_re+i31_im;
1758  b1_im = +i11_im-i31_re;
1759  b2_re = +i12_re+i32_im;
1760  b2_im = +i12_im-i32_re;
1761 
1762 #ifdef MULTI_GPU
1763  } else {
1764 
1765  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
1766 
1767  // read half spinor for the first flavor from device memory
1768  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
1769 
1770  a0_re = i00_re; a0_im = i00_im;
1771  a1_re = i01_re; a1_im = i01_im;
1772  a2_re = i02_re; a2_im = i02_im;
1773  b0_re = i10_re; b0_im = i10_im;
1774  b1_re = i11_re; b1_im = i11_im;
1775  b2_re = i12_re; b2_im = i12_im;
1776 
1777  }
1778 #endif // MULTI_GPU
1779 
1780  // multiply row 0
1781  spinorFloat A0_re = 0;
1782  A0_re += g00_re * a0_re;
1783  A0_re -= g00_im * a0_im;
1784  A0_re += g01_re * a1_re;
1785  A0_re -= g01_im * a1_im;
1786  A0_re += g02_re * a2_re;
1787  A0_re -= g02_im * a2_im;
1788  spinorFloat A0_im = 0;
1789  A0_im += g00_re * a0_im;
1790  A0_im += g00_im * a0_re;
1791  A0_im += g01_re * a1_im;
1792  A0_im += g01_im * a1_re;
1793  A0_im += g02_re * a2_im;
1794  A0_im += g02_im * a2_re;
1795  spinorFloat B0_re = 0;
1796  B0_re += g00_re * b0_re;
1797  B0_re -= g00_im * b0_im;
1798  B0_re += g01_re * b1_re;
1799  B0_re -= g01_im * b1_im;
1800  B0_re += g02_re * b2_re;
1801  B0_re -= g02_im * b2_im;
1802  spinorFloat B0_im = 0;
1803  B0_im += g00_re * b0_im;
1804  B0_im += g00_im * b0_re;
1805  B0_im += g01_re * b1_im;
1806  B0_im += g01_im * b1_re;
1807  B0_im += g02_re * b2_im;
1808  B0_im += g02_im * b2_re;
1809 
1810  // multiply row 1
1811  spinorFloat A1_re = 0;
1812  A1_re += g10_re * a0_re;
1813  A1_re -= g10_im * a0_im;
1814  A1_re += g11_re * a1_re;
1815  A1_re -= g11_im * a1_im;
1816  A1_re += g12_re * a2_re;
1817  A1_re -= g12_im * a2_im;
1818  spinorFloat A1_im = 0;
1819  A1_im += g10_re * a0_im;
1820  A1_im += g10_im * a0_re;
1821  A1_im += g11_re * a1_im;
1822  A1_im += g11_im * a1_re;
1823  A1_im += g12_re * a2_im;
1824  A1_im += g12_im * a2_re;
1825  spinorFloat B1_re = 0;
1826  B1_re += g10_re * b0_re;
1827  B1_re -= g10_im * b0_im;
1828  B1_re += g11_re * b1_re;
1829  B1_re -= g11_im * b1_im;
1830  B1_re += g12_re * b2_re;
1831  B1_re -= g12_im * b2_im;
1832  spinorFloat B1_im = 0;
1833  B1_im += g10_re * b0_im;
1834  B1_im += g10_im * b0_re;
1835  B1_im += g11_re * b1_im;
1836  B1_im += g11_im * b1_re;
1837  B1_im += g12_re * b2_im;
1838  B1_im += g12_im * b2_re;
1839 
1840  // multiply row 2
1841  spinorFloat A2_re = 0;
1842  A2_re += g20_re * a0_re;
1843  A2_re -= g20_im * a0_im;
1844  A2_re += g21_re * a1_re;
1845  A2_re -= g21_im * a1_im;
1846  A2_re += g22_re * a2_re;
1847  A2_re -= g22_im * a2_im;
1848  spinorFloat A2_im = 0;
1849  A2_im += g20_re * a0_im;
1850  A2_im += g20_im * a0_re;
1851  A2_im += g21_re * a1_im;
1852  A2_im += g21_im * a1_re;
1853  A2_im += g22_re * a2_im;
1854  A2_im += g22_im * a2_re;
1855  spinorFloat B2_re = 0;
1856  B2_re += g20_re * b0_re;
1857  B2_re -= g20_im * b0_im;
1858  B2_re += g21_re * b1_re;
1859  B2_re -= g21_im * b1_im;
1860  B2_re += g22_re * b2_re;
1861  B2_re -= g22_im * b2_im;
1862  spinorFloat B2_im = 0;
1863  B2_im += g20_re * b0_im;
1864  B2_im += g20_im * b0_re;
1865  B2_im += g21_re * b1_im;
1866  B2_im += g21_im * b1_re;
1867  B2_im += g22_re * b2_im;
1868  B2_im += g22_im * b2_re;
1869 
1870  o1_00_re += A0_re;
1871  o1_00_im += A0_im;
1872  o1_10_re += B0_re;
1873  o1_10_im += B0_im;
1874  o1_20_re += A0_im;
1875  o1_20_im -= A0_re;
1876  o1_30_re -= B0_im;
1877  o1_30_im += B0_re;
1878 
1879  o1_01_re += A1_re;
1880  o1_01_im += A1_im;
1881  o1_11_re += B1_re;
1882  o1_11_im += B1_im;
1883  o1_21_re += A1_im;
1884  o1_21_im -= A1_re;
1885  o1_31_re -= B1_im;
1886  o1_31_im += B1_re;
1887 
1888  o1_02_re += A2_re;
1889  o1_02_im += A2_im;
1890  o1_12_re += B2_re;
1891  o1_12_im += B2_im;
1892  o1_22_re += A2_im;
1893  o1_22_im -= A2_re;
1894  o1_32_re -= B2_im;
1895  o1_32_im += B2_re;
1896 
1897  }
1898  {
1899 #ifdef MULTI_GPU
1900  if (kernel_type == INTERIOR_KERNEL) {
1901 #endif
1902 
1903  // read flavor 2 from device memory
1905 
1906  // project spinor into half spinors
1907  a0_re = +i00_re-i20_im;
1908  a0_im = +i00_im+i20_re;
1909  a1_re = +i01_re-i21_im;
1910  a1_im = +i01_im+i21_re;
1911  a2_re = +i02_re-i22_im;
1912  a2_im = +i02_im+i22_re;
1913  b0_re = +i10_re+i30_im;
1914  b0_im = +i10_im-i30_re;
1915  b1_re = +i11_re+i31_im;
1916  b1_im = +i11_im-i31_re;
1917  b2_re = +i12_re+i32_im;
1918  b2_im = +i12_im-i32_re;
1919 
1920 #ifdef MULTI_GPU
1921  } else {
1922 
1923  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
1924 
1925  // read half spinor for the second flavor from device memory
1926  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
1927  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+FLAVORS*ghostFace[static_cast<int>(kernel_type)]);
1928 
1929  a0_re = i00_re; a0_im = i00_im;
1930  a1_re = i01_re; a1_im = i01_im;
1931  a2_re = i02_re; a2_im = i02_im;
1932  b0_re = i10_re; b0_im = i10_im;
1933  b1_re = i11_re; b1_im = i11_im;
1934  b2_re = i12_re; b2_im = i12_im;
1935 
1936  }
1937 #endif // MULTI_GPU
1938 
1939  // multiply row 0
1940  spinorFloat A0_re = 0;
1941  A0_re += g00_re * a0_re;
1942  A0_re -= g00_im * a0_im;
1943  A0_re += g01_re * a1_re;
1944  A0_re -= g01_im * a1_im;
1945  A0_re += g02_re * a2_re;
1946  A0_re -= g02_im * a2_im;
1947  spinorFloat A0_im = 0;
1948  A0_im += g00_re * a0_im;
1949  A0_im += g00_im * a0_re;
1950  A0_im += g01_re * a1_im;
1951  A0_im += g01_im * a1_re;
1952  A0_im += g02_re * a2_im;
1953  A0_im += g02_im * a2_re;
1954  spinorFloat B0_re = 0;
1955  B0_re += g00_re * b0_re;
1956  B0_re -= g00_im * b0_im;
1957  B0_re += g01_re * b1_re;
1958  B0_re -= g01_im * b1_im;
1959  B0_re += g02_re * b2_re;
1960  B0_re -= g02_im * b2_im;
1961  spinorFloat B0_im = 0;
1962  B0_im += g00_re * b0_im;
1963  B0_im += g00_im * b0_re;
1964  B0_im += g01_re * b1_im;
1965  B0_im += g01_im * b1_re;
1966  B0_im += g02_re * b2_im;
1967  B0_im += g02_im * b2_re;
1968 
1969  // multiply row 1
1970  spinorFloat A1_re = 0;
1971  A1_re += g10_re * a0_re;
1972  A1_re -= g10_im * a0_im;
1973  A1_re += g11_re * a1_re;
1974  A1_re -= g11_im * a1_im;
1975  A1_re += g12_re * a2_re;
1976  A1_re -= g12_im * a2_im;
1977  spinorFloat A1_im = 0;
1978  A1_im += g10_re * a0_im;
1979  A1_im += g10_im * a0_re;
1980  A1_im += g11_re * a1_im;
1981  A1_im += g11_im * a1_re;
1982  A1_im += g12_re * a2_im;
1983  A1_im += g12_im * a2_re;
1984  spinorFloat B1_re = 0;
1985  B1_re += g10_re * b0_re;
1986  B1_re -= g10_im * b0_im;
1987  B1_re += g11_re * b1_re;
1988  B1_re -= g11_im * b1_im;
1989  B1_re += g12_re * b2_re;
1990  B1_re -= g12_im * b2_im;
1991  spinorFloat B1_im = 0;
1992  B1_im += g10_re * b0_im;
1993  B1_im += g10_im * b0_re;
1994  B1_im += g11_re * b1_im;
1995  B1_im += g11_im * b1_re;
1996  B1_im += g12_re * b2_im;
1997  B1_im += g12_im * b2_re;
1998 
1999  // multiply row 2
2000  spinorFloat A2_re = 0;
2001  A2_re += g20_re * a0_re;
2002  A2_re -= g20_im * a0_im;
2003  A2_re += g21_re * a1_re;
2004  A2_re -= g21_im * a1_im;
2005  A2_re += g22_re * a2_re;
2006  A2_re -= g22_im * a2_im;
2007  spinorFloat A2_im = 0;
2008  A2_im += g20_re * a0_im;
2009  A2_im += g20_im * a0_re;
2010  A2_im += g21_re * a1_im;
2011  A2_im += g21_im * a1_re;
2012  A2_im += g22_re * a2_im;
2013  A2_im += g22_im * a2_re;
2014  spinorFloat B2_re = 0;
2015  B2_re += g20_re * b0_re;
2016  B2_re -= g20_im * b0_im;
2017  B2_re += g21_re * b1_re;
2018  B2_re -= g21_im * b1_im;
2019  B2_re += g22_re * b2_re;
2020  B2_re -= g22_im * b2_im;
2021  spinorFloat B2_im = 0;
2022  B2_im += g20_re * b0_im;
2023  B2_im += g20_im * b0_re;
2024  B2_im += g21_re * b1_im;
2025  B2_im += g21_im * b1_re;
2026  B2_im += g22_re * b2_im;
2027  B2_im += g22_im * b2_re;
2028 
2029  o2_00_re += A0_re;
2030  o2_00_im += A0_im;
2031  o2_10_re += B0_re;
2032  o2_10_im += B0_im;
2033  o2_20_re += A0_im;
2034  o2_20_im -= A0_re;
2035  o2_30_re -= B0_im;
2036  o2_30_im += B0_re;
2037 
2038  o2_01_re += A1_re;
2039  o2_01_im += A1_im;
2040  o2_11_re += B1_re;
2041  o2_11_im += B1_im;
2042  o2_21_re += A1_im;
2043  o2_21_im -= A1_re;
2044  o2_31_re -= B1_im;
2045  o2_31_im += B1_re;
2046 
2047  o2_02_re += A2_re;
2048  o2_02_im += A2_im;
2049  o2_12_re += B2_re;
2050  o2_12_im += B2_im;
2051  o2_22_re += A2_im;
2052  o2_22_im -= A2_re;
2053  o2_32_re -= B2_im;
2054  o2_32_im += B2_re;
2055 
2056  }
2057 }
2058 
2059 #ifdef MULTI_GPU
2060 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
2061  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
2062 #endif
2063 {
2064  // Projector P2-
2065  // 1 0 -i 0
2066  // 0 1 0 i
2067  // i 0 1 0
2068  // 0 -i 0 1
2069 
2070 #ifdef MULTI_GPU
2071  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
2072  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2073 #else
2074  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
2075 #endif
2076 
2077 #ifdef MULTI_GPU
2078  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
2079 #else
2080  const int ga_idx = sp_idx;
2081 #endif
2082 
2089 
2090  // read gauge matrix from device memory
2091  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
2092 
2093  // reconstruct gauge matrix
2095 
2096  {
2097 #ifdef MULTI_GPU
2098  if (kernel_type == INTERIOR_KERNEL) {
2099 #endif
2100 
2101  // read flavor 1 from device memory
2102  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
2103 
2104  // project spinor into half spinors
2105  a0_re = +i00_re+i20_im;
2106  a0_im = +i00_im-i20_re;
2107  a1_re = +i01_re+i21_im;
2108  a1_im = +i01_im-i21_re;
2109  a2_re = +i02_re+i22_im;
2110  a2_im = +i02_im-i22_re;
2111  b0_re = +i10_re-i30_im;
2112  b0_im = +i10_im+i30_re;
2113  b1_re = +i11_re-i31_im;
2114  b1_im = +i11_im+i31_re;
2115  b2_re = +i12_re-i32_im;
2116  b2_im = +i12_im+i32_re;
2117 
2118 #ifdef MULTI_GPU
2119  } else {
2120 
2121  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2122 
2123  // read half spinor for the first flavor from device memory
2124  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2125 
2126  a0_re = i00_re; a0_im = i00_im;
2127  a1_re = i01_re; a1_im = i01_im;
2128  a2_re = i02_re; a2_im = i02_im;
2129  b0_re = i10_re; b0_im = i10_im;
2130  b1_re = i11_re; b1_im = i11_im;
2131  b2_re = i12_re; b2_im = i12_im;
2132 
2133  }
2134 #endif // MULTI_GPU
2135 
2136  // multiply row 0
2137  spinorFloat A0_re = 0;
2138  A0_re += gT00_re * a0_re;
2139  A0_re -= gT00_im * a0_im;
2140  A0_re += gT01_re * a1_re;
2141  A0_re -= gT01_im * a1_im;
2142  A0_re += gT02_re * a2_re;
2143  A0_re -= gT02_im * a2_im;
2144  spinorFloat A0_im = 0;
2145  A0_im += gT00_re * a0_im;
2146  A0_im += gT00_im * a0_re;
2147  A0_im += gT01_re * a1_im;
2148  A0_im += gT01_im * a1_re;
2149  A0_im += gT02_re * a2_im;
2150  A0_im += gT02_im * a2_re;
2151  spinorFloat B0_re = 0;
2152  B0_re += gT00_re * b0_re;
2153  B0_re -= gT00_im * b0_im;
2154  B0_re += gT01_re * b1_re;
2155  B0_re -= gT01_im * b1_im;
2156  B0_re += gT02_re * b2_re;
2157  B0_re -= gT02_im * b2_im;
2158  spinorFloat B0_im = 0;
2159  B0_im += gT00_re * b0_im;
2160  B0_im += gT00_im * b0_re;
2161  B0_im += gT01_re * b1_im;
2162  B0_im += gT01_im * b1_re;
2163  B0_im += gT02_re * b2_im;
2164  B0_im += gT02_im * b2_re;
2165 
2166  // multiply row 1
2167  spinorFloat A1_re = 0;
2168  A1_re += gT10_re * a0_re;
2169  A1_re -= gT10_im * a0_im;
2170  A1_re += gT11_re * a1_re;
2171  A1_re -= gT11_im * a1_im;
2172  A1_re += gT12_re * a2_re;
2173  A1_re -= gT12_im * a2_im;
2174  spinorFloat A1_im = 0;
2175  A1_im += gT10_re * a0_im;
2176  A1_im += gT10_im * a0_re;
2177  A1_im += gT11_re * a1_im;
2178  A1_im += gT11_im * a1_re;
2179  A1_im += gT12_re * a2_im;
2180  A1_im += gT12_im * a2_re;
2181  spinorFloat B1_re = 0;
2182  B1_re += gT10_re * b0_re;
2183  B1_re -= gT10_im * b0_im;
2184  B1_re += gT11_re * b1_re;
2185  B1_re -= gT11_im * b1_im;
2186  B1_re += gT12_re * b2_re;
2187  B1_re -= gT12_im * b2_im;
2188  spinorFloat B1_im = 0;
2189  B1_im += gT10_re * b0_im;
2190  B1_im += gT10_im * b0_re;
2191  B1_im += gT11_re * b1_im;
2192  B1_im += gT11_im * b1_re;
2193  B1_im += gT12_re * b2_im;
2194  B1_im += gT12_im * b2_re;
2195 
2196  // multiply row 2
2197  spinorFloat A2_re = 0;
2198  A2_re += gT20_re * a0_re;
2199  A2_re -= gT20_im * a0_im;
2200  A2_re += gT21_re * a1_re;
2201  A2_re -= gT21_im * a1_im;
2202  A2_re += gT22_re * a2_re;
2203  A2_re -= gT22_im * a2_im;
2204  spinorFloat A2_im = 0;
2205  A2_im += gT20_re * a0_im;
2206  A2_im += gT20_im * a0_re;
2207  A2_im += gT21_re * a1_im;
2208  A2_im += gT21_im * a1_re;
2209  A2_im += gT22_re * a2_im;
2210  A2_im += gT22_im * a2_re;
2211  spinorFloat B2_re = 0;
2212  B2_re += gT20_re * b0_re;
2213  B2_re -= gT20_im * b0_im;
2214  B2_re += gT21_re * b1_re;
2215  B2_re -= gT21_im * b1_im;
2216  B2_re += gT22_re * b2_re;
2217  B2_re -= gT22_im * b2_im;
2218  spinorFloat B2_im = 0;
2219  B2_im += gT20_re * b0_im;
2220  B2_im += gT20_im * b0_re;
2221  B2_im += gT21_re * b1_im;
2222  B2_im += gT21_im * b1_re;
2223  B2_im += gT22_re * b2_im;
2224  B2_im += gT22_im * b2_re;
2225 
2226  o1_00_re += A0_re;
2227  o1_00_im += A0_im;
2228  o1_10_re += B0_re;
2229  o1_10_im += B0_im;
2230  o1_20_re -= A0_im;
2231  o1_20_im += A0_re;
2232  o1_30_re += B0_im;
2233  o1_30_im -= B0_re;
2234 
2235  o1_01_re += A1_re;
2236  o1_01_im += A1_im;
2237  o1_11_re += B1_re;
2238  o1_11_im += B1_im;
2239  o1_21_re -= A1_im;
2240  o1_21_im += A1_re;
2241  o1_31_re += B1_im;
2242  o1_31_im -= B1_re;
2243 
2244  o1_02_re += A2_re;
2245  o1_02_im += A2_im;
2246  o1_12_re += B2_re;
2247  o1_12_im += B2_im;
2248  o1_22_re -= A2_im;
2249  o1_22_im += A2_re;
2250  o1_32_re += B2_im;
2251  o1_32_im -= B2_re;
2252 
2253  }
2254  {
2255 #ifdef MULTI_GPU
2256  if (kernel_type == INTERIOR_KERNEL) {
2257 #endif
2258 
2259  // read flavor 2 from device memory
2261 
2262  // project spinor into half spinors
2263  a0_re = +i00_re+i20_im;
2264  a0_im = +i00_im-i20_re;
2265  a1_re = +i01_re+i21_im;
2266  a1_im = +i01_im-i21_re;
2267  a2_re = +i02_re+i22_im;
2268  a2_im = +i02_im-i22_re;
2269  b0_re = +i10_re-i30_im;
2270  b0_im = +i10_im+i30_re;
2271  b1_re = +i11_re-i31_im;
2272  b1_im = +i11_im+i31_re;
2273  b2_re = +i12_re-i32_im;
2274  b2_im = +i12_im+i32_re;
2275 
2276 #ifdef MULTI_GPU
2277  } else {
2278 
2279  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2280 
2281  // read half spinor for the second flavor from device memory
2282  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
2283  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
2284 
2285  a0_re = i00_re; a0_im = i00_im;
2286  a1_re = i01_re; a1_im = i01_im;
2287  a2_re = i02_re; a2_im = i02_im;
2288  b0_re = i10_re; b0_im = i10_im;
2289  b1_re = i11_re; b1_im = i11_im;
2290  b2_re = i12_re; b2_im = i12_im;
2291 
2292  }
2293 #endif // MULTI_GPU
2294 
2295  // multiply row 0
2296  spinorFloat A0_re = 0;
2297  A0_re += gT00_re * a0_re;
2298  A0_re -= gT00_im * a0_im;
2299  A0_re += gT01_re * a1_re;
2300  A0_re -= gT01_im * a1_im;
2301  A0_re += gT02_re * a2_re;
2302  A0_re -= gT02_im * a2_im;
2303  spinorFloat A0_im = 0;
2304  A0_im += gT00_re * a0_im;
2305  A0_im += gT00_im * a0_re;
2306  A0_im += gT01_re * a1_im;
2307  A0_im += gT01_im * a1_re;
2308  A0_im += gT02_re * a2_im;
2309  A0_im += gT02_im * a2_re;
2310  spinorFloat B0_re = 0;
2311  B0_re += gT00_re * b0_re;
2312  B0_re -= gT00_im * b0_im;
2313  B0_re += gT01_re * b1_re;
2314  B0_re -= gT01_im * b1_im;
2315  B0_re += gT02_re * b2_re;
2316  B0_re -= gT02_im * b2_im;
2317  spinorFloat B0_im = 0;
2318  B0_im += gT00_re * b0_im;
2319  B0_im += gT00_im * b0_re;
2320  B0_im += gT01_re * b1_im;
2321  B0_im += gT01_im * b1_re;
2322  B0_im += gT02_re * b2_im;
2323  B0_im += gT02_im * b2_re;
2324 
2325  // multiply row 1
2326  spinorFloat A1_re = 0;
2327  A1_re += gT10_re * a0_re;
2328  A1_re -= gT10_im * a0_im;
2329  A1_re += gT11_re * a1_re;
2330  A1_re -= gT11_im * a1_im;
2331  A1_re += gT12_re * a2_re;
2332  A1_re -= gT12_im * a2_im;
2333  spinorFloat A1_im = 0;
2334  A1_im += gT10_re * a0_im;
2335  A1_im += gT10_im * a0_re;
2336  A1_im += gT11_re * a1_im;
2337  A1_im += gT11_im * a1_re;
2338  A1_im += gT12_re * a2_im;
2339  A1_im += gT12_im * a2_re;
2340  spinorFloat B1_re = 0;
2341  B1_re += gT10_re * b0_re;
2342  B1_re -= gT10_im * b0_im;
2343  B1_re += gT11_re * b1_re;
2344  B1_re -= gT11_im * b1_im;
2345  B1_re += gT12_re * b2_re;
2346  B1_re -= gT12_im * b2_im;
2347  spinorFloat B1_im = 0;
2348  B1_im += gT10_re * b0_im;
2349  B1_im += gT10_im * b0_re;
2350  B1_im += gT11_re * b1_im;
2351  B1_im += gT11_im * b1_re;
2352  B1_im += gT12_re * b2_im;
2353  B1_im += gT12_im * b2_re;
2354 
2355  // multiply row 2
2356  spinorFloat A2_re = 0;
2357  A2_re += gT20_re * a0_re;
2358  A2_re -= gT20_im * a0_im;
2359  A2_re += gT21_re * a1_re;
2360  A2_re -= gT21_im * a1_im;
2361  A2_re += gT22_re * a2_re;
2362  A2_re -= gT22_im * a2_im;
2363  spinorFloat A2_im = 0;
2364  A2_im += gT20_re * a0_im;
2365  A2_im += gT20_im * a0_re;
2366  A2_im += gT21_re * a1_im;
2367  A2_im += gT21_im * a1_re;
2368  A2_im += gT22_re * a2_im;
2369  A2_im += gT22_im * a2_re;
2370  spinorFloat B2_re = 0;
2371  B2_re += gT20_re * b0_re;
2372  B2_re -= gT20_im * b0_im;
2373  B2_re += gT21_re * b1_re;
2374  B2_re -= gT21_im * b1_im;
2375  B2_re += gT22_re * b2_re;
2376  B2_re -= gT22_im * b2_im;
2377  spinorFloat B2_im = 0;
2378  B2_im += gT20_re * b0_im;
2379  B2_im += gT20_im * b0_re;
2380  B2_im += gT21_re * b1_im;
2381  B2_im += gT21_im * b1_re;
2382  B2_im += gT22_re * b2_im;
2383  B2_im += gT22_im * b2_re;
2384 
2385  o2_00_re += A0_re;
2386  o2_00_im += A0_im;
2387  o2_10_re += B0_re;
2388  o2_10_im += B0_im;
2389  o2_20_re -= A0_im;
2390  o2_20_im += A0_re;
2391  o2_30_re += B0_im;
2392  o2_30_im -= B0_re;
2393 
2394  o2_01_re += A1_re;
2395  o2_01_im += A1_im;
2396  o2_11_re += B1_re;
2397  o2_11_im += B1_im;
2398  o2_21_re -= A1_im;
2399  o2_21_im += A1_re;
2400  o2_31_re += B1_im;
2401  o2_31_im -= B1_re;
2402 
2403  o2_02_re += A2_re;
2404  o2_02_im += A2_im;
2405  o2_12_re += B2_re;
2406  o2_12_im += B2_im;
2407  o2_22_re -= A2_im;
2408  o2_22_im += A2_re;
2409  o2_32_re += B2_im;
2410  o2_32_im -= B2_re;
2411 
2412  }
2413 }
2414 
2415 #ifdef MULTI_GPU
2416 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
2417  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
2418 #endif
2419 {
2420  // Projector P3+
2421  // 2 0 0 0
2422  // 0 2 0 0
2423  // 0 0 0 0
2424  // 0 0 0 0
2425 
2426 #ifdef MULTI_GPU
2427  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
2428  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2429 #else
2430  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
2431 #endif
2432 
2433  const int ga_idx = sid;
2434 
2441 
2443  {
2444  {
2445 #ifdef MULTI_GPU
2446  if (kernel_type == INTERIOR_KERNEL) {
2447 #endif
2448 
2449  // read flavor 1 from device memory
2450  READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx);
2451 
2452  // project spinor into half spinors
2453  a0_re = +2*i00_re;
2454  a0_im = +2*i00_im;
2455  a1_re = +2*i01_re;
2456  a1_im = +2*i01_im;
2457  a2_re = +2*i02_re;
2458  a2_im = +2*i02_im;
2459  b0_re = +2*i10_re;
2460  b0_im = +2*i10_im;
2461  b1_re = +2*i11_re;
2462  b1_im = +2*i11_im;
2463  b2_re = +2*i12_re;
2464  b2_im = +2*i12_im;
2465 
2466 #ifdef MULTI_GPU
2467  } else {
2468 
2469  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2470 
2471  // read half spinor for the first flavor from device memory
2472  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
2473 
2474  a0_re = i00_re; a0_im = i00_im;
2475  a1_re = i01_re; a1_im = i01_im;
2476  a2_re = i02_re; a2_im = i02_im;
2477  b0_re = i10_re; b0_im = i10_im;
2478  b1_re = i11_re; b1_im = i11_im;
2479  b2_re = i12_re; b2_im = i12_im;
2480 
2481  }
2482 #endif // MULTI_GPU
2483 
2484  // identity gauge matrix
2491 
2492  o1_00_re += A0_re;
2493  o1_00_im += A0_im;
2494  o1_10_re += B0_re;
2495  o1_10_im += B0_im;
2496 
2497  o1_01_re += A1_re;
2498  o1_01_im += A1_im;
2499  o1_11_re += B1_re;
2500  o1_11_im += B1_im;
2501 
2502  o1_02_re += A2_re;
2503  o1_02_im += A2_im;
2504  o1_12_re += B2_re;
2505  o1_12_im += B2_im;
2506 
2507  }
2508  {
2509 #ifdef MULTI_GPU
2510  if (kernel_type == INTERIOR_KERNEL) {
2511 #endif
2512 
2513  // read flavor 2 from device memory
2515 
2516  // project spinor into half spinors
2517  a0_re = +2*i00_re;
2518  a0_im = +2*i00_im;
2519  a1_re = +2*i01_re;
2520  a1_im = +2*i01_im;
2521  a2_re = +2*i02_re;
2522  a2_im = +2*i02_im;
2523  b0_re = +2*i10_re;
2524  b0_im = +2*i10_im;
2525  b1_re = +2*i11_re;
2526  b1_im = +2*i11_im;
2527  b2_re = +2*i12_re;
2528  b2_im = +2*i12_im;
2529 
2530 #ifdef MULTI_GPU
2531  } else {
2532 
2533  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2534 
2535  // read half spinor for the second flavor from device memory
2536  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
2537  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+FLAVORS*ghostFace[static_cast<int>(kernel_type)]);
2538 
2539  a0_re = i00_re; a0_im = i00_im;
2540  a1_re = i01_re; a1_im = i01_im;
2541  a2_re = i02_re; a2_im = i02_im;
2542  b0_re = i10_re; b0_im = i10_im;
2543  b1_re = i11_re; b1_im = i11_im;
2544  b2_re = i12_re; b2_im = i12_im;
2545 
2546  }
2547 #endif // MULTI_GPU
2548 
2549  // identity gauge matrix
2556 
2557  o2_00_re += A0_re;
2558  o2_00_im += A0_im;
2559  o2_10_re += B0_re;
2560  o2_10_im += B0_im;
2561 
2562  o2_01_re += A1_re;
2563  o2_01_im += A1_im;
2564  o2_11_re += B1_re;
2565  o2_11_im += B1_im;
2566 
2567  o2_02_re += A2_re;
2568  o2_02_im += A2_im;
2569  o2_12_re += B2_re;
2570  o2_12_im += B2_im;
2571 
2572  }
2573  } else {
2574  // read gauge matrix from device memory
2575  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
2576 
2577  // reconstruct gauge matrix
2579 
2580  {
2581 #ifdef MULTI_GPU
2582  if (kernel_type == INTERIOR_KERNEL) {
2583 #endif
2584 
2585  // read flavor 1 from device memory
2586  READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx);
2587 
2588  // project spinor into half spinors
2589  a0_re = +2*i00_re;
2590  a0_im = +2*i00_im;
2591  a1_re = +2*i01_re;
2592  a1_im = +2*i01_im;
2593  a2_re = +2*i02_re;
2594  a2_im = +2*i02_im;
2595  b0_re = +2*i10_re;
2596  b0_im = +2*i10_im;
2597  b1_re = +2*i11_re;
2598  b1_im = +2*i11_im;
2599  b2_re = +2*i12_re;
2600  b2_im = +2*i12_im;
2601 
2602 #ifdef MULTI_GPU
2603  } else {
2604 
2605  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2606 
2607  // read half spinor for the first flavor from device memory
2608  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
2609 
2610  a0_re = i00_re; a0_im = i00_im;
2611  a1_re = i01_re; a1_im = i01_im;
2612  a2_re = i02_re; a2_im = i02_im;
2613  b0_re = i10_re; b0_im = i10_im;
2614  b1_re = i11_re; b1_im = i11_im;
2615  b2_re = i12_re; b2_im = i12_im;
2616 
2617  }
2618 #endif // MULTI_GPU
2619 
2620  // multiply row 0
2621  spinorFloat A0_re = 0;
2622  A0_re += g00_re * a0_re;
2623  A0_re -= g00_im * a0_im;
2624  A0_re += g01_re * a1_re;
2625  A0_re -= g01_im * a1_im;
2626  A0_re += g02_re * a2_re;
2627  A0_re -= g02_im * a2_im;
2628  spinorFloat A0_im = 0;
2629  A0_im += g00_re * a0_im;
2630  A0_im += g00_im * a0_re;
2631  A0_im += g01_re * a1_im;
2632  A0_im += g01_im * a1_re;
2633  A0_im += g02_re * a2_im;
2634  A0_im += g02_im * a2_re;
2635  spinorFloat B0_re = 0;
2636  B0_re += g00_re * b0_re;
2637  B0_re -= g00_im * b0_im;
2638  B0_re += g01_re * b1_re;
2639  B0_re -= g01_im * b1_im;
2640  B0_re += g02_re * b2_re;
2641  B0_re -= g02_im * b2_im;
2642  spinorFloat B0_im = 0;
2643  B0_im += g00_re * b0_im;
2644  B0_im += g00_im * b0_re;
2645  B0_im += g01_re * b1_im;
2646  B0_im += g01_im * b1_re;
2647  B0_im += g02_re * b2_im;
2648  B0_im += g02_im * b2_re;
2649 
2650  // multiply row 1
2651  spinorFloat A1_re = 0;
2652  A1_re += g10_re * a0_re;
2653  A1_re -= g10_im * a0_im;
2654  A1_re += g11_re * a1_re;
2655  A1_re -= g11_im * a1_im;
2656  A1_re += g12_re * a2_re;
2657  A1_re -= g12_im * a2_im;
2658  spinorFloat A1_im = 0;
2659  A1_im += g10_re * a0_im;
2660  A1_im += g10_im * a0_re;
2661  A1_im += g11_re * a1_im;
2662  A1_im += g11_im * a1_re;
2663  A1_im += g12_re * a2_im;
2664  A1_im += g12_im * a2_re;
2665  spinorFloat B1_re = 0;
2666  B1_re += g10_re * b0_re;
2667  B1_re -= g10_im * b0_im;
2668  B1_re += g11_re * b1_re;
2669  B1_re -= g11_im * b1_im;
2670  B1_re += g12_re * b2_re;
2671  B1_re -= g12_im * b2_im;
2672  spinorFloat B1_im = 0;
2673  B1_im += g10_re * b0_im;
2674  B1_im += g10_im * b0_re;
2675  B1_im += g11_re * b1_im;
2676  B1_im += g11_im * b1_re;
2677  B1_im += g12_re * b2_im;
2678  B1_im += g12_im * b2_re;
2679 
2680  // multiply row 2
2681  spinorFloat A2_re = 0;
2682  A2_re += g20_re * a0_re;
2683  A2_re -= g20_im * a0_im;
2684  A2_re += g21_re * a1_re;
2685  A2_re -= g21_im * a1_im;
2686  A2_re += g22_re * a2_re;
2687  A2_re -= g22_im * a2_im;
2688  spinorFloat A2_im = 0;
2689  A2_im += g20_re * a0_im;
2690  A2_im += g20_im * a0_re;
2691  A2_im += g21_re * a1_im;
2692  A2_im += g21_im * a1_re;
2693  A2_im += g22_re * a2_im;
2694  A2_im += g22_im * a2_re;
2695  spinorFloat B2_re = 0;
2696  B2_re += g20_re * b0_re;
2697  B2_re -= g20_im * b0_im;
2698  B2_re += g21_re * b1_re;
2699  B2_re -= g21_im * b1_im;
2700  B2_re += g22_re * b2_re;
2701  B2_re -= g22_im * b2_im;
2702  spinorFloat B2_im = 0;
2703  B2_im += g20_re * b0_im;
2704  B2_im += g20_im * b0_re;
2705  B2_im += g21_re * b1_im;
2706  B2_im += g21_im * b1_re;
2707  B2_im += g22_re * b2_im;
2708  B2_im += g22_im * b2_re;
2709 
2710  o1_00_re += A0_re;
2711  o1_00_im += A0_im;
2712  o1_10_re += B0_re;
2713  o1_10_im += B0_im;
2714 
2715  o1_01_re += A1_re;
2716  o1_01_im += A1_im;
2717  o1_11_re += B1_re;
2718  o1_11_im += B1_im;
2719 
2720  o1_02_re += A2_re;
2721  o1_02_im += A2_im;
2722  o1_12_re += B2_re;
2723  o1_12_im += B2_im;
2724 
2725  }
2726  {
2727 #ifdef MULTI_GPU
2728  if (kernel_type == INTERIOR_KERNEL) {
2729 #endif
2730 
2731  // read flavor 2 from device memory
2733 
2734  // project spinor into half spinors
2735  a0_re = +2*i00_re;
2736  a0_im = +2*i00_im;
2737  a1_re = +2*i01_re;
2738  a1_im = +2*i01_im;
2739  a2_re = +2*i02_re;
2740  a2_im = +2*i02_im;
2741  b0_re = +2*i10_re;
2742  b0_im = +2*i10_im;
2743  b1_re = +2*i11_re;
2744  b1_im = +2*i11_im;
2745  b2_re = +2*i12_re;
2746  b2_im = +2*i12_im;
2747 
2748 #ifdef MULTI_GPU
2749  } else {
2750 
2751  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2752 
2753  // read half spinor for the second flavor from device memory
2754  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
2755  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+FLAVORS*ghostFace[static_cast<int>(kernel_type)]);
2756 
2757  a0_re = i00_re; a0_im = i00_im;
2758  a1_re = i01_re; a1_im = i01_im;
2759  a2_re = i02_re; a2_im = i02_im;
2760  b0_re = i10_re; b0_im = i10_im;
2761  b1_re = i11_re; b1_im = i11_im;
2762  b2_re = i12_re; b2_im = i12_im;
2763 
2764  }
2765 #endif // MULTI_GPU
2766 
2767  // multiply row 0
2768  spinorFloat A0_re = 0;
2769  A0_re += g00_re * a0_re;
2770  A0_re -= g00_im * a0_im;
2771  A0_re += g01_re * a1_re;
2772  A0_re -= g01_im * a1_im;
2773  A0_re += g02_re * a2_re;
2774  A0_re -= g02_im * a2_im;
2775  spinorFloat A0_im = 0;
2776  A0_im += g00_re * a0_im;
2777  A0_im += g00_im * a0_re;
2778  A0_im += g01_re * a1_im;
2779  A0_im += g01_im * a1_re;
2780  A0_im += g02_re * a2_im;
2781  A0_im += g02_im * a2_re;
2782  spinorFloat B0_re = 0;
2783  B0_re += g00_re * b0_re;
2784  B0_re -= g00_im * b0_im;
2785  B0_re += g01_re * b1_re;
2786  B0_re -= g01_im * b1_im;
2787  B0_re += g02_re * b2_re;
2788  B0_re -= g02_im * b2_im;
2789  spinorFloat B0_im = 0;
2790  B0_im += g00_re * b0_im;
2791  B0_im += g00_im * b0_re;
2792  B0_im += g01_re * b1_im;
2793  B0_im += g01_im * b1_re;
2794  B0_im += g02_re * b2_im;
2795  B0_im += g02_im * b2_re;
2796 
2797  // multiply row 1
2798  spinorFloat A1_re = 0;
2799  A1_re += g10_re * a0_re;
2800  A1_re -= g10_im * a0_im;
2801  A1_re += g11_re * a1_re;
2802  A1_re -= g11_im * a1_im;
2803  A1_re += g12_re * a2_re;
2804  A1_re -= g12_im * a2_im;
2805  spinorFloat A1_im = 0;
2806  A1_im += g10_re * a0_im;
2807  A1_im += g10_im * a0_re;
2808  A1_im += g11_re * a1_im;
2809  A1_im += g11_im * a1_re;
2810  A1_im += g12_re * a2_im;
2811  A1_im += g12_im * a2_re;
2812  spinorFloat B1_re = 0;
2813  B1_re += g10_re * b0_re;
2814  B1_re -= g10_im * b0_im;
2815  B1_re += g11_re * b1_re;
2816  B1_re -= g11_im * b1_im;
2817  B1_re += g12_re * b2_re;
2818  B1_re -= g12_im * b2_im;
2819  spinorFloat B1_im = 0;
2820  B1_im += g10_re * b0_im;
2821  B1_im += g10_im * b0_re;
2822  B1_im += g11_re * b1_im;
2823  B1_im += g11_im * b1_re;
2824  B1_im += g12_re * b2_im;
2825  B1_im += g12_im * b2_re;
2826 
2827  // multiply row 2
2828  spinorFloat A2_re = 0;
2829  A2_re += g20_re * a0_re;
2830  A2_re -= g20_im * a0_im;
2831  A2_re += g21_re * a1_re;
2832  A2_re -= g21_im * a1_im;
2833  A2_re += g22_re * a2_re;
2834  A2_re -= g22_im * a2_im;
2835  spinorFloat A2_im = 0;
2836  A2_im += g20_re * a0_im;
2837  A2_im += g20_im * a0_re;
2838  A2_im += g21_re * a1_im;
2839  A2_im += g21_im * a1_re;
2840  A2_im += g22_re * a2_im;
2841  A2_im += g22_im * a2_re;
2842  spinorFloat B2_re = 0;
2843  B2_re += g20_re * b0_re;
2844  B2_re -= g20_im * b0_im;
2845  B2_re += g21_re * b1_re;
2846  B2_re -= g21_im * b1_im;
2847  B2_re += g22_re * b2_re;
2848  B2_re -= g22_im * b2_im;
2849  spinorFloat B2_im = 0;
2850  B2_im += g20_re * b0_im;
2851  B2_im += g20_im * b0_re;
2852  B2_im += g21_re * b1_im;
2853  B2_im += g21_im * b1_re;
2854  B2_im += g22_re * b2_im;
2855  B2_im += g22_im * b2_re;
2856 
2857  o2_00_re += A0_re;
2858  o2_00_im += A0_im;
2859  o2_10_re += B0_re;
2860  o2_10_im += B0_im;
2861 
2862  o2_01_re += A1_re;
2863  o2_01_im += A1_im;
2864  o2_11_re += B1_re;
2865  o2_11_im += B1_im;
2866 
2867  o2_02_re += A2_re;
2868  o2_02_im += A2_im;
2869  o2_12_re += B2_re;
2870  o2_12_im += B2_im;
2871 
2872  }
2873  }
2874 }
2875 
2876 #ifdef MULTI_GPU
2877 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
2878  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
2879 #endif
2880 {
2881  // Projector P3-
2882  // 0 0 0 0
2883  // 0 0 0 0
2884  // 0 0 2 0
2885  // 0 0 0 2
2886 
2887 #ifdef MULTI_GPU
2888  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
2889  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2890 #else
2891  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
2892 #endif
2893 
2894 #ifdef MULTI_GPU
2895  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
2896 #else
2897  const int ga_idx = sp_idx;
2898 #endif
2899 
2906 
2907  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
2908  {
2909  {
2910 #ifdef MULTI_GPU
2911  if (kernel_type == INTERIOR_KERNEL) {
2912 #endif
2913 
2914  // read flavor 1 from device memory
2915  READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx);
2916 
2917  // project spinor into half spinors
2918  a0_re = +2*i20_re;
2919  a0_im = +2*i20_im;
2920  a1_re = +2*i21_re;
2921  a1_im = +2*i21_im;
2922  a2_re = +2*i22_re;
2923  a2_im = +2*i22_im;
2924  b0_re = +2*i30_re;
2925  b0_im = +2*i30_im;
2926  b1_re = +2*i31_re;
2927  b1_im = +2*i31_im;
2928  b2_re = +2*i32_re;
2929  b2_im = +2*i32_im;
2930 
2931 #ifdef MULTI_GPU
2932  } else {
2933 
2934  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2935 
2936  // read half spinor for the first flavor from device memory
2937  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2938 
2939  a0_re = i00_re; a0_im = i00_im;
2940  a1_re = i01_re; a1_im = i01_im;
2941  a2_re = i02_re; a2_im = i02_im;
2942  b0_re = i10_re; b0_im = i10_im;
2943  b1_re = i11_re; b1_im = i11_im;
2944  b2_re = i12_re; b2_im = i12_im;
2945 
2946  }
2947 #endif // MULTI_GPU
2948 
2949  // identity gauge matrix
2956 
2957  o1_20_re += A0_re;
2958  o1_20_im += A0_im;
2959  o1_30_re += B0_re;
2960  o1_30_im += B0_im;
2961 
2962  o1_21_re += A1_re;
2963  o1_21_im += A1_im;
2964  o1_31_re += B1_re;
2965  o1_31_im += B1_im;
2966 
2967  o1_22_re += A2_re;
2968  o1_22_im += A2_im;
2969  o1_32_re += B2_re;
2970  o1_32_im += B2_im;
2971 
2972  }
2973  {
2974 #ifdef MULTI_GPU
2975  if (kernel_type == INTERIOR_KERNEL) {
2976 #endif
2977 
2978  // read flavor 2 from device memory
2980 
2981  // project spinor into half spinors
2982  a0_re = +2*i20_re;
2983  a0_im = +2*i20_im;
2984  a1_re = +2*i21_re;
2985  a1_im = +2*i21_im;
2986  a2_re = +2*i22_re;
2987  a2_im = +2*i22_im;
2988  b0_re = +2*i30_re;
2989  b0_im = +2*i30_im;
2990  b1_re = +2*i31_re;
2991  b1_im = +2*i31_im;
2992  b2_re = +2*i32_re;
2993  b2_im = +2*i32_im;
2994 
2995 #ifdef MULTI_GPU
2996  } else {
2997 
2998  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2999 
3000  // read half spinor for the second flavor from device memory
3001  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
3002  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
3003 
3004  a0_re = i00_re; a0_im = i00_im;
3005  a1_re = i01_re; a1_im = i01_im;
3006  a2_re = i02_re; a2_im = i02_im;
3007  b0_re = i10_re; b0_im = i10_im;
3008  b1_re = i11_re; b1_im = i11_im;
3009  b2_re = i12_re; b2_im = i12_im;
3010 
3011  }
3012 #endif // MULTI_GPU
3013 
3014  // identity gauge matrix
3021 
3022  o2_20_re += A0_re;
3023  o2_20_im += A0_im;
3024  o2_30_re += B0_re;
3025  o2_30_im += B0_im;
3026 
3027  o2_21_re += A1_re;
3028  o2_21_im += A1_im;
3029  o2_31_re += B1_re;
3030  o2_31_im += B1_im;
3031 
3032  o2_22_re += A2_re;
3033  o2_22_im += A2_im;
3034  o2_32_re += B2_re;
3035  o2_32_im += B2_im;
3036 
3037  }
3038  } else {
3039  // read gauge matrix from device memory
3040  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
3041 
3042  // reconstruct gauge matrix
3044 
3045  {
3046 #ifdef MULTI_GPU
3047  if (kernel_type == INTERIOR_KERNEL) {
3048 #endif
3049 
3050  // read flavor 1 from device memory
3051  READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx);
3052 
3053  // project spinor into half spinors
3054  a0_re = +2*i20_re;
3055  a0_im = +2*i20_im;
3056  a1_re = +2*i21_re;
3057  a1_im = +2*i21_im;
3058  a2_re = +2*i22_re;
3059  a2_im = +2*i22_im;
3060  b0_re = +2*i30_re;
3061  b0_im = +2*i30_im;
3062  b1_re = +2*i31_re;
3063  b1_im = +2*i31_im;
3064  b2_re = +2*i32_re;
3065  b2_im = +2*i32_im;
3066 
3067 #ifdef MULTI_GPU
3068  } else {
3069 
3070  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
3071 
3072  // read half spinor for the first flavor from device memory
3073  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
3074 
3075  a0_re = i00_re; a0_im = i00_im;
3076  a1_re = i01_re; a1_im = i01_im;
3077  a2_re = i02_re; a2_im = i02_im;
3078  b0_re = i10_re; b0_im = i10_im;
3079  b1_re = i11_re; b1_im = i11_im;
3080  b2_re = i12_re; b2_im = i12_im;
3081 
3082  }
3083 #endif // MULTI_GPU
3084 
3085  // multiply row 0
3086  spinorFloat A0_re = 0;
3087  A0_re += gT00_re * a0_re;
3088  A0_re -= gT00_im * a0_im;
3089  A0_re += gT01_re * a1_re;
3090  A0_re -= gT01_im * a1_im;
3091  A0_re += gT02_re * a2_re;
3092  A0_re -= gT02_im * a2_im;
3093  spinorFloat A0_im = 0;
3094  A0_im += gT00_re * a0_im;
3095  A0_im += gT00_im * a0_re;
3096  A0_im += gT01_re * a1_im;
3097  A0_im += gT01_im * a1_re;
3098  A0_im += gT02_re * a2_im;
3099  A0_im += gT02_im * a2_re;
3100  spinorFloat B0_re = 0;
3101  B0_re += gT00_re * b0_re;
3102  B0_re -= gT00_im * b0_im;
3103  B0_re += gT01_re * b1_re;
3104  B0_re -= gT01_im * b1_im;
3105  B0_re += gT02_re * b2_re;
3106  B0_re -= gT02_im * b2_im;
3107  spinorFloat B0_im = 0;
3108  B0_im += gT00_re * b0_im;
3109  B0_im += gT00_im * b0_re;
3110  B0_im += gT01_re * b1_im;
3111  B0_im += gT01_im * b1_re;
3112  B0_im += gT02_re * b2_im;
3113  B0_im += gT02_im * b2_re;
3114 
3115  // multiply row 1
3116  spinorFloat A1_re = 0;
3117  A1_re += gT10_re * a0_re;
3118  A1_re -= gT10_im * a0_im;
3119  A1_re += gT11_re * a1_re;
3120  A1_re -= gT11_im * a1_im;
3121  A1_re += gT12_re * a2_re;
3122  A1_re -= gT12_im * a2_im;
3123  spinorFloat A1_im = 0;
3124  A1_im += gT10_re * a0_im;
3125  A1_im += gT10_im * a0_re;
3126  A1_im += gT11_re * a1_im;
3127  A1_im += gT11_im * a1_re;
3128  A1_im += gT12_re * a2_im;
3129  A1_im += gT12_im * a2_re;
3130  spinorFloat B1_re = 0;
3131  B1_re += gT10_re * b0_re;
3132  B1_re -= gT10_im * b0_im;
3133  B1_re += gT11_re * b1_re;
3134  B1_re -= gT11_im * b1_im;
3135  B1_re += gT12_re * b2_re;
3136  B1_re -= gT12_im * b2_im;
3137  spinorFloat B1_im = 0;
3138  B1_im += gT10_re * b0_im;
3139  B1_im += gT10_im * b0_re;
3140  B1_im += gT11_re * b1_im;
3141  B1_im += gT11_im * b1_re;
3142  B1_im += gT12_re * b2_im;
3143  B1_im += gT12_im * b2_re;
3144 
3145  // multiply row 2
3146  spinorFloat A2_re = 0;
3147  A2_re += gT20_re * a0_re;
3148  A2_re -= gT20_im * a0_im;
3149  A2_re += gT21_re * a1_re;
3150  A2_re -= gT21_im * a1_im;
3151  A2_re += gT22_re * a2_re;
3152  A2_re -= gT22_im * a2_im;
3153  spinorFloat A2_im = 0;
3154  A2_im += gT20_re * a0_im;
3155  A2_im += gT20_im * a0_re;
3156  A2_im += gT21_re * a1_im;
3157  A2_im += gT21_im * a1_re;
3158  A2_im += gT22_re * a2_im;
3159  A2_im += gT22_im * a2_re;
3160  spinorFloat B2_re = 0;
3161  B2_re += gT20_re * b0_re;
3162  B2_re -= gT20_im * b0_im;
3163  B2_re += gT21_re * b1_re;
3164  B2_re -= gT21_im * b1_im;
3165  B2_re += gT22_re * b2_re;
3166  B2_re -= gT22_im * b2_im;
3167  spinorFloat B2_im = 0;
3168  B2_im += gT20_re * b0_im;
3169  B2_im += gT20_im * b0_re;
3170  B2_im += gT21_re * b1_im;
3171  B2_im += gT21_im * b1_re;
3172  B2_im += gT22_re * b2_im;
3173  B2_im += gT22_im * b2_re;
3174 
3175  o1_20_re += A0_re;
3176  o1_20_im += A0_im;
3177  o1_30_re += B0_re;
3178  o1_30_im += B0_im;
3179 
3180  o1_21_re += A1_re;
3181  o1_21_im += A1_im;
3182  o1_31_re += B1_re;
3183  o1_31_im += B1_im;
3184 
3185  o1_22_re += A2_re;
3186  o1_22_im += A2_im;
3187  o1_32_re += B2_re;
3188  o1_32_im += B2_im;
3189 
3190  }
3191  {
3192 #ifdef MULTI_GPU
3193  if (kernel_type == INTERIOR_KERNEL) {
3194 #endif
3195 
3196  // read flavor 2 from device memory
3198 
3199  // project spinor into half spinors
3200  a0_re = +2*i20_re;
3201  a0_im = +2*i20_im;
3202  a1_re = +2*i21_re;
3203  a1_im = +2*i21_im;
3204  a2_re = +2*i22_re;
3205  a2_im = +2*i22_im;
3206  b0_re = +2*i30_re;
3207  b0_im = +2*i30_im;
3208  b1_re = +2*i31_re;
3209  b1_im = +2*i31_im;
3210  b2_re = +2*i32_re;
3211  b2_im = +2*i32_im;
3212 
3213 #ifdef MULTI_GPU
3214  } else {
3215 
3216  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
3217 
3218  // read half spinor for the second flavor from device memory
3219  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
3220  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
3221 
3222  a0_re = i00_re; a0_im = i00_im;
3223  a1_re = i01_re; a1_im = i01_im;
3224  a2_re = i02_re; a2_im = i02_im;
3225  b0_re = i10_re; b0_im = i10_im;
3226  b1_re = i11_re; b1_im = i11_im;
3227  b2_re = i12_re; b2_im = i12_im;
3228 
3229  }
3230 #endif // MULTI_GPU
3231 
3232  // multiply row 0
3233  spinorFloat A0_re = 0;
3234  A0_re += gT00_re * a0_re;
3235  A0_re -= gT00_im * a0_im;
3236  A0_re += gT01_re * a1_re;
3237  A0_re -= gT01_im * a1_im;
3238  A0_re += gT02_re * a2_re;
3239  A0_re -= gT02_im * a2_im;
3240  spinorFloat A0_im = 0;
3241  A0_im += gT00_re * a0_im;
3242  A0_im += gT00_im * a0_re;
3243  A0_im += gT01_re * a1_im;
3244  A0_im += gT01_im * a1_re;
3245  A0_im += gT02_re * a2_im;
3246  A0_im += gT02_im * a2_re;
3247  spinorFloat B0_re = 0;
3248  B0_re += gT00_re * b0_re;
3249  B0_re -= gT00_im * b0_im;
3250  B0_re += gT01_re * b1_re;
3251  B0_re -= gT01_im * b1_im;
3252  B0_re += gT02_re * b2_re;
3253  B0_re -= gT02_im * b2_im;
3254  spinorFloat B0_im = 0;
3255  B0_im += gT00_re * b0_im;
3256  B0_im += gT00_im * b0_re;
3257  B0_im += gT01_re * b1_im;
3258  B0_im += gT01_im * b1_re;
3259  B0_im += gT02_re * b2_im;
3260  B0_im += gT02_im * b2_re;
3261 
3262  // multiply row 1
3263  spinorFloat A1_re = 0;
3264  A1_re += gT10_re * a0_re;
3265  A1_re -= gT10_im * a0_im;
3266  A1_re += gT11_re * a1_re;
3267  A1_re -= gT11_im * a1_im;
3268  A1_re += gT12_re * a2_re;
3269  A1_re -= gT12_im * a2_im;
3270  spinorFloat A1_im = 0;
3271  A1_im += gT10_re * a0_im;
3272  A1_im += gT10_im * a0_re;
3273  A1_im += gT11_re * a1_im;
3274  A1_im += gT11_im * a1_re;
3275  A1_im += gT12_re * a2_im;
3276  A1_im += gT12_im * a2_re;
3277  spinorFloat B1_re = 0;
3278  B1_re += gT10_re * b0_re;
3279  B1_re -= gT10_im * b0_im;
3280  B1_re += gT11_re * b1_re;
3281  B1_re -= gT11_im * b1_im;
3282  B1_re += gT12_re * b2_re;
3283  B1_re -= gT12_im * b2_im;
3284  spinorFloat B1_im = 0;
3285  B1_im += gT10_re * b0_im;
3286  B1_im += gT10_im * b0_re;
3287  B1_im += gT11_re * b1_im;
3288  B1_im += gT11_im * b1_re;
3289  B1_im += gT12_re * b2_im;
3290  B1_im += gT12_im * b2_re;
3291 
3292  // multiply row 2
3293  spinorFloat A2_re = 0;
3294  A2_re += gT20_re * a0_re;
3295  A2_re -= gT20_im * a0_im;
3296  A2_re += gT21_re * a1_re;
3297  A2_re -= gT21_im * a1_im;
3298  A2_re += gT22_re * a2_re;
3299  A2_re -= gT22_im * a2_im;
3300  spinorFloat A2_im = 0;
3301  A2_im += gT20_re * a0_im;
3302  A2_im += gT20_im * a0_re;
3303  A2_im += gT21_re * a1_im;
3304  A2_im += gT21_im * a1_re;
3305  A2_im += gT22_re * a2_im;
3306  A2_im += gT22_im * a2_re;
3307  spinorFloat B2_re = 0;
3308  B2_re += gT20_re * b0_re;
3309  B2_re -= gT20_im * b0_im;
3310  B2_re += gT21_re * b1_re;
3311  B2_re -= gT21_im * b1_im;
3312  B2_re += gT22_re * b2_re;
3313  B2_re -= gT22_im * b2_im;
3314  spinorFloat B2_im = 0;
3315  B2_im += gT20_re * b0_im;
3316  B2_im += gT20_im * b0_re;
3317  B2_im += gT21_re * b1_im;
3318  B2_im += gT21_im * b1_re;
3319  B2_im += gT22_re * b2_im;
3320  B2_im += gT22_im * b2_re;
3321 
3322  o2_20_re += A0_re;
3323  o2_20_im += A0_im;
3324  o2_30_re += B0_re;
3325  o2_30_im += B0_im;
3326 
3327  o2_21_re += A1_re;
3328  o2_21_im += A1_im;
3329  o2_31_re += B1_re;
3330  o2_31_im += B1_im;
3331 
3332  o2_22_re += A2_re;
3333  o2_22_im += A2_im;
3334  o2_32_re += B2_re;
3335  o2_32_im += B2_im;
3336 
3337  }
3338  }
3339 }
3340 
3341 #ifdef MULTI_GPU
3342 
3343 int incomplete = 0; // Have all 8 contributions been computed for this site?
3344 
3345 switch(kernel_type) { // intentional fall-through
3346 case INTERIOR_KERNEL:
3347  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
3348 case EXTERIOR_KERNEL_T:
3349  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
3350 case EXTERIOR_KERNEL_Z:
3351  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
3352 case EXTERIOR_KERNEL_Y:
3353  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
3354 }
3355 
3356 
3357 if (!incomplete)
3358 #endif // MULTI_GPU
3359 // apply twisted mass rotation
3360 {
3361 
3362  {
3363  //Perform twist rotation first:
3364  //(1 + i*a*gamma_5 * tau_3 + b * tau_1)
3365  volatile spinorFloat x1_re, x1_im, y1_re, y1_im;
3366  volatile spinorFloat x2_re, x2_im, y2_re, y2_im;
3367 
3368  x1_re = 0.0, x1_im = 0.0;
3369  y1_re = 0.0, y1_im = 0.0;
3370  x2_re = 0.0, x2_im = 0.0;
3371  y2_re = 0.0, y2_im = 0.0;
3372 
3373 
3374  // using o1 regs:
3375  x1_re = o1_00_re - a *o1_20_im;
3376  x1_im = o1_00_im + a *o1_20_re;
3377  x2_re = b * o1_00_re;
3378  x2_im = b * o1_00_im;
3379 
3380  y1_re = o1_20_re - a *o1_00_im;
3381  y1_im = o1_20_im + a *o1_00_re;
3382  y2_re = b * o1_20_re;
3383  y2_im = b * o1_20_im;
3384 
3385 
3386  // using o2 regs:
3387  x2_re += o2_00_re + a *o2_20_im;
3388  x2_im += o2_00_im - a *o2_20_re;
3389  x1_re += b * o2_00_re;
3390  x1_im += b * o2_00_im;
3391 
3392  y2_re += o2_20_re + a *o2_00_im;
3393  y2_im += o2_20_im - a *o2_00_re;
3394  y1_re += b * o2_20_re;
3395  y1_im += b * o2_20_im;
3396 
3397 
3398  o1_00_re = x1_re; o1_00_im = x1_im;
3399  o1_20_re = y1_re; o1_20_im = y1_im;
3400 
3401  o2_00_re = x2_re; o2_00_im = x2_im;
3402  o2_20_re = y2_re; o2_20_im = y2_im;
3403 
3404  // using o1 regs:
3405  x1_re = o1_10_re - a *o1_30_im;
3406  x1_im = o1_10_im + a *o1_30_re;
3407  x2_re = b * o1_10_re;
3408  x2_im = b * o1_10_im;
3409 
3410  y1_re = o1_30_re - a *o1_10_im;
3411  y1_im = o1_30_im + a *o1_10_re;
3412  y2_re = b * o1_30_re;
3413  y2_im = b * o1_30_im;
3414 
3415 
3416  // using o2 regs:
3417  x2_re += o2_10_re + a *o2_30_im;
3418  x2_im += o2_10_im - a *o2_30_re;
3419  x1_re += b * o2_10_re;
3420  x1_im += b * o2_10_im;
3421 
3422  y2_re += o2_30_re + a *o2_10_im;
3423  y2_im += o2_30_im - a *o2_10_re;
3424  y1_re += b * o2_30_re;
3425  y1_im += b * o2_30_im;
3426 
3427 
3428  o1_10_re = x1_re; o1_10_im = x1_im;
3429  o1_30_re = y1_re; o1_30_im = y1_im;
3430 
3431  o2_10_re = x2_re; o2_10_im = x2_im;
3432  o2_30_re = y2_re; o2_30_im = y2_im;
3433 
3434  // using o1 regs:
3435  x1_re = o1_01_re - a *o1_21_im;
3436  x1_im = o1_01_im + a *o1_21_re;
3437  x2_re = b * o1_01_re;
3438  x2_im = b * o1_01_im;
3439 
3440  y1_re = o1_21_re - a *o1_01_im;
3441  y1_im = o1_21_im + a *o1_01_re;
3442  y2_re = b * o1_21_re;
3443  y2_im = b * o1_21_im;
3444 
3445 
3446  // using o2 regs:
3447  x2_re += o2_01_re + a *o2_21_im;
3448  x2_im += o2_01_im - a *o2_21_re;
3449  x1_re += b * o2_01_re;
3450  x1_im += b * o2_01_im;
3451 
3452  y2_re += o2_21_re + a *o2_01_im;
3453  y2_im += o2_21_im - a *o2_01_re;
3454  y1_re += b * o2_21_re;
3455  y1_im += b * o2_21_im;
3456 
3457 
3458  o1_01_re = x1_re; o1_01_im = x1_im;
3459  o1_21_re = y1_re; o1_21_im = y1_im;
3460 
3461  o2_01_re = x2_re; o2_01_im = x2_im;
3462  o2_21_re = y2_re; o2_21_im = y2_im;
3463 
3464  // using o1 regs:
3465  x1_re = o1_11_re - a *o1_31_im;
3466  x1_im = o1_11_im + a *o1_31_re;
3467  x2_re = b * o1_11_re;
3468  x2_im = b * o1_11_im;
3469 
3470  y1_re = o1_31_re - a *o1_11_im;
3471  y1_im = o1_31_im + a *o1_11_re;
3472  y2_re = b * o1_31_re;
3473  y2_im = b * o1_31_im;
3474 
3475 
3476  // using o2 regs:
3477  x2_re += o2_11_re + a *o2_31_im;
3478  x2_im += o2_11_im - a *o2_31_re;
3479  x1_re += b * o2_11_re;
3480  x1_im += b * o2_11_im;
3481 
3482  y2_re += o2_31_re + a *o2_11_im;
3483  y2_im += o2_31_im - a *o2_11_re;
3484  y1_re += b * o2_31_re;
3485  y1_im += b * o2_31_im;
3486 
3487 
3488  o1_11_re = x1_re; o1_11_im = x1_im;
3489  o1_31_re = y1_re; o1_31_im = y1_im;
3490 
3491  o2_11_re = x2_re; o2_11_im = x2_im;
3492  o2_31_re = y2_re; o2_31_im = y2_im;
3493 
3494  // using o1 regs:
3495  x1_re = o1_02_re - a *o1_22_im;
3496  x1_im = o1_02_im + a *o1_22_re;
3497  x2_re = b * o1_02_re;
3498  x2_im = b * o1_02_im;
3499 
3500  y1_re = o1_22_re - a *o1_02_im;
3501  y1_im = o1_22_im + a *o1_02_re;
3502  y2_re = b * o1_22_re;
3503  y2_im = b * o1_22_im;
3504 
3505 
3506  // using o2 regs:
3507  x2_re += o2_02_re + a *o2_22_im;
3508  x2_im += o2_02_im - a *o2_22_re;
3509  x1_re += b * o2_02_re;
3510  x1_im += b * o2_02_im;
3511 
3512  y2_re += o2_22_re + a *o2_02_im;
3513  y2_im += o2_22_im - a *o2_02_re;
3514  y1_re += b * o2_22_re;
3515  y1_im += b * o2_22_im;
3516 
3517 
3518  o1_02_re = x1_re; o1_02_im = x1_im;
3519  o1_22_re = y1_re; o1_22_im = y1_im;
3520 
3521  o2_02_re = x2_re; o2_02_im = x2_im;
3522  o2_22_re = y2_re; o2_22_im = y2_im;
3523 
3524  // using o1 regs:
3525  x1_re = o1_12_re - a *o1_32_im;
3526  x1_im = o1_12_im + a *o1_32_re;
3527  x2_re = b * o1_12_re;
3528  x2_im = b * o1_12_im;
3529 
3530  y1_re = o1_32_re - a *o1_12_im;
3531  y1_im = o1_32_im + a *o1_12_re;
3532  y2_re = b * o1_32_re;
3533  y2_im = b * o1_32_im;
3534 
3535 
3536  // using o2 regs:
3537  x2_re += o2_12_re + a *o2_32_im;
3538  x2_im += o2_12_im - a *o2_32_re;
3539  x1_re += b * o2_12_re;
3540  x1_im += b * o2_12_im;
3541 
3542  y2_re += o2_32_re + a *o2_12_im;
3543  y2_im += o2_32_im - a *o2_12_re;
3544  y1_re += b * o2_32_re;
3545  y1_im += b * o2_32_im;
3546 
3547 
3548  o1_12_re = x1_re; o1_12_im = x1_im;
3549  o1_32_re = y1_re; o1_32_im = y1_im;
3550 
3551  o2_12_re = x2_re; o2_12_im = x2_im;
3552  o2_32_re = y2_re; o2_32_im = y2_im;
3553 
3554 
3555  }
3556 
3557 #ifndef DSLASH_XPAY
3558  o1_00_re *= c;
3559  o1_00_im *= c;
3560  o1_01_re *= c;
3561  o1_01_im *= c;
3562  o1_02_re *= c;
3563  o1_02_im *= c;
3564  o1_10_re *= c;
3565  o1_10_im *= c;
3566  o1_11_re *= c;
3567  o1_11_im *= c;
3568  o1_12_re *= c;
3569  o1_12_im *= c;
3570  o1_20_re *= c;
3571  o1_20_im *= c;
3572  o1_21_re *= c;
3573  o1_21_im *= c;
3574  o1_22_re *= c;
3575  o1_22_im *= c;
3576  o1_30_re *= c;
3577  o1_30_im *= c;
3578  o1_31_re *= c;
3579  o1_31_im *= c;
3580  o1_32_re *= c;
3581  o1_32_im *= c;
3582 
3583  o2_00_re *= c;
3584  o2_00_im *= c;
3585  o2_01_re *= c;
3586  o2_01_im *= c;
3587  o2_02_re *= c;
3588  o2_02_im *= c;
3589  o2_10_re *= c;
3590  o2_10_im *= c;
3591  o2_11_re *= c;
3592  o2_11_im *= c;
3593  o2_12_re *= c;
3594  o2_12_im *= c;
3595  o2_20_re *= c;
3596  o2_20_im *= c;
3597  o2_21_re *= c;
3598  o2_21_im *= c;
3599  o2_22_re *= c;
3600  o2_22_im *= c;
3601  o2_30_re *= c;
3602  o2_30_im *= c;
3603  o2_31_re *= c;
3604  o2_31_im *= c;
3605  o2_32_re *= c;
3606  o2_32_im *= c;
3607 #else
3608  int tmp = sid;
3609  {
3610  READ_ACCUM(ACCUMTEX, sp_stride)
3611 
3612 #ifdef SPINOR_DOUBLE
3613  o1_00_re = c*o1_00_re + accum0.x;
3614  o1_00_im = c*o1_00_im + accum0.y;
3615  o1_01_re = c*o1_01_re + accum1.x;
3616  o1_01_im = c*o1_01_im + accum1.y;
3617  o1_02_re = c*o1_02_re + accum2.x;
3618  o1_02_im = c*o1_02_im + accum2.y;
3619  o1_10_re = c*o1_10_re + accum3.x;
3620  o1_10_im = c*o1_10_im + accum3.y;
3621  o1_11_re = c*o1_11_re + accum4.x;
3622  o1_11_im = c*o1_11_im + accum4.y;
3623  o1_12_re = c*o1_12_re + accum5.x;
3624  o1_12_im = c*o1_12_im + accum5.y;
3625  o1_20_re = c*o1_20_re + accum6.x;
3626  o1_20_im = c*o1_20_im + accum6.y;
3627  o1_21_re = c*o1_21_re + accum7.x;
3628  o1_21_im = c*o1_21_im + accum7.y;
3629  o1_22_re = c*o1_22_re + accum8.x;
3630  o1_22_im = c*o1_22_im + accum8.y;
3631  o1_30_re = c*o1_30_re + accum9.x;
3632  o1_30_im = c*o1_30_im + accum9.y;
3633  o1_31_re = c*o1_31_re + accum10.x;
3634  o1_31_im = c*o1_31_im + accum10.y;
3635  o1_32_re = c*o1_32_re + accum11.x;
3636  o1_32_im = c*o1_32_im + accum11.y;
3637 #else
3638  o1_00_re = c*o1_00_re + accum0.x;
3639  o1_00_im = c*o1_00_im + accum0.y;
3640  o1_01_re = c*o1_01_re + accum0.z;
3641  o1_01_im = c*o1_01_im + accum0.w;
3642  o1_02_re = c*o1_02_re + accum1.x;
3643  o1_02_im = c*o1_02_im + accum1.y;
3644  o1_10_re = c*o1_10_re + accum1.z;
3645  o1_10_im = c*o1_10_im + accum1.w;
3646  o1_11_re = c*o1_11_re + accum2.x;
3647  o1_11_im = c*o1_11_im + accum2.y;
3648  o1_12_re = c*o1_12_re + accum2.z;
3649  o1_12_im = c*o1_12_im + accum2.w;
3650  o1_20_re = c*o1_20_re + accum3.x;
3651  o1_20_im = c*o1_20_im + accum3.y;
3652  o1_21_re = c*o1_21_re + accum3.z;
3653  o1_21_im = c*o1_21_im + accum3.w;
3654  o1_22_re = c*o1_22_re + accum4.x;
3655  o1_22_im = c*o1_22_im + accum4.y;
3656  o1_30_re = c*o1_30_re + accum4.z;
3657  o1_30_im = c*o1_30_im + accum4.w;
3658  o1_31_re = c*o1_31_re + accum5.x;
3659  o1_31_im = c*o1_31_im + accum5.y;
3660  o1_32_re = c*o1_32_re + accum5.z;
3661  o1_32_im = c*o1_32_im + accum5.w;
3662 #endif // SPINOR_DOUBLE
3663 
3664  }
3665  {
3666  sid += fl_stride;
3667  READ_ACCUM(ACCUMTEX, sp_stride)
3668 
3669 #ifdef SPINOR_DOUBLE
3670  o2_00_re = c*o2_00_re + accum0.x;
3671  o2_00_im = c*o2_00_im + accum0.y;
3672  o2_01_re = c*o2_01_re + accum1.x;
3673  o2_01_im = c*o2_01_im + accum1.y;
3674  o2_02_re = c*o2_02_re + accum2.x;
3675  o2_02_im = c*o2_02_im + accum2.y;
3676  o2_10_re = c*o2_10_re + accum3.x;
3677  o2_10_im = c*o2_10_im + accum3.y;
3678  o2_11_re = c*o2_11_re + accum4.x;
3679  o2_11_im = c*o2_11_im + accum4.y;
3680  o2_12_re = c*o2_12_re + accum5.x;
3681  o2_12_im = c*o2_12_im + accum5.y;
3682  o2_20_re = c*o2_20_re + accum6.x;
3683  o2_20_im = c*o2_20_im + accum6.y;
3684  o2_21_re = c*o2_21_re + accum7.x;
3685  o2_21_im = c*o2_21_im + accum7.y;
3686  o2_22_re = c*o2_22_re + accum8.x;
3687  o2_22_im = c*o2_22_im + accum8.y;
3688  o2_30_re = c*o2_30_re + accum9.x;
3689  o2_30_im = c*o2_30_im + accum9.y;
3690  o2_31_re = c*o2_31_re + accum10.x;
3691  o2_31_im = c*o2_31_im + accum10.y;
3692  o2_32_re = c*o2_32_re + accum11.x;
3693  o2_32_im = c*o2_32_im + accum11.y;
3694 #else
3695  o2_00_re = c*o2_00_re + accum0.x;
3696  o2_00_im = c*o2_00_im + accum0.y;
3697  o2_01_re = c*o2_01_re + accum0.z;
3698  o2_01_im = c*o2_01_im + accum0.w;
3699  o2_02_re = c*o2_02_re + accum1.x;
3700  o2_02_im = c*o2_02_im + accum1.y;
3701  o2_10_re = c*o2_10_re + accum1.z;
3702  o2_10_im = c*o2_10_im + accum1.w;
3703  o2_11_re = c*o2_11_re + accum2.x;
3704  o2_11_im = c*o2_11_im + accum2.y;
3705  o2_12_re = c*o2_12_re + accum2.z;
3706  o2_12_im = c*o2_12_im + accum2.w;
3707  o2_20_re = c*o2_20_re + accum3.x;
3708  o2_20_im = c*o2_20_im + accum3.y;
3709  o2_21_re = c*o2_21_re + accum3.z;
3710  o2_21_im = c*o2_21_im + accum3.w;
3711  o2_22_re = c*o2_22_re + accum4.x;
3712  o2_22_im = c*o2_22_im + accum4.y;
3713  o2_30_re = c*o2_30_re + accum4.z;
3714  o2_30_im = c*o2_30_im + accum4.w;
3715  o2_31_re = c*o2_31_re + accum5.x;
3716  o2_31_im = c*o2_31_im + accum5.y;
3717  o2_32_re = c*o2_32_re + accum5.z;
3718  o2_32_im = c*o2_32_im + accum5.w;
3719 #endif // SPINOR_DOUBLE
3720 
3721  }
3722  sid = tmp;
3723 #endif // DSLASH_XPAY
3724 }
3725 
3726 // write spinor field back to device memory
3728 
3729 // undefine to prevent warning when precision is changed
3730 #undef spinorFloat
3731 #undef g00_re
3732 #undef g00_im
3733 #undef g01_re
3734 #undef g01_im
3735 #undef g02_re
3736 #undef g02_im
3737 #undef g10_re
3738 #undef g10_im
3739 #undef g11_re
3740 #undef g11_im
3741 #undef g12_re
3742 #undef g12_im
3743 #undef g20_re
3744 #undef g20_im
3745 #undef g21_re
3746 #undef g21_im
3747 #undef g22_re
3748 #undef g22_im
3749 
3750 #undef i00_re
3751 #undef i00_im
3752 #undef i01_re
3753 #undef i01_im
3754 #undef i02_re
3755 #undef i02_im
3756 #undef i10_re
3757 #undef i10_im
3758 #undef i11_re
3759 #undef i11_im
3760 #undef i12_re
3761 #undef i12_im
3762 #undef i20_re
3763 #undef i20_im
3764 #undef i21_re
3765 #undef i21_im
3766 #undef i22_re
3767 #undef i22_im
3768 #undef i30_re
3769 #undef i30_im
3770 #undef i31_re
3771 #undef i31_im
3772 #undef i32_re
3773 #undef i32_im
3774 
3775 
3776 #undef VOLATILE