QUDA  0.9.0
tm_ndeg_dslash_core.h
Go to the documentation of this file.
1 // *** CUDA NDEG TWISTED MASS DSLASH ***
2 
3 // Arguments (double) mu, (double)eta and (double)delta
4 #define SHARED_TMNDEG_FLOATS_PER_THREAD 0
5 #define FLAVORS 2
6 
7 
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
9 #define VOLATILE
10 #else // Open64 compiler
11 #define VOLATILE volatile
12 #endif
13 // input spinor
14 #ifdef SPINOR_DOUBLE
15 #define spinorFloat double
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #else
41 #define spinorFloat float
42 #define i00_re I0.x
43 #define i00_im I0.y
44 #define i01_re I0.z
45 #define i01_im I0.w
46 #define i02_re I1.x
47 #define i02_im I1.y
48 #define i10_re I1.z
49 #define i10_im I1.w
50 #define i11_re I2.x
51 #define i11_im I2.y
52 #define i12_re I2.z
53 #define i12_im I2.w
54 #define i20_re I3.x
55 #define i20_im I3.y
56 #define i21_re I3.z
57 #define i21_im I3.w
58 #define i22_re I4.x
59 #define i22_im I4.y
60 #define i30_re I4.z
61 #define i30_im I4.w
62 #define i31_re I5.x
63 #define i31_im I5.y
64 #define i32_re I5.z
65 #define i32_im I5.w
66 #endif // SPINOR_DOUBLE
67 
68 // gauge link
69 #ifdef GAUGE_FLOAT2
70 #define g00_re G0.x
71 #define g00_im G0.y
72 #define g01_re G1.x
73 #define g01_im G1.y
74 #define g02_re G2.x
75 #define g02_im G2.y
76 #define g10_re G3.x
77 #define g10_im G3.y
78 #define g11_re G4.x
79 #define g11_im G4.y
80 #define g12_re G5.x
81 #define g12_im G5.y
82 #define g20_re G6.x
83 #define g20_im G6.y
84 #define g21_re G7.x
85 #define g21_im G7.y
86 #define g22_re G8.x
87 #define g22_im G8.y
88 
89 #else
90 #define g00_re G0.x
91 #define g00_im G0.y
92 #define g01_re G0.z
93 #define g01_im G0.w
94 #define g02_re G1.x
95 #define g02_im G1.y
96 #define g10_re G1.z
97 #define g10_im G1.w
98 #define g11_re G2.x
99 #define g11_im G2.y
100 #define g12_re G2.z
101 #define g12_im G2.w
102 #define g20_re G3.x
103 #define g20_im G3.y
104 #define g21_re G3.z
105 #define g21_im G3.w
106 #define g22_re G4.x
107 #define g22_im G4.y
108 
109 #endif // GAUGE_DOUBLE
110 
111 // conjugated gauge link
112 #define gT00_re (+g00_re)
113 #define gT00_im (-g00_im)
114 #define gT01_re (+g10_re)
115 #define gT01_im (-g10_im)
116 #define gT02_re (+g20_re)
117 #define gT02_im (-g20_im)
118 #define gT10_re (+g01_re)
119 #define gT10_im (-g01_im)
120 #define gT11_re (+g11_re)
121 #define gT11_im (-g11_im)
122 #define gT12_re (+g21_re)
123 #define gT12_im (-g21_im)
124 #define gT20_re (+g02_re)
125 #define gT20_im (-g02_im)
126 #define gT21_re (+g12_re)
127 #define gT21_im (-g12_im)
128 #define gT22_re (+g22_re)
129 #define gT22_im (-g22_im)
130 
131 // output spinor for flavor 1
156 // output spinor for flavor 2
181 
182 #include "read_gauge.h"
183 #include "io_spinor.h"
184 
185 int coord[5];
186 int X;
187 
188 int sid;
189 
190 #ifdef MULTI_GPU
191 int face_idx;
192 if (kernel_type == INTERIOR_KERNEL) {
193 #endif
194 
195  sid = blockIdx.x*blockDim.x + threadIdx.x;
196  if (sid >= param.threads) return;
197 
198  // Assume even dimensions
200 
201  o1_00_re = 0; o1_00_im = 0;
202  o1_01_re = 0; o1_01_im = 0;
203  o1_02_re = 0; o1_02_im = 0;
204  o1_10_re = 0; o1_10_im = 0;
205  o1_11_re = 0; o1_11_im = 0;
206  o1_12_re = 0; o1_12_im = 0;
207  o1_20_re = 0; o1_20_im = 0;
208  o1_21_re = 0; o1_21_im = 0;
209  o1_22_re = 0; o1_22_im = 0;
210  o1_30_re = 0; o1_30_im = 0;
211  o1_31_re = 0; o1_31_im = 0;
212  o1_32_re = 0; o1_32_im = 0;
213 
214  o2_00_re = 0; o2_00_im = 0;
215  o2_01_re = 0; o2_01_im = 0;
216  o2_02_re = 0; o2_02_im = 0;
217  o2_10_re = 0; o2_10_im = 0;
218  o2_11_re = 0; o2_11_im = 0;
219  o2_12_re = 0; o2_12_im = 0;
220  o2_20_re = 0; o2_20_im = 0;
221  o2_21_re = 0; o2_21_im = 0;
222  o2_22_re = 0; o2_22_im = 0;
223  o2_30_re = 0; o2_30_im = 0;
224  o2_31_re = 0; o2_31_im = 0;
225  o2_32_re = 0; o2_32_im = 0;
226 
227 #ifdef MULTI_GPU
228 } else { // exterior kernel
229 
230  sid = blockIdx.x*blockDim.x + threadIdx.x;
231  if (sid >= param.threads) return;
232 
233  const int face_volume = (param.threads >> 1); // volume of one face (per flavor)
234  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
235  face_idx = sid - face_num*face_volume; // index into the respective face
236 
237  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
238  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
239  //sp_idx = face_idx + param.ghostOffset[dim];
240 
241  coordsFromFaceIndex<4,QUDA_4D_PC,kernel_type,1>(X, sid, coord, face_idx, face_num, param);
242 
243 
244  {
258 
259 
260  }
261  {
262  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid+param.fl_stride, sid+param.fl_stride);
275 
276 
277  }
278 }
279 #endif // MULTI_GPU
280 
281 
282 #ifdef MULTI_GPU
283 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]<(param.dc.X[0]-1))) ||
284  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==(param.dc.X[0]-1)) )
285 #endif
286 {
287  // Projector P0-
288  // 1 0 0 -i
289  // 0 1 -i 0
290  // 0 i 1 0
291  // i 0 0 1
292 
293 #ifdef MULTI_GPU
294  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1 :
295  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
296 #if (DD_PREC==2) // half precision
297  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
298 #endif
299 #else
300  const int sp_idx = (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1;
301 #endif
302 
303  const int ga_idx = sid;
304 
311 
312  // read gauge matrix from device memory
313  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride);
314 
315  // reconstruct gauge matrix
317 
318  {
319 #ifdef MULTI_GPU
320  if (kernel_type == INTERIOR_KERNEL) {
321 #endif
322 
323  // read flavor 1 from device memory
324  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
325 
326  // project spinor into half spinors
327  a0_re = +i00_re+i30_im;
328  a0_im = +i00_im-i30_re;
329  a1_re = +i01_re+i31_im;
330  a1_im = +i01_im-i31_re;
331  a2_re = +i02_re+i32_im;
332  a2_im = +i02_im-i32_re;
333  b0_re = +i10_re+i20_im;
334  b0_im = +i10_im-i20_re;
335  b1_re = +i11_re+i21_im;
336  b1_im = +i11_im-i21_re;
337  b2_re = +i12_re+i22_im;
338  b2_im = +i12_im-i22_re;
339 
340 #ifdef MULTI_GPU
341  } else {
342 
343  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
344 
345  // read half spinor for the first flavor from device memory
346  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 0);
347 
348  a0_re = i00_re; a0_im = i00_im;
349  a1_re = i01_re; a1_im = i01_im;
350  a2_re = i02_re; a2_im = i02_im;
351  b0_re = i10_re; b0_im = i10_im;
352  b1_re = i11_re; b1_im = i11_im;
353  b2_re = i12_re; b2_im = i12_im;
354 
355  }
356 #endif // MULTI_GPU
357 
358  // multiply row 0
360  A0_re += g00_re * a0_re;
361  A0_re -= g00_im * a0_im;
362  A0_re += g01_re * a1_re;
363  A0_re -= g01_im * a1_im;
364  A0_re += g02_re * a2_re;
365  A0_re -= g02_im * a2_im;
367  A0_im += g00_re * a0_im;
368  A0_im += g00_im * a0_re;
369  A0_im += g01_re * a1_im;
370  A0_im += g01_im * a1_re;
371  A0_im += g02_re * a2_im;
372  A0_im += g02_im * a2_re;
374  B0_re += g00_re * b0_re;
375  B0_re -= g00_im * b0_im;
376  B0_re += g01_re * b1_re;
377  B0_re -= g01_im * b1_im;
378  B0_re += g02_re * b2_re;
379  B0_re -= g02_im * b2_im;
381  B0_im += g00_re * b0_im;
382  B0_im += g00_im * b0_re;
383  B0_im += g01_re * b1_im;
384  B0_im += g01_im * b1_re;
385  B0_im += g02_re * b2_im;
386  B0_im += g02_im * b2_re;
387 
388  // multiply row 1
390  A1_re += g10_re * a0_re;
391  A1_re -= g10_im * a0_im;
392  A1_re += g11_re * a1_re;
393  A1_re -= g11_im * a1_im;
394  A1_re += g12_re * a2_re;
395  A1_re -= g12_im * a2_im;
397  A1_im += g10_re * a0_im;
398  A1_im += g10_im * a0_re;
399  A1_im += g11_re * a1_im;
400  A1_im += g11_im * a1_re;
401  A1_im += g12_re * a2_im;
402  A1_im += g12_im * a2_re;
404  B1_re += g10_re * b0_re;
405  B1_re -= g10_im * b0_im;
406  B1_re += g11_re * b1_re;
407  B1_re -= g11_im * b1_im;
408  B1_re += g12_re * b2_re;
409  B1_re -= g12_im * b2_im;
411  B1_im += g10_re * b0_im;
412  B1_im += g10_im * b0_re;
413  B1_im += g11_re * b1_im;
414  B1_im += g11_im * b1_re;
415  B1_im += g12_re * b2_im;
416  B1_im += g12_im * b2_re;
417 
418  // multiply row 2
420  A2_re += g20_re * a0_re;
421  A2_re -= g20_im * a0_im;
422  A2_re += g21_re * a1_re;
423  A2_re -= g21_im * a1_im;
424  A2_re += g22_re * a2_re;
425  A2_re -= g22_im * a2_im;
427  A2_im += g20_re * a0_im;
428  A2_im += g20_im * a0_re;
429  A2_im += g21_re * a1_im;
430  A2_im += g21_im * a1_re;
431  A2_im += g22_re * a2_im;
432  A2_im += g22_im * a2_re;
434  B2_re += g20_re * b0_re;
435  B2_re -= g20_im * b0_im;
436  B2_re += g21_re * b1_re;
437  B2_re -= g21_im * b1_im;
438  B2_re += g22_re * b2_re;
439  B2_re -= g22_im * b2_im;
441  B2_im += g20_re * b0_im;
442  B2_im += g20_im * b0_re;
443  B2_im += g21_re * b1_im;
444  B2_im += g21_im * b1_re;
445  B2_im += g22_re * b2_im;
446  B2_im += g22_im * b2_re;
447 
448  o1_00_re += A0_re;
449  o1_00_im += A0_im;
450  o1_10_re += B0_re;
451  o1_10_im += B0_im;
452  o1_20_re -= B0_im;
453  o1_20_im += B0_re;
454  o1_30_re -= A0_im;
455  o1_30_im += A0_re;
456 
457  o1_01_re += A1_re;
458  o1_01_im += A1_im;
459  o1_11_re += B1_re;
460  o1_11_im += B1_im;
461  o1_21_re -= B1_im;
462  o1_21_im += B1_re;
463  o1_31_re -= A1_im;
464  o1_31_im += A1_re;
465 
466  o1_02_re += A2_re;
467  o1_02_im += A2_im;
468  o1_12_re += B2_re;
469  o1_12_im += B2_im;
470  o1_22_re -= B2_im;
471  o1_22_im += B2_re;
472  o1_32_re -= A2_im;
473  o1_32_im += A2_re;
474 
475  }
476  {
477 #ifdef MULTI_GPU
478  if (kernel_type == INTERIOR_KERNEL) {
479 #endif
480 
481  // read flavor 2 from device memory
482  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
483 
484  // project spinor into half spinors
485  a0_re = +i00_re+i30_im;
486  a0_im = +i00_im-i30_re;
487  a1_re = +i01_re+i31_im;
488  a1_im = +i01_im-i31_re;
489  a2_re = +i02_re+i32_im;
490  a2_im = +i02_im-i32_re;
491  b0_re = +i10_re+i20_im;
492  b0_im = +i10_im-i20_re;
493  b1_re = +i11_re+i21_im;
494  b1_im = +i11_im-i21_re;
495  b2_re = +i12_re+i22_im;
496  b2_im = +i12_im-i22_re;
497 
498 #ifdef MULTI_GPU
499  } else {
500 
501  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
502 
503  // read half spinor for the second flavor from device memory
504  const int fl_idx = sp_idx + param.dc.ghostFace[static_cast<int>(kernel_type)];
505  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+param.dc.ghostFace[static_cast<int>(kernel_type)],0);
506 
507  a0_re = i00_re; a0_im = i00_im;
508  a1_re = i01_re; a1_im = i01_im;
509  a2_re = i02_re; a2_im = i02_im;
510  b0_re = i10_re; b0_im = i10_im;
511  b1_re = i11_re; b1_im = i11_im;
512  b2_re = i12_re; b2_im = i12_im;
513 
514  }
515 #endif // MULTI_GPU
516 
517  // multiply row 0
518  spinorFloat A0_re = 0;
519  A0_re += g00_re * a0_re;
520  A0_re -= g00_im * a0_im;
521  A0_re += g01_re * a1_re;
522  A0_re -= g01_im * a1_im;
523  A0_re += g02_re * a2_re;
524  A0_re -= g02_im * a2_im;
525  spinorFloat A0_im = 0;
526  A0_im += g00_re * a0_im;
527  A0_im += g00_im * a0_re;
528  A0_im += g01_re * a1_im;
529  A0_im += g01_im * a1_re;
530  A0_im += g02_re * a2_im;
531  A0_im += g02_im * a2_re;
532  spinorFloat B0_re = 0;
533  B0_re += g00_re * b0_re;
534  B0_re -= g00_im * b0_im;
535  B0_re += g01_re * b1_re;
536  B0_re -= g01_im * b1_im;
537  B0_re += g02_re * b2_re;
538  B0_re -= g02_im * b2_im;
539  spinorFloat B0_im = 0;
540  B0_im += g00_re * b0_im;
541  B0_im += g00_im * b0_re;
542  B0_im += g01_re * b1_im;
543  B0_im += g01_im * b1_re;
544  B0_im += g02_re * b2_im;
545  B0_im += g02_im * b2_re;
546 
547  // multiply row 1
548  spinorFloat A1_re = 0;
549  A1_re += g10_re * a0_re;
550  A1_re -= g10_im * a0_im;
551  A1_re += g11_re * a1_re;
552  A1_re -= g11_im * a1_im;
553  A1_re += g12_re * a2_re;
554  A1_re -= g12_im * a2_im;
555  spinorFloat A1_im = 0;
556  A1_im += g10_re * a0_im;
557  A1_im += g10_im * a0_re;
558  A1_im += g11_re * a1_im;
559  A1_im += g11_im * a1_re;
560  A1_im += g12_re * a2_im;
561  A1_im += g12_im * a2_re;
562  spinorFloat B1_re = 0;
563  B1_re += g10_re * b0_re;
564  B1_re -= g10_im * b0_im;
565  B1_re += g11_re * b1_re;
566  B1_re -= g11_im * b1_im;
567  B1_re += g12_re * b2_re;
568  B1_re -= g12_im * b2_im;
569  spinorFloat B1_im = 0;
570  B1_im += g10_re * b0_im;
571  B1_im += g10_im * b0_re;
572  B1_im += g11_re * b1_im;
573  B1_im += g11_im * b1_re;
574  B1_im += g12_re * b2_im;
575  B1_im += g12_im * b2_re;
576 
577  // multiply row 2
578  spinorFloat A2_re = 0;
579  A2_re += g20_re * a0_re;
580  A2_re -= g20_im * a0_im;
581  A2_re += g21_re * a1_re;
582  A2_re -= g21_im * a1_im;
583  A2_re += g22_re * a2_re;
584  A2_re -= g22_im * a2_im;
585  spinorFloat A2_im = 0;
586  A2_im += g20_re * a0_im;
587  A2_im += g20_im * a0_re;
588  A2_im += g21_re * a1_im;
589  A2_im += g21_im * a1_re;
590  A2_im += g22_re * a2_im;
591  A2_im += g22_im * a2_re;
592  spinorFloat B2_re = 0;
593  B2_re += g20_re * b0_re;
594  B2_re -= g20_im * b0_im;
595  B2_re += g21_re * b1_re;
596  B2_re -= g21_im * b1_im;
597  B2_re += g22_re * b2_re;
598  B2_re -= g22_im * b2_im;
599  spinorFloat B2_im = 0;
600  B2_im += g20_re * b0_im;
601  B2_im += g20_im * b0_re;
602  B2_im += g21_re * b1_im;
603  B2_im += g21_im * b1_re;
604  B2_im += g22_re * b2_im;
605  B2_im += g22_im * b2_re;
606 
607  o2_00_re += A0_re;
608  o2_00_im += A0_im;
609  o2_10_re += B0_re;
610  o2_10_im += B0_im;
611  o2_20_re -= B0_im;
612  o2_20_im += B0_re;
613  o2_30_re -= A0_im;
614  o2_30_im += A0_re;
615 
616  o2_01_re += A1_re;
617  o2_01_im += A1_im;
618  o2_11_re += B1_re;
619  o2_11_im += B1_im;
620  o2_21_re -= B1_im;
621  o2_21_im += B1_re;
622  o2_31_re -= A1_im;
623  o2_31_im += A1_re;
624 
625  o2_02_re += A2_re;
626  o2_02_im += A2_im;
627  o2_12_re += B2_re;
628  o2_12_im += B2_im;
629  o2_22_re -= B2_im;
630  o2_22_im += B2_re;
631  o2_32_re -= A2_im;
632  o2_32_im += A2_re;
633 
634  }
635 }
636 
637 #ifdef MULTI_GPU
638 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]>0)) ||
639  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==0) )
640 #endif
641 {
642  // Projector P0+
643  // 1 0 0 i
644  // 0 1 i 0
645  // 0 -i 1 0
646  // -i 0 0 1
647 
648 #ifdef MULTI_GPU
649  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1 :
650  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
651 #if (DD_PREC==2) // half precision
652  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
653 #endif
654 #else
655  const int sp_idx = (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1;
656 #endif
657 
658 #ifdef MULTI_GPU
659  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
660 #else
661  const int ga_idx = sp_idx;
662 #endif
663 
670 
671  // read gauge matrix from device memory
672  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, param.gauge_stride);
673 
674  // reconstruct gauge matrix
676 
677  {
678 #ifdef MULTI_GPU
679  if (kernel_type == INTERIOR_KERNEL) {
680 #endif
681 
682  // read flavor 1 from device memory
683  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
684 
685  // project spinor into half spinors
686  a0_re = +i00_re-i30_im;
687  a0_im = +i00_im+i30_re;
688  a1_re = +i01_re-i31_im;
689  a1_im = +i01_im+i31_re;
690  a2_re = +i02_re-i32_im;
691  a2_im = +i02_im+i32_re;
692  b0_re = +i10_re-i20_im;
693  b0_im = +i10_im+i20_re;
694  b1_re = +i11_re-i21_im;
695  b1_im = +i11_im+i21_re;
696  b2_re = +i12_re-i22_im;
697  b2_im = +i12_im+i22_re;
698 
699 #ifdef MULTI_GPU
700  } else {
701 
702  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
703 
704  // read half spinor for the first flavor from device memory
705  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 1);
706 
707  a0_re = i00_re; a0_im = i00_im;
708  a1_re = i01_re; a1_im = i01_im;
709  a2_re = i02_re; a2_im = i02_im;
710  b0_re = i10_re; b0_im = i10_im;
711  b1_re = i11_re; b1_im = i11_im;
712  b2_re = i12_re; b2_im = i12_im;
713 
714  }
715 #endif // MULTI_GPU
716 
717  // multiply row 0
718  spinorFloat A0_re = 0;
719  A0_re += gT00_re * a0_re;
720  A0_re -= gT00_im * a0_im;
721  A0_re += gT01_re * a1_re;
722  A0_re -= gT01_im * a1_im;
723  A0_re += gT02_re * a2_re;
724  A0_re -= gT02_im * a2_im;
725  spinorFloat A0_im = 0;
726  A0_im += gT00_re * a0_im;
727  A0_im += gT00_im * a0_re;
728  A0_im += gT01_re * a1_im;
729  A0_im += gT01_im * a1_re;
730  A0_im += gT02_re * a2_im;
731  A0_im += gT02_im * a2_re;
732  spinorFloat B0_re = 0;
733  B0_re += gT00_re * b0_re;
734  B0_re -= gT00_im * b0_im;
735  B0_re += gT01_re * b1_re;
736  B0_re -= gT01_im * b1_im;
737  B0_re += gT02_re * b2_re;
738  B0_re -= gT02_im * b2_im;
739  spinorFloat B0_im = 0;
740  B0_im += gT00_re * b0_im;
741  B0_im += gT00_im * b0_re;
742  B0_im += gT01_re * b1_im;
743  B0_im += gT01_im * b1_re;
744  B0_im += gT02_re * b2_im;
745  B0_im += gT02_im * b2_re;
746 
747  // multiply row 1
748  spinorFloat A1_re = 0;
749  A1_re += gT10_re * a0_re;
750  A1_re -= gT10_im * a0_im;
751  A1_re += gT11_re * a1_re;
752  A1_re -= gT11_im * a1_im;
753  A1_re += gT12_re * a2_re;
754  A1_re -= gT12_im * a2_im;
755  spinorFloat A1_im = 0;
756  A1_im += gT10_re * a0_im;
757  A1_im += gT10_im * a0_re;
758  A1_im += gT11_re * a1_im;
759  A1_im += gT11_im * a1_re;
760  A1_im += gT12_re * a2_im;
761  A1_im += gT12_im * a2_re;
762  spinorFloat B1_re = 0;
763  B1_re += gT10_re * b0_re;
764  B1_re -= gT10_im * b0_im;
765  B1_re += gT11_re * b1_re;
766  B1_re -= gT11_im * b1_im;
767  B1_re += gT12_re * b2_re;
768  B1_re -= gT12_im * b2_im;
769  spinorFloat B1_im = 0;
770  B1_im += gT10_re * b0_im;
771  B1_im += gT10_im * b0_re;
772  B1_im += gT11_re * b1_im;
773  B1_im += gT11_im * b1_re;
774  B1_im += gT12_re * b2_im;
775  B1_im += gT12_im * b2_re;
776 
777  // multiply row 2
778  spinorFloat A2_re = 0;
779  A2_re += gT20_re * a0_re;
780  A2_re -= gT20_im * a0_im;
781  A2_re += gT21_re * a1_re;
782  A2_re -= gT21_im * a1_im;
783  A2_re += gT22_re * a2_re;
784  A2_re -= gT22_im * a2_im;
785  spinorFloat A2_im = 0;
786  A2_im += gT20_re * a0_im;
787  A2_im += gT20_im * a0_re;
788  A2_im += gT21_re * a1_im;
789  A2_im += gT21_im * a1_re;
790  A2_im += gT22_re * a2_im;
791  A2_im += gT22_im * a2_re;
792  spinorFloat B2_re = 0;
793  B2_re += gT20_re * b0_re;
794  B2_re -= gT20_im * b0_im;
795  B2_re += gT21_re * b1_re;
796  B2_re -= gT21_im * b1_im;
797  B2_re += gT22_re * b2_re;
798  B2_re -= gT22_im * b2_im;
799  spinorFloat B2_im = 0;
800  B2_im += gT20_re * b0_im;
801  B2_im += gT20_im * b0_re;
802  B2_im += gT21_re * b1_im;
803  B2_im += gT21_im * b1_re;
804  B2_im += gT22_re * b2_im;
805  B2_im += gT22_im * b2_re;
806 
807  o1_00_re += A0_re;
808  o1_00_im += A0_im;
809  o1_10_re += B0_re;
810  o1_10_im += B0_im;
811  o1_20_re += B0_im;
812  o1_20_im -= B0_re;
813  o1_30_re += A0_im;
814  o1_30_im -= A0_re;
815 
816  o1_01_re += A1_re;
817  o1_01_im += A1_im;
818  o1_11_re += B1_re;
819  o1_11_im += B1_im;
820  o1_21_re += B1_im;
821  o1_21_im -= B1_re;
822  o1_31_re += A1_im;
823  o1_31_im -= A1_re;
824 
825  o1_02_re += A2_re;
826  o1_02_im += A2_im;
827  o1_12_re += B2_re;
828  o1_12_im += B2_im;
829  o1_22_re += B2_im;
830  o1_22_im -= B2_re;
831  o1_32_re += A2_im;
832  o1_32_im -= A2_re;
833 
834  }
835  {
836 #ifdef MULTI_GPU
837  if (kernel_type == INTERIOR_KERNEL) {
838 #endif
839 
840  // read flavor 2 from device memory
841  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
842 
843  // project spinor into half spinors
844  a0_re = +i00_re-i30_im;
845  a0_im = +i00_im+i30_re;
846  a1_re = +i01_re-i31_im;
847  a1_im = +i01_im+i31_re;
848  a2_re = +i02_re-i32_im;
849  a2_im = +i02_im+i32_re;
850  b0_re = +i10_re-i20_im;
851  b0_im = +i10_im+i20_re;
852  b1_re = +i11_re-i21_im;
853  b1_im = +i11_im+i21_re;
854  b2_re = +i12_re-i22_im;
855  b2_im = +i12_im+i22_re;
856 
857 #ifdef MULTI_GPU
858  } else {
859 
860  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
861 
862  // read half spinor for the second flavor from device memory
863  const int fl_idx = sp_idx + param.dc.ghostFace[static_cast<int>(kernel_type)];
864  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+param.dc.ghostFace[static_cast<int>(kernel_type)],1);
865 
866  a0_re = i00_re; a0_im = i00_im;
867  a1_re = i01_re; a1_im = i01_im;
868  a2_re = i02_re; a2_im = i02_im;
869  b0_re = i10_re; b0_im = i10_im;
870  b1_re = i11_re; b1_im = i11_im;
871  b2_re = i12_re; b2_im = i12_im;
872 
873  }
874 #endif // MULTI_GPU
875 
876  // multiply row 0
877  spinorFloat A0_re = 0;
878  A0_re += gT00_re * a0_re;
879  A0_re -= gT00_im * a0_im;
880  A0_re += gT01_re * a1_re;
881  A0_re -= gT01_im * a1_im;
882  A0_re += gT02_re * a2_re;
883  A0_re -= gT02_im * a2_im;
884  spinorFloat A0_im = 0;
885  A0_im += gT00_re * a0_im;
886  A0_im += gT00_im * a0_re;
887  A0_im += gT01_re * a1_im;
888  A0_im += gT01_im * a1_re;
889  A0_im += gT02_re * a2_im;
890  A0_im += gT02_im * a2_re;
891  spinorFloat B0_re = 0;
892  B0_re += gT00_re * b0_re;
893  B0_re -= gT00_im * b0_im;
894  B0_re += gT01_re * b1_re;
895  B0_re -= gT01_im * b1_im;
896  B0_re += gT02_re * b2_re;
897  B0_re -= gT02_im * b2_im;
898  spinorFloat B0_im = 0;
899  B0_im += gT00_re * b0_im;
900  B0_im += gT00_im * b0_re;
901  B0_im += gT01_re * b1_im;
902  B0_im += gT01_im * b1_re;
903  B0_im += gT02_re * b2_im;
904  B0_im += gT02_im * b2_re;
905 
906  // multiply row 1
907  spinorFloat A1_re = 0;
908  A1_re += gT10_re * a0_re;
909  A1_re -= gT10_im * a0_im;
910  A1_re += gT11_re * a1_re;
911  A1_re -= gT11_im * a1_im;
912  A1_re += gT12_re * a2_re;
913  A1_re -= gT12_im * a2_im;
914  spinorFloat A1_im = 0;
915  A1_im += gT10_re * a0_im;
916  A1_im += gT10_im * a0_re;
917  A1_im += gT11_re * a1_im;
918  A1_im += gT11_im * a1_re;
919  A1_im += gT12_re * a2_im;
920  A1_im += gT12_im * a2_re;
921  spinorFloat B1_re = 0;
922  B1_re += gT10_re * b0_re;
923  B1_re -= gT10_im * b0_im;
924  B1_re += gT11_re * b1_re;
925  B1_re -= gT11_im * b1_im;
926  B1_re += gT12_re * b2_re;
927  B1_re -= gT12_im * b2_im;
928  spinorFloat B1_im = 0;
929  B1_im += gT10_re * b0_im;
930  B1_im += gT10_im * b0_re;
931  B1_im += gT11_re * b1_im;
932  B1_im += gT11_im * b1_re;
933  B1_im += gT12_re * b2_im;
934  B1_im += gT12_im * b2_re;
935 
936  // multiply row 2
937  spinorFloat A2_re = 0;
938  A2_re += gT20_re * a0_re;
939  A2_re -= gT20_im * a0_im;
940  A2_re += gT21_re * a1_re;
941  A2_re -= gT21_im * a1_im;
942  A2_re += gT22_re * a2_re;
943  A2_re -= gT22_im * a2_im;
944  spinorFloat A2_im = 0;
945  A2_im += gT20_re * a0_im;
946  A2_im += gT20_im * a0_re;
947  A2_im += gT21_re * a1_im;
948  A2_im += gT21_im * a1_re;
949  A2_im += gT22_re * a2_im;
950  A2_im += gT22_im * a2_re;
951  spinorFloat B2_re = 0;
952  B2_re += gT20_re * b0_re;
953  B2_re -= gT20_im * b0_im;
954  B2_re += gT21_re * b1_re;
955  B2_re -= gT21_im * b1_im;
956  B2_re += gT22_re * b2_re;
957  B2_re -= gT22_im * b2_im;
958  spinorFloat B2_im = 0;
959  B2_im += gT20_re * b0_im;
960  B2_im += gT20_im * b0_re;
961  B2_im += gT21_re * b1_im;
962  B2_im += gT21_im * b1_re;
963  B2_im += gT22_re * b2_im;
964  B2_im += gT22_im * b2_re;
965 
966  o2_00_re += A0_re;
967  o2_00_im += A0_im;
968  o2_10_re += B0_re;
969  o2_10_im += B0_im;
970  o2_20_re += B0_im;
971  o2_20_im -= B0_re;
972  o2_30_re += A0_im;
973  o2_30_im -= A0_re;
974 
975  o2_01_re += A1_re;
976  o2_01_im += A1_im;
977  o2_11_re += B1_re;
978  o2_11_im += B1_im;
979  o2_21_re += B1_im;
980  o2_21_im -= B1_re;
981  o2_31_re += A1_im;
982  o2_31_im -= A1_re;
983 
984  o2_02_re += A2_re;
985  o2_02_im += A2_im;
986  o2_12_re += B2_re;
987  o2_12_im += B2_im;
988  o2_22_re += B2_im;
989  o2_22_im -= B2_re;
990  o2_32_re += A2_im;
991  o2_32_im -= A2_re;
992 
993  }
994 }
995 
996 #ifdef MULTI_GPU
997 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]<(param.dc.X[1]-1))) ||
998  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==(param.dc.X[1]-1)) )
999 #endif
1000 {
1001  // Projector P1-
1002  // 1 0 0 -1
1003  // 0 1 1 0
1004  // 0 1 1 0
1005  // -1 0 0 1
1006 
1007 #ifdef MULTI_GPU
1008  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1 :
1009  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1010 #if (DD_PREC==2) // half precision
1011  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1012 #endif
1013 #else
1014  const int sp_idx = (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1;
1015 #endif
1016 
1017  const int ga_idx = sid;
1018 
1025 
1026  // read gauge matrix from device memory
1027  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, param.gauge_stride);
1028 
1029  // reconstruct gauge matrix
1031 
1032  {
1033 #ifdef MULTI_GPU
1034  if (kernel_type == INTERIOR_KERNEL) {
1035 #endif
1036 
1037  // read flavor 1 from device memory
1038  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1039 
1040  // project spinor into half spinors
1041  a0_re = +i00_re-i30_re;
1042  a0_im = +i00_im-i30_im;
1043  a1_re = +i01_re-i31_re;
1044  a1_im = +i01_im-i31_im;
1045  a2_re = +i02_re-i32_re;
1046  a2_im = +i02_im-i32_im;
1047  b0_re = +i10_re+i20_re;
1048  b0_im = +i10_im+i20_im;
1049  b1_re = +i11_re+i21_re;
1050  b1_im = +i11_im+i21_im;
1051  b2_re = +i12_re+i22_re;
1052  b2_im = +i12_im+i22_im;
1053 
1054 #ifdef MULTI_GPU
1055  } else {
1056 
1057  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
1058 
1059  // read half spinor for the first flavor from device memory
1060  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 2);
1061 
1062  a0_re = i00_re; a0_im = i00_im;
1063  a1_re = i01_re; a1_im = i01_im;
1064  a2_re = i02_re; a2_im = i02_im;
1065  b0_re = i10_re; b0_im = i10_im;
1066  b1_re = i11_re; b1_im = i11_im;
1067  b2_re = i12_re; b2_im = i12_im;
1068 
1069  }
1070 #endif // MULTI_GPU
1071 
1072  // multiply row 0
1073  spinorFloat A0_re = 0;
1074  A0_re += g00_re * a0_re;
1075  A0_re -= g00_im * a0_im;
1076  A0_re += g01_re * a1_re;
1077  A0_re -= g01_im * a1_im;
1078  A0_re += g02_re * a2_re;
1079  A0_re -= g02_im * a2_im;
1080  spinorFloat A0_im = 0;
1081  A0_im += g00_re * a0_im;
1082  A0_im += g00_im * a0_re;
1083  A0_im += g01_re * a1_im;
1084  A0_im += g01_im * a1_re;
1085  A0_im += g02_re * a2_im;
1086  A0_im += g02_im * a2_re;
1087  spinorFloat B0_re = 0;
1088  B0_re += g00_re * b0_re;
1089  B0_re -= g00_im * b0_im;
1090  B0_re += g01_re * b1_re;
1091  B0_re -= g01_im * b1_im;
1092  B0_re += g02_re * b2_re;
1093  B0_re -= g02_im * b2_im;
1094  spinorFloat B0_im = 0;
1095  B0_im += g00_re * b0_im;
1096  B0_im += g00_im * b0_re;
1097  B0_im += g01_re * b1_im;
1098  B0_im += g01_im * b1_re;
1099  B0_im += g02_re * b2_im;
1100  B0_im += g02_im * b2_re;
1101 
1102  // multiply row 1
1103  spinorFloat A1_re = 0;
1104  A1_re += g10_re * a0_re;
1105  A1_re -= g10_im * a0_im;
1106  A1_re += g11_re * a1_re;
1107  A1_re -= g11_im * a1_im;
1108  A1_re += g12_re * a2_re;
1109  A1_re -= g12_im * a2_im;
1110  spinorFloat A1_im = 0;
1111  A1_im += g10_re * a0_im;
1112  A1_im += g10_im * a0_re;
1113  A1_im += g11_re * a1_im;
1114  A1_im += g11_im * a1_re;
1115  A1_im += g12_re * a2_im;
1116  A1_im += g12_im * a2_re;
1117  spinorFloat B1_re = 0;
1118  B1_re += g10_re * b0_re;
1119  B1_re -= g10_im * b0_im;
1120  B1_re += g11_re * b1_re;
1121  B1_re -= g11_im * b1_im;
1122  B1_re += g12_re * b2_re;
1123  B1_re -= g12_im * b2_im;
1124  spinorFloat B1_im = 0;
1125  B1_im += g10_re * b0_im;
1126  B1_im += g10_im * b0_re;
1127  B1_im += g11_re * b1_im;
1128  B1_im += g11_im * b1_re;
1129  B1_im += g12_re * b2_im;
1130  B1_im += g12_im * b2_re;
1131 
1132  // multiply row 2
1133  spinorFloat A2_re = 0;
1134  A2_re += g20_re * a0_re;
1135  A2_re -= g20_im * a0_im;
1136  A2_re += g21_re * a1_re;
1137  A2_re -= g21_im * a1_im;
1138  A2_re += g22_re * a2_re;
1139  A2_re -= g22_im * a2_im;
1140  spinorFloat A2_im = 0;
1141  A2_im += g20_re * a0_im;
1142  A2_im += g20_im * a0_re;
1143  A2_im += g21_re * a1_im;
1144  A2_im += g21_im * a1_re;
1145  A2_im += g22_re * a2_im;
1146  A2_im += g22_im * a2_re;
1147  spinorFloat B2_re = 0;
1148  B2_re += g20_re * b0_re;
1149  B2_re -= g20_im * b0_im;
1150  B2_re += g21_re * b1_re;
1151  B2_re -= g21_im * b1_im;
1152  B2_re += g22_re * b2_re;
1153  B2_re -= g22_im * b2_im;
1154  spinorFloat B2_im = 0;
1155  B2_im += g20_re * b0_im;
1156  B2_im += g20_im * b0_re;
1157  B2_im += g21_re * b1_im;
1158  B2_im += g21_im * b1_re;
1159  B2_im += g22_re * b2_im;
1160  B2_im += g22_im * b2_re;
1161 
1162  o1_00_re += A0_re;
1163  o1_00_im += A0_im;
1164  o1_10_re += B0_re;
1165  o1_10_im += B0_im;
1166  o1_20_re += B0_re;
1167  o1_20_im += B0_im;
1168  o1_30_re -= A0_re;
1169  o1_30_im -= A0_im;
1170 
1171  o1_01_re += A1_re;
1172  o1_01_im += A1_im;
1173  o1_11_re += B1_re;
1174  o1_11_im += B1_im;
1175  o1_21_re += B1_re;
1176  o1_21_im += B1_im;
1177  o1_31_re -= A1_re;
1178  o1_31_im -= A1_im;
1179 
1180  o1_02_re += A2_re;
1181  o1_02_im += A2_im;
1182  o1_12_re += B2_re;
1183  o1_12_im += B2_im;
1184  o1_22_re += B2_re;
1185  o1_22_im += B2_im;
1186  o1_32_re -= A2_re;
1187  o1_32_im -= A2_im;
1188 
1189  }
1190  {
1191 #ifdef MULTI_GPU
1192  if (kernel_type == INTERIOR_KERNEL) {
1193 #endif
1194 
1195  // read flavor 2 from device memory
1196  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
1197 
1198  // project spinor into half spinors
1199  a0_re = +i00_re-i30_re;
1200  a0_im = +i00_im-i30_im;
1201  a1_re = +i01_re-i31_re;
1202  a1_im = +i01_im-i31_im;
1203  a2_re = +i02_re-i32_re;
1204  a2_im = +i02_im-i32_im;
1205  b0_re = +i10_re+i20_re;
1206  b0_im = +i10_im+i20_im;
1207  b1_re = +i11_re+i21_re;
1208  b1_im = +i11_im+i21_im;
1209  b2_re = +i12_re+i22_re;
1210  b2_im = +i12_im+i22_im;
1211 
1212 #ifdef MULTI_GPU
1213  } else {
1214 
1215  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
1216 
1217  // read half spinor for the second flavor from device memory
1218  const int fl_idx = sp_idx + param.dc.ghostFace[static_cast<int>(kernel_type)];
1219  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+param.dc.ghostFace[static_cast<int>(kernel_type)],2);
1220 
1221  a0_re = i00_re; a0_im = i00_im;
1222  a1_re = i01_re; a1_im = i01_im;
1223  a2_re = i02_re; a2_im = i02_im;
1224  b0_re = i10_re; b0_im = i10_im;
1225  b1_re = i11_re; b1_im = i11_im;
1226  b2_re = i12_re; b2_im = i12_im;
1227 
1228  }
1229 #endif // MULTI_GPU
1230 
1231  // multiply row 0
1232  spinorFloat A0_re = 0;
1233  A0_re += g00_re * a0_re;
1234  A0_re -= g00_im * a0_im;
1235  A0_re += g01_re * a1_re;
1236  A0_re -= g01_im * a1_im;
1237  A0_re += g02_re * a2_re;
1238  A0_re -= g02_im * a2_im;
1239  spinorFloat A0_im = 0;
1240  A0_im += g00_re * a0_im;
1241  A0_im += g00_im * a0_re;
1242  A0_im += g01_re * a1_im;
1243  A0_im += g01_im * a1_re;
1244  A0_im += g02_re * a2_im;
1245  A0_im += g02_im * a2_re;
1246  spinorFloat B0_re = 0;
1247  B0_re += g00_re * b0_re;
1248  B0_re -= g00_im * b0_im;
1249  B0_re += g01_re * b1_re;
1250  B0_re -= g01_im * b1_im;
1251  B0_re += g02_re * b2_re;
1252  B0_re -= g02_im * b2_im;
1253  spinorFloat B0_im = 0;
1254  B0_im += g00_re * b0_im;
1255  B0_im += g00_im * b0_re;
1256  B0_im += g01_re * b1_im;
1257  B0_im += g01_im * b1_re;
1258  B0_im += g02_re * b2_im;
1259  B0_im += g02_im * b2_re;
1260 
1261  // multiply row 1
1262  spinorFloat A1_re = 0;
1263  A1_re += g10_re * a0_re;
1264  A1_re -= g10_im * a0_im;
1265  A1_re += g11_re * a1_re;
1266  A1_re -= g11_im * a1_im;
1267  A1_re += g12_re * a2_re;
1268  A1_re -= g12_im * a2_im;
1269  spinorFloat A1_im = 0;
1270  A1_im += g10_re * a0_im;
1271  A1_im += g10_im * a0_re;
1272  A1_im += g11_re * a1_im;
1273  A1_im += g11_im * a1_re;
1274  A1_im += g12_re * a2_im;
1275  A1_im += g12_im * a2_re;
1276  spinorFloat B1_re = 0;
1277  B1_re += g10_re * b0_re;
1278  B1_re -= g10_im * b0_im;
1279  B1_re += g11_re * b1_re;
1280  B1_re -= g11_im * b1_im;
1281  B1_re += g12_re * b2_re;
1282  B1_re -= g12_im * b2_im;
1283  spinorFloat B1_im = 0;
1284  B1_im += g10_re * b0_im;
1285  B1_im += g10_im * b0_re;
1286  B1_im += g11_re * b1_im;
1287  B1_im += g11_im * b1_re;
1288  B1_im += g12_re * b2_im;
1289  B1_im += g12_im * b2_re;
1290 
1291  // multiply row 2
1292  spinorFloat A2_re = 0;
1293  A2_re += g20_re * a0_re;
1294  A2_re -= g20_im * a0_im;
1295  A2_re += g21_re * a1_re;
1296  A2_re -= g21_im * a1_im;
1297  A2_re += g22_re * a2_re;
1298  A2_re -= g22_im * a2_im;
1299  spinorFloat A2_im = 0;
1300  A2_im += g20_re * a0_im;
1301  A2_im += g20_im * a0_re;
1302  A2_im += g21_re * a1_im;
1303  A2_im += g21_im * a1_re;
1304  A2_im += g22_re * a2_im;
1305  A2_im += g22_im * a2_re;
1306  spinorFloat B2_re = 0;
1307  B2_re += g20_re * b0_re;
1308  B2_re -= g20_im * b0_im;
1309  B2_re += g21_re * b1_re;
1310  B2_re -= g21_im * b1_im;
1311  B2_re += g22_re * b2_re;
1312  B2_re -= g22_im * b2_im;
1313  spinorFloat B2_im = 0;
1314  B2_im += g20_re * b0_im;
1315  B2_im += g20_im * b0_re;
1316  B2_im += g21_re * b1_im;
1317  B2_im += g21_im * b1_re;
1318  B2_im += g22_re * b2_im;
1319  B2_im += g22_im * b2_re;
1320 
1321  o2_00_re += A0_re;
1322  o2_00_im += A0_im;
1323  o2_10_re += B0_re;
1324  o2_10_im += B0_im;
1325  o2_20_re += B0_re;
1326  o2_20_im += B0_im;
1327  o2_30_re -= A0_re;
1328  o2_30_im -= A0_im;
1329 
1330  o2_01_re += A1_re;
1331  o2_01_im += A1_im;
1332  o2_11_re += B1_re;
1333  o2_11_im += B1_im;
1334  o2_21_re += B1_re;
1335  o2_21_im += B1_im;
1336  o2_31_re -= A1_re;
1337  o2_31_im -= A1_im;
1338 
1339  o2_02_re += A2_re;
1340  o2_02_im += A2_im;
1341  o2_12_re += B2_re;
1342  o2_12_im += B2_im;
1343  o2_22_re += B2_re;
1344  o2_22_im += B2_im;
1345  o2_32_re -= A2_re;
1346  o2_32_im -= A2_im;
1347 
1348  }
1349 }
1350 
1351 #ifdef MULTI_GPU
1352 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]>0)) ||
1353  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==0) )
1354 #endif
1355 {
1356  // Projector P1+
1357  // 1 0 0 1
1358  // 0 1 -1 0
1359  // 0 -1 1 0
1360  // 1 0 0 1
1361 
1362 #ifdef MULTI_GPU
1363  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1 :
1364  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1365 #if (DD_PREC==2) // half precision
1366  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1367 #endif
1368 #else
1369  const int sp_idx = (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1;
1370 #endif
1371 
1372 #ifdef MULTI_GPU
1373  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1374 #else
1375  const int ga_idx = sp_idx;
1376 #endif
1377 
1384 
1385  // read gauge matrix from device memory
1386  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, param.gauge_stride);
1387 
1388  // reconstruct gauge matrix
1390 
1391  {
1392 #ifdef MULTI_GPU
1393  if (kernel_type == INTERIOR_KERNEL) {
1394 #endif
1395 
1396  // read flavor 1 from device memory
1397  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1398 
1399  // project spinor into half spinors
1400  a0_re = +i00_re+i30_re;
1401  a0_im = +i00_im+i30_im;
1402  a1_re = +i01_re+i31_re;
1403  a1_im = +i01_im+i31_im;
1404  a2_re = +i02_re+i32_re;
1405  a2_im = +i02_im+i32_im;
1406  b0_re = +i10_re-i20_re;
1407  b0_im = +i10_im-i20_im;
1408  b1_re = +i11_re-i21_re;
1409  b1_im = +i11_im-i21_im;
1410  b2_re = +i12_re-i22_re;
1411  b2_im = +i12_im-i22_im;
1412 
1413 #ifdef MULTI_GPU
1414  } else {
1415 
1416  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
1417 
1418  // read half spinor for the first flavor from device memory
1419  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 3);
1420 
1421  a0_re = i00_re; a0_im = i00_im;
1422  a1_re = i01_re; a1_im = i01_im;
1423  a2_re = i02_re; a2_im = i02_im;
1424  b0_re = i10_re; b0_im = i10_im;
1425  b1_re = i11_re; b1_im = i11_im;
1426  b2_re = i12_re; b2_im = i12_im;
1427 
1428  }
1429 #endif // MULTI_GPU
1430 
1431  // multiply row 0
1432  spinorFloat A0_re = 0;
1433  A0_re += gT00_re * a0_re;
1434  A0_re -= gT00_im * a0_im;
1435  A0_re += gT01_re * a1_re;
1436  A0_re -= gT01_im * a1_im;
1437  A0_re += gT02_re * a2_re;
1438  A0_re -= gT02_im * a2_im;
1439  spinorFloat A0_im = 0;
1440  A0_im += gT00_re * a0_im;
1441  A0_im += gT00_im * a0_re;
1442  A0_im += gT01_re * a1_im;
1443  A0_im += gT01_im * a1_re;
1444  A0_im += gT02_re * a2_im;
1445  A0_im += gT02_im * a2_re;
1446  spinorFloat B0_re = 0;
1447  B0_re += gT00_re * b0_re;
1448  B0_re -= gT00_im * b0_im;
1449  B0_re += gT01_re * b1_re;
1450  B0_re -= gT01_im * b1_im;
1451  B0_re += gT02_re * b2_re;
1452  B0_re -= gT02_im * b2_im;
1453  spinorFloat B0_im = 0;
1454  B0_im += gT00_re * b0_im;
1455  B0_im += gT00_im * b0_re;
1456  B0_im += gT01_re * b1_im;
1457  B0_im += gT01_im * b1_re;
1458  B0_im += gT02_re * b2_im;
1459  B0_im += gT02_im * b2_re;
1460 
1461  // multiply row 1
1462  spinorFloat A1_re = 0;
1463  A1_re += gT10_re * a0_re;
1464  A1_re -= gT10_im * a0_im;
1465  A1_re += gT11_re * a1_re;
1466  A1_re -= gT11_im * a1_im;
1467  A1_re += gT12_re * a2_re;
1468  A1_re -= gT12_im * a2_im;
1469  spinorFloat A1_im = 0;
1470  A1_im += gT10_re * a0_im;
1471  A1_im += gT10_im * a0_re;
1472  A1_im += gT11_re * a1_im;
1473  A1_im += gT11_im * a1_re;
1474  A1_im += gT12_re * a2_im;
1475  A1_im += gT12_im * a2_re;
1476  spinorFloat B1_re = 0;
1477  B1_re += gT10_re * b0_re;
1478  B1_re -= gT10_im * b0_im;
1479  B1_re += gT11_re * b1_re;
1480  B1_re -= gT11_im * b1_im;
1481  B1_re += gT12_re * b2_re;
1482  B1_re -= gT12_im * b2_im;
1483  spinorFloat B1_im = 0;
1484  B1_im += gT10_re * b0_im;
1485  B1_im += gT10_im * b0_re;
1486  B1_im += gT11_re * b1_im;
1487  B1_im += gT11_im * b1_re;
1488  B1_im += gT12_re * b2_im;
1489  B1_im += gT12_im * b2_re;
1490 
1491  // multiply row 2
1492  spinorFloat A2_re = 0;
1493  A2_re += gT20_re * a0_re;
1494  A2_re -= gT20_im * a0_im;
1495  A2_re += gT21_re * a1_re;
1496  A2_re -= gT21_im * a1_im;
1497  A2_re += gT22_re * a2_re;
1498  A2_re -= gT22_im * a2_im;
1499  spinorFloat A2_im = 0;
1500  A2_im += gT20_re * a0_im;
1501  A2_im += gT20_im * a0_re;
1502  A2_im += gT21_re * a1_im;
1503  A2_im += gT21_im * a1_re;
1504  A2_im += gT22_re * a2_im;
1505  A2_im += gT22_im * a2_re;
1506  spinorFloat B2_re = 0;
1507  B2_re += gT20_re * b0_re;
1508  B2_re -= gT20_im * b0_im;
1509  B2_re += gT21_re * b1_re;
1510  B2_re -= gT21_im * b1_im;
1511  B2_re += gT22_re * b2_re;
1512  B2_re -= gT22_im * b2_im;
1513  spinorFloat B2_im = 0;
1514  B2_im += gT20_re * b0_im;
1515  B2_im += gT20_im * b0_re;
1516  B2_im += gT21_re * b1_im;
1517  B2_im += gT21_im * b1_re;
1518  B2_im += gT22_re * b2_im;
1519  B2_im += gT22_im * b2_re;
1520 
1521  o1_00_re += A0_re;
1522  o1_00_im += A0_im;
1523  o1_10_re += B0_re;
1524  o1_10_im += B0_im;
1525  o1_20_re -= B0_re;
1526  o1_20_im -= B0_im;
1527  o1_30_re += A0_re;
1528  o1_30_im += A0_im;
1529 
1530  o1_01_re += A1_re;
1531  o1_01_im += A1_im;
1532  o1_11_re += B1_re;
1533  o1_11_im += B1_im;
1534  o1_21_re -= B1_re;
1535  o1_21_im -= B1_im;
1536  o1_31_re += A1_re;
1537  o1_31_im += A1_im;
1538 
1539  o1_02_re += A2_re;
1540  o1_02_im += A2_im;
1541  o1_12_re += B2_re;
1542  o1_12_im += B2_im;
1543  o1_22_re -= B2_re;
1544  o1_22_im -= B2_im;
1545  o1_32_re += A2_re;
1546  o1_32_im += A2_im;
1547 
1548  }
1549  {
1550 #ifdef MULTI_GPU
1551  if (kernel_type == INTERIOR_KERNEL) {
1552 #endif
1553 
1554  // read flavor 2 from device memory
1555  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
1556 
1557  // project spinor into half spinors
1558  a0_re = +i00_re+i30_re;
1559  a0_im = +i00_im+i30_im;
1560  a1_re = +i01_re+i31_re;
1561  a1_im = +i01_im+i31_im;
1562  a2_re = +i02_re+i32_re;
1563  a2_im = +i02_im+i32_im;
1564  b0_re = +i10_re-i20_re;
1565  b0_im = +i10_im-i20_im;
1566  b1_re = +i11_re-i21_re;
1567  b1_im = +i11_im-i21_im;
1568  b2_re = +i12_re-i22_re;
1569  b2_im = +i12_im-i22_im;
1570 
1571 #ifdef MULTI_GPU
1572  } else {
1573 
1574  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
1575 
1576  // read half spinor for the second flavor from device memory
1577  const int fl_idx = sp_idx + param.dc.ghostFace[static_cast<int>(kernel_type)];
1578  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+param.dc.ghostFace[static_cast<int>(kernel_type)],3);
1579 
1580  a0_re = i00_re; a0_im = i00_im;
1581  a1_re = i01_re; a1_im = i01_im;
1582  a2_re = i02_re; a2_im = i02_im;
1583  b0_re = i10_re; b0_im = i10_im;
1584  b1_re = i11_re; b1_im = i11_im;
1585  b2_re = i12_re; b2_im = i12_im;
1586 
1587  }
1588 #endif // MULTI_GPU
1589 
1590  // multiply row 0
1591  spinorFloat A0_re = 0;
1592  A0_re += gT00_re * a0_re;
1593  A0_re -= gT00_im * a0_im;
1594  A0_re += gT01_re * a1_re;
1595  A0_re -= gT01_im * a1_im;
1596  A0_re += gT02_re * a2_re;
1597  A0_re -= gT02_im * a2_im;
1598  spinorFloat A0_im = 0;
1599  A0_im += gT00_re * a0_im;
1600  A0_im += gT00_im * a0_re;
1601  A0_im += gT01_re * a1_im;
1602  A0_im += gT01_im * a1_re;
1603  A0_im += gT02_re * a2_im;
1604  A0_im += gT02_im * a2_re;
1605  spinorFloat B0_re = 0;
1606  B0_re += gT00_re * b0_re;
1607  B0_re -= gT00_im * b0_im;
1608  B0_re += gT01_re * b1_re;
1609  B0_re -= gT01_im * b1_im;
1610  B0_re += gT02_re * b2_re;
1611  B0_re -= gT02_im * b2_im;
1612  spinorFloat B0_im = 0;
1613  B0_im += gT00_re * b0_im;
1614  B0_im += gT00_im * b0_re;
1615  B0_im += gT01_re * b1_im;
1616  B0_im += gT01_im * b1_re;
1617  B0_im += gT02_re * b2_im;
1618  B0_im += gT02_im * b2_re;
1619 
1620  // multiply row 1
1621  spinorFloat A1_re = 0;
1622  A1_re += gT10_re * a0_re;
1623  A1_re -= gT10_im * a0_im;
1624  A1_re += gT11_re * a1_re;
1625  A1_re -= gT11_im * a1_im;
1626  A1_re += gT12_re * a2_re;
1627  A1_re -= gT12_im * a2_im;
1628  spinorFloat A1_im = 0;
1629  A1_im += gT10_re * a0_im;
1630  A1_im += gT10_im * a0_re;
1631  A1_im += gT11_re * a1_im;
1632  A1_im += gT11_im * a1_re;
1633  A1_im += gT12_re * a2_im;
1634  A1_im += gT12_im * a2_re;
1635  spinorFloat B1_re = 0;
1636  B1_re += gT10_re * b0_re;
1637  B1_re -= gT10_im * b0_im;
1638  B1_re += gT11_re * b1_re;
1639  B1_re -= gT11_im * b1_im;
1640  B1_re += gT12_re * b2_re;
1641  B1_re -= gT12_im * b2_im;
1642  spinorFloat B1_im = 0;
1643  B1_im += gT10_re * b0_im;
1644  B1_im += gT10_im * b0_re;
1645  B1_im += gT11_re * b1_im;
1646  B1_im += gT11_im * b1_re;
1647  B1_im += gT12_re * b2_im;
1648  B1_im += gT12_im * b2_re;
1649 
1650  // multiply row 2
1651  spinorFloat A2_re = 0;
1652  A2_re += gT20_re * a0_re;
1653  A2_re -= gT20_im * a0_im;
1654  A2_re += gT21_re * a1_re;
1655  A2_re -= gT21_im * a1_im;
1656  A2_re += gT22_re * a2_re;
1657  A2_re -= gT22_im * a2_im;
1658  spinorFloat A2_im = 0;
1659  A2_im += gT20_re * a0_im;
1660  A2_im += gT20_im * a0_re;
1661  A2_im += gT21_re * a1_im;
1662  A2_im += gT21_im * a1_re;
1663  A2_im += gT22_re * a2_im;
1664  A2_im += gT22_im * a2_re;
1665  spinorFloat B2_re = 0;
1666  B2_re += gT20_re * b0_re;
1667  B2_re -= gT20_im * b0_im;
1668  B2_re += gT21_re * b1_re;
1669  B2_re -= gT21_im * b1_im;
1670  B2_re += gT22_re * b2_re;
1671  B2_re -= gT22_im * b2_im;
1672  spinorFloat B2_im = 0;
1673  B2_im += gT20_re * b0_im;
1674  B2_im += gT20_im * b0_re;
1675  B2_im += gT21_re * b1_im;
1676  B2_im += gT21_im * b1_re;
1677  B2_im += gT22_re * b2_im;
1678  B2_im += gT22_im * b2_re;
1679 
1680  o2_00_re += A0_re;
1681  o2_00_im += A0_im;
1682  o2_10_re += B0_re;
1683  o2_10_im += B0_im;
1684  o2_20_re -= B0_re;
1685  o2_20_im -= B0_im;
1686  o2_30_re += A0_re;
1687  o2_30_im += A0_im;
1688 
1689  o2_01_re += A1_re;
1690  o2_01_im += A1_im;
1691  o2_11_re += B1_re;
1692  o2_11_im += B1_im;
1693  o2_21_re -= B1_re;
1694  o2_21_im -= B1_im;
1695  o2_31_re += A1_re;
1696  o2_31_im += A1_im;
1697 
1698  o2_02_re += A2_re;
1699  o2_02_im += A2_im;
1700  o2_12_re += B2_re;
1701  o2_12_im += B2_im;
1702  o2_22_re -= B2_re;
1703  o2_22_im -= B2_im;
1704  o2_32_re += A2_re;
1705  o2_32_im += A2_im;
1706 
1707  }
1708 }
1709 
1710 #ifdef MULTI_GPU
1711 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]<(param.dc.X[2]-1))) ||
1712  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==(param.dc.X[2]-1)) )
1713 #endif
1714 {
1715  // Projector P2-
1716  // 1 0 -i 0
1717  // 0 1 0 i
1718  // i 0 1 0
1719  // 0 -i 0 1
1720 
1721 #ifdef MULTI_GPU
1722  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1 :
1723  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1724 #if (DD_PREC==2) // half precision
1725  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1726 #endif
1727 #else
1728  const int sp_idx = (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1;
1729 #endif
1730 
1731  const int ga_idx = sid;
1732 
1739 
1740  // read gauge matrix from device memory
1741  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, param.gauge_stride);
1742 
1743  // reconstruct gauge matrix
1745 
1746  {
1747 #ifdef MULTI_GPU
1748  if (kernel_type == INTERIOR_KERNEL) {
1749 #endif
1750 
1751  // read flavor 1 from device memory
1752  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1753 
1754  // project spinor into half spinors
1755  a0_re = +i00_re+i20_im;
1756  a0_im = +i00_im-i20_re;
1757  a1_re = +i01_re+i21_im;
1758  a1_im = +i01_im-i21_re;
1759  a2_re = +i02_re+i22_im;
1760  a2_im = +i02_im-i22_re;
1761  b0_re = +i10_re-i30_im;
1762  b0_im = +i10_im+i30_re;
1763  b1_re = +i11_re-i31_im;
1764  b1_im = +i11_im+i31_re;
1765  b2_re = +i12_re-i32_im;
1766  b2_im = +i12_im+i32_re;
1767 
1768 #ifdef MULTI_GPU
1769  } else {
1770 
1771  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
1772 
1773  // read half spinor for the first flavor from device memory
1774  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 4);
1775 
1776  a0_re = i00_re; a0_im = i00_im;
1777  a1_re = i01_re; a1_im = i01_im;
1778  a2_re = i02_re; a2_im = i02_im;
1779  b0_re = i10_re; b0_im = i10_im;
1780  b1_re = i11_re; b1_im = i11_im;
1781  b2_re = i12_re; b2_im = i12_im;
1782 
1783  }
1784 #endif // MULTI_GPU
1785 
1786  // multiply row 0
1787  spinorFloat A0_re = 0;
1788  A0_re += g00_re * a0_re;
1789  A0_re -= g00_im * a0_im;
1790  A0_re += g01_re * a1_re;
1791  A0_re -= g01_im * a1_im;
1792  A0_re += g02_re * a2_re;
1793  A0_re -= g02_im * a2_im;
1794  spinorFloat A0_im = 0;
1795  A0_im += g00_re * a0_im;
1796  A0_im += g00_im * a0_re;
1797  A0_im += g01_re * a1_im;
1798  A0_im += g01_im * a1_re;
1799  A0_im += g02_re * a2_im;
1800  A0_im += g02_im * a2_re;
1801  spinorFloat B0_re = 0;
1802  B0_re += g00_re * b0_re;
1803  B0_re -= g00_im * b0_im;
1804  B0_re += g01_re * b1_re;
1805  B0_re -= g01_im * b1_im;
1806  B0_re += g02_re * b2_re;
1807  B0_re -= g02_im * b2_im;
1808  spinorFloat B0_im = 0;
1809  B0_im += g00_re * b0_im;
1810  B0_im += g00_im * b0_re;
1811  B0_im += g01_re * b1_im;
1812  B0_im += g01_im * b1_re;
1813  B0_im += g02_re * b2_im;
1814  B0_im += g02_im * b2_re;
1815 
1816  // multiply row 1
1817  spinorFloat A1_re = 0;
1818  A1_re += g10_re * a0_re;
1819  A1_re -= g10_im * a0_im;
1820  A1_re += g11_re * a1_re;
1821  A1_re -= g11_im * a1_im;
1822  A1_re += g12_re * a2_re;
1823  A1_re -= g12_im * a2_im;
1824  spinorFloat A1_im = 0;
1825  A1_im += g10_re * a0_im;
1826  A1_im += g10_im * a0_re;
1827  A1_im += g11_re * a1_im;
1828  A1_im += g11_im * a1_re;
1829  A1_im += g12_re * a2_im;
1830  A1_im += g12_im * a2_re;
1831  spinorFloat B1_re = 0;
1832  B1_re += g10_re * b0_re;
1833  B1_re -= g10_im * b0_im;
1834  B1_re += g11_re * b1_re;
1835  B1_re -= g11_im * b1_im;
1836  B1_re += g12_re * b2_re;
1837  B1_re -= g12_im * b2_im;
1838  spinorFloat B1_im = 0;
1839  B1_im += g10_re * b0_im;
1840  B1_im += g10_im * b0_re;
1841  B1_im += g11_re * b1_im;
1842  B1_im += g11_im * b1_re;
1843  B1_im += g12_re * b2_im;
1844  B1_im += g12_im * b2_re;
1845 
1846  // multiply row 2
1847  spinorFloat A2_re = 0;
1848  A2_re += g20_re * a0_re;
1849  A2_re -= g20_im * a0_im;
1850  A2_re += g21_re * a1_re;
1851  A2_re -= g21_im * a1_im;
1852  A2_re += g22_re * a2_re;
1853  A2_re -= g22_im * a2_im;
1854  spinorFloat A2_im = 0;
1855  A2_im += g20_re * a0_im;
1856  A2_im += g20_im * a0_re;
1857  A2_im += g21_re * a1_im;
1858  A2_im += g21_im * a1_re;
1859  A2_im += g22_re * a2_im;
1860  A2_im += g22_im * a2_re;
1861  spinorFloat B2_re = 0;
1862  B2_re += g20_re * b0_re;
1863  B2_re -= g20_im * b0_im;
1864  B2_re += g21_re * b1_re;
1865  B2_re -= g21_im * b1_im;
1866  B2_re += g22_re * b2_re;
1867  B2_re -= g22_im * b2_im;
1868  spinorFloat B2_im = 0;
1869  B2_im += g20_re * b0_im;
1870  B2_im += g20_im * b0_re;
1871  B2_im += g21_re * b1_im;
1872  B2_im += g21_im * b1_re;
1873  B2_im += g22_re * b2_im;
1874  B2_im += g22_im * b2_re;
1875 
1876  o1_00_re += A0_re;
1877  o1_00_im += A0_im;
1878  o1_10_re += B0_re;
1879  o1_10_im += B0_im;
1880  o1_20_re -= A0_im;
1881  o1_20_im += A0_re;
1882  o1_30_re += B0_im;
1883  o1_30_im -= B0_re;
1884 
1885  o1_01_re += A1_re;
1886  o1_01_im += A1_im;
1887  o1_11_re += B1_re;
1888  o1_11_im += B1_im;
1889  o1_21_re -= A1_im;
1890  o1_21_im += A1_re;
1891  o1_31_re += B1_im;
1892  o1_31_im -= B1_re;
1893 
1894  o1_02_re += A2_re;
1895  o1_02_im += A2_im;
1896  o1_12_re += B2_re;
1897  o1_12_im += B2_im;
1898  o1_22_re -= A2_im;
1899  o1_22_im += A2_re;
1900  o1_32_re += B2_im;
1901  o1_32_im -= B2_re;
1902 
1903  }
1904  {
1905 #ifdef MULTI_GPU
1906  if (kernel_type == INTERIOR_KERNEL) {
1907 #endif
1908 
1909  // read flavor 2 from device memory
1910  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
1911 
1912  // project spinor into half spinors
1913  a0_re = +i00_re+i20_im;
1914  a0_im = +i00_im-i20_re;
1915  a1_re = +i01_re+i21_im;
1916  a1_im = +i01_im-i21_re;
1917  a2_re = +i02_re+i22_im;
1918  a2_im = +i02_im-i22_re;
1919  b0_re = +i10_re-i30_im;
1920  b0_im = +i10_im+i30_re;
1921  b1_re = +i11_re-i31_im;
1922  b1_im = +i11_im+i31_re;
1923  b2_re = +i12_re-i32_im;
1924  b2_im = +i12_im+i32_re;
1925 
1926 #ifdef MULTI_GPU
1927  } else {
1928 
1929  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
1930 
1931  // read half spinor for the second flavor from device memory
1932  const int fl_idx = sp_idx + param.dc.ghostFace[static_cast<int>(kernel_type)];
1933  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+param.dc.ghostFace[static_cast<int>(kernel_type)],4);
1934 
1935  a0_re = i00_re; a0_im = i00_im;
1936  a1_re = i01_re; a1_im = i01_im;
1937  a2_re = i02_re; a2_im = i02_im;
1938  b0_re = i10_re; b0_im = i10_im;
1939  b1_re = i11_re; b1_im = i11_im;
1940  b2_re = i12_re; b2_im = i12_im;
1941 
1942  }
1943 #endif // MULTI_GPU
1944 
1945  // multiply row 0
1946  spinorFloat A0_re = 0;
1947  A0_re += g00_re * a0_re;
1948  A0_re -= g00_im * a0_im;
1949  A0_re += g01_re * a1_re;
1950  A0_re -= g01_im * a1_im;
1951  A0_re += g02_re * a2_re;
1952  A0_re -= g02_im * a2_im;
1953  spinorFloat A0_im = 0;
1954  A0_im += g00_re * a0_im;
1955  A0_im += g00_im * a0_re;
1956  A0_im += g01_re * a1_im;
1957  A0_im += g01_im * a1_re;
1958  A0_im += g02_re * a2_im;
1959  A0_im += g02_im * a2_re;
1960  spinorFloat B0_re = 0;
1961  B0_re += g00_re * b0_re;
1962  B0_re -= g00_im * b0_im;
1963  B0_re += g01_re * b1_re;
1964  B0_re -= g01_im * b1_im;
1965  B0_re += g02_re * b2_re;
1966  B0_re -= g02_im * b2_im;
1967  spinorFloat B0_im = 0;
1968  B0_im += g00_re * b0_im;
1969  B0_im += g00_im * b0_re;
1970  B0_im += g01_re * b1_im;
1971  B0_im += g01_im * b1_re;
1972  B0_im += g02_re * b2_im;
1973  B0_im += g02_im * b2_re;
1974 
1975  // multiply row 1
1976  spinorFloat A1_re = 0;
1977  A1_re += g10_re * a0_re;
1978  A1_re -= g10_im * a0_im;
1979  A1_re += g11_re * a1_re;
1980  A1_re -= g11_im * a1_im;
1981  A1_re += g12_re * a2_re;
1982  A1_re -= g12_im * a2_im;
1983  spinorFloat A1_im = 0;
1984  A1_im += g10_re * a0_im;
1985  A1_im += g10_im * a0_re;
1986  A1_im += g11_re * a1_im;
1987  A1_im += g11_im * a1_re;
1988  A1_im += g12_re * a2_im;
1989  A1_im += g12_im * a2_re;
1990  spinorFloat B1_re = 0;
1991  B1_re += g10_re * b0_re;
1992  B1_re -= g10_im * b0_im;
1993  B1_re += g11_re * b1_re;
1994  B1_re -= g11_im * b1_im;
1995  B1_re += g12_re * b2_re;
1996  B1_re -= g12_im * b2_im;
1997  spinorFloat B1_im = 0;
1998  B1_im += g10_re * b0_im;
1999  B1_im += g10_im * b0_re;
2000  B1_im += g11_re * b1_im;
2001  B1_im += g11_im * b1_re;
2002  B1_im += g12_re * b2_im;
2003  B1_im += g12_im * b2_re;
2004 
2005  // multiply row 2
2006  spinorFloat A2_re = 0;
2007  A2_re += g20_re * a0_re;
2008  A2_re -= g20_im * a0_im;
2009  A2_re += g21_re * a1_re;
2010  A2_re -= g21_im * a1_im;
2011  A2_re += g22_re * a2_re;
2012  A2_re -= g22_im * a2_im;
2013  spinorFloat A2_im = 0;
2014  A2_im += g20_re * a0_im;
2015  A2_im += g20_im * a0_re;
2016  A2_im += g21_re * a1_im;
2017  A2_im += g21_im * a1_re;
2018  A2_im += g22_re * a2_im;
2019  A2_im += g22_im * a2_re;
2020  spinorFloat B2_re = 0;
2021  B2_re += g20_re * b0_re;
2022  B2_re -= g20_im * b0_im;
2023  B2_re += g21_re * b1_re;
2024  B2_re -= g21_im * b1_im;
2025  B2_re += g22_re * b2_re;
2026  B2_re -= g22_im * b2_im;
2027  spinorFloat B2_im = 0;
2028  B2_im += g20_re * b0_im;
2029  B2_im += g20_im * b0_re;
2030  B2_im += g21_re * b1_im;
2031  B2_im += g21_im * b1_re;
2032  B2_im += g22_re * b2_im;
2033  B2_im += g22_im * b2_re;
2034 
2035  o2_00_re += A0_re;
2036  o2_00_im += A0_im;
2037  o2_10_re += B0_re;
2038  o2_10_im += B0_im;
2039  o2_20_re -= A0_im;
2040  o2_20_im += A0_re;
2041  o2_30_re += B0_im;
2042  o2_30_im -= B0_re;
2043 
2044  o2_01_re += A1_re;
2045  o2_01_im += A1_im;
2046  o2_11_re += B1_re;
2047  o2_11_im += B1_im;
2048  o2_21_re -= A1_im;
2049  o2_21_im += A1_re;
2050  o2_31_re += B1_im;
2051  o2_31_im -= B1_re;
2052 
2053  o2_02_re += A2_re;
2054  o2_02_im += A2_im;
2055  o2_12_re += B2_re;
2056  o2_12_im += B2_im;
2057  o2_22_re -= A2_im;
2058  o2_22_im += A2_re;
2059  o2_32_re += B2_im;
2060  o2_32_im -= B2_re;
2061 
2062  }
2063 }
2064 
2065 #ifdef MULTI_GPU
2066 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]>0)) ||
2067  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==0) )
2068 #endif
2069 {
2070  // Projector P2+
2071  // 1 0 i 0
2072  // 0 1 0 -i
2073  // -i 0 1 0
2074  // 0 i 0 1
2075 
2076 #ifdef MULTI_GPU
2077  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1 :
2078  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
2079 #if (DD_PREC==2) // half precision
2080  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
2081 #endif
2082 #else
2083  const int sp_idx = (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1;
2084 #endif
2085 
2086 #ifdef MULTI_GPU
2087  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
2088 #else
2089  const int ga_idx = sp_idx;
2090 #endif
2091 
2098 
2099  // read gauge matrix from device memory
2100  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, param.gauge_stride);
2101 
2102  // reconstruct gauge matrix
2104 
2105  {
2106 #ifdef MULTI_GPU
2107  if (kernel_type == INTERIOR_KERNEL) {
2108 #endif
2109 
2110  // read flavor 1 from device memory
2111  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2112 
2113  // project spinor into half spinors
2114  a0_re = +i00_re-i20_im;
2115  a0_im = +i00_im+i20_re;
2116  a1_re = +i01_re-i21_im;
2117  a1_im = +i01_im+i21_re;
2118  a2_re = +i02_re-i22_im;
2119  a2_im = +i02_im+i22_re;
2120  b0_re = +i10_re+i30_im;
2121  b0_im = +i10_im-i30_re;
2122  b1_re = +i11_re+i31_im;
2123  b1_im = +i11_im-i31_re;
2124  b2_re = +i12_re+i32_im;
2125  b2_im = +i12_im-i32_re;
2126 
2127 #ifdef MULTI_GPU
2128  } else {
2129 
2130  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
2131 
2132  // read half spinor for the first flavor from device memory
2133  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 5);
2134 
2135  a0_re = i00_re; a0_im = i00_im;
2136  a1_re = i01_re; a1_im = i01_im;
2137  a2_re = i02_re; a2_im = i02_im;
2138  b0_re = i10_re; b0_im = i10_im;
2139  b1_re = i11_re; b1_im = i11_im;
2140  b2_re = i12_re; b2_im = i12_im;
2141 
2142  }
2143 #endif // MULTI_GPU
2144 
2145  // multiply row 0
2146  spinorFloat A0_re = 0;
2147  A0_re += gT00_re * a0_re;
2148  A0_re -= gT00_im * a0_im;
2149  A0_re += gT01_re * a1_re;
2150  A0_re -= gT01_im * a1_im;
2151  A0_re += gT02_re * a2_re;
2152  A0_re -= gT02_im * a2_im;
2153  spinorFloat A0_im = 0;
2154  A0_im += gT00_re * a0_im;
2155  A0_im += gT00_im * a0_re;
2156  A0_im += gT01_re * a1_im;
2157  A0_im += gT01_im * a1_re;
2158  A0_im += gT02_re * a2_im;
2159  A0_im += gT02_im * a2_re;
2160  spinorFloat B0_re = 0;
2161  B0_re += gT00_re * b0_re;
2162  B0_re -= gT00_im * b0_im;
2163  B0_re += gT01_re * b1_re;
2164  B0_re -= gT01_im * b1_im;
2165  B0_re += gT02_re * b2_re;
2166  B0_re -= gT02_im * b2_im;
2167  spinorFloat B0_im = 0;
2168  B0_im += gT00_re * b0_im;
2169  B0_im += gT00_im * b0_re;
2170  B0_im += gT01_re * b1_im;
2171  B0_im += gT01_im * b1_re;
2172  B0_im += gT02_re * b2_im;
2173  B0_im += gT02_im * b2_re;
2174 
2175  // multiply row 1
2176  spinorFloat A1_re = 0;
2177  A1_re += gT10_re * a0_re;
2178  A1_re -= gT10_im * a0_im;
2179  A1_re += gT11_re * a1_re;
2180  A1_re -= gT11_im * a1_im;
2181  A1_re += gT12_re * a2_re;
2182  A1_re -= gT12_im * a2_im;
2183  spinorFloat A1_im = 0;
2184  A1_im += gT10_re * a0_im;
2185  A1_im += gT10_im * a0_re;
2186  A1_im += gT11_re * a1_im;
2187  A1_im += gT11_im * a1_re;
2188  A1_im += gT12_re * a2_im;
2189  A1_im += gT12_im * a2_re;
2190  spinorFloat B1_re = 0;
2191  B1_re += gT10_re * b0_re;
2192  B1_re -= gT10_im * b0_im;
2193  B1_re += gT11_re * b1_re;
2194  B1_re -= gT11_im * b1_im;
2195  B1_re += gT12_re * b2_re;
2196  B1_re -= gT12_im * b2_im;
2197  spinorFloat B1_im = 0;
2198  B1_im += gT10_re * b0_im;
2199  B1_im += gT10_im * b0_re;
2200  B1_im += gT11_re * b1_im;
2201  B1_im += gT11_im * b1_re;
2202  B1_im += gT12_re * b2_im;
2203  B1_im += gT12_im * b2_re;
2204 
2205  // multiply row 2
2206  spinorFloat A2_re = 0;
2207  A2_re += gT20_re * a0_re;
2208  A2_re -= gT20_im * a0_im;
2209  A2_re += gT21_re * a1_re;
2210  A2_re -= gT21_im * a1_im;
2211  A2_re += gT22_re * a2_re;
2212  A2_re -= gT22_im * a2_im;
2213  spinorFloat A2_im = 0;
2214  A2_im += gT20_re * a0_im;
2215  A2_im += gT20_im * a0_re;
2216  A2_im += gT21_re * a1_im;
2217  A2_im += gT21_im * a1_re;
2218  A2_im += gT22_re * a2_im;
2219  A2_im += gT22_im * a2_re;
2220  spinorFloat B2_re = 0;
2221  B2_re += gT20_re * b0_re;
2222  B2_re -= gT20_im * b0_im;
2223  B2_re += gT21_re * b1_re;
2224  B2_re -= gT21_im * b1_im;
2225  B2_re += gT22_re * b2_re;
2226  B2_re -= gT22_im * b2_im;
2227  spinorFloat B2_im = 0;
2228  B2_im += gT20_re * b0_im;
2229  B2_im += gT20_im * b0_re;
2230  B2_im += gT21_re * b1_im;
2231  B2_im += gT21_im * b1_re;
2232  B2_im += gT22_re * b2_im;
2233  B2_im += gT22_im * b2_re;
2234 
2235  o1_00_re += A0_re;
2236  o1_00_im += A0_im;
2237  o1_10_re += B0_re;
2238  o1_10_im += B0_im;
2239  o1_20_re += A0_im;
2240  o1_20_im -= A0_re;
2241  o1_30_re -= B0_im;
2242  o1_30_im += B0_re;
2243 
2244  o1_01_re += A1_re;
2245  o1_01_im += A1_im;
2246  o1_11_re += B1_re;
2247  o1_11_im += B1_im;
2248  o1_21_re += A1_im;
2249  o1_21_im -= A1_re;
2250  o1_31_re -= B1_im;
2251  o1_31_im += B1_re;
2252 
2253  o1_02_re += A2_re;
2254  o1_02_im += A2_im;
2255  o1_12_re += B2_re;
2256  o1_12_im += B2_im;
2257  o1_22_re += A2_im;
2258  o1_22_im -= A2_re;
2259  o1_32_re -= B2_im;
2260  o1_32_im += B2_re;
2261 
2262  }
2263  {
2264 #ifdef MULTI_GPU
2265  if (kernel_type == INTERIOR_KERNEL) {
2266 #endif
2267 
2268  // read flavor 2 from device memory
2269  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
2270 
2271  // project spinor into half spinors
2272  a0_re = +i00_re-i20_im;
2273  a0_im = +i00_im+i20_re;
2274  a1_re = +i01_re-i21_im;
2275  a1_im = +i01_im+i21_re;
2276  a2_re = +i02_re-i22_im;
2277  a2_im = +i02_im+i22_re;
2278  b0_re = +i10_re+i30_im;
2279  b0_im = +i10_im-i30_re;
2280  b1_re = +i11_re+i31_im;
2281  b1_im = +i11_im-i31_re;
2282  b2_re = +i12_re+i32_im;
2283  b2_im = +i12_im-i32_re;
2284 
2285 #ifdef MULTI_GPU
2286  } else {
2287 
2288  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
2289 
2290  // read half spinor for the second flavor from device memory
2291  const int fl_idx = sp_idx + param.dc.ghostFace[static_cast<int>(kernel_type)];
2292  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+param.dc.ghostFace[static_cast<int>(kernel_type)],5);
2293 
2294  a0_re = i00_re; a0_im = i00_im;
2295  a1_re = i01_re; a1_im = i01_im;
2296  a2_re = i02_re; a2_im = i02_im;
2297  b0_re = i10_re; b0_im = i10_im;
2298  b1_re = i11_re; b1_im = i11_im;
2299  b2_re = i12_re; b2_im = i12_im;
2300 
2301  }
2302 #endif // MULTI_GPU
2303 
2304  // multiply row 0
2305  spinorFloat A0_re = 0;
2306  A0_re += gT00_re * a0_re;
2307  A0_re -= gT00_im * a0_im;
2308  A0_re += gT01_re * a1_re;
2309  A0_re -= gT01_im * a1_im;
2310  A0_re += gT02_re * a2_re;
2311  A0_re -= gT02_im * a2_im;
2312  spinorFloat A0_im = 0;
2313  A0_im += gT00_re * a0_im;
2314  A0_im += gT00_im * a0_re;
2315  A0_im += gT01_re * a1_im;
2316  A0_im += gT01_im * a1_re;
2317  A0_im += gT02_re * a2_im;
2318  A0_im += gT02_im * a2_re;
2319  spinorFloat B0_re = 0;
2320  B0_re += gT00_re * b0_re;
2321  B0_re -= gT00_im * b0_im;
2322  B0_re += gT01_re * b1_re;
2323  B0_re -= gT01_im * b1_im;
2324  B0_re += gT02_re * b2_re;
2325  B0_re -= gT02_im * b2_im;
2326  spinorFloat B0_im = 0;
2327  B0_im += gT00_re * b0_im;
2328  B0_im += gT00_im * b0_re;
2329  B0_im += gT01_re * b1_im;
2330  B0_im += gT01_im * b1_re;
2331  B0_im += gT02_re * b2_im;
2332  B0_im += gT02_im * b2_re;
2333 
2334  // multiply row 1
2335  spinorFloat A1_re = 0;
2336  A1_re += gT10_re * a0_re;
2337  A1_re -= gT10_im * a0_im;
2338  A1_re += gT11_re * a1_re;
2339  A1_re -= gT11_im * a1_im;
2340  A1_re += gT12_re * a2_re;
2341  A1_re -= gT12_im * a2_im;
2342  spinorFloat A1_im = 0;
2343  A1_im += gT10_re * a0_im;
2344  A1_im += gT10_im * a0_re;
2345  A1_im += gT11_re * a1_im;
2346  A1_im += gT11_im * a1_re;
2347  A1_im += gT12_re * a2_im;
2348  A1_im += gT12_im * a2_re;
2349  spinorFloat B1_re = 0;
2350  B1_re += gT10_re * b0_re;
2351  B1_re -= gT10_im * b0_im;
2352  B1_re += gT11_re * b1_re;
2353  B1_re -= gT11_im * b1_im;
2354  B1_re += gT12_re * b2_re;
2355  B1_re -= gT12_im * b2_im;
2356  spinorFloat B1_im = 0;
2357  B1_im += gT10_re * b0_im;
2358  B1_im += gT10_im * b0_re;
2359  B1_im += gT11_re * b1_im;
2360  B1_im += gT11_im * b1_re;
2361  B1_im += gT12_re * b2_im;
2362  B1_im += gT12_im * b2_re;
2363 
2364  // multiply row 2
2365  spinorFloat A2_re = 0;
2366  A2_re += gT20_re * a0_re;
2367  A2_re -= gT20_im * a0_im;
2368  A2_re += gT21_re * a1_re;
2369  A2_re -= gT21_im * a1_im;
2370  A2_re += gT22_re * a2_re;
2371  A2_re -= gT22_im * a2_im;
2372  spinorFloat A2_im = 0;
2373  A2_im += gT20_re * a0_im;
2374  A2_im += gT20_im * a0_re;
2375  A2_im += gT21_re * a1_im;
2376  A2_im += gT21_im * a1_re;
2377  A2_im += gT22_re * a2_im;
2378  A2_im += gT22_im * a2_re;
2379  spinorFloat B2_re = 0;
2380  B2_re += gT20_re * b0_re;
2381  B2_re -= gT20_im * b0_im;
2382  B2_re += gT21_re * b1_re;
2383  B2_re -= gT21_im * b1_im;
2384  B2_re += gT22_re * b2_re;
2385  B2_re -= gT22_im * b2_im;
2386  spinorFloat B2_im = 0;
2387  B2_im += gT20_re * b0_im;
2388  B2_im += gT20_im * b0_re;
2389  B2_im += gT21_re * b1_im;
2390  B2_im += gT21_im * b1_re;
2391  B2_im += gT22_re * b2_im;
2392  B2_im += gT22_im * b2_re;
2393 
2394  o2_00_re += A0_re;
2395  o2_00_im += A0_im;
2396  o2_10_re += B0_re;
2397  o2_10_im += B0_im;
2398  o2_20_re += A0_im;
2399  o2_20_im -= A0_re;
2400  o2_30_re -= B0_im;
2401  o2_30_im += B0_re;
2402 
2403  o2_01_re += A1_re;
2404  o2_01_im += A1_im;
2405  o2_11_re += B1_re;
2406  o2_11_im += B1_im;
2407  o2_21_re += A1_im;
2408  o2_21_im -= A1_re;
2409  o2_31_re -= B1_im;
2410  o2_31_im += B1_re;
2411 
2412  o2_02_re += A2_re;
2413  o2_02_im += A2_im;
2414  o2_12_re += B2_re;
2415  o2_12_im += B2_im;
2416  o2_22_re += A2_im;
2417  o2_22_im -= A2_re;
2418  o2_32_re -= B2_im;
2419  o2_32_im += B2_re;
2420 
2421  }
2422 }
2423 
2424 #ifdef MULTI_GPU
2425 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]<(param.dc.X[3]-1))) ||
2426  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==(param.dc.X[3]-1)) )
2427 #endif
2428 {
2429  // Projector P3-
2430  // 0 0 0 0
2431  // 0 0 0 0
2432  // 0 0 2 0
2433  // 0 0 0 2
2434 
2435 #ifdef MULTI_GPU
2436  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1 :
2437  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
2438 #if (DD_PREC==2) // half precision
2439  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
2440 #endif
2441 #else
2442  const int sp_idx = (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1;
2443 #endif
2444 
2445  const int ga_idx = sid;
2446 
2453 
2454  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
2455  {
2456  {
2457 #ifdef MULTI_GPU
2458  if (kernel_type == INTERIOR_KERNEL) {
2459 #endif
2460 
2461  // read flavor 1 from device memory
2463 
2464  // project spinor into half spinors
2465  a0_re = +2*i20_re;
2466  a0_im = +2*i20_im;
2467  a1_re = +2*i21_re;
2468  a1_im = +2*i21_im;
2469  a2_re = +2*i22_re;
2470  a2_im = +2*i22_im;
2471  b0_re = +2*i30_re;
2472  b0_im = +2*i30_im;
2473  b1_re = +2*i31_re;
2474  b1_im = +2*i31_im;
2475  b2_re = +2*i32_re;
2476  b2_im = +2*i32_im;
2477 
2478 #ifdef MULTI_GPU
2479  } else {
2480 
2481  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
2482  const int t_proj_scale = TPROJSCALE;
2483 
2484  // read half spinor for the first flavor from device memory
2485  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
2486 
2487  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2488  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2489  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2490  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2491  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2492  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2493 
2494  }
2495 #endif // MULTI_GPU
2496 
2497  // identity gauge matrix
2504 
2505  o1_20_re += A0_re;
2506  o1_20_im += A0_im;
2507  o1_30_re += B0_re;
2508  o1_30_im += B0_im;
2509 
2510  o1_21_re += A1_re;
2511  o1_21_im += A1_im;
2512  o1_31_re += B1_re;
2513  o1_31_im += B1_im;
2514 
2515  o1_22_re += A2_re;
2516  o1_22_im += A2_im;
2517  o1_32_re += B2_re;
2518  o1_32_im += B2_im;
2519 
2520  }
2521  {
2522 #ifdef MULTI_GPU
2523  if (kernel_type == INTERIOR_KERNEL) {
2524 #endif
2525 
2526  // read flavor 2 from device memory
2527  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
2528 
2529  // project spinor into half spinors
2530  a0_re = +2*i20_re;
2531  a0_im = +2*i20_im;
2532  a1_re = +2*i21_re;
2533  a1_im = +2*i21_im;
2534  a2_re = +2*i22_re;
2535  a2_im = +2*i22_im;
2536  b0_re = +2*i30_re;
2537  b0_im = +2*i30_im;
2538  b1_re = +2*i31_re;
2539  b1_im = +2*i31_im;
2540  b2_re = +2*i32_re;
2541  b2_im = +2*i32_im;
2542 
2543 #ifdef MULTI_GPU
2544  } else {
2545 
2546  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
2547  const int t_proj_scale = TPROJSCALE;
2548 
2549  // read half spinor for the second flavor from device memory
2550  const int fl_idx = sp_idx + param.dc.ghostFace[static_cast<int>(kernel_type)];
2551  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+param.dc.ghostFace[static_cast<int>(kernel_type)],6);
2552 
2553  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2554  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2555  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2556  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2557  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2558  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2559 
2560  }
2561 #endif // MULTI_GPU
2562 
2563  // identity gauge matrix
2570 
2571  o2_20_re += A0_re;
2572  o2_20_im += A0_im;
2573  o2_30_re += B0_re;
2574  o2_30_im += B0_im;
2575 
2576  o2_21_re += A1_re;
2577  o2_21_im += A1_im;
2578  o2_31_re += B1_re;
2579  o2_31_im += B1_im;
2580 
2581  o2_22_re += A2_re;
2582  o2_22_im += A2_im;
2583  o2_32_re += B2_re;
2584  o2_32_im += B2_im;
2585 
2586  }
2587  } else {
2588  // read gauge matrix from device memory
2589  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, param.gauge_stride);
2590 
2591  // reconstruct gauge matrix
2593 
2594  {
2595 #ifdef MULTI_GPU
2596  if (kernel_type == INTERIOR_KERNEL) {
2597 #endif
2598 
2599  // read flavor 1 from device memory
2601 
2602  // project spinor into half spinors
2603  a0_re = +2*i20_re;
2604  a0_im = +2*i20_im;
2605  a1_re = +2*i21_re;
2606  a1_im = +2*i21_im;
2607  a2_re = +2*i22_re;
2608  a2_im = +2*i22_im;
2609  b0_re = +2*i30_re;
2610  b0_im = +2*i30_im;
2611  b1_re = +2*i31_re;
2612  b1_im = +2*i31_im;
2613  b2_re = +2*i32_re;
2614  b2_im = +2*i32_im;
2615 
2616 #ifdef MULTI_GPU
2617  } else {
2618 
2619  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
2620  const int t_proj_scale = TPROJSCALE;
2621 
2622  // read half spinor for the first flavor from device memory
2623  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
2624 
2625  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2626  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2627  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2628  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2629  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2630  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2631 
2632  }
2633 #endif // MULTI_GPU
2634 
2635  // multiply row 0
2636  spinorFloat A0_re = 0;
2637  A0_re += g00_re * a0_re;
2638  A0_re -= g00_im * a0_im;
2639  A0_re += g01_re * a1_re;
2640  A0_re -= g01_im * a1_im;
2641  A0_re += g02_re * a2_re;
2642  A0_re -= g02_im * a2_im;
2643  spinorFloat A0_im = 0;
2644  A0_im += g00_re * a0_im;
2645  A0_im += g00_im * a0_re;
2646  A0_im += g01_re * a1_im;
2647  A0_im += g01_im * a1_re;
2648  A0_im += g02_re * a2_im;
2649  A0_im += g02_im * a2_re;
2650  spinorFloat B0_re = 0;
2651  B0_re += g00_re * b0_re;
2652  B0_re -= g00_im * b0_im;
2653  B0_re += g01_re * b1_re;
2654  B0_re -= g01_im * b1_im;
2655  B0_re += g02_re * b2_re;
2656  B0_re -= g02_im * b2_im;
2657  spinorFloat B0_im = 0;
2658  B0_im += g00_re * b0_im;
2659  B0_im += g00_im * b0_re;
2660  B0_im += g01_re * b1_im;
2661  B0_im += g01_im * b1_re;
2662  B0_im += g02_re * b2_im;
2663  B0_im += g02_im * b2_re;
2664 
2665  // multiply row 1
2666  spinorFloat A1_re = 0;
2667  A1_re += g10_re * a0_re;
2668  A1_re -= g10_im * a0_im;
2669  A1_re += g11_re * a1_re;
2670  A1_re -= g11_im * a1_im;
2671  A1_re += g12_re * a2_re;
2672  A1_re -= g12_im * a2_im;
2673  spinorFloat A1_im = 0;
2674  A1_im += g10_re * a0_im;
2675  A1_im += g10_im * a0_re;
2676  A1_im += g11_re * a1_im;
2677  A1_im += g11_im * a1_re;
2678  A1_im += g12_re * a2_im;
2679  A1_im += g12_im * a2_re;
2680  spinorFloat B1_re = 0;
2681  B1_re += g10_re * b0_re;
2682  B1_re -= g10_im * b0_im;
2683  B1_re += g11_re * b1_re;
2684  B1_re -= g11_im * b1_im;
2685  B1_re += g12_re * b2_re;
2686  B1_re -= g12_im * b2_im;
2687  spinorFloat B1_im = 0;
2688  B1_im += g10_re * b0_im;
2689  B1_im += g10_im * b0_re;
2690  B1_im += g11_re * b1_im;
2691  B1_im += g11_im * b1_re;
2692  B1_im += g12_re * b2_im;
2693  B1_im += g12_im * b2_re;
2694 
2695  // multiply row 2
2696  spinorFloat A2_re = 0;
2697  A2_re += g20_re * a0_re;
2698  A2_re -= g20_im * a0_im;
2699  A2_re += g21_re * a1_re;
2700  A2_re -= g21_im * a1_im;
2701  A2_re += g22_re * a2_re;
2702  A2_re -= g22_im * a2_im;
2703  spinorFloat A2_im = 0;
2704  A2_im += g20_re * a0_im;
2705  A2_im += g20_im * a0_re;
2706  A2_im += g21_re * a1_im;
2707  A2_im += g21_im * a1_re;
2708  A2_im += g22_re * a2_im;
2709  A2_im += g22_im * a2_re;
2710  spinorFloat B2_re = 0;
2711  B2_re += g20_re * b0_re;
2712  B2_re -= g20_im * b0_im;
2713  B2_re += g21_re * b1_re;
2714  B2_re -= g21_im * b1_im;
2715  B2_re += g22_re * b2_re;
2716  B2_re -= g22_im * b2_im;
2717  spinorFloat B2_im = 0;
2718  B2_im += g20_re * b0_im;
2719  B2_im += g20_im * b0_re;
2720  B2_im += g21_re * b1_im;
2721  B2_im += g21_im * b1_re;
2722  B2_im += g22_re * b2_im;
2723  B2_im += g22_im * b2_re;
2724 
2725  o1_20_re += A0_re;
2726  o1_20_im += A0_im;
2727  o1_30_re += B0_re;
2728  o1_30_im += B0_im;
2729 
2730  o1_21_re += A1_re;
2731  o1_21_im += A1_im;
2732  o1_31_re += B1_re;
2733  o1_31_im += B1_im;
2734 
2735  o1_22_re += A2_re;
2736  o1_22_im += A2_im;
2737  o1_32_re += B2_re;
2738  o1_32_im += B2_im;
2739 
2740  }
2741  {
2742 #ifdef MULTI_GPU
2743  if (kernel_type == INTERIOR_KERNEL) {
2744 #endif
2745 
2746  // read flavor 2 from device memory
2747  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
2748 
2749  // project spinor into half spinors
2750  a0_re = +2*i20_re;
2751  a0_im = +2*i20_im;
2752  a1_re = +2*i21_re;
2753  a1_im = +2*i21_im;
2754  a2_re = +2*i22_re;
2755  a2_im = +2*i22_im;
2756  b0_re = +2*i30_re;
2757  b0_im = +2*i30_im;
2758  b1_re = +2*i31_re;
2759  b1_im = +2*i31_im;
2760  b2_re = +2*i32_re;
2761  b2_im = +2*i32_im;
2762 
2763 #ifdef MULTI_GPU
2764  } else {
2765 
2766  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
2767  const int t_proj_scale = TPROJSCALE;
2768 
2769  // read half spinor for the second flavor from device memory
2770  const int fl_idx = sp_idx + param.dc.ghostFace[static_cast<int>(kernel_type)];
2771  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+param.dc.ghostFace[static_cast<int>(kernel_type)],6);
2772 
2773  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2774  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2775  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2776  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2777  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2778  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2779 
2780  }
2781 #endif // MULTI_GPU
2782 
2783  // multiply row 0
2784  spinorFloat A0_re = 0;
2785  A0_re += g00_re * a0_re;
2786  A0_re -= g00_im * a0_im;
2787  A0_re += g01_re * a1_re;
2788  A0_re -= g01_im * a1_im;
2789  A0_re += g02_re * a2_re;
2790  A0_re -= g02_im * a2_im;
2791  spinorFloat A0_im = 0;
2792  A0_im += g00_re * a0_im;
2793  A0_im += g00_im * a0_re;
2794  A0_im += g01_re * a1_im;
2795  A0_im += g01_im * a1_re;
2796  A0_im += g02_re * a2_im;
2797  A0_im += g02_im * a2_re;
2798  spinorFloat B0_re = 0;
2799  B0_re += g00_re * b0_re;
2800  B0_re -= g00_im * b0_im;
2801  B0_re += g01_re * b1_re;
2802  B0_re -= g01_im * b1_im;
2803  B0_re += g02_re * b2_re;
2804  B0_re -= g02_im * b2_im;
2805  spinorFloat B0_im = 0;
2806  B0_im += g00_re * b0_im;
2807  B0_im += g00_im * b0_re;
2808  B0_im += g01_re * b1_im;
2809  B0_im += g01_im * b1_re;
2810  B0_im += g02_re * b2_im;
2811  B0_im += g02_im * b2_re;
2812 
2813  // multiply row 1
2814  spinorFloat A1_re = 0;
2815  A1_re += g10_re * a0_re;
2816  A1_re -= g10_im * a0_im;
2817  A1_re += g11_re * a1_re;
2818  A1_re -= g11_im * a1_im;
2819  A1_re += g12_re * a2_re;
2820  A1_re -= g12_im * a2_im;
2821  spinorFloat A1_im = 0;
2822  A1_im += g10_re * a0_im;
2823  A1_im += g10_im * a0_re;
2824  A1_im += g11_re * a1_im;
2825  A1_im += g11_im * a1_re;
2826  A1_im += g12_re * a2_im;
2827  A1_im += g12_im * a2_re;
2828  spinorFloat B1_re = 0;
2829  B1_re += g10_re * b0_re;
2830  B1_re -= g10_im * b0_im;
2831  B1_re += g11_re * b1_re;
2832  B1_re -= g11_im * b1_im;
2833  B1_re += g12_re * b2_re;
2834  B1_re -= g12_im * b2_im;
2835  spinorFloat B1_im = 0;
2836  B1_im += g10_re * b0_im;
2837  B1_im += g10_im * b0_re;
2838  B1_im += g11_re * b1_im;
2839  B1_im += g11_im * b1_re;
2840  B1_im += g12_re * b2_im;
2841  B1_im += g12_im * b2_re;
2842 
2843  // multiply row 2
2844  spinorFloat A2_re = 0;
2845  A2_re += g20_re * a0_re;
2846  A2_re -= g20_im * a0_im;
2847  A2_re += g21_re * a1_re;
2848  A2_re -= g21_im * a1_im;
2849  A2_re += g22_re * a2_re;
2850  A2_re -= g22_im * a2_im;
2851  spinorFloat A2_im = 0;
2852  A2_im += g20_re * a0_im;
2853  A2_im += g20_im * a0_re;
2854  A2_im += g21_re * a1_im;
2855  A2_im += g21_im * a1_re;
2856  A2_im += g22_re * a2_im;
2857  A2_im += g22_im * a2_re;
2858  spinorFloat B2_re = 0;
2859  B2_re += g20_re * b0_re;
2860  B2_re -= g20_im * b0_im;
2861  B2_re += g21_re * b1_re;
2862  B2_re -= g21_im * b1_im;
2863  B2_re += g22_re * b2_re;
2864  B2_re -= g22_im * b2_im;
2865  spinorFloat B2_im = 0;
2866  B2_im += g20_re * b0_im;
2867  B2_im += g20_im * b0_re;
2868  B2_im += g21_re * b1_im;
2869  B2_im += g21_im * b1_re;
2870  B2_im += g22_re * b2_im;
2871  B2_im += g22_im * b2_re;
2872 
2873  o2_20_re += A0_re;
2874  o2_20_im += A0_im;
2875  o2_30_re += B0_re;
2876  o2_30_im += B0_im;
2877 
2878  o2_21_re += A1_re;
2879  o2_21_im += A1_im;
2880  o2_31_re += B1_re;
2881  o2_31_im += B1_im;
2882 
2883  o2_22_re += A2_re;
2884  o2_22_im += A2_im;
2885  o2_32_re += B2_re;
2886  o2_32_im += B2_im;
2887 
2888  }
2889  }
2890 }
2891 
2892 #ifdef MULTI_GPU
2893 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]>0)) ||
2894  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==0) )
2895 #endif
2896 {
2897  // Projector P3+
2898  // 2 0 0 0
2899  // 0 2 0 0
2900  // 0 0 0 0
2901  // 0 0 0 0
2902 
2903 #ifdef MULTI_GPU
2904  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1 :
2905  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
2906 #if (DD_PREC==2) // half precision
2907  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
2908 #endif
2909 #else
2910  const int sp_idx = (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1;
2911 #endif
2912 
2913 #ifdef MULTI_GPU
2914  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
2915 #else
2916  const int ga_idx = sp_idx;
2917 #endif
2918 
2925 
2926  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
2927  {
2928  {
2929 #ifdef MULTI_GPU
2930  if (kernel_type == INTERIOR_KERNEL) {
2931 #endif
2932 
2933  // read flavor 1 from device memory
2934  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2935 
2936  // project spinor into half spinors
2937  a0_re = +2*i00_re;
2938  a0_im = +2*i00_im;
2939  a1_re = +2*i01_re;
2940  a1_im = +2*i01_im;
2941  a2_re = +2*i02_re;
2942  a2_im = +2*i02_im;
2943  b0_re = +2*i10_re;
2944  b0_im = +2*i10_im;
2945  b1_re = +2*i11_re;
2946  b1_im = +2*i11_im;
2947  b2_re = +2*i12_re;
2948  b2_im = +2*i12_im;
2949 
2950 #ifdef MULTI_GPU
2951  } else {
2952 
2953  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
2954  const int t_proj_scale = TPROJSCALE;
2955 
2956  // read half spinor for the first flavor from device memory
2957  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
2958 
2959  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2960  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2961  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2962  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2963  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2964  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2965 
2966  }
2967 #endif // MULTI_GPU
2968 
2969  // identity gauge matrix
2976 
2977  o1_00_re += A0_re;
2978  o1_00_im += A0_im;
2979  o1_10_re += B0_re;
2980  o1_10_im += B0_im;
2981 
2982  o1_01_re += A1_re;
2983  o1_01_im += A1_im;
2984  o1_11_re += B1_re;
2985  o1_11_im += B1_im;
2986 
2987  o1_02_re += A2_re;
2988  o1_02_im += A2_im;
2989  o1_12_re += B2_re;
2990  o1_12_im += B2_im;
2991 
2992  }
2993  {
2994 #ifdef MULTI_GPU
2995  if (kernel_type == INTERIOR_KERNEL) {
2996 #endif
2997 
2998  // read flavor 2 from device memory
2999  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
3000 
3001  // project spinor into half spinors
3002  a0_re = +2*i00_re;
3003  a0_im = +2*i00_im;
3004  a1_re = +2*i01_re;
3005  a1_im = +2*i01_im;
3006  a2_re = +2*i02_re;
3007  a2_im = +2*i02_im;
3008  b0_re = +2*i10_re;
3009  b0_im = +2*i10_im;
3010  b1_re = +2*i11_re;
3011  b1_im = +2*i11_im;
3012  b2_re = +2*i12_re;
3013  b2_im = +2*i12_im;
3014 
3015 #ifdef MULTI_GPU
3016  } else {
3017 
3018  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
3019  const int t_proj_scale = TPROJSCALE;
3020 
3021  // read half spinor for the second flavor from device memory
3022  const int fl_idx = sp_idx + param.dc.ghostFace[static_cast<int>(kernel_type)];
3023  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+param.dc.ghostFace[static_cast<int>(kernel_type)],7);
3024 
3025  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
3026  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
3027  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
3028  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
3029  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
3030  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
3031 
3032  }
3033 #endif // MULTI_GPU
3034 
3035  // identity gauge matrix
3042 
3043  o2_00_re += A0_re;
3044  o2_00_im += A0_im;
3045  o2_10_re += B0_re;
3046  o2_10_im += B0_im;
3047 
3048  o2_01_re += A1_re;
3049  o2_01_im += A1_im;
3050  o2_11_re += B1_re;
3051  o2_11_im += B1_im;
3052 
3053  o2_02_re += A2_re;
3054  o2_02_im += A2_im;
3055  o2_12_re += B2_re;
3056  o2_12_im += B2_im;
3057 
3058  }
3059  } else {
3060  // read gauge matrix from device memory
3061  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, param.gauge_stride);
3062 
3063  // reconstruct gauge matrix
3065 
3066  {
3067 #ifdef MULTI_GPU
3068  if (kernel_type == INTERIOR_KERNEL) {
3069 #endif
3070 
3071  // read flavor 1 from device memory
3072  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
3073 
3074  // project spinor into half spinors
3075  a0_re = +2*i00_re;
3076  a0_im = +2*i00_im;
3077  a1_re = +2*i01_re;
3078  a1_im = +2*i01_im;
3079  a2_re = +2*i02_re;
3080  a2_im = +2*i02_im;
3081  b0_re = +2*i10_re;
3082  b0_im = +2*i10_im;
3083  b1_re = +2*i11_re;
3084  b1_im = +2*i11_im;
3085  b2_re = +2*i12_re;
3086  b2_im = +2*i12_im;
3087 
3088 #ifdef MULTI_GPU
3089  } else {
3090 
3091  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
3092  const int t_proj_scale = TPROJSCALE;
3093 
3094  // read half spinor for the first flavor from device memory
3095  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
3096 
3097  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
3098  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
3099  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
3100  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
3101  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
3102  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
3103 
3104  }
3105 #endif // MULTI_GPU
3106 
3107  // multiply row 0
3108  spinorFloat A0_re = 0;
3109  A0_re += gT00_re * a0_re;
3110  A0_re -= gT00_im * a0_im;
3111  A0_re += gT01_re * a1_re;
3112  A0_re -= gT01_im * a1_im;
3113  A0_re += gT02_re * a2_re;
3114  A0_re -= gT02_im * a2_im;
3115  spinorFloat A0_im = 0;
3116  A0_im += gT00_re * a0_im;
3117  A0_im += gT00_im * a0_re;
3118  A0_im += gT01_re * a1_im;
3119  A0_im += gT01_im * a1_re;
3120  A0_im += gT02_re * a2_im;
3121  A0_im += gT02_im * a2_re;
3122  spinorFloat B0_re = 0;
3123  B0_re += gT00_re * b0_re;
3124  B0_re -= gT00_im * b0_im;
3125  B0_re += gT01_re * b1_re;
3126  B0_re -= gT01_im * b1_im;
3127  B0_re += gT02_re * b2_re;
3128  B0_re -= gT02_im * b2_im;
3129  spinorFloat B0_im = 0;
3130  B0_im += gT00_re * b0_im;
3131  B0_im += gT00_im * b0_re;
3132  B0_im += gT01_re * b1_im;
3133  B0_im += gT01_im * b1_re;
3134  B0_im += gT02_re * b2_im;
3135  B0_im += gT02_im * b2_re;
3136 
3137  // multiply row 1
3138  spinorFloat A1_re = 0;
3139  A1_re += gT10_re * a0_re;
3140  A1_re -= gT10_im * a0_im;
3141  A1_re += gT11_re * a1_re;
3142  A1_re -= gT11_im * a1_im;
3143  A1_re += gT12_re * a2_re;
3144  A1_re -= gT12_im * a2_im;
3145  spinorFloat A1_im = 0;
3146  A1_im += gT10_re * a0_im;
3147  A1_im += gT10_im * a0_re;
3148  A1_im += gT11_re * a1_im;
3149  A1_im += gT11_im * a1_re;
3150  A1_im += gT12_re * a2_im;
3151  A1_im += gT12_im * a2_re;
3152  spinorFloat B1_re = 0;
3153  B1_re += gT10_re * b0_re;
3154  B1_re -= gT10_im * b0_im;
3155  B1_re += gT11_re * b1_re;
3156  B1_re -= gT11_im * b1_im;
3157  B1_re += gT12_re * b2_re;
3158  B1_re -= gT12_im * b2_im;
3159  spinorFloat B1_im = 0;
3160  B1_im += gT10_re * b0_im;
3161  B1_im += gT10_im * b0_re;
3162  B1_im += gT11_re * b1_im;
3163  B1_im += gT11_im * b1_re;
3164  B1_im += gT12_re * b2_im;
3165  B1_im += gT12_im * b2_re;
3166 
3167  // multiply row 2
3168  spinorFloat A2_re = 0;
3169  A2_re += gT20_re * a0_re;
3170  A2_re -= gT20_im * a0_im;
3171  A2_re += gT21_re * a1_re;
3172  A2_re -= gT21_im * a1_im;
3173  A2_re += gT22_re * a2_re;
3174  A2_re -= gT22_im * a2_im;
3175  spinorFloat A2_im = 0;
3176  A2_im += gT20_re * a0_im;
3177  A2_im += gT20_im * a0_re;
3178  A2_im += gT21_re * a1_im;
3179  A2_im += gT21_im * a1_re;
3180  A2_im += gT22_re * a2_im;
3181  A2_im += gT22_im * a2_re;
3182  spinorFloat B2_re = 0;
3183  B2_re += gT20_re * b0_re;
3184  B2_re -= gT20_im * b0_im;
3185  B2_re += gT21_re * b1_re;
3186  B2_re -= gT21_im * b1_im;
3187  B2_re += gT22_re * b2_re;
3188  B2_re -= gT22_im * b2_im;
3189  spinorFloat B2_im = 0;
3190  B2_im += gT20_re * b0_im;
3191  B2_im += gT20_im * b0_re;
3192  B2_im += gT21_re * b1_im;
3193  B2_im += gT21_im * b1_re;
3194  B2_im += gT22_re * b2_im;
3195  B2_im += gT22_im * b2_re;
3196 
3197  o1_00_re += A0_re;
3198  o1_00_im += A0_im;
3199  o1_10_re += B0_re;
3200  o1_10_im += B0_im;
3201 
3202  o1_01_re += A1_re;
3203  o1_01_im += A1_im;
3204  o1_11_re += B1_re;
3205  o1_11_im += B1_im;
3206 
3207  o1_02_re += A2_re;
3208  o1_02_im += A2_im;
3209  o1_12_re += B2_re;
3210  o1_12_im += B2_im;
3211 
3212  }
3213  {
3214 #ifdef MULTI_GPU
3215  if (kernel_type == INTERIOR_KERNEL) {
3216 #endif
3217 
3218  // read flavor 2 from device memory
3219  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
3220 
3221  // project spinor into half spinors
3222  a0_re = +2*i00_re;
3223  a0_im = +2*i00_im;
3224  a1_re = +2*i01_re;
3225  a1_im = +2*i01_im;
3226  a2_re = +2*i02_re;
3227  a2_im = +2*i02_im;
3228  b0_re = +2*i10_re;
3229  b0_im = +2*i10_im;
3230  b1_re = +2*i11_re;
3231  b1_im = +2*i11_im;
3232  b2_re = +2*i12_re;
3233  b2_im = +2*i12_im;
3234 
3235 #ifdef MULTI_GPU
3236  } else {
3237 
3238  const int sp_stride_pad = FLAVORS*param.dc.ghostFace[static_cast<int>(kernel_type)];
3239  const int t_proj_scale = TPROJSCALE;
3240 
3241  // read half spinor for the second flavor from device memory
3242  const int fl_idx = sp_idx + param.dc.ghostFace[static_cast<int>(kernel_type)];
3243  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+param.dc.ghostFace[static_cast<int>(kernel_type)],7);
3244 
3245  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
3246  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
3247  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
3248  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
3249  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
3250  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
3251 
3252  }
3253 #endif // MULTI_GPU
3254 
3255  // multiply row 0
3256  spinorFloat A0_re = 0;
3257  A0_re += gT00_re * a0_re;
3258  A0_re -= gT00_im * a0_im;
3259  A0_re += gT01_re * a1_re;
3260  A0_re -= gT01_im * a1_im;
3261  A0_re += gT02_re * a2_re;
3262  A0_re -= gT02_im * a2_im;
3263  spinorFloat A0_im = 0;
3264  A0_im += gT00_re * a0_im;
3265  A0_im += gT00_im * a0_re;
3266  A0_im += gT01_re * a1_im;
3267  A0_im += gT01_im * a1_re;
3268  A0_im += gT02_re * a2_im;
3269  A0_im += gT02_im * a2_re;
3270  spinorFloat B0_re = 0;
3271  B0_re += gT00_re * b0_re;
3272  B0_re -= gT00_im * b0_im;
3273  B0_re += gT01_re * b1_re;
3274  B0_re -= gT01_im * b1_im;
3275  B0_re += gT02_re * b2_re;
3276  B0_re -= gT02_im * b2_im;
3277  spinorFloat B0_im = 0;
3278  B0_im += gT00_re * b0_im;
3279  B0_im += gT00_im * b0_re;
3280  B0_im += gT01_re * b1_im;
3281  B0_im += gT01_im * b1_re;
3282  B0_im += gT02_re * b2_im;
3283  B0_im += gT02_im * b2_re;
3284 
3285  // multiply row 1
3286  spinorFloat A1_re = 0;
3287  A1_re += gT10_re * a0_re;
3288  A1_re -= gT10_im * a0_im;
3289  A1_re += gT11_re * a1_re;
3290  A1_re -= gT11_im * a1_im;
3291  A1_re += gT12_re * a2_re;
3292  A1_re -= gT12_im * a2_im;
3293  spinorFloat A1_im = 0;
3294  A1_im += gT10_re * a0_im;
3295  A1_im += gT10_im * a0_re;
3296  A1_im += gT11_re * a1_im;
3297  A1_im += gT11_im * a1_re;
3298  A1_im += gT12_re * a2_im;
3299  A1_im += gT12_im * a2_re;
3300  spinorFloat B1_re = 0;
3301  B1_re += gT10_re * b0_re;
3302  B1_re -= gT10_im * b0_im;
3303  B1_re += gT11_re * b1_re;
3304  B1_re -= gT11_im * b1_im;
3305  B1_re += gT12_re * b2_re;
3306  B1_re -= gT12_im * b2_im;
3307  spinorFloat B1_im = 0;
3308  B1_im += gT10_re * b0_im;
3309  B1_im += gT10_im * b0_re;
3310  B1_im += gT11_re * b1_im;
3311  B1_im += gT11_im * b1_re;
3312  B1_im += gT12_re * b2_im;
3313  B1_im += gT12_im * b2_re;
3314 
3315  // multiply row 2
3316  spinorFloat A2_re = 0;
3317  A2_re += gT20_re * a0_re;
3318  A2_re -= gT20_im * a0_im;
3319  A2_re += gT21_re * a1_re;
3320  A2_re -= gT21_im * a1_im;
3321  A2_re += gT22_re * a2_re;
3322  A2_re -= gT22_im * a2_im;
3323  spinorFloat A2_im = 0;
3324  A2_im += gT20_re * a0_im;
3325  A2_im += gT20_im * a0_re;
3326  A2_im += gT21_re * a1_im;
3327  A2_im += gT21_im * a1_re;
3328  A2_im += gT22_re * a2_im;
3329  A2_im += gT22_im * a2_re;
3330  spinorFloat B2_re = 0;
3331  B2_re += gT20_re * b0_re;
3332  B2_re -= gT20_im * b0_im;
3333  B2_re += gT21_re * b1_re;
3334  B2_re -= gT21_im * b1_im;
3335  B2_re += gT22_re * b2_re;
3336  B2_re -= gT22_im * b2_im;
3337  spinorFloat B2_im = 0;
3338  B2_im += gT20_re * b0_im;
3339  B2_im += gT20_im * b0_re;
3340  B2_im += gT21_re * b1_im;
3341  B2_im += gT21_im * b1_re;
3342  B2_im += gT22_re * b2_im;
3343  B2_im += gT22_im * b2_re;
3344 
3345  o2_00_re += A0_re;
3346  o2_00_im += A0_im;
3347  o2_10_re += B0_re;
3348  o2_10_im += B0_im;
3349 
3350  o2_01_re += A1_re;
3351  o2_01_im += A1_im;
3352  o2_11_re += B1_re;
3353  o2_11_im += B1_im;
3354 
3355  o2_02_re += A2_re;
3356  o2_02_im += A2_im;
3357  o2_12_re += B2_re;
3358  o2_12_im += B2_im;
3359 
3360  }
3361  }
3362 }
3363 
3364 #ifdef MULTI_GPU
3365 
3366 int incomplete = 0; // Have all 8 contributions been computed for this site?
3367 
3368 switch(kernel_type) { // intentional fall-through
3369 case INTERIOR_KERNEL:
3370  incomplete = incomplete || (param.commDim[3] && (coord[3]==0 || coord[3]==(param.dc.X[3]-1)));
3371 case EXTERIOR_KERNEL_T:
3372  incomplete = incomplete || (param.commDim[2] && (coord[2]==0 || coord[2]==(param.dc.X[2]-1)));
3373 case EXTERIOR_KERNEL_Z:
3374  incomplete = incomplete || (param.commDim[1] && (coord[1]==0 || coord[1]==(param.dc.X[1]-1)));
3375 case EXTERIOR_KERNEL_Y:
3376  incomplete = incomplete || (param.commDim[0] && (coord[0]==0 || coord[0]==(param.dc.X[0]-1)));
3377 }
3378 
3379 
3380 if (!incomplete)
3381 #endif // MULTI_GPU
3382 // apply twisted mass rotation
3383 {
3384 
3385 #ifdef DSLASH_TWIST
3386  {
3387 #ifdef SPINOR_DOUBLE
3388  const spinorFloat a = param.a;
3389  const spinorFloat b = param.b;
3390 #else
3391  const spinorFloat a = param.a_f;
3392  const spinorFloat b = param.b_f;
3393 #endif
3394  //Perform twist rotation first:
3395  //(1 - i*a*gamma_5 * tau_3 + b * tau_1)
3396  volatile spinorFloat x1_re, x1_im, y1_re, y1_im;
3397  volatile spinorFloat x2_re, x2_im, y2_re, y2_im;
3398 
3399  x1_re = 0.0, x1_im = 0.0;
3400  y1_re = 0.0, y1_im = 0.0;
3401  x2_re = 0.0, x2_im = 0.0;
3402  y2_re = 0.0, y2_im = 0.0;
3403 
3404 
3405  // using o1 regs:
3406  x1_re = o1_00_re + a *o1_20_im;
3407  x1_im = o1_00_im - a *o1_20_re;
3408  x2_re = b * o1_00_re;
3409  x2_im = b * o1_00_im;
3410 
3411  y1_re = o1_20_re + a *o1_00_im;
3412  y1_im = o1_20_im - a *o1_00_re;
3413  y2_re = b * o1_20_re;
3414  y2_im = b * o1_20_im;
3415 
3416 
3417  // using o2 regs:
3418  x2_re += o2_00_re - a *o2_20_im;
3419  x2_im += o2_00_im + a *o2_20_re;
3420  x1_re += b * o2_00_re;
3421  x1_im += b * o2_00_im;
3422 
3423  y2_re += o2_20_re - a *o2_00_im;
3424  y2_im += o2_20_im + a *o2_00_re;
3425  y1_re += b * o2_20_re;
3426  y1_im += b * o2_20_im;
3427 
3428 
3429  o1_00_re = x1_re; o1_00_im = x1_im;
3430  o1_20_re = y1_re; o1_20_im = y1_im;
3431 
3432  o2_00_re = x2_re; o2_00_im = x2_im;
3433  o2_20_re = y2_re; o2_20_im = y2_im;
3434 
3435  // using o1 regs:
3436  x1_re = o1_10_re + a *o1_30_im;
3437  x1_im = o1_10_im - a *o1_30_re;
3438  x2_re = b * o1_10_re;
3439  x2_im = b * o1_10_im;
3440 
3441  y1_re = o1_30_re + a *o1_10_im;
3442  y1_im = o1_30_im - a *o1_10_re;
3443  y2_re = b * o1_30_re;
3444  y2_im = b * o1_30_im;
3445 
3446 
3447  // using o2 regs:
3448  x2_re += o2_10_re - a *o2_30_im;
3449  x2_im += o2_10_im + a *o2_30_re;
3450  x1_re += b * o2_10_re;
3451  x1_im += b * o2_10_im;
3452 
3453  y2_re += o2_30_re - a *o2_10_im;
3454  y2_im += o2_30_im + a *o2_10_re;
3455  y1_re += b * o2_30_re;
3456  y1_im += b * o2_30_im;
3457 
3458 
3459  o1_10_re = x1_re; o1_10_im = x1_im;
3460  o1_30_re = y1_re; o1_30_im = y1_im;
3461 
3462  o2_10_re = x2_re; o2_10_im = x2_im;
3463  o2_30_re = y2_re; o2_30_im = y2_im;
3464 
3465  // using o1 regs:
3466  x1_re = o1_01_re + a *o1_21_im;
3467  x1_im = o1_01_im - a *o1_21_re;
3468  x2_re = b * o1_01_re;
3469  x2_im = b * o1_01_im;
3470 
3471  y1_re = o1_21_re + a *o1_01_im;
3472  y1_im = o1_21_im - a *o1_01_re;
3473  y2_re = b * o1_21_re;
3474  y2_im = b * o1_21_im;
3475 
3476 
3477  // using o2 regs:
3478  x2_re += o2_01_re - a *o2_21_im;
3479  x2_im += o2_01_im + a *o2_21_re;
3480  x1_re += b * o2_01_re;
3481  x1_im += b * o2_01_im;
3482 
3483  y2_re += o2_21_re - a *o2_01_im;
3484  y2_im += o2_21_im + a *o2_01_re;
3485  y1_re += b * o2_21_re;
3486  y1_im += b * o2_21_im;
3487 
3488 
3489  o1_01_re = x1_re; o1_01_im = x1_im;
3490  o1_21_re = y1_re; o1_21_im = y1_im;
3491 
3492  o2_01_re = x2_re; o2_01_im = x2_im;
3493  o2_21_re = y2_re; o2_21_im = y2_im;
3494 
3495  // using o1 regs:
3496  x1_re = o1_11_re + a *o1_31_im;
3497  x1_im = o1_11_im - a *o1_31_re;
3498  x2_re = b * o1_11_re;
3499  x2_im = b * o1_11_im;
3500 
3501  y1_re = o1_31_re + a *o1_11_im;
3502  y1_im = o1_31_im - a *o1_11_re;
3503  y2_re = b * o1_31_re;
3504  y2_im = b * o1_31_im;
3505 
3506 
3507  // using o2 regs:
3508  x2_re += o2_11_re - a *o2_31_im;
3509  x2_im += o2_11_im + a *o2_31_re;
3510  x1_re += b * o2_11_re;
3511  x1_im += b * o2_11_im;
3512 
3513  y2_re += o2_31_re - a *o2_11_im;
3514  y2_im += o2_31_im + a *o2_11_re;
3515  y1_re += b * o2_31_re;
3516  y1_im += b * o2_31_im;
3517 
3518 
3519  o1_11_re = x1_re; o1_11_im = x1_im;
3520  o1_31_re = y1_re; o1_31_im = y1_im;
3521 
3522  o2_11_re = x2_re; o2_11_im = x2_im;
3523  o2_31_re = y2_re; o2_31_im = y2_im;
3524 
3525  // using o1 regs:
3526  x1_re = o1_02_re + a *o1_22_im;
3527  x1_im = o1_02_im - a *o1_22_re;
3528  x2_re = b * o1_02_re;
3529  x2_im = b * o1_02_im;
3530 
3531  y1_re = o1_22_re + a *o1_02_im;
3532  y1_im = o1_22_im - a *o1_02_re;
3533  y2_re = b * o1_22_re;
3534  y2_im = b * o1_22_im;
3535 
3536 
3537  // using o2 regs:
3538  x2_re += o2_02_re - a *o2_22_im;
3539  x2_im += o2_02_im + a *o2_22_re;
3540  x1_re += b * o2_02_re;
3541  x1_im += b * o2_02_im;
3542 
3543  y2_re += o2_22_re - a *o2_02_im;
3544  y2_im += o2_22_im + a *o2_02_re;
3545  y1_re += b * o2_22_re;
3546  y1_im += b * o2_22_im;
3547 
3548 
3549  o1_02_re = x1_re; o1_02_im = x1_im;
3550  o1_22_re = y1_re; o1_22_im = y1_im;
3551 
3552  o2_02_re = x2_re; o2_02_im = x2_im;
3553  o2_22_re = y2_re; o2_22_im = y2_im;
3554 
3555  // using o1 regs:
3556  x1_re = o1_12_re + a *o1_32_im;
3557  x1_im = o1_12_im - a *o1_32_re;
3558  x2_re = b * o1_12_re;
3559  x2_im = b * o1_12_im;
3560 
3561  y1_re = o1_32_re + a *o1_12_im;
3562  y1_im = o1_32_im - a *o1_12_re;
3563  y2_re = b * o1_32_re;
3564  y2_im = b * o1_32_im;
3565 
3566 
3567  // using o2 regs:
3568  x2_re += o2_12_re - a *o2_32_im;
3569  x2_im += o2_12_im + a *o2_32_re;
3570  x1_re += b * o2_12_re;
3571  x1_im += b * o2_12_im;
3572 
3573  y2_re += o2_32_re - a *o2_12_im;
3574  y2_im += o2_32_im + a *o2_12_re;
3575  y1_re += b * o2_32_re;
3576  y1_im += b * o2_32_im;
3577 
3578 
3579  o1_12_re = x1_re; o1_12_im = x1_im;
3580  o1_32_re = y1_re; o1_32_im = y1_im;
3581 
3582  o2_12_re = x2_re; o2_12_im = x2_im;
3583  o2_32_re = y2_re; o2_32_im = y2_im;
3584 
3585  }
3586 #endif
3587 
3588 #if !defined(DSLASH_XPAY) || defined(DSLASH_TWIST)
3589 #ifdef SPINOR_DOUBLE
3590  const spinorFloat c = param.c;
3591 #else
3592  const spinorFloat c = param.c_f;
3593 #endif
3594 #endif
3595 #ifndef DSLASH_XPAY
3596  o1_00_re *= c;
3597  o1_00_im *= c;
3598  o1_01_re *= c;
3599  o1_01_im *= c;
3600  o1_02_re *= c;
3601  o1_02_im *= c;
3602  o1_10_re *= c;
3603  o1_10_im *= c;
3604  o1_11_re *= c;
3605  o1_11_im *= c;
3606  o1_12_re *= c;
3607  o1_12_im *= c;
3608  o1_20_re *= c;
3609  o1_20_im *= c;
3610  o1_21_re *= c;
3611  o1_21_im *= c;
3612  o1_22_re *= c;
3613  o1_22_im *= c;
3614  o1_30_re *= c;
3615  o1_30_im *= c;
3616  o1_31_re *= c;
3617  o1_31_im *= c;
3618  o1_32_re *= c;
3619  o1_32_im *= c;
3620 
3621  o2_00_re *= c;
3622  o2_00_im *= c;
3623  o2_01_re *= c;
3624  o2_01_im *= c;
3625  o2_02_re *= c;
3626  o2_02_im *= c;
3627  o2_10_re *= c;
3628  o2_10_im *= c;
3629  o2_11_re *= c;
3630  o2_11_im *= c;
3631  o2_12_re *= c;
3632  o2_12_im *= c;
3633  o2_20_re *= c;
3634  o2_20_im *= c;
3635  o2_21_re *= c;
3636  o2_21_im *= c;
3637  o2_22_re *= c;
3638  o2_22_im *= c;
3639  o2_30_re *= c;
3640  o2_30_im *= c;
3641  o2_31_re *= c;
3642  o2_31_im *= c;
3643  o2_32_re *= c;
3644  o2_32_im *= c;
3645 #else
3646 #ifdef DSLASH_TWIST
3647  // accum spinor
3648 #ifdef SPINOR_DOUBLE
3649 
3650 #define acc_00_re accum0.x
3651 #define acc_00_im accum0.y
3652 #define acc_01_re accum1.x
3653 #define acc_01_im accum1.y
3654 #define acc_02_re accum2.x
3655 #define acc_02_im accum2.y
3656 #define acc_10_re accum3.x
3657 #define acc_10_im accum3.y
3658 #define acc_11_re accum4.x
3659 #define acc_11_im accum4.y
3660 #define acc_12_re accum5.x
3661 #define acc_12_im accum5.y
3662 #define acc_20_re accum6.x
3663 #define acc_20_im accum6.y
3664 #define acc_21_re accum7.x
3665 #define acc_21_im accum7.y
3666 #define acc_22_re accum8.x
3667 #define acc_22_im accum8.y
3668 #define acc_30_re accum9.x
3669 #define acc_30_im accum9.y
3670 #define acc_31_re accum10.x
3671 #define acc_31_im accum10.y
3672 #define acc_32_re accum11.x
3673 #define acc_32_im accum11.y
3674 
3675 #else
3676 #define acc_00_re accum0.x
3677 #define acc_00_im accum0.y
3678 #define acc_01_re accum0.z
3679 #define acc_01_im accum0.w
3680 #define acc_02_re accum1.x
3681 #define acc_02_im accum1.y
3682 #define acc_10_re accum1.z
3683 #define acc_10_im accum1.w
3684 #define acc_11_re accum2.x
3685 #define acc_11_im accum2.y
3686 #define acc_12_re accum2.z
3687 #define acc_12_im accum2.w
3688 #define acc_20_re accum3.x
3689 #define acc_20_im accum3.y
3690 #define acc_21_re accum3.z
3691 #define acc_21_im accum3.w
3692 #define acc_22_re accum4.x
3693 #define acc_22_im accum4.y
3694 #define acc_30_re accum4.z
3695 #define acc_30_im accum4.w
3696 #define acc_31_re accum5.x
3697 #define acc_31_im accum5.y
3698 #define acc_32_re accum5.z
3699 #define acc_32_im accum5.w
3700 
3701 #endif // SPINOR_DOUBLE
3702 
3703  {
3704  READ_ACCUM(ACCUMTEX, param.sp_stride)
3705 
3706  o1_00_re = c*o1_00_re + acc_00_re;
3707  o1_00_im = c*o1_00_im + acc_00_im;
3708  o1_01_re = c*o1_01_re + acc_01_re;
3709  o1_01_im = c*o1_01_im + acc_01_im;
3710  o1_02_re = c*o1_02_re + acc_02_re;
3711  o1_02_im = c*o1_02_im + acc_02_im;
3712  o1_10_re = c*o1_10_re + acc_10_re;
3713  o1_10_im = c*o1_10_im + acc_10_im;
3714  o1_11_re = c*o1_11_re + acc_11_re;
3715  o1_11_im = c*o1_11_im + acc_11_im;
3716  o1_12_re = c*o1_12_re + acc_12_re;
3717  o1_12_im = c*o1_12_im + acc_12_im;
3718  o1_20_re = c*o1_20_re + acc_20_re;
3719  o1_20_im = c*o1_20_im + acc_20_im;
3720  o1_21_re = c*o1_21_re + acc_21_re;
3721  o1_21_im = c*o1_21_im + acc_21_im;
3722  o1_22_re = c*o1_22_re + acc_22_re;
3723  o1_22_im = c*o1_22_im + acc_22_im;
3724  o1_30_re = c*o1_30_re + acc_30_re;
3725  o1_30_im = c*o1_30_im + acc_30_im;
3726  o1_31_re = c*o1_31_re + acc_31_re;
3727  o1_31_im = c*o1_31_im + acc_31_im;
3728  o1_32_re = c*o1_32_re + acc_32_re;
3729  o1_32_im = c*o1_32_im + acc_32_im;
3730 
3731  ASSN_ACCUM(ACCUMTEX, param.sp_stride, param.fl_stride)
3732 
3733  o2_00_re = c*o2_00_re + acc_00_re;
3734  o2_00_im = c*o2_00_im + acc_00_im;
3735  o2_01_re = c*o2_01_re + acc_01_re;
3736  o2_01_im = c*o2_01_im + acc_01_im;
3737  o2_02_re = c*o2_02_re + acc_02_re;
3738  o2_02_im = c*o2_02_im + acc_02_im;
3739  o2_10_re = c*o2_10_re + acc_10_re;
3740  o2_10_im = c*o2_10_im + acc_10_im;
3741  o2_11_re = c*o2_11_re + acc_11_re;
3742  o2_11_im = c*o2_11_im + acc_11_im;
3743  o2_12_re = c*o2_12_re + acc_12_re;
3744  o2_12_im = c*o2_12_im + acc_12_im;
3745  o2_20_re = c*o2_20_re + acc_20_re;
3746  o2_20_im = c*o2_20_im + acc_20_im;
3747  o2_21_re = c*o2_21_re + acc_21_re;
3748  o2_21_im = c*o2_21_im + acc_21_im;
3749  o2_22_re = c*o2_22_re + acc_22_re;
3750  o2_22_im = c*o2_22_im + acc_22_im;
3751  o2_30_re = c*o2_30_re + acc_30_re;
3752  o2_30_im = c*o2_30_im + acc_30_im;
3753  o2_31_re = c*o2_31_re + acc_31_re;
3754  o2_31_im = c*o2_31_im + acc_31_im;
3755  o2_32_re = c*o2_32_re + acc_32_re;
3756  o2_32_im = c*o2_32_im + acc_32_im;
3757  }
3758 
3759 #undef acc_00_re
3760 #undef acc_00_im
3761 #undef acc_01_re
3762 #undef acc_01_im
3763 #undef acc_02_re
3764 #undef acc_02_im
3765 #undef acc_10_re
3766 #undef acc_10_im
3767 #undef acc_11_re
3768 #undef acc_11_im
3769 #undef acc_12_re
3770 #undef acc_12_im
3771 #undef acc_20_re
3772 #undef acc_20_im
3773 #undef acc_21_re
3774 #undef acc_21_im
3775 #undef acc_22_re
3776 #undef acc_22_im
3777 #undef acc_30_re
3778 #undef acc_30_im
3779 #undef acc_31_re
3780 #undef acc_31_im
3781 #undef acc_32_re
3782 #undef acc_32_im
3783 
3784 #else
3785  // accum spinor
3786 #ifdef SPINOR_DOUBLE
3787 
3788 #define acc1_00_re flv1_accum0.x
3789 #define acc1_00_im flv1_accum0.y
3790 #define acc1_01_re flv1_accum1.x
3791 #define acc1_01_im flv1_accum1.y
3792 #define acc1_02_re flv1_accum2.x
3793 #define acc1_02_im flv1_accum2.y
3794 #define acc1_10_re flv1_accum3.x
3795 #define acc1_10_im flv1_accum3.y
3796 #define acc1_11_re flv1_accum4.x
3797 #define acc1_11_im flv1_accum4.y
3798 #define acc1_12_re flv1_accum5.x
3799 #define acc1_12_im flv1_accum5.y
3800 #define acc1_20_re flv1_accum6.x
3801 #define acc1_20_im flv1_accum6.y
3802 #define acc1_21_re flv1_accum7.x
3803 #define acc1_21_im flv1_accum7.y
3804 #define acc1_22_re flv1_accum8.x
3805 #define acc1_22_im flv1_accum8.y
3806 #define acc1_30_re flv1_accum9.x
3807 #define acc1_30_im flv1_accum9.y
3808 #define acc1_31_re flv1_accum10.x
3809 #define acc1_31_im flv1_accum10.y
3810 #define acc1_32_re flv1_accum11.x
3811 #define acc1_32_im flv1_accum11.y
3812 
3813 #define acc2_00_re flv2_accum0.x
3814 #define acc2_00_im flv2_accum0.y
3815 #define acc2_01_re flv2_accum1.x
3816 #define acc2_01_im flv2_accum1.y
3817 #define acc2_02_re flv2_accum2.x
3818 #define acc2_02_im flv2_accum2.y
3819 #define acc2_10_re flv2_accum3.x
3820 #define acc2_10_im flv2_accum3.y
3821 #define acc2_11_re flv2_accum4.x
3822 #define acc2_11_im flv2_accum4.y
3823 #define acc2_12_re flv2_accum5.x
3824 #define acc2_12_im flv2_accum5.y
3825 #define acc2_20_re flv2_accum6.x
3826 #define acc2_20_im flv2_accum6.y
3827 #define acc2_21_re flv2_accum7.x
3828 #define acc2_21_im flv2_accum7.y
3829 #define acc2_22_re flv2_accum8.x
3830 #define acc2_22_im flv2_accum8.y
3831 #define acc2_30_re flv2_accum9.x
3832 #define acc2_30_im flv2_accum9.y
3833 #define acc2_31_re flv2_accum10.x
3834 #define acc2_31_im flv2_accum10.y
3835 #define acc2_32_re flv2_accum11.x
3836 #define acc2_32_im flv2_accum11.y
3837 
3838 #else
3839 
3840 #define acc1_00_re flv1_accum0.x
3841 #define acc1_00_im flv1_accum0.y
3842 #define acc1_01_re flv1_accum0.z
3843 #define acc1_01_im flv1_accum0.w
3844 #define acc1_02_re flv1_accum1.x
3845 #define acc1_02_im flv1_accum1.y
3846 #define acc1_10_re flv1_accum1.z
3847 #define acc1_10_im flv1_accum1.w
3848 #define acc1_11_re flv1_accum2.x
3849 #define acc1_11_im flv1_accum2.y
3850 #define acc1_12_re flv1_accum2.z
3851 #define acc1_12_im flv1_accum2.w
3852 #define acc1_20_re flv1_accum3.x
3853 #define acc1_20_im flv1_accum3.y
3854 #define acc1_21_re flv1_accum3.z
3855 #define acc1_21_im flv1_accum3.w
3856 #define acc1_22_re flv1_accum4.x
3857 #define acc1_22_im flv1_accum4.y
3858 #define acc1_30_re flv1_accum4.z
3859 #define acc1_30_im flv1_accum4.w
3860 #define acc1_31_re flv1_accum5.x
3861 #define acc1_31_im flv1_accum5.y
3862 #define acc1_32_re flv1_accum5.z
3863 #define acc1_32_im flv1_accum5.w
3864 
3865 #define acc2_00_re flv2_accum0.x
3866 #define acc2_00_im flv2_accum0.y
3867 #define acc2_01_re flv2_accum0.z
3868 #define acc2_01_im flv2_accum0.w
3869 #define acc2_02_re flv2_accum1.x
3870 #define acc2_02_im flv2_accum1.y
3871 #define acc2_10_re flv2_accum1.z
3872 #define acc2_10_im flv2_accum1.w
3873 #define acc2_11_re flv2_accum2.x
3874 #define acc2_11_im flv2_accum2.y
3875 #define acc2_12_re flv2_accum2.z
3876 #define acc2_12_im flv2_accum2.w
3877 #define acc2_20_re flv2_accum3.x
3878 #define acc2_20_im flv2_accum3.y
3879 #define acc2_21_re flv2_accum3.z
3880 #define acc2_21_im flv2_accum3.w
3881 #define acc2_22_re flv2_accum4.x
3882 #define acc2_22_im flv2_accum4.y
3883 #define acc2_30_re flv2_accum4.z
3884 #define acc2_30_im flv2_accum4.w
3885 #define acc2_31_re flv2_accum5.x
3886 #define acc2_31_im flv2_accum5.y
3887 #define acc2_32_re flv2_accum5.z
3888 #define acc2_32_im flv2_accum5.w
3889 
3890 #endif // SPINOR_DOUBLE
3891 
3892  {
3893  READ_ACCUM_FLAVOR(ACCUMTEX, param.sp_stride, param.fl_stride)
3894 
3895 #ifdef SPINOR_DOUBLE
3896  const spinorFloat a = param.a;
3897  const spinorFloat b = param.b;
3898 #else
3899  const spinorFloat a = param.a_f;
3900  const spinorFloat b = param.b_f;
3901 #endif
3902  //Perform twist rotation:
3903  //(1 - i*a*gamma_5 * tau_3 + b * tau_1)
3904  volatile spinorFloat x1_re, x1_im, y1_re, y1_im;
3905  volatile spinorFloat x2_re, x2_im, y2_re, y2_im;
3906 
3907  x1_re = 0.0, x1_im = 0.0;
3908  y1_re = 0.0, y1_im = 0.0;
3909  x2_re = 0.0, x2_im = 0.0;
3910  y2_re = 0.0, y2_im = 0.0;
3911 
3912 
3913  // using acc1 regs:
3914  x1_re = acc1_00_re + a *acc1_20_im;
3915  x1_im = acc1_00_im - a *acc1_20_re;
3916  x2_re = b * acc1_00_re;
3917  x2_im = b * acc1_00_im;
3918 
3919  y1_re = acc1_20_re + a *acc1_00_im;
3920  y1_im = acc1_20_im - a *acc1_00_re;
3921  y2_re = b * acc1_20_re;
3922  y2_im = b * acc1_20_im;
3923 
3924 
3925  // using acc2 regs:
3926  x2_re += acc2_00_re - a *acc2_20_im;
3927  x2_im += acc2_00_im + a *acc2_20_re;
3928  x1_re += b * acc2_00_re;
3929  x1_im += b * acc2_00_im;
3930 
3931  y2_re += acc2_20_re - a *acc2_00_im;
3932  y2_im += acc2_20_im + a *acc2_00_re;
3933  y1_re += b * acc2_20_re;
3934  y1_im += b * acc2_20_im;
3935 
3936 
3937  acc1_00_re = x1_re; acc1_00_im = x1_im;
3938  acc1_20_re = y1_re; acc1_20_im = y1_im;
3939 
3940  acc2_00_re = x2_re; acc2_00_im = x2_im;
3941  acc2_20_re = y2_re; acc2_20_im = y2_im;
3942 
3943  // using acc1 regs:
3944  x1_re = acc1_10_re + a *acc1_30_im;
3945  x1_im = acc1_10_im - a *acc1_30_re;
3946  x2_re = b * acc1_10_re;
3947  x2_im = b * acc1_10_im;
3948 
3949  y1_re = acc1_30_re + a *acc1_10_im;
3950  y1_im = acc1_30_im - a *acc1_10_re;
3951  y2_re = b * acc1_30_re;
3952  y2_im = b * acc1_30_im;
3953 
3954 
3955  // using acc2 regs:
3956  x2_re += acc2_10_re - a *acc2_30_im;
3957  x2_im += acc2_10_im + a *acc2_30_re;
3958  x1_re += b * acc2_10_re;
3959  x1_im += b * acc2_10_im;
3960 
3961  y2_re += acc2_30_re - a *acc2_10_im;
3962  y2_im += acc2_30_im + a *acc2_10_re;
3963  y1_re += b * acc2_30_re;
3964  y1_im += b * acc2_30_im;
3965 
3966 
3967  acc1_10_re = x1_re; acc1_10_im = x1_im;
3968  acc1_30_re = y1_re; acc1_30_im = y1_im;
3969 
3970  acc2_10_re = x2_re; acc2_10_im = x2_im;
3971  acc2_30_re = y2_re; acc2_30_im = y2_im;
3972 
3973  // using acc1 regs:
3974  x1_re = acc1_01_re + a *acc1_21_im;
3975  x1_im = acc1_01_im - a *acc1_21_re;
3976  x2_re = b * acc1_01_re;
3977  x2_im = b * acc1_01_im;
3978 
3979  y1_re = acc1_21_re + a *acc1_01_im;
3980  y1_im = acc1_21_im - a *acc1_01_re;
3981  y2_re = b * acc1_21_re;
3982  y2_im = b * acc1_21_im;
3983 
3984 
3985  // using acc2 regs:
3986  x2_re += acc2_01_re - a *acc2_21_im;
3987  x2_im += acc2_01_im + a *acc2_21_re;
3988  x1_re += b * acc2_01_re;
3989  x1_im += b * acc2_01_im;
3990 
3991  y2_re += acc2_21_re - a *acc2_01_im;
3992  y2_im += acc2_21_im + a *acc2_01_re;
3993  y1_re += b * acc2_21_re;
3994  y1_im += b * acc2_21_im;
3995 
3996 
3997  acc1_01_re = x1_re; acc1_01_im = x1_im;
3998  acc1_21_re = y1_re; acc1_21_im = y1_im;
3999 
4000  acc2_01_re = x2_re; acc2_01_im = x2_im;
4001  acc2_21_re = y2_re; acc2_21_im = y2_im;
4002 
4003  // using acc1 regs:
4004  x1_re = acc1_11_re + a *acc1_31_im;
4005  x1_im = acc1_11_im - a *acc1_31_re;
4006  x2_re = b * acc1_11_re;
4007  x2_im = b * acc1_11_im;
4008 
4009  y1_re = acc1_31_re + a *acc1_11_im;
4010  y1_im = acc1_31_im - a *acc1_11_re;
4011  y2_re = b * acc1_31_re;
4012  y2_im = b * acc1_31_im;
4013 
4014 
4015  // using acc2 regs:
4016  x2_re += acc2_11_re - a *acc2_31_im;
4017  x2_im += acc2_11_im + a *acc2_31_re;
4018  x1_re += b * acc2_11_re;
4019  x1_im += b * acc2_11_im;
4020 
4021  y2_re += acc2_31_re - a *acc2_11_im;
4022  y2_im += acc2_31_im + a *acc2_11_re;
4023  y1_re += b * acc2_31_re;
4024  y1_im += b * acc2_31_im;
4025 
4026 
4027  acc1_11_re = x1_re; acc1_11_im = x1_im;
4028  acc1_31_re = y1_re; acc1_31_im = y1_im;
4029 
4030  acc2_11_re = x2_re; acc2_11_im = x2_im;
4031  acc2_31_re = y2_re; acc2_31_im = y2_im;
4032 
4033  // using acc1 regs:
4034  x1_re = acc1_02_re + a *acc1_22_im;
4035  x1_im = acc1_02_im - a *acc1_22_re;
4036  x2_re = b * acc1_02_re;
4037  x2_im = b * acc1_02_im;
4038 
4039  y1_re = acc1_22_re + a *acc1_02_im;
4040  y1_im = acc1_22_im - a *acc1_02_re;
4041  y2_re = b * acc1_22_re;
4042  y2_im = b * acc1_22_im;
4043 
4044 
4045  // using acc2 regs:
4046  x2_re += acc2_02_re - a *acc2_22_im;
4047  x2_im += acc2_02_im + a *acc2_22_re;
4048  x1_re += b * acc2_02_re;
4049  x1_im += b * acc2_02_im;
4050 
4051  y2_re += acc2_22_re - a *acc2_02_im;
4052  y2_im += acc2_22_im + a *acc2_02_re;
4053  y1_re += b * acc2_22_re;
4054  y1_im += b * acc2_22_im;
4055 
4056 
4057  acc1_02_re = x1_re; acc1_02_im = x1_im;
4058  acc1_22_re = y1_re; acc1_22_im = y1_im;
4059 
4060  acc2_02_re = x2_re; acc2_02_im = x2_im;
4061  acc2_22_re = y2_re; acc2_22_im = y2_im;
4062 
4063  // using acc1 regs:
4064  x1_re = acc1_12_re + a *acc1_32_im;
4065  x1_im = acc1_12_im - a *acc1_32_re;
4066  x2_re = b * acc1_12_re;
4067  x2_im = b * acc1_12_im;
4068 
4069  y1_re = acc1_32_re + a *acc1_12_im;
4070  y1_im = acc1_32_im - a *acc1_12_re;
4071  y2_re = b * acc1_32_re;
4072  y2_im = b * acc1_32_im;
4073 
4074 
4075  // using acc2 regs:
4076  x2_re += acc2_12_re - a *acc2_32_im;
4077  x2_im += acc2_12_im + a *acc2_32_re;
4078  x1_re += b * acc2_12_re;
4079  x1_im += b * acc2_12_im;
4080 
4081  y2_re += acc2_32_re - a *acc2_12_im;
4082  y2_im += acc2_32_im + a *acc2_12_re;
4083  y1_re += b * acc2_32_re;
4084  y1_im += b * acc2_32_im;
4085 
4086 
4087  acc1_12_re = x1_re; acc1_12_im = x1_im;
4088  acc1_32_re = y1_re; acc1_32_im = y1_im;
4089 
4090  acc2_12_re = x2_re; acc2_12_im = x2_im;
4091  acc2_32_re = y2_re; acc2_32_im = y2_im;
4092 
4093 #ifdef SPINOR_DOUBLE
4094  const spinorFloat k = param.d;
4095 #else
4096  const spinorFloat k = param.d_f;
4097 #endif
4098  o1_00_re = k*o1_00_re + acc1_00_re;
4099  o1_00_im = k*o1_00_im + acc1_00_im;
4100  o1_01_re = k*o1_01_re + acc1_01_re;
4101  o1_01_im = k*o1_01_im + acc1_01_im;
4102  o1_02_re = k*o1_02_re + acc1_02_re;
4103  o1_02_im = k*o1_02_im + acc1_02_im;
4104  o1_10_re = k*o1_10_re + acc1_10_re;
4105  o1_10_im = k*o1_10_im + acc1_10_im;
4106  o1_11_re = k*o1_11_re + acc1_11_re;
4107  o1_11_im = k*o1_11_im + acc1_11_im;
4108  o1_12_re = k*o1_12_re + acc1_12_re;
4109  o1_12_im = k*o1_12_im + acc1_12_im;
4110  o1_20_re = k*o1_20_re + acc1_20_re;
4111  o1_20_im = k*o1_20_im + acc1_20_im;
4112  o1_21_re = k*o1_21_re + acc1_21_re;
4113  o1_21_im = k*o1_21_im + acc1_21_im;
4114  o1_22_re = k*o1_22_re + acc1_22_re;
4115  o1_22_im = k*o1_22_im + acc1_22_im;
4116  o1_30_re = k*o1_30_re + acc1_30_re;
4117  o1_30_im = k*o1_30_im + acc1_30_im;
4118  o1_31_re = k*o1_31_re + acc1_31_re;
4119  o1_31_im = k*o1_31_im + acc1_31_im;
4120  o1_32_re = k*o1_32_re + acc1_32_re;
4121  o1_32_im = k*o1_32_im + acc1_32_im;
4122 
4123  o2_00_re = k*o2_00_re + acc2_00_re;
4124  o2_00_im = k*o2_00_im + acc2_00_im;
4125  o2_01_re = k*o2_01_re + acc2_01_re;
4126  o2_01_im = k*o2_01_im + acc2_01_im;
4127  o2_02_re = k*o2_02_re + acc2_02_re;
4128  o2_02_im = k*o2_02_im + acc2_02_im;
4129  o2_10_re = k*o2_10_re + acc2_10_re;
4130  o2_10_im = k*o2_10_im + acc2_10_im;
4131  o2_11_re = k*o2_11_re + acc2_11_re;
4132  o2_11_im = k*o2_11_im + acc2_11_im;
4133  o2_12_re = k*o2_12_re + acc2_12_re;
4134  o2_12_im = k*o2_12_im + acc2_12_im;
4135  o2_20_re = k*o2_20_re + acc2_20_re;
4136  o2_20_im = k*o2_20_im + acc2_20_im;
4137  o2_21_re = k*o2_21_re + acc2_21_re;
4138  o2_21_im = k*o2_21_im + acc2_21_im;
4139  o2_22_re = k*o2_22_re + acc2_22_re;
4140  o2_22_im = k*o2_22_im + acc2_22_im;
4141  o2_30_re = k*o2_30_re + acc2_30_re;
4142  o2_30_im = k*o2_30_im + acc2_30_im;
4143  o2_31_re = k*o2_31_re + acc2_31_re;
4144  o2_31_im = k*o2_31_im + acc2_31_im;
4145  o2_32_re = k*o2_32_re + acc2_32_re;
4146  o2_32_im = k*o2_32_im + acc2_32_im;
4147  }
4148 
4149 #undef acc1_00_re
4150 #undef acc1_00_im
4151 #undef acc1_01_re
4152 #undef acc1_01_im
4153 #undef acc1_02_re
4154 #undef acc1_02_im
4155 #undef acc1_10_re
4156 #undef acc1_10_im
4157 #undef acc1_11_re
4158 #undef acc1_11_im
4159 #undef acc1_12_re
4160 #undef acc1_12_im
4161 #undef acc1_20_re
4162 #undef acc1_20_im
4163 #undef acc1_21_re
4164 #undef acc1_21_im
4165 #undef acc1_22_re
4166 #undef acc1_22_im
4167 #undef acc1_30_re
4168 #undef acc1_30_im
4169 #undef acc1_31_re
4170 #undef acc1_31_im
4171 #undef acc1_32_re
4172 #undef acc1_32_im
4173 
4174 #undef acc2_00_re
4175 #undef acc2_00_im
4176 #undef acc2_01_re
4177 #undef acc2_01_im
4178 #undef acc2_02_re
4179 #undef acc2_02_im
4180 #undef acc2_10_re
4181 #undef acc2_10_im
4182 #undef acc2_11_re
4183 #undef acc2_11_im
4184 #undef acc2_12_re
4185 #undef acc2_12_im
4186 #undef acc2_20_re
4187 #undef acc2_20_im
4188 #undef acc2_21_re
4189 #undef acc2_21_im
4190 #undef acc2_22_re
4191 #undef acc2_22_im
4192 #undef acc2_30_re
4193 #undef acc2_30_im
4194 #undef acc2_31_re
4195 #undef acc2_31_im
4196 #undef acc2_32_re
4197 #undef acc2_32_im
4198 
4199 #endif//DSLASH_TWIST
4200 
4201 #endif // DSLASH_XPAY
4202 }
4203 
4204 // write spinor field back to device memory
4206 
4207 // undefine to prevent warning when precision is changed
4208 #undef spinorFloat
4209 #undef g00_re
4210 #undef g00_im
4211 #undef g01_re
4212 #undef g01_im
4213 #undef g02_re
4214 #undef g02_im
4215 #undef g10_re
4216 #undef g10_im
4217 #undef g11_re
4218 #undef g11_im
4219 #undef g12_re
4220 #undef g12_im
4221 #undef g20_re
4222 #undef g20_im
4223 #undef g21_re
4224 #undef g21_im
4225 #undef g22_re
4226 #undef g22_im
4227 
4228 #undef i00_re
4229 #undef i00_im
4230 #undef i01_re
4231 #undef i01_im
4232 #undef i02_re
4233 #undef i02_im
4234 #undef i10_re
4235 #undef i10_im
4236 #undef i11_re
4237 #undef i11_im
4238 #undef i12_re
4239 #undef i12_im
4240 #undef i20_re
4241 #undef i20_im
4242 #undef i21_re
4243 #undef i21_im
4244 #undef i22_re
4245 #undef i22_im
4246 #undef i30_re
4247 #undef i30_im
4248 #undef i31_re
4249 #undef i31_im
4250 #undef i32_re
4251 #undef i32_im
4252 
4253 
4254 #undef VOLATILE
RECONSTRUCT_GAUGE_MATRIX(0)
#define gT21_im
#define gT12_im
dim3 dim3 blockDim
#define gT01_im
#define i21_re
VOLATILE spinorFloat o2_01_re
spinorFloat a2_im
#define gT00_re
#define i31_im
spinorFloat B0_re
int coord[5]
VOLATILE spinorFloat o1_30_re
spinorFloat b2_re
VOLATILE spinorFloat o1_02_im
spinorFloat A0_im
#define i12_re
#define i02_re
spinorFloat b0_re
VOLATILE spinorFloat o1_32_re
VOLATILE spinorFloat o1_22_im
#define gT11_re
#define g21_re
#define g11_re
VOLATILE spinorFloat o2_32_im
#define i02_im
#define g00_im
int sid
VOLATILE spinorFloat o2_02_im
int sp_idx
#define g20_re
#define i22_re
VOLATILE spinorFloat o2_31_re
#define i01_im
#define gT20_re
spinorFloat B0_im
#define i00_re
VOLATILE spinorFloat o1_31_re
VOLATILE spinorFloat o2_32_re
spinorFloat a0_im
#define i20_re
#define gT01_re
VOLATILE spinorFloat o1_20_re
VOLATILE spinorFloat o1_02_re
#define gT11_im
VOLATILE spinorFloat o2_22_re
spinorFloat b1_im
#define g02_re
VOLATILE spinorFloat o1_10_im
#define GAUGE0TEX
spinorFloat A2_re
VOLATILE spinorFloat o2_30_re
#define FLAVORS
#define gT02_re
VOLATILE spinorFloat o2_31_im
VOLATILE spinorFloat o1_31_im
#define gT12_re
QudaGaugeParam param
Definition: pack_test.cpp:17
#define b
#define i32_re
#define i20_im
VOLATILE spinorFloat o1_22_re
VOLATILE spinorFloat o1_00_re
VOLATILE spinorFloat o1_12_re
#define g11_im
#define g02_im
#define g01_im
o1_00_re *o1_00_im *o1_01_re *o1_01_im *o1_02_re *o1_02_im *o1_10_re *o1_10_im *o1_11_re *o1_11_im *o1_12_re *o1_12_im *o1_20_re *o1_20_im *o1_21_re *o1_21_im *o1_22_re *o1_22_im *o1_30_re *o1_30_im *o1_31_re *o1_31_im *o1_32_re *o1_32_im *o2_00_re *o2_00_im *o2_01_re *o2_01_im *o2_02_re *o2_02_im *o2_10_re *o2_10_im *o2_11_re *o2_11_im *o2_12_re *o2_12_im *o2_20_re *o2_20_im *o2_21_re *o2_21_im *o2_22_re *o2_22_im *o2_30_re *o2_30_im *o2_31_re *o2_31_im *o2_32_re *o2_32_im * WRITE_FLAVOR_SPINOR()
VOLATILE spinorFloat o2_12_re
VOLATILE spinorFloat o1_11_re
VOLATILE spinorFloat o2_22_im
VOLATILE spinorFloat o2_00_im
spinorFloat A0_re
#define i22_im
#define gT22_re
VOLATILE spinorFloat o2_21_im
VOLATILE spinorFloat o1_32_im
#define gT21_re
spinorFloat B1_re
#define GAUGE1TEX
#define gT22_im
VOLATILE spinorFloat o1_21_im
#define g12_im
spinorFloat B1_im
VOLATILE spinorFloat o2_10_im
VOLATILE spinorFloat o1_01_re
#define SPINORTEX
VOLATILE spinorFloat o2_01_im
VOLATILE spinorFloat o1_01_im
#define READ_SPINOR
const int ga_idx
#define i11_re
#define READ_INTERMEDIATE_SPINOR
spinorFloat a1_im
#define spinorFloat
VOLATILE spinorFloat o1_11_im
VOLATILE spinorFloat o1_12_im
VOLATILE spinorFloat o2_10_re
#define gT02_im
VOLATILE spinorFloat o2_02_re
#define g10_im
#define i31_re
#define g00_re
int X[4]
Definition: quda.h:29
coordsFromIndex< 4, QUDA_4D_PC, EVEN_X >(X, coord, sid, param)
#define gT10_im
#define g01_re
#define i30_re
#define g22_im
#define READ_SPINOR_GHOST
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)
spinorFloat A1_im
#define i32_im
VOLATILE spinorFloat o1_20_im
VOLATILE spinorFloat o2_00_re
spinorFloat b1_re
VOLATILE spinorFloat o2_11_im
#define g22_re
#define gT20_im
#define i12_im
spinorFloat a0_re
VOLATILE spinorFloat o1_10_re
spinorFloat b0_im
VOLATILE spinorFloat o1_30_im
#define g10_re
VOLATILE spinorFloat o2_20_im
spinorFloat B2_re
#define g21_im
#define i10_re
#define i10_im
spinorFloat A1_re
VOLATILE spinorFloat o2_30_im
#define i00_im
#define INTERTEX
#define g20_im
VOLATILE spinorFloat o2_12_im
#define gT10_re
int face_idx
spinorFloat A2_im
spinorFloat B2_im
const void * c
const int face_num
#define TPROJSCALE
#define i21_im
#define GHOSTSPINORTEX
#define gT00_im
#define g12_re
VOLATILE spinorFloat o2_21_re
VOLATILE spinorFloat o1_21_re
#define i30_im
spinorFloat b2_im
spinorFloat a2_re
#define a
#define i01_re
spinorFloat a1_re
#define i11_im
#define READ_SPINOR_UP
#define READ_SPINOR_DOWN
VOLATILE spinorFloat o2_20_re
#define VOLATILE
VOLATILE spinorFloat o2_11_re
VOLATILE spinorFloat o1_00_im