QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tm_ndeg_fused_exterior_dslash_core.h
Go to the documentation of this file.
1 // *** CUDA NDEG TWISTED MASS DSLASH ***
2 
3 // Arguments (double) mu, (double)eta and (double)delta
4 #define SHARED_TMNDEG_FLOATS_PER_THREAD 0
5 #define FLAVORS 2
6 
7 
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
9 #define VOLATILE
10 #else // Open64 compiler
11 #define VOLATILE volatile
12 #endif
14 // input spinor
15 #ifdef SPINOR_DOUBLE
16 #define spinorFloat double
17 #define i00_re I0.x
18 #define i00_im I0.y
19 #define i01_re I1.x
20 #define i01_im I1.y
21 #define i02_re I2.x
22 #define i02_im I2.y
23 #define i10_re I3.x
24 #define i10_im I3.y
25 #define i11_re I4.x
26 #define i11_im I4.y
27 #define i12_re I5.x
28 #define i12_im I5.y
29 #define i20_re I6.x
30 #define i20_im I6.y
31 #define i21_re I7.x
32 #define i21_im I7.y
33 #define i22_re I8.x
34 #define i22_im I8.y
35 #define i30_re I9.x
36 #define i30_im I9.y
37 #define i31_re I10.x
38 #define i31_im I10.y
39 #define i32_re I11.x
40 #define i32_im I11.y
41 #else
42 #define spinorFloat float
43 #define i00_re I0.x
44 #define i00_im I0.y
45 #define i01_re I0.z
46 #define i01_im I0.w
47 #define i02_re I1.x
48 #define i02_im I1.y
49 #define i10_re I1.z
50 #define i10_im I1.w
51 #define i11_re I2.x
52 #define i11_im I2.y
53 #define i12_re I2.z
54 #define i12_im I2.w
55 #define i20_re I3.x
56 #define i20_im I3.y
57 #define i21_re I3.z
58 #define i21_im I3.w
59 #define i22_re I4.x
60 #define i22_im I4.y
61 #define i30_re I4.z
62 #define i30_im I4.w
63 #define i31_re I5.x
64 #define i31_im I5.y
65 #define i32_re I5.z
66 #define i32_im I5.w
67 #endif // SPINOR_DOUBLE
68 
69 // gauge link
70 #ifdef GAUGE_FLOAT2
71 #define g00_re G0.x
72 #define g00_im G0.y
73 #define g01_re G1.x
74 #define g01_im G1.y
75 #define g02_re G2.x
76 #define g02_im G2.y
77 #define g10_re G3.x
78 #define g10_im G3.y
79 #define g11_re G4.x
80 #define g11_im G4.y
81 #define g12_re G5.x
82 #define g12_im G5.y
83 #define g20_re G6.x
84 #define g20_im G6.y
85 #define g21_re G7.x
86 #define g21_im G7.y
87 #define g22_re G8.x
88 #define g22_im G8.y
89 
90 #else
91 #define g00_re G0.x
92 #define g00_im G0.y
93 #define g01_re G0.z
94 #define g01_im G0.w
95 #define g02_re G1.x
96 #define g02_im G1.y
97 #define g10_re G1.z
98 #define g10_im G1.w
99 #define g11_re G2.x
100 #define g11_im G2.y
101 #define g12_re G2.z
102 #define g12_im G2.w
103 #define g20_re G3.x
104 #define g20_im G3.y
105 #define g21_re G3.z
106 #define g21_im G3.w
107 #define g22_re G4.x
108 #define g22_im G4.y
109 
110 #endif // GAUGE_DOUBLE
111 
112 // conjugated gauge link
113 #define gT00_re (+g00_re)
114 #define gT00_im (-g00_im)
115 #define gT01_re (+g10_re)
116 #define gT01_im (-g10_im)
117 #define gT02_re (+g20_re)
118 #define gT02_im (-g20_im)
119 #define gT10_re (+g01_re)
120 #define gT10_im (-g01_im)
121 #define gT11_re (+g11_re)
122 #define gT11_im (-g11_im)
123 #define gT12_re (+g21_re)
124 #define gT12_im (-g21_im)
125 #define gT20_re (+g02_re)
126 #define gT20_im (-g02_im)
127 #define gT21_re (+g12_re)
128 #define gT21_im (-g12_im)
129 #define gT22_re (+g22_re)
130 #define gT22_im (-g22_im)
131 
132 // output spinor for flavor 1
157 // output spinor for flavor 2
182 
183 #include "read_gauge.h"
184 #include "io_spinor.h"
185 
186 int x1, x2, x3, x4;
187 int X;
188 
189 #if (DD_PREC==2) // half precision
190 int sp_norm_idx;
191 #endif // MULTI_GPU half precision
192 
193 int sid;
194 
195 int dim;
197 int Y[4] = {X1,X2,X3,X4};
198 int faceVolume[4];
199 faceVolume[0] = (X2*X3*X4)>>1;
200 faceVolume[1] = (X1*X3*X4)>>1;
201 faceVolume[2] = (X1*X2*X4)>>1;
202 faceVolume[3] = (X1*X2*X3)>>1;
203 
204 
205 #ifdef MULTI_GPU
207 #endif
208 
209  sid = blockIdx.x*blockDim.x + threadIdx.x;
210  if (sid >= param.threads) return;
211 
212  // Inline by hand for the moment and assume even dimensions
213  const int dims[] = {X1, X2, X3, X4};
214  coordsFromIndex<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
215 
216  o1_00_re = 0; o1_00_im = 0;
217  o1_01_re = 0; o1_01_im = 0;
218  o1_02_re = 0; o1_02_im = 0;
219  o1_10_re = 0; o1_10_im = 0;
220  o1_11_re = 0; o1_11_im = 0;
221  o1_12_re = 0; o1_12_im = 0;
222  o1_20_re = 0; o1_20_im = 0;
223  o1_21_re = 0; o1_21_im = 0;
224  o1_22_re = 0; o1_22_im = 0;
225  o1_30_re = 0; o1_30_im = 0;
226  o1_31_re = 0; o1_31_im = 0;
227  o1_32_re = 0; o1_32_im = 0;
228 
229  o2_00_re = 0; o2_00_im = 0;
230  o2_01_re = 0; o2_01_im = 0;
231  o2_02_re = 0; o2_02_im = 0;
232  o2_10_re = 0; o2_10_im = 0;
233  o2_11_re = 0; o2_11_im = 0;
234  o2_12_re = 0; o2_12_im = 0;
235  o2_20_re = 0; o2_20_im = 0;
236  o2_21_re = 0; o2_21_im = 0;
237  o2_22_re = 0; o2_22_im = 0;
238  o2_30_re = 0; o2_30_im = 0;
239  o2_31_re = 0; o2_31_im = 0;
240  o2_32_re = 0; o2_32_im = 0;
241 
242 #ifdef MULTI_GPU
243 } else { // exterior kernel
244 
245  sid = blockIdx.x*blockDim.x + threadIdx.x;
246  if (sid >= param.threads) return;
247 
248  dim = dimFromFaceIndex(sid, param); // sid is also modified
249 
250  const int face_volume = ((param.threadDimMapUpper[dim] - param.threadDimMapLower[dim]) >> 1);
251  // volume of one face (per flavor)
252  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
253  face_idx = sid - face_num*face_volume; // index into the respective face
254 
255  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
256  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
257  //sp_idx = face_idx + param.ghostOffset[dim];
258 
259 
260  const int dims[] = {X1, X2, X3, X4};
261  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
262 
263  bool active = false;
264  for(int dir=0; dir<4; ++dir){
265  active = active || isActive(dim,dir,+1,x1,x2,x3,x4,param.commDim,param.X);
266  }
267  if(!active) return;
268 
269 
270  {
271  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
284 
285 
286  }
287  {
288  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid+param.fl_stride, sid+param.fl_stride);
301 
302 
303  }
304 }
305 #endif // MULTI_GPU
306 
307 
308 if (isActive(dim,0,+1,x1,x2,x3,x4,param.commDim,param.X) && x1==X1m1)
309 {
310  // Projector P0-
311  // 1 0 0 -i
312  // 0 1 -i 0
313  // 0 i 1 0
314  // i 0 0 1
315 
316  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,0,Y);
317  const int sp_idx = face_idx + param.ghostOffset[0];
318 #if (DD_PREC==2)
319  sp_norm_idx = face_idx + faceVolume[0] + param.ghostNormOffset[0];
320 #endif
321  const int ga_idx = sid;
322 
329 
330  // read gauge matrix from device memory
331  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
332 
333  // reconstruct gauge matrix
335 
336  {
337 #ifdef MULTI_GPU
338  if (kernel_type == INTERIOR_KERNEL) {
339 #endif
340 
341  // read flavor 1 from device memory
342  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
343 
344  // project spinor into half spinors
345  a0_re = +i00_re+i30_im;
346  a0_im = +i00_im-i30_re;
347  a1_re = +i01_re+i31_im;
348  a1_im = +i01_im-i31_re;
349  a2_re = +i02_re+i32_im;
350  a2_im = +i02_im-i32_re;
351  b0_re = +i10_re+i20_im;
352  b0_im = +i10_im-i20_re;
353  b1_re = +i11_re+i21_im;
354  b1_im = +i11_im-i21_re;
355  b2_re = +i12_re+i22_im;
356  b2_im = +i12_im-i22_re;
357 
358 #ifdef MULTI_GPU
359  } else {
360 
361  const int sp_stride_pad = FLAVORS*ghostFace[0];
362 
363  // read half spinor for the first flavor from device memory
364  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
365 
366  a0_re = i00_re; a0_im = i00_im;
367  a1_re = i01_re; a1_im = i01_im;
368  a2_re = i02_re; a2_im = i02_im;
369  b0_re = i10_re; b0_im = i10_im;
370  b1_re = i11_re; b1_im = i11_im;
371  b2_re = i12_re; b2_im = i12_im;
372 
373  }
374 #endif // MULTI_GPU
375 
376  // multiply row 0
377  spinorFloat A0_re = 0;
378  A0_re += g00_re * a0_re;
379  A0_re -= g00_im * a0_im;
380  A0_re += g01_re * a1_re;
381  A0_re -= g01_im * a1_im;
382  A0_re += g02_re * a2_re;
383  A0_re -= g02_im * a2_im;
384  spinorFloat A0_im = 0;
385  A0_im += g00_re * a0_im;
386  A0_im += g00_im * a0_re;
387  A0_im += g01_re * a1_im;
388  A0_im += g01_im * a1_re;
389  A0_im += g02_re * a2_im;
390  A0_im += g02_im * a2_re;
391  spinorFloat B0_re = 0;
392  B0_re += g00_re * b0_re;
393  B0_re -= g00_im * b0_im;
394  B0_re += g01_re * b1_re;
395  B0_re -= g01_im * b1_im;
396  B0_re += g02_re * b2_re;
397  B0_re -= g02_im * b2_im;
398  spinorFloat B0_im = 0;
399  B0_im += g00_re * b0_im;
400  B0_im += g00_im * b0_re;
401  B0_im += g01_re * b1_im;
402  B0_im += g01_im * b1_re;
403  B0_im += g02_re * b2_im;
404  B0_im += g02_im * b2_re;
405 
406  // multiply row 1
407  spinorFloat A1_re = 0;
408  A1_re += g10_re * a0_re;
409  A1_re -= g10_im * a0_im;
410  A1_re += g11_re * a1_re;
411  A1_re -= g11_im * a1_im;
412  A1_re += g12_re * a2_re;
413  A1_re -= g12_im * a2_im;
414  spinorFloat A1_im = 0;
415  A1_im += g10_re * a0_im;
416  A1_im += g10_im * a0_re;
417  A1_im += g11_re * a1_im;
418  A1_im += g11_im * a1_re;
419  A1_im += g12_re * a2_im;
420  A1_im += g12_im * a2_re;
421  spinorFloat B1_re = 0;
422  B1_re += g10_re * b0_re;
423  B1_re -= g10_im * b0_im;
424  B1_re += g11_re * b1_re;
425  B1_re -= g11_im * b1_im;
426  B1_re += g12_re * b2_re;
427  B1_re -= g12_im * b2_im;
428  spinorFloat B1_im = 0;
429  B1_im += g10_re * b0_im;
430  B1_im += g10_im * b0_re;
431  B1_im += g11_re * b1_im;
432  B1_im += g11_im * b1_re;
433  B1_im += g12_re * b2_im;
434  B1_im += g12_im * b2_re;
435 
436  // multiply row 2
437  spinorFloat A2_re = 0;
438  A2_re += g20_re * a0_re;
439  A2_re -= g20_im * a0_im;
440  A2_re += g21_re * a1_re;
441  A2_re -= g21_im * a1_im;
442  A2_re += g22_re * a2_re;
443  A2_re -= g22_im * a2_im;
444  spinorFloat A2_im = 0;
445  A2_im += g20_re * a0_im;
446  A2_im += g20_im * a0_re;
447  A2_im += g21_re * a1_im;
448  A2_im += g21_im * a1_re;
449  A2_im += g22_re * a2_im;
450  A2_im += g22_im * a2_re;
451  spinorFloat B2_re = 0;
452  B2_re += g20_re * b0_re;
453  B2_re -= g20_im * b0_im;
454  B2_re += g21_re * b1_re;
455  B2_re -= g21_im * b1_im;
456  B2_re += g22_re * b2_re;
457  B2_re -= g22_im * b2_im;
458  spinorFloat B2_im = 0;
459  B2_im += g20_re * b0_im;
460  B2_im += g20_im * b0_re;
461  B2_im += g21_re * b1_im;
462  B2_im += g21_im * b1_re;
463  B2_im += g22_re * b2_im;
464  B2_im += g22_im * b2_re;
465 
466  o1_00_re += A0_re;
467  o1_00_im += A0_im;
468  o1_10_re += B0_re;
469  o1_10_im += B0_im;
470  o1_20_re -= B0_im;
471  o1_20_im += B0_re;
472  o1_30_re -= A0_im;
473  o1_30_im += A0_re;
474 
475  o1_01_re += A1_re;
476  o1_01_im += A1_im;
477  o1_11_re += B1_re;
478  o1_11_im += B1_im;
479  o1_21_re -= B1_im;
480  o1_21_im += B1_re;
481  o1_31_re -= A1_im;
482  o1_31_im += A1_re;
483 
484  o1_02_re += A2_re;
485  o1_02_im += A2_im;
486  o1_12_re += B2_re;
487  o1_12_im += B2_im;
488  o1_22_re -= B2_im;
489  o1_22_im += B2_re;
490  o1_32_re -= A2_im;
491  o1_32_im += A2_re;
492 
493  }
494  {
495 #ifdef MULTI_GPU
496  if (kernel_type == INTERIOR_KERNEL) {
497 #endif
498 
499  // read flavor 2 from device memory
500  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
501 
502  // project spinor into half spinors
503  a0_re = +i00_re+i30_im;
504  a0_im = +i00_im-i30_re;
505  a1_re = +i01_re+i31_im;
506  a1_im = +i01_im-i31_re;
507  a2_re = +i02_re+i32_im;
508  a2_im = +i02_im-i32_re;
509  b0_re = +i10_re+i20_im;
510  b0_im = +i10_im-i20_re;
511  b1_re = +i11_re+i21_im;
512  b1_im = +i11_im-i21_re;
513  b2_re = +i12_re+i22_im;
514  b2_im = +i12_im-i22_re;
515 
516 #ifdef MULTI_GPU
517  } else {
518 
519  const int sp_stride_pad = FLAVORS*ghostFace[0];
520 
521  // read half spinor for the second flavor from device memory
522  const int fl_idx = sp_idx + ghostFace[0];
523  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[0]);
524 
525  a0_re = i00_re; a0_im = i00_im;
526  a1_re = i01_re; a1_im = i01_im;
527  a2_re = i02_re; a2_im = i02_im;
528  b0_re = i10_re; b0_im = i10_im;
529  b1_re = i11_re; b1_im = i11_im;
530  b2_re = i12_re; b2_im = i12_im;
531 
532  }
533 #endif // MULTI_GPU
534 
535  // multiply row 0
536  spinorFloat A0_re = 0;
537  A0_re += g00_re * a0_re;
538  A0_re -= g00_im * a0_im;
539  A0_re += g01_re * a1_re;
540  A0_re -= g01_im * a1_im;
541  A0_re += g02_re * a2_re;
542  A0_re -= g02_im * a2_im;
543  spinorFloat A0_im = 0;
544  A0_im += g00_re * a0_im;
545  A0_im += g00_im * a0_re;
546  A0_im += g01_re * a1_im;
547  A0_im += g01_im * a1_re;
548  A0_im += g02_re * a2_im;
549  A0_im += g02_im * a2_re;
550  spinorFloat B0_re = 0;
551  B0_re += g00_re * b0_re;
552  B0_re -= g00_im * b0_im;
553  B0_re += g01_re * b1_re;
554  B0_re -= g01_im * b1_im;
555  B0_re += g02_re * b2_re;
556  B0_re -= g02_im * b2_im;
557  spinorFloat B0_im = 0;
558  B0_im += g00_re * b0_im;
559  B0_im += g00_im * b0_re;
560  B0_im += g01_re * b1_im;
561  B0_im += g01_im * b1_re;
562  B0_im += g02_re * b2_im;
563  B0_im += g02_im * b2_re;
564 
565  // multiply row 1
566  spinorFloat A1_re = 0;
567  A1_re += g10_re * a0_re;
568  A1_re -= g10_im * a0_im;
569  A1_re += g11_re * a1_re;
570  A1_re -= g11_im * a1_im;
571  A1_re += g12_re * a2_re;
572  A1_re -= g12_im * a2_im;
573  spinorFloat A1_im = 0;
574  A1_im += g10_re * a0_im;
575  A1_im += g10_im * a0_re;
576  A1_im += g11_re * a1_im;
577  A1_im += g11_im * a1_re;
578  A1_im += g12_re * a2_im;
579  A1_im += g12_im * a2_re;
580  spinorFloat B1_re = 0;
581  B1_re += g10_re * b0_re;
582  B1_re -= g10_im * b0_im;
583  B1_re += g11_re * b1_re;
584  B1_re -= g11_im * b1_im;
585  B1_re += g12_re * b2_re;
586  B1_re -= g12_im * b2_im;
587  spinorFloat B1_im = 0;
588  B1_im += g10_re * b0_im;
589  B1_im += g10_im * b0_re;
590  B1_im += g11_re * b1_im;
591  B1_im += g11_im * b1_re;
592  B1_im += g12_re * b2_im;
593  B1_im += g12_im * b2_re;
594 
595  // multiply row 2
596  spinorFloat A2_re = 0;
597  A2_re += g20_re * a0_re;
598  A2_re -= g20_im * a0_im;
599  A2_re += g21_re * a1_re;
600  A2_re -= g21_im * a1_im;
601  A2_re += g22_re * a2_re;
602  A2_re -= g22_im * a2_im;
603  spinorFloat A2_im = 0;
604  A2_im += g20_re * a0_im;
605  A2_im += g20_im * a0_re;
606  A2_im += g21_re * a1_im;
607  A2_im += g21_im * a1_re;
608  A2_im += g22_re * a2_im;
609  A2_im += g22_im * a2_re;
610  spinorFloat B2_re = 0;
611  B2_re += g20_re * b0_re;
612  B2_re -= g20_im * b0_im;
613  B2_re += g21_re * b1_re;
614  B2_re -= g21_im * b1_im;
615  B2_re += g22_re * b2_re;
616  B2_re -= g22_im * b2_im;
617  spinorFloat B2_im = 0;
618  B2_im += g20_re * b0_im;
619  B2_im += g20_im * b0_re;
620  B2_im += g21_re * b1_im;
621  B2_im += g21_im * b1_re;
622  B2_im += g22_re * b2_im;
623  B2_im += g22_im * b2_re;
624 
625  o2_00_re += A0_re;
626  o2_00_im += A0_im;
627  o2_10_re += B0_re;
628  o2_10_im += B0_im;
629  o2_20_re -= B0_im;
630  o2_20_im += B0_re;
631  o2_30_re -= A0_im;
632  o2_30_im += A0_re;
633 
634  o2_01_re += A1_re;
635  o2_01_im += A1_im;
636  o2_11_re += B1_re;
637  o2_11_im += B1_im;
638  o2_21_re -= B1_im;
639  o2_21_im += B1_re;
640  o2_31_re -= A1_im;
641  o2_31_im += A1_re;
642 
643  o2_02_re += A2_re;
644  o2_02_im += A2_im;
645  o2_12_re += B2_re;
646  o2_12_im += B2_im;
647  o2_22_re -= B2_im;
648  o2_22_im += B2_re;
649  o2_32_re -= A2_im;
650  o2_32_im += A2_re;
651 
652  }
653 }
654 
655 if (isActive(dim,0,-1,x1,x2,x3,x4,param.commDim,param.X) && x1==0)
656 {
657  // Projector P0+
658  // 1 0 0 i
659  // 0 1 i 0
660  // 0 -i 1 0
661  // -i 0 0 1
662 
663  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,0,Y);
664  const int sp_idx = face_idx + param.ghostOffset[0];
665 #if (DD_PREC==2)
666  sp_norm_idx = face_idx + param.ghostNormOffset[0];
667 #endif
668  const int ga_idx = Vh+face_idx;
669 
676 
677  // read gauge matrix from device memory
678  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
679 
680  // reconstruct gauge matrix
682 
683  {
684 #ifdef MULTI_GPU
685  if (kernel_type == INTERIOR_KERNEL) {
686 #endif
687 
688  // read flavor 1 from device memory
689  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
690 
691  // project spinor into half spinors
692  a0_re = +i00_re-i30_im;
693  a0_im = +i00_im+i30_re;
694  a1_re = +i01_re-i31_im;
695  a1_im = +i01_im+i31_re;
696  a2_re = +i02_re-i32_im;
697  a2_im = +i02_im+i32_re;
698  b0_re = +i10_re-i20_im;
699  b0_im = +i10_im+i20_re;
700  b1_re = +i11_re-i21_im;
701  b1_im = +i11_im+i21_re;
702  b2_re = +i12_re-i22_im;
703  b2_im = +i12_im+i22_re;
704 
705 #ifdef MULTI_GPU
706  } else {
707 
708  const int sp_stride_pad = FLAVORS*ghostFace[0];
709 
710  // read half spinor for the first flavor from device memory
711  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
712 
713  a0_re = i00_re; a0_im = i00_im;
714  a1_re = i01_re; a1_im = i01_im;
715  a2_re = i02_re; a2_im = i02_im;
716  b0_re = i10_re; b0_im = i10_im;
717  b1_re = i11_re; b1_im = i11_im;
718  b2_re = i12_re; b2_im = i12_im;
719 
720  }
721 #endif // MULTI_GPU
722 
723  // multiply row 0
724  spinorFloat A0_re = 0;
725  A0_re += gT00_re * a0_re;
726  A0_re -= gT00_im * a0_im;
727  A0_re += gT01_re * a1_re;
728  A0_re -= gT01_im * a1_im;
729  A0_re += gT02_re * a2_re;
730  A0_re -= gT02_im * a2_im;
731  spinorFloat A0_im = 0;
732  A0_im += gT00_re * a0_im;
733  A0_im += gT00_im * a0_re;
734  A0_im += gT01_re * a1_im;
735  A0_im += gT01_im * a1_re;
736  A0_im += gT02_re * a2_im;
737  A0_im += gT02_im * a2_re;
738  spinorFloat B0_re = 0;
739  B0_re += gT00_re * b0_re;
740  B0_re -= gT00_im * b0_im;
741  B0_re += gT01_re * b1_re;
742  B0_re -= gT01_im * b1_im;
743  B0_re += gT02_re * b2_re;
744  B0_re -= gT02_im * b2_im;
745  spinorFloat B0_im = 0;
746  B0_im += gT00_re * b0_im;
747  B0_im += gT00_im * b0_re;
748  B0_im += gT01_re * b1_im;
749  B0_im += gT01_im * b1_re;
750  B0_im += gT02_re * b2_im;
751  B0_im += gT02_im * b2_re;
752 
753  // multiply row 1
754  spinorFloat A1_re = 0;
755  A1_re += gT10_re * a0_re;
756  A1_re -= gT10_im * a0_im;
757  A1_re += gT11_re * a1_re;
758  A1_re -= gT11_im * a1_im;
759  A1_re += gT12_re * a2_re;
760  A1_re -= gT12_im * a2_im;
761  spinorFloat A1_im = 0;
762  A1_im += gT10_re * a0_im;
763  A1_im += gT10_im * a0_re;
764  A1_im += gT11_re * a1_im;
765  A1_im += gT11_im * a1_re;
766  A1_im += gT12_re * a2_im;
767  A1_im += gT12_im * a2_re;
768  spinorFloat B1_re = 0;
769  B1_re += gT10_re * b0_re;
770  B1_re -= gT10_im * b0_im;
771  B1_re += gT11_re * b1_re;
772  B1_re -= gT11_im * b1_im;
773  B1_re += gT12_re * b2_re;
774  B1_re -= gT12_im * b2_im;
775  spinorFloat B1_im = 0;
776  B1_im += gT10_re * b0_im;
777  B1_im += gT10_im * b0_re;
778  B1_im += gT11_re * b1_im;
779  B1_im += gT11_im * b1_re;
780  B1_im += gT12_re * b2_im;
781  B1_im += gT12_im * b2_re;
782 
783  // multiply row 2
784  spinorFloat A2_re = 0;
785  A2_re += gT20_re * a0_re;
786  A2_re -= gT20_im * a0_im;
787  A2_re += gT21_re * a1_re;
788  A2_re -= gT21_im * a1_im;
789  A2_re += gT22_re * a2_re;
790  A2_re -= gT22_im * a2_im;
791  spinorFloat A2_im = 0;
792  A2_im += gT20_re * a0_im;
793  A2_im += gT20_im * a0_re;
794  A2_im += gT21_re * a1_im;
795  A2_im += gT21_im * a1_re;
796  A2_im += gT22_re * a2_im;
797  A2_im += gT22_im * a2_re;
798  spinorFloat B2_re = 0;
799  B2_re += gT20_re * b0_re;
800  B2_re -= gT20_im * b0_im;
801  B2_re += gT21_re * b1_re;
802  B2_re -= gT21_im * b1_im;
803  B2_re += gT22_re * b2_re;
804  B2_re -= gT22_im * b2_im;
805  spinorFloat B2_im = 0;
806  B2_im += gT20_re * b0_im;
807  B2_im += gT20_im * b0_re;
808  B2_im += gT21_re * b1_im;
809  B2_im += gT21_im * b1_re;
810  B2_im += gT22_re * b2_im;
811  B2_im += gT22_im * b2_re;
812 
813  o1_00_re += A0_re;
814  o1_00_im += A0_im;
815  o1_10_re += B0_re;
816  o1_10_im += B0_im;
817  o1_20_re += B0_im;
818  o1_20_im -= B0_re;
819  o1_30_re += A0_im;
820  o1_30_im -= A0_re;
821 
822  o1_01_re += A1_re;
823  o1_01_im += A1_im;
824  o1_11_re += B1_re;
825  o1_11_im += B1_im;
826  o1_21_re += B1_im;
827  o1_21_im -= B1_re;
828  o1_31_re += A1_im;
829  o1_31_im -= A1_re;
830 
831  o1_02_re += A2_re;
832  o1_02_im += A2_im;
833  o1_12_re += B2_re;
834  o1_12_im += B2_im;
835  o1_22_re += B2_im;
836  o1_22_im -= B2_re;
837  o1_32_re += A2_im;
838  o1_32_im -= A2_re;
839 
840  }
841  {
842 #ifdef MULTI_GPU
843  if (kernel_type == INTERIOR_KERNEL) {
844 #endif
845 
846  // read flavor 2 from device memory
847  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
848 
849  // project spinor into half spinors
850  a0_re = +i00_re-i30_im;
851  a0_im = +i00_im+i30_re;
852  a1_re = +i01_re-i31_im;
853  a1_im = +i01_im+i31_re;
854  a2_re = +i02_re-i32_im;
855  a2_im = +i02_im+i32_re;
856  b0_re = +i10_re-i20_im;
857  b0_im = +i10_im+i20_re;
858  b1_re = +i11_re-i21_im;
859  b1_im = +i11_im+i21_re;
860  b2_re = +i12_re-i22_im;
861  b2_im = +i12_im+i22_re;
862 
863 #ifdef MULTI_GPU
864  } else {
865 
866  const int sp_stride_pad = FLAVORS*ghostFace[0];
867 
868  // read half spinor for the second flavor from device memory
869  const int fl_idx = sp_idx + ghostFace[0];
870  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[0]);
871 
872  a0_re = i00_re; a0_im = i00_im;
873  a1_re = i01_re; a1_im = i01_im;
874  a2_re = i02_re; a2_im = i02_im;
875  b0_re = i10_re; b0_im = i10_im;
876  b1_re = i11_re; b1_im = i11_im;
877  b2_re = i12_re; b2_im = i12_im;
878 
879  }
880 #endif // MULTI_GPU
881 
882  // multiply row 0
883  spinorFloat A0_re = 0;
884  A0_re += gT00_re * a0_re;
885  A0_re -= gT00_im * a0_im;
886  A0_re += gT01_re * a1_re;
887  A0_re -= gT01_im * a1_im;
888  A0_re += gT02_re * a2_re;
889  A0_re -= gT02_im * a2_im;
890  spinorFloat A0_im = 0;
891  A0_im += gT00_re * a0_im;
892  A0_im += gT00_im * a0_re;
893  A0_im += gT01_re * a1_im;
894  A0_im += gT01_im * a1_re;
895  A0_im += gT02_re * a2_im;
896  A0_im += gT02_im * a2_re;
897  spinorFloat B0_re = 0;
898  B0_re += gT00_re * b0_re;
899  B0_re -= gT00_im * b0_im;
900  B0_re += gT01_re * b1_re;
901  B0_re -= gT01_im * b1_im;
902  B0_re += gT02_re * b2_re;
903  B0_re -= gT02_im * b2_im;
904  spinorFloat B0_im = 0;
905  B0_im += gT00_re * b0_im;
906  B0_im += gT00_im * b0_re;
907  B0_im += gT01_re * b1_im;
908  B0_im += gT01_im * b1_re;
909  B0_im += gT02_re * b2_im;
910  B0_im += gT02_im * b2_re;
911 
912  // multiply row 1
913  spinorFloat A1_re = 0;
914  A1_re += gT10_re * a0_re;
915  A1_re -= gT10_im * a0_im;
916  A1_re += gT11_re * a1_re;
917  A1_re -= gT11_im * a1_im;
918  A1_re += gT12_re * a2_re;
919  A1_re -= gT12_im * a2_im;
920  spinorFloat A1_im = 0;
921  A1_im += gT10_re * a0_im;
922  A1_im += gT10_im * a0_re;
923  A1_im += gT11_re * a1_im;
924  A1_im += gT11_im * a1_re;
925  A1_im += gT12_re * a2_im;
926  A1_im += gT12_im * a2_re;
927  spinorFloat B1_re = 0;
928  B1_re += gT10_re * b0_re;
929  B1_re -= gT10_im * b0_im;
930  B1_re += gT11_re * b1_re;
931  B1_re -= gT11_im * b1_im;
932  B1_re += gT12_re * b2_re;
933  B1_re -= gT12_im * b2_im;
934  spinorFloat B1_im = 0;
935  B1_im += gT10_re * b0_im;
936  B1_im += gT10_im * b0_re;
937  B1_im += gT11_re * b1_im;
938  B1_im += gT11_im * b1_re;
939  B1_im += gT12_re * b2_im;
940  B1_im += gT12_im * b2_re;
941 
942  // multiply row 2
943  spinorFloat A2_re = 0;
944  A2_re += gT20_re * a0_re;
945  A2_re -= gT20_im * a0_im;
946  A2_re += gT21_re * a1_re;
947  A2_re -= gT21_im * a1_im;
948  A2_re += gT22_re * a2_re;
949  A2_re -= gT22_im * a2_im;
950  spinorFloat A2_im = 0;
951  A2_im += gT20_re * a0_im;
952  A2_im += gT20_im * a0_re;
953  A2_im += gT21_re * a1_im;
954  A2_im += gT21_im * a1_re;
955  A2_im += gT22_re * a2_im;
956  A2_im += gT22_im * a2_re;
957  spinorFloat B2_re = 0;
958  B2_re += gT20_re * b0_re;
959  B2_re -= gT20_im * b0_im;
960  B2_re += gT21_re * b1_re;
961  B2_re -= gT21_im * b1_im;
962  B2_re += gT22_re * b2_re;
963  B2_re -= gT22_im * b2_im;
964  spinorFloat B2_im = 0;
965  B2_im += gT20_re * b0_im;
966  B2_im += gT20_im * b0_re;
967  B2_im += gT21_re * b1_im;
968  B2_im += gT21_im * b1_re;
969  B2_im += gT22_re * b2_im;
970  B2_im += gT22_im * b2_re;
971 
972  o2_00_re += A0_re;
973  o2_00_im += A0_im;
974  o2_10_re += B0_re;
975  o2_10_im += B0_im;
976  o2_20_re += B0_im;
977  o2_20_im -= B0_re;
978  o2_30_re += A0_im;
979  o2_30_im -= A0_re;
980 
981  o2_01_re += A1_re;
982  o2_01_im += A1_im;
983  o2_11_re += B1_re;
984  o2_11_im += B1_im;
985  o2_21_re += B1_im;
986  o2_21_im -= B1_re;
987  o2_31_re += A1_im;
988  o2_31_im -= A1_re;
989 
990  o2_02_re += A2_re;
991  o2_02_im += A2_im;
992  o2_12_re += B2_re;
993  o2_12_im += B2_im;
994  o2_22_re += B2_im;
995  o2_22_im -= B2_re;
996  o2_32_re += A2_im;
997  o2_32_im -= A2_re;
998 
999  }
1000 }
1001 
1002 if (isActive(dim,1,+1,x1,x2,x3,x4,param.commDim,param.X) && x2==X2m1)
1003 {
1004  // Projector P1-
1005  // 1 0 0 -1
1006  // 0 1 1 0
1007  // 0 1 1 0
1008  // -1 0 0 1
1009 
1010  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,1,Y);
1011  const int sp_idx = face_idx + param.ghostOffset[1];
1012 #if (DD_PREC==2)
1013  sp_norm_idx = face_idx + faceVolume[1] + param.ghostNormOffset[1];
1014 #endif
1015  const int ga_idx = sid;
1016 
1023 
1024  // read gauge matrix from device memory
1025  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
1026 
1027  // reconstruct gauge matrix
1029 
1030  {
1031 #ifdef MULTI_GPU
1032  if (kernel_type == INTERIOR_KERNEL) {
1033 #endif
1034 
1035  // read flavor 1 from device memory
1036  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1037 
1038  // project spinor into half spinors
1039  a0_re = +i00_re-i30_re;
1040  a0_im = +i00_im-i30_im;
1041  a1_re = +i01_re-i31_re;
1042  a1_im = +i01_im-i31_im;
1043  a2_re = +i02_re-i32_re;
1044  a2_im = +i02_im-i32_im;
1045  b0_re = +i10_re+i20_re;
1046  b0_im = +i10_im+i20_im;
1047  b1_re = +i11_re+i21_re;
1048  b1_im = +i11_im+i21_im;
1049  b2_re = +i12_re+i22_re;
1050  b2_im = +i12_im+i22_im;
1051 
1052 #ifdef MULTI_GPU
1053  } else {
1054 
1055  const int sp_stride_pad = FLAVORS*ghostFace[1];
1056 
1057  // read half spinor for the first flavor from device memory
1058  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1059 
1060  a0_re = i00_re; a0_im = i00_im;
1061  a1_re = i01_re; a1_im = i01_im;
1062  a2_re = i02_re; a2_im = i02_im;
1063  b0_re = i10_re; b0_im = i10_im;
1064  b1_re = i11_re; b1_im = i11_im;
1065  b2_re = i12_re; b2_im = i12_im;
1066 
1067  }
1068 #endif // MULTI_GPU
1069 
1070  // multiply row 0
1071  spinorFloat A0_re = 0;
1072  A0_re += g00_re * a0_re;
1073  A0_re -= g00_im * a0_im;
1074  A0_re += g01_re * a1_re;
1075  A0_re -= g01_im * a1_im;
1076  A0_re += g02_re * a2_re;
1077  A0_re -= g02_im * a2_im;
1078  spinorFloat A0_im = 0;
1079  A0_im += g00_re * a0_im;
1080  A0_im += g00_im * a0_re;
1081  A0_im += g01_re * a1_im;
1082  A0_im += g01_im * a1_re;
1083  A0_im += g02_re * a2_im;
1084  A0_im += g02_im * a2_re;
1085  spinorFloat B0_re = 0;
1086  B0_re += g00_re * b0_re;
1087  B0_re -= g00_im * b0_im;
1088  B0_re += g01_re * b1_re;
1089  B0_re -= g01_im * b1_im;
1090  B0_re += g02_re * b2_re;
1091  B0_re -= g02_im * b2_im;
1092  spinorFloat B0_im = 0;
1093  B0_im += g00_re * b0_im;
1094  B0_im += g00_im * b0_re;
1095  B0_im += g01_re * b1_im;
1096  B0_im += g01_im * b1_re;
1097  B0_im += g02_re * b2_im;
1098  B0_im += g02_im * b2_re;
1099 
1100  // multiply row 1
1101  spinorFloat A1_re = 0;
1102  A1_re += g10_re * a0_re;
1103  A1_re -= g10_im * a0_im;
1104  A1_re += g11_re * a1_re;
1105  A1_re -= g11_im * a1_im;
1106  A1_re += g12_re * a2_re;
1107  A1_re -= g12_im * a2_im;
1108  spinorFloat A1_im = 0;
1109  A1_im += g10_re * a0_im;
1110  A1_im += g10_im * a0_re;
1111  A1_im += g11_re * a1_im;
1112  A1_im += g11_im * a1_re;
1113  A1_im += g12_re * a2_im;
1114  A1_im += g12_im * a2_re;
1115  spinorFloat B1_re = 0;
1116  B1_re += g10_re * b0_re;
1117  B1_re -= g10_im * b0_im;
1118  B1_re += g11_re * b1_re;
1119  B1_re -= g11_im * b1_im;
1120  B1_re += g12_re * b2_re;
1121  B1_re -= g12_im * b2_im;
1122  spinorFloat B1_im = 0;
1123  B1_im += g10_re * b0_im;
1124  B1_im += g10_im * b0_re;
1125  B1_im += g11_re * b1_im;
1126  B1_im += g11_im * b1_re;
1127  B1_im += g12_re * b2_im;
1128  B1_im += g12_im * b2_re;
1129 
1130  // multiply row 2
1131  spinorFloat A2_re = 0;
1132  A2_re += g20_re * a0_re;
1133  A2_re -= g20_im * a0_im;
1134  A2_re += g21_re * a1_re;
1135  A2_re -= g21_im * a1_im;
1136  A2_re += g22_re * a2_re;
1137  A2_re -= g22_im * a2_im;
1138  spinorFloat A2_im = 0;
1139  A2_im += g20_re * a0_im;
1140  A2_im += g20_im * a0_re;
1141  A2_im += g21_re * a1_im;
1142  A2_im += g21_im * a1_re;
1143  A2_im += g22_re * a2_im;
1144  A2_im += g22_im * a2_re;
1145  spinorFloat B2_re = 0;
1146  B2_re += g20_re * b0_re;
1147  B2_re -= g20_im * b0_im;
1148  B2_re += g21_re * b1_re;
1149  B2_re -= g21_im * b1_im;
1150  B2_re += g22_re * b2_re;
1151  B2_re -= g22_im * b2_im;
1152  spinorFloat B2_im = 0;
1153  B2_im += g20_re * b0_im;
1154  B2_im += g20_im * b0_re;
1155  B2_im += g21_re * b1_im;
1156  B2_im += g21_im * b1_re;
1157  B2_im += g22_re * b2_im;
1158  B2_im += g22_im * b2_re;
1159 
1160  o1_00_re += A0_re;
1161  o1_00_im += A0_im;
1162  o1_10_re += B0_re;
1163  o1_10_im += B0_im;
1164  o1_20_re += B0_re;
1165  o1_20_im += B0_im;
1166  o1_30_re -= A0_re;
1167  o1_30_im -= A0_im;
1168 
1169  o1_01_re += A1_re;
1170  o1_01_im += A1_im;
1171  o1_11_re += B1_re;
1172  o1_11_im += B1_im;
1173  o1_21_re += B1_re;
1174  o1_21_im += B1_im;
1175  o1_31_re -= A1_re;
1176  o1_31_im -= A1_im;
1177 
1178  o1_02_re += A2_re;
1179  o1_02_im += A2_im;
1180  o1_12_re += B2_re;
1181  o1_12_im += B2_im;
1182  o1_22_re += B2_re;
1183  o1_22_im += B2_im;
1184  o1_32_re -= A2_re;
1185  o1_32_im -= A2_im;
1186 
1187  }
1188  {
1189 #ifdef MULTI_GPU
1190  if (kernel_type == INTERIOR_KERNEL) {
1191 #endif
1192 
1193  // read flavor 2 from device memory
1194  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
1195 
1196  // project spinor into half spinors
1197  a0_re = +i00_re-i30_re;
1198  a0_im = +i00_im-i30_im;
1199  a1_re = +i01_re-i31_re;
1200  a1_im = +i01_im-i31_im;
1201  a2_re = +i02_re-i32_re;
1202  a2_im = +i02_im-i32_im;
1203  b0_re = +i10_re+i20_re;
1204  b0_im = +i10_im+i20_im;
1205  b1_re = +i11_re+i21_re;
1206  b1_im = +i11_im+i21_im;
1207  b2_re = +i12_re+i22_re;
1208  b2_im = +i12_im+i22_im;
1209 
1210 #ifdef MULTI_GPU
1211  } else {
1212 
1213  const int sp_stride_pad = FLAVORS*ghostFace[1];
1214 
1215  // read half spinor for the second flavor from device memory
1216  const int fl_idx = sp_idx + ghostFace[1];
1217  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[1]);
1218 
1219  a0_re = i00_re; a0_im = i00_im;
1220  a1_re = i01_re; a1_im = i01_im;
1221  a2_re = i02_re; a2_im = i02_im;
1222  b0_re = i10_re; b0_im = i10_im;
1223  b1_re = i11_re; b1_im = i11_im;
1224  b2_re = i12_re; b2_im = i12_im;
1225 
1226  }
1227 #endif // MULTI_GPU
1228 
1229  // multiply row 0
1230  spinorFloat A0_re = 0;
1231  A0_re += g00_re * a0_re;
1232  A0_re -= g00_im * a0_im;
1233  A0_re += g01_re * a1_re;
1234  A0_re -= g01_im * a1_im;
1235  A0_re += g02_re * a2_re;
1236  A0_re -= g02_im * a2_im;
1237  spinorFloat A0_im = 0;
1238  A0_im += g00_re * a0_im;
1239  A0_im += g00_im * a0_re;
1240  A0_im += g01_re * a1_im;
1241  A0_im += g01_im * a1_re;
1242  A0_im += g02_re * a2_im;
1243  A0_im += g02_im * a2_re;
1244  spinorFloat B0_re = 0;
1245  B0_re += g00_re * b0_re;
1246  B0_re -= g00_im * b0_im;
1247  B0_re += g01_re * b1_re;
1248  B0_re -= g01_im * b1_im;
1249  B0_re += g02_re * b2_re;
1250  B0_re -= g02_im * b2_im;
1251  spinorFloat B0_im = 0;
1252  B0_im += g00_re * b0_im;
1253  B0_im += g00_im * b0_re;
1254  B0_im += g01_re * b1_im;
1255  B0_im += g01_im * b1_re;
1256  B0_im += g02_re * b2_im;
1257  B0_im += g02_im * b2_re;
1258 
1259  // multiply row 1
1260  spinorFloat A1_re = 0;
1261  A1_re += g10_re * a0_re;
1262  A1_re -= g10_im * a0_im;
1263  A1_re += g11_re * a1_re;
1264  A1_re -= g11_im * a1_im;
1265  A1_re += g12_re * a2_re;
1266  A1_re -= g12_im * a2_im;
1267  spinorFloat A1_im = 0;
1268  A1_im += g10_re * a0_im;
1269  A1_im += g10_im * a0_re;
1270  A1_im += g11_re * a1_im;
1271  A1_im += g11_im * a1_re;
1272  A1_im += g12_re * a2_im;
1273  A1_im += g12_im * a2_re;
1274  spinorFloat B1_re = 0;
1275  B1_re += g10_re * b0_re;
1276  B1_re -= g10_im * b0_im;
1277  B1_re += g11_re * b1_re;
1278  B1_re -= g11_im * b1_im;
1279  B1_re += g12_re * b2_re;
1280  B1_re -= g12_im * b2_im;
1281  spinorFloat B1_im = 0;
1282  B1_im += g10_re * b0_im;
1283  B1_im += g10_im * b0_re;
1284  B1_im += g11_re * b1_im;
1285  B1_im += g11_im * b1_re;
1286  B1_im += g12_re * b2_im;
1287  B1_im += g12_im * b2_re;
1288 
1289  // multiply row 2
1290  spinorFloat A2_re = 0;
1291  A2_re += g20_re * a0_re;
1292  A2_re -= g20_im * a0_im;
1293  A2_re += g21_re * a1_re;
1294  A2_re -= g21_im * a1_im;
1295  A2_re += g22_re * a2_re;
1296  A2_re -= g22_im * a2_im;
1297  spinorFloat A2_im = 0;
1298  A2_im += g20_re * a0_im;
1299  A2_im += g20_im * a0_re;
1300  A2_im += g21_re * a1_im;
1301  A2_im += g21_im * a1_re;
1302  A2_im += g22_re * a2_im;
1303  A2_im += g22_im * a2_re;
1304  spinorFloat B2_re = 0;
1305  B2_re += g20_re * b0_re;
1306  B2_re -= g20_im * b0_im;
1307  B2_re += g21_re * b1_re;
1308  B2_re -= g21_im * b1_im;
1309  B2_re += g22_re * b2_re;
1310  B2_re -= g22_im * b2_im;
1311  spinorFloat B2_im = 0;
1312  B2_im += g20_re * b0_im;
1313  B2_im += g20_im * b0_re;
1314  B2_im += g21_re * b1_im;
1315  B2_im += g21_im * b1_re;
1316  B2_im += g22_re * b2_im;
1317  B2_im += g22_im * b2_re;
1318 
1319  o2_00_re += A0_re;
1320  o2_00_im += A0_im;
1321  o2_10_re += B0_re;
1322  o2_10_im += B0_im;
1323  o2_20_re += B0_re;
1324  o2_20_im += B0_im;
1325  o2_30_re -= A0_re;
1326  o2_30_im -= A0_im;
1327 
1328  o2_01_re += A1_re;
1329  o2_01_im += A1_im;
1330  o2_11_re += B1_re;
1331  o2_11_im += B1_im;
1332  o2_21_re += B1_re;
1333  o2_21_im += B1_im;
1334  o2_31_re -= A1_re;
1335  o2_31_im -= A1_im;
1336 
1337  o2_02_re += A2_re;
1338  o2_02_im += A2_im;
1339  o2_12_re += B2_re;
1340  o2_12_im += B2_im;
1341  o2_22_re += B2_re;
1342  o2_22_im += B2_im;
1343  o2_32_re -= A2_re;
1344  o2_32_im -= A2_im;
1345 
1346  }
1347 }
1348 
1349 if (isActive(dim,1,-1,x1,x2,x3,x4,param.commDim,param.X) && x2==0)
1350 {
1351  // Projector P1+
1352  // 1 0 0 1
1353  // 0 1 -1 0
1354  // 0 -1 1 0
1355  // 1 0 0 1
1356 
1357  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,1,Y);
1358  const int sp_idx = face_idx + param.ghostOffset[1];
1359 #if (DD_PREC==2)
1360  sp_norm_idx = face_idx + param.ghostNormOffset[1];
1361 #endif
1362  const int ga_idx = Vh+face_idx;
1363 
1370 
1371  // read gauge matrix from device memory
1372  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
1373 
1374  // reconstruct gauge matrix
1376 
1377  {
1378 #ifdef MULTI_GPU
1379  if (kernel_type == INTERIOR_KERNEL) {
1380 #endif
1381 
1382  // read flavor 1 from device memory
1383  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1384 
1385  // project spinor into half spinors
1386  a0_re = +i00_re+i30_re;
1387  a0_im = +i00_im+i30_im;
1388  a1_re = +i01_re+i31_re;
1389  a1_im = +i01_im+i31_im;
1390  a2_re = +i02_re+i32_re;
1391  a2_im = +i02_im+i32_im;
1392  b0_re = +i10_re-i20_re;
1393  b0_im = +i10_im-i20_im;
1394  b1_re = +i11_re-i21_re;
1395  b1_im = +i11_im-i21_im;
1396  b2_re = +i12_re-i22_re;
1397  b2_im = +i12_im-i22_im;
1398 
1399 #ifdef MULTI_GPU
1400  } else {
1401 
1402  const int sp_stride_pad = FLAVORS*ghostFace[1];
1403 
1404  // read half spinor for the first flavor from device memory
1405  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1406 
1407  a0_re = i00_re; a0_im = i00_im;
1408  a1_re = i01_re; a1_im = i01_im;
1409  a2_re = i02_re; a2_im = i02_im;
1410  b0_re = i10_re; b0_im = i10_im;
1411  b1_re = i11_re; b1_im = i11_im;
1412  b2_re = i12_re; b2_im = i12_im;
1413 
1414  }
1415 #endif // MULTI_GPU
1416 
1417  // multiply row 0
1418  spinorFloat A0_re = 0;
1419  A0_re += gT00_re * a0_re;
1420  A0_re -= gT00_im * a0_im;
1421  A0_re += gT01_re * a1_re;
1422  A0_re -= gT01_im * a1_im;
1423  A0_re += gT02_re * a2_re;
1424  A0_re -= gT02_im * a2_im;
1425  spinorFloat A0_im = 0;
1426  A0_im += gT00_re * a0_im;
1427  A0_im += gT00_im * a0_re;
1428  A0_im += gT01_re * a1_im;
1429  A0_im += gT01_im * a1_re;
1430  A0_im += gT02_re * a2_im;
1431  A0_im += gT02_im * a2_re;
1432  spinorFloat B0_re = 0;
1433  B0_re += gT00_re * b0_re;
1434  B0_re -= gT00_im * b0_im;
1435  B0_re += gT01_re * b1_re;
1436  B0_re -= gT01_im * b1_im;
1437  B0_re += gT02_re * b2_re;
1438  B0_re -= gT02_im * b2_im;
1439  spinorFloat B0_im = 0;
1440  B0_im += gT00_re * b0_im;
1441  B0_im += gT00_im * b0_re;
1442  B0_im += gT01_re * b1_im;
1443  B0_im += gT01_im * b1_re;
1444  B0_im += gT02_re * b2_im;
1445  B0_im += gT02_im * b2_re;
1446 
1447  // multiply row 1
1448  spinorFloat A1_re = 0;
1449  A1_re += gT10_re * a0_re;
1450  A1_re -= gT10_im * a0_im;
1451  A1_re += gT11_re * a1_re;
1452  A1_re -= gT11_im * a1_im;
1453  A1_re += gT12_re * a2_re;
1454  A1_re -= gT12_im * a2_im;
1455  spinorFloat A1_im = 0;
1456  A1_im += gT10_re * a0_im;
1457  A1_im += gT10_im * a0_re;
1458  A1_im += gT11_re * a1_im;
1459  A1_im += gT11_im * a1_re;
1460  A1_im += gT12_re * a2_im;
1461  A1_im += gT12_im * a2_re;
1462  spinorFloat B1_re = 0;
1463  B1_re += gT10_re * b0_re;
1464  B1_re -= gT10_im * b0_im;
1465  B1_re += gT11_re * b1_re;
1466  B1_re -= gT11_im * b1_im;
1467  B1_re += gT12_re * b2_re;
1468  B1_re -= gT12_im * b2_im;
1469  spinorFloat B1_im = 0;
1470  B1_im += gT10_re * b0_im;
1471  B1_im += gT10_im * b0_re;
1472  B1_im += gT11_re * b1_im;
1473  B1_im += gT11_im * b1_re;
1474  B1_im += gT12_re * b2_im;
1475  B1_im += gT12_im * b2_re;
1476 
1477  // multiply row 2
1478  spinorFloat A2_re = 0;
1479  A2_re += gT20_re * a0_re;
1480  A2_re -= gT20_im * a0_im;
1481  A2_re += gT21_re * a1_re;
1482  A2_re -= gT21_im * a1_im;
1483  A2_re += gT22_re * a2_re;
1484  A2_re -= gT22_im * a2_im;
1485  spinorFloat A2_im = 0;
1486  A2_im += gT20_re * a0_im;
1487  A2_im += gT20_im * a0_re;
1488  A2_im += gT21_re * a1_im;
1489  A2_im += gT21_im * a1_re;
1490  A2_im += gT22_re * a2_im;
1491  A2_im += gT22_im * a2_re;
1492  spinorFloat B2_re = 0;
1493  B2_re += gT20_re * b0_re;
1494  B2_re -= gT20_im * b0_im;
1495  B2_re += gT21_re * b1_re;
1496  B2_re -= gT21_im * b1_im;
1497  B2_re += gT22_re * b2_re;
1498  B2_re -= gT22_im * b2_im;
1499  spinorFloat B2_im = 0;
1500  B2_im += gT20_re * b0_im;
1501  B2_im += gT20_im * b0_re;
1502  B2_im += gT21_re * b1_im;
1503  B2_im += gT21_im * b1_re;
1504  B2_im += gT22_re * b2_im;
1505  B2_im += gT22_im * b2_re;
1506 
1507  o1_00_re += A0_re;
1508  o1_00_im += A0_im;
1509  o1_10_re += B0_re;
1510  o1_10_im += B0_im;
1511  o1_20_re -= B0_re;
1512  o1_20_im -= B0_im;
1513  o1_30_re += A0_re;
1514  o1_30_im += A0_im;
1515 
1516  o1_01_re += A1_re;
1517  o1_01_im += A1_im;
1518  o1_11_re += B1_re;
1519  o1_11_im += B1_im;
1520  o1_21_re -= B1_re;
1521  o1_21_im -= B1_im;
1522  o1_31_re += A1_re;
1523  o1_31_im += A1_im;
1524 
1525  o1_02_re += A2_re;
1526  o1_02_im += A2_im;
1527  o1_12_re += B2_re;
1528  o1_12_im += B2_im;
1529  o1_22_re -= B2_re;
1530  o1_22_im -= B2_im;
1531  o1_32_re += A2_re;
1532  o1_32_im += A2_im;
1533 
1534  }
1535  {
1536 #ifdef MULTI_GPU
1537  if (kernel_type == INTERIOR_KERNEL) {
1538 #endif
1539 
1540  // read flavor 2 from device memory
1541  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
1542 
1543  // project spinor into half spinors
1544  a0_re = +i00_re+i30_re;
1545  a0_im = +i00_im+i30_im;
1546  a1_re = +i01_re+i31_re;
1547  a1_im = +i01_im+i31_im;
1548  a2_re = +i02_re+i32_re;
1549  a2_im = +i02_im+i32_im;
1550  b0_re = +i10_re-i20_re;
1551  b0_im = +i10_im-i20_im;
1552  b1_re = +i11_re-i21_re;
1553  b1_im = +i11_im-i21_im;
1554  b2_re = +i12_re-i22_re;
1555  b2_im = +i12_im-i22_im;
1556 
1557 #ifdef MULTI_GPU
1558  } else {
1559 
1560  const int sp_stride_pad = FLAVORS*ghostFace[1];
1561 
1562  // read half spinor for the second flavor from device memory
1563  const int fl_idx = sp_idx + ghostFace[1];
1564  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[1]);
1565 
1566  a0_re = i00_re; a0_im = i00_im;
1567  a1_re = i01_re; a1_im = i01_im;
1568  a2_re = i02_re; a2_im = i02_im;
1569  b0_re = i10_re; b0_im = i10_im;
1570  b1_re = i11_re; b1_im = i11_im;
1571  b2_re = i12_re; b2_im = i12_im;
1572 
1573  }
1574 #endif // MULTI_GPU
1575 
1576  // multiply row 0
1577  spinorFloat A0_re = 0;
1578  A0_re += gT00_re * a0_re;
1579  A0_re -= gT00_im * a0_im;
1580  A0_re += gT01_re * a1_re;
1581  A0_re -= gT01_im * a1_im;
1582  A0_re += gT02_re * a2_re;
1583  A0_re -= gT02_im * a2_im;
1584  spinorFloat A0_im = 0;
1585  A0_im += gT00_re * a0_im;
1586  A0_im += gT00_im * a0_re;
1587  A0_im += gT01_re * a1_im;
1588  A0_im += gT01_im * a1_re;
1589  A0_im += gT02_re * a2_im;
1590  A0_im += gT02_im * a2_re;
1591  spinorFloat B0_re = 0;
1592  B0_re += gT00_re * b0_re;
1593  B0_re -= gT00_im * b0_im;
1594  B0_re += gT01_re * b1_re;
1595  B0_re -= gT01_im * b1_im;
1596  B0_re += gT02_re * b2_re;
1597  B0_re -= gT02_im * b2_im;
1598  spinorFloat B0_im = 0;
1599  B0_im += gT00_re * b0_im;
1600  B0_im += gT00_im * b0_re;
1601  B0_im += gT01_re * b1_im;
1602  B0_im += gT01_im * b1_re;
1603  B0_im += gT02_re * b2_im;
1604  B0_im += gT02_im * b2_re;
1605 
1606  // multiply row 1
1607  spinorFloat A1_re = 0;
1608  A1_re += gT10_re * a0_re;
1609  A1_re -= gT10_im * a0_im;
1610  A1_re += gT11_re * a1_re;
1611  A1_re -= gT11_im * a1_im;
1612  A1_re += gT12_re * a2_re;
1613  A1_re -= gT12_im * a2_im;
1614  spinorFloat A1_im = 0;
1615  A1_im += gT10_re * a0_im;
1616  A1_im += gT10_im * a0_re;
1617  A1_im += gT11_re * a1_im;
1618  A1_im += gT11_im * a1_re;
1619  A1_im += gT12_re * a2_im;
1620  A1_im += gT12_im * a2_re;
1621  spinorFloat B1_re = 0;
1622  B1_re += gT10_re * b0_re;
1623  B1_re -= gT10_im * b0_im;
1624  B1_re += gT11_re * b1_re;
1625  B1_re -= gT11_im * b1_im;
1626  B1_re += gT12_re * b2_re;
1627  B1_re -= gT12_im * b2_im;
1628  spinorFloat B1_im = 0;
1629  B1_im += gT10_re * b0_im;
1630  B1_im += gT10_im * b0_re;
1631  B1_im += gT11_re * b1_im;
1632  B1_im += gT11_im * b1_re;
1633  B1_im += gT12_re * b2_im;
1634  B1_im += gT12_im * b2_re;
1635 
1636  // multiply row 2
1637  spinorFloat A2_re = 0;
1638  A2_re += gT20_re * a0_re;
1639  A2_re -= gT20_im * a0_im;
1640  A2_re += gT21_re * a1_re;
1641  A2_re -= gT21_im * a1_im;
1642  A2_re += gT22_re * a2_re;
1643  A2_re -= gT22_im * a2_im;
1644  spinorFloat A2_im = 0;
1645  A2_im += gT20_re * a0_im;
1646  A2_im += gT20_im * a0_re;
1647  A2_im += gT21_re * a1_im;
1648  A2_im += gT21_im * a1_re;
1649  A2_im += gT22_re * a2_im;
1650  A2_im += gT22_im * a2_re;
1651  spinorFloat B2_re = 0;
1652  B2_re += gT20_re * b0_re;
1653  B2_re -= gT20_im * b0_im;
1654  B2_re += gT21_re * b1_re;
1655  B2_re -= gT21_im * b1_im;
1656  B2_re += gT22_re * b2_re;
1657  B2_re -= gT22_im * b2_im;
1658  spinorFloat B2_im = 0;
1659  B2_im += gT20_re * b0_im;
1660  B2_im += gT20_im * b0_re;
1661  B2_im += gT21_re * b1_im;
1662  B2_im += gT21_im * b1_re;
1663  B2_im += gT22_re * b2_im;
1664  B2_im += gT22_im * b2_re;
1665 
1666  o2_00_re += A0_re;
1667  o2_00_im += A0_im;
1668  o2_10_re += B0_re;
1669  o2_10_im += B0_im;
1670  o2_20_re -= B0_re;
1671  o2_20_im -= B0_im;
1672  o2_30_re += A0_re;
1673  o2_30_im += A0_im;
1674 
1675  o2_01_re += A1_re;
1676  o2_01_im += A1_im;
1677  o2_11_re += B1_re;
1678  o2_11_im += B1_im;
1679  o2_21_re -= B1_re;
1680  o2_21_im -= B1_im;
1681  o2_31_re += A1_re;
1682  o2_31_im += A1_im;
1683 
1684  o2_02_re += A2_re;
1685  o2_02_im += A2_im;
1686  o2_12_re += B2_re;
1687  o2_12_im += B2_im;
1688  o2_22_re -= B2_re;
1689  o2_22_im -= B2_im;
1690  o2_32_re += A2_re;
1691  o2_32_im += A2_im;
1692 
1693  }
1694 }
1695 
1696 if (isActive(dim,2,+1,x1,x2,x3,x4,param.commDim,param.X) && x3==X3m1)
1697 {
1698  // Projector P2-
1699  // 1 0 -i 0
1700  // 0 1 0 i
1701  // i 0 1 0
1702  // 0 -i 0 1
1703 
1704  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,2,Y);
1705  const int sp_idx = face_idx + param.ghostOffset[2];
1706 #if (DD_PREC==2)
1707  sp_norm_idx = face_idx + faceVolume[2] + param.ghostNormOffset[2];
1708 #endif
1709  const int ga_idx = sid;
1710 
1717 
1718  // read gauge matrix from device memory
1719  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1720 
1721  // reconstruct gauge matrix
1723 
1724  {
1725 #ifdef MULTI_GPU
1726  if (kernel_type == INTERIOR_KERNEL) {
1727 #endif
1728 
1729  // read flavor 1 from device memory
1730  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1731 
1732  // project spinor into half spinors
1733  a0_re = +i00_re+i20_im;
1734  a0_im = +i00_im-i20_re;
1735  a1_re = +i01_re+i21_im;
1736  a1_im = +i01_im-i21_re;
1737  a2_re = +i02_re+i22_im;
1738  a2_im = +i02_im-i22_re;
1739  b0_re = +i10_re-i30_im;
1740  b0_im = +i10_im+i30_re;
1741  b1_re = +i11_re-i31_im;
1742  b1_im = +i11_im+i31_re;
1743  b2_re = +i12_re-i32_im;
1744  b2_im = +i12_im+i32_re;
1745 
1746 #ifdef MULTI_GPU
1747  } else {
1748 
1749  const int sp_stride_pad = FLAVORS*ghostFace[2];
1750 
1751  // read half spinor for the first flavor from device memory
1752  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1753 
1754  a0_re = i00_re; a0_im = i00_im;
1755  a1_re = i01_re; a1_im = i01_im;
1756  a2_re = i02_re; a2_im = i02_im;
1757  b0_re = i10_re; b0_im = i10_im;
1758  b1_re = i11_re; b1_im = i11_im;
1759  b2_re = i12_re; b2_im = i12_im;
1760 
1761  }
1762 #endif // MULTI_GPU
1763 
1764  // multiply row 0
1765  spinorFloat A0_re = 0;
1766  A0_re += g00_re * a0_re;
1767  A0_re -= g00_im * a0_im;
1768  A0_re += g01_re * a1_re;
1769  A0_re -= g01_im * a1_im;
1770  A0_re += g02_re * a2_re;
1771  A0_re -= g02_im * a2_im;
1772  spinorFloat A0_im = 0;
1773  A0_im += g00_re * a0_im;
1774  A0_im += g00_im * a0_re;
1775  A0_im += g01_re * a1_im;
1776  A0_im += g01_im * a1_re;
1777  A0_im += g02_re * a2_im;
1778  A0_im += g02_im * a2_re;
1779  spinorFloat B0_re = 0;
1780  B0_re += g00_re * b0_re;
1781  B0_re -= g00_im * b0_im;
1782  B0_re += g01_re * b1_re;
1783  B0_re -= g01_im * b1_im;
1784  B0_re += g02_re * b2_re;
1785  B0_re -= g02_im * b2_im;
1786  spinorFloat B0_im = 0;
1787  B0_im += g00_re * b0_im;
1788  B0_im += g00_im * b0_re;
1789  B0_im += g01_re * b1_im;
1790  B0_im += g01_im * b1_re;
1791  B0_im += g02_re * b2_im;
1792  B0_im += g02_im * b2_re;
1793 
1794  // multiply row 1
1795  spinorFloat A1_re = 0;
1796  A1_re += g10_re * a0_re;
1797  A1_re -= g10_im * a0_im;
1798  A1_re += g11_re * a1_re;
1799  A1_re -= g11_im * a1_im;
1800  A1_re += g12_re * a2_re;
1801  A1_re -= g12_im * a2_im;
1802  spinorFloat A1_im = 0;
1803  A1_im += g10_re * a0_im;
1804  A1_im += g10_im * a0_re;
1805  A1_im += g11_re * a1_im;
1806  A1_im += g11_im * a1_re;
1807  A1_im += g12_re * a2_im;
1808  A1_im += g12_im * a2_re;
1809  spinorFloat B1_re = 0;
1810  B1_re += g10_re * b0_re;
1811  B1_re -= g10_im * b0_im;
1812  B1_re += g11_re * b1_re;
1813  B1_re -= g11_im * b1_im;
1814  B1_re += g12_re * b2_re;
1815  B1_re -= g12_im * b2_im;
1816  spinorFloat B1_im = 0;
1817  B1_im += g10_re * b0_im;
1818  B1_im += g10_im * b0_re;
1819  B1_im += g11_re * b1_im;
1820  B1_im += g11_im * b1_re;
1821  B1_im += g12_re * b2_im;
1822  B1_im += g12_im * b2_re;
1823 
1824  // multiply row 2
1825  spinorFloat A2_re = 0;
1826  A2_re += g20_re * a0_re;
1827  A2_re -= g20_im * a0_im;
1828  A2_re += g21_re * a1_re;
1829  A2_re -= g21_im * a1_im;
1830  A2_re += g22_re * a2_re;
1831  A2_re -= g22_im * a2_im;
1832  spinorFloat A2_im = 0;
1833  A2_im += g20_re * a0_im;
1834  A2_im += g20_im * a0_re;
1835  A2_im += g21_re * a1_im;
1836  A2_im += g21_im * a1_re;
1837  A2_im += g22_re * a2_im;
1838  A2_im += g22_im * a2_re;
1839  spinorFloat B2_re = 0;
1840  B2_re += g20_re * b0_re;
1841  B2_re -= g20_im * b0_im;
1842  B2_re += g21_re * b1_re;
1843  B2_re -= g21_im * b1_im;
1844  B2_re += g22_re * b2_re;
1845  B2_re -= g22_im * b2_im;
1846  spinorFloat B2_im = 0;
1847  B2_im += g20_re * b0_im;
1848  B2_im += g20_im * b0_re;
1849  B2_im += g21_re * b1_im;
1850  B2_im += g21_im * b1_re;
1851  B2_im += g22_re * b2_im;
1852  B2_im += g22_im * b2_re;
1853 
1854  o1_00_re += A0_re;
1855  o1_00_im += A0_im;
1856  o1_10_re += B0_re;
1857  o1_10_im += B0_im;
1858  o1_20_re -= A0_im;
1859  o1_20_im += A0_re;
1860  o1_30_re += B0_im;
1861  o1_30_im -= B0_re;
1862 
1863  o1_01_re += A1_re;
1864  o1_01_im += A1_im;
1865  o1_11_re += B1_re;
1866  o1_11_im += B1_im;
1867  o1_21_re -= A1_im;
1868  o1_21_im += A1_re;
1869  o1_31_re += B1_im;
1870  o1_31_im -= B1_re;
1871 
1872  o1_02_re += A2_re;
1873  o1_02_im += A2_im;
1874  o1_12_re += B2_re;
1875  o1_12_im += B2_im;
1876  o1_22_re -= A2_im;
1877  o1_22_im += A2_re;
1878  o1_32_re += B2_im;
1879  o1_32_im -= B2_re;
1880 
1881  }
1882  {
1883 #ifdef MULTI_GPU
1884  if (kernel_type == INTERIOR_KERNEL) {
1885 #endif
1886 
1887  // read flavor 2 from device memory
1888  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
1889 
1890  // project spinor into half spinors
1891  a0_re = +i00_re+i20_im;
1892  a0_im = +i00_im-i20_re;
1893  a1_re = +i01_re+i21_im;
1894  a1_im = +i01_im-i21_re;
1895  a2_re = +i02_re+i22_im;
1896  a2_im = +i02_im-i22_re;
1897  b0_re = +i10_re-i30_im;
1898  b0_im = +i10_im+i30_re;
1899  b1_re = +i11_re-i31_im;
1900  b1_im = +i11_im+i31_re;
1901  b2_re = +i12_re-i32_im;
1902  b2_im = +i12_im+i32_re;
1903 
1904 #ifdef MULTI_GPU
1905  } else {
1906 
1907  const int sp_stride_pad = FLAVORS*ghostFace[2];
1908 
1909  // read half spinor for the second flavor from device memory
1910  const int fl_idx = sp_idx + ghostFace[2];
1911  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[2]);
1912 
1913  a0_re = i00_re; a0_im = i00_im;
1914  a1_re = i01_re; a1_im = i01_im;
1915  a2_re = i02_re; a2_im = i02_im;
1916  b0_re = i10_re; b0_im = i10_im;
1917  b1_re = i11_re; b1_im = i11_im;
1918  b2_re = i12_re; b2_im = i12_im;
1919 
1920  }
1921 #endif // MULTI_GPU
1922 
1923  // multiply row 0
1924  spinorFloat A0_re = 0;
1925  A0_re += g00_re * a0_re;
1926  A0_re -= g00_im * a0_im;
1927  A0_re += g01_re * a1_re;
1928  A0_re -= g01_im * a1_im;
1929  A0_re += g02_re * a2_re;
1930  A0_re -= g02_im * a2_im;
1931  spinorFloat A0_im = 0;
1932  A0_im += g00_re * a0_im;
1933  A0_im += g00_im * a0_re;
1934  A0_im += g01_re * a1_im;
1935  A0_im += g01_im * a1_re;
1936  A0_im += g02_re * a2_im;
1937  A0_im += g02_im * a2_re;
1938  spinorFloat B0_re = 0;
1939  B0_re += g00_re * b0_re;
1940  B0_re -= g00_im * b0_im;
1941  B0_re += g01_re * b1_re;
1942  B0_re -= g01_im * b1_im;
1943  B0_re += g02_re * b2_re;
1944  B0_re -= g02_im * b2_im;
1945  spinorFloat B0_im = 0;
1946  B0_im += g00_re * b0_im;
1947  B0_im += g00_im * b0_re;
1948  B0_im += g01_re * b1_im;
1949  B0_im += g01_im * b1_re;
1950  B0_im += g02_re * b2_im;
1951  B0_im += g02_im * b2_re;
1952 
1953  // multiply row 1
1954  spinorFloat A1_re = 0;
1955  A1_re += g10_re * a0_re;
1956  A1_re -= g10_im * a0_im;
1957  A1_re += g11_re * a1_re;
1958  A1_re -= g11_im * a1_im;
1959  A1_re += g12_re * a2_re;
1960  A1_re -= g12_im * a2_im;
1961  spinorFloat A1_im = 0;
1962  A1_im += g10_re * a0_im;
1963  A1_im += g10_im * a0_re;
1964  A1_im += g11_re * a1_im;
1965  A1_im += g11_im * a1_re;
1966  A1_im += g12_re * a2_im;
1967  A1_im += g12_im * a2_re;
1968  spinorFloat B1_re = 0;
1969  B1_re += g10_re * b0_re;
1970  B1_re -= g10_im * b0_im;
1971  B1_re += g11_re * b1_re;
1972  B1_re -= g11_im * b1_im;
1973  B1_re += g12_re * b2_re;
1974  B1_re -= g12_im * b2_im;
1975  spinorFloat B1_im = 0;
1976  B1_im += g10_re * b0_im;
1977  B1_im += g10_im * b0_re;
1978  B1_im += g11_re * b1_im;
1979  B1_im += g11_im * b1_re;
1980  B1_im += g12_re * b2_im;
1981  B1_im += g12_im * b2_re;
1982 
1983  // multiply row 2
1984  spinorFloat A2_re = 0;
1985  A2_re += g20_re * a0_re;
1986  A2_re -= g20_im * a0_im;
1987  A2_re += g21_re * a1_re;
1988  A2_re -= g21_im * a1_im;
1989  A2_re += g22_re * a2_re;
1990  A2_re -= g22_im * a2_im;
1991  spinorFloat A2_im = 0;
1992  A2_im += g20_re * a0_im;
1993  A2_im += g20_im * a0_re;
1994  A2_im += g21_re * a1_im;
1995  A2_im += g21_im * a1_re;
1996  A2_im += g22_re * a2_im;
1997  A2_im += g22_im * a2_re;
1998  spinorFloat B2_re = 0;
1999  B2_re += g20_re * b0_re;
2000  B2_re -= g20_im * b0_im;
2001  B2_re += g21_re * b1_re;
2002  B2_re -= g21_im * b1_im;
2003  B2_re += g22_re * b2_re;
2004  B2_re -= g22_im * b2_im;
2005  spinorFloat B2_im = 0;
2006  B2_im += g20_re * b0_im;
2007  B2_im += g20_im * b0_re;
2008  B2_im += g21_re * b1_im;
2009  B2_im += g21_im * b1_re;
2010  B2_im += g22_re * b2_im;
2011  B2_im += g22_im * b2_re;
2012 
2013  o2_00_re += A0_re;
2014  o2_00_im += A0_im;
2015  o2_10_re += B0_re;
2016  o2_10_im += B0_im;
2017  o2_20_re -= A0_im;
2018  o2_20_im += A0_re;
2019  o2_30_re += B0_im;
2020  o2_30_im -= B0_re;
2021 
2022  o2_01_re += A1_re;
2023  o2_01_im += A1_im;
2024  o2_11_re += B1_re;
2025  o2_11_im += B1_im;
2026  o2_21_re -= A1_im;
2027  o2_21_im += A1_re;
2028  o2_31_re += B1_im;
2029  o2_31_im -= B1_re;
2030 
2031  o2_02_re += A2_re;
2032  o2_02_im += A2_im;
2033  o2_12_re += B2_re;
2034  o2_12_im += B2_im;
2035  o2_22_re -= A2_im;
2036  o2_22_im += A2_re;
2037  o2_32_re += B2_im;
2038  o2_32_im -= B2_re;
2039 
2040  }
2041 }
2042 
2043 if (isActive(dim,2,-1,x1,x2,x3,x4,param.commDim,param.X) && x3==0)
2044 {
2045  // Projector P2+
2046  // 1 0 i 0
2047  // 0 1 0 -i
2048  // -i 0 1 0
2049  // 0 i 0 1
2050 
2051  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,2,Y);
2052  const int sp_idx = face_idx + param.ghostOffset[2];
2053 #if (DD_PREC==2)
2054  sp_norm_idx = face_idx + param.ghostNormOffset[2];
2055 #endif
2056  const int ga_idx = Vh+face_idx;
2057 
2064 
2065  // read gauge matrix from device memory
2066  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
2067 
2068  // reconstruct gauge matrix
2070 
2071  {
2072 #ifdef MULTI_GPU
2073  if (kernel_type == INTERIOR_KERNEL) {
2074 #endif
2075 
2076  // read flavor 1 from device memory
2077  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2078 
2079  // project spinor into half spinors
2080  a0_re = +i00_re-i20_im;
2081  a0_im = +i00_im+i20_re;
2082  a1_re = +i01_re-i21_im;
2083  a1_im = +i01_im+i21_re;
2084  a2_re = +i02_re-i22_im;
2085  a2_im = +i02_im+i22_re;
2086  b0_re = +i10_re+i30_im;
2087  b0_im = +i10_im-i30_re;
2088  b1_re = +i11_re+i31_im;
2089  b1_im = +i11_im-i31_re;
2090  b2_re = +i12_re+i32_im;
2091  b2_im = +i12_im-i32_re;
2092 
2093 #ifdef MULTI_GPU
2094  } else {
2095 
2096  const int sp_stride_pad = FLAVORS*ghostFace[2];
2097 
2098  // read half spinor for the first flavor from device memory
2099  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2100 
2101  a0_re = i00_re; a0_im = i00_im;
2102  a1_re = i01_re; a1_im = i01_im;
2103  a2_re = i02_re; a2_im = i02_im;
2104  b0_re = i10_re; b0_im = i10_im;
2105  b1_re = i11_re; b1_im = i11_im;
2106  b2_re = i12_re; b2_im = i12_im;
2107 
2108  }
2109 #endif // MULTI_GPU
2110 
2111  // multiply row 0
2112  spinorFloat A0_re = 0;
2113  A0_re += gT00_re * a0_re;
2114  A0_re -= gT00_im * a0_im;
2115  A0_re += gT01_re * a1_re;
2116  A0_re -= gT01_im * a1_im;
2117  A0_re += gT02_re * a2_re;
2118  A0_re -= gT02_im * a2_im;
2119  spinorFloat A0_im = 0;
2120  A0_im += gT00_re * a0_im;
2121  A0_im += gT00_im * a0_re;
2122  A0_im += gT01_re * a1_im;
2123  A0_im += gT01_im * a1_re;
2124  A0_im += gT02_re * a2_im;
2125  A0_im += gT02_im * a2_re;
2126  spinorFloat B0_re = 0;
2127  B0_re += gT00_re * b0_re;
2128  B0_re -= gT00_im * b0_im;
2129  B0_re += gT01_re * b1_re;
2130  B0_re -= gT01_im * b1_im;
2131  B0_re += gT02_re * b2_re;
2132  B0_re -= gT02_im * b2_im;
2133  spinorFloat B0_im = 0;
2134  B0_im += gT00_re * b0_im;
2135  B0_im += gT00_im * b0_re;
2136  B0_im += gT01_re * b1_im;
2137  B0_im += gT01_im * b1_re;
2138  B0_im += gT02_re * b2_im;
2139  B0_im += gT02_im * b2_re;
2140 
2141  // multiply row 1
2142  spinorFloat A1_re = 0;
2143  A1_re += gT10_re * a0_re;
2144  A1_re -= gT10_im * a0_im;
2145  A1_re += gT11_re * a1_re;
2146  A1_re -= gT11_im * a1_im;
2147  A1_re += gT12_re * a2_re;
2148  A1_re -= gT12_im * a2_im;
2149  spinorFloat A1_im = 0;
2150  A1_im += gT10_re * a0_im;
2151  A1_im += gT10_im * a0_re;
2152  A1_im += gT11_re * a1_im;
2153  A1_im += gT11_im * a1_re;
2154  A1_im += gT12_re * a2_im;
2155  A1_im += gT12_im * a2_re;
2156  spinorFloat B1_re = 0;
2157  B1_re += gT10_re * b0_re;
2158  B1_re -= gT10_im * b0_im;
2159  B1_re += gT11_re * b1_re;
2160  B1_re -= gT11_im * b1_im;
2161  B1_re += gT12_re * b2_re;
2162  B1_re -= gT12_im * b2_im;
2163  spinorFloat B1_im = 0;
2164  B1_im += gT10_re * b0_im;
2165  B1_im += gT10_im * b0_re;
2166  B1_im += gT11_re * b1_im;
2167  B1_im += gT11_im * b1_re;
2168  B1_im += gT12_re * b2_im;
2169  B1_im += gT12_im * b2_re;
2170 
2171  // multiply row 2
2172  spinorFloat A2_re = 0;
2173  A2_re += gT20_re * a0_re;
2174  A2_re -= gT20_im * a0_im;
2175  A2_re += gT21_re * a1_re;
2176  A2_re -= gT21_im * a1_im;
2177  A2_re += gT22_re * a2_re;
2178  A2_re -= gT22_im * a2_im;
2179  spinorFloat A2_im = 0;
2180  A2_im += gT20_re * a0_im;
2181  A2_im += gT20_im * a0_re;
2182  A2_im += gT21_re * a1_im;
2183  A2_im += gT21_im * a1_re;
2184  A2_im += gT22_re * a2_im;
2185  A2_im += gT22_im * a2_re;
2186  spinorFloat B2_re = 0;
2187  B2_re += gT20_re * b0_re;
2188  B2_re -= gT20_im * b0_im;
2189  B2_re += gT21_re * b1_re;
2190  B2_re -= gT21_im * b1_im;
2191  B2_re += gT22_re * b2_re;
2192  B2_re -= gT22_im * b2_im;
2193  spinorFloat B2_im = 0;
2194  B2_im += gT20_re * b0_im;
2195  B2_im += gT20_im * b0_re;
2196  B2_im += gT21_re * b1_im;
2197  B2_im += gT21_im * b1_re;
2198  B2_im += gT22_re * b2_im;
2199  B2_im += gT22_im * b2_re;
2200 
2201  o1_00_re += A0_re;
2202  o1_00_im += A0_im;
2203  o1_10_re += B0_re;
2204  o1_10_im += B0_im;
2205  o1_20_re += A0_im;
2206  o1_20_im -= A0_re;
2207  o1_30_re -= B0_im;
2208  o1_30_im += B0_re;
2209 
2210  o1_01_re += A1_re;
2211  o1_01_im += A1_im;
2212  o1_11_re += B1_re;
2213  o1_11_im += B1_im;
2214  o1_21_re += A1_im;
2215  o1_21_im -= A1_re;
2216  o1_31_re -= B1_im;
2217  o1_31_im += B1_re;
2218 
2219  o1_02_re += A2_re;
2220  o1_02_im += A2_im;
2221  o1_12_re += B2_re;
2222  o1_12_im += B2_im;
2223  o1_22_re += A2_im;
2224  o1_22_im -= A2_re;
2225  o1_32_re -= B2_im;
2226  o1_32_im += B2_re;
2227 
2228  }
2229  {
2230 #ifdef MULTI_GPU
2231  if (kernel_type == INTERIOR_KERNEL) {
2232 #endif
2233 
2234  // read flavor 2 from device memory
2235  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
2236 
2237  // project spinor into half spinors
2238  a0_re = +i00_re-i20_im;
2239  a0_im = +i00_im+i20_re;
2240  a1_re = +i01_re-i21_im;
2241  a1_im = +i01_im+i21_re;
2242  a2_re = +i02_re-i22_im;
2243  a2_im = +i02_im+i22_re;
2244  b0_re = +i10_re+i30_im;
2245  b0_im = +i10_im-i30_re;
2246  b1_re = +i11_re+i31_im;
2247  b1_im = +i11_im-i31_re;
2248  b2_re = +i12_re+i32_im;
2249  b2_im = +i12_im-i32_re;
2250 
2251 #ifdef MULTI_GPU
2252  } else {
2253 
2254  const int sp_stride_pad = FLAVORS*ghostFace[2];
2255 
2256  // read half spinor for the second flavor from device memory
2257  const int fl_idx = sp_idx + ghostFace[2];
2258  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[2]);
2259 
2260  a0_re = i00_re; a0_im = i00_im;
2261  a1_re = i01_re; a1_im = i01_im;
2262  a2_re = i02_re; a2_im = i02_im;
2263  b0_re = i10_re; b0_im = i10_im;
2264  b1_re = i11_re; b1_im = i11_im;
2265  b2_re = i12_re; b2_im = i12_im;
2266 
2267  }
2268 #endif // MULTI_GPU
2269 
2270  // multiply row 0
2271  spinorFloat A0_re = 0;
2272  A0_re += gT00_re * a0_re;
2273  A0_re -= gT00_im * a0_im;
2274  A0_re += gT01_re * a1_re;
2275  A0_re -= gT01_im * a1_im;
2276  A0_re += gT02_re * a2_re;
2277  A0_re -= gT02_im * a2_im;
2278  spinorFloat A0_im = 0;
2279  A0_im += gT00_re * a0_im;
2280  A0_im += gT00_im * a0_re;
2281  A0_im += gT01_re * a1_im;
2282  A0_im += gT01_im * a1_re;
2283  A0_im += gT02_re * a2_im;
2284  A0_im += gT02_im * a2_re;
2285  spinorFloat B0_re = 0;
2286  B0_re += gT00_re * b0_re;
2287  B0_re -= gT00_im * b0_im;
2288  B0_re += gT01_re * b1_re;
2289  B0_re -= gT01_im * b1_im;
2290  B0_re += gT02_re * b2_re;
2291  B0_re -= gT02_im * b2_im;
2292  spinorFloat B0_im = 0;
2293  B0_im += gT00_re * b0_im;
2294  B0_im += gT00_im * b0_re;
2295  B0_im += gT01_re * b1_im;
2296  B0_im += gT01_im * b1_re;
2297  B0_im += gT02_re * b2_im;
2298  B0_im += gT02_im * b2_re;
2299 
2300  // multiply row 1
2301  spinorFloat A1_re = 0;
2302  A1_re += gT10_re * a0_re;
2303  A1_re -= gT10_im * a0_im;
2304  A1_re += gT11_re * a1_re;
2305  A1_re -= gT11_im * a1_im;
2306  A1_re += gT12_re * a2_re;
2307  A1_re -= gT12_im * a2_im;
2308  spinorFloat A1_im = 0;
2309  A1_im += gT10_re * a0_im;
2310  A1_im += gT10_im * a0_re;
2311  A1_im += gT11_re * a1_im;
2312  A1_im += gT11_im * a1_re;
2313  A1_im += gT12_re * a2_im;
2314  A1_im += gT12_im * a2_re;
2315  spinorFloat B1_re = 0;
2316  B1_re += gT10_re * b0_re;
2317  B1_re -= gT10_im * b0_im;
2318  B1_re += gT11_re * b1_re;
2319  B1_re -= gT11_im * b1_im;
2320  B1_re += gT12_re * b2_re;
2321  B1_re -= gT12_im * b2_im;
2322  spinorFloat B1_im = 0;
2323  B1_im += gT10_re * b0_im;
2324  B1_im += gT10_im * b0_re;
2325  B1_im += gT11_re * b1_im;
2326  B1_im += gT11_im * b1_re;
2327  B1_im += gT12_re * b2_im;
2328  B1_im += gT12_im * b2_re;
2329 
2330  // multiply row 2
2331  spinorFloat A2_re = 0;
2332  A2_re += gT20_re * a0_re;
2333  A2_re -= gT20_im * a0_im;
2334  A2_re += gT21_re * a1_re;
2335  A2_re -= gT21_im * a1_im;
2336  A2_re += gT22_re * a2_re;
2337  A2_re -= gT22_im * a2_im;
2338  spinorFloat A2_im = 0;
2339  A2_im += gT20_re * a0_im;
2340  A2_im += gT20_im * a0_re;
2341  A2_im += gT21_re * a1_im;
2342  A2_im += gT21_im * a1_re;
2343  A2_im += gT22_re * a2_im;
2344  A2_im += gT22_im * a2_re;
2345  spinorFloat B2_re = 0;
2346  B2_re += gT20_re * b0_re;
2347  B2_re -= gT20_im * b0_im;
2348  B2_re += gT21_re * b1_re;
2349  B2_re -= gT21_im * b1_im;
2350  B2_re += gT22_re * b2_re;
2351  B2_re -= gT22_im * b2_im;
2352  spinorFloat B2_im = 0;
2353  B2_im += gT20_re * b0_im;
2354  B2_im += gT20_im * b0_re;
2355  B2_im += gT21_re * b1_im;
2356  B2_im += gT21_im * b1_re;
2357  B2_im += gT22_re * b2_im;
2358  B2_im += gT22_im * b2_re;
2359 
2360  o2_00_re += A0_re;
2361  o2_00_im += A0_im;
2362  o2_10_re += B0_re;
2363  o2_10_im += B0_im;
2364  o2_20_re += A0_im;
2365  o2_20_im -= A0_re;
2366  o2_30_re -= B0_im;
2367  o2_30_im += B0_re;
2368 
2369  o2_01_re += A1_re;
2370  o2_01_im += A1_im;
2371  o2_11_re += B1_re;
2372  o2_11_im += B1_im;
2373  o2_21_re += A1_im;
2374  o2_21_im -= A1_re;
2375  o2_31_re -= B1_im;
2376  o2_31_im += B1_re;
2377 
2378  o2_02_re += A2_re;
2379  o2_02_im += A2_im;
2380  o2_12_re += B2_re;
2381  o2_12_im += B2_im;
2382  o2_22_re += A2_im;
2383  o2_22_im -= A2_re;
2384  o2_32_re -= B2_im;
2385  o2_32_im += B2_re;
2386 
2387  }
2388 }
2389 
2390 if (isActive(dim,3,+1,x1,x2,x3,x4,param.commDim,param.X) && x4==X4m1)
2391 {
2392  // Projector P3-
2393  // 0 0 0 0
2394  // 0 0 0 0
2395  // 0 0 2 0
2396  // 0 0 0 2
2397 
2398  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,3,Y);
2399  const int sp_idx = face_idx + param.ghostOffset[3];
2400 #if (DD_PREC==2)
2401  sp_norm_idx = face_idx + faceVolume[3] + param.ghostNormOffset[3];
2402 #endif
2403  const int ga_idx = sid;
2404 
2411 
2412  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
2413  {
2414  {
2415 #ifdef MULTI_GPU
2416  if (kernel_type == INTERIOR_KERNEL) {
2417 #endif
2418 
2419  // read flavor 1 from device memory
2420  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2421 
2422  // project spinor into half spinors
2423  a0_re = +2*i20_re;
2424  a0_im = +2*i20_im;
2425  a1_re = +2*i21_re;
2426  a1_im = +2*i21_im;
2427  a2_re = +2*i22_re;
2428  a2_im = +2*i22_im;
2429  b0_re = +2*i30_re;
2430  b0_im = +2*i30_im;
2431  b1_re = +2*i31_re;
2432  b1_im = +2*i31_im;
2433  b2_re = +2*i32_re;
2434  b2_im = +2*i32_im;
2435 
2436 #ifdef MULTI_GPU
2437  } else {
2438 
2439  const int sp_stride_pad = FLAVORS*ghostFace[3];
2440 
2441  // read half spinor for the first flavor from device memory
2442  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
2443 
2444  a0_re = 2*i00_re; a0_im = 2*i00_im;
2445  a1_re = 2*i01_re; a1_im = 2*i01_im;
2446  a2_re = 2*i02_re; a2_im = 2*i02_im;
2447  b0_re = 2*i10_re; b0_im = 2*i10_im;
2448  b1_re = 2*i11_re; b1_im = 2*i11_im;
2449  b2_re = 2*i12_re; b2_im = 2*i12_im;
2450 
2451  }
2452 #endif // MULTI_GPU
2453 
2454  // identity gauge matrix
2461 
2462  o1_20_re += A0_re;
2463  o1_20_im += A0_im;
2464  o1_30_re += B0_re;
2465  o1_30_im += B0_im;
2466 
2467  o1_21_re += A1_re;
2468  o1_21_im += A1_im;
2469  o1_31_re += B1_re;
2470  o1_31_im += B1_im;
2471 
2472  o1_22_re += A2_re;
2473  o1_22_im += A2_im;
2474  o1_32_re += B2_re;
2475  o1_32_im += B2_im;
2476 
2477  }
2478  {
2479 #ifdef MULTI_GPU
2480  if (kernel_type == INTERIOR_KERNEL) {
2481 #endif
2482 
2483  // read flavor 2 from device memory
2484  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
2485 
2486  // project spinor into half spinors
2487  a0_re = +2*i20_re;
2488  a0_im = +2*i20_im;
2489  a1_re = +2*i21_re;
2490  a1_im = +2*i21_im;
2491  a2_re = +2*i22_re;
2492  a2_im = +2*i22_im;
2493  b0_re = +2*i30_re;
2494  b0_im = +2*i30_im;
2495  b1_re = +2*i31_re;
2496  b1_im = +2*i31_im;
2497  b2_re = +2*i32_re;
2498  b2_im = +2*i32_im;
2499 
2500 #ifdef MULTI_GPU
2501  } else {
2502 
2503  const int sp_stride_pad = FLAVORS*ghostFace[3];
2504 
2505  // read half spinor for the second flavor from device memory
2506  const int fl_idx = sp_idx + ghostFace[3];
2507  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[3]);
2508 
2509  a0_re = 2*i00_re; a0_im = 2*i00_im;
2510  a1_re = 2*i01_re; a1_im = 2*i01_im;
2511  a2_re = 2*i02_re; a2_im = 2*i02_im;
2512  b0_re = 2*i10_re; b0_im = 2*i10_im;
2513  b1_re = 2*i11_re; b1_im = 2*i11_im;
2514  b2_re = 2*i12_re; b2_im = 2*i12_im;
2515 
2516  }
2517 #endif // MULTI_GPU
2518 
2519  // identity gauge matrix
2526 
2527  o2_20_re += A0_re;
2528  o2_20_im += A0_im;
2529  o2_30_re += B0_re;
2530  o2_30_im += B0_im;
2531 
2532  o2_21_re += A1_re;
2533  o2_21_im += A1_im;
2534  o2_31_re += B1_re;
2535  o2_31_im += B1_im;
2536 
2537  o2_22_re += A2_re;
2538  o2_22_im += A2_im;
2539  o2_32_re += B2_re;
2540  o2_32_im += B2_im;
2541 
2542  }
2543  } else {
2544  // read gauge matrix from device memory
2545  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
2546 
2547  // reconstruct gauge matrix
2549 
2550  {
2551 #ifdef MULTI_GPU
2552  if (kernel_type == INTERIOR_KERNEL) {
2553 #endif
2554 
2555  // read flavor 1 from device memory
2556  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2557 
2558  // project spinor into half spinors
2559  a0_re = +2*i20_re;
2560  a0_im = +2*i20_im;
2561  a1_re = +2*i21_re;
2562  a1_im = +2*i21_im;
2563  a2_re = +2*i22_re;
2564  a2_im = +2*i22_im;
2565  b0_re = +2*i30_re;
2566  b0_im = +2*i30_im;
2567  b1_re = +2*i31_re;
2568  b1_im = +2*i31_im;
2569  b2_re = +2*i32_re;
2570  b2_im = +2*i32_im;
2571 
2572 #ifdef MULTI_GPU
2573  } else {
2574 
2575  const int sp_stride_pad = FLAVORS*ghostFace[3];
2576 
2577  // read half spinor for the first flavor from device memory
2578  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
2579 
2580  a0_re = 2*i00_re; a0_im = 2*i00_im;
2581  a1_re = 2*i01_re; a1_im = 2*i01_im;
2582  a2_re = 2*i02_re; a2_im = 2*i02_im;
2583  b0_re = 2*i10_re; b0_im = 2*i10_im;
2584  b1_re = 2*i11_re; b1_im = 2*i11_im;
2585  b2_re = 2*i12_re; b2_im = 2*i12_im;
2586 
2587  }
2588 #endif // MULTI_GPU
2589 
2590  // multiply row 0
2591  spinorFloat A0_re = 0;
2592  A0_re += g00_re * a0_re;
2593  A0_re -= g00_im * a0_im;
2594  A0_re += g01_re * a1_re;
2595  A0_re -= g01_im * a1_im;
2596  A0_re += g02_re * a2_re;
2597  A0_re -= g02_im * a2_im;
2598  spinorFloat A0_im = 0;
2599  A0_im += g00_re * a0_im;
2600  A0_im += g00_im * a0_re;
2601  A0_im += g01_re * a1_im;
2602  A0_im += g01_im * a1_re;
2603  A0_im += g02_re * a2_im;
2604  A0_im += g02_im * a2_re;
2605  spinorFloat B0_re = 0;
2606  B0_re += g00_re * b0_re;
2607  B0_re -= g00_im * b0_im;
2608  B0_re += g01_re * b1_re;
2609  B0_re -= g01_im * b1_im;
2610  B0_re += g02_re * b2_re;
2611  B0_re -= g02_im * b2_im;
2612  spinorFloat B0_im = 0;
2613  B0_im += g00_re * b0_im;
2614  B0_im += g00_im * b0_re;
2615  B0_im += g01_re * b1_im;
2616  B0_im += g01_im * b1_re;
2617  B0_im += g02_re * b2_im;
2618  B0_im += g02_im * b2_re;
2619 
2620  // multiply row 1
2621  spinorFloat A1_re = 0;
2622  A1_re += g10_re * a0_re;
2623  A1_re -= g10_im * a0_im;
2624  A1_re += g11_re * a1_re;
2625  A1_re -= g11_im * a1_im;
2626  A1_re += g12_re * a2_re;
2627  A1_re -= g12_im * a2_im;
2628  spinorFloat A1_im = 0;
2629  A1_im += g10_re * a0_im;
2630  A1_im += g10_im * a0_re;
2631  A1_im += g11_re * a1_im;
2632  A1_im += g11_im * a1_re;
2633  A1_im += g12_re * a2_im;
2634  A1_im += g12_im * a2_re;
2635  spinorFloat B1_re = 0;
2636  B1_re += g10_re * b0_re;
2637  B1_re -= g10_im * b0_im;
2638  B1_re += g11_re * b1_re;
2639  B1_re -= g11_im * b1_im;
2640  B1_re += g12_re * b2_re;
2641  B1_re -= g12_im * b2_im;
2642  spinorFloat B1_im = 0;
2643  B1_im += g10_re * b0_im;
2644  B1_im += g10_im * b0_re;
2645  B1_im += g11_re * b1_im;
2646  B1_im += g11_im * b1_re;
2647  B1_im += g12_re * b2_im;
2648  B1_im += g12_im * b2_re;
2649 
2650  // multiply row 2
2651  spinorFloat A2_re = 0;
2652  A2_re += g20_re * a0_re;
2653  A2_re -= g20_im * a0_im;
2654  A2_re += g21_re * a1_re;
2655  A2_re -= g21_im * a1_im;
2656  A2_re += g22_re * a2_re;
2657  A2_re -= g22_im * a2_im;
2658  spinorFloat A2_im = 0;
2659  A2_im += g20_re * a0_im;
2660  A2_im += g20_im * a0_re;
2661  A2_im += g21_re * a1_im;
2662  A2_im += g21_im * a1_re;
2663  A2_im += g22_re * a2_im;
2664  A2_im += g22_im * a2_re;
2665  spinorFloat B2_re = 0;
2666  B2_re += g20_re * b0_re;
2667  B2_re -= g20_im * b0_im;
2668  B2_re += g21_re * b1_re;
2669  B2_re -= g21_im * b1_im;
2670  B2_re += g22_re * b2_re;
2671  B2_re -= g22_im * b2_im;
2672  spinorFloat B2_im = 0;
2673  B2_im += g20_re * b0_im;
2674  B2_im += g20_im * b0_re;
2675  B2_im += g21_re * b1_im;
2676  B2_im += g21_im * b1_re;
2677  B2_im += g22_re * b2_im;
2678  B2_im += g22_im * b2_re;
2679 
2680  o1_20_re += A0_re;
2681  o1_20_im += A0_im;
2682  o1_30_re += B0_re;
2683  o1_30_im += B0_im;
2684 
2685  o1_21_re += A1_re;
2686  o1_21_im += A1_im;
2687  o1_31_re += B1_re;
2688  o1_31_im += B1_im;
2689 
2690  o1_22_re += A2_re;
2691  o1_22_im += A2_im;
2692  o1_32_re += B2_re;
2693  o1_32_im += B2_im;
2694 
2695  }
2696  {
2697 #ifdef MULTI_GPU
2698  if (kernel_type == INTERIOR_KERNEL) {
2699 #endif
2700 
2701  // read flavor 2 from device memory
2702  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
2703 
2704  // project spinor into half spinors
2705  a0_re = +2*i20_re;
2706  a0_im = +2*i20_im;
2707  a1_re = +2*i21_re;
2708  a1_im = +2*i21_im;
2709  a2_re = +2*i22_re;
2710  a2_im = +2*i22_im;
2711  b0_re = +2*i30_re;
2712  b0_im = +2*i30_im;
2713  b1_re = +2*i31_re;
2714  b1_im = +2*i31_im;
2715  b2_re = +2*i32_re;
2716  b2_im = +2*i32_im;
2717 
2718 #ifdef MULTI_GPU
2719  } else {
2720 
2721  const int sp_stride_pad = FLAVORS*ghostFace[3];
2722 
2723  // read half spinor for the second flavor from device memory
2724  const int fl_idx = sp_idx + ghostFace[3];
2725  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[3]);
2726 
2727  a0_re = 2*i00_re; a0_im = 2*i00_im;
2728  a1_re = 2*i01_re; a1_im = 2*i01_im;
2729  a2_re = 2*i02_re; a2_im = 2*i02_im;
2730  b0_re = 2*i10_re; b0_im = 2*i10_im;
2731  b1_re = 2*i11_re; b1_im = 2*i11_im;
2732  b2_re = 2*i12_re; b2_im = 2*i12_im;
2733 
2734  }
2735 #endif // MULTI_GPU
2736 
2737  // multiply row 0
2738  spinorFloat A0_re = 0;
2739  A0_re += g00_re * a0_re;
2740  A0_re -= g00_im * a0_im;
2741  A0_re += g01_re * a1_re;
2742  A0_re -= g01_im * a1_im;
2743  A0_re += g02_re * a2_re;
2744  A0_re -= g02_im * a2_im;
2745  spinorFloat A0_im = 0;
2746  A0_im += g00_re * a0_im;
2747  A0_im += g00_im * a0_re;
2748  A0_im += g01_re * a1_im;
2749  A0_im += g01_im * a1_re;
2750  A0_im += g02_re * a2_im;
2751  A0_im += g02_im * a2_re;
2752  spinorFloat B0_re = 0;
2753  B0_re += g00_re * b0_re;
2754  B0_re -= g00_im * b0_im;
2755  B0_re += g01_re * b1_re;
2756  B0_re -= g01_im * b1_im;
2757  B0_re += g02_re * b2_re;
2758  B0_re -= g02_im * b2_im;
2759  spinorFloat B0_im = 0;
2760  B0_im += g00_re * b0_im;
2761  B0_im += g00_im * b0_re;
2762  B0_im += g01_re * b1_im;
2763  B0_im += g01_im * b1_re;
2764  B0_im += g02_re * b2_im;
2765  B0_im += g02_im * b2_re;
2766 
2767  // multiply row 1
2768  spinorFloat A1_re = 0;
2769  A1_re += g10_re * a0_re;
2770  A1_re -= g10_im * a0_im;
2771  A1_re += g11_re * a1_re;
2772  A1_re -= g11_im * a1_im;
2773  A1_re += g12_re * a2_re;
2774  A1_re -= g12_im * a2_im;
2775  spinorFloat A1_im = 0;
2776  A1_im += g10_re * a0_im;
2777  A1_im += g10_im * a0_re;
2778  A1_im += g11_re * a1_im;
2779  A1_im += g11_im * a1_re;
2780  A1_im += g12_re * a2_im;
2781  A1_im += g12_im * a2_re;
2782  spinorFloat B1_re = 0;
2783  B1_re += g10_re * b0_re;
2784  B1_re -= g10_im * b0_im;
2785  B1_re += g11_re * b1_re;
2786  B1_re -= g11_im * b1_im;
2787  B1_re += g12_re * b2_re;
2788  B1_re -= g12_im * b2_im;
2789  spinorFloat B1_im = 0;
2790  B1_im += g10_re * b0_im;
2791  B1_im += g10_im * b0_re;
2792  B1_im += g11_re * b1_im;
2793  B1_im += g11_im * b1_re;
2794  B1_im += g12_re * b2_im;
2795  B1_im += g12_im * b2_re;
2796 
2797  // multiply row 2
2798  spinorFloat A2_re = 0;
2799  A2_re += g20_re * a0_re;
2800  A2_re -= g20_im * a0_im;
2801  A2_re += g21_re * a1_re;
2802  A2_re -= g21_im * a1_im;
2803  A2_re += g22_re * a2_re;
2804  A2_re -= g22_im * a2_im;
2805  spinorFloat A2_im = 0;
2806  A2_im += g20_re * a0_im;
2807  A2_im += g20_im * a0_re;
2808  A2_im += g21_re * a1_im;
2809  A2_im += g21_im * a1_re;
2810  A2_im += g22_re * a2_im;
2811  A2_im += g22_im * a2_re;
2812  spinorFloat B2_re = 0;
2813  B2_re += g20_re * b0_re;
2814  B2_re -= g20_im * b0_im;
2815  B2_re += g21_re * b1_re;
2816  B2_re -= g21_im * b1_im;
2817  B2_re += g22_re * b2_re;
2818  B2_re -= g22_im * b2_im;
2819  spinorFloat B2_im = 0;
2820  B2_im += g20_re * b0_im;
2821  B2_im += g20_im * b0_re;
2822  B2_im += g21_re * b1_im;
2823  B2_im += g21_im * b1_re;
2824  B2_im += g22_re * b2_im;
2825  B2_im += g22_im * b2_re;
2826 
2827  o2_20_re += A0_re;
2828  o2_20_im += A0_im;
2829  o2_30_re += B0_re;
2830  o2_30_im += B0_im;
2831 
2832  o2_21_re += A1_re;
2833  o2_21_im += A1_im;
2834  o2_31_re += B1_re;
2835  o2_31_im += B1_im;
2836 
2837  o2_22_re += A2_re;
2838  o2_22_im += A2_im;
2839  o2_32_re += B2_re;
2840  o2_32_im += B2_im;
2841 
2842  }
2843  }
2844 }
2845 
2846 if (isActive(dim,3,-1,x1,x2,x3,x4,param.commDim,param.X) && x4==0)
2847 {
2848  // Projector P3+
2849  // 2 0 0 0
2850  // 0 2 0 0
2851  // 0 0 0 0
2852  // 0 0 0 0
2853 
2854  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,3,Y);
2855  const int sp_idx = face_idx + param.ghostOffset[3];
2856 #if (DD_PREC==2)
2857  sp_norm_idx = face_idx + param.ghostNormOffset[3];
2858 #endif
2859  const int ga_idx = Vh+face_idx;
2860 
2867 
2868  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
2869  {
2870  {
2871 #ifdef MULTI_GPU
2872  if (kernel_type == INTERIOR_KERNEL) {
2873 #endif
2874 
2875  // read flavor 1 from device memory
2876  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2877 
2878  // project spinor into half spinors
2879  a0_re = +2*i00_re;
2880  a0_im = +2*i00_im;
2881  a1_re = +2*i01_re;
2882  a1_im = +2*i01_im;
2883  a2_re = +2*i02_re;
2884  a2_im = +2*i02_im;
2885  b0_re = +2*i10_re;
2886  b0_im = +2*i10_im;
2887  b1_re = +2*i11_re;
2888  b1_im = +2*i11_im;
2889  b2_re = +2*i12_re;
2890  b2_im = +2*i12_im;
2891 
2892 #ifdef MULTI_GPU
2893  } else {
2894 
2895  const int sp_stride_pad = FLAVORS*ghostFace[3];
2896 
2897  // read half spinor for the first flavor from device memory
2898  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2899 
2900  a0_re = 2*i00_re; a0_im = 2*i00_im;
2901  a1_re = 2*i01_re; a1_im = 2*i01_im;
2902  a2_re = 2*i02_re; a2_im = 2*i02_im;
2903  b0_re = 2*i10_re; b0_im = 2*i10_im;
2904  b1_re = 2*i11_re; b1_im = 2*i11_im;
2905  b2_re = 2*i12_re; b2_im = 2*i12_im;
2906 
2907  }
2908 #endif // MULTI_GPU
2909 
2910  // identity gauge matrix
2917 
2918  o1_00_re += A0_re;
2919  o1_00_im += A0_im;
2920  o1_10_re += B0_re;
2921  o1_10_im += B0_im;
2922 
2923  o1_01_re += A1_re;
2924  o1_01_im += A1_im;
2925  o1_11_re += B1_re;
2926  o1_11_im += B1_im;
2927 
2928  o1_02_re += A2_re;
2929  o1_02_im += A2_im;
2930  o1_12_re += B2_re;
2931  o1_12_im += B2_im;
2932 
2933  }
2934  {
2935 #ifdef MULTI_GPU
2936  if (kernel_type == INTERIOR_KERNEL) {
2937 #endif
2938 
2939  // read flavor 2 from device memory
2940  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
2941 
2942  // project spinor into half spinors
2943  a0_re = +2*i00_re;
2944  a0_im = +2*i00_im;
2945  a1_re = +2*i01_re;
2946  a1_im = +2*i01_im;
2947  a2_re = +2*i02_re;
2948  a2_im = +2*i02_im;
2949  b0_re = +2*i10_re;
2950  b0_im = +2*i10_im;
2951  b1_re = +2*i11_re;
2952  b1_im = +2*i11_im;
2953  b2_re = +2*i12_re;
2954  b2_im = +2*i12_im;
2955 
2956 #ifdef MULTI_GPU
2957  } else {
2958 
2959  const int sp_stride_pad = FLAVORS*ghostFace[3];
2960 
2961  // read half spinor for the second flavor from device memory
2962  const int fl_idx = sp_idx + ghostFace[3];
2963  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[3]);
2964 
2965  a0_re = 2*i00_re; a0_im = 2*i00_im;
2966  a1_re = 2*i01_re; a1_im = 2*i01_im;
2967  a2_re = 2*i02_re; a2_im = 2*i02_im;
2968  b0_re = 2*i10_re; b0_im = 2*i10_im;
2969  b1_re = 2*i11_re; b1_im = 2*i11_im;
2970  b2_re = 2*i12_re; b2_im = 2*i12_im;
2971 
2972  }
2973 #endif // MULTI_GPU
2974 
2975  // identity gauge matrix
2982 
2983  o2_00_re += A0_re;
2984  o2_00_im += A0_im;
2985  o2_10_re += B0_re;
2986  o2_10_im += B0_im;
2987 
2988  o2_01_re += A1_re;
2989  o2_01_im += A1_im;
2990  o2_11_re += B1_re;
2991  o2_11_im += B1_im;
2992 
2993  o2_02_re += A2_re;
2994  o2_02_im += A2_im;
2995  o2_12_re += B2_re;
2996  o2_12_im += B2_im;
2997 
2998  }
2999  } else {
3000  // read gauge matrix from device memory
3001  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
3002 
3003  // reconstruct gauge matrix
3005 
3006  {
3007 #ifdef MULTI_GPU
3008  if (kernel_type == INTERIOR_KERNEL) {
3009 #endif
3010 
3011  // read flavor 1 from device memory
3012  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
3013 
3014  // project spinor into half spinors
3015  a0_re = +2*i00_re;
3016  a0_im = +2*i00_im;
3017  a1_re = +2*i01_re;
3018  a1_im = +2*i01_im;
3019  a2_re = +2*i02_re;
3020  a2_im = +2*i02_im;
3021  b0_re = +2*i10_re;
3022  b0_im = +2*i10_im;
3023  b1_re = +2*i11_re;
3024  b1_im = +2*i11_im;
3025  b2_re = +2*i12_re;
3026  b2_im = +2*i12_im;
3027 
3028 #ifdef MULTI_GPU
3029  } else {
3030 
3031  const int sp_stride_pad = FLAVORS*ghostFace[3];
3032 
3033  // read half spinor for the first flavor from device memory
3034  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
3035 
3036  a0_re = 2*i00_re; a0_im = 2*i00_im;
3037  a1_re = 2*i01_re; a1_im = 2*i01_im;
3038  a2_re = 2*i02_re; a2_im = 2*i02_im;
3039  b0_re = 2*i10_re; b0_im = 2*i10_im;
3040  b1_re = 2*i11_re; b1_im = 2*i11_im;
3041  b2_re = 2*i12_re; b2_im = 2*i12_im;
3042 
3043  }
3044 #endif // MULTI_GPU
3045 
3046  // multiply row 0
3047  spinorFloat A0_re = 0;
3048  A0_re += gT00_re * a0_re;
3049  A0_re -= gT00_im * a0_im;
3050  A0_re += gT01_re * a1_re;
3051  A0_re -= gT01_im * a1_im;
3052  A0_re += gT02_re * a2_re;
3053  A0_re -= gT02_im * a2_im;
3054  spinorFloat A0_im = 0;
3055  A0_im += gT00_re * a0_im;
3056  A0_im += gT00_im * a0_re;
3057  A0_im += gT01_re * a1_im;
3058  A0_im += gT01_im * a1_re;
3059  A0_im += gT02_re * a2_im;
3060  A0_im += gT02_im * a2_re;
3061  spinorFloat B0_re = 0;
3062  B0_re += gT00_re * b0_re;
3063  B0_re -= gT00_im * b0_im;
3064  B0_re += gT01_re * b1_re;
3065  B0_re -= gT01_im * b1_im;
3066  B0_re += gT02_re * b2_re;
3067  B0_re -= gT02_im * b2_im;
3068  spinorFloat B0_im = 0;
3069  B0_im += gT00_re * b0_im;
3070  B0_im += gT00_im * b0_re;
3071  B0_im += gT01_re * b1_im;
3072  B0_im += gT01_im * b1_re;
3073  B0_im += gT02_re * b2_im;
3074  B0_im += gT02_im * b2_re;
3075 
3076  // multiply row 1
3077  spinorFloat A1_re = 0;
3078  A1_re += gT10_re * a0_re;
3079  A1_re -= gT10_im * a0_im;
3080  A1_re += gT11_re * a1_re;
3081  A1_re -= gT11_im * a1_im;
3082  A1_re += gT12_re * a2_re;
3083  A1_re -= gT12_im * a2_im;
3084  spinorFloat A1_im = 0;
3085  A1_im += gT10_re * a0_im;
3086  A1_im += gT10_im * a0_re;
3087  A1_im += gT11_re * a1_im;
3088  A1_im += gT11_im * a1_re;
3089  A1_im += gT12_re * a2_im;
3090  A1_im += gT12_im * a2_re;
3091  spinorFloat B1_re = 0;
3092  B1_re += gT10_re * b0_re;
3093  B1_re -= gT10_im * b0_im;
3094  B1_re += gT11_re * b1_re;
3095  B1_re -= gT11_im * b1_im;
3096  B1_re += gT12_re * b2_re;
3097  B1_re -= gT12_im * b2_im;
3098  spinorFloat B1_im = 0;
3099  B1_im += gT10_re * b0_im;
3100  B1_im += gT10_im * b0_re;
3101  B1_im += gT11_re * b1_im;
3102  B1_im += gT11_im * b1_re;
3103  B1_im += gT12_re * b2_im;
3104  B1_im += gT12_im * b2_re;
3105 
3106  // multiply row 2
3107  spinorFloat A2_re = 0;
3108  A2_re += gT20_re * a0_re;
3109  A2_re -= gT20_im * a0_im;
3110  A2_re += gT21_re * a1_re;
3111  A2_re -= gT21_im * a1_im;
3112  A2_re += gT22_re * a2_re;
3113  A2_re -= gT22_im * a2_im;
3114  spinorFloat A2_im = 0;
3115  A2_im += gT20_re * a0_im;
3116  A2_im += gT20_im * a0_re;
3117  A2_im += gT21_re * a1_im;
3118  A2_im += gT21_im * a1_re;
3119  A2_im += gT22_re * a2_im;
3120  A2_im += gT22_im * a2_re;
3121  spinorFloat B2_re = 0;
3122  B2_re += gT20_re * b0_re;
3123  B2_re -= gT20_im * b0_im;
3124  B2_re += gT21_re * b1_re;
3125  B2_re -= gT21_im * b1_im;
3126  B2_re += gT22_re * b2_re;
3127  B2_re -= gT22_im * b2_im;
3128  spinorFloat B2_im = 0;
3129  B2_im += gT20_re * b0_im;
3130  B2_im += gT20_im * b0_re;
3131  B2_im += gT21_re * b1_im;
3132  B2_im += gT21_im * b1_re;
3133  B2_im += gT22_re * b2_im;
3134  B2_im += gT22_im * b2_re;
3135 
3136  o1_00_re += A0_re;
3137  o1_00_im += A0_im;
3138  o1_10_re += B0_re;
3139  o1_10_im += B0_im;
3140 
3141  o1_01_re += A1_re;
3142  o1_01_im += A1_im;
3143  o1_11_re += B1_re;
3144  o1_11_im += B1_im;
3145 
3146  o1_02_re += A2_re;
3147  o1_02_im += A2_im;
3148  o1_12_re += B2_re;
3149  o1_12_im += B2_im;
3150 
3151  }
3152  {
3153 #ifdef MULTI_GPU
3154  if (kernel_type == INTERIOR_KERNEL) {
3155 #endif
3156 
3157  // read flavor 2 from device memory
3158  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
3159 
3160  // project spinor into half spinors
3161  a0_re = +2*i00_re;
3162  a0_im = +2*i00_im;
3163  a1_re = +2*i01_re;
3164  a1_im = +2*i01_im;
3165  a2_re = +2*i02_re;
3166  a2_im = +2*i02_im;
3167  b0_re = +2*i10_re;
3168  b0_im = +2*i10_im;
3169  b1_re = +2*i11_re;
3170  b1_im = +2*i11_im;
3171  b2_re = +2*i12_re;
3172  b2_im = +2*i12_im;
3173 
3174 #ifdef MULTI_GPU
3175  } else {
3176 
3177  const int sp_stride_pad = FLAVORS*ghostFace[3];
3178 
3179  // read half spinor for the second flavor from device memory
3180  const int fl_idx = sp_idx + ghostFace[3];
3181  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[3]);
3182 
3183  a0_re = 2*i00_re; a0_im = 2*i00_im;
3184  a1_re = 2*i01_re; a1_im = 2*i01_im;
3185  a2_re = 2*i02_re; a2_im = 2*i02_im;
3186  b0_re = 2*i10_re; b0_im = 2*i10_im;
3187  b1_re = 2*i11_re; b1_im = 2*i11_im;
3188  b2_re = 2*i12_re; b2_im = 2*i12_im;
3189 
3190  }
3191 #endif // MULTI_GPU
3192 
3193  // multiply row 0
3194  spinorFloat A0_re = 0;
3195  A0_re += gT00_re * a0_re;
3196  A0_re -= gT00_im * a0_im;
3197  A0_re += gT01_re * a1_re;
3198  A0_re -= gT01_im * a1_im;
3199  A0_re += gT02_re * a2_re;
3200  A0_re -= gT02_im * a2_im;
3201  spinorFloat A0_im = 0;
3202  A0_im += gT00_re * a0_im;
3203  A0_im += gT00_im * a0_re;
3204  A0_im += gT01_re * a1_im;
3205  A0_im += gT01_im * a1_re;
3206  A0_im += gT02_re * a2_im;
3207  A0_im += gT02_im * a2_re;
3208  spinorFloat B0_re = 0;
3209  B0_re += gT00_re * b0_re;
3210  B0_re -= gT00_im * b0_im;
3211  B0_re += gT01_re * b1_re;
3212  B0_re -= gT01_im * b1_im;
3213  B0_re += gT02_re * b2_re;
3214  B0_re -= gT02_im * b2_im;
3215  spinorFloat B0_im = 0;
3216  B0_im += gT00_re * b0_im;
3217  B0_im += gT00_im * b0_re;
3218  B0_im += gT01_re * b1_im;
3219  B0_im += gT01_im * b1_re;
3220  B0_im += gT02_re * b2_im;
3221  B0_im += gT02_im * b2_re;
3222 
3223  // multiply row 1
3224  spinorFloat A1_re = 0;
3225  A1_re += gT10_re * a0_re;
3226  A1_re -= gT10_im * a0_im;
3227  A1_re += gT11_re * a1_re;
3228  A1_re -= gT11_im * a1_im;
3229  A1_re += gT12_re * a2_re;
3230  A1_re -= gT12_im * a2_im;
3231  spinorFloat A1_im = 0;
3232  A1_im += gT10_re * a0_im;
3233  A1_im += gT10_im * a0_re;
3234  A1_im += gT11_re * a1_im;
3235  A1_im += gT11_im * a1_re;
3236  A1_im += gT12_re * a2_im;
3237  A1_im += gT12_im * a2_re;
3238  spinorFloat B1_re = 0;
3239  B1_re += gT10_re * b0_re;
3240  B1_re -= gT10_im * b0_im;
3241  B1_re += gT11_re * b1_re;
3242  B1_re -= gT11_im * b1_im;
3243  B1_re += gT12_re * b2_re;
3244  B1_re -= gT12_im * b2_im;
3245  spinorFloat B1_im = 0;
3246  B1_im += gT10_re * b0_im;
3247  B1_im += gT10_im * b0_re;
3248  B1_im += gT11_re * b1_im;
3249  B1_im += gT11_im * b1_re;
3250  B1_im += gT12_re * b2_im;
3251  B1_im += gT12_im * b2_re;
3252 
3253  // multiply row 2
3254  spinorFloat A2_re = 0;
3255  A2_re += gT20_re * a0_re;
3256  A2_re -= gT20_im * a0_im;
3257  A2_re += gT21_re * a1_re;
3258  A2_re -= gT21_im * a1_im;
3259  A2_re += gT22_re * a2_re;
3260  A2_re -= gT22_im * a2_im;
3261  spinorFloat A2_im = 0;
3262  A2_im += gT20_re * a0_im;
3263  A2_im += gT20_im * a0_re;
3264  A2_im += gT21_re * a1_im;
3265  A2_im += gT21_im * a1_re;
3266  A2_im += gT22_re * a2_im;
3267  A2_im += gT22_im * a2_re;
3268  spinorFloat B2_re = 0;
3269  B2_re += gT20_re * b0_re;
3270  B2_re -= gT20_im * b0_im;
3271  B2_re += gT21_re * b1_re;
3272  B2_re -= gT21_im * b1_im;
3273  B2_re += gT22_re * b2_re;
3274  B2_re -= gT22_im * b2_im;
3275  spinorFloat B2_im = 0;
3276  B2_im += gT20_re * b0_im;
3277  B2_im += gT20_im * b0_re;
3278  B2_im += gT21_re * b1_im;
3279  B2_im += gT21_im * b1_re;
3280  B2_im += gT22_re * b2_im;
3281  B2_im += gT22_im * b2_re;
3282 
3283  o2_00_re += A0_re;
3284  o2_00_im += A0_im;
3285  o2_10_re += B0_re;
3286  o2_10_im += B0_im;
3287 
3288  o2_01_re += A1_re;
3289  o2_01_im += A1_im;
3290  o2_11_re += B1_re;
3291  o2_11_im += B1_im;
3292 
3293  o2_02_re += A2_re;
3294  o2_02_im += A2_im;
3295  o2_12_re += B2_re;
3296  o2_12_im += B2_im;
3297 
3298  }
3299  }
3300 }
3301 
3302 #ifdef MULTI_GPU
3303 
3304 int incomplete = 0; // Have all 8 contributions been computed for this site?
3305 
3306 switch(kernel_type) { // intentional fall-through
3307 case INTERIOR_KERNEL:
3308  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
3309 case EXTERIOR_KERNEL_T:
3310  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
3311 case EXTERIOR_KERNEL_Z:
3312  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
3313 case EXTERIOR_KERNEL_Y:
3314  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
3315 }
3316 
3317 
3318 if (!incomplete)
3319 #endif // MULTI_GPU
3320 // apply twisted mass rotation
3321 {
3322 
3323 #ifdef DSLASH_TWIST
3324  {
3325  //Perform twist rotation first:
3326  //(1 - i*a*gamma_5 * tau_3 + b * tau_1)
3327  volatile spinorFloat x1_re, x1_im, y1_re, y1_im;
3328  volatile spinorFloat x2_re, x2_im, y2_re, y2_im;
3329 
3330  x1_re = 0.0, x1_im = 0.0;
3331  y1_re = 0.0, y1_im = 0.0;
3332  x2_re = 0.0, x2_im = 0.0;
3333  y2_re = 0.0, y2_im = 0.0;
3334 
3335 
3336  // using o1 regs:
3337  x1_re = o1_00_re + a *o1_20_im;
3338  x1_im = o1_00_im - a *o1_20_re;
3339  x2_re = b * o1_00_re;
3340  x2_im = b * o1_00_im;
3341 
3342  y1_re = o1_20_re + a *o1_00_im;
3343  y1_im = o1_20_im - a *o1_00_re;
3344  y2_re = b * o1_20_re;
3345  y2_im = b * o1_20_im;
3346 
3347 
3348  // using o2 regs:
3349  x2_re += o2_00_re - a *o2_20_im;
3350  x2_im += o2_00_im + a *o2_20_re;
3351  x1_re += b * o2_00_re;
3352  x1_im += b * o2_00_im;
3353 
3354  y2_re += o2_20_re - a *o2_00_im;
3355  y2_im += o2_20_im + a *o2_00_re;
3356  y1_re += b * o2_20_re;
3357  y1_im += b * o2_20_im;
3358 
3359 
3360  o1_00_re = x1_re; o1_00_im = x1_im;
3361  o1_20_re = y1_re; o1_20_im = y1_im;
3362 
3363  o2_00_re = x2_re; o2_00_im = x2_im;
3364  o2_20_re = y2_re; o2_20_im = y2_im;
3365 
3366  // using o1 regs:
3367  x1_re = o1_10_re + a *o1_30_im;
3368  x1_im = o1_10_im - a *o1_30_re;
3369  x2_re = b * o1_10_re;
3370  x2_im = b * o1_10_im;
3371 
3372  y1_re = o1_30_re + a *o1_10_im;
3373  y1_im = o1_30_im - a *o1_10_re;
3374  y2_re = b * o1_30_re;
3375  y2_im = b * o1_30_im;
3376 
3377 
3378  // using o2 regs:
3379  x2_re += o2_10_re - a *o2_30_im;
3380  x2_im += o2_10_im + a *o2_30_re;
3381  x1_re += b * o2_10_re;
3382  x1_im += b * o2_10_im;
3383 
3384  y2_re += o2_30_re - a *o2_10_im;
3385  y2_im += o2_30_im + a *o2_10_re;
3386  y1_re += b * o2_30_re;
3387  y1_im += b * o2_30_im;
3388 
3389 
3390  o1_10_re = x1_re; o1_10_im = x1_im;
3391  o1_30_re = y1_re; o1_30_im = y1_im;
3392 
3393  o2_10_re = x2_re; o2_10_im = x2_im;
3394  o2_30_re = y2_re; o2_30_im = y2_im;
3395 
3396  // using o1 regs:
3397  x1_re = o1_01_re + a *o1_21_im;
3398  x1_im = o1_01_im - a *o1_21_re;
3399  x2_re = b * o1_01_re;
3400  x2_im = b * o1_01_im;
3401 
3402  y1_re = o1_21_re + a *o1_01_im;
3403  y1_im = o1_21_im - a *o1_01_re;
3404  y2_re = b * o1_21_re;
3405  y2_im = b * o1_21_im;
3406 
3407 
3408  // using o2 regs:
3409  x2_re += o2_01_re - a *o2_21_im;
3410  x2_im += o2_01_im + a *o2_21_re;
3411  x1_re += b * o2_01_re;
3412  x1_im += b * o2_01_im;
3413 
3414  y2_re += o2_21_re - a *o2_01_im;
3415  y2_im += o2_21_im + a *o2_01_re;
3416  y1_re += b * o2_21_re;
3417  y1_im += b * o2_21_im;
3418 
3419 
3420  o1_01_re = x1_re; o1_01_im = x1_im;
3421  o1_21_re = y1_re; o1_21_im = y1_im;
3422 
3423  o2_01_re = x2_re; o2_01_im = x2_im;
3424  o2_21_re = y2_re; o2_21_im = y2_im;
3425 
3426  // using o1 regs:
3427  x1_re = o1_11_re + a *o1_31_im;
3428  x1_im = o1_11_im - a *o1_31_re;
3429  x2_re = b * o1_11_re;
3430  x2_im = b * o1_11_im;
3431 
3432  y1_re = o1_31_re + a *o1_11_im;
3433  y1_im = o1_31_im - a *o1_11_re;
3434  y2_re = b * o1_31_re;
3435  y2_im = b * o1_31_im;
3436 
3437 
3438  // using o2 regs:
3439  x2_re += o2_11_re - a *o2_31_im;
3440  x2_im += o2_11_im + a *o2_31_re;
3441  x1_re += b * o2_11_re;
3442  x1_im += b * o2_11_im;
3443 
3444  y2_re += o2_31_re - a *o2_11_im;
3445  y2_im += o2_31_im + a *o2_11_re;
3446  y1_re += b * o2_31_re;
3447  y1_im += b * o2_31_im;
3448 
3449 
3450  o1_11_re = x1_re; o1_11_im = x1_im;
3451  o1_31_re = y1_re; o1_31_im = y1_im;
3452 
3453  o2_11_re = x2_re; o2_11_im = x2_im;
3454  o2_31_re = y2_re; o2_31_im = y2_im;
3455 
3456  // using o1 regs:
3457  x1_re = o1_02_re + a *o1_22_im;
3458  x1_im = o1_02_im - a *o1_22_re;
3459  x2_re = b * o1_02_re;
3460  x2_im = b * o1_02_im;
3461 
3462  y1_re = o1_22_re + a *o1_02_im;
3463  y1_im = o1_22_im - a *o1_02_re;
3464  y2_re = b * o1_22_re;
3465  y2_im = b * o1_22_im;
3466 
3467 
3468  // using o2 regs:
3469  x2_re += o2_02_re - a *o2_22_im;
3470  x2_im += o2_02_im + a *o2_22_re;
3471  x1_re += b * o2_02_re;
3472  x1_im += b * o2_02_im;
3473 
3474  y2_re += o2_22_re - a *o2_02_im;
3475  y2_im += o2_22_im + a *o2_02_re;
3476  y1_re += b * o2_22_re;
3477  y1_im += b * o2_22_im;
3478 
3479 
3480  o1_02_re = x1_re; o1_02_im = x1_im;
3481  o1_22_re = y1_re; o1_22_im = y1_im;
3482 
3483  o2_02_re = x2_re; o2_02_im = x2_im;
3484  o2_22_re = y2_re; o2_22_im = y2_im;
3485 
3486  // using o1 regs:
3487  x1_re = o1_12_re + a *o1_32_im;
3488  x1_im = o1_12_im - a *o1_32_re;
3489  x2_re = b * o1_12_re;
3490  x2_im = b * o1_12_im;
3491 
3492  y1_re = o1_32_re + a *o1_12_im;
3493  y1_im = o1_32_im - a *o1_12_re;
3494  y2_re = b * o1_32_re;
3495  y2_im = b * o1_32_im;
3496 
3497 
3498  // using o2 regs:
3499  x2_re += o2_12_re - a *o2_32_im;
3500  x2_im += o2_12_im + a *o2_32_re;
3501  x1_re += b * o2_12_re;
3502  x1_im += b * o2_12_im;
3503 
3504  y2_re += o2_32_re - a *o2_12_im;
3505  y2_im += o2_32_im + a *o2_12_re;
3506  y1_re += b * o2_32_re;
3507  y1_im += b * o2_32_im;
3508 
3509 
3510  o1_12_re = x1_re; o1_12_im = x1_im;
3511  o1_32_re = y1_re; o1_32_im = y1_im;
3512 
3513  o2_12_re = x2_re; o2_12_im = x2_im;
3514  o2_32_re = y2_re; o2_32_im = y2_im;
3515 
3516  }
3517 #endif
3518 
3519 #ifndef DSLASH_XPAY
3520  o1_00_re *= c;
3521  o1_00_im *= c;
3522  o1_01_re *= c;
3523  o1_01_im *= c;
3524  o1_02_re *= c;
3525  o1_02_im *= c;
3526  o1_10_re *= c;
3527  o1_10_im *= c;
3528  o1_11_re *= c;
3529  o1_11_im *= c;
3530  o1_12_re *= c;
3531  o1_12_im *= c;
3532  o1_20_re *= c;
3533  o1_20_im *= c;
3534  o1_21_re *= c;
3535  o1_21_im *= c;
3536  o1_22_re *= c;
3537  o1_22_im *= c;
3538  o1_30_re *= c;
3539  o1_30_im *= c;
3540  o1_31_re *= c;
3541  o1_31_im *= c;
3542  o1_32_re *= c;
3543  o1_32_im *= c;
3544 
3545  o2_00_re *= c;
3546  o2_00_im *= c;
3547  o2_01_re *= c;
3548  o2_01_im *= c;
3549  o2_02_re *= c;
3550  o2_02_im *= c;
3551  o2_10_re *= c;
3552  o2_10_im *= c;
3553  o2_11_re *= c;
3554  o2_11_im *= c;
3555  o2_12_re *= c;
3556  o2_12_im *= c;
3557  o2_20_re *= c;
3558  o2_20_im *= c;
3559  o2_21_re *= c;
3560  o2_21_im *= c;
3561  o2_22_re *= c;
3562  o2_22_im *= c;
3563  o2_30_re *= c;
3564  o2_30_im *= c;
3565  o2_31_re *= c;
3566  o2_31_im *= c;
3567  o2_32_re *= c;
3568  o2_32_im *= c;
3569 #else
3570 #ifdef DSLASH_TWIST
3571  // accum spinor
3572 #ifdef SPINOR_DOUBLE
3573 
3574 #define acc_00_re accum0.x
3575 #define acc_00_im accum0.y
3576 #define acc_01_re accum1.x
3577 #define acc_01_im accum1.y
3578 #define acc_02_re accum2.x
3579 #define acc_02_im accum2.y
3580 #define acc_10_re accum3.x
3581 #define acc_10_im accum3.y
3582 #define acc_11_re accum4.x
3583 #define acc_11_im accum4.y
3584 #define acc_12_re accum5.x
3585 #define acc_12_im accum5.y
3586 #define acc_20_re accum6.x
3587 #define acc_20_im accum6.y
3588 #define acc_21_re accum7.x
3589 #define acc_21_im accum7.y
3590 #define acc_22_re accum8.x
3591 #define acc_22_im accum8.y
3592 #define acc_30_re accum9.x
3593 #define acc_30_im accum9.y
3594 #define acc_31_re accum10.x
3595 #define acc_31_im accum10.y
3596 #define acc_32_re accum11.x
3597 #define acc_32_im accum11.y
3598 
3599 #else
3600 #define acc_00_re accum0.x
3601 #define acc_00_im accum0.y
3602 #define acc_01_re accum0.z
3603 #define acc_01_im accum0.w
3604 #define acc_02_re accum1.x
3605 #define acc_02_im accum1.y
3606 #define acc_10_re accum1.z
3607 #define acc_10_im accum1.w
3608 #define acc_11_re accum2.x
3609 #define acc_11_im accum2.y
3610 #define acc_12_re accum2.z
3611 #define acc_12_im accum2.w
3612 #define acc_20_re accum3.x
3613 #define acc_20_im accum3.y
3614 #define acc_21_re accum3.z
3615 #define acc_21_im accum3.w
3616 #define acc_22_re accum4.x
3617 #define acc_22_im accum4.y
3618 #define acc_30_re accum4.z
3619 #define acc_30_im accum4.w
3620 #define acc_31_re accum5.x
3621 #define acc_31_im accum5.y
3622 #define acc_32_re accum5.z
3623 #define acc_32_im accum5.w
3624 
3625 #endif // SPINOR_DOUBLE
3626 
3627  {
3628  READ_ACCUM(ACCUMTEX, param.sp_stride)
3629 
3630  o1_00_re = c*o1_00_re + acc_00_re;
3631  o1_00_im = c*o1_00_im + acc_00_im;
3632  o1_01_re = c*o1_01_re + acc_01_re;
3633  o1_01_im = c*o1_01_im + acc_01_im;
3634  o1_02_re = c*o1_02_re + acc_02_re;
3635  o1_02_im = c*o1_02_im + acc_02_im;
3636  o1_10_re = c*o1_10_re + acc_10_re;
3637  o1_10_im = c*o1_10_im + acc_10_im;
3638  o1_11_re = c*o1_11_re + acc_11_re;
3639  o1_11_im = c*o1_11_im + acc_11_im;
3640  o1_12_re = c*o1_12_re + acc_12_re;
3641  o1_12_im = c*o1_12_im + acc_12_im;
3642  o1_20_re = c*o1_20_re + acc_20_re;
3643  o1_20_im = c*o1_20_im + acc_20_im;
3644  o1_21_re = c*o1_21_re + acc_21_re;
3645  o1_21_im = c*o1_21_im + acc_21_im;
3646  o1_22_re = c*o1_22_re + acc_22_re;
3647  o1_22_im = c*o1_22_im + acc_22_im;
3648  o1_30_re = c*o1_30_re + acc_30_re;
3649  o1_30_im = c*o1_30_im + acc_30_im;
3650  o1_31_re = c*o1_31_re + acc_31_re;
3651  o1_31_im = c*o1_31_im + acc_31_im;
3652  o1_32_re = c*o1_32_re + acc_32_re;
3653  o1_32_im = c*o1_32_im + acc_32_im;
3654 
3655  ASSN_ACCUM(ACCUMTEX, param.sp_stride, param.fl_stride)
3656 
3657  o2_00_re = c*o2_00_re + acc_00_re;
3658  o2_00_im = c*o2_00_im + acc_00_im;
3659  o2_01_re = c*o2_01_re + acc_01_re;
3660  o2_01_im = c*o2_01_im + acc_01_im;
3661  o2_02_re = c*o2_02_re + acc_02_re;
3662  o2_02_im = c*o2_02_im + acc_02_im;
3663  o2_10_re = c*o2_10_re + acc_10_re;
3664  o2_10_im = c*o2_10_im + acc_10_im;
3665  o2_11_re = c*o2_11_re + acc_11_re;
3666  o2_11_im = c*o2_11_im + acc_11_im;
3667  o2_12_re = c*o2_12_re + acc_12_re;
3668  o2_12_im = c*o2_12_im + acc_12_im;
3669  o2_20_re = c*o2_20_re + acc_20_re;
3670  o2_20_im = c*o2_20_im + acc_20_im;
3671  o2_21_re = c*o2_21_re + acc_21_re;
3672  o2_21_im = c*o2_21_im + acc_21_im;
3673  o2_22_re = c*o2_22_re + acc_22_re;
3674  o2_22_im = c*o2_22_im + acc_22_im;
3675  o2_30_re = c*o2_30_re + acc_30_re;
3676  o2_30_im = c*o2_30_im + acc_30_im;
3677  o2_31_re = c*o2_31_re + acc_31_re;
3678  o2_31_im = c*o2_31_im + acc_31_im;
3679  o2_32_re = c*o2_32_re + acc_32_re;
3680  o2_32_im = c*o2_32_im + acc_32_im;
3681  }
3682 
3683 #undef acc_00_re
3684 #undef acc_00_im
3685 #undef acc_01_re
3686 #undef acc_01_im
3687 #undef acc_02_re
3688 #undef acc_02_im
3689 #undef acc_10_re
3690 #undef acc_10_im
3691 #undef acc_11_re
3692 #undef acc_11_im
3693 #undef acc_12_re
3694 #undef acc_12_im
3695 #undef acc_20_re
3696 #undef acc_20_im
3697 #undef acc_21_re
3698 #undef acc_21_im
3699 #undef acc_22_re
3700 #undef acc_22_im
3701 #undef acc_30_re
3702 #undef acc_30_im
3703 #undef acc_31_re
3704 #undef acc_31_im
3705 #undef acc_32_re
3706 #undef acc_32_im
3707 
3708 #else
3709  // accum spinor
3710 #ifdef SPINOR_DOUBLE
3711 
3712 #define acc1_00_re flv1_accum0.x
3713 #define acc1_00_im flv1_accum0.y
3714 #define acc1_01_re flv1_accum1.x
3715 #define acc1_01_im flv1_accum1.y
3716 #define acc1_02_re flv1_accum2.x
3717 #define acc1_02_im flv1_accum2.y
3718 #define acc1_10_re flv1_accum3.x
3719 #define acc1_10_im flv1_accum3.y
3720 #define acc1_11_re flv1_accum4.x
3721 #define acc1_11_im flv1_accum4.y
3722 #define acc1_12_re flv1_accum5.x
3723 #define acc1_12_im flv1_accum5.y
3724 #define acc1_20_re flv1_accum6.x
3725 #define acc1_20_im flv1_accum6.y
3726 #define acc1_21_re flv1_accum7.x
3727 #define acc1_21_im flv1_accum7.y
3728 #define acc1_22_re flv1_accum8.x
3729 #define acc1_22_im flv1_accum8.y
3730 #define acc1_30_re flv1_accum9.x
3731 #define acc1_30_im flv1_accum9.y
3732 #define acc1_31_re flv1_accum10.x
3733 #define acc1_31_im flv1_accum10.y
3734 #define acc1_32_re flv1_accum11.x
3735 #define acc1_32_im flv1_accum11.y
3736 
3737 #define acc2_00_re flv2_accum0.x
3738 #define acc2_00_im flv2_accum0.y
3739 #define acc2_01_re flv2_accum1.x
3740 #define acc2_01_im flv2_accum1.y
3741 #define acc2_02_re flv2_accum2.x
3742 #define acc2_02_im flv2_accum2.y
3743 #define acc2_10_re flv2_accum3.x
3744 #define acc2_10_im flv2_accum3.y
3745 #define acc2_11_re flv2_accum4.x
3746 #define acc2_11_im flv2_accum4.y
3747 #define acc2_12_re flv2_accum5.x
3748 #define acc2_12_im flv2_accum5.y
3749 #define acc2_20_re flv2_accum6.x
3750 #define acc2_20_im flv2_accum6.y
3751 #define acc2_21_re flv2_accum7.x
3752 #define acc2_21_im flv2_accum7.y
3753 #define acc2_22_re flv2_accum8.x
3754 #define acc2_22_im flv2_accum8.y
3755 #define acc2_30_re flv2_accum9.x
3756 #define acc2_30_im flv2_accum9.y
3757 #define acc2_31_re flv2_accum10.x
3758 #define acc2_31_im flv2_accum10.y
3759 #define acc2_32_re flv2_accum11.x
3760 #define acc2_32_im flv2_accum11.y
3761 
3762 #else
3763 
3764 #define acc1_00_re flv1_accum0.x
3765 #define acc1_00_im flv1_accum0.y
3766 #define acc1_01_re flv1_accum0.z
3767 #define acc1_01_im flv1_accum0.w
3768 #define acc1_02_re flv1_accum1.x
3769 #define acc1_02_im flv1_accum1.y
3770 #define acc1_10_re flv1_accum1.z
3771 #define acc1_10_im flv1_accum1.w
3772 #define acc1_11_re flv1_accum2.x
3773 #define acc1_11_im flv1_accum2.y
3774 #define acc1_12_re flv1_accum2.z
3775 #define acc1_12_im flv1_accum2.w
3776 #define acc1_20_re flv1_accum3.x
3777 #define acc1_20_im flv1_accum3.y
3778 #define acc1_21_re flv1_accum3.z
3779 #define acc1_21_im flv1_accum3.w
3780 #define acc1_22_re flv1_accum4.x
3781 #define acc1_22_im flv1_accum4.y
3782 #define acc1_30_re flv1_accum4.z
3783 #define acc1_30_im flv1_accum4.w
3784 #define acc1_31_re flv1_accum5.x
3785 #define acc1_31_im flv1_accum5.y
3786 #define acc1_32_re flv1_accum5.z
3787 #define acc1_32_im flv1_accum5.w
3788 
3789 #define acc2_00_re flv2_accum0.x
3790 #define acc2_00_im flv2_accum0.y
3791 #define acc2_01_re flv2_accum0.z
3792 #define acc2_01_im flv2_accum0.w
3793 #define acc2_02_re flv2_accum1.x
3794 #define acc2_02_im flv2_accum1.y
3795 #define acc2_10_re flv2_accum1.z
3796 #define acc2_10_im flv2_accum1.w
3797 #define acc2_11_re flv2_accum2.x
3798 #define acc2_11_im flv2_accum2.y
3799 #define acc2_12_re flv2_accum2.z
3800 #define acc2_12_im flv2_accum2.w
3801 #define acc2_20_re flv2_accum3.x
3802 #define acc2_20_im flv2_accum3.y
3803 #define acc2_21_re flv2_accum3.z
3804 #define acc2_21_im flv2_accum3.w
3805 #define acc2_22_re flv2_accum4.x
3806 #define acc2_22_im flv2_accum4.y
3807 #define acc2_30_re flv2_accum4.z
3808 #define acc2_30_im flv2_accum4.w
3809 #define acc2_31_re flv2_accum5.x
3810 #define acc2_31_im flv2_accum5.y
3811 #define acc2_32_re flv2_accum5.z
3812 #define acc2_32_im flv2_accum5.w
3813 
3814 #endif // SPINOR_DOUBLE
3815 
3816  {
3817  READ_ACCUM_FLAVOR(ACCUMTEX, param.sp_stride, param.fl_stride)
3818 
3819  //Perform twist rotation:
3820  //(1 - i*a*gamma_5 * tau_3 + b * tau_1)
3821  volatile spinorFloat x1_re, x1_im, y1_re, y1_im;
3822  volatile spinorFloat x2_re, x2_im, y2_re, y2_im;
3823 
3824  x1_re = 0.0, x1_im = 0.0;
3825  y1_re = 0.0, y1_im = 0.0;
3826  x2_re = 0.0, x2_im = 0.0;
3827  y2_re = 0.0, y2_im = 0.0;
3828 
3829 
3830  // using acc1 regs:
3831  x1_re = acc1_00_re + a *acc1_20_im;
3832  x1_im = acc1_00_im - a *acc1_20_re;
3833  x2_re = b * acc1_00_re;
3834  x2_im = b * acc1_00_im;
3835 
3836  y1_re = acc1_20_re + a *acc1_00_im;
3837  y1_im = acc1_20_im - a *acc1_00_re;
3838  y2_re = b * acc1_20_re;
3839  y2_im = b * acc1_20_im;
3840 
3841 
3842  // using acc2 regs:
3843  x2_re += acc2_00_re - a *acc2_20_im;
3844  x2_im += acc2_00_im + a *acc2_20_re;
3845  x1_re += b * acc2_00_re;
3846  x1_im += b * acc2_00_im;
3847 
3848  y2_re += acc2_20_re - a *acc2_00_im;
3849  y2_im += acc2_20_im + a *acc2_00_re;
3850  y1_re += b * acc2_20_re;
3851  y1_im += b * acc2_20_im;
3852 
3853 
3854  acc1_00_re = x1_re; acc1_00_im = x1_im;
3855  acc1_20_re = y1_re; acc1_20_im = y1_im;
3856 
3857  acc2_00_re = x2_re; acc2_00_im = x2_im;
3858  acc2_20_re = y2_re; acc2_20_im = y2_im;
3859 
3860  // using acc1 regs:
3861  x1_re = acc1_10_re + a *acc1_30_im;
3862  x1_im = acc1_10_im - a *acc1_30_re;
3863  x2_re = b * acc1_10_re;
3864  x2_im = b * acc1_10_im;
3865 
3866  y1_re = acc1_30_re + a *acc1_10_im;
3867  y1_im = acc1_30_im - a *acc1_10_re;
3868  y2_re = b * acc1_30_re;
3869  y2_im = b * acc1_30_im;
3870 
3871 
3872  // using acc2 regs:
3873  x2_re += acc2_10_re - a *acc2_30_im;
3874  x2_im += acc2_10_im + a *acc2_30_re;
3875  x1_re += b * acc2_10_re;
3876  x1_im += b * acc2_10_im;
3877 
3878  y2_re += acc2_30_re - a *acc2_10_im;
3879  y2_im += acc2_30_im + a *acc2_10_re;
3880  y1_re += b * acc2_30_re;
3881  y1_im += b * acc2_30_im;
3882 
3883 
3884  acc1_10_re = x1_re; acc1_10_im = x1_im;
3885  acc1_30_re = y1_re; acc1_30_im = y1_im;
3886 
3887  acc2_10_re = x2_re; acc2_10_im = x2_im;
3888  acc2_30_re = y2_re; acc2_30_im = y2_im;
3889 
3890  // using acc1 regs:
3891  x1_re = acc1_01_re + a *acc1_21_im;
3892  x1_im = acc1_01_im - a *acc1_21_re;
3893  x2_re = b * acc1_01_re;
3894  x2_im = b * acc1_01_im;
3895 
3896  y1_re = acc1_21_re + a *acc1_01_im;
3897  y1_im = acc1_21_im - a *acc1_01_re;
3898  y2_re = b * acc1_21_re;
3899  y2_im = b * acc1_21_im;
3900 
3901 
3902  // using acc2 regs:
3903  x2_re += acc2_01_re - a *acc2_21_im;
3904  x2_im += acc2_01_im + a *acc2_21_re;
3905  x1_re += b * acc2_01_re;
3906  x1_im += b * acc2_01_im;
3907 
3908  y2_re += acc2_21_re - a *acc2_01_im;
3909  y2_im += acc2_21_im + a *acc2_01_re;
3910  y1_re += b * acc2_21_re;
3911  y1_im += b * acc2_21_im;
3912 
3913 
3914  acc1_01_re = x1_re; acc1_01_im = x1_im;
3915  acc1_21_re = y1_re; acc1_21_im = y1_im;
3916 
3917  acc2_01_re = x2_re; acc2_01_im = x2_im;
3918  acc2_21_re = y2_re; acc2_21_im = y2_im;
3919 
3920  // using acc1 regs:
3921  x1_re = acc1_11_re + a *acc1_31_im;
3922  x1_im = acc1_11_im - a *acc1_31_re;
3923  x2_re = b * acc1_11_re;
3924  x2_im = b * acc1_11_im;
3925 
3926  y1_re = acc1_31_re + a *acc1_11_im;
3927  y1_im = acc1_31_im - a *acc1_11_re;
3928  y2_re = b * acc1_31_re;
3929  y2_im = b * acc1_31_im;
3930 
3931 
3932  // using acc2 regs:
3933  x2_re += acc2_11_re - a *acc2_31_im;
3934  x2_im += acc2_11_im + a *acc2_31_re;
3935  x1_re += b * acc2_11_re;
3936  x1_im += b * acc2_11_im;
3937 
3938  y2_re += acc2_31_re - a *acc2_11_im;
3939  y2_im += acc2_31_im + a *acc2_11_re;
3940  y1_re += b * acc2_31_re;
3941  y1_im += b * acc2_31_im;
3942 
3943 
3944  acc1_11_re = x1_re; acc1_11_im = x1_im;
3945  acc1_31_re = y1_re; acc1_31_im = y1_im;
3946 
3947  acc2_11_re = x2_re; acc2_11_im = x2_im;
3948  acc2_31_re = y2_re; acc2_31_im = y2_im;
3949 
3950  // using acc1 regs:
3951  x1_re = acc1_02_re + a *acc1_22_im;
3952  x1_im = acc1_02_im - a *acc1_22_re;
3953  x2_re = b * acc1_02_re;
3954  x2_im = b * acc1_02_im;
3955 
3956  y1_re = acc1_22_re + a *acc1_02_im;
3957  y1_im = acc1_22_im - a *acc1_02_re;
3958  y2_re = b * acc1_22_re;
3959  y2_im = b * acc1_22_im;
3960 
3961 
3962  // using acc2 regs:
3963  x2_re += acc2_02_re - a *acc2_22_im;
3964  x2_im += acc2_02_im + a *acc2_22_re;
3965  x1_re += b * acc2_02_re;
3966  x1_im += b * acc2_02_im;
3967 
3968  y2_re += acc2_22_re - a *acc2_02_im;
3969  y2_im += acc2_22_im + a *acc2_02_re;
3970  y1_re += b * acc2_22_re;
3971  y1_im += b * acc2_22_im;
3972 
3973 
3974  acc1_02_re = x1_re; acc1_02_im = x1_im;
3975  acc1_22_re = y1_re; acc1_22_im = y1_im;
3976 
3977  acc2_02_re = x2_re; acc2_02_im = x2_im;
3978  acc2_22_re = y2_re; acc2_22_im = y2_im;
3979 
3980  // using acc1 regs:
3981  x1_re = acc1_12_re + a *acc1_32_im;
3982  x1_im = acc1_12_im - a *acc1_32_re;
3983  x2_re = b * acc1_12_re;
3984  x2_im = b * acc1_12_im;
3985 
3986  y1_re = acc1_32_re + a *acc1_12_im;
3987  y1_im = acc1_32_im - a *acc1_12_re;
3988  y2_re = b * acc1_32_re;
3989  y2_im = b * acc1_32_im;
3990 
3991 
3992  // using acc2 regs:
3993  x2_re += acc2_12_re - a *acc2_32_im;
3994  x2_im += acc2_12_im + a *acc2_32_re;
3995  x1_re += b * acc2_12_re;
3996  x1_im += b * acc2_12_im;
3997 
3998  y2_re += acc2_32_re - a *acc2_12_im;
3999  y2_im += acc2_32_im + a *acc2_12_re;
4000  y1_re += b * acc2_32_re;
4001  y1_im += b * acc2_32_im;
4002 
4003 
4004  acc1_12_re = x1_re; acc1_12_im = x1_im;
4005  acc1_32_re = y1_re; acc1_32_im = y1_im;
4006 
4007  acc2_12_re = x2_re; acc2_12_im = x2_im;
4008  acc2_32_re = y2_re; acc2_32_im = y2_im;
4009 
4010  o1_00_re = k*o1_00_re + acc1_00_re;
4011  o1_00_im = k*o1_00_im + acc1_00_im;
4012  o1_01_re = k*o1_01_re + acc1_01_re;
4013  o1_01_im = k*o1_01_im + acc1_01_im;
4014  o1_02_re = k*o1_02_re + acc1_02_re;
4015  o1_02_im = k*o1_02_im + acc1_02_im;
4016  o1_10_re = k*o1_10_re + acc1_10_re;
4017  o1_10_im = k*o1_10_im + acc1_10_im;
4018  o1_11_re = k*o1_11_re + acc1_11_re;
4019  o1_11_im = k*o1_11_im + acc1_11_im;
4020  o1_12_re = k*o1_12_re + acc1_12_re;
4021  o1_12_im = k*o1_12_im + acc1_12_im;
4022  o1_20_re = k*o1_20_re + acc1_20_re;
4023  o1_20_im = k*o1_20_im + acc1_20_im;
4024  o1_21_re = k*o1_21_re + acc1_21_re;
4025  o1_21_im = k*o1_21_im + acc1_21_im;
4026  o1_22_re = k*o1_22_re + acc1_22_re;
4027  o1_22_im = k*o1_22_im + acc1_22_im;
4028  o1_30_re = k*o1_30_re + acc1_30_re;
4029  o1_30_im = k*o1_30_im + acc1_30_im;
4030  o1_31_re = k*o1_31_re + acc1_31_re;
4031  o1_31_im = k*o1_31_im + acc1_31_im;
4032  o1_32_re = k*o1_32_re + acc1_32_re;
4033  o1_32_im = k*o1_32_im + acc1_32_im;
4034 
4035  o2_00_re = k*o2_00_re + acc2_00_re;
4036  o2_00_im = k*o2_00_im + acc2_00_im;
4037  o2_01_re = k*o2_01_re + acc2_01_re;
4038  o2_01_im = k*o2_01_im + acc2_01_im;
4039  o2_02_re = k*o2_02_re + acc2_02_re;
4040  o2_02_im = k*o2_02_im + acc2_02_im;
4041  o2_10_re = k*o2_10_re + acc2_10_re;
4042  o2_10_im = k*o2_10_im + acc2_10_im;
4043  o2_11_re = k*o2_11_re + acc2_11_re;
4044  o2_11_im = k*o2_11_im + acc2_11_im;
4045  o2_12_re = k*o2_12_re + acc2_12_re;
4046  o2_12_im = k*o2_12_im + acc2_12_im;
4047  o2_20_re = k*o2_20_re + acc2_20_re;
4048  o2_20_im = k*o2_20_im + acc2_20_im;
4049  o2_21_re = k*o2_21_re + acc2_21_re;
4050  o2_21_im = k*o2_21_im + acc2_21_im;
4051  o2_22_re = k*o2_22_re + acc2_22_re;
4052  o2_22_im = k*o2_22_im + acc2_22_im;
4053  o2_30_re = k*o2_30_re + acc2_30_re;
4054  o2_30_im = k*o2_30_im + acc2_30_im;
4055  o2_31_re = k*o2_31_re + acc2_31_re;
4056  o2_31_im = k*o2_31_im + acc2_31_im;
4057  o2_32_re = k*o2_32_re + acc2_32_re;
4058  o2_32_im = k*o2_32_im + acc2_32_im;
4059  }
4060 
4061 #undef acc1_00_re
4062 #undef acc1_00_im
4063 #undef acc1_01_re
4064 #undef acc1_01_im
4065 #undef acc1_02_re
4066 #undef acc1_02_im
4067 #undef acc1_10_re
4068 #undef acc1_10_im
4069 #undef acc1_11_re
4070 #undef acc1_11_im
4071 #undef acc1_12_re
4072 #undef acc1_12_im
4073 #undef acc1_20_re
4074 #undef acc1_20_im
4075 #undef acc1_21_re
4076 #undef acc1_21_im
4077 #undef acc1_22_re
4078 #undef acc1_22_im
4079 #undef acc1_30_re
4080 #undef acc1_30_im
4081 #undef acc1_31_re
4082 #undef acc1_31_im
4083 #undef acc1_32_re
4084 #undef acc1_32_im
4085 
4086 #undef acc2_00_re
4087 #undef acc2_00_im
4088 #undef acc2_01_re
4089 #undef acc2_01_im
4090 #undef acc2_02_re
4091 #undef acc2_02_im
4092 #undef acc2_10_re
4093 #undef acc2_10_im
4094 #undef acc2_11_re
4095 #undef acc2_11_im
4096 #undef acc2_12_re
4097 #undef acc2_12_im
4098 #undef acc2_20_re
4099 #undef acc2_20_im
4100 #undef acc2_21_re
4101 #undef acc2_21_im
4102 #undef acc2_22_re
4103 #undef acc2_22_im
4104 #undef acc2_30_re
4105 #undef acc2_30_im
4106 #undef acc2_31_re
4107 #undef acc2_31_im
4108 #undef acc2_32_re
4109 #undef acc2_32_im
4110 
4111 #endif//DSLASH_TWIST
4112 
4113 #endif // DSLASH_XPAY
4114 }
4115 
4116 // write spinor field back to device memory
4118 
4119 // undefine to prevent warning when precision is changed
4120 #undef spinorFloat
4121 #undef g00_re
4122 #undef g00_im
4123 #undef g01_re
4124 #undef g01_im
4125 #undef g02_re
4126 #undef g02_im
4127 #undef g10_re
4128 #undef g10_im
4129 #undef g11_re
4130 #undef g11_im
4131 #undef g12_re
4132 #undef g12_im
4133 #undef g20_re
4134 #undef g20_im
4135 #undef g21_re
4136 #undef g21_im
4137 #undef g22_re
4138 #undef g22_im
4139 
4140 #undef i00_re
4141 #undef i00_im
4142 #undef i01_re
4143 #undef i01_im
4144 #undef i02_re
4145 #undef i02_im
4146 #undef i10_re
4147 #undef i10_im
4148 #undef i11_re
4149 #undef i11_im
4150 #undef i12_re
4151 #undef i12_im
4152 #undef i20_re
4153 #undef i20_im
4154 #undef i21_re
4155 #undef i21_im
4156 #undef i22_re
4157 #undef i22_im
4158 #undef i30_re
4159 #undef i30_im
4160 #undef i31_re
4161 #undef i31_im
4162 #undef i32_re
4163 #undef i32_im
4164 
4165 
4166 #undef VOLATILE
__constant__ int Vh
__constant__ int X2
VOLATILE spinorFloat o2_30_im
VOLATILE spinorFloat o2_01_im
VOLATILE spinorFloat o1_12_im
VOLATILE spinorFloat o2_30_re
VOLATILE spinorFloat o2_32_im
VOLATILE spinorFloat o1_20_re
VOLATILE spinorFloat o1_32_im
VOLATILE spinorFloat o2_00_re
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
VOLATILE spinorFloat o1_10_im
int sp_idx
VOLATILE spinorFloat o1_11_im
VOLATILE spinorFloat o2_21_re
VOLATILE spinorFloat o2_10_re
VOLATILE spinorFloat o1_12_re
VOLATILE spinorFloat o2_22_re
o1_00_im *o1_01_re *o1_01_im *o1_02_re *o1_02_im *o1_10_re *o1_10_im *o1_11_re *o1_11_im *o1_12_re *o1_12_im *o1_20_re *o1_20_im *o1_21_re *o1_21_im *o1_22_re *o1_22_im *o1_30_re *o1_30_im *o1_31_re *o1_31_im *o1_32_re *o1_32_im *o2_00_re *o2_00_im *o2_01_re *o2_01_im *o2_02_re *o2_02_im *o2_10_re *o2_10_im *o2_11_re *o2_11_im *o2_12_re *o2_12_im *o2_20_re *o2_20_im *o2_21_re *o2_21_im *o2_22_re *o2_22_im *o2_30_re *o2_30_im *o2_31_re *o2_31_im *o2_32_re *o2_32_im * WRITE_FLAVOR_SPINOR()
KernelType
VOLATILE spinorFloat o2_10_im
VOLATILE spinorFloat o2_02_re
QudaGaugeParam param
Definition: pack_test.cpp:17
VOLATILE spinorFloat o2_32_re
VOLATILE spinorFloat o1_10_re
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define READ_SPINOR_UP
Definition: covDev.h:130
#define RECONSTRUCT_GAUGE_MATRIX
Definition: covDev.h:39
VOLATILE spinorFloat o1_01_im
VOLATILE spinorFloat o2_11_re
VOLATILE spinorFloat o1_00_re
#define GAUGE0TEX
Definition: covDev.h:112
VOLATILE spinorFloat o2_20_im
VOLATILE spinorFloat o2_12_re
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
VOLATILE spinorFloat o2_31_im
VOLATILE spinorFloat o1_02_im
__constant__ int X2m1
#define SPINORTEX
Definition: clover_def.h:40
VOLATILE spinorFloat o2_11_im
int X[4]
Definition: quda.h:29
__constant__ int gauge_fixed
VOLATILE spinorFloat o1_01_re
VOLATILE spinorFloat o1_11_re
VOLATILE spinorFloat o1_21_im
VOLATILE spinorFloat o2_02_im
VOLATILE spinorFloat o1_31_im
#define SPINOR_HOP
Definition: covDev.h:158
VOLATILE spinorFloat o2_31_re
__constant__ int ga_stride
VOLATILE spinorFloat o2_22_im
VOLATILE spinorFloat o1_22_im
__constant__ int X1m1
__constant__ int X3
VOLATILE spinorFloat o2_00_im
VOLATILE spinorFloat o1_30_re
VOLATILE spinorFloat o1_00_im
VOLATILE spinorFloat o2_20_re
VOLATILE spinorFloat o1_21_re
VOLATILE spinorFloat o1_02_re
VOLATILE spinorFloat o1_30_im
VOLATILE spinorFloat o2_12_im
#define GAUGE1TEX
Definition: covDev.h:113
#define READ_GAUGE_MATRIX
Definition: covDev.h:44
VOLATILE spinorFloat o1_32_re
__constant__ int X4m1
#define READ_SPINOR
Definition: clover_def.h:36
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
#define READ_SPINOR_DOWN
Definition: covDev.h:131
__constant__ int X4X3X2X1hmX3X2X1h
VOLATILE spinorFloat o1_20_im
KernelType kernel_type
VOLATILE spinorFloat o1_22_re
__constant__ int X4
__constant__ int X3m1
VOLATILE spinorFloat o2_21_im
VOLATILE spinorFloat o2_01_re
VOLATILE spinorFloat o1_31_re