QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tm_ndeg_dslash_core.h
Go to the documentation of this file.
1 // *** CUDA NDEG TWISTED MASS DSLASH ***
2 
3 // Arguments (double) mu, (double)eta and (double)delta
4 #define SHARED_TMNDEG_FLOATS_PER_THREAD 0
5 #define FLAVORS 2
6 
7 
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
9 #define VOLATILE
10 #else // Open64 compiler
11 #define VOLATILE volatile
12 #endif
13 // input spinor
14 #ifdef SPINOR_DOUBLE
15 #define spinorFloat double
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #else
41 #define spinorFloat float
42 #define i00_re I0.x
43 #define i00_im I0.y
44 #define i01_re I0.z
45 #define i01_im I0.w
46 #define i02_re I1.x
47 #define i02_im I1.y
48 #define i10_re I1.z
49 #define i10_im I1.w
50 #define i11_re I2.x
51 #define i11_im I2.y
52 #define i12_re I2.z
53 #define i12_im I2.w
54 #define i20_re I3.x
55 #define i20_im I3.y
56 #define i21_re I3.z
57 #define i21_im I3.w
58 #define i22_re I4.x
59 #define i22_im I4.y
60 #define i30_re I4.z
61 #define i30_im I4.w
62 #define i31_re I5.x
63 #define i31_im I5.y
64 #define i32_re I5.z
65 #define i32_im I5.w
66 #endif // SPINOR_DOUBLE
67 
68 // gauge link
69 #ifdef GAUGE_FLOAT2
70 #define g00_re G0.x
71 #define g00_im G0.y
72 #define g01_re G1.x
73 #define g01_im G1.y
74 #define g02_re G2.x
75 #define g02_im G2.y
76 #define g10_re G3.x
77 #define g10_im G3.y
78 #define g11_re G4.x
79 #define g11_im G4.y
80 #define g12_re G5.x
81 #define g12_im G5.y
82 #define g20_re G6.x
83 #define g20_im G6.y
84 #define g21_re G7.x
85 #define g21_im G7.y
86 #define g22_re G8.x
87 #define g22_im G8.y
88 
89 #else
90 #define g00_re G0.x
91 #define g00_im G0.y
92 #define g01_re G0.z
93 #define g01_im G0.w
94 #define g02_re G1.x
95 #define g02_im G1.y
96 #define g10_re G1.z
97 #define g10_im G1.w
98 #define g11_re G2.x
99 #define g11_im G2.y
100 #define g12_re G2.z
101 #define g12_im G2.w
102 #define g20_re G3.x
103 #define g20_im G3.y
104 #define g21_re G3.z
105 #define g21_im G3.w
106 #define g22_re G4.x
107 #define g22_im G4.y
108 
109 #endif // GAUGE_DOUBLE
110 
111 // conjugated gauge link
112 #define gT00_re (+g00_re)
113 #define gT00_im (-g00_im)
114 #define gT01_re (+g10_re)
115 #define gT01_im (-g10_im)
116 #define gT02_re (+g20_re)
117 #define gT02_im (-g20_im)
118 #define gT10_re (+g01_re)
119 #define gT10_im (-g01_im)
120 #define gT11_re (+g11_re)
121 #define gT11_im (-g11_im)
122 #define gT12_re (+g21_re)
123 #define gT12_im (-g21_im)
124 #define gT20_re (+g02_re)
125 #define gT20_im (-g02_im)
126 #define gT21_re (+g12_re)
127 #define gT21_im (-g12_im)
128 #define gT22_re (+g22_re)
129 #define gT22_im (-g22_im)
130 
131 // output spinor for flavor 1
156 // output spinor for flavor 2
181 
182 #include "read_gauge.h"
183 #include "io_spinor.h"
184 
185 int x1, x2, x3, x4;
186 int X;
187 
188 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
189 int sp_norm_idx;
190 #endif // MULTI_GPU half precision
191 
192 int sid;
193 
194 #ifdef MULTI_GPU
195 int face_idx;
197 #endif
198 
199  sid = blockIdx.x*blockDim.x + threadIdx.x;
200  if (sid >= param.threads) return;
201 
202  // Inline by hand for the moment and assume even dimensions
203  const int dims[] = {X1, X2, X3, X4};
204  coordsFromIndex<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
205 
206  o1_00_re = 0; o1_00_im = 0;
207  o1_01_re = 0; o1_01_im = 0;
208  o1_02_re = 0; o1_02_im = 0;
209  o1_10_re = 0; o1_10_im = 0;
210  o1_11_re = 0; o1_11_im = 0;
211  o1_12_re = 0; o1_12_im = 0;
212  o1_20_re = 0; o1_20_im = 0;
213  o1_21_re = 0; o1_21_im = 0;
214  o1_22_re = 0; o1_22_im = 0;
215  o1_30_re = 0; o1_30_im = 0;
216  o1_31_re = 0; o1_31_im = 0;
217  o1_32_re = 0; o1_32_im = 0;
218 
219  o2_00_re = 0; o2_00_im = 0;
220  o2_01_re = 0; o2_01_im = 0;
221  o2_02_re = 0; o2_02_im = 0;
222  o2_10_re = 0; o2_10_im = 0;
223  o2_11_re = 0; o2_11_im = 0;
224  o2_12_re = 0; o2_12_im = 0;
225  o2_20_re = 0; o2_20_im = 0;
226  o2_21_re = 0; o2_21_im = 0;
227  o2_22_re = 0; o2_22_im = 0;
228  o2_30_re = 0; o2_30_im = 0;
229  o2_31_re = 0; o2_31_im = 0;
230  o2_32_re = 0; o2_32_im = 0;
231 
232 #ifdef MULTI_GPU
233 } else { // exterior kernel
234 
235  sid = blockIdx.x*blockDim.x + threadIdx.x;
236  if (sid >= param.threads) return;
237 
238  const int dim = static_cast<int>(kernel_type);
239  const int face_volume = (param.threads >> 1); // volume of one face (per flavor)
240  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
241  face_idx = sid - face_num*face_volume; // index into the respective face
242 
243  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
244  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
245  //sp_idx = face_idx + param.ghostOffset[dim];
246 
247 #if (DD_PREC==2) // half precision
248  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)] + face_num*ghostFace[static_cast<int>(kernel_type)];
249 #endif
250 
251  const int dims[] = {X1, X2, X3, X4};
252  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
253 
254 
255  {
256  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
269 
270 
271  }
272  {
273  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid+param.fl_stride, sid+param.fl_stride);
286 
287 
288  }
289 }
290 #endif // MULTI_GPU
291 
292 
293 #ifdef MULTI_GPU
294 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
296 #endif
297 {
298  // Projector P0-
299  // 1 0 0 -i
300  // 0 1 -i 0
301  // 0 i 1 0
302  // i 0 0 1
303 
304 #ifdef MULTI_GPU
305  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
306  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
307 #else
308  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
309 #endif
310 
311  const int ga_idx = sid;
312 
319 
320  // read gauge matrix from device memory
321  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
322 
323  // reconstruct gauge matrix
325 
326  {
327 #ifdef MULTI_GPU
328  if (kernel_type == INTERIOR_KERNEL) {
329 #endif
330 
331  // read flavor 1 from device memory
332  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
333 
334  // project spinor into half spinors
335  a0_re = +i00_re+i30_im;
336  a0_im = +i00_im-i30_re;
337  a1_re = +i01_re+i31_im;
338  a1_im = +i01_im-i31_re;
339  a2_re = +i02_re+i32_im;
340  a2_im = +i02_im-i32_re;
341  b0_re = +i10_re+i20_im;
342  b0_im = +i10_im-i20_re;
343  b1_re = +i11_re+i21_im;
344  b1_im = +i11_im-i21_re;
345  b2_re = +i12_re+i22_im;
346  b2_im = +i12_im-i22_re;
347 
348 #ifdef MULTI_GPU
349  } else {
350 
351  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
352 
353  // read half spinor for the first flavor from device memory
354  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
355 
356  a0_re = i00_re; a0_im = i00_im;
357  a1_re = i01_re; a1_im = i01_im;
358  a2_re = i02_re; a2_im = i02_im;
359  b0_re = i10_re; b0_im = i10_im;
360  b1_re = i11_re; b1_im = i11_im;
361  b2_re = i12_re; b2_im = i12_im;
362 
363  }
364 #endif // MULTI_GPU
365 
366  // multiply row 0
368  A0_re += g00_re * a0_re;
369  A0_re -= g00_im * a0_im;
370  A0_re += g01_re * a1_re;
371  A0_re -= g01_im * a1_im;
372  A0_re += g02_re * a2_re;
373  A0_re -= g02_im * a2_im;
375  A0_im += g00_re * a0_im;
376  A0_im += g00_im * a0_re;
377  A0_im += g01_re * a1_im;
378  A0_im += g01_im * a1_re;
379  A0_im += g02_re * a2_im;
380  A0_im += g02_im * a2_re;
382  B0_re += g00_re * b0_re;
383  B0_re -= g00_im * b0_im;
384  B0_re += g01_re * b1_re;
385  B0_re -= g01_im * b1_im;
386  B0_re += g02_re * b2_re;
387  B0_re -= g02_im * b2_im;
389  B0_im += g00_re * b0_im;
390  B0_im += g00_im * b0_re;
391  B0_im += g01_re * b1_im;
392  B0_im += g01_im * b1_re;
393  B0_im += g02_re * b2_im;
394  B0_im += g02_im * b2_re;
395 
396  // multiply row 1
398  A1_re += g10_re * a0_re;
399  A1_re -= g10_im * a0_im;
400  A1_re += g11_re * a1_re;
401  A1_re -= g11_im * a1_im;
402  A1_re += g12_re * a2_re;
403  A1_re -= g12_im * a2_im;
405  A1_im += g10_re * a0_im;
406  A1_im += g10_im * a0_re;
407  A1_im += g11_re * a1_im;
408  A1_im += g11_im * a1_re;
409  A1_im += g12_re * a2_im;
410  A1_im += g12_im * a2_re;
412  B1_re += g10_re * b0_re;
413  B1_re -= g10_im * b0_im;
414  B1_re += g11_re * b1_re;
415  B1_re -= g11_im * b1_im;
416  B1_re += g12_re * b2_re;
417  B1_re -= g12_im * b2_im;
419  B1_im += g10_re * b0_im;
420  B1_im += g10_im * b0_re;
421  B1_im += g11_re * b1_im;
422  B1_im += g11_im * b1_re;
423  B1_im += g12_re * b2_im;
424  B1_im += g12_im * b2_re;
425 
426  // multiply row 2
428  A2_re += g20_re * a0_re;
429  A2_re -= g20_im * a0_im;
430  A2_re += g21_re * a1_re;
431  A2_re -= g21_im * a1_im;
432  A2_re += g22_re * a2_re;
433  A2_re -= g22_im * a2_im;
435  A2_im += g20_re * a0_im;
436  A2_im += g20_im * a0_re;
437  A2_im += g21_re * a1_im;
438  A2_im += g21_im * a1_re;
439  A2_im += g22_re * a2_im;
440  A2_im += g22_im * a2_re;
442  B2_re += g20_re * b0_re;
443  B2_re -= g20_im * b0_im;
444  B2_re += g21_re * b1_re;
445  B2_re -= g21_im * b1_im;
446  B2_re += g22_re * b2_re;
447  B2_re -= g22_im * b2_im;
449  B2_im += g20_re * b0_im;
450  B2_im += g20_im * b0_re;
451  B2_im += g21_re * b1_im;
452  B2_im += g21_im * b1_re;
453  B2_im += g22_re * b2_im;
454  B2_im += g22_im * b2_re;
455 
456  o1_00_re += A0_re;
457  o1_00_im += A0_im;
458  o1_10_re += B0_re;
459  o1_10_im += B0_im;
460  o1_20_re -= B0_im;
461  o1_20_im += B0_re;
462  o1_30_re -= A0_im;
463  o1_30_im += A0_re;
464 
465  o1_01_re += A1_re;
466  o1_01_im += A1_im;
467  o1_11_re += B1_re;
468  o1_11_im += B1_im;
469  o1_21_re -= B1_im;
470  o1_21_im += B1_re;
471  o1_31_re -= A1_im;
472  o1_31_im += A1_re;
473 
474  o1_02_re += A2_re;
475  o1_02_im += A2_im;
476  o1_12_re += B2_re;
477  o1_12_im += B2_im;
478  o1_22_re -= B2_im;
479  o1_22_im += B2_re;
480  o1_32_re -= A2_im;
481  o1_32_im += A2_re;
482 
483  }
484  {
485 #ifdef MULTI_GPU
486  if (kernel_type == INTERIOR_KERNEL) {
487 #endif
488 
489  // read flavor 2 from device memory
490  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
491 
492  // project spinor into half spinors
493  a0_re = +i00_re+i30_im;
494  a0_im = +i00_im-i30_re;
495  a1_re = +i01_re+i31_im;
496  a1_im = +i01_im-i31_re;
497  a2_re = +i02_re+i32_im;
498  a2_im = +i02_im-i32_re;
499  b0_re = +i10_re+i20_im;
500  b0_im = +i10_im-i20_re;
501  b1_re = +i11_re+i21_im;
502  b1_im = +i11_im-i21_re;
503  b2_re = +i12_re+i22_im;
504  b2_im = +i12_im-i22_re;
505 
506 #ifdef MULTI_GPU
507  } else {
508 
509  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
510 
511  // read half spinor for the second flavor from device memory
512  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
513  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
514 
515  a0_re = i00_re; a0_im = i00_im;
516  a1_re = i01_re; a1_im = i01_im;
517  a2_re = i02_re; a2_im = i02_im;
518  b0_re = i10_re; b0_im = i10_im;
519  b1_re = i11_re; b1_im = i11_im;
520  b2_re = i12_re; b2_im = i12_im;
521 
522  }
523 #endif // MULTI_GPU
524 
525  // multiply row 0
526  spinorFloat A0_re = 0;
527  A0_re += g00_re * a0_re;
528  A0_re -= g00_im * a0_im;
529  A0_re += g01_re * a1_re;
530  A0_re -= g01_im * a1_im;
531  A0_re += g02_re * a2_re;
532  A0_re -= g02_im * a2_im;
533  spinorFloat A0_im = 0;
534  A0_im += g00_re * a0_im;
535  A0_im += g00_im * a0_re;
536  A0_im += g01_re * a1_im;
537  A0_im += g01_im * a1_re;
538  A0_im += g02_re * a2_im;
539  A0_im += g02_im * a2_re;
540  spinorFloat B0_re = 0;
541  B0_re += g00_re * b0_re;
542  B0_re -= g00_im * b0_im;
543  B0_re += g01_re * b1_re;
544  B0_re -= g01_im * b1_im;
545  B0_re += g02_re * b2_re;
546  B0_re -= g02_im * b2_im;
547  spinorFloat B0_im = 0;
548  B0_im += g00_re * b0_im;
549  B0_im += g00_im * b0_re;
550  B0_im += g01_re * b1_im;
551  B0_im += g01_im * b1_re;
552  B0_im += g02_re * b2_im;
553  B0_im += g02_im * b2_re;
554 
555  // multiply row 1
556  spinorFloat A1_re = 0;
557  A1_re += g10_re * a0_re;
558  A1_re -= g10_im * a0_im;
559  A1_re += g11_re * a1_re;
560  A1_re -= g11_im * a1_im;
561  A1_re += g12_re * a2_re;
562  A1_re -= g12_im * a2_im;
563  spinorFloat A1_im = 0;
564  A1_im += g10_re * a0_im;
565  A1_im += g10_im * a0_re;
566  A1_im += g11_re * a1_im;
567  A1_im += g11_im * a1_re;
568  A1_im += g12_re * a2_im;
569  A1_im += g12_im * a2_re;
570  spinorFloat B1_re = 0;
571  B1_re += g10_re * b0_re;
572  B1_re -= g10_im * b0_im;
573  B1_re += g11_re * b1_re;
574  B1_re -= g11_im * b1_im;
575  B1_re += g12_re * b2_re;
576  B1_re -= g12_im * b2_im;
577  spinorFloat B1_im = 0;
578  B1_im += g10_re * b0_im;
579  B1_im += g10_im * b0_re;
580  B1_im += g11_re * b1_im;
581  B1_im += g11_im * b1_re;
582  B1_im += g12_re * b2_im;
583  B1_im += g12_im * b2_re;
584 
585  // multiply row 2
586  spinorFloat A2_re = 0;
587  A2_re += g20_re * a0_re;
588  A2_re -= g20_im * a0_im;
589  A2_re += g21_re * a1_re;
590  A2_re -= g21_im * a1_im;
591  A2_re += g22_re * a2_re;
592  A2_re -= g22_im * a2_im;
593  spinorFloat A2_im = 0;
594  A2_im += g20_re * a0_im;
595  A2_im += g20_im * a0_re;
596  A2_im += g21_re * a1_im;
597  A2_im += g21_im * a1_re;
598  A2_im += g22_re * a2_im;
599  A2_im += g22_im * a2_re;
600  spinorFloat B2_re = 0;
601  B2_re += g20_re * b0_re;
602  B2_re -= g20_im * b0_im;
603  B2_re += g21_re * b1_re;
604  B2_re -= g21_im * b1_im;
605  B2_re += g22_re * b2_re;
606  B2_re -= g22_im * b2_im;
607  spinorFloat B2_im = 0;
608  B2_im += g20_re * b0_im;
609  B2_im += g20_im * b0_re;
610  B2_im += g21_re * b1_im;
611  B2_im += g21_im * b1_re;
612  B2_im += g22_re * b2_im;
613  B2_im += g22_im * b2_re;
614 
615  o2_00_re += A0_re;
616  o2_00_im += A0_im;
617  o2_10_re += B0_re;
618  o2_10_im += B0_im;
619  o2_20_re -= B0_im;
620  o2_20_im += B0_re;
621  o2_30_re -= A0_im;
622  o2_30_im += A0_re;
623 
624  o2_01_re += A1_re;
625  o2_01_im += A1_im;
626  o2_11_re += B1_re;
627  o2_11_im += B1_im;
628  o2_21_re -= B1_im;
629  o2_21_im += B1_re;
630  o2_31_re -= A1_im;
631  o2_31_im += A1_re;
632 
633  o2_02_re += A2_re;
634  o2_02_im += A2_im;
635  o2_12_re += B2_re;
636  o2_12_im += B2_im;
637  o2_22_re -= B2_im;
638  o2_22_im += B2_re;
639  o2_32_re -= A2_im;
640  o2_32_im += A2_re;
641 
642  }
643 }
644 
645 #ifdef MULTI_GPU
646 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
647  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
648 #endif
649 {
650  // Projector P0+
651  // 1 0 0 i
652  // 0 1 i 0
653  // 0 -i 1 0
654  // -i 0 0 1
655 
656 #ifdef MULTI_GPU
657  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
658  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
659 #else
660  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
661 #endif
662 
663 #ifdef MULTI_GPU
664  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
665 #else
666  const int ga_idx = sp_idx;
667 #endif
668 
675 
676  // read gauge matrix from device memory
677  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
678 
679  // reconstruct gauge matrix
681 
682  {
683 #ifdef MULTI_GPU
684  if (kernel_type == INTERIOR_KERNEL) {
685 #endif
686 
687  // read flavor 1 from device memory
688  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
689 
690  // project spinor into half spinors
691  a0_re = +i00_re-i30_im;
692  a0_im = +i00_im+i30_re;
693  a1_re = +i01_re-i31_im;
694  a1_im = +i01_im+i31_re;
695  a2_re = +i02_re-i32_im;
696  a2_im = +i02_im+i32_re;
697  b0_re = +i10_re-i20_im;
698  b0_im = +i10_im+i20_re;
699  b1_re = +i11_re-i21_im;
700  b1_im = +i11_im+i21_re;
701  b2_re = +i12_re-i22_im;
702  b2_im = +i12_im+i22_re;
703 
704 #ifdef MULTI_GPU
705  } else {
706 
707  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
708 
709  // read half spinor for the first flavor from device memory
710  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
711 
712  a0_re = i00_re; a0_im = i00_im;
713  a1_re = i01_re; a1_im = i01_im;
714  a2_re = i02_re; a2_im = i02_im;
715  b0_re = i10_re; b0_im = i10_im;
716  b1_re = i11_re; b1_im = i11_im;
717  b2_re = i12_re; b2_im = i12_im;
718 
719  }
720 #endif // MULTI_GPU
721 
722  // multiply row 0
723  spinorFloat A0_re = 0;
724  A0_re += gT00_re * a0_re;
725  A0_re -= gT00_im * a0_im;
726  A0_re += gT01_re * a1_re;
727  A0_re -= gT01_im * a1_im;
728  A0_re += gT02_re * a2_re;
729  A0_re -= gT02_im * a2_im;
730  spinorFloat A0_im = 0;
731  A0_im += gT00_re * a0_im;
732  A0_im += gT00_im * a0_re;
733  A0_im += gT01_re * a1_im;
734  A0_im += gT01_im * a1_re;
735  A0_im += gT02_re * a2_im;
736  A0_im += gT02_im * a2_re;
737  spinorFloat B0_re = 0;
738  B0_re += gT00_re * b0_re;
739  B0_re -= gT00_im * b0_im;
740  B0_re += gT01_re * b1_re;
741  B0_re -= gT01_im * b1_im;
742  B0_re += gT02_re * b2_re;
743  B0_re -= gT02_im * b2_im;
744  spinorFloat B0_im = 0;
745  B0_im += gT00_re * b0_im;
746  B0_im += gT00_im * b0_re;
747  B0_im += gT01_re * b1_im;
748  B0_im += gT01_im * b1_re;
749  B0_im += gT02_re * b2_im;
750  B0_im += gT02_im * b2_re;
751 
752  // multiply row 1
753  spinorFloat A1_re = 0;
754  A1_re += gT10_re * a0_re;
755  A1_re -= gT10_im * a0_im;
756  A1_re += gT11_re * a1_re;
757  A1_re -= gT11_im * a1_im;
758  A1_re += gT12_re * a2_re;
759  A1_re -= gT12_im * a2_im;
760  spinorFloat A1_im = 0;
761  A1_im += gT10_re * a0_im;
762  A1_im += gT10_im * a0_re;
763  A1_im += gT11_re * a1_im;
764  A1_im += gT11_im * a1_re;
765  A1_im += gT12_re * a2_im;
766  A1_im += gT12_im * a2_re;
767  spinorFloat B1_re = 0;
768  B1_re += gT10_re * b0_re;
769  B1_re -= gT10_im * b0_im;
770  B1_re += gT11_re * b1_re;
771  B1_re -= gT11_im * b1_im;
772  B1_re += gT12_re * b2_re;
773  B1_re -= gT12_im * b2_im;
774  spinorFloat B1_im = 0;
775  B1_im += gT10_re * b0_im;
776  B1_im += gT10_im * b0_re;
777  B1_im += gT11_re * b1_im;
778  B1_im += gT11_im * b1_re;
779  B1_im += gT12_re * b2_im;
780  B1_im += gT12_im * b2_re;
781 
782  // multiply row 2
783  spinorFloat A2_re = 0;
784  A2_re += gT20_re * a0_re;
785  A2_re -= gT20_im * a0_im;
786  A2_re += gT21_re * a1_re;
787  A2_re -= gT21_im * a1_im;
788  A2_re += gT22_re * a2_re;
789  A2_re -= gT22_im * a2_im;
790  spinorFloat A2_im = 0;
791  A2_im += gT20_re * a0_im;
792  A2_im += gT20_im * a0_re;
793  A2_im += gT21_re * a1_im;
794  A2_im += gT21_im * a1_re;
795  A2_im += gT22_re * a2_im;
796  A2_im += gT22_im * a2_re;
797  spinorFloat B2_re = 0;
798  B2_re += gT20_re * b0_re;
799  B2_re -= gT20_im * b0_im;
800  B2_re += gT21_re * b1_re;
801  B2_re -= gT21_im * b1_im;
802  B2_re += gT22_re * b2_re;
803  B2_re -= gT22_im * b2_im;
804  spinorFloat B2_im = 0;
805  B2_im += gT20_re * b0_im;
806  B2_im += gT20_im * b0_re;
807  B2_im += gT21_re * b1_im;
808  B2_im += gT21_im * b1_re;
809  B2_im += gT22_re * b2_im;
810  B2_im += gT22_im * b2_re;
811 
812  o1_00_re += A0_re;
813  o1_00_im += A0_im;
814  o1_10_re += B0_re;
815  o1_10_im += B0_im;
816  o1_20_re += B0_im;
817  o1_20_im -= B0_re;
818  o1_30_re += A0_im;
819  o1_30_im -= A0_re;
820 
821  o1_01_re += A1_re;
822  o1_01_im += A1_im;
823  o1_11_re += B1_re;
824  o1_11_im += B1_im;
825  o1_21_re += B1_im;
826  o1_21_im -= B1_re;
827  o1_31_re += A1_im;
828  o1_31_im -= A1_re;
829 
830  o1_02_re += A2_re;
831  o1_02_im += A2_im;
832  o1_12_re += B2_re;
833  o1_12_im += B2_im;
834  o1_22_re += B2_im;
835  o1_22_im -= B2_re;
836  o1_32_re += A2_im;
837  o1_32_im -= A2_re;
838 
839  }
840  {
841 #ifdef MULTI_GPU
842  if (kernel_type == INTERIOR_KERNEL) {
843 #endif
844 
845  // read flavor 2 from device memory
846  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
847 
848  // project spinor into half spinors
849  a0_re = +i00_re-i30_im;
850  a0_im = +i00_im+i30_re;
851  a1_re = +i01_re-i31_im;
852  a1_im = +i01_im+i31_re;
853  a2_re = +i02_re-i32_im;
854  a2_im = +i02_im+i32_re;
855  b0_re = +i10_re-i20_im;
856  b0_im = +i10_im+i20_re;
857  b1_re = +i11_re-i21_im;
858  b1_im = +i11_im+i21_re;
859  b2_re = +i12_re-i22_im;
860  b2_im = +i12_im+i22_re;
861 
862 #ifdef MULTI_GPU
863  } else {
864 
865  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
866 
867  // read half spinor for the second flavor from device memory
868  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
869  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
870 
871  a0_re = i00_re; a0_im = i00_im;
872  a1_re = i01_re; a1_im = i01_im;
873  a2_re = i02_re; a2_im = i02_im;
874  b0_re = i10_re; b0_im = i10_im;
875  b1_re = i11_re; b1_im = i11_im;
876  b2_re = i12_re; b2_im = i12_im;
877 
878  }
879 #endif // MULTI_GPU
880 
881  // multiply row 0
882  spinorFloat A0_re = 0;
883  A0_re += gT00_re * a0_re;
884  A0_re -= gT00_im * a0_im;
885  A0_re += gT01_re * a1_re;
886  A0_re -= gT01_im * a1_im;
887  A0_re += gT02_re * a2_re;
888  A0_re -= gT02_im * a2_im;
889  spinorFloat A0_im = 0;
890  A0_im += gT00_re * a0_im;
891  A0_im += gT00_im * a0_re;
892  A0_im += gT01_re * a1_im;
893  A0_im += gT01_im * a1_re;
894  A0_im += gT02_re * a2_im;
895  A0_im += gT02_im * a2_re;
896  spinorFloat B0_re = 0;
897  B0_re += gT00_re * b0_re;
898  B0_re -= gT00_im * b0_im;
899  B0_re += gT01_re * b1_re;
900  B0_re -= gT01_im * b1_im;
901  B0_re += gT02_re * b2_re;
902  B0_re -= gT02_im * b2_im;
903  spinorFloat B0_im = 0;
904  B0_im += gT00_re * b0_im;
905  B0_im += gT00_im * b0_re;
906  B0_im += gT01_re * b1_im;
907  B0_im += gT01_im * b1_re;
908  B0_im += gT02_re * b2_im;
909  B0_im += gT02_im * b2_re;
910 
911  // multiply row 1
912  spinorFloat A1_re = 0;
913  A1_re += gT10_re * a0_re;
914  A1_re -= gT10_im * a0_im;
915  A1_re += gT11_re * a1_re;
916  A1_re -= gT11_im * a1_im;
917  A1_re += gT12_re * a2_re;
918  A1_re -= gT12_im * a2_im;
919  spinorFloat A1_im = 0;
920  A1_im += gT10_re * a0_im;
921  A1_im += gT10_im * a0_re;
922  A1_im += gT11_re * a1_im;
923  A1_im += gT11_im * a1_re;
924  A1_im += gT12_re * a2_im;
925  A1_im += gT12_im * a2_re;
926  spinorFloat B1_re = 0;
927  B1_re += gT10_re * b0_re;
928  B1_re -= gT10_im * b0_im;
929  B1_re += gT11_re * b1_re;
930  B1_re -= gT11_im * b1_im;
931  B1_re += gT12_re * b2_re;
932  B1_re -= gT12_im * b2_im;
933  spinorFloat B1_im = 0;
934  B1_im += gT10_re * b0_im;
935  B1_im += gT10_im * b0_re;
936  B1_im += gT11_re * b1_im;
937  B1_im += gT11_im * b1_re;
938  B1_im += gT12_re * b2_im;
939  B1_im += gT12_im * b2_re;
940 
941  // multiply row 2
942  spinorFloat A2_re = 0;
943  A2_re += gT20_re * a0_re;
944  A2_re -= gT20_im * a0_im;
945  A2_re += gT21_re * a1_re;
946  A2_re -= gT21_im * a1_im;
947  A2_re += gT22_re * a2_re;
948  A2_re -= gT22_im * a2_im;
949  spinorFloat A2_im = 0;
950  A2_im += gT20_re * a0_im;
951  A2_im += gT20_im * a0_re;
952  A2_im += gT21_re * a1_im;
953  A2_im += gT21_im * a1_re;
954  A2_im += gT22_re * a2_im;
955  A2_im += gT22_im * a2_re;
956  spinorFloat B2_re = 0;
957  B2_re += gT20_re * b0_re;
958  B2_re -= gT20_im * b0_im;
959  B2_re += gT21_re * b1_re;
960  B2_re -= gT21_im * b1_im;
961  B2_re += gT22_re * b2_re;
962  B2_re -= gT22_im * b2_im;
963  spinorFloat B2_im = 0;
964  B2_im += gT20_re * b0_im;
965  B2_im += gT20_im * b0_re;
966  B2_im += gT21_re * b1_im;
967  B2_im += gT21_im * b1_re;
968  B2_im += gT22_re * b2_im;
969  B2_im += gT22_im * b2_re;
970 
971  o2_00_re += A0_re;
972  o2_00_im += A0_im;
973  o2_10_re += B0_re;
974  o2_10_im += B0_im;
975  o2_20_re += B0_im;
976  o2_20_im -= B0_re;
977  o2_30_re += A0_im;
978  o2_30_im -= A0_re;
979 
980  o2_01_re += A1_re;
981  o2_01_im += A1_im;
982  o2_11_re += B1_re;
983  o2_11_im += B1_im;
984  o2_21_re += B1_im;
985  o2_21_im -= B1_re;
986  o2_31_re += A1_im;
987  o2_31_im -= A1_re;
988 
989  o2_02_re += A2_re;
990  o2_02_im += A2_im;
991  o2_12_re += B2_re;
992  o2_12_im += B2_im;
993  o2_22_re += B2_im;
994  o2_22_im -= B2_re;
995  o2_32_re += A2_im;
996  o2_32_im -= A2_re;
997 
998  }
999 }
1000 
1001 #ifdef MULTI_GPU
1002 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
1004 #endif
1005 {
1006  // Projector P1-
1007  // 1 0 0 -1
1008  // 0 1 1 0
1009  // 0 1 1 0
1010  // -1 0 0 1
1011 
1012 #ifdef MULTI_GPU
1013  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
1014  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1015 #else
1016  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
1017 #endif
1018 
1019  const int ga_idx = sid;
1020 
1027 
1028  // read gauge matrix from device memory
1029  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
1030 
1031  // reconstruct gauge matrix
1033 
1034  {
1035 #ifdef MULTI_GPU
1036  if (kernel_type == INTERIOR_KERNEL) {
1037 #endif
1038 
1039  // read flavor 1 from device memory
1040  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1041 
1042  // project spinor into half spinors
1043  a0_re = +i00_re-i30_re;
1044  a0_im = +i00_im-i30_im;
1045  a1_re = +i01_re-i31_re;
1046  a1_im = +i01_im-i31_im;
1047  a2_re = +i02_re-i32_re;
1048  a2_im = +i02_im-i32_im;
1049  b0_re = +i10_re+i20_re;
1050  b0_im = +i10_im+i20_im;
1051  b1_re = +i11_re+i21_re;
1052  b1_im = +i11_im+i21_im;
1053  b2_re = +i12_re+i22_re;
1054  b2_im = +i12_im+i22_im;
1055 
1056 #ifdef MULTI_GPU
1057  } else {
1058 
1059  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
1060 
1061  // read half spinor for the first flavor from device memory
1062  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1063 
1064  a0_re = i00_re; a0_im = i00_im;
1065  a1_re = i01_re; a1_im = i01_im;
1066  a2_re = i02_re; a2_im = i02_im;
1067  b0_re = i10_re; b0_im = i10_im;
1068  b1_re = i11_re; b1_im = i11_im;
1069  b2_re = i12_re; b2_im = i12_im;
1070 
1071  }
1072 #endif // MULTI_GPU
1073 
1074  // multiply row 0
1075  spinorFloat A0_re = 0;
1076  A0_re += g00_re * a0_re;
1077  A0_re -= g00_im * a0_im;
1078  A0_re += g01_re * a1_re;
1079  A0_re -= g01_im * a1_im;
1080  A0_re += g02_re * a2_re;
1081  A0_re -= g02_im * a2_im;
1082  spinorFloat A0_im = 0;
1083  A0_im += g00_re * a0_im;
1084  A0_im += g00_im * a0_re;
1085  A0_im += g01_re * a1_im;
1086  A0_im += g01_im * a1_re;
1087  A0_im += g02_re * a2_im;
1088  A0_im += g02_im * a2_re;
1089  spinorFloat B0_re = 0;
1090  B0_re += g00_re * b0_re;
1091  B0_re -= g00_im * b0_im;
1092  B0_re += g01_re * b1_re;
1093  B0_re -= g01_im * b1_im;
1094  B0_re += g02_re * b2_re;
1095  B0_re -= g02_im * b2_im;
1096  spinorFloat B0_im = 0;
1097  B0_im += g00_re * b0_im;
1098  B0_im += g00_im * b0_re;
1099  B0_im += g01_re * b1_im;
1100  B0_im += g01_im * b1_re;
1101  B0_im += g02_re * b2_im;
1102  B0_im += g02_im * b2_re;
1103 
1104  // multiply row 1
1105  spinorFloat A1_re = 0;
1106  A1_re += g10_re * a0_re;
1107  A1_re -= g10_im * a0_im;
1108  A1_re += g11_re * a1_re;
1109  A1_re -= g11_im * a1_im;
1110  A1_re += g12_re * a2_re;
1111  A1_re -= g12_im * a2_im;
1112  spinorFloat A1_im = 0;
1113  A1_im += g10_re * a0_im;
1114  A1_im += g10_im * a0_re;
1115  A1_im += g11_re * a1_im;
1116  A1_im += g11_im * a1_re;
1117  A1_im += g12_re * a2_im;
1118  A1_im += g12_im * a2_re;
1119  spinorFloat B1_re = 0;
1120  B1_re += g10_re * b0_re;
1121  B1_re -= g10_im * b0_im;
1122  B1_re += g11_re * b1_re;
1123  B1_re -= g11_im * b1_im;
1124  B1_re += g12_re * b2_re;
1125  B1_re -= g12_im * b2_im;
1126  spinorFloat B1_im = 0;
1127  B1_im += g10_re * b0_im;
1128  B1_im += g10_im * b0_re;
1129  B1_im += g11_re * b1_im;
1130  B1_im += g11_im * b1_re;
1131  B1_im += g12_re * b2_im;
1132  B1_im += g12_im * b2_re;
1133 
1134  // multiply row 2
1135  spinorFloat A2_re = 0;
1136  A2_re += g20_re * a0_re;
1137  A2_re -= g20_im * a0_im;
1138  A2_re += g21_re * a1_re;
1139  A2_re -= g21_im * a1_im;
1140  A2_re += g22_re * a2_re;
1141  A2_re -= g22_im * a2_im;
1142  spinorFloat A2_im = 0;
1143  A2_im += g20_re * a0_im;
1144  A2_im += g20_im * a0_re;
1145  A2_im += g21_re * a1_im;
1146  A2_im += g21_im * a1_re;
1147  A2_im += g22_re * a2_im;
1148  A2_im += g22_im * a2_re;
1149  spinorFloat B2_re = 0;
1150  B2_re += g20_re * b0_re;
1151  B2_re -= g20_im * b0_im;
1152  B2_re += g21_re * b1_re;
1153  B2_re -= g21_im * b1_im;
1154  B2_re += g22_re * b2_re;
1155  B2_re -= g22_im * b2_im;
1156  spinorFloat B2_im = 0;
1157  B2_im += g20_re * b0_im;
1158  B2_im += g20_im * b0_re;
1159  B2_im += g21_re * b1_im;
1160  B2_im += g21_im * b1_re;
1161  B2_im += g22_re * b2_im;
1162  B2_im += g22_im * b2_re;
1163 
1164  o1_00_re += A0_re;
1165  o1_00_im += A0_im;
1166  o1_10_re += B0_re;
1167  o1_10_im += B0_im;
1168  o1_20_re += B0_re;
1169  o1_20_im += B0_im;
1170  o1_30_re -= A0_re;
1171  o1_30_im -= A0_im;
1172 
1173  o1_01_re += A1_re;
1174  o1_01_im += A1_im;
1175  o1_11_re += B1_re;
1176  o1_11_im += B1_im;
1177  o1_21_re += B1_re;
1178  o1_21_im += B1_im;
1179  o1_31_re -= A1_re;
1180  o1_31_im -= A1_im;
1181 
1182  o1_02_re += A2_re;
1183  o1_02_im += A2_im;
1184  o1_12_re += B2_re;
1185  o1_12_im += B2_im;
1186  o1_22_re += B2_re;
1187  o1_22_im += B2_im;
1188  o1_32_re -= A2_re;
1189  o1_32_im -= A2_im;
1190 
1191  }
1192  {
1193 #ifdef MULTI_GPU
1194  if (kernel_type == INTERIOR_KERNEL) {
1195 #endif
1196 
1197  // read flavor 2 from device memory
1198  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
1199 
1200  // project spinor into half spinors
1201  a0_re = +i00_re-i30_re;
1202  a0_im = +i00_im-i30_im;
1203  a1_re = +i01_re-i31_re;
1204  a1_im = +i01_im-i31_im;
1205  a2_re = +i02_re-i32_re;
1206  a2_im = +i02_im-i32_im;
1207  b0_re = +i10_re+i20_re;
1208  b0_im = +i10_im+i20_im;
1209  b1_re = +i11_re+i21_re;
1210  b1_im = +i11_im+i21_im;
1211  b2_re = +i12_re+i22_re;
1212  b2_im = +i12_im+i22_im;
1213 
1214 #ifdef MULTI_GPU
1215  } else {
1216 
1217  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
1218 
1219  // read half spinor for the second flavor from device memory
1220  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
1221  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
1222 
1223  a0_re = i00_re; a0_im = i00_im;
1224  a1_re = i01_re; a1_im = i01_im;
1225  a2_re = i02_re; a2_im = i02_im;
1226  b0_re = i10_re; b0_im = i10_im;
1227  b1_re = i11_re; b1_im = i11_im;
1228  b2_re = i12_re; b2_im = i12_im;
1229 
1230  }
1231 #endif // MULTI_GPU
1232 
1233  // multiply row 0
1234  spinorFloat A0_re = 0;
1235  A0_re += g00_re * a0_re;
1236  A0_re -= g00_im * a0_im;
1237  A0_re += g01_re * a1_re;
1238  A0_re -= g01_im * a1_im;
1239  A0_re += g02_re * a2_re;
1240  A0_re -= g02_im * a2_im;
1241  spinorFloat A0_im = 0;
1242  A0_im += g00_re * a0_im;
1243  A0_im += g00_im * a0_re;
1244  A0_im += g01_re * a1_im;
1245  A0_im += g01_im * a1_re;
1246  A0_im += g02_re * a2_im;
1247  A0_im += g02_im * a2_re;
1248  spinorFloat B0_re = 0;
1249  B0_re += g00_re * b0_re;
1250  B0_re -= g00_im * b0_im;
1251  B0_re += g01_re * b1_re;
1252  B0_re -= g01_im * b1_im;
1253  B0_re += g02_re * b2_re;
1254  B0_re -= g02_im * b2_im;
1255  spinorFloat B0_im = 0;
1256  B0_im += g00_re * b0_im;
1257  B0_im += g00_im * b0_re;
1258  B0_im += g01_re * b1_im;
1259  B0_im += g01_im * b1_re;
1260  B0_im += g02_re * b2_im;
1261  B0_im += g02_im * b2_re;
1262 
1263  // multiply row 1
1264  spinorFloat A1_re = 0;
1265  A1_re += g10_re * a0_re;
1266  A1_re -= g10_im * a0_im;
1267  A1_re += g11_re * a1_re;
1268  A1_re -= g11_im * a1_im;
1269  A1_re += g12_re * a2_re;
1270  A1_re -= g12_im * a2_im;
1271  spinorFloat A1_im = 0;
1272  A1_im += g10_re * a0_im;
1273  A1_im += g10_im * a0_re;
1274  A1_im += g11_re * a1_im;
1275  A1_im += g11_im * a1_re;
1276  A1_im += g12_re * a2_im;
1277  A1_im += g12_im * a2_re;
1278  spinorFloat B1_re = 0;
1279  B1_re += g10_re * b0_re;
1280  B1_re -= g10_im * b0_im;
1281  B1_re += g11_re * b1_re;
1282  B1_re -= g11_im * b1_im;
1283  B1_re += g12_re * b2_re;
1284  B1_re -= g12_im * b2_im;
1285  spinorFloat B1_im = 0;
1286  B1_im += g10_re * b0_im;
1287  B1_im += g10_im * b0_re;
1288  B1_im += g11_re * b1_im;
1289  B1_im += g11_im * b1_re;
1290  B1_im += g12_re * b2_im;
1291  B1_im += g12_im * b2_re;
1292 
1293  // multiply row 2
1294  spinorFloat A2_re = 0;
1295  A2_re += g20_re * a0_re;
1296  A2_re -= g20_im * a0_im;
1297  A2_re += g21_re * a1_re;
1298  A2_re -= g21_im * a1_im;
1299  A2_re += g22_re * a2_re;
1300  A2_re -= g22_im * a2_im;
1301  spinorFloat A2_im = 0;
1302  A2_im += g20_re * a0_im;
1303  A2_im += g20_im * a0_re;
1304  A2_im += g21_re * a1_im;
1305  A2_im += g21_im * a1_re;
1306  A2_im += g22_re * a2_im;
1307  A2_im += g22_im * a2_re;
1308  spinorFloat B2_re = 0;
1309  B2_re += g20_re * b0_re;
1310  B2_re -= g20_im * b0_im;
1311  B2_re += g21_re * b1_re;
1312  B2_re -= g21_im * b1_im;
1313  B2_re += g22_re * b2_re;
1314  B2_re -= g22_im * b2_im;
1315  spinorFloat B2_im = 0;
1316  B2_im += g20_re * b0_im;
1317  B2_im += g20_im * b0_re;
1318  B2_im += g21_re * b1_im;
1319  B2_im += g21_im * b1_re;
1320  B2_im += g22_re * b2_im;
1321  B2_im += g22_im * b2_re;
1322 
1323  o2_00_re += A0_re;
1324  o2_00_im += A0_im;
1325  o2_10_re += B0_re;
1326  o2_10_im += B0_im;
1327  o2_20_re += B0_re;
1328  o2_20_im += B0_im;
1329  o2_30_re -= A0_re;
1330  o2_30_im -= A0_im;
1331 
1332  o2_01_re += A1_re;
1333  o2_01_im += A1_im;
1334  o2_11_re += B1_re;
1335  o2_11_im += B1_im;
1336  o2_21_re += B1_re;
1337  o2_21_im += B1_im;
1338  o2_31_re -= A1_re;
1339  o2_31_im -= A1_im;
1340 
1341  o2_02_re += A2_re;
1342  o2_02_im += A2_im;
1343  o2_12_re += B2_re;
1344  o2_12_im += B2_im;
1345  o2_22_re += B2_re;
1346  o2_22_im += B2_im;
1347  o2_32_re -= A2_re;
1348  o2_32_im -= A2_im;
1349 
1350  }
1351 }
1352 
1353 #ifdef MULTI_GPU
1354 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
1355  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
1356 #endif
1357 {
1358  // Projector P1+
1359  // 1 0 0 1
1360  // 0 1 -1 0
1361  // 0 -1 1 0
1362  // 1 0 0 1
1363 
1364 #ifdef MULTI_GPU
1365  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
1366  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1367 #else
1368  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
1369 #endif
1370 
1371 #ifdef MULTI_GPU
1372  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1373 #else
1374  const int ga_idx = sp_idx;
1375 #endif
1376 
1383 
1384  // read gauge matrix from device memory
1385  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
1386 
1387  // reconstruct gauge matrix
1389 
1390  {
1391 #ifdef MULTI_GPU
1392  if (kernel_type == INTERIOR_KERNEL) {
1393 #endif
1394 
1395  // read flavor 1 from device memory
1396  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1397 
1398  // project spinor into half spinors
1399  a0_re = +i00_re+i30_re;
1400  a0_im = +i00_im+i30_im;
1401  a1_re = +i01_re+i31_re;
1402  a1_im = +i01_im+i31_im;
1403  a2_re = +i02_re+i32_re;
1404  a2_im = +i02_im+i32_im;
1405  b0_re = +i10_re-i20_re;
1406  b0_im = +i10_im-i20_im;
1407  b1_re = +i11_re-i21_re;
1408  b1_im = +i11_im-i21_im;
1409  b2_re = +i12_re-i22_re;
1410  b2_im = +i12_im-i22_im;
1411 
1412 #ifdef MULTI_GPU
1413  } else {
1414 
1415  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
1416 
1417  // read half spinor for the first flavor from device memory
1418  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1419 
1420  a0_re = i00_re; a0_im = i00_im;
1421  a1_re = i01_re; a1_im = i01_im;
1422  a2_re = i02_re; a2_im = i02_im;
1423  b0_re = i10_re; b0_im = i10_im;
1424  b1_re = i11_re; b1_im = i11_im;
1425  b2_re = i12_re; b2_im = i12_im;
1426 
1427  }
1428 #endif // MULTI_GPU
1429 
1430  // multiply row 0
1431  spinorFloat A0_re = 0;
1432  A0_re += gT00_re * a0_re;
1433  A0_re -= gT00_im * a0_im;
1434  A0_re += gT01_re * a1_re;
1435  A0_re -= gT01_im * a1_im;
1436  A0_re += gT02_re * a2_re;
1437  A0_re -= gT02_im * a2_im;
1438  spinorFloat A0_im = 0;
1439  A0_im += gT00_re * a0_im;
1440  A0_im += gT00_im * a0_re;
1441  A0_im += gT01_re * a1_im;
1442  A0_im += gT01_im * a1_re;
1443  A0_im += gT02_re * a2_im;
1444  A0_im += gT02_im * a2_re;
1445  spinorFloat B0_re = 0;
1446  B0_re += gT00_re * b0_re;
1447  B0_re -= gT00_im * b0_im;
1448  B0_re += gT01_re * b1_re;
1449  B0_re -= gT01_im * b1_im;
1450  B0_re += gT02_re * b2_re;
1451  B0_re -= gT02_im * b2_im;
1452  spinorFloat B0_im = 0;
1453  B0_im += gT00_re * b0_im;
1454  B0_im += gT00_im * b0_re;
1455  B0_im += gT01_re * b1_im;
1456  B0_im += gT01_im * b1_re;
1457  B0_im += gT02_re * b2_im;
1458  B0_im += gT02_im * b2_re;
1459 
1460  // multiply row 1
1461  spinorFloat A1_re = 0;
1462  A1_re += gT10_re * a0_re;
1463  A1_re -= gT10_im * a0_im;
1464  A1_re += gT11_re * a1_re;
1465  A1_re -= gT11_im * a1_im;
1466  A1_re += gT12_re * a2_re;
1467  A1_re -= gT12_im * a2_im;
1468  spinorFloat A1_im = 0;
1469  A1_im += gT10_re * a0_im;
1470  A1_im += gT10_im * a0_re;
1471  A1_im += gT11_re * a1_im;
1472  A1_im += gT11_im * a1_re;
1473  A1_im += gT12_re * a2_im;
1474  A1_im += gT12_im * a2_re;
1475  spinorFloat B1_re = 0;
1476  B1_re += gT10_re * b0_re;
1477  B1_re -= gT10_im * b0_im;
1478  B1_re += gT11_re * b1_re;
1479  B1_re -= gT11_im * b1_im;
1480  B1_re += gT12_re * b2_re;
1481  B1_re -= gT12_im * b2_im;
1482  spinorFloat B1_im = 0;
1483  B1_im += gT10_re * b0_im;
1484  B1_im += gT10_im * b0_re;
1485  B1_im += gT11_re * b1_im;
1486  B1_im += gT11_im * b1_re;
1487  B1_im += gT12_re * b2_im;
1488  B1_im += gT12_im * b2_re;
1489 
1490  // multiply row 2
1491  spinorFloat A2_re = 0;
1492  A2_re += gT20_re * a0_re;
1493  A2_re -= gT20_im * a0_im;
1494  A2_re += gT21_re * a1_re;
1495  A2_re -= gT21_im * a1_im;
1496  A2_re += gT22_re * a2_re;
1497  A2_re -= gT22_im * a2_im;
1498  spinorFloat A2_im = 0;
1499  A2_im += gT20_re * a0_im;
1500  A2_im += gT20_im * a0_re;
1501  A2_im += gT21_re * a1_im;
1502  A2_im += gT21_im * a1_re;
1503  A2_im += gT22_re * a2_im;
1504  A2_im += gT22_im * a2_re;
1505  spinorFloat B2_re = 0;
1506  B2_re += gT20_re * b0_re;
1507  B2_re -= gT20_im * b0_im;
1508  B2_re += gT21_re * b1_re;
1509  B2_re -= gT21_im * b1_im;
1510  B2_re += gT22_re * b2_re;
1511  B2_re -= gT22_im * b2_im;
1512  spinorFloat B2_im = 0;
1513  B2_im += gT20_re * b0_im;
1514  B2_im += gT20_im * b0_re;
1515  B2_im += gT21_re * b1_im;
1516  B2_im += gT21_im * b1_re;
1517  B2_im += gT22_re * b2_im;
1518  B2_im += gT22_im * b2_re;
1519 
1520  o1_00_re += A0_re;
1521  o1_00_im += A0_im;
1522  o1_10_re += B0_re;
1523  o1_10_im += B0_im;
1524  o1_20_re -= B0_re;
1525  o1_20_im -= B0_im;
1526  o1_30_re += A0_re;
1527  o1_30_im += A0_im;
1528 
1529  o1_01_re += A1_re;
1530  o1_01_im += A1_im;
1531  o1_11_re += B1_re;
1532  o1_11_im += B1_im;
1533  o1_21_re -= B1_re;
1534  o1_21_im -= B1_im;
1535  o1_31_re += A1_re;
1536  o1_31_im += A1_im;
1537 
1538  o1_02_re += A2_re;
1539  o1_02_im += A2_im;
1540  o1_12_re += B2_re;
1541  o1_12_im += B2_im;
1542  o1_22_re -= B2_re;
1543  o1_22_im -= B2_im;
1544  o1_32_re += A2_re;
1545  o1_32_im += A2_im;
1546 
1547  }
1548  {
1549 #ifdef MULTI_GPU
1550  if (kernel_type == INTERIOR_KERNEL) {
1551 #endif
1552 
1553  // read flavor 2 from device memory
1554  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
1555 
1556  // project spinor into half spinors
1557  a0_re = +i00_re+i30_re;
1558  a0_im = +i00_im+i30_im;
1559  a1_re = +i01_re+i31_re;
1560  a1_im = +i01_im+i31_im;
1561  a2_re = +i02_re+i32_re;
1562  a2_im = +i02_im+i32_im;
1563  b0_re = +i10_re-i20_re;
1564  b0_im = +i10_im-i20_im;
1565  b1_re = +i11_re-i21_re;
1566  b1_im = +i11_im-i21_im;
1567  b2_re = +i12_re-i22_re;
1568  b2_im = +i12_im-i22_im;
1569 
1570 #ifdef MULTI_GPU
1571  } else {
1572 
1573  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
1574 
1575  // read half spinor for the second flavor from device memory
1576  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
1577  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
1578 
1579  a0_re = i00_re; a0_im = i00_im;
1580  a1_re = i01_re; a1_im = i01_im;
1581  a2_re = i02_re; a2_im = i02_im;
1582  b0_re = i10_re; b0_im = i10_im;
1583  b1_re = i11_re; b1_im = i11_im;
1584  b2_re = i12_re; b2_im = i12_im;
1585 
1586  }
1587 #endif // MULTI_GPU
1588 
1589  // multiply row 0
1590  spinorFloat A0_re = 0;
1591  A0_re += gT00_re * a0_re;
1592  A0_re -= gT00_im * a0_im;
1593  A0_re += gT01_re * a1_re;
1594  A0_re -= gT01_im * a1_im;
1595  A0_re += gT02_re * a2_re;
1596  A0_re -= gT02_im * a2_im;
1597  spinorFloat A0_im = 0;
1598  A0_im += gT00_re * a0_im;
1599  A0_im += gT00_im * a0_re;
1600  A0_im += gT01_re * a1_im;
1601  A0_im += gT01_im * a1_re;
1602  A0_im += gT02_re * a2_im;
1603  A0_im += gT02_im * a2_re;
1604  spinorFloat B0_re = 0;
1605  B0_re += gT00_re * b0_re;
1606  B0_re -= gT00_im * b0_im;
1607  B0_re += gT01_re * b1_re;
1608  B0_re -= gT01_im * b1_im;
1609  B0_re += gT02_re * b2_re;
1610  B0_re -= gT02_im * b2_im;
1611  spinorFloat B0_im = 0;
1612  B0_im += gT00_re * b0_im;
1613  B0_im += gT00_im * b0_re;
1614  B0_im += gT01_re * b1_im;
1615  B0_im += gT01_im * b1_re;
1616  B0_im += gT02_re * b2_im;
1617  B0_im += gT02_im * b2_re;
1618 
1619  // multiply row 1
1620  spinorFloat A1_re = 0;
1621  A1_re += gT10_re * a0_re;
1622  A1_re -= gT10_im * a0_im;
1623  A1_re += gT11_re * a1_re;
1624  A1_re -= gT11_im * a1_im;
1625  A1_re += gT12_re * a2_re;
1626  A1_re -= gT12_im * a2_im;
1627  spinorFloat A1_im = 0;
1628  A1_im += gT10_re * a0_im;
1629  A1_im += gT10_im * a0_re;
1630  A1_im += gT11_re * a1_im;
1631  A1_im += gT11_im * a1_re;
1632  A1_im += gT12_re * a2_im;
1633  A1_im += gT12_im * a2_re;
1634  spinorFloat B1_re = 0;
1635  B1_re += gT10_re * b0_re;
1636  B1_re -= gT10_im * b0_im;
1637  B1_re += gT11_re * b1_re;
1638  B1_re -= gT11_im * b1_im;
1639  B1_re += gT12_re * b2_re;
1640  B1_re -= gT12_im * b2_im;
1641  spinorFloat B1_im = 0;
1642  B1_im += gT10_re * b0_im;
1643  B1_im += gT10_im * b0_re;
1644  B1_im += gT11_re * b1_im;
1645  B1_im += gT11_im * b1_re;
1646  B1_im += gT12_re * b2_im;
1647  B1_im += gT12_im * b2_re;
1648 
1649  // multiply row 2
1650  spinorFloat A2_re = 0;
1651  A2_re += gT20_re * a0_re;
1652  A2_re -= gT20_im * a0_im;
1653  A2_re += gT21_re * a1_re;
1654  A2_re -= gT21_im * a1_im;
1655  A2_re += gT22_re * a2_re;
1656  A2_re -= gT22_im * a2_im;
1657  spinorFloat A2_im = 0;
1658  A2_im += gT20_re * a0_im;
1659  A2_im += gT20_im * a0_re;
1660  A2_im += gT21_re * a1_im;
1661  A2_im += gT21_im * a1_re;
1662  A2_im += gT22_re * a2_im;
1663  A2_im += gT22_im * a2_re;
1664  spinorFloat B2_re = 0;
1665  B2_re += gT20_re * b0_re;
1666  B2_re -= gT20_im * b0_im;
1667  B2_re += gT21_re * b1_re;
1668  B2_re -= gT21_im * b1_im;
1669  B2_re += gT22_re * b2_re;
1670  B2_re -= gT22_im * b2_im;
1671  spinorFloat B2_im = 0;
1672  B2_im += gT20_re * b0_im;
1673  B2_im += gT20_im * b0_re;
1674  B2_im += gT21_re * b1_im;
1675  B2_im += gT21_im * b1_re;
1676  B2_im += gT22_re * b2_im;
1677  B2_im += gT22_im * b2_re;
1678 
1679  o2_00_re += A0_re;
1680  o2_00_im += A0_im;
1681  o2_10_re += B0_re;
1682  o2_10_im += B0_im;
1683  o2_20_re -= B0_re;
1684  o2_20_im -= B0_im;
1685  o2_30_re += A0_re;
1686  o2_30_im += A0_im;
1687 
1688  o2_01_re += A1_re;
1689  o2_01_im += A1_im;
1690  o2_11_re += B1_re;
1691  o2_11_im += B1_im;
1692  o2_21_re -= B1_re;
1693  o2_21_im -= B1_im;
1694  o2_31_re += A1_re;
1695  o2_31_im += A1_im;
1696 
1697  o2_02_re += A2_re;
1698  o2_02_im += A2_im;
1699  o2_12_re += B2_re;
1700  o2_12_im += B2_im;
1701  o2_22_re -= B2_re;
1702  o2_22_im -= B2_im;
1703  o2_32_re += A2_re;
1704  o2_32_im += A2_im;
1705 
1706  }
1707 }
1708 
1709 #ifdef MULTI_GPU
1710 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1712 #endif
1713 {
1714  // Projector P2-
1715  // 1 0 -i 0
1716  // 0 1 0 i
1717  // i 0 1 0
1718  // 0 -i 0 1
1719 
1720 #ifdef MULTI_GPU
1721  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1722  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1723 #else
1724  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1725 #endif
1726 
1727  const int ga_idx = sid;
1728 
1735 
1736  // read gauge matrix from device memory
1737  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1738 
1739  // reconstruct gauge matrix
1741 
1742  {
1743 #ifdef MULTI_GPU
1744  if (kernel_type == INTERIOR_KERNEL) {
1745 #endif
1746 
1747  // read flavor 1 from device memory
1748  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1749 
1750  // project spinor into half spinors
1751  a0_re = +i00_re+i20_im;
1752  a0_im = +i00_im-i20_re;
1753  a1_re = +i01_re+i21_im;
1754  a1_im = +i01_im-i21_re;
1755  a2_re = +i02_re+i22_im;
1756  a2_im = +i02_im-i22_re;
1757  b0_re = +i10_re-i30_im;
1758  b0_im = +i10_im+i30_re;
1759  b1_re = +i11_re-i31_im;
1760  b1_im = +i11_im+i31_re;
1761  b2_re = +i12_re-i32_im;
1762  b2_im = +i12_im+i32_re;
1763 
1764 #ifdef MULTI_GPU
1765  } else {
1766 
1767  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
1768 
1769  // read half spinor for the first flavor from device memory
1770  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1771 
1772  a0_re = i00_re; a0_im = i00_im;
1773  a1_re = i01_re; a1_im = i01_im;
1774  a2_re = i02_re; a2_im = i02_im;
1775  b0_re = i10_re; b0_im = i10_im;
1776  b1_re = i11_re; b1_im = i11_im;
1777  b2_re = i12_re; b2_im = i12_im;
1778 
1779  }
1780 #endif // MULTI_GPU
1781 
1782  // multiply row 0
1783  spinorFloat A0_re = 0;
1784  A0_re += g00_re * a0_re;
1785  A0_re -= g00_im * a0_im;
1786  A0_re += g01_re * a1_re;
1787  A0_re -= g01_im * a1_im;
1788  A0_re += g02_re * a2_re;
1789  A0_re -= g02_im * a2_im;
1790  spinorFloat A0_im = 0;
1791  A0_im += g00_re * a0_im;
1792  A0_im += g00_im * a0_re;
1793  A0_im += g01_re * a1_im;
1794  A0_im += g01_im * a1_re;
1795  A0_im += g02_re * a2_im;
1796  A0_im += g02_im * a2_re;
1797  spinorFloat B0_re = 0;
1798  B0_re += g00_re * b0_re;
1799  B0_re -= g00_im * b0_im;
1800  B0_re += g01_re * b1_re;
1801  B0_re -= g01_im * b1_im;
1802  B0_re += g02_re * b2_re;
1803  B0_re -= g02_im * b2_im;
1804  spinorFloat B0_im = 0;
1805  B0_im += g00_re * b0_im;
1806  B0_im += g00_im * b0_re;
1807  B0_im += g01_re * b1_im;
1808  B0_im += g01_im * b1_re;
1809  B0_im += g02_re * b2_im;
1810  B0_im += g02_im * b2_re;
1811 
1812  // multiply row 1
1813  spinorFloat A1_re = 0;
1814  A1_re += g10_re * a0_re;
1815  A1_re -= g10_im * a0_im;
1816  A1_re += g11_re * a1_re;
1817  A1_re -= g11_im * a1_im;
1818  A1_re += g12_re * a2_re;
1819  A1_re -= g12_im * a2_im;
1820  spinorFloat A1_im = 0;
1821  A1_im += g10_re * a0_im;
1822  A1_im += g10_im * a0_re;
1823  A1_im += g11_re * a1_im;
1824  A1_im += g11_im * a1_re;
1825  A1_im += g12_re * a2_im;
1826  A1_im += g12_im * a2_re;
1827  spinorFloat B1_re = 0;
1828  B1_re += g10_re * b0_re;
1829  B1_re -= g10_im * b0_im;
1830  B1_re += g11_re * b1_re;
1831  B1_re -= g11_im * b1_im;
1832  B1_re += g12_re * b2_re;
1833  B1_re -= g12_im * b2_im;
1834  spinorFloat B1_im = 0;
1835  B1_im += g10_re * b0_im;
1836  B1_im += g10_im * b0_re;
1837  B1_im += g11_re * b1_im;
1838  B1_im += g11_im * b1_re;
1839  B1_im += g12_re * b2_im;
1840  B1_im += g12_im * b2_re;
1841 
1842  // multiply row 2
1843  spinorFloat A2_re = 0;
1844  A2_re += g20_re * a0_re;
1845  A2_re -= g20_im * a0_im;
1846  A2_re += g21_re * a1_re;
1847  A2_re -= g21_im * a1_im;
1848  A2_re += g22_re * a2_re;
1849  A2_re -= g22_im * a2_im;
1850  spinorFloat A2_im = 0;
1851  A2_im += g20_re * a0_im;
1852  A2_im += g20_im * a0_re;
1853  A2_im += g21_re * a1_im;
1854  A2_im += g21_im * a1_re;
1855  A2_im += g22_re * a2_im;
1856  A2_im += g22_im * a2_re;
1857  spinorFloat B2_re = 0;
1858  B2_re += g20_re * b0_re;
1859  B2_re -= g20_im * b0_im;
1860  B2_re += g21_re * b1_re;
1861  B2_re -= g21_im * b1_im;
1862  B2_re += g22_re * b2_re;
1863  B2_re -= g22_im * b2_im;
1864  spinorFloat B2_im = 0;
1865  B2_im += g20_re * b0_im;
1866  B2_im += g20_im * b0_re;
1867  B2_im += g21_re * b1_im;
1868  B2_im += g21_im * b1_re;
1869  B2_im += g22_re * b2_im;
1870  B2_im += g22_im * b2_re;
1871 
1872  o1_00_re += A0_re;
1873  o1_00_im += A0_im;
1874  o1_10_re += B0_re;
1875  o1_10_im += B0_im;
1876  o1_20_re -= A0_im;
1877  o1_20_im += A0_re;
1878  o1_30_re += B0_im;
1879  o1_30_im -= B0_re;
1880 
1881  o1_01_re += A1_re;
1882  o1_01_im += A1_im;
1883  o1_11_re += B1_re;
1884  o1_11_im += B1_im;
1885  o1_21_re -= A1_im;
1886  o1_21_im += A1_re;
1887  o1_31_re += B1_im;
1888  o1_31_im -= B1_re;
1889 
1890  o1_02_re += A2_re;
1891  o1_02_im += A2_im;
1892  o1_12_re += B2_re;
1893  o1_12_im += B2_im;
1894  o1_22_re -= A2_im;
1895  o1_22_im += A2_re;
1896  o1_32_re += B2_im;
1897  o1_32_im -= B2_re;
1898 
1899  }
1900  {
1901 #ifdef MULTI_GPU
1902  if (kernel_type == INTERIOR_KERNEL) {
1903 #endif
1904 
1905  // read flavor 2 from device memory
1906  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
1907 
1908  // project spinor into half spinors
1909  a0_re = +i00_re+i20_im;
1910  a0_im = +i00_im-i20_re;
1911  a1_re = +i01_re+i21_im;
1912  a1_im = +i01_im-i21_re;
1913  a2_re = +i02_re+i22_im;
1914  a2_im = +i02_im-i22_re;
1915  b0_re = +i10_re-i30_im;
1916  b0_im = +i10_im+i30_re;
1917  b1_re = +i11_re-i31_im;
1918  b1_im = +i11_im+i31_re;
1919  b2_re = +i12_re-i32_im;
1920  b2_im = +i12_im+i32_re;
1921 
1922 #ifdef MULTI_GPU
1923  } else {
1924 
1925  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
1926 
1927  // read half spinor for the second flavor from device memory
1928  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
1929  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
1930 
1931  a0_re = i00_re; a0_im = i00_im;
1932  a1_re = i01_re; a1_im = i01_im;
1933  a2_re = i02_re; a2_im = i02_im;
1934  b0_re = i10_re; b0_im = i10_im;
1935  b1_re = i11_re; b1_im = i11_im;
1936  b2_re = i12_re; b2_im = i12_im;
1937 
1938  }
1939 #endif // MULTI_GPU
1940 
1941  // multiply row 0
1942  spinorFloat A0_re = 0;
1943  A0_re += g00_re * a0_re;
1944  A0_re -= g00_im * a0_im;
1945  A0_re += g01_re * a1_re;
1946  A0_re -= g01_im * a1_im;
1947  A0_re += g02_re * a2_re;
1948  A0_re -= g02_im * a2_im;
1949  spinorFloat A0_im = 0;
1950  A0_im += g00_re * a0_im;
1951  A0_im += g00_im * a0_re;
1952  A0_im += g01_re * a1_im;
1953  A0_im += g01_im * a1_re;
1954  A0_im += g02_re * a2_im;
1955  A0_im += g02_im * a2_re;
1956  spinorFloat B0_re = 0;
1957  B0_re += g00_re * b0_re;
1958  B0_re -= g00_im * b0_im;
1959  B0_re += g01_re * b1_re;
1960  B0_re -= g01_im * b1_im;
1961  B0_re += g02_re * b2_re;
1962  B0_re -= g02_im * b2_im;
1963  spinorFloat B0_im = 0;
1964  B0_im += g00_re * b0_im;
1965  B0_im += g00_im * b0_re;
1966  B0_im += g01_re * b1_im;
1967  B0_im += g01_im * b1_re;
1968  B0_im += g02_re * b2_im;
1969  B0_im += g02_im * b2_re;
1970 
1971  // multiply row 1
1972  spinorFloat A1_re = 0;
1973  A1_re += g10_re * a0_re;
1974  A1_re -= g10_im * a0_im;
1975  A1_re += g11_re * a1_re;
1976  A1_re -= g11_im * a1_im;
1977  A1_re += g12_re * a2_re;
1978  A1_re -= g12_im * a2_im;
1979  spinorFloat A1_im = 0;
1980  A1_im += g10_re * a0_im;
1981  A1_im += g10_im * a0_re;
1982  A1_im += g11_re * a1_im;
1983  A1_im += g11_im * a1_re;
1984  A1_im += g12_re * a2_im;
1985  A1_im += g12_im * a2_re;
1986  spinorFloat B1_re = 0;
1987  B1_re += g10_re * b0_re;
1988  B1_re -= g10_im * b0_im;
1989  B1_re += g11_re * b1_re;
1990  B1_re -= g11_im * b1_im;
1991  B1_re += g12_re * b2_re;
1992  B1_re -= g12_im * b2_im;
1993  spinorFloat B1_im = 0;
1994  B1_im += g10_re * b0_im;
1995  B1_im += g10_im * b0_re;
1996  B1_im += g11_re * b1_im;
1997  B1_im += g11_im * b1_re;
1998  B1_im += g12_re * b2_im;
1999  B1_im += g12_im * b2_re;
2000 
2001  // multiply row 2
2002  spinorFloat A2_re = 0;
2003  A2_re += g20_re * a0_re;
2004  A2_re -= g20_im * a0_im;
2005  A2_re += g21_re * a1_re;
2006  A2_re -= g21_im * a1_im;
2007  A2_re += g22_re * a2_re;
2008  A2_re -= g22_im * a2_im;
2009  spinorFloat A2_im = 0;
2010  A2_im += g20_re * a0_im;
2011  A2_im += g20_im * a0_re;
2012  A2_im += g21_re * a1_im;
2013  A2_im += g21_im * a1_re;
2014  A2_im += g22_re * a2_im;
2015  A2_im += g22_im * a2_re;
2016  spinorFloat B2_re = 0;
2017  B2_re += g20_re * b0_re;
2018  B2_re -= g20_im * b0_im;
2019  B2_re += g21_re * b1_re;
2020  B2_re -= g21_im * b1_im;
2021  B2_re += g22_re * b2_re;
2022  B2_re -= g22_im * b2_im;
2023  spinorFloat B2_im = 0;
2024  B2_im += g20_re * b0_im;
2025  B2_im += g20_im * b0_re;
2026  B2_im += g21_re * b1_im;
2027  B2_im += g21_im * b1_re;
2028  B2_im += g22_re * b2_im;
2029  B2_im += g22_im * b2_re;
2030 
2031  o2_00_re += A0_re;
2032  o2_00_im += A0_im;
2033  o2_10_re += B0_re;
2034  o2_10_im += B0_im;
2035  o2_20_re -= A0_im;
2036  o2_20_im += A0_re;
2037  o2_30_re += B0_im;
2038  o2_30_im -= B0_re;
2039 
2040  o2_01_re += A1_re;
2041  o2_01_im += A1_im;
2042  o2_11_re += B1_re;
2043  o2_11_im += B1_im;
2044  o2_21_re -= A1_im;
2045  o2_21_im += A1_re;
2046  o2_31_re += B1_im;
2047  o2_31_im -= B1_re;
2048 
2049  o2_02_re += A2_re;
2050  o2_02_im += A2_im;
2051  o2_12_re += B2_re;
2052  o2_12_im += B2_im;
2053  o2_22_re -= A2_im;
2054  o2_22_im += A2_re;
2055  o2_32_re += B2_im;
2056  o2_32_im -= B2_re;
2057 
2058  }
2059 }
2060 
2061 #ifdef MULTI_GPU
2062 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
2063  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
2064 #endif
2065 {
2066  // Projector P2+
2067  // 1 0 i 0
2068  // 0 1 0 -i
2069  // -i 0 1 0
2070  // 0 i 0 1
2071 
2072 #ifdef MULTI_GPU
2073  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
2074  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2075 #else
2076  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
2077 #endif
2078 
2079 #ifdef MULTI_GPU
2080  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
2081 #else
2082  const int ga_idx = sp_idx;
2083 #endif
2084 
2091 
2092  // read gauge matrix from device memory
2093  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
2094 
2095  // reconstruct gauge matrix
2097 
2098  {
2099 #ifdef MULTI_GPU
2100  if (kernel_type == INTERIOR_KERNEL) {
2101 #endif
2102 
2103  // read flavor 1 from device memory
2104  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2105 
2106  // project spinor into half spinors
2107  a0_re = +i00_re-i20_im;
2108  a0_im = +i00_im+i20_re;
2109  a1_re = +i01_re-i21_im;
2110  a1_im = +i01_im+i21_re;
2111  a2_re = +i02_re-i22_im;
2112  a2_im = +i02_im+i22_re;
2113  b0_re = +i10_re+i30_im;
2114  b0_im = +i10_im-i30_re;
2115  b1_re = +i11_re+i31_im;
2116  b1_im = +i11_im-i31_re;
2117  b2_re = +i12_re+i32_im;
2118  b2_im = +i12_im-i32_re;
2119 
2120 #ifdef MULTI_GPU
2121  } else {
2122 
2123  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2124 
2125  // read half spinor for the first flavor from device memory
2126  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2127 
2128  a0_re = i00_re; a0_im = i00_im;
2129  a1_re = i01_re; a1_im = i01_im;
2130  a2_re = i02_re; a2_im = i02_im;
2131  b0_re = i10_re; b0_im = i10_im;
2132  b1_re = i11_re; b1_im = i11_im;
2133  b2_re = i12_re; b2_im = i12_im;
2134 
2135  }
2136 #endif // MULTI_GPU
2137 
2138  // multiply row 0
2139  spinorFloat A0_re = 0;
2140  A0_re += gT00_re * a0_re;
2141  A0_re -= gT00_im * a0_im;
2142  A0_re += gT01_re * a1_re;
2143  A0_re -= gT01_im * a1_im;
2144  A0_re += gT02_re * a2_re;
2145  A0_re -= gT02_im * a2_im;
2146  spinorFloat A0_im = 0;
2147  A0_im += gT00_re * a0_im;
2148  A0_im += gT00_im * a0_re;
2149  A0_im += gT01_re * a1_im;
2150  A0_im += gT01_im * a1_re;
2151  A0_im += gT02_re * a2_im;
2152  A0_im += gT02_im * a2_re;
2153  spinorFloat B0_re = 0;
2154  B0_re += gT00_re * b0_re;
2155  B0_re -= gT00_im * b0_im;
2156  B0_re += gT01_re * b1_re;
2157  B0_re -= gT01_im * b1_im;
2158  B0_re += gT02_re * b2_re;
2159  B0_re -= gT02_im * b2_im;
2160  spinorFloat B0_im = 0;
2161  B0_im += gT00_re * b0_im;
2162  B0_im += gT00_im * b0_re;
2163  B0_im += gT01_re * b1_im;
2164  B0_im += gT01_im * b1_re;
2165  B0_im += gT02_re * b2_im;
2166  B0_im += gT02_im * b2_re;
2167 
2168  // multiply row 1
2169  spinorFloat A1_re = 0;
2170  A1_re += gT10_re * a0_re;
2171  A1_re -= gT10_im * a0_im;
2172  A1_re += gT11_re * a1_re;
2173  A1_re -= gT11_im * a1_im;
2174  A1_re += gT12_re * a2_re;
2175  A1_re -= gT12_im * a2_im;
2176  spinorFloat A1_im = 0;
2177  A1_im += gT10_re * a0_im;
2178  A1_im += gT10_im * a0_re;
2179  A1_im += gT11_re * a1_im;
2180  A1_im += gT11_im * a1_re;
2181  A1_im += gT12_re * a2_im;
2182  A1_im += gT12_im * a2_re;
2183  spinorFloat B1_re = 0;
2184  B1_re += gT10_re * b0_re;
2185  B1_re -= gT10_im * b0_im;
2186  B1_re += gT11_re * b1_re;
2187  B1_re -= gT11_im * b1_im;
2188  B1_re += gT12_re * b2_re;
2189  B1_re -= gT12_im * b2_im;
2190  spinorFloat B1_im = 0;
2191  B1_im += gT10_re * b0_im;
2192  B1_im += gT10_im * b0_re;
2193  B1_im += gT11_re * b1_im;
2194  B1_im += gT11_im * b1_re;
2195  B1_im += gT12_re * b2_im;
2196  B1_im += gT12_im * b2_re;
2197 
2198  // multiply row 2
2199  spinorFloat A2_re = 0;
2200  A2_re += gT20_re * a0_re;
2201  A2_re -= gT20_im * a0_im;
2202  A2_re += gT21_re * a1_re;
2203  A2_re -= gT21_im * a1_im;
2204  A2_re += gT22_re * a2_re;
2205  A2_re -= gT22_im * a2_im;
2206  spinorFloat A2_im = 0;
2207  A2_im += gT20_re * a0_im;
2208  A2_im += gT20_im * a0_re;
2209  A2_im += gT21_re * a1_im;
2210  A2_im += gT21_im * a1_re;
2211  A2_im += gT22_re * a2_im;
2212  A2_im += gT22_im * a2_re;
2213  spinorFloat B2_re = 0;
2214  B2_re += gT20_re * b0_re;
2215  B2_re -= gT20_im * b0_im;
2216  B2_re += gT21_re * b1_re;
2217  B2_re -= gT21_im * b1_im;
2218  B2_re += gT22_re * b2_re;
2219  B2_re -= gT22_im * b2_im;
2220  spinorFloat B2_im = 0;
2221  B2_im += gT20_re * b0_im;
2222  B2_im += gT20_im * b0_re;
2223  B2_im += gT21_re * b1_im;
2224  B2_im += gT21_im * b1_re;
2225  B2_im += gT22_re * b2_im;
2226  B2_im += gT22_im * b2_re;
2227 
2228  o1_00_re += A0_re;
2229  o1_00_im += A0_im;
2230  o1_10_re += B0_re;
2231  o1_10_im += B0_im;
2232  o1_20_re += A0_im;
2233  o1_20_im -= A0_re;
2234  o1_30_re -= B0_im;
2235  o1_30_im += B0_re;
2236 
2237  o1_01_re += A1_re;
2238  o1_01_im += A1_im;
2239  o1_11_re += B1_re;
2240  o1_11_im += B1_im;
2241  o1_21_re += A1_im;
2242  o1_21_im -= A1_re;
2243  o1_31_re -= B1_im;
2244  o1_31_im += B1_re;
2245 
2246  o1_02_re += A2_re;
2247  o1_02_im += A2_im;
2248  o1_12_re += B2_re;
2249  o1_12_im += B2_im;
2250  o1_22_re += A2_im;
2251  o1_22_im -= A2_re;
2252  o1_32_re -= B2_im;
2253  o1_32_im += B2_re;
2254 
2255  }
2256  {
2257 #ifdef MULTI_GPU
2258  if (kernel_type == INTERIOR_KERNEL) {
2259 #endif
2260 
2261  // read flavor 2 from device memory
2262  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
2263 
2264  // project spinor into half spinors
2265  a0_re = +i00_re-i20_im;
2266  a0_im = +i00_im+i20_re;
2267  a1_re = +i01_re-i21_im;
2268  a1_im = +i01_im+i21_re;
2269  a2_re = +i02_re-i22_im;
2270  a2_im = +i02_im+i22_re;
2271  b0_re = +i10_re+i30_im;
2272  b0_im = +i10_im-i30_re;
2273  b1_re = +i11_re+i31_im;
2274  b1_im = +i11_im-i31_re;
2275  b2_re = +i12_re+i32_im;
2276  b2_im = +i12_im-i32_re;
2277 
2278 #ifdef MULTI_GPU
2279  } else {
2280 
2281  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2282 
2283  // read half spinor for the second flavor from device memory
2284  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
2285  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
2286 
2287  a0_re = i00_re; a0_im = i00_im;
2288  a1_re = i01_re; a1_im = i01_im;
2289  a2_re = i02_re; a2_im = i02_im;
2290  b0_re = i10_re; b0_im = i10_im;
2291  b1_re = i11_re; b1_im = i11_im;
2292  b2_re = i12_re; b2_im = i12_im;
2293 
2294  }
2295 #endif // MULTI_GPU
2296 
2297  // multiply row 0
2298  spinorFloat A0_re = 0;
2299  A0_re += gT00_re * a0_re;
2300  A0_re -= gT00_im * a0_im;
2301  A0_re += gT01_re * a1_re;
2302  A0_re -= gT01_im * a1_im;
2303  A0_re += gT02_re * a2_re;
2304  A0_re -= gT02_im * a2_im;
2305  spinorFloat A0_im = 0;
2306  A0_im += gT00_re * a0_im;
2307  A0_im += gT00_im * a0_re;
2308  A0_im += gT01_re * a1_im;
2309  A0_im += gT01_im * a1_re;
2310  A0_im += gT02_re * a2_im;
2311  A0_im += gT02_im * a2_re;
2312  spinorFloat B0_re = 0;
2313  B0_re += gT00_re * b0_re;
2314  B0_re -= gT00_im * b0_im;
2315  B0_re += gT01_re * b1_re;
2316  B0_re -= gT01_im * b1_im;
2317  B0_re += gT02_re * b2_re;
2318  B0_re -= gT02_im * b2_im;
2319  spinorFloat B0_im = 0;
2320  B0_im += gT00_re * b0_im;
2321  B0_im += gT00_im * b0_re;
2322  B0_im += gT01_re * b1_im;
2323  B0_im += gT01_im * b1_re;
2324  B0_im += gT02_re * b2_im;
2325  B0_im += gT02_im * b2_re;
2326 
2327  // multiply row 1
2328  spinorFloat A1_re = 0;
2329  A1_re += gT10_re * a0_re;
2330  A1_re -= gT10_im * a0_im;
2331  A1_re += gT11_re * a1_re;
2332  A1_re -= gT11_im * a1_im;
2333  A1_re += gT12_re * a2_re;
2334  A1_re -= gT12_im * a2_im;
2335  spinorFloat A1_im = 0;
2336  A1_im += gT10_re * a0_im;
2337  A1_im += gT10_im * a0_re;
2338  A1_im += gT11_re * a1_im;
2339  A1_im += gT11_im * a1_re;
2340  A1_im += gT12_re * a2_im;
2341  A1_im += gT12_im * a2_re;
2342  spinorFloat B1_re = 0;
2343  B1_re += gT10_re * b0_re;
2344  B1_re -= gT10_im * b0_im;
2345  B1_re += gT11_re * b1_re;
2346  B1_re -= gT11_im * b1_im;
2347  B1_re += gT12_re * b2_re;
2348  B1_re -= gT12_im * b2_im;
2349  spinorFloat B1_im = 0;
2350  B1_im += gT10_re * b0_im;
2351  B1_im += gT10_im * b0_re;
2352  B1_im += gT11_re * b1_im;
2353  B1_im += gT11_im * b1_re;
2354  B1_im += gT12_re * b2_im;
2355  B1_im += gT12_im * b2_re;
2356 
2357  // multiply row 2
2358  spinorFloat A2_re = 0;
2359  A2_re += gT20_re * a0_re;
2360  A2_re -= gT20_im * a0_im;
2361  A2_re += gT21_re * a1_re;
2362  A2_re -= gT21_im * a1_im;
2363  A2_re += gT22_re * a2_re;
2364  A2_re -= gT22_im * a2_im;
2365  spinorFloat A2_im = 0;
2366  A2_im += gT20_re * a0_im;
2367  A2_im += gT20_im * a0_re;
2368  A2_im += gT21_re * a1_im;
2369  A2_im += gT21_im * a1_re;
2370  A2_im += gT22_re * a2_im;
2371  A2_im += gT22_im * a2_re;
2372  spinorFloat B2_re = 0;
2373  B2_re += gT20_re * b0_re;
2374  B2_re -= gT20_im * b0_im;
2375  B2_re += gT21_re * b1_re;
2376  B2_re -= gT21_im * b1_im;
2377  B2_re += gT22_re * b2_re;
2378  B2_re -= gT22_im * b2_im;
2379  spinorFloat B2_im = 0;
2380  B2_im += gT20_re * b0_im;
2381  B2_im += gT20_im * b0_re;
2382  B2_im += gT21_re * b1_im;
2383  B2_im += gT21_im * b1_re;
2384  B2_im += gT22_re * b2_im;
2385  B2_im += gT22_im * b2_re;
2386 
2387  o2_00_re += A0_re;
2388  o2_00_im += A0_im;
2389  o2_10_re += B0_re;
2390  o2_10_im += B0_im;
2391  o2_20_re += A0_im;
2392  o2_20_im -= A0_re;
2393  o2_30_re -= B0_im;
2394  o2_30_im += B0_re;
2395 
2396  o2_01_re += A1_re;
2397  o2_01_im += A1_im;
2398  o2_11_re += B1_re;
2399  o2_11_im += B1_im;
2400  o2_21_re += A1_im;
2401  o2_21_im -= A1_re;
2402  o2_31_re -= B1_im;
2403  o2_31_im += B1_re;
2404 
2405  o2_02_re += A2_re;
2406  o2_02_im += A2_im;
2407  o2_12_re += B2_re;
2408  o2_12_im += B2_im;
2409  o2_22_re += A2_im;
2410  o2_22_im -= A2_re;
2411  o2_32_re -= B2_im;
2412  o2_32_im += B2_re;
2413 
2414  }
2415 }
2416 
2417 #ifdef MULTI_GPU
2418 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
2420 #endif
2421 {
2422  // Projector P3-
2423  // 0 0 0 0
2424  // 0 0 0 0
2425  // 0 0 2 0
2426  // 0 0 0 2
2427 
2428 #ifdef MULTI_GPU
2429  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
2430  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2431 #else
2432  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
2433 #endif
2434 
2435  const int ga_idx = sid;
2436 
2443 
2445  {
2446  {
2447 #ifdef MULTI_GPU
2448  if (kernel_type == INTERIOR_KERNEL) {
2449 #endif
2450 
2451  // read flavor 1 from device memory
2452  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2453 
2454  // project spinor into half spinors
2455  a0_re = +2*i20_re;
2456  a0_im = +2*i20_im;
2457  a1_re = +2*i21_re;
2458  a1_im = +2*i21_im;
2459  a2_re = +2*i22_re;
2460  a2_im = +2*i22_im;
2461  b0_re = +2*i30_re;
2462  b0_im = +2*i30_im;
2463  b1_re = +2*i31_re;
2464  b1_im = +2*i31_im;
2465  b2_re = +2*i32_re;
2466  b2_im = +2*i32_im;
2467 
2468 #ifdef MULTI_GPU
2469  } else {
2470 
2471  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2472 
2473  // read half spinor for the first flavor from device memory
2474  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
2475 
2476  a0_re = 2*i00_re; a0_im = 2*i00_im;
2477  a1_re = 2*i01_re; a1_im = 2*i01_im;
2478  a2_re = 2*i02_re; a2_im = 2*i02_im;
2479  b0_re = 2*i10_re; b0_im = 2*i10_im;
2480  b1_re = 2*i11_re; b1_im = 2*i11_im;
2481  b2_re = 2*i12_re; b2_im = 2*i12_im;
2482 
2483  }
2484 #endif // MULTI_GPU
2485 
2486  // identity gauge matrix
2493 
2494  o1_20_re += A0_re;
2495  o1_20_im += A0_im;
2496  o1_30_re += B0_re;
2497  o1_30_im += B0_im;
2498 
2499  o1_21_re += A1_re;
2500  o1_21_im += A1_im;
2501  o1_31_re += B1_re;
2502  o1_31_im += B1_im;
2503 
2504  o1_22_re += A2_re;
2505  o1_22_im += A2_im;
2506  o1_32_re += B2_re;
2507  o1_32_im += B2_im;
2508 
2509  }
2510  {
2511 #ifdef MULTI_GPU
2512  if (kernel_type == INTERIOR_KERNEL) {
2513 #endif
2514 
2515  // read flavor 2 from device memory
2516  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
2517 
2518  // project spinor into half spinors
2519  a0_re = +2*i20_re;
2520  a0_im = +2*i20_im;
2521  a1_re = +2*i21_re;
2522  a1_im = +2*i21_im;
2523  a2_re = +2*i22_re;
2524  a2_im = +2*i22_im;
2525  b0_re = +2*i30_re;
2526  b0_im = +2*i30_im;
2527  b1_re = +2*i31_re;
2528  b1_im = +2*i31_im;
2529  b2_re = +2*i32_re;
2530  b2_im = +2*i32_im;
2531 
2532 #ifdef MULTI_GPU
2533  } else {
2534 
2535  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2536 
2537  // read half spinor for the second flavor from device memory
2538  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
2539  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
2540 
2541  a0_re = 2*i00_re; a0_im = 2*i00_im;
2542  a1_re = 2*i01_re; a1_im = 2*i01_im;
2543  a2_re = 2*i02_re; a2_im = 2*i02_im;
2544  b0_re = 2*i10_re; b0_im = 2*i10_im;
2545  b1_re = 2*i11_re; b1_im = 2*i11_im;
2546  b2_re = 2*i12_re; b2_im = 2*i12_im;
2547 
2548  }
2549 #endif // MULTI_GPU
2550 
2551  // identity gauge matrix
2558 
2559  o2_20_re += A0_re;
2560  o2_20_im += A0_im;
2561  o2_30_re += B0_re;
2562  o2_30_im += B0_im;
2563 
2564  o2_21_re += A1_re;
2565  o2_21_im += A1_im;
2566  o2_31_re += B1_re;
2567  o2_31_im += B1_im;
2568 
2569  o2_22_re += A2_re;
2570  o2_22_im += A2_im;
2571  o2_32_re += B2_re;
2572  o2_32_im += B2_im;
2573 
2574  }
2575  } else {
2576  // read gauge matrix from device memory
2577  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
2578 
2579  // reconstruct gauge matrix
2581 
2582  {
2583 #ifdef MULTI_GPU
2584  if (kernel_type == INTERIOR_KERNEL) {
2585 #endif
2586 
2587  // read flavor 1 from device memory
2588  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2589 
2590  // project spinor into half spinors
2591  a0_re = +2*i20_re;
2592  a0_im = +2*i20_im;
2593  a1_re = +2*i21_re;
2594  a1_im = +2*i21_im;
2595  a2_re = +2*i22_re;
2596  a2_im = +2*i22_im;
2597  b0_re = +2*i30_re;
2598  b0_im = +2*i30_im;
2599  b1_re = +2*i31_re;
2600  b1_im = +2*i31_im;
2601  b2_re = +2*i32_re;
2602  b2_im = +2*i32_im;
2603 
2604 #ifdef MULTI_GPU
2605  } else {
2606 
2607  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2608 
2609  // read half spinor for the first flavor from device memory
2610  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
2611 
2612  a0_re = 2*i00_re; a0_im = 2*i00_im;
2613  a1_re = 2*i01_re; a1_im = 2*i01_im;
2614  a2_re = 2*i02_re; a2_im = 2*i02_im;
2615  b0_re = 2*i10_re; b0_im = 2*i10_im;
2616  b1_re = 2*i11_re; b1_im = 2*i11_im;
2617  b2_re = 2*i12_re; b2_im = 2*i12_im;
2618 
2619  }
2620 #endif // MULTI_GPU
2621 
2622  // multiply row 0
2623  spinorFloat A0_re = 0;
2624  A0_re += g00_re * a0_re;
2625  A0_re -= g00_im * a0_im;
2626  A0_re += g01_re * a1_re;
2627  A0_re -= g01_im * a1_im;
2628  A0_re += g02_re * a2_re;
2629  A0_re -= g02_im * a2_im;
2630  spinorFloat A0_im = 0;
2631  A0_im += g00_re * a0_im;
2632  A0_im += g00_im * a0_re;
2633  A0_im += g01_re * a1_im;
2634  A0_im += g01_im * a1_re;
2635  A0_im += g02_re * a2_im;
2636  A0_im += g02_im * a2_re;
2637  spinorFloat B0_re = 0;
2638  B0_re += g00_re * b0_re;
2639  B0_re -= g00_im * b0_im;
2640  B0_re += g01_re * b1_re;
2641  B0_re -= g01_im * b1_im;
2642  B0_re += g02_re * b2_re;
2643  B0_re -= g02_im * b2_im;
2644  spinorFloat B0_im = 0;
2645  B0_im += g00_re * b0_im;
2646  B0_im += g00_im * b0_re;
2647  B0_im += g01_re * b1_im;
2648  B0_im += g01_im * b1_re;
2649  B0_im += g02_re * b2_im;
2650  B0_im += g02_im * b2_re;
2651 
2652  // multiply row 1
2653  spinorFloat A1_re = 0;
2654  A1_re += g10_re * a0_re;
2655  A1_re -= g10_im * a0_im;
2656  A1_re += g11_re * a1_re;
2657  A1_re -= g11_im * a1_im;
2658  A1_re += g12_re * a2_re;
2659  A1_re -= g12_im * a2_im;
2660  spinorFloat A1_im = 0;
2661  A1_im += g10_re * a0_im;
2662  A1_im += g10_im * a0_re;
2663  A1_im += g11_re * a1_im;
2664  A1_im += g11_im * a1_re;
2665  A1_im += g12_re * a2_im;
2666  A1_im += g12_im * a2_re;
2667  spinorFloat B1_re = 0;
2668  B1_re += g10_re * b0_re;
2669  B1_re -= g10_im * b0_im;
2670  B1_re += g11_re * b1_re;
2671  B1_re -= g11_im * b1_im;
2672  B1_re += g12_re * b2_re;
2673  B1_re -= g12_im * b2_im;
2674  spinorFloat B1_im = 0;
2675  B1_im += g10_re * b0_im;
2676  B1_im += g10_im * b0_re;
2677  B1_im += g11_re * b1_im;
2678  B1_im += g11_im * b1_re;
2679  B1_im += g12_re * b2_im;
2680  B1_im += g12_im * b2_re;
2681 
2682  // multiply row 2
2683  spinorFloat A2_re = 0;
2684  A2_re += g20_re * a0_re;
2685  A2_re -= g20_im * a0_im;
2686  A2_re += g21_re * a1_re;
2687  A2_re -= g21_im * a1_im;
2688  A2_re += g22_re * a2_re;
2689  A2_re -= g22_im * a2_im;
2690  spinorFloat A2_im = 0;
2691  A2_im += g20_re * a0_im;
2692  A2_im += g20_im * a0_re;
2693  A2_im += g21_re * a1_im;
2694  A2_im += g21_im * a1_re;
2695  A2_im += g22_re * a2_im;
2696  A2_im += g22_im * a2_re;
2697  spinorFloat B2_re = 0;
2698  B2_re += g20_re * b0_re;
2699  B2_re -= g20_im * b0_im;
2700  B2_re += g21_re * b1_re;
2701  B2_re -= g21_im * b1_im;
2702  B2_re += g22_re * b2_re;
2703  B2_re -= g22_im * b2_im;
2704  spinorFloat B2_im = 0;
2705  B2_im += g20_re * b0_im;
2706  B2_im += g20_im * b0_re;
2707  B2_im += g21_re * b1_im;
2708  B2_im += g21_im * b1_re;
2709  B2_im += g22_re * b2_im;
2710  B2_im += g22_im * b2_re;
2711 
2712  o1_20_re += A0_re;
2713  o1_20_im += A0_im;
2714  o1_30_re += B0_re;
2715  o1_30_im += B0_im;
2716 
2717  o1_21_re += A1_re;
2718  o1_21_im += A1_im;
2719  o1_31_re += B1_re;
2720  o1_31_im += B1_im;
2721 
2722  o1_22_re += A2_re;
2723  o1_22_im += A2_im;
2724  o1_32_re += B2_re;
2725  o1_32_im += B2_im;
2726 
2727  }
2728  {
2729 #ifdef MULTI_GPU
2730  if (kernel_type == INTERIOR_KERNEL) {
2731 #endif
2732 
2733  // read flavor 2 from device memory
2734  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
2735 
2736  // project spinor into half spinors
2737  a0_re = +2*i20_re;
2738  a0_im = +2*i20_im;
2739  a1_re = +2*i21_re;
2740  a1_im = +2*i21_im;
2741  a2_re = +2*i22_re;
2742  a2_im = +2*i22_im;
2743  b0_re = +2*i30_re;
2744  b0_im = +2*i30_im;
2745  b1_re = +2*i31_re;
2746  b1_im = +2*i31_im;
2747  b2_re = +2*i32_re;
2748  b2_im = +2*i32_im;
2749 
2750 #ifdef MULTI_GPU
2751  } else {
2752 
2753  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2754 
2755  // read half spinor for the second flavor from device memory
2756  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
2757  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
2758 
2759  a0_re = 2*i00_re; a0_im = 2*i00_im;
2760  a1_re = 2*i01_re; a1_im = 2*i01_im;
2761  a2_re = 2*i02_re; a2_im = 2*i02_im;
2762  b0_re = 2*i10_re; b0_im = 2*i10_im;
2763  b1_re = 2*i11_re; b1_im = 2*i11_im;
2764  b2_re = 2*i12_re; b2_im = 2*i12_im;
2765 
2766  }
2767 #endif // MULTI_GPU
2768 
2769  // multiply row 0
2770  spinorFloat A0_re = 0;
2771  A0_re += g00_re * a0_re;
2772  A0_re -= g00_im * a0_im;
2773  A0_re += g01_re * a1_re;
2774  A0_re -= g01_im * a1_im;
2775  A0_re += g02_re * a2_re;
2776  A0_re -= g02_im * a2_im;
2777  spinorFloat A0_im = 0;
2778  A0_im += g00_re * a0_im;
2779  A0_im += g00_im * a0_re;
2780  A0_im += g01_re * a1_im;
2781  A0_im += g01_im * a1_re;
2782  A0_im += g02_re * a2_im;
2783  A0_im += g02_im * a2_re;
2784  spinorFloat B0_re = 0;
2785  B0_re += g00_re * b0_re;
2786  B0_re -= g00_im * b0_im;
2787  B0_re += g01_re * b1_re;
2788  B0_re -= g01_im * b1_im;
2789  B0_re += g02_re * b2_re;
2790  B0_re -= g02_im * b2_im;
2791  spinorFloat B0_im = 0;
2792  B0_im += g00_re * b0_im;
2793  B0_im += g00_im * b0_re;
2794  B0_im += g01_re * b1_im;
2795  B0_im += g01_im * b1_re;
2796  B0_im += g02_re * b2_im;
2797  B0_im += g02_im * b2_re;
2798 
2799  // multiply row 1
2800  spinorFloat A1_re = 0;
2801  A1_re += g10_re * a0_re;
2802  A1_re -= g10_im * a0_im;
2803  A1_re += g11_re * a1_re;
2804  A1_re -= g11_im * a1_im;
2805  A1_re += g12_re * a2_re;
2806  A1_re -= g12_im * a2_im;
2807  spinorFloat A1_im = 0;
2808  A1_im += g10_re * a0_im;
2809  A1_im += g10_im * a0_re;
2810  A1_im += g11_re * a1_im;
2811  A1_im += g11_im * a1_re;
2812  A1_im += g12_re * a2_im;
2813  A1_im += g12_im * a2_re;
2814  spinorFloat B1_re = 0;
2815  B1_re += g10_re * b0_re;
2816  B1_re -= g10_im * b0_im;
2817  B1_re += g11_re * b1_re;
2818  B1_re -= g11_im * b1_im;
2819  B1_re += g12_re * b2_re;
2820  B1_re -= g12_im * b2_im;
2821  spinorFloat B1_im = 0;
2822  B1_im += g10_re * b0_im;
2823  B1_im += g10_im * b0_re;
2824  B1_im += g11_re * b1_im;
2825  B1_im += g11_im * b1_re;
2826  B1_im += g12_re * b2_im;
2827  B1_im += g12_im * b2_re;
2828 
2829  // multiply row 2
2830  spinorFloat A2_re = 0;
2831  A2_re += g20_re * a0_re;
2832  A2_re -= g20_im * a0_im;
2833  A2_re += g21_re * a1_re;
2834  A2_re -= g21_im * a1_im;
2835  A2_re += g22_re * a2_re;
2836  A2_re -= g22_im * a2_im;
2837  spinorFloat A2_im = 0;
2838  A2_im += g20_re * a0_im;
2839  A2_im += g20_im * a0_re;
2840  A2_im += g21_re * a1_im;
2841  A2_im += g21_im * a1_re;
2842  A2_im += g22_re * a2_im;
2843  A2_im += g22_im * a2_re;
2844  spinorFloat B2_re = 0;
2845  B2_re += g20_re * b0_re;
2846  B2_re -= g20_im * b0_im;
2847  B2_re += g21_re * b1_re;
2848  B2_re -= g21_im * b1_im;
2849  B2_re += g22_re * b2_re;
2850  B2_re -= g22_im * b2_im;
2851  spinorFloat B2_im = 0;
2852  B2_im += g20_re * b0_im;
2853  B2_im += g20_im * b0_re;
2854  B2_im += g21_re * b1_im;
2855  B2_im += g21_im * b1_re;
2856  B2_im += g22_re * b2_im;
2857  B2_im += g22_im * b2_re;
2858 
2859  o2_20_re += A0_re;
2860  o2_20_im += A0_im;
2861  o2_30_re += B0_re;
2862  o2_30_im += B0_im;
2863 
2864  o2_21_re += A1_re;
2865  o2_21_im += A1_im;
2866  o2_31_re += B1_re;
2867  o2_31_im += B1_im;
2868 
2869  o2_22_re += A2_re;
2870  o2_22_im += A2_im;
2871  o2_32_re += B2_re;
2872  o2_32_im += B2_im;
2873 
2874  }
2875  }
2876 }
2877 
2878 #ifdef MULTI_GPU
2879 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
2880  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
2881 #endif
2882 {
2883  // Projector P3+
2884  // 2 0 0 0
2885  // 0 2 0 0
2886  // 0 0 0 0
2887  // 0 0 0 0
2888 
2889 #ifdef MULTI_GPU
2890  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
2891  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2892 #else
2893  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
2894 #endif
2895 
2896 #ifdef MULTI_GPU
2897  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
2898 #else
2899  const int ga_idx = sp_idx;
2900 #endif
2901 
2908 
2909  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
2910  {
2911  {
2912 #ifdef MULTI_GPU
2913  if (kernel_type == INTERIOR_KERNEL) {
2914 #endif
2915 
2916  // read flavor 1 from device memory
2917  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2918 
2919  // project spinor into half spinors
2920  a0_re = +2*i00_re;
2921  a0_im = +2*i00_im;
2922  a1_re = +2*i01_re;
2923  a1_im = +2*i01_im;
2924  a2_re = +2*i02_re;
2925  a2_im = +2*i02_im;
2926  b0_re = +2*i10_re;
2927  b0_im = +2*i10_im;
2928  b1_re = +2*i11_re;
2929  b1_im = +2*i11_im;
2930  b2_re = +2*i12_re;
2931  b2_im = +2*i12_im;
2932 
2933 #ifdef MULTI_GPU
2934  } else {
2935 
2936  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
2937 
2938  // read half spinor for the first flavor from device memory
2939  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2940 
2941  a0_re = 2*i00_re; a0_im = 2*i00_im;
2942  a1_re = 2*i01_re; a1_im = 2*i01_im;
2943  a2_re = 2*i02_re; a2_im = 2*i02_im;
2944  b0_re = 2*i10_re; b0_im = 2*i10_im;
2945  b1_re = 2*i11_re; b1_im = 2*i11_im;
2946  b2_re = 2*i12_re; b2_im = 2*i12_im;
2947 
2948  }
2949 #endif // MULTI_GPU
2950 
2951  // identity gauge matrix
2958 
2959  o1_00_re += A0_re;
2960  o1_00_im += A0_im;
2961  o1_10_re += B0_re;
2962  o1_10_im += B0_im;
2963 
2964  o1_01_re += A1_re;
2965  o1_01_im += A1_im;
2966  o1_11_re += B1_re;
2967  o1_11_im += B1_im;
2968 
2969  o1_02_re += A2_re;
2970  o1_02_im += A2_im;
2971  o1_12_re += B2_re;
2972  o1_12_im += B2_im;
2973 
2974  }
2975  {
2976 #ifdef MULTI_GPU
2977  if (kernel_type == INTERIOR_KERNEL) {
2978 #endif
2979 
2980  // read flavor 2 from device memory
2981  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
2982 
2983  // project spinor into half spinors
2984  a0_re = +2*i00_re;
2985  a0_im = +2*i00_im;
2986  a1_re = +2*i01_re;
2987  a1_im = +2*i01_im;
2988  a2_re = +2*i02_re;
2989  a2_im = +2*i02_im;
2990  b0_re = +2*i10_re;
2991  b0_im = +2*i10_im;
2992  b1_re = +2*i11_re;
2993  b1_im = +2*i11_im;
2994  b2_re = +2*i12_re;
2995  b2_im = +2*i12_im;
2996 
2997 #ifdef MULTI_GPU
2998  } else {
2999 
3000  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
3001 
3002  // read half spinor for the second flavor from device memory
3003  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
3004  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
3005 
3006  a0_re = 2*i00_re; a0_im = 2*i00_im;
3007  a1_re = 2*i01_re; a1_im = 2*i01_im;
3008  a2_re = 2*i02_re; a2_im = 2*i02_im;
3009  b0_re = 2*i10_re; b0_im = 2*i10_im;
3010  b1_re = 2*i11_re; b1_im = 2*i11_im;
3011  b2_re = 2*i12_re; b2_im = 2*i12_im;
3012 
3013  }
3014 #endif // MULTI_GPU
3015 
3016  // identity gauge matrix
3023 
3024  o2_00_re += A0_re;
3025  o2_00_im += A0_im;
3026  o2_10_re += B0_re;
3027  o2_10_im += B0_im;
3028 
3029  o2_01_re += A1_re;
3030  o2_01_im += A1_im;
3031  o2_11_re += B1_re;
3032  o2_11_im += B1_im;
3033 
3034  o2_02_re += A2_re;
3035  o2_02_im += A2_im;
3036  o2_12_re += B2_re;
3037  o2_12_im += B2_im;
3038 
3039  }
3040  } else {
3041  // read gauge matrix from device memory
3042  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
3043 
3044  // reconstruct gauge matrix
3046 
3047  {
3048 #ifdef MULTI_GPU
3049  if (kernel_type == INTERIOR_KERNEL) {
3050 #endif
3051 
3052  // read flavor 1 from device memory
3053  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
3054 
3055  // project spinor into half spinors
3056  a0_re = +2*i00_re;
3057  a0_im = +2*i00_im;
3058  a1_re = +2*i01_re;
3059  a1_im = +2*i01_im;
3060  a2_re = +2*i02_re;
3061  a2_im = +2*i02_im;
3062  b0_re = +2*i10_re;
3063  b0_im = +2*i10_im;
3064  b1_re = +2*i11_re;
3065  b1_im = +2*i11_im;
3066  b2_re = +2*i12_re;
3067  b2_im = +2*i12_im;
3068 
3069 #ifdef MULTI_GPU
3070  } else {
3071 
3072  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
3073 
3074  // read half spinor for the first flavor from device memory
3075  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
3076 
3077  a0_re = 2*i00_re; a0_im = 2*i00_im;
3078  a1_re = 2*i01_re; a1_im = 2*i01_im;
3079  a2_re = 2*i02_re; a2_im = 2*i02_im;
3080  b0_re = 2*i10_re; b0_im = 2*i10_im;
3081  b1_re = 2*i11_re; b1_im = 2*i11_im;
3082  b2_re = 2*i12_re; b2_im = 2*i12_im;
3083 
3084  }
3085 #endif // MULTI_GPU
3086 
3087  // multiply row 0
3088  spinorFloat A0_re = 0;
3089  A0_re += gT00_re * a0_re;
3090  A0_re -= gT00_im * a0_im;
3091  A0_re += gT01_re * a1_re;
3092  A0_re -= gT01_im * a1_im;
3093  A0_re += gT02_re * a2_re;
3094  A0_re -= gT02_im * a2_im;
3095  spinorFloat A0_im = 0;
3096  A0_im += gT00_re * a0_im;
3097  A0_im += gT00_im * a0_re;
3098  A0_im += gT01_re * a1_im;
3099  A0_im += gT01_im * a1_re;
3100  A0_im += gT02_re * a2_im;
3101  A0_im += gT02_im * a2_re;
3102  spinorFloat B0_re = 0;
3103  B0_re += gT00_re * b0_re;
3104  B0_re -= gT00_im * b0_im;
3105  B0_re += gT01_re * b1_re;
3106  B0_re -= gT01_im * b1_im;
3107  B0_re += gT02_re * b2_re;
3108  B0_re -= gT02_im * b2_im;
3109  spinorFloat B0_im = 0;
3110  B0_im += gT00_re * b0_im;
3111  B0_im += gT00_im * b0_re;
3112  B0_im += gT01_re * b1_im;
3113  B0_im += gT01_im * b1_re;
3114  B0_im += gT02_re * b2_im;
3115  B0_im += gT02_im * b2_re;
3116 
3117  // multiply row 1
3118  spinorFloat A1_re = 0;
3119  A1_re += gT10_re * a0_re;
3120  A1_re -= gT10_im * a0_im;
3121  A1_re += gT11_re * a1_re;
3122  A1_re -= gT11_im * a1_im;
3123  A1_re += gT12_re * a2_re;
3124  A1_re -= gT12_im * a2_im;
3125  spinorFloat A1_im = 0;
3126  A1_im += gT10_re * a0_im;
3127  A1_im += gT10_im * a0_re;
3128  A1_im += gT11_re * a1_im;
3129  A1_im += gT11_im * a1_re;
3130  A1_im += gT12_re * a2_im;
3131  A1_im += gT12_im * a2_re;
3132  spinorFloat B1_re = 0;
3133  B1_re += gT10_re * b0_re;
3134  B1_re -= gT10_im * b0_im;
3135  B1_re += gT11_re * b1_re;
3136  B1_re -= gT11_im * b1_im;
3137  B1_re += gT12_re * b2_re;
3138  B1_re -= gT12_im * b2_im;
3139  spinorFloat B1_im = 0;
3140  B1_im += gT10_re * b0_im;
3141  B1_im += gT10_im * b0_re;
3142  B1_im += gT11_re * b1_im;
3143  B1_im += gT11_im * b1_re;
3144  B1_im += gT12_re * b2_im;
3145  B1_im += gT12_im * b2_re;
3146 
3147  // multiply row 2
3148  spinorFloat A2_re = 0;
3149  A2_re += gT20_re * a0_re;
3150  A2_re -= gT20_im * a0_im;
3151  A2_re += gT21_re * a1_re;
3152  A2_re -= gT21_im * a1_im;
3153  A2_re += gT22_re * a2_re;
3154  A2_re -= gT22_im * a2_im;
3155  spinorFloat A2_im = 0;
3156  A2_im += gT20_re * a0_im;
3157  A2_im += gT20_im * a0_re;
3158  A2_im += gT21_re * a1_im;
3159  A2_im += gT21_im * a1_re;
3160  A2_im += gT22_re * a2_im;
3161  A2_im += gT22_im * a2_re;
3162  spinorFloat B2_re = 0;
3163  B2_re += gT20_re * b0_re;
3164  B2_re -= gT20_im * b0_im;
3165  B2_re += gT21_re * b1_re;
3166  B2_re -= gT21_im * b1_im;
3167  B2_re += gT22_re * b2_re;
3168  B2_re -= gT22_im * b2_im;
3169  spinorFloat B2_im = 0;
3170  B2_im += gT20_re * b0_im;
3171  B2_im += gT20_im * b0_re;
3172  B2_im += gT21_re * b1_im;
3173  B2_im += gT21_im * b1_re;
3174  B2_im += gT22_re * b2_im;
3175  B2_im += gT22_im * b2_re;
3176 
3177  o1_00_re += A0_re;
3178  o1_00_im += A0_im;
3179  o1_10_re += B0_re;
3180  o1_10_im += B0_im;
3181 
3182  o1_01_re += A1_re;
3183  o1_01_im += A1_im;
3184  o1_11_re += B1_re;
3185  o1_11_im += B1_im;
3186 
3187  o1_02_re += A2_re;
3188  o1_02_im += A2_im;
3189  o1_12_re += B2_re;
3190  o1_12_im += B2_im;
3191 
3192  }
3193  {
3194 #ifdef MULTI_GPU
3195  if (kernel_type == INTERIOR_KERNEL) {
3196 #endif
3197 
3198  // read flavor 2 from device memory
3199  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx+param.fl_stride, sp_idx+param.fl_stride);
3200 
3201  // project spinor into half spinors
3202  a0_re = +2*i00_re;
3203  a0_im = +2*i00_im;
3204  a1_re = +2*i01_re;
3205  a1_im = +2*i01_im;
3206  a2_re = +2*i02_re;
3207  a2_im = +2*i02_im;
3208  b0_re = +2*i10_re;
3209  b0_im = +2*i10_im;
3210  b1_re = +2*i11_re;
3211  b1_im = +2*i11_im;
3212  b2_re = +2*i12_re;
3213  b2_im = +2*i12_im;
3214 
3215 #ifdef MULTI_GPU
3216  } else {
3217 
3218  const int sp_stride_pad = FLAVORS*ghostFace[static_cast<int>(kernel_type)];
3219 
3220  // read half spinor for the second flavor from device memory
3221  const int fl_idx = sp_idx + ghostFace[static_cast<int>(kernel_type)];
3222  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, fl_idx, sp_norm_idx+ghostFace[static_cast<int>(kernel_type)]);
3223 
3224  a0_re = 2*i00_re; a0_im = 2*i00_im;
3225  a1_re = 2*i01_re; a1_im = 2*i01_im;
3226  a2_re = 2*i02_re; a2_im = 2*i02_im;
3227  b0_re = 2*i10_re; b0_im = 2*i10_im;
3228  b1_re = 2*i11_re; b1_im = 2*i11_im;
3229  b2_re = 2*i12_re; b2_im = 2*i12_im;
3230 
3231  }
3232 #endif // MULTI_GPU
3233 
3234  // multiply row 0
3235  spinorFloat A0_re = 0;
3236  A0_re += gT00_re * a0_re;
3237  A0_re -= gT00_im * a0_im;
3238  A0_re += gT01_re * a1_re;
3239  A0_re -= gT01_im * a1_im;
3240  A0_re += gT02_re * a2_re;
3241  A0_re -= gT02_im * a2_im;
3242  spinorFloat A0_im = 0;
3243  A0_im += gT00_re * a0_im;
3244  A0_im += gT00_im * a0_re;
3245  A0_im += gT01_re * a1_im;
3246  A0_im += gT01_im * a1_re;
3247  A0_im += gT02_re * a2_im;
3248  A0_im += gT02_im * a2_re;
3249  spinorFloat B0_re = 0;
3250  B0_re += gT00_re * b0_re;
3251  B0_re -= gT00_im * b0_im;
3252  B0_re += gT01_re * b1_re;
3253  B0_re -= gT01_im * b1_im;
3254  B0_re += gT02_re * b2_re;
3255  B0_re -= gT02_im * b2_im;
3256  spinorFloat B0_im = 0;
3257  B0_im += gT00_re * b0_im;
3258  B0_im += gT00_im * b0_re;
3259  B0_im += gT01_re * b1_im;
3260  B0_im += gT01_im * b1_re;
3261  B0_im += gT02_re * b2_im;
3262  B0_im += gT02_im * b2_re;
3263 
3264  // multiply row 1
3265  spinorFloat A1_re = 0;
3266  A1_re += gT10_re * a0_re;
3267  A1_re -= gT10_im * a0_im;
3268  A1_re += gT11_re * a1_re;
3269  A1_re -= gT11_im * a1_im;
3270  A1_re += gT12_re * a2_re;
3271  A1_re -= gT12_im * a2_im;
3272  spinorFloat A1_im = 0;
3273  A1_im += gT10_re * a0_im;
3274  A1_im += gT10_im * a0_re;
3275  A1_im += gT11_re * a1_im;
3276  A1_im += gT11_im * a1_re;
3277  A1_im += gT12_re * a2_im;
3278  A1_im += gT12_im * a2_re;
3279  spinorFloat B1_re = 0;
3280  B1_re += gT10_re * b0_re;
3281  B1_re -= gT10_im * b0_im;
3282  B1_re += gT11_re * b1_re;
3283  B1_re -= gT11_im * b1_im;
3284  B1_re += gT12_re * b2_re;
3285  B1_re -= gT12_im * b2_im;
3286  spinorFloat B1_im = 0;
3287  B1_im += gT10_re * b0_im;
3288  B1_im += gT10_im * b0_re;
3289  B1_im += gT11_re * b1_im;
3290  B1_im += gT11_im * b1_re;
3291  B1_im += gT12_re * b2_im;
3292  B1_im += gT12_im * b2_re;
3293 
3294  // multiply row 2
3295  spinorFloat A2_re = 0;
3296  A2_re += gT20_re * a0_re;
3297  A2_re -= gT20_im * a0_im;
3298  A2_re += gT21_re * a1_re;
3299  A2_re -= gT21_im * a1_im;
3300  A2_re += gT22_re * a2_re;
3301  A2_re -= gT22_im * a2_im;
3302  spinorFloat A2_im = 0;
3303  A2_im += gT20_re * a0_im;
3304  A2_im += gT20_im * a0_re;
3305  A2_im += gT21_re * a1_im;
3306  A2_im += gT21_im * a1_re;
3307  A2_im += gT22_re * a2_im;
3308  A2_im += gT22_im * a2_re;
3309  spinorFloat B2_re = 0;
3310  B2_re += gT20_re * b0_re;
3311  B2_re -= gT20_im * b0_im;
3312  B2_re += gT21_re * b1_re;
3313  B2_re -= gT21_im * b1_im;
3314  B2_re += gT22_re * b2_re;
3315  B2_re -= gT22_im * b2_im;
3316  spinorFloat B2_im = 0;
3317  B2_im += gT20_re * b0_im;
3318  B2_im += gT20_im * b0_re;
3319  B2_im += gT21_re * b1_im;
3320  B2_im += gT21_im * b1_re;
3321  B2_im += gT22_re * b2_im;
3322  B2_im += gT22_im * b2_re;
3323 
3324  o2_00_re += A0_re;
3325  o2_00_im += A0_im;
3326  o2_10_re += B0_re;
3327  o2_10_im += B0_im;
3328 
3329  o2_01_re += A1_re;
3330  o2_01_im += A1_im;
3331  o2_11_re += B1_re;
3332  o2_11_im += B1_im;
3333 
3334  o2_02_re += A2_re;
3335  o2_02_im += A2_im;
3336  o2_12_re += B2_re;
3337  o2_12_im += B2_im;
3338 
3339  }
3340  }
3341 }
3342 
3343 #ifdef MULTI_GPU
3344 
3345 int incomplete = 0; // Have all 8 contributions been computed for this site?
3346 
3347 switch(kernel_type) { // intentional fall-through
3348 case INTERIOR_KERNEL:
3349  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
3350 case EXTERIOR_KERNEL_T:
3351  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
3352 case EXTERIOR_KERNEL_Z:
3353  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
3354 case EXTERIOR_KERNEL_Y:
3355  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
3356 }
3357 
3358 
3359 if (!incomplete)
3360 #endif // MULTI_GPU
3361 // apply twisted mass rotation
3362 {
3363 
3364 #ifdef DSLASH_TWIST
3365  {
3366  //Perform twist rotation first:
3367  //(1 - i*a*gamma_5 * tau_3 + b * tau_1)
3368  volatile spinorFloat x1_re, x1_im, y1_re, y1_im;
3369  volatile spinorFloat x2_re, x2_im, y2_re, y2_im;
3370 
3371  x1_re = 0.0, x1_im = 0.0;
3372  y1_re = 0.0, y1_im = 0.0;
3373  x2_re = 0.0, x2_im = 0.0;
3374  y2_re = 0.0, y2_im = 0.0;
3375 
3376 
3377  // using o1 regs:
3378  x1_re = o1_00_re + a *o1_20_im;
3379  x1_im = o1_00_im - a *o1_20_re;
3380  x2_re = b * o1_00_re;
3381  x2_im = b * o1_00_im;
3382 
3383  y1_re = o1_20_re + a *o1_00_im;
3384  y1_im = o1_20_im - a *o1_00_re;
3385  y2_re = b * o1_20_re;
3386  y2_im = b * o1_20_im;
3387 
3388 
3389  // using o2 regs:
3390  x2_re += o2_00_re - a *o2_20_im;
3391  x2_im += o2_00_im + a *o2_20_re;
3392  x1_re += b * o2_00_re;
3393  x1_im += b * o2_00_im;
3394 
3395  y2_re += o2_20_re - a *o2_00_im;
3396  y2_im += o2_20_im + a *o2_00_re;
3397  y1_re += b * o2_20_re;
3398  y1_im += b * o2_20_im;
3399 
3400 
3401  o1_00_re = x1_re; o1_00_im = x1_im;
3402  o1_20_re = y1_re; o1_20_im = y1_im;
3403 
3404  o2_00_re = x2_re; o2_00_im = x2_im;
3405  o2_20_re = y2_re; o2_20_im = y2_im;
3406 
3407  // using o1 regs:
3408  x1_re = o1_10_re + a *o1_30_im;
3409  x1_im = o1_10_im - a *o1_30_re;
3410  x2_re = b * o1_10_re;
3411  x2_im = b * o1_10_im;
3412 
3413  y1_re = o1_30_re + a *o1_10_im;
3414  y1_im = o1_30_im - a *o1_10_re;
3415  y2_re = b * o1_30_re;
3416  y2_im = b * o1_30_im;
3417 
3418 
3419  // using o2 regs:
3420  x2_re += o2_10_re - a *o2_30_im;
3421  x2_im += o2_10_im + a *o2_30_re;
3422  x1_re += b * o2_10_re;
3423  x1_im += b * o2_10_im;
3424 
3425  y2_re += o2_30_re - a *o2_10_im;
3426  y2_im += o2_30_im + a *o2_10_re;
3427  y1_re += b * o2_30_re;
3428  y1_im += b * o2_30_im;
3429 
3430 
3431  o1_10_re = x1_re; o1_10_im = x1_im;
3432  o1_30_re = y1_re; o1_30_im = y1_im;
3433 
3434  o2_10_re = x2_re; o2_10_im = x2_im;
3435  o2_30_re = y2_re; o2_30_im = y2_im;
3436 
3437  // using o1 regs:
3438  x1_re = o1_01_re + a *o1_21_im;
3439  x1_im = o1_01_im - a *o1_21_re;
3440  x2_re = b * o1_01_re;
3441  x2_im = b * o1_01_im;
3442 
3443  y1_re = o1_21_re + a *o1_01_im;
3444  y1_im = o1_21_im - a *o1_01_re;
3445  y2_re = b * o1_21_re;
3446  y2_im = b * o1_21_im;
3447 
3448 
3449  // using o2 regs:
3450  x2_re += o2_01_re - a *o2_21_im;
3451  x2_im += o2_01_im + a *o2_21_re;
3452  x1_re += b * o2_01_re;
3453  x1_im += b * o2_01_im;
3454 
3455  y2_re += o2_21_re - a *o2_01_im;
3456  y2_im += o2_21_im + a *o2_01_re;
3457  y1_re += b * o2_21_re;
3458  y1_im += b * o2_21_im;
3459 
3460 
3461  o1_01_re = x1_re; o1_01_im = x1_im;
3462  o1_21_re = y1_re; o1_21_im = y1_im;
3463 
3464  o2_01_re = x2_re; o2_01_im = x2_im;
3465  o2_21_re = y2_re; o2_21_im = y2_im;
3466 
3467  // using o1 regs:
3468  x1_re = o1_11_re + a *o1_31_im;
3469  x1_im = o1_11_im - a *o1_31_re;
3470  x2_re = b * o1_11_re;
3471  x2_im = b * o1_11_im;
3472 
3473  y1_re = o1_31_re + a *o1_11_im;
3474  y1_im = o1_31_im - a *o1_11_re;
3475  y2_re = b * o1_31_re;
3476  y2_im = b * o1_31_im;
3477 
3478 
3479  // using o2 regs:
3480  x2_re += o2_11_re - a *o2_31_im;
3481  x2_im += o2_11_im + a *o2_31_re;
3482  x1_re += b * o2_11_re;
3483  x1_im += b * o2_11_im;
3484 
3485  y2_re += o2_31_re - a *o2_11_im;
3486  y2_im += o2_31_im + a *o2_11_re;
3487  y1_re += b * o2_31_re;
3488  y1_im += b * o2_31_im;
3489 
3490 
3491  o1_11_re = x1_re; o1_11_im = x1_im;
3492  o1_31_re = y1_re; o1_31_im = y1_im;
3493 
3494  o2_11_re = x2_re; o2_11_im = x2_im;
3495  o2_31_re = y2_re; o2_31_im = y2_im;
3496 
3497  // using o1 regs:
3498  x1_re = o1_02_re + a *o1_22_im;
3499  x1_im = o1_02_im - a *o1_22_re;
3500  x2_re = b * o1_02_re;
3501  x2_im = b * o1_02_im;
3502 
3503  y1_re = o1_22_re + a *o1_02_im;
3504  y1_im = o1_22_im - a *o1_02_re;
3505  y2_re = b * o1_22_re;
3506  y2_im = b * o1_22_im;
3507 
3508 
3509  // using o2 regs:
3510  x2_re += o2_02_re - a *o2_22_im;
3511  x2_im += o2_02_im + a *o2_22_re;
3512  x1_re += b * o2_02_re;
3513  x1_im += b * o2_02_im;
3514 
3515  y2_re += o2_22_re - a *o2_02_im;
3516  y2_im += o2_22_im + a *o2_02_re;
3517  y1_re += b * o2_22_re;
3518  y1_im += b * o2_22_im;
3519 
3520 
3521  o1_02_re = x1_re; o1_02_im = x1_im;
3522  o1_22_re = y1_re; o1_22_im = y1_im;
3523 
3524  o2_02_re = x2_re; o2_02_im = x2_im;
3525  o2_22_re = y2_re; o2_22_im = y2_im;
3526 
3527  // using o1 regs:
3528  x1_re = o1_12_re + a *o1_32_im;
3529  x1_im = o1_12_im - a *o1_32_re;
3530  x2_re = b * o1_12_re;
3531  x2_im = b * o1_12_im;
3532 
3533  y1_re = o1_32_re + a *o1_12_im;
3534  y1_im = o1_32_im - a *o1_12_re;
3535  y2_re = b * o1_32_re;
3536  y2_im = b * o1_32_im;
3537 
3538 
3539  // using o2 regs:
3540  x2_re += o2_12_re - a *o2_32_im;
3541  x2_im += o2_12_im + a *o2_32_re;
3542  x1_re += b * o2_12_re;
3543  x1_im += b * o2_12_im;
3544 
3545  y2_re += o2_32_re - a *o2_12_im;
3546  y2_im += o2_32_im + a *o2_12_re;
3547  y1_re += b * o2_32_re;
3548  y1_im += b * o2_32_im;
3549 
3550 
3551  o1_12_re = x1_re; o1_12_im = x1_im;
3552  o1_32_re = y1_re; o1_32_im = y1_im;
3553 
3554  o2_12_re = x2_re; o2_12_im = x2_im;
3555  o2_32_re = y2_re; o2_32_im = y2_im;
3556 
3557  }
3558 #endif
3559 
3560 #ifndef DSLASH_XPAY
3561  o1_00_re *= c;
3562  o1_00_im *= c;
3563  o1_01_re *= c;
3564  o1_01_im *= c;
3565  o1_02_re *= c;
3566  o1_02_im *= c;
3567  o1_10_re *= c;
3568  o1_10_im *= c;
3569  o1_11_re *= c;
3570  o1_11_im *= c;
3571  o1_12_re *= c;
3572  o1_12_im *= c;
3573  o1_20_re *= c;
3574  o1_20_im *= c;
3575  o1_21_re *= c;
3576  o1_21_im *= c;
3577  o1_22_re *= c;
3578  o1_22_im *= c;
3579  o1_30_re *= c;
3580  o1_30_im *= c;
3581  o1_31_re *= c;
3582  o1_31_im *= c;
3583  o1_32_re *= c;
3584  o1_32_im *= c;
3585 
3586  o2_00_re *= c;
3587  o2_00_im *= c;
3588  o2_01_re *= c;
3589  o2_01_im *= c;
3590  o2_02_re *= c;
3591  o2_02_im *= c;
3592  o2_10_re *= c;
3593  o2_10_im *= c;
3594  o2_11_re *= c;
3595  o2_11_im *= c;
3596  o2_12_re *= c;
3597  o2_12_im *= c;
3598  o2_20_re *= c;
3599  o2_20_im *= c;
3600  o2_21_re *= c;
3601  o2_21_im *= c;
3602  o2_22_re *= c;
3603  o2_22_im *= c;
3604  o2_30_re *= c;
3605  o2_30_im *= c;
3606  o2_31_re *= c;
3607  o2_31_im *= c;
3608  o2_32_re *= c;
3609  o2_32_im *= c;
3610 #else
3611 #ifdef DSLASH_TWIST
3612  // accum spinor
3613 #ifdef SPINOR_DOUBLE
3614 
3615 #define acc_00_re accum0.x
3616 #define acc_00_im accum0.y
3617 #define acc_01_re accum1.x
3618 #define acc_01_im accum1.y
3619 #define acc_02_re accum2.x
3620 #define acc_02_im accum2.y
3621 #define acc_10_re accum3.x
3622 #define acc_10_im accum3.y
3623 #define acc_11_re accum4.x
3624 #define acc_11_im accum4.y
3625 #define acc_12_re accum5.x
3626 #define acc_12_im accum5.y
3627 #define acc_20_re accum6.x
3628 #define acc_20_im accum6.y
3629 #define acc_21_re accum7.x
3630 #define acc_21_im accum7.y
3631 #define acc_22_re accum8.x
3632 #define acc_22_im accum8.y
3633 #define acc_30_re accum9.x
3634 #define acc_30_im accum9.y
3635 #define acc_31_re accum10.x
3636 #define acc_31_im accum10.y
3637 #define acc_32_re accum11.x
3638 #define acc_32_im accum11.y
3639 
3640 #else
3641 #define acc_00_re accum0.x
3642 #define acc_00_im accum0.y
3643 #define acc_01_re accum0.z
3644 #define acc_01_im accum0.w
3645 #define acc_02_re accum1.x
3646 #define acc_02_im accum1.y
3647 #define acc_10_re accum1.z
3648 #define acc_10_im accum1.w
3649 #define acc_11_re accum2.x
3650 #define acc_11_im accum2.y
3651 #define acc_12_re accum2.z
3652 #define acc_12_im accum2.w
3653 #define acc_20_re accum3.x
3654 #define acc_20_im accum3.y
3655 #define acc_21_re accum3.z
3656 #define acc_21_im accum3.w
3657 #define acc_22_re accum4.x
3658 #define acc_22_im accum4.y
3659 #define acc_30_re accum4.z
3660 #define acc_30_im accum4.w
3661 #define acc_31_re accum5.x
3662 #define acc_31_im accum5.y
3663 #define acc_32_re accum5.z
3664 #define acc_32_im accum5.w
3665 
3666 #endif // SPINOR_DOUBLE
3667 
3668  {
3669  READ_ACCUM(ACCUMTEX, param.sp_stride)
3670 
3671  o1_00_re = c*o1_00_re + acc_00_re;
3672  o1_00_im = c*o1_00_im + acc_00_im;
3673  o1_01_re = c*o1_01_re + acc_01_re;
3674  o1_01_im = c*o1_01_im + acc_01_im;
3675  o1_02_re = c*o1_02_re + acc_02_re;
3676  o1_02_im = c*o1_02_im + acc_02_im;
3677  o1_10_re = c*o1_10_re + acc_10_re;
3678  o1_10_im = c*o1_10_im + acc_10_im;
3679  o1_11_re = c*o1_11_re + acc_11_re;
3680  o1_11_im = c*o1_11_im + acc_11_im;
3681  o1_12_re = c*o1_12_re + acc_12_re;
3682  o1_12_im = c*o1_12_im + acc_12_im;
3683  o1_20_re = c*o1_20_re + acc_20_re;
3684  o1_20_im = c*o1_20_im + acc_20_im;
3685  o1_21_re = c*o1_21_re + acc_21_re;
3686  o1_21_im = c*o1_21_im + acc_21_im;
3687  o1_22_re = c*o1_22_re + acc_22_re;
3688  o1_22_im = c*o1_22_im + acc_22_im;
3689  o1_30_re = c*o1_30_re + acc_30_re;
3690  o1_30_im = c*o1_30_im + acc_30_im;
3691  o1_31_re = c*o1_31_re + acc_31_re;
3692  o1_31_im = c*o1_31_im + acc_31_im;
3693  o1_32_re = c*o1_32_re + acc_32_re;
3694  o1_32_im = c*o1_32_im + acc_32_im;
3695 
3696  ASSN_ACCUM(ACCUMTEX, param.sp_stride, param.fl_stride)
3697 
3698  o2_00_re = c*o2_00_re + acc_00_re;
3699  o2_00_im = c*o2_00_im + acc_00_im;
3700  o2_01_re = c*o2_01_re + acc_01_re;
3701  o2_01_im = c*o2_01_im + acc_01_im;
3702  o2_02_re = c*o2_02_re + acc_02_re;
3703  o2_02_im = c*o2_02_im + acc_02_im;
3704  o2_10_re = c*o2_10_re + acc_10_re;
3705  o2_10_im = c*o2_10_im + acc_10_im;
3706  o2_11_re = c*o2_11_re + acc_11_re;
3707  o2_11_im = c*o2_11_im + acc_11_im;
3708  o2_12_re = c*o2_12_re + acc_12_re;
3709  o2_12_im = c*o2_12_im + acc_12_im;
3710  o2_20_re = c*o2_20_re + acc_20_re;
3711  o2_20_im = c*o2_20_im + acc_20_im;
3712  o2_21_re = c*o2_21_re + acc_21_re;
3713  o2_21_im = c*o2_21_im + acc_21_im;
3714  o2_22_re = c*o2_22_re + acc_22_re;
3715  o2_22_im = c*o2_22_im + acc_22_im;
3716  o2_30_re = c*o2_30_re + acc_30_re;
3717  o2_30_im = c*o2_30_im + acc_30_im;
3718  o2_31_re = c*o2_31_re + acc_31_re;
3719  o2_31_im = c*o2_31_im + acc_31_im;
3720  o2_32_re = c*o2_32_re + acc_32_re;
3721  o2_32_im = c*o2_32_im + acc_32_im;
3722  }
3723 
3724 #undef acc_00_re
3725 #undef acc_00_im
3726 #undef acc_01_re
3727 #undef acc_01_im
3728 #undef acc_02_re
3729 #undef acc_02_im
3730 #undef acc_10_re
3731 #undef acc_10_im
3732 #undef acc_11_re
3733 #undef acc_11_im
3734 #undef acc_12_re
3735 #undef acc_12_im
3736 #undef acc_20_re
3737 #undef acc_20_im
3738 #undef acc_21_re
3739 #undef acc_21_im
3740 #undef acc_22_re
3741 #undef acc_22_im
3742 #undef acc_30_re
3743 #undef acc_30_im
3744 #undef acc_31_re
3745 #undef acc_31_im
3746 #undef acc_32_re
3747 #undef acc_32_im
3748 
3749 #else
3750  // accum spinor
3751 #ifdef SPINOR_DOUBLE
3752 
3753 #define acc1_00_re flv1_accum0.x
3754 #define acc1_00_im flv1_accum0.y
3755 #define acc1_01_re flv1_accum1.x
3756 #define acc1_01_im flv1_accum1.y
3757 #define acc1_02_re flv1_accum2.x
3758 #define acc1_02_im flv1_accum2.y
3759 #define acc1_10_re flv1_accum3.x
3760 #define acc1_10_im flv1_accum3.y
3761 #define acc1_11_re flv1_accum4.x
3762 #define acc1_11_im flv1_accum4.y
3763 #define acc1_12_re flv1_accum5.x
3764 #define acc1_12_im flv1_accum5.y
3765 #define acc1_20_re flv1_accum6.x
3766 #define acc1_20_im flv1_accum6.y
3767 #define acc1_21_re flv1_accum7.x
3768 #define acc1_21_im flv1_accum7.y
3769 #define acc1_22_re flv1_accum8.x
3770 #define acc1_22_im flv1_accum8.y
3771 #define acc1_30_re flv1_accum9.x
3772 #define acc1_30_im flv1_accum9.y
3773 #define acc1_31_re flv1_accum10.x
3774 #define acc1_31_im flv1_accum10.y
3775 #define acc1_32_re flv1_accum11.x
3776 #define acc1_32_im flv1_accum11.y
3777 
3778 #define acc2_00_re flv2_accum0.x
3779 #define acc2_00_im flv2_accum0.y
3780 #define acc2_01_re flv2_accum1.x
3781 #define acc2_01_im flv2_accum1.y
3782 #define acc2_02_re flv2_accum2.x
3783 #define acc2_02_im flv2_accum2.y
3784 #define acc2_10_re flv2_accum3.x
3785 #define acc2_10_im flv2_accum3.y
3786 #define acc2_11_re flv2_accum4.x
3787 #define acc2_11_im flv2_accum4.y
3788 #define acc2_12_re flv2_accum5.x
3789 #define acc2_12_im flv2_accum5.y
3790 #define acc2_20_re flv2_accum6.x
3791 #define acc2_20_im flv2_accum6.y
3792 #define acc2_21_re flv2_accum7.x
3793 #define acc2_21_im flv2_accum7.y
3794 #define acc2_22_re flv2_accum8.x
3795 #define acc2_22_im flv2_accum8.y
3796 #define acc2_30_re flv2_accum9.x
3797 #define acc2_30_im flv2_accum9.y
3798 #define acc2_31_re flv2_accum10.x
3799 #define acc2_31_im flv2_accum10.y
3800 #define acc2_32_re flv2_accum11.x
3801 #define acc2_32_im flv2_accum11.y
3802 
3803 #else
3804 
3805 #define acc1_00_re flv1_accum0.x
3806 #define acc1_00_im flv1_accum0.y
3807 #define acc1_01_re flv1_accum0.z
3808 #define acc1_01_im flv1_accum0.w
3809 #define acc1_02_re flv1_accum1.x
3810 #define acc1_02_im flv1_accum1.y
3811 #define acc1_10_re flv1_accum1.z
3812 #define acc1_10_im flv1_accum1.w
3813 #define acc1_11_re flv1_accum2.x
3814 #define acc1_11_im flv1_accum2.y
3815 #define acc1_12_re flv1_accum2.z
3816 #define acc1_12_im flv1_accum2.w
3817 #define acc1_20_re flv1_accum3.x
3818 #define acc1_20_im flv1_accum3.y
3819 #define acc1_21_re flv1_accum3.z
3820 #define acc1_21_im flv1_accum3.w
3821 #define acc1_22_re flv1_accum4.x
3822 #define acc1_22_im flv1_accum4.y
3823 #define acc1_30_re flv1_accum4.z
3824 #define acc1_30_im flv1_accum4.w
3825 #define acc1_31_re flv1_accum5.x
3826 #define acc1_31_im flv1_accum5.y
3827 #define acc1_32_re flv1_accum5.z
3828 #define acc1_32_im flv1_accum5.w
3829 
3830 #define acc2_00_re flv2_accum0.x
3831 #define acc2_00_im flv2_accum0.y
3832 #define acc2_01_re flv2_accum0.z
3833 #define acc2_01_im flv2_accum0.w
3834 #define acc2_02_re flv2_accum1.x
3835 #define acc2_02_im flv2_accum1.y
3836 #define acc2_10_re flv2_accum1.z
3837 #define acc2_10_im flv2_accum1.w
3838 #define acc2_11_re flv2_accum2.x
3839 #define acc2_11_im flv2_accum2.y
3840 #define acc2_12_re flv2_accum2.z
3841 #define acc2_12_im flv2_accum2.w
3842 #define acc2_20_re flv2_accum3.x
3843 #define acc2_20_im flv2_accum3.y
3844 #define acc2_21_re flv2_accum3.z
3845 #define acc2_21_im flv2_accum3.w
3846 #define acc2_22_re flv2_accum4.x
3847 #define acc2_22_im flv2_accum4.y
3848 #define acc2_30_re flv2_accum4.z
3849 #define acc2_30_im flv2_accum4.w
3850 #define acc2_31_re flv2_accum5.x
3851 #define acc2_31_im flv2_accum5.y
3852 #define acc2_32_re flv2_accum5.z
3853 #define acc2_32_im flv2_accum5.w
3854 
3855 #endif // SPINOR_DOUBLE
3856 
3857  {
3858  READ_ACCUM_FLAVOR(ACCUMTEX, param.sp_stride, param.fl_stride)
3859 
3860  //Perform twist rotation:
3861  //(1 - i*a*gamma_5 * tau_3 + b * tau_1)
3862  volatile spinorFloat x1_re, x1_im, y1_re, y1_im;
3863  volatile spinorFloat x2_re, x2_im, y2_re, y2_im;
3864 
3865  x1_re = 0.0, x1_im = 0.0;
3866  y1_re = 0.0, y1_im = 0.0;
3867  x2_re = 0.0, x2_im = 0.0;
3868  y2_re = 0.0, y2_im = 0.0;
3869 
3870 
3871  // using acc1 regs:
3872  x1_re = acc1_00_re + a *acc1_20_im;
3873  x1_im = acc1_00_im - a *acc1_20_re;
3874  x2_re = b * acc1_00_re;
3875  x2_im = b * acc1_00_im;
3876 
3877  y1_re = acc1_20_re + a *acc1_00_im;
3878  y1_im = acc1_20_im - a *acc1_00_re;
3879  y2_re = b * acc1_20_re;
3880  y2_im = b * acc1_20_im;
3881 
3882 
3883  // using acc2 regs:
3884  x2_re += acc2_00_re - a *acc2_20_im;
3885  x2_im += acc2_00_im + a *acc2_20_re;
3886  x1_re += b * acc2_00_re;
3887  x1_im += b * acc2_00_im;
3888 
3889  y2_re += acc2_20_re - a *acc2_00_im;
3890  y2_im += acc2_20_im + a *acc2_00_re;
3891  y1_re += b * acc2_20_re;
3892  y1_im += b * acc2_20_im;
3893 
3894 
3895  acc1_00_re = x1_re; acc1_00_im = x1_im;
3896  acc1_20_re = y1_re; acc1_20_im = y1_im;
3897 
3898  acc2_00_re = x2_re; acc2_00_im = x2_im;
3899  acc2_20_re = y2_re; acc2_20_im = y2_im;
3900 
3901  // using acc1 regs:
3902  x1_re = acc1_10_re + a *acc1_30_im;
3903  x1_im = acc1_10_im - a *acc1_30_re;
3904  x2_re = b * acc1_10_re;
3905  x2_im = b * acc1_10_im;
3906 
3907  y1_re = acc1_30_re + a *acc1_10_im;
3908  y1_im = acc1_30_im - a *acc1_10_re;
3909  y2_re = b * acc1_30_re;
3910  y2_im = b * acc1_30_im;
3911 
3912 
3913  // using acc2 regs:
3914  x2_re += acc2_10_re - a *acc2_30_im;
3915  x2_im += acc2_10_im + a *acc2_30_re;
3916  x1_re += b * acc2_10_re;
3917  x1_im += b * acc2_10_im;
3918 
3919  y2_re += acc2_30_re - a *acc2_10_im;
3920  y2_im += acc2_30_im + a *acc2_10_re;
3921  y1_re += b * acc2_30_re;
3922  y1_im += b * acc2_30_im;
3923 
3924 
3925  acc1_10_re = x1_re; acc1_10_im = x1_im;
3926  acc1_30_re = y1_re; acc1_30_im = y1_im;
3927 
3928  acc2_10_re = x2_re; acc2_10_im = x2_im;
3929  acc2_30_re = y2_re; acc2_30_im = y2_im;
3930 
3931  // using acc1 regs:
3932  x1_re = acc1_01_re + a *acc1_21_im;
3933  x1_im = acc1_01_im - a *acc1_21_re;
3934  x2_re = b * acc1_01_re;
3935  x2_im = b * acc1_01_im;
3936 
3937  y1_re = acc1_21_re + a *acc1_01_im;
3938  y1_im = acc1_21_im - a *acc1_01_re;
3939  y2_re = b * acc1_21_re;
3940  y2_im = b * acc1_21_im;
3941 
3942 
3943  // using acc2 regs:
3944  x2_re += acc2_01_re - a *acc2_21_im;
3945  x2_im += acc2_01_im + a *acc2_21_re;
3946  x1_re += b * acc2_01_re;
3947  x1_im += b * acc2_01_im;
3948 
3949  y2_re += acc2_21_re - a *acc2_01_im;
3950  y2_im += acc2_21_im + a *acc2_01_re;
3951  y1_re += b * acc2_21_re;
3952  y1_im += b * acc2_21_im;
3953 
3954 
3955  acc1_01_re = x1_re; acc1_01_im = x1_im;
3956  acc1_21_re = y1_re; acc1_21_im = y1_im;
3957 
3958  acc2_01_re = x2_re; acc2_01_im = x2_im;
3959  acc2_21_re = y2_re; acc2_21_im = y2_im;
3960 
3961  // using acc1 regs:
3962  x1_re = acc1_11_re + a *acc1_31_im;
3963  x1_im = acc1_11_im - a *acc1_31_re;
3964  x2_re = b * acc1_11_re;
3965  x2_im = b * acc1_11_im;
3966 
3967  y1_re = acc1_31_re + a *acc1_11_im;
3968  y1_im = acc1_31_im - a *acc1_11_re;
3969  y2_re = b * acc1_31_re;
3970  y2_im = b * acc1_31_im;
3971 
3972 
3973  // using acc2 regs:
3974  x2_re += acc2_11_re - a *acc2_31_im;
3975  x2_im += acc2_11_im + a *acc2_31_re;
3976  x1_re += b * acc2_11_re;
3977  x1_im += b * acc2_11_im;
3978 
3979  y2_re += acc2_31_re - a *acc2_11_im;
3980  y2_im += acc2_31_im + a *acc2_11_re;
3981  y1_re += b * acc2_31_re;
3982  y1_im += b * acc2_31_im;
3983 
3984 
3985  acc1_11_re = x1_re; acc1_11_im = x1_im;
3986  acc1_31_re = y1_re; acc1_31_im = y1_im;
3987 
3988  acc2_11_re = x2_re; acc2_11_im = x2_im;
3989  acc2_31_re = y2_re; acc2_31_im = y2_im;
3990 
3991  // using acc1 regs:
3992  x1_re = acc1_02_re + a *acc1_22_im;
3993  x1_im = acc1_02_im - a *acc1_22_re;
3994  x2_re = b * acc1_02_re;
3995  x2_im = b * acc1_02_im;
3996 
3997  y1_re = acc1_22_re + a *acc1_02_im;
3998  y1_im = acc1_22_im - a *acc1_02_re;
3999  y2_re = b * acc1_22_re;
4000  y2_im = b * acc1_22_im;
4001 
4002 
4003  // using acc2 regs:
4004  x2_re += acc2_02_re - a *acc2_22_im;
4005  x2_im += acc2_02_im + a *acc2_22_re;
4006  x1_re += b * acc2_02_re;
4007  x1_im += b * acc2_02_im;
4008 
4009  y2_re += acc2_22_re - a *acc2_02_im;
4010  y2_im += acc2_22_im + a *acc2_02_re;
4011  y1_re += b * acc2_22_re;
4012  y1_im += b * acc2_22_im;
4013 
4014 
4015  acc1_02_re = x1_re; acc1_02_im = x1_im;
4016  acc1_22_re = y1_re; acc1_22_im = y1_im;
4017 
4018  acc2_02_re = x2_re; acc2_02_im = x2_im;
4019  acc2_22_re = y2_re; acc2_22_im = y2_im;
4020 
4021  // using acc1 regs:
4022  x1_re = acc1_12_re + a *acc1_32_im;
4023  x1_im = acc1_12_im - a *acc1_32_re;
4024  x2_re = b * acc1_12_re;
4025  x2_im = b * acc1_12_im;
4026 
4027  y1_re = acc1_32_re + a *acc1_12_im;
4028  y1_im = acc1_32_im - a *acc1_12_re;
4029  y2_re = b * acc1_32_re;
4030  y2_im = b * acc1_32_im;
4031 
4032 
4033  // using acc2 regs:
4034  x2_re += acc2_12_re - a *acc2_32_im;
4035  x2_im += acc2_12_im + a *acc2_32_re;
4036  x1_re += b * acc2_12_re;
4037  x1_im += b * acc2_12_im;
4038 
4039  y2_re += acc2_32_re - a *acc2_12_im;
4040  y2_im += acc2_32_im + a *acc2_12_re;
4041  y1_re += b * acc2_32_re;
4042  y1_im += b * acc2_32_im;
4043 
4044 
4045  acc1_12_re = x1_re; acc1_12_im = x1_im;
4046  acc1_32_re = y1_re; acc1_32_im = y1_im;
4047 
4048  acc2_12_re = x2_re; acc2_12_im = x2_im;
4049  acc2_32_re = y2_re; acc2_32_im = y2_im;
4050 
4051  o1_00_re = k*o1_00_re + acc1_00_re;
4052  o1_00_im = k*o1_00_im + acc1_00_im;
4053  o1_01_re = k*o1_01_re + acc1_01_re;
4054  o1_01_im = k*o1_01_im + acc1_01_im;
4055  o1_02_re = k*o1_02_re + acc1_02_re;
4056  o1_02_im = k*o1_02_im + acc1_02_im;
4057  o1_10_re = k*o1_10_re + acc1_10_re;
4058  o1_10_im = k*o1_10_im + acc1_10_im;
4059  o1_11_re = k*o1_11_re + acc1_11_re;
4060  o1_11_im = k*o1_11_im + acc1_11_im;
4061  o1_12_re = k*o1_12_re + acc1_12_re;
4062  o1_12_im = k*o1_12_im + acc1_12_im;
4063  o1_20_re = k*o1_20_re + acc1_20_re;
4064  o1_20_im = k*o1_20_im + acc1_20_im;
4065  o1_21_re = k*o1_21_re + acc1_21_re;
4066  o1_21_im = k*o1_21_im + acc1_21_im;
4067  o1_22_re = k*o1_22_re + acc1_22_re;
4068  o1_22_im = k*o1_22_im + acc1_22_im;
4069  o1_30_re = k*o1_30_re + acc1_30_re;
4070  o1_30_im = k*o1_30_im + acc1_30_im;
4071  o1_31_re = k*o1_31_re + acc1_31_re;
4072  o1_31_im = k*o1_31_im + acc1_31_im;
4073  o1_32_re = k*o1_32_re + acc1_32_re;
4074  o1_32_im = k*o1_32_im + acc1_32_im;
4075 
4076  o2_00_re = k*o2_00_re + acc2_00_re;
4077  o2_00_im = k*o2_00_im + acc2_00_im;
4078  o2_01_re = k*o2_01_re + acc2_01_re;
4079  o2_01_im = k*o2_01_im + acc2_01_im;
4080  o2_02_re = k*o2_02_re + acc2_02_re;
4081  o2_02_im = k*o2_02_im + acc2_02_im;
4082  o2_10_re = k*o2_10_re + acc2_10_re;
4083  o2_10_im = k*o2_10_im + acc2_10_im;
4084  o2_11_re = k*o2_11_re + acc2_11_re;
4085  o2_11_im = k*o2_11_im + acc2_11_im;
4086  o2_12_re = k*o2_12_re + acc2_12_re;
4087  o2_12_im = k*o2_12_im + acc2_12_im;
4088  o2_20_re = k*o2_20_re + acc2_20_re;
4089  o2_20_im = k*o2_20_im + acc2_20_im;
4090  o2_21_re = k*o2_21_re + acc2_21_re;
4091  o2_21_im = k*o2_21_im + acc2_21_im;
4092  o2_22_re = k*o2_22_re + acc2_22_re;
4093  o2_22_im = k*o2_22_im + acc2_22_im;
4094  o2_30_re = k*o2_30_re + acc2_30_re;
4095  o2_30_im = k*o2_30_im + acc2_30_im;
4096  o2_31_re = k*o2_31_re + acc2_31_re;
4097  o2_31_im = k*o2_31_im + acc2_31_im;
4098  o2_32_re = k*o2_32_re + acc2_32_re;
4099  o2_32_im = k*o2_32_im + acc2_32_im;
4100  }
4101 
4102 #undef acc1_00_re
4103 #undef acc1_00_im
4104 #undef acc1_01_re
4105 #undef acc1_01_im
4106 #undef acc1_02_re
4107 #undef acc1_02_im
4108 #undef acc1_10_re
4109 #undef acc1_10_im
4110 #undef acc1_11_re
4111 #undef acc1_11_im
4112 #undef acc1_12_re
4113 #undef acc1_12_im
4114 #undef acc1_20_re
4115 #undef acc1_20_im
4116 #undef acc1_21_re
4117 #undef acc1_21_im
4118 #undef acc1_22_re
4119 #undef acc1_22_im
4120 #undef acc1_30_re
4121 #undef acc1_30_im
4122 #undef acc1_31_re
4123 #undef acc1_31_im
4124 #undef acc1_32_re
4125 #undef acc1_32_im
4126 
4127 #undef acc2_00_re
4128 #undef acc2_00_im
4129 #undef acc2_01_re
4130 #undef acc2_01_im
4131 #undef acc2_02_re
4132 #undef acc2_02_im
4133 #undef acc2_10_re
4134 #undef acc2_10_im
4135 #undef acc2_11_re
4136 #undef acc2_11_im
4137 #undef acc2_12_re
4138 #undef acc2_12_im
4139 #undef acc2_20_re
4140 #undef acc2_20_im
4141 #undef acc2_21_re
4142 #undef acc2_21_im
4143 #undef acc2_22_re
4144 #undef acc2_22_im
4145 #undef acc2_30_re
4146 #undef acc2_30_im
4147 #undef acc2_31_re
4148 #undef acc2_31_im
4149 #undef acc2_32_re
4150 #undef acc2_32_im
4151 
4152 #endif//DSLASH_TWIST
4153 
4154 #endif // DSLASH_XPAY
4155 }
4156 
4157 // write spinor field back to device memory
4159 
4160 // undefine to prevent warning when precision is changed
4161 #undef spinorFloat
4162 #undef g00_re
4163 #undef g00_im
4164 #undef g01_re
4165 #undef g01_im
4166 #undef g02_re
4167 #undef g02_im
4168 #undef g10_re
4169 #undef g10_im
4170 #undef g11_re
4171 #undef g11_im
4172 #undef g12_re
4173 #undef g12_im
4174 #undef g20_re
4175 #undef g20_im
4176 #undef g21_re
4177 #undef g21_im
4178 #undef g22_re
4179 #undef g22_im
4180 
4181 #undef i00_re
4182 #undef i00_im
4183 #undef i01_re
4184 #undef i01_im
4185 #undef i02_re
4186 #undef i02_im
4187 #undef i10_re
4188 #undef i10_im
4189 #undef i11_re
4190 #undef i11_im
4191 #undef i12_re
4192 #undef i12_im
4193 #undef i20_re
4194 #undef i20_im
4195 #undef i21_re
4196 #undef i21_im
4197 #undef i22_re
4198 #undef i22_im
4199 #undef i30_re
4200 #undef i30_im
4201 #undef i31_re
4202 #undef i31_im
4203 #undef i32_re
4204 #undef i32_im
4205 
4206 
4207 #undef VOLATILE
RECONSTRUCT_GAUGE_MATRIX(0)
__constant__ int Vh
#define gT21_im
#define gT12_im
#define gT01_im
#define i21_re
VOLATILE spinorFloat o2_01_re
spinorFloat a2_im
#define gT00_re
__constant__ int X2
#define i31_im
spinorFloat B0_re
VOLATILE spinorFloat o1_30_re
spinorFloat b2_re
__constant__ int X2X1mX1
VOLATILE spinorFloat o1_02_im
spinorFloat A0_im
#define i12_re
#define i02_re
spinorFloat b0_re
VOLATILE spinorFloat o1_32_re
VOLATILE spinorFloat o1_22_im
#define gT11_re
__constant__ int X3X2X1mX2X1
#define g21_re
const int dims[]
#define g11_re
VOLATILE spinorFloat o2_32_im
#define i02_im
#define g00_im
int sid
__constant__ int X1
VOLATILE spinorFloat o2_02_im
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
#define g20_re
#define i22_re
VOLATILE spinorFloat o2_31_re
#define i01_im
#define gT20_re
spinorFloat B0_im
#define i00_re
VOLATILE spinorFloat o1_31_re
VOLATILE spinorFloat o2_32_re
spinorFloat a0_im
__constant__ int X3X2X1
#define i20_re
#define gT01_re
VOLATILE spinorFloat o1_20_re
VOLATILE spinorFloat o1_02_re
#define gT11_im
VOLATILE spinorFloat o2_22_re
spinorFloat b1_im
#define g02_re
o1_00_im *o1_01_re *o1_01_im *o1_02_re *o1_02_im *o1_10_re *o1_10_im *o1_11_re *o1_11_im *o1_12_re *o1_12_im *o1_20_re *o1_20_im *o1_21_re *o1_21_im *o1_22_re *o1_22_im *o1_30_re *o1_30_im *o1_31_re *o1_31_im *o1_32_re *o1_32_im *o2_00_re *o2_00_im *o2_01_re *o2_01_im *o2_02_re *o2_02_im *o2_10_re *o2_10_im *o2_11_re *o2_11_im *o2_12_re *o2_12_im *o2_20_re *o2_20_im *o2_21_re *o2_21_im *o2_22_re *o2_22_im *o2_30_re *o2_30_im *o2_31_re *o2_31_im *o2_32_re *o2_32_im * WRITE_FLAVOR_SPINOR()
VOLATILE spinorFloat o1_10_im
spinorFloat A2_re
VOLATILE spinorFloat o2_30_re
#define FLAVORS
#define gT02_re
int x2
VOLATILE spinorFloat o2_31_im
VOLATILE spinorFloat o1_31_im
#define gT12_re
QudaGaugeParam param
Definition: pack_test.cpp:17
#define i32_re
#define i20_im
__constant__ int ghostFace[QUDA_MAX_DIM+1]
VOLATILE spinorFloat o1_22_re
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
VOLATILE spinorFloat o1_00_re
VOLATILE spinorFloat o1_12_re
#define READ_SPINOR_UP
Definition: covDev.h:130
#define g11_im
#define g02_im
#define g01_im
VOLATILE spinorFloat o2_12_re
int x1
VOLATILE spinorFloat o1_11_re
VOLATILE spinorFloat o2_22_im
VOLATILE spinorFloat o2_00_im
spinorFloat A0_re
#define i22_im
#define gT22_re
VOLATILE spinorFloat o2_21_im
VOLATILE spinorFloat o1_32_im
#define gT21_re
spinorFloat B1_re
#define GAUGE0TEX
Definition: covDev.h:112
#define gT22_im
VOLATILE spinorFloat o1_21_im
#define g12_im
spinorFloat B1_im
VOLATILE spinorFloat o2_10_im
VOLATILE spinorFloat o1_01_re
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
VOLATILE spinorFloat o2_01_im
VOLATILE spinorFloat o1_01_im
const int ga_idx
#define i11_re
spinorFloat a1_im
__constant__ int X2m1
#define spinorFloat
VOLATILE spinorFloat o1_11_im
VOLATILE spinorFloat o1_12_im
VOLATILE spinorFloat o2_10_re
#define gT02_im
VOLATILE spinorFloat o2_02_re
#define g10_im
#define SPINORTEX
Definition: clover_def.h:40
#define i31_re
#define g00_re
__constant__ int gauge_fixed
#define gT10_im
#define g01_re
#define i30_re
__constant__ int X4X3X2X1mX3X2X1
#define g22_im
#define SPINOR_HOP
Definition: covDev.h:158
spinorFloat A1_im
#define i32_im
VOLATILE spinorFloat o1_20_im
int x3
VOLATILE spinorFloat o2_00_re
spinorFloat b1_re
VOLATILE spinorFloat o2_11_im
__constant__ int ga_stride
#define g22_re
#define gT20_im
#define i12_im
spinorFloat a0_re
VOLATILE spinorFloat o1_10_re
spinorFloat b0_im
VOLATILE spinorFloat o1_30_im
__constant__ int X1m1
#define g10_re
VOLATILE spinorFloat o2_20_im
__constant__ int X3
spinorFloat B2_re
#define g21_im
#define i10_re
#define i10_im
spinorFloat A1_re
VOLATILE spinorFloat o2_30_im
#define i00_im
#define g20_im
VOLATILE spinorFloat o2_12_im
#define gT10_re
#define GAUGE1TEX
Definition: covDev.h:113
spinorFloat A2_im
spinorFloat B2_im
#define i21_im
__constant__ int X4m1
#define gT00_im
#define g12_re
#define READ_SPINOR
Definition: clover_def.h:36
VOLATILE spinorFloat o2_21_re
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
#define READ_SPINOR_DOWN
Definition: covDev.h:131
VOLATILE spinorFloat o1_21_re
#define i30_im
__constant__ int X4X3X2X1hmX3X2X1h
spinorFloat b2_im
spinorFloat a2_re
#define i01_re
spinorFloat a1_re
#define i11_im
KernelType kernel_type
__constant__ int X4
__constant__ int X3m1
VOLATILE spinorFloat o2_20_re
#define VOLATILE
VOLATILE spinorFloat o2_11_re
__constant__ int X2X1
VOLATILE spinorFloat o1_00_im
int x4