QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tm_dslash_dagger_gt200_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define i00_re I0.x
15 #define i00_im I0.y
16 #define i01_re I1.x
17 #define i01_im I1.y
18 #define i02_re I2.x
19 #define i02_im I2.y
20 #define i10_re I3.x
21 #define i10_im I3.y
22 #define i11_re I4.x
23 #define i11_im I4.y
24 #define i12_re I5.x
25 #define i12_im I5.y
26 #define i20_re I6.x
27 #define i20_im I6.y
28 #define i21_re I7.x
29 #define i21_im I7.y
30 #define i22_re I8.x
31 #define i22_im I8.y
32 #define i30_re I9.x
33 #define i30_im I9.y
34 #define i31_re I10.x
35 #define i31_im I10.y
36 #define i32_re I11.x
37 #define i32_im I11.y
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
62 #else
63 #define spinorFloat float
64 #define i00_re I0.x
65 #define i00_im I0.y
66 #define i01_re I0.z
67 #define i01_im I0.w
68 #define i02_re I1.x
69 #define i02_im I1.y
70 #define i10_re I1.z
71 #define i10_im I1.w
72 #define i11_re I2.x
73 #define i11_im I2.y
74 #define i12_re I2.z
75 #define i12_im I2.w
76 #define i20_re I3.x
77 #define i20_im I3.y
78 #define i21_re I3.z
79 #define i21_im I3.w
80 #define i22_re I4.x
81 #define i22_im I4.y
82 #define i30_re I4.z
83 #define i30_im I4.w
84 #define i31_re I5.x
85 #define i31_im I5.y
86 #define i32_re I5.z
87 #define i32_im I5.w
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
113 
114 // gauge link
115 #ifdef GAUGE_FLOAT2
116 #define g00_re G0.x
117 #define g00_im G0.y
118 #define g01_re G1.x
119 #define g01_im G1.y
120 #define g02_re G2.x
121 #define g02_im G2.y
122 #define g10_re G3.x
123 #define g10_im G3.y
124 #define g11_re G4.x
125 #define g11_im G4.y
126 #define g12_re G5.x
127 #define g12_im G5.y
128 #define g20_re G6.x
129 #define g20_im G6.y
130 #define g21_re G7.x
131 #define g21_im G7.y
132 #define g22_re G8.x
133 #define g22_im G8.y
134 
135 #else
136 #define g00_re G0.x
137 #define g00_im G0.y
138 #define g01_re G0.z
139 #define g01_im G0.w
140 #define g02_re G1.x
141 #define g02_im G1.y
142 #define g10_re G1.z
143 #define g10_im G1.w
144 #define g11_re G2.x
145 #define g11_im G2.y
146 #define g12_re G2.z
147 #define g12_im G2.w
148 #define g20_re G3.x
149 #define g20_im G3.y
150 #define g21_re G3.z
151 #define g21_im G3.w
152 #define g22_re G4.x
153 #define g22_im G4.y
154 
155 #endif // GAUGE_DOUBLE
156 
157 // conjugated gauge link
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
176 
177 // output spinor
202 
203 #include "read_gauge.h"
204 #include "io_spinor.h"
205 
206 int x1, x2, x3, x4;
207 int X;
208 
209 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
210 int sp_norm_idx;
211 #endif // MULTI_GPU half precision
212 
213 int sid;
214 
215 #ifdef MULTI_GPU
216 int face_idx;
218 #endif
219 
220  sid = blockIdx.x*blockDim.x + threadIdx.x;
221  if (sid >= param.threads) return;
222 
223  // Inline by hand for the moment and assume even dimensions
224  const int dims[] = {X1, X2, X3, X4};
225  coordsFromIndex<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
226 
227  o00_re = 0; o00_im = 0;
228  o01_re = 0; o01_im = 0;
229  o02_re = 0; o02_im = 0;
230  o10_re = 0; o10_im = 0;
231  o11_re = 0; o11_im = 0;
232  o12_re = 0; o12_im = 0;
233  o20_re = 0; o20_im = 0;
234  o21_re = 0; o21_im = 0;
235  o22_re = 0; o22_im = 0;
236  o30_re = 0; o30_im = 0;
237  o31_re = 0; o31_im = 0;
238  o32_re = 0; o32_im = 0;
239 
240 #ifdef MULTI_GPU
241 } else { // exterior kernel
242 
243  sid = blockIdx.x*blockDim.x + threadIdx.x;
244  if (sid >= param.threads) return;
245 
246  const int dim = static_cast<int>(kernel_type);
247  const int face_volume = (param.threads >> 1); // volume of one face
248  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
249  face_idx = sid - face_num*face_volume; // index into the respective face
250 
251  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
252  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
253  //sp_idx = face_idx + param.ghostOffset[dim];
254 
255 #if (DD_PREC==2) // half precision
256  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
257 #endif
258 
259  const int dims[] = {X1, X2, X3, X4};
260  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
261 
262  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
263 
264  o00_re = i00_re; o00_im = i00_im;
265  o01_re = i01_re; o01_im = i01_im;
266  o02_re = i02_re; o02_im = i02_im;
267  o10_re = i10_re; o10_im = i10_im;
268  o11_re = i11_re; o11_im = i11_im;
269  o12_re = i12_re; o12_im = i12_im;
270  o20_re = i20_re; o20_im = i20_im;
271  o21_re = i21_re; o21_im = i21_im;
272  o22_re = i22_re; o22_im = i22_im;
273  o30_re = i30_re; o30_im = i30_im;
274  o31_re = i31_re; o31_im = i31_im;
275  o32_re = i32_re; o32_im = i32_im;
276 }
277 #endif // MULTI_GPU
278 
279 
280 #ifdef MULTI_GPU
281 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
283 #endif
284 {
285  // Projector P0+
286  // 1 0 0 i
287  // 0 1 i 0
288  // 0 -i 1 0
289  // -i 0 0 1
290 
291 #ifdef MULTI_GPU
292  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
293  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
294 #else
295  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
296 #endif
297 
298  const int ga_idx = sid;
299 
306 
307 #ifdef MULTI_GPU
308  if (kernel_type == INTERIOR_KERNEL) {
309 #endif
310 
311  // read spinor from device memory
312  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
313 #ifdef TWIST_INV_DSLASH
314  APPLY_TWIST_INV(-a, b, i);
315 #endif
316 
317  // project spinor into half spinors
318  a0_re = +i00_re-i30_im;
319  a0_im = +i00_im+i30_re;
320  a1_re = +i01_re-i31_im;
321  a1_im = +i01_im+i31_re;
322  a2_re = +i02_re-i32_im;
323  a2_im = +i02_im+i32_re;
324  b0_re = +i10_re-i20_im;
325  b0_im = +i10_im+i20_re;
326  b1_re = +i11_re-i21_im;
327  b1_im = +i11_im+i21_re;
328  b2_re = +i12_re-i22_im;
329  b2_im = +i12_im+i22_re;
330 
331 #ifdef MULTI_GPU
332  } else {
333 
334  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
335 
336  // read half spinor from device memory
337  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
338 
339  a0_re = i00_re; a0_im = i00_im;
340  a1_re = i01_re; a1_im = i01_im;
341  a2_re = i02_re; a2_im = i02_im;
342  b0_re = i10_re; b0_im = i10_im;
343  b1_re = i11_re; b1_im = i11_im;
344  b2_re = i12_re; b2_im = i12_im;
345 
346  }
347 #endif // MULTI_GPU
348 
349  // read gauge matrix from device memory
350  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
351 
352  // reconstruct gauge matrix
354 
355  // multiply row 0
357  A0_re += g00_re * a0_re;
358  A0_re -= g00_im * a0_im;
359  A0_re += g01_re * a1_re;
360  A0_re -= g01_im * a1_im;
361  A0_re += g02_re * a2_re;
362  A0_re -= g02_im * a2_im;
364  A0_im += g00_re * a0_im;
365  A0_im += g00_im * a0_re;
366  A0_im += g01_re * a1_im;
367  A0_im += g01_im * a1_re;
368  A0_im += g02_re * a2_im;
369  A0_im += g02_im * a2_re;
371  B0_re += g00_re * b0_re;
372  B0_re -= g00_im * b0_im;
373  B0_re += g01_re * b1_re;
374  B0_re -= g01_im * b1_im;
375  B0_re += g02_re * b2_re;
376  B0_re -= g02_im * b2_im;
378  B0_im += g00_re * b0_im;
379  B0_im += g00_im * b0_re;
380  B0_im += g01_re * b1_im;
381  B0_im += g01_im * b1_re;
382  B0_im += g02_re * b2_im;
383  B0_im += g02_im * b2_re;
384 
385  // multiply row 1
387  A1_re += g10_re * a0_re;
388  A1_re -= g10_im * a0_im;
389  A1_re += g11_re * a1_re;
390  A1_re -= g11_im * a1_im;
391  A1_re += g12_re * a2_re;
392  A1_re -= g12_im * a2_im;
394  A1_im += g10_re * a0_im;
395  A1_im += g10_im * a0_re;
396  A1_im += g11_re * a1_im;
397  A1_im += g11_im * a1_re;
398  A1_im += g12_re * a2_im;
399  A1_im += g12_im * a2_re;
401  B1_re += g10_re * b0_re;
402  B1_re -= g10_im * b0_im;
403  B1_re += g11_re * b1_re;
404  B1_re -= g11_im * b1_im;
405  B1_re += g12_re * b2_re;
406  B1_re -= g12_im * b2_im;
408  B1_im += g10_re * b0_im;
409  B1_im += g10_im * b0_re;
410  B1_im += g11_re * b1_im;
411  B1_im += g11_im * b1_re;
412  B1_im += g12_re * b2_im;
413  B1_im += g12_im * b2_re;
414 
415  // multiply row 2
417  A2_re += g20_re * a0_re;
418  A2_re -= g20_im * a0_im;
419  A2_re += g21_re * a1_re;
420  A2_re -= g21_im * a1_im;
421  A2_re += g22_re * a2_re;
422  A2_re -= g22_im * a2_im;
424  A2_im += g20_re * a0_im;
425  A2_im += g20_im * a0_re;
426  A2_im += g21_re * a1_im;
427  A2_im += g21_im * a1_re;
428  A2_im += g22_re * a2_im;
429  A2_im += g22_im * a2_re;
431  B2_re += g20_re * b0_re;
432  B2_re -= g20_im * b0_im;
433  B2_re += g21_re * b1_re;
434  B2_re -= g21_im * b1_im;
435  B2_re += g22_re * b2_re;
436  B2_re -= g22_im * b2_im;
438  B2_im += g20_re * b0_im;
439  B2_im += g20_im * b0_re;
440  B2_im += g21_re * b1_im;
441  B2_im += g21_im * b1_re;
442  B2_im += g22_re * b2_im;
443  B2_im += g22_im * b2_re;
444 
445  o00_re += A0_re;
446  o00_im += A0_im;
447  o10_re += B0_re;
448  o10_im += B0_im;
449  o20_re += B0_im;
450  o20_im -= B0_re;
451  o30_re += A0_im;
452  o30_im -= A0_re;
453 
454  o01_re += A1_re;
455  o01_im += A1_im;
456  o11_re += B1_re;
457  o11_im += B1_im;
458  o21_re += B1_im;
459  o21_im -= B1_re;
460  o31_re += A1_im;
461  o31_im -= A1_re;
462 
463  o02_re += A2_re;
464  o02_im += A2_im;
465  o12_re += B2_re;
466  o12_im += B2_im;
467  o22_re += B2_im;
468  o22_im -= B2_re;
469  o32_re += A2_im;
470  o32_im -= A2_re;
471 
472 }
473 
474 #ifdef MULTI_GPU
475 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
476  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
477 #endif
478 {
479  // Projector P0-
480  // 1 0 0 -i
481  // 0 1 -i 0
482  // 0 i 1 0
483  // i 0 0 1
484 
485 #ifdef MULTI_GPU
486  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
487  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
488 #else
489  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
490 #endif
491 
492 #ifdef MULTI_GPU
493  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
494 #else
495  const int ga_idx = sp_idx;
496 #endif
497 
504 
505 #ifdef MULTI_GPU
506  if (kernel_type == INTERIOR_KERNEL) {
507 #endif
508 
509  // read spinor from device memory
510  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
511 #ifdef TWIST_INV_DSLASH
512  APPLY_TWIST_INV(-a, b, i);
513 #endif
514 
515  // project spinor into half spinors
516  a0_re = +i00_re+i30_im;
517  a0_im = +i00_im-i30_re;
518  a1_re = +i01_re+i31_im;
519  a1_im = +i01_im-i31_re;
520  a2_re = +i02_re+i32_im;
521  a2_im = +i02_im-i32_re;
522  b0_re = +i10_re+i20_im;
523  b0_im = +i10_im-i20_re;
524  b1_re = +i11_re+i21_im;
525  b1_im = +i11_im-i21_re;
526  b2_re = +i12_re+i22_im;
527  b2_im = +i12_im-i22_re;
528 
529 #ifdef MULTI_GPU
530  } else {
531 
532  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
533 
534  // read half spinor from device memory
535  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
536 
537  a0_re = i00_re; a0_im = i00_im;
538  a1_re = i01_re; a1_im = i01_im;
539  a2_re = i02_re; a2_im = i02_im;
540  b0_re = i10_re; b0_im = i10_im;
541  b1_re = i11_re; b1_im = i11_im;
542  b2_re = i12_re; b2_im = i12_im;
543 
544  }
545 #endif // MULTI_GPU
546 
547  // read gauge matrix from device memory
548  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
549 
550  // reconstruct gauge matrix
552 
553  // multiply row 0
554  spinorFloat A0_re = 0;
555  A0_re += gT00_re * a0_re;
556  A0_re -= gT00_im * a0_im;
557  A0_re += gT01_re * a1_re;
558  A0_re -= gT01_im * a1_im;
559  A0_re += gT02_re * a2_re;
560  A0_re -= gT02_im * a2_im;
561  spinorFloat A0_im = 0;
562  A0_im += gT00_re * a0_im;
563  A0_im += gT00_im * a0_re;
564  A0_im += gT01_re * a1_im;
565  A0_im += gT01_im * a1_re;
566  A0_im += gT02_re * a2_im;
567  A0_im += gT02_im * a2_re;
568  spinorFloat B0_re = 0;
569  B0_re += gT00_re * b0_re;
570  B0_re -= gT00_im * b0_im;
571  B0_re += gT01_re * b1_re;
572  B0_re -= gT01_im * b1_im;
573  B0_re += gT02_re * b2_re;
574  B0_re -= gT02_im * b2_im;
575  spinorFloat B0_im = 0;
576  B0_im += gT00_re * b0_im;
577  B0_im += gT00_im * b0_re;
578  B0_im += gT01_re * b1_im;
579  B0_im += gT01_im * b1_re;
580  B0_im += gT02_re * b2_im;
581  B0_im += gT02_im * b2_re;
582 
583  // multiply row 1
584  spinorFloat A1_re = 0;
585  A1_re += gT10_re * a0_re;
586  A1_re -= gT10_im * a0_im;
587  A1_re += gT11_re * a1_re;
588  A1_re -= gT11_im * a1_im;
589  A1_re += gT12_re * a2_re;
590  A1_re -= gT12_im * a2_im;
591  spinorFloat A1_im = 0;
592  A1_im += gT10_re * a0_im;
593  A1_im += gT10_im * a0_re;
594  A1_im += gT11_re * a1_im;
595  A1_im += gT11_im * a1_re;
596  A1_im += gT12_re * a2_im;
597  A1_im += gT12_im * a2_re;
598  spinorFloat B1_re = 0;
599  B1_re += gT10_re * b0_re;
600  B1_re -= gT10_im * b0_im;
601  B1_re += gT11_re * b1_re;
602  B1_re -= gT11_im * b1_im;
603  B1_re += gT12_re * b2_re;
604  B1_re -= gT12_im * b2_im;
605  spinorFloat B1_im = 0;
606  B1_im += gT10_re * b0_im;
607  B1_im += gT10_im * b0_re;
608  B1_im += gT11_re * b1_im;
609  B1_im += gT11_im * b1_re;
610  B1_im += gT12_re * b2_im;
611  B1_im += gT12_im * b2_re;
612 
613  // multiply row 2
614  spinorFloat A2_re = 0;
615  A2_re += gT20_re * a0_re;
616  A2_re -= gT20_im * a0_im;
617  A2_re += gT21_re * a1_re;
618  A2_re -= gT21_im * a1_im;
619  A2_re += gT22_re * a2_re;
620  A2_re -= gT22_im * a2_im;
621  spinorFloat A2_im = 0;
622  A2_im += gT20_re * a0_im;
623  A2_im += gT20_im * a0_re;
624  A2_im += gT21_re * a1_im;
625  A2_im += gT21_im * a1_re;
626  A2_im += gT22_re * a2_im;
627  A2_im += gT22_im * a2_re;
628  spinorFloat B2_re = 0;
629  B2_re += gT20_re * b0_re;
630  B2_re -= gT20_im * b0_im;
631  B2_re += gT21_re * b1_re;
632  B2_re -= gT21_im * b1_im;
633  B2_re += gT22_re * b2_re;
634  B2_re -= gT22_im * b2_im;
635  spinorFloat B2_im = 0;
636  B2_im += gT20_re * b0_im;
637  B2_im += gT20_im * b0_re;
638  B2_im += gT21_re * b1_im;
639  B2_im += gT21_im * b1_re;
640  B2_im += gT22_re * b2_im;
641  B2_im += gT22_im * b2_re;
642 
643  o00_re += A0_re;
644  o00_im += A0_im;
645  o10_re += B0_re;
646  o10_im += B0_im;
647  o20_re -= B0_im;
648  o20_im += B0_re;
649  o30_re -= A0_im;
650  o30_im += A0_re;
651 
652  o01_re += A1_re;
653  o01_im += A1_im;
654  o11_re += B1_re;
655  o11_im += B1_im;
656  o21_re -= B1_im;
657  o21_im += B1_re;
658  o31_re -= A1_im;
659  o31_im += A1_re;
660 
661  o02_re += A2_re;
662  o02_im += A2_im;
663  o12_re += B2_re;
664  o12_im += B2_im;
665  o22_re -= B2_im;
666  o22_im += B2_re;
667  o32_re -= A2_im;
668  o32_im += A2_re;
669 
670 }
671 
672 #ifdef MULTI_GPU
673 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
675 #endif
676 {
677  // Projector P1+
678  // 1 0 0 1
679  // 0 1 -1 0
680  // 0 -1 1 0
681  // 1 0 0 1
682 
683 #ifdef MULTI_GPU
684  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
685  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
686 #else
687  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
688 #endif
689 
690  const int ga_idx = sid;
691 
698 
699 #ifdef MULTI_GPU
700  if (kernel_type == INTERIOR_KERNEL) {
701 #endif
702 
703  // read spinor from device memory
704  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
705 #ifdef TWIST_INV_DSLASH
706  APPLY_TWIST_INV(-a, b, i);
707 #endif
708 
709  // project spinor into half spinors
710  a0_re = +i00_re+i30_re;
711  a0_im = +i00_im+i30_im;
712  a1_re = +i01_re+i31_re;
713  a1_im = +i01_im+i31_im;
714  a2_re = +i02_re+i32_re;
715  a2_im = +i02_im+i32_im;
716  b0_re = +i10_re-i20_re;
717  b0_im = +i10_im-i20_im;
718  b1_re = +i11_re-i21_re;
719  b1_im = +i11_im-i21_im;
720  b2_re = +i12_re-i22_re;
721  b2_im = +i12_im-i22_im;
722 
723 #ifdef MULTI_GPU
724  } else {
725 
726  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
727 
728  // read half spinor from device memory
729  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
730 
731  a0_re = i00_re; a0_im = i00_im;
732  a1_re = i01_re; a1_im = i01_im;
733  a2_re = i02_re; a2_im = i02_im;
734  b0_re = i10_re; b0_im = i10_im;
735  b1_re = i11_re; b1_im = i11_im;
736  b2_re = i12_re; b2_im = i12_im;
737 
738  }
739 #endif // MULTI_GPU
740 
741  // read gauge matrix from device memory
742  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
743 
744  // reconstruct gauge matrix
746 
747  // multiply row 0
748  spinorFloat A0_re = 0;
749  A0_re += g00_re * a0_re;
750  A0_re -= g00_im * a0_im;
751  A0_re += g01_re * a1_re;
752  A0_re -= g01_im * a1_im;
753  A0_re += g02_re * a2_re;
754  A0_re -= g02_im * a2_im;
755  spinorFloat A0_im = 0;
756  A0_im += g00_re * a0_im;
757  A0_im += g00_im * a0_re;
758  A0_im += g01_re * a1_im;
759  A0_im += g01_im * a1_re;
760  A0_im += g02_re * a2_im;
761  A0_im += g02_im * a2_re;
762  spinorFloat B0_re = 0;
763  B0_re += g00_re * b0_re;
764  B0_re -= g00_im * b0_im;
765  B0_re += g01_re * b1_re;
766  B0_re -= g01_im * b1_im;
767  B0_re += g02_re * b2_re;
768  B0_re -= g02_im * b2_im;
769  spinorFloat B0_im = 0;
770  B0_im += g00_re * b0_im;
771  B0_im += g00_im * b0_re;
772  B0_im += g01_re * b1_im;
773  B0_im += g01_im * b1_re;
774  B0_im += g02_re * b2_im;
775  B0_im += g02_im * b2_re;
776 
777  // multiply row 1
778  spinorFloat A1_re = 0;
779  A1_re += g10_re * a0_re;
780  A1_re -= g10_im * a0_im;
781  A1_re += g11_re * a1_re;
782  A1_re -= g11_im * a1_im;
783  A1_re += g12_re * a2_re;
784  A1_re -= g12_im * a2_im;
785  spinorFloat A1_im = 0;
786  A1_im += g10_re * a0_im;
787  A1_im += g10_im * a0_re;
788  A1_im += g11_re * a1_im;
789  A1_im += g11_im * a1_re;
790  A1_im += g12_re * a2_im;
791  A1_im += g12_im * a2_re;
792  spinorFloat B1_re = 0;
793  B1_re += g10_re * b0_re;
794  B1_re -= g10_im * b0_im;
795  B1_re += g11_re * b1_re;
796  B1_re -= g11_im * b1_im;
797  B1_re += g12_re * b2_re;
798  B1_re -= g12_im * b2_im;
799  spinorFloat B1_im = 0;
800  B1_im += g10_re * b0_im;
801  B1_im += g10_im * b0_re;
802  B1_im += g11_re * b1_im;
803  B1_im += g11_im * b1_re;
804  B1_im += g12_re * b2_im;
805  B1_im += g12_im * b2_re;
806 
807  // multiply row 2
808  spinorFloat A2_re = 0;
809  A2_re += g20_re * a0_re;
810  A2_re -= g20_im * a0_im;
811  A2_re += g21_re * a1_re;
812  A2_re -= g21_im * a1_im;
813  A2_re += g22_re * a2_re;
814  A2_re -= g22_im * a2_im;
815  spinorFloat A2_im = 0;
816  A2_im += g20_re * a0_im;
817  A2_im += g20_im * a0_re;
818  A2_im += g21_re * a1_im;
819  A2_im += g21_im * a1_re;
820  A2_im += g22_re * a2_im;
821  A2_im += g22_im * a2_re;
822  spinorFloat B2_re = 0;
823  B2_re += g20_re * b0_re;
824  B2_re -= g20_im * b0_im;
825  B2_re += g21_re * b1_re;
826  B2_re -= g21_im * b1_im;
827  B2_re += g22_re * b2_re;
828  B2_re -= g22_im * b2_im;
829  spinorFloat B2_im = 0;
830  B2_im += g20_re * b0_im;
831  B2_im += g20_im * b0_re;
832  B2_im += g21_re * b1_im;
833  B2_im += g21_im * b1_re;
834  B2_im += g22_re * b2_im;
835  B2_im += g22_im * b2_re;
836 
837  o00_re += A0_re;
838  o00_im += A0_im;
839  o10_re += B0_re;
840  o10_im += B0_im;
841  o20_re -= B0_re;
842  o20_im -= B0_im;
843  o30_re += A0_re;
844  o30_im += A0_im;
845 
846  o01_re += A1_re;
847  o01_im += A1_im;
848  o11_re += B1_re;
849  o11_im += B1_im;
850  o21_re -= B1_re;
851  o21_im -= B1_im;
852  o31_re += A1_re;
853  o31_im += A1_im;
854 
855  o02_re += A2_re;
856  o02_im += A2_im;
857  o12_re += B2_re;
858  o12_im += B2_im;
859  o22_re -= B2_re;
860  o22_im -= B2_im;
861  o32_re += A2_re;
862  o32_im += A2_im;
863 
864 }
865 
866 #ifdef MULTI_GPU
867 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
868  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
869 #endif
870 {
871  // Projector P1-
872  // 1 0 0 -1
873  // 0 1 1 0
874  // 0 1 1 0
875  // -1 0 0 1
876 
877 #ifdef MULTI_GPU
878  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
879  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
880 #else
881  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
882 #endif
883 
884 #ifdef MULTI_GPU
885  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
886 #else
887  const int ga_idx = sp_idx;
888 #endif
889 
896 
897 #ifdef MULTI_GPU
898  if (kernel_type == INTERIOR_KERNEL) {
899 #endif
900 
901  // read spinor from device memory
902  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
903 #ifdef TWIST_INV_DSLASH
904  APPLY_TWIST_INV(-a, b, i);
905 #endif
906 
907  // project spinor into half spinors
908  a0_re = +i00_re-i30_re;
909  a0_im = +i00_im-i30_im;
910  a1_re = +i01_re-i31_re;
911  a1_im = +i01_im-i31_im;
912  a2_re = +i02_re-i32_re;
913  a2_im = +i02_im-i32_im;
914  b0_re = +i10_re+i20_re;
915  b0_im = +i10_im+i20_im;
916  b1_re = +i11_re+i21_re;
917  b1_im = +i11_im+i21_im;
918  b2_re = +i12_re+i22_re;
919  b2_im = +i12_im+i22_im;
920 
921 #ifdef MULTI_GPU
922  } else {
923 
924  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
925 
926  // read half spinor from device memory
927  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
928 
929  a0_re = i00_re; a0_im = i00_im;
930  a1_re = i01_re; a1_im = i01_im;
931  a2_re = i02_re; a2_im = i02_im;
932  b0_re = i10_re; b0_im = i10_im;
933  b1_re = i11_re; b1_im = i11_im;
934  b2_re = i12_re; b2_im = i12_im;
935 
936  }
937 #endif // MULTI_GPU
938 
939  // read gauge matrix from device memory
940  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
941 
942  // reconstruct gauge matrix
944 
945  // multiply row 0
946  spinorFloat A0_re = 0;
947  A0_re += gT00_re * a0_re;
948  A0_re -= gT00_im * a0_im;
949  A0_re += gT01_re * a1_re;
950  A0_re -= gT01_im * a1_im;
951  A0_re += gT02_re * a2_re;
952  A0_re -= gT02_im * a2_im;
953  spinorFloat A0_im = 0;
954  A0_im += gT00_re * a0_im;
955  A0_im += gT00_im * a0_re;
956  A0_im += gT01_re * a1_im;
957  A0_im += gT01_im * a1_re;
958  A0_im += gT02_re * a2_im;
959  A0_im += gT02_im * a2_re;
960  spinorFloat B0_re = 0;
961  B0_re += gT00_re * b0_re;
962  B0_re -= gT00_im * b0_im;
963  B0_re += gT01_re * b1_re;
964  B0_re -= gT01_im * b1_im;
965  B0_re += gT02_re * b2_re;
966  B0_re -= gT02_im * b2_im;
967  spinorFloat B0_im = 0;
968  B0_im += gT00_re * b0_im;
969  B0_im += gT00_im * b0_re;
970  B0_im += gT01_re * b1_im;
971  B0_im += gT01_im * b1_re;
972  B0_im += gT02_re * b2_im;
973  B0_im += gT02_im * b2_re;
974 
975  // multiply row 1
976  spinorFloat A1_re = 0;
977  A1_re += gT10_re * a0_re;
978  A1_re -= gT10_im * a0_im;
979  A1_re += gT11_re * a1_re;
980  A1_re -= gT11_im * a1_im;
981  A1_re += gT12_re * a2_re;
982  A1_re -= gT12_im * a2_im;
983  spinorFloat A1_im = 0;
984  A1_im += gT10_re * a0_im;
985  A1_im += gT10_im * a0_re;
986  A1_im += gT11_re * a1_im;
987  A1_im += gT11_im * a1_re;
988  A1_im += gT12_re * a2_im;
989  A1_im += gT12_im * a2_re;
990  spinorFloat B1_re = 0;
991  B1_re += gT10_re * b0_re;
992  B1_re -= gT10_im * b0_im;
993  B1_re += gT11_re * b1_re;
994  B1_re -= gT11_im * b1_im;
995  B1_re += gT12_re * b2_re;
996  B1_re -= gT12_im * b2_im;
997  spinorFloat B1_im = 0;
998  B1_im += gT10_re * b0_im;
999  B1_im += gT10_im * b0_re;
1000  B1_im += gT11_re * b1_im;
1001  B1_im += gT11_im * b1_re;
1002  B1_im += gT12_re * b2_im;
1003  B1_im += gT12_im * b2_re;
1004 
1005  // multiply row 2
1006  spinorFloat A2_re = 0;
1007  A2_re += gT20_re * a0_re;
1008  A2_re -= gT20_im * a0_im;
1009  A2_re += gT21_re * a1_re;
1010  A2_re -= gT21_im * a1_im;
1011  A2_re += gT22_re * a2_re;
1012  A2_re -= gT22_im * a2_im;
1013  spinorFloat A2_im = 0;
1014  A2_im += gT20_re * a0_im;
1015  A2_im += gT20_im * a0_re;
1016  A2_im += gT21_re * a1_im;
1017  A2_im += gT21_im * a1_re;
1018  A2_im += gT22_re * a2_im;
1019  A2_im += gT22_im * a2_re;
1020  spinorFloat B2_re = 0;
1021  B2_re += gT20_re * b0_re;
1022  B2_re -= gT20_im * b0_im;
1023  B2_re += gT21_re * b1_re;
1024  B2_re -= gT21_im * b1_im;
1025  B2_re += gT22_re * b2_re;
1026  B2_re -= gT22_im * b2_im;
1027  spinorFloat B2_im = 0;
1028  B2_im += gT20_re * b0_im;
1029  B2_im += gT20_im * b0_re;
1030  B2_im += gT21_re * b1_im;
1031  B2_im += gT21_im * b1_re;
1032  B2_im += gT22_re * b2_im;
1033  B2_im += gT22_im * b2_re;
1034 
1035  o00_re += A0_re;
1036  o00_im += A0_im;
1037  o10_re += B0_re;
1038  o10_im += B0_im;
1039  o20_re += B0_re;
1040  o20_im += B0_im;
1041  o30_re -= A0_re;
1042  o30_im -= A0_im;
1043 
1044  o01_re += A1_re;
1045  o01_im += A1_im;
1046  o11_re += B1_re;
1047  o11_im += B1_im;
1048  o21_re += B1_re;
1049  o21_im += B1_im;
1050  o31_re -= A1_re;
1051  o31_im -= A1_im;
1052 
1053  o02_re += A2_re;
1054  o02_im += A2_im;
1055  o12_re += B2_re;
1056  o12_im += B2_im;
1057  o22_re += B2_re;
1058  o22_im += B2_im;
1059  o32_re -= A2_re;
1060  o32_im -= A2_im;
1061 
1062 }
1063 
1064 #ifdef MULTI_GPU
1065 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1067 #endif
1068 {
1069  // Projector P2+
1070  // 1 0 i 0
1071  // 0 1 0 -i
1072  // -i 0 1 0
1073  // 0 i 0 1
1074 
1075 #ifdef MULTI_GPU
1076  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1077  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1078 #else
1079  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1080 #endif
1081 
1082  const int ga_idx = sid;
1083 
1090 
1091 #ifdef MULTI_GPU
1092  if (kernel_type == INTERIOR_KERNEL) {
1093 #endif
1094 
1095  // read spinor from device memory
1096  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1097 #ifdef TWIST_INV_DSLASH
1098  APPLY_TWIST_INV(-a, b, i);
1099 #endif
1100 
1101  // project spinor into half spinors
1102  a0_re = +i00_re-i20_im;
1103  a0_im = +i00_im+i20_re;
1104  a1_re = +i01_re-i21_im;
1105  a1_im = +i01_im+i21_re;
1106  a2_re = +i02_re-i22_im;
1107  a2_im = +i02_im+i22_re;
1108  b0_re = +i10_re+i30_im;
1109  b0_im = +i10_im-i30_re;
1110  b1_re = +i11_re+i31_im;
1111  b1_im = +i11_im-i31_re;
1112  b2_re = +i12_re+i32_im;
1113  b2_im = +i12_im-i32_re;
1114 
1115 #ifdef MULTI_GPU
1116  } else {
1117 
1118  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1119 
1120  // read half spinor from device memory
1121  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1122 
1123  a0_re = i00_re; a0_im = i00_im;
1124  a1_re = i01_re; a1_im = i01_im;
1125  a2_re = i02_re; a2_im = i02_im;
1126  b0_re = i10_re; b0_im = i10_im;
1127  b1_re = i11_re; b1_im = i11_im;
1128  b2_re = i12_re; b2_im = i12_im;
1129 
1130  }
1131 #endif // MULTI_GPU
1132 
1133  // read gauge matrix from device memory
1134  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1135 
1136  // reconstruct gauge matrix
1138 
1139  // multiply row 0
1140  spinorFloat A0_re = 0;
1141  A0_re += g00_re * a0_re;
1142  A0_re -= g00_im * a0_im;
1143  A0_re += g01_re * a1_re;
1144  A0_re -= g01_im * a1_im;
1145  A0_re += g02_re * a2_re;
1146  A0_re -= g02_im * a2_im;
1147  spinorFloat A0_im = 0;
1148  A0_im += g00_re * a0_im;
1149  A0_im += g00_im * a0_re;
1150  A0_im += g01_re * a1_im;
1151  A0_im += g01_im * a1_re;
1152  A0_im += g02_re * a2_im;
1153  A0_im += g02_im * a2_re;
1154  spinorFloat B0_re = 0;
1155  B0_re += g00_re * b0_re;
1156  B0_re -= g00_im * b0_im;
1157  B0_re += g01_re * b1_re;
1158  B0_re -= g01_im * b1_im;
1159  B0_re += g02_re * b2_re;
1160  B0_re -= g02_im * b2_im;
1161  spinorFloat B0_im = 0;
1162  B0_im += g00_re * b0_im;
1163  B0_im += g00_im * b0_re;
1164  B0_im += g01_re * b1_im;
1165  B0_im += g01_im * b1_re;
1166  B0_im += g02_re * b2_im;
1167  B0_im += g02_im * b2_re;
1168 
1169  // multiply row 1
1170  spinorFloat A1_re = 0;
1171  A1_re += g10_re * a0_re;
1172  A1_re -= g10_im * a0_im;
1173  A1_re += g11_re * a1_re;
1174  A1_re -= g11_im * a1_im;
1175  A1_re += g12_re * a2_re;
1176  A1_re -= g12_im * a2_im;
1177  spinorFloat A1_im = 0;
1178  A1_im += g10_re * a0_im;
1179  A1_im += g10_im * a0_re;
1180  A1_im += g11_re * a1_im;
1181  A1_im += g11_im * a1_re;
1182  A1_im += g12_re * a2_im;
1183  A1_im += g12_im * a2_re;
1184  spinorFloat B1_re = 0;
1185  B1_re += g10_re * b0_re;
1186  B1_re -= g10_im * b0_im;
1187  B1_re += g11_re * b1_re;
1188  B1_re -= g11_im * b1_im;
1189  B1_re += g12_re * b2_re;
1190  B1_re -= g12_im * b2_im;
1191  spinorFloat B1_im = 0;
1192  B1_im += g10_re * b0_im;
1193  B1_im += g10_im * b0_re;
1194  B1_im += g11_re * b1_im;
1195  B1_im += g11_im * b1_re;
1196  B1_im += g12_re * b2_im;
1197  B1_im += g12_im * b2_re;
1198 
1199  // multiply row 2
1200  spinorFloat A2_re = 0;
1201  A2_re += g20_re * a0_re;
1202  A2_re -= g20_im * a0_im;
1203  A2_re += g21_re * a1_re;
1204  A2_re -= g21_im * a1_im;
1205  A2_re += g22_re * a2_re;
1206  A2_re -= g22_im * a2_im;
1207  spinorFloat A2_im = 0;
1208  A2_im += g20_re * a0_im;
1209  A2_im += g20_im * a0_re;
1210  A2_im += g21_re * a1_im;
1211  A2_im += g21_im * a1_re;
1212  A2_im += g22_re * a2_im;
1213  A2_im += g22_im * a2_re;
1214  spinorFloat B2_re = 0;
1215  B2_re += g20_re * b0_re;
1216  B2_re -= g20_im * b0_im;
1217  B2_re += g21_re * b1_re;
1218  B2_re -= g21_im * b1_im;
1219  B2_re += g22_re * b2_re;
1220  B2_re -= g22_im * b2_im;
1221  spinorFloat B2_im = 0;
1222  B2_im += g20_re * b0_im;
1223  B2_im += g20_im * b0_re;
1224  B2_im += g21_re * b1_im;
1225  B2_im += g21_im * b1_re;
1226  B2_im += g22_re * b2_im;
1227  B2_im += g22_im * b2_re;
1228 
1229  o00_re += A0_re;
1230  o00_im += A0_im;
1231  o10_re += B0_re;
1232  o10_im += B0_im;
1233  o20_re += A0_im;
1234  o20_im -= A0_re;
1235  o30_re -= B0_im;
1236  o30_im += B0_re;
1237 
1238  o01_re += A1_re;
1239  o01_im += A1_im;
1240  o11_re += B1_re;
1241  o11_im += B1_im;
1242  o21_re += A1_im;
1243  o21_im -= A1_re;
1244  o31_re -= B1_im;
1245  o31_im += B1_re;
1246 
1247  o02_re += A2_re;
1248  o02_im += A2_im;
1249  o12_re += B2_re;
1250  o12_im += B2_im;
1251  o22_re += A2_im;
1252  o22_im -= A2_re;
1253  o32_re -= B2_im;
1254  o32_im += B2_re;
1255 
1256 }
1257 
1258 #ifdef MULTI_GPU
1259 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1260  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1261 #endif
1262 {
1263  // Projector P2-
1264  // 1 0 -i 0
1265  // 0 1 0 i
1266  // i 0 1 0
1267  // 0 -i 0 1
1268 
1269 #ifdef MULTI_GPU
1270  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1271  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1272 #else
1273  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1274 #endif
1275 
1276 #ifdef MULTI_GPU
1277  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1278 #else
1279  const int ga_idx = sp_idx;
1280 #endif
1281 
1288 
1289 #ifdef MULTI_GPU
1290  if (kernel_type == INTERIOR_KERNEL) {
1291 #endif
1292 
1293  // read spinor from device memory
1294  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1295 #ifdef TWIST_INV_DSLASH
1296  APPLY_TWIST_INV(-a, b, i);
1297 #endif
1298 
1299  // project spinor into half spinors
1300  a0_re = +i00_re+i20_im;
1301  a0_im = +i00_im-i20_re;
1302  a1_re = +i01_re+i21_im;
1303  a1_im = +i01_im-i21_re;
1304  a2_re = +i02_re+i22_im;
1305  a2_im = +i02_im-i22_re;
1306  b0_re = +i10_re-i30_im;
1307  b0_im = +i10_im+i30_re;
1308  b1_re = +i11_re-i31_im;
1309  b1_im = +i11_im+i31_re;
1310  b2_re = +i12_re-i32_im;
1311  b2_im = +i12_im+i32_re;
1312 
1313 #ifdef MULTI_GPU
1314  } else {
1315 
1316  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1317 
1318  // read half spinor from device memory
1319  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1320 
1321  a0_re = i00_re; a0_im = i00_im;
1322  a1_re = i01_re; a1_im = i01_im;
1323  a2_re = i02_re; a2_im = i02_im;
1324  b0_re = i10_re; b0_im = i10_im;
1325  b1_re = i11_re; b1_im = i11_im;
1326  b2_re = i12_re; b2_im = i12_im;
1327 
1328  }
1329 #endif // MULTI_GPU
1330 
1331  // read gauge matrix from device memory
1332  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1333 
1334  // reconstruct gauge matrix
1336 
1337  // multiply row 0
1338  spinorFloat A0_re = 0;
1339  A0_re += gT00_re * a0_re;
1340  A0_re -= gT00_im * a0_im;
1341  A0_re += gT01_re * a1_re;
1342  A0_re -= gT01_im * a1_im;
1343  A0_re += gT02_re * a2_re;
1344  A0_re -= gT02_im * a2_im;
1345  spinorFloat A0_im = 0;
1346  A0_im += gT00_re * a0_im;
1347  A0_im += gT00_im * a0_re;
1348  A0_im += gT01_re * a1_im;
1349  A0_im += gT01_im * a1_re;
1350  A0_im += gT02_re * a2_im;
1351  A0_im += gT02_im * a2_re;
1352  spinorFloat B0_re = 0;
1353  B0_re += gT00_re * b0_re;
1354  B0_re -= gT00_im * b0_im;
1355  B0_re += gT01_re * b1_re;
1356  B0_re -= gT01_im * b1_im;
1357  B0_re += gT02_re * b2_re;
1358  B0_re -= gT02_im * b2_im;
1359  spinorFloat B0_im = 0;
1360  B0_im += gT00_re * b0_im;
1361  B0_im += gT00_im * b0_re;
1362  B0_im += gT01_re * b1_im;
1363  B0_im += gT01_im * b1_re;
1364  B0_im += gT02_re * b2_im;
1365  B0_im += gT02_im * b2_re;
1366 
1367  // multiply row 1
1368  spinorFloat A1_re = 0;
1369  A1_re += gT10_re * a0_re;
1370  A1_re -= gT10_im * a0_im;
1371  A1_re += gT11_re * a1_re;
1372  A1_re -= gT11_im * a1_im;
1373  A1_re += gT12_re * a2_re;
1374  A1_re -= gT12_im * a2_im;
1375  spinorFloat A1_im = 0;
1376  A1_im += gT10_re * a0_im;
1377  A1_im += gT10_im * a0_re;
1378  A1_im += gT11_re * a1_im;
1379  A1_im += gT11_im * a1_re;
1380  A1_im += gT12_re * a2_im;
1381  A1_im += gT12_im * a2_re;
1382  spinorFloat B1_re = 0;
1383  B1_re += gT10_re * b0_re;
1384  B1_re -= gT10_im * b0_im;
1385  B1_re += gT11_re * b1_re;
1386  B1_re -= gT11_im * b1_im;
1387  B1_re += gT12_re * b2_re;
1388  B1_re -= gT12_im * b2_im;
1389  spinorFloat B1_im = 0;
1390  B1_im += gT10_re * b0_im;
1391  B1_im += gT10_im * b0_re;
1392  B1_im += gT11_re * b1_im;
1393  B1_im += gT11_im * b1_re;
1394  B1_im += gT12_re * b2_im;
1395  B1_im += gT12_im * b2_re;
1396 
1397  // multiply row 2
1398  spinorFloat A2_re = 0;
1399  A2_re += gT20_re * a0_re;
1400  A2_re -= gT20_im * a0_im;
1401  A2_re += gT21_re * a1_re;
1402  A2_re -= gT21_im * a1_im;
1403  A2_re += gT22_re * a2_re;
1404  A2_re -= gT22_im * a2_im;
1405  spinorFloat A2_im = 0;
1406  A2_im += gT20_re * a0_im;
1407  A2_im += gT20_im * a0_re;
1408  A2_im += gT21_re * a1_im;
1409  A2_im += gT21_im * a1_re;
1410  A2_im += gT22_re * a2_im;
1411  A2_im += gT22_im * a2_re;
1412  spinorFloat B2_re = 0;
1413  B2_re += gT20_re * b0_re;
1414  B2_re -= gT20_im * b0_im;
1415  B2_re += gT21_re * b1_re;
1416  B2_re -= gT21_im * b1_im;
1417  B2_re += gT22_re * b2_re;
1418  B2_re -= gT22_im * b2_im;
1419  spinorFloat B2_im = 0;
1420  B2_im += gT20_re * b0_im;
1421  B2_im += gT20_im * b0_re;
1422  B2_im += gT21_re * b1_im;
1423  B2_im += gT21_im * b1_re;
1424  B2_im += gT22_re * b2_im;
1425  B2_im += gT22_im * b2_re;
1426 
1427  o00_re += A0_re;
1428  o00_im += A0_im;
1429  o10_re += B0_re;
1430  o10_im += B0_im;
1431  o20_re -= A0_im;
1432  o20_im += A0_re;
1433  o30_re += B0_im;
1434  o30_im -= B0_re;
1435 
1436  o01_re += A1_re;
1437  o01_im += A1_im;
1438  o11_re += B1_re;
1439  o11_im += B1_im;
1440  o21_re -= A1_im;
1441  o21_im += A1_re;
1442  o31_re += B1_im;
1443  o31_im -= B1_re;
1444 
1445  o02_re += A2_re;
1446  o02_im += A2_im;
1447  o12_re += B2_re;
1448  o12_im += B2_im;
1449  o22_re -= A2_im;
1450  o22_im += A2_re;
1451  o32_re += B2_im;
1452  o32_im -= B2_re;
1453 
1454 }
1455 
1456 #ifdef MULTI_GPU
1457 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1459 #endif
1460 {
1461  // Projector P3+
1462  // 2 0 0 0
1463  // 0 2 0 0
1464  // 0 0 0 0
1465  // 0 0 0 0
1466 
1467 #ifdef MULTI_GPU
1468  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1469  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1470 #else
1471  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1472 #endif
1473 
1474  const int ga_idx = sid;
1475 
1477  {
1484 
1485 #ifdef MULTI_GPU
1486  if (kernel_type == INTERIOR_KERNEL) {
1487 #endif
1488 
1489  // read spinor from device memory
1490 #ifndef TWIST_INV_DSLASH
1491  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1492 #else
1493  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1494  APPLY_TWIST_INV(-a, b, i);
1495 #endif
1496 
1497  // project spinor into half spinors
1498  a0_re = +2*i00_re;
1499  a0_im = +2*i00_im;
1500  a1_re = +2*i01_re;
1501  a1_im = +2*i01_im;
1502  a2_re = +2*i02_re;
1503  a2_im = +2*i02_im;
1504  b0_re = +2*i10_re;
1505  b0_im = +2*i10_im;
1506  b1_re = +2*i11_re;
1507  b1_im = +2*i11_im;
1508  b2_re = +2*i12_re;
1509  b2_im = +2*i12_im;
1510 
1511 #ifdef MULTI_GPU
1512  } else {
1513 
1514  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1515  //const int t_proj_scale = TPROJSCALE;
1516 
1517  // read half spinor from device memory
1518  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1519 
1520 #ifdef TWIST_INV_DSLASH
1521  a0_re = i00_re; a0_im = i00_im;
1522  a1_re = i01_re; a1_im = i01_im;
1523  a2_re = i02_re; a2_im = i02_im;
1524  b0_re = i10_re; b0_im = i10_im;
1525  b1_re = i11_re; b1_im = i11_im;
1526  b2_re = i12_re; b2_im = i12_im;
1527 #else
1528  a0_re = 2*i00_re; a0_im = 2*i00_im;
1529  a1_re = 2*i01_re; a1_im = 2*i01_im;
1530  a2_re = 2*i02_re; a2_im = 2*i02_im;
1531  b0_re = 2*i10_re; b0_im = 2*i10_im;
1532  b1_re = 2*i11_re; b1_im = 2*i11_im;
1533  b2_re = 2*i12_re; b2_im = 2*i12_im;
1534 #endif
1535 
1536  }
1537 #endif // MULTI_GPU
1538 
1539  // identity gauge matrix
1546 
1547  o00_re += A0_re;
1548  o00_im += A0_im;
1549  o10_re += B0_re;
1550  o10_im += B0_im;
1551 
1552  o01_re += A1_re;
1553  o01_im += A1_im;
1554  o11_re += B1_re;
1555  o11_im += B1_im;
1556 
1557  o02_re += A2_re;
1558  o02_im += A2_im;
1559  o12_re += B2_re;
1560  o12_im += B2_im;
1561 
1562  } else {
1569 
1570 #ifdef MULTI_GPU
1571  if (kernel_type == INTERIOR_KERNEL) {
1572 #endif
1573 
1574  // read spinor from device memory
1575 #ifndef TWIST_INV_DSLASH
1576  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1577 #else
1578  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1579  APPLY_TWIST_INV(-a, b, i);
1580 #endif
1581 
1582  // project spinor into half spinors
1583  a0_re = +2*i00_re;
1584  a0_im = +2*i00_im;
1585  a1_re = +2*i01_re;
1586  a1_im = +2*i01_im;
1587  a2_re = +2*i02_re;
1588  a2_im = +2*i02_im;
1589  b0_re = +2*i10_re;
1590  b0_im = +2*i10_im;
1591  b1_re = +2*i11_re;
1592  b1_im = +2*i11_im;
1593  b2_re = +2*i12_re;
1594  b2_im = +2*i12_im;
1595 
1596 #ifdef MULTI_GPU
1597  } else {
1598 
1599  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1600  //const int t_proj_scale = TPROJSCALE;
1601 
1602  // read half spinor from device memory
1603  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1604 
1605 #ifdef TWIST_INV_DSLASH
1606  a0_re = i00_re; a0_im = i00_im;
1607  a1_re = i01_re; a1_im = i01_im;
1608  a2_re = i02_re; a2_im = i02_im;
1609  b0_re = i10_re; b0_im = i10_im;
1610  b1_re = i11_re; b1_im = i11_im;
1611  b2_re = i12_re; b2_im = i12_im;
1612 #else
1613  a0_re = 2*i00_re; a0_im = 2*i00_im;
1614  a1_re = 2*i01_re; a1_im = 2*i01_im;
1615  a2_re = 2*i02_re; a2_im = 2*i02_im;
1616  b0_re = 2*i10_re; b0_im = 2*i10_im;
1617  b1_re = 2*i11_re; b1_im = 2*i11_im;
1618  b2_re = 2*i12_re; b2_im = 2*i12_im;
1619 #endif
1620 
1621  }
1622 #endif // MULTI_GPU
1623 
1624  // read gauge matrix from device memory
1625  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1626 
1627  // reconstruct gauge matrix
1629 
1630  // multiply row 0
1631  spinorFloat A0_re = 0;
1632  A0_re += g00_re * a0_re;
1633  A0_re -= g00_im * a0_im;
1634  A0_re += g01_re * a1_re;
1635  A0_re -= g01_im * a1_im;
1636  A0_re += g02_re * a2_re;
1637  A0_re -= g02_im * a2_im;
1638  spinorFloat A0_im = 0;
1639  A0_im += g00_re * a0_im;
1640  A0_im += g00_im * a0_re;
1641  A0_im += g01_re * a1_im;
1642  A0_im += g01_im * a1_re;
1643  A0_im += g02_re * a2_im;
1644  A0_im += g02_im * a2_re;
1645  spinorFloat B0_re = 0;
1646  B0_re += g00_re * b0_re;
1647  B0_re -= g00_im * b0_im;
1648  B0_re += g01_re * b1_re;
1649  B0_re -= g01_im * b1_im;
1650  B0_re += g02_re * b2_re;
1651  B0_re -= g02_im * b2_im;
1652  spinorFloat B0_im = 0;
1653  B0_im += g00_re * b0_im;
1654  B0_im += g00_im * b0_re;
1655  B0_im += g01_re * b1_im;
1656  B0_im += g01_im * b1_re;
1657  B0_im += g02_re * b2_im;
1658  B0_im += g02_im * b2_re;
1659 
1660  // multiply row 1
1661  spinorFloat A1_re = 0;
1662  A1_re += g10_re * a0_re;
1663  A1_re -= g10_im * a0_im;
1664  A1_re += g11_re * a1_re;
1665  A1_re -= g11_im * a1_im;
1666  A1_re += g12_re * a2_re;
1667  A1_re -= g12_im * a2_im;
1668  spinorFloat A1_im = 0;
1669  A1_im += g10_re * a0_im;
1670  A1_im += g10_im * a0_re;
1671  A1_im += g11_re * a1_im;
1672  A1_im += g11_im * a1_re;
1673  A1_im += g12_re * a2_im;
1674  A1_im += g12_im * a2_re;
1675  spinorFloat B1_re = 0;
1676  B1_re += g10_re * b0_re;
1677  B1_re -= g10_im * b0_im;
1678  B1_re += g11_re * b1_re;
1679  B1_re -= g11_im * b1_im;
1680  B1_re += g12_re * b2_re;
1681  B1_re -= g12_im * b2_im;
1682  spinorFloat B1_im = 0;
1683  B1_im += g10_re * b0_im;
1684  B1_im += g10_im * b0_re;
1685  B1_im += g11_re * b1_im;
1686  B1_im += g11_im * b1_re;
1687  B1_im += g12_re * b2_im;
1688  B1_im += g12_im * b2_re;
1689 
1690  // multiply row 2
1691  spinorFloat A2_re = 0;
1692  A2_re += g20_re * a0_re;
1693  A2_re -= g20_im * a0_im;
1694  A2_re += g21_re * a1_re;
1695  A2_re -= g21_im * a1_im;
1696  A2_re += g22_re * a2_re;
1697  A2_re -= g22_im * a2_im;
1698  spinorFloat A2_im = 0;
1699  A2_im += g20_re * a0_im;
1700  A2_im += g20_im * a0_re;
1701  A2_im += g21_re * a1_im;
1702  A2_im += g21_im * a1_re;
1703  A2_im += g22_re * a2_im;
1704  A2_im += g22_im * a2_re;
1705  spinorFloat B2_re = 0;
1706  B2_re += g20_re * b0_re;
1707  B2_re -= g20_im * b0_im;
1708  B2_re += g21_re * b1_re;
1709  B2_re -= g21_im * b1_im;
1710  B2_re += g22_re * b2_re;
1711  B2_re -= g22_im * b2_im;
1712  spinorFloat B2_im = 0;
1713  B2_im += g20_re * b0_im;
1714  B2_im += g20_im * b0_re;
1715  B2_im += g21_re * b1_im;
1716  B2_im += g21_im * b1_re;
1717  B2_im += g22_re * b2_im;
1718  B2_im += g22_im * b2_re;
1719 
1720  o00_re += A0_re;
1721  o00_im += A0_im;
1722  o10_re += B0_re;
1723  o10_im += B0_im;
1724 
1725  o01_re += A1_re;
1726  o01_im += A1_im;
1727  o11_re += B1_re;
1728  o11_im += B1_im;
1729 
1730  o02_re += A2_re;
1731  o02_im += A2_im;
1732  o12_re += B2_re;
1733  o12_im += B2_im;
1734 
1735  }
1736 }
1737 
1738 #ifdef MULTI_GPU
1739 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
1740  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
1741 #endif
1742 {
1743  // Projector P3-
1744  // 0 0 0 0
1745  // 0 0 0 0
1746  // 0 0 2 0
1747  // 0 0 0 2
1748 
1749 #ifdef MULTI_GPU
1750  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
1751  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1752 #else
1753  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
1754 #endif
1755 
1756 #ifdef MULTI_GPU
1757  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1758 #else
1759  const int ga_idx = sp_idx;
1760 #endif
1761 
1762  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1763  {
1770 
1771 #ifdef MULTI_GPU
1772  if (kernel_type == INTERIOR_KERNEL) {
1773 #endif
1774 
1775  // read spinor from device memory
1776 #ifndef TWIST_INV_DSLASH
1777  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1778 #else
1779  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1780  APPLY_TWIST_INV(-a, b, i);
1781 #endif
1782 
1783  // project spinor into half spinors
1784  a0_re = +2*i20_re;
1785  a0_im = +2*i20_im;
1786  a1_re = +2*i21_re;
1787  a1_im = +2*i21_im;
1788  a2_re = +2*i22_re;
1789  a2_im = +2*i22_im;
1790  b0_re = +2*i30_re;
1791  b0_im = +2*i30_im;
1792  b1_re = +2*i31_re;
1793  b1_im = +2*i31_im;
1794  b2_re = +2*i32_re;
1795  b2_im = +2*i32_im;
1796 
1797 #ifdef MULTI_GPU
1798  } else {
1799 
1800  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1801  //const int t_proj_scale = TPROJSCALE;
1802 
1803  // read half spinor from device memory
1804  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1805 
1806 #ifdef TWIST_INV_DSLASH
1807  a0_re = i00_re; a0_im = i00_im;
1808  a1_re = i01_re; a1_im = i01_im;
1809  a2_re = i02_re; a2_im = i02_im;
1810  b0_re = i10_re; b0_im = i10_im;
1811  b1_re = i11_re; b1_im = i11_im;
1812  b2_re = i12_re; b2_im = i12_im;
1813 #else
1814  a0_re = 2*i00_re; a0_im = 2*i00_im;
1815  a1_re = 2*i01_re; a1_im = 2*i01_im;
1816  a2_re = 2*i02_re; a2_im = 2*i02_im;
1817  b0_re = 2*i10_re; b0_im = 2*i10_im;
1818  b1_re = 2*i11_re; b1_im = 2*i11_im;
1819  b2_re = 2*i12_re; b2_im = 2*i12_im;
1820 #endif
1821 
1822  }
1823 #endif // MULTI_GPU
1824 
1825  // identity gauge matrix
1832 
1833  o20_re += A0_re;
1834  o20_im += A0_im;
1835  o30_re += B0_re;
1836  o30_im += B0_im;
1837 
1838  o21_re += A1_re;
1839  o21_im += A1_im;
1840  o31_re += B1_re;
1841  o31_im += B1_im;
1842 
1843  o22_re += A2_re;
1844  o22_im += A2_im;
1845  o32_re += B2_re;
1846  o32_im += B2_im;
1847 
1848  } else {
1855 
1856 #ifdef MULTI_GPU
1857  if (kernel_type == INTERIOR_KERNEL) {
1858 #endif
1859 
1860  // read spinor from device memory
1861 #ifndef TWIST_INV_DSLASH
1862  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1863 #else
1864  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1865  APPLY_TWIST_INV(-a, b, i);
1866 #endif
1867 
1868  // project spinor into half spinors
1869  a0_re = +2*i20_re;
1870  a0_im = +2*i20_im;
1871  a1_re = +2*i21_re;
1872  a1_im = +2*i21_im;
1873  a2_re = +2*i22_re;
1874  a2_im = +2*i22_im;
1875  b0_re = +2*i30_re;
1876  b0_im = +2*i30_im;
1877  b1_re = +2*i31_re;
1878  b1_im = +2*i31_im;
1879  b2_re = +2*i32_re;
1880  b2_im = +2*i32_im;
1881 
1882 #ifdef MULTI_GPU
1883  } else {
1884 
1885  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1886  //const int t_proj_scale = TPROJSCALE;
1887 
1888  // read half spinor from device memory
1889  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1890 
1891 #ifdef TWIST_INV_DSLASH
1892  a0_re = i00_re; a0_im = i00_im;
1893  a1_re = i01_re; a1_im = i01_im;
1894  a2_re = i02_re; a2_im = i02_im;
1895  b0_re = i10_re; b0_im = i10_im;
1896  b1_re = i11_re; b1_im = i11_im;
1897  b2_re = i12_re; b2_im = i12_im;
1898 #else
1899  a0_re = 2*i00_re; a0_im = 2*i00_im;
1900  a1_re = 2*i01_re; a1_im = 2*i01_im;
1901  a2_re = 2*i02_re; a2_im = 2*i02_im;
1902  b0_re = 2*i10_re; b0_im = 2*i10_im;
1903  b1_re = 2*i11_re; b1_im = 2*i11_im;
1904  b2_re = 2*i12_re; b2_im = 2*i12_im;
1905 #endif
1906 
1907  }
1908 #endif // MULTI_GPU
1909 
1910  // read gauge matrix from device memory
1911  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
1912 
1913  // reconstruct gauge matrix
1915 
1916  // multiply row 0
1917  spinorFloat A0_re = 0;
1918  A0_re += gT00_re * a0_re;
1919  A0_re -= gT00_im * a0_im;
1920  A0_re += gT01_re * a1_re;
1921  A0_re -= gT01_im * a1_im;
1922  A0_re += gT02_re * a2_re;
1923  A0_re -= gT02_im * a2_im;
1924  spinorFloat A0_im = 0;
1925  A0_im += gT00_re * a0_im;
1926  A0_im += gT00_im * a0_re;
1927  A0_im += gT01_re * a1_im;
1928  A0_im += gT01_im * a1_re;
1929  A0_im += gT02_re * a2_im;
1930  A0_im += gT02_im * a2_re;
1931  spinorFloat B0_re = 0;
1932  B0_re += gT00_re * b0_re;
1933  B0_re -= gT00_im * b0_im;
1934  B0_re += gT01_re * b1_re;
1935  B0_re -= gT01_im * b1_im;
1936  B0_re += gT02_re * b2_re;
1937  B0_re -= gT02_im * b2_im;
1938  spinorFloat B0_im = 0;
1939  B0_im += gT00_re * b0_im;
1940  B0_im += gT00_im * b0_re;
1941  B0_im += gT01_re * b1_im;
1942  B0_im += gT01_im * b1_re;
1943  B0_im += gT02_re * b2_im;
1944  B0_im += gT02_im * b2_re;
1945 
1946  // multiply row 1
1947  spinorFloat A1_re = 0;
1948  A1_re += gT10_re * a0_re;
1949  A1_re -= gT10_im * a0_im;
1950  A1_re += gT11_re * a1_re;
1951  A1_re -= gT11_im * a1_im;
1952  A1_re += gT12_re * a2_re;
1953  A1_re -= gT12_im * a2_im;
1954  spinorFloat A1_im = 0;
1955  A1_im += gT10_re * a0_im;
1956  A1_im += gT10_im * a0_re;
1957  A1_im += gT11_re * a1_im;
1958  A1_im += gT11_im * a1_re;
1959  A1_im += gT12_re * a2_im;
1960  A1_im += gT12_im * a2_re;
1961  spinorFloat B1_re = 0;
1962  B1_re += gT10_re * b0_re;
1963  B1_re -= gT10_im * b0_im;
1964  B1_re += gT11_re * b1_re;
1965  B1_re -= gT11_im * b1_im;
1966  B1_re += gT12_re * b2_re;
1967  B1_re -= gT12_im * b2_im;
1968  spinorFloat B1_im = 0;
1969  B1_im += gT10_re * b0_im;
1970  B1_im += gT10_im * b0_re;
1971  B1_im += gT11_re * b1_im;
1972  B1_im += gT11_im * b1_re;
1973  B1_im += gT12_re * b2_im;
1974  B1_im += gT12_im * b2_re;
1975 
1976  // multiply row 2
1977  spinorFloat A2_re = 0;
1978  A2_re += gT20_re * a0_re;
1979  A2_re -= gT20_im * a0_im;
1980  A2_re += gT21_re * a1_re;
1981  A2_re -= gT21_im * a1_im;
1982  A2_re += gT22_re * a2_re;
1983  A2_re -= gT22_im * a2_im;
1984  spinorFloat A2_im = 0;
1985  A2_im += gT20_re * a0_im;
1986  A2_im += gT20_im * a0_re;
1987  A2_im += gT21_re * a1_im;
1988  A2_im += gT21_im * a1_re;
1989  A2_im += gT22_re * a2_im;
1990  A2_im += gT22_im * a2_re;
1991  spinorFloat B2_re = 0;
1992  B2_re += gT20_re * b0_re;
1993  B2_re -= gT20_im * b0_im;
1994  B2_re += gT21_re * b1_re;
1995  B2_re -= gT21_im * b1_im;
1996  B2_re += gT22_re * b2_re;
1997  B2_re -= gT22_im * b2_im;
1998  spinorFloat B2_im = 0;
1999  B2_im += gT20_re * b0_im;
2000  B2_im += gT20_im * b0_re;
2001  B2_im += gT21_re * b1_im;
2002  B2_im += gT21_im * b1_re;
2003  B2_im += gT22_re * b2_im;
2004  B2_im += gT22_im * b2_re;
2005 
2006  o20_re += A0_re;
2007  o20_im += A0_im;
2008  o30_re += B0_re;
2009  o30_im += B0_im;
2010 
2011  o21_re += A1_re;
2012  o21_im += A1_im;
2013  o31_re += B1_re;
2014  o31_im += B1_im;
2015 
2016  o22_re += A2_re;
2017  o22_im += A2_im;
2018  o32_re += B2_re;
2019  o32_im += B2_im;
2020 
2021  }
2022 }
2023 
2024 #ifdef MULTI_GPU
2025 
2026 int incomplete = 0; // Have all 8 contributions been computed for this site?
2027 
2028 switch(kernel_type) { // intentional fall-through
2029 case INTERIOR_KERNEL:
2030  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
2031 case EXTERIOR_KERNEL_T:
2032  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
2033 case EXTERIOR_KERNEL_Z:
2034  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
2035 case EXTERIOR_KERNEL_Y:
2036  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
2037 }
2038 
2039 if (!incomplete)
2040 #endif // MULTI_GPU
2041 {
2042 #ifdef DSLASH_XPAY
2043  READ_ACCUM(ACCUMTEX, param.sp_stride)
2044 
2045 #ifndef TWIST_XPAY
2046 #ifndef TWIST_INV_DSLASH
2047  //perform invert twist first:
2048  APPLY_TWIST_INV(-a, b, o);
2049 #endif
2050  o00_re += acc00_re;
2051  o00_im += acc00_im;
2052  o01_re += acc01_re;
2053  o01_im += acc01_im;
2054  o02_re += acc02_re;
2055  o02_im += acc02_im;
2056  o10_re += acc10_re;
2057  o10_im += acc10_im;
2058  o11_re += acc11_re;
2059  o11_im += acc11_im;
2060  o12_re += acc12_re;
2061  o12_im += acc12_im;
2062  o20_re += acc20_re;
2063  o20_im += acc20_im;
2064  o21_re += acc21_re;
2065  o21_im += acc21_im;
2066  o22_re += acc22_re;
2067  o22_im += acc22_im;
2068  o30_re += acc30_re;
2069  o30_im += acc30_im;
2070  o31_re += acc31_re;
2071  o31_im += acc31_im;
2072  o32_re += acc32_re;
2073  o32_im += acc32_im;
2074 #else
2075  APPLY_TWIST(-a, acc);
2076  //warning! b is unrelated to the twisted mass parameter in this case!
2077 
2078  o00_re = b*o00_re+acc00_re;
2079  o00_im = b*o00_im+acc00_im;
2080  o01_re = b*o01_re+acc01_re;
2081  o01_im = b*o01_im+acc01_im;
2082  o02_re = b*o02_re+acc02_re;
2083  o02_im = b*o02_im+acc02_im;
2084  o10_re = b*o10_re+acc10_re;
2085  o10_im = b*o10_im+acc10_im;
2086  o11_re = b*o11_re+acc11_re;
2087  o11_im = b*o11_im+acc11_im;
2088  o12_re = b*o12_re+acc12_re;
2089  o12_im = b*o12_im+acc12_im;
2090  o20_re = b*o20_re+acc20_re;
2091  o20_im = b*o20_im+acc20_im;
2092  o21_re = b*o21_re+acc21_re;
2093  o21_im = b*o21_im+acc21_im;
2094  o22_re = b*o22_re+acc22_re;
2095  o22_im = b*o22_im+acc22_im;
2096  o30_re = b*o30_re+acc30_re;
2097  o30_im = b*o30_im+acc30_im;
2098  o31_re = b*o31_re+acc31_re;
2099  o31_im = b*o31_im+acc31_im;
2100  o32_re = b*o32_re+acc32_re;
2101  o32_im = b*o32_im+acc32_im;
2102 #endif//TWIST_XPAY
2103 #else //no XPAY
2104 #ifndef TWIST_INV_DSLASH
2105  APPLY_TWIST_INV(-a, b, o);
2106 #endif
2107 #endif
2108 }
2109 
2110 // write spinor field back to device memory
2111 WRITE_SPINOR(param.sp_stride);
2112 
2113 // undefine to prevent warning when precision is changed
2114 #undef spinorFloat
2115 #undef g00_re
2116 #undef g00_im
2117 #undef g01_re
2118 #undef g01_im
2119 #undef g02_re
2120 #undef g02_im
2121 #undef g10_re
2122 #undef g10_im
2123 #undef g11_re
2124 #undef g11_im
2125 #undef g12_re
2126 #undef g12_im
2127 #undef g20_re
2128 #undef g20_im
2129 #undef g21_re
2130 #undef g21_im
2131 #undef g22_re
2132 #undef g22_im
2133 
2134 #undef i00_re
2135 #undef i00_im
2136 #undef i01_re
2137 #undef i01_im
2138 #undef i02_re
2139 #undef i02_im
2140 #undef i10_re
2141 #undef i10_im
2142 #undef i11_re
2143 #undef i11_im
2144 #undef i12_re
2145 #undef i12_im
2146 #undef i20_re
2147 #undef i20_im
2148 #undef i21_re
2149 #undef i21_im
2150 #undef i22_re
2151 #undef i22_im
2152 #undef i30_re
2153 #undef i30_im
2154 #undef i31_re
2155 #undef i31_im
2156 #undef i32_re
2157 #undef i32_im
2158 
2159 #undef acc00_re
2160 #undef acc00_im
2161 #undef acc01_re
2162 #undef acc01_im
2163 #undef acc02_re
2164 #undef acc02_im
2165 #undef acc10_re
2166 #undef acc10_im
2167 #undef acc11_re
2168 #undef acc11_im
2169 #undef acc12_re
2170 #undef acc12_im
2171 #undef acc20_re
2172 #undef acc20_im
2173 #undef acc21_re
2174 #undef acc21_im
2175 #undef acc22_re
2176 #undef acc22_im
2177 #undef acc30_re
2178 #undef acc30_im
2179 #undef acc31_re
2180 #undef acc31_im
2181 #undef acc32_re
2182 #undef acc32_im
2183 
2184 
2185 
2186 #undef VOLATILE
#define g21_im
VOLATILE spinorFloat o32_re
#define g01_re
#define g11_im
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
__constant__ int Vh
#define acc02_re
#define gT01_re
#define acc12_im
#define gT12_re
__constant__ int X2
spinorFloat B1_re
#define gT00_re
#define APPLY_TWIST(a, reg)
Definition: io_spinor.h:1187
#define i12_re
spinorFloat b0_re
#define i01_re
RECONSTRUCT_GAUGE_MATRIX(0)
#define i30_im
spinorFloat A2_im
#define acc02_im
spinorFloat A2_re
__constant__ int X2X1mX1
VOLATILE spinorFloat o20_im
VOLATILE spinorFloat o30_re
spinorFloat B1_im
#define g01_im
#define g12_re
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o12_im
#define APPLY_TWIST_INV(a, b, reg)
**************************only for deg tm:*******************************
Definition: io_spinor.h:1122
#define i01_im
__constant__ int X3X2X1mX2X1
#define acc01_im
#define acc22_re
__constant__ int X1
#define acc22_im
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
spinorFloat b1_re
#define i10_re
#define gT02_re
VOLATILE spinorFloat o21_im
#define acc11_re
__constant__ int X3X2X1
#define acc30_im
#define g20_re
#define i31_im
#define g20_im
spinorFloat b1_im
#define i21_re
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o02_im
#define acc32_im
#define acc20_im
WRITE_SPINOR(param.sp_stride)
#define i12_im
#define gT22_im
#define gT00_im
#define gT12_im
const int dims[]
QudaGaugeParam param
Definition: pack_test.cpp:17
#define g02_re
#define acc12_re
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define i11_im
#define g10_re
#define i20_re
spinorFloat a0_re
VOLATILE spinorFloat o01_re
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
spinorFloat B0_im
#define i22_re
#define acc10_im
#define gT21_re
spinorFloat A1_re
#define GAUGE0TEX
Definition: covDev.h:112
spinorFloat b0_im
VOLATILE spinorFloat o00_im
#define acc00_re
#define acc31_re
#define g22_re
#define acc20_re
VOLATILE spinorFloat o31_re
#define acc01_re
#define gT11_im
#define i32_im
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
spinorFloat B0_re
__constant__ int X2m1
#define gT11_re
VOLATILE spinorFloat o00_re
#define i10_im
#define SPINORTEX
Definition: clover_def.h:40
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o11_im
__constant__ int gauge_fixed
#define gT21_im
#define i22_im
spinorFloat b2_im
#define gT01_im
#define gT22_re
spinorFloat a0_im
__constant__ int X4X3X2X1mX3X2X1
#define g10_im
VOLATILE spinorFloat o12_re
#define SPINOR_HOP
Definition: covDev.h:158
#define gT20_re
VOLATILE spinorFloat o11_re
#define i11_re
#define g00_re
spinorFloat A1_im
VOLATILE spinorFloat o02_re
__constant__ int ga_stride
#define g22_im
spinorFloat a2_im
#define i00_re
VOLATILE spinorFloat o10_re
spinorFloat B2_re
#define acc21_im
spinorFloat b2_re
#define gT10_im
VOLATILE spinorFloat o32_im
__constant__ int X1m1
#define gT02_im
#define g12_im
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o21_re
__constant__ int X3
#define i00_im
#define spinorFloat
#define acc31_im
#define g00_im
#define acc30_re
VOLATILE spinorFloat o01_im
#define i31_re
#define GAUGE1TEX
Definition: covDev.h:113
const int ga_idx
VOLATILE spinorFloat o22_im
#define i32_re
#define acc32_re
#define g02_im
VOLATILE spinorFloat o20_re
__constant__ int X4m1
spinorFloat A0_re
spinorFloat a1_im
#define g21_re
spinorFloat a1_re
#define acc21_re
#define acc00_im
#define VOLATILE
spinorFloat A0_im
spinorFloat a2_re
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define i21_im
#define INTERTEX
Definition: covDev.h:149
__constant__ int X4X3X2X1hmX3X2X1h
#define gT10_re
#define acc10_re
#define i30_re
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define i02_im
VOLATILE spinorFloat o30_im
KernelType kernel_type
#define i20_im
#define gT20_im
#define g11_re
__constant__ int X4
__constant__ int X3m1
#define acc11_im
spinorFloat B2_im
#define i02_re
__constant__ int X2X1