QUDA  0.9.0
tm_dslash_dagger_gt200_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define i00_re I0.x
15 #define i00_im I0.y
16 #define i01_re I1.x
17 #define i01_im I1.y
18 #define i02_re I2.x
19 #define i02_im I2.y
20 #define i10_re I3.x
21 #define i10_im I3.y
22 #define i11_re I4.x
23 #define i11_im I4.y
24 #define i12_re I5.x
25 #define i12_im I5.y
26 #define i20_re I6.x
27 #define i20_im I6.y
28 #define i21_re I7.x
29 #define i21_im I7.y
30 #define i22_re I8.x
31 #define i22_im I8.y
32 #define i30_re I9.x
33 #define i30_im I9.y
34 #define i31_re I10.x
35 #define i31_im I10.y
36 #define i32_re I11.x
37 #define i32_im I11.y
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
62 #else
63 #define spinorFloat float
64 #define i00_re I0.x
65 #define i00_im I0.y
66 #define i01_re I0.z
67 #define i01_im I0.w
68 #define i02_re I1.x
69 #define i02_im I1.y
70 #define i10_re I1.z
71 #define i10_im I1.w
72 #define i11_re I2.x
73 #define i11_im I2.y
74 #define i12_re I2.z
75 #define i12_im I2.w
76 #define i20_re I3.x
77 #define i20_im I3.y
78 #define i21_re I3.z
79 #define i21_im I3.w
80 #define i22_re I4.x
81 #define i22_im I4.y
82 #define i30_re I4.z
83 #define i30_im I4.w
84 #define i31_re I5.x
85 #define i31_im I5.y
86 #define i32_re I5.z
87 #define i32_im I5.w
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
113 
114 // gauge link
115 #ifdef GAUGE_FLOAT2
116 #define g00_re G0.x
117 #define g00_im G0.y
118 #define g01_re G1.x
119 #define g01_im G1.y
120 #define g02_re G2.x
121 #define g02_im G2.y
122 #define g10_re G3.x
123 #define g10_im G3.y
124 #define g11_re G4.x
125 #define g11_im G4.y
126 #define g12_re G5.x
127 #define g12_im G5.y
128 #define g20_re G6.x
129 #define g20_im G6.y
130 #define g21_re G7.x
131 #define g21_im G7.y
132 #define g22_re G8.x
133 #define g22_im G8.y
134 
135 #else
136 #define g00_re G0.x
137 #define g00_im G0.y
138 #define g01_re G0.z
139 #define g01_im G0.w
140 #define g02_re G1.x
141 #define g02_im G1.y
142 #define g10_re G1.z
143 #define g10_im G1.w
144 #define g11_re G2.x
145 #define g11_im G2.y
146 #define g12_re G2.z
147 #define g12_im G2.w
148 #define g20_re G3.x
149 #define g20_im G3.y
150 #define g21_re G3.z
151 #define g21_im G3.w
152 #define g22_re G4.x
153 #define g22_im G4.y
154 
155 #endif // GAUGE_DOUBLE
156 
157 // conjugated gauge link
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
176 
177 // output spinor
202 
203 #include "read_gauge.h"
204 #include "io_spinor.h"
205 
206 int coord[5];
207 int X;
208 
209 int sid;
210 
211 #ifdef MULTI_GPU
212 int face_idx;
213 if (kernel_type == INTERIOR_KERNEL) {
214 #endif
215 
216  sid = blockIdx.x*blockDim.x + threadIdx.x;
217  if (sid >= param.threads) return;
218 
219  // Assume even dimensions
221 
222  o00_re = 0; o00_im = 0;
223  o01_re = 0; o01_im = 0;
224  o02_re = 0; o02_im = 0;
225  o10_re = 0; o10_im = 0;
226  o11_re = 0; o11_im = 0;
227  o12_re = 0; o12_im = 0;
228  o20_re = 0; o20_im = 0;
229  o21_re = 0; o21_im = 0;
230  o22_re = 0; o22_im = 0;
231  o30_re = 0; o30_im = 0;
232  o31_re = 0; o31_im = 0;
233  o32_re = 0; o32_im = 0;
234 
235 #ifdef MULTI_GPU
236 } else { // exterior kernel
237 
238  sid = blockIdx.x*blockDim.x + threadIdx.x;
239  if (sid >= param.threads) return;
240 
241  const int face_volume = (param.threads >> 1); // volume of one face
242  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
243  face_idx = sid - face_num*face_volume; // index into the respective face
244 
245  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
246  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
247  //sp_idx = face_idx + param.ghostOffset[dim];
248 
249  coordsFromFaceIndex<4,QUDA_4D_PC,kernel_type,1>(X, sid, coord, face_idx, face_num, param);
250 
252 
253  o00_re = i00_re; o00_im = i00_im;
254  o01_re = i01_re; o01_im = i01_im;
255  o02_re = i02_re; o02_im = i02_im;
256  o10_re = i10_re; o10_im = i10_im;
257  o11_re = i11_re; o11_im = i11_im;
258  o12_re = i12_re; o12_im = i12_im;
259  o20_re = i20_re; o20_im = i20_im;
260  o21_re = i21_re; o21_im = i21_im;
261  o22_re = i22_re; o22_im = i22_im;
262  o30_re = i30_re; o30_im = i30_im;
263  o31_re = i31_re; o31_im = i31_im;
264  o32_re = i32_re; o32_im = i32_im;
265 }
266 #endif // MULTI_GPU
267 
268 
269 #ifdef MULTI_GPU
270 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]<(param.dc.X[0]-1))) ||
271  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==(param.dc.X[0]-1)) )
272 #endif
273 {
274  // Projector P0+
275  // 1 0 0 i
276  // 0 1 i 0
277  // 0 -i 1 0
278  // -i 0 0 1
279 
280 #ifdef MULTI_GPU
281  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1 :
282  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
283 #if (DD_PREC==2) // half precision
284  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
285 #endif
286 #else
287  const int sp_idx = (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1;
288 #endif
289 
290  const int ga_idx = sid;
291 
298 
299 #ifdef MULTI_GPU
300  if (kernel_type == INTERIOR_KERNEL) {
301 #endif
302 
303  // read spinor from device memory
304 #ifdef TWIST_INV_DSLASH
305 #ifdef SPINOR_DOUBLE
306  const spinorFloat a = param.a;
307  const spinorFloat b = param.b;
308 #else
309  const spinorFloat a = param.a_f;
310  const spinorFloat b = param.b_f;
311 #endif
312 #endif
313  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
314 #ifdef TWIST_INV_DSLASH
315  APPLY_TWIST_INV(-a, b, i);
316 #endif
317 
318  // project spinor into half spinors
319  a0_re = +i00_re-i30_im;
320  a0_im = +i00_im+i30_re;
321  a1_re = +i01_re-i31_im;
322  a1_im = +i01_im+i31_re;
323  a2_re = +i02_re-i32_im;
324  a2_im = +i02_im+i32_re;
325  b0_re = +i10_re-i20_im;
326  b0_im = +i10_im+i20_re;
327  b1_re = +i11_re-i21_im;
328  b1_im = +i11_im+i21_re;
329  b2_re = +i12_re-i22_im;
330  b2_im = +i12_im+i22_re;
331 
332 #ifdef MULTI_GPU
333  } else {
334 
335  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
336 
337  // read half spinor from device memory
338  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 0);
339 
340  a0_re = i00_re; a0_im = i00_im;
341  a1_re = i01_re; a1_im = i01_im;
342  a2_re = i02_re; a2_im = i02_im;
343  b0_re = i10_re; b0_im = i10_im;
344  b1_re = i11_re; b1_im = i11_im;
345  b2_re = i12_re; b2_im = i12_im;
346 
347  }
348 #endif // MULTI_GPU
349 
350  // read gauge matrix from device memory
351  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride);
352 
353  // reconstruct gauge matrix
355 
356  // multiply row 0
358  A0_re += g00_re * a0_re;
359  A0_re -= g00_im * a0_im;
360  A0_re += g01_re * a1_re;
361  A0_re -= g01_im * a1_im;
362  A0_re += g02_re * a2_re;
363  A0_re -= g02_im * a2_im;
365  A0_im += g00_re * a0_im;
366  A0_im += g00_im * a0_re;
367  A0_im += g01_re * a1_im;
368  A0_im += g01_im * a1_re;
369  A0_im += g02_re * a2_im;
370  A0_im += g02_im * a2_re;
372  B0_re += g00_re * b0_re;
373  B0_re -= g00_im * b0_im;
374  B0_re += g01_re * b1_re;
375  B0_re -= g01_im * b1_im;
376  B0_re += g02_re * b2_re;
377  B0_re -= g02_im * b2_im;
379  B0_im += g00_re * b0_im;
380  B0_im += g00_im * b0_re;
381  B0_im += g01_re * b1_im;
382  B0_im += g01_im * b1_re;
383  B0_im += g02_re * b2_im;
384  B0_im += g02_im * b2_re;
385 
386  // multiply row 1
388  A1_re += g10_re * a0_re;
389  A1_re -= g10_im * a0_im;
390  A1_re += g11_re * a1_re;
391  A1_re -= g11_im * a1_im;
392  A1_re += g12_re * a2_re;
393  A1_re -= g12_im * a2_im;
395  A1_im += g10_re * a0_im;
396  A1_im += g10_im * a0_re;
397  A1_im += g11_re * a1_im;
398  A1_im += g11_im * a1_re;
399  A1_im += g12_re * a2_im;
400  A1_im += g12_im * a2_re;
402  B1_re += g10_re * b0_re;
403  B1_re -= g10_im * b0_im;
404  B1_re += g11_re * b1_re;
405  B1_re -= g11_im * b1_im;
406  B1_re += g12_re * b2_re;
407  B1_re -= g12_im * b2_im;
409  B1_im += g10_re * b0_im;
410  B1_im += g10_im * b0_re;
411  B1_im += g11_re * b1_im;
412  B1_im += g11_im * b1_re;
413  B1_im += g12_re * b2_im;
414  B1_im += g12_im * b2_re;
415 
416  // multiply row 2
418  A2_re += g20_re * a0_re;
419  A2_re -= g20_im * a0_im;
420  A2_re += g21_re * a1_re;
421  A2_re -= g21_im * a1_im;
422  A2_re += g22_re * a2_re;
423  A2_re -= g22_im * a2_im;
425  A2_im += g20_re * a0_im;
426  A2_im += g20_im * a0_re;
427  A2_im += g21_re * a1_im;
428  A2_im += g21_im * a1_re;
429  A2_im += g22_re * a2_im;
430  A2_im += g22_im * a2_re;
432  B2_re += g20_re * b0_re;
433  B2_re -= g20_im * b0_im;
434  B2_re += g21_re * b1_re;
435  B2_re -= g21_im * b1_im;
436  B2_re += g22_re * b2_re;
437  B2_re -= g22_im * b2_im;
439  B2_im += g20_re * b0_im;
440  B2_im += g20_im * b0_re;
441  B2_im += g21_re * b1_im;
442  B2_im += g21_im * b1_re;
443  B2_im += g22_re * b2_im;
444  B2_im += g22_im * b2_re;
445 
446  o00_re += A0_re;
447  o00_im += A0_im;
448  o10_re += B0_re;
449  o10_im += B0_im;
450  o20_re += B0_im;
451  o20_im -= B0_re;
452  o30_re += A0_im;
453  o30_im -= A0_re;
454 
455  o01_re += A1_re;
456  o01_im += A1_im;
457  o11_re += B1_re;
458  o11_im += B1_im;
459  o21_re += B1_im;
460  o21_im -= B1_re;
461  o31_re += A1_im;
462  o31_im -= A1_re;
463 
464  o02_re += A2_re;
465  o02_im += A2_im;
466  o12_re += B2_re;
467  o12_im += B2_im;
468  o22_re += B2_im;
469  o22_im -= B2_re;
470  o32_re += A2_im;
471  o32_im -= A2_re;
472 
473 }
474 
475 #ifdef MULTI_GPU
476 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]>0)) ||
477  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==0) )
478 #endif
479 {
480  // Projector P0-
481  // 1 0 0 -i
482  // 0 1 -i 0
483  // 0 i 1 0
484  // i 0 0 1
485 
486 #ifdef MULTI_GPU
487  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1 :
488  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
489 #if (DD_PREC==2) // half precision
490  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
491 #endif
492 #else
493  const int sp_idx = (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1;
494 #endif
495 
496 #ifdef MULTI_GPU
497  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
498 #else
499  const int ga_idx = sp_idx;
500 #endif
501 
508 
509 #ifdef MULTI_GPU
510  if (kernel_type == INTERIOR_KERNEL) {
511 #endif
512 
513  // read spinor from device memory
514 #ifdef TWIST_INV_DSLASH
515 #ifdef SPINOR_DOUBLE
516  const spinorFloat a = param.a;
517  const spinorFloat b = param.b;
518 #else
519  const spinorFloat a = param.a_f;
520  const spinorFloat b = param.b_f;
521 #endif
522 #endif
523  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
524 #ifdef TWIST_INV_DSLASH
525  APPLY_TWIST_INV(-a, b, i);
526 #endif
527 
528  // project spinor into half spinors
529  a0_re = +i00_re+i30_im;
530  a0_im = +i00_im-i30_re;
531  a1_re = +i01_re+i31_im;
532  a1_im = +i01_im-i31_re;
533  a2_re = +i02_re+i32_im;
534  a2_im = +i02_im-i32_re;
535  b0_re = +i10_re+i20_im;
536  b0_im = +i10_im-i20_re;
537  b1_re = +i11_re+i21_im;
538  b1_im = +i11_im-i21_re;
539  b2_re = +i12_re+i22_im;
540  b2_im = +i12_im-i22_re;
541 
542 #ifdef MULTI_GPU
543  } else {
544 
545  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
546 
547  // read half spinor from device memory
548  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 1);
549 
550  a0_re = i00_re; a0_im = i00_im;
551  a1_re = i01_re; a1_im = i01_im;
552  a2_re = i02_re; a2_im = i02_im;
553  b0_re = i10_re; b0_im = i10_im;
554  b1_re = i11_re; b1_im = i11_im;
555  b2_re = i12_re; b2_im = i12_im;
556 
557  }
558 #endif // MULTI_GPU
559 
560  // read gauge matrix from device memory
561  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, param.gauge_stride);
562 
563  // reconstruct gauge matrix
565 
566  // multiply row 0
567  spinorFloat A0_re = 0;
568  A0_re += gT00_re * a0_re;
569  A0_re -= gT00_im * a0_im;
570  A0_re += gT01_re * a1_re;
571  A0_re -= gT01_im * a1_im;
572  A0_re += gT02_re * a2_re;
573  A0_re -= gT02_im * a2_im;
574  spinorFloat A0_im = 0;
575  A0_im += gT00_re * a0_im;
576  A0_im += gT00_im * a0_re;
577  A0_im += gT01_re * a1_im;
578  A0_im += gT01_im * a1_re;
579  A0_im += gT02_re * a2_im;
580  A0_im += gT02_im * a2_re;
581  spinorFloat B0_re = 0;
582  B0_re += gT00_re * b0_re;
583  B0_re -= gT00_im * b0_im;
584  B0_re += gT01_re * b1_re;
585  B0_re -= gT01_im * b1_im;
586  B0_re += gT02_re * b2_re;
587  B0_re -= gT02_im * b2_im;
588  spinorFloat B0_im = 0;
589  B0_im += gT00_re * b0_im;
590  B0_im += gT00_im * b0_re;
591  B0_im += gT01_re * b1_im;
592  B0_im += gT01_im * b1_re;
593  B0_im += gT02_re * b2_im;
594  B0_im += gT02_im * b2_re;
595 
596  // multiply row 1
597  spinorFloat A1_re = 0;
598  A1_re += gT10_re * a0_re;
599  A1_re -= gT10_im * a0_im;
600  A1_re += gT11_re * a1_re;
601  A1_re -= gT11_im * a1_im;
602  A1_re += gT12_re * a2_re;
603  A1_re -= gT12_im * a2_im;
604  spinorFloat A1_im = 0;
605  A1_im += gT10_re * a0_im;
606  A1_im += gT10_im * a0_re;
607  A1_im += gT11_re * a1_im;
608  A1_im += gT11_im * a1_re;
609  A1_im += gT12_re * a2_im;
610  A1_im += gT12_im * a2_re;
611  spinorFloat B1_re = 0;
612  B1_re += gT10_re * b0_re;
613  B1_re -= gT10_im * b0_im;
614  B1_re += gT11_re * b1_re;
615  B1_re -= gT11_im * b1_im;
616  B1_re += gT12_re * b2_re;
617  B1_re -= gT12_im * b2_im;
618  spinorFloat B1_im = 0;
619  B1_im += gT10_re * b0_im;
620  B1_im += gT10_im * b0_re;
621  B1_im += gT11_re * b1_im;
622  B1_im += gT11_im * b1_re;
623  B1_im += gT12_re * b2_im;
624  B1_im += gT12_im * b2_re;
625 
626  // multiply row 2
627  spinorFloat A2_re = 0;
628  A2_re += gT20_re * a0_re;
629  A2_re -= gT20_im * a0_im;
630  A2_re += gT21_re * a1_re;
631  A2_re -= gT21_im * a1_im;
632  A2_re += gT22_re * a2_re;
633  A2_re -= gT22_im * a2_im;
634  spinorFloat A2_im = 0;
635  A2_im += gT20_re * a0_im;
636  A2_im += gT20_im * a0_re;
637  A2_im += gT21_re * a1_im;
638  A2_im += gT21_im * a1_re;
639  A2_im += gT22_re * a2_im;
640  A2_im += gT22_im * a2_re;
641  spinorFloat B2_re = 0;
642  B2_re += gT20_re * b0_re;
643  B2_re -= gT20_im * b0_im;
644  B2_re += gT21_re * b1_re;
645  B2_re -= gT21_im * b1_im;
646  B2_re += gT22_re * b2_re;
647  B2_re -= gT22_im * b2_im;
648  spinorFloat B2_im = 0;
649  B2_im += gT20_re * b0_im;
650  B2_im += gT20_im * b0_re;
651  B2_im += gT21_re * b1_im;
652  B2_im += gT21_im * b1_re;
653  B2_im += gT22_re * b2_im;
654  B2_im += gT22_im * b2_re;
655 
656  o00_re += A0_re;
657  o00_im += A0_im;
658  o10_re += B0_re;
659  o10_im += B0_im;
660  o20_re -= B0_im;
661  o20_im += B0_re;
662  o30_re -= A0_im;
663  o30_im += A0_re;
664 
665  o01_re += A1_re;
666  o01_im += A1_im;
667  o11_re += B1_re;
668  o11_im += B1_im;
669  o21_re -= B1_im;
670  o21_im += B1_re;
671  o31_re -= A1_im;
672  o31_im += A1_re;
673 
674  o02_re += A2_re;
675  o02_im += A2_im;
676  o12_re += B2_re;
677  o12_im += B2_im;
678  o22_re -= B2_im;
679  o22_im += B2_re;
680  o32_re -= A2_im;
681  o32_im += A2_re;
682 
683 }
684 
685 #ifdef MULTI_GPU
686 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]<(param.dc.X[1]-1))) ||
687  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==(param.dc.X[1]-1)) )
688 #endif
689 {
690  // Projector P1+
691  // 1 0 0 1
692  // 0 1 -1 0
693  // 0 -1 1 0
694  // 1 0 0 1
695 
696 #ifdef MULTI_GPU
697  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1 :
698  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
699 #if (DD_PREC==2) // half precision
700  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
701 #endif
702 #else
703  const int sp_idx = (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1;
704 #endif
705 
706  const int ga_idx = sid;
707 
714 
715 #ifdef MULTI_GPU
716  if (kernel_type == INTERIOR_KERNEL) {
717 #endif
718 
719  // read spinor from device memory
720 #ifdef TWIST_INV_DSLASH
721 #ifdef SPINOR_DOUBLE
722  const spinorFloat a = param.a;
723  const spinorFloat b = param.b;
724 #else
725  const spinorFloat a = param.a_f;
726  const spinorFloat b = param.b_f;
727 #endif
728 #endif
729  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
730 #ifdef TWIST_INV_DSLASH
731  APPLY_TWIST_INV(-a, b, i);
732 #endif
733 
734  // project spinor into half spinors
735  a0_re = +i00_re+i30_re;
736  a0_im = +i00_im+i30_im;
737  a1_re = +i01_re+i31_re;
738  a1_im = +i01_im+i31_im;
739  a2_re = +i02_re+i32_re;
740  a2_im = +i02_im+i32_im;
741  b0_re = +i10_re-i20_re;
742  b0_im = +i10_im-i20_im;
743  b1_re = +i11_re-i21_re;
744  b1_im = +i11_im-i21_im;
745  b2_re = +i12_re-i22_re;
746  b2_im = +i12_im-i22_im;
747 
748 #ifdef MULTI_GPU
749  } else {
750 
751  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
752 
753  // read half spinor from device memory
754  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 2);
755 
756  a0_re = i00_re; a0_im = i00_im;
757  a1_re = i01_re; a1_im = i01_im;
758  a2_re = i02_re; a2_im = i02_im;
759  b0_re = i10_re; b0_im = i10_im;
760  b1_re = i11_re; b1_im = i11_im;
761  b2_re = i12_re; b2_im = i12_im;
762 
763  }
764 #endif // MULTI_GPU
765 
766  // read gauge matrix from device memory
767  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, param.gauge_stride);
768 
769  // reconstruct gauge matrix
771 
772  // multiply row 0
773  spinorFloat A0_re = 0;
774  A0_re += g00_re * a0_re;
775  A0_re -= g00_im * a0_im;
776  A0_re += g01_re * a1_re;
777  A0_re -= g01_im * a1_im;
778  A0_re += g02_re * a2_re;
779  A0_re -= g02_im * a2_im;
780  spinorFloat A0_im = 0;
781  A0_im += g00_re * a0_im;
782  A0_im += g00_im * a0_re;
783  A0_im += g01_re * a1_im;
784  A0_im += g01_im * a1_re;
785  A0_im += g02_re * a2_im;
786  A0_im += g02_im * a2_re;
787  spinorFloat B0_re = 0;
788  B0_re += g00_re * b0_re;
789  B0_re -= g00_im * b0_im;
790  B0_re += g01_re * b1_re;
791  B0_re -= g01_im * b1_im;
792  B0_re += g02_re * b2_re;
793  B0_re -= g02_im * b2_im;
794  spinorFloat B0_im = 0;
795  B0_im += g00_re * b0_im;
796  B0_im += g00_im * b0_re;
797  B0_im += g01_re * b1_im;
798  B0_im += g01_im * b1_re;
799  B0_im += g02_re * b2_im;
800  B0_im += g02_im * b2_re;
801 
802  // multiply row 1
803  spinorFloat A1_re = 0;
804  A1_re += g10_re * a0_re;
805  A1_re -= g10_im * a0_im;
806  A1_re += g11_re * a1_re;
807  A1_re -= g11_im * a1_im;
808  A1_re += g12_re * a2_re;
809  A1_re -= g12_im * a2_im;
810  spinorFloat A1_im = 0;
811  A1_im += g10_re * a0_im;
812  A1_im += g10_im * a0_re;
813  A1_im += g11_re * a1_im;
814  A1_im += g11_im * a1_re;
815  A1_im += g12_re * a2_im;
816  A1_im += g12_im * a2_re;
817  spinorFloat B1_re = 0;
818  B1_re += g10_re * b0_re;
819  B1_re -= g10_im * b0_im;
820  B1_re += g11_re * b1_re;
821  B1_re -= g11_im * b1_im;
822  B1_re += g12_re * b2_re;
823  B1_re -= g12_im * b2_im;
824  spinorFloat B1_im = 0;
825  B1_im += g10_re * b0_im;
826  B1_im += g10_im * b0_re;
827  B1_im += g11_re * b1_im;
828  B1_im += g11_im * b1_re;
829  B1_im += g12_re * b2_im;
830  B1_im += g12_im * b2_re;
831 
832  // multiply row 2
833  spinorFloat A2_re = 0;
834  A2_re += g20_re * a0_re;
835  A2_re -= g20_im * a0_im;
836  A2_re += g21_re * a1_re;
837  A2_re -= g21_im * a1_im;
838  A2_re += g22_re * a2_re;
839  A2_re -= g22_im * a2_im;
840  spinorFloat A2_im = 0;
841  A2_im += g20_re * a0_im;
842  A2_im += g20_im * a0_re;
843  A2_im += g21_re * a1_im;
844  A2_im += g21_im * a1_re;
845  A2_im += g22_re * a2_im;
846  A2_im += g22_im * a2_re;
847  spinorFloat B2_re = 0;
848  B2_re += g20_re * b0_re;
849  B2_re -= g20_im * b0_im;
850  B2_re += g21_re * b1_re;
851  B2_re -= g21_im * b1_im;
852  B2_re += g22_re * b2_re;
853  B2_re -= g22_im * b2_im;
854  spinorFloat B2_im = 0;
855  B2_im += g20_re * b0_im;
856  B2_im += g20_im * b0_re;
857  B2_im += g21_re * b1_im;
858  B2_im += g21_im * b1_re;
859  B2_im += g22_re * b2_im;
860  B2_im += g22_im * b2_re;
861 
862  o00_re += A0_re;
863  o00_im += A0_im;
864  o10_re += B0_re;
865  o10_im += B0_im;
866  o20_re -= B0_re;
867  o20_im -= B0_im;
868  o30_re += A0_re;
869  o30_im += A0_im;
870 
871  o01_re += A1_re;
872  o01_im += A1_im;
873  o11_re += B1_re;
874  o11_im += B1_im;
875  o21_re -= B1_re;
876  o21_im -= B1_im;
877  o31_re += A1_re;
878  o31_im += A1_im;
879 
880  o02_re += A2_re;
881  o02_im += A2_im;
882  o12_re += B2_re;
883  o12_im += B2_im;
884  o22_re -= B2_re;
885  o22_im -= B2_im;
886  o32_re += A2_re;
887  o32_im += A2_im;
888 
889 }
890 
891 #ifdef MULTI_GPU
892 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]>0)) ||
893  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==0) )
894 #endif
895 {
896  // Projector P1-
897  // 1 0 0 -1
898  // 0 1 1 0
899  // 0 1 1 0
900  // -1 0 0 1
901 
902 #ifdef MULTI_GPU
903  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1 :
904  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
905 #if (DD_PREC==2) // half precision
906  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
907 #endif
908 #else
909  const int sp_idx = (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1;
910 #endif
911 
912 #ifdef MULTI_GPU
913  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
914 #else
915  const int ga_idx = sp_idx;
916 #endif
917 
924 
925 #ifdef MULTI_GPU
926  if (kernel_type == INTERIOR_KERNEL) {
927 #endif
928 
929  // read spinor from device memory
930 #ifdef TWIST_INV_DSLASH
931 #ifdef SPINOR_DOUBLE
932  const spinorFloat a = param.a;
933  const spinorFloat b = param.b;
934 #else
935  const spinorFloat a = param.a_f;
936  const spinorFloat b = param.b_f;
937 #endif
938 #endif
939  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
940 #ifdef TWIST_INV_DSLASH
941  APPLY_TWIST_INV(-a, b, i);
942 #endif
943 
944  // project spinor into half spinors
945  a0_re = +i00_re-i30_re;
946  a0_im = +i00_im-i30_im;
947  a1_re = +i01_re-i31_re;
948  a1_im = +i01_im-i31_im;
949  a2_re = +i02_re-i32_re;
950  a2_im = +i02_im-i32_im;
951  b0_re = +i10_re+i20_re;
952  b0_im = +i10_im+i20_im;
953  b1_re = +i11_re+i21_re;
954  b1_im = +i11_im+i21_im;
955  b2_re = +i12_re+i22_re;
956  b2_im = +i12_im+i22_im;
957 
958 #ifdef MULTI_GPU
959  } else {
960 
961  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
962 
963  // read half spinor from device memory
964  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 3);
965 
966  a0_re = i00_re; a0_im = i00_im;
967  a1_re = i01_re; a1_im = i01_im;
968  a2_re = i02_re; a2_im = i02_im;
969  b0_re = i10_re; b0_im = i10_im;
970  b1_re = i11_re; b1_im = i11_im;
971  b2_re = i12_re; b2_im = i12_im;
972 
973  }
974 #endif // MULTI_GPU
975 
976  // read gauge matrix from device memory
977  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, param.gauge_stride);
978 
979  // reconstruct gauge matrix
981 
982  // multiply row 0
983  spinorFloat A0_re = 0;
984  A0_re += gT00_re * a0_re;
985  A0_re -= gT00_im * a0_im;
986  A0_re += gT01_re * a1_re;
987  A0_re -= gT01_im * a1_im;
988  A0_re += gT02_re * a2_re;
989  A0_re -= gT02_im * a2_im;
990  spinorFloat A0_im = 0;
991  A0_im += gT00_re * a0_im;
992  A0_im += gT00_im * a0_re;
993  A0_im += gT01_re * a1_im;
994  A0_im += gT01_im * a1_re;
995  A0_im += gT02_re * a2_im;
996  A0_im += gT02_im * a2_re;
997  spinorFloat B0_re = 0;
998  B0_re += gT00_re * b0_re;
999  B0_re -= gT00_im * b0_im;
1000  B0_re += gT01_re * b1_re;
1001  B0_re -= gT01_im * b1_im;
1002  B0_re += gT02_re * b2_re;
1003  B0_re -= gT02_im * b2_im;
1004  spinorFloat B0_im = 0;
1005  B0_im += gT00_re * b0_im;
1006  B0_im += gT00_im * b0_re;
1007  B0_im += gT01_re * b1_im;
1008  B0_im += gT01_im * b1_re;
1009  B0_im += gT02_re * b2_im;
1010  B0_im += gT02_im * b2_re;
1011 
1012  // multiply row 1
1013  spinorFloat A1_re = 0;
1014  A1_re += gT10_re * a0_re;
1015  A1_re -= gT10_im * a0_im;
1016  A1_re += gT11_re * a1_re;
1017  A1_re -= gT11_im * a1_im;
1018  A1_re += gT12_re * a2_re;
1019  A1_re -= gT12_im * a2_im;
1020  spinorFloat A1_im = 0;
1021  A1_im += gT10_re * a0_im;
1022  A1_im += gT10_im * a0_re;
1023  A1_im += gT11_re * a1_im;
1024  A1_im += gT11_im * a1_re;
1025  A1_im += gT12_re * a2_im;
1026  A1_im += gT12_im * a2_re;
1027  spinorFloat B1_re = 0;
1028  B1_re += gT10_re * b0_re;
1029  B1_re -= gT10_im * b0_im;
1030  B1_re += gT11_re * b1_re;
1031  B1_re -= gT11_im * b1_im;
1032  B1_re += gT12_re * b2_re;
1033  B1_re -= gT12_im * b2_im;
1034  spinorFloat B1_im = 0;
1035  B1_im += gT10_re * b0_im;
1036  B1_im += gT10_im * b0_re;
1037  B1_im += gT11_re * b1_im;
1038  B1_im += gT11_im * b1_re;
1039  B1_im += gT12_re * b2_im;
1040  B1_im += gT12_im * b2_re;
1041 
1042  // multiply row 2
1043  spinorFloat A2_re = 0;
1044  A2_re += gT20_re * a0_re;
1045  A2_re -= gT20_im * a0_im;
1046  A2_re += gT21_re * a1_re;
1047  A2_re -= gT21_im * a1_im;
1048  A2_re += gT22_re * a2_re;
1049  A2_re -= gT22_im * a2_im;
1050  spinorFloat A2_im = 0;
1051  A2_im += gT20_re * a0_im;
1052  A2_im += gT20_im * a0_re;
1053  A2_im += gT21_re * a1_im;
1054  A2_im += gT21_im * a1_re;
1055  A2_im += gT22_re * a2_im;
1056  A2_im += gT22_im * a2_re;
1057  spinorFloat B2_re = 0;
1058  B2_re += gT20_re * b0_re;
1059  B2_re -= gT20_im * b0_im;
1060  B2_re += gT21_re * b1_re;
1061  B2_re -= gT21_im * b1_im;
1062  B2_re += gT22_re * b2_re;
1063  B2_re -= gT22_im * b2_im;
1064  spinorFloat B2_im = 0;
1065  B2_im += gT20_re * b0_im;
1066  B2_im += gT20_im * b0_re;
1067  B2_im += gT21_re * b1_im;
1068  B2_im += gT21_im * b1_re;
1069  B2_im += gT22_re * b2_im;
1070  B2_im += gT22_im * b2_re;
1071 
1072  o00_re += A0_re;
1073  o00_im += A0_im;
1074  o10_re += B0_re;
1075  o10_im += B0_im;
1076  o20_re += B0_re;
1077  o20_im += B0_im;
1078  o30_re -= A0_re;
1079  o30_im -= A0_im;
1080 
1081  o01_re += A1_re;
1082  o01_im += A1_im;
1083  o11_re += B1_re;
1084  o11_im += B1_im;
1085  o21_re += B1_re;
1086  o21_im += B1_im;
1087  o31_re -= A1_re;
1088  o31_im -= A1_im;
1089 
1090  o02_re += A2_re;
1091  o02_im += A2_im;
1092  o12_re += B2_re;
1093  o12_im += B2_im;
1094  o22_re += B2_re;
1095  o22_im += B2_im;
1096  o32_re -= A2_re;
1097  o32_im -= A2_im;
1098 
1099 }
1100 
1101 #ifdef MULTI_GPU
1102 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]<(param.dc.X[2]-1))) ||
1103  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==(param.dc.X[2]-1)) )
1104 #endif
1105 {
1106  // Projector P2+
1107  // 1 0 i 0
1108  // 0 1 0 -i
1109  // -i 0 1 0
1110  // 0 i 0 1
1111 
1112 #ifdef MULTI_GPU
1113  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1 :
1114  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1115 #if (DD_PREC==2) // half precision
1116  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1117 #endif
1118 #else
1119  const int sp_idx = (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1;
1120 #endif
1121 
1122  const int ga_idx = sid;
1123 
1130 
1131 #ifdef MULTI_GPU
1132  if (kernel_type == INTERIOR_KERNEL) {
1133 #endif
1134 
1135  // read spinor from device memory
1136 #ifdef TWIST_INV_DSLASH
1137 #ifdef SPINOR_DOUBLE
1138  const spinorFloat a = param.a;
1139  const spinorFloat b = param.b;
1140 #else
1141  const spinorFloat a = param.a_f;
1142  const spinorFloat b = param.b_f;
1143 #endif
1144 #endif
1145  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1146 #ifdef TWIST_INV_DSLASH
1147  APPLY_TWIST_INV(-a, b, i);
1148 #endif
1149 
1150  // project spinor into half spinors
1151  a0_re = +i00_re-i20_im;
1152  a0_im = +i00_im+i20_re;
1153  a1_re = +i01_re-i21_im;
1154  a1_im = +i01_im+i21_re;
1155  a2_re = +i02_re-i22_im;
1156  a2_im = +i02_im+i22_re;
1157  b0_re = +i10_re+i30_im;
1158  b0_im = +i10_im-i30_re;
1159  b1_re = +i11_re+i31_im;
1160  b1_im = +i11_im-i31_re;
1161  b2_re = +i12_re+i32_im;
1162  b2_im = +i12_im-i32_re;
1163 
1164 #ifdef MULTI_GPU
1165  } else {
1166 
1167  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1168 
1169  // read half spinor from device memory
1170  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 4);
1171 
1172  a0_re = i00_re; a0_im = i00_im;
1173  a1_re = i01_re; a1_im = i01_im;
1174  a2_re = i02_re; a2_im = i02_im;
1175  b0_re = i10_re; b0_im = i10_im;
1176  b1_re = i11_re; b1_im = i11_im;
1177  b2_re = i12_re; b2_im = i12_im;
1178 
1179  }
1180 #endif // MULTI_GPU
1181 
1182  // read gauge matrix from device memory
1183  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, param.gauge_stride);
1184 
1185  // reconstruct gauge matrix
1187 
1188  // multiply row 0
1189  spinorFloat A0_re = 0;
1190  A0_re += g00_re * a0_re;
1191  A0_re -= g00_im * a0_im;
1192  A0_re += g01_re * a1_re;
1193  A0_re -= g01_im * a1_im;
1194  A0_re += g02_re * a2_re;
1195  A0_re -= g02_im * a2_im;
1196  spinorFloat A0_im = 0;
1197  A0_im += g00_re * a0_im;
1198  A0_im += g00_im * a0_re;
1199  A0_im += g01_re * a1_im;
1200  A0_im += g01_im * a1_re;
1201  A0_im += g02_re * a2_im;
1202  A0_im += g02_im * a2_re;
1203  spinorFloat B0_re = 0;
1204  B0_re += g00_re * b0_re;
1205  B0_re -= g00_im * b0_im;
1206  B0_re += g01_re * b1_re;
1207  B0_re -= g01_im * b1_im;
1208  B0_re += g02_re * b2_re;
1209  B0_re -= g02_im * b2_im;
1210  spinorFloat B0_im = 0;
1211  B0_im += g00_re * b0_im;
1212  B0_im += g00_im * b0_re;
1213  B0_im += g01_re * b1_im;
1214  B0_im += g01_im * b1_re;
1215  B0_im += g02_re * b2_im;
1216  B0_im += g02_im * b2_re;
1217 
1218  // multiply row 1
1219  spinorFloat A1_re = 0;
1220  A1_re += g10_re * a0_re;
1221  A1_re -= g10_im * a0_im;
1222  A1_re += g11_re * a1_re;
1223  A1_re -= g11_im * a1_im;
1224  A1_re += g12_re * a2_re;
1225  A1_re -= g12_im * a2_im;
1226  spinorFloat A1_im = 0;
1227  A1_im += g10_re * a0_im;
1228  A1_im += g10_im * a0_re;
1229  A1_im += g11_re * a1_im;
1230  A1_im += g11_im * a1_re;
1231  A1_im += g12_re * a2_im;
1232  A1_im += g12_im * a2_re;
1233  spinorFloat B1_re = 0;
1234  B1_re += g10_re * b0_re;
1235  B1_re -= g10_im * b0_im;
1236  B1_re += g11_re * b1_re;
1237  B1_re -= g11_im * b1_im;
1238  B1_re += g12_re * b2_re;
1239  B1_re -= g12_im * b2_im;
1240  spinorFloat B1_im = 0;
1241  B1_im += g10_re * b0_im;
1242  B1_im += g10_im * b0_re;
1243  B1_im += g11_re * b1_im;
1244  B1_im += g11_im * b1_re;
1245  B1_im += g12_re * b2_im;
1246  B1_im += g12_im * b2_re;
1247 
1248  // multiply row 2
1249  spinorFloat A2_re = 0;
1250  A2_re += g20_re * a0_re;
1251  A2_re -= g20_im * a0_im;
1252  A2_re += g21_re * a1_re;
1253  A2_re -= g21_im * a1_im;
1254  A2_re += g22_re * a2_re;
1255  A2_re -= g22_im * a2_im;
1256  spinorFloat A2_im = 0;
1257  A2_im += g20_re * a0_im;
1258  A2_im += g20_im * a0_re;
1259  A2_im += g21_re * a1_im;
1260  A2_im += g21_im * a1_re;
1261  A2_im += g22_re * a2_im;
1262  A2_im += g22_im * a2_re;
1263  spinorFloat B2_re = 0;
1264  B2_re += g20_re * b0_re;
1265  B2_re -= g20_im * b0_im;
1266  B2_re += g21_re * b1_re;
1267  B2_re -= g21_im * b1_im;
1268  B2_re += g22_re * b2_re;
1269  B2_re -= g22_im * b2_im;
1270  spinorFloat B2_im = 0;
1271  B2_im += g20_re * b0_im;
1272  B2_im += g20_im * b0_re;
1273  B2_im += g21_re * b1_im;
1274  B2_im += g21_im * b1_re;
1275  B2_im += g22_re * b2_im;
1276  B2_im += g22_im * b2_re;
1277 
1278  o00_re += A0_re;
1279  o00_im += A0_im;
1280  o10_re += B0_re;
1281  o10_im += B0_im;
1282  o20_re += A0_im;
1283  o20_im -= A0_re;
1284  o30_re -= B0_im;
1285  o30_im += B0_re;
1286 
1287  o01_re += A1_re;
1288  o01_im += A1_im;
1289  o11_re += B1_re;
1290  o11_im += B1_im;
1291  o21_re += A1_im;
1292  o21_im -= A1_re;
1293  o31_re -= B1_im;
1294  o31_im += B1_re;
1295 
1296  o02_re += A2_re;
1297  o02_im += A2_im;
1298  o12_re += B2_re;
1299  o12_im += B2_im;
1300  o22_re += A2_im;
1301  o22_im -= A2_re;
1302  o32_re -= B2_im;
1303  o32_im += B2_re;
1304 
1305 }
1306 
1307 #ifdef MULTI_GPU
1308 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]>0)) ||
1309  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==0) )
1310 #endif
1311 {
1312  // Projector P2-
1313  // 1 0 -i 0
1314  // 0 1 0 i
1315  // i 0 1 0
1316  // 0 -i 0 1
1317 
1318 #ifdef MULTI_GPU
1319  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1 :
1320  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1321 #if (DD_PREC==2) // half precision
1322  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1323 #endif
1324 #else
1325  const int sp_idx = (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1;
1326 #endif
1327 
1328 #ifdef MULTI_GPU
1329  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1330 #else
1331  const int ga_idx = sp_idx;
1332 #endif
1333 
1340 
1341 #ifdef MULTI_GPU
1342  if (kernel_type == INTERIOR_KERNEL) {
1343 #endif
1344 
1345  // read spinor from device memory
1346 #ifdef TWIST_INV_DSLASH
1347 #ifdef SPINOR_DOUBLE
1348  const spinorFloat a = param.a;
1349  const spinorFloat b = param.b;
1350 #else
1351  const spinorFloat a = param.a_f;
1352  const spinorFloat b = param.b_f;
1353 #endif
1354 #endif
1355  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1356 #ifdef TWIST_INV_DSLASH
1357  APPLY_TWIST_INV(-a, b, i);
1358 #endif
1359 
1360  // project spinor into half spinors
1361  a0_re = +i00_re+i20_im;
1362  a0_im = +i00_im-i20_re;
1363  a1_re = +i01_re+i21_im;
1364  a1_im = +i01_im-i21_re;
1365  a2_re = +i02_re+i22_im;
1366  a2_im = +i02_im-i22_re;
1367  b0_re = +i10_re-i30_im;
1368  b0_im = +i10_im+i30_re;
1369  b1_re = +i11_re-i31_im;
1370  b1_im = +i11_im+i31_re;
1371  b2_re = +i12_re-i32_im;
1372  b2_im = +i12_im+i32_re;
1373 
1374 #ifdef MULTI_GPU
1375  } else {
1376 
1377  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1378 
1379  // read half spinor from device memory
1380  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 5);
1381 
1382  a0_re = i00_re; a0_im = i00_im;
1383  a1_re = i01_re; a1_im = i01_im;
1384  a2_re = i02_re; a2_im = i02_im;
1385  b0_re = i10_re; b0_im = i10_im;
1386  b1_re = i11_re; b1_im = i11_im;
1387  b2_re = i12_re; b2_im = i12_im;
1388 
1389  }
1390 #endif // MULTI_GPU
1391 
1392  // read gauge matrix from device memory
1393  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, param.gauge_stride);
1394 
1395  // reconstruct gauge matrix
1397 
1398  // multiply row 0
1399  spinorFloat A0_re = 0;
1400  A0_re += gT00_re * a0_re;
1401  A0_re -= gT00_im * a0_im;
1402  A0_re += gT01_re * a1_re;
1403  A0_re -= gT01_im * a1_im;
1404  A0_re += gT02_re * a2_re;
1405  A0_re -= gT02_im * a2_im;
1406  spinorFloat A0_im = 0;
1407  A0_im += gT00_re * a0_im;
1408  A0_im += gT00_im * a0_re;
1409  A0_im += gT01_re * a1_im;
1410  A0_im += gT01_im * a1_re;
1411  A0_im += gT02_re * a2_im;
1412  A0_im += gT02_im * a2_re;
1413  spinorFloat B0_re = 0;
1414  B0_re += gT00_re * b0_re;
1415  B0_re -= gT00_im * b0_im;
1416  B0_re += gT01_re * b1_re;
1417  B0_re -= gT01_im * b1_im;
1418  B0_re += gT02_re * b2_re;
1419  B0_re -= gT02_im * b2_im;
1420  spinorFloat B0_im = 0;
1421  B0_im += gT00_re * b0_im;
1422  B0_im += gT00_im * b0_re;
1423  B0_im += gT01_re * b1_im;
1424  B0_im += gT01_im * b1_re;
1425  B0_im += gT02_re * b2_im;
1426  B0_im += gT02_im * b2_re;
1427 
1428  // multiply row 1
1429  spinorFloat A1_re = 0;
1430  A1_re += gT10_re * a0_re;
1431  A1_re -= gT10_im * a0_im;
1432  A1_re += gT11_re * a1_re;
1433  A1_re -= gT11_im * a1_im;
1434  A1_re += gT12_re * a2_re;
1435  A1_re -= gT12_im * a2_im;
1436  spinorFloat A1_im = 0;
1437  A1_im += gT10_re * a0_im;
1438  A1_im += gT10_im * a0_re;
1439  A1_im += gT11_re * a1_im;
1440  A1_im += gT11_im * a1_re;
1441  A1_im += gT12_re * a2_im;
1442  A1_im += gT12_im * a2_re;
1443  spinorFloat B1_re = 0;
1444  B1_re += gT10_re * b0_re;
1445  B1_re -= gT10_im * b0_im;
1446  B1_re += gT11_re * b1_re;
1447  B1_re -= gT11_im * b1_im;
1448  B1_re += gT12_re * b2_re;
1449  B1_re -= gT12_im * b2_im;
1450  spinorFloat B1_im = 0;
1451  B1_im += gT10_re * b0_im;
1452  B1_im += gT10_im * b0_re;
1453  B1_im += gT11_re * b1_im;
1454  B1_im += gT11_im * b1_re;
1455  B1_im += gT12_re * b2_im;
1456  B1_im += gT12_im * b2_re;
1457 
1458  // multiply row 2
1459  spinorFloat A2_re = 0;
1460  A2_re += gT20_re * a0_re;
1461  A2_re -= gT20_im * a0_im;
1462  A2_re += gT21_re * a1_re;
1463  A2_re -= gT21_im * a1_im;
1464  A2_re += gT22_re * a2_re;
1465  A2_re -= gT22_im * a2_im;
1466  spinorFloat A2_im = 0;
1467  A2_im += gT20_re * a0_im;
1468  A2_im += gT20_im * a0_re;
1469  A2_im += gT21_re * a1_im;
1470  A2_im += gT21_im * a1_re;
1471  A2_im += gT22_re * a2_im;
1472  A2_im += gT22_im * a2_re;
1473  spinorFloat B2_re = 0;
1474  B2_re += gT20_re * b0_re;
1475  B2_re -= gT20_im * b0_im;
1476  B2_re += gT21_re * b1_re;
1477  B2_re -= gT21_im * b1_im;
1478  B2_re += gT22_re * b2_re;
1479  B2_re -= gT22_im * b2_im;
1480  spinorFloat B2_im = 0;
1481  B2_im += gT20_re * b0_im;
1482  B2_im += gT20_im * b0_re;
1483  B2_im += gT21_re * b1_im;
1484  B2_im += gT21_im * b1_re;
1485  B2_im += gT22_re * b2_im;
1486  B2_im += gT22_im * b2_re;
1487 
1488  o00_re += A0_re;
1489  o00_im += A0_im;
1490  o10_re += B0_re;
1491  o10_im += B0_im;
1492  o20_re -= A0_im;
1493  o20_im += A0_re;
1494  o30_re += B0_im;
1495  o30_im -= B0_re;
1496 
1497  o01_re += A1_re;
1498  o01_im += A1_im;
1499  o11_re += B1_re;
1500  o11_im += B1_im;
1501  o21_re -= A1_im;
1502  o21_im += A1_re;
1503  o31_re += B1_im;
1504  o31_im -= B1_re;
1505 
1506  o02_re += A2_re;
1507  o02_im += A2_im;
1508  o12_re += B2_re;
1509  o12_im += B2_im;
1510  o22_re -= A2_im;
1511  o22_im += A2_re;
1512  o32_re += B2_im;
1513  o32_im -= B2_re;
1514 
1515 }
1516 
1517 #ifdef MULTI_GPU
1518 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]<(param.dc.X[3]-1))) ||
1519  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==(param.dc.X[3]-1)) )
1520 #endif
1521 {
1522  // Projector P3+
1523  // 2 0 0 0
1524  // 0 2 0 0
1525  // 0 0 0 0
1526  // 0 0 0 0
1527 
1528 #ifdef MULTI_GPU
1529  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1 :
1530  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1531 #if (DD_PREC==2) // half precision
1532  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1533 #endif
1534 #else
1535  const int sp_idx = (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1;
1536 #endif
1537 
1538  const int ga_idx = sid;
1539 
1540  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1541  {
1548 
1549 #ifdef MULTI_GPU
1550  if (kernel_type == INTERIOR_KERNEL) {
1551 #endif
1552 
1553  // read spinor from device memory
1554 #ifdef TWIST_INV_DSLASH
1555 #ifdef SPINOR_DOUBLE
1556  const spinorFloat a = param.a;
1557  const spinorFloat b = param.b;
1558 #else
1559  const spinorFloat a = param.a_f;
1560  const spinorFloat b = param.b_f;
1561 #endif
1562 #endif
1563 #ifndef TWIST_INV_DSLASH
1564  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1565 #else
1566  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1567  APPLY_TWIST_INV(-a, b, i);
1568 #endif
1569 
1570  // project spinor into half spinors
1571  a0_re = +2*i00_re;
1572  a0_im = +2*i00_im;
1573  a1_re = +2*i01_re;
1574  a1_im = +2*i01_im;
1575  a2_re = +2*i02_re;
1576  a2_im = +2*i02_im;
1577  b0_re = +2*i10_re;
1578  b0_im = +2*i10_im;
1579  b1_re = +2*i11_re;
1580  b1_im = +2*i11_im;
1581  b2_re = +2*i12_re;
1582  b2_im = +2*i12_im;
1583 
1584 #ifdef MULTI_GPU
1585  } else {
1586 
1587  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1588  const int t_proj_scale = TPROJSCALE;
1589 
1590  // read half spinor from device memory
1591  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1592 
1593  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1594  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1595  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1596  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1597  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1598  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1599 
1600  }
1601 #endif // MULTI_GPU
1602 
1603  // identity gauge matrix
1610 
1611  o00_re += A0_re;
1612  o00_im += A0_im;
1613  o10_re += B0_re;
1614  o10_im += B0_im;
1615 
1616  o01_re += A1_re;
1617  o01_im += A1_im;
1618  o11_re += B1_re;
1619  o11_im += B1_im;
1620 
1621  o02_re += A2_re;
1622  o02_im += A2_im;
1623  o12_re += B2_re;
1624  o12_im += B2_im;
1625 
1626  } else {
1633 
1634 #ifdef MULTI_GPU
1635  if (kernel_type == INTERIOR_KERNEL) {
1636 #endif
1637 
1638  // read spinor from device memory
1639 #ifdef TWIST_INV_DSLASH
1640 #ifdef SPINOR_DOUBLE
1641  const spinorFloat a = param.a;
1642  const spinorFloat b = param.b;
1643 #else
1644  const spinorFloat a = param.a_f;
1645  const spinorFloat b = param.b_f;
1646 #endif
1647 #endif
1648 #ifndef TWIST_INV_DSLASH
1649  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1650 #else
1651  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1652  APPLY_TWIST_INV(-a, b, i);
1653 #endif
1654 
1655  // project spinor into half spinors
1656  a0_re = +2*i00_re;
1657  a0_im = +2*i00_im;
1658  a1_re = +2*i01_re;
1659  a1_im = +2*i01_im;
1660  a2_re = +2*i02_re;
1661  a2_im = +2*i02_im;
1662  b0_re = +2*i10_re;
1663  b0_im = +2*i10_im;
1664  b1_re = +2*i11_re;
1665  b1_im = +2*i11_im;
1666  b2_re = +2*i12_re;
1667  b2_im = +2*i12_im;
1668 
1669 #ifdef MULTI_GPU
1670  } else {
1671 
1672  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1673  const int t_proj_scale = TPROJSCALE;
1674 
1675  // read half spinor from device memory
1676  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1677 
1678  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1679  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1680  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1681  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1682  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1683  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1684 
1685  }
1686 #endif // MULTI_GPU
1687 
1688  // read gauge matrix from device memory
1689  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, param.gauge_stride);
1690 
1691  // reconstruct gauge matrix
1693 
1694  // multiply row 0
1695  spinorFloat A0_re = 0;
1696  A0_re += g00_re * a0_re;
1697  A0_re -= g00_im * a0_im;
1698  A0_re += g01_re * a1_re;
1699  A0_re -= g01_im * a1_im;
1700  A0_re += g02_re * a2_re;
1701  A0_re -= g02_im * a2_im;
1702  spinorFloat A0_im = 0;
1703  A0_im += g00_re * a0_im;
1704  A0_im += g00_im * a0_re;
1705  A0_im += g01_re * a1_im;
1706  A0_im += g01_im * a1_re;
1707  A0_im += g02_re * a2_im;
1708  A0_im += g02_im * a2_re;
1709  spinorFloat B0_re = 0;
1710  B0_re += g00_re * b0_re;
1711  B0_re -= g00_im * b0_im;
1712  B0_re += g01_re * b1_re;
1713  B0_re -= g01_im * b1_im;
1714  B0_re += g02_re * b2_re;
1715  B0_re -= g02_im * b2_im;
1716  spinorFloat B0_im = 0;
1717  B0_im += g00_re * b0_im;
1718  B0_im += g00_im * b0_re;
1719  B0_im += g01_re * b1_im;
1720  B0_im += g01_im * b1_re;
1721  B0_im += g02_re * b2_im;
1722  B0_im += g02_im * b2_re;
1723 
1724  // multiply row 1
1725  spinorFloat A1_re = 0;
1726  A1_re += g10_re * a0_re;
1727  A1_re -= g10_im * a0_im;
1728  A1_re += g11_re * a1_re;
1729  A1_re -= g11_im * a1_im;
1730  A1_re += g12_re * a2_re;
1731  A1_re -= g12_im * a2_im;
1732  spinorFloat A1_im = 0;
1733  A1_im += g10_re * a0_im;
1734  A1_im += g10_im * a0_re;
1735  A1_im += g11_re * a1_im;
1736  A1_im += g11_im * a1_re;
1737  A1_im += g12_re * a2_im;
1738  A1_im += g12_im * a2_re;
1739  spinorFloat B1_re = 0;
1740  B1_re += g10_re * b0_re;
1741  B1_re -= g10_im * b0_im;
1742  B1_re += g11_re * b1_re;
1743  B1_re -= g11_im * b1_im;
1744  B1_re += g12_re * b2_re;
1745  B1_re -= g12_im * b2_im;
1746  spinorFloat B1_im = 0;
1747  B1_im += g10_re * b0_im;
1748  B1_im += g10_im * b0_re;
1749  B1_im += g11_re * b1_im;
1750  B1_im += g11_im * b1_re;
1751  B1_im += g12_re * b2_im;
1752  B1_im += g12_im * b2_re;
1753 
1754  // multiply row 2
1755  spinorFloat A2_re = 0;
1756  A2_re += g20_re * a0_re;
1757  A2_re -= g20_im * a0_im;
1758  A2_re += g21_re * a1_re;
1759  A2_re -= g21_im * a1_im;
1760  A2_re += g22_re * a2_re;
1761  A2_re -= g22_im * a2_im;
1762  spinorFloat A2_im = 0;
1763  A2_im += g20_re * a0_im;
1764  A2_im += g20_im * a0_re;
1765  A2_im += g21_re * a1_im;
1766  A2_im += g21_im * a1_re;
1767  A2_im += g22_re * a2_im;
1768  A2_im += g22_im * a2_re;
1769  spinorFloat B2_re = 0;
1770  B2_re += g20_re * b0_re;
1771  B2_re -= g20_im * b0_im;
1772  B2_re += g21_re * b1_re;
1773  B2_re -= g21_im * b1_im;
1774  B2_re += g22_re * b2_re;
1775  B2_re -= g22_im * b2_im;
1776  spinorFloat B2_im = 0;
1777  B2_im += g20_re * b0_im;
1778  B2_im += g20_im * b0_re;
1779  B2_im += g21_re * b1_im;
1780  B2_im += g21_im * b1_re;
1781  B2_im += g22_re * b2_im;
1782  B2_im += g22_im * b2_re;
1783 
1784  o00_re += A0_re;
1785  o00_im += A0_im;
1786  o10_re += B0_re;
1787  o10_im += B0_im;
1788 
1789  o01_re += A1_re;
1790  o01_im += A1_im;
1791  o11_re += B1_re;
1792  o11_im += B1_im;
1793 
1794  o02_re += A2_re;
1795  o02_im += A2_im;
1796  o12_re += B2_re;
1797  o12_im += B2_im;
1798 
1799  }
1800 }
1801 
1802 #ifdef MULTI_GPU
1803 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]>0)) ||
1804  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==0) )
1805 #endif
1806 {
1807  // Projector P3-
1808  // 0 0 0 0
1809  // 0 0 0 0
1810  // 0 0 2 0
1811  // 0 0 0 2
1812 
1813 #ifdef MULTI_GPU
1814  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1 :
1815  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1816 #if (DD_PREC==2) // half precision
1817  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1818 #endif
1819 #else
1820  const int sp_idx = (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1;
1821 #endif
1822 
1823 #ifdef MULTI_GPU
1824  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1825 #else
1826  const int ga_idx = sp_idx;
1827 #endif
1828 
1829  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1830  {
1837 
1838 #ifdef MULTI_GPU
1839  if (kernel_type == INTERIOR_KERNEL) {
1840 #endif
1841 
1842  // read spinor from device memory
1843 #ifdef TWIST_INV_DSLASH
1844 #ifdef SPINOR_DOUBLE
1845  const spinorFloat a = param.a;
1846  const spinorFloat b = param.b;
1847 #else
1848  const spinorFloat a = param.a_f;
1849  const spinorFloat b = param.b_f;
1850 #endif
1851 #endif
1852 #ifndef TWIST_INV_DSLASH
1854 #else
1855  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1856  APPLY_TWIST_INV(-a, b, i);
1857 #endif
1858 
1859  // project spinor into half spinors
1860  a0_re = +2*i20_re;
1861  a0_im = +2*i20_im;
1862  a1_re = +2*i21_re;
1863  a1_im = +2*i21_im;
1864  a2_re = +2*i22_re;
1865  a2_im = +2*i22_im;
1866  b0_re = +2*i30_re;
1867  b0_im = +2*i30_im;
1868  b1_re = +2*i31_re;
1869  b1_im = +2*i31_im;
1870  b2_re = +2*i32_re;
1871  b2_im = +2*i32_im;
1872 
1873 #ifdef MULTI_GPU
1874  } else {
1875 
1876  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1877  const int t_proj_scale = TPROJSCALE;
1878 
1879  // read half spinor from device memory
1880  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
1881 
1882  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1883  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1884  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1885  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1886  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1887  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1888 
1889  }
1890 #endif // MULTI_GPU
1891 
1892  // identity gauge matrix
1899 
1900  o20_re += A0_re;
1901  o20_im += A0_im;
1902  o30_re += B0_re;
1903  o30_im += B0_im;
1904 
1905  o21_re += A1_re;
1906  o21_im += A1_im;
1907  o31_re += B1_re;
1908  o31_im += B1_im;
1909 
1910  o22_re += A2_re;
1911  o22_im += A2_im;
1912  o32_re += B2_re;
1913  o32_im += B2_im;
1914 
1915  } else {
1922 
1923 #ifdef MULTI_GPU
1924  if (kernel_type == INTERIOR_KERNEL) {
1925 #endif
1926 
1927  // read spinor from device memory
1928 #ifdef TWIST_INV_DSLASH
1929 #ifdef SPINOR_DOUBLE
1930  const spinorFloat a = param.a;
1931  const spinorFloat b = param.b;
1932 #else
1933  const spinorFloat a = param.a_f;
1934  const spinorFloat b = param.b_f;
1935 #endif
1936 #endif
1937 #ifndef TWIST_INV_DSLASH
1939 #else
1940  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1941  APPLY_TWIST_INV(-a, b, i);
1942 #endif
1943 
1944  // project spinor into half spinors
1945  a0_re = +2*i20_re;
1946  a0_im = +2*i20_im;
1947  a1_re = +2*i21_re;
1948  a1_im = +2*i21_im;
1949  a2_re = +2*i22_re;
1950  a2_im = +2*i22_im;
1951  b0_re = +2*i30_re;
1952  b0_im = +2*i30_im;
1953  b1_re = +2*i31_re;
1954  b1_im = +2*i31_im;
1955  b2_re = +2*i32_re;
1956  b2_im = +2*i32_im;
1957 
1958 #ifdef MULTI_GPU
1959  } else {
1960 
1961  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1962  const int t_proj_scale = TPROJSCALE;
1963 
1964  // read half spinor from device memory
1965  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
1966 
1967  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1968  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1969  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1970  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1971  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1972  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1973 
1974  }
1975 #endif // MULTI_GPU
1976 
1977  // read gauge matrix from device memory
1978  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, param.gauge_stride);
1979 
1980  // reconstruct gauge matrix
1982 
1983  // multiply row 0
1984  spinorFloat A0_re = 0;
1985  A0_re += gT00_re * a0_re;
1986  A0_re -= gT00_im * a0_im;
1987  A0_re += gT01_re * a1_re;
1988  A0_re -= gT01_im * a1_im;
1989  A0_re += gT02_re * a2_re;
1990  A0_re -= gT02_im * a2_im;
1991  spinorFloat A0_im = 0;
1992  A0_im += gT00_re * a0_im;
1993  A0_im += gT00_im * a0_re;
1994  A0_im += gT01_re * a1_im;
1995  A0_im += gT01_im * a1_re;
1996  A0_im += gT02_re * a2_im;
1997  A0_im += gT02_im * a2_re;
1998  spinorFloat B0_re = 0;
1999  B0_re += gT00_re * b0_re;
2000  B0_re -= gT00_im * b0_im;
2001  B0_re += gT01_re * b1_re;
2002  B0_re -= gT01_im * b1_im;
2003  B0_re += gT02_re * b2_re;
2004  B0_re -= gT02_im * b2_im;
2005  spinorFloat B0_im = 0;
2006  B0_im += gT00_re * b0_im;
2007  B0_im += gT00_im * b0_re;
2008  B0_im += gT01_re * b1_im;
2009  B0_im += gT01_im * b1_re;
2010  B0_im += gT02_re * b2_im;
2011  B0_im += gT02_im * b2_re;
2012 
2013  // multiply row 1
2014  spinorFloat A1_re = 0;
2015  A1_re += gT10_re * a0_re;
2016  A1_re -= gT10_im * a0_im;
2017  A1_re += gT11_re * a1_re;
2018  A1_re -= gT11_im * a1_im;
2019  A1_re += gT12_re * a2_re;
2020  A1_re -= gT12_im * a2_im;
2021  spinorFloat A1_im = 0;
2022  A1_im += gT10_re * a0_im;
2023  A1_im += gT10_im * a0_re;
2024  A1_im += gT11_re * a1_im;
2025  A1_im += gT11_im * a1_re;
2026  A1_im += gT12_re * a2_im;
2027  A1_im += gT12_im * a2_re;
2028  spinorFloat B1_re = 0;
2029  B1_re += gT10_re * b0_re;
2030  B1_re -= gT10_im * b0_im;
2031  B1_re += gT11_re * b1_re;
2032  B1_re -= gT11_im * b1_im;
2033  B1_re += gT12_re * b2_re;
2034  B1_re -= gT12_im * b2_im;
2035  spinorFloat B1_im = 0;
2036  B1_im += gT10_re * b0_im;
2037  B1_im += gT10_im * b0_re;
2038  B1_im += gT11_re * b1_im;
2039  B1_im += gT11_im * b1_re;
2040  B1_im += gT12_re * b2_im;
2041  B1_im += gT12_im * b2_re;
2042 
2043  // multiply row 2
2044  spinorFloat A2_re = 0;
2045  A2_re += gT20_re * a0_re;
2046  A2_re -= gT20_im * a0_im;
2047  A2_re += gT21_re * a1_re;
2048  A2_re -= gT21_im * a1_im;
2049  A2_re += gT22_re * a2_re;
2050  A2_re -= gT22_im * a2_im;
2051  spinorFloat A2_im = 0;
2052  A2_im += gT20_re * a0_im;
2053  A2_im += gT20_im * a0_re;
2054  A2_im += gT21_re * a1_im;
2055  A2_im += gT21_im * a1_re;
2056  A2_im += gT22_re * a2_im;
2057  A2_im += gT22_im * a2_re;
2058  spinorFloat B2_re = 0;
2059  B2_re += gT20_re * b0_re;
2060  B2_re -= gT20_im * b0_im;
2061  B2_re += gT21_re * b1_re;
2062  B2_re -= gT21_im * b1_im;
2063  B2_re += gT22_re * b2_re;
2064  B2_re -= gT22_im * b2_im;
2065  spinorFloat B2_im = 0;
2066  B2_im += gT20_re * b0_im;
2067  B2_im += gT20_im * b0_re;
2068  B2_im += gT21_re * b1_im;
2069  B2_im += gT21_im * b1_re;
2070  B2_im += gT22_re * b2_im;
2071  B2_im += gT22_im * b2_re;
2072 
2073  o20_re += A0_re;
2074  o20_im += A0_im;
2075  o30_re += B0_re;
2076  o30_im += B0_im;
2077 
2078  o21_re += A1_re;
2079  o21_im += A1_im;
2080  o31_re += B1_re;
2081  o31_im += B1_im;
2082 
2083  o22_re += A2_re;
2084  o22_im += A2_im;
2085  o32_re += B2_re;
2086  o32_im += B2_im;
2087 
2088  }
2089 }
2090 
2091 #ifdef MULTI_GPU
2092 
2093 int incomplete = 0; // Have all 8 contributions been computed for this site?
2094 
2095 switch(kernel_type) { // intentional fall-through
2096 case INTERIOR_KERNEL:
2097  incomplete = incomplete || (param.commDim[3] && (coord[3]==0 || coord[3]==(param.dc.X[3]-1)));
2098 case EXTERIOR_KERNEL_T:
2099  incomplete = incomplete || (param.commDim[2] && (coord[2]==0 || coord[2]==(param.dc.X[2]-1)));
2100 case EXTERIOR_KERNEL_Z:
2101  incomplete = incomplete || (param.commDim[1] && (coord[1]==0 || coord[1]==(param.dc.X[1]-1)));
2102 case EXTERIOR_KERNEL_Y:
2103  incomplete = incomplete || (param.commDim[0] && (coord[0]==0 || coord[0]==(param.dc.X[0]-1)));
2104 }
2105 
2106 if (!incomplete)
2107 #endif // MULTI_GPU
2108 {
2109 #ifndef TWIST_INV_DSLASH
2110 #ifdef SPINOR_DOUBLE
2111  const spinorFloat a = param.a;
2112  const spinorFloat b = param.b;
2113 #else
2114  const spinorFloat a = param.a_f;
2115  const spinorFloat b = param.b_f;
2116 #endif
2117 #endif
2118 #ifdef DSLASH_XPAY
2119  READ_ACCUM(ACCUMTEX, param.sp_stride)
2120 
2121 #ifndef TWIST_XPAY
2122 #ifndef TWIST_INV_DSLASH
2123  //perform invert twist first:
2124  APPLY_TWIST_INV(-a, b, o);
2125 #endif
2126  o00_re += acc00_re;
2127  o00_im += acc00_im;
2128  o01_re += acc01_re;
2129  o01_im += acc01_im;
2130  o02_re += acc02_re;
2131  o02_im += acc02_im;
2132  o10_re += acc10_re;
2133  o10_im += acc10_im;
2134  o11_re += acc11_re;
2135  o11_im += acc11_im;
2136  o12_re += acc12_re;
2137  o12_im += acc12_im;
2138  o20_re += acc20_re;
2139  o20_im += acc20_im;
2140  o21_re += acc21_re;
2141  o21_im += acc21_im;
2142  o22_re += acc22_re;
2143  o22_im += acc22_im;
2144  o30_re += acc30_re;
2145  o30_im += acc30_im;
2146  o31_re += acc31_re;
2147  o31_im += acc31_im;
2148  o32_re += acc32_re;
2149  o32_im += acc32_im;
2150 #else
2151  APPLY_TWIST(-a, acc);
2152  //warning! b is unrelated to the twisted mass parameter in this case!
2153 
2154  o00_re = b*o00_re+acc00_re;
2155  o00_im = b*o00_im+acc00_im;
2156  o01_re = b*o01_re+acc01_re;
2157  o01_im = b*o01_im+acc01_im;
2158  o02_re = b*o02_re+acc02_re;
2159  o02_im = b*o02_im+acc02_im;
2160  o10_re = b*o10_re+acc10_re;
2161  o10_im = b*o10_im+acc10_im;
2162  o11_re = b*o11_re+acc11_re;
2163  o11_im = b*o11_im+acc11_im;
2164  o12_re = b*o12_re+acc12_re;
2165  o12_im = b*o12_im+acc12_im;
2166  o20_re = b*o20_re+acc20_re;
2167  o20_im = b*o20_im+acc20_im;
2168  o21_re = b*o21_re+acc21_re;
2169  o21_im = b*o21_im+acc21_im;
2170  o22_re = b*o22_re+acc22_re;
2171  o22_im = b*o22_im+acc22_im;
2172  o30_re = b*o30_re+acc30_re;
2173  o30_im = b*o30_im+acc30_im;
2174  o31_re = b*o31_re+acc31_re;
2175  o31_im = b*o31_im+acc31_im;
2176  o32_re = b*o32_re+acc32_re;
2177  o32_im = b*o32_im+acc32_im;
2178 #endif//TWIST_XPAY
2179 #else //no XPAY
2180 #ifndef TWIST_INV_DSLASH
2181  APPLY_TWIST_INV(-a, b, o);
2182 #endif
2183 #endif
2184 }
2185 
2186 // write spinor field back to device memory
2187 WRITE_SPINOR(param.sp_stride);
2188 
2189 // undefine to prevent warning when precision is changed
2190 #undef spinorFloat
2191 #undef g00_re
2192 #undef g00_im
2193 #undef g01_re
2194 #undef g01_im
2195 #undef g02_re
2196 #undef g02_im
2197 #undef g10_re
2198 #undef g10_im
2199 #undef g11_re
2200 #undef g11_im
2201 #undef g12_re
2202 #undef g12_im
2203 #undef g20_re
2204 #undef g20_im
2205 #undef g21_re
2206 #undef g21_im
2207 #undef g22_re
2208 #undef g22_im
2209 
2210 #undef i00_re
2211 #undef i00_im
2212 #undef i01_re
2213 #undef i01_im
2214 #undef i02_re
2215 #undef i02_im
2216 #undef i10_re
2217 #undef i10_im
2218 #undef i11_re
2219 #undef i11_im
2220 #undef i12_re
2221 #undef i12_im
2222 #undef i20_re
2223 #undef i20_im
2224 #undef i21_re
2225 #undef i21_im
2226 #undef i22_re
2227 #undef i22_im
2228 #undef i30_re
2229 #undef i30_im
2230 #undef i31_re
2231 #undef i31_im
2232 #undef i32_re
2233 #undef i32_im
2234 
2235 #undef acc00_re
2236 #undef acc00_im
2237 #undef acc01_re
2238 #undef acc01_im
2239 #undef acc02_re
2240 #undef acc02_im
2241 #undef acc10_re
2242 #undef acc10_im
2243 #undef acc11_re
2244 #undef acc11_im
2245 #undef acc12_re
2246 #undef acc12_im
2247 #undef acc20_re
2248 #undef acc20_im
2249 #undef acc21_re
2250 #undef acc21_im
2251 #undef acc22_re
2252 #undef acc22_im
2253 #undef acc30_re
2254 #undef acc30_im
2255 #undef acc31_re
2256 #undef acc31_im
2257 #undef acc32_re
2258 #undef acc32_im
2259 
2260 
2261 
2262 #undef VOLATILE
#define g21_im
VOLATILE spinorFloat o32_re
#define g01_re
#define g11_im
dim3 dim3 blockDim
#define acc02_re
#define gT01_re
#define acc12_im
#define gT12_re
spinorFloat B1_re
#define gT00_re
#define APPLY_TWIST(a, reg)
Definition: io_spinor.h:1379
#define i12_re
spinorFloat b0_re
#define i01_re
RECONSTRUCT_GAUGE_MATRIX(0)
#define i30_im
spinorFloat A2_im
#define acc02_im
spinorFloat A2_re
VOLATILE spinorFloat o20_im
VOLATILE spinorFloat o30_re
spinorFloat B1_im
#define g01_im
#define g12_re
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o12_im
#define i01_im
APPLY_TWIST_INV(-a, b, o)
#define acc01_im
#define acc22_re
#define acc22_im
int sp_idx
spinorFloat b1_re
#define i10_re
#define gT02_re
VOLATILE spinorFloat o21_im
#define acc11_re
#define acc30_im
#define g20_re
#define i31_im
#define g20_im
spinorFloat b1_im
#define i21_re
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o02_im
#define acc32_im
#define acc20_im
#define GAUGE0TEX
WRITE_SPINOR(param.sp_stride)
#define i12_im
#define gT22_im
#define gT00_im
#define gT12_im
QudaGaugeParam param
Definition: pack_test.cpp:17
#define g02_re
#define acc12_re
#define i11_im
#define g10_re
#define i20_re
spinorFloat a0_re
VOLATILE spinorFloat o01_re
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
spinorFloat B0_im
#define i22_re
#define acc10_im
#define gT21_re
spinorFloat A1_re
#define GAUGE1TEX
spinorFloat b0_im
VOLATILE spinorFloat o00_im
#define acc00_re
#define SPINORTEX
#define acc31_re
#define g22_re
#define acc20_re
VOLATILE spinorFloat o31_re
#define acc01_re
#define READ_INTERMEDIATE_SPINOR
#define gT11_im
#define i32_im
spinorFloat B0_re
#define gT11_re
VOLATILE spinorFloat o00_re
#define i10_im
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o11_im
int X[4]
Definition: quda.h:29
#define gT21_im
#define i22_im
spinorFloat b2_im
#define gT01_im
#define gT22_re
spinorFloat a0_im
#define g10_im
VOLATILE spinorFloat o12_re
#define gT20_re
#define READ_SPINOR_GHOST
VOLATILE spinorFloat o11_re
#define i11_re
#define g00_re
spinorFloat A1_im
VOLATILE spinorFloat o02_re
#define g22_im
spinorFloat a2_im
#define i00_re
VOLATILE spinorFloat o10_re
spinorFloat B2_re
#define acc21_im
spinorFloat b2_re
#define gT10_im
VOLATILE spinorFloat o32_im
#define gT02_im
#define g12_im
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o21_re
#define i00_im
#define spinorFloat
#define acc31_im
#define INTERTEX
#define g00_im
#define acc30_re
VOLATILE spinorFloat o01_im
#define i31_re
int face_idx
const int ga_idx
VOLATILE spinorFloat o22_im
#define i32_re
#define acc32_re
#define g02_im
const int face_num
#define TPROJSCALE
VOLATILE spinorFloat o20_re
#define GHOSTSPINORTEX
spinorFloat A0_re
spinorFloat a1_im
#define g21_re
spinorFloat a1_re
#define acc21_re
#define acc00_im
#define VOLATILE
spinorFloat A0_im
spinorFloat a2_re
#define i21_im
coordsFromIndex< 4, QUDA_4D_PC, EVEN_X >(X, coord, sid, param)
#define gT10_re
#define acc10_re
#define i30_re
#define a
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define i02_im
VOLATILE spinorFloat o30_im
#define i20_im
const spinorFloat b
#define gT20_im
#define g11_re
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)
#define acc11_im
spinorFloat B2_im
#define i02_re