QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tm_dslash_dagger_g80_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 19
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define i00_re I0.x
15 #define i00_im I0.y
16 #define i01_re I1.x
17 #define i01_im I1.y
18 #define i02_re I2.x
19 #define i02_im I2.y
20 #define i10_re I3.x
21 #define i10_im I3.y
22 #define i11_re I4.x
23 #define i11_im I4.y
24 #define i12_re I5.x
25 #define i12_im I5.y
26 #define i20_re I6.x
27 #define i20_im I6.y
28 #define i21_re I7.x
29 #define i21_im I7.y
30 #define i22_re I8.x
31 #define i22_im I8.y
32 #define i30_re I9.x
33 #define i30_im I9.y
34 #define i31_re I10.x
35 #define i31_im I10.y
36 #define i32_re I11.x
37 #define i32_im I11.y
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
62 #else
63 #define spinorFloat float
64 #define i00_re I0.x
65 #define i00_im I0.y
66 #define i01_re I0.z
67 #define i01_im I0.w
68 #define i02_re I1.x
69 #define i02_im I1.y
70 #define i10_re I1.z
71 #define i10_im I1.w
72 #define i11_re I2.x
73 #define i11_im I2.y
74 #define i12_re I2.z
75 #define i12_im I2.w
76 #define i20_re I3.x
77 #define i20_im I3.y
78 #define i21_re I3.z
79 #define i21_im I3.w
80 #define i22_re I4.x
81 #define i22_im I4.y
82 #define i30_re I4.z
83 #define i30_im I4.w
84 #define i31_re I5.x
85 #define i31_im I5.y
86 #define i32_re I5.z
87 #define i32_im I5.w
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
113 
114 // gauge link
115 #ifdef GAUGE_FLOAT2
116 #define g00_re G0.x
117 #define g00_im G0.y
118 #define g01_re G1.x
119 #define g01_im G1.y
120 #define g02_re G2.x
121 #define g02_im G2.y
122 #define g10_re G3.x
123 #define g10_im G3.y
124 #define g11_re G4.x
125 #define g11_im G4.y
126 #define g12_re G5.x
127 #define g12_im G5.y
128 #define g20_re G6.x
129 #define g20_im G6.y
130 #define g21_re G7.x
131 #define g21_im G7.y
132 #define g22_re G8.x
133 #define g22_im G8.y
134 
135 #else
136 #define g00_re G0.x
137 #define g00_im G0.y
138 #define g01_re G0.z
139 #define g01_im G0.w
140 #define g02_re G1.x
141 #define g02_im G1.y
142 #define g10_re G1.z
143 #define g10_im G1.w
144 #define g11_re G2.x
145 #define g11_im G2.y
146 #define g12_re G2.z
147 #define g12_im G2.w
148 #define g20_re G3.x
149 #define g20_im G3.y
150 #define g21_re G3.z
151 #define g21_im G3.w
152 #define g22_re G4.x
153 #define g22_im G4.y
154 
155 #endif // GAUGE_DOUBLE
156 
157 // conjugated gauge link
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
176 
177 // output spinor
178 #define o00_re s[0*SHARED_STRIDE]
179 #define o00_im s[1*SHARED_STRIDE]
180 #define o01_re s[2*SHARED_STRIDE]
181 #define o01_im s[3*SHARED_STRIDE]
182 #define o02_re s[4*SHARED_STRIDE]
183 #define o02_im s[5*SHARED_STRIDE]
184 #define o10_re s[6*SHARED_STRIDE]
185 #define o10_im s[7*SHARED_STRIDE]
186 #define o11_re s[8*SHARED_STRIDE]
187 #define o11_im s[9*SHARED_STRIDE]
188 #define o12_re s[10*SHARED_STRIDE]
189 #define o12_im s[11*SHARED_STRIDE]
190 #define o20_re s[12*SHARED_STRIDE]
191 #define o20_im s[13*SHARED_STRIDE]
192 #define o21_re s[14*SHARED_STRIDE]
193 #define o21_im s[15*SHARED_STRIDE]
194 #define o22_re s[16*SHARED_STRIDE]
195 #define o22_im s[17*SHARED_STRIDE]
196 #define o30_re s[18*SHARED_STRIDE]
202 
203 #ifdef SPINOR_DOUBLE
204 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
205 #else
206 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
207 #endif
208 
209 extern __shared__ char s_data[];
210 
212  + (threadIdx.x % SHARED_STRIDE);
213 
214 #include "read_gauge.h"
215 #include "io_spinor.h"
216 
217 int x1, x2, x3, x4;
218 int X;
219 
220 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
221 int sp_norm_idx;
222 #endif // MULTI_GPU half precision
223 
224 int sid;
225 
226 #ifdef MULTI_GPU
227 int face_idx;
229 #endif
230 
231  sid = blockIdx.x*blockDim.x + threadIdx.x;
232  if (sid >= param.threads) return;
233 
234  // Inline by hand for the moment and assume even dimensions
235  const int dims[] = {X1, X2, X3, X4};
236  coordsFromIndex<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
237 
238  o00_re = 0; o00_im = 0;
239  o01_re = 0; o01_im = 0;
240  o02_re = 0; o02_im = 0;
241  o10_re = 0; o10_im = 0;
242  o11_re = 0; o11_im = 0;
243  o12_re = 0; o12_im = 0;
244  o20_re = 0; o20_im = 0;
245  o21_re = 0; o21_im = 0;
246  o22_re = 0; o22_im = 0;
247  o30_re = 0; o30_im = 0;
248  o31_re = 0; o31_im = 0;
249  o32_re = 0; o32_im = 0;
250 
251 #ifdef MULTI_GPU
252 } else { // exterior kernel
253 
254  sid = blockIdx.x*blockDim.x + threadIdx.x;
255  if (sid >= param.threads) return;
256 
257  const int dim = static_cast<int>(kernel_type);
258  const int face_volume = (param.threads >> 1); // volume of one face
259  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
260  face_idx = sid - face_num*face_volume; // index into the respective face
261 
262  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
263  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
264  //sp_idx = face_idx + param.ghostOffset[dim];
265 
266 #if (DD_PREC==2) // half precision
267  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
268 #endif
269 
270  const int dims[] = {X1, X2, X3, X4};
271  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
272 
273  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
274 
275  o00_re = i00_re; o00_im = i00_im;
276  o01_re = i01_re; o01_im = i01_im;
277  o02_re = i02_re; o02_im = i02_im;
278  o10_re = i10_re; o10_im = i10_im;
279  o11_re = i11_re; o11_im = i11_im;
280  o12_re = i12_re; o12_im = i12_im;
281  o20_re = i20_re; o20_im = i20_im;
282  o21_re = i21_re; o21_im = i21_im;
283  o22_re = i22_re; o22_im = i22_im;
284  o30_re = i30_re; o30_im = i30_im;
285  o31_re = i31_re; o31_im = i31_im;
286  o32_re = i32_re; o32_im = i32_im;
287 }
288 #endif // MULTI_GPU
289 
290 
291 #ifdef MULTI_GPU
292 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
293  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
294 #endif
295 {
296  // Projector P0+
297  // 1 0 0 i
298  // 0 1 i 0
299  // 0 -i 1 0
300  // -i 0 0 1
301 
302 #ifdef MULTI_GPU
303  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
304  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
305 #else
306  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
307 #endif
308 
309  const int ga_idx = sid;
310 
317 
318 #ifdef MULTI_GPU
319  if (kernel_type == INTERIOR_KERNEL) {
320 #endif
321 
322  // read spinor from device memory
323  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
324 #ifdef TWIST_INV_DSLASH
325  APPLY_TWIST_INV(-a, b, i);
326 #endif
327 
328  // project spinor into half spinors
329  a0_re = +i00_re-i30_im;
330  a0_im = +i00_im+i30_re;
331  a1_re = +i01_re-i31_im;
332  a1_im = +i01_im+i31_re;
333  a2_re = +i02_re-i32_im;
334  a2_im = +i02_im+i32_re;
335  b0_re = +i10_re-i20_im;
336  b0_im = +i10_im+i20_re;
337  b1_re = +i11_re-i21_im;
338  b1_im = +i11_im+i21_re;
339  b2_re = +i12_re-i22_im;
340  b2_im = +i12_im+i22_re;
341 
342 #ifdef MULTI_GPU
343  } else {
344 
345  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
346 
347  // read half spinor from device memory
348  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
349 
350  a0_re = i00_re; a0_im = i00_im;
351  a1_re = i01_re; a1_im = i01_im;
352  a2_re = i02_re; a2_im = i02_im;
353  b0_re = i10_re; b0_im = i10_im;
354  b1_re = i11_re; b1_im = i11_im;
355  b2_re = i12_re; b2_im = i12_im;
356 
357  }
358 #endif // MULTI_GPU
359 
360  // read gauge matrix from device memory
361  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
362 
363  // reconstruct gauge matrix
365 
366  // multiply row 0
368  A0_re += g00_re * a0_re;
369  A0_re -= g00_im * a0_im;
370  A0_re += g01_re * a1_re;
371  A0_re -= g01_im * a1_im;
372  A0_re += g02_re * a2_re;
373  A0_re -= g02_im * a2_im;
375  A0_im += g00_re * a0_im;
376  A0_im += g00_im * a0_re;
377  A0_im += g01_re * a1_im;
378  A0_im += g01_im * a1_re;
379  A0_im += g02_re * a2_im;
380  A0_im += g02_im * a2_re;
382  B0_re += g00_re * b0_re;
383  B0_re -= g00_im * b0_im;
384  B0_re += g01_re * b1_re;
385  B0_re -= g01_im * b1_im;
386  B0_re += g02_re * b2_re;
387  B0_re -= g02_im * b2_im;
389  B0_im += g00_re * b0_im;
390  B0_im += g00_im * b0_re;
391  B0_im += g01_re * b1_im;
392  B0_im += g01_im * b1_re;
393  B0_im += g02_re * b2_im;
394  B0_im += g02_im * b2_re;
395 
396  // multiply row 1
398  A1_re += g10_re * a0_re;
399  A1_re -= g10_im * a0_im;
400  A1_re += g11_re * a1_re;
401  A1_re -= g11_im * a1_im;
402  A1_re += g12_re * a2_re;
403  A1_re -= g12_im * a2_im;
405  A1_im += g10_re * a0_im;
406  A1_im += g10_im * a0_re;
407  A1_im += g11_re * a1_im;
408  A1_im += g11_im * a1_re;
409  A1_im += g12_re * a2_im;
410  A1_im += g12_im * a2_re;
412  B1_re += g10_re * b0_re;
413  B1_re -= g10_im * b0_im;
414  B1_re += g11_re * b1_re;
415  B1_re -= g11_im * b1_im;
416  B1_re += g12_re * b2_re;
417  B1_re -= g12_im * b2_im;
419  B1_im += g10_re * b0_im;
420  B1_im += g10_im * b0_re;
421  B1_im += g11_re * b1_im;
422  B1_im += g11_im * b1_re;
423  B1_im += g12_re * b2_im;
424  B1_im += g12_im * b2_re;
425 
426  // multiply row 2
428  A2_re += g20_re * a0_re;
429  A2_re -= g20_im * a0_im;
430  A2_re += g21_re * a1_re;
431  A2_re -= g21_im * a1_im;
432  A2_re += g22_re * a2_re;
433  A2_re -= g22_im * a2_im;
435  A2_im += g20_re * a0_im;
436  A2_im += g20_im * a0_re;
437  A2_im += g21_re * a1_im;
438  A2_im += g21_im * a1_re;
439  A2_im += g22_re * a2_im;
440  A2_im += g22_im * a2_re;
442  B2_re += g20_re * b0_re;
443  B2_re -= g20_im * b0_im;
444  B2_re += g21_re * b1_re;
445  B2_re -= g21_im * b1_im;
446  B2_re += g22_re * b2_re;
447  B2_re -= g22_im * b2_im;
449  B2_im += g20_re * b0_im;
450  B2_im += g20_im * b0_re;
451  B2_im += g21_re * b1_im;
452  B2_im += g21_im * b1_re;
453  B2_im += g22_re * b2_im;
454  B2_im += g22_im * b2_re;
455 
456  o00_re += A0_re;
457  o00_im += A0_im;
458  o10_re += B0_re;
459  o10_im += B0_im;
460  o20_re += B0_im;
461  o20_im -= B0_re;
462  o30_re += A0_im;
463  o30_im -= A0_re;
464 
465  o01_re += A1_re;
466  o01_im += A1_im;
467  o11_re += B1_re;
468  o11_im += B1_im;
469  o21_re += B1_im;
470  o21_im -= B1_re;
471  o31_re += A1_im;
472  o31_im -= A1_re;
473 
474  o02_re += A2_re;
475  o02_im += A2_im;
476  o12_re += B2_re;
477  o12_im += B2_im;
478  o22_re += B2_im;
479  o22_im -= B2_re;
480  o32_re += A2_im;
481  o32_im -= A2_re;
482 
483 }
484 
485 #ifdef MULTI_GPU
486 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
487  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
488 #endif
489 {
490  // Projector P0-
491  // 1 0 0 -i
492  // 0 1 -i 0
493  // 0 i 1 0
494  // i 0 0 1
495 
496 #ifdef MULTI_GPU
497  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
498  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
499 #else
500  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
501 #endif
502 
503 #ifdef MULTI_GPU
504  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
505 #else
506  const int ga_idx = sp_idx;
507 #endif
508 
515 
516 #ifdef MULTI_GPU
517  if (kernel_type == INTERIOR_KERNEL) {
518 #endif
519 
520  // read spinor from device memory
521  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
522 #ifdef TWIST_INV_DSLASH
523  APPLY_TWIST_INV(-a, b, i);
524 #endif
525 
526  // project spinor into half spinors
527  a0_re = +i00_re+i30_im;
528  a0_im = +i00_im-i30_re;
529  a1_re = +i01_re+i31_im;
530  a1_im = +i01_im-i31_re;
531  a2_re = +i02_re+i32_im;
532  a2_im = +i02_im-i32_re;
533  b0_re = +i10_re+i20_im;
534  b0_im = +i10_im-i20_re;
535  b1_re = +i11_re+i21_im;
536  b1_im = +i11_im-i21_re;
537  b2_re = +i12_re+i22_im;
538  b2_im = +i12_im-i22_re;
539 
540 #ifdef MULTI_GPU
541  } else {
542 
543  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
544 
545  // read half spinor from device memory
546  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
547 
548  a0_re = i00_re; a0_im = i00_im;
549  a1_re = i01_re; a1_im = i01_im;
550  a2_re = i02_re; a2_im = i02_im;
551  b0_re = i10_re; b0_im = i10_im;
552  b1_re = i11_re; b1_im = i11_im;
553  b2_re = i12_re; b2_im = i12_im;
554 
555  }
556 #endif // MULTI_GPU
557 
558  // read gauge matrix from device memory
559  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
560 
561  // reconstruct gauge matrix
563 
564  // multiply row 0
565  spinorFloat A0_re = 0;
566  A0_re += gT00_re * a0_re;
567  A0_re -= gT00_im * a0_im;
568  A0_re += gT01_re * a1_re;
569  A0_re -= gT01_im * a1_im;
570  A0_re += gT02_re * a2_re;
571  A0_re -= gT02_im * a2_im;
572  spinorFloat A0_im = 0;
573  A0_im += gT00_re * a0_im;
574  A0_im += gT00_im * a0_re;
575  A0_im += gT01_re * a1_im;
576  A0_im += gT01_im * a1_re;
577  A0_im += gT02_re * a2_im;
578  A0_im += gT02_im * a2_re;
579  spinorFloat B0_re = 0;
580  B0_re += gT00_re * b0_re;
581  B0_re -= gT00_im * b0_im;
582  B0_re += gT01_re * b1_re;
583  B0_re -= gT01_im * b1_im;
584  B0_re += gT02_re * b2_re;
585  B0_re -= gT02_im * b2_im;
586  spinorFloat B0_im = 0;
587  B0_im += gT00_re * b0_im;
588  B0_im += gT00_im * b0_re;
589  B0_im += gT01_re * b1_im;
590  B0_im += gT01_im * b1_re;
591  B0_im += gT02_re * b2_im;
592  B0_im += gT02_im * b2_re;
593 
594  // multiply row 1
595  spinorFloat A1_re = 0;
596  A1_re += gT10_re * a0_re;
597  A1_re -= gT10_im * a0_im;
598  A1_re += gT11_re * a1_re;
599  A1_re -= gT11_im * a1_im;
600  A1_re += gT12_re * a2_re;
601  A1_re -= gT12_im * a2_im;
602  spinorFloat A1_im = 0;
603  A1_im += gT10_re * a0_im;
604  A1_im += gT10_im * a0_re;
605  A1_im += gT11_re * a1_im;
606  A1_im += gT11_im * a1_re;
607  A1_im += gT12_re * a2_im;
608  A1_im += gT12_im * a2_re;
609  spinorFloat B1_re = 0;
610  B1_re += gT10_re * b0_re;
611  B1_re -= gT10_im * b0_im;
612  B1_re += gT11_re * b1_re;
613  B1_re -= gT11_im * b1_im;
614  B1_re += gT12_re * b2_re;
615  B1_re -= gT12_im * b2_im;
616  spinorFloat B1_im = 0;
617  B1_im += gT10_re * b0_im;
618  B1_im += gT10_im * b0_re;
619  B1_im += gT11_re * b1_im;
620  B1_im += gT11_im * b1_re;
621  B1_im += gT12_re * b2_im;
622  B1_im += gT12_im * b2_re;
623 
624  // multiply row 2
625  spinorFloat A2_re = 0;
626  A2_re += gT20_re * a0_re;
627  A2_re -= gT20_im * a0_im;
628  A2_re += gT21_re * a1_re;
629  A2_re -= gT21_im * a1_im;
630  A2_re += gT22_re * a2_re;
631  A2_re -= gT22_im * a2_im;
632  spinorFloat A2_im = 0;
633  A2_im += gT20_re * a0_im;
634  A2_im += gT20_im * a0_re;
635  A2_im += gT21_re * a1_im;
636  A2_im += gT21_im * a1_re;
637  A2_im += gT22_re * a2_im;
638  A2_im += gT22_im * a2_re;
639  spinorFloat B2_re = 0;
640  B2_re += gT20_re * b0_re;
641  B2_re -= gT20_im * b0_im;
642  B2_re += gT21_re * b1_re;
643  B2_re -= gT21_im * b1_im;
644  B2_re += gT22_re * b2_re;
645  B2_re -= gT22_im * b2_im;
646  spinorFloat B2_im = 0;
647  B2_im += gT20_re * b0_im;
648  B2_im += gT20_im * b0_re;
649  B2_im += gT21_re * b1_im;
650  B2_im += gT21_im * b1_re;
651  B2_im += gT22_re * b2_im;
652  B2_im += gT22_im * b2_re;
653 
654  o00_re += A0_re;
655  o00_im += A0_im;
656  o10_re += B0_re;
657  o10_im += B0_im;
658  o20_re -= B0_im;
659  o20_im += B0_re;
660  o30_re -= A0_im;
661  o30_im += A0_re;
662 
663  o01_re += A1_re;
664  o01_im += A1_im;
665  o11_re += B1_re;
666  o11_im += B1_im;
667  o21_re -= B1_im;
668  o21_im += B1_re;
669  o31_re -= A1_im;
670  o31_im += A1_re;
671 
672  o02_re += A2_re;
673  o02_im += A2_im;
674  o12_re += B2_re;
675  o12_im += B2_im;
676  o22_re -= B2_im;
677  o22_im += B2_re;
678  o32_re -= A2_im;
679  o32_im += A2_re;
680 
681 }
682 
683 #ifdef MULTI_GPU
684 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
685  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
686 #endif
687 {
688  // Projector P1+
689  // 1 0 0 1
690  // 0 1 -1 0
691  // 0 -1 1 0
692  // 1 0 0 1
693 
694 #ifdef MULTI_GPU
695  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
696  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
697 #else
698  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
699 #endif
700 
701  const int ga_idx = sid;
702 
709 
710 #ifdef MULTI_GPU
711  if (kernel_type == INTERIOR_KERNEL) {
712 #endif
713 
714  // read spinor from device memory
715  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
716 #ifdef TWIST_INV_DSLASH
717  APPLY_TWIST_INV(-a, b, i);
718 #endif
719 
720  // project spinor into half spinors
721  a0_re = +i00_re+i30_re;
722  a0_im = +i00_im+i30_im;
723  a1_re = +i01_re+i31_re;
724  a1_im = +i01_im+i31_im;
725  a2_re = +i02_re+i32_re;
726  a2_im = +i02_im+i32_im;
727  b0_re = +i10_re-i20_re;
728  b0_im = +i10_im-i20_im;
729  b1_re = +i11_re-i21_re;
730  b1_im = +i11_im-i21_im;
731  b2_re = +i12_re-i22_re;
732  b2_im = +i12_im-i22_im;
733 
734 #ifdef MULTI_GPU
735  } else {
736 
737  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
738 
739  // read half spinor from device memory
740  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
741 
742  a0_re = i00_re; a0_im = i00_im;
743  a1_re = i01_re; a1_im = i01_im;
744  a2_re = i02_re; a2_im = i02_im;
745  b0_re = i10_re; b0_im = i10_im;
746  b1_re = i11_re; b1_im = i11_im;
747  b2_re = i12_re; b2_im = i12_im;
748 
749  }
750 #endif // MULTI_GPU
751 
752  // read gauge matrix from device memory
753  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
754 
755  // reconstruct gauge matrix
757 
758  // multiply row 0
759  spinorFloat A0_re = 0;
760  A0_re += g00_re * a0_re;
761  A0_re -= g00_im * a0_im;
762  A0_re += g01_re * a1_re;
763  A0_re -= g01_im * a1_im;
764  A0_re += g02_re * a2_re;
765  A0_re -= g02_im * a2_im;
766  spinorFloat A0_im = 0;
767  A0_im += g00_re * a0_im;
768  A0_im += g00_im * a0_re;
769  A0_im += g01_re * a1_im;
770  A0_im += g01_im * a1_re;
771  A0_im += g02_re * a2_im;
772  A0_im += g02_im * a2_re;
773  spinorFloat B0_re = 0;
774  B0_re += g00_re * b0_re;
775  B0_re -= g00_im * b0_im;
776  B0_re += g01_re * b1_re;
777  B0_re -= g01_im * b1_im;
778  B0_re += g02_re * b2_re;
779  B0_re -= g02_im * b2_im;
780  spinorFloat B0_im = 0;
781  B0_im += g00_re * b0_im;
782  B0_im += g00_im * b0_re;
783  B0_im += g01_re * b1_im;
784  B0_im += g01_im * b1_re;
785  B0_im += g02_re * b2_im;
786  B0_im += g02_im * b2_re;
787 
788  // multiply row 1
789  spinorFloat A1_re = 0;
790  A1_re += g10_re * a0_re;
791  A1_re -= g10_im * a0_im;
792  A1_re += g11_re * a1_re;
793  A1_re -= g11_im * a1_im;
794  A1_re += g12_re * a2_re;
795  A1_re -= g12_im * a2_im;
796  spinorFloat A1_im = 0;
797  A1_im += g10_re * a0_im;
798  A1_im += g10_im * a0_re;
799  A1_im += g11_re * a1_im;
800  A1_im += g11_im * a1_re;
801  A1_im += g12_re * a2_im;
802  A1_im += g12_im * a2_re;
803  spinorFloat B1_re = 0;
804  B1_re += g10_re * b0_re;
805  B1_re -= g10_im * b0_im;
806  B1_re += g11_re * b1_re;
807  B1_re -= g11_im * b1_im;
808  B1_re += g12_re * b2_re;
809  B1_re -= g12_im * b2_im;
810  spinorFloat B1_im = 0;
811  B1_im += g10_re * b0_im;
812  B1_im += g10_im * b0_re;
813  B1_im += g11_re * b1_im;
814  B1_im += g11_im * b1_re;
815  B1_im += g12_re * b2_im;
816  B1_im += g12_im * b2_re;
817 
818  // multiply row 2
819  spinorFloat A2_re = 0;
820  A2_re += g20_re * a0_re;
821  A2_re -= g20_im * a0_im;
822  A2_re += g21_re * a1_re;
823  A2_re -= g21_im * a1_im;
824  A2_re += g22_re * a2_re;
825  A2_re -= g22_im * a2_im;
826  spinorFloat A2_im = 0;
827  A2_im += g20_re * a0_im;
828  A2_im += g20_im * a0_re;
829  A2_im += g21_re * a1_im;
830  A2_im += g21_im * a1_re;
831  A2_im += g22_re * a2_im;
832  A2_im += g22_im * a2_re;
833  spinorFloat B2_re = 0;
834  B2_re += g20_re * b0_re;
835  B2_re -= g20_im * b0_im;
836  B2_re += g21_re * b1_re;
837  B2_re -= g21_im * b1_im;
838  B2_re += g22_re * b2_re;
839  B2_re -= g22_im * b2_im;
840  spinorFloat B2_im = 0;
841  B2_im += g20_re * b0_im;
842  B2_im += g20_im * b0_re;
843  B2_im += g21_re * b1_im;
844  B2_im += g21_im * b1_re;
845  B2_im += g22_re * b2_im;
846  B2_im += g22_im * b2_re;
847 
848  o00_re += A0_re;
849  o00_im += A0_im;
850  o10_re += B0_re;
851  o10_im += B0_im;
852  o20_re -= B0_re;
853  o20_im -= B0_im;
854  o30_re += A0_re;
855  o30_im += A0_im;
856 
857  o01_re += A1_re;
858  o01_im += A1_im;
859  o11_re += B1_re;
860  o11_im += B1_im;
861  o21_re -= B1_re;
862  o21_im -= B1_im;
863  o31_re += A1_re;
864  o31_im += A1_im;
865 
866  o02_re += A2_re;
867  o02_im += A2_im;
868  o12_re += B2_re;
869  o12_im += B2_im;
870  o22_re -= B2_re;
871  o22_im -= B2_im;
872  o32_re += A2_re;
873  o32_im += A2_im;
874 
875 }
876 
877 #ifdef MULTI_GPU
878 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
879  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
880 #endif
881 {
882  // Projector P1-
883  // 1 0 0 -1
884  // 0 1 1 0
885  // 0 1 1 0
886  // -1 0 0 1
887 
888 #ifdef MULTI_GPU
889  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
890  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
891 #else
892  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
893 #endif
894 
895 #ifdef MULTI_GPU
896  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
897 #else
898  const int ga_idx = sp_idx;
899 #endif
900 
907 
908 #ifdef MULTI_GPU
909  if (kernel_type == INTERIOR_KERNEL) {
910 #endif
911 
912  // read spinor from device memory
913  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
914 #ifdef TWIST_INV_DSLASH
915  APPLY_TWIST_INV(-a, b, i);
916 #endif
917 
918  // project spinor into half spinors
919  a0_re = +i00_re-i30_re;
920  a0_im = +i00_im-i30_im;
921  a1_re = +i01_re-i31_re;
922  a1_im = +i01_im-i31_im;
923  a2_re = +i02_re-i32_re;
924  a2_im = +i02_im-i32_im;
925  b0_re = +i10_re+i20_re;
926  b0_im = +i10_im+i20_im;
927  b1_re = +i11_re+i21_re;
928  b1_im = +i11_im+i21_im;
929  b2_re = +i12_re+i22_re;
930  b2_im = +i12_im+i22_im;
931 
932 #ifdef MULTI_GPU
933  } else {
934 
935  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
936 
937  // read half spinor from device memory
938  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
939 
940  a0_re = i00_re; a0_im = i00_im;
941  a1_re = i01_re; a1_im = i01_im;
942  a2_re = i02_re; a2_im = i02_im;
943  b0_re = i10_re; b0_im = i10_im;
944  b1_re = i11_re; b1_im = i11_im;
945  b2_re = i12_re; b2_im = i12_im;
946 
947  }
948 #endif // MULTI_GPU
949 
950  // read gauge matrix from device memory
951  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
952 
953  // reconstruct gauge matrix
955 
956  // multiply row 0
957  spinorFloat A0_re = 0;
958  A0_re += gT00_re * a0_re;
959  A0_re -= gT00_im * a0_im;
960  A0_re += gT01_re * a1_re;
961  A0_re -= gT01_im * a1_im;
962  A0_re += gT02_re * a2_re;
963  A0_re -= gT02_im * a2_im;
964  spinorFloat A0_im = 0;
965  A0_im += gT00_re * a0_im;
966  A0_im += gT00_im * a0_re;
967  A0_im += gT01_re * a1_im;
968  A0_im += gT01_im * a1_re;
969  A0_im += gT02_re * a2_im;
970  A0_im += gT02_im * a2_re;
971  spinorFloat B0_re = 0;
972  B0_re += gT00_re * b0_re;
973  B0_re -= gT00_im * b0_im;
974  B0_re += gT01_re * b1_re;
975  B0_re -= gT01_im * b1_im;
976  B0_re += gT02_re * b2_re;
977  B0_re -= gT02_im * b2_im;
978  spinorFloat B0_im = 0;
979  B0_im += gT00_re * b0_im;
980  B0_im += gT00_im * b0_re;
981  B0_im += gT01_re * b1_im;
982  B0_im += gT01_im * b1_re;
983  B0_im += gT02_re * b2_im;
984  B0_im += gT02_im * b2_re;
985 
986  // multiply row 1
987  spinorFloat A1_re = 0;
988  A1_re += gT10_re * a0_re;
989  A1_re -= gT10_im * a0_im;
990  A1_re += gT11_re * a1_re;
991  A1_re -= gT11_im * a1_im;
992  A1_re += gT12_re * a2_re;
993  A1_re -= gT12_im * a2_im;
994  spinorFloat A1_im = 0;
995  A1_im += gT10_re * a0_im;
996  A1_im += gT10_im * a0_re;
997  A1_im += gT11_re * a1_im;
998  A1_im += gT11_im * a1_re;
999  A1_im += gT12_re * a2_im;
1000  A1_im += gT12_im * a2_re;
1001  spinorFloat B1_re = 0;
1002  B1_re += gT10_re * b0_re;
1003  B1_re -= gT10_im * b0_im;
1004  B1_re += gT11_re * b1_re;
1005  B1_re -= gT11_im * b1_im;
1006  B1_re += gT12_re * b2_re;
1007  B1_re -= gT12_im * b2_im;
1008  spinorFloat B1_im = 0;
1009  B1_im += gT10_re * b0_im;
1010  B1_im += gT10_im * b0_re;
1011  B1_im += gT11_re * b1_im;
1012  B1_im += gT11_im * b1_re;
1013  B1_im += gT12_re * b2_im;
1014  B1_im += gT12_im * b2_re;
1015 
1016  // multiply row 2
1017  spinorFloat A2_re = 0;
1018  A2_re += gT20_re * a0_re;
1019  A2_re -= gT20_im * a0_im;
1020  A2_re += gT21_re * a1_re;
1021  A2_re -= gT21_im * a1_im;
1022  A2_re += gT22_re * a2_re;
1023  A2_re -= gT22_im * a2_im;
1024  spinorFloat A2_im = 0;
1025  A2_im += gT20_re * a0_im;
1026  A2_im += gT20_im * a0_re;
1027  A2_im += gT21_re * a1_im;
1028  A2_im += gT21_im * a1_re;
1029  A2_im += gT22_re * a2_im;
1030  A2_im += gT22_im * a2_re;
1031  spinorFloat B2_re = 0;
1032  B2_re += gT20_re * b0_re;
1033  B2_re -= gT20_im * b0_im;
1034  B2_re += gT21_re * b1_re;
1035  B2_re -= gT21_im * b1_im;
1036  B2_re += gT22_re * b2_re;
1037  B2_re -= gT22_im * b2_im;
1038  spinorFloat B2_im = 0;
1039  B2_im += gT20_re * b0_im;
1040  B2_im += gT20_im * b0_re;
1041  B2_im += gT21_re * b1_im;
1042  B2_im += gT21_im * b1_re;
1043  B2_im += gT22_re * b2_im;
1044  B2_im += gT22_im * b2_re;
1045 
1046  o00_re += A0_re;
1047  o00_im += A0_im;
1048  o10_re += B0_re;
1049  o10_im += B0_im;
1050  o20_re += B0_re;
1051  o20_im += B0_im;
1052  o30_re -= A0_re;
1053  o30_im -= A0_im;
1054 
1055  o01_re += A1_re;
1056  o01_im += A1_im;
1057  o11_re += B1_re;
1058  o11_im += B1_im;
1059  o21_re += B1_re;
1060  o21_im += B1_im;
1061  o31_re -= A1_re;
1062  o31_im -= A1_im;
1063 
1064  o02_re += A2_re;
1065  o02_im += A2_im;
1066  o12_re += B2_re;
1067  o12_im += B2_im;
1068  o22_re += B2_re;
1069  o22_im += B2_im;
1070  o32_re -= A2_re;
1071  o32_im -= A2_im;
1072 
1073 }
1074 
1075 #ifdef MULTI_GPU
1076 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1077  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1078 #endif
1079 {
1080  // Projector P2+
1081  // 1 0 i 0
1082  // 0 1 0 -i
1083  // -i 0 1 0
1084  // 0 i 0 1
1085 
1086 #ifdef MULTI_GPU
1087  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1088  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1089 #else
1090  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1091 #endif
1092 
1093  const int ga_idx = sid;
1094 
1101 
1102 #ifdef MULTI_GPU
1103  if (kernel_type == INTERIOR_KERNEL) {
1104 #endif
1105 
1106  // read spinor from device memory
1107  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1108 #ifdef TWIST_INV_DSLASH
1109  APPLY_TWIST_INV(-a, b, i);
1110 #endif
1111 
1112  // project spinor into half spinors
1113  a0_re = +i00_re-i20_im;
1114  a0_im = +i00_im+i20_re;
1115  a1_re = +i01_re-i21_im;
1116  a1_im = +i01_im+i21_re;
1117  a2_re = +i02_re-i22_im;
1118  a2_im = +i02_im+i22_re;
1119  b0_re = +i10_re+i30_im;
1120  b0_im = +i10_im-i30_re;
1121  b1_re = +i11_re+i31_im;
1122  b1_im = +i11_im-i31_re;
1123  b2_re = +i12_re+i32_im;
1124  b2_im = +i12_im-i32_re;
1125 
1126 #ifdef MULTI_GPU
1127  } else {
1128 
1129  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1130 
1131  // read half spinor from device memory
1132  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1133 
1134  a0_re = i00_re; a0_im = i00_im;
1135  a1_re = i01_re; a1_im = i01_im;
1136  a2_re = i02_re; a2_im = i02_im;
1137  b0_re = i10_re; b0_im = i10_im;
1138  b1_re = i11_re; b1_im = i11_im;
1139  b2_re = i12_re; b2_im = i12_im;
1140 
1141  }
1142 #endif // MULTI_GPU
1143 
1144  // read gauge matrix from device memory
1145  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1146 
1147  // reconstruct gauge matrix
1149 
1150  // multiply row 0
1151  spinorFloat A0_re = 0;
1152  A0_re += g00_re * a0_re;
1153  A0_re -= g00_im * a0_im;
1154  A0_re += g01_re * a1_re;
1155  A0_re -= g01_im * a1_im;
1156  A0_re += g02_re * a2_re;
1157  A0_re -= g02_im * a2_im;
1158  spinorFloat A0_im = 0;
1159  A0_im += g00_re * a0_im;
1160  A0_im += g00_im * a0_re;
1161  A0_im += g01_re * a1_im;
1162  A0_im += g01_im * a1_re;
1163  A0_im += g02_re * a2_im;
1164  A0_im += g02_im * a2_re;
1165  spinorFloat B0_re = 0;
1166  B0_re += g00_re * b0_re;
1167  B0_re -= g00_im * b0_im;
1168  B0_re += g01_re * b1_re;
1169  B0_re -= g01_im * b1_im;
1170  B0_re += g02_re * b2_re;
1171  B0_re -= g02_im * b2_im;
1172  spinorFloat B0_im = 0;
1173  B0_im += g00_re * b0_im;
1174  B0_im += g00_im * b0_re;
1175  B0_im += g01_re * b1_im;
1176  B0_im += g01_im * b1_re;
1177  B0_im += g02_re * b2_im;
1178  B0_im += g02_im * b2_re;
1179 
1180  // multiply row 1
1181  spinorFloat A1_re = 0;
1182  A1_re += g10_re * a0_re;
1183  A1_re -= g10_im * a0_im;
1184  A1_re += g11_re * a1_re;
1185  A1_re -= g11_im * a1_im;
1186  A1_re += g12_re * a2_re;
1187  A1_re -= g12_im * a2_im;
1188  spinorFloat A1_im = 0;
1189  A1_im += g10_re * a0_im;
1190  A1_im += g10_im * a0_re;
1191  A1_im += g11_re * a1_im;
1192  A1_im += g11_im * a1_re;
1193  A1_im += g12_re * a2_im;
1194  A1_im += g12_im * a2_re;
1195  spinorFloat B1_re = 0;
1196  B1_re += g10_re * b0_re;
1197  B1_re -= g10_im * b0_im;
1198  B1_re += g11_re * b1_re;
1199  B1_re -= g11_im * b1_im;
1200  B1_re += g12_re * b2_re;
1201  B1_re -= g12_im * b2_im;
1202  spinorFloat B1_im = 0;
1203  B1_im += g10_re * b0_im;
1204  B1_im += g10_im * b0_re;
1205  B1_im += g11_re * b1_im;
1206  B1_im += g11_im * b1_re;
1207  B1_im += g12_re * b2_im;
1208  B1_im += g12_im * b2_re;
1209 
1210  // multiply row 2
1211  spinorFloat A2_re = 0;
1212  A2_re += g20_re * a0_re;
1213  A2_re -= g20_im * a0_im;
1214  A2_re += g21_re * a1_re;
1215  A2_re -= g21_im * a1_im;
1216  A2_re += g22_re * a2_re;
1217  A2_re -= g22_im * a2_im;
1218  spinorFloat A2_im = 0;
1219  A2_im += g20_re * a0_im;
1220  A2_im += g20_im * a0_re;
1221  A2_im += g21_re * a1_im;
1222  A2_im += g21_im * a1_re;
1223  A2_im += g22_re * a2_im;
1224  A2_im += g22_im * a2_re;
1225  spinorFloat B2_re = 0;
1226  B2_re += g20_re * b0_re;
1227  B2_re -= g20_im * b0_im;
1228  B2_re += g21_re * b1_re;
1229  B2_re -= g21_im * b1_im;
1230  B2_re += g22_re * b2_re;
1231  B2_re -= g22_im * b2_im;
1232  spinorFloat B2_im = 0;
1233  B2_im += g20_re * b0_im;
1234  B2_im += g20_im * b0_re;
1235  B2_im += g21_re * b1_im;
1236  B2_im += g21_im * b1_re;
1237  B2_im += g22_re * b2_im;
1238  B2_im += g22_im * b2_re;
1239 
1240  o00_re += A0_re;
1241  o00_im += A0_im;
1242  o10_re += B0_re;
1243  o10_im += B0_im;
1244  o20_re += A0_im;
1245  o20_im -= A0_re;
1246  o30_re -= B0_im;
1247  o30_im += B0_re;
1248 
1249  o01_re += A1_re;
1250  o01_im += A1_im;
1251  o11_re += B1_re;
1252  o11_im += B1_im;
1253  o21_re += A1_im;
1254  o21_im -= A1_re;
1255  o31_re -= B1_im;
1256  o31_im += B1_re;
1257 
1258  o02_re += A2_re;
1259  o02_im += A2_im;
1260  o12_re += B2_re;
1261  o12_im += B2_im;
1262  o22_re += A2_im;
1263  o22_im -= A2_re;
1264  o32_re -= B2_im;
1265  o32_im += B2_re;
1266 
1267 }
1268 
1269 #ifdef MULTI_GPU
1270 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1271  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1272 #endif
1273 {
1274  // Projector P2-
1275  // 1 0 -i 0
1276  // 0 1 0 i
1277  // i 0 1 0
1278  // 0 -i 0 1
1279 
1280 #ifdef MULTI_GPU
1281  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1282  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1283 #else
1284  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1285 #endif
1286 
1287 #ifdef MULTI_GPU
1288  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1289 #else
1290  const int ga_idx = sp_idx;
1291 #endif
1292 
1299 
1300 #ifdef MULTI_GPU
1301  if (kernel_type == INTERIOR_KERNEL) {
1302 #endif
1303 
1304  // read spinor from device memory
1305  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1306 #ifdef TWIST_INV_DSLASH
1307  APPLY_TWIST_INV(-a, b, i);
1308 #endif
1309 
1310  // project spinor into half spinors
1311  a0_re = +i00_re+i20_im;
1312  a0_im = +i00_im-i20_re;
1313  a1_re = +i01_re+i21_im;
1314  a1_im = +i01_im-i21_re;
1315  a2_re = +i02_re+i22_im;
1316  a2_im = +i02_im-i22_re;
1317  b0_re = +i10_re-i30_im;
1318  b0_im = +i10_im+i30_re;
1319  b1_re = +i11_re-i31_im;
1320  b1_im = +i11_im+i31_re;
1321  b2_re = +i12_re-i32_im;
1322  b2_im = +i12_im+i32_re;
1323 
1324 #ifdef MULTI_GPU
1325  } else {
1326 
1327  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1328 
1329  // read half spinor from device memory
1330  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1331 
1332  a0_re = i00_re; a0_im = i00_im;
1333  a1_re = i01_re; a1_im = i01_im;
1334  a2_re = i02_re; a2_im = i02_im;
1335  b0_re = i10_re; b0_im = i10_im;
1336  b1_re = i11_re; b1_im = i11_im;
1337  b2_re = i12_re; b2_im = i12_im;
1338 
1339  }
1340 #endif // MULTI_GPU
1341 
1342  // read gauge matrix from device memory
1343  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1344 
1345  // reconstruct gauge matrix
1347 
1348  // multiply row 0
1349  spinorFloat A0_re = 0;
1350  A0_re += gT00_re * a0_re;
1351  A0_re -= gT00_im * a0_im;
1352  A0_re += gT01_re * a1_re;
1353  A0_re -= gT01_im * a1_im;
1354  A0_re += gT02_re * a2_re;
1355  A0_re -= gT02_im * a2_im;
1356  spinorFloat A0_im = 0;
1357  A0_im += gT00_re * a0_im;
1358  A0_im += gT00_im * a0_re;
1359  A0_im += gT01_re * a1_im;
1360  A0_im += gT01_im * a1_re;
1361  A0_im += gT02_re * a2_im;
1362  A0_im += gT02_im * a2_re;
1363  spinorFloat B0_re = 0;
1364  B0_re += gT00_re * b0_re;
1365  B0_re -= gT00_im * b0_im;
1366  B0_re += gT01_re * b1_re;
1367  B0_re -= gT01_im * b1_im;
1368  B0_re += gT02_re * b2_re;
1369  B0_re -= gT02_im * b2_im;
1370  spinorFloat B0_im = 0;
1371  B0_im += gT00_re * b0_im;
1372  B0_im += gT00_im * b0_re;
1373  B0_im += gT01_re * b1_im;
1374  B0_im += gT01_im * b1_re;
1375  B0_im += gT02_re * b2_im;
1376  B0_im += gT02_im * b2_re;
1377 
1378  // multiply row 1
1379  spinorFloat A1_re = 0;
1380  A1_re += gT10_re * a0_re;
1381  A1_re -= gT10_im * a0_im;
1382  A1_re += gT11_re * a1_re;
1383  A1_re -= gT11_im * a1_im;
1384  A1_re += gT12_re * a2_re;
1385  A1_re -= gT12_im * a2_im;
1386  spinorFloat A1_im = 0;
1387  A1_im += gT10_re * a0_im;
1388  A1_im += gT10_im * a0_re;
1389  A1_im += gT11_re * a1_im;
1390  A1_im += gT11_im * a1_re;
1391  A1_im += gT12_re * a2_im;
1392  A1_im += gT12_im * a2_re;
1393  spinorFloat B1_re = 0;
1394  B1_re += gT10_re * b0_re;
1395  B1_re -= gT10_im * b0_im;
1396  B1_re += gT11_re * b1_re;
1397  B1_re -= gT11_im * b1_im;
1398  B1_re += gT12_re * b2_re;
1399  B1_re -= gT12_im * b2_im;
1400  spinorFloat B1_im = 0;
1401  B1_im += gT10_re * b0_im;
1402  B1_im += gT10_im * b0_re;
1403  B1_im += gT11_re * b1_im;
1404  B1_im += gT11_im * b1_re;
1405  B1_im += gT12_re * b2_im;
1406  B1_im += gT12_im * b2_re;
1407 
1408  // multiply row 2
1409  spinorFloat A2_re = 0;
1410  A2_re += gT20_re * a0_re;
1411  A2_re -= gT20_im * a0_im;
1412  A2_re += gT21_re * a1_re;
1413  A2_re -= gT21_im * a1_im;
1414  A2_re += gT22_re * a2_re;
1415  A2_re -= gT22_im * a2_im;
1416  spinorFloat A2_im = 0;
1417  A2_im += gT20_re * a0_im;
1418  A2_im += gT20_im * a0_re;
1419  A2_im += gT21_re * a1_im;
1420  A2_im += gT21_im * a1_re;
1421  A2_im += gT22_re * a2_im;
1422  A2_im += gT22_im * a2_re;
1423  spinorFloat B2_re = 0;
1424  B2_re += gT20_re * b0_re;
1425  B2_re -= gT20_im * b0_im;
1426  B2_re += gT21_re * b1_re;
1427  B2_re -= gT21_im * b1_im;
1428  B2_re += gT22_re * b2_re;
1429  B2_re -= gT22_im * b2_im;
1430  spinorFloat B2_im = 0;
1431  B2_im += gT20_re * b0_im;
1432  B2_im += gT20_im * b0_re;
1433  B2_im += gT21_re * b1_im;
1434  B2_im += gT21_im * b1_re;
1435  B2_im += gT22_re * b2_im;
1436  B2_im += gT22_im * b2_re;
1437 
1438  o00_re += A0_re;
1439  o00_im += A0_im;
1440  o10_re += B0_re;
1441  o10_im += B0_im;
1442  o20_re -= A0_im;
1443  o20_im += A0_re;
1444  o30_re += B0_im;
1445  o30_im -= B0_re;
1446 
1447  o01_re += A1_re;
1448  o01_im += A1_im;
1449  o11_re += B1_re;
1450  o11_im += B1_im;
1451  o21_re -= A1_im;
1452  o21_im += A1_re;
1453  o31_re += B1_im;
1454  o31_im -= B1_re;
1455 
1456  o02_re += A2_re;
1457  o02_im += A2_im;
1458  o12_re += B2_re;
1459  o12_im += B2_im;
1460  o22_re -= A2_im;
1461  o22_im += A2_re;
1462  o32_re += B2_im;
1463  o32_im -= B2_re;
1464 
1465 }
1466 
1467 #ifdef MULTI_GPU
1468 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1469  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
1470 #endif
1471 {
1472  // Projector P3+
1473  // 2 0 0 0
1474  // 0 2 0 0
1475  // 0 0 0 0
1476  // 0 0 0 0
1477 
1478 #ifdef MULTI_GPU
1479  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1480  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1481 #else
1482  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1483 #endif
1484 
1485  const int ga_idx = sid;
1486 
1488  {
1495 
1496 #ifdef MULTI_GPU
1497  if (kernel_type == INTERIOR_KERNEL) {
1498 #endif
1499 
1500  // read spinor from device memory
1501 #ifndef TWIST_INV_DSLASH
1502  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1503 #else
1504  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1505  APPLY_TWIST_INV(-a, b, i);
1506 #endif
1507 
1508  // project spinor into half spinors
1509  a0_re = +2*i00_re;
1510  a0_im = +2*i00_im;
1511  a1_re = +2*i01_re;
1512  a1_im = +2*i01_im;
1513  a2_re = +2*i02_re;
1514  a2_im = +2*i02_im;
1515  b0_re = +2*i10_re;
1516  b0_im = +2*i10_im;
1517  b1_re = +2*i11_re;
1518  b1_im = +2*i11_im;
1519  b2_re = +2*i12_re;
1520  b2_im = +2*i12_im;
1521 
1522 #ifdef MULTI_GPU
1523  } else {
1524 
1525  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1526  //const int t_proj_scale = TPROJSCALE;
1527 
1528  // read half spinor from device memory
1529  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1530 
1531 #ifdef TWIST_INV_DSLASH
1532  a0_re = i00_re; a0_im = i00_im;
1533  a1_re = i01_re; a1_im = i01_im;
1534  a2_re = i02_re; a2_im = i02_im;
1535  b0_re = i10_re; b0_im = i10_im;
1536  b1_re = i11_re; b1_im = i11_im;
1537  b2_re = i12_re; b2_im = i12_im;
1538 #else
1539  a0_re = 2*i00_re; a0_im = 2*i00_im;
1540  a1_re = 2*i01_re; a1_im = 2*i01_im;
1541  a2_re = 2*i02_re; a2_im = 2*i02_im;
1542  b0_re = 2*i10_re; b0_im = 2*i10_im;
1543  b1_re = 2*i11_re; b1_im = 2*i11_im;
1544  b2_re = 2*i12_re; b2_im = 2*i12_im;
1545 #endif
1546 
1547  }
1548 #endif // MULTI_GPU
1549 
1550  // identity gauge matrix
1557 
1558  o00_re += A0_re;
1559  o00_im += A0_im;
1560  o10_re += B0_re;
1561  o10_im += B0_im;
1562 
1563  o01_re += A1_re;
1564  o01_im += A1_im;
1565  o11_re += B1_re;
1566  o11_im += B1_im;
1567 
1568  o02_re += A2_re;
1569  o02_im += A2_im;
1570  o12_re += B2_re;
1571  o12_im += B2_im;
1572 
1573  } else {
1580 
1581 #ifdef MULTI_GPU
1582  if (kernel_type == INTERIOR_KERNEL) {
1583 #endif
1584 
1585  // read spinor from device memory
1586 #ifndef TWIST_INV_DSLASH
1587  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1588 #else
1589  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1590  APPLY_TWIST_INV(-a, b, i);
1591 #endif
1592 
1593  // project spinor into half spinors
1594  a0_re = +2*i00_re;
1595  a0_im = +2*i00_im;
1596  a1_re = +2*i01_re;
1597  a1_im = +2*i01_im;
1598  a2_re = +2*i02_re;
1599  a2_im = +2*i02_im;
1600  b0_re = +2*i10_re;
1601  b0_im = +2*i10_im;
1602  b1_re = +2*i11_re;
1603  b1_im = +2*i11_im;
1604  b2_re = +2*i12_re;
1605  b2_im = +2*i12_im;
1606 
1607 #ifdef MULTI_GPU
1608  } else {
1609 
1610  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1611  //const int t_proj_scale = TPROJSCALE;
1612 
1613  // read half spinor from device memory
1614  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1615 
1616 #ifdef TWIST_INV_DSLASH
1617  a0_re = i00_re; a0_im = i00_im;
1618  a1_re = i01_re; a1_im = i01_im;
1619  a2_re = i02_re; a2_im = i02_im;
1620  b0_re = i10_re; b0_im = i10_im;
1621  b1_re = i11_re; b1_im = i11_im;
1622  b2_re = i12_re; b2_im = i12_im;
1623 #else
1624  a0_re = 2*i00_re; a0_im = 2*i00_im;
1625  a1_re = 2*i01_re; a1_im = 2*i01_im;
1626  a2_re = 2*i02_re; a2_im = 2*i02_im;
1627  b0_re = 2*i10_re; b0_im = 2*i10_im;
1628  b1_re = 2*i11_re; b1_im = 2*i11_im;
1629  b2_re = 2*i12_re; b2_im = 2*i12_im;
1630 #endif
1631 
1632  }
1633 #endif // MULTI_GPU
1634 
1635  // read gauge matrix from device memory
1636  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1637 
1638  // reconstruct gauge matrix
1640 
1641  // multiply row 0
1642  spinorFloat A0_re = 0;
1643  A0_re += g00_re * a0_re;
1644  A0_re -= g00_im * a0_im;
1645  A0_re += g01_re * a1_re;
1646  A0_re -= g01_im * a1_im;
1647  A0_re += g02_re * a2_re;
1648  A0_re -= g02_im * a2_im;
1649  spinorFloat A0_im = 0;
1650  A0_im += g00_re * a0_im;
1651  A0_im += g00_im * a0_re;
1652  A0_im += g01_re * a1_im;
1653  A0_im += g01_im * a1_re;
1654  A0_im += g02_re * a2_im;
1655  A0_im += g02_im * a2_re;
1656  spinorFloat B0_re = 0;
1657  B0_re += g00_re * b0_re;
1658  B0_re -= g00_im * b0_im;
1659  B0_re += g01_re * b1_re;
1660  B0_re -= g01_im * b1_im;
1661  B0_re += g02_re * b2_re;
1662  B0_re -= g02_im * b2_im;
1663  spinorFloat B0_im = 0;
1664  B0_im += g00_re * b0_im;
1665  B0_im += g00_im * b0_re;
1666  B0_im += g01_re * b1_im;
1667  B0_im += g01_im * b1_re;
1668  B0_im += g02_re * b2_im;
1669  B0_im += g02_im * b2_re;
1670 
1671  // multiply row 1
1672  spinorFloat A1_re = 0;
1673  A1_re += g10_re * a0_re;
1674  A1_re -= g10_im * a0_im;
1675  A1_re += g11_re * a1_re;
1676  A1_re -= g11_im * a1_im;
1677  A1_re += g12_re * a2_re;
1678  A1_re -= g12_im * a2_im;
1679  spinorFloat A1_im = 0;
1680  A1_im += g10_re * a0_im;
1681  A1_im += g10_im * a0_re;
1682  A1_im += g11_re * a1_im;
1683  A1_im += g11_im * a1_re;
1684  A1_im += g12_re * a2_im;
1685  A1_im += g12_im * a2_re;
1686  spinorFloat B1_re = 0;
1687  B1_re += g10_re * b0_re;
1688  B1_re -= g10_im * b0_im;
1689  B1_re += g11_re * b1_re;
1690  B1_re -= g11_im * b1_im;
1691  B1_re += g12_re * b2_re;
1692  B1_re -= g12_im * b2_im;
1693  spinorFloat B1_im = 0;
1694  B1_im += g10_re * b0_im;
1695  B1_im += g10_im * b0_re;
1696  B1_im += g11_re * b1_im;
1697  B1_im += g11_im * b1_re;
1698  B1_im += g12_re * b2_im;
1699  B1_im += g12_im * b2_re;
1700 
1701  // multiply row 2
1702  spinorFloat A2_re = 0;
1703  A2_re += g20_re * a0_re;
1704  A2_re -= g20_im * a0_im;
1705  A2_re += g21_re * a1_re;
1706  A2_re -= g21_im * a1_im;
1707  A2_re += g22_re * a2_re;
1708  A2_re -= g22_im * a2_im;
1709  spinorFloat A2_im = 0;
1710  A2_im += g20_re * a0_im;
1711  A2_im += g20_im * a0_re;
1712  A2_im += g21_re * a1_im;
1713  A2_im += g21_im * a1_re;
1714  A2_im += g22_re * a2_im;
1715  A2_im += g22_im * a2_re;
1716  spinorFloat B2_re = 0;
1717  B2_re += g20_re * b0_re;
1718  B2_re -= g20_im * b0_im;
1719  B2_re += g21_re * b1_re;
1720  B2_re -= g21_im * b1_im;
1721  B2_re += g22_re * b2_re;
1722  B2_re -= g22_im * b2_im;
1723  spinorFloat B2_im = 0;
1724  B2_im += g20_re * b0_im;
1725  B2_im += g20_im * b0_re;
1726  B2_im += g21_re * b1_im;
1727  B2_im += g21_im * b1_re;
1728  B2_im += g22_re * b2_im;
1729  B2_im += g22_im * b2_re;
1730 
1731  o00_re += A0_re;
1732  o00_im += A0_im;
1733  o10_re += B0_re;
1734  o10_im += B0_im;
1735 
1736  o01_re += A1_re;
1737  o01_im += A1_im;
1738  o11_re += B1_re;
1739  o11_im += B1_im;
1740 
1741  o02_re += A2_re;
1742  o02_im += A2_im;
1743  o12_re += B2_re;
1744  o12_im += B2_im;
1745 
1746  }
1747 }
1748 
1749 #ifdef MULTI_GPU
1750 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
1751  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
1752 #endif
1753 {
1754  // Projector P3-
1755  // 0 0 0 0
1756  // 0 0 0 0
1757  // 0 0 2 0
1758  // 0 0 0 2
1759 
1760 #ifdef MULTI_GPU
1761  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
1762  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1763 #else
1764  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
1765 #endif
1766 
1767 #ifdef MULTI_GPU
1768  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1769 #else
1770  const int ga_idx = sp_idx;
1771 #endif
1772 
1773  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1774  {
1781 
1782 #ifdef MULTI_GPU
1783  if (kernel_type == INTERIOR_KERNEL) {
1784 #endif
1785 
1786  // read spinor from device memory
1787 #ifndef TWIST_INV_DSLASH
1788  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1789 #else
1790  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1791  APPLY_TWIST_INV(-a, b, i);
1792 #endif
1793 
1794  // project spinor into half spinors
1795  a0_re = +2*i20_re;
1796  a0_im = +2*i20_im;
1797  a1_re = +2*i21_re;
1798  a1_im = +2*i21_im;
1799  a2_re = +2*i22_re;
1800  a2_im = +2*i22_im;
1801  b0_re = +2*i30_re;
1802  b0_im = +2*i30_im;
1803  b1_re = +2*i31_re;
1804  b1_im = +2*i31_im;
1805  b2_re = +2*i32_re;
1806  b2_im = +2*i32_im;
1807 
1808 #ifdef MULTI_GPU
1809  } else {
1810 
1811  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1812  //const int t_proj_scale = TPROJSCALE;
1813 
1814  // read half spinor from device memory
1815  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1816 
1817 #ifdef TWIST_INV_DSLASH
1818  a0_re = i00_re; a0_im = i00_im;
1819  a1_re = i01_re; a1_im = i01_im;
1820  a2_re = i02_re; a2_im = i02_im;
1821  b0_re = i10_re; b0_im = i10_im;
1822  b1_re = i11_re; b1_im = i11_im;
1823  b2_re = i12_re; b2_im = i12_im;
1824 #else
1825  a0_re = 2*i00_re; a0_im = 2*i00_im;
1826  a1_re = 2*i01_re; a1_im = 2*i01_im;
1827  a2_re = 2*i02_re; a2_im = 2*i02_im;
1828  b0_re = 2*i10_re; b0_im = 2*i10_im;
1829  b1_re = 2*i11_re; b1_im = 2*i11_im;
1830  b2_re = 2*i12_re; b2_im = 2*i12_im;
1831 #endif
1832 
1833  }
1834 #endif // MULTI_GPU
1835 
1836  // identity gauge matrix
1843 
1844  o20_re += A0_re;
1845  o20_im += A0_im;
1846  o30_re += B0_re;
1847  o30_im += B0_im;
1848 
1849  o21_re += A1_re;
1850  o21_im += A1_im;
1851  o31_re += B1_re;
1852  o31_im += B1_im;
1853 
1854  o22_re += A2_re;
1855  o22_im += A2_im;
1856  o32_re += B2_re;
1857  o32_im += B2_im;
1858 
1859  } else {
1866 
1867 #ifdef MULTI_GPU
1868  if (kernel_type == INTERIOR_KERNEL) {
1869 #endif
1870 
1871  // read spinor from device memory
1872 #ifndef TWIST_INV_DSLASH
1873  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1874 #else
1875  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1876  APPLY_TWIST_INV(-a, b, i);
1877 #endif
1878 
1879  // project spinor into half spinors
1880  a0_re = +2*i20_re;
1881  a0_im = +2*i20_im;
1882  a1_re = +2*i21_re;
1883  a1_im = +2*i21_im;
1884  a2_re = +2*i22_re;
1885  a2_im = +2*i22_im;
1886  b0_re = +2*i30_re;
1887  b0_im = +2*i30_im;
1888  b1_re = +2*i31_re;
1889  b1_im = +2*i31_im;
1890  b2_re = +2*i32_re;
1891  b2_im = +2*i32_im;
1892 
1893 #ifdef MULTI_GPU
1894  } else {
1895 
1896  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1897  //const int t_proj_scale = TPROJSCALE;
1898 
1899  // read half spinor from device memory
1900  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1901 
1902 #ifdef TWIST_INV_DSLASH
1903  a0_re = i00_re; a0_im = i00_im;
1904  a1_re = i01_re; a1_im = i01_im;
1905  a2_re = i02_re; a2_im = i02_im;
1906  b0_re = i10_re; b0_im = i10_im;
1907  b1_re = i11_re; b1_im = i11_im;
1908  b2_re = i12_re; b2_im = i12_im;
1909 #else
1910  a0_re = 2*i00_re; a0_im = 2*i00_im;
1911  a1_re = 2*i01_re; a1_im = 2*i01_im;
1912  a2_re = 2*i02_re; a2_im = 2*i02_im;
1913  b0_re = 2*i10_re; b0_im = 2*i10_im;
1914  b1_re = 2*i11_re; b1_im = 2*i11_im;
1915  b2_re = 2*i12_re; b2_im = 2*i12_im;
1916 #endif
1917 
1918  }
1919 #endif // MULTI_GPU
1920 
1921  // read gauge matrix from device memory
1922  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
1923 
1924  // reconstruct gauge matrix
1926 
1927  // multiply row 0
1928  spinorFloat A0_re = 0;
1929  A0_re += gT00_re * a0_re;
1930  A0_re -= gT00_im * a0_im;
1931  A0_re += gT01_re * a1_re;
1932  A0_re -= gT01_im * a1_im;
1933  A0_re += gT02_re * a2_re;
1934  A0_re -= gT02_im * a2_im;
1935  spinorFloat A0_im = 0;
1936  A0_im += gT00_re * a0_im;
1937  A0_im += gT00_im * a0_re;
1938  A0_im += gT01_re * a1_im;
1939  A0_im += gT01_im * a1_re;
1940  A0_im += gT02_re * a2_im;
1941  A0_im += gT02_im * a2_re;
1942  spinorFloat B0_re = 0;
1943  B0_re += gT00_re * b0_re;
1944  B0_re -= gT00_im * b0_im;
1945  B0_re += gT01_re * b1_re;
1946  B0_re -= gT01_im * b1_im;
1947  B0_re += gT02_re * b2_re;
1948  B0_re -= gT02_im * b2_im;
1949  spinorFloat B0_im = 0;
1950  B0_im += gT00_re * b0_im;
1951  B0_im += gT00_im * b0_re;
1952  B0_im += gT01_re * b1_im;
1953  B0_im += gT01_im * b1_re;
1954  B0_im += gT02_re * b2_im;
1955  B0_im += gT02_im * b2_re;
1956 
1957  // multiply row 1
1958  spinorFloat A1_re = 0;
1959  A1_re += gT10_re * a0_re;
1960  A1_re -= gT10_im * a0_im;
1961  A1_re += gT11_re * a1_re;
1962  A1_re -= gT11_im * a1_im;
1963  A1_re += gT12_re * a2_re;
1964  A1_re -= gT12_im * a2_im;
1965  spinorFloat A1_im = 0;
1966  A1_im += gT10_re * a0_im;
1967  A1_im += gT10_im * a0_re;
1968  A1_im += gT11_re * a1_im;
1969  A1_im += gT11_im * a1_re;
1970  A1_im += gT12_re * a2_im;
1971  A1_im += gT12_im * a2_re;
1972  spinorFloat B1_re = 0;
1973  B1_re += gT10_re * b0_re;
1974  B1_re -= gT10_im * b0_im;
1975  B1_re += gT11_re * b1_re;
1976  B1_re -= gT11_im * b1_im;
1977  B1_re += gT12_re * b2_re;
1978  B1_re -= gT12_im * b2_im;
1979  spinorFloat B1_im = 0;
1980  B1_im += gT10_re * b0_im;
1981  B1_im += gT10_im * b0_re;
1982  B1_im += gT11_re * b1_im;
1983  B1_im += gT11_im * b1_re;
1984  B1_im += gT12_re * b2_im;
1985  B1_im += gT12_im * b2_re;
1986 
1987  // multiply row 2
1988  spinorFloat A2_re = 0;
1989  A2_re += gT20_re * a0_re;
1990  A2_re -= gT20_im * a0_im;
1991  A2_re += gT21_re * a1_re;
1992  A2_re -= gT21_im * a1_im;
1993  A2_re += gT22_re * a2_re;
1994  A2_re -= gT22_im * a2_im;
1995  spinorFloat A2_im = 0;
1996  A2_im += gT20_re * a0_im;
1997  A2_im += gT20_im * a0_re;
1998  A2_im += gT21_re * a1_im;
1999  A2_im += gT21_im * a1_re;
2000  A2_im += gT22_re * a2_im;
2001  A2_im += gT22_im * a2_re;
2002  spinorFloat B2_re = 0;
2003  B2_re += gT20_re * b0_re;
2004  B2_re -= gT20_im * b0_im;
2005  B2_re += gT21_re * b1_re;
2006  B2_re -= gT21_im * b1_im;
2007  B2_re += gT22_re * b2_re;
2008  B2_re -= gT22_im * b2_im;
2009  spinorFloat B2_im = 0;
2010  B2_im += gT20_re * b0_im;
2011  B2_im += gT20_im * b0_re;
2012  B2_im += gT21_re * b1_im;
2013  B2_im += gT21_im * b1_re;
2014  B2_im += gT22_re * b2_im;
2015  B2_im += gT22_im * b2_re;
2016 
2017  o20_re += A0_re;
2018  o20_im += A0_im;
2019  o30_re += B0_re;
2020  o30_im += B0_im;
2021 
2022  o21_re += A1_re;
2023  o21_im += A1_im;
2024  o31_re += B1_re;
2025  o31_im += B1_im;
2026 
2027  o22_re += A2_re;
2028  o22_im += A2_im;
2029  o32_re += B2_re;
2030  o32_im += B2_im;
2031 
2032  }
2033 }
2034 
2035 #ifdef MULTI_GPU
2036 
2037 int incomplete = 0; // Have all 8 contributions been computed for this site?
2038 
2039 switch(kernel_type) { // intentional fall-through
2040 case INTERIOR_KERNEL:
2041  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
2042 case EXTERIOR_KERNEL_T:
2043  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
2044 case EXTERIOR_KERNEL_Z:
2045  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
2046 case EXTERIOR_KERNEL_Y:
2047  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
2048 }
2049 
2050 if (!incomplete)
2051 #endif // MULTI_GPU
2052 {
2053 #ifdef DSLASH_XPAY
2054  READ_ACCUM(ACCUMTEX, param.sp_stride)
2055 
2056 #ifndef TWIST_XPAY
2057 #ifndef TWIST_INV_DSLASH
2058  //perform invert twist first:
2059  APPLY_TWIST_INV(-a, b, o);
2060 #endif
2061  o00_re += acc00_re;
2062  o00_im += acc00_im;
2063  o01_re += acc01_re;
2064  o01_im += acc01_im;
2065  o02_re += acc02_re;
2066  o02_im += acc02_im;
2067  o10_re += acc10_re;
2068  o10_im += acc10_im;
2069  o11_re += acc11_re;
2070  o11_im += acc11_im;
2071  o12_re += acc12_re;
2072  o12_im += acc12_im;
2073  o20_re += acc20_re;
2074  o20_im += acc20_im;
2075  o21_re += acc21_re;
2076  o21_im += acc21_im;
2077  o22_re += acc22_re;
2078  o22_im += acc22_im;
2079  o30_re += acc30_re;
2080  o30_im += acc30_im;
2081  o31_re += acc31_re;
2082  o31_im += acc31_im;
2083  o32_re += acc32_re;
2084  o32_im += acc32_im;
2085 #else
2086  APPLY_TWIST(-a, acc);
2087  //warning! b is unrelated to the twisted mass parameter in this case!
2088 
2089  o00_re = b*o00_re+acc00_re;
2090  o00_im = b*o00_im+acc00_im;
2091  o01_re = b*o01_re+acc01_re;
2092  o01_im = b*o01_im+acc01_im;
2093  o02_re = b*o02_re+acc02_re;
2094  o02_im = b*o02_im+acc02_im;
2095  o10_re = b*o10_re+acc10_re;
2096  o10_im = b*o10_im+acc10_im;
2097  o11_re = b*o11_re+acc11_re;
2098  o11_im = b*o11_im+acc11_im;
2099  o12_re = b*o12_re+acc12_re;
2100  o12_im = b*o12_im+acc12_im;
2101  o20_re = b*o20_re+acc20_re;
2102  o20_im = b*o20_im+acc20_im;
2103  o21_re = b*o21_re+acc21_re;
2104  o21_im = b*o21_im+acc21_im;
2105  o22_re = b*o22_re+acc22_re;
2106  o22_im = b*o22_im+acc22_im;
2107  o30_re = b*o30_re+acc30_re;
2108  o30_im = b*o30_im+acc30_im;
2109  o31_re = b*o31_re+acc31_re;
2110  o31_im = b*o31_im+acc31_im;
2111  o32_re = b*o32_re+acc32_re;
2112  o32_im = b*o32_im+acc32_im;
2113 #endif//TWIST_XPAY
2114 #else //no XPAY
2115 #ifndef TWIST_INV_DSLASH
2116  APPLY_TWIST_INV(-a, b, o);
2117 #endif
2118 #endif
2119 }
2120 
2121 // write spinor field back to device memory
2122 WRITE_SPINOR(param.sp_stride);
2123 
2124 // undefine to prevent warning when precision is changed
2125 #undef spinorFloat
2126 #undef SHARED_STRIDE
2127 
2128 #undef g00_re
2129 #undef g00_im
2130 #undef g01_re
2131 #undef g01_im
2132 #undef g02_re
2133 #undef g02_im
2134 #undef g10_re
2135 #undef g10_im
2136 #undef g11_re
2137 #undef g11_im
2138 #undef g12_re
2139 #undef g12_im
2140 #undef g20_re
2141 #undef g20_im
2142 #undef g21_re
2143 #undef g21_im
2144 #undef g22_re
2145 #undef g22_im
2146 
2147 #undef i00_re
2148 #undef i00_im
2149 #undef i01_re
2150 #undef i01_im
2151 #undef i02_re
2152 #undef i02_im
2153 #undef i10_re
2154 #undef i10_im
2155 #undef i11_re
2156 #undef i11_im
2157 #undef i12_re
2158 #undef i12_im
2159 #undef i20_re
2160 #undef i20_im
2161 #undef i21_re
2162 #undef i21_im
2163 #undef i22_re
2164 #undef i22_im
2165 #undef i30_re
2166 #undef i30_im
2167 #undef i31_re
2168 #undef i31_im
2169 #undef i32_re
2170 #undef i32_im
2171 
2172 #undef acc00_re
2173 #undef acc00_im
2174 #undef acc01_re
2175 #undef acc01_im
2176 #undef acc02_re
2177 #undef acc02_im
2178 #undef acc10_re
2179 #undef acc10_im
2180 #undef acc11_re
2181 #undef acc11_im
2182 #undef acc12_re
2183 #undef acc12_im
2184 #undef acc20_re
2185 #undef acc20_im
2186 #undef acc21_re
2187 #undef acc21_im
2188 #undef acc22_re
2189 #undef acc22_im
2190 #undef acc30_re
2191 #undef acc30_im
2192 #undef acc31_re
2193 #undef acc31_im
2194 #undef acc32_re
2195 #undef acc32_im
2196 
2197 
2198 #undef o00_re
2199 #undef o00_im
2200 #undef o01_re
2201 #undef o01_im
2202 #undef o02_re
2203 #undef o02_im
2204 #undef o10_re
2205 #undef o10_im
2206 #undef o11_re
2207 #undef o11_im
2208 #undef o12_re
2209 #undef o12_im
2210 #undef o20_re
2211 #undef o20_im
2212 #undef o21_re
2213 #undef o21_im
2214 #undef o22_re
2215 #undef o22_im
2216 #undef o30_re
2217 
2218 #undef VOLATILE
spinorFloat B1_re
#define gT01_im
#define gT21_re
__constant__ int Vh
WRITE_SPINOR(param.sp_stride)
#define acc12_re
#define gT10_re
#define acc22_re
RECONSTRUCT_GAUGE_MATRIX(0)
__constant__ int X2
VOLATILE spinorFloat * s
#define APPLY_TWIST(a, reg)
Definition: io_spinor.h:1187
#define i32_re
#define i11_re
#define gT12_im
__constant__ int X2X1mX1
#define acc02_im
#define gT20_im
VOLATILE spinorFloat o32_re
#define g00_im
spinorFloat B0_im
#define acc01_re
#define APPLY_TWIST_INV(a, b, reg)
**************************only for deg tm:*******************************
Definition: io_spinor.h:1122
__constant__ int X3X2X1mX2X1
spinorFloat A1_re
#define acc30_re
#define o22_re
__constant__ int X1
spinorFloat A1_im
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
spinorFloat B2_re
int sp_idx
#define g21_im
#define gT11_im
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
spinorFloat a2_im
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define o11_re
#define o20_re
#define acc20_re
spinorFloat A2_im
#define gT20_re
#define i21_im
spinorFloat A0_im
__constant__ int X3X2X1
#define i12_im
#define gT02_im
spinorFloat B2_im
#define i22_im
const int dims[]
#define i02_im
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
#define g00_re
spinorFloat a0_im
#define gT10_im
#define i10_im
#define acc01_im
#define acc31_im
#define g01_re
#define i20_im
#define gT21_im
spinorFloat b1_re
#define i30_re
#define g01_im
#define i01_im
spinorFloat a1_im
#define acc10_re
QudaGaugeParam param
Definition: pack_test.cpp:17
#define acc00_im
VOLATILE spinorFloat o32_im
__constant__ int ghostFace[QUDA_MAX_DIM+1]
spinorFloat b1_im
#define gT02_re
spinorFloat b0_re
#define gT01_re
spinorFloat b0_im
#define g10_im
#define o00_re
spinorFloat b2_im
#define i10_re
spinorFloat a0_re
#define acc22_im
#define DSLASH_SHARED_FLOATS_PER_THREAD
#define o11_im
#define i22_re
#define i00_im
#define GAUGE0TEX
Definition: covDev.h:112
#define g20_re
#define acc11_re
#define i31_re
spinorFloat B1_im
#define gT00_im
#define o21_im
#define i32_im
#define acc11_im
#define g12_re
#define acc21_re
__constant__ int X2m1
#define o02_re
#define gT11_re
#define acc12_im
#define acc21_im
#define SPINORTEX
Definition: clover_def.h:40
#define o10_im
__constant__ int gauge_fixed
#define i01_re
#define g22_im
#define g22_re
#define acc02_re
__constant__ int X4X3X2X1mX3X2X1
#define g12_im
__shared__ char s_data[]
#define SPINOR_HOP
Definition: covDev.h:158
#define o01_re
#define acc32_re
spinorFloat A2_re
#define spinorFloat
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define gT12_re
__constant__ int ga_stride
#define g10_re
spinorFloat b2_re
#define o02_im
#define acc31_re
__constant__ int X1m1
VOLATILE spinorFloat o31_im
#define acc20_im
#define gT22_im
__constant__ int X3
#define o22_im
#define o12_re
#define i12_re
#define o12_im
#define i21_re
#define i31_im
#define acc30_im
const int ga_idx
spinorFloat A0_re
#define i20_re
#define o01_im
#define GAUGE1TEX
Definition: covDev.h:113
VOLATILE spinorFloat o30_im
#define g02_im
#define acc00_re
#define g02_re
__constant__ int X4m1
#define gT22_re
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
#define VOLATILE
#define gT00_re
#define o00_im
#define g20_im
#define i11_im
#define g11_re
#define g11_im
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
#define o30_re
spinorFloat B0_re
__constant__ int X4X3X2X1hmX3X2X1h
#define o21_re
spinorFloat a2_re
spinorFloat a1_re
#define o10_re
#define g21_re
VOLATILE spinorFloat o31_re
KernelType kernel_type
#define i00_re
#define o20_im
__constant__ int X4
#define i02_re
__constant__ int X3m1
#define i30_im
#define SHARED_STRIDE
__constant__ int X2X1
#define acc32_im
#define acc10_im