QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tm_dslash_fermi_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 24
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2
15 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
64 #else
65 #define spinorFloat float
66 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4
67 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4
68 #define i00_re I0.x
69 #define i00_im I0.y
70 #define i01_re I0.z
71 #define i01_im I0.w
72 #define i02_re I1.x
73 #define i02_im I1.y
74 #define i10_re I1.z
75 #define i10_im I1.w
76 #define i11_re I2.x
77 #define i11_im I2.y
78 #define i12_re I2.z
79 #define i12_im I2.w
80 #define i20_re I3.x
81 #define i20_im I3.y
82 #define i21_re I3.z
83 #define i21_im I3.w
84 #define i22_re I4.x
85 #define i22_im I4.y
86 #define i30_re I4.z
87 #define i30_im I4.w
88 #define i31_re I5.x
89 #define i31_im I5.y
90 #define i32_re I5.z
91 #define i32_im I5.w
92 #define acc00_re accum0.x
93 #define acc00_im accum0.y
94 #define acc01_re accum0.z
95 #define acc01_im accum0.w
96 #define acc02_re accum1.x
97 #define acc02_im accum1.y
98 #define acc10_re accum1.z
99 #define acc10_im accum1.w
100 #define acc11_re accum2.x
101 #define acc11_im accum2.y
102 #define acc12_re accum2.z
103 #define acc12_im accum2.w
104 #define acc20_re accum3.x
105 #define acc20_im accum3.y
106 #define acc21_re accum3.z
107 #define acc21_im accum3.w
108 #define acc22_re accum4.x
109 #define acc22_im accum4.y
110 #define acc30_re accum4.z
111 #define acc30_im accum4.w
112 #define acc31_re accum5.x
113 #define acc31_im accum5.y
114 #define acc32_re accum5.z
115 #define acc32_im accum5.w
116 #endif // SPINOR_DOUBLE
117 
118 // gauge link
119 #ifdef GAUGE_FLOAT2
120 #define g00_re G0.x
121 #define g00_im G0.y
122 #define g01_re G1.x
123 #define g01_im G1.y
124 #define g02_re G2.x
125 #define g02_im G2.y
126 #define g10_re G3.x
127 #define g10_im G3.y
128 #define g11_re G4.x
129 #define g11_im G4.y
130 #define g12_re G5.x
131 #define g12_im G5.y
132 #define g20_re G6.x
133 #define g20_im G6.y
134 #define g21_re G7.x
135 #define g21_im G7.y
136 #define g22_re G8.x
137 #define g22_im G8.y
138 
139 #else
140 #define g00_re G0.x
141 #define g00_im G0.y
142 #define g01_re G0.z
143 #define g01_im G0.w
144 #define g02_re G1.x
145 #define g02_im G1.y
146 #define g10_re G1.z
147 #define g10_im G1.w
148 #define g11_re G2.x
149 #define g11_im G2.y
150 #define g12_re G2.z
151 #define g12_im G2.w
152 #define g20_re G3.x
153 #define g20_im G3.y
154 #define g21_re G3.z
155 #define g21_im G3.w
156 #define g22_re G4.x
157 #define g22_im G4.y
158 
159 #endif // GAUGE_DOUBLE
160 
161 // conjugated gauge link
162 #define gT00_re (+g00_re)
163 #define gT00_im (-g00_im)
164 #define gT01_re (+g10_re)
165 #define gT01_im (-g10_im)
166 #define gT02_re (+g20_re)
167 #define gT02_im (-g20_im)
168 #define gT10_re (+g01_re)
169 #define gT10_im (-g01_im)
170 #define gT11_re (+g11_re)
171 #define gT11_im (-g11_im)
172 #define gT12_re (+g21_re)
173 #define gT12_im (-g21_im)
174 #define gT20_re (+g02_re)
175 #define gT20_im (-g02_im)
176 #define gT21_re (+g12_re)
177 #define gT21_im (-g12_im)
178 #define gT22_re (+g22_re)
179 #define gT22_im (-g22_im)
180 
181 // output spinor
206 
207 #ifdef SPINOR_DOUBLE
208 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
209 #else
210 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
211 #endif
212 
213 #include "read_gauge.h"
214 #include "io_spinor.h"
215 
216 int x1, x2, x3, x4;
217 int X;
218 
219 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
220 int sp_norm_idx;
221 #endif // MULTI_GPU half precision
222 
223 int sid;
224 
225 #ifdef MULTI_GPU
226 int face_idx;
228 #endif
229 
230  // Inline by hand for the moment and assume even dimensions
231  const int dims[] = {X1, X2, X3, X4};
232  coordsFromIndex3D<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
233 
234  // only need to check Y and Z dims currently since X and T set to match exactly
235  if (x2 >= X2) return;
236  if (x3 >= X3) return;
237 
238  o00_re = 0; o00_im = 0;
239  o01_re = 0; o01_im = 0;
240  o02_re = 0; o02_im = 0;
241  o10_re = 0; o10_im = 0;
242  o11_re = 0; o11_im = 0;
243  o12_re = 0; o12_im = 0;
244  o20_re = 0; o20_im = 0;
245  o21_re = 0; o21_im = 0;
246  o22_re = 0; o22_im = 0;
247  o30_re = 0; o30_im = 0;
248  o31_re = 0; o31_im = 0;
249  o32_re = 0; o32_im = 0;
250 
251 #ifdef MULTI_GPU
252 } else { // exterior kernel
253 
254  sid = blockIdx.x*blockDim.x + threadIdx.x;
255  if (sid >= param.threads) return;
256 
257  const int dim = static_cast<int>(kernel_type);
258  const int face_volume = (param.threads >> 1); // volume of one face
259  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
260  face_idx = sid - face_num*face_volume; // index into the respective face
261 
262  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
263  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
264  //sp_idx = face_idx + param.ghostOffset[dim];
265 
266 #if (DD_PREC==2) // half precision
267  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
268 #endif
269 
270  const int dims[] = {X1, X2, X3, X4};
271  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
272 
273  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
274 
275  o00_re = i00_re; o00_im = i00_im;
276  o01_re = i01_re; o01_im = i01_im;
277  o02_re = i02_re; o02_im = i02_im;
278  o10_re = i10_re; o10_im = i10_im;
279  o11_re = i11_re; o11_im = i11_im;
280  o12_re = i12_re; o12_im = i12_im;
281  o20_re = i20_re; o20_im = i20_im;
282  o21_re = i21_re; o21_im = i21_im;
283  o22_re = i22_re; o22_im = i22_im;
284  o30_re = i30_re; o30_im = i30_im;
285  o31_re = i31_re; o31_im = i31_im;
286  o32_re = i32_re; o32_im = i32_im;
287 }
288 #endif // MULTI_GPU
289 
290 
291 #ifdef MULTI_GPU
292 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
293  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
294 #endif
295 {
296  // Projector P0-
297  // 1 0 0 -i
298  // 0 1 -i 0
299  // 0 i 1 0
300  // i 0 0 1
301 
302 #ifdef MULTI_GPU
303  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
304  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
305 #else
306  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
307 #endif
308 
309  const int ga_idx = sid;
310 
317 
318 #ifdef MULTI_GPU
319  if (kernel_type == INTERIOR_KERNEL) {
320 #endif
321 
322  // read spinor from device memory
323  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
324 #ifdef TWIST_INV_DSLASH
325  APPLY_TWIST_INV( a, b, i);
326 #endif
327 
328  // store spinor into shared memory
329  WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i);
330 
331  // project spinor into half spinors
332  a0_re = +i00_re+i30_im;
333  a0_im = +i00_im-i30_re;
334  a1_re = +i01_re+i31_im;
335  a1_im = +i01_im-i31_re;
336  a2_re = +i02_re+i32_im;
337  a2_im = +i02_im-i32_re;
338  b0_re = +i10_re+i20_im;
339  b0_im = +i10_im-i20_re;
340  b1_re = +i11_re+i21_im;
341  b1_im = +i11_im-i21_re;
342  b2_re = +i12_re+i22_im;
343  b2_im = +i12_im-i22_re;
344 
345 #ifdef MULTI_GPU
346  } else {
347 
348  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
349 
350  // read half spinor from device memory
351  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
352 
353  a0_re = i00_re; a0_im = i00_im;
354  a1_re = i01_re; a1_im = i01_im;
355  a2_re = i02_re; a2_im = i02_im;
356  b0_re = i10_re; b0_im = i10_im;
357  b1_re = i11_re; b1_im = i11_im;
358  b2_re = i12_re; b2_im = i12_im;
359 
360  }
361 #endif // MULTI_GPU
362 
363  // read gauge matrix from device memory
364  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
365 
366  // reconstruct gauge matrix
368 
369  // multiply row 0
371  A0_re += g00_re * a0_re;
372  A0_re -= g00_im * a0_im;
373  A0_re += g01_re * a1_re;
374  A0_re -= g01_im * a1_im;
375  A0_re += g02_re * a2_re;
376  A0_re -= g02_im * a2_im;
378  A0_im += g00_re * a0_im;
379  A0_im += g00_im * a0_re;
380  A0_im += g01_re * a1_im;
381  A0_im += g01_im * a1_re;
382  A0_im += g02_re * a2_im;
383  A0_im += g02_im * a2_re;
385  B0_re += g00_re * b0_re;
386  B0_re -= g00_im * b0_im;
387  B0_re += g01_re * b1_re;
388  B0_re -= g01_im * b1_im;
389  B0_re += g02_re * b2_re;
390  B0_re -= g02_im * b2_im;
392  B0_im += g00_re * b0_im;
393  B0_im += g00_im * b0_re;
394  B0_im += g01_re * b1_im;
395  B0_im += g01_im * b1_re;
396  B0_im += g02_re * b2_im;
397  B0_im += g02_im * b2_re;
398 
399  // multiply row 1
401  A1_re += g10_re * a0_re;
402  A1_re -= g10_im * a0_im;
403  A1_re += g11_re * a1_re;
404  A1_re -= g11_im * a1_im;
405  A1_re += g12_re * a2_re;
406  A1_re -= g12_im * a2_im;
408  A1_im += g10_re * a0_im;
409  A1_im += g10_im * a0_re;
410  A1_im += g11_re * a1_im;
411  A1_im += g11_im * a1_re;
412  A1_im += g12_re * a2_im;
413  A1_im += g12_im * a2_re;
415  B1_re += g10_re * b0_re;
416  B1_re -= g10_im * b0_im;
417  B1_re += g11_re * b1_re;
418  B1_re -= g11_im * b1_im;
419  B1_re += g12_re * b2_re;
420  B1_re -= g12_im * b2_im;
422  B1_im += g10_re * b0_im;
423  B1_im += g10_im * b0_re;
424  B1_im += g11_re * b1_im;
425  B1_im += g11_im * b1_re;
426  B1_im += g12_re * b2_im;
427  B1_im += g12_im * b2_re;
428 
429  // multiply row 2
431  A2_re += g20_re * a0_re;
432  A2_re -= g20_im * a0_im;
433  A2_re += g21_re * a1_re;
434  A2_re -= g21_im * a1_im;
435  A2_re += g22_re * a2_re;
436  A2_re -= g22_im * a2_im;
438  A2_im += g20_re * a0_im;
439  A2_im += g20_im * a0_re;
440  A2_im += g21_re * a1_im;
441  A2_im += g21_im * a1_re;
442  A2_im += g22_re * a2_im;
443  A2_im += g22_im * a2_re;
445  B2_re += g20_re * b0_re;
446  B2_re -= g20_im * b0_im;
447  B2_re += g21_re * b1_re;
448  B2_re -= g21_im * b1_im;
449  B2_re += g22_re * b2_re;
450  B2_re -= g22_im * b2_im;
452  B2_im += g20_re * b0_im;
453  B2_im += g20_im * b0_re;
454  B2_im += g21_re * b1_im;
455  B2_im += g21_im * b1_re;
456  B2_im += g22_re * b2_im;
457  B2_im += g22_im * b2_re;
458 
459  o00_re += A0_re;
460  o00_im += A0_im;
461  o10_re += B0_re;
462  o10_im += B0_im;
463  o20_re -= B0_im;
464  o20_im += B0_re;
465  o30_re -= A0_im;
466  o30_im += A0_re;
467 
468  o01_re += A1_re;
469  o01_im += A1_im;
470  o11_re += B1_re;
471  o11_im += B1_im;
472  o21_re -= B1_im;
473  o21_im += B1_re;
474  o31_re -= A1_im;
475  o31_im += A1_re;
476 
477  o02_re += A2_re;
478  o02_im += A2_im;
479  o12_re += B2_re;
480  o12_im += B2_im;
481  o22_re -= B2_im;
482  o22_im += B2_re;
483  o32_re -= A2_im;
484  o32_im += A2_re;
485 
486 }
487 
488 #ifdef MULTI_GPU
489 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
490  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
491 #endif
492 {
493  // Projector P0+
494  // 1 0 0 i
495  // 0 1 i 0
496  // 0 -i 1 0
497  // -i 0 0 1
498 
499 #ifdef MULTI_GPU
500  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
501  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
502 #else
503  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
504 #endif
505 
506 #ifdef MULTI_GPU
507  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
508 #else
509  const int ga_idx = sp_idx;
510 #endif
511 
518 
519 #ifdef MULTI_GPU
520  if (kernel_type == INTERIOR_KERNEL) {
521 #endif
522 
523  // load spinor from shared memory
524  int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1;
525  __syncthreads();
526  READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z);
527 
528  // project spinor into half spinors
529  a0_re = +i00_re-i30_im;
530  a0_im = +i00_im+i30_re;
531  a1_re = +i01_re-i31_im;
532  a1_im = +i01_im+i31_re;
533  a2_re = +i02_re-i32_im;
534  a2_im = +i02_im+i32_re;
535  b0_re = +i10_re-i20_im;
536  b0_im = +i10_im+i20_re;
537  b1_re = +i11_re-i21_im;
538  b1_im = +i11_im+i21_re;
539  b2_re = +i12_re-i22_im;
540  b2_im = +i12_im+i22_re;
541 
542 #ifdef MULTI_GPU
543  } else {
544 
545  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
546 
547  // read half spinor from device memory
548  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
549 
550  a0_re = i00_re; a0_im = i00_im;
551  a1_re = i01_re; a1_im = i01_im;
552  a2_re = i02_re; a2_im = i02_im;
553  b0_re = i10_re; b0_im = i10_im;
554  b1_re = i11_re; b1_im = i11_im;
555  b2_re = i12_re; b2_im = i12_im;
556 
557  }
558 #endif // MULTI_GPU
559 
560  // read gauge matrix from device memory
561  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
562 
563  // reconstruct gauge matrix
565 
566  // multiply row 0
567  spinorFloat A0_re = 0;
568  A0_re += gT00_re * a0_re;
569  A0_re -= gT00_im * a0_im;
570  A0_re += gT01_re * a1_re;
571  A0_re -= gT01_im * a1_im;
572  A0_re += gT02_re * a2_re;
573  A0_re -= gT02_im * a2_im;
574  spinorFloat A0_im = 0;
575  A0_im += gT00_re * a0_im;
576  A0_im += gT00_im * a0_re;
577  A0_im += gT01_re * a1_im;
578  A0_im += gT01_im * a1_re;
579  A0_im += gT02_re * a2_im;
580  A0_im += gT02_im * a2_re;
581  spinorFloat B0_re = 0;
582  B0_re += gT00_re * b0_re;
583  B0_re -= gT00_im * b0_im;
584  B0_re += gT01_re * b1_re;
585  B0_re -= gT01_im * b1_im;
586  B0_re += gT02_re * b2_re;
587  B0_re -= gT02_im * b2_im;
588  spinorFloat B0_im = 0;
589  B0_im += gT00_re * b0_im;
590  B0_im += gT00_im * b0_re;
591  B0_im += gT01_re * b1_im;
592  B0_im += gT01_im * b1_re;
593  B0_im += gT02_re * b2_im;
594  B0_im += gT02_im * b2_re;
595 
596  // multiply row 1
597  spinorFloat A1_re = 0;
598  A1_re += gT10_re * a0_re;
599  A1_re -= gT10_im * a0_im;
600  A1_re += gT11_re * a1_re;
601  A1_re -= gT11_im * a1_im;
602  A1_re += gT12_re * a2_re;
603  A1_re -= gT12_im * a2_im;
604  spinorFloat A1_im = 0;
605  A1_im += gT10_re * a0_im;
606  A1_im += gT10_im * a0_re;
607  A1_im += gT11_re * a1_im;
608  A1_im += gT11_im * a1_re;
609  A1_im += gT12_re * a2_im;
610  A1_im += gT12_im * a2_re;
611  spinorFloat B1_re = 0;
612  B1_re += gT10_re * b0_re;
613  B1_re -= gT10_im * b0_im;
614  B1_re += gT11_re * b1_re;
615  B1_re -= gT11_im * b1_im;
616  B1_re += gT12_re * b2_re;
617  B1_re -= gT12_im * b2_im;
618  spinorFloat B1_im = 0;
619  B1_im += gT10_re * b0_im;
620  B1_im += gT10_im * b0_re;
621  B1_im += gT11_re * b1_im;
622  B1_im += gT11_im * b1_re;
623  B1_im += gT12_re * b2_im;
624  B1_im += gT12_im * b2_re;
625 
626  // multiply row 2
627  spinorFloat A2_re = 0;
628  A2_re += gT20_re * a0_re;
629  A2_re -= gT20_im * a0_im;
630  A2_re += gT21_re * a1_re;
631  A2_re -= gT21_im * a1_im;
632  A2_re += gT22_re * a2_re;
633  A2_re -= gT22_im * a2_im;
634  spinorFloat A2_im = 0;
635  A2_im += gT20_re * a0_im;
636  A2_im += gT20_im * a0_re;
637  A2_im += gT21_re * a1_im;
638  A2_im += gT21_im * a1_re;
639  A2_im += gT22_re * a2_im;
640  A2_im += gT22_im * a2_re;
641  spinorFloat B2_re = 0;
642  B2_re += gT20_re * b0_re;
643  B2_re -= gT20_im * b0_im;
644  B2_re += gT21_re * b1_re;
645  B2_re -= gT21_im * b1_im;
646  B2_re += gT22_re * b2_re;
647  B2_re -= gT22_im * b2_im;
648  spinorFloat B2_im = 0;
649  B2_im += gT20_re * b0_im;
650  B2_im += gT20_im * b0_re;
651  B2_im += gT21_re * b1_im;
652  B2_im += gT21_im * b1_re;
653  B2_im += gT22_re * b2_im;
654  B2_im += gT22_im * b2_re;
655 
656  o00_re += A0_re;
657  o00_im += A0_im;
658  o10_re += B0_re;
659  o10_im += B0_im;
660  o20_re += B0_im;
661  o20_im -= B0_re;
662  o30_re += A0_im;
663  o30_im -= A0_re;
664 
665  o01_re += A1_re;
666  o01_im += A1_im;
667  o11_re += B1_re;
668  o11_im += B1_im;
669  o21_re += B1_im;
670  o21_im -= B1_re;
671  o31_re += A1_im;
672  o31_im -= A1_re;
673 
674  o02_re += A2_re;
675  o02_im += A2_im;
676  o12_re += B2_re;
677  o12_im += B2_im;
678  o22_re += B2_im;
679  o22_im -= B2_re;
680  o32_re += A2_im;
681  o32_im -= A2_re;
682 
683 }
684 
685 #ifdef MULTI_GPU
686 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
687  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
688 #endif
689 {
690  // Projector P1-
691  // 1 0 0 -1
692  // 0 1 1 0
693  // 0 1 1 0
694  // -1 0 0 1
695 
696 #ifdef MULTI_GPU
697  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
698  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
699 #else
700  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
701 #endif
702 
703  const int ga_idx = sid;
704 
711 
712 #ifdef MULTI_GPU
713  if (kernel_type == INTERIOR_KERNEL) {
714 #endif
715 
716  if (threadIdx.y == blockDim.y-1 && blockDim.y < X2 ) {
717  // read spinor from device memory
718  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
719 #ifdef TWIST_INV_DSLASH
720  APPLY_TWIST_INV( a, b, i);
721 #endif
722 
723  // project spinor into half spinors
724  a0_re = +i00_re-i30_re;
725  a0_im = +i00_im-i30_im;
726  a1_re = +i01_re-i31_re;
727  a1_im = +i01_im-i31_im;
728  a2_re = +i02_re-i32_re;
729  a2_im = +i02_im-i32_im;
730  b0_re = +i10_re+i20_re;
731  b0_im = +i10_im+i20_im;
732  b1_re = +i11_re+i21_re;
733  b1_im = +i11_im+i21_im;
734  b2_re = +i12_re+i22_re;
735  b2_im = +i12_im+i22_im;
736  } else {
737  // load spinor from shared memory
738  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x;
739  int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0;
740  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
741 
742  // project spinor into half spinors
743  a0_re = +i00_re-i30_re;
744  a0_im = +i00_im-i30_im;
745  a1_re = +i01_re-i31_re;
746  a1_im = +i01_im-i31_im;
747  a2_re = +i02_re-i32_re;
748  a2_im = +i02_im-i32_im;
749  b0_re = +i10_re+i20_re;
750  b0_im = +i10_im+i20_im;
751  b1_re = +i11_re+i21_re;
752  b1_im = +i11_im+i21_im;
753  b2_re = +i12_re+i22_re;
754  b2_im = +i12_im+i22_im;
755  }
756 
757 #ifdef MULTI_GPU
758  } else {
759 
760  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
761 
762  // read half spinor from device memory
763  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
764 
765  a0_re = i00_re; a0_im = i00_im;
766  a1_re = i01_re; a1_im = i01_im;
767  a2_re = i02_re; a2_im = i02_im;
768  b0_re = i10_re; b0_im = i10_im;
769  b1_re = i11_re; b1_im = i11_im;
770  b2_re = i12_re; b2_im = i12_im;
771 
772  }
773 #endif // MULTI_GPU
774 
775  // read gauge matrix from device memory
776  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
777 
778  // reconstruct gauge matrix
780 
781  // multiply row 0
782  spinorFloat A0_re = 0;
783  A0_re += g00_re * a0_re;
784  A0_re -= g00_im * a0_im;
785  A0_re += g01_re * a1_re;
786  A0_re -= g01_im * a1_im;
787  A0_re += g02_re * a2_re;
788  A0_re -= g02_im * a2_im;
789  spinorFloat A0_im = 0;
790  A0_im += g00_re * a0_im;
791  A0_im += g00_im * a0_re;
792  A0_im += g01_re * a1_im;
793  A0_im += g01_im * a1_re;
794  A0_im += g02_re * a2_im;
795  A0_im += g02_im * a2_re;
796  spinorFloat B0_re = 0;
797  B0_re += g00_re * b0_re;
798  B0_re -= g00_im * b0_im;
799  B0_re += g01_re * b1_re;
800  B0_re -= g01_im * b1_im;
801  B0_re += g02_re * b2_re;
802  B0_re -= g02_im * b2_im;
803  spinorFloat B0_im = 0;
804  B0_im += g00_re * b0_im;
805  B0_im += g00_im * b0_re;
806  B0_im += g01_re * b1_im;
807  B0_im += g01_im * b1_re;
808  B0_im += g02_re * b2_im;
809  B0_im += g02_im * b2_re;
810 
811  // multiply row 1
812  spinorFloat A1_re = 0;
813  A1_re += g10_re * a0_re;
814  A1_re -= g10_im * a0_im;
815  A1_re += g11_re * a1_re;
816  A1_re -= g11_im * a1_im;
817  A1_re += g12_re * a2_re;
818  A1_re -= g12_im * a2_im;
819  spinorFloat A1_im = 0;
820  A1_im += g10_re * a0_im;
821  A1_im += g10_im * a0_re;
822  A1_im += g11_re * a1_im;
823  A1_im += g11_im * a1_re;
824  A1_im += g12_re * a2_im;
825  A1_im += g12_im * a2_re;
826  spinorFloat B1_re = 0;
827  B1_re += g10_re * b0_re;
828  B1_re -= g10_im * b0_im;
829  B1_re += g11_re * b1_re;
830  B1_re -= g11_im * b1_im;
831  B1_re += g12_re * b2_re;
832  B1_re -= g12_im * b2_im;
833  spinorFloat B1_im = 0;
834  B1_im += g10_re * b0_im;
835  B1_im += g10_im * b0_re;
836  B1_im += g11_re * b1_im;
837  B1_im += g11_im * b1_re;
838  B1_im += g12_re * b2_im;
839  B1_im += g12_im * b2_re;
840 
841  // multiply row 2
842  spinorFloat A2_re = 0;
843  A2_re += g20_re * a0_re;
844  A2_re -= g20_im * a0_im;
845  A2_re += g21_re * a1_re;
846  A2_re -= g21_im * a1_im;
847  A2_re += g22_re * a2_re;
848  A2_re -= g22_im * a2_im;
849  spinorFloat A2_im = 0;
850  A2_im += g20_re * a0_im;
851  A2_im += g20_im * a0_re;
852  A2_im += g21_re * a1_im;
853  A2_im += g21_im * a1_re;
854  A2_im += g22_re * a2_im;
855  A2_im += g22_im * a2_re;
856  spinorFloat B2_re = 0;
857  B2_re += g20_re * b0_re;
858  B2_re -= g20_im * b0_im;
859  B2_re += g21_re * b1_re;
860  B2_re -= g21_im * b1_im;
861  B2_re += g22_re * b2_re;
862  B2_re -= g22_im * b2_im;
863  spinorFloat B2_im = 0;
864  B2_im += g20_re * b0_im;
865  B2_im += g20_im * b0_re;
866  B2_im += g21_re * b1_im;
867  B2_im += g21_im * b1_re;
868  B2_im += g22_re * b2_im;
869  B2_im += g22_im * b2_re;
870 
871  o00_re += A0_re;
872  o00_im += A0_im;
873  o10_re += B0_re;
874  o10_im += B0_im;
875  o20_re += B0_re;
876  o20_im += B0_im;
877  o30_re -= A0_re;
878  o30_im -= A0_im;
879 
880  o01_re += A1_re;
881  o01_im += A1_im;
882  o11_re += B1_re;
883  o11_im += B1_im;
884  o21_re += B1_re;
885  o21_im += B1_im;
886  o31_re -= A1_re;
887  o31_im -= A1_im;
888 
889  o02_re += A2_re;
890  o02_im += A2_im;
891  o12_re += B2_re;
892  o12_im += B2_im;
893  o22_re += B2_re;
894  o22_im += B2_im;
895  o32_re -= A2_re;
896  o32_im -= A2_im;
897 
898 }
899 
900 #ifdef MULTI_GPU
901 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
902  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
903 #endif
904 {
905  // Projector P1+
906  // 1 0 0 1
907  // 0 1 -1 0
908  // 0 -1 1 0
909  // 1 0 0 1
910 
911 #ifdef MULTI_GPU
912  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
913  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
914 #else
915  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
916 #endif
917 
918 #ifdef MULTI_GPU
919  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
920 #else
921  const int ga_idx = sp_idx;
922 #endif
923 
930 
931 #ifdef MULTI_GPU
932  if (kernel_type == INTERIOR_KERNEL) {
933 #endif
934 
935  if (threadIdx.y == 0 && blockDim.y < X2) {
936  // read spinor from device memory
937  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
938 #ifdef TWIST_INV_DSLASH
939  APPLY_TWIST_INV( a, b, i);
940 #endif
941 
942  // project spinor into half spinors
943  a0_re = +i00_re+i30_re;
944  a0_im = +i00_im+i30_im;
945  a1_re = +i01_re+i31_re;
946  a1_im = +i01_im+i31_im;
947  a2_re = +i02_re+i32_re;
948  a2_im = +i02_im+i32_im;
949  b0_re = +i10_re-i20_re;
950  b0_im = +i10_im-i20_im;
951  b1_re = +i11_re-i21_re;
952  b1_im = +i11_im-i21_im;
953  b2_re = +i12_re-i22_re;
954  b2_im = +i12_im-i22_im;
955  } else {
956  // load spinor from shared memory
957  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x;
958  int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1;
959  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
960 
961  // project spinor into half spinors
962  a0_re = +i00_re+i30_re;
963  a0_im = +i00_im+i30_im;
964  a1_re = +i01_re+i31_re;
965  a1_im = +i01_im+i31_im;
966  a2_re = +i02_re+i32_re;
967  a2_im = +i02_im+i32_im;
968  b0_re = +i10_re-i20_re;
969  b0_im = +i10_im-i20_im;
970  b1_re = +i11_re-i21_re;
971  b1_im = +i11_im-i21_im;
972  b2_re = +i12_re-i22_re;
973  b2_im = +i12_im-i22_im;
974  }
975 
976 #ifdef MULTI_GPU
977  } else {
978 
979  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
980 
981  // read half spinor from device memory
982  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
983 
984  a0_re = i00_re; a0_im = i00_im;
985  a1_re = i01_re; a1_im = i01_im;
986  a2_re = i02_re; a2_im = i02_im;
987  b0_re = i10_re; b0_im = i10_im;
988  b1_re = i11_re; b1_im = i11_im;
989  b2_re = i12_re; b2_im = i12_im;
990 
991  }
992 #endif // MULTI_GPU
993 
994  // read gauge matrix from device memory
995  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
996 
997  // reconstruct gauge matrix
999 
1000  // multiply row 0
1001  spinorFloat A0_re = 0;
1002  A0_re += gT00_re * a0_re;
1003  A0_re -= gT00_im * a0_im;
1004  A0_re += gT01_re * a1_re;
1005  A0_re -= gT01_im * a1_im;
1006  A0_re += gT02_re * a2_re;
1007  A0_re -= gT02_im * a2_im;
1008  spinorFloat A0_im = 0;
1009  A0_im += gT00_re * a0_im;
1010  A0_im += gT00_im * a0_re;
1011  A0_im += gT01_re * a1_im;
1012  A0_im += gT01_im * a1_re;
1013  A0_im += gT02_re * a2_im;
1014  A0_im += gT02_im * a2_re;
1015  spinorFloat B0_re = 0;
1016  B0_re += gT00_re * b0_re;
1017  B0_re -= gT00_im * b0_im;
1018  B0_re += gT01_re * b1_re;
1019  B0_re -= gT01_im * b1_im;
1020  B0_re += gT02_re * b2_re;
1021  B0_re -= gT02_im * b2_im;
1022  spinorFloat B0_im = 0;
1023  B0_im += gT00_re * b0_im;
1024  B0_im += gT00_im * b0_re;
1025  B0_im += gT01_re * b1_im;
1026  B0_im += gT01_im * b1_re;
1027  B0_im += gT02_re * b2_im;
1028  B0_im += gT02_im * b2_re;
1029 
1030  // multiply row 1
1031  spinorFloat A1_re = 0;
1032  A1_re += gT10_re * a0_re;
1033  A1_re -= gT10_im * a0_im;
1034  A1_re += gT11_re * a1_re;
1035  A1_re -= gT11_im * a1_im;
1036  A1_re += gT12_re * a2_re;
1037  A1_re -= gT12_im * a2_im;
1038  spinorFloat A1_im = 0;
1039  A1_im += gT10_re * a0_im;
1040  A1_im += gT10_im * a0_re;
1041  A1_im += gT11_re * a1_im;
1042  A1_im += gT11_im * a1_re;
1043  A1_im += gT12_re * a2_im;
1044  A1_im += gT12_im * a2_re;
1045  spinorFloat B1_re = 0;
1046  B1_re += gT10_re * b0_re;
1047  B1_re -= gT10_im * b0_im;
1048  B1_re += gT11_re * b1_re;
1049  B1_re -= gT11_im * b1_im;
1050  B1_re += gT12_re * b2_re;
1051  B1_re -= gT12_im * b2_im;
1052  spinorFloat B1_im = 0;
1053  B1_im += gT10_re * b0_im;
1054  B1_im += gT10_im * b0_re;
1055  B1_im += gT11_re * b1_im;
1056  B1_im += gT11_im * b1_re;
1057  B1_im += gT12_re * b2_im;
1058  B1_im += gT12_im * b2_re;
1059 
1060  // multiply row 2
1061  spinorFloat A2_re = 0;
1062  A2_re += gT20_re * a0_re;
1063  A2_re -= gT20_im * a0_im;
1064  A2_re += gT21_re * a1_re;
1065  A2_re -= gT21_im * a1_im;
1066  A2_re += gT22_re * a2_re;
1067  A2_re -= gT22_im * a2_im;
1068  spinorFloat A2_im = 0;
1069  A2_im += gT20_re * a0_im;
1070  A2_im += gT20_im * a0_re;
1071  A2_im += gT21_re * a1_im;
1072  A2_im += gT21_im * a1_re;
1073  A2_im += gT22_re * a2_im;
1074  A2_im += gT22_im * a2_re;
1075  spinorFloat B2_re = 0;
1076  B2_re += gT20_re * b0_re;
1077  B2_re -= gT20_im * b0_im;
1078  B2_re += gT21_re * b1_re;
1079  B2_re -= gT21_im * b1_im;
1080  B2_re += gT22_re * b2_re;
1081  B2_re -= gT22_im * b2_im;
1082  spinorFloat B2_im = 0;
1083  B2_im += gT20_re * b0_im;
1084  B2_im += gT20_im * b0_re;
1085  B2_im += gT21_re * b1_im;
1086  B2_im += gT21_im * b1_re;
1087  B2_im += gT22_re * b2_im;
1088  B2_im += gT22_im * b2_re;
1089 
1090  o00_re += A0_re;
1091  o00_im += A0_im;
1092  o10_re += B0_re;
1093  o10_im += B0_im;
1094  o20_re -= B0_re;
1095  o20_im -= B0_im;
1096  o30_re += A0_re;
1097  o30_im += A0_im;
1098 
1099  o01_re += A1_re;
1100  o01_im += A1_im;
1101  o11_re += B1_re;
1102  o11_im += B1_im;
1103  o21_re -= B1_re;
1104  o21_im -= B1_im;
1105  o31_re += A1_re;
1106  o31_im += A1_im;
1107 
1108  o02_re += A2_re;
1109  o02_im += A2_im;
1110  o12_re += B2_re;
1111  o12_im += B2_im;
1112  o22_re -= B2_re;
1113  o22_im -= B2_im;
1114  o32_re += A2_re;
1115  o32_im += A2_im;
1116 
1117 }
1118 
1119 #ifdef MULTI_GPU
1120 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1121  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1122 #endif
1123 {
1124  // Projector P2-
1125  // 1 0 -i 0
1126  // 0 1 0 i
1127  // i 0 1 0
1128  // 0 -i 0 1
1129 
1130 #ifdef MULTI_GPU
1131  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1132  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1133 #else
1134  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1135 #endif
1136 
1137  const int ga_idx = sid;
1138 
1145 
1146 #ifdef MULTI_GPU
1147  if (kernel_type == INTERIOR_KERNEL) {
1148 #endif
1149 
1150  if (threadIdx.z == blockDim.z-1 && blockDim.z < X3) {
1151  // read spinor from device memory
1152  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1153 #ifdef TWIST_INV_DSLASH
1154  APPLY_TWIST_INV( a, b, i);
1155 #endif
1156 
1157  // project spinor into half spinors
1158  a0_re = +i00_re+i20_im;
1159  a0_im = +i00_im-i20_re;
1160  a1_re = +i01_re+i21_im;
1161  a1_im = +i01_im-i21_re;
1162  a2_re = +i02_re+i22_im;
1163  a2_im = +i02_im-i22_re;
1164  b0_re = +i10_re-i30_im;
1165  b0_im = +i10_im+i30_re;
1166  b1_re = +i11_re-i31_im;
1167  b1_im = +i11_im+i31_re;
1168  b2_re = +i12_re-i32_im;
1169  b2_im = +i12_im+i32_re;
1170  } else {
1171  // load spinor from shared memory
1172  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x;
1173  int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0;
1174  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1175 
1176  // project spinor into half spinors
1177  a0_re = +i00_re+i20_im;
1178  a0_im = +i00_im-i20_re;
1179  a1_re = +i01_re+i21_im;
1180  a1_im = +i01_im-i21_re;
1181  a2_re = +i02_re+i22_im;
1182  a2_im = +i02_im-i22_re;
1183  b0_re = +i10_re-i30_im;
1184  b0_im = +i10_im+i30_re;
1185  b1_re = +i11_re-i31_im;
1186  b1_im = +i11_im+i31_re;
1187  b2_re = +i12_re-i32_im;
1188  b2_im = +i12_im+i32_re;
1189  }
1190 
1191 #ifdef MULTI_GPU
1192  } else {
1193 
1194  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1195 
1196  // read half spinor from device memory
1197  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1198 
1199  a0_re = i00_re; a0_im = i00_im;
1200  a1_re = i01_re; a1_im = i01_im;
1201  a2_re = i02_re; a2_im = i02_im;
1202  b0_re = i10_re; b0_im = i10_im;
1203  b1_re = i11_re; b1_im = i11_im;
1204  b2_re = i12_re; b2_im = i12_im;
1205 
1206  }
1207 #endif // MULTI_GPU
1208 
1209  // read gauge matrix from device memory
1210  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1211 
1212  // reconstruct gauge matrix
1214 
1215  // multiply row 0
1216  spinorFloat A0_re = 0;
1217  A0_re += g00_re * a0_re;
1218  A0_re -= g00_im * a0_im;
1219  A0_re += g01_re * a1_re;
1220  A0_re -= g01_im * a1_im;
1221  A0_re += g02_re * a2_re;
1222  A0_re -= g02_im * a2_im;
1223  spinorFloat A0_im = 0;
1224  A0_im += g00_re * a0_im;
1225  A0_im += g00_im * a0_re;
1226  A0_im += g01_re * a1_im;
1227  A0_im += g01_im * a1_re;
1228  A0_im += g02_re * a2_im;
1229  A0_im += g02_im * a2_re;
1230  spinorFloat B0_re = 0;
1231  B0_re += g00_re * b0_re;
1232  B0_re -= g00_im * b0_im;
1233  B0_re += g01_re * b1_re;
1234  B0_re -= g01_im * b1_im;
1235  B0_re += g02_re * b2_re;
1236  B0_re -= g02_im * b2_im;
1237  spinorFloat B0_im = 0;
1238  B0_im += g00_re * b0_im;
1239  B0_im += g00_im * b0_re;
1240  B0_im += g01_re * b1_im;
1241  B0_im += g01_im * b1_re;
1242  B0_im += g02_re * b2_im;
1243  B0_im += g02_im * b2_re;
1244 
1245  // multiply row 1
1246  spinorFloat A1_re = 0;
1247  A1_re += g10_re * a0_re;
1248  A1_re -= g10_im * a0_im;
1249  A1_re += g11_re * a1_re;
1250  A1_re -= g11_im * a1_im;
1251  A1_re += g12_re * a2_re;
1252  A1_re -= g12_im * a2_im;
1253  spinorFloat A1_im = 0;
1254  A1_im += g10_re * a0_im;
1255  A1_im += g10_im * a0_re;
1256  A1_im += g11_re * a1_im;
1257  A1_im += g11_im * a1_re;
1258  A1_im += g12_re * a2_im;
1259  A1_im += g12_im * a2_re;
1260  spinorFloat B1_re = 0;
1261  B1_re += g10_re * b0_re;
1262  B1_re -= g10_im * b0_im;
1263  B1_re += g11_re * b1_re;
1264  B1_re -= g11_im * b1_im;
1265  B1_re += g12_re * b2_re;
1266  B1_re -= g12_im * b2_im;
1267  spinorFloat B1_im = 0;
1268  B1_im += g10_re * b0_im;
1269  B1_im += g10_im * b0_re;
1270  B1_im += g11_re * b1_im;
1271  B1_im += g11_im * b1_re;
1272  B1_im += g12_re * b2_im;
1273  B1_im += g12_im * b2_re;
1274 
1275  // multiply row 2
1276  spinorFloat A2_re = 0;
1277  A2_re += g20_re * a0_re;
1278  A2_re -= g20_im * a0_im;
1279  A2_re += g21_re * a1_re;
1280  A2_re -= g21_im * a1_im;
1281  A2_re += g22_re * a2_re;
1282  A2_re -= g22_im * a2_im;
1283  spinorFloat A2_im = 0;
1284  A2_im += g20_re * a0_im;
1285  A2_im += g20_im * a0_re;
1286  A2_im += g21_re * a1_im;
1287  A2_im += g21_im * a1_re;
1288  A2_im += g22_re * a2_im;
1289  A2_im += g22_im * a2_re;
1290  spinorFloat B2_re = 0;
1291  B2_re += g20_re * b0_re;
1292  B2_re -= g20_im * b0_im;
1293  B2_re += g21_re * b1_re;
1294  B2_re -= g21_im * b1_im;
1295  B2_re += g22_re * b2_re;
1296  B2_re -= g22_im * b2_im;
1297  spinorFloat B2_im = 0;
1298  B2_im += g20_re * b0_im;
1299  B2_im += g20_im * b0_re;
1300  B2_im += g21_re * b1_im;
1301  B2_im += g21_im * b1_re;
1302  B2_im += g22_re * b2_im;
1303  B2_im += g22_im * b2_re;
1304 
1305  o00_re += A0_re;
1306  o00_im += A0_im;
1307  o10_re += B0_re;
1308  o10_im += B0_im;
1309  o20_re -= A0_im;
1310  o20_im += A0_re;
1311  o30_re += B0_im;
1312  o30_im -= B0_re;
1313 
1314  o01_re += A1_re;
1315  o01_im += A1_im;
1316  o11_re += B1_re;
1317  o11_im += B1_im;
1318  o21_re -= A1_im;
1319  o21_im += A1_re;
1320  o31_re += B1_im;
1321  o31_im -= B1_re;
1322 
1323  o02_re += A2_re;
1324  o02_im += A2_im;
1325  o12_re += B2_re;
1326  o12_im += B2_im;
1327  o22_re -= A2_im;
1328  o22_im += A2_re;
1329  o32_re += B2_im;
1330  o32_im -= B2_re;
1331 
1332 }
1333 
1334 #ifdef MULTI_GPU
1335 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1336  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1337 #endif
1338 {
1339  // Projector P2+
1340  // 1 0 i 0
1341  // 0 1 0 -i
1342  // -i 0 1 0
1343  // 0 i 0 1
1344 
1345 #ifdef MULTI_GPU
1346  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1347  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1348 #else
1349  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1350 #endif
1351 
1352 #ifdef MULTI_GPU
1353  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1354 #else
1355  const int ga_idx = sp_idx;
1356 #endif
1357 
1364 
1365 #ifdef MULTI_GPU
1366  if (kernel_type == INTERIOR_KERNEL) {
1367 #endif
1368 
1369  if (threadIdx.z == 0 && blockDim.z < X3) {
1370  // read spinor from device memory
1371  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1372 #ifdef TWIST_INV_DSLASH
1373  APPLY_TWIST_INV( a, b, i);
1374 #endif
1375 
1376  // project spinor into half spinors
1377  a0_re = +i00_re-i20_im;
1378  a0_im = +i00_im+i20_re;
1379  a1_re = +i01_re-i21_im;
1380  a1_im = +i01_im+i21_re;
1381  a2_re = +i02_re-i22_im;
1382  a2_im = +i02_im+i22_re;
1383  b0_re = +i10_re+i30_im;
1384  b0_im = +i10_im-i30_re;
1385  b1_re = +i11_re+i31_im;
1386  b1_im = +i11_im-i31_re;
1387  b2_re = +i12_re+i32_im;
1388  b2_im = +i12_im-i32_re;
1389  } else {
1390  // load spinor from shared memory
1391  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x;
1392  int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1;
1393  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1394 
1395  // project spinor into half spinors
1396  a0_re = +i00_re-i20_im;
1397  a0_im = +i00_im+i20_re;
1398  a1_re = +i01_re-i21_im;
1399  a1_im = +i01_im+i21_re;
1400  a2_re = +i02_re-i22_im;
1401  a2_im = +i02_im+i22_re;
1402  b0_re = +i10_re+i30_im;
1403  b0_im = +i10_im-i30_re;
1404  b1_re = +i11_re+i31_im;
1405  b1_im = +i11_im-i31_re;
1406  b2_re = +i12_re+i32_im;
1407  b2_im = +i12_im-i32_re;
1408  }
1409 
1410 #ifdef MULTI_GPU
1411  } else {
1412 
1413  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1414 
1415  // read half spinor from device memory
1416  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1417 
1418  a0_re = i00_re; a0_im = i00_im;
1419  a1_re = i01_re; a1_im = i01_im;
1420  a2_re = i02_re; a2_im = i02_im;
1421  b0_re = i10_re; b0_im = i10_im;
1422  b1_re = i11_re; b1_im = i11_im;
1423  b2_re = i12_re; b2_im = i12_im;
1424 
1425  }
1426 #endif // MULTI_GPU
1427 
1428  // read gauge matrix from device memory
1429  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1430 
1431  // reconstruct gauge matrix
1433 
1434  // multiply row 0
1435  spinorFloat A0_re = 0;
1436  A0_re += gT00_re * a0_re;
1437  A0_re -= gT00_im * a0_im;
1438  A0_re += gT01_re * a1_re;
1439  A0_re -= gT01_im * a1_im;
1440  A0_re += gT02_re * a2_re;
1441  A0_re -= gT02_im * a2_im;
1442  spinorFloat A0_im = 0;
1443  A0_im += gT00_re * a0_im;
1444  A0_im += gT00_im * a0_re;
1445  A0_im += gT01_re * a1_im;
1446  A0_im += gT01_im * a1_re;
1447  A0_im += gT02_re * a2_im;
1448  A0_im += gT02_im * a2_re;
1449  spinorFloat B0_re = 0;
1450  B0_re += gT00_re * b0_re;
1451  B0_re -= gT00_im * b0_im;
1452  B0_re += gT01_re * b1_re;
1453  B0_re -= gT01_im * b1_im;
1454  B0_re += gT02_re * b2_re;
1455  B0_re -= gT02_im * b2_im;
1456  spinorFloat B0_im = 0;
1457  B0_im += gT00_re * b0_im;
1458  B0_im += gT00_im * b0_re;
1459  B0_im += gT01_re * b1_im;
1460  B0_im += gT01_im * b1_re;
1461  B0_im += gT02_re * b2_im;
1462  B0_im += gT02_im * b2_re;
1463 
1464  // multiply row 1
1465  spinorFloat A1_re = 0;
1466  A1_re += gT10_re * a0_re;
1467  A1_re -= gT10_im * a0_im;
1468  A1_re += gT11_re * a1_re;
1469  A1_re -= gT11_im * a1_im;
1470  A1_re += gT12_re * a2_re;
1471  A1_re -= gT12_im * a2_im;
1472  spinorFloat A1_im = 0;
1473  A1_im += gT10_re * a0_im;
1474  A1_im += gT10_im * a0_re;
1475  A1_im += gT11_re * a1_im;
1476  A1_im += gT11_im * a1_re;
1477  A1_im += gT12_re * a2_im;
1478  A1_im += gT12_im * a2_re;
1479  spinorFloat B1_re = 0;
1480  B1_re += gT10_re * b0_re;
1481  B1_re -= gT10_im * b0_im;
1482  B1_re += gT11_re * b1_re;
1483  B1_re -= gT11_im * b1_im;
1484  B1_re += gT12_re * b2_re;
1485  B1_re -= gT12_im * b2_im;
1486  spinorFloat B1_im = 0;
1487  B1_im += gT10_re * b0_im;
1488  B1_im += gT10_im * b0_re;
1489  B1_im += gT11_re * b1_im;
1490  B1_im += gT11_im * b1_re;
1491  B1_im += gT12_re * b2_im;
1492  B1_im += gT12_im * b2_re;
1493 
1494  // multiply row 2
1495  spinorFloat A2_re = 0;
1496  A2_re += gT20_re * a0_re;
1497  A2_re -= gT20_im * a0_im;
1498  A2_re += gT21_re * a1_re;
1499  A2_re -= gT21_im * a1_im;
1500  A2_re += gT22_re * a2_re;
1501  A2_re -= gT22_im * a2_im;
1502  spinorFloat A2_im = 0;
1503  A2_im += gT20_re * a0_im;
1504  A2_im += gT20_im * a0_re;
1505  A2_im += gT21_re * a1_im;
1506  A2_im += gT21_im * a1_re;
1507  A2_im += gT22_re * a2_im;
1508  A2_im += gT22_im * a2_re;
1509  spinorFloat B2_re = 0;
1510  B2_re += gT20_re * b0_re;
1511  B2_re -= gT20_im * b0_im;
1512  B2_re += gT21_re * b1_re;
1513  B2_re -= gT21_im * b1_im;
1514  B2_re += gT22_re * b2_re;
1515  B2_re -= gT22_im * b2_im;
1516  spinorFloat B2_im = 0;
1517  B2_im += gT20_re * b0_im;
1518  B2_im += gT20_im * b0_re;
1519  B2_im += gT21_re * b1_im;
1520  B2_im += gT21_im * b1_re;
1521  B2_im += gT22_re * b2_im;
1522  B2_im += gT22_im * b2_re;
1523 
1524  o00_re += A0_re;
1525  o00_im += A0_im;
1526  o10_re += B0_re;
1527  o10_im += B0_im;
1528  o20_re += A0_im;
1529  o20_im -= A0_re;
1530  o30_re -= B0_im;
1531  o30_im += B0_re;
1532 
1533  o01_re += A1_re;
1534  o01_im += A1_im;
1535  o11_re += B1_re;
1536  o11_im += B1_im;
1537  o21_re += A1_im;
1538  o21_im -= A1_re;
1539  o31_re -= B1_im;
1540  o31_im += B1_re;
1541 
1542  o02_re += A2_re;
1543  o02_im += A2_im;
1544  o12_re += B2_re;
1545  o12_im += B2_im;
1546  o22_re += A2_im;
1547  o22_im -= A2_re;
1548  o32_re -= B2_im;
1549  o32_im += B2_re;
1550 
1551 }
1552 
1553 #ifdef MULTI_GPU
1554 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1555  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
1556 #endif
1557 {
1558  // Projector P3-
1559  // 0 0 0 0
1560  // 0 0 0 0
1561  // 0 0 2 0
1562  // 0 0 0 2
1563 
1564 #ifdef MULTI_GPU
1565  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1566  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1567 #else
1568  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1569 #endif
1570 
1571  const int ga_idx = sid;
1572 
1573  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1574  {
1581 
1582 #ifdef MULTI_GPU
1583  if (kernel_type == INTERIOR_KERNEL) {
1584 #endif
1585 
1586  // read spinor from device memory
1587 #ifndef TWIST_INV_DSLASH
1588  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1589 #else
1590  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1591  APPLY_TWIST_INV( a, b, i);
1592 #endif
1593 
1594  // project spinor into half spinors
1595  a0_re = +2*i20_re;
1596  a0_im = +2*i20_im;
1597  a1_re = +2*i21_re;
1598  a1_im = +2*i21_im;
1599  a2_re = +2*i22_re;
1600  a2_im = +2*i22_im;
1601  b0_re = +2*i30_re;
1602  b0_im = +2*i30_im;
1603  b1_re = +2*i31_re;
1604  b1_im = +2*i31_im;
1605  b2_re = +2*i32_re;
1606  b2_im = +2*i32_im;
1607 
1608 #ifdef MULTI_GPU
1609  } else {
1610 
1611  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1612  //const int t_proj_scale = TPROJSCALE;
1613 
1614  // read half spinor from device memory
1615  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1616 
1617 #ifdef TWIST_INV_DSLASH
1618  a0_re = i00_re; a0_im = i00_im;
1619  a1_re = i01_re; a1_im = i01_im;
1620  a2_re = i02_re; a2_im = i02_im;
1621  b0_re = i10_re; b0_im = i10_im;
1622  b1_re = i11_re; b1_im = i11_im;
1623  b2_re = i12_re; b2_im = i12_im;
1624 #else
1625  a0_re = 2*i00_re; a0_im = 2*i00_im;
1626  a1_re = 2*i01_re; a1_im = 2*i01_im;
1627  a2_re = 2*i02_re; a2_im = 2*i02_im;
1628  b0_re = 2*i10_re; b0_im = 2*i10_im;
1629  b1_re = 2*i11_re; b1_im = 2*i11_im;
1630  b2_re = 2*i12_re; b2_im = 2*i12_im;
1631 #endif
1632 
1633  }
1634 #endif // MULTI_GPU
1635 
1636  // identity gauge matrix
1643 
1644  o20_re += A0_re;
1645  o20_im += A0_im;
1646  o30_re += B0_re;
1647  o30_im += B0_im;
1648 
1649  o21_re += A1_re;
1650  o21_im += A1_im;
1651  o31_re += B1_re;
1652  o31_im += B1_im;
1653 
1654  o22_re += A2_re;
1655  o22_im += A2_im;
1656  o32_re += B2_re;
1657  o32_im += B2_im;
1658 
1659  } else {
1666 
1667 #ifdef MULTI_GPU
1668  if (kernel_type == INTERIOR_KERNEL) {
1669 #endif
1670 
1671  // read spinor from device memory
1672 #ifndef TWIST_INV_DSLASH
1673  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1674 #else
1675  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1676  APPLY_TWIST_INV( a, b, i);
1677 #endif
1678 
1679  // project spinor into half spinors
1680  a0_re = +2*i20_re;
1681  a0_im = +2*i20_im;
1682  a1_re = +2*i21_re;
1683  a1_im = +2*i21_im;
1684  a2_re = +2*i22_re;
1685  a2_im = +2*i22_im;
1686  b0_re = +2*i30_re;
1687  b0_im = +2*i30_im;
1688  b1_re = +2*i31_re;
1689  b1_im = +2*i31_im;
1690  b2_re = +2*i32_re;
1691  b2_im = +2*i32_im;
1692 
1693 #ifdef MULTI_GPU
1694  } else {
1695 
1696  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1697  //const int t_proj_scale = TPROJSCALE;
1698 
1699  // read half spinor from device memory
1700  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1701 
1702 #ifdef TWIST_INV_DSLASH
1703  a0_re = i00_re; a0_im = i00_im;
1704  a1_re = i01_re; a1_im = i01_im;
1705  a2_re = i02_re; a2_im = i02_im;
1706  b0_re = i10_re; b0_im = i10_im;
1707  b1_re = i11_re; b1_im = i11_im;
1708  b2_re = i12_re; b2_im = i12_im;
1709 #else
1710  a0_re = 2*i00_re; a0_im = 2*i00_im;
1711  a1_re = 2*i01_re; a1_im = 2*i01_im;
1712  a2_re = 2*i02_re; a2_im = 2*i02_im;
1713  b0_re = 2*i10_re; b0_im = 2*i10_im;
1714  b1_re = 2*i11_re; b1_im = 2*i11_im;
1715  b2_re = 2*i12_re; b2_im = 2*i12_im;
1716 #endif
1717 
1718  }
1719 #endif // MULTI_GPU
1720 
1721  // read gauge matrix from device memory
1722  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1723 
1724  // reconstruct gauge matrix
1726 
1727  // multiply row 0
1728  spinorFloat A0_re = 0;
1729  A0_re += g00_re * a0_re;
1730  A0_re -= g00_im * a0_im;
1731  A0_re += g01_re * a1_re;
1732  A0_re -= g01_im * a1_im;
1733  A0_re += g02_re * a2_re;
1734  A0_re -= g02_im * a2_im;
1735  spinorFloat A0_im = 0;
1736  A0_im += g00_re * a0_im;
1737  A0_im += g00_im * a0_re;
1738  A0_im += g01_re * a1_im;
1739  A0_im += g01_im * a1_re;
1740  A0_im += g02_re * a2_im;
1741  A0_im += g02_im * a2_re;
1742  spinorFloat B0_re = 0;
1743  B0_re += g00_re * b0_re;
1744  B0_re -= g00_im * b0_im;
1745  B0_re += g01_re * b1_re;
1746  B0_re -= g01_im * b1_im;
1747  B0_re += g02_re * b2_re;
1748  B0_re -= g02_im * b2_im;
1749  spinorFloat B0_im = 0;
1750  B0_im += g00_re * b0_im;
1751  B0_im += g00_im * b0_re;
1752  B0_im += g01_re * b1_im;
1753  B0_im += g01_im * b1_re;
1754  B0_im += g02_re * b2_im;
1755  B0_im += g02_im * b2_re;
1756 
1757  // multiply row 1
1758  spinorFloat A1_re = 0;
1759  A1_re += g10_re * a0_re;
1760  A1_re -= g10_im * a0_im;
1761  A1_re += g11_re * a1_re;
1762  A1_re -= g11_im * a1_im;
1763  A1_re += g12_re * a2_re;
1764  A1_re -= g12_im * a2_im;
1765  spinorFloat A1_im = 0;
1766  A1_im += g10_re * a0_im;
1767  A1_im += g10_im * a0_re;
1768  A1_im += g11_re * a1_im;
1769  A1_im += g11_im * a1_re;
1770  A1_im += g12_re * a2_im;
1771  A1_im += g12_im * a2_re;
1772  spinorFloat B1_re = 0;
1773  B1_re += g10_re * b0_re;
1774  B1_re -= g10_im * b0_im;
1775  B1_re += g11_re * b1_re;
1776  B1_re -= g11_im * b1_im;
1777  B1_re += g12_re * b2_re;
1778  B1_re -= g12_im * b2_im;
1779  spinorFloat B1_im = 0;
1780  B1_im += g10_re * b0_im;
1781  B1_im += g10_im * b0_re;
1782  B1_im += g11_re * b1_im;
1783  B1_im += g11_im * b1_re;
1784  B1_im += g12_re * b2_im;
1785  B1_im += g12_im * b2_re;
1786 
1787  // multiply row 2
1788  spinorFloat A2_re = 0;
1789  A2_re += g20_re * a0_re;
1790  A2_re -= g20_im * a0_im;
1791  A2_re += g21_re * a1_re;
1792  A2_re -= g21_im * a1_im;
1793  A2_re += g22_re * a2_re;
1794  A2_re -= g22_im * a2_im;
1795  spinorFloat A2_im = 0;
1796  A2_im += g20_re * a0_im;
1797  A2_im += g20_im * a0_re;
1798  A2_im += g21_re * a1_im;
1799  A2_im += g21_im * a1_re;
1800  A2_im += g22_re * a2_im;
1801  A2_im += g22_im * a2_re;
1802  spinorFloat B2_re = 0;
1803  B2_re += g20_re * b0_re;
1804  B2_re -= g20_im * b0_im;
1805  B2_re += g21_re * b1_re;
1806  B2_re -= g21_im * b1_im;
1807  B2_re += g22_re * b2_re;
1808  B2_re -= g22_im * b2_im;
1809  spinorFloat B2_im = 0;
1810  B2_im += g20_re * b0_im;
1811  B2_im += g20_im * b0_re;
1812  B2_im += g21_re * b1_im;
1813  B2_im += g21_im * b1_re;
1814  B2_im += g22_re * b2_im;
1815  B2_im += g22_im * b2_re;
1816 
1817  o20_re += A0_re;
1818  o20_im += A0_im;
1819  o30_re += B0_re;
1820  o30_im += B0_im;
1821 
1822  o21_re += A1_re;
1823  o21_im += A1_im;
1824  o31_re += B1_re;
1825  o31_im += B1_im;
1826 
1827  o22_re += A2_re;
1828  o22_im += A2_im;
1829  o32_re += B2_re;
1830  o32_im += B2_im;
1831 
1832  }
1833 }
1834 
1835 #ifdef MULTI_GPU
1836 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
1837  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
1838 #endif
1839 {
1840  // Projector P3+
1841  // 2 0 0 0
1842  // 0 2 0 0
1843  // 0 0 0 0
1844  // 0 0 0 0
1845 
1846 #ifdef MULTI_GPU
1847  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
1848  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1849 #else
1850  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
1851 #endif
1852 
1853 #ifdef MULTI_GPU
1854  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1855 #else
1856  const int ga_idx = sp_idx;
1857 #endif
1858 
1859  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1860  {
1867 
1868 #ifdef MULTI_GPU
1869  if (kernel_type == INTERIOR_KERNEL) {
1870 #endif
1871 
1872  // read spinor from device memory
1873 #ifndef TWIST_INV_DSLASH
1874  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1875 #else
1876  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1877  APPLY_TWIST_INV( a, b, i);
1878 #endif
1879 
1880  // project spinor into half spinors
1881  a0_re = +2*i00_re;
1882  a0_im = +2*i00_im;
1883  a1_re = +2*i01_re;
1884  a1_im = +2*i01_im;
1885  a2_re = +2*i02_re;
1886  a2_im = +2*i02_im;
1887  b0_re = +2*i10_re;
1888  b0_im = +2*i10_im;
1889  b1_re = +2*i11_re;
1890  b1_im = +2*i11_im;
1891  b2_re = +2*i12_re;
1892  b2_im = +2*i12_im;
1893 
1894 #ifdef MULTI_GPU
1895  } else {
1896 
1897  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1898  //const int t_proj_scale = TPROJSCALE;
1899 
1900  // read half spinor from device memory
1901  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1902 
1903 #ifdef TWIST_INV_DSLASH
1904  a0_re = i00_re; a0_im = i00_im;
1905  a1_re = i01_re; a1_im = i01_im;
1906  a2_re = i02_re; a2_im = i02_im;
1907  b0_re = i10_re; b0_im = i10_im;
1908  b1_re = i11_re; b1_im = i11_im;
1909  b2_re = i12_re; b2_im = i12_im;
1910 #else
1911  a0_re = 2*i00_re; a0_im = 2*i00_im;
1912  a1_re = 2*i01_re; a1_im = 2*i01_im;
1913  a2_re = 2*i02_re; a2_im = 2*i02_im;
1914  b0_re = 2*i10_re; b0_im = 2*i10_im;
1915  b1_re = 2*i11_re; b1_im = 2*i11_im;
1916  b2_re = 2*i12_re; b2_im = 2*i12_im;
1917 #endif
1918 
1919  }
1920 #endif // MULTI_GPU
1921 
1922  // identity gauge matrix
1929 
1930  o00_re += A0_re;
1931  o00_im += A0_im;
1932  o10_re += B0_re;
1933  o10_im += B0_im;
1934 
1935  o01_re += A1_re;
1936  o01_im += A1_im;
1937  o11_re += B1_re;
1938  o11_im += B1_im;
1939 
1940  o02_re += A2_re;
1941  o02_im += A2_im;
1942  o12_re += B2_re;
1943  o12_im += B2_im;
1944 
1945  } else {
1952 
1953 #ifdef MULTI_GPU
1954  if (kernel_type == INTERIOR_KERNEL) {
1955 #endif
1956 
1957  // read spinor from device memory
1958 #ifndef TWIST_INV_DSLASH
1959  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1960 #else
1961  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1962  APPLY_TWIST_INV( a, b, i);
1963 #endif
1964 
1965  // project spinor into half spinors
1966  a0_re = +2*i00_re;
1967  a0_im = +2*i00_im;
1968  a1_re = +2*i01_re;
1969  a1_im = +2*i01_im;
1970  a2_re = +2*i02_re;
1971  a2_im = +2*i02_im;
1972  b0_re = +2*i10_re;
1973  b0_im = +2*i10_im;
1974  b1_re = +2*i11_re;
1975  b1_im = +2*i11_im;
1976  b2_re = +2*i12_re;
1977  b2_im = +2*i12_im;
1978 
1979 #ifdef MULTI_GPU
1980  } else {
1981 
1982  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1983  //const int t_proj_scale = TPROJSCALE;
1984 
1985  // read half spinor from device memory
1986  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1987 
1988 #ifdef TWIST_INV_DSLASH
1989  a0_re = i00_re; a0_im = i00_im;
1990  a1_re = i01_re; a1_im = i01_im;
1991  a2_re = i02_re; a2_im = i02_im;
1992  b0_re = i10_re; b0_im = i10_im;
1993  b1_re = i11_re; b1_im = i11_im;
1994  b2_re = i12_re; b2_im = i12_im;
1995 #else
1996  a0_re = 2*i00_re; a0_im = 2*i00_im;
1997  a1_re = 2*i01_re; a1_im = 2*i01_im;
1998  a2_re = 2*i02_re; a2_im = 2*i02_im;
1999  b0_re = 2*i10_re; b0_im = 2*i10_im;
2000  b1_re = 2*i11_re; b1_im = 2*i11_im;
2001  b2_re = 2*i12_re; b2_im = 2*i12_im;
2002 #endif
2003 
2004  }
2005 #endif // MULTI_GPU
2006 
2007  // read gauge matrix from device memory
2008  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
2009 
2010  // reconstruct gauge matrix
2012 
2013  // multiply row 0
2014  spinorFloat A0_re = 0;
2015  A0_re += gT00_re * a0_re;
2016  A0_re -= gT00_im * a0_im;
2017  A0_re += gT01_re * a1_re;
2018  A0_re -= gT01_im * a1_im;
2019  A0_re += gT02_re * a2_re;
2020  A0_re -= gT02_im * a2_im;
2021  spinorFloat A0_im = 0;
2022  A0_im += gT00_re * a0_im;
2023  A0_im += gT00_im * a0_re;
2024  A0_im += gT01_re * a1_im;
2025  A0_im += gT01_im * a1_re;
2026  A0_im += gT02_re * a2_im;
2027  A0_im += gT02_im * a2_re;
2028  spinorFloat B0_re = 0;
2029  B0_re += gT00_re * b0_re;
2030  B0_re -= gT00_im * b0_im;
2031  B0_re += gT01_re * b1_re;
2032  B0_re -= gT01_im * b1_im;
2033  B0_re += gT02_re * b2_re;
2034  B0_re -= gT02_im * b2_im;
2035  spinorFloat B0_im = 0;
2036  B0_im += gT00_re * b0_im;
2037  B0_im += gT00_im * b0_re;
2038  B0_im += gT01_re * b1_im;
2039  B0_im += gT01_im * b1_re;
2040  B0_im += gT02_re * b2_im;
2041  B0_im += gT02_im * b2_re;
2042 
2043  // multiply row 1
2044  spinorFloat A1_re = 0;
2045  A1_re += gT10_re * a0_re;
2046  A1_re -= gT10_im * a0_im;
2047  A1_re += gT11_re * a1_re;
2048  A1_re -= gT11_im * a1_im;
2049  A1_re += gT12_re * a2_re;
2050  A1_re -= gT12_im * a2_im;
2051  spinorFloat A1_im = 0;
2052  A1_im += gT10_re * a0_im;
2053  A1_im += gT10_im * a0_re;
2054  A1_im += gT11_re * a1_im;
2055  A1_im += gT11_im * a1_re;
2056  A1_im += gT12_re * a2_im;
2057  A1_im += gT12_im * a2_re;
2058  spinorFloat B1_re = 0;
2059  B1_re += gT10_re * b0_re;
2060  B1_re -= gT10_im * b0_im;
2061  B1_re += gT11_re * b1_re;
2062  B1_re -= gT11_im * b1_im;
2063  B1_re += gT12_re * b2_re;
2064  B1_re -= gT12_im * b2_im;
2065  spinorFloat B1_im = 0;
2066  B1_im += gT10_re * b0_im;
2067  B1_im += gT10_im * b0_re;
2068  B1_im += gT11_re * b1_im;
2069  B1_im += gT11_im * b1_re;
2070  B1_im += gT12_re * b2_im;
2071  B1_im += gT12_im * b2_re;
2072 
2073  // multiply row 2
2074  spinorFloat A2_re = 0;
2075  A2_re += gT20_re * a0_re;
2076  A2_re -= gT20_im * a0_im;
2077  A2_re += gT21_re * a1_re;
2078  A2_re -= gT21_im * a1_im;
2079  A2_re += gT22_re * a2_re;
2080  A2_re -= gT22_im * a2_im;
2081  spinorFloat A2_im = 0;
2082  A2_im += gT20_re * a0_im;
2083  A2_im += gT20_im * a0_re;
2084  A2_im += gT21_re * a1_im;
2085  A2_im += gT21_im * a1_re;
2086  A2_im += gT22_re * a2_im;
2087  A2_im += gT22_im * a2_re;
2088  spinorFloat B2_re = 0;
2089  B2_re += gT20_re * b0_re;
2090  B2_re -= gT20_im * b0_im;
2091  B2_re += gT21_re * b1_re;
2092  B2_re -= gT21_im * b1_im;
2093  B2_re += gT22_re * b2_re;
2094  B2_re -= gT22_im * b2_im;
2095  spinorFloat B2_im = 0;
2096  B2_im += gT20_re * b0_im;
2097  B2_im += gT20_im * b0_re;
2098  B2_im += gT21_re * b1_im;
2099  B2_im += gT21_im * b1_re;
2100  B2_im += gT22_re * b2_im;
2101  B2_im += gT22_im * b2_re;
2102 
2103  o00_re += A0_re;
2104  o00_im += A0_im;
2105  o10_re += B0_re;
2106  o10_im += B0_im;
2107 
2108  o01_re += A1_re;
2109  o01_im += A1_im;
2110  o11_re += B1_re;
2111  o11_im += B1_im;
2112 
2113  o02_re += A2_re;
2114  o02_im += A2_im;
2115  o12_re += B2_re;
2116  o12_im += B2_im;
2117 
2118  }
2119 }
2120 
2121 #ifdef MULTI_GPU
2122 
2123 int incomplete = 0; // Have all 8 contributions been computed for this site?
2124 
2125 switch(kernel_type) { // intentional fall-through
2126 case INTERIOR_KERNEL:
2127  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
2128 case EXTERIOR_KERNEL_T:
2129  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
2130 case EXTERIOR_KERNEL_Z:
2131  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
2132 case EXTERIOR_KERNEL_Y:
2133  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
2134 }
2135 
2136 if (!incomplete)
2137 #endif // MULTI_GPU
2138 {
2139 #ifdef DSLASH_XPAY
2140  READ_ACCUM(ACCUMTEX, param.sp_stride)
2141 
2142 #ifndef TWIST_XPAY
2143 #ifndef TWIST_INV_DSLASH
2144  //perform invert twist first:
2145  APPLY_TWIST_INV( a, b, o);
2146 #endif
2147  o00_re += acc00_re;
2148  o00_im += acc00_im;
2149  o01_re += acc01_re;
2150  o01_im += acc01_im;
2151  o02_re += acc02_re;
2152  o02_im += acc02_im;
2153  o10_re += acc10_re;
2154  o10_im += acc10_im;
2155  o11_re += acc11_re;
2156  o11_im += acc11_im;
2157  o12_re += acc12_re;
2158  o12_im += acc12_im;
2159  o20_re += acc20_re;
2160  o20_im += acc20_im;
2161  o21_re += acc21_re;
2162  o21_im += acc21_im;
2163  o22_re += acc22_re;
2164  o22_im += acc22_im;
2165  o30_re += acc30_re;
2166  o30_im += acc30_im;
2167  o31_re += acc31_re;
2168  o31_im += acc31_im;
2169  o32_re += acc32_re;
2170  o32_im += acc32_im;
2171 #else
2172  APPLY_TWIST( a, acc);
2173  //warning! b is unrelated to the twisted mass parameter in this case!
2174 
2175  o00_re = b*o00_re+acc00_re;
2176  o00_im = b*o00_im+acc00_im;
2177  o01_re = b*o01_re+acc01_re;
2178  o01_im = b*o01_im+acc01_im;
2179  o02_re = b*o02_re+acc02_re;
2180  o02_im = b*o02_im+acc02_im;
2181  o10_re = b*o10_re+acc10_re;
2182  o10_im = b*o10_im+acc10_im;
2183  o11_re = b*o11_re+acc11_re;
2184  o11_im = b*o11_im+acc11_im;
2185  o12_re = b*o12_re+acc12_re;
2186  o12_im = b*o12_im+acc12_im;
2187  o20_re = b*o20_re+acc20_re;
2188  o20_im = b*o20_im+acc20_im;
2189  o21_re = b*o21_re+acc21_re;
2190  o21_im = b*o21_im+acc21_im;
2191  o22_re = b*o22_re+acc22_re;
2192  o22_im = b*o22_im+acc22_im;
2193  o30_re = b*o30_re+acc30_re;
2194  o30_im = b*o30_im+acc30_im;
2195  o31_re = b*o31_re+acc31_re;
2196  o31_im = b*o31_im+acc31_im;
2197  o32_re = b*o32_re+acc32_re;
2198  o32_im = b*o32_im+acc32_im;
2199 #endif//TWIST_XPAY
2200 #else //no XPAY
2201 #ifndef TWIST_INV_DSLASH
2202  APPLY_TWIST_INV( a, b, o);
2203 #endif
2204 #endif
2205 }
2206 
2207 // write spinor field back to device memory
2208 WRITE_SPINOR(param.sp_stride);
2209 
2210 // undefine to prevent warning when precision is changed
2211 #undef spinorFloat
2212 #undef WRITE_SPINOR_SHARED
2213 #undef READ_SPINOR_SHARED
2214 #undef SHARED_STRIDE
2215 
2216 #undef g00_re
2217 #undef g00_im
2218 #undef g01_re
2219 #undef g01_im
2220 #undef g02_re
2221 #undef g02_im
2222 #undef g10_re
2223 #undef g10_im
2224 #undef g11_re
2225 #undef g11_im
2226 #undef g12_re
2227 #undef g12_im
2228 #undef g20_re
2229 #undef g20_im
2230 #undef g21_re
2231 #undef g21_im
2232 #undef g22_re
2233 #undef g22_im
2234 
2235 #undef i00_re
2236 #undef i00_im
2237 #undef i01_re
2238 #undef i01_im
2239 #undef i02_re
2240 #undef i02_im
2241 #undef i10_re
2242 #undef i10_im
2243 #undef i11_re
2244 #undef i11_im
2245 #undef i12_re
2246 #undef i12_im
2247 #undef i20_re
2248 #undef i20_im
2249 #undef i21_re
2250 #undef i21_im
2251 #undef i22_re
2252 #undef i22_im
2253 #undef i30_re
2254 #undef i30_im
2255 #undef i31_re
2256 #undef i31_im
2257 #undef i32_re
2258 #undef i32_im
2259 
2260 #undef acc00_re
2261 #undef acc00_im
2262 #undef acc01_re
2263 #undef acc01_im
2264 #undef acc02_re
2265 #undef acc02_im
2266 #undef acc10_re
2267 #undef acc10_im
2268 #undef acc11_re
2269 #undef acc11_im
2270 #undef acc12_re
2271 #undef acc12_im
2272 #undef acc20_re
2273 #undef acc20_im
2274 #undef acc21_re
2275 #undef acc21_im
2276 #undef acc22_re
2277 #undef acc22_im
2278 #undef acc30_re
2279 #undef acc30_im
2280 #undef acc31_re
2281 #undef acc31_im
2282 #undef acc32_re
2283 #undef acc32_im
2284 
2285 
2286 #undef o00_re
2287 #undef o00_im
2288 #undef o01_re
2289 #undef o01_im
2290 #undef o02_re
2291 #undef o02_im
2292 #undef o10_re
2293 #undef o10_im
2294 #undef o11_re
2295 #undef o11_im
2296 #undef o12_re
2297 #undef o12_im
2298 #undef o20_re
2299 #undef o20_im
2300 #undef o21_re
2301 #undef o21_im
2302 #undef o22_re
2303 #undef o22_im
2304 #undef o30_re
2305 #undef o30_im
2306 #undef o31_re
2307 #undef o31_im
2308 #undef o32_re
2309 #undef o32_im
2310 
2311 #undef VOLATILE
coordsFromIndex3D< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
#define g21_im
#define i20_im
#define g21_re
spinorFloat A2_re
__constant__ int Vh
#define g02_im
#define g10_im
#define g02_re
#define i11_re
spinorFloat A2_im
__constant__ int X2
#define gT12_re
#define APPLY_TWIST(a, reg)
Definition: io_spinor.h:1187
VOLATILE spinorFloat o01_im
#define i02_re
#define spinorFloat
__constant__ int X2X1mX1
#define gT22_im
VOLATILE spinorFloat o30_re
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define APPLY_TWIST_INV(a, b, reg)
**************************only for deg tm:*******************************
Definition: io_spinor.h:1122
__constant__ int X3X2X1mX2X1
spinorFloat B1_re
__constant__ int X1
spinorFloat A1_im
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
#define g20_im
#define acc32_re
VOLATILE spinorFloat o12_im
#define gT11_im
#define gT02_im
spinorFloat a1_re
__constant__ int X3X2X1
#define gT20_re
#define acc30_re
spinorFloat B2_im
spinorFloat A1_re
VOLATILE spinorFloat o00_re
#define gT21_im
VOLATILE spinorFloat o02_im
#define i31_re
#define g00_im
#define gT10_re
#define g11_re
VOLATILE spinorFloat o31_im
#define i12_re
#define gT10_im
#define i11_im
#define acc00_im
#define i01_re
#define g00_re
spinorFloat A0_re
#define g01_re
spinorFloat a0_im
#define i00_im
QudaGaugeParam param
Definition: pack_test.cpp:17
#define i12_im
#define g12_im
VOLATILE spinorFloat o21_im
__constant__ int ghostFace[QUDA_MAX_DIM+1]
spinorFloat A0_im
#define i01_im
WRITE_SPINOR(param.sp_stride)
VOLATILE spinorFloat o32_im
#define acc01_re
VOLATILE spinorFloat o21_re
#define gT22_re
RECONSTRUCT_GAUGE_MATRIX(0)
#define i00_re
#define i30_re
#define gT12_im
#define acc12_re
#define i32_im
VOLATILE spinorFloat o30_im
spinorFloat a2_re
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define acc00_re
VOLATILE spinorFloat o20_re
#define READ_SPINOR_SHARED
#define VOLATILE
#define acc02_im
__syncthreads()
spinorFloat B1_im
#define GAUGE0TEX
Definition: covDev.h:112
spinorFloat B0_im
#define acc20_re
#define gT00_im
#define g01_im
#define acc10_im
#define i10_re
#define i21_im
#define acc11_re
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define g20_re
#define gT21_re
#define g11_im
#define i32_re
__constant__ int X2m1
#define g22_re
#define WRITE_SPINOR_SHARED
const int ga_idx
#define SPINORTEX
Definition: clover_def.h:40
#define g12_re
#define acc31_im
__constant__ int gauge_fixed
VOLATILE spinorFloat o00_im
VOLATILE spinorFloat o11_im
__constant__ int X4X3X2X1mX3X2X1
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o20_im
spinorFloat a2_im
#define SPINOR_HOP
Definition: covDev.h:158
spinorFloat b2_im
#define i31_im
__constant__ int ga_stride
#define gT01_im
#define g22_im
#define i02_im
#define acc32_im
#define gT00_re
__constant__ int X1m1
spinorFloat b0_re
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o02_re
__constant__ int X3
#define i30_im
#define acc22_re
#define i21_re
#define acc31_re
#define acc01_im
#define acc22_im
#define gT20_im
#define acc11_im
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o22_re
spinorFloat B0_re
VOLATILE spinorFloat o11_re
spinorFloat b1_im
#define acc21_im
#define GAUGE1TEX
Definition: covDev.h:113
#define i10_im
#define i22_re
spinorFloat a0_re
#define acc20_im
#define acc12_im
#define gT02_re
#define g10_re
const int dims[]
#define i22_im
__constant__ int X4m1
spinorFloat b2_re
#define acc10_re
spinorFloat B2_re
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o22_im
spinorFloat a1_im
spinorFloat b0_im
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
__constant__ int X4X3X2X1hmX3X2X1h
VOLATILE spinorFloat o31_re
#define acc02_re
VOLATILE spinorFloat o32_re
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
KernelType kernel_type
#define gT11_re
#define i20_re
#define acc30_im
#define acc21_re
__constant__ int X4
__constant__ int X3m1
#define gT01_re
spinorFloat b1_re
__constant__ int X2X1