QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tm_dslash_dagger_fermi_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 24
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2
15 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
64 #else
65 #define spinorFloat float
66 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4
67 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4
68 #define i00_re I0.x
69 #define i00_im I0.y
70 #define i01_re I0.z
71 #define i01_im I0.w
72 #define i02_re I1.x
73 #define i02_im I1.y
74 #define i10_re I1.z
75 #define i10_im I1.w
76 #define i11_re I2.x
77 #define i11_im I2.y
78 #define i12_re I2.z
79 #define i12_im I2.w
80 #define i20_re I3.x
81 #define i20_im I3.y
82 #define i21_re I3.z
83 #define i21_im I3.w
84 #define i22_re I4.x
85 #define i22_im I4.y
86 #define i30_re I4.z
87 #define i30_im I4.w
88 #define i31_re I5.x
89 #define i31_im I5.y
90 #define i32_re I5.z
91 #define i32_im I5.w
92 #define acc00_re accum0.x
93 #define acc00_im accum0.y
94 #define acc01_re accum0.z
95 #define acc01_im accum0.w
96 #define acc02_re accum1.x
97 #define acc02_im accum1.y
98 #define acc10_re accum1.z
99 #define acc10_im accum1.w
100 #define acc11_re accum2.x
101 #define acc11_im accum2.y
102 #define acc12_re accum2.z
103 #define acc12_im accum2.w
104 #define acc20_re accum3.x
105 #define acc20_im accum3.y
106 #define acc21_re accum3.z
107 #define acc21_im accum3.w
108 #define acc22_re accum4.x
109 #define acc22_im accum4.y
110 #define acc30_re accum4.z
111 #define acc30_im accum4.w
112 #define acc31_re accum5.x
113 #define acc31_im accum5.y
114 #define acc32_re accum5.z
115 #define acc32_im accum5.w
116 #endif // SPINOR_DOUBLE
117 
118 // gauge link
119 #ifdef GAUGE_FLOAT2
120 #define g00_re G0.x
121 #define g00_im G0.y
122 #define g01_re G1.x
123 #define g01_im G1.y
124 #define g02_re G2.x
125 #define g02_im G2.y
126 #define g10_re G3.x
127 #define g10_im G3.y
128 #define g11_re G4.x
129 #define g11_im G4.y
130 #define g12_re G5.x
131 #define g12_im G5.y
132 #define g20_re G6.x
133 #define g20_im G6.y
134 #define g21_re G7.x
135 #define g21_im G7.y
136 #define g22_re G8.x
137 #define g22_im G8.y
138 
139 #else
140 #define g00_re G0.x
141 #define g00_im G0.y
142 #define g01_re G0.z
143 #define g01_im G0.w
144 #define g02_re G1.x
145 #define g02_im G1.y
146 #define g10_re G1.z
147 #define g10_im G1.w
148 #define g11_re G2.x
149 #define g11_im G2.y
150 #define g12_re G2.z
151 #define g12_im G2.w
152 #define g20_re G3.x
153 #define g20_im G3.y
154 #define g21_re G3.z
155 #define g21_im G3.w
156 #define g22_re G4.x
157 #define g22_im G4.y
158 
159 #endif // GAUGE_DOUBLE
160 
161 // conjugated gauge link
162 #define gT00_re (+g00_re)
163 #define gT00_im (-g00_im)
164 #define gT01_re (+g10_re)
165 #define gT01_im (-g10_im)
166 #define gT02_re (+g20_re)
167 #define gT02_im (-g20_im)
168 #define gT10_re (+g01_re)
169 #define gT10_im (-g01_im)
170 #define gT11_re (+g11_re)
171 #define gT11_im (-g11_im)
172 #define gT12_re (+g21_re)
173 #define gT12_im (-g21_im)
174 #define gT20_re (+g02_re)
175 #define gT20_im (-g02_im)
176 #define gT21_re (+g12_re)
177 #define gT21_im (-g12_im)
178 #define gT22_re (+g22_re)
179 #define gT22_im (-g22_im)
180 
181 // output spinor
206 
207 #ifdef SPINOR_DOUBLE
208 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
209 #else
210 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
211 #endif
212 
213 #include "read_gauge.h"
214 #include "read_clover.h"
215 #include "io_spinor.h"
216 
217 int x1, x2, x3, x4;
218 int X;
219 
220 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
221 int sp_norm_idx;
222 #endif // MULTI_GPU half precision
223 
224 int sid;
225 
226 #ifdef MULTI_GPU
227 int face_idx;
228 if (kernel_type == INTERIOR_KERNEL) {
229 #endif
230 
231  // Inline by hand for the moment and assume even dimensions
232  coordsFromIndex3D<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity);
233 
234  // only need to check Y and Z dims currently since X and T set to match exactly
235  if (x2 >= X2) return;
236  if (x3 >= X3) return;
237 
238  o00_re = 0; o00_im = 0;
239  o01_re = 0; o01_im = 0;
240  o02_re = 0; o02_im = 0;
241  o10_re = 0; o10_im = 0;
242  o11_re = 0; o11_im = 0;
243  o12_re = 0; o12_im = 0;
244  o20_re = 0; o20_im = 0;
245  o21_re = 0; o21_im = 0;
246  o22_re = 0; o22_im = 0;
247  o30_re = 0; o30_im = 0;
248  o31_re = 0; o31_im = 0;
249  o32_re = 0; o32_im = 0;
250 
251 #ifdef MULTI_GPU
252 } else { // exterior kernel
253 
254  sid = blockIdx.x*blockDim.x + threadIdx.x;
255  if (sid >= param.threads) return;
256 
257  const int dim = static_cast<int>(kernel_type);
258  const int face_volume = (param.threads >> 1); // volume of one face
259  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
260  face_idx = sid - face_num*face_volume; // index into the respective face
261 
262  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
263  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
264  //sp_idx = face_idx + param.ghostOffset[dim];
265 
266 #if (DD_PREC==2) // half precision
267  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
268 #endif
269 
270  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity);
271 
273 
274  o00_re = i00_re; o00_im = i00_im;
275  o01_re = i01_re; o01_im = i01_im;
276  o02_re = i02_re; o02_im = i02_im;
277  o10_re = i10_re; o10_im = i10_im;
278  o11_re = i11_re; o11_im = i11_im;
279  o12_re = i12_re; o12_im = i12_im;
280  o20_re = i20_re; o20_im = i20_im;
281  o21_re = i21_re; o21_im = i21_im;
282  o22_re = i22_re; o22_im = i22_im;
283  o30_re = i30_re; o30_im = i30_im;
284  o31_re = i31_re; o31_im = i31_im;
285  o32_re = i32_re; o32_im = i32_im;
286 }
287 #endif // MULTI_GPU
288 
289 
290 #ifdef MULTI_GPU
291 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
292  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
293 #endif
294 {
295  // Projector P0+
296  // 1 0 0 i
297  // 0 1 i 0
298  // 0 -i 1 0
299  // -i 0 0 1
300 
301 #ifdef MULTI_GPU
302  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
303  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
304 #else
305  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
306 #endif
307 
308  const int ga_idx = sid;
309 
316 
317 #ifdef MULTI_GPU
318  if (kernel_type == INTERIOR_KERNEL) {
319 #endif
320 
321  // read spinor from device memory
322  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
323 
324  // store spinor into shared memory
325  WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i);
326 
327  // project spinor into half spinors
328  a0_re = +i00_re-i30_im;
329  a0_im = +i00_im+i30_re;
330  a1_re = +i01_re-i31_im;
331  a1_im = +i01_im+i31_re;
332  a2_re = +i02_re-i32_im;
333  a2_im = +i02_im+i32_re;
334  b0_re = +i10_re-i20_im;
335  b0_im = +i10_im+i20_re;
336  b1_re = +i11_re-i21_im;
337  b1_im = +i11_im+i21_re;
338  b2_re = +i12_re-i22_im;
339  b2_im = +i12_im+i22_re;
340 
341 #ifdef MULTI_GPU
342  } else {
343 
344  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
345 
346  // read half spinor from device memory
347  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
348 
349  a0_re = i00_re; a0_im = i00_im;
350  a1_re = i01_re; a1_im = i01_im;
351  a2_re = i02_re; a2_im = i02_im;
352  b0_re = i10_re; b0_im = i10_im;
353  b1_re = i11_re; b1_im = i11_im;
354  b2_re = i12_re; b2_im = i12_im;
355 
356  }
357 #endif // MULTI_GPU
358 
359  // read gauge matrix from device memory
360  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
361 
362  // reconstruct gauge matrix
364 
365  // multiply row 0
367  A0_re += g00_re * a0_re;
368  A0_re -= g00_im * a0_im;
369  A0_re += g01_re * a1_re;
370  A0_re -= g01_im * a1_im;
371  A0_re += g02_re * a2_re;
372  A0_re -= g02_im * a2_im;
374  A0_im += g00_re * a0_im;
375  A0_im += g00_im * a0_re;
376  A0_im += g01_re * a1_im;
377  A0_im += g01_im * a1_re;
378  A0_im += g02_re * a2_im;
379  A0_im += g02_im * a2_re;
381  B0_re += g00_re * b0_re;
382  B0_re -= g00_im * b0_im;
383  B0_re += g01_re * b1_re;
384  B0_re -= g01_im * b1_im;
385  B0_re += g02_re * b2_re;
386  B0_re -= g02_im * b2_im;
388  B0_im += g00_re * b0_im;
389  B0_im += g00_im * b0_re;
390  B0_im += g01_re * b1_im;
391  B0_im += g01_im * b1_re;
392  B0_im += g02_re * b2_im;
393  B0_im += g02_im * b2_re;
394 
395  // multiply row 1
397  A1_re += g10_re * a0_re;
398  A1_re -= g10_im * a0_im;
399  A1_re += g11_re * a1_re;
400  A1_re -= g11_im * a1_im;
401  A1_re += g12_re * a2_re;
402  A1_re -= g12_im * a2_im;
404  A1_im += g10_re * a0_im;
405  A1_im += g10_im * a0_re;
406  A1_im += g11_re * a1_im;
407  A1_im += g11_im * a1_re;
408  A1_im += g12_re * a2_im;
409  A1_im += g12_im * a2_re;
411  B1_re += g10_re * b0_re;
412  B1_re -= g10_im * b0_im;
413  B1_re += g11_re * b1_re;
414  B1_re -= g11_im * b1_im;
415  B1_re += g12_re * b2_re;
416  B1_re -= g12_im * b2_im;
418  B1_im += g10_re * b0_im;
419  B1_im += g10_im * b0_re;
420  B1_im += g11_re * b1_im;
421  B1_im += g11_im * b1_re;
422  B1_im += g12_re * b2_im;
423  B1_im += g12_im * b2_re;
424 
425  // multiply row 2
427  A2_re += g20_re * a0_re;
428  A2_re -= g20_im * a0_im;
429  A2_re += g21_re * a1_re;
430  A2_re -= g21_im * a1_im;
431  A2_re += g22_re * a2_re;
432  A2_re -= g22_im * a2_im;
434  A2_im += g20_re * a0_im;
435  A2_im += g20_im * a0_re;
436  A2_im += g21_re * a1_im;
437  A2_im += g21_im * a1_re;
438  A2_im += g22_re * a2_im;
439  A2_im += g22_im * a2_re;
441  B2_re += g20_re * b0_re;
442  B2_re -= g20_im * b0_im;
443  B2_re += g21_re * b1_re;
444  B2_re -= g21_im * b1_im;
445  B2_re += g22_re * b2_re;
446  B2_re -= g22_im * b2_im;
448  B2_im += g20_re * b0_im;
449  B2_im += g20_im * b0_re;
450  B2_im += g21_re * b1_im;
451  B2_im += g21_im * b1_re;
452  B2_im += g22_re * b2_im;
453  B2_im += g22_im * b2_re;
454 
455  o00_re += A0_re;
456  o00_im += A0_im;
457  o10_re += B0_re;
458  o10_im += B0_im;
459  o20_re += B0_im;
460  o20_im -= B0_re;
461  o30_re += A0_im;
462  o30_im -= A0_re;
463 
464  o01_re += A1_re;
465  o01_im += A1_im;
466  o11_re += B1_re;
467  o11_im += B1_im;
468  o21_re += B1_im;
469  o21_im -= B1_re;
470  o31_re += A1_im;
471  o31_im -= A1_re;
472 
473  o02_re += A2_re;
474  o02_im += A2_im;
475  o12_re += B2_re;
476  o12_im += B2_im;
477  o22_re += B2_im;
478  o22_im -= B2_re;
479  o32_re += A2_im;
480  o32_im -= A2_re;
481 
482 }
483 
484 #ifdef MULTI_GPU
485 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
486  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
487 #endif
488 {
489  // Projector P0-
490  // 1 0 0 -i
491  // 0 1 -i 0
492  // 0 i 1 0
493  // i 0 0 1
494 
495 #ifdef MULTI_GPU
496  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
497  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
498 #else
499  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
500 #endif
501 
502 #ifdef MULTI_GPU
503  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
504 #else
505  const int ga_idx = sp_idx;
506 #endif
507 
514 
515 #ifdef MULTI_GPU
516  if (kernel_type == INTERIOR_KERNEL) {
517 #endif
518 
519  // load spinor from shared memory
520  int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1;
521  __syncthreads();
522  READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z);
523 
524  // project spinor into half spinors
525  a0_re = +i00_re+i30_im;
526  a0_im = +i00_im-i30_re;
527  a1_re = +i01_re+i31_im;
528  a1_im = +i01_im-i31_re;
529  a2_re = +i02_re+i32_im;
530  a2_im = +i02_im-i32_re;
531  b0_re = +i10_re+i20_im;
532  b0_im = +i10_im-i20_re;
533  b1_re = +i11_re+i21_im;
534  b1_im = +i11_im-i21_re;
535  b2_re = +i12_re+i22_im;
536  b2_im = +i12_im-i22_re;
537 
538 #ifdef MULTI_GPU
539  } else {
540 
541  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
542 
543  // read half spinor from device memory
544  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
545 
546  a0_re = i00_re; a0_im = i00_im;
547  a1_re = i01_re; a1_im = i01_im;
548  a2_re = i02_re; a2_im = i02_im;
549  b0_re = i10_re; b0_im = i10_im;
550  b1_re = i11_re; b1_im = i11_im;
551  b2_re = i12_re; b2_im = i12_im;
552 
553  }
554 #endif // MULTI_GPU
555 
556  // read gauge matrix from device memory
557  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
558 
559  // reconstruct gauge matrix
561 
562  // multiply row 0
563  spinorFloat A0_re = 0;
564  A0_re += gT00_re * a0_re;
565  A0_re -= gT00_im * a0_im;
566  A0_re += gT01_re * a1_re;
567  A0_re -= gT01_im * a1_im;
568  A0_re += gT02_re * a2_re;
569  A0_re -= gT02_im * a2_im;
570  spinorFloat A0_im = 0;
571  A0_im += gT00_re * a0_im;
572  A0_im += gT00_im * a0_re;
573  A0_im += gT01_re * a1_im;
574  A0_im += gT01_im * a1_re;
575  A0_im += gT02_re * a2_im;
576  A0_im += gT02_im * a2_re;
577  spinorFloat B0_re = 0;
578  B0_re += gT00_re * b0_re;
579  B0_re -= gT00_im * b0_im;
580  B0_re += gT01_re * b1_re;
581  B0_re -= gT01_im * b1_im;
582  B0_re += gT02_re * b2_re;
583  B0_re -= gT02_im * b2_im;
584  spinorFloat B0_im = 0;
585  B0_im += gT00_re * b0_im;
586  B0_im += gT00_im * b0_re;
587  B0_im += gT01_re * b1_im;
588  B0_im += gT01_im * b1_re;
589  B0_im += gT02_re * b2_im;
590  B0_im += gT02_im * b2_re;
591 
592  // multiply row 1
593  spinorFloat A1_re = 0;
594  A1_re += gT10_re * a0_re;
595  A1_re -= gT10_im * a0_im;
596  A1_re += gT11_re * a1_re;
597  A1_re -= gT11_im * a1_im;
598  A1_re += gT12_re * a2_re;
599  A1_re -= gT12_im * a2_im;
600  spinorFloat A1_im = 0;
601  A1_im += gT10_re * a0_im;
602  A1_im += gT10_im * a0_re;
603  A1_im += gT11_re * a1_im;
604  A1_im += gT11_im * a1_re;
605  A1_im += gT12_re * a2_im;
606  A1_im += gT12_im * a2_re;
607  spinorFloat B1_re = 0;
608  B1_re += gT10_re * b0_re;
609  B1_re -= gT10_im * b0_im;
610  B1_re += gT11_re * b1_re;
611  B1_re -= gT11_im * b1_im;
612  B1_re += gT12_re * b2_re;
613  B1_re -= gT12_im * b2_im;
614  spinorFloat B1_im = 0;
615  B1_im += gT10_re * b0_im;
616  B1_im += gT10_im * b0_re;
617  B1_im += gT11_re * b1_im;
618  B1_im += gT11_im * b1_re;
619  B1_im += gT12_re * b2_im;
620  B1_im += gT12_im * b2_re;
621 
622  // multiply row 2
623  spinorFloat A2_re = 0;
624  A2_re += gT20_re * a0_re;
625  A2_re -= gT20_im * a0_im;
626  A2_re += gT21_re * a1_re;
627  A2_re -= gT21_im * a1_im;
628  A2_re += gT22_re * a2_re;
629  A2_re -= gT22_im * a2_im;
630  spinorFloat A2_im = 0;
631  A2_im += gT20_re * a0_im;
632  A2_im += gT20_im * a0_re;
633  A2_im += gT21_re * a1_im;
634  A2_im += gT21_im * a1_re;
635  A2_im += gT22_re * a2_im;
636  A2_im += gT22_im * a2_re;
637  spinorFloat B2_re = 0;
638  B2_re += gT20_re * b0_re;
639  B2_re -= gT20_im * b0_im;
640  B2_re += gT21_re * b1_re;
641  B2_re -= gT21_im * b1_im;
642  B2_re += gT22_re * b2_re;
643  B2_re -= gT22_im * b2_im;
644  spinorFloat B2_im = 0;
645  B2_im += gT20_re * b0_im;
646  B2_im += gT20_im * b0_re;
647  B2_im += gT21_re * b1_im;
648  B2_im += gT21_im * b1_re;
649  B2_im += gT22_re * b2_im;
650  B2_im += gT22_im * b2_re;
651 
652  o00_re += A0_re;
653  o00_im += A0_im;
654  o10_re += B0_re;
655  o10_im += B0_im;
656  o20_re -= B0_im;
657  o20_im += B0_re;
658  o30_re -= A0_im;
659  o30_im += A0_re;
660 
661  o01_re += A1_re;
662  o01_im += A1_im;
663  o11_re += B1_re;
664  o11_im += B1_im;
665  o21_re -= B1_im;
666  o21_im += B1_re;
667  o31_re -= A1_im;
668  o31_im += A1_re;
669 
670  o02_re += A2_re;
671  o02_im += A2_im;
672  o12_re += B2_re;
673  o12_im += B2_im;
674  o22_re -= B2_im;
675  o22_im += B2_re;
676  o32_re -= A2_im;
677  o32_im += A2_re;
678 
679 }
680 
681 #ifdef MULTI_GPU
682 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
683  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
684 #endif
685 {
686  // Projector P1+
687  // 1 0 0 1
688  // 0 1 -1 0
689  // 0 -1 1 0
690  // 1 0 0 1
691 
692 #ifdef MULTI_GPU
693  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
694  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
695 #else
696  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
697 #endif
698 
699  const int ga_idx = sid;
700 
707 
708 #ifdef MULTI_GPU
709  if (kernel_type == INTERIOR_KERNEL) {
710 #endif
711 
712  if (threadIdx.y == blockDim.y-1 && blockDim.y < X2 ) {
713  // read spinor from device memory
714  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
715 
716  // project spinor into half spinors
717  a0_re = +i00_re+i30_re;
718  a0_im = +i00_im+i30_im;
719  a1_re = +i01_re+i31_re;
720  a1_im = +i01_im+i31_im;
721  a2_re = +i02_re+i32_re;
722  a2_im = +i02_im+i32_im;
723  b0_re = +i10_re-i20_re;
724  b0_im = +i10_im-i20_im;
725  b1_re = +i11_re-i21_re;
726  b1_im = +i11_im-i21_im;
727  b2_re = +i12_re-i22_re;
728  b2_im = +i12_im-i22_im;
729  } else {
730  // load spinor from shared memory
731  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x;
732  int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0;
733  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
734 
735  // project spinor into half spinors
736  a0_re = +i00_re+i30_re;
737  a0_im = +i00_im+i30_im;
738  a1_re = +i01_re+i31_re;
739  a1_im = +i01_im+i31_im;
740  a2_re = +i02_re+i32_re;
741  a2_im = +i02_im+i32_im;
742  b0_re = +i10_re-i20_re;
743  b0_im = +i10_im-i20_im;
744  b1_re = +i11_re-i21_re;
745  b1_im = +i11_im-i21_im;
746  b2_re = +i12_re-i22_re;
747  b2_im = +i12_im-i22_im;
748  }
749 
750 #ifdef MULTI_GPU
751  } else {
752 
753  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
754 
755  // read half spinor from device memory
756  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
757 
758  a0_re = i00_re; a0_im = i00_im;
759  a1_re = i01_re; a1_im = i01_im;
760  a2_re = i02_re; a2_im = i02_im;
761  b0_re = i10_re; b0_im = i10_im;
762  b1_re = i11_re; b1_im = i11_im;
763  b2_re = i12_re; b2_im = i12_im;
764 
765  }
766 #endif // MULTI_GPU
767 
768  // read gauge matrix from device memory
769  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
770 
771  // reconstruct gauge matrix
773 
774  // multiply row 0
775  spinorFloat A0_re = 0;
776  A0_re += g00_re * a0_re;
777  A0_re -= g00_im * a0_im;
778  A0_re += g01_re * a1_re;
779  A0_re -= g01_im * a1_im;
780  A0_re += g02_re * a2_re;
781  A0_re -= g02_im * a2_im;
782  spinorFloat A0_im = 0;
783  A0_im += g00_re * a0_im;
784  A0_im += g00_im * a0_re;
785  A0_im += g01_re * a1_im;
786  A0_im += g01_im * a1_re;
787  A0_im += g02_re * a2_im;
788  A0_im += g02_im * a2_re;
789  spinorFloat B0_re = 0;
790  B0_re += g00_re * b0_re;
791  B0_re -= g00_im * b0_im;
792  B0_re += g01_re * b1_re;
793  B0_re -= g01_im * b1_im;
794  B0_re += g02_re * b2_re;
795  B0_re -= g02_im * b2_im;
796  spinorFloat B0_im = 0;
797  B0_im += g00_re * b0_im;
798  B0_im += g00_im * b0_re;
799  B0_im += g01_re * b1_im;
800  B0_im += g01_im * b1_re;
801  B0_im += g02_re * b2_im;
802  B0_im += g02_im * b2_re;
803 
804  // multiply row 1
805  spinorFloat A1_re = 0;
806  A1_re += g10_re * a0_re;
807  A1_re -= g10_im * a0_im;
808  A1_re += g11_re * a1_re;
809  A1_re -= g11_im * a1_im;
810  A1_re += g12_re * a2_re;
811  A1_re -= g12_im * a2_im;
812  spinorFloat A1_im = 0;
813  A1_im += g10_re * a0_im;
814  A1_im += g10_im * a0_re;
815  A1_im += g11_re * a1_im;
816  A1_im += g11_im * a1_re;
817  A1_im += g12_re * a2_im;
818  A1_im += g12_im * a2_re;
819  spinorFloat B1_re = 0;
820  B1_re += g10_re * b0_re;
821  B1_re -= g10_im * b0_im;
822  B1_re += g11_re * b1_re;
823  B1_re -= g11_im * b1_im;
824  B1_re += g12_re * b2_re;
825  B1_re -= g12_im * b2_im;
826  spinorFloat B1_im = 0;
827  B1_im += g10_re * b0_im;
828  B1_im += g10_im * b0_re;
829  B1_im += g11_re * b1_im;
830  B1_im += g11_im * b1_re;
831  B1_im += g12_re * b2_im;
832  B1_im += g12_im * b2_re;
833 
834  // multiply row 2
835  spinorFloat A2_re = 0;
836  A2_re += g20_re * a0_re;
837  A2_re -= g20_im * a0_im;
838  A2_re += g21_re * a1_re;
839  A2_re -= g21_im * a1_im;
840  A2_re += g22_re * a2_re;
841  A2_re -= g22_im * a2_im;
842  spinorFloat A2_im = 0;
843  A2_im += g20_re * a0_im;
844  A2_im += g20_im * a0_re;
845  A2_im += g21_re * a1_im;
846  A2_im += g21_im * a1_re;
847  A2_im += g22_re * a2_im;
848  A2_im += g22_im * a2_re;
849  spinorFloat B2_re = 0;
850  B2_re += g20_re * b0_re;
851  B2_re -= g20_im * b0_im;
852  B2_re += g21_re * b1_re;
853  B2_re -= g21_im * b1_im;
854  B2_re += g22_re * b2_re;
855  B2_re -= g22_im * b2_im;
856  spinorFloat B2_im = 0;
857  B2_im += g20_re * b0_im;
858  B2_im += g20_im * b0_re;
859  B2_im += g21_re * b1_im;
860  B2_im += g21_im * b1_re;
861  B2_im += g22_re * b2_im;
862  B2_im += g22_im * b2_re;
863 
864  o00_re += A0_re;
865  o00_im += A0_im;
866  o10_re += B0_re;
867  o10_im += B0_im;
868  o20_re -= B0_re;
869  o20_im -= B0_im;
870  o30_re += A0_re;
871  o30_im += A0_im;
872 
873  o01_re += A1_re;
874  o01_im += A1_im;
875  o11_re += B1_re;
876  o11_im += B1_im;
877  o21_re -= B1_re;
878  o21_im -= B1_im;
879  o31_re += A1_re;
880  o31_im += A1_im;
881 
882  o02_re += A2_re;
883  o02_im += A2_im;
884  o12_re += B2_re;
885  o12_im += B2_im;
886  o22_re -= B2_re;
887  o22_im -= B2_im;
888  o32_re += A2_re;
889  o32_im += A2_im;
890 
891 }
892 
893 #ifdef MULTI_GPU
894 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
895  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
896 #endif
897 {
898  // Projector P1-
899  // 1 0 0 -1
900  // 0 1 1 0
901  // 0 1 1 0
902  // -1 0 0 1
903 
904 #ifdef MULTI_GPU
905  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
906  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
907 #else
908  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
909 #endif
910 
911 #ifdef MULTI_GPU
912  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
913 #else
914  const int ga_idx = sp_idx;
915 #endif
916 
923 
924 #ifdef MULTI_GPU
925  if (kernel_type == INTERIOR_KERNEL) {
926 #endif
927 
928  if (threadIdx.y == 0 && blockDim.y < X2) {
929  // read spinor from device memory
930  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
931 
932  // project spinor into half spinors
933  a0_re = +i00_re-i30_re;
934  a0_im = +i00_im-i30_im;
935  a1_re = +i01_re-i31_re;
936  a1_im = +i01_im-i31_im;
937  a2_re = +i02_re-i32_re;
938  a2_im = +i02_im-i32_im;
939  b0_re = +i10_re+i20_re;
940  b0_im = +i10_im+i20_im;
941  b1_re = +i11_re+i21_re;
942  b1_im = +i11_im+i21_im;
943  b2_re = +i12_re+i22_re;
944  b2_im = +i12_im+i22_im;
945  } else {
946  // load spinor from shared memory
947  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x;
948  int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1;
949  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
950 
951  // project spinor into half spinors
952  a0_re = +i00_re-i30_re;
953  a0_im = +i00_im-i30_im;
954  a1_re = +i01_re-i31_re;
955  a1_im = +i01_im-i31_im;
956  a2_re = +i02_re-i32_re;
957  a2_im = +i02_im-i32_im;
958  b0_re = +i10_re+i20_re;
959  b0_im = +i10_im+i20_im;
960  b1_re = +i11_re+i21_re;
961  b1_im = +i11_im+i21_im;
962  b2_re = +i12_re+i22_re;
963  b2_im = +i12_im+i22_im;
964  }
965 
966 #ifdef MULTI_GPU
967  } else {
968 
969  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
970 
971  // read half spinor from device memory
972  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
973 
974  a0_re = i00_re; a0_im = i00_im;
975  a1_re = i01_re; a1_im = i01_im;
976  a2_re = i02_re; a2_im = i02_im;
977  b0_re = i10_re; b0_im = i10_im;
978  b1_re = i11_re; b1_im = i11_im;
979  b2_re = i12_re; b2_im = i12_im;
980 
981  }
982 #endif // MULTI_GPU
983 
984  // read gauge matrix from device memory
985  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
986 
987  // reconstruct gauge matrix
989 
990  // multiply row 0
991  spinorFloat A0_re = 0;
992  A0_re += gT00_re * a0_re;
993  A0_re -= gT00_im * a0_im;
994  A0_re += gT01_re * a1_re;
995  A0_re -= gT01_im * a1_im;
996  A0_re += gT02_re * a2_re;
997  A0_re -= gT02_im * a2_im;
998  spinorFloat A0_im = 0;
999  A0_im += gT00_re * a0_im;
1000  A0_im += gT00_im * a0_re;
1001  A0_im += gT01_re * a1_im;
1002  A0_im += gT01_im * a1_re;
1003  A0_im += gT02_re * a2_im;
1004  A0_im += gT02_im * a2_re;
1005  spinorFloat B0_re = 0;
1006  B0_re += gT00_re * b0_re;
1007  B0_re -= gT00_im * b0_im;
1008  B0_re += gT01_re * b1_re;
1009  B0_re -= gT01_im * b1_im;
1010  B0_re += gT02_re * b2_re;
1011  B0_re -= gT02_im * b2_im;
1012  spinorFloat B0_im = 0;
1013  B0_im += gT00_re * b0_im;
1014  B0_im += gT00_im * b0_re;
1015  B0_im += gT01_re * b1_im;
1016  B0_im += gT01_im * b1_re;
1017  B0_im += gT02_re * b2_im;
1018  B0_im += gT02_im * b2_re;
1019 
1020  // multiply row 1
1021  spinorFloat A1_re = 0;
1022  A1_re += gT10_re * a0_re;
1023  A1_re -= gT10_im * a0_im;
1024  A1_re += gT11_re * a1_re;
1025  A1_re -= gT11_im * a1_im;
1026  A1_re += gT12_re * a2_re;
1027  A1_re -= gT12_im * a2_im;
1028  spinorFloat A1_im = 0;
1029  A1_im += gT10_re * a0_im;
1030  A1_im += gT10_im * a0_re;
1031  A1_im += gT11_re * a1_im;
1032  A1_im += gT11_im * a1_re;
1033  A1_im += gT12_re * a2_im;
1034  A1_im += gT12_im * a2_re;
1035  spinorFloat B1_re = 0;
1036  B1_re += gT10_re * b0_re;
1037  B1_re -= gT10_im * b0_im;
1038  B1_re += gT11_re * b1_re;
1039  B1_re -= gT11_im * b1_im;
1040  B1_re += gT12_re * b2_re;
1041  B1_re -= gT12_im * b2_im;
1042  spinorFloat B1_im = 0;
1043  B1_im += gT10_re * b0_im;
1044  B1_im += gT10_im * b0_re;
1045  B1_im += gT11_re * b1_im;
1046  B1_im += gT11_im * b1_re;
1047  B1_im += gT12_re * b2_im;
1048  B1_im += gT12_im * b2_re;
1049 
1050  // multiply row 2
1051  spinorFloat A2_re = 0;
1052  A2_re += gT20_re * a0_re;
1053  A2_re -= gT20_im * a0_im;
1054  A2_re += gT21_re * a1_re;
1055  A2_re -= gT21_im * a1_im;
1056  A2_re += gT22_re * a2_re;
1057  A2_re -= gT22_im * a2_im;
1058  spinorFloat A2_im = 0;
1059  A2_im += gT20_re * a0_im;
1060  A2_im += gT20_im * a0_re;
1061  A2_im += gT21_re * a1_im;
1062  A2_im += gT21_im * a1_re;
1063  A2_im += gT22_re * a2_im;
1064  A2_im += gT22_im * a2_re;
1065  spinorFloat B2_re = 0;
1066  B2_re += gT20_re * b0_re;
1067  B2_re -= gT20_im * b0_im;
1068  B2_re += gT21_re * b1_re;
1069  B2_re -= gT21_im * b1_im;
1070  B2_re += gT22_re * b2_re;
1071  B2_re -= gT22_im * b2_im;
1072  spinorFloat B2_im = 0;
1073  B2_im += gT20_re * b0_im;
1074  B2_im += gT20_im * b0_re;
1075  B2_im += gT21_re * b1_im;
1076  B2_im += gT21_im * b1_re;
1077  B2_im += gT22_re * b2_im;
1078  B2_im += gT22_im * b2_re;
1079 
1080  o00_re += A0_re;
1081  o00_im += A0_im;
1082  o10_re += B0_re;
1083  o10_im += B0_im;
1084  o20_re += B0_re;
1085  o20_im += B0_im;
1086  o30_re -= A0_re;
1087  o30_im -= A0_im;
1088 
1089  o01_re += A1_re;
1090  o01_im += A1_im;
1091  o11_re += B1_re;
1092  o11_im += B1_im;
1093  o21_re += B1_re;
1094  o21_im += B1_im;
1095  o31_re -= A1_re;
1096  o31_im -= A1_im;
1097 
1098  o02_re += A2_re;
1099  o02_im += A2_im;
1100  o12_re += B2_re;
1101  o12_im += B2_im;
1102  o22_re += B2_re;
1103  o22_im += B2_im;
1104  o32_re -= A2_re;
1105  o32_im -= A2_im;
1106 
1107 }
1108 
1109 #ifdef MULTI_GPU
1110 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1111  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1112 #endif
1113 {
1114  // Projector P2+
1115  // 1 0 i 0
1116  // 0 1 0 -i
1117  // -i 0 1 0
1118  // 0 i 0 1
1119 
1120 #ifdef MULTI_GPU
1121  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1122  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1123 #else
1124  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1125 #endif
1126 
1127  const int ga_idx = sid;
1128 
1135 
1136 #ifdef MULTI_GPU
1137  if (kernel_type == INTERIOR_KERNEL) {
1138 #endif
1139 
1140  if (threadIdx.z == blockDim.z-1 && blockDim.z < X3) {
1141  // read spinor from device memory
1142  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
1143 
1144  // project spinor into half spinors
1145  a0_re = +i00_re-i20_im;
1146  a0_im = +i00_im+i20_re;
1147  a1_re = +i01_re-i21_im;
1148  a1_im = +i01_im+i21_re;
1149  a2_re = +i02_re-i22_im;
1150  a2_im = +i02_im+i22_re;
1151  b0_re = +i10_re+i30_im;
1152  b0_im = +i10_im-i30_re;
1153  b1_re = +i11_re+i31_im;
1154  b1_im = +i11_im-i31_re;
1155  b2_re = +i12_re+i32_im;
1156  b2_im = +i12_im-i32_re;
1157  } else {
1158  // load spinor from shared memory
1159  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x;
1160  int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0;
1161  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1162 
1163  // project spinor into half spinors
1164  a0_re = +i00_re-i20_im;
1165  a0_im = +i00_im+i20_re;
1166  a1_re = +i01_re-i21_im;
1167  a1_im = +i01_im+i21_re;
1168  a2_re = +i02_re-i22_im;
1169  a2_im = +i02_im+i22_re;
1170  b0_re = +i10_re+i30_im;
1171  b0_im = +i10_im-i30_re;
1172  b1_re = +i11_re+i31_im;
1173  b1_im = +i11_im-i31_re;
1174  b2_re = +i12_re+i32_im;
1175  b2_im = +i12_im-i32_re;
1176  }
1177 
1178 #ifdef MULTI_GPU
1179  } else {
1180 
1181  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1182 
1183  // read half spinor from device memory
1184  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1185 
1186  a0_re = i00_re; a0_im = i00_im;
1187  a1_re = i01_re; a1_im = i01_im;
1188  a2_re = i02_re; a2_im = i02_im;
1189  b0_re = i10_re; b0_im = i10_im;
1190  b1_re = i11_re; b1_im = i11_im;
1191  b2_re = i12_re; b2_im = i12_im;
1192 
1193  }
1194 #endif // MULTI_GPU
1195 
1196  // read gauge matrix from device memory
1197  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1198 
1199  // reconstruct gauge matrix
1201 
1202  // multiply row 0
1203  spinorFloat A0_re = 0;
1204  A0_re += g00_re * a0_re;
1205  A0_re -= g00_im * a0_im;
1206  A0_re += g01_re * a1_re;
1207  A0_re -= g01_im * a1_im;
1208  A0_re += g02_re * a2_re;
1209  A0_re -= g02_im * a2_im;
1210  spinorFloat A0_im = 0;
1211  A0_im += g00_re * a0_im;
1212  A0_im += g00_im * a0_re;
1213  A0_im += g01_re * a1_im;
1214  A0_im += g01_im * a1_re;
1215  A0_im += g02_re * a2_im;
1216  A0_im += g02_im * a2_re;
1217  spinorFloat B0_re = 0;
1218  B0_re += g00_re * b0_re;
1219  B0_re -= g00_im * b0_im;
1220  B0_re += g01_re * b1_re;
1221  B0_re -= g01_im * b1_im;
1222  B0_re += g02_re * b2_re;
1223  B0_re -= g02_im * b2_im;
1224  spinorFloat B0_im = 0;
1225  B0_im += g00_re * b0_im;
1226  B0_im += g00_im * b0_re;
1227  B0_im += g01_re * b1_im;
1228  B0_im += g01_im * b1_re;
1229  B0_im += g02_re * b2_im;
1230  B0_im += g02_im * b2_re;
1231 
1232  // multiply row 1
1233  spinorFloat A1_re = 0;
1234  A1_re += g10_re * a0_re;
1235  A1_re -= g10_im * a0_im;
1236  A1_re += g11_re * a1_re;
1237  A1_re -= g11_im * a1_im;
1238  A1_re += g12_re * a2_re;
1239  A1_re -= g12_im * a2_im;
1240  spinorFloat A1_im = 0;
1241  A1_im += g10_re * a0_im;
1242  A1_im += g10_im * a0_re;
1243  A1_im += g11_re * a1_im;
1244  A1_im += g11_im * a1_re;
1245  A1_im += g12_re * a2_im;
1246  A1_im += g12_im * a2_re;
1247  spinorFloat B1_re = 0;
1248  B1_re += g10_re * b0_re;
1249  B1_re -= g10_im * b0_im;
1250  B1_re += g11_re * b1_re;
1251  B1_re -= g11_im * b1_im;
1252  B1_re += g12_re * b2_re;
1253  B1_re -= g12_im * b2_im;
1254  spinorFloat B1_im = 0;
1255  B1_im += g10_re * b0_im;
1256  B1_im += g10_im * b0_re;
1257  B1_im += g11_re * b1_im;
1258  B1_im += g11_im * b1_re;
1259  B1_im += g12_re * b2_im;
1260  B1_im += g12_im * b2_re;
1261 
1262  // multiply row 2
1263  spinorFloat A2_re = 0;
1264  A2_re += g20_re * a0_re;
1265  A2_re -= g20_im * a0_im;
1266  A2_re += g21_re * a1_re;
1267  A2_re -= g21_im * a1_im;
1268  A2_re += g22_re * a2_re;
1269  A2_re -= g22_im * a2_im;
1270  spinorFloat A2_im = 0;
1271  A2_im += g20_re * a0_im;
1272  A2_im += g20_im * a0_re;
1273  A2_im += g21_re * a1_im;
1274  A2_im += g21_im * a1_re;
1275  A2_im += g22_re * a2_im;
1276  A2_im += g22_im * a2_re;
1277  spinorFloat B2_re = 0;
1278  B2_re += g20_re * b0_re;
1279  B2_re -= g20_im * b0_im;
1280  B2_re += g21_re * b1_re;
1281  B2_re -= g21_im * b1_im;
1282  B2_re += g22_re * b2_re;
1283  B2_re -= g22_im * b2_im;
1284  spinorFloat B2_im = 0;
1285  B2_im += g20_re * b0_im;
1286  B2_im += g20_im * b0_re;
1287  B2_im += g21_re * b1_im;
1288  B2_im += g21_im * b1_re;
1289  B2_im += g22_re * b2_im;
1290  B2_im += g22_im * b2_re;
1291 
1292  o00_re += A0_re;
1293  o00_im += A0_im;
1294  o10_re += B0_re;
1295  o10_im += B0_im;
1296  o20_re += A0_im;
1297  o20_im -= A0_re;
1298  o30_re -= B0_im;
1299  o30_im += B0_re;
1300 
1301  o01_re += A1_re;
1302  o01_im += A1_im;
1303  o11_re += B1_re;
1304  o11_im += B1_im;
1305  o21_re += A1_im;
1306  o21_im -= A1_re;
1307  o31_re -= B1_im;
1308  o31_im += B1_re;
1309 
1310  o02_re += A2_re;
1311  o02_im += A2_im;
1312  o12_re += B2_re;
1313  o12_im += B2_im;
1314  o22_re += A2_im;
1315  o22_im -= A2_re;
1316  o32_re -= B2_im;
1317  o32_im += B2_re;
1318 
1319 }
1320 
1321 #ifdef MULTI_GPU
1322 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1323  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1324 #endif
1325 {
1326  // Projector P2-
1327  // 1 0 -i 0
1328  // 0 1 0 i
1329  // i 0 1 0
1330  // 0 -i 0 1
1331 
1332 #ifdef MULTI_GPU
1333  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1334  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1335 #else
1336  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1337 #endif
1338 
1339 #ifdef MULTI_GPU
1340  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1341 #else
1342  const int ga_idx = sp_idx;
1343 #endif
1344 
1351 
1352 #ifdef MULTI_GPU
1353  if (kernel_type == INTERIOR_KERNEL) {
1354 #endif
1355 
1356  if (threadIdx.z == 0 && blockDim.z < X3) {
1357  // read spinor from device memory
1358  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
1359 
1360  // project spinor into half spinors
1361  a0_re = +i00_re+i20_im;
1362  a0_im = +i00_im-i20_re;
1363  a1_re = +i01_re+i21_im;
1364  a1_im = +i01_im-i21_re;
1365  a2_re = +i02_re+i22_im;
1366  a2_im = +i02_im-i22_re;
1367  b0_re = +i10_re-i30_im;
1368  b0_im = +i10_im+i30_re;
1369  b1_re = +i11_re-i31_im;
1370  b1_im = +i11_im+i31_re;
1371  b2_re = +i12_re-i32_im;
1372  b2_im = +i12_im+i32_re;
1373  } else {
1374  // load spinor from shared memory
1375  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x;
1376  int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1;
1377  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1378 
1379  // project spinor into half spinors
1380  a0_re = +i00_re+i20_im;
1381  a0_im = +i00_im-i20_re;
1382  a1_re = +i01_re+i21_im;
1383  a1_im = +i01_im-i21_re;
1384  a2_re = +i02_re+i22_im;
1385  a2_im = +i02_im-i22_re;
1386  b0_re = +i10_re-i30_im;
1387  b0_im = +i10_im+i30_re;
1388  b1_re = +i11_re-i31_im;
1389  b1_im = +i11_im+i31_re;
1390  b2_re = +i12_re-i32_im;
1391  b2_im = +i12_im+i32_re;
1392  }
1393 
1394 #ifdef MULTI_GPU
1395  } else {
1396 
1397  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1398 
1399  // read half spinor from device memory
1400  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1401 
1402  a0_re = i00_re; a0_im = i00_im;
1403  a1_re = i01_re; a1_im = i01_im;
1404  a2_re = i02_re; a2_im = i02_im;
1405  b0_re = i10_re; b0_im = i10_im;
1406  b1_re = i11_re; b1_im = i11_im;
1407  b2_re = i12_re; b2_im = i12_im;
1408 
1409  }
1410 #endif // MULTI_GPU
1411 
1412  // read gauge matrix from device memory
1413  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1414 
1415  // reconstruct gauge matrix
1417 
1418  // multiply row 0
1419  spinorFloat A0_re = 0;
1420  A0_re += gT00_re * a0_re;
1421  A0_re -= gT00_im * a0_im;
1422  A0_re += gT01_re * a1_re;
1423  A0_re -= gT01_im * a1_im;
1424  A0_re += gT02_re * a2_re;
1425  A0_re -= gT02_im * a2_im;
1426  spinorFloat A0_im = 0;
1427  A0_im += gT00_re * a0_im;
1428  A0_im += gT00_im * a0_re;
1429  A0_im += gT01_re * a1_im;
1430  A0_im += gT01_im * a1_re;
1431  A0_im += gT02_re * a2_im;
1432  A0_im += gT02_im * a2_re;
1433  spinorFloat B0_re = 0;
1434  B0_re += gT00_re * b0_re;
1435  B0_re -= gT00_im * b0_im;
1436  B0_re += gT01_re * b1_re;
1437  B0_re -= gT01_im * b1_im;
1438  B0_re += gT02_re * b2_re;
1439  B0_re -= gT02_im * b2_im;
1440  spinorFloat B0_im = 0;
1441  B0_im += gT00_re * b0_im;
1442  B0_im += gT00_im * b0_re;
1443  B0_im += gT01_re * b1_im;
1444  B0_im += gT01_im * b1_re;
1445  B0_im += gT02_re * b2_im;
1446  B0_im += gT02_im * b2_re;
1447 
1448  // multiply row 1
1449  spinorFloat A1_re = 0;
1450  A1_re += gT10_re * a0_re;
1451  A1_re -= gT10_im * a0_im;
1452  A1_re += gT11_re * a1_re;
1453  A1_re -= gT11_im * a1_im;
1454  A1_re += gT12_re * a2_re;
1455  A1_re -= gT12_im * a2_im;
1456  spinorFloat A1_im = 0;
1457  A1_im += gT10_re * a0_im;
1458  A1_im += gT10_im * a0_re;
1459  A1_im += gT11_re * a1_im;
1460  A1_im += gT11_im * a1_re;
1461  A1_im += gT12_re * a2_im;
1462  A1_im += gT12_im * a2_re;
1463  spinorFloat B1_re = 0;
1464  B1_re += gT10_re * b0_re;
1465  B1_re -= gT10_im * b0_im;
1466  B1_re += gT11_re * b1_re;
1467  B1_re -= gT11_im * b1_im;
1468  B1_re += gT12_re * b2_re;
1469  B1_re -= gT12_im * b2_im;
1470  spinorFloat B1_im = 0;
1471  B1_im += gT10_re * b0_im;
1472  B1_im += gT10_im * b0_re;
1473  B1_im += gT11_re * b1_im;
1474  B1_im += gT11_im * b1_re;
1475  B1_im += gT12_re * b2_im;
1476  B1_im += gT12_im * b2_re;
1477 
1478  // multiply row 2
1479  spinorFloat A2_re = 0;
1480  A2_re += gT20_re * a0_re;
1481  A2_re -= gT20_im * a0_im;
1482  A2_re += gT21_re * a1_re;
1483  A2_re -= gT21_im * a1_im;
1484  A2_re += gT22_re * a2_re;
1485  A2_re -= gT22_im * a2_im;
1486  spinorFloat A2_im = 0;
1487  A2_im += gT20_re * a0_im;
1488  A2_im += gT20_im * a0_re;
1489  A2_im += gT21_re * a1_im;
1490  A2_im += gT21_im * a1_re;
1491  A2_im += gT22_re * a2_im;
1492  A2_im += gT22_im * a2_re;
1493  spinorFloat B2_re = 0;
1494  B2_re += gT20_re * b0_re;
1495  B2_re -= gT20_im * b0_im;
1496  B2_re += gT21_re * b1_re;
1497  B2_re -= gT21_im * b1_im;
1498  B2_re += gT22_re * b2_re;
1499  B2_re -= gT22_im * b2_im;
1500  spinorFloat B2_im = 0;
1501  B2_im += gT20_re * b0_im;
1502  B2_im += gT20_im * b0_re;
1503  B2_im += gT21_re * b1_im;
1504  B2_im += gT21_im * b1_re;
1505  B2_im += gT22_re * b2_im;
1506  B2_im += gT22_im * b2_re;
1507 
1508  o00_re += A0_re;
1509  o00_im += A0_im;
1510  o10_re += B0_re;
1511  o10_im += B0_im;
1512  o20_re -= A0_im;
1513  o20_im += A0_re;
1514  o30_re += B0_im;
1515  o30_im -= B0_re;
1516 
1517  o01_re += A1_re;
1518  o01_im += A1_im;
1519  o11_re += B1_re;
1520  o11_im += B1_im;
1521  o21_re -= A1_im;
1522  o21_im += A1_re;
1523  o31_re += B1_im;
1524  o31_im -= B1_re;
1525 
1526  o02_re += A2_re;
1527  o02_im += A2_im;
1528  o12_re += B2_re;
1529  o12_im += B2_im;
1530  o22_re -= A2_im;
1531  o22_im += A2_re;
1532  o32_re += B2_im;
1533  o32_im -= B2_re;
1534 
1535 }
1536 
1537 #ifdef MULTI_GPU
1538 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1539  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
1540 #endif
1541 {
1542  // Projector P3+
1543  // 2 0 0 0
1544  // 0 2 0 0
1545  // 0 0 0 0
1546  // 0 0 0 0
1547 
1548 #ifdef MULTI_GPU
1549  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1550  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1551 #else
1552  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1553 #endif
1554 
1555  const int ga_idx = sid;
1556 
1557  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1558  {
1565 
1566 #ifdef MULTI_GPU
1567  if (kernel_type == INTERIOR_KERNEL) {
1568 #endif
1569 
1570  // read spinor from device memory
1571  READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx);
1572 
1573  // project spinor into half spinors
1574  a0_re = +2*i00_re;
1575  a0_im = +2*i00_im;
1576  a1_re = +2*i01_re;
1577  a1_im = +2*i01_im;
1578  a2_re = +2*i02_re;
1579  a2_im = +2*i02_im;
1580  b0_re = +2*i10_re;
1581  b0_im = +2*i10_im;
1582  b1_re = +2*i11_re;
1583  b1_im = +2*i11_im;
1584  b2_re = +2*i12_re;
1585  b2_im = +2*i12_im;
1586 
1587 #ifdef MULTI_GPU
1588  } else {
1589 
1590  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1591  const int t_proj_scale = TPROJSCALE;
1592 
1593  // read half spinor from device memory
1594  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1595 
1596  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1597  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1598  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1599  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1600  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1601  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1602 
1603  }
1604 #endif // MULTI_GPU
1605 
1606  // identity gauge matrix
1613 
1614  o00_re += A0_re;
1615  o00_im += A0_im;
1616  o10_re += B0_re;
1617  o10_im += B0_im;
1618 
1619  o01_re += A1_re;
1620  o01_im += A1_im;
1621  o11_re += B1_re;
1622  o11_im += B1_im;
1623 
1624  o02_re += A2_re;
1625  o02_im += A2_im;
1626  o12_re += B2_re;
1627  o12_im += B2_im;
1628 
1629  } else {
1636 
1637 #ifdef MULTI_GPU
1638  if (kernel_type == INTERIOR_KERNEL) {
1639 #endif
1640 
1641  // read spinor from device memory
1642  READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx);
1643 
1644  // project spinor into half spinors
1645  a0_re = +2*i00_re;
1646  a0_im = +2*i00_im;
1647  a1_re = +2*i01_re;
1648  a1_im = +2*i01_im;
1649  a2_re = +2*i02_re;
1650  a2_im = +2*i02_im;
1651  b0_re = +2*i10_re;
1652  b0_im = +2*i10_im;
1653  b1_re = +2*i11_re;
1654  b1_im = +2*i11_im;
1655  b2_re = +2*i12_re;
1656  b2_im = +2*i12_im;
1657 
1658 #ifdef MULTI_GPU
1659  } else {
1660 
1661  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1662  const int t_proj_scale = TPROJSCALE;
1663 
1664  // read half spinor from device memory
1665  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1666 
1667  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1668  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1669  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1670  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1671  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1672  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1673 
1674  }
1675 #endif // MULTI_GPU
1676 
1677  // read gauge matrix from device memory
1678  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1679 
1680  // reconstruct gauge matrix
1682 
1683  // multiply row 0
1684  spinorFloat A0_re = 0;
1685  A0_re += g00_re * a0_re;
1686  A0_re -= g00_im * a0_im;
1687  A0_re += g01_re * a1_re;
1688  A0_re -= g01_im * a1_im;
1689  A0_re += g02_re * a2_re;
1690  A0_re -= g02_im * a2_im;
1691  spinorFloat A0_im = 0;
1692  A0_im += g00_re * a0_im;
1693  A0_im += g00_im * a0_re;
1694  A0_im += g01_re * a1_im;
1695  A0_im += g01_im * a1_re;
1696  A0_im += g02_re * a2_im;
1697  A0_im += g02_im * a2_re;
1698  spinorFloat B0_re = 0;
1699  B0_re += g00_re * b0_re;
1700  B0_re -= g00_im * b0_im;
1701  B0_re += g01_re * b1_re;
1702  B0_re -= g01_im * b1_im;
1703  B0_re += g02_re * b2_re;
1704  B0_re -= g02_im * b2_im;
1705  spinorFloat B0_im = 0;
1706  B0_im += g00_re * b0_im;
1707  B0_im += g00_im * b0_re;
1708  B0_im += g01_re * b1_im;
1709  B0_im += g01_im * b1_re;
1710  B0_im += g02_re * b2_im;
1711  B0_im += g02_im * b2_re;
1712 
1713  // multiply row 1
1714  spinorFloat A1_re = 0;
1715  A1_re += g10_re * a0_re;
1716  A1_re -= g10_im * a0_im;
1717  A1_re += g11_re * a1_re;
1718  A1_re -= g11_im * a1_im;
1719  A1_re += g12_re * a2_re;
1720  A1_re -= g12_im * a2_im;
1721  spinorFloat A1_im = 0;
1722  A1_im += g10_re * a0_im;
1723  A1_im += g10_im * a0_re;
1724  A1_im += g11_re * a1_im;
1725  A1_im += g11_im * a1_re;
1726  A1_im += g12_re * a2_im;
1727  A1_im += g12_im * a2_re;
1728  spinorFloat B1_re = 0;
1729  B1_re += g10_re * b0_re;
1730  B1_re -= g10_im * b0_im;
1731  B1_re += g11_re * b1_re;
1732  B1_re -= g11_im * b1_im;
1733  B1_re += g12_re * b2_re;
1734  B1_re -= g12_im * b2_im;
1735  spinorFloat B1_im = 0;
1736  B1_im += g10_re * b0_im;
1737  B1_im += g10_im * b0_re;
1738  B1_im += g11_re * b1_im;
1739  B1_im += g11_im * b1_re;
1740  B1_im += g12_re * b2_im;
1741  B1_im += g12_im * b2_re;
1742 
1743  // multiply row 2
1744  spinorFloat A2_re = 0;
1745  A2_re += g20_re * a0_re;
1746  A2_re -= g20_im * a0_im;
1747  A2_re += g21_re * a1_re;
1748  A2_re -= g21_im * a1_im;
1749  A2_re += g22_re * a2_re;
1750  A2_re -= g22_im * a2_im;
1751  spinorFloat A2_im = 0;
1752  A2_im += g20_re * a0_im;
1753  A2_im += g20_im * a0_re;
1754  A2_im += g21_re * a1_im;
1755  A2_im += g21_im * a1_re;
1756  A2_im += g22_re * a2_im;
1757  A2_im += g22_im * a2_re;
1758  spinorFloat B2_re = 0;
1759  B2_re += g20_re * b0_re;
1760  B2_re -= g20_im * b0_im;
1761  B2_re += g21_re * b1_re;
1762  B2_re -= g21_im * b1_im;
1763  B2_re += g22_re * b2_re;
1764  B2_re -= g22_im * b2_im;
1765  spinorFloat B2_im = 0;
1766  B2_im += g20_re * b0_im;
1767  B2_im += g20_im * b0_re;
1768  B2_im += g21_re * b1_im;
1769  B2_im += g21_im * b1_re;
1770  B2_im += g22_re * b2_im;
1771  B2_im += g22_im * b2_re;
1772 
1773  o00_re += A0_re;
1774  o00_im += A0_im;
1775  o10_re += B0_re;
1776  o10_im += B0_im;
1777 
1778  o01_re += A1_re;
1779  o01_im += A1_im;
1780  o11_re += B1_re;
1781  o11_im += B1_im;
1782 
1783  o02_re += A2_re;
1784  o02_im += A2_im;
1785  o12_re += B2_re;
1786  o12_im += B2_im;
1787 
1788  }
1789 }
1790 
1791 #ifdef MULTI_GPU
1792 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
1793  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
1794 #endif
1795 {
1796  // Projector P3-
1797  // 0 0 0 0
1798  // 0 0 0 0
1799  // 0 0 2 0
1800  // 0 0 0 2
1801 
1802 #ifdef MULTI_GPU
1803  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
1804  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1805 #else
1806  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
1807 #endif
1808 
1809 #ifdef MULTI_GPU
1810  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1811 #else
1812  const int ga_idx = sp_idx;
1813 #endif
1814 
1815  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1816  {
1823 
1824 #ifdef MULTI_GPU
1825  if (kernel_type == INTERIOR_KERNEL) {
1826 #endif
1827 
1828  // read spinor from device memory
1829  READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx);
1830 
1831  // project spinor into half spinors
1832  a0_re = +2*i20_re;
1833  a0_im = +2*i20_im;
1834  a1_re = +2*i21_re;
1835  a1_im = +2*i21_im;
1836  a2_re = +2*i22_re;
1837  a2_im = +2*i22_im;
1838  b0_re = +2*i30_re;
1839  b0_im = +2*i30_im;
1840  b1_re = +2*i31_re;
1841  b1_im = +2*i31_im;
1842  b2_re = +2*i32_re;
1843  b2_im = +2*i32_im;
1844 
1845 #ifdef MULTI_GPU
1846  } else {
1847 
1848  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1849  const int t_proj_scale = TPROJSCALE;
1850 
1851  // read half spinor from device memory
1852  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1853 
1854  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1855  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1856  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1857  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1858  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1859  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1860 
1861  }
1862 #endif // MULTI_GPU
1863 
1864  // identity gauge matrix
1871 
1872  o20_re += A0_re;
1873  o20_im += A0_im;
1874  o30_re += B0_re;
1875  o30_im += B0_im;
1876 
1877  o21_re += A1_re;
1878  o21_im += A1_im;
1879  o31_re += B1_re;
1880  o31_im += B1_im;
1881 
1882  o22_re += A2_re;
1883  o22_im += A2_im;
1884  o32_re += B2_re;
1885  o32_im += B2_im;
1886 
1887  } else {
1894 
1895 #ifdef MULTI_GPU
1896  if (kernel_type == INTERIOR_KERNEL) {
1897 #endif
1898 
1899  // read spinor from device memory
1900  READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx);
1901 
1902  // project spinor into half spinors
1903  a0_re = +2*i20_re;
1904  a0_im = +2*i20_im;
1905  a1_re = +2*i21_re;
1906  a1_im = +2*i21_im;
1907  a2_re = +2*i22_re;
1908  a2_im = +2*i22_im;
1909  b0_re = +2*i30_re;
1910  b0_im = +2*i30_im;
1911  b1_re = +2*i31_re;
1912  b1_im = +2*i31_im;
1913  b2_re = +2*i32_re;
1914  b2_im = +2*i32_im;
1915 
1916 #ifdef MULTI_GPU
1917  } else {
1918 
1919  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1920  const int t_proj_scale = TPROJSCALE;
1921 
1922  // read half spinor from device memory
1923  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1924 
1925  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1926  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1927  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1928  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1929  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1930  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1931 
1932  }
1933 #endif // MULTI_GPU
1934 
1935  // read gauge matrix from device memory
1936  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
1937 
1938  // reconstruct gauge matrix
1940 
1941  // multiply row 0
1942  spinorFloat A0_re = 0;
1943  A0_re += gT00_re * a0_re;
1944  A0_re -= gT00_im * a0_im;
1945  A0_re += gT01_re * a1_re;
1946  A0_re -= gT01_im * a1_im;
1947  A0_re += gT02_re * a2_re;
1948  A0_re -= gT02_im * a2_im;
1949  spinorFloat A0_im = 0;
1950  A0_im += gT00_re * a0_im;
1951  A0_im += gT00_im * a0_re;
1952  A0_im += gT01_re * a1_im;
1953  A0_im += gT01_im * a1_re;
1954  A0_im += gT02_re * a2_im;
1955  A0_im += gT02_im * a2_re;
1956  spinorFloat B0_re = 0;
1957  B0_re += gT00_re * b0_re;
1958  B0_re -= gT00_im * b0_im;
1959  B0_re += gT01_re * b1_re;
1960  B0_re -= gT01_im * b1_im;
1961  B0_re += gT02_re * b2_re;
1962  B0_re -= gT02_im * b2_im;
1963  spinorFloat B0_im = 0;
1964  B0_im += gT00_re * b0_im;
1965  B0_im += gT00_im * b0_re;
1966  B0_im += gT01_re * b1_im;
1967  B0_im += gT01_im * b1_re;
1968  B0_im += gT02_re * b2_im;
1969  B0_im += gT02_im * b2_re;
1970 
1971  // multiply row 1
1972  spinorFloat A1_re = 0;
1973  A1_re += gT10_re * a0_re;
1974  A1_re -= gT10_im * a0_im;
1975  A1_re += gT11_re * a1_re;
1976  A1_re -= gT11_im * a1_im;
1977  A1_re += gT12_re * a2_re;
1978  A1_re -= gT12_im * a2_im;
1979  spinorFloat A1_im = 0;
1980  A1_im += gT10_re * a0_im;
1981  A1_im += gT10_im * a0_re;
1982  A1_im += gT11_re * a1_im;
1983  A1_im += gT11_im * a1_re;
1984  A1_im += gT12_re * a2_im;
1985  A1_im += gT12_im * a2_re;
1986  spinorFloat B1_re = 0;
1987  B1_re += gT10_re * b0_re;
1988  B1_re -= gT10_im * b0_im;
1989  B1_re += gT11_re * b1_re;
1990  B1_re -= gT11_im * b1_im;
1991  B1_re += gT12_re * b2_re;
1992  B1_re -= gT12_im * b2_im;
1993  spinorFloat B1_im = 0;
1994  B1_im += gT10_re * b0_im;
1995  B1_im += gT10_im * b0_re;
1996  B1_im += gT11_re * b1_im;
1997  B1_im += gT11_im * b1_re;
1998  B1_im += gT12_re * b2_im;
1999  B1_im += gT12_im * b2_re;
2000 
2001  // multiply row 2
2002  spinorFloat A2_re = 0;
2003  A2_re += gT20_re * a0_re;
2004  A2_re -= gT20_im * a0_im;
2005  A2_re += gT21_re * a1_re;
2006  A2_re -= gT21_im * a1_im;
2007  A2_re += gT22_re * a2_re;
2008  A2_re -= gT22_im * a2_im;
2009  spinorFloat A2_im = 0;
2010  A2_im += gT20_re * a0_im;
2011  A2_im += gT20_im * a0_re;
2012  A2_im += gT21_re * a1_im;
2013  A2_im += gT21_im * a1_re;
2014  A2_im += gT22_re * a2_im;
2015  A2_im += gT22_im * a2_re;
2016  spinorFloat B2_re = 0;
2017  B2_re += gT20_re * b0_re;
2018  B2_re -= gT20_im * b0_im;
2019  B2_re += gT21_re * b1_re;
2020  B2_re -= gT21_im * b1_im;
2021  B2_re += gT22_re * b2_re;
2022  B2_re -= gT22_im * b2_im;
2023  spinorFloat B2_im = 0;
2024  B2_im += gT20_re * b0_im;
2025  B2_im += gT20_im * b0_re;
2026  B2_im += gT21_re * b1_im;
2027  B2_im += gT21_im * b1_re;
2028  B2_im += gT22_re * b2_im;
2029  B2_im += gT22_im * b2_re;
2030 
2031  o20_re += A0_re;
2032  o20_im += A0_im;
2033  o30_re += B0_re;
2034  o30_im += B0_im;
2035 
2036  o21_re += A1_re;
2037  o21_im += A1_im;
2038  o31_re += B1_re;
2039  o31_im += B1_im;
2040 
2041  o22_re += A2_re;
2042  o22_im += A2_im;
2043  o32_re += B2_re;
2044  o32_im += B2_im;
2045 
2046  }
2047 }
2048 
2049 #ifdef MULTI_GPU
2050 
2051 int incomplete = 0; // Have all 8 contributions been computed for this site?
2052 
2053 switch(kernel_type) { // intentional fall-through
2054 case INTERIOR_KERNEL:
2055  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
2056 case EXTERIOR_KERNEL_T:
2057  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
2058 case EXTERIOR_KERNEL_Z:
2059  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
2060 case EXTERIOR_KERNEL_Y:
2061  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
2062 }
2063 
2064 if (!incomplete)
2065 #endif // MULTI_GPU
2066 {
2067  {
2068  // apply twisted mass rotation
2069  VOLATILE spinorFloat tmp00_re = +o00_re-o20_im*a;
2070  VOLATILE spinorFloat tmp00_im = +o00_im+o20_re*a;
2071  VOLATILE spinorFloat tmp01_re = +o01_re-o21_im*a;
2072  VOLATILE spinorFloat tmp01_im = +o01_im+o21_re*a;
2073  VOLATILE spinorFloat tmp02_re = +o02_re-o22_im*a;
2074  VOLATILE spinorFloat tmp02_im = +o02_im+o22_re*a;
2075 
2076  VOLATILE spinorFloat tmp10_re = +o10_re-o30_im*a;
2077  VOLATILE spinorFloat tmp10_im = +o10_im+o30_re*a;
2078  VOLATILE spinorFloat tmp11_re = +o11_re-o31_im*a;
2079  VOLATILE spinorFloat tmp11_im = +o11_im+o31_re*a;
2080  VOLATILE spinorFloat tmp12_re = +o12_re-o32_im*a;
2081  VOLATILE spinorFloat tmp12_im = +o12_im+o32_re*a;
2082 
2083  VOLATILE spinorFloat tmp20_re = -o00_im*a+o20_re;
2084  VOLATILE spinorFloat tmp20_im = +o00_re*a+o20_im;
2085  VOLATILE spinorFloat tmp21_re = -o01_im*a+o21_re;
2086  VOLATILE spinorFloat tmp21_im = +o01_re*a+o21_im;
2087  VOLATILE spinorFloat tmp22_re = -o02_im*a+o22_re;
2088  VOLATILE spinorFloat tmp22_im = +o02_re*a+o22_im;
2089 
2090  VOLATILE spinorFloat tmp30_re = -o10_im*a+o30_re;
2091  VOLATILE spinorFloat tmp30_im = +o10_re*a+o30_im;
2092  VOLATILE spinorFloat tmp31_re = -o11_im*a+o31_re;
2093  VOLATILE spinorFloat tmp31_im = +o11_re*a+o31_im;
2094  VOLATILE spinorFloat tmp32_re = -o12_im*a+o32_re;
2095  VOLATILE spinorFloat tmp32_im = +o12_re*a+o32_im;
2096 
2097 
2098 #ifndef DSLASH_XPAY
2099  //scale by b = 1/(1 + a*a)
2100  o00_re = b*tmp00_re;
2101  o00_im = b*tmp00_im;
2102  o01_re = b*tmp01_re;
2103  o01_im = b*tmp01_im;
2104  o02_re = b*tmp02_re;
2105  o02_im = b*tmp02_im;
2106  o10_re = b*tmp10_re;
2107  o10_im = b*tmp10_im;
2108  o11_re = b*tmp11_re;
2109  o11_im = b*tmp11_im;
2110  o12_re = b*tmp12_re;
2111  o12_im = b*tmp12_im;
2112  o20_re = b*tmp20_re;
2113  o20_im = b*tmp20_im;
2114  o21_re = b*tmp21_re;
2115  o21_im = b*tmp21_im;
2116  o22_re = b*tmp22_re;
2117  o22_im = b*tmp22_im;
2118  o30_re = b*tmp30_re;
2119  o30_im = b*tmp30_im;
2120  o31_re = b*tmp31_re;
2121  o31_im = b*tmp31_im;
2122  o32_re = b*tmp32_re;
2123  o32_im = b*tmp32_im;
2124 #else
2125  o00_re = tmp00_re;
2126  o00_im = tmp00_im;
2127  o01_re = tmp01_re;
2128  o01_im = tmp01_im;
2129  o02_re = tmp02_re;
2130  o02_im = tmp02_im;
2131  o10_re = tmp10_re;
2132  o10_im = tmp10_im;
2133  o11_re = tmp11_re;
2134  o11_im = tmp11_im;
2135  o12_re = tmp12_re;
2136  o12_im = tmp12_im;
2137  o20_re = tmp20_re;
2138  o20_im = tmp20_im;
2139  o21_re = tmp21_re;
2140  o21_im = tmp21_im;
2141  o22_re = tmp22_re;
2142  o22_im = tmp22_im;
2143  o30_re = tmp30_re;
2144  o30_im = tmp30_im;
2145  o31_re = tmp31_re;
2146  o31_im = tmp31_im;
2147  o32_re = tmp32_re;
2148  o32_im = tmp32_im;
2149 #endif // DSLASH_XPAY
2150 
2151  }
2152 #ifdef DSLASH_XPAY
2153 
2154  READ_ACCUM(ACCUMTEX, sp_stride)
2155 
2156  o00_re = b*o00_re+acc00_re;
2157  o00_im = b*o00_im+acc00_im;
2158  o01_re = b*o01_re+acc01_re;
2159  o01_im = b*o01_im+acc01_im;
2160  o02_re = b*o02_re+acc02_re;
2161  o02_im = b*o02_im+acc02_im;
2162  o10_re = b*o10_re+acc10_re;
2163  o10_im = b*o10_im+acc10_im;
2164  o11_re = b*o11_re+acc11_re;
2165  o11_im = b*o11_im+acc11_im;
2166  o12_re = b*o12_re+acc12_re;
2167  o12_im = b*o12_im+acc12_im;
2168  o20_re = b*o20_re+acc20_re;
2169  o20_im = b*o20_im+acc20_im;
2170  o21_re = b*o21_re+acc21_re;
2171  o21_im = b*o21_im+acc21_im;
2172  o22_re = b*o22_re+acc22_re;
2173  o22_im = b*o22_im+acc22_im;
2174  o30_re = b*o30_re+acc30_re;
2175  o30_im = b*o30_im+acc30_im;
2176  o31_re = b*o31_re+acc31_re;
2177  o31_im = b*o31_im+acc31_im;
2178  o32_re = b*o32_re+acc32_re;
2179  o32_im = b*o32_im+acc32_im;
2180 #endif // DSLASH_XPAY
2181 }
2182 
2183 // write spinor field back to device memory
2185 
2186 // undefine to prevent warning when precision is changed
2187 #undef spinorFloat
2188 #undef WRITE_SPINOR_SHARED
2189 #undef READ_SPINOR_SHARED
2190 #undef SHARED_STRIDE
2191 
2192 #undef g00_re
2193 #undef g00_im
2194 #undef g01_re
2195 #undef g01_im
2196 #undef g02_re
2197 #undef g02_im
2198 #undef g10_re
2199 #undef g10_im
2200 #undef g11_re
2201 #undef g11_im
2202 #undef g12_re
2203 #undef g12_im
2204 #undef g20_re
2205 #undef g20_im
2206 #undef g21_re
2207 #undef g21_im
2208 #undef g22_re
2209 #undef g22_im
2210 
2211 #undef i00_re
2212 #undef i00_im
2213 #undef i01_re
2214 #undef i01_im
2215 #undef i02_re
2216 #undef i02_im
2217 #undef i10_re
2218 #undef i10_im
2219 #undef i11_re
2220 #undef i11_im
2221 #undef i12_re
2222 #undef i12_im
2223 #undef i20_re
2224 #undef i20_im
2225 #undef i21_re
2226 #undef i21_im
2227 #undef i22_re
2228 #undef i22_im
2229 #undef i30_re
2230 #undef i30_im
2231 #undef i31_re
2232 #undef i31_im
2233 #undef i32_re
2234 #undef i32_im
2235 
2236 #undef acc00_re
2237 #undef acc00_im
2238 #undef acc01_re
2239 #undef acc01_im
2240 #undef acc02_re
2241 #undef acc02_im
2242 #undef acc10_re
2243 #undef acc10_im
2244 #undef acc11_re
2245 #undef acc11_im
2246 #undef acc12_re
2247 #undef acc12_im
2248 #undef acc20_re
2249 #undef acc20_im
2250 #undef acc21_re
2251 #undef acc21_im
2252 #undef acc22_re
2253 #undef acc22_im
2254 #undef acc30_re
2255 #undef acc30_im
2256 #undef acc31_re
2257 #undef acc31_im
2258 #undef acc32_re
2259 #undef acc32_im
2260 
2261 
2262 #undef o00_re
2263 #undef o00_im
2264 #undef o01_re
2265 #undef o01_im
2266 #undef o02_re
2267 #undef o02_im
2268 #undef o10_re
2269 #undef o10_im
2270 #undef o11_re
2271 #undef o11_im
2272 #undef o12_re
2273 #undef o12_im
2274 #undef o20_re
2275 #undef o20_im
2276 #undef o21_re
2277 #undef o21_im
2278 #undef o22_re
2279 #undef o22_im
2280 #undef o30_re
2281 #undef o30_im
2282 #undef o31_re
2283 #undef o31_im
2284 #undef o32_re
2285 #undef o32_im
2286 
2287 #undef VOLATILE