QUDA  0.9.0
tm_dslash_dagger_fermi_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 24
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2
15 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
64 #else
65 #define spinorFloat float
66 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4
67 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4
68 #define i00_re I0.x
69 #define i00_im I0.y
70 #define i01_re I0.z
71 #define i01_im I0.w
72 #define i02_re I1.x
73 #define i02_im I1.y
74 #define i10_re I1.z
75 #define i10_im I1.w
76 #define i11_re I2.x
77 #define i11_im I2.y
78 #define i12_re I2.z
79 #define i12_im I2.w
80 #define i20_re I3.x
81 #define i20_im I3.y
82 #define i21_re I3.z
83 #define i21_im I3.w
84 #define i22_re I4.x
85 #define i22_im I4.y
86 #define i30_re I4.z
87 #define i30_im I4.w
88 #define i31_re I5.x
89 #define i31_im I5.y
90 #define i32_re I5.z
91 #define i32_im I5.w
92 #define acc00_re accum0.x
93 #define acc00_im accum0.y
94 #define acc01_re accum0.z
95 #define acc01_im accum0.w
96 #define acc02_re accum1.x
97 #define acc02_im accum1.y
98 #define acc10_re accum1.z
99 #define acc10_im accum1.w
100 #define acc11_re accum2.x
101 #define acc11_im accum2.y
102 #define acc12_re accum2.z
103 #define acc12_im accum2.w
104 #define acc20_re accum3.x
105 #define acc20_im accum3.y
106 #define acc21_re accum3.z
107 #define acc21_im accum3.w
108 #define acc22_re accum4.x
109 #define acc22_im accum4.y
110 #define acc30_re accum4.z
111 #define acc30_im accum4.w
112 #define acc31_re accum5.x
113 #define acc31_im accum5.y
114 #define acc32_re accum5.z
115 #define acc32_im accum5.w
116 #endif // SPINOR_DOUBLE
117 
118 // gauge link
119 #ifdef GAUGE_FLOAT2
120 #define g00_re G0.x
121 #define g00_im G0.y
122 #define g01_re G1.x
123 #define g01_im G1.y
124 #define g02_re G2.x
125 #define g02_im G2.y
126 #define g10_re G3.x
127 #define g10_im G3.y
128 #define g11_re G4.x
129 #define g11_im G4.y
130 #define g12_re G5.x
131 #define g12_im G5.y
132 #define g20_re G6.x
133 #define g20_im G6.y
134 #define g21_re G7.x
135 #define g21_im G7.y
136 #define g22_re G8.x
137 #define g22_im G8.y
138 
139 #else
140 #define g00_re G0.x
141 #define g00_im G0.y
142 #define g01_re G0.z
143 #define g01_im G0.w
144 #define g02_re G1.x
145 #define g02_im G1.y
146 #define g10_re G1.z
147 #define g10_im G1.w
148 #define g11_re G2.x
149 #define g11_im G2.y
150 #define g12_re G2.z
151 #define g12_im G2.w
152 #define g20_re G3.x
153 #define g20_im G3.y
154 #define g21_re G3.z
155 #define g21_im G3.w
156 #define g22_re G4.x
157 #define g22_im G4.y
158 
159 #endif // GAUGE_DOUBLE
160 
161 // conjugated gauge link
162 #define gT00_re (+g00_re)
163 #define gT00_im (-g00_im)
164 #define gT01_re (+g10_re)
165 #define gT01_im (-g10_im)
166 #define gT02_re (+g20_re)
167 #define gT02_im (-g20_im)
168 #define gT10_re (+g01_re)
169 #define gT10_im (-g01_im)
170 #define gT11_re (+g11_re)
171 #define gT11_im (-g11_im)
172 #define gT12_re (+g21_re)
173 #define gT12_im (-g21_im)
174 #define gT20_re (+g02_re)
175 #define gT20_im (-g02_im)
176 #define gT21_re (+g12_re)
177 #define gT21_im (-g12_im)
178 #define gT22_re (+g22_re)
179 #define gT22_im (-g22_im)
180 
181 // output spinor
206 
207 #ifdef SPINOR_DOUBLE
208 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
209 #else
210 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
211 #endif
212 
213 #include "read_gauge.h"
214 #include "io_spinor.h"
215 
216 int coord[5];
217 int X;
218 
219 int sid;
220 
221 #ifdef MULTI_GPU
222 int face_idx;
223 if (kernel_type == INTERIOR_KERNEL) {
224 #endif
225 
226  // Assume even dimensions
228 
229  // only need to check Y and Z dims currently since X and T set to match exactly
230  if (coord[1] >= param.dc.X[1]) return;
231  if (cpprd[2] >= param.dc.X[2]) return;
232 
233  o00_re = 0; o00_im = 0;
234  o01_re = 0; o01_im = 0;
235  o02_re = 0; o02_im = 0;
236  o10_re = 0; o10_im = 0;
237  o11_re = 0; o11_im = 0;
238  o12_re = 0; o12_im = 0;
239  o20_re = 0; o20_im = 0;
240  o21_re = 0; o21_im = 0;
241  o22_re = 0; o22_im = 0;
242  o30_re = 0; o30_im = 0;
243  o31_re = 0; o31_im = 0;
244  o32_re = 0; o32_im = 0;
245 
246 #ifdef MULTI_GPU
247 } else { // exterior kernel
248 
249  sid = blockIdx.x*blockDim.x + threadIdx.x;
250  if (sid >= param.threads) return;
251 
252  const int face_volume = (param.threads >> 1); // volume of one face
253  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
254  face_idx = sid - face_num*face_volume; // index into the respective face
255 
256  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
257  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
258  //sp_idx = face_idx + param.ghostOffset[dim];
259 
260  coordsFromFaceIndex<4,QUDA_4D_PC,kernel_type,1>(X, sid, coord, face_idx, face_num, param);
261 
263 
264  o00_re = i00_re; o00_im = i00_im;
265  o01_re = i01_re; o01_im = i01_im;
266  o02_re = i02_re; o02_im = i02_im;
267  o10_re = i10_re; o10_im = i10_im;
268  o11_re = i11_re; o11_im = i11_im;
269  o12_re = i12_re; o12_im = i12_im;
270  o20_re = i20_re; o20_im = i20_im;
271  o21_re = i21_re; o21_im = i21_im;
272  o22_re = i22_re; o22_im = i22_im;
273  o30_re = i30_re; o30_im = i30_im;
274  o31_re = i31_re; o31_im = i31_im;
275  o32_re = i32_re; o32_im = i32_im;
276 }
277 #endif // MULTI_GPU
278 
279 
280 #ifdef MULTI_GPU
281 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]<(param.dc.X[0]-1))) ||
282  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==(param.dc.X[0]-1)) )
283 #endif
284 {
285  // Projector P0+
286  // 1 0 0 i
287  // 0 1 i 0
288  // 0 -i 1 0
289  // -i 0 0 1
290 
291 #ifdef MULTI_GPU
292  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1 :
293  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
294 #if (DD_PREC==2) // half precision
295  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
296 #endif
297 #else
298  const int sp_idx = (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1;
299 #endif
300 
301  const int ga_idx = sid;
302 
309 
310 #ifdef MULTI_GPU
311  if (kernel_type == INTERIOR_KERNEL) {
312 #endif
313 
314  // read spinor from device memory
315 #ifdef TWIST_INV_DSLASH
316 #ifdef SPINOR_DOUBLE
317  const spinorFloat a = param.a;
318  const spinorFloat b = param.b;
319 #else
320  const spinorFloat a = param.a_f;
321  const spinorFloat b = param.b_f;
322 #endif
323 #endif
324  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
325 #ifdef TWIST_INV_DSLASH
326  APPLY_TWIST_INV(-a, b, i);
327 #endif
328 
329  // store spinor into shared memory
330  WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i);
331 
332  // project spinor into half spinors
333  a0_re = +i00_re-i30_im;
334  a0_im = +i00_im+i30_re;
335  a1_re = +i01_re-i31_im;
336  a1_im = +i01_im+i31_re;
337  a2_re = +i02_re-i32_im;
338  a2_im = +i02_im+i32_re;
339  b0_re = +i10_re-i20_im;
340  b0_im = +i10_im+i20_re;
341  b1_re = +i11_re-i21_im;
342  b1_im = +i11_im+i21_re;
343  b2_re = +i12_re-i22_im;
344  b2_im = +i12_im+i22_re;
345 
346 #ifdef MULTI_GPU
347  } else {
348 
349  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
350 
351  // read half spinor from device memory
352  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 0);
353 
354  a0_re = i00_re; a0_im = i00_im;
355  a1_re = i01_re; a1_im = i01_im;
356  a2_re = i02_re; a2_im = i02_im;
357  b0_re = i10_re; b0_im = i10_im;
358  b1_re = i11_re; b1_im = i11_im;
359  b2_re = i12_re; b2_im = i12_im;
360 
361  }
362 #endif // MULTI_GPU
363 
364  // read gauge matrix from device memory
365  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride);
366 
367  // reconstruct gauge matrix
369 
370  // multiply row 0
372  A0_re += g00_re * a0_re;
373  A0_re -= g00_im * a0_im;
374  A0_re += g01_re * a1_re;
375  A0_re -= g01_im * a1_im;
376  A0_re += g02_re * a2_re;
377  A0_re -= g02_im * a2_im;
379  A0_im += g00_re * a0_im;
380  A0_im += g00_im * a0_re;
381  A0_im += g01_re * a1_im;
382  A0_im += g01_im * a1_re;
383  A0_im += g02_re * a2_im;
384  A0_im += g02_im * a2_re;
386  B0_re += g00_re * b0_re;
387  B0_re -= g00_im * b0_im;
388  B0_re += g01_re * b1_re;
389  B0_re -= g01_im * b1_im;
390  B0_re += g02_re * b2_re;
391  B0_re -= g02_im * b2_im;
393  B0_im += g00_re * b0_im;
394  B0_im += g00_im * b0_re;
395  B0_im += g01_re * b1_im;
396  B0_im += g01_im * b1_re;
397  B0_im += g02_re * b2_im;
398  B0_im += g02_im * b2_re;
399 
400  // multiply row 1
402  A1_re += g10_re * a0_re;
403  A1_re -= g10_im * a0_im;
404  A1_re += g11_re * a1_re;
405  A1_re -= g11_im * a1_im;
406  A1_re += g12_re * a2_re;
407  A1_re -= g12_im * a2_im;
409  A1_im += g10_re * a0_im;
410  A1_im += g10_im * a0_re;
411  A1_im += g11_re * a1_im;
412  A1_im += g11_im * a1_re;
413  A1_im += g12_re * a2_im;
414  A1_im += g12_im * a2_re;
416  B1_re += g10_re * b0_re;
417  B1_re -= g10_im * b0_im;
418  B1_re += g11_re * b1_re;
419  B1_re -= g11_im * b1_im;
420  B1_re += g12_re * b2_re;
421  B1_re -= g12_im * b2_im;
423  B1_im += g10_re * b0_im;
424  B1_im += g10_im * b0_re;
425  B1_im += g11_re * b1_im;
426  B1_im += g11_im * b1_re;
427  B1_im += g12_re * b2_im;
428  B1_im += g12_im * b2_re;
429 
430  // multiply row 2
432  A2_re += g20_re * a0_re;
433  A2_re -= g20_im * a0_im;
434  A2_re += g21_re * a1_re;
435  A2_re -= g21_im * a1_im;
436  A2_re += g22_re * a2_re;
437  A2_re -= g22_im * a2_im;
439  A2_im += g20_re * a0_im;
440  A2_im += g20_im * a0_re;
441  A2_im += g21_re * a1_im;
442  A2_im += g21_im * a1_re;
443  A2_im += g22_re * a2_im;
444  A2_im += g22_im * a2_re;
446  B2_re += g20_re * b0_re;
447  B2_re -= g20_im * b0_im;
448  B2_re += g21_re * b1_re;
449  B2_re -= g21_im * b1_im;
450  B2_re += g22_re * b2_re;
451  B2_re -= g22_im * b2_im;
453  B2_im += g20_re * b0_im;
454  B2_im += g20_im * b0_re;
455  B2_im += g21_re * b1_im;
456  B2_im += g21_im * b1_re;
457  B2_im += g22_re * b2_im;
458  B2_im += g22_im * b2_re;
459 
460  o00_re += A0_re;
461  o00_im += A0_im;
462  o10_re += B0_re;
463  o10_im += B0_im;
464  o20_re += B0_im;
465  o20_im -= B0_re;
466  o30_re += A0_im;
467  o30_im -= A0_re;
468 
469  o01_re += A1_re;
470  o01_im += A1_im;
471  o11_re += B1_re;
472  o11_im += B1_im;
473  o21_re += B1_im;
474  o21_im -= B1_re;
475  o31_re += A1_im;
476  o31_im -= A1_re;
477 
478  o02_re += A2_re;
479  o02_im += A2_im;
480  o12_re += B2_re;
481  o12_im += B2_im;
482  o22_re += B2_im;
483  o22_im -= B2_re;
484  o32_re += A2_im;
485  o32_im -= A2_re;
486 
487 }
488 
489 #ifdef MULTI_GPU
490 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]>0)) ||
491  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==0) )
492 #endif
493 {
494  // Projector P0-
495  // 1 0 0 -i
496  // 0 1 -i 0
497  // 0 i 1 0
498  // i 0 0 1
499 
500 #ifdef MULTI_GPU
501  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1 :
502  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
503 #if (DD_PREC==2) // half precision
504  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
505 #endif
506 #else
507  const int sp_idx = (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1;
508 #endif
509 
510 #ifdef MULTI_GPU
511  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
512 #else
513  const int ga_idx = sp_idx;
514 #endif
515 
522 
523 #ifdef MULTI_GPU
524  if (kernel_type == INTERIOR_KERNEL) {
525 #endif
526 
527  // load spinor from shared memory
528  int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1;
529  __syncthreads();
530  READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z);
531 
532  // project spinor into half spinors
533  a0_re = +i00_re+i30_im;
534  a0_im = +i00_im-i30_re;
535  a1_re = +i01_re+i31_im;
536  a1_im = +i01_im-i31_re;
537  a2_re = +i02_re+i32_im;
538  a2_im = +i02_im-i32_re;
539  b0_re = +i10_re+i20_im;
540  b0_im = +i10_im-i20_re;
541  b1_re = +i11_re+i21_im;
542  b1_im = +i11_im-i21_re;
543  b2_re = +i12_re+i22_im;
544  b2_im = +i12_im-i22_re;
545 
546 #ifdef MULTI_GPU
547  } else {
548 
549  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
550 
551  // read half spinor from device memory
552  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 1);
553 
554  a0_re = i00_re; a0_im = i00_im;
555  a1_re = i01_re; a1_im = i01_im;
556  a2_re = i02_re; a2_im = i02_im;
557  b0_re = i10_re; b0_im = i10_im;
558  b1_re = i11_re; b1_im = i11_im;
559  b2_re = i12_re; b2_im = i12_im;
560 
561  }
562 #endif // MULTI_GPU
563 
564  // read gauge matrix from device memory
565  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, param.gauge_stride);
566 
567  // reconstruct gauge matrix
569 
570  // multiply row 0
571  spinorFloat A0_re = 0;
572  A0_re += gT00_re * a0_re;
573  A0_re -= gT00_im * a0_im;
574  A0_re += gT01_re * a1_re;
575  A0_re -= gT01_im * a1_im;
576  A0_re += gT02_re * a2_re;
577  A0_re -= gT02_im * a2_im;
578  spinorFloat A0_im = 0;
579  A0_im += gT00_re * a0_im;
580  A0_im += gT00_im * a0_re;
581  A0_im += gT01_re * a1_im;
582  A0_im += gT01_im * a1_re;
583  A0_im += gT02_re * a2_im;
584  A0_im += gT02_im * a2_re;
585  spinorFloat B0_re = 0;
586  B0_re += gT00_re * b0_re;
587  B0_re -= gT00_im * b0_im;
588  B0_re += gT01_re * b1_re;
589  B0_re -= gT01_im * b1_im;
590  B0_re += gT02_re * b2_re;
591  B0_re -= gT02_im * b2_im;
592  spinorFloat B0_im = 0;
593  B0_im += gT00_re * b0_im;
594  B0_im += gT00_im * b0_re;
595  B0_im += gT01_re * b1_im;
596  B0_im += gT01_im * b1_re;
597  B0_im += gT02_re * b2_im;
598  B0_im += gT02_im * b2_re;
599 
600  // multiply row 1
601  spinorFloat A1_re = 0;
602  A1_re += gT10_re * a0_re;
603  A1_re -= gT10_im * a0_im;
604  A1_re += gT11_re * a1_re;
605  A1_re -= gT11_im * a1_im;
606  A1_re += gT12_re * a2_re;
607  A1_re -= gT12_im * a2_im;
608  spinorFloat A1_im = 0;
609  A1_im += gT10_re * a0_im;
610  A1_im += gT10_im * a0_re;
611  A1_im += gT11_re * a1_im;
612  A1_im += gT11_im * a1_re;
613  A1_im += gT12_re * a2_im;
614  A1_im += gT12_im * a2_re;
615  spinorFloat B1_re = 0;
616  B1_re += gT10_re * b0_re;
617  B1_re -= gT10_im * b0_im;
618  B1_re += gT11_re * b1_re;
619  B1_re -= gT11_im * b1_im;
620  B1_re += gT12_re * b2_re;
621  B1_re -= gT12_im * b2_im;
622  spinorFloat B1_im = 0;
623  B1_im += gT10_re * b0_im;
624  B1_im += gT10_im * b0_re;
625  B1_im += gT11_re * b1_im;
626  B1_im += gT11_im * b1_re;
627  B1_im += gT12_re * b2_im;
628  B1_im += gT12_im * b2_re;
629 
630  // multiply row 2
631  spinorFloat A2_re = 0;
632  A2_re += gT20_re * a0_re;
633  A2_re -= gT20_im * a0_im;
634  A2_re += gT21_re * a1_re;
635  A2_re -= gT21_im * a1_im;
636  A2_re += gT22_re * a2_re;
637  A2_re -= gT22_im * a2_im;
638  spinorFloat A2_im = 0;
639  A2_im += gT20_re * a0_im;
640  A2_im += gT20_im * a0_re;
641  A2_im += gT21_re * a1_im;
642  A2_im += gT21_im * a1_re;
643  A2_im += gT22_re * a2_im;
644  A2_im += gT22_im * a2_re;
645  spinorFloat B2_re = 0;
646  B2_re += gT20_re * b0_re;
647  B2_re -= gT20_im * b0_im;
648  B2_re += gT21_re * b1_re;
649  B2_re -= gT21_im * b1_im;
650  B2_re += gT22_re * b2_re;
651  B2_re -= gT22_im * b2_im;
652  spinorFloat B2_im = 0;
653  B2_im += gT20_re * b0_im;
654  B2_im += gT20_im * b0_re;
655  B2_im += gT21_re * b1_im;
656  B2_im += gT21_im * b1_re;
657  B2_im += gT22_re * b2_im;
658  B2_im += gT22_im * b2_re;
659 
660  o00_re += A0_re;
661  o00_im += A0_im;
662  o10_re += B0_re;
663  o10_im += B0_im;
664  o20_re -= B0_im;
665  o20_im += B0_re;
666  o30_re -= A0_im;
667  o30_im += A0_re;
668 
669  o01_re += A1_re;
670  o01_im += A1_im;
671  o11_re += B1_re;
672  o11_im += B1_im;
673  o21_re -= B1_im;
674  o21_im += B1_re;
675  o31_re -= A1_im;
676  o31_im += A1_re;
677 
678  o02_re += A2_re;
679  o02_im += A2_im;
680  o12_re += B2_re;
681  o12_im += B2_im;
682  o22_re -= B2_im;
683  o22_im += B2_re;
684  o32_re -= A2_im;
685  o32_im += A2_re;
686 
687 }
688 
689 #ifdef MULTI_GPU
690 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]<(param.dc.X[1]-1))) ||
691  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==(param.dc.X[1]-1)) )
692 #endif
693 {
694  // Projector P1+
695  // 1 0 0 1
696  // 0 1 -1 0
697  // 0 -1 1 0
698  // 1 0 0 1
699 
700 #ifdef MULTI_GPU
701  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1 :
702  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
703 #if (DD_PREC==2) // half precision
704  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
705 #endif
706 #else
707  const int sp_idx = (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1;
708 #endif
709 
710  const int ga_idx = sid;
711 
718 
719 #ifdef MULTI_GPU
720  if (kernel_type == INTERIOR_KERNEL) {
721 #endif
722 
723  if (threadIdx.y == blockDim.y-1 && blockDim.y < param.dc.X[1] ) {
724  // read spinor from device memory
725 #ifdef TWIST_INV_DSLASH
726 #ifdef SPINOR_DOUBLE
727  const spinorFloat a = param.a;
728  const spinorFloat b = param.b;
729 #else
730  const spinorFloat a = param.a_f;
731  const spinorFloat b = param.b_f;
732 #endif
733 #endif
734  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
735 #ifdef TWIST_INV_DSLASH
736  APPLY_TWIST_INV(-a, b, i);
737 #endif
738 
739  // project spinor into half spinors
740  a0_re = +i00_re+i30_re;
741  a0_im = +i00_im+i30_im;
742  a1_re = +i01_re+i31_re;
743  a1_im = +i01_im+i31_im;
744  a2_re = +i02_re+i32_re;
745  a2_im = +i02_im+i32_im;
746  b0_re = +i10_re-i20_re;
747  b0_im = +i10_im-i20_im;
748  b1_re = +i11_re-i21_re;
749  b1_im = +i11_im-i21_im;
750  b2_re = +i12_re-i22_re;
751  b2_im = +i12_im-i22_im;
752  } else {
753  // load spinor from shared memory
754  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x;
755  int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0;
756  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
757 
758  // project spinor into half spinors
759  a0_re = +i00_re+i30_re;
760  a0_im = +i00_im+i30_im;
761  a1_re = +i01_re+i31_re;
762  a1_im = +i01_im+i31_im;
763  a2_re = +i02_re+i32_re;
764  a2_im = +i02_im+i32_im;
765  b0_re = +i10_re-i20_re;
766  b0_im = +i10_im-i20_im;
767  b1_re = +i11_re-i21_re;
768  b1_im = +i11_im-i21_im;
769  b2_re = +i12_re-i22_re;
770  b2_im = +i12_im-i22_im;
771  }
772 
773 #ifdef MULTI_GPU
774  } else {
775 
776  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
777 
778  // read half spinor from device memory
779  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 2);
780 
781  a0_re = i00_re; a0_im = i00_im;
782  a1_re = i01_re; a1_im = i01_im;
783  a2_re = i02_re; a2_im = i02_im;
784  b0_re = i10_re; b0_im = i10_im;
785  b1_re = i11_re; b1_im = i11_im;
786  b2_re = i12_re; b2_im = i12_im;
787 
788  }
789 #endif // MULTI_GPU
790 
791  // read gauge matrix from device memory
792  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, param.gauge_stride);
793 
794  // reconstruct gauge matrix
796 
797  // multiply row 0
798  spinorFloat A0_re = 0;
799  A0_re += g00_re * a0_re;
800  A0_re -= g00_im * a0_im;
801  A0_re += g01_re * a1_re;
802  A0_re -= g01_im * a1_im;
803  A0_re += g02_re * a2_re;
804  A0_re -= g02_im * a2_im;
805  spinorFloat A0_im = 0;
806  A0_im += g00_re * a0_im;
807  A0_im += g00_im * a0_re;
808  A0_im += g01_re * a1_im;
809  A0_im += g01_im * a1_re;
810  A0_im += g02_re * a2_im;
811  A0_im += g02_im * a2_re;
812  spinorFloat B0_re = 0;
813  B0_re += g00_re * b0_re;
814  B0_re -= g00_im * b0_im;
815  B0_re += g01_re * b1_re;
816  B0_re -= g01_im * b1_im;
817  B0_re += g02_re * b2_re;
818  B0_re -= g02_im * b2_im;
819  spinorFloat B0_im = 0;
820  B0_im += g00_re * b0_im;
821  B0_im += g00_im * b0_re;
822  B0_im += g01_re * b1_im;
823  B0_im += g01_im * b1_re;
824  B0_im += g02_re * b2_im;
825  B0_im += g02_im * b2_re;
826 
827  // multiply row 1
828  spinorFloat A1_re = 0;
829  A1_re += g10_re * a0_re;
830  A1_re -= g10_im * a0_im;
831  A1_re += g11_re * a1_re;
832  A1_re -= g11_im * a1_im;
833  A1_re += g12_re * a2_re;
834  A1_re -= g12_im * a2_im;
835  spinorFloat A1_im = 0;
836  A1_im += g10_re * a0_im;
837  A1_im += g10_im * a0_re;
838  A1_im += g11_re * a1_im;
839  A1_im += g11_im * a1_re;
840  A1_im += g12_re * a2_im;
841  A1_im += g12_im * a2_re;
842  spinorFloat B1_re = 0;
843  B1_re += g10_re * b0_re;
844  B1_re -= g10_im * b0_im;
845  B1_re += g11_re * b1_re;
846  B1_re -= g11_im * b1_im;
847  B1_re += g12_re * b2_re;
848  B1_re -= g12_im * b2_im;
849  spinorFloat B1_im = 0;
850  B1_im += g10_re * b0_im;
851  B1_im += g10_im * b0_re;
852  B1_im += g11_re * b1_im;
853  B1_im += g11_im * b1_re;
854  B1_im += g12_re * b2_im;
855  B1_im += g12_im * b2_re;
856 
857  // multiply row 2
858  spinorFloat A2_re = 0;
859  A2_re += g20_re * a0_re;
860  A2_re -= g20_im * a0_im;
861  A2_re += g21_re * a1_re;
862  A2_re -= g21_im * a1_im;
863  A2_re += g22_re * a2_re;
864  A2_re -= g22_im * a2_im;
865  spinorFloat A2_im = 0;
866  A2_im += g20_re * a0_im;
867  A2_im += g20_im * a0_re;
868  A2_im += g21_re * a1_im;
869  A2_im += g21_im * a1_re;
870  A2_im += g22_re * a2_im;
871  A2_im += g22_im * a2_re;
872  spinorFloat B2_re = 0;
873  B2_re += g20_re * b0_re;
874  B2_re -= g20_im * b0_im;
875  B2_re += g21_re * b1_re;
876  B2_re -= g21_im * b1_im;
877  B2_re += g22_re * b2_re;
878  B2_re -= g22_im * b2_im;
879  spinorFloat B2_im = 0;
880  B2_im += g20_re * b0_im;
881  B2_im += g20_im * b0_re;
882  B2_im += g21_re * b1_im;
883  B2_im += g21_im * b1_re;
884  B2_im += g22_re * b2_im;
885  B2_im += g22_im * b2_re;
886 
887  o00_re += A0_re;
888  o00_im += A0_im;
889  o10_re += B0_re;
890  o10_im += B0_im;
891  o20_re -= B0_re;
892  o20_im -= B0_im;
893  o30_re += A0_re;
894  o30_im += A0_im;
895 
896  o01_re += A1_re;
897  o01_im += A1_im;
898  o11_re += B1_re;
899  o11_im += B1_im;
900  o21_re -= B1_re;
901  o21_im -= B1_im;
902  o31_re += A1_re;
903  o31_im += A1_im;
904 
905  o02_re += A2_re;
906  o02_im += A2_im;
907  o12_re += B2_re;
908  o12_im += B2_im;
909  o22_re -= B2_re;
910  o22_im -= B2_im;
911  o32_re += A2_re;
912  o32_im += A2_im;
913 
914 }
915 
916 #ifdef MULTI_GPU
917 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]>0)) ||
918  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==0) )
919 #endif
920 {
921  // Projector P1-
922  // 1 0 0 -1
923  // 0 1 1 0
924  // 0 1 1 0
925  // -1 0 0 1
926 
927 #ifdef MULTI_GPU
928  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1 :
929  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
930 #if (DD_PREC==2) // half precision
931  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
932 #endif
933 #else
934  const int sp_idx = (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1;
935 #endif
936 
937 #ifdef MULTI_GPU
938  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
939 #else
940  const int ga_idx = sp_idx;
941 #endif
942 
949 
950 #ifdef MULTI_GPU
951  if (kernel_type == INTERIOR_KERNEL) {
952 #endif
953 
954  if (threadIdx.y == 0 && blockDim.y < param.dc.X[1]) {
955  // read spinor from device memory
956 #ifdef TWIST_INV_DSLASH
957 #ifdef SPINOR_DOUBLE
958  const spinorFloat a = param.a;
959  const spinorFloat b = param.b;
960 #else
961  const spinorFloat a = param.a_f;
962  const spinorFloat b = param.b_f;
963 #endif
964 #endif
965  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
966 #ifdef TWIST_INV_DSLASH
967  APPLY_TWIST_INV(-a, b, i);
968 #endif
969 
970  // project spinor into half spinors
971  a0_re = +i00_re-i30_re;
972  a0_im = +i00_im-i30_im;
973  a1_re = +i01_re-i31_re;
974  a1_im = +i01_im-i31_im;
975  a2_re = +i02_re-i32_re;
976  a2_im = +i02_im-i32_im;
977  b0_re = +i10_re+i20_re;
978  b0_im = +i10_im+i20_im;
979  b1_re = +i11_re+i21_re;
980  b1_im = +i11_im+i21_im;
981  b2_re = +i12_re+i22_re;
982  b2_im = +i12_im+i22_im;
983  } else {
984  // load spinor from shared memory
985  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x;
986  int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1;
987  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
988 
989  // project spinor into half spinors
990  a0_re = +i00_re-i30_re;
991  a0_im = +i00_im-i30_im;
992  a1_re = +i01_re-i31_re;
993  a1_im = +i01_im-i31_im;
994  a2_re = +i02_re-i32_re;
995  a2_im = +i02_im-i32_im;
996  b0_re = +i10_re+i20_re;
997  b0_im = +i10_im+i20_im;
998  b1_re = +i11_re+i21_re;
999  b1_im = +i11_im+i21_im;
1000  b2_re = +i12_re+i22_re;
1001  b2_im = +i12_im+i22_im;
1002  }
1003 
1004 #ifdef MULTI_GPU
1005  } else {
1006 
1007  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1008 
1009  // read half spinor from device memory
1010  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 3);
1011 
1012  a0_re = i00_re; a0_im = i00_im;
1013  a1_re = i01_re; a1_im = i01_im;
1014  a2_re = i02_re; a2_im = i02_im;
1015  b0_re = i10_re; b0_im = i10_im;
1016  b1_re = i11_re; b1_im = i11_im;
1017  b2_re = i12_re; b2_im = i12_im;
1018 
1019  }
1020 #endif // MULTI_GPU
1021 
1022  // read gauge matrix from device memory
1023  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, param.gauge_stride);
1024 
1025  // reconstruct gauge matrix
1027 
1028  // multiply row 0
1029  spinorFloat A0_re = 0;
1030  A0_re += gT00_re * a0_re;
1031  A0_re -= gT00_im * a0_im;
1032  A0_re += gT01_re * a1_re;
1033  A0_re -= gT01_im * a1_im;
1034  A0_re += gT02_re * a2_re;
1035  A0_re -= gT02_im * a2_im;
1036  spinorFloat A0_im = 0;
1037  A0_im += gT00_re * a0_im;
1038  A0_im += gT00_im * a0_re;
1039  A0_im += gT01_re * a1_im;
1040  A0_im += gT01_im * a1_re;
1041  A0_im += gT02_re * a2_im;
1042  A0_im += gT02_im * a2_re;
1043  spinorFloat B0_re = 0;
1044  B0_re += gT00_re * b0_re;
1045  B0_re -= gT00_im * b0_im;
1046  B0_re += gT01_re * b1_re;
1047  B0_re -= gT01_im * b1_im;
1048  B0_re += gT02_re * b2_re;
1049  B0_re -= gT02_im * b2_im;
1050  spinorFloat B0_im = 0;
1051  B0_im += gT00_re * b0_im;
1052  B0_im += gT00_im * b0_re;
1053  B0_im += gT01_re * b1_im;
1054  B0_im += gT01_im * b1_re;
1055  B0_im += gT02_re * b2_im;
1056  B0_im += gT02_im * b2_re;
1057 
1058  // multiply row 1
1059  spinorFloat A1_re = 0;
1060  A1_re += gT10_re * a0_re;
1061  A1_re -= gT10_im * a0_im;
1062  A1_re += gT11_re * a1_re;
1063  A1_re -= gT11_im * a1_im;
1064  A1_re += gT12_re * a2_re;
1065  A1_re -= gT12_im * a2_im;
1066  spinorFloat A1_im = 0;
1067  A1_im += gT10_re * a0_im;
1068  A1_im += gT10_im * a0_re;
1069  A1_im += gT11_re * a1_im;
1070  A1_im += gT11_im * a1_re;
1071  A1_im += gT12_re * a2_im;
1072  A1_im += gT12_im * a2_re;
1073  spinorFloat B1_re = 0;
1074  B1_re += gT10_re * b0_re;
1075  B1_re -= gT10_im * b0_im;
1076  B1_re += gT11_re * b1_re;
1077  B1_re -= gT11_im * b1_im;
1078  B1_re += gT12_re * b2_re;
1079  B1_re -= gT12_im * b2_im;
1080  spinorFloat B1_im = 0;
1081  B1_im += gT10_re * b0_im;
1082  B1_im += gT10_im * b0_re;
1083  B1_im += gT11_re * b1_im;
1084  B1_im += gT11_im * b1_re;
1085  B1_im += gT12_re * b2_im;
1086  B1_im += gT12_im * b2_re;
1087 
1088  // multiply row 2
1089  spinorFloat A2_re = 0;
1090  A2_re += gT20_re * a0_re;
1091  A2_re -= gT20_im * a0_im;
1092  A2_re += gT21_re * a1_re;
1093  A2_re -= gT21_im * a1_im;
1094  A2_re += gT22_re * a2_re;
1095  A2_re -= gT22_im * a2_im;
1096  spinorFloat A2_im = 0;
1097  A2_im += gT20_re * a0_im;
1098  A2_im += gT20_im * a0_re;
1099  A2_im += gT21_re * a1_im;
1100  A2_im += gT21_im * a1_re;
1101  A2_im += gT22_re * a2_im;
1102  A2_im += gT22_im * a2_re;
1103  spinorFloat B2_re = 0;
1104  B2_re += gT20_re * b0_re;
1105  B2_re -= gT20_im * b0_im;
1106  B2_re += gT21_re * b1_re;
1107  B2_re -= gT21_im * b1_im;
1108  B2_re += gT22_re * b2_re;
1109  B2_re -= gT22_im * b2_im;
1110  spinorFloat B2_im = 0;
1111  B2_im += gT20_re * b0_im;
1112  B2_im += gT20_im * b0_re;
1113  B2_im += gT21_re * b1_im;
1114  B2_im += gT21_im * b1_re;
1115  B2_im += gT22_re * b2_im;
1116  B2_im += gT22_im * b2_re;
1117 
1118  o00_re += A0_re;
1119  o00_im += A0_im;
1120  o10_re += B0_re;
1121  o10_im += B0_im;
1122  o20_re += B0_re;
1123  o20_im += B0_im;
1124  o30_re -= A0_re;
1125  o30_im -= A0_im;
1126 
1127  o01_re += A1_re;
1128  o01_im += A1_im;
1129  o11_re += B1_re;
1130  o11_im += B1_im;
1131  o21_re += B1_re;
1132  o21_im += B1_im;
1133  o31_re -= A1_re;
1134  o31_im -= A1_im;
1135 
1136  o02_re += A2_re;
1137  o02_im += A2_im;
1138  o12_re += B2_re;
1139  o12_im += B2_im;
1140  o22_re += B2_re;
1141  o22_im += B2_im;
1142  o32_re -= A2_re;
1143  o32_im -= A2_im;
1144 
1145 }
1146 
1147 #ifdef MULTI_GPU
1148 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]<(param.dc.X[2]-1))) ||
1149  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==(param.dc.X[2]-1)) )
1150 #endif
1151 {
1152  // Projector P2+
1153  // 1 0 i 0
1154  // 0 1 0 -i
1155  // -i 0 1 0
1156  // 0 i 0 1
1157 
1158 #ifdef MULTI_GPU
1159  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1 :
1160  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1161 #if (DD_PREC==2) // half precision
1162  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1163 #endif
1164 #else
1165  const int sp_idx = (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1;
1166 #endif
1167 
1168  const int ga_idx = sid;
1169 
1176 
1177 #ifdef MULTI_GPU
1178  if (kernel_type == INTERIOR_KERNEL) {
1179 #endif
1180 
1181  if (threadIdx.z == blockDim.z-1 && blockDim.z < X3) {
1182  // read spinor from device memory
1183 #ifdef TWIST_INV_DSLASH
1184 #ifdef SPINOR_DOUBLE
1185  const spinorFloat a = param.a;
1186  const spinorFloat b = param.b;
1187 #else
1188  const spinorFloat a = param.a_f;
1189  const spinorFloat b = param.b_f;
1190 #endif
1191 #endif
1192  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1193 #ifdef TWIST_INV_DSLASH
1194  APPLY_TWIST_INV(-a, b, i);
1195 #endif
1196 
1197  // project spinor into half spinors
1198  a0_re = +i00_re-i20_im;
1199  a0_im = +i00_im+i20_re;
1200  a1_re = +i01_re-i21_im;
1201  a1_im = +i01_im+i21_re;
1202  a2_re = +i02_re-i22_im;
1203  a2_im = +i02_im+i22_re;
1204  b0_re = +i10_re+i30_im;
1205  b0_im = +i10_im-i30_re;
1206  b1_re = +i11_re+i31_im;
1207  b1_im = +i11_im-i31_re;
1208  b2_re = +i12_re+i32_im;
1209  b2_im = +i12_im-i32_re;
1210  } else {
1211  // load spinor from shared memory
1212  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x;
1213  int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0;
1214  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1215 
1216  // project spinor into half spinors
1217  a0_re = +i00_re-i20_im;
1218  a0_im = +i00_im+i20_re;
1219  a1_re = +i01_re-i21_im;
1220  a1_im = +i01_im+i21_re;
1221  a2_re = +i02_re-i22_im;
1222  a2_im = +i02_im+i22_re;
1223  b0_re = +i10_re+i30_im;
1224  b0_im = +i10_im-i30_re;
1225  b1_re = +i11_re+i31_im;
1226  b1_im = +i11_im-i31_re;
1227  b2_re = +i12_re+i32_im;
1228  b2_im = +i12_im-i32_re;
1229  }
1230 
1231 #ifdef MULTI_GPU
1232  } else {
1233 
1234  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1235 
1236  // read half spinor from device memory
1237  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 4);
1238 
1239  a0_re = i00_re; a0_im = i00_im;
1240  a1_re = i01_re; a1_im = i01_im;
1241  a2_re = i02_re; a2_im = i02_im;
1242  b0_re = i10_re; b0_im = i10_im;
1243  b1_re = i11_re; b1_im = i11_im;
1244  b2_re = i12_re; b2_im = i12_im;
1245 
1246  }
1247 #endif // MULTI_GPU
1248 
1249  // read gauge matrix from device memory
1250  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, param.gauge_stride);
1251 
1252  // reconstruct gauge matrix
1254 
1255  // multiply row 0
1256  spinorFloat A0_re = 0;
1257  A0_re += g00_re * a0_re;
1258  A0_re -= g00_im * a0_im;
1259  A0_re += g01_re * a1_re;
1260  A0_re -= g01_im * a1_im;
1261  A0_re += g02_re * a2_re;
1262  A0_re -= g02_im * a2_im;
1263  spinorFloat A0_im = 0;
1264  A0_im += g00_re * a0_im;
1265  A0_im += g00_im * a0_re;
1266  A0_im += g01_re * a1_im;
1267  A0_im += g01_im * a1_re;
1268  A0_im += g02_re * a2_im;
1269  A0_im += g02_im * a2_re;
1270  spinorFloat B0_re = 0;
1271  B0_re += g00_re * b0_re;
1272  B0_re -= g00_im * b0_im;
1273  B0_re += g01_re * b1_re;
1274  B0_re -= g01_im * b1_im;
1275  B0_re += g02_re * b2_re;
1276  B0_re -= g02_im * b2_im;
1277  spinorFloat B0_im = 0;
1278  B0_im += g00_re * b0_im;
1279  B0_im += g00_im * b0_re;
1280  B0_im += g01_re * b1_im;
1281  B0_im += g01_im * b1_re;
1282  B0_im += g02_re * b2_im;
1283  B0_im += g02_im * b2_re;
1284 
1285  // multiply row 1
1286  spinorFloat A1_re = 0;
1287  A1_re += g10_re * a0_re;
1288  A1_re -= g10_im * a0_im;
1289  A1_re += g11_re * a1_re;
1290  A1_re -= g11_im * a1_im;
1291  A1_re += g12_re * a2_re;
1292  A1_re -= g12_im * a2_im;
1293  spinorFloat A1_im = 0;
1294  A1_im += g10_re * a0_im;
1295  A1_im += g10_im * a0_re;
1296  A1_im += g11_re * a1_im;
1297  A1_im += g11_im * a1_re;
1298  A1_im += g12_re * a2_im;
1299  A1_im += g12_im * a2_re;
1300  spinorFloat B1_re = 0;
1301  B1_re += g10_re * b0_re;
1302  B1_re -= g10_im * b0_im;
1303  B1_re += g11_re * b1_re;
1304  B1_re -= g11_im * b1_im;
1305  B1_re += g12_re * b2_re;
1306  B1_re -= g12_im * b2_im;
1307  spinorFloat B1_im = 0;
1308  B1_im += g10_re * b0_im;
1309  B1_im += g10_im * b0_re;
1310  B1_im += g11_re * b1_im;
1311  B1_im += g11_im * b1_re;
1312  B1_im += g12_re * b2_im;
1313  B1_im += g12_im * b2_re;
1314 
1315  // multiply row 2
1316  spinorFloat A2_re = 0;
1317  A2_re += g20_re * a0_re;
1318  A2_re -= g20_im * a0_im;
1319  A2_re += g21_re * a1_re;
1320  A2_re -= g21_im * a1_im;
1321  A2_re += g22_re * a2_re;
1322  A2_re -= g22_im * a2_im;
1323  spinorFloat A2_im = 0;
1324  A2_im += g20_re * a0_im;
1325  A2_im += g20_im * a0_re;
1326  A2_im += g21_re * a1_im;
1327  A2_im += g21_im * a1_re;
1328  A2_im += g22_re * a2_im;
1329  A2_im += g22_im * a2_re;
1330  spinorFloat B2_re = 0;
1331  B2_re += g20_re * b0_re;
1332  B2_re -= g20_im * b0_im;
1333  B2_re += g21_re * b1_re;
1334  B2_re -= g21_im * b1_im;
1335  B2_re += g22_re * b2_re;
1336  B2_re -= g22_im * b2_im;
1337  spinorFloat B2_im = 0;
1338  B2_im += g20_re * b0_im;
1339  B2_im += g20_im * b0_re;
1340  B2_im += g21_re * b1_im;
1341  B2_im += g21_im * b1_re;
1342  B2_im += g22_re * b2_im;
1343  B2_im += g22_im * b2_re;
1344 
1345  o00_re += A0_re;
1346  o00_im += A0_im;
1347  o10_re += B0_re;
1348  o10_im += B0_im;
1349  o20_re += A0_im;
1350  o20_im -= A0_re;
1351  o30_re -= B0_im;
1352  o30_im += B0_re;
1353 
1354  o01_re += A1_re;
1355  o01_im += A1_im;
1356  o11_re += B1_re;
1357  o11_im += B1_im;
1358  o21_re += A1_im;
1359  o21_im -= A1_re;
1360  o31_re -= B1_im;
1361  o31_im += B1_re;
1362 
1363  o02_re += A2_re;
1364  o02_im += A2_im;
1365  o12_re += B2_re;
1366  o12_im += B2_im;
1367  o22_re += A2_im;
1368  o22_im -= A2_re;
1369  o32_re -= B2_im;
1370  o32_im += B2_re;
1371 
1372 }
1373 
1374 #ifdef MULTI_GPU
1375 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]>0)) ||
1376  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==0) )
1377 #endif
1378 {
1379  // Projector P2-
1380  // 1 0 -i 0
1381  // 0 1 0 i
1382  // i 0 1 0
1383  // 0 -i 0 1
1384 
1385 #ifdef MULTI_GPU
1386  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1 :
1387  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1388 #if (DD_PREC==2) // half precision
1389  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1390 #endif
1391 #else
1392  const int sp_idx = (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1;
1393 #endif
1394 
1395 #ifdef MULTI_GPU
1396  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1397 #else
1398  const int ga_idx = sp_idx;
1399 #endif
1400 
1407 
1408 #ifdef MULTI_GPU
1409  if (kernel_type == INTERIOR_KERNEL) {
1410 #endif
1411 
1412  if (threadIdx.z == 0 && blockDim.z < X3) {
1413  // read spinor from device memory
1414 #ifdef TWIST_INV_DSLASH
1415 #ifdef SPINOR_DOUBLE
1416  const spinorFloat a = param.a;
1417  const spinorFloat b = param.b;
1418 #else
1419  const spinorFloat a = param.a_f;
1420  const spinorFloat b = param.b_f;
1421 #endif
1422 #endif
1423  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1424 #ifdef TWIST_INV_DSLASH
1425  APPLY_TWIST_INV(-a, b, i);
1426 #endif
1427 
1428  // project spinor into half spinors
1429  a0_re = +i00_re+i20_im;
1430  a0_im = +i00_im-i20_re;
1431  a1_re = +i01_re+i21_im;
1432  a1_im = +i01_im-i21_re;
1433  a2_re = +i02_re+i22_im;
1434  a2_im = +i02_im-i22_re;
1435  b0_re = +i10_re-i30_im;
1436  b0_im = +i10_im+i30_re;
1437  b1_re = +i11_re-i31_im;
1438  b1_im = +i11_im+i31_re;
1439  b2_re = +i12_re-i32_im;
1440  b2_im = +i12_im+i32_re;
1441  } else {
1442  // load spinor from shared memory
1443  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x;
1444  int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1;
1445  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1446 
1447  // project spinor into half spinors
1448  a0_re = +i00_re+i20_im;
1449  a0_im = +i00_im-i20_re;
1450  a1_re = +i01_re+i21_im;
1451  a1_im = +i01_im-i21_re;
1452  a2_re = +i02_re+i22_im;
1453  a2_im = +i02_im-i22_re;
1454  b0_re = +i10_re-i30_im;
1455  b0_im = +i10_im+i30_re;
1456  b1_re = +i11_re-i31_im;
1457  b1_im = +i11_im+i31_re;
1458  b2_re = +i12_re-i32_im;
1459  b2_im = +i12_im+i32_re;
1460  }
1461 
1462 #ifdef MULTI_GPU
1463  } else {
1464 
1465  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1466 
1467  // read half spinor from device memory
1468  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 5);
1469 
1470  a0_re = i00_re; a0_im = i00_im;
1471  a1_re = i01_re; a1_im = i01_im;
1472  a2_re = i02_re; a2_im = i02_im;
1473  b0_re = i10_re; b0_im = i10_im;
1474  b1_re = i11_re; b1_im = i11_im;
1475  b2_re = i12_re; b2_im = i12_im;
1476 
1477  }
1478 #endif // MULTI_GPU
1479 
1480  // read gauge matrix from device memory
1481  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, param.gauge_stride);
1482 
1483  // reconstruct gauge matrix
1485 
1486  // multiply row 0
1487  spinorFloat A0_re = 0;
1488  A0_re += gT00_re * a0_re;
1489  A0_re -= gT00_im * a0_im;
1490  A0_re += gT01_re * a1_re;
1491  A0_re -= gT01_im * a1_im;
1492  A0_re += gT02_re * a2_re;
1493  A0_re -= gT02_im * a2_im;
1494  spinorFloat A0_im = 0;
1495  A0_im += gT00_re * a0_im;
1496  A0_im += gT00_im * a0_re;
1497  A0_im += gT01_re * a1_im;
1498  A0_im += gT01_im * a1_re;
1499  A0_im += gT02_re * a2_im;
1500  A0_im += gT02_im * a2_re;
1501  spinorFloat B0_re = 0;
1502  B0_re += gT00_re * b0_re;
1503  B0_re -= gT00_im * b0_im;
1504  B0_re += gT01_re * b1_re;
1505  B0_re -= gT01_im * b1_im;
1506  B0_re += gT02_re * b2_re;
1507  B0_re -= gT02_im * b2_im;
1508  spinorFloat B0_im = 0;
1509  B0_im += gT00_re * b0_im;
1510  B0_im += gT00_im * b0_re;
1511  B0_im += gT01_re * b1_im;
1512  B0_im += gT01_im * b1_re;
1513  B0_im += gT02_re * b2_im;
1514  B0_im += gT02_im * b2_re;
1515 
1516  // multiply row 1
1517  spinorFloat A1_re = 0;
1518  A1_re += gT10_re * a0_re;
1519  A1_re -= gT10_im * a0_im;
1520  A1_re += gT11_re * a1_re;
1521  A1_re -= gT11_im * a1_im;
1522  A1_re += gT12_re * a2_re;
1523  A1_re -= gT12_im * a2_im;
1524  spinorFloat A1_im = 0;
1525  A1_im += gT10_re * a0_im;
1526  A1_im += gT10_im * a0_re;
1527  A1_im += gT11_re * a1_im;
1528  A1_im += gT11_im * a1_re;
1529  A1_im += gT12_re * a2_im;
1530  A1_im += gT12_im * a2_re;
1531  spinorFloat B1_re = 0;
1532  B1_re += gT10_re * b0_re;
1533  B1_re -= gT10_im * b0_im;
1534  B1_re += gT11_re * b1_re;
1535  B1_re -= gT11_im * b1_im;
1536  B1_re += gT12_re * b2_re;
1537  B1_re -= gT12_im * b2_im;
1538  spinorFloat B1_im = 0;
1539  B1_im += gT10_re * b0_im;
1540  B1_im += gT10_im * b0_re;
1541  B1_im += gT11_re * b1_im;
1542  B1_im += gT11_im * b1_re;
1543  B1_im += gT12_re * b2_im;
1544  B1_im += gT12_im * b2_re;
1545 
1546  // multiply row 2
1547  spinorFloat A2_re = 0;
1548  A2_re += gT20_re * a0_re;
1549  A2_re -= gT20_im * a0_im;
1550  A2_re += gT21_re * a1_re;
1551  A2_re -= gT21_im * a1_im;
1552  A2_re += gT22_re * a2_re;
1553  A2_re -= gT22_im * a2_im;
1554  spinorFloat A2_im = 0;
1555  A2_im += gT20_re * a0_im;
1556  A2_im += gT20_im * a0_re;
1557  A2_im += gT21_re * a1_im;
1558  A2_im += gT21_im * a1_re;
1559  A2_im += gT22_re * a2_im;
1560  A2_im += gT22_im * a2_re;
1561  spinorFloat B2_re = 0;
1562  B2_re += gT20_re * b0_re;
1563  B2_re -= gT20_im * b0_im;
1564  B2_re += gT21_re * b1_re;
1565  B2_re -= gT21_im * b1_im;
1566  B2_re += gT22_re * b2_re;
1567  B2_re -= gT22_im * b2_im;
1568  spinorFloat B2_im = 0;
1569  B2_im += gT20_re * b0_im;
1570  B2_im += gT20_im * b0_re;
1571  B2_im += gT21_re * b1_im;
1572  B2_im += gT21_im * b1_re;
1573  B2_im += gT22_re * b2_im;
1574  B2_im += gT22_im * b2_re;
1575 
1576  o00_re += A0_re;
1577  o00_im += A0_im;
1578  o10_re += B0_re;
1579  o10_im += B0_im;
1580  o20_re -= A0_im;
1581  o20_im += A0_re;
1582  o30_re += B0_im;
1583  o30_im -= B0_re;
1584 
1585  o01_re += A1_re;
1586  o01_im += A1_im;
1587  o11_re += B1_re;
1588  o11_im += B1_im;
1589  o21_re -= A1_im;
1590  o21_im += A1_re;
1591  o31_re += B1_im;
1592  o31_im -= B1_re;
1593 
1594  o02_re += A2_re;
1595  o02_im += A2_im;
1596  o12_re += B2_re;
1597  o12_im += B2_im;
1598  o22_re -= A2_im;
1599  o22_im += A2_re;
1600  o32_re += B2_im;
1601  o32_im -= B2_re;
1602 
1603 }
1604 
1605 #ifdef MULTI_GPU
1606 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]<(param.dc.X[3]-1))) ||
1607  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==(param.dc.X[3]-1)) )
1608 #endif
1609 {
1610  // Projector P3+
1611  // 2 0 0 0
1612  // 0 2 0 0
1613  // 0 0 0 0
1614  // 0 0 0 0
1615 
1616 #ifdef MULTI_GPU
1617  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1 :
1618  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1619 #if (DD_PREC==2) // half precision
1620  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1621 #endif
1622 #else
1623  const int sp_idx = (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1;
1624 #endif
1625 
1626  const int ga_idx = sid;
1627 
1628  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1629  {
1636 
1637 #ifdef MULTI_GPU
1638  if (kernel_type == INTERIOR_KERNEL) {
1639 #endif
1640 
1641  // read spinor from device memory
1642 #ifdef TWIST_INV_DSLASH
1643 #ifdef SPINOR_DOUBLE
1644  const spinorFloat a = param.a;
1645  const spinorFloat b = param.b;
1646 #else
1647  const spinorFloat a = param.a_f;
1648  const spinorFloat b = param.b_f;
1649 #endif
1650 #endif
1651 #ifndef TWIST_INV_DSLASH
1652  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1653 #else
1654  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1655  APPLY_TWIST_INV(-a, b, i);
1656 #endif
1657 
1658  // project spinor into half spinors
1659  a0_re = +2*i00_re;
1660  a0_im = +2*i00_im;
1661  a1_re = +2*i01_re;
1662  a1_im = +2*i01_im;
1663  a2_re = +2*i02_re;
1664  a2_im = +2*i02_im;
1665  b0_re = +2*i10_re;
1666  b0_im = +2*i10_im;
1667  b1_re = +2*i11_re;
1668  b1_im = +2*i11_im;
1669  b2_re = +2*i12_re;
1670  b2_im = +2*i12_im;
1671 
1672 #ifdef MULTI_GPU
1673  } else {
1674 
1675  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1676  const int t_proj_scale = TPROJSCALE;
1677 
1678  // read half spinor from device memory
1679  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1680 
1681  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1682  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1683  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1684  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1685  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1686  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1687 
1688  }
1689 #endif // MULTI_GPU
1690 
1691  // identity gauge matrix
1698 
1699  o00_re += A0_re;
1700  o00_im += A0_im;
1701  o10_re += B0_re;
1702  o10_im += B0_im;
1703 
1704  o01_re += A1_re;
1705  o01_im += A1_im;
1706  o11_re += B1_re;
1707  o11_im += B1_im;
1708 
1709  o02_re += A2_re;
1710  o02_im += A2_im;
1711  o12_re += B2_re;
1712  o12_im += B2_im;
1713 
1714  } else {
1721 
1722 #ifdef MULTI_GPU
1723  if (kernel_type == INTERIOR_KERNEL) {
1724 #endif
1725 
1726  // read spinor from device memory
1727 #ifdef TWIST_INV_DSLASH
1728 #ifdef SPINOR_DOUBLE
1729  const spinorFloat a = param.a;
1730  const spinorFloat b = param.b;
1731 #else
1732  const spinorFloat a = param.a_f;
1733  const spinorFloat b = param.b_f;
1734 #endif
1735 #endif
1736 #ifndef TWIST_INV_DSLASH
1737  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1738 #else
1739  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1740  APPLY_TWIST_INV(-a, b, i);
1741 #endif
1742 
1743  // project spinor into half spinors
1744  a0_re = +2*i00_re;
1745  a0_im = +2*i00_im;
1746  a1_re = +2*i01_re;
1747  a1_im = +2*i01_im;
1748  a2_re = +2*i02_re;
1749  a2_im = +2*i02_im;
1750  b0_re = +2*i10_re;
1751  b0_im = +2*i10_im;
1752  b1_re = +2*i11_re;
1753  b1_im = +2*i11_im;
1754  b2_re = +2*i12_re;
1755  b2_im = +2*i12_im;
1756 
1757 #ifdef MULTI_GPU
1758  } else {
1759 
1760  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1761  const int t_proj_scale = TPROJSCALE;
1762 
1763  // read half spinor from device memory
1764  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1765 
1766  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1767  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1768  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1769  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1770  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1771  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1772 
1773  }
1774 #endif // MULTI_GPU
1775 
1776  // read gauge matrix from device memory
1777  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, param.gauge_stride);
1778 
1779  // reconstruct gauge matrix
1781 
1782  // multiply row 0
1783  spinorFloat A0_re = 0;
1784  A0_re += g00_re * a0_re;
1785  A0_re -= g00_im * a0_im;
1786  A0_re += g01_re * a1_re;
1787  A0_re -= g01_im * a1_im;
1788  A0_re += g02_re * a2_re;
1789  A0_re -= g02_im * a2_im;
1790  spinorFloat A0_im = 0;
1791  A0_im += g00_re * a0_im;
1792  A0_im += g00_im * a0_re;
1793  A0_im += g01_re * a1_im;
1794  A0_im += g01_im * a1_re;
1795  A0_im += g02_re * a2_im;
1796  A0_im += g02_im * a2_re;
1797  spinorFloat B0_re = 0;
1798  B0_re += g00_re * b0_re;
1799  B0_re -= g00_im * b0_im;
1800  B0_re += g01_re * b1_re;
1801  B0_re -= g01_im * b1_im;
1802  B0_re += g02_re * b2_re;
1803  B0_re -= g02_im * b2_im;
1804  spinorFloat B0_im = 0;
1805  B0_im += g00_re * b0_im;
1806  B0_im += g00_im * b0_re;
1807  B0_im += g01_re * b1_im;
1808  B0_im += g01_im * b1_re;
1809  B0_im += g02_re * b2_im;
1810  B0_im += g02_im * b2_re;
1811 
1812  // multiply row 1
1813  spinorFloat A1_re = 0;
1814  A1_re += g10_re * a0_re;
1815  A1_re -= g10_im * a0_im;
1816  A1_re += g11_re * a1_re;
1817  A1_re -= g11_im * a1_im;
1818  A1_re += g12_re * a2_re;
1819  A1_re -= g12_im * a2_im;
1820  spinorFloat A1_im = 0;
1821  A1_im += g10_re * a0_im;
1822  A1_im += g10_im * a0_re;
1823  A1_im += g11_re * a1_im;
1824  A1_im += g11_im * a1_re;
1825  A1_im += g12_re * a2_im;
1826  A1_im += g12_im * a2_re;
1827  spinorFloat B1_re = 0;
1828  B1_re += g10_re * b0_re;
1829  B1_re -= g10_im * b0_im;
1830  B1_re += g11_re * b1_re;
1831  B1_re -= g11_im * b1_im;
1832  B1_re += g12_re * b2_re;
1833  B1_re -= g12_im * b2_im;
1834  spinorFloat B1_im = 0;
1835  B1_im += g10_re * b0_im;
1836  B1_im += g10_im * b0_re;
1837  B1_im += g11_re * b1_im;
1838  B1_im += g11_im * b1_re;
1839  B1_im += g12_re * b2_im;
1840  B1_im += g12_im * b2_re;
1841 
1842  // multiply row 2
1843  spinorFloat A2_re = 0;
1844  A2_re += g20_re * a0_re;
1845  A2_re -= g20_im * a0_im;
1846  A2_re += g21_re * a1_re;
1847  A2_re -= g21_im * a1_im;
1848  A2_re += g22_re * a2_re;
1849  A2_re -= g22_im * a2_im;
1850  spinorFloat A2_im = 0;
1851  A2_im += g20_re * a0_im;
1852  A2_im += g20_im * a0_re;
1853  A2_im += g21_re * a1_im;
1854  A2_im += g21_im * a1_re;
1855  A2_im += g22_re * a2_im;
1856  A2_im += g22_im * a2_re;
1857  spinorFloat B2_re = 0;
1858  B2_re += g20_re * b0_re;
1859  B2_re -= g20_im * b0_im;
1860  B2_re += g21_re * b1_re;
1861  B2_re -= g21_im * b1_im;
1862  B2_re += g22_re * b2_re;
1863  B2_re -= g22_im * b2_im;
1864  spinorFloat B2_im = 0;
1865  B2_im += g20_re * b0_im;
1866  B2_im += g20_im * b0_re;
1867  B2_im += g21_re * b1_im;
1868  B2_im += g21_im * b1_re;
1869  B2_im += g22_re * b2_im;
1870  B2_im += g22_im * b2_re;
1871 
1872  o00_re += A0_re;
1873  o00_im += A0_im;
1874  o10_re += B0_re;
1875  o10_im += B0_im;
1876 
1877  o01_re += A1_re;
1878  o01_im += A1_im;
1879  o11_re += B1_re;
1880  o11_im += B1_im;
1881 
1882  o02_re += A2_re;
1883  o02_im += A2_im;
1884  o12_re += B2_re;
1885  o12_im += B2_im;
1886 
1887  }
1888 }
1889 
1890 #ifdef MULTI_GPU
1891 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]>0)) ||
1892  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==0) )
1893 #endif
1894 {
1895  // Projector P3-
1896  // 0 0 0 0
1897  // 0 0 0 0
1898  // 0 0 2 0
1899  // 0 0 0 2
1900 
1901 #ifdef MULTI_GPU
1902  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1 :
1903  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1904 #if (DD_PREC==2) // half precision
1905  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1906 #endif
1907 #else
1908  const int sp_idx = (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1;
1909 #endif
1910 
1911 #ifdef MULTI_GPU
1912  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1913 #else
1914  const int ga_idx = sp_idx;
1915 #endif
1916 
1917  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1918  {
1925 
1926 #ifdef MULTI_GPU
1927  if (kernel_type == INTERIOR_KERNEL) {
1928 #endif
1929 
1930  // read spinor from device memory
1931 #ifdef TWIST_INV_DSLASH
1932 #ifdef SPINOR_DOUBLE
1933  const spinorFloat a = param.a;
1934  const spinorFloat b = param.b;
1935 #else
1936  const spinorFloat a = param.a_f;
1937  const spinorFloat b = param.b_f;
1938 #endif
1939 #endif
1940 #ifndef TWIST_INV_DSLASH
1942 #else
1943  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1944  APPLY_TWIST_INV(-a, b, i);
1945 #endif
1946 
1947  // project spinor into half spinors
1948  a0_re = +2*i20_re;
1949  a0_im = +2*i20_im;
1950  a1_re = +2*i21_re;
1951  a1_im = +2*i21_im;
1952  a2_re = +2*i22_re;
1953  a2_im = +2*i22_im;
1954  b0_re = +2*i30_re;
1955  b0_im = +2*i30_im;
1956  b1_re = +2*i31_re;
1957  b1_im = +2*i31_im;
1958  b2_re = +2*i32_re;
1959  b2_im = +2*i32_im;
1960 
1961 #ifdef MULTI_GPU
1962  } else {
1963 
1964  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1965  const int t_proj_scale = TPROJSCALE;
1966 
1967  // read half spinor from device memory
1968  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
1969 
1970  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1971  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1972  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1973  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1974  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1975  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1976 
1977  }
1978 #endif // MULTI_GPU
1979 
1980  // identity gauge matrix
1987 
1988  o20_re += A0_re;
1989  o20_im += A0_im;
1990  o30_re += B0_re;
1991  o30_im += B0_im;
1992 
1993  o21_re += A1_re;
1994  o21_im += A1_im;
1995  o31_re += B1_re;
1996  o31_im += B1_im;
1997 
1998  o22_re += A2_re;
1999  o22_im += A2_im;
2000  o32_re += B2_re;
2001  o32_im += B2_im;
2002 
2003  } else {
2010 
2011 #ifdef MULTI_GPU
2012  if (kernel_type == INTERIOR_KERNEL) {
2013 #endif
2014 
2015  // read spinor from device memory
2016 #ifdef TWIST_INV_DSLASH
2017 #ifdef SPINOR_DOUBLE
2018  const spinorFloat a = param.a;
2019  const spinorFloat b = param.b;
2020 #else
2021  const spinorFloat a = param.a_f;
2022  const spinorFloat b = param.b_f;
2023 #endif
2024 #endif
2025 #ifndef TWIST_INV_DSLASH
2027 #else
2028  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2029  APPLY_TWIST_INV(-a, b, i);
2030 #endif
2031 
2032  // project spinor into half spinors
2033  a0_re = +2*i20_re;
2034  a0_im = +2*i20_im;
2035  a1_re = +2*i21_re;
2036  a1_im = +2*i21_im;
2037  a2_re = +2*i22_re;
2038  a2_im = +2*i22_im;
2039  b0_re = +2*i30_re;
2040  b0_im = +2*i30_im;
2041  b1_re = +2*i31_re;
2042  b1_im = +2*i31_im;
2043  b2_re = +2*i32_re;
2044  b2_im = +2*i32_im;
2045 
2046 #ifdef MULTI_GPU
2047  } else {
2048 
2049  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2050  const int t_proj_scale = TPROJSCALE;
2051 
2052  // read half spinor from device memory
2053  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
2054 
2055  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2056  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2057  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2058  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2059  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2060  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2061 
2062  }
2063 #endif // MULTI_GPU
2064 
2065  // read gauge matrix from device memory
2066  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, param.gauge_stride);
2067 
2068  // reconstruct gauge matrix
2070 
2071  // multiply row 0
2072  spinorFloat A0_re = 0;
2073  A0_re += gT00_re * a0_re;
2074  A0_re -= gT00_im * a0_im;
2075  A0_re += gT01_re * a1_re;
2076  A0_re -= gT01_im * a1_im;
2077  A0_re += gT02_re * a2_re;
2078  A0_re -= gT02_im * a2_im;
2079  spinorFloat A0_im = 0;
2080  A0_im += gT00_re * a0_im;
2081  A0_im += gT00_im * a0_re;
2082  A0_im += gT01_re * a1_im;
2083  A0_im += gT01_im * a1_re;
2084  A0_im += gT02_re * a2_im;
2085  A0_im += gT02_im * a2_re;
2086  spinorFloat B0_re = 0;
2087  B0_re += gT00_re * b0_re;
2088  B0_re -= gT00_im * b0_im;
2089  B0_re += gT01_re * b1_re;
2090  B0_re -= gT01_im * b1_im;
2091  B0_re += gT02_re * b2_re;
2092  B0_re -= gT02_im * b2_im;
2093  spinorFloat B0_im = 0;
2094  B0_im += gT00_re * b0_im;
2095  B0_im += gT00_im * b0_re;
2096  B0_im += gT01_re * b1_im;
2097  B0_im += gT01_im * b1_re;
2098  B0_im += gT02_re * b2_im;
2099  B0_im += gT02_im * b2_re;
2100 
2101  // multiply row 1
2102  spinorFloat A1_re = 0;
2103  A1_re += gT10_re * a0_re;
2104  A1_re -= gT10_im * a0_im;
2105  A1_re += gT11_re * a1_re;
2106  A1_re -= gT11_im * a1_im;
2107  A1_re += gT12_re * a2_re;
2108  A1_re -= gT12_im * a2_im;
2109  spinorFloat A1_im = 0;
2110  A1_im += gT10_re * a0_im;
2111  A1_im += gT10_im * a0_re;
2112  A1_im += gT11_re * a1_im;
2113  A1_im += gT11_im * a1_re;
2114  A1_im += gT12_re * a2_im;
2115  A1_im += gT12_im * a2_re;
2116  spinorFloat B1_re = 0;
2117  B1_re += gT10_re * b0_re;
2118  B1_re -= gT10_im * b0_im;
2119  B1_re += gT11_re * b1_re;
2120  B1_re -= gT11_im * b1_im;
2121  B1_re += gT12_re * b2_re;
2122  B1_re -= gT12_im * b2_im;
2123  spinorFloat B1_im = 0;
2124  B1_im += gT10_re * b0_im;
2125  B1_im += gT10_im * b0_re;
2126  B1_im += gT11_re * b1_im;
2127  B1_im += gT11_im * b1_re;
2128  B1_im += gT12_re * b2_im;
2129  B1_im += gT12_im * b2_re;
2130 
2131  // multiply row 2
2132  spinorFloat A2_re = 0;
2133  A2_re += gT20_re * a0_re;
2134  A2_re -= gT20_im * a0_im;
2135  A2_re += gT21_re * a1_re;
2136  A2_re -= gT21_im * a1_im;
2137  A2_re += gT22_re * a2_re;
2138  A2_re -= gT22_im * a2_im;
2139  spinorFloat A2_im = 0;
2140  A2_im += gT20_re * a0_im;
2141  A2_im += gT20_im * a0_re;
2142  A2_im += gT21_re * a1_im;
2143  A2_im += gT21_im * a1_re;
2144  A2_im += gT22_re * a2_im;
2145  A2_im += gT22_im * a2_re;
2146  spinorFloat B2_re = 0;
2147  B2_re += gT20_re * b0_re;
2148  B2_re -= gT20_im * b0_im;
2149  B2_re += gT21_re * b1_re;
2150  B2_re -= gT21_im * b1_im;
2151  B2_re += gT22_re * b2_re;
2152  B2_re -= gT22_im * b2_im;
2153  spinorFloat B2_im = 0;
2154  B2_im += gT20_re * b0_im;
2155  B2_im += gT20_im * b0_re;
2156  B2_im += gT21_re * b1_im;
2157  B2_im += gT21_im * b1_re;
2158  B2_im += gT22_re * b2_im;
2159  B2_im += gT22_im * b2_re;
2160 
2161  o20_re += A0_re;
2162  o20_im += A0_im;
2163  o30_re += B0_re;
2164  o30_im += B0_im;
2165 
2166  o21_re += A1_re;
2167  o21_im += A1_im;
2168  o31_re += B1_re;
2169  o31_im += B1_im;
2170 
2171  o22_re += A2_re;
2172  o22_im += A2_im;
2173  o32_re += B2_re;
2174  o32_im += B2_im;
2175 
2176  }
2177 }
2178 
2179 #ifdef MULTI_GPU
2180 
2181 int incomplete = 0; // Have all 8 contributions been computed for this site?
2182 
2183 switch(kernel_type) { // intentional fall-through
2184 case INTERIOR_KERNEL:
2185  incomplete = incomplete || (param.commDim[3] && (coord[3]==0 || coord[3]==(param.dc.X[3]-1)));
2186 case EXTERIOR_KERNEL_T:
2187  incomplete = incomplete || (param.commDim[2] && (coord[2]==0 || coord[2]==(param.dc.X[2]-1)));
2188 case EXTERIOR_KERNEL_Z:
2189  incomplete = incomplete || (param.commDim[1] && (coord[1]==0 || coord[1]==(param.dc.X[1]-1)));
2190 case EXTERIOR_KERNEL_Y:
2191  incomplete = incomplete || (param.commDim[0] && (coord[0]==0 || coord[0]==(param.dc.X[0]-1)));
2192 }
2193 
2194 if (!incomplete)
2195 #endif // MULTI_GPU
2196 {
2197 #ifndef TWIST_INV_DSLASH
2198 #ifdef SPINOR_DOUBLE
2199  const spinorFloat a = param.a;
2200  const spinorFloat b = param.b;
2201 #else
2202  const spinorFloat a = param.a_f;
2203  const spinorFloat b = param.b_f;
2204 #endif
2205 #endif
2206 #ifdef DSLASH_XPAY
2207  READ_ACCUM(ACCUMTEX, param.sp_stride)
2208 
2209 #ifndef TWIST_XPAY
2210 #ifndef TWIST_INV_DSLASH
2211  //perform invert twist first:
2212  APPLY_TWIST_INV(-a, b, o);
2213 #endif
2214  o00_re += acc00_re;
2215  o00_im += acc00_im;
2216  o01_re += acc01_re;
2217  o01_im += acc01_im;
2218  o02_re += acc02_re;
2219  o02_im += acc02_im;
2220  o10_re += acc10_re;
2221  o10_im += acc10_im;
2222  o11_re += acc11_re;
2223  o11_im += acc11_im;
2224  o12_re += acc12_re;
2225  o12_im += acc12_im;
2226  o20_re += acc20_re;
2227  o20_im += acc20_im;
2228  o21_re += acc21_re;
2229  o21_im += acc21_im;
2230  o22_re += acc22_re;
2231  o22_im += acc22_im;
2232  o30_re += acc30_re;
2233  o30_im += acc30_im;
2234  o31_re += acc31_re;
2235  o31_im += acc31_im;
2236  o32_re += acc32_re;
2237  o32_im += acc32_im;
2238 #else
2239  APPLY_TWIST(-a, acc);
2240  //warning! b is unrelated to the twisted mass parameter in this case!
2241 
2242  o00_re = b*o00_re+acc00_re;
2243  o00_im = b*o00_im+acc00_im;
2244  o01_re = b*o01_re+acc01_re;
2245  o01_im = b*o01_im+acc01_im;
2246  o02_re = b*o02_re+acc02_re;
2247  o02_im = b*o02_im+acc02_im;
2248  o10_re = b*o10_re+acc10_re;
2249  o10_im = b*o10_im+acc10_im;
2250  o11_re = b*o11_re+acc11_re;
2251  o11_im = b*o11_im+acc11_im;
2252  o12_re = b*o12_re+acc12_re;
2253  o12_im = b*o12_im+acc12_im;
2254  o20_re = b*o20_re+acc20_re;
2255  o20_im = b*o20_im+acc20_im;
2256  o21_re = b*o21_re+acc21_re;
2257  o21_im = b*o21_im+acc21_im;
2258  o22_re = b*o22_re+acc22_re;
2259  o22_im = b*o22_im+acc22_im;
2260  o30_re = b*o30_re+acc30_re;
2261  o30_im = b*o30_im+acc30_im;
2262  o31_re = b*o31_re+acc31_re;
2263  o31_im = b*o31_im+acc31_im;
2264  o32_re = b*o32_re+acc32_re;
2265  o32_im = b*o32_im+acc32_im;
2266 #endif//TWIST_XPAY
2267 #else //no XPAY
2268 #ifndef TWIST_INV_DSLASH
2269  APPLY_TWIST_INV(-a, b, o);
2270 #endif
2271 #endif
2272 }
2273 
2274 // write spinor field back to device memory
2275 WRITE_SPINOR(param.sp_stride);
2276 
2277 // undefine to prevent warning when precision is changed
2278 #undef spinorFloat
2279 #undef WRITE_SPINOR_SHARED
2280 #undef READ_SPINOR_SHARED
2281 #undef SHARED_STRIDE
2282 
2283 #undef g00_re
2284 #undef g00_im
2285 #undef g01_re
2286 #undef g01_im
2287 #undef g02_re
2288 #undef g02_im
2289 #undef g10_re
2290 #undef g10_im
2291 #undef g11_re
2292 #undef g11_im
2293 #undef g12_re
2294 #undef g12_im
2295 #undef g20_re
2296 #undef g20_im
2297 #undef g21_re
2298 #undef g21_im
2299 #undef g22_re
2300 #undef g22_im
2301 
2302 #undef i00_re
2303 #undef i00_im
2304 #undef i01_re
2305 #undef i01_im
2306 #undef i02_re
2307 #undef i02_im
2308 #undef i10_re
2309 #undef i10_im
2310 #undef i11_re
2311 #undef i11_im
2312 #undef i12_re
2313 #undef i12_im
2314 #undef i20_re
2315 #undef i20_im
2316 #undef i21_re
2317 #undef i21_im
2318 #undef i22_re
2319 #undef i22_im
2320 #undef i30_re
2321 #undef i30_im
2322 #undef i31_re
2323 #undef i31_im
2324 #undef i32_re
2325 #undef i32_im
2326 
2327 #undef acc00_re
2328 #undef acc00_im
2329 #undef acc01_re
2330 #undef acc01_im
2331 #undef acc02_re
2332 #undef acc02_im
2333 #undef acc10_re
2334 #undef acc10_im
2335 #undef acc11_re
2336 #undef acc11_im
2337 #undef acc12_re
2338 #undef acc12_im
2339 #undef acc20_re
2340 #undef acc20_im
2341 #undef acc21_re
2342 #undef acc21_im
2343 #undef acc22_re
2344 #undef acc22_im
2345 #undef acc30_re
2346 #undef acc30_im
2347 #undef acc31_re
2348 #undef acc31_im
2349 #undef acc32_re
2350 #undef acc32_im
2351 
2352 
2353 #undef o00_re
2354 #undef o00_im
2355 #undef o01_re
2356 #undef o01_im
2357 #undef o02_re
2358 #undef o02_im
2359 #undef o10_re
2360 #undef o10_im
2361 #undef o11_re
2362 #undef o11_im
2363 #undef o12_re
2364 #undef o12_im
2365 #undef o20_re
2366 #undef o20_im
2367 #undef o21_re
2368 #undef o21_im
2369 #undef o22_re
2370 #undef o22_im
2371 #undef o30_re
2372 #undef o30_im
2373 #undef o31_re
2374 #undef o31_im
2375 #undef o32_re
2376 #undef o32_im
2377 
2378 #undef VOLATILE
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o30_re
#define acc32_im
dim3 dim3 blockDim
#define acc30_im
spinorFloat a0_im
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o30_im
VOLATILE spinorFloat o31_re
VOLATILE spinorFloat o21_im
#define APPLY_TWIST(a, reg)
Definition: io_spinor.h:1379
#define READ_SPINOR_SHARED
__syncthreads()
#define gT01_im
#define acc01_im
#define i10_im
#define i22_re
#define g02_re
#define gT10_im
#define gT10_re
#define i21_re
#define i02_re
#define acc20_im
VOLATILE spinorFloat o00_re
int sp_idx
#define i11_im
#define acc00_im
spinorFloat A1_re
VOLATILE spinorFloat o01_im
#define acc20_re
#define acc22_im
#define gT12_re
VOLATILE spinorFloat o32_re
spinorFloat A0_im
spinorFloat A1_im
VOLATILE spinorFloat o21_re
#define gT12_im
#define acc32_re
#define i32_im
spinorFloat b0_im
#define gT01_re
#define gT02_im
#define gT00_re
spinorFloat B2_re
VOLATILE spinorFloat o01_re
#define GAUGE0TEX
#define i01_im
VOLATILE spinorFloat o20_re
spinorFloat B1_im
#define gT11_re
#define i30_re
#define acc12_re
#define gT11_im
#define g10_im
QudaGaugeParam param
Definition: pack_test.cpp:17
#define acc30_re
#define i20_im
#define acc11_im
spinorFloat b1_re
#define i21_im
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o10_im
#define acc21_im
spinorFloat B0_re
#define acc10_im
#define g12_im
spinorFloat B1_re
#define gT21_re
spinorFloat A2_im
#define GAUGE1TEX
#define g01_re
VOLATILE spinorFloat o32_im
#define SPINORTEX
spinorFloat b2_im
VOLATILE spinorFloat o20_im
spinorFloat a1_re
spinorFloat a2_re
#define gT22_re
#define READ_INTERMEDIATE_SPINOR
#define acc21_re
#define g20_im
#define i02_im
int X[4]
Definition: quda.h:29
#define VOLATILE
VOLATILE spinorFloat o10_re
#define g21_re
#define g11_re
spinorFloat B0_im
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define acc02_re
#define gT20_im
const spinorFloat b
#define i32_re
#define i31_re
#define gT00_im
#define READ_SPINOR_GHOST
#define gT20_re
VOLATILE spinorFloat o02_im
#define g12_re
#define g00_im
#define g01_im
#define acc31_re
#define i30_im
#define acc11_re
#define g11_im
#define g21_im
VOLATILE spinorFloat o11_im
#define WRITE_SPINOR_SHARED
spinorFloat A2_re
VOLATILE spinorFloat o11_re
spinorFloat b1_im
#define g10_re
#define i12_im
const int ga_idx
spinorFloat a0_re
#define i00_re
spinorFloat b2_re
#define gT22_im
WRITE_SPINOR(param.sp_stride)
#define spinorFloat
#define i22_im
#define acc00_re
#define gT02_re
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define i00_im
#define INTERTEX
coordsFromIndex3D< EVEN_X >(X, coord, sid, param)
spinorFloat a1_im
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o12_re
spinorFloat b0_re
#define g22_im
int face_idx
#define i31_im
RECONSTRUCT_GAUGE_MATRIX(0)
#define acc01_re
const int face_num
#define TPROJSCALE
#define acc02_im
spinorFloat B2_im
#define GHOSTSPINORTEX
VOLATILE spinorFloat o02_re
VOLATILE spinorFloat o22_re
#define g22_re
APPLY_TWIST_INV(-a, b, o)
#define acc22_re
#define i20_re
#define i10_re
#define i12_re
#define acc31_im
#define g20_re
#define g00_re
#define i01_re
#define a
#define i11_re
#define g02_im
spinorFloat a2_im
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)
#define acc10_re
#define gT21_im
spinorFloat A0_re
#define acc12_im
VOLATILE spinorFloat o00_im