QUDA  0.9.0
wilson_dslash_dagger_fermi_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 24
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2
15 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
64 #else
65 #define spinorFloat float
66 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4
67 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4
68 #define i00_re I0.x
69 #define i00_im I0.y
70 #define i01_re I0.z
71 #define i01_im I0.w
72 #define i02_re I1.x
73 #define i02_im I1.y
74 #define i10_re I1.z
75 #define i10_im I1.w
76 #define i11_re I2.x
77 #define i11_im I2.y
78 #define i12_re I2.z
79 #define i12_im I2.w
80 #define i20_re I3.x
81 #define i20_im I3.y
82 #define i21_re I3.z
83 #define i21_im I3.w
84 #define i22_re I4.x
85 #define i22_im I4.y
86 #define i30_re I4.z
87 #define i30_im I4.w
88 #define i31_re I5.x
89 #define i31_im I5.y
90 #define i32_re I5.z
91 #define i32_im I5.w
92 #define acc00_re accum0.x
93 #define acc00_im accum0.y
94 #define acc01_re accum0.z
95 #define acc01_im accum0.w
96 #define acc02_re accum1.x
97 #define acc02_im accum1.y
98 #define acc10_re accum1.z
99 #define acc10_im accum1.w
100 #define acc11_re accum2.x
101 #define acc11_im accum2.y
102 #define acc12_re accum2.z
103 #define acc12_im accum2.w
104 #define acc20_re accum3.x
105 #define acc20_im accum3.y
106 #define acc21_re accum3.z
107 #define acc21_im accum3.w
108 #define acc22_re accum4.x
109 #define acc22_im accum4.y
110 #define acc30_re accum4.z
111 #define acc30_im accum4.w
112 #define acc31_re accum5.x
113 #define acc31_im accum5.y
114 #define acc32_re accum5.z
115 #define acc32_im accum5.w
116 #endif // SPINOR_DOUBLE
117 
118 // gauge link
119 #ifdef GAUGE_FLOAT2
120 #define g00_re G0.x
121 #define g00_im G0.y
122 #define g01_re G1.x
123 #define g01_im G1.y
124 #define g02_re G2.x
125 #define g02_im G2.y
126 #define g10_re G3.x
127 #define g10_im G3.y
128 #define g11_re G4.x
129 #define g11_im G4.y
130 #define g12_re G5.x
131 #define g12_im G5.y
132 #define g20_re G6.x
133 #define g20_im G6.y
134 #define g21_re G7.x
135 #define g21_im G7.y
136 #define g22_re G8.x
137 #define g22_im G8.y
138 
139 #else
140 #define g00_re G0.x
141 #define g00_im G0.y
142 #define g01_re G0.z
143 #define g01_im G0.w
144 #define g02_re G1.x
145 #define g02_im G1.y
146 #define g10_re G1.z
147 #define g10_im G1.w
148 #define g11_re G2.x
149 #define g11_im G2.y
150 #define g12_re G2.z
151 #define g12_im G2.w
152 #define g20_re G3.x
153 #define g20_im G3.y
154 #define g21_re G3.z
155 #define g21_im G3.w
156 #define g22_re G4.x
157 #define g22_im G4.y
158 
159 #endif // GAUGE_DOUBLE
160 
161 // conjugated gauge link
162 #define gT00_re (+g00_re)
163 #define gT00_im (-g00_im)
164 #define gT01_re (+g10_re)
165 #define gT01_im (-g10_im)
166 #define gT02_re (+g20_re)
167 #define gT02_im (-g20_im)
168 #define gT10_re (+g01_re)
169 #define gT10_im (-g01_im)
170 #define gT11_re (+g11_re)
171 #define gT11_im (-g11_im)
172 #define gT12_re (+g21_re)
173 #define gT12_im (-g21_im)
174 #define gT20_re (+g02_re)
175 #define gT20_im (-g02_im)
176 #define gT21_re (+g12_re)
177 #define gT21_im (-g12_im)
178 #define gT22_re (+g22_re)
179 #define gT22_im (-g22_im)
180 
181 // first chiral block of inverted clover term
182 #ifdef CLOVER_DOUBLE
183 #define c00_00_re C0.x
184 #define c01_01_re C0.y
185 #define c02_02_re C1.x
186 #define c10_10_re C1.y
187 #define c11_11_re C2.x
188 #define c12_12_re C2.y
189 #define c01_00_re C3.x
190 #define c01_00_im C3.y
191 #define c02_00_re C4.x
192 #define c02_00_im C4.y
193 #define c10_00_re C5.x
194 #define c10_00_im C5.y
195 #define c11_00_re C6.x
196 #define c11_00_im C6.y
197 #define c12_00_re C7.x
198 #define c12_00_im C7.y
199 #define c02_01_re C8.x
200 #define c02_01_im C8.y
201 #define c10_01_re C9.x
202 #define c10_01_im C9.y
203 #define c11_01_re C10.x
204 #define c11_01_im C10.y
205 #define c12_01_re C11.x
206 #define c12_01_im C11.y
207 #define c10_02_re C12.x
208 #define c10_02_im C12.y
209 #define c11_02_re C13.x
210 #define c11_02_im C13.y
211 #define c12_02_re C14.x
212 #define c12_02_im C14.y
213 #define c11_10_re C15.x
214 #define c11_10_im C15.y
215 #define c12_10_re C16.x
216 #define c12_10_im C16.y
217 #define c12_11_re C17.x
218 #define c12_11_im C17.y
219 #else
220 #define c00_00_re C0.x
221 #define c01_01_re C0.y
222 #define c02_02_re C0.z
223 #define c10_10_re C0.w
224 #define c11_11_re C1.x
225 #define c12_12_re C1.y
226 #define c01_00_re C1.z
227 #define c01_00_im C1.w
228 #define c02_00_re C2.x
229 #define c02_00_im C2.y
230 #define c10_00_re C2.z
231 #define c10_00_im C2.w
232 #define c11_00_re C3.x
233 #define c11_00_im C3.y
234 #define c12_00_re C3.z
235 #define c12_00_im C3.w
236 #define c02_01_re C4.x
237 #define c02_01_im C4.y
238 #define c10_01_re C4.z
239 #define c10_01_im C4.w
240 #define c11_01_re C5.x
241 #define c11_01_im C5.y
242 #define c12_01_re C5.z
243 #define c12_01_im C5.w
244 #define c10_02_re C6.x
245 #define c10_02_im C6.y
246 #define c11_02_re C6.z
247 #define c11_02_im C6.w
248 #define c12_02_re C7.x
249 #define c12_02_im C7.y
250 #define c11_10_re C7.z
251 #define c11_10_im C7.w
252 #define c12_10_re C8.x
253 #define c12_10_im C8.y
254 #define c12_11_re C8.z
255 #define c12_11_im C8.w
256 #endif // CLOVER_DOUBLE
257 
258 #define c00_01_re (+c01_00_re)
259 #define c00_01_im (-c01_00_im)
260 #define c00_02_re (+c02_00_re)
261 #define c00_02_im (-c02_00_im)
262 #define c01_02_re (+c02_01_re)
263 #define c01_02_im (-c02_01_im)
264 #define c00_10_re (+c10_00_re)
265 #define c00_10_im (-c10_00_im)
266 #define c01_10_re (+c10_01_re)
267 #define c01_10_im (-c10_01_im)
268 #define c02_10_re (+c10_02_re)
269 #define c02_10_im (-c10_02_im)
270 #define c00_11_re (+c11_00_re)
271 #define c00_11_im (-c11_00_im)
272 #define c01_11_re (+c11_01_re)
273 #define c01_11_im (-c11_01_im)
274 #define c02_11_re (+c11_02_re)
275 #define c02_11_im (-c11_02_im)
276 #define c10_11_re (+c11_10_re)
277 #define c10_11_im (-c11_10_im)
278 #define c00_12_re (+c12_00_re)
279 #define c00_12_im (-c12_00_im)
280 #define c01_12_re (+c12_01_re)
281 #define c01_12_im (-c12_01_im)
282 #define c02_12_re (+c12_02_re)
283 #define c02_12_im (-c12_02_im)
284 #define c10_12_re (+c12_10_re)
285 #define c10_12_im (-c12_10_im)
286 #define c11_12_re (+c12_11_re)
287 #define c11_12_im (-c12_11_im)
288 
289 // second chiral block of inverted clover term (reuses C0,...,C9)
290 #define c20_20_re c00_00_re
291 #define c21_20_re c01_00_re
292 #define c21_20_im c01_00_im
293 #define c22_20_re c02_00_re
294 #define c22_20_im c02_00_im
295 #define c30_20_re c10_00_re
296 #define c30_20_im c10_00_im
297 #define c31_20_re c11_00_re
298 #define c31_20_im c11_00_im
299 #define c32_20_re c12_00_re
300 #define c32_20_im c12_00_im
301 #define c20_21_re c00_01_re
302 #define c20_21_im c00_01_im
303 #define c21_21_re c01_01_re
304 #define c22_21_re c02_01_re
305 #define c22_21_im c02_01_im
306 #define c30_21_re c10_01_re
307 #define c30_21_im c10_01_im
308 #define c31_21_re c11_01_re
309 #define c31_21_im c11_01_im
310 #define c32_21_re c12_01_re
311 #define c32_21_im c12_01_im
312 #define c20_22_re c00_02_re
313 #define c20_22_im c00_02_im
314 #define c21_22_re c01_02_re
315 #define c21_22_im c01_02_im
316 #define c22_22_re c02_02_re
317 #define c30_22_re c10_02_re
318 #define c30_22_im c10_02_im
319 #define c31_22_re c11_02_re
320 #define c31_22_im c11_02_im
321 #define c32_22_re c12_02_re
322 #define c32_22_im c12_02_im
323 #define c20_30_re c00_10_re
324 #define c20_30_im c00_10_im
325 #define c21_30_re c01_10_re
326 #define c21_30_im c01_10_im
327 #define c22_30_re c02_10_re
328 #define c22_30_im c02_10_im
329 #define c30_30_re c10_10_re
330 #define c31_30_re c11_10_re
331 #define c31_30_im c11_10_im
332 #define c32_30_re c12_10_re
333 #define c32_30_im c12_10_im
334 #define c20_31_re c00_11_re
335 #define c20_31_im c00_11_im
336 #define c21_31_re c01_11_re
337 #define c21_31_im c01_11_im
338 #define c22_31_re c02_11_re
339 #define c22_31_im c02_11_im
340 #define c30_31_re c10_11_re
341 #define c30_31_im c10_11_im
342 #define c31_31_re c11_11_re
343 #define c32_31_re c12_11_re
344 #define c32_31_im c12_11_im
345 #define c20_32_re c00_12_re
346 #define c20_32_im c00_12_im
347 #define c21_32_re c01_12_re
348 #define c21_32_im c01_12_im
349 #define c22_32_re c02_12_re
350 #define c22_32_im c02_12_im
351 #define c30_32_re c10_12_re
352 #define c30_32_im c10_12_im
353 #define c31_32_re c11_12_re
354 #define c31_32_im c11_12_im
355 #define c32_32_re c12_12_re
356 
357 // output spinor
382 
383 #ifdef SPINOR_DOUBLE
384 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
385 #else
386 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
387 #endif
388 
389 #include "read_gauge.h"
390 #include "read_clover.h"
391 #include "io_spinor.h"
392 
393 int coord[5];
394 int X;
395 
396 int sid;
397 
398 #ifdef MULTI_GPU
399 int face_idx;
400 if (kernel_type == INTERIOR_KERNEL) {
401 #endif
402 
403  // Assume even dimensions
404  coordsFromIndex3D<EVEN_X>(X, x, sid, param.parity, param.dc.X);
405 
406  // only need to check Y and Z dims currently since X and T set to match exactly
407  if (coord[1] >= param.dc.X[1]) return;
408  if (coord[2] >= param.dc.X[2]) return;
409 
410  o00_re = 0; o00_im = 0;
411  o01_re = 0; o01_im = 0;
412  o02_re = 0; o02_im = 0;
413  o10_re = 0; o10_im = 0;
414  o11_re = 0; o11_im = 0;
415  o12_re = 0; o12_im = 0;
416  o20_re = 0; o20_im = 0;
417  o21_re = 0; o21_im = 0;
418  o22_re = 0; o22_im = 0;
419  o30_re = 0; o30_im = 0;
420  o31_re = 0; o31_im = 0;
421  o32_re = 0; o32_im = 0;
422 
423 #ifdef MULTI_GPU
424 } else { // exterior kernel
425 
426  sid = blockIdx.x*blockDim.x + threadIdx.x;
427  if (sid >= param.threads) return;
428 
429  const int face_volume = (param.threads >> 1); // volume of one face
430  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
431  face_idx = sid - face_num*face_volume; // index into the respective face
432 
433  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
434  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
435  //sp_idx = face_idx + param.ghostOffset[dim];
436 
437  coordsFromFaceIndex<4,QUDA_4D_PC,kernel_type,1>(X, sid, coord, face_idx, face_num, param);
438 
440 
441  o00_re = i00_re; o00_im = i00_im;
442  o01_re = i01_re; o01_im = i01_im;
443  o02_re = i02_re; o02_im = i02_im;
444  o10_re = i10_re; o10_im = i10_im;
445  o11_re = i11_re; o11_im = i11_im;
446  o12_re = i12_re; o12_im = i12_im;
447  o20_re = i20_re; o20_im = i20_im;
448  o21_re = i21_re; o21_im = i21_im;
449  o22_re = i22_re; o22_im = i22_im;
450  o30_re = i30_re; o30_im = i30_im;
451  o31_re = i31_re; o31_im = i31_im;
452  o32_re = i32_re; o32_im = i32_im;
453 }
454 #endif // MULTI_GPU
455 
456 
457 #ifdef MULTI_GPU
458 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]<(param.dc.X[0]-1))) ||
459  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==(param.dc.X[0]-1)) )
460 #endif
461 {
462  // Projector P0+
463  // 1 0 0 i
464  // 0 1 i 0
465  // 0 -i 1 0
466  // -i 0 0 1
467 
468 #ifdef MULTI_GPU
469  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1 :
470  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
471 #if (DD_PREC==2) // half precision
472  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
473 #endif
474 #else
475  const int sp_idx = (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1;
476 #endif
477 
478  const int ga_idx = sid;
479 
486 
487 #ifdef MULTI_GPU
488  if (kernel_type == INTERIOR_KERNEL) {
489 #endif
490 
491  // read spinor from device memory
492  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
493 
494  // store spinor into shared memory
495  WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i);
496 
497  // project spinor into half spinors
498  a0_re = +i00_re-i30_im;
499  a0_im = +i00_im+i30_re;
500  a1_re = +i01_re-i31_im;
501  a1_im = +i01_im+i31_re;
502  a2_re = +i02_re-i32_im;
503  a2_im = +i02_im+i32_re;
504  b0_re = +i10_re-i20_im;
505  b0_im = +i10_im+i20_re;
506  b1_re = +i11_re-i21_im;
507  b1_im = +i11_im+i21_re;
508  b2_re = +i12_re-i22_im;
509  b2_im = +i12_im+i22_re;
510 
511 #ifdef MULTI_GPU
512  } else {
513 
514  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
515 
516  // read half spinor from device memory
517  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 0);
518 
519  a0_re = i00_re; a0_im = i00_im;
520  a1_re = i01_re; a1_im = i01_im;
521  a2_re = i02_re; a2_im = i02_im;
522  b0_re = i10_re; b0_im = i10_im;
523  b1_re = i11_re; b1_im = i11_im;
524  b2_re = i12_re; b2_im = i12_im;
525 
526  }
527 #endif // MULTI_GPU
528 
529  // read gauge matrix from device memory
530  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride);
531 
532  // reconstruct gauge matrix
534 
535  // multiply row 0
537  A0_re += g00_re * a0_re;
538  A0_re -= g00_im * a0_im;
539  A0_re += g01_re * a1_re;
540  A0_re -= g01_im * a1_im;
541  A0_re += g02_re * a2_re;
542  A0_re -= g02_im * a2_im;
544  A0_im += g00_re * a0_im;
545  A0_im += g00_im * a0_re;
546  A0_im += g01_re * a1_im;
547  A0_im += g01_im * a1_re;
548  A0_im += g02_re * a2_im;
549  A0_im += g02_im * a2_re;
551  B0_re += g00_re * b0_re;
552  B0_re -= g00_im * b0_im;
553  B0_re += g01_re * b1_re;
554  B0_re -= g01_im * b1_im;
555  B0_re += g02_re * b2_re;
556  B0_re -= g02_im * b2_im;
558  B0_im += g00_re * b0_im;
559  B0_im += g00_im * b0_re;
560  B0_im += g01_re * b1_im;
561  B0_im += g01_im * b1_re;
562  B0_im += g02_re * b2_im;
563  B0_im += g02_im * b2_re;
564 
565  // multiply row 1
567  A1_re += g10_re * a0_re;
568  A1_re -= g10_im * a0_im;
569  A1_re += g11_re * a1_re;
570  A1_re -= g11_im * a1_im;
571  A1_re += g12_re * a2_re;
572  A1_re -= g12_im * a2_im;
574  A1_im += g10_re * a0_im;
575  A1_im += g10_im * a0_re;
576  A1_im += g11_re * a1_im;
577  A1_im += g11_im * a1_re;
578  A1_im += g12_re * a2_im;
579  A1_im += g12_im * a2_re;
581  B1_re += g10_re * b0_re;
582  B1_re -= g10_im * b0_im;
583  B1_re += g11_re * b1_re;
584  B1_re -= g11_im * b1_im;
585  B1_re += g12_re * b2_re;
586  B1_re -= g12_im * b2_im;
588  B1_im += g10_re * b0_im;
589  B1_im += g10_im * b0_re;
590  B1_im += g11_re * b1_im;
591  B1_im += g11_im * b1_re;
592  B1_im += g12_re * b2_im;
593  B1_im += g12_im * b2_re;
594 
595  // multiply row 2
597  A2_re += g20_re * a0_re;
598  A2_re -= g20_im * a0_im;
599  A2_re += g21_re * a1_re;
600  A2_re -= g21_im * a1_im;
601  A2_re += g22_re * a2_re;
602  A2_re -= g22_im * a2_im;
604  A2_im += g20_re * a0_im;
605  A2_im += g20_im * a0_re;
606  A2_im += g21_re * a1_im;
607  A2_im += g21_im * a1_re;
608  A2_im += g22_re * a2_im;
609  A2_im += g22_im * a2_re;
611  B2_re += g20_re * b0_re;
612  B2_re -= g20_im * b0_im;
613  B2_re += g21_re * b1_re;
614  B2_re -= g21_im * b1_im;
615  B2_re += g22_re * b2_re;
616  B2_re -= g22_im * b2_im;
618  B2_im += g20_re * b0_im;
619  B2_im += g20_im * b0_re;
620  B2_im += g21_re * b1_im;
621  B2_im += g21_im * b1_re;
622  B2_im += g22_re * b2_im;
623  B2_im += g22_im * b2_re;
624 
625  o00_re += A0_re;
626  o00_im += A0_im;
627  o10_re += B0_re;
628  o10_im += B0_im;
629  o20_re += B0_im;
630  o20_im -= B0_re;
631  o30_re += A0_im;
632  o30_im -= A0_re;
633 
634  o01_re += A1_re;
635  o01_im += A1_im;
636  o11_re += B1_re;
637  o11_im += B1_im;
638  o21_re += B1_im;
639  o21_im -= B1_re;
640  o31_re += A1_im;
641  o31_im -= A1_re;
642 
643  o02_re += A2_re;
644  o02_im += A2_im;
645  o12_re += B2_re;
646  o12_im += B2_im;
647  o22_re += B2_im;
648  o22_im -= B2_re;
649  o32_re += A2_im;
650  o32_im -= A2_re;
651 
652 }
653 
654 #ifdef MULTI_GPU
655 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]>0)) ||
656  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==0) )
657 #endif
658 {
659  // Projector P0-
660  // 1 0 0 -i
661  // 0 1 -i 0
662  // 0 i 1 0
663  // i 0 0 1
664 
665 #ifdef MULTI_GPU
666  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1 :
667  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
668 #if (DD_PREC==2) // half precision
669  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
670 #endif
671 #else
672  const int sp_idx = (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1;
673 #endif
674 
675 #ifdef MULTI_GPU
676  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
677 #else
678  const int ga_idx = sp_idx;
679 #endif
680 
687 
688 #ifdef MULTI_GPU
689  if (kernel_type == INTERIOR_KERNEL) {
690 #endif
691 
692  // load spinor from shared memory
693  int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1;
694  __syncthreads();
695  READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z);
696 
697  // project spinor into half spinors
698  a0_re = +i00_re+i30_im;
699  a0_im = +i00_im-i30_re;
700  a1_re = +i01_re+i31_im;
701  a1_im = +i01_im-i31_re;
702  a2_re = +i02_re+i32_im;
703  a2_im = +i02_im-i32_re;
704  b0_re = +i10_re+i20_im;
705  b0_im = +i10_im-i20_re;
706  b1_re = +i11_re+i21_im;
707  b1_im = +i11_im-i21_re;
708  b2_re = +i12_re+i22_im;
709  b2_im = +i12_im-i22_re;
710 
711 #ifdef MULTI_GPU
712  } else {
713 
714  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
715 
716  // read half spinor from device memory
717  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 1);
718 
719  a0_re = i00_re; a0_im = i00_im;
720  a1_re = i01_re; a1_im = i01_im;
721  a2_re = i02_re; a2_im = i02_im;
722  b0_re = i10_re; b0_im = i10_im;
723  b1_re = i11_re; b1_im = i11_im;
724  b2_re = i12_re; b2_im = i12_im;
725 
726  }
727 #endif // MULTI_GPU
728 
729  // read gauge matrix from device memory
730  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, param.gauge_stride);
731 
732  // reconstruct gauge matrix
734 
735  // multiply row 0
736  spinorFloat A0_re = 0;
737  A0_re += gT00_re * a0_re;
738  A0_re -= gT00_im * a0_im;
739  A0_re += gT01_re * a1_re;
740  A0_re -= gT01_im * a1_im;
741  A0_re += gT02_re * a2_re;
742  A0_re -= gT02_im * a2_im;
743  spinorFloat A0_im = 0;
744  A0_im += gT00_re * a0_im;
745  A0_im += gT00_im * a0_re;
746  A0_im += gT01_re * a1_im;
747  A0_im += gT01_im * a1_re;
748  A0_im += gT02_re * a2_im;
749  A0_im += gT02_im * a2_re;
750  spinorFloat B0_re = 0;
751  B0_re += gT00_re * b0_re;
752  B0_re -= gT00_im * b0_im;
753  B0_re += gT01_re * b1_re;
754  B0_re -= gT01_im * b1_im;
755  B0_re += gT02_re * b2_re;
756  B0_re -= gT02_im * b2_im;
757  spinorFloat B0_im = 0;
758  B0_im += gT00_re * b0_im;
759  B0_im += gT00_im * b0_re;
760  B0_im += gT01_re * b1_im;
761  B0_im += gT01_im * b1_re;
762  B0_im += gT02_re * b2_im;
763  B0_im += gT02_im * b2_re;
764 
765  // multiply row 1
766  spinorFloat A1_re = 0;
767  A1_re += gT10_re * a0_re;
768  A1_re -= gT10_im * a0_im;
769  A1_re += gT11_re * a1_re;
770  A1_re -= gT11_im * a1_im;
771  A1_re += gT12_re * a2_re;
772  A1_re -= gT12_im * a2_im;
773  spinorFloat A1_im = 0;
774  A1_im += gT10_re * a0_im;
775  A1_im += gT10_im * a0_re;
776  A1_im += gT11_re * a1_im;
777  A1_im += gT11_im * a1_re;
778  A1_im += gT12_re * a2_im;
779  A1_im += gT12_im * a2_re;
780  spinorFloat B1_re = 0;
781  B1_re += gT10_re * b0_re;
782  B1_re -= gT10_im * b0_im;
783  B1_re += gT11_re * b1_re;
784  B1_re -= gT11_im * b1_im;
785  B1_re += gT12_re * b2_re;
786  B1_re -= gT12_im * b2_im;
787  spinorFloat B1_im = 0;
788  B1_im += gT10_re * b0_im;
789  B1_im += gT10_im * b0_re;
790  B1_im += gT11_re * b1_im;
791  B1_im += gT11_im * b1_re;
792  B1_im += gT12_re * b2_im;
793  B1_im += gT12_im * b2_re;
794 
795  // multiply row 2
796  spinorFloat A2_re = 0;
797  A2_re += gT20_re * a0_re;
798  A2_re -= gT20_im * a0_im;
799  A2_re += gT21_re * a1_re;
800  A2_re -= gT21_im * a1_im;
801  A2_re += gT22_re * a2_re;
802  A2_re -= gT22_im * a2_im;
803  spinorFloat A2_im = 0;
804  A2_im += gT20_re * a0_im;
805  A2_im += gT20_im * a0_re;
806  A2_im += gT21_re * a1_im;
807  A2_im += gT21_im * a1_re;
808  A2_im += gT22_re * a2_im;
809  A2_im += gT22_im * a2_re;
810  spinorFloat B2_re = 0;
811  B2_re += gT20_re * b0_re;
812  B2_re -= gT20_im * b0_im;
813  B2_re += gT21_re * b1_re;
814  B2_re -= gT21_im * b1_im;
815  B2_re += gT22_re * b2_re;
816  B2_re -= gT22_im * b2_im;
817  spinorFloat B2_im = 0;
818  B2_im += gT20_re * b0_im;
819  B2_im += gT20_im * b0_re;
820  B2_im += gT21_re * b1_im;
821  B2_im += gT21_im * b1_re;
822  B2_im += gT22_re * b2_im;
823  B2_im += gT22_im * b2_re;
824 
825  o00_re += A0_re;
826  o00_im += A0_im;
827  o10_re += B0_re;
828  o10_im += B0_im;
829  o20_re -= B0_im;
830  o20_im += B0_re;
831  o30_re -= A0_im;
832  o30_im += A0_re;
833 
834  o01_re += A1_re;
835  o01_im += A1_im;
836  o11_re += B1_re;
837  o11_im += B1_im;
838  o21_re -= B1_im;
839  o21_im += B1_re;
840  o31_re -= A1_im;
841  o31_im += A1_re;
842 
843  o02_re += A2_re;
844  o02_im += A2_im;
845  o12_re += B2_re;
846  o12_im += B2_im;
847  o22_re -= B2_im;
848  o22_im += B2_re;
849  o32_re -= A2_im;
850  o32_im += A2_re;
851 
852 }
853 
854 #ifdef MULTI_GPU
855 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]<(param.dc.X[1]-1))) ||
856  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==(param.dc.X[1]-1)) )
857 #endif
858 {
859  // Projector P1+
860  // 1 0 0 1
861  // 0 1 -1 0
862  // 0 -1 1 0
863  // 1 0 0 1
864 
865 #ifdef MULTI_GPU
866  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1 :
867  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
868 #if (DD_PREC==2) // half precision
869  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
870 #endif
871 #else
872  const int sp_idx = (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1;
873 #endif
874 
875  const int ga_idx = sid;
876 
883 
884 #ifdef MULTI_GPU
885  if (kernel_type == INTERIOR_KERNEL) {
886 #endif
887 
888  if (threadIdx.y == blockDim.y-1 && blockDim.y < param.dc.X[1] ) {
889  // read spinor from device memory
890  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
891 
892  // project spinor into half spinors
893  a0_re = +i00_re+i30_re;
894  a0_im = +i00_im+i30_im;
895  a1_re = +i01_re+i31_re;
896  a1_im = +i01_im+i31_im;
897  a2_re = +i02_re+i32_re;
898  a2_im = +i02_im+i32_im;
899  b0_re = +i10_re-i20_re;
900  b0_im = +i10_im-i20_im;
901  b1_re = +i11_re-i21_re;
902  b1_im = +i11_im-i21_im;
903  b2_re = +i12_re-i22_re;
904  b2_im = +i12_im-i22_im;
905  } else {
906  // load spinor from shared memory
907  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x;
908  int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0;
909  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
910 
911  // project spinor into half spinors
912  a0_re = +i00_re+i30_re;
913  a0_im = +i00_im+i30_im;
914  a1_re = +i01_re+i31_re;
915  a1_im = +i01_im+i31_im;
916  a2_re = +i02_re+i32_re;
917  a2_im = +i02_im+i32_im;
918  b0_re = +i10_re-i20_re;
919  b0_im = +i10_im-i20_im;
920  b1_re = +i11_re-i21_re;
921  b1_im = +i11_im-i21_im;
922  b2_re = +i12_re-i22_re;
923  b2_im = +i12_im-i22_im;
924  }
925 
926 #ifdef MULTI_GPU
927  } else {
928 
929  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
930 
931  // read half spinor from device memory
932  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 2);
933 
934  a0_re = i00_re; a0_im = i00_im;
935  a1_re = i01_re; a1_im = i01_im;
936  a2_re = i02_re; a2_im = i02_im;
937  b0_re = i10_re; b0_im = i10_im;
938  b1_re = i11_re; b1_im = i11_im;
939  b2_re = i12_re; b2_im = i12_im;
940 
941  }
942 #endif // MULTI_GPU
943 
944  // read gauge matrix from device memory
945  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, param.gauge_stride);
946 
947  // reconstruct gauge matrix
949 
950  // multiply row 0
951  spinorFloat A0_re = 0;
952  A0_re += g00_re * a0_re;
953  A0_re -= g00_im * a0_im;
954  A0_re += g01_re * a1_re;
955  A0_re -= g01_im * a1_im;
956  A0_re += g02_re * a2_re;
957  A0_re -= g02_im * a2_im;
958  spinorFloat A0_im = 0;
959  A0_im += g00_re * a0_im;
960  A0_im += g00_im * a0_re;
961  A0_im += g01_re * a1_im;
962  A0_im += g01_im * a1_re;
963  A0_im += g02_re * a2_im;
964  A0_im += g02_im * a2_re;
965  spinorFloat B0_re = 0;
966  B0_re += g00_re * b0_re;
967  B0_re -= g00_im * b0_im;
968  B0_re += g01_re * b1_re;
969  B0_re -= g01_im * b1_im;
970  B0_re += g02_re * b2_re;
971  B0_re -= g02_im * b2_im;
972  spinorFloat B0_im = 0;
973  B0_im += g00_re * b0_im;
974  B0_im += g00_im * b0_re;
975  B0_im += g01_re * b1_im;
976  B0_im += g01_im * b1_re;
977  B0_im += g02_re * b2_im;
978  B0_im += g02_im * b2_re;
979 
980  // multiply row 1
981  spinorFloat A1_re = 0;
982  A1_re += g10_re * a0_re;
983  A1_re -= g10_im * a0_im;
984  A1_re += g11_re * a1_re;
985  A1_re -= g11_im * a1_im;
986  A1_re += g12_re * a2_re;
987  A1_re -= g12_im * a2_im;
988  spinorFloat A1_im = 0;
989  A1_im += g10_re * a0_im;
990  A1_im += g10_im * a0_re;
991  A1_im += g11_re * a1_im;
992  A1_im += g11_im * a1_re;
993  A1_im += g12_re * a2_im;
994  A1_im += g12_im * a2_re;
995  spinorFloat B1_re = 0;
996  B1_re += g10_re * b0_re;
997  B1_re -= g10_im * b0_im;
998  B1_re += g11_re * b1_re;
999  B1_re -= g11_im * b1_im;
1000  B1_re += g12_re * b2_re;
1001  B1_re -= g12_im * b2_im;
1002  spinorFloat B1_im = 0;
1003  B1_im += g10_re * b0_im;
1004  B1_im += g10_im * b0_re;
1005  B1_im += g11_re * b1_im;
1006  B1_im += g11_im * b1_re;
1007  B1_im += g12_re * b2_im;
1008  B1_im += g12_im * b2_re;
1009 
1010  // multiply row 2
1011  spinorFloat A2_re = 0;
1012  A2_re += g20_re * a0_re;
1013  A2_re -= g20_im * a0_im;
1014  A2_re += g21_re * a1_re;
1015  A2_re -= g21_im * a1_im;
1016  A2_re += g22_re * a2_re;
1017  A2_re -= g22_im * a2_im;
1018  spinorFloat A2_im = 0;
1019  A2_im += g20_re * a0_im;
1020  A2_im += g20_im * a0_re;
1021  A2_im += g21_re * a1_im;
1022  A2_im += g21_im * a1_re;
1023  A2_im += g22_re * a2_im;
1024  A2_im += g22_im * a2_re;
1025  spinorFloat B2_re = 0;
1026  B2_re += g20_re * b0_re;
1027  B2_re -= g20_im * b0_im;
1028  B2_re += g21_re * b1_re;
1029  B2_re -= g21_im * b1_im;
1030  B2_re += g22_re * b2_re;
1031  B2_re -= g22_im * b2_im;
1032  spinorFloat B2_im = 0;
1033  B2_im += g20_re * b0_im;
1034  B2_im += g20_im * b0_re;
1035  B2_im += g21_re * b1_im;
1036  B2_im += g21_im * b1_re;
1037  B2_im += g22_re * b2_im;
1038  B2_im += g22_im * b2_re;
1039 
1040  o00_re += A0_re;
1041  o00_im += A0_im;
1042  o10_re += B0_re;
1043  o10_im += B0_im;
1044  o20_re -= B0_re;
1045  o20_im -= B0_im;
1046  o30_re += A0_re;
1047  o30_im += A0_im;
1048 
1049  o01_re += A1_re;
1050  o01_im += A1_im;
1051  o11_re += B1_re;
1052  o11_im += B1_im;
1053  o21_re -= B1_re;
1054  o21_im -= B1_im;
1055  o31_re += A1_re;
1056  o31_im += A1_im;
1057 
1058  o02_re += A2_re;
1059  o02_im += A2_im;
1060  o12_re += B2_re;
1061  o12_im += B2_im;
1062  o22_re -= B2_re;
1063  o22_im -= B2_im;
1064  o32_re += A2_re;
1065  o32_im += A2_im;
1066 
1067 }
1068 
1069 #ifdef MULTI_GPU
1070 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]>0)) ||
1071  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==0) )
1072 #endif
1073 {
1074  // Projector P1-
1075  // 1 0 0 -1
1076  // 0 1 1 0
1077  // 0 1 1 0
1078  // -1 0 0 1
1079 
1080 #ifdef MULTI_GPU
1081  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1 :
1082  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1083 #if (DD_PREC==2) // half precision
1084  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1085 #endif
1086 #else
1087  const int sp_idx = (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1;
1088 #endif
1089 
1090 #ifdef MULTI_GPU
1091  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1092 #else
1093  const int ga_idx = sp_idx;
1094 #endif
1095 
1102 
1103 #ifdef MULTI_GPU
1104  if (kernel_type == INTERIOR_KERNEL) {
1105 #endif
1106 
1107  if (threadIdx.y == 0 && blockDim.y < param.dc.X[1]) {
1108  // read spinor from device memory
1109  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1110 
1111  // project spinor into half spinors
1112  a0_re = +i00_re-i30_re;
1113  a0_im = +i00_im-i30_im;
1114  a1_re = +i01_re-i31_re;
1115  a1_im = +i01_im-i31_im;
1116  a2_re = +i02_re-i32_re;
1117  a2_im = +i02_im-i32_im;
1118  b0_re = +i10_re+i20_re;
1119  b0_im = +i10_im+i20_im;
1120  b1_re = +i11_re+i21_re;
1121  b1_im = +i11_im+i21_im;
1122  b2_re = +i12_re+i22_re;
1123  b2_im = +i12_im+i22_im;
1124  } else {
1125  // load spinor from shared memory
1126  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x;
1127  int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1;
1128  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
1129 
1130  // project spinor into half spinors
1131  a0_re = +i00_re-i30_re;
1132  a0_im = +i00_im-i30_im;
1133  a1_re = +i01_re-i31_re;
1134  a1_im = +i01_im-i31_im;
1135  a2_re = +i02_re-i32_re;
1136  a2_im = +i02_im-i32_im;
1137  b0_re = +i10_re+i20_re;
1138  b0_im = +i10_im+i20_im;
1139  b1_re = +i11_re+i21_re;
1140  b1_im = +i11_im+i21_im;
1141  b2_re = +i12_re+i22_re;
1142  b2_im = +i12_im+i22_im;
1143  }
1144 
1145 #ifdef MULTI_GPU
1146  } else {
1147 
1148  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1149 
1150  // read half spinor from device memory
1151  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 3);
1152 
1153  a0_re = i00_re; a0_im = i00_im;
1154  a1_re = i01_re; a1_im = i01_im;
1155  a2_re = i02_re; a2_im = i02_im;
1156  b0_re = i10_re; b0_im = i10_im;
1157  b1_re = i11_re; b1_im = i11_im;
1158  b2_re = i12_re; b2_im = i12_im;
1159 
1160  }
1161 #endif // MULTI_GPU
1162 
1163  // read gauge matrix from device memory
1164  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, param.gauge_stride);
1165 
1166  // reconstruct gauge matrix
1168 
1169  // multiply row 0
1170  spinorFloat A0_re = 0;
1171  A0_re += gT00_re * a0_re;
1172  A0_re -= gT00_im * a0_im;
1173  A0_re += gT01_re * a1_re;
1174  A0_re -= gT01_im * a1_im;
1175  A0_re += gT02_re * a2_re;
1176  A0_re -= gT02_im * a2_im;
1177  spinorFloat A0_im = 0;
1178  A0_im += gT00_re * a0_im;
1179  A0_im += gT00_im * a0_re;
1180  A0_im += gT01_re * a1_im;
1181  A0_im += gT01_im * a1_re;
1182  A0_im += gT02_re * a2_im;
1183  A0_im += gT02_im * a2_re;
1184  spinorFloat B0_re = 0;
1185  B0_re += gT00_re * b0_re;
1186  B0_re -= gT00_im * b0_im;
1187  B0_re += gT01_re * b1_re;
1188  B0_re -= gT01_im * b1_im;
1189  B0_re += gT02_re * b2_re;
1190  B0_re -= gT02_im * b2_im;
1191  spinorFloat B0_im = 0;
1192  B0_im += gT00_re * b0_im;
1193  B0_im += gT00_im * b0_re;
1194  B0_im += gT01_re * b1_im;
1195  B0_im += gT01_im * b1_re;
1196  B0_im += gT02_re * b2_im;
1197  B0_im += gT02_im * b2_re;
1198 
1199  // multiply row 1
1200  spinorFloat A1_re = 0;
1201  A1_re += gT10_re * a0_re;
1202  A1_re -= gT10_im * a0_im;
1203  A1_re += gT11_re * a1_re;
1204  A1_re -= gT11_im * a1_im;
1205  A1_re += gT12_re * a2_re;
1206  A1_re -= gT12_im * a2_im;
1207  spinorFloat A1_im = 0;
1208  A1_im += gT10_re * a0_im;
1209  A1_im += gT10_im * a0_re;
1210  A1_im += gT11_re * a1_im;
1211  A1_im += gT11_im * a1_re;
1212  A1_im += gT12_re * a2_im;
1213  A1_im += gT12_im * a2_re;
1214  spinorFloat B1_re = 0;
1215  B1_re += gT10_re * b0_re;
1216  B1_re -= gT10_im * b0_im;
1217  B1_re += gT11_re * b1_re;
1218  B1_re -= gT11_im * b1_im;
1219  B1_re += gT12_re * b2_re;
1220  B1_re -= gT12_im * b2_im;
1221  spinorFloat B1_im = 0;
1222  B1_im += gT10_re * b0_im;
1223  B1_im += gT10_im * b0_re;
1224  B1_im += gT11_re * b1_im;
1225  B1_im += gT11_im * b1_re;
1226  B1_im += gT12_re * b2_im;
1227  B1_im += gT12_im * b2_re;
1228 
1229  // multiply row 2
1230  spinorFloat A2_re = 0;
1231  A2_re += gT20_re * a0_re;
1232  A2_re -= gT20_im * a0_im;
1233  A2_re += gT21_re * a1_re;
1234  A2_re -= gT21_im * a1_im;
1235  A2_re += gT22_re * a2_re;
1236  A2_re -= gT22_im * a2_im;
1237  spinorFloat A2_im = 0;
1238  A2_im += gT20_re * a0_im;
1239  A2_im += gT20_im * a0_re;
1240  A2_im += gT21_re * a1_im;
1241  A2_im += gT21_im * a1_re;
1242  A2_im += gT22_re * a2_im;
1243  A2_im += gT22_im * a2_re;
1244  spinorFloat B2_re = 0;
1245  B2_re += gT20_re * b0_re;
1246  B2_re -= gT20_im * b0_im;
1247  B2_re += gT21_re * b1_re;
1248  B2_re -= gT21_im * b1_im;
1249  B2_re += gT22_re * b2_re;
1250  B2_re -= gT22_im * b2_im;
1251  spinorFloat B2_im = 0;
1252  B2_im += gT20_re * b0_im;
1253  B2_im += gT20_im * b0_re;
1254  B2_im += gT21_re * b1_im;
1255  B2_im += gT21_im * b1_re;
1256  B2_im += gT22_re * b2_im;
1257  B2_im += gT22_im * b2_re;
1258 
1259  o00_re += A0_re;
1260  o00_im += A0_im;
1261  o10_re += B0_re;
1262  o10_im += B0_im;
1263  o20_re += B0_re;
1264  o20_im += B0_im;
1265  o30_re -= A0_re;
1266  o30_im -= A0_im;
1267 
1268  o01_re += A1_re;
1269  o01_im += A1_im;
1270  o11_re += B1_re;
1271  o11_im += B1_im;
1272  o21_re += B1_re;
1273  o21_im += B1_im;
1274  o31_re -= A1_re;
1275  o31_im -= A1_im;
1276 
1277  o02_re += A2_re;
1278  o02_im += A2_im;
1279  o12_re += B2_re;
1280  o12_im += B2_im;
1281  o22_re += B2_re;
1282  o22_im += B2_im;
1283  o32_re -= A2_re;
1284  o32_im -= A2_im;
1285 
1286 }
1287 
1288 #ifdef MULTI_GPU
1289 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]<(param.dc.X[2]-1))) ||
1290  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==(param.dc.X[2]-1)) )
1291 #endif
1292 {
1293  // Projector P2+
1294  // 1 0 i 0
1295  // 0 1 0 -i
1296  // -i 0 1 0
1297  // 0 i 0 1
1298 
1299 #ifdef MULTI_GPU
1300  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1 :
1301  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1302 #if (DD_PREC==2) // half precision
1303  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1304 #endif
1305 #else
1306  const int sp_idx = (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1;
1307 #endif
1308 
1309  const int ga_idx = sid;
1310 
1317 
1318 #ifdef MULTI_GPU
1319  if (kernel_type == INTERIOR_KERNEL) {
1320 #endif
1321 
1322  if (threadIdx.z == blockDim.z-1 && blockDim.z < param.dc.X[2]) {
1323  // read spinor from device memory
1324  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1325 
1326  // project spinor into half spinors
1327  a0_re = +i00_re-i20_im;
1328  a0_im = +i00_im+i20_re;
1329  a1_re = +i01_re-i21_im;
1330  a1_im = +i01_im+i21_re;
1331  a2_re = +i02_re-i22_im;
1332  a2_im = +i02_im+i22_re;
1333  b0_re = +i10_re+i30_im;
1334  b0_im = +i10_im-i30_re;
1335  b1_re = +i11_re+i31_im;
1336  b1_im = +i11_im-i31_re;
1337  b2_re = +i12_re+i32_im;
1338  b2_im = +i12_im-i32_re;
1339  } else {
1340  // load spinor from shared memory
1341  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1) ) % blockDim.x;
1342  int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0;
1343  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1344 
1345  // project spinor into half spinors
1346  a0_re = +i00_re-i20_im;
1347  a0_im = +i00_im+i20_re;
1348  a1_re = +i01_re-i21_im;
1349  a1_im = +i01_im+i21_re;
1350  a2_re = +i02_re-i22_im;
1351  a2_im = +i02_im+i22_re;
1352  b0_re = +i10_re+i30_im;
1353  b0_im = +i10_im-i30_re;
1354  b1_re = +i11_re+i31_im;
1355  b1_im = +i11_im-i31_re;
1356  b2_re = +i12_re+i32_im;
1357  b2_im = +i12_im-i32_re;
1358  }
1359 
1360 #ifdef MULTI_GPU
1361  } else {
1362 
1363  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1364 
1365  // read half spinor from device memory
1366  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 4);
1367 
1368  a0_re = i00_re; a0_im = i00_im;
1369  a1_re = i01_re; a1_im = i01_im;
1370  a2_re = i02_re; a2_im = i02_im;
1371  b0_re = i10_re; b0_im = i10_im;
1372  b1_re = i11_re; b1_im = i11_im;
1373  b2_re = i12_re; b2_im = i12_im;
1374 
1375  }
1376 #endif // MULTI_GPU
1377 
1378  // read gauge matrix from device memory
1379  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, param.gauge_stride);
1380 
1381  // reconstruct gauge matrix
1383 
1384  // multiply row 0
1385  spinorFloat A0_re = 0;
1386  A0_re += g00_re * a0_re;
1387  A0_re -= g00_im * a0_im;
1388  A0_re += g01_re * a1_re;
1389  A0_re -= g01_im * a1_im;
1390  A0_re += g02_re * a2_re;
1391  A0_re -= g02_im * a2_im;
1392  spinorFloat A0_im = 0;
1393  A0_im += g00_re * a0_im;
1394  A0_im += g00_im * a0_re;
1395  A0_im += g01_re * a1_im;
1396  A0_im += g01_im * a1_re;
1397  A0_im += g02_re * a2_im;
1398  A0_im += g02_im * a2_re;
1399  spinorFloat B0_re = 0;
1400  B0_re += g00_re * b0_re;
1401  B0_re -= g00_im * b0_im;
1402  B0_re += g01_re * b1_re;
1403  B0_re -= g01_im * b1_im;
1404  B0_re += g02_re * b2_re;
1405  B0_re -= g02_im * b2_im;
1406  spinorFloat B0_im = 0;
1407  B0_im += g00_re * b0_im;
1408  B0_im += g00_im * b0_re;
1409  B0_im += g01_re * b1_im;
1410  B0_im += g01_im * b1_re;
1411  B0_im += g02_re * b2_im;
1412  B0_im += g02_im * b2_re;
1413 
1414  // multiply row 1
1415  spinorFloat A1_re = 0;
1416  A1_re += g10_re * a0_re;
1417  A1_re -= g10_im * a0_im;
1418  A1_re += g11_re * a1_re;
1419  A1_re -= g11_im * a1_im;
1420  A1_re += g12_re * a2_re;
1421  A1_re -= g12_im * a2_im;
1422  spinorFloat A1_im = 0;
1423  A1_im += g10_re * a0_im;
1424  A1_im += g10_im * a0_re;
1425  A1_im += g11_re * a1_im;
1426  A1_im += g11_im * a1_re;
1427  A1_im += g12_re * a2_im;
1428  A1_im += g12_im * a2_re;
1429  spinorFloat B1_re = 0;
1430  B1_re += g10_re * b0_re;
1431  B1_re -= g10_im * b0_im;
1432  B1_re += g11_re * b1_re;
1433  B1_re -= g11_im * b1_im;
1434  B1_re += g12_re * b2_re;
1435  B1_re -= g12_im * b2_im;
1436  spinorFloat B1_im = 0;
1437  B1_im += g10_re * b0_im;
1438  B1_im += g10_im * b0_re;
1439  B1_im += g11_re * b1_im;
1440  B1_im += g11_im * b1_re;
1441  B1_im += g12_re * b2_im;
1442  B1_im += g12_im * b2_re;
1443 
1444  // multiply row 2
1445  spinorFloat A2_re = 0;
1446  A2_re += g20_re * a0_re;
1447  A2_re -= g20_im * a0_im;
1448  A2_re += g21_re * a1_re;
1449  A2_re -= g21_im * a1_im;
1450  A2_re += g22_re * a2_re;
1451  A2_re -= g22_im * a2_im;
1452  spinorFloat A2_im = 0;
1453  A2_im += g20_re * a0_im;
1454  A2_im += g20_im * a0_re;
1455  A2_im += g21_re * a1_im;
1456  A2_im += g21_im * a1_re;
1457  A2_im += g22_re * a2_im;
1458  A2_im += g22_im * a2_re;
1459  spinorFloat B2_re = 0;
1460  B2_re += g20_re * b0_re;
1461  B2_re -= g20_im * b0_im;
1462  B2_re += g21_re * b1_re;
1463  B2_re -= g21_im * b1_im;
1464  B2_re += g22_re * b2_re;
1465  B2_re -= g22_im * b2_im;
1466  spinorFloat B2_im = 0;
1467  B2_im += g20_re * b0_im;
1468  B2_im += g20_im * b0_re;
1469  B2_im += g21_re * b1_im;
1470  B2_im += g21_im * b1_re;
1471  B2_im += g22_re * b2_im;
1472  B2_im += g22_im * b2_re;
1473 
1474  o00_re += A0_re;
1475  o00_im += A0_im;
1476  o10_re += B0_re;
1477  o10_im += B0_im;
1478  o20_re += A0_im;
1479  o20_im -= A0_re;
1480  o30_re -= B0_im;
1481  o30_im += B0_re;
1482 
1483  o01_re += A1_re;
1484  o01_im += A1_im;
1485  o11_re += B1_re;
1486  o11_im += B1_im;
1487  o21_re += A1_im;
1488  o21_im -= A1_re;
1489  o31_re -= B1_im;
1490  o31_im += B1_re;
1491 
1492  o02_re += A2_re;
1493  o02_im += A2_im;
1494  o12_re += B2_re;
1495  o12_im += B2_im;
1496  o22_re += A2_im;
1497  o22_im -= A2_re;
1498  o32_re -= B2_im;
1499  o32_im += B2_re;
1500 
1501 }
1502 
1503 #ifdef MULTI_GPU
1504 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]>0)) ||
1505  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==0) )
1506 #endif
1507 {
1508  // Projector P2-
1509  // 1 0 -i 0
1510  // 0 1 0 i
1511  // i 0 1 0
1512  // 0 -i 0 1
1513 
1514 #ifdef MULTI_GPU
1515  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1 :
1516  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1517 #if (DD_PREC==2) // half precision
1518  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1519 #endif
1520 #else
1521  const int sp_idx = (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1;
1522 #endif
1523 
1524 #ifdef MULTI_GPU
1525  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1526 #else
1527  const int ga_idx = sp_idx;
1528 #endif
1529 
1536 
1537 #ifdef MULTI_GPU
1538  if (kernel_type == INTERIOR_KERNEL) {
1539 #endif
1540 
1541  if (threadIdx.z == 0 && blockDim.z < param.dc.X[2]) {
1542  // read spinor from device memory
1543  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1544 
1545  // project spinor into half spinors
1546  a0_re = +i00_re+i20_im;
1547  a0_im = +i00_im-i20_re;
1548  a1_re = +i01_re+i21_im;
1549  a1_im = +i01_im-i21_re;
1550  a2_re = +i02_re+i22_im;
1551  a2_im = +i02_im-i22_re;
1552  b0_re = +i10_re-i30_im;
1553  b0_im = +i10_im+i30_re;
1554  b1_re = +i11_re-i31_im;
1555  b1_im = +i11_im+i31_re;
1556  b2_re = +i12_re-i32_im;
1557  b2_im = +i12_im+i32_re;
1558  } else {
1559  // load spinor from shared memory
1560  int tx = (threadIdx.x + blockDim.x - ((coord[0]+1)&1)) % blockDim.x;
1561  int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1;
1562  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1563 
1564  // project spinor into half spinors
1565  a0_re = +i00_re+i20_im;
1566  a0_im = +i00_im-i20_re;
1567  a1_re = +i01_re+i21_im;
1568  a1_im = +i01_im-i21_re;
1569  a2_re = +i02_re+i22_im;
1570  a2_im = +i02_im-i22_re;
1571  b0_re = +i10_re-i30_im;
1572  b0_im = +i10_im+i30_re;
1573  b1_re = +i11_re-i31_im;
1574  b1_im = +i11_im+i31_re;
1575  b2_re = +i12_re-i32_im;
1576  b2_im = +i12_im+i32_re;
1577  }
1578 
1579 #ifdef MULTI_GPU
1580  } else {
1581 
1582  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1583 
1584  // read half spinor from device memory
1585  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 5);
1586 
1587  a0_re = i00_re; a0_im = i00_im;
1588  a1_re = i01_re; a1_im = i01_im;
1589  a2_re = i02_re; a2_im = i02_im;
1590  b0_re = i10_re; b0_im = i10_im;
1591  b1_re = i11_re; b1_im = i11_im;
1592  b2_re = i12_re; b2_im = i12_im;
1593 
1594  }
1595 #endif // MULTI_GPU
1596 
1597  // read gauge matrix from device memory
1598  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, param.gauge_stride);
1599 
1600  // reconstruct gauge matrix
1602 
1603  // multiply row 0
1604  spinorFloat A0_re = 0;
1605  A0_re += gT00_re * a0_re;
1606  A0_re -= gT00_im * a0_im;
1607  A0_re += gT01_re * a1_re;
1608  A0_re -= gT01_im * a1_im;
1609  A0_re += gT02_re * a2_re;
1610  A0_re -= gT02_im * a2_im;
1611  spinorFloat A0_im = 0;
1612  A0_im += gT00_re * a0_im;
1613  A0_im += gT00_im * a0_re;
1614  A0_im += gT01_re * a1_im;
1615  A0_im += gT01_im * a1_re;
1616  A0_im += gT02_re * a2_im;
1617  A0_im += gT02_im * a2_re;
1618  spinorFloat B0_re = 0;
1619  B0_re += gT00_re * b0_re;
1620  B0_re -= gT00_im * b0_im;
1621  B0_re += gT01_re * b1_re;
1622  B0_re -= gT01_im * b1_im;
1623  B0_re += gT02_re * b2_re;
1624  B0_re -= gT02_im * b2_im;
1625  spinorFloat B0_im = 0;
1626  B0_im += gT00_re * b0_im;
1627  B0_im += gT00_im * b0_re;
1628  B0_im += gT01_re * b1_im;
1629  B0_im += gT01_im * b1_re;
1630  B0_im += gT02_re * b2_im;
1631  B0_im += gT02_im * b2_re;
1632 
1633  // multiply row 1
1634  spinorFloat A1_re = 0;
1635  A1_re += gT10_re * a0_re;
1636  A1_re -= gT10_im * a0_im;
1637  A1_re += gT11_re * a1_re;
1638  A1_re -= gT11_im * a1_im;
1639  A1_re += gT12_re * a2_re;
1640  A1_re -= gT12_im * a2_im;
1641  spinorFloat A1_im = 0;
1642  A1_im += gT10_re * a0_im;
1643  A1_im += gT10_im * a0_re;
1644  A1_im += gT11_re * a1_im;
1645  A1_im += gT11_im * a1_re;
1646  A1_im += gT12_re * a2_im;
1647  A1_im += gT12_im * a2_re;
1648  spinorFloat B1_re = 0;
1649  B1_re += gT10_re * b0_re;
1650  B1_re -= gT10_im * b0_im;
1651  B1_re += gT11_re * b1_re;
1652  B1_re -= gT11_im * b1_im;
1653  B1_re += gT12_re * b2_re;
1654  B1_re -= gT12_im * b2_im;
1655  spinorFloat B1_im = 0;
1656  B1_im += gT10_re * b0_im;
1657  B1_im += gT10_im * b0_re;
1658  B1_im += gT11_re * b1_im;
1659  B1_im += gT11_im * b1_re;
1660  B1_im += gT12_re * b2_im;
1661  B1_im += gT12_im * b2_re;
1662 
1663  // multiply row 2
1664  spinorFloat A2_re = 0;
1665  A2_re += gT20_re * a0_re;
1666  A2_re -= gT20_im * a0_im;
1667  A2_re += gT21_re * a1_re;
1668  A2_re -= gT21_im * a1_im;
1669  A2_re += gT22_re * a2_re;
1670  A2_re -= gT22_im * a2_im;
1671  spinorFloat A2_im = 0;
1672  A2_im += gT20_re * a0_im;
1673  A2_im += gT20_im * a0_re;
1674  A2_im += gT21_re * a1_im;
1675  A2_im += gT21_im * a1_re;
1676  A2_im += gT22_re * a2_im;
1677  A2_im += gT22_im * a2_re;
1678  spinorFloat B2_re = 0;
1679  B2_re += gT20_re * b0_re;
1680  B2_re -= gT20_im * b0_im;
1681  B2_re += gT21_re * b1_re;
1682  B2_re -= gT21_im * b1_im;
1683  B2_re += gT22_re * b2_re;
1684  B2_re -= gT22_im * b2_im;
1685  spinorFloat B2_im = 0;
1686  B2_im += gT20_re * b0_im;
1687  B2_im += gT20_im * b0_re;
1688  B2_im += gT21_re * b1_im;
1689  B2_im += gT21_im * b1_re;
1690  B2_im += gT22_re * b2_im;
1691  B2_im += gT22_im * b2_re;
1692 
1693  o00_re += A0_re;
1694  o00_im += A0_im;
1695  o10_re += B0_re;
1696  o10_im += B0_im;
1697  o20_re -= A0_im;
1698  o20_im += A0_re;
1699  o30_re += B0_im;
1700  o30_im -= B0_re;
1701 
1702  o01_re += A1_re;
1703  o01_im += A1_im;
1704  o11_re += B1_re;
1705  o11_im += B1_im;
1706  o21_re -= A1_im;
1707  o21_im += A1_re;
1708  o31_re += B1_im;
1709  o31_im -= B1_re;
1710 
1711  o02_re += A2_re;
1712  o02_im += A2_im;
1713  o12_re += B2_re;
1714  o12_im += B2_im;
1715  o22_re -= A2_im;
1716  o22_im += A2_re;
1717  o32_re += B2_im;
1718  o32_im -= B2_re;
1719 
1720 }
1721 
1722 #ifdef MULTI_GPU
1723 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]<(param.dc.X[3]-1))) ||
1724  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==(param.dc.X[3]-1)) )
1725 #endif
1726 {
1727  // Projector P3+
1728  // 2 0 0 0
1729  // 0 2 0 0
1730  // 0 0 0 0
1731  // 0 0 0 0
1732 
1733 #ifdef MULTI_GPU
1734  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1 :
1735  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1736 #if (DD_PREC==2) // half precision
1737  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1738 #endif
1739 #else
1740  const int sp_idx = (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1;
1741 #endif
1742 
1743  const int ga_idx = sid;
1744 
1745  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1746  {
1753 
1754 #ifdef MULTI_GPU
1755  if (kernel_type == INTERIOR_KERNEL) {
1756 #endif
1757 
1758  // read spinor from device memory
1759  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1760 
1761  // project spinor into half spinors
1762  a0_re = +2*i00_re;
1763  a0_im = +2*i00_im;
1764  a1_re = +2*i01_re;
1765  a1_im = +2*i01_im;
1766  a2_re = +2*i02_re;
1767  a2_im = +2*i02_im;
1768  b0_re = +2*i10_re;
1769  b0_im = +2*i10_im;
1770  b1_re = +2*i11_re;
1771  b1_im = +2*i11_im;
1772  b2_re = +2*i12_re;
1773  b2_im = +2*i12_im;
1774 
1775 #ifdef MULTI_GPU
1776  } else {
1777 
1778  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1779  const int t_proj_scale = TPROJSCALE;
1780 
1781  // read half spinor from device memory
1782  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1783 
1784  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1785  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1786  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1787  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1788  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1789  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1790 
1791  }
1792 #endif // MULTI_GPU
1793 
1794  // identity gauge matrix
1801 
1802  o00_re += A0_re;
1803  o00_im += A0_im;
1804  o10_re += B0_re;
1805  o10_im += B0_im;
1806 
1807  o01_re += A1_re;
1808  o01_im += A1_im;
1809  o11_re += B1_re;
1810  o11_im += B1_im;
1811 
1812  o02_re += A2_re;
1813  o02_im += A2_im;
1814  o12_re += B2_re;
1815  o12_im += B2_im;
1816 
1817  } else {
1824 
1825 #ifdef MULTI_GPU
1826  if (kernel_type == INTERIOR_KERNEL) {
1827 #endif
1828 
1829  // read spinor from device memory
1830  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1831 
1832  // project spinor into half spinors
1833  a0_re = +2*i00_re;
1834  a0_im = +2*i00_im;
1835  a1_re = +2*i01_re;
1836  a1_im = +2*i01_im;
1837  a2_re = +2*i02_re;
1838  a2_im = +2*i02_im;
1839  b0_re = +2*i10_re;
1840  b0_im = +2*i10_im;
1841  b1_re = +2*i11_re;
1842  b1_im = +2*i11_im;
1843  b2_re = +2*i12_re;
1844  b2_im = +2*i12_im;
1845 
1846 #ifdef MULTI_GPU
1847  } else {
1848 
1849  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1850  const int t_proj_scale = TPROJSCALE;
1851 
1852  // read half spinor from device memory
1853  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1854 
1855  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1856  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1857  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1858  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1859  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1860  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1861 
1862  }
1863 #endif // MULTI_GPU
1864 
1865  // read gauge matrix from device memory
1866  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, param.gauge_stride);
1867 
1868  // reconstruct gauge matrix
1870 
1871  // multiply row 0
1872  spinorFloat A0_re = 0;
1873  A0_re += g00_re * a0_re;
1874  A0_re -= g00_im * a0_im;
1875  A0_re += g01_re * a1_re;
1876  A0_re -= g01_im * a1_im;
1877  A0_re += g02_re * a2_re;
1878  A0_re -= g02_im * a2_im;
1879  spinorFloat A0_im = 0;
1880  A0_im += g00_re * a0_im;
1881  A0_im += g00_im * a0_re;
1882  A0_im += g01_re * a1_im;
1883  A0_im += g01_im * a1_re;
1884  A0_im += g02_re * a2_im;
1885  A0_im += g02_im * a2_re;
1886  spinorFloat B0_re = 0;
1887  B0_re += g00_re * b0_re;
1888  B0_re -= g00_im * b0_im;
1889  B0_re += g01_re * b1_re;
1890  B0_re -= g01_im * b1_im;
1891  B0_re += g02_re * b2_re;
1892  B0_re -= g02_im * b2_im;
1893  spinorFloat B0_im = 0;
1894  B0_im += g00_re * b0_im;
1895  B0_im += g00_im * b0_re;
1896  B0_im += g01_re * b1_im;
1897  B0_im += g01_im * b1_re;
1898  B0_im += g02_re * b2_im;
1899  B0_im += g02_im * b2_re;
1900 
1901  // multiply row 1
1902  spinorFloat A1_re = 0;
1903  A1_re += g10_re * a0_re;
1904  A1_re -= g10_im * a0_im;
1905  A1_re += g11_re * a1_re;
1906  A1_re -= g11_im * a1_im;
1907  A1_re += g12_re * a2_re;
1908  A1_re -= g12_im * a2_im;
1909  spinorFloat A1_im = 0;
1910  A1_im += g10_re * a0_im;
1911  A1_im += g10_im * a0_re;
1912  A1_im += g11_re * a1_im;
1913  A1_im += g11_im * a1_re;
1914  A1_im += g12_re * a2_im;
1915  A1_im += g12_im * a2_re;
1916  spinorFloat B1_re = 0;
1917  B1_re += g10_re * b0_re;
1918  B1_re -= g10_im * b0_im;
1919  B1_re += g11_re * b1_re;
1920  B1_re -= g11_im * b1_im;
1921  B1_re += g12_re * b2_re;
1922  B1_re -= g12_im * b2_im;
1923  spinorFloat B1_im = 0;
1924  B1_im += g10_re * b0_im;
1925  B1_im += g10_im * b0_re;
1926  B1_im += g11_re * b1_im;
1927  B1_im += g11_im * b1_re;
1928  B1_im += g12_re * b2_im;
1929  B1_im += g12_im * b2_re;
1930 
1931  // multiply row 2
1932  spinorFloat A2_re = 0;
1933  A2_re += g20_re * a0_re;
1934  A2_re -= g20_im * a0_im;
1935  A2_re += g21_re * a1_re;
1936  A2_re -= g21_im * a1_im;
1937  A2_re += g22_re * a2_re;
1938  A2_re -= g22_im * a2_im;
1939  spinorFloat A2_im = 0;
1940  A2_im += g20_re * a0_im;
1941  A2_im += g20_im * a0_re;
1942  A2_im += g21_re * a1_im;
1943  A2_im += g21_im * a1_re;
1944  A2_im += g22_re * a2_im;
1945  A2_im += g22_im * a2_re;
1946  spinorFloat B2_re = 0;
1947  B2_re += g20_re * b0_re;
1948  B2_re -= g20_im * b0_im;
1949  B2_re += g21_re * b1_re;
1950  B2_re -= g21_im * b1_im;
1951  B2_re += g22_re * b2_re;
1952  B2_re -= g22_im * b2_im;
1953  spinorFloat B2_im = 0;
1954  B2_im += g20_re * b0_im;
1955  B2_im += g20_im * b0_re;
1956  B2_im += g21_re * b1_im;
1957  B2_im += g21_im * b1_re;
1958  B2_im += g22_re * b2_im;
1959  B2_im += g22_im * b2_re;
1960 
1961  o00_re += A0_re;
1962  o00_im += A0_im;
1963  o10_re += B0_re;
1964  o10_im += B0_im;
1965 
1966  o01_re += A1_re;
1967  o01_im += A1_im;
1968  o11_re += B1_re;
1969  o11_im += B1_im;
1970 
1971  o02_re += A2_re;
1972  o02_im += A2_im;
1973  o12_re += B2_re;
1974  o12_im += B2_im;
1975 
1976  }
1977 }
1978 
1979 #ifdef MULTI_GPU
1980 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]>0)) ||
1981  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==0) )
1982 #endif
1983 {
1984  // Projector P3-
1985  // 0 0 0 0
1986  // 0 0 0 0
1987  // 0 0 2 0
1988  // 0 0 0 2
1989 
1990 #ifdef MULTI_GPU
1991  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1 :
1992  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1993 #if (DD_PREC==2) // half precision
1994  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1995 #endif
1996 #else
1997  const int sp_idx = (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1;
1998 #endif
1999 
2000 #ifdef MULTI_GPU
2001  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
2002 #else
2003  const int ga_idx = sp_idx;
2004 #endif
2005 
2006  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
2007  {
2014 
2015 #ifdef MULTI_GPU
2016  if (kernel_type == INTERIOR_KERNEL) {
2017 #endif
2018 
2019  // read spinor from device memory
2021 
2022  // project spinor into half spinors
2023  a0_re = +2*i20_re;
2024  a0_im = +2*i20_im;
2025  a1_re = +2*i21_re;
2026  a1_im = +2*i21_im;
2027  a2_re = +2*i22_re;
2028  a2_im = +2*i22_im;
2029  b0_re = +2*i30_re;
2030  b0_im = +2*i30_im;
2031  b1_re = +2*i31_re;
2032  b1_im = +2*i31_im;
2033  b2_re = +2*i32_re;
2034  b2_im = +2*i32_im;
2035 
2036 #ifdef MULTI_GPU
2037  } else {
2038 
2039  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2040  const int t_proj_scale = TPROJSCALE;
2041 
2042  // read half spinor from device memory
2043  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
2044 
2045  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2046  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2047  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2048  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2049  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2050  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2051 
2052  }
2053 #endif // MULTI_GPU
2054 
2055  // identity gauge matrix
2062 
2063  o20_re += A0_re;
2064  o20_im += A0_im;
2065  o30_re += B0_re;
2066  o30_im += B0_im;
2067 
2068  o21_re += A1_re;
2069  o21_im += A1_im;
2070  o31_re += B1_re;
2071  o31_im += B1_im;
2072 
2073  o22_re += A2_re;
2074  o22_im += A2_im;
2075  o32_re += B2_re;
2076  o32_im += B2_im;
2077 
2078  } else {
2085 
2086 #ifdef MULTI_GPU
2087  if (kernel_type == INTERIOR_KERNEL) {
2088 #endif
2089 
2090  // read spinor from device memory
2092 
2093  // project spinor into half spinors
2094  a0_re = +2*i20_re;
2095  a0_im = +2*i20_im;
2096  a1_re = +2*i21_re;
2097  a1_im = +2*i21_im;
2098  a2_re = +2*i22_re;
2099  a2_im = +2*i22_im;
2100  b0_re = +2*i30_re;
2101  b0_im = +2*i30_im;
2102  b1_re = +2*i31_re;
2103  b1_im = +2*i31_im;
2104  b2_re = +2*i32_re;
2105  b2_im = +2*i32_im;
2106 
2107 #ifdef MULTI_GPU
2108  } else {
2109 
2110  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2111  const int t_proj_scale = TPROJSCALE;
2112 
2113  // read half spinor from device memory
2114  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
2115 
2116  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2117  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2118  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2119  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2120  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2121  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2122 
2123  }
2124 #endif // MULTI_GPU
2125 
2126  // read gauge matrix from device memory
2127  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, param.gauge_stride);
2128 
2129  // reconstruct gauge matrix
2131 
2132  // multiply row 0
2133  spinorFloat A0_re = 0;
2134  A0_re += gT00_re * a0_re;
2135  A0_re -= gT00_im * a0_im;
2136  A0_re += gT01_re * a1_re;
2137  A0_re -= gT01_im * a1_im;
2138  A0_re += gT02_re * a2_re;
2139  A0_re -= gT02_im * a2_im;
2140  spinorFloat A0_im = 0;
2141  A0_im += gT00_re * a0_im;
2142  A0_im += gT00_im * a0_re;
2143  A0_im += gT01_re * a1_im;
2144  A0_im += gT01_im * a1_re;
2145  A0_im += gT02_re * a2_im;
2146  A0_im += gT02_im * a2_re;
2147  spinorFloat B0_re = 0;
2148  B0_re += gT00_re * b0_re;
2149  B0_re -= gT00_im * b0_im;
2150  B0_re += gT01_re * b1_re;
2151  B0_re -= gT01_im * b1_im;
2152  B0_re += gT02_re * b2_re;
2153  B0_re -= gT02_im * b2_im;
2154  spinorFloat B0_im = 0;
2155  B0_im += gT00_re * b0_im;
2156  B0_im += gT00_im * b0_re;
2157  B0_im += gT01_re * b1_im;
2158  B0_im += gT01_im * b1_re;
2159  B0_im += gT02_re * b2_im;
2160  B0_im += gT02_im * b2_re;
2161 
2162  // multiply row 1
2163  spinorFloat A1_re = 0;
2164  A1_re += gT10_re * a0_re;
2165  A1_re -= gT10_im * a0_im;
2166  A1_re += gT11_re * a1_re;
2167  A1_re -= gT11_im * a1_im;
2168  A1_re += gT12_re * a2_re;
2169  A1_re -= gT12_im * a2_im;
2170  spinorFloat A1_im = 0;
2171  A1_im += gT10_re * a0_im;
2172  A1_im += gT10_im * a0_re;
2173  A1_im += gT11_re * a1_im;
2174  A1_im += gT11_im * a1_re;
2175  A1_im += gT12_re * a2_im;
2176  A1_im += gT12_im * a2_re;
2177  spinorFloat B1_re = 0;
2178  B1_re += gT10_re * b0_re;
2179  B1_re -= gT10_im * b0_im;
2180  B1_re += gT11_re * b1_re;
2181  B1_re -= gT11_im * b1_im;
2182  B1_re += gT12_re * b2_re;
2183  B1_re -= gT12_im * b2_im;
2184  spinorFloat B1_im = 0;
2185  B1_im += gT10_re * b0_im;
2186  B1_im += gT10_im * b0_re;
2187  B1_im += gT11_re * b1_im;
2188  B1_im += gT11_im * b1_re;
2189  B1_im += gT12_re * b2_im;
2190  B1_im += gT12_im * b2_re;
2191 
2192  // multiply row 2
2193  spinorFloat A2_re = 0;
2194  A2_re += gT20_re * a0_re;
2195  A2_re -= gT20_im * a0_im;
2196  A2_re += gT21_re * a1_re;
2197  A2_re -= gT21_im * a1_im;
2198  A2_re += gT22_re * a2_re;
2199  A2_re -= gT22_im * a2_im;
2200  spinorFloat A2_im = 0;
2201  A2_im += gT20_re * a0_im;
2202  A2_im += gT20_im * a0_re;
2203  A2_im += gT21_re * a1_im;
2204  A2_im += gT21_im * a1_re;
2205  A2_im += gT22_re * a2_im;
2206  A2_im += gT22_im * a2_re;
2207  spinorFloat B2_re = 0;
2208  B2_re += gT20_re * b0_re;
2209  B2_re -= gT20_im * b0_im;
2210  B2_re += gT21_re * b1_re;
2211  B2_re -= gT21_im * b1_im;
2212  B2_re += gT22_re * b2_re;
2213  B2_re -= gT22_im * b2_im;
2214  spinorFloat B2_im = 0;
2215  B2_im += gT20_re * b0_im;
2216  B2_im += gT20_im * b0_re;
2217  B2_im += gT21_re * b1_im;
2218  B2_im += gT21_im * b1_re;
2219  B2_im += gT22_re * b2_im;
2220  B2_im += gT22_im * b2_re;
2221 
2222  o20_re += A0_re;
2223  o20_im += A0_im;
2224  o30_re += B0_re;
2225  o30_im += B0_im;
2226 
2227  o21_re += A1_re;
2228  o21_im += A1_im;
2229  o31_re += B1_re;
2230  o31_im += B1_im;
2231 
2232  o22_re += A2_re;
2233  o22_im += A2_im;
2234  o32_re += B2_re;
2235  o32_im += B2_im;
2236 
2237  }
2238 }
2239 
2240 #if defined MULTI_GPU && (defined DSLASH_XPAY || defined DSLASH_CLOVER)
2241 
2242 int incomplete = 0; // Have all 8 contributions been computed for this site?
2243 
2244 switch(kernel_type) { // intentional fall-through
2245 case INTERIOR_KERNEL:
2246  incomplete = incomplete || (param.commDim[3] && (coord[3]==0 || coord[3]==(param.dc.X[3]-1)));
2247 case EXTERIOR_KERNEL_T:
2248  incomplete = incomplete || (param.commDim[2] && (coord[2]==0 || coord[2]==(param.dc.X[2]-1)));
2249 case EXTERIOR_KERNEL_Z:
2250  incomplete = incomplete || (param.commDim[1] && (coord[1]==0 || coord[1]==(param.dc.X[1]-1)));
2251 case EXTERIOR_KERNEL_Y:
2252  incomplete = incomplete || (param.commDim[0] && (coord[0]==0 || coord[0]==(param.dc.X[0]-1)));
2253 }
2254 
2255 if (!incomplete)
2256 #endif // MULTI_GPU
2257 {
2258 #ifdef DSLASH_CLOVER
2259 
2260  // change to chiral basis
2261  {
2262  spinorFloat a00_re = -o10_re - o30_re;
2263  spinorFloat a00_im = -o10_im - o30_im;
2264  spinorFloat a10_re = o00_re + o20_re;
2265  spinorFloat a10_im = o00_im + o20_im;
2266  spinorFloat a20_re = -o10_re + o30_re;
2267  spinorFloat a20_im = -o10_im + o30_im;
2268  spinorFloat a30_re = o00_re - o20_re;
2269  spinorFloat a30_im = o00_im - o20_im;
2270 
2271  o00_re = a00_re; o00_im = a00_im;
2272  o10_re = a10_re; o10_im = a10_im;
2273  o20_re = a20_re; o20_im = a20_im;
2274  o30_re = a30_re; o30_im = a30_im;
2275  }
2276 
2277  {
2278  spinorFloat a01_re = -o11_re - o31_re;
2279  spinorFloat a01_im = -o11_im - o31_im;
2280  spinorFloat a11_re = o01_re + o21_re;
2281  spinorFloat a11_im = o01_im + o21_im;
2282  spinorFloat a21_re = -o11_re + o31_re;
2283  spinorFloat a21_im = -o11_im + o31_im;
2284  spinorFloat a31_re = o01_re - o21_re;
2285  spinorFloat a31_im = o01_im - o21_im;
2286 
2287  o01_re = a01_re; o01_im = a01_im;
2288  o11_re = a11_re; o11_im = a11_im;
2289  o21_re = a21_re; o21_im = a21_im;
2290  o31_re = a31_re; o31_im = a31_im;
2291  }
2292 
2293  {
2294  spinorFloat a02_re = -o12_re - o32_re;
2295  spinorFloat a02_im = -o12_im - o32_im;
2296  spinorFloat a12_re = o02_re + o22_re;
2297  spinorFloat a12_im = o02_im + o22_im;
2298  spinorFloat a22_re = -o12_re + o32_re;
2299  spinorFloat a22_im = -o12_im + o32_im;
2300  spinorFloat a32_re = o02_re - o22_re;
2301  spinorFloat a32_im = o02_im - o22_im;
2302 
2303  o02_re = a02_re; o02_im = a02_im;
2304  o12_re = a12_re; o12_im = a12_im;
2305  o22_re = a22_re; o22_im = a22_im;
2306  o32_re = a32_re; o32_im = a32_im;
2307  }
2308 
2309  // apply first chiral block
2310  {
2312 
2313  spinorFloat a00_re = 0; spinorFloat a00_im = 0;
2314  spinorFloat a01_re = 0; spinorFloat a01_im = 0;
2315  spinorFloat a02_re = 0; spinorFloat a02_im = 0;
2316  spinorFloat a10_re = 0; spinorFloat a10_im = 0;
2317  spinorFloat a11_re = 0; spinorFloat a11_im = 0;
2318  spinorFloat a12_re = 0; spinorFloat a12_im = 0;
2319 
2320  a00_re += c00_00_re * o00_re;
2321  a00_im += c00_00_re * o00_im;
2322  a00_re += c00_01_re * o01_re;
2323  a00_re -= c00_01_im * o01_im;
2324  a00_im += c00_01_re * o01_im;
2325  a00_im += c00_01_im * o01_re;
2326  a00_re += c00_02_re * o02_re;
2327  a00_re -= c00_02_im * o02_im;
2328  a00_im += c00_02_re * o02_im;
2329  a00_im += c00_02_im * o02_re;
2330  a00_re += c00_10_re * o10_re;
2331  a00_re -= c00_10_im * o10_im;
2332  a00_im += c00_10_re * o10_im;
2333  a00_im += c00_10_im * o10_re;
2334  a00_re += c00_11_re * o11_re;
2335  a00_re -= c00_11_im * o11_im;
2336  a00_im += c00_11_re * o11_im;
2337  a00_im += c00_11_im * o11_re;
2338  a00_re += c00_12_re * o12_re;
2339  a00_re -= c00_12_im * o12_im;
2340  a00_im += c00_12_re * o12_im;
2341  a00_im += c00_12_im * o12_re;
2342 
2343  a01_re += c01_00_re * o00_re;
2344  a01_re -= c01_00_im * o00_im;
2345  a01_im += c01_00_re * o00_im;
2346  a01_im += c01_00_im * o00_re;
2347  a01_re += c01_01_re * o01_re;
2348  a01_im += c01_01_re * o01_im;
2349  a01_re += c01_02_re * o02_re;
2350  a01_re -= c01_02_im * o02_im;
2351  a01_im += c01_02_re * o02_im;
2352  a01_im += c01_02_im * o02_re;
2353  a01_re += c01_10_re * o10_re;
2354  a01_re -= c01_10_im * o10_im;
2355  a01_im += c01_10_re * o10_im;
2356  a01_im += c01_10_im * o10_re;
2357  a01_re += c01_11_re * o11_re;
2358  a01_re -= c01_11_im * o11_im;
2359  a01_im += c01_11_re * o11_im;
2360  a01_im += c01_11_im * o11_re;
2361  a01_re += c01_12_re * o12_re;
2362  a01_re -= c01_12_im * o12_im;
2363  a01_im += c01_12_re * o12_im;
2364  a01_im += c01_12_im * o12_re;
2365 
2366  a02_re += c02_00_re * o00_re;
2367  a02_re -= c02_00_im * o00_im;
2368  a02_im += c02_00_re * o00_im;
2369  a02_im += c02_00_im * o00_re;
2370  a02_re += c02_01_re * o01_re;
2371  a02_re -= c02_01_im * o01_im;
2372  a02_im += c02_01_re * o01_im;
2373  a02_im += c02_01_im * o01_re;
2374  a02_re += c02_02_re * o02_re;
2375  a02_im += c02_02_re * o02_im;
2376  a02_re += c02_10_re * o10_re;
2377  a02_re -= c02_10_im * o10_im;
2378  a02_im += c02_10_re * o10_im;
2379  a02_im += c02_10_im * o10_re;
2380  a02_re += c02_11_re * o11_re;
2381  a02_re -= c02_11_im * o11_im;
2382  a02_im += c02_11_re * o11_im;
2383  a02_im += c02_11_im * o11_re;
2384  a02_re += c02_12_re * o12_re;
2385  a02_re -= c02_12_im * o12_im;
2386  a02_im += c02_12_re * o12_im;
2387  a02_im += c02_12_im * o12_re;
2388 
2389  a10_re += c10_00_re * o00_re;
2390  a10_re -= c10_00_im * o00_im;
2391  a10_im += c10_00_re * o00_im;
2392  a10_im += c10_00_im * o00_re;
2393  a10_re += c10_01_re * o01_re;
2394  a10_re -= c10_01_im * o01_im;
2395  a10_im += c10_01_re * o01_im;
2396  a10_im += c10_01_im * o01_re;
2397  a10_re += c10_02_re * o02_re;
2398  a10_re -= c10_02_im * o02_im;
2399  a10_im += c10_02_re * o02_im;
2400  a10_im += c10_02_im * o02_re;
2401  a10_re += c10_10_re * o10_re;
2402  a10_im += c10_10_re * o10_im;
2403  a10_re += c10_11_re * o11_re;
2404  a10_re -= c10_11_im * o11_im;
2405  a10_im += c10_11_re * o11_im;
2406  a10_im += c10_11_im * o11_re;
2407  a10_re += c10_12_re * o12_re;
2408  a10_re -= c10_12_im * o12_im;
2409  a10_im += c10_12_re * o12_im;
2410  a10_im += c10_12_im * o12_re;
2411 
2412  a11_re += c11_00_re * o00_re;
2413  a11_re -= c11_00_im * o00_im;
2414  a11_im += c11_00_re * o00_im;
2415  a11_im += c11_00_im * o00_re;
2416  a11_re += c11_01_re * o01_re;
2417  a11_re -= c11_01_im * o01_im;
2418  a11_im += c11_01_re * o01_im;
2419  a11_im += c11_01_im * o01_re;
2420  a11_re += c11_02_re * o02_re;
2421  a11_re -= c11_02_im * o02_im;
2422  a11_im += c11_02_re * o02_im;
2423  a11_im += c11_02_im * o02_re;
2424  a11_re += c11_10_re * o10_re;
2425  a11_re -= c11_10_im * o10_im;
2426  a11_im += c11_10_re * o10_im;
2427  a11_im += c11_10_im * o10_re;
2428  a11_re += c11_11_re * o11_re;
2429  a11_im += c11_11_re * o11_im;
2430  a11_re += c11_12_re * o12_re;
2431  a11_re -= c11_12_im * o12_im;
2432  a11_im += c11_12_re * o12_im;
2433  a11_im += c11_12_im * o12_re;
2434 
2435  a12_re += c12_00_re * o00_re;
2436  a12_re -= c12_00_im * o00_im;
2437  a12_im += c12_00_re * o00_im;
2438  a12_im += c12_00_im * o00_re;
2439  a12_re += c12_01_re * o01_re;
2440  a12_re -= c12_01_im * o01_im;
2441  a12_im += c12_01_re * o01_im;
2442  a12_im += c12_01_im * o01_re;
2443  a12_re += c12_02_re * o02_re;
2444  a12_re -= c12_02_im * o02_im;
2445  a12_im += c12_02_re * o02_im;
2446  a12_im += c12_02_im * o02_re;
2447  a12_re += c12_10_re * o10_re;
2448  a12_re -= c12_10_im * o10_im;
2449  a12_im += c12_10_re * o10_im;
2450  a12_im += c12_10_im * o10_re;
2451  a12_re += c12_11_re * o11_re;
2452  a12_re -= c12_11_im * o11_im;
2453  a12_im += c12_11_re * o11_im;
2454  a12_im += c12_11_im * o11_re;
2455  a12_re += c12_12_re * o12_re;
2456  a12_im += c12_12_re * o12_im;
2457 
2458  o00_re = a00_re; o00_im = a00_im;
2459  o01_re = a01_re; o01_im = a01_im;
2460  o02_re = a02_re; o02_im = a02_im;
2461  o10_re = a10_re; o10_im = a10_im;
2462  o11_re = a11_re; o11_im = a11_im;
2463  o12_re = a12_re; o12_im = a12_im;
2464 
2465  }
2466 
2467  // apply second chiral block
2468  {
2470 
2471  spinorFloat a20_re = 0; spinorFloat a20_im = 0;
2472  spinorFloat a21_re = 0; spinorFloat a21_im = 0;
2473  spinorFloat a22_re = 0; spinorFloat a22_im = 0;
2474  spinorFloat a30_re = 0; spinorFloat a30_im = 0;
2475  spinorFloat a31_re = 0; spinorFloat a31_im = 0;
2476  spinorFloat a32_re = 0; spinorFloat a32_im = 0;
2477 
2478  a20_re += c20_20_re * o20_re;
2479  a20_im += c20_20_re * o20_im;
2480  a20_re += c20_21_re * o21_re;
2481  a20_re -= c20_21_im * o21_im;
2482  a20_im += c20_21_re * o21_im;
2483  a20_im += c20_21_im * o21_re;
2484  a20_re += c20_22_re * o22_re;
2485  a20_re -= c20_22_im * o22_im;
2486  a20_im += c20_22_re * o22_im;
2487  a20_im += c20_22_im * o22_re;
2488  a20_re += c20_30_re * o30_re;
2489  a20_re -= c20_30_im * o30_im;
2490  a20_im += c20_30_re * o30_im;
2491  a20_im += c20_30_im * o30_re;
2492  a20_re += c20_31_re * o31_re;
2493  a20_re -= c20_31_im * o31_im;
2494  a20_im += c20_31_re * o31_im;
2495  a20_im += c20_31_im * o31_re;
2496  a20_re += c20_32_re * o32_re;
2497  a20_re -= c20_32_im * o32_im;
2498  a20_im += c20_32_re * o32_im;
2499  a20_im += c20_32_im * o32_re;
2500 
2501  a21_re += c21_20_re * o20_re;
2502  a21_re -= c21_20_im * o20_im;
2503  a21_im += c21_20_re * o20_im;
2504  a21_im += c21_20_im * o20_re;
2505  a21_re += c21_21_re * o21_re;
2506  a21_im += c21_21_re * o21_im;
2507  a21_re += c21_22_re * o22_re;
2508  a21_re -= c21_22_im * o22_im;
2509  a21_im += c21_22_re * o22_im;
2510  a21_im += c21_22_im * o22_re;
2511  a21_re += c21_30_re * o30_re;
2512  a21_re -= c21_30_im * o30_im;
2513  a21_im += c21_30_re * o30_im;
2514  a21_im += c21_30_im * o30_re;
2515  a21_re += c21_31_re * o31_re;
2516  a21_re -= c21_31_im * o31_im;
2517  a21_im += c21_31_re * o31_im;
2518  a21_im += c21_31_im * o31_re;
2519  a21_re += c21_32_re * o32_re;
2520  a21_re -= c21_32_im * o32_im;
2521  a21_im += c21_32_re * o32_im;
2522  a21_im += c21_32_im * o32_re;
2523 
2524  a22_re += c22_20_re * o20_re;
2525  a22_re -= c22_20_im * o20_im;
2526  a22_im += c22_20_re * o20_im;
2527  a22_im += c22_20_im * o20_re;
2528  a22_re += c22_21_re * o21_re;
2529  a22_re -= c22_21_im * o21_im;
2530  a22_im += c22_21_re * o21_im;
2531  a22_im += c22_21_im * o21_re;
2532  a22_re += c22_22_re * o22_re;
2533  a22_im += c22_22_re * o22_im;
2534  a22_re += c22_30_re * o30_re;
2535  a22_re -= c22_30_im * o30_im;
2536  a22_im += c22_30_re * o30_im;
2537  a22_im += c22_30_im * o30_re;
2538  a22_re += c22_31_re * o31_re;
2539  a22_re -= c22_31_im * o31_im;
2540  a22_im += c22_31_re * o31_im;
2541  a22_im += c22_31_im * o31_re;
2542  a22_re += c22_32_re * o32_re;
2543  a22_re -= c22_32_im * o32_im;
2544  a22_im += c22_32_re * o32_im;
2545  a22_im += c22_32_im * o32_re;
2546 
2547  a30_re += c30_20_re * o20_re;
2548  a30_re -= c30_20_im * o20_im;
2549  a30_im += c30_20_re * o20_im;
2550  a30_im += c30_20_im * o20_re;
2551  a30_re += c30_21_re * o21_re;
2552  a30_re -= c30_21_im * o21_im;
2553  a30_im += c30_21_re * o21_im;
2554  a30_im += c30_21_im * o21_re;
2555  a30_re += c30_22_re * o22_re;
2556  a30_re -= c30_22_im * o22_im;
2557  a30_im += c30_22_re * o22_im;
2558  a30_im += c30_22_im * o22_re;
2559  a30_re += c30_30_re * o30_re;
2560  a30_im += c30_30_re * o30_im;
2561  a30_re += c30_31_re * o31_re;
2562  a30_re -= c30_31_im * o31_im;
2563  a30_im += c30_31_re * o31_im;
2564  a30_im += c30_31_im * o31_re;
2565  a30_re += c30_32_re * o32_re;
2566  a30_re -= c30_32_im * o32_im;
2567  a30_im += c30_32_re * o32_im;
2568  a30_im += c30_32_im * o32_re;
2569 
2570  a31_re += c31_20_re * o20_re;
2571  a31_re -= c31_20_im * o20_im;
2572  a31_im += c31_20_re * o20_im;
2573  a31_im += c31_20_im * o20_re;
2574  a31_re += c31_21_re * o21_re;
2575  a31_re -= c31_21_im * o21_im;
2576  a31_im += c31_21_re * o21_im;
2577  a31_im += c31_21_im * o21_re;
2578  a31_re += c31_22_re * o22_re;
2579  a31_re -= c31_22_im * o22_im;
2580  a31_im += c31_22_re * o22_im;
2581  a31_im += c31_22_im * o22_re;
2582  a31_re += c31_30_re * o30_re;
2583  a31_re -= c31_30_im * o30_im;
2584  a31_im += c31_30_re * o30_im;
2585  a31_im += c31_30_im * o30_re;
2586  a31_re += c31_31_re * o31_re;
2587  a31_im += c31_31_re * o31_im;
2588  a31_re += c31_32_re * o32_re;
2589  a31_re -= c31_32_im * o32_im;
2590  a31_im += c31_32_re * o32_im;
2591  a31_im += c31_32_im * o32_re;
2592 
2593  a32_re += c32_20_re * o20_re;
2594  a32_re -= c32_20_im * o20_im;
2595  a32_im += c32_20_re * o20_im;
2596  a32_im += c32_20_im * o20_re;
2597  a32_re += c32_21_re * o21_re;
2598  a32_re -= c32_21_im * o21_im;
2599  a32_im += c32_21_re * o21_im;
2600  a32_im += c32_21_im * o21_re;
2601  a32_re += c32_22_re * o22_re;
2602  a32_re -= c32_22_im * o22_im;
2603  a32_im += c32_22_re * o22_im;
2604  a32_im += c32_22_im * o22_re;
2605  a32_re += c32_30_re * o30_re;
2606  a32_re -= c32_30_im * o30_im;
2607  a32_im += c32_30_re * o30_im;
2608  a32_im += c32_30_im * o30_re;
2609  a32_re += c32_31_re * o31_re;
2610  a32_re -= c32_31_im * o31_im;
2611  a32_im += c32_31_re * o31_im;
2612  a32_im += c32_31_im * o31_re;
2613  a32_re += c32_32_re * o32_re;
2614  a32_im += c32_32_re * o32_im;
2615 
2616  o20_re = a20_re; o20_im = a20_im;
2617  o21_re = a21_re; o21_im = a21_im;
2618  o22_re = a22_re; o22_im = a22_im;
2619  o30_re = a30_re; o30_im = a30_im;
2620  o31_re = a31_re; o31_im = a31_im;
2621  o32_re = a32_re; o32_im = a32_im;
2622 
2623  }
2624 
2625  // change back from chiral basis
2626  // (note: required factor of 1/2 is included in clover term normalization)
2627  {
2628  spinorFloat a00_re = o10_re + o30_re;
2629  spinorFloat a00_im = o10_im + o30_im;
2630  spinorFloat a10_re = -o00_re - o20_re;
2631  spinorFloat a10_im = -o00_im - o20_im;
2632  spinorFloat a20_re = o10_re - o30_re;
2633  spinorFloat a20_im = o10_im - o30_im;
2634  spinorFloat a30_re = -o00_re + o20_re;
2635  spinorFloat a30_im = -o00_im + o20_im;
2636 
2637  o00_re = a00_re; o00_im = a00_im;
2638  o10_re = a10_re; o10_im = a10_im;
2639  o20_re = a20_re; o20_im = a20_im;
2640  o30_re = a30_re; o30_im = a30_im;
2641  }
2642 
2643  {
2644  spinorFloat a01_re = o11_re + o31_re;
2645  spinorFloat a01_im = o11_im + o31_im;
2646  spinorFloat a11_re = -o01_re - o21_re;
2647  spinorFloat a11_im = -o01_im - o21_im;
2648  spinorFloat a21_re = o11_re - o31_re;
2649  spinorFloat a21_im = o11_im - o31_im;
2650  spinorFloat a31_re = -o01_re + o21_re;
2651  spinorFloat a31_im = -o01_im + o21_im;
2652 
2653  o01_re = a01_re; o01_im = a01_im;
2654  o11_re = a11_re; o11_im = a11_im;
2655  o21_re = a21_re; o21_im = a21_im;
2656  o31_re = a31_re; o31_im = a31_im;
2657  }
2658 
2659  {
2660  spinorFloat a02_re = o12_re + o32_re;
2661  spinorFloat a02_im = o12_im + o32_im;
2662  spinorFloat a12_re = -o02_re - o22_re;
2663  spinorFloat a12_im = -o02_im - o22_im;
2664  spinorFloat a22_re = o12_re - o32_re;
2665  spinorFloat a22_im = o12_im - o32_im;
2666  spinorFloat a32_re = -o02_re + o22_re;
2667  spinorFloat a32_im = -o02_im + o22_im;
2668 
2669  o02_re = a02_re; o02_im = a02_im;
2670  o12_re = a12_re; o12_im = a12_im;
2671  o22_re = a22_re; o22_im = a22_im;
2672  o32_re = a32_re; o32_im = a32_im;
2673  }
2674 
2675 #endif // DSLASH_CLOVER
2676 
2677 #ifdef DSLASH_XPAY
2678 
2679  READ_ACCUM(ACCUMTEX, param.sp_stride)
2680 
2681 #ifdef SPINOR_DOUBLE
2682  spinorFloat a = param.a;
2683 #else
2684  spinorFloat a = param.a_f;
2685 #endif
2686  o00_re = a*o00_re+acc00_re;
2687  o00_im = a*o00_im+acc00_im;
2688  o01_re = a*o01_re+acc01_re;
2689  o01_im = a*o01_im+acc01_im;
2690  o02_re = a*o02_re+acc02_re;
2691  o02_im = a*o02_im+acc02_im;
2692  o10_re = a*o10_re+acc10_re;
2693  o10_im = a*o10_im+acc10_im;
2694  o11_re = a*o11_re+acc11_re;
2695  o11_im = a*o11_im+acc11_im;
2696  o12_re = a*o12_re+acc12_re;
2697  o12_im = a*o12_im+acc12_im;
2698  o20_re = a*o20_re+acc20_re;
2699  o20_im = a*o20_im+acc20_im;
2700  o21_re = a*o21_re+acc21_re;
2701  o21_im = a*o21_im+acc21_im;
2702  o22_re = a*o22_re+acc22_re;
2703  o22_im = a*o22_im+acc22_im;
2704  o30_re = a*o30_re+acc30_re;
2705  o30_im = a*o30_im+acc30_im;
2706  o31_re = a*o31_re+acc31_re;
2707  o31_im = a*o31_im+acc31_im;
2708  o32_re = a*o32_re+acc32_re;
2709  o32_im = a*o32_im+acc32_im;
2710 #endif // DSLASH_XPAY
2711 }
2712 
2713 // write spinor field back to device memory
2714 WRITE_SPINOR(param.sp_stride);
2715 
2716 // undefine to prevent warning when precision is changed
2717 #undef spinorFloat
2718 #undef WRITE_SPINOR_SHARED
2719 #undef READ_SPINOR_SHARED
2720 #undef SHARED_STRIDE
2721 
2722 #undef g00_re
2723 #undef g00_im
2724 #undef g01_re
2725 #undef g01_im
2726 #undef g02_re
2727 #undef g02_im
2728 #undef g10_re
2729 #undef g10_im
2730 #undef g11_re
2731 #undef g11_im
2732 #undef g12_re
2733 #undef g12_im
2734 #undef g20_re
2735 #undef g20_im
2736 #undef g21_re
2737 #undef g21_im
2738 #undef g22_re
2739 #undef g22_im
2740 
2741 #undef i00_re
2742 #undef i00_im
2743 #undef i01_re
2744 #undef i01_im
2745 #undef i02_re
2746 #undef i02_im
2747 #undef i10_re
2748 #undef i10_im
2749 #undef i11_re
2750 #undef i11_im
2751 #undef i12_re
2752 #undef i12_im
2753 #undef i20_re
2754 #undef i20_im
2755 #undef i21_re
2756 #undef i21_im
2757 #undef i22_re
2758 #undef i22_im
2759 #undef i30_re
2760 #undef i30_im
2761 #undef i31_re
2762 #undef i31_im
2763 #undef i32_re
2764 #undef i32_im
2765 
2766 #undef acc00_re
2767 #undef acc00_im
2768 #undef acc01_re
2769 #undef acc01_im
2770 #undef acc02_re
2771 #undef acc02_im
2772 #undef acc10_re
2773 #undef acc10_im
2774 #undef acc11_re
2775 #undef acc11_im
2776 #undef acc12_re
2777 #undef acc12_im
2778 #undef acc20_re
2779 #undef acc20_im
2780 #undef acc21_re
2781 #undef acc21_im
2782 #undef acc22_re
2783 #undef acc22_im
2784 #undef acc30_re
2785 #undef acc30_im
2786 #undef acc31_re
2787 #undef acc31_im
2788 #undef acc32_re
2789 #undef acc32_im
2790 
2791 #undef c00_00_re
2792 #undef c01_01_re
2793 #undef c02_02_re
2794 #undef c10_10_re
2795 #undef c11_11_re
2796 #undef c12_12_re
2797 #undef c01_00_re
2798 #undef c01_00_im
2799 #undef c02_00_re
2800 #undef c02_00_im
2801 #undef c10_00_re
2802 #undef c10_00_im
2803 #undef c11_00_re
2804 #undef c11_00_im
2805 #undef c12_00_re
2806 #undef c12_00_im
2807 #undef c02_01_re
2808 #undef c02_01_im
2809 #undef c10_01_re
2810 #undef c10_01_im
2811 #undef c11_01_re
2812 #undef c11_01_im
2813 #undef c12_01_re
2814 #undef c12_01_im
2815 #undef c10_02_re
2816 #undef c10_02_im
2817 #undef c11_02_re
2818 #undef c11_02_im
2819 #undef c12_02_re
2820 #undef c12_02_im
2821 #undef c11_10_re
2822 #undef c11_10_im
2823 #undef c12_10_re
2824 #undef c12_10_im
2825 #undef c12_11_re
2826 #undef c12_11_im
2827 
2828 #undef o00_re
2829 #undef o00_im
2830 #undef o01_re
2831 #undef o01_im
2832 #undef o02_re
2833 #undef o02_im
2834 #undef o10_re
2835 #undef o10_im
2836 #undef o11_re
2837 #undef o11_im
2838 #undef o12_re
2839 #undef o12_im
2840 #undef o20_re
2841 #undef o20_im
2842 #undef o21_re
2843 #undef o21_im
2844 #undef o22_re
2845 #undef o22_im
2846 #undef o30_re
2847 #undef o30_im
2848 #undef o31_re
2849 #undef o31_im
2850 #undef o32_re
2851 #undef o32_im
2852 
2853 #undef VOLATILE
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o20_re
dim3 dim3 blockDim
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o32_im
spinorFloat B0_im
spinorFloat b2_re
spinorFloat A2_im
#define WRITE_SPINOR
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o00_re
spinorFloat A0_re
spinorFloat a0_im
int sp_idx
VOLATILE spinorFloat o30_im
spinorFloat B0_re
spinorFloat b0_im
spinorFloat b1_im
#define GAUGE0TEX
spinorFloat a1_re
spinorFloat B2_re
QudaGaugeParam param
Definition: pack_test.cpp:17
spinorFloat b2_im
spinorFloat a2_im
spinorFloat B1_re
VOLATILE spinorFloat o02_re
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o01_re
#define GAUGE1TEX
VOLATILE spinorFloat o10_im
#define SPINORTEX
spinorFloat a0_re
VOLATILE spinorFloat o11_re
coordsFromIndex3D< EVEN_X >(X, x, sid, param.parity, param.dc.X)
VOLATILE spinorFloat o00_im
#define READ_INTERMEDIATE_SPINOR
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
int X[4]
Definition: quda.h:29
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o01_im
VOLATILE spinorFloat o30_re
spinorFloat b1_re
#define READ_SPINOR_GHOST
VOLATILE spinorFloat o31_re
__syncthreads()
VOLATILE spinorFloat o21_re
spinorFloat a2_re
spinorFloat B1_im
spinorFloat b0_re
#define READ_SPINOR_SHARED
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define CLOVERTEX
spinorFloat B2_im
spinorFloat a1_im
#define INTERTEX
int face_idx
spinorFloat A2_re
VOLATILE spinorFloat o12_im
#define WRITE_SPINOR_SHARED
const int face_num
#define TPROJSCALE
spinorFloat A0_im
spinorFloat A1_re
#define GHOSTSPINORTEX
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define READ_CLOVER
VOLATILE spinorFloat o22_re
RECONSTRUCT_GAUGE_MATRIX(0)
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o12_re
spinorFloat A1_im
#define a
VOLATILE spinorFloat o20_im
VOLATILE spinorFloat o32_re
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)