QUDA  0.9.0
wilson_dslash_dagger_gt200_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define i00_re I0.x
15 #define i00_im I0.y
16 #define i01_re I1.x
17 #define i01_im I1.y
18 #define i02_re I2.x
19 #define i02_im I2.y
20 #define i10_re I3.x
21 #define i10_im I3.y
22 #define i11_re I4.x
23 #define i11_im I4.y
24 #define i12_re I5.x
25 #define i12_im I5.y
26 #define i20_re I6.x
27 #define i20_im I6.y
28 #define i21_re I7.x
29 #define i21_im I7.y
30 #define i22_re I8.x
31 #define i22_im I8.y
32 #define i30_re I9.x
33 #define i30_im I9.y
34 #define i31_re I10.x
35 #define i31_im I10.y
36 #define i32_re I11.x
37 #define i32_im I11.y
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
62 #else
63 #define spinorFloat float
64 #define i00_re I0.x
65 #define i00_im I0.y
66 #define i01_re I0.z
67 #define i01_im I0.w
68 #define i02_re I1.x
69 #define i02_im I1.y
70 #define i10_re I1.z
71 #define i10_im I1.w
72 #define i11_re I2.x
73 #define i11_im I2.y
74 #define i12_re I2.z
75 #define i12_im I2.w
76 #define i20_re I3.x
77 #define i20_im I3.y
78 #define i21_re I3.z
79 #define i21_im I3.w
80 #define i22_re I4.x
81 #define i22_im I4.y
82 #define i30_re I4.z
83 #define i30_im I4.w
84 #define i31_re I5.x
85 #define i31_im I5.y
86 #define i32_re I5.z
87 #define i32_im I5.w
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
113 
114 // gauge link
115 #ifdef GAUGE_FLOAT2
116 #define g00_re G0.x
117 #define g00_im G0.y
118 #define g01_re G1.x
119 #define g01_im G1.y
120 #define g02_re G2.x
121 #define g02_im G2.y
122 #define g10_re G3.x
123 #define g10_im G3.y
124 #define g11_re G4.x
125 #define g11_im G4.y
126 #define g12_re G5.x
127 #define g12_im G5.y
128 #define g20_re G6.x
129 #define g20_im G6.y
130 #define g21_re G7.x
131 #define g21_im G7.y
132 #define g22_re G8.x
133 #define g22_im G8.y
134 
135 #else
136 #define g00_re G0.x
137 #define g00_im G0.y
138 #define g01_re G0.z
139 #define g01_im G0.w
140 #define g02_re G1.x
141 #define g02_im G1.y
142 #define g10_re G1.z
143 #define g10_im G1.w
144 #define g11_re G2.x
145 #define g11_im G2.y
146 #define g12_re G2.z
147 #define g12_im G2.w
148 #define g20_re G3.x
149 #define g20_im G3.y
150 #define g21_re G3.z
151 #define g21_im G3.w
152 #define g22_re G4.x
153 #define g22_im G4.y
154 
155 #endif // GAUGE_DOUBLE
156 
157 // conjugated gauge link
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
176 
177 // first chiral block of inverted clover term
178 #ifdef CLOVER_DOUBLE
179 #define c00_00_re C0.x
180 #define c01_01_re C0.y
181 #define c02_02_re C1.x
182 #define c10_10_re C1.y
183 #define c11_11_re C2.x
184 #define c12_12_re C2.y
185 #define c01_00_re C3.x
186 #define c01_00_im C3.y
187 #define c02_00_re C4.x
188 #define c02_00_im C4.y
189 #define c10_00_re C5.x
190 #define c10_00_im C5.y
191 #define c11_00_re C6.x
192 #define c11_00_im C6.y
193 #define c12_00_re C7.x
194 #define c12_00_im C7.y
195 #define c02_01_re C8.x
196 #define c02_01_im C8.y
197 #define c10_01_re C9.x
198 #define c10_01_im C9.y
199 #define c11_01_re C10.x
200 #define c11_01_im C10.y
201 #define c12_01_re C11.x
202 #define c12_01_im C11.y
203 #define c10_02_re C12.x
204 #define c10_02_im C12.y
205 #define c11_02_re C13.x
206 #define c11_02_im C13.y
207 #define c12_02_re C14.x
208 #define c12_02_im C14.y
209 #define c11_10_re C15.x
210 #define c11_10_im C15.y
211 #define c12_10_re C16.x
212 #define c12_10_im C16.y
213 #define c12_11_re C17.x
214 #define c12_11_im C17.y
215 #else
216 #define c00_00_re C0.x
217 #define c01_01_re C0.y
218 #define c02_02_re C0.z
219 #define c10_10_re C0.w
220 #define c11_11_re C1.x
221 #define c12_12_re C1.y
222 #define c01_00_re C1.z
223 #define c01_00_im C1.w
224 #define c02_00_re C2.x
225 #define c02_00_im C2.y
226 #define c10_00_re C2.z
227 #define c10_00_im C2.w
228 #define c11_00_re C3.x
229 #define c11_00_im C3.y
230 #define c12_00_re C3.z
231 #define c12_00_im C3.w
232 #define c02_01_re C4.x
233 #define c02_01_im C4.y
234 #define c10_01_re C4.z
235 #define c10_01_im C4.w
236 #define c11_01_re C5.x
237 #define c11_01_im C5.y
238 #define c12_01_re C5.z
239 #define c12_01_im C5.w
240 #define c10_02_re C6.x
241 #define c10_02_im C6.y
242 #define c11_02_re C6.z
243 #define c11_02_im C6.w
244 #define c12_02_re C7.x
245 #define c12_02_im C7.y
246 #define c11_10_re C7.z
247 #define c11_10_im C7.w
248 #define c12_10_re C8.x
249 #define c12_10_im C8.y
250 #define c12_11_re C8.z
251 #define c12_11_im C8.w
252 #endif // CLOVER_DOUBLE
253 
254 #define c00_01_re (+c01_00_re)
255 #define c00_01_im (-c01_00_im)
256 #define c00_02_re (+c02_00_re)
257 #define c00_02_im (-c02_00_im)
258 #define c01_02_re (+c02_01_re)
259 #define c01_02_im (-c02_01_im)
260 #define c00_10_re (+c10_00_re)
261 #define c00_10_im (-c10_00_im)
262 #define c01_10_re (+c10_01_re)
263 #define c01_10_im (-c10_01_im)
264 #define c02_10_re (+c10_02_re)
265 #define c02_10_im (-c10_02_im)
266 #define c00_11_re (+c11_00_re)
267 #define c00_11_im (-c11_00_im)
268 #define c01_11_re (+c11_01_re)
269 #define c01_11_im (-c11_01_im)
270 #define c02_11_re (+c11_02_re)
271 #define c02_11_im (-c11_02_im)
272 #define c10_11_re (+c11_10_re)
273 #define c10_11_im (-c11_10_im)
274 #define c00_12_re (+c12_00_re)
275 #define c00_12_im (-c12_00_im)
276 #define c01_12_re (+c12_01_re)
277 #define c01_12_im (-c12_01_im)
278 #define c02_12_re (+c12_02_re)
279 #define c02_12_im (-c12_02_im)
280 #define c10_12_re (+c12_10_re)
281 #define c10_12_im (-c12_10_im)
282 #define c11_12_re (+c12_11_re)
283 #define c11_12_im (-c12_11_im)
284 
285 // second chiral block of inverted clover term (reuses C0,...,C9)
286 #define c20_20_re c00_00_re
287 #define c21_20_re c01_00_re
288 #define c21_20_im c01_00_im
289 #define c22_20_re c02_00_re
290 #define c22_20_im c02_00_im
291 #define c30_20_re c10_00_re
292 #define c30_20_im c10_00_im
293 #define c31_20_re c11_00_re
294 #define c31_20_im c11_00_im
295 #define c32_20_re c12_00_re
296 #define c32_20_im c12_00_im
297 #define c20_21_re c00_01_re
298 #define c20_21_im c00_01_im
299 #define c21_21_re c01_01_re
300 #define c22_21_re c02_01_re
301 #define c22_21_im c02_01_im
302 #define c30_21_re c10_01_re
303 #define c30_21_im c10_01_im
304 #define c31_21_re c11_01_re
305 #define c31_21_im c11_01_im
306 #define c32_21_re c12_01_re
307 #define c32_21_im c12_01_im
308 #define c20_22_re c00_02_re
309 #define c20_22_im c00_02_im
310 #define c21_22_re c01_02_re
311 #define c21_22_im c01_02_im
312 #define c22_22_re c02_02_re
313 #define c30_22_re c10_02_re
314 #define c30_22_im c10_02_im
315 #define c31_22_re c11_02_re
316 #define c31_22_im c11_02_im
317 #define c32_22_re c12_02_re
318 #define c32_22_im c12_02_im
319 #define c20_30_re c00_10_re
320 #define c20_30_im c00_10_im
321 #define c21_30_re c01_10_re
322 #define c21_30_im c01_10_im
323 #define c22_30_re c02_10_re
324 #define c22_30_im c02_10_im
325 #define c30_30_re c10_10_re
326 #define c31_30_re c11_10_re
327 #define c31_30_im c11_10_im
328 #define c32_30_re c12_10_re
329 #define c32_30_im c12_10_im
330 #define c20_31_re c00_11_re
331 #define c20_31_im c00_11_im
332 #define c21_31_re c01_11_re
333 #define c21_31_im c01_11_im
334 #define c22_31_re c02_11_re
335 #define c22_31_im c02_11_im
336 #define c30_31_re c10_11_re
337 #define c30_31_im c10_11_im
338 #define c31_31_re c11_11_re
339 #define c32_31_re c12_11_re
340 #define c32_31_im c12_11_im
341 #define c20_32_re c00_12_re
342 #define c20_32_im c00_12_im
343 #define c21_32_re c01_12_re
344 #define c21_32_im c01_12_im
345 #define c22_32_re c02_12_re
346 #define c22_32_im c02_12_im
347 #define c30_32_re c10_12_re
348 #define c30_32_im c10_12_im
349 #define c31_32_re c11_12_re
350 #define c31_32_im c11_12_im
351 #define c32_32_re c12_12_re
352 
353 // output spinor
378 
379 #include "read_gauge.h"
380 #include "read_clover.h"
381 #include "io_spinor.h"
382 
383 int coord[5];
384 int X;
385 
386 int sid;
387 
388 #ifdef MULTI_GPU
389 int face_idx;
390 if (kernel_type == INTERIOR_KERNEL) {
391 #endif
392 
393  sid = blockIdx.x*blockDim.x + threadIdx.x;
394  if (sid >= param.threads) return;
395 
396  // Assume even dimensions
398 
399  o00_re = 0; o00_im = 0;
400  o01_re = 0; o01_im = 0;
401  o02_re = 0; o02_im = 0;
402  o10_re = 0; o10_im = 0;
403  o11_re = 0; o11_im = 0;
404  o12_re = 0; o12_im = 0;
405  o20_re = 0; o20_im = 0;
406  o21_re = 0; o21_im = 0;
407  o22_re = 0; o22_im = 0;
408  o30_re = 0; o30_im = 0;
409  o31_re = 0; o31_im = 0;
410  o32_re = 0; o32_im = 0;
411 
412 #ifdef MULTI_GPU
413 } else { // exterior kernel
414 
415  sid = blockIdx.x*blockDim.x + threadIdx.x;
416  if (sid >= param.threads) return;
417 
418  const int face_volume = (param.threads >> 1); // volume of one face
419  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
420  face_idx = sid - face_num*face_volume; // index into the respective face
421 
422  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
423  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
424  //sp_idx = face_idx + param.ghostOffset[dim];
425 
426  coordsFromFaceIndex<4,QUDA_4D_PC,kernel_type,1>(X, sid, coord, face_idx, face_num, param);
427 
429 
430  o00_re = i00_re; o00_im = i00_im;
431  o01_re = i01_re; o01_im = i01_im;
432  o02_re = i02_re; o02_im = i02_im;
433  o10_re = i10_re; o10_im = i10_im;
434  o11_re = i11_re; o11_im = i11_im;
435  o12_re = i12_re; o12_im = i12_im;
436  o20_re = i20_re; o20_im = i20_im;
437  o21_re = i21_re; o21_im = i21_im;
438  o22_re = i22_re; o22_im = i22_im;
439  o30_re = i30_re; o30_im = i30_im;
440  o31_re = i31_re; o31_im = i31_im;
441  o32_re = i32_re; o32_im = i32_im;
442 }
443 #endif // MULTI_GPU
444 
445 
446 #ifdef MULTI_GPU
447 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]<(param.dc.X[0]-1))) ||
448  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==(param.dc.X[0]-1)) )
449 #endif
450 {
451  // Projector P0+
452  // 1 0 0 i
453  // 0 1 i 0
454  // 0 -i 1 0
455  // -i 0 0 1
456 
457 #ifdef MULTI_GPU
458  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1 :
459  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
460 #if (DD_PREC==2) // half precision
461  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
462 #endif
463 #else
464  const int sp_idx = (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1;
465 #endif
466 
467  const int ga_idx = sid;
468 
475 
476 #ifdef MULTI_GPU
477  if (kernel_type == INTERIOR_KERNEL) {
478 #endif
479 
480  // read spinor from device memory
481  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
482 
483  // project spinor into half spinors
484  a0_re = +i00_re-i30_im;
485  a0_im = +i00_im+i30_re;
486  a1_re = +i01_re-i31_im;
487  a1_im = +i01_im+i31_re;
488  a2_re = +i02_re-i32_im;
489  a2_im = +i02_im+i32_re;
490  b0_re = +i10_re-i20_im;
491  b0_im = +i10_im+i20_re;
492  b1_re = +i11_re-i21_im;
493  b1_im = +i11_im+i21_re;
494  b2_re = +i12_re-i22_im;
495  b2_im = +i12_im+i22_re;
496 
497 #ifdef MULTI_GPU
498  } else {
499 
500  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
501 
502  // read half spinor from device memory
503  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 0);
504 
505  a0_re = i00_re; a0_im = i00_im;
506  a1_re = i01_re; a1_im = i01_im;
507  a2_re = i02_re; a2_im = i02_im;
508  b0_re = i10_re; b0_im = i10_im;
509  b1_re = i11_re; b1_im = i11_im;
510  b2_re = i12_re; b2_im = i12_im;
511 
512  }
513 #endif // MULTI_GPU
514 
515  // read gauge matrix from device memory
516  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride);
517 
518  // reconstruct gauge matrix
520 
521  // multiply row 0
523  A0_re += g00_re * a0_re;
524  A0_re -= g00_im * a0_im;
525  A0_re += g01_re * a1_re;
526  A0_re -= g01_im * a1_im;
527  A0_re += g02_re * a2_re;
528  A0_re -= g02_im * a2_im;
530  A0_im += g00_re * a0_im;
531  A0_im += g00_im * a0_re;
532  A0_im += g01_re * a1_im;
533  A0_im += g01_im * a1_re;
534  A0_im += g02_re * a2_im;
535  A0_im += g02_im * a2_re;
537  B0_re += g00_re * b0_re;
538  B0_re -= g00_im * b0_im;
539  B0_re += g01_re * b1_re;
540  B0_re -= g01_im * b1_im;
541  B0_re += g02_re * b2_re;
542  B0_re -= g02_im * b2_im;
544  B0_im += g00_re * b0_im;
545  B0_im += g00_im * b0_re;
546  B0_im += g01_re * b1_im;
547  B0_im += g01_im * b1_re;
548  B0_im += g02_re * b2_im;
549  B0_im += g02_im * b2_re;
550 
551  // multiply row 1
553  A1_re += g10_re * a0_re;
554  A1_re -= g10_im * a0_im;
555  A1_re += g11_re * a1_re;
556  A1_re -= g11_im * a1_im;
557  A1_re += g12_re * a2_re;
558  A1_re -= g12_im * a2_im;
560  A1_im += g10_re * a0_im;
561  A1_im += g10_im * a0_re;
562  A1_im += g11_re * a1_im;
563  A1_im += g11_im * a1_re;
564  A1_im += g12_re * a2_im;
565  A1_im += g12_im * a2_re;
567  B1_re += g10_re * b0_re;
568  B1_re -= g10_im * b0_im;
569  B1_re += g11_re * b1_re;
570  B1_re -= g11_im * b1_im;
571  B1_re += g12_re * b2_re;
572  B1_re -= g12_im * b2_im;
574  B1_im += g10_re * b0_im;
575  B1_im += g10_im * b0_re;
576  B1_im += g11_re * b1_im;
577  B1_im += g11_im * b1_re;
578  B1_im += g12_re * b2_im;
579  B1_im += g12_im * b2_re;
580 
581  // multiply row 2
583  A2_re += g20_re * a0_re;
584  A2_re -= g20_im * a0_im;
585  A2_re += g21_re * a1_re;
586  A2_re -= g21_im * a1_im;
587  A2_re += g22_re * a2_re;
588  A2_re -= g22_im * a2_im;
590  A2_im += g20_re * a0_im;
591  A2_im += g20_im * a0_re;
592  A2_im += g21_re * a1_im;
593  A2_im += g21_im * a1_re;
594  A2_im += g22_re * a2_im;
595  A2_im += g22_im * a2_re;
597  B2_re += g20_re * b0_re;
598  B2_re -= g20_im * b0_im;
599  B2_re += g21_re * b1_re;
600  B2_re -= g21_im * b1_im;
601  B2_re += g22_re * b2_re;
602  B2_re -= g22_im * b2_im;
604  B2_im += g20_re * b0_im;
605  B2_im += g20_im * b0_re;
606  B2_im += g21_re * b1_im;
607  B2_im += g21_im * b1_re;
608  B2_im += g22_re * b2_im;
609  B2_im += g22_im * b2_re;
610 
611  o00_re += A0_re;
612  o00_im += A0_im;
613  o10_re += B0_re;
614  o10_im += B0_im;
615  o20_re += B0_im;
616  o20_im -= B0_re;
617  o30_re += A0_im;
618  o30_im -= A0_re;
619 
620  o01_re += A1_re;
621  o01_im += A1_im;
622  o11_re += B1_re;
623  o11_im += B1_im;
624  o21_re += B1_im;
625  o21_im -= B1_re;
626  o31_re += A1_im;
627  o31_im -= A1_re;
628 
629  o02_re += A2_re;
630  o02_im += A2_im;
631  o12_re += B2_re;
632  o12_im += B2_im;
633  o22_re += B2_im;
634  o22_im -= B2_re;
635  o32_re += A2_im;
636  o32_im -= A2_re;
637 
638 }
639 
640 #ifdef MULTI_GPU
641 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]>0)) ||
642  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==0) )
643 #endif
644 {
645  // Projector P0-
646  // 1 0 0 -i
647  // 0 1 -i 0
648  // 0 i 1 0
649  // i 0 0 1
650 
651 #ifdef MULTI_GPU
652  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1 :
653  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
654 #if (DD_PREC==2) // half precision
655  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
656 #endif
657 #else
658  const int sp_idx = (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1;
659 #endif
660 
661 #ifdef MULTI_GPU
662  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
663 #else
664  const int ga_idx = sp_idx;
665 #endif
666 
673 
674 #ifdef MULTI_GPU
675  if (kernel_type == INTERIOR_KERNEL) {
676 #endif
677 
678  // read spinor from device memory
679  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
680 
681  // project spinor into half spinors
682  a0_re = +i00_re+i30_im;
683  a0_im = +i00_im-i30_re;
684  a1_re = +i01_re+i31_im;
685  a1_im = +i01_im-i31_re;
686  a2_re = +i02_re+i32_im;
687  a2_im = +i02_im-i32_re;
688  b0_re = +i10_re+i20_im;
689  b0_im = +i10_im-i20_re;
690  b1_re = +i11_re+i21_im;
691  b1_im = +i11_im-i21_re;
692  b2_re = +i12_re+i22_im;
693  b2_im = +i12_im-i22_re;
694 
695 #ifdef MULTI_GPU
696  } else {
697 
698  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
699 
700  // read half spinor from device memory
701  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 1);
702 
703  a0_re = i00_re; a0_im = i00_im;
704  a1_re = i01_re; a1_im = i01_im;
705  a2_re = i02_re; a2_im = i02_im;
706  b0_re = i10_re; b0_im = i10_im;
707  b1_re = i11_re; b1_im = i11_im;
708  b2_re = i12_re; b2_im = i12_im;
709 
710  }
711 #endif // MULTI_GPU
712 
713  // read gauge matrix from device memory
714  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, param.gauge_stride);
715 
716  // reconstruct gauge matrix
718 
719  // multiply row 0
720  spinorFloat A0_re = 0;
721  A0_re += gT00_re * a0_re;
722  A0_re -= gT00_im * a0_im;
723  A0_re += gT01_re * a1_re;
724  A0_re -= gT01_im * a1_im;
725  A0_re += gT02_re * a2_re;
726  A0_re -= gT02_im * a2_im;
727  spinorFloat A0_im = 0;
728  A0_im += gT00_re * a0_im;
729  A0_im += gT00_im * a0_re;
730  A0_im += gT01_re * a1_im;
731  A0_im += gT01_im * a1_re;
732  A0_im += gT02_re * a2_im;
733  A0_im += gT02_im * a2_re;
734  spinorFloat B0_re = 0;
735  B0_re += gT00_re * b0_re;
736  B0_re -= gT00_im * b0_im;
737  B0_re += gT01_re * b1_re;
738  B0_re -= gT01_im * b1_im;
739  B0_re += gT02_re * b2_re;
740  B0_re -= gT02_im * b2_im;
741  spinorFloat B0_im = 0;
742  B0_im += gT00_re * b0_im;
743  B0_im += gT00_im * b0_re;
744  B0_im += gT01_re * b1_im;
745  B0_im += gT01_im * b1_re;
746  B0_im += gT02_re * b2_im;
747  B0_im += gT02_im * b2_re;
748 
749  // multiply row 1
750  spinorFloat A1_re = 0;
751  A1_re += gT10_re * a0_re;
752  A1_re -= gT10_im * a0_im;
753  A1_re += gT11_re * a1_re;
754  A1_re -= gT11_im * a1_im;
755  A1_re += gT12_re * a2_re;
756  A1_re -= gT12_im * a2_im;
757  spinorFloat A1_im = 0;
758  A1_im += gT10_re * a0_im;
759  A1_im += gT10_im * a0_re;
760  A1_im += gT11_re * a1_im;
761  A1_im += gT11_im * a1_re;
762  A1_im += gT12_re * a2_im;
763  A1_im += gT12_im * a2_re;
764  spinorFloat B1_re = 0;
765  B1_re += gT10_re * b0_re;
766  B1_re -= gT10_im * b0_im;
767  B1_re += gT11_re * b1_re;
768  B1_re -= gT11_im * b1_im;
769  B1_re += gT12_re * b2_re;
770  B1_re -= gT12_im * b2_im;
771  spinorFloat B1_im = 0;
772  B1_im += gT10_re * b0_im;
773  B1_im += gT10_im * b0_re;
774  B1_im += gT11_re * b1_im;
775  B1_im += gT11_im * b1_re;
776  B1_im += gT12_re * b2_im;
777  B1_im += gT12_im * b2_re;
778 
779  // multiply row 2
780  spinorFloat A2_re = 0;
781  A2_re += gT20_re * a0_re;
782  A2_re -= gT20_im * a0_im;
783  A2_re += gT21_re * a1_re;
784  A2_re -= gT21_im * a1_im;
785  A2_re += gT22_re * a2_re;
786  A2_re -= gT22_im * a2_im;
787  spinorFloat A2_im = 0;
788  A2_im += gT20_re * a0_im;
789  A2_im += gT20_im * a0_re;
790  A2_im += gT21_re * a1_im;
791  A2_im += gT21_im * a1_re;
792  A2_im += gT22_re * a2_im;
793  A2_im += gT22_im * a2_re;
794  spinorFloat B2_re = 0;
795  B2_re += gT20_re * b0_re;
796  B2_re -= gT20_im * b0_im;
797  B2_re += gT21_re * b1_re;
798  B2_re -= gT21_im * b1_im;
799  B2_re += gT22_re * b2_re;
800  B2_re -= gT22_im * b2_im;
801  spinorFloat B2_im = 0;
802  B2_im += gT20_re * b0_im;
803  B2_im += gT20_im * b0_re;
804  B2_im += gT21_re * b1_im;
805  B2_im += gT21_im * b1_re;
806  B2_im += gT22_re * b2_im;
807  B2_im += gT22_im * b2_re;
808 
809  o00_re += A0_re;
810  o00_im += A0_im;
811  o10_re += B0_re;
812  o10_im += B0_im;
813  o20_re -= B0_im;
814  o20_im += B0_re;
815  o30_re -= A0_im;
816  o30_im += A0_re;
817 
818  o01_re += A1_re;
819  o01_im += A1_im;
820  o11_re += B1_re;
821  o11_im += B1_im;
822  o21_re -= B1_im;
823  o21_im += B1_re;
824  o31_re -= A1_im;
825  o31_im += A1_re;
826 
827  o02_re += A2_re;
828  o02_im += A2_im;
829  o12_re += B2_re;
830  o12_im += B2_im;
831  o22_re -= B2_im;
832  o22_im += B2_re;
833  o32_re -= A2_im;
834  o32_im += A2_re;
835 
836 }
837 
838 #ifdef MULTI_GPU
839 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]<(param.dc.X[1]-1))) ||
840  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==(param.dc.X[1]-1)) )
841 #endif
842 {
843  // Projector P1+
844  // 1 0 0 1
845  // 0 1 -1 0
846  // 0 -1 1 0
847  // 1 0 0 1
848 
849 #ifdef MULTI_GPU
850  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1 :
851  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
852 #if (DD_PREC==2) // half precision
853  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
854 #endif
855 #else
856  const int sp_idx = (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1;
857 #endif
858 
859  const int ga_idx = sid;
860 
867 
868 #ifdef MULTI_GPU
869  if (kernel_type == INTERIOR_KERNEL) {
870 #endif
871 
872  // read spinor from device memory
873  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
874 
875  // project spinor into half spinors
876  a0_re = +i00_re+i30_re;
877  a0_im = +i00_im+i30_im;
878  a1_re = +i01_re+i31_re;
879  a1_im = +i01_im+i31_im;
880  a2_re = +i02_re+i32_re;
881  a2_im = +i02_im+i32_im;
882  b0_re = +i10_re-i20_re;
883  b0_im = +i10_im-i20_im;
884  b1_re = +i11_re-i21_re;
885  b1_im = +i11_im-i21_im;
886  b2_re = +i12_re-i22_re;
887  b2_im = +i12_im-i22_im;
888 
889 #ifdef MULTI_GPU
890  } else {
891 
892  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
893 
894  // read half spinor from device memory
895  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 2);
896 
897  a0_re = i00_re; a0_im = i00_im;
898  a1_re = i01_re; a1_im = i01_im;
899  a2_re = i02_re; a2_im = i02_im;
900  b0_re = i10_re; b0_im = i10_im;
901  b1_re = i11_re; b1_im = i11_im;
902  b2_re = i12_re; b2_im = i12_im;
903 
904  }
905 #endif // MULTI_GPU
906 
907  // read gauge matrix from device memory
908  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, param.gauge_stride);
909 
910  // reconstruct gauge matrix
912 
913  // multiply row 0
914  spinorFloat A0_re = 0;
915  A0_re += g00_re * a0_re;
916  A0_re -= g00_im * a0_im;
917  A0_re += g01_re * a1_re;
918  A0_re -= g01_im * a1_im;
919  A0_re += g02_re * a2_re;
920  A0_re -= g02_im * a2_im;
921  spinorFloat A0_im = 0;
922  A0_im += g00_re * a0_im;
923  A0_im += g00_im * a0_re;
924  A0_im += g01_re * a1_im;
925  A0_im += g01_im * a1_re;
926  A0_im += g02_re * a2_im;
927  A0_im += g02_im * a2_re;
928  spinorFloat B0_re = 0;
929  B0_re += g00_re * b0_re;
930  B0_re -= g00_im * b0_im;
931  B0_re += g01_re * b1_re;
932  B0_re -= g01_im * b1_im;
933  B0_re += g02_re * b2_re;
934  B0_re -= g02_im * b2_im;
935  spinorFloat B0_im = 0;
936  B0_im += g00_re * b0_im;
937  B0_im += g00_im * b0_re;
938  B0_im += g01_re * b1_im;
939  B0_im += g01_im * b1_re;
940  B0_im += g02_re * b2_im;
941  B0_im += g02_im * b2_re;
942 
943  // multiply row 1
944  spinorFloat A1_re = 0;
945  A1_re += g10_re * a0_re;
946  A1_re -= g10_im * a0_im;
947  A1_re += g11_re * a1_re;
948  A1_re -= g11_im * a1_im;
949  A1_re += g12_re * a2_re;
950  A1_re -= g12_im * a2_im;
951  spinorFloat A1_im = 0;
952  A1_im += g10_re * a0_im;
953  A1_im += g10_im * a0_re;
954  A1_im += g11_re * a1_im;
955  A1_im += g11_im * a1_re;
956  A1_im += g12_re * a2_im;
957  A1_im += g12_im * a2_re;
958  spinorFloat B1_re = 0;
959  B1_re += g10_re * b0_re;
960  B1_re -= g10_im * b0_im;
961  B1_re += g11_re * b1_re;
962  B1_re -= g11_im * b1_im;
963  B1_re += g12_re * b2_re;
964  B1_re -= g12_im * b2_im;
965  spinorFloat B1_im = 0;
966  B1_im += g10_re * b0_im;
967  B1_im += g10_im * b0_re;
968  B1_im += g11_re * b1_im;
969  B1_im += g11_im * b1_re;
970  B1_im += g12_re * b2_im;
971  B1_im += g12_im * b2_re;
972 
973  // multiply row 2
974  spinorFloat A2_re = 0;
975  A2_re += g20_re * a0_re;
976  A2_re -= g20_im * a0_im;
977  A2_re += g21_re * a1_re;
978  A2_re -= g21_im * a1_im;
979  A2_re += g22_re * a2_re;
980  A2_re -= g22_im * a2_im;
981  spinorFloat A2_im = 0;
982  A2_im += g20_re * a0_im;
983  A2_im += g20_im * a0_re;
984  A2_im += g21_re * a1_im;
985  A2_im += g21_im * a1_re;
986  A2_im += g22_re * a2_im;
987  A2_im += g22_im * a2_re;
988  spinorFloat B2_re = 0;
989  B2_re += g20_re * b0_re;
990  B2_re -= g20_im * b0_im;
991  B2_re += g21_re * b1_re;
992  B2_re -= g21_im * b1_im;
993  B2_re += g22_re * b2_re;
994  B2_re -= g22_im * b2_im;
995  spinorFloat B2_im = 0;
996  B2_im += g20_re * b0_im;
997  B2_im += g20_im * b0_re;
998  B2_im += g21_re * b1_im;
999  B2_im += g21_im * b1_re;
1000  B2_im += g22_re * b2_im;
1001  B2_im += g22_im * b2_re;
1002 
1003  o00_re += A0_re;
1004  o00_im += A0_im;
1005  o10_re += B0_re;
1006  o10_im += B0_im;
1007  o20_re -= B0_re;
1008  o20_im -= B0_im;
1009  o30_re += A0_re;
1010  o30_im += A0_im;
1011 
1012  o01_re += A1_re;
1013  o01_im += A1_im;
1014  o11_re += B1_re;
1015  o11_im += B1_im;
1016  o21_re -= B1_re;
1017  o21_im -= B1_im;
1018  o31_re += A1_re;
1019  o31_im += A1_im;
1020 
1021  o02_re += A2_re;
1022  o02_im += A2_im;
1023  o12_re += B2_re;
1024  o12_im += B2_im;
1025  o22_re -= B2_re;
1026  o22_im -= B2_im;
1027  o32_re += A2_re;
1028  o32_im += A2_im;
1029 
1030 }
1031 
1032 #ifdef MULTI_GPU
1033 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]>0)) ||
1034  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==0) )
1035 #endif
1036 {
1037  // Projector P1-
1038  // 1 0 0 -1
1039  // 0 1 1 0
1040  // 0 1 1 0
1041  // -1 0 0 1
1042 
1043 #ifdef MULTI_GPU
1044  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1 :
1045  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1046 #if (DD_PREC==2) // half precision
1047  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1048 #endif
1049 #else
1050  const int sp_idx = (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1;
1051 #endif
1052 
1053 #ifdef MULTI_GPU
1054  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1055 #else
1056  const int ga_idx = sp_idx;
1057 #endif
1058 
1065 
1066 #ifdef MULTI_GPU
1067  if (kernel_type == INTERIOR_KERNEL) {
1068 #endif
1069 
1070  // read spinor from device memory
1071  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1072 
1073  // project spinor into half spinors
1074  a0_re = +i00_re-i30_re;
1075  a0_im = +i00_im-i30_im;
1076  a1_re = +i01_re-i31_re;
1077  a1_im = +i01_im-i31_im;
1078  a2_re = +i02_re-i32_re;
1079  a2_im = +i02_im-i32_im;
1080  b0_re = +i10_re+i20_re;
1081  b0_im = +i10_im+i20_im;
1082  b1_re = +i11_re+i21_re;
1083  b1_im = +i11_im+i21_im;
1084  b2_re = +i12_re+i22_re;
1085  b2_im = +i12_im+i22_im;
1086 
1087 #ifdef MULTI_GPU
1088  } else {
1089 
1090  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1091 
1092  // read half spinor from device memory
1093  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 3);
1094 
1095  a0_re = i00_re; a0_im = i00_im;
1096  a1_re = i01_re; a1_im = i01_im;
1097  a2_re = i02_re; a2_im = i02_im;
1098  b0_re = i10_re; b0_im = i10_im;
1099  b1_re = i11_re; b1_im = i11_im;
1100  b2_re = i12_re; b2_im = i12_im;
1101 
1102  }
1103 #endif // MULTI_GPU
1104 
1105  // read gauge matrix from device memory
1106  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, param.gauge_stride);
1107 
1108  // reconstruct gauge matrix
1110 
1111  // multiply row 0
1112  spinorFloat A0_re = 0;
1113  A0_re += gT00_re * a0_re;
1114  A0_re -= gT00_im * a0_im;
1115  A0_re += gT01_re * a1_re;
1116  A0_re -= gT01_im * a1_im;
1117  A0_re += gT02_re * a2_re;
1118  A0_re -= gT02_im * a2_im;
1119  spinorFloat A0_im = 0;
1120  A0_im += gT00_re * a0_im;
1121  A0_im += gT00_im * a0_re;
1122  A0_im += gT01_re * a1_im;
1123  A0_im += gT01_im * a1_re;
1124  A0_im += gT02_re * a2_im;
1125  A0_im += gT02_im * a2_re;
1126  spinorFloat B0_re = 0;
1127  B0_re += gT00_re * b0_re;
1128  B0_re -= gT00_im * b0_im;
1129  B0_re += gT01_re * b1_re;
1130  B0_re -= gT01_im * b1_im;
1131  B0_re += gT02_re * b2_re;
1132  B0_re -= gT02_im * b2_im;
1133  spinorFloat B0_im = 0;
1134  B0_im += gT00_re * b0_im;
1135  B0_im += gT00_im * b0_re;
1136  B0_im += gT01_re * b1_im;
1137  B0_im += gT01_im * b1_re;
1138  B0_im += gT02_re * b2_im;
1139  B0_im += gT02_im * b2_re;
1140 
1141  // multiply row 1
1142  spinorFloat A1_re = 0;
1143  A1_re += gT10_re * a0_re;
1144  A1_re -= gT10_im * a0_im;
1145  A1_re += gT11_re * a1_re;
1146  A1_re -= gT11_im * a1_im;
1147  A1_re += gT12_re * a2_re;
1148  A1_re -= gT12_im * a2_im;
1149  spinorFloat A1_im = 0;
1150  A1_im += gT10_re * a0_im;
1151  A1_im += gT10_im * a0_re;
1152  A1_im += gT11_re * a1_im;
1153  A1_im += gT11_im * a1_re;
1154  A1_im += gT12_re * a2_im;
1155  A1_im += gT12_im * a2_re;
1156  spinorFloat B1_re = 0;
1157  B1_re += gT10_re * b0_re;
1158  B1_re -= gT10_im * b0_im;
1159  B1_re += gT11_re * b1_re;
1160  B1_re -= gT11_im * b1_im;
1161  B1_re += gT12_re * b2_re;
1162  B1_re -= gT12_im * b2_im;
1163  spinorFloat B1_im = 0;
1164  B1_im += gT10_re * b0_im;
1165  B1_im += gT10_im * b0_re;
1166  B1_im += gT11_re * b1_im;
1167  B1_im += gT11_im * b1_re;
1168  B1_im += gT12_re * b2_im;
1169  B1_im += gT12_im * b2_re;
1170 
1171  // multiply row 2
1172  spinorFloat A2_re = 0;
1173  A2_re += gT20_re * a0_re;
1174  A2_re -= gT20_im * a0_im;
1175  A2_re += gT21_re * a1_re;
1176  A2_re -= gT21_im * a1_im;
1177  A2_re += gT22_re * a2_re;
1178  A2_re -= gT22_im * a2_im;
1179  spinorFloat A2_im = 0;
1180  A2_im += gT20_re * a0_im;
1181  A2_im += gT20_im * a0_re;
1182  A2_im += gT21_re * a1_im;
1183  A2_im += gT21_im * a1_re;
1184  A2_im += gT22_re * a2_im;
1185  A2_im += gT22_im * a2_re;
1186  spinorFloat B2_re = 0;
1187  B2_re += gT20_re * b0_re;
1188  B2_re -= gT20_im * b0_im;
1189  B2_re += gT21_re * b1_re;
1190  B2_re -= gT21_im * b1_im;
1191  B2_re += gT22_re * b2_re;
1192  B2_re -= gT22_im * b2_im;
1193  spinorFloat B2_im = 0;
1194  B2_im += gT20_re * b0_im;
1195  B2_im += gT20_im * b0_re;
1196  B2_im += gT21_re * b1_im;
1197  B2_im += gT21_im * b1_re;
1198  B2_im += gT22_re * b2_im;
1199  B2_im += gT22_im * b2_re;
1200 
1201  o00_re += A0_re;
1202  o00_im += A0_im;
1203  o10_re += B0_re;
1204  o10_im += B0_im;
1205  o20_re += B0_re;
1206  o20_im += B0_im;
1207  o30_re -= A0_re;
1208  o30_im -= A0_im;
1209 
1210  o01_re += A1_re;
1211  o01_im += A1_im;
1212  o11_re += B1_re;
1213  o11_im += B1_im;
1214  o21_re += B1_re;
1215  o21_im += B1_im;
1216  o31_re -= A1_re;
1217  o31_im -= A1_im;
1218 
1219  o02_re += A2_re;
1220  o02_im += A2_im;
1221  o12_re += B2_re;
1222  o12_im += B2_im;
1223  o22_re += B2_re;
1224  o22_im += B2_im;
1225  o32_re -= A2_re;
1226  o32_im -= A2_im;
1227 
1228 }
1229 
1230 #ifdef MULTI_GPU
1231 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]<(param.dc.X[2]-1))) ||
1232  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==(param.dc.X[2]-1)) )
1233 #endif
1234 {
1235  // Projector P2+
1236  // 1 0 i 0
1237  // 0 1 0 -i
1238  // -i 0 1 0
1239  // 0 i 0 1
1240 
1241 #ifdef MULTI_GPU
1242  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1 :
1243  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1244 #if (DD_PREC==2) // half precision
1245  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1246 #endif
1247 #else
1248  const int sp_idx = (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1;
1249 #endif
1250 
1251  const int ga_idx = sid;
1252 
1259 
1260 #ifdef MULTI_GPU
1261  if (kernel_type == INTERIOR_KERNEL) {
1262 #endif
1263 
1264  // read spinor from device memory
1265  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1266 
1267  // project spinor into half spinors
1268  a0_re = +i00_re-i20_im;
1269  a0_im = +i00_im+i20_re;
1270  a1_re = +i01_re-i21_im;
1271  a1_im = +i01_im+i21_re;
1272  a2_re = +i02_re-i22_im;
1273  a2_im = +i02_im+i22_re;
1274  b0_re = +i10_re+i30_im;
1275  b0_im = +i10_im-i30_re;
1276  b1_re = +i11_re+i31_im;
1277  b1_im = +i11_im-i31_re;
1278  b2_re = +i12_re+i32_im;
1279  b2_im = +i12_im-i32_re;
1280 
1281 #ifdef MULTI_GPU
1282  } else {
1283 
1284  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1285 
1286  // read half spinor from device memory
1287  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 4);
1288 
1289  a0_re = i00_re; a0_im = i00_im;
1290  a1_re = i01_re; a1_im = i01_im;
1291  a2_re = i02_re; a2_im = i02_im;
1292  b0_re = i10_re; b0_im = i10_im;
1293  b1_re = i11_re; b1_im = i11_im;
1294  b2_re = i12_re; b2_im = i12_im;
1295 
1296  }
1297 #endif // MULTI_GPU
1298 
1299  // read gauge matrix from device memory
1300  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, param.gauge_stride);
1301 
1302  // reconstruct gauge matrix
1304 
1305  // multiply row 0
1306  spinorFloat A0_re = 0;
1307  A0_re += g00_re * a0_re;
1308  A0_re -= g00_im * a0_im;
1309  A0_re += g01_re * a1_re;
1310  A0_re -= g01_im * a1_im;
1311  A0_re += g02_re * a2_re;
1312  A0_re -= g02_im * a2_im;
1313  spinorFloat A0_im = 0;
1314  A0_im += g00_re * a0_im;
1315  A0_im += g00_im * a0_re;
1316  A0_im += g01_re * a1_im;
1317  A0_im += g01_im * a1_re;
1318  A0_im += g02_re * a2_im;
1319  A0_im += g02_im * a2_re;
1320  spinorFloat B0_re = 0;
1321  B0_re += g00_re * b0_re;
1322  B0_re -= g00_im * b0_im;
1323  B0_re += g01_re * b1_re;
1324  B0_re -= g01_im * b1_im;
1325  B0_re += g02_re * b2_re;
1326  B0_re -= g02_im * b2_im;
1327  spinorFloat B0_im = 0;
1328  B0_im += g00_re * b0_im;
1329  B0_im += g00_im * b0_re;
1330  B0_im += g01_re * b1_im;
1331  B0_im += g01_im * b1_re;
1332  B0_im += g02_re * b2_im;
1333  B0_im += g02_im * b2_re;
1334 
1335  // multiply row 1
1336  spinorFloat A1_re = 0;
1337  A1_re += g10_re * a0_re;
1338  A1_re -= g10_im * a0_im;
1339  A1_re += g11_re * a1_re;
1340  A1_re -= g11_im * a1_im;
1341  A1_re += g12_re * a2_re;
1342  A1_re -= g12_im * a2_im;
1343  spinorFloat A1_im = 0;
1344  A1_im += g10_re * a0_im;
1345  A1_im += g10_im * a0_re;
1346  A1_im += g11_re * a1_im;
1347  A1_im += g11_im * a1_re;
1348  A1_im += g12_re * a2_im;
1349  A1_im += g12_im * a2_re;
1350  spinorFloat B1_re = 0;
1351  B1_re += g10_re * b0_re;
1352  B1_re -= g10_im * b0_im;
1353  B1_re += g11_re * b1_re;
1354  B1_re -= g11_im * b1_im;
1355  B1_re += g12_re * b2_re;
1356  B1_re -= g12_im * b2_im;
1357  spinorFloat B1_im = 0;
1358  B1_im += g10_re * b0_im;
1359  B1_im += g10_im * b0_re;
1360  B1_im += g11_re * b1_im;
1361  B1_im += g11_im * b1_re;
1362  B1_im += g12_re * b2_im;
1363  B1_im += g12_im * b2_re;
1364 
1365  // multiply row 2
1366  spinorFloat A2_re = 0;
1367  A2_re += g20_re * a0_re;
1368  A2_re -= g20_im * a0_im;
1369  A2_re += g21_re * a1_re;
1370  A2_re -= g21_im * a1_im;
1371  A2_re += g22_re * a2_re;
1372  A2_re -= g22_im * a2_im;
1373  spinorFloat A2_im = 0;
1374  A2_im += g20_re * a0_im;
1375  A2_im += g20_im * a0_re;
1376  A2_im += g21_re * a1_im;
1377  A2_im += g21_im * a1_re;
1378  A2_im += g22_re * a2_im;
1379  A2_im += g22_im * a2_re;
1380  spinorFloat B2_re = 0;
1381  B2_re += g20_re * b0_re;
1382  B2_re -= g20_im * b0_im;
1383  B2_re += g21_re * b1_re;
1384  B2_re -= g21_im * b1_im;
1385  B2_re += g22_re * b2_re;
1386  B2_re -= g22_im * b2_im;
1387  spinorFloat B2_im = 0;
1388  B2_im += g20_re * b0_im;
1389  B2_im += g20_im * b0_re;
1390  B2_im += g21_re * b1_im;
1391  B2_im += g21_im * b1_re;
1392  B2_im += g22_re * b2_im;
1393  B2_im += g22_im * b2_re;
1394 
1395  o00_re += A0_re;
1396  o00_im += A0_im;
1397  o10_re += B0_re;
1398  o10_im += B0_im;
1399  o20_re += A0_im;
1400  o20_im -= A0_re;
1401  o30_re -= B0_im;
1402  o30_im += B0_re;
1403 
1404  o01_re += A1_re;
1405  o01_im += A1_im;
1406  o11_re += B1_re;
1407  o11_im += B1_im;
1408  o21_re += A1_im;
1409  o21_im -= A1_re;
1410  o31_re -= B1_im;
1411  o31_im += B1_re;
1412 
1413  o02_re += A2_re;
1414  o02_im += A2_im;
1415  o12_re += B2_re;
1416  o12_im += B2_im;
1417  o22_re += A2_im;
1418  o22_im -= A2_re;
1419  o32_re -= B2_im;
1420  o32_im += B2_re;
1421 
1422 }
1423 
1424 #ifdef MULTI_GPU
1425 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]>0)) ||
1426  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==0) )
1427 #endif
1428 {
1429  // Projector P2-
1430  // 1 0 -i 0
1431  // 0 1 0 i
1432  // i 0 1 0
1433  // 0 -i 0 1
1434 
1435 #ifdef MULTI_GPU
1436  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1 :
1437  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1438 #if (DD_PREC==2) // half precision
1439  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1440 #endif
1441 #else
1442  const int sp_idx = (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1;
1443 #endif
1444 
1445 #ifdef MULTI_GPU
1446  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1447 #else
1448  const int ga_idx = sp_idx;
1449 #endif
1450 
1457 
1458 #ifdef MULTI_GPU
1459  if (kernel_type == INTERIOR_KERNEL) {
1460 #endif
1461 
1462  // read spinor from device memory
1463  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1464 
1465  // project spinor into half spinors
1466  a0_re = +i00_re+i20_im;
1467  a0_im = +i00_im-i20_re;
1468  a1_re = +i01_re+i21_im;
1469  a1_im = +i01_im-i21_re;
1470  a2_re = +i02_re+i22_im;
1471  a2_im = +i02_im-i22_re;
1472  b0_re = +i10_re-i30_im;
1473  b0_im = +i10_im+i30_re;
1474  b1_re = +i11_re-i31_im;
1475  b1_im = +i11_im+i31_re;
1476  b2_re = +i12_re-i32_im;
1477  b2_im = +i12_im+i32_re;
1478 
1479 #ifdef MULTI_GPU
1480  } else {
1481 
1482  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1483 
1484  // read half spinor from device memory
1485  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 5);
1486 
1487  a0_re = i00_re; a0_im = i00_im;
1488  a1_re = i01_re; a1_im = i01_im;
1489  a2_re = i02_re; a2_im = i02_im;
1490  b0_re = i10_re; b0_im = i10_im;
1491  b1_re = i11_re; b1_im = i11_im;
1492  b2_re = i12_re; b2_im = i12_im;
1493 
1494  }
1495 #endif // MULTI_GPU
1496 
1497  // read gauge matrix from device memory
1498  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, param.gauge_stride);
1499 
1500  // reconstruct gauge matrix
1502 
1503  // multiply row 0
1504  spinorFloat A0_re = 0;
1505  A0_re += gT00_re * a0_re;
1506  A0_re -= gT00_im * a0_im;
1507  A0_re += gT01_re * a1_re;
1508  A0_re -= gT01_im * a1_im;
1509  A0_re += gT02_re * a2_re;
1510  A0_re -= gT02_im * a2_im;
1511  spinorFloat A0_im = 0;
1512  A0_im += gT00_re * a0_im;
1513  A0_im += gT00_im * a0_re;
1514  A0_im += gT01_re * a1_im;
1515  A0_im += gT01_im * a1_re;
1516  A0_im += gT02_re * a2_im;
1517  A0_im += gT02_im * a2_re;
1518  spinorFloat B0_re = 0;
1519  B0_re += gT00_re * b0_re;
1520  B0_re -= gT00_im * b0_im;
1521  B0_re += gT01_re * b1_re;
1522  B0_re -= gT01_im * b1_im;
1523  B0_re += gT02_re * b2_re;
1524  B0_re -= gT02_im * b2_im;
1525  spinorFloat B0_im = 0;
1526  B0_im += gT00_re * b0_im;
1527  B0_im += gT00_im * b0_re;
1528  B0_im += gT01_re * b1_im;
1529  B0_im += gT01_im * b1_re;
1530  B0_im += gT02_re * b2_im;
1531  B0_im += gT02_im * b2_re;
1532 
1533  // multiply row 1
1534  spinorFloat A1_re = 0;
1535  A1_re += gT10_re * a0_re;
1536  A1_re -= gT10_im * a0_im;
1537  A1_re += gT11_re * a1_re;
1538  A1_re -= gT11_im * a1_im;
1539  A1_re += gT12_re * a2_re;
1540  A1_re -= gT12_im * a2_im;
1541  spinorFloat A1_im = 0;
1542  A1_im += gT10_re * a0_im;
1543  A1_im += gT10_im * a0_re;
1544  A1_im += gT11_re * a1_im;
1545  A1_im += gT11_im * a1_re;
1546  A1_im += gT12_re * a2_im;
1547  A1_im += gT12_im * a2_re;
1548  spinorFloat B1_re = 0;
1549  B1_re += gT10_re * b0_re;
1550  B1_re -= gT10_im * b0_im;
1551  B1_re += gT11_re * b1_re;
1552  B1_re -= gT11_im * b1_im;
1553  B1_re += gT12_re * b2_re;
1554  B1_re -= gT12_im * b2_im;
1555  spinorFloat B1_im = 0;
1556  B1_im += gT10_re * b0_im;
1557  B1_im += gT10_im * b0_re;
1558  B1_im += gT11_re * b1_im;
1559  B1_im += gT11_im * b1_re;
1560  B1_im += gT12_re * b2_im;
1561  B1_im += gT12_im * b2_re;
1562 
1563  // multiply row 2
1564  spinorFloat A2_re = 0;
1565  A2_re += gT20_re * a0_re;
1566  A2_re -= gT20_im * a0_im;
1567  A2_re += gT21_re * a1_re;
1568  A2_re -= gT21_im * a1_im;
1569  A2_re += gT22_re * a2_re;
1570  A2_re -= gT22_im * a2_im;
1571  spinorFloat A2_im = 0;
1572  A2_im += gT20_re * a0_im;
1573  A2_im += gT20_im * a0_re;
1574  A2_im += gT21_re * a1_im;
1575  A2_im += gT21_im * a1_re;
1576  A2_im += gT22_re * a2_im;
1577  A2_im += gT22_im * a2_re;
1578  spinorFloat B2_re = 0;
1579  B2_re += gT20_re * b0_re;
1580  B2_re -= gT20_im * b0_im;
1581  B2_re += gT21_re * b1_re;
1582  B2_re -= gT21_im * b1_im;
1583  B2_re += gT22_re * b2_re;
1584  B2_re -= gT22_im * b2_im;
1585  spinorFloat B2_im = 0;
1586  B2_im += gT20_re * b0_im;
1587  B2_im += gT20_im * b0_re;
1588  B2_im += gT21_re * b1_im;
1589  B2_im += gT21_im * b1_re;
1590  B2_im += gT22_re * b2_im;
1591  B2_im += gT22_im * b2_re;
1592 
1593  o00_re += A0_re;
1594  o00_im += A0_im;
1595  o10_re += B0_re;
1596  o10_im += B0_im;
1597  o20_re -= A0_im;
1598  o20_im += A0_re;
1599  o30_re += B0_im;
1600  o30_im -= B0_re;
1601 
1602  o01_re += A1_re;
1603  o01_im += A1_im;
1604  o11_re += B1_re;
1605  o11_im += B1_im;
1606  o21_re -= A1_im;
1607  o21_im += A1_re;
1608  o31_re += B1_im;
1609  o31_im -= B1_re;
1610 
1611  o02_re += A2_re;
1612  o02_im += A2_im;
1613  o12_re += B2_re;
1614  o12_im += B2_im;
1615  o22_re -= A2_im;
1616  o22_im += A2_re;
1617  o32_re += B2_im;
1618  o32_im -= B2_re;
1619 
1620 }
1621 
1622 #ifdef MULTI_GPU
1623 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]<(param.dc.X[3]-1))) ||
1624  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==(param.dc.X[3]-1)) )
1625 #endif
1626 {
1627  // Projector P3+
1628  // 2 0 0 0
1629  // 0 2 0 0
1630  // 0 0 0 0
1631  // 0 0 0 0
1632 
1633 #ifdef MULTI_GPU
1634  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1 :
1635  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1636 #if (DD_PREC==2) // half precision
1637  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1638 #endif
1639 #else
1640  const int sp_idx = (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1;
1641 #endif
1642 
1643  const int ga_idx = sid;
1644 
1645  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1646  {
1653 
1654 #ifdef MULTI_GPU
1655  if (kernel_type == INTERIOR_KERNEL) {
1656 #endif
1657 
1658  // read spinor from device memory
1659  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1660 
1661  // project spinor into half spinors
1662  a0_re = +2*i00_re;
1663  a0_im = +2*i00_im;
1664  a1_re = +2*i01_re;
1665  a1_im = +2*i01_im;
1666  a2_re = +2*i02_re;
1667  a2_im = +2*i02_im;
1668  b0_re = +2*i10_re;
1669  b0_im = +2*i10_im;
1670  b1_re = +2*i11_re;
1671  b1_im = +2*i11_im;
1672  b2_re = +2*i12_re;
1673  b2_im = +2*i12_im;
1674 
1675 #ifdef MULTI_GPU
1676  } else {
1677 
1678  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1679  const int t_proj_scale = TPROJSCALE;
1680 
1681  // read half spinor from device memory
1682  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1683 
1684  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1685  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1686  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1687  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1688  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1689  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1690 
1691  }
1692 #endif // MULTI_GPU
1693 
1694  // identity gauge matrix
1701 
1702  o00_re += A0_re;
1703  o00_im += A0_im;
1704  o10_re += B0_re;
1705  o10_im += B0_im;
1706 
1707  o01_re += A1_re;
1708  o01_im += A1_im;
1709  o11_re += B1_re;
1710  o11_im += B1_im;
1711 
1712  o02_re += A2_re;
1713  o02_im += A2_im;
1714  o12_re += B2_re;
1715  o12_im += B2_im;
1716 
1717  } else {
1724 
1725 #ifdef MULTI_GPU
1726  if (kernel_type == INTERIOR_KERNEL) {
1727 #endif
1728 
1729  // read spinor from device memory
1730  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1731 
1732  // project spinor into half spinors
1733  a0_re = +2*i00_re;
1734  a0_im = +2*i00_im;
1735  a1_re = +2*i01_re;
1736  a1_im = +2*i01_im;
1737  a2_re = +2*i02_re;
1738  a2_im = +2*i02_im;
1739  b0_re = +2*i10_re;
1740  b0_im = +2*i10_im;
1741  b1_re = +2*i11_re;
1742  b1_im = +2*i11_im;
1743  b2_re = +2*i12_re;
1744  b2_im = +2*i12_im;
1745 
1746 #ifdef MULTI_GPU
1747  } else {
1748 
1749  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1750  const int t_proj_scale = TPROJSCALE;
1751 
1752  // read half spinor from device memory
1753  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1754 
1755  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1756  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1757  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1758  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1759  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1760  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1761 
1762  }
1763 #endif // MULTI_GPU
1764 
1765  // read gauge matrix from device memory
1766  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, param.gauge_stride);
1767 
1768  // reconstruct gauge matrix
1770 
1771  // multiply row 0
1772  spinorFloat A0_re = 0;
1773  A0_re += g00_re * a0_re;
1774  A0_re -= g00_im * a0_im;
1775  A0_re += g01_re * a1_re;
1776  A0_re -= g01_im * a1_im;
1777  A0_re += g02_re * a2_re;
1778  A0_re -= g02_im * a2_im;
1779  spinorFloat A0_im = 0;
1780  A0_im += g00_re * a0_im;
1781  A0_im += g00_im * a0_re;
1782  A0_im += g01_re * a1_im;
1783  A0_im += g01_im * a1_re;
1784  A0_im += g02_re * a2_im;
1785  A0_im += g02_im * a2_re;
1786  spinorFloat B0_re = 0;
1787  B0_re += g00_re * b0_re;
1788  B0_re -= g00_im * b0_im;
1789  B0_re += g01_re * b1_re;
1790  B0_re -= g01_im * b1_im;
1791  B0_re += g02_re * b2_re;
1792  B0_re -= g02_im * b2_im;
1793  spinorFloat B0_im = 0;
1794  B0_im += g00_re * b0_im;
1795  B0_im += g00_im * b0_re;
1796  B0_im += g01_re * b1_im;
1797  B0_im += g01_im * b1_re;
1798  B0_im += g02_re * b2_im;
1799  B0_im += g02_im * b2_re;
1800 
1801  // multiply row 1
1802  spinorFloat A1_re = 0;
1803  A1_re += g10_re * a0_re;
1804  A1_re -= g10_im * a0_im;
1805  A1_re += g11_re * a1_re;
1806  A1_re -= g11_im * a1_im;
1807  A1_re += g12_re * a2_re;
1808  A1_re -= g12_im * a2_im;
1809  spinorFloat A1_im = 0;
1810  A1_im += g10_re * a0_im;
1811  A1_im += g10_im * a0_re;
1812  A1_im += g11_re * a1_im;
1813  A1_im += g11_im * a1_re;
1814  A1_im += g12_re * a2_im;
1815  A1_im += g12_im * a2_re;
1816  spinorFloat B1_re = 0;
1817  B1_re += g10_re * b0_re;
1818  B1_re -= g10_im * b0_im;
1819  B1_re += g11_re * b1_re;
1820  B1_re -= g11_im * b1_im;
1821  B1_re += g12_re * b2_re;
1822  B1_re -= g12_im * b2_im;
1823  spinorFloat B1_im = 0;
1824  B1_im += g10_re * b0_im;
1825  B1_im += g10_im * b0_re;
1826  B1_im += g11_re * b1_im;
1827  B1_im += g11_im * b1_re;
1828  B1_im += g12_re * b2_im;
1829  B1_im += g12_im * b2_re;
1830 
1831  // multiply row 2
1832  spinorFloat A2_re = 0;
1833  A2_re += g20_re * a0_re;
1834  A2_re -= g20_im * a0_im;
1835  A2_re += g21_re * a1_re;
1836  A2_re -= g21_im * a1_im;
1837  A2_re += g22_re * a2_re;
1838  A2_re -= g22_im * a2_im;
1839  spinorFloat A2_im = 0;
1840  A2_im += g20_re * a0_im;
1841  A2_im += g20_im * a0_re;
1842  A2_im += g21_re * a1_im;
1843  A2_im += g21_im * a1_re;
1844  A2_im += g22_re * a2_im;
1845  A2_im += g22_im * a2_re;
1846  spinorFloat B2_re = 0;
1847  B2_re += g20_re * b0_re;
1848  B2_re -= g20_im * b0_im;
1849  B2_re += g21_re * b1_re;
1850  B2_re -= g21_im * b1_im;
1851  B2_re += g22_re * b2_re;
1852  B2_re -= g22_im * b2_im;
1853  spinorFloat B2_im = 0;
1854  B2_im += g20_re * b0_im;
1855  B2_im += g20_im * b0_re;
1856  B2_im += g21_re * b1_im;
1857  B2_im += g21_im * b1_re;
1858  B2_im += g22_re * b2_im;
1859  B2_im += g22_im * b2_re;
1860 
1861  o00_re += A0_re;
1862  o00_im += A0_im;
1863  o10_re += B0_re;
1864  o10_im += B0_im;
1865 
1866  o01_re += A1_re;
1867  o01_im += A1_im;
1868  o11_re += B1_re;
1869  o11_im += B1_im;
1870 
1871  o02_re += A2_re;
1872  o02_im += A2_im;
1873  o12_re += B2_re;
1874  o12_im += B2_im;
1875 
1876  }
1877 }
1878 
1879 #ifdef MULTI_GPU
1880 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]>0)) ||
1881  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==0) )
1882 #endif
1883 {
1884  // Projector P3-
1885  // 0 0 0 0
1886  // 0 0 0 0
1887  // 0 0 2 0
1888  // 0 0 0 2
1889 
1890 #ifdef MULTI_GPU
1891  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1 :
1892  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1893 #if (DD_PREC==2) // half precision
1894  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1895 #endif
1896 #else
1897  const int sp_idx = (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1;
1898 #endif
1899 
1900 #ifdef MULTI_GPU
1901  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : param.dc.Vh+face_idx);
1902 #else
1903  const int ga_idx = sp_idx;
1904 #endif
1905 
1906  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1907  {
1914 
1915 #ifdef MULTI_GPU
1916  if (kernel_type == INTERIOR_KERNEL) {
1917 #endif
1918 
1919  // read spinor from device memory
1921 
1922  // project spinor into half spinors
1923  a0_re = +2*i20_re;
1924  a0_im = +2*i20_im;
1925  a1_re = +2*i21_re;
1926  a1_im = +2*i21_im;
1927  a2_re = +2*i22_re;
1928  a2_im = +2*i22_im;
1929  b0_re = +2*i30_re;
1930  b0_im = +2*i30_im;
1931  b1_re = +2*i31_re;
1932  b1_im = +2*i31_im;
1933  b2_re = +2*i32_re;
1934  b2_im = +2*i32_im;
1935 
1936 #ifdef MULTI_GPU
1937  } else {
1938 
1939  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
1940  const int t_proj_scale = TPROJSCALE;
1941 
1942  // read half spinor from device memory
1943  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
1944 
1945  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1946  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1947  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1948  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1949  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1950  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1951 
1952  }
1953 #endif // MULTI_GPU
1954 
1955  // identity gauge matrix
1962 
1963  o20_re += A0_re;
1964  o20_im += A0_im;
1965  o30_re += B0_re;
1966  o30_im += B0_im;
1967 
1968  o21_re += A1_re;
1969  o21_im += A1_im;
1970  o31_re += B1_re;
1971  o31_im += B1_im;
1972 
1973  o22_re += A2_re;
1974  o22_im += A2_im;
1975  o32_re += B2_re;
1976  o32_im += B2_im;
1977 
1978  } else {
1985 
1986 #ifdef MULTI_GPU
1987  if (kernel_type == INTERIOR_KERNEL) {
1988 #endif
1989 
1990  // read spinor from device memory
1992 
1993  // project spinor into half spinors
1994  a0_re = +2*i20_re;
1995  a0_im = +2*i20_im;
1996  a1_re = +2*i21_re;
1997  a1_im = +2*i21_im;
1998  a2_re = +2*i22_re;
1999  a2_im = +2*i22_im;
2000  b0_re = +2*i30_re;
2001  b0_im = +2*i30_im;
2002  b1_re = +2*i31_re;
2003  b1_im = +2*i31_im;
2004  b2_re = +2*i32_re;
2005  b2_im = +2*i32_im;
2006 
2007 #ifdef MULTI_GPU
2008  } else {
2009 
2010  const int sp_stride_pad = param.dc.ghostFace[static_cast<int>(kernel_type)];
2011  const int t_proj_scale = TPROJSCALE;
2012 
2013  // read half spinor from device memory
2014  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
2015 
2016  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2017  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2018  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2019  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2020  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2021  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2022 
2023  }
2024 #endif // MULTI_GPU
2025 
2026  // read gauge matrix from device memory
2027  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, param.gauge_stride);
2028 
2029  // reconstruct gauge matrix
2031 
2032  // multiply row 0
2033  spinorFloat A0_re = 0;
2034  A0_re += gT00_re * a0_re;
2035  A0_re -= gT00_im * a0_im;
2036  A0_re += gT01_re * a1_re;
2037  A0_re -= gT01_im * a1_im;
2038  A0_re += gT02_re * a2_re;
2039  A0_re -= gT02_im * a2_im;
2040  spinorFloat A0_im = 0;
2041  A0_im += gT00_re * a0_im;
2042  A0_im += gT00_im * a0_re;
2043  A0_im += gT01_re * a1_im;
2044  A0_im += gT01_im * a1_re;
2045  A0_im += gT02_re * a2_im;
2046  A0_im += gT02_im * a2_re;
2047  spinorFloat B0_re = 0;
2048  B0_re += gT00_re * b0_re;
2049  B0_re -= gT00_im * b0_im;
2050  B0_re += gT01_re * b1_re;
2051  B0_re -= gT01_im * b1_im;
2052  B0_re += gT02_re * b2_re;
2053  B0_re -= gT02_im * b2_im;
2054  spinorFloat B0_im = 0;
2055  B0_im += gT00_re * b0_im;
2056  B0_im += gT00_im * b0_re;
2057  B0_im += gT01_re * b1_im;
2058  B0_im += gT01_im * b1_re;
2059  B0_im += gT02_re * b2_im;
2060  B0_im += gT02_im * b2_re;
2061 
2062  // multiply row 1
2063  spinorFloat A1_re = 0;
2064  A1_re += gT10_re * a0_re;
2065  A1_re -= gT10_im * a0_im;
2066  A1_re += gT11_re * a1_re;
2067  A1_re -= gT11_im * a1_im;
2068  A1_re += gT12_re * a2_re;
2069  A1_re -= gT12_im * a2_im;
2070  spinorFloat A1_im = 0;
2071  A1_im += gT10_re * a0_im;
2072  A1_im += gT10_im * a0_re;
2073  A1_im += gT11_re * a1_im;
2074  A1_im += gT11_im * a1_re;
2075  A1_im += gT12_re * a2_im;
2076  A1_im += gT12_im * a2_re;
2077  spinorFloat B1_re = 0;
2078  B1_re += gT10_re * b0_re;
2079  B1_re -= gT10_im * b0_im;
2080  B1_re += gT11_re * b1_re;
2081  B1_re -= gT11_im * b1_im;
2082  B1_re += gT12_re * b2_re;
2083  B1_re -= gT12_im * b2_im;
2084  spinorFloat B1_im = 0;
2085  B1_im += gT10_re * b0_im;
2086  B1_im += gT10_im * b0_re;
2087  B1_im += gT11_re * b1_im;
2088  B1_im += gT11_im * b1_re;
2089  B1_im += gT12_re * b2_im;
2090  B1_im += gT12_im * b2_re;
2091 
2092  // multiply row 2
2093  spinorFloat A2_re = 0;
2094  A2_re += gT20_re * a0_re;
2095  A2_re -= gT20_im * a0_im;
2096  A2_re += gT21_re * a1_re;
2097  A2_re -= gT21_im * a1_im;
2098  A2_re += gT22_re * a2_re;
2099  A2_re -= gT22_im * a2_im;
2100  spinorFloat A2_im = 0;
2101  A2_im += gT20_re * a0_im;
2102  A2_im += gT20_im * a0_re;
2103  A2_im += gT21_re * a1_im;
2104  A2_im += gT21_im * a1_re;
2105  A2_im += gT22_re * a2_im;
2106  A2_im += gT22_im * a2_re;
2107  spinorFloat B2_re = 0;
2108  B2_re += gT20_re * b0_re;
2109  B2_re -= gT20_im * b0_im;
2110  B2_re += gT21_re * b1_re;
2111  B2_re -= gT21_im * b1_im;
2112  B2_re += gT22_re * b2_re;
2113  B2_re -= gT22_im * b2_im;
2114  spinorFloat B2_im = 0;
2115  B2_im += gT20_re * b0_im;
2116  B2_im += gT20_im * b0_re;
2117  B2_im += gT21_re * b1_im;
2118  B2_im += gT21_im * b1_re;
2119  B2_im += gT22_re * b2_im;
2120  B2_im += gT22_im * b2_re;
2121 
2122  o20_re += A0_re;
2123  o20_im += A0_im;
2124  o30_re += B0_re;
2125  o30_im += B0_im;
2126 
2127  o21_re += A1_re;
2128  o21_im += A1_im;
2129  o31_re += B1_re;
2130  o31_im += B1_im;
2131 
2132  o22_re += A2_re;
2133  o22_im += A2_im;
2134  o32_re += B2_re;
2135  o32_im += B2_im;
2136 
2137  }
2138 }
2139 
2140 #if defined MULTI_GPU && (defined DSLASH_XPAY || defined DSLASH_CLOVER)
2141 
2142 int incomplete = 0; // Have all 8 contributions been computed for this site?
2143 
2144 switch(kernel_type) { // intentional fall-through
2145 case INTERIOR_KERNEL:
2146  incomplete = incomplete || (param.commDim[3] && (coord[3]==0 || coord[3]==(param.dc.X[3]-1)));
2147 case EXTERIOR_KERNEL_T:
2148  incomplete = incomplete || (param.commDim[2] && (coord[2]==0 || coord[2]==(param.dc.X[2]-1)));
2149 case EXTERIOR_KERNEL_Z:
2150  incomplete = incomplete || (param.commDim[1] && (coord[1]==0 || coord[1]==(param.dc.X[1]-1)));
2151 case EXTERIOR_KERNEL_Y:
2152  incomplete = incomplete || (param.commDim[0] && (coord[0]==0 || coord[0]==(param.dc.X[0]-1)));
2153 }
2154 
2155 if (!incomplete)
2156 #endif // MULTI_GPU
2157 {
2158 #ifdef DSLASH_CLOVER
2159 
2160  // change to chiral basis
2161  {
2162  spinorFloat a00_re = -o10_re - o30_re;
2163  spinorFloat a00_im = -o10_im - o30_im;
2164  spinorFloat a10_re = o00_re + o20_re;
2165  spinorFloat a10_im = o00_im + o20_im;
2166  spinorFloat a20_re = -o10_re + o30_re;
2167  spinorFloat a20_im = -o10_im + o30_im;
2168  spinorFloat a30_re = o00_re - o20_re;
2169  spinorFloat a30_im = o00_im - o20_im;
2170 
2171  o00_re = a00_re; o00_im = a00_im;
2172  o10_re = a10_re; o10_im = a10_im;
2173  o20_re = a20_re; o20_im = a20_im;
2174  o30_re = a30_re; o30_im = a30_im;
2175  }
2176 
2177  {
2178  spinorFloat a01_re = -o11_re - o31_re;
2179  spinorFloat a01_im = -o11_im - o31_im;
2180  spinorFloat a11_re = o01_re + o21_re;
2181  spinorFloat a11_im = o01_im + o21_im;
2182  spinorFloat a21_re = -o11_re + o31_re;
2183  spinorFloat a21_im = -o11_im + o31_im;
2184  spinorFloat a31_re = o01_re - o21_re;
2185  spinorFloat a31_im = o01_im - o21_im;
2186 
2187  o01_re = a01_re; o01_im = a01_im;
2188  o11_re = a11_re; o11_im = a11_im;
2189  o21_re = a21_re; o21_im = a21_im;
2190  o31_re = a31_re; o31_im = a31_im;
2191  }
2192 
2193  {
2194  spinorFloat a02_re = -o12_re - o32_re;
2195  spinorFloat a02_im = -o12_im - o32_im;
2196  spinorFloat a12_re = o02_re + o22_re;
2197  spinorFloat a12_im = o02_im + o22_im;
2198  spinorFloat a22_re = -o12_re + o32_re;
2199  spinorFloat a22_im = -o12_im + o32_im;
2200  spinorFloat a32_re = o02_re - o22_re;
2201  spinorFloat a32_im = o02_im - o22_im;
2202 
2203  o02_re = a02_re; o02_im = a02_im;
2204  o12_re = a12_re; o12_im = a12_im;
2205  o22_re = a22_re; o22_im = a22_im;
2206  o32_re = a32_re; o32_im = a32_im;
2207  }
2208 
2209  // apply first chiral block
2210  {
2212 
2213  spinorFloat a00_re = 0; spinorFloat a00_im = 0;
2214  spinorFloat a01_re = 0; spinorFloat a01_im = 0;
2215  spinorFloat a02_re = 0; spinorFloat a02_im = 0;
2216  spinorFloat a10_re = 0; spinorFloat a10_im = 0;
2217  spinorFloat a11_re = 0; spinorFloat a11_im = 0;
2218  spinorFloat a12_re = 0; spinorFloat a12_im = 0;
2219 
2220  a00_re += c00_00_re * o00_re;
2221  a00_im += c00_00_re * o00_im;
2222  a00_re += c00_01_re * o01_re;
2223  a00_re -= c00_01_im * o01_im;
2224  a00_im += c00_01_re * o01_im;
2225  a00_im += c00_01_im * o01_re;
2226  a00_re += c00_02_re * o02_re;
2227  a00_re -= c00_02_im * o02_im;
2228  a00_im += c00_02_re * o02_im;
2229  a00_im += c00_02_im * o02_re;
2230  a00_re += c00_10_re * o10_re;
2231  a00_re -= c00_10_im * o10_im;
2232  a00_im += c00_10_re * o10_im;
2233  a00_im += c00_10_im * o10_re;
2234  a00_re += c00_11_re * o11_re;
2235  a00_re -= c00_11_im * o11_im;
2236  a00_im += c00_11_re * o11_im;
2237  a00_im += c00_11_im * o11_re;
2238  a00_re += c00_12_re * o12_re;
2239  a00_re -= c00_12_im * o12_im;
2240  a00_im += c00_12_re * o12_im;
2241  a00_im += c00_12_im * o12_re;
2242 
2243  a01_re += c01_00_re * o00_re;
2244  a01_re -= c01_00_im * o00_im;
2245  a01_im += c01_00_re * o00_im;
2246  a01_im += c01_00_im * o00_re;
2247  a01_re += c01_01_re * o01_re;
2248  a01_im += c01_01_re * o01_im;
2249  a01_re += c01_02_re * o02_re;
2250  a01_re -= c01_02_im * o02_im;
2251  a01_im += c01_02_re * o02_im;
2252  a01_im += c01_02_im * o02_re;
2253  a01_re += c01_10_re * o10_re;
2254  a01_re -= c01_10_im * o10_im;
2255  a01_im += c01_10_re * o10_im;
2256  a01_im += c01_10_im * o10_re;
2257  a01_re += c01_11_re * o11_re;
2258  a01_re -= c01_11_im * o11_im;
2259  a01_im += c01_11_re * o11_im;
2260  a01_im += c01_11_im * o11_re;
2261  a01_re += c01_12_re * o12_re;
2262  a01_re -= c01_12_im * o12_im;
2263  a01_im += c01_12_re * o12_im;
2264  a01_im += c01_12_im * o12_re;
2265 
2266  a02_re += c02_00_re * o00_re;
2267  a02_re -= c02_00_im * o00_im;
2268  a02_im += c02_00_re * o00_im;
2269  a02_im += c02_00_im * o00_re;
2270  a02_re += c02_01_re * o01_re;
2271  a02_re -= c02_01_im * o01_im;
2272  a02_im += c02_01_re * o01_im;
2273  a02_im += c02_01_im * o01_re;
2274  a02_re += c02_02_re * o02_re;
2275  a02_im += c02_02_re * o02_im;
2276  a02_re += c02_10_re * o10_re;
2277  a02_re -= c02_10_im * o10_im;
2278  a02_im += c02_10_re * o10_im;
2279  a02_im += c02_10_im * o10_re;
2280  a02_re += c02_11_re * o11_re;
2281  a02_re -= c02_11_im * o11_im;
2282  a02_im += c02_11_re * o11_im;
2283  a02_im += c02_11_im * o11_re;
2284  a02_re += c02_12_re * o12_re;
2285  a02_re -= c02_12_im * o12_im;
2286  a02_im += c02_12_re * o12_im;
2287  a02_im += c02_12_im * o12_re;
2288 
2289  a10_re += c10_00_re * o00_re;
2290  a10_re -= c10_00_im * o00_im;
2291  a10_im += c10_00_re * o00_im;
2292  a10_im += c10_00_im * o00_re;
2293  a10_re += c10_01_re * o01_re;
2294  a10_re -= c10_01_im * o01_im;
2295  a10_im += c10_01_re * o01_im;
2296  a10_im += c10_01_im * o01_re;
2297  a10_re += c10_02_re * o02_re;
2298  a10_re -= c10_02_im * o02_im;
2299  a10_im += c10_02_re * o02_im;
2300  a10_im += c10_02_im * o02_re;
2301  a10_re += c10_10_re * o10_re;
2302  a10_im += c10_10_re * o10_im;
2303  a10_re += c10_11_re * o11_re;
2304  a10_re -= c10_11_im * o11_im;
2305  a10_im += c10_11_re * o11_im;
2306  a10_im += c10_11_im * o11_re;
2307  a10_re += c10_12_re * o12_re;
2308  a10_re -= c10_12_im * o12_im;
2309  a10_im += c10_12_re * o12_im;
2310  a10_im += c10_12_im * o12_re;
2311 
2312  a11_re += c11_00_re * o00_re;
2313  a11_re -= c11_00_im * o00_im;
2314  a11_im += c11_00_re * o00_im;
2315  a11_im += c11_00_im * o00_re;
2316  a11_re += c11_01_re * o01_re;
2317  a11_re -= c11_01_im * o01_im;
2318  a11_im += c11_01_re * o01_im;
2319  a11_im += c11_01_im * o01_re;
2320  a11_re += c11_02_re * o02_re;
2321  a11_re -= c11_02_im * o02_im;
2322  a11_im += c11_02_re * o02_im;
2323  a11_im += c11_02_im * o02_re;
2324  a11_re += c11_10_re * o10_re;
2325  a11_re -= c11_10_im * o10_im;
2326  a11_im += c11_10_re * o10_im;
2327  a11_im += c11_10_im * o10_re;
2328  a11_re += c11_11_re * o11_re;
2329  a11_im += c11_11_re * o11_im;
2330  a11_re += c11_12_re * o12_re;
2331  a11_re -= c11_12_im * o12_im;
2332  a11_im += c11_12_re * o12_im;
2333  a11_im += c11_12_im * o12_re;
2334 
2335  a12_re += c12_00_re * o00_re;
2336  a12_re -= c12_00_im * o00_im;
2337  a12_im += c12_00_re * o00_im;
2338  a12_im += c12_00_im * o00_re;
2339  a12_re += c12_01_re * o01_re;
2340  a12_re -= c12_01_im * o01_im;
2341  a12_im += c12_01_re * o01_im;
2342  a12_im += c12_01_im * o01_re;
2343  a12_re += c12_02_re * o02_re;
2344  a12_re -= c12_02_im * o02_im;
2345  a12_im += c12_02_re * o02_im;
2346  a12_im += c12_02_im * o02_re;
2347  a12_re += c12_10_re * o10_re;
2348  a12_re -= c12_10_im * o10_im;
2349  a12_im += c12_10_re * o10_im;
2350  a12_im += c12_10_im * o10_re;
2351  a12_re += c12_11_re * o11_re;
2352  a12_re -= c12_11_im * o11_im;
2353  a12_im += c12_11_re * o11_im;
2354  a12_im += c12_11_im * o11_re;
2355  a12_re += c12_12_re * o12_re;
2356  a12_im += c12_12_re * o12_im;
2357 
2358  o00_re = a00_re; o00_im = a00_im;
2359  o01_re = a01_re; o01_im = a01_im;
2360  o02_re = a02_re; o02_im = a02_im;
2361  o10_re = a10_re; o10_im = a10_im;
2362  o11_re = a11_re; o11_im = a11_im;
2363  o12_re = a12_re; o12_im = a12_im;
2364 
2365  }
2366 
2367  // apply second chiral block
2368  {
2370 
2371  spinorFloat a20_re = 0; spinorFloat a20_im = 0;
2372  spinorFloat a21_re = 0; spinorFloat a21_im = 0;
2373  spinorFloat a22_re = 0; spinorFloat a22_im = 0;
2374  spinorFloat a30_re = 0; spinorFloat a30_im = 0;
2375  spinorFloat a31_re = 0; spinorFloat a31_im = 0;
2376  spinorFloat a32_re = 0; spinorFloat a32_im = 0;
2377 
2378  a20_re += c20_20_re * o20_re;
2379  a20_im += c20_20_re * o20_im;
2380  a20_re += c20_21_re * o21_re;
2381  a20_re -= c20_21_im * o21_im;
2382  a20_im += c20_21_re * o21_im;
2383  a20_im += c20_21_im * o21_re;
2384  a20_re += c20_22_re * o22_re;
2385  a20_re -= c20_22_im * o22_im;
2386  a20_im += c20_22_re * o22_im;
2387  a20_im += c20_22_im * o22_re;
2388  a20_re += c20_30_re * o30_re;
2389  a20_re -= c20_30_im * o30_im;
2390  a20_im += c20_30_re * o30_im;
2391  a20_im += c20_30_im * o30_re;
2392  a20_re += c20_31_re * o31_re;
2393  a20_re -= c20_31_im * o31_im;
2394  a20_im += c20_31_re * o31_im;
2395  a20_im += c20_31_im * o31_re;
2396  a20_re += c20_32_re * o32_re;
2397  a20_re -= c20_32_im * o32_im;
2398  a20_im += c20_32_re * o32_im;
2399  a20_im += c20_32_im * o32_re;
2400 
2401  a21_re += c21_20_re * o20_re;
2402  a21_re -= c21_20_im * o20_im;
2403  a21_im += c21_20_re * o20_im;
2404  a21_im += c21_20_im * o20_re;
2405  a21_re += c21_21_re * o21_re;
2406  a21_im += c21_21_re * o21_im;
2407  a21_re += c21_22_re * o22_re;
2408  a21_re -= c21_22_im * o22_im;
2409  a21_im += c21_22_re * o22_im;
2410  a21_im += c21_22_im * o22_re;
2411  a21_re += c21_30_re * o30_re;
2412  a21_re -= c21_30_im * o30_im;
2413  a21_im += c21_30_re * o30_im;
2414  a21_im += c21_30_im * o30_re;
2415  a21_re += c21_31_re * o31_re;
2416  a21_re -= c21_31_im * o31_im;
2417  a21_im += c21_31_re * o31_im;
2418  a21_im += c21_31_im * o31_re;
2419  a21_re += c21_32_re * o32_re;
2420  a21_re -= c21_32_im * o32_im;
2421  a21_im += c21_32_re * o32_im;
2422  a21_im += c21_32_im * o32_re;
2423 
2424  a22_re += c22_20_re * o20_re;
2425  a22_re -= c22_20_im * o20_im;
2426  a22_im += c22_20_re * o20_im;
2427  a22_im += c22_20_im * o20_re;
2428  a22_re += c22_21_re * o21_re;
2429  a22_re -= c22_21_im * o21_im;
2430  a22_im += c22_21_re * o21_im;
2431  a22_im += c22_21_im * o21_re;
2432  a22_re += c22_22_re * o22_re;
2433  a22_im += c22_22_re * o22_im;
2434  a22_re += c22_30_re * o30_re;
2435  a22_re -= c22_30_im * o30_im;
2436  a22_im += c22_30_re * o30_im;
2437  a22_im += c22_30_im * o30_re;
2438  a22_re += c22_31_re * o31_re;
2439  a22_re -= c22_31_im * o31_im;
2440  a22_im += c22_31_re * o31_im;
2441  a22_im += c22_31_im * o31_re;
2442  a22_re += c22_32_re * o32_re;
2443  a22_re -= c22_32_im * o32_im;
2444  a22_im += c22_32_re * o32_im;
2445  a22_im += c22_32_im * o32_re;
2446 
2447  a30_re += c30_20_re * o20_re;
2448  a30_re -= c30_20_im * o20_im;
2449  a30_im += c30_20_re * o20_im;
2450  a30_im += c30_20_im * o20_re;
2451  a30_re += c30_21_re * o21_re;
2452  a30_re -= c30_21_im * o21_im;
2453  a30_im += c30_21_re * o21_im;
2454  a30_im += c30_21_im * o21_re;
2455  a30_re += c30_22_re * o22_re;
2456  a30_re -= c30_22_im * o22_im;
2457  a30_im += c30_22_re * o22_im;
2458  a30_im += c30_22_im * o22_re;
2459  a30_re += c30_30_re * o30_re;
2460  a30_im += c30_30_re * o30_im;
2461  a30_re += c30_31_re * o31_re;
2462  a30_re -= c30_31_im * o31_im;
2463  a30_im += c30_31_re * o31_im;
2464  a30_im += c30_31_im * o31_re;
2465  a30_re += c30_32_re * o32_re;
2466  a30_re -= c30_32_im * o32_im;
2467  a30_im += c30_32_re * o32_im;
2468  a30_im += c30_32_im * o32_re;
2469 
2470  a31_re += c31_20_re * o20_re;
2471  a31_re -= c31_20_im * o20_im;
2472  a31_im += c31_20_re * o20_im;
2473  a31_im += c31_20_im * o20_re;
2474  a31_re += c31_21_re * o21_re;
2475  a31_re -= c31_21_im * o21_im;
2476  a31_im += c31_21_re * o21_im;
2477  a31_im += c31_21_im * o21_re;
2478  a31_re += c31_22_re * o22_re;
2479  a31_re -= c31_22_im * o22_im;
2480  a31_im += c31_22_re * o22_im;
2481  a31_im += c31_22_im * o22_re;
2482  a31_re += c31_30_re * o30_re;
2483  a31_re -= c31_30_im * o30_im;
2484  a31_im += c31_30_re * o30_im;
2485  a31_im += c31_30_im * o30_re;
2486  a31_re += c31_31_re * o31_re;
2487  a31_im += c31_31_re * o31_im;
2488  a31_re += c31_32_re * o32_re;
2489  a31_re -= c31_32_im * o32_im;
2490  a31_im += c31_32_re * o32_im;
2491  a31_im += c31_32_im * o32_re;
2492 
2493  a32_re += c32_20_re * o20_re;
2494  a32_re -= c32_20_im * o20_im;
2495  a32_im += c32_20_re * o20_im;
2496  a32_im += c32_20_im * o20_re;
2497  a32_re += c32_21_re * o21_re;
2498  a32_re -= c32_21_im * o21_im;
2499  a32_im += c32_21_re * o21_im;
2500  a32_im += c32_21_im * o21_re;
2501  a32_re += c32_22_re * o22_re;
2502  a32_re -= c32_22_im * o22_im;
2503  a32_im += c32_22_re * o22_im;
2504  a32_im += c32_22_im * o22_re;
2505  a32_re += c32_30_re * o30_re;
2506  a32_re -= c32_30_im * o30_im;
2507  a32_im += c32_30_re * o30_im;
2508  a32_im += c32_30_im * o30_re;
2509  a32_re += c32_31_re * o31_re;
2510  a32_re -= c32_31_im * o31_im;
2511  a32_im += c32_31_re * o31_im;
2512  a32_im += c32_31_im * o31_re;
2513  a32_re += c32_32_re * o32_re;
2514  a32_im += c32_32_re * o32_im;
2515 
2516  o20_re = a20_re; o20_im = a20_im;
2517  o21_re = a21_re; o21_im = a21_im;
2518  o22_re = a22_re; o22_im = a22_im;
2519  o30_re = a30_re; o30_im = a30_im;
2520  o31_re = a31_re; o31_im = a31_im;
2521  o32_re = a32_re; o32_im = a32_im;
2522 
2523  }
2524 
2525  // change back from chiral basis
2526  // (note: required factor of 1/2 is included in clover term normalization)
2527  {
2528  spinorFloat a00_re = o10_re + o30_re;
2529  spinorFloat a00_im = o10_im + o30_im;
2530  spinorFloat a10_re = -o00_re - o20_re;
2531  spinorFloat a10_im = -o00_im - o20_im;
2532  spinorFloat a20_re = o10_re - o30_re;
2533  spinorFloat a20_im = o10_im - o30_im;
2534  spinorFloat a30_re = -o00_re + o20_re;
2535  spinorFloat a30_im = -o00_im + o20_im;
2536 
2537  o00_re = a00_re; o00_im = a00_im;
2538  o10_re = a10_re; o10_im = a10_im;
2539  o20_re = a20_re; o20_im = a20_im;
2540  o30_re = a30_re; o30_im = a30_im;
2541  }
2542 
2543  {
2544  spinorFloat a01_re = o11_re + o31_re;
2545  spinorFloat a01_im = o11_im + o31_im;
2546  spinorFloat a11_re = -o01_re - o21_re;
2547  spinorFloat a11_im = -o01_im - o21_im;
2548  spinorFloat a21_re = o11_re - o31_re;
2549  spinorFloat a21_im = o11_im - o31_im;
2550  spinorFloat a31_re = -o01_re + o21_re;
2551  spinorFloat a31_im = -o01_im + o21_im;
2552 
2553  o01_re = a01_re; o01_im = a01_im;
2554  o11_re = a11_re; o11_im = a11_im;
2555  o21_re = a21_re; o21_im = a21_im;
2556  o31_re = a31_re; o31_im = a31_im;
2557  }
2558 
2559  {
2560  spinorFloat a02_re = o12_re + o32_re;
2561  spinorFloat a02_im = o12_im + o32_im;
2562  spinorFloat a12_re = -o02_re - o22_re;
2563  spinorFloat a12_im = -o02_im - o22_im;
2564  spinorFloat a22_re = o12_re - o32_re;
2565  spinorFloat a22_im = o12_im - o32_im;
2566  spinorFloat a32_re = -o02_re + o22_re;
2567  spinorFloat a32_im = -o02_im + o22_im;
2568 
2569  o02_re = a02_re; o02_im = a02_im;
2570  o12_re = a12_re; o12_im = a12_im;
2571  o22_re = a22_re; o22_im = a22_im;
2572  o32_re = a32_re; o32_im = a32_im;
2573  }
2574 
2575 #endif // DSLASH_CLOVER
2576 
2577 #ifdef DSLASH_XPAY
2578 
2579  READ_ACCUM(ACCUMTEX, param.sp_stride)
2580 
2581 #ifdef SPINOR_DOUBLE
2582  spinorFloat a = param.a;
2583 #else
2584  spinorFloat a = param.a_f;
2585 #endif
2586  o00_re = a*o00_re+acc00_re;
2587  o00_im = a*o00_im+acc00_im;
2588  o01_re = a*o01_re+acc01_re;
2589  o01_im = a*o01_im+acc01_im;
2590  o02_re = a*o02_re+acc02_re;
2591  o02_im = a*o02_im+acc02_im;
2592  o10_re = a*o10_re+acc10_re;
2593  o10_im = a*o10_im+acc10_im;
2594  o11_re = a*o11_re+acc11_re;
2595  o11_im = a*o11_im+acc11_im;
2596  o12_re = a*o12_re+acc12_re;
2597  o12_im = a*o12_im+acc12_im;
2598  o20_re = a*o20_re+acc20_re;
2599  o20_im = a*o20_im+acc20_im;
2600  o21_re = a*o21_re+acc21_re;
2601  o21_im = a*o21_im+acc21_im;
2602  o22_re = a*o22_re+acc22_re;
2603  o22_im = a*o22_im+acc22_im;
2604  o30_re = a*o30_re+acc30_re;
2605  o30_im = a*o30_im+acc30_im;
2606  o31_re = a*o31_re+acc31_re;
2607  o31_im = a*o31_im+acc31_im;
2608  o32_re = a*o32_re+acc32_re;
2609  o32_im = a*o32_im+acc32_im;
2610 #endif // DSLASH_XPAY
2611 }
2612 
2613 // write spinor field back to device memory
2614 WRITE_SPINOR(param.sp_stride);
2615 
2616 // undefine to prevent warning when precision is changed
2617 #undef spinorFloat
2618 #undef g00_re
2619 #undef g00_im
2620 #undef g01_re
2621 #undef g01_im
2622 #undef g02_re
2623 #undef g02_im
2624 #undef g10_re
2625 #undef g10_im
2626 #undef g11_re
2627 #undef g11_im
2628 #undef g12_re
2629 #undef g12_im
2630 #undef g20_re
2631 #undef g20_im
2632 #undef g21_re
2633 #undef g21_im
2634 #undef g22_re
2635 #undef g22_im
2636 
2637 #undef i00_re
2638 #undef i00_im
2639 #undef i01_re
2640 #undef i01_im
2641 #undef i02_re
2642 #undef i02_im
2643 #undef i10_re
2644 #undef i10_im
2645 #undef i11_re
2646 #undef i11_im
2647 #undef i12_re
2648 #undef i12_im
2649 #undef i20_re
2650 #undef i20_im
2651 #undef i21_re
2652 #undef i21_im
2653 #undef i22_re
2654 #undef i22_im
2655 #undef i30_re
2656 #undef i30_im
2657 #undef i31_re
2658 #undef i31_im
2659 #undef i32_re
2660 #undef i32_im
2661 
2662 #undef acc00_re
2663 #undef acc00_im
2664 #undef acc01_re
2665 #undef acc01_im
2666 #undef acc02_re
2667 #undef acc02_im
2668 #undef acc10_re
2669 #undef acc10_im
2670 #undef acc11_re
2671 #undef acc11_im
2672 #undef acc12_re
2673 #undef acc12_im
2674 #undef acc20_re
2675 #undef acc20_im
2676 #undef acc21_re
2677 #undef acc21_im
2678 #undef acc22_re
2679 #undef acc22_im
2680 #undef acc30_re
2681 #undef acc30_im
2682 #undef acc31_re
2683 #undef acc31_im
2684 #undef acc32_re
2685 #undef acc32_im
2686 
2687 #undef c00_00_re
2688 #undef c01_01_re
2689 #undef c02_02_re
2690 #undef c10_10_re
2691 #undef c11_11_re
2692 #undef c12_12_re
2693 #undef c01_00_re
2694 #undef c01_00_im
2695 #undef c02_00_re
2696 #undef c02_00_im
2697 #undef c10_00_re
2698 #undef c10_00_im
2699 #undef c11_00_re
2700 #undef c11_00_im
2701 #undef c12_00_re
2702 #undef c12_00_im
2703 #undef c02_01_re
2704 #undef c02_01_im
2705 #undef c10_01_re
2706 #undef c10_01_im
2707 #undef c11_01_re
2708 #undef c11_01_im
2709 #undef c12_01_re
2710 #undef c12_01_im
2711 #undef c10_02_re
2712 #undef c10_02_im
2713 #undef c11_02_re
2714 #undef c11_02_im
2715 #undef c12_02_re
2716 #undef c12_02_im
2717 #undef c11_10_re
2718 #undef c11_10_im
2719 #undef c12_10_re
2720 #undef c12_10_im
2721 #undef c12_11_re
2722 #undef c12_11_im
2723 
2724 
2725 #undef VOLATILE
VOLATILE spinorFloat o30_re
dim3 dim3 blockDim
spinorFloat a1_re
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
spinorFloat A2_im
#define WRITE_SPINOR
spinorFloat A1_im
RECONSTRUCT_GAUGE_MATRIX(0)
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o00_re
spinorFloat B1_re
int sp_idx
spinorFloat A2_re
spinorFloat b2_re
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o21_re
coordsFromIndex< 4, QUDA_4D_PC, EVEN_X >(X, coord, sid, param)
#define GAUGE0TEX
spinorFloat a0_im
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
QudaGaugeParam param
Definition: pack_test.cpp:17
spinorFloat a2_im
spinorFloat B2_re
spinorFloat a0_re
VOLATILE spinorFloat o01_im
VOLATILE spinorFloat o02_im
VOLATILE spinorFloat o12_im
#define GAUGE1TEX
VOLATILE spinorFloat o20_im
#define SPINORTEX
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o10_im
spinorFloat b1_im
int X[4]
Definition: quda.h:29
VOLATILE spinorFloat o21_im
spinorFloat a2_re
spinorFloat b0_re
VOLATILE spinorFloat o10_re
#define READ_SPINOR_GHOST
spinorFloat A1_re
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o31_re
spinorFloat B2_im
spinorFloat B0_im
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o32_re
spinorFloat B0_re
#define CLOVERTEX
spinorFloat b1_re
spinorFloat A0_re
#define INTERTEX
spinorFloat b2_im
VOLATILE spinorFloat o32_im
spinorFloat A0_im
int face_idx
spinorFloat B1_im
const int face_num
#define TPROJSCALE
spinorFloat a1_im
spinorFloat b0_im
VOLATILE spinorFloat o30_im
#define GHOSTSPINORTEX
#define READ_CLOVER
VOLATILE spinorFloat o00_im
#define a
VOLATILE spinorFloat o02_re
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o11_re
VOLATILE spinorFloat o20_re