QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
wilson_dslash_gt200_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define i00_re I0.x
15 #define i00_im I0.y
16 #define i01_re I1.x
17 #define i01_im I1.y
18 #define i02_re I2.x
19 #define i02_im I2.y
20 #define i10_re I3.x
21 #define i10_im I3.y
22 #define i11_re I4.x
23 #define i11_im I4.y
24 #define i12_re I5.x
25 #define i12_im I5.y
26 #define i20_re I6.x
27 #define i20_im I6.y
28 #define i21_re I7.x
29 #define i21_im I7.y
30 #define i22_re I8.x
31 #define i22_im I8.y
32 #define i30_re I9.x
33 #define i30_im I9.y
34 #define i31_re I10.x
35 #define i31_im I10.y
36 #define i32_re I11.x
37 #define i32_im I11.y
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
62 #else
63 #define spinorFloat float
64 #define i00_re I0.x
65 #define i00_im I0.y
66 #define i01_re I0.z
67 #define i01_im I0.w
68 #define i02_re I1.x
69 #define i02_im I1.y
70 #define i10_re I1.z
71 #define i10_im I1.w
72 #define i11_re I2.x
73 #define i11_im I2.y
74 #define i12_re I2.z
75 #define i12_im I2.w
76 #define i20_re I3.x
77 #define i20_im I3.y
78 #define i21_re I3.z
79 #define i21_im I3.w
80 #define i22_re I4.x
81 #define i22_im I4.y
82 #define i30_re I4.z
83 #define i30_im I4.w
84 #define i31_re I5.x
85 #define i31_im I5.y
86 #define i32_re I5.z
87 #define i32_im I5.w
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
113 
114 // gauge link
115 #ifdef GAUGE_FLOAT2
116 #define g00_re G0.x
117 #define g00_im G0.y
118 #define g01_re G1.x
119 #define g01_im G1.y
120 #define g02_re G2.x
121 #define g02_im G2.y
122 #define g10_re G3.x
123 #define g10_im G3.y
124 #define g11_re G4.x
125 #define g11_im G4.y
126 #define g12_re G5.x
127 #define g12_im G5.y
128 #define g20_re G6.x
129 #define g20_im G6.y
130 #define g21_re G7.x
131 #define g21_im G7.y
132 #define g22_re G8.x
133 #define g22_im G8.y
134 
135 #else
136 #define g00_re G0.x
137 #define g00_im G0.y
138 #define g01_re G0.z
139 #define g01_im G0.w
140 #define g02_re G1.x
141 #define g02_im G1.y
142 #define g10_re G1.z
143 #define g10_im G1.w
144 #define g11_re G2.x
145 #define g11_im G2.y
146 #define g12_re G2.z
147 #define g12_im G2.w
148 #define g20_re G3.x
149 #define g20_im G3.y
150 #define g21_re G3.z
151 #define g21_im G3.w
152 #define g22_re G4.x
153 #define g22_im G4.y
154 
155 #endif // GAUGE_DOUBLE
156 
157 // conjugated gauge link
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
176 
177 // first chiral block of inverted clover term
178 #ifdef CLOVER_DOUBLE
179 #define c00_00_re C0.x
180 #define c01_01_re C0.y
181 #define c02_02_re C1.x
182 #define c10_10_re C1.y
183 #define c11_11_re C2.x
184 #define c12_12_re C2.y
185 #define c01_00_re C3.x
186 #define c01_00_im C3.y
187 #define c02_00_re C4.x
188 #define c02_00_im C4.y
189 #define c10_00_re C5.x
190 #define c10_00_im C5.y
191 #define c11_00_re C6.x
192 #define c11_00_im C6.y
193 #define c12_00_re C7.x
194 #define c12_00_im C7.y
195 #define c02_01_re C8.x
196 #define c02_01_im C8.y
197 #define c10_01_re C9.x
198 #define c10_01_im C9.y
199 #define c11_01_re C10.x
200 #define c11_01_im C10.y
201 #define c12_01_re C11.x
202 #define c12_01_im C11.y
203 #define c10_02_re C12.x
204 #define c10_02_im C12.y
205 #define c11_02_re C13.x
206 #define c11_02_im C13.y
207 #define c12_02_re C14.x
208 #define c12_02_im C14.y
209 #define c11_10_re C15.x
210 #define c11_10_im C15.y
211 #define c12_10_re C16.x
212 #define c12_10_im C16.y
213 #define c12_11_re C17.x
214 #define c12_11_im C17.y
215 #else
216 #define c00_00_re C0.x
217 #define c01_01_re C0.y
218 #define c02_02_re C0.z
219 #define c10_10_re C0.w
220 #define c11_11_re C1.x
221 #define c12_12_re C1.y
222 #define c01_00_re C1.z
223 #define c01_00_im C1.w
224 #define c02_00_re C2.x
225 #define c02_00_im C2.y
226 #define c10_00_re C2.z
227 #define c10_00_im C2.w
228 #define c11_00_re C3.x
229 #define c11_00_im C3.y
230 #define c12_00_re C3.z
231 #define c12_00_im C3.w
232 #define c02_01_re C4.x
233 #define c02_01_im C4.y
234 #define c10_01_re C4.z
235 #define c10_01_im C4.w
236 #define c11_01_re C5.x
237 #define c11_01_im C5.y
238 #define c12_01_re C5.z
239 #define c12_01_im C5.w
240 #define c10_02_re C6.x
241 #define c10_02_im C6.y
242 #define c11_02_re C6.z
243 #define c11_02_im C6.w
244 #define c12_02_re C7.x
245 #define c12_02_im C7.y
246 #define c11_10_re C7.z
247 #define c11_10_im C7.w
248 #define c12_10_re C8.x
249 #define c12_10_im C8.y
250 #define c12_11_re C8.z
251 #define c12_11_im C8.w
252 #endif // CLOVER_DOUBLE
253 
254 #define c00_01_re (+c01_00_re)
255 #define c00_01_im (-c01_00_im)
256 #define c00_02_re (+c02_00_re)
257 #define c00_02_im (-c02_00_im)
258 #define c01_02_re (+c02_01_re)
259 #define c01_02_im (-c02_01_im)
260 #define c00_10_re (+c10_00_re)
261 #define c00_10_im (-c10_00_im)
262 #define c01_10_re (+c10_01_re)
263 #define c01_10_im (-c10_01_im)
264 #define c02_10_re (+c10_02_re)
265 #define c02_10_im (-c10_02_im)
266 #define c00_11_re (+c11_00_re)
267 #define c00_11_im (-c11_00_im)
268 #define c01_11_re (+c11_01_re)
269 #define c01_11_im (-c11_01_im)
270 #define c02_11_re (+c11_02_re)
271 #define c02_11_im (-c11_02_im)
272 #define c10_11_re (+c11_10_re)
273 #define c10_11_im (-c11_10_im)
274 #define c00_12_re (+c12_00_re)
275 #define c00_12_im (-c12_00_im)
276 #define c01_12_re (+c12_01_re)
277 #define c01_12_im (-c12_01_im)
278 #define c02_12_re (+c12_02_re)
279 #define c02_12_im (-c12_02_im)
280 #define c10_12_re (+c12_10_re)
281 #define c10_12_im (-c12_10_im)
282 #define c11_12_re (+c12_11_re)
283 #define c11_12_im (-c12_11_im)
284 
285 // second chiral block of inverted clover term (reuses C0,...,C9)
286 #define c20_20_re c00_00_re
287 #define c21_20_re c01_00_re
288 #define c21_20_im c01_00_im
289 #define c22_20_re c02_00_re
290 #define c22_20_im c02_00_im
291 #define c30_20_re c10_00_re
292 #define c30_20_im c10_00_im
293 #define c31_20_re c11_00_re
294 #define c31_20_im c11_00_im
295 #define c32_20_re c12_00_re
296 #define c32_20_im c12_00_im
297 #define c20_21_re c00_01_re
298 #define c20_21_im c00_01_im
299 #define c21_21_re c01_01_re
300 #define c22_21_re c02_01_re
301 #define c22_21_im c02_01_im
302 #define c30_21_re c10_01_re
303 #define c30_21_im c10_01_im
304 #define c31_21_re c11_01_re
305 #define c31_21_im c11_01_im
306 #define c32_21_re c12_01_re
307 #define c32_21_im c12_01_im
308 #define c20_22_re c00_02_re
309 #define c20_22_im c00_02_im
310 #define c21_22_re c01_02_re
311 #define c21_22_im c01_02_im
312 #define c22_22_re c02_02_re
313 #define c30_22_re c10_02_re
314 #define c30_22_im c10_02_im
315 #define c31_22_re c11_02_re
316 #define c31_22_im c11_02_im
317 #define c32_22_re c12_02_re
318 #define c32_22_im c12_02_im
319 #define c20_30_re c00_10_re
320 #define c20_30_im c00_10_im
321 #define c21_30_re c01_10_re
322 #define c21_30_im c01_10_im
323 #define c22_30_re c02_10_re
324 #define c22_30_im c02_10_im
325 #define c30_30_re c10_10_re
326 #define c31_30_re c11_10_re
327 #define c31_30_im c11_10_im
328 #define c32_30_re c12_10_re
329 #define c32_30_im c12_10_im
330 #define c20_31_re c00_11_re
331 #define c20_31_im c00_11_im
332 #define c21_31_re c01_11_re
333 #define c21_31_im c01_11_im
334 #define c22_31_re c02_11_re
335 #define c22_31_im c02_11_im
336 #define c30_31_re c10_11_re
337 #define c30_31_im c10_11_im
338 #define c31_31_re c11_11_re
339 #define c32_31_re c12_11_re
340 #define c32_31_im c12_11_im
341 #define c20_32_re c00_12_re
342 #define c20_32_im c00_12_im
343 #define c21_32_re c01_12_re
344 #define c21_32_im c01_12_im
345 #define c22_32_re c02_12_re
346 #define c22_32_im c02_12_im
347 #define c30_32_re c10_12_re
348 #define c30_32_im c10_12_im
349 #define c31_32_re c11_12_re
350 #define c31_32_im c11_12_im
351 #define c32_32_re c12_12_re
352 
353 // output spinor
378 
379 #include "read_gauge.h"
380 #include "read_clover.h"
381 #include "io_spinor.h"
382 
383 int x1, x2, x3, x4;
384 int X;
385 
386 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
387 int sp_norm_idx;
388 #endif // MULTI_GPU half precision
389 
390 int sid;
391 
392 #ifdef MULTI_GPU
393 int face_idx;
395 #endif
396 
397  sid = blockIdx.x*blockDim.x + threadIdx.x;
398  if (sid >= param.threads) return;
399 
400  // Inline by hand for the moment and assume even dimensions
401  const int dims[] = {X1, X2, X3, X4};
402  coordsFromIndex<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
403 
404  o00_re = 0; o00_im = 0;
405  o01_re = 0; o01_im = 0;
406  o02_re = 0; o02_im = 0;
407  o10_re = 0; o10_im = 0;
408  o11_re = 0; o11_im = 0;
409  o12_re = 0; o12_im = 0;
410  o20_re = 0; o20_im = 0;
411  o21_re = 0; o21_im = 0;
412  o22_re = 0; o22_im = 0;
413  o30_re = 0; o30_im = 0;
414  o31_re = 0; o31_im = 0;
415  o32_re = 0; o32_im = 0;
416 
417 #ifdef MULTI_GPU
418 } else { // exterior kernel
419 
420  sid = blockIdx.x*blockDim.x + threadIdx.x;
421  if (sid >= param.threads) return;
422 
423  const int dim = static_cast<int>(kernel_type);
424  const int face_volume = (param.threads >> 1); // volume of one face
425  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
426  face_idx = sid - face_num*face_volume; // index into the respective face
427 
428  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
429  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
430  //sp_idx = face_idx + param.ghostOffset[dim];
431 
432 #if (DD_PREC==2) // half precision
433  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
434 #endif
435 
436  const int dims[] = {X1, X2, X3, X4};
437  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
438 
439  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
440 
441  o00_re = i00_re; o00_im = i00_im;
442  o01_re = i01_re; o01_im = i01_im;
443  o02_re = i02_re; o02_im = i02_im;
444  o10_re = i10_re; o10_im = i10_im;
445  o11_re = i11_re; o11_im = i11_im;
446  o12_re = i12_re; o12_im = i12_im;
447  o20_re = i20_re; o20_im = i20_im;
448  o21_re = i21_re; o21_im = i21_im;
449  o22_re = i22_re; o22_im = i22_im;
450  o30_re = i30_re; o30_im = i30_im;
451  o31_re = i31_re; o31_im = i31_im;
452  o32_re = i32_re; o32_im = i32_im;
453 }
454 #endif // MULTI_GPU
455 
456 
457 #ifdef MULTI_GPU
458 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
460 #endif
461 {
462  // Projector P0-
463  // 1 0 0 -i
464  // 0 1 -i 0
465  // 0 i 1 0
466  // i 0 0 1
467 
468 #ifdef MULTI_GPU
469  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
470  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
471 #else
472  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
473 #endif
474 
475  const int ga_idx = sid;
476 
483 
484 #ifdef MULTI_GPU
485  if (kernel_type == INTERIOR_KERNEL) {
486 #endif
487 
488  // read spinor from device memory
489  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
490 
491  // project spinor into half spinors
492  a0_re = +i00_re+i30_im;
493  a0_im = +i00_im-i30_re;
494  a1_re = +i01_re+i31_im;
495  a1_im = +i01_im-i31_re;
496  a2_re = +i02_re+i32_im;
497  a2_im = +i02_im-i32_re;
498  b0_re = +i10_re+i20_im;
499  b0_im = +i10_im-i20_re;
500  b1_re = +i11_re+i21_im;
501  b1_im = +i11_im-i21_re;
502  b2_re = +i12_re+i22_im;
503  b2_im = +i12_im-i22_re;
504 
505 #ifdef MULTI_GPU
506  } else {
507 
508  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
509 
510  // read half spinor from device memory
511  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
512 
513  a0_re = i00_re; a0_im = i00_im;
514  a1_re = i01_re; a1_im = i01_im;
515  a2_re = i02_re; a2_im = i02_im;
516  b0_re = i10_re; b0_im = i10_im;
517  b1_re = i11_re; b1_im = i11_im;
518  b2_re = i12_re; b2_im = i12_im;
519 
520  }
521 #endif // MULTI_GPU
522 
523  // read gauge matrix from device memory
524  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
525 
526  // reconstruct gauge matrix
528 
529  // multiply row 0
531  A0_re += g00_re * a0_re;
532  A0_re -= g00_im * a0_im;
533  A0_re += g01_re * a1_re;
534  A0_re -= g01_im * a1_im;
535  A0_re += g02_re * a2_re;
536  A0_re -= g02_im * a2_im;
538  A0_im += g00_re * a0_im;
539  A0_im += g00_im * a0_re;
540  A0_im += g01_re * a1_im;
541  A0_im += g01_im * a1_re;
542  A0_im += g02_re * a2_im;
543  A0_im += g02_im * a2_re;
545  B0_re += g00_re * b0_re;
546  B0_re -= g00_im * b0_im;
547  B0_re += g01_re * b1_re;
548  B0_re -= g01_im * b1_im;
549  B0_re += g02_re * b2_re;
550  B0_re -= g02_im * b2_im;
552  B0_im += g00_re * b0_im;
553  B0_im += g00_im * b0_re;
554  B0_im += g01_re * b1_im;
555  B0_im += g01_im * b1_re;
556  B0_im += g02_re * b2_im;
557  B0_im += g02_im * b2_re;
558 
559  // multiply row 1
561  A1_re += g10_re * a0_re;
562  A1_re -= g10_im * a0_im;
563  A1_re += g11_re * a1_re;
564  A1_re -= g11_im * a1_im;
565  A1_re += g12_re * a2_re;
566  A1_re -= g12_im * a2_im;
568  A1_im += g10_re * a0_im;
569  A1_im += g10_im * a0_re;
570  A1_im += g11_re * a1_im;
571  A1_im += g11_im * a1_re;
572  A1_im += g12_re * a2_im;
573  A1_im += g12_im * a2_re;
575  B1_re += g10_re * b0_re;
576  B1_re -= g10_im * b0_im;
577  B1_re += g11_re * b1_re;
578  B1_re -= g11_im * b1_im;
579  B1_re += g12_re * b2_re;
580  B1_re -= g12_im * b2_im;
582  B1_im += g10_re * b0_im;
583  B1_im += g10_im * b0_re;
584  B1_im += g11_re * b1_im;
585  B1_im += g11_im * b1_re;
586  B1_im += g12_re * b2_im;
587  B1_im += g12_im * b2_re;
588 
589  // multiply row 2
591  A2_re += g20_re * a0_re;
592  A2_re -= g20_im * a0_im;
593  A2_re += g21_re * a1_re;
594  A2_re -= g21_im * a1_im;
595  A2_re += g22_re * a2_re;
596  A2_re -= g22_im * a2_im;
598  A2_im += g20_re * a0_im;
599  A2_im += g20_im * a0_re;
600  A2_im += g21_re * a1_im;
601  A2_im += g21_im * a1_re;
602  A2_im += g22_re * a2_im;
603  A2_im += g22_im * a2_re;
605  B2_re += g20_re * b0_re;
606  B2_re -= g20_im * b0_im;
607  B2_re += g21_re * b1_re;
608  B2_re -= g21_im * b1_im;
609  B2_re += g22_re * b2_re;
610  B2_re -= g22_im * b2_im;
612  B2_im += g20_re * b0_im;
613  B2_im += g20_im * b0_re;
614  B2_im += g21_re * b1_im;
615  B2_im += g21_im * b1_re;
616  B2_im += g22_re * b2_im;
617  B2_im += g22_im * b2_re;
618 
619  o00_re += A0_re;
620  o00_im += A0_im;
621  o10_re += B0_re;
622  o10_im += B0_im;
623  o20_re -= B0_im;
624  o20_im += B0_re;
625  o30_re -= A0_im;
626  o30_im += A0_re;
627 
628  o01_re += A1_re;
629  o01_im += A1_im;
630  o11_re += B1_re;
631  o11_im += B1_im;
632  o21_re -= B1_im;
633  o21_im += B1_re;
634  o31_re -= A1_im;
635  o31_im += A1_re;
636 
637  o02_re += A2_re;
638  o02_im += A2_im;
639  o12_re += B2_re;
640  o12_im += B2_im;
641  o22_re -= B2_im;
642  o22_im += B2_re;
643  o32_re -= A2_im;
644  o32_im += A2_re;
645 
646 }
647 
648 #ifdef MULTI_GPU
649 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
650  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
651 #endif
652 {
653  // Projector P0+
654  // 1 0 0 i
655  // 0 1 i 0
656  // 0 -i 1 0
657  // -i 0 0 1
658 
659 #ifdef MULTI_GPU
660  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
661  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
662 #else
663  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
664 #endif
665 
666 #ifdef MULTI_GPU
667  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
668 #else
669  const int ga_idx = sp_idx;
670 #endif
671 
678 
679 #ifdef MULTI_GPU
680  if (kernel_type == INTERIOR_KERNEL) {
681 #endif
682 
683  // read spinor from device memory
684  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
685 
686  // project spinor into half spinors
687  a0_re = +i00_re-i30_im;
688  a0_im = +i00_im+i30_re;
689  a1_re = +i01_re-i31_im;
690  a1_im = +i01_im+i31_re;
691  a2_re = +i02_re-i32_im;
692  a2_im = +i02_im+i32_re;
693  b0_re = +i10_re-i20_im;
694  b0_im = +i10_im+i20_re;
695  b1_re = +i11_re-i21_im;
696  b1_im = +i11_im+i21_re;
697  b2_re = +i12_re-i22_im;
698  b2_im = +i12_im+i22_re;
699 
700 #ifdef MULTI_GPU
701  } else {
702 
703  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
704 
705  // read half spinor from device memory
706  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
707 
708  a0_re = i00_re; a0_im = i00_im;
709  a1_re = i01_re; a1_im = i01_im;
710  a2_re = i02_re; a2_im = i02_im;
711  b0_re = i10_re; b0_im = i10_im;
712  b1_re = i11_re; b1_im = i11_im;
713  b2_re = i12_re; b2_im = i12_im;
714 
715  }
716 #endif // MULTI_GPU
717 
718  // read gauge matrix from device memory
719  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
720 
721  // reconstruct gauge matrix
723 
724  // multiply row 0
725  spinorFloat A0_re = 0;
726  A0_re += gT00_re * a0_re;
727  A0_re -= gT00_im * a0_im;
728  A0_re += gT01_re * a1_re;
729  A0_re -= gT01_im * a1_im;
730  A0_re += gT02_re * a2_re;
731  A0_re -= gT02_im * a2_im;
732  spinorFloat A0_im = 0;
733  A0_im += gT00_re * a0_im;
734  A0_im += gT00_im * a0_re;
735  A0_im += gT01_re * a1_im;
736  A0_im += gT01_im * a1_re;
737  A0_im += gT02_re * a2_im;
738  A0_im += gT02_im * a2_re;
739  spinorFloat B0_re = 0;
740  B0_re += gT00_re * b0_re;
741  B0_re -= gT00_im * b0_im;
742  B0_re += gT01_re * b1_re;
743  B0_re -= gT01_im * b1_im;
744  B0_re += gT02_re * b2_re;
745  B0_re -= gT02_im * b2_im;
746  spinorFloat B0_im = 0;
747  B0_im += gT00_re * b0_im;
748  B0_im += gT00_im * b0_re;
749  B0_im += gT01_re * b1_im;
750  B0_im += gT01_im * b1_re;
751  B0_im += gT02_re * b2_im;
752  B0_im += gT02_im * b2_re;
753 
754  // multiply row 1
755  spinorFloat A1_re = 0;
756  A1_re += gT10_re * a0_re;
757  A1_re -= gT10_im * a0_im;
758  A1_re += gT11_re * a1_re;
759  A1_re -= gT11_im * a1_im;
760  A1_re += gT12_re * a2_re;
761  A1_re -= gT12_im * a2_im;
762  spinorFloat A1_im = 0;
763  A1_im += gT10_re * a0_im;
764  A1_im += gT10_im * a0_re;
765  A1_im += gT11_re * a1_im;
766  A1_im += gT11_im * a1_re;
767  A1_im += gT12_re * a2_im;
768  A1_im += gT12_im * a2_re;
769  spinorFloat B1_re = 0;
770  B1_re += gT10_re * b0_re;
771  B1_re -= gT10_im * b0_im;
772  B1_re += gT11_re * b1_re;
773  B1_re -= gT11_im * b1_im;
774  B1_re += gT12_re * b2_re;
775  B1_re -= gT12_im * b2_im;
776  spinorFloat B1_im = 0;
777  B1_im += gT10_re * b0_im;
778  B1_im += gT10_im * b0_re;
779  B1_im += gT11_re * b1_im;
780  B1_im += gT11_im * b1_re;
781  B1_im += gT12_re * b2_im;
782  B1_im += gT12_im * b2_re;
783 
784  // multiply row 2
785  spinorFloat A2_re = 0;
786  A2_re += gT20_re * a0_re;
787  A2_re -= gT20_im * a0_im;
788  A2_re += gT21_re * a1_re;
789  A2_re -= gT21_im * a1_im;
790  A2_re += gT22_re * a2_re;
791  A2_re -= gT22_im * a2_im;
792  spinorFloat A2_im = 0;
793  A2_im += gT20_re * a0_im;
794  A2_im += gT20_im * a0_re;
795  A2_im += gT21_re * a1_im;
796  A2_im += gT21_im * a1_re;
797  A2_im += gT22_re * a2_im;
798  A2_im += gT22_im * a2_re;
799  spinorFloat B2_re = 0;
800  B2_re += gT20_re * b0_re;
801  B2_re -= gT20_im * b0_im;
802  B2_re += gT21_re * b1_re;
803  B2_re -= gT21_im * b1_im;
804  B2_re += gT22_re * b2_re;
805  B2_re -= gT22_im * b2_im;
806  spinorFloat B2_im = 0;
807  B2_im += gT20_re * b0_im;
808  B2_im += gT20_im * b0_re;
809  B2_im += gT21_re * b1_im;
810  B2_im += gT21_im * b1_re;
811  B2_im += gT22_re * b2_im;
812  B2_im += gT22_im * b2_re;
813 
814  o00_re += A0_re;
815  o00_im += A0_im;
816  o10_re += B0_re;
817  o10_im += B0_im;
818  o20_re += B0_im;
819  o20_im -= B0_re;
820  o30_re += A0_im;
821  o30_im -= A0_re;
822 
823  o01_re += A1_re;
824  o01_im += A1_im;
825  o11_re += B1_re;
826  o11_im += B1_im;
827  o21_re += B1_im;
828  o21_im -= B1_re;
829  o31_re += A1_im;
830  o31_im -= A1_re;
831 
832  o02_re += A2_re;
833  o02_im += A2_im;
834  o12_re += B2_re;
835  o12_im += B2_im;
836  o22_re += B2_im;
837  o22_im -= B2_re;
838  o32_re += A2_im;
839  o32_im -= A2_re;
840 
841 }
842 
843 #ifdef MULTI_GPU
844 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
846 #endif
847 {
848  // Projector P1-
849  // 1 0 0 -1
850  // 0 1 1 0
851  // 0 1 1 0
852  // -1 0 0 1
853 
854 #ifdef MULTI_GPU
855  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
856  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
857 #else
858  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
859 #endif
860 
861  const int ga_idx = sid;
862 
869 
870 #ifdef MULTI_GPU
871  if (kernel_type == INTERIOR_KERNEL) {
872 #endif
873 
874  // read spinor from device memory
875  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
876 
877  // project spinor into half spinors
878  a0_re = +i00_re-i30_re;
879  a0_im = +i00_im-i30_im;
880  a1_re = +i01_re-i31_re;
881  a1_im = +i01_im-i31_im;
882  a2_re = +i02_re-i32_re;
883  a2_im = +i02_im-i32_im;
884  b0_re = +i10_re+i20_re;
885  b0_im = +i10_im+i20_im;
886  b1_re = +i11_re+i21_re;
887  b1_im = +i11_im+i21_im;
888  b2_re = +i12_re+i22_re;
889  b2_im = +i12_im+i22_im;
890 
891 #ifdef MULTI_GPU
892  } else {
893 
894  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
895 
896  // read half spinor from device memory
897  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
898 
899  a0_re = i00_re; a0_im = i00_im;
900  a1_re = i01_re; a1_im = i01_im;
901  a2_re = i02_re; a2_im = i02_im;
902  b0_re = i10_re; b0_im = i10_im;
903  b1_re = i11_re; b1_im = i11_im;
904  b2_re = i12_re; b2_im = i12_im;
905 
906  }
907 #endif // MULTI_GPU
908 
909  // read gauge matrix from device memory
910  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
911 
912  // reconstruct gauge matrix
914 
915  // multiply row 0
916  spinorFloat A0_re = 0;
917  A0_re += g00_re * a0_re;
918  A0_re -= g00_im * a0_im;
919  A0_re += g01_re * a1_re;
920  A0_re -= g01_im * a1_im;
921  A0_re += g02_re * a2_re;
922  A0_re -= g02_im * a2_im;
923  spinorFloat A0_im = 0;
924  A0_im += g00_re * a0_im;
925  A0_im += g00_im * a0_re;
926  A0_im += g01_re * a1_im;
927  A0_im += g01_im * a1_re;
928  A0_im += g02_re * a2_im;
929  A0_im += g02_im * a2_re;
930  spinorFloat B0_re = 0;
931  B0_re += g00_re * b0_re;
932  B0_re -= g00_im * b0_im;
933  B0_re += g01_re * b1_re;
934  B0_re -= g01_im * b1_im;
935  B0_re += g02_re * b2_re;
936  B0_re -= g02_im * b2_im;
937  spinorFloat B0_im = 0;
938  B0_im += g00_re * b0_im;
939  B0_im += g00_im * b0_re;
940  B0_im += g01_re * b1_im;
941  B0_im += g01_im * b1_re;
942  B0_im += g02_re * b2_im;
943  B0_im += g02_im * b2_re;
944 
945  // multiply row 1
946  spinorFloat A1_re = 0;
947  A1_re += g10_re * a0_re;
948  A1_re -= g10_im * a0_im;
949  A1_re += g11_re * a1_re;
950  A1_re -= g11_im * a1_im;
951  A1_re += g12_re * a2_re;
952  A1_re -= g12_im * a2_im;
953  spinorFloat A1_im = 0;
954  A1_im += g10_re * a0_im;
955  A1_im += g10_im * a0_re;
956  A1_im += g11_re * a1_im;
957  A1_im += g11_im * a1_re;
958  A1_im += g12_re * a2_im;
959  A1_im += g12_im * a2_re;
960  spinorFloat B1_re = 0;
961  B1_re += g10_re * b0_re;
962  B1_re -= g10_im * b0_im;
963  B1_re += g11_re * b1_re;
964  B1_re -= g11_im * b1_im;
965  B1_re += g12_re * b2_re;
966  B1_re -= g12_im * b2_im;
967  spinorFloat B1_im = 0;
968  B1_im += g10_re * b0_im;
969  B1_im += g10_im * b0_re;
970  B1_im += g11_re * b1_im;
971  B1_im += g11_im * b1_re;
972  B1_im += g12_re * b2_im;
973  B1_im += g12_im * b2_re;
974 
975  // multiply row 2
976  spinorFloat A2_re = 0;
977  A2_re += g20_re * a0_re;
978  A2_re -= g20_im * a0_im;
979  A2_re += g21_re * a1_re;
980  A2_re -= g21_im * a1_im;
981  A2_re += g22_re * a2_re;
982  A2_re -= g22_im * a2_im;
983  spinorFloat A2_im = 0;
984  A2_im += g20_re * a0_im;
985  A2_im += g20_im * a0_re;
986  A2_im += g21_re * a1_im;
987  A2_im += g21_im * a1_re;
988  A2_im += g22_re * a2_im;
989  A2_im += g22_im * a2_re;
990  spinorFloat B2_re = 0;
991  B2_re += g20_re * b0_re;
992  B2_re -= g20_im * b0_im;
993  B2_re += g21_re * b1_re;
994  B2_re -= g21_im * b1_im;
995  B2_re += g22_re * b2_re;
996  B2_re -= g22_im * b2_im;
997  spinorFloat B2_im = 0;
998  B2_im += g20_re * b0_im;
999  B2_im += g20_im * b0_re;
1000  B2_im += g21_re * b1_im;
1001  B2_im += g21_im * b1_re;
1002  B2_im += g22_re * b2_im;
1003  B2_im += g22_im * b2_re;
1004 
1005  o00_re += A0_re;
1006  o00_im += A0_im;
1007  o10_re += B0_re;
1008  o10_im += B0_im;
1009  o20_re += B0_re;
1010  o20_im += B0_im;
1011  o30_re -= A0_re;
1012  o30_im -= A0_im;
1013 
1014  o01_re += A1_re;
1015  o01_im += A1_im;
1016  o11_re += B1_re;
1017  o11_im += B1_im;
1018  o21_re += B1_re;
1019  o21_im += B1_im;
1020  o31_re -= A1_re;
1021  o31_im -= A1_im;
1022 
1023  o02_re += A2_re;
1024  o02_im += A2_im;
1025  o12_re += B2_re;
1026  o12_im += B2_im;
1027  o22_re += B2_re;
1028  o22_im += B2_im;
1029  o32_re -= A2_re;
1030  o32_im -= A2_im;
1031 
1032 }
1033 
1034 #ifdef MULTI_GPU
1035 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
1036  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
1037 #endif
1038 {
1039  // Projector P1+
1040  // 1 0 0 1
1041  // 0 1 -1 0
1042  // 0 -1 1 0
1043  // 1 0 0 1
1044 
1045 #ifdef MULTI_GPU
1046  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
1047  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1048 #else
1049  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
1050 #endif
1051 
1052 #ifdef MULTI_GPU
1053  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1054 #else
1055  const int ga_idx = sp_idx;
1056 #endif
1057 
1064 
1065 #ifdef MULTI_GPU
1066  if (kernel_type == INTERIOR_KERNEL) {
1067 #endif
1068 
1069  // read spinor from device memory
1070  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1071 
1072  // project spinor into half spinors
1073  a0_re = +i00_re+i30_re;
1074  a0_im = +i00_im+i30_im;
1075  a1_re = +i01_re+i31_re;
1076  a1_im = +i01_im+i31_im;
1077  a2_re = +i02_re+i32_re;
1078  a2_im = +i02_im+i32_im;
1079  b0_re = +i10_re-i20_re;
1080  b0_im = +i10_im-i20_im;
1081  b1_re = +i11_re-i21_re;
1082  b1_im = +i11_im-i21_im;
1083  b2_re = +i12_re-i22_re;
1084  b2_im = +i12_im-i22_im;
1085 
1086 #ifdef MULTI_GPU
1087  } else {
1088 
1089  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1090 
1091  // read half spinor from device memory
1092  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1093 
1094  a0_re = i00_re; a0_im = i00_im;
1095  a1_re = i01_re; a1_im = i01_im;
1096  a2_re = i02_re; a2_im = i02_im;
1097  b0_re = i10_re; b0_im = i10_im;
1098  b1_re = i11_re; b1_im = i11_im;
1099  b2_re = i12_re; b2_im = i12_im;
1100 
1101  }
1102 #endif // MULTI_GPU
1103 
1104  // read gauge matrix from device memory
1105  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
1106 
1107  // reconstruct gauge matrix
1109 
1110  // multiply row 0
1111  spinorFloat A0_re = 0;
1112  A0_re += gT00_re * a0_re;
1113  A0_re -= gT00_im * a0_im;
1114  A0_re += gT01_re * a1_re;
1115  A0_re -= gT01_im * a1_im;
1116  A0_re += gT02_re * a2_re;
1117  A0_re -= gT02_im * a2_im;
1118  spinorFloat A0_im = 0;
1119  A0_im += gT00_re * a0_im;
1120  A0_im += gT00_im * a0_re;
1121  A0_im += gT01_re * a1_im;
1122  A0_im += gT01_im * a1_re;
1123  A0_im += gT02_re * a2_im;
1124  A0_im += gT02_im * a2_re;
1125  spinorFloat B0_re = 0;
1126  B0_re += gT00_re * b0_re;
1127  B0_re -= gT00_im * b0_im;
1128  B0_re += gT01_re * b1_re;
1129  B0_re -= gT01_im * b1_im;
1130  B0_re += gT02_re * b2_re;
1131  B0_re -= gT02_im * b2_im;
1132  spinorFloat B0_im = 0;
1133  B0_im += gT00_re * b0_im;
1134  B0_im += gT00_im * b0_re;
1135  B0_im += gT01_re * b1_im;
1136  B0_im += gT01_im * b1_re;
1137  B0_im += gT02_re * b2_im;
1138  B0_im += gT02_im * b2_re;
1139 
1140  // multiply row 1
1141  spinorFloat A1_re = 0;
1142  A1_re += gT10_re * a0_re;
1143  A1_re -= gT10_im * a0_im;
1144  A1_re += gT11_re * a1_re;
1145  A1_re -= gT11_im * a1_im;
1146  A1_re += gT12_re * a2_re;
1147  A1_re -= gT12_im * a2_im;
1148  spinorFloat A1_im = 0;
1149  A1_im += gT10_re * a0_im;
1150  A1_im += gT10_im * a0_re;
1151  A1_im += gT11_re * a1_im;
1152  A1_im += gT11_im * a1_re;
1153  A1_im += gT12_re * a2_im;
1154  A1_im += gT12_im * a2_re;
1155  spinorFloat B1_re = 0;
1156  B1_re += gT10_re * b0_re;
1157  B1_re -= gT10_im * b0_im;
1158  B1_re += gT11_re * b1_re;
1159  B1_re -= gT11_im * b1_im;
1160  B1_re += gT12_re * b2_re;
1161  B1_re -= gT12_im * b2_im;
1162  spinorFloat B1_im = 0;
1163  B1_im += gT10_re * b0_im;
1164  B1_im += gT10_im * b0_re;
1165  B1_im += gT11_re * b1_im;
1166  B1_im += gT11_im * b1_re;
1167  B1_im += gT12_re * b2_im;
1168  B1_im += gT12_im * b2_re;
1169 
1170  // multiply row 2
1171  spinorFloat A2_re = 0;
1172  A2_re += gT20_re * a0_re;
1173  A2_re -= gT20_im * a0_im;
1174  A2_re += gT21_re * a1_re;
1175  A2_re -= gT21_im * a1_im;
1176  A2_re += gT22_re * a2_re;
1177  A2_re -= gT22_im * a2_im;
1178  spinorFloat A2_im = 0;
1179  A2_im += gT20_re * a0_im;
1180  A2_im += gT20_im * a0_re;
1181  A2_im += gT21_re * a1_im;
1182  A2_im += gT21_im * a1_re;
1183  A2_im += gT22_re * a2_im;
1184  A2_im += gT22_im * a2_re;
1185  spinorFloat B2_re = 0;
1186  B2_re += gT20_re * b0_re;
1187  B2_re -= gT20_im * b0_im;
1188  B2_re += gT21_re * b1_re;
1189  B2_re -= gT21_im * b1_im;
1190  B2_re += gT22_re * b2_re;
1191  B2_re -= gT22_im * b2_im;
1192  spinorFloat B2_im = 0;
1193  B2_im += gT20_re * b0_im;
1194  B2_im += gT20_im * b0_re;
1195  B2_im += gT21_re * b1_im;
1196  B2_im += gT21_im * b1_re;
1197  B2_im += gT22_re * b2_im;
1198  B2_im += gT22_im * b2_re;
1199 
1200  o00_re += A0_re;
1201  o00_im += A0_im;
1202  o10_re += B0_re;
1203  o10_im += B0_im;
1204  o20_re -= B0_re;
1205  o20_im -= B0_im;
1206  o30_re += A0_re;
1207  o30_im += A0_im;
1208 
1209  o01_re += A1_re;
1210  o01_im += A1_im;
1211  o11_re += B1_re;
1212  o11_im += B1_im;
1213  o21_re -= B1_re;
1214  o21_im -= B1_im;
1215  o31_re += A1_re;
1216  o31_im += A1_im;
1217 
1218  o02_re += A2_re;
1219  o02_im += A2_im;
1220  o12_re += B2_re;
1221  o12_im += B2_im;
1222  o22_re -= B2_re;
1223  o22_im -= B2_im;
1224  o32_re += A2_re;
1225  o32_im += A2_im;
1226 
1227 }
1228 
1229 #ifdef MULTI_GPU
1230 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1232 #endif
1233 {
1234  // Projector P2-
1235  // 1 0 -i 0
1236  // 0 1 0 i
1237  // i 0 1 0
1238  // 0 -i 0 1
1239 
1240 #ifdef MULTI_GPU
1241  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1242  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1243 #else
1244  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1245 #endif
1246 
1247  const int ga_idx = sid;
1248 
1255 
1256 #ifdef MULTI_GPU
1257  if (kernel_type == INTERIOR_KERNEL) {
1258 #endif
1259 
1260  // read spinor from device memory
1261  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1262 
1263  // project spinor into half spinors
1264  a0_re = +i00_re+i20_im;
1265  a0_im = +i00_im-i20_re;
1266  a1_re = +i01_re+i21_im;
1267  a1_im = +i01_im-i21_re;
1268  a2_re = +i02_re+i22_im;
1269  a2_im = +i02_im-i22_re;
1270  b0_re = +i10_re-i30_im;
1271  b0_im = +i10_im+i30_re;
1272  b1_re = +i11_re-i31_im;
1273  b1_im = +i11_im+i31_re;
1274  b2_re = +i12_re-i32_im;
1275  b2_im = +i12_im+i32_re;
1276 
1277 #ifdef MULTI_GPU
1278  } else {
1279 
1280  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1281 
1282  // read half spinor from device memory
1283  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1284 
1285  a0_re = i00_re; a0_im = i00_im;
1286  a1_re = i01_re; a1_im = i01_im;
1287  a2_re = i02_re; a2_im = i02_im;
1288  b0_re = i10_re; b0_im = i10_im;
1289  b1_re = i11_re; b1_im = i11_im;
1290  b2_re = i12_re; b2_im = i12_im;
1291 
1292  }
1293 #endif // MULTI_GPU
1294 
1295  // read gauge matrix from device memory
1296  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1297 
1298  // reconstruct gauge matrix
1300 
1301  // multiply row 0
1302  spinorFloat A0_re = 0;
1303  A0_re += g00_re * a0_re;
1304  A0_re -= g00_im * a0_im;
1305  A0_re += g01_re * a1_re;
1306  A0_re -= g01_im * a1_im;
1307  A0_re += g02_re * a2_re;
1308  A0_re -= g02_im * a2_im;
1309  spinorFloat A0_im = 0;
1310  A0_im += g00_re * a0_im;
1311  A0_im += g00_im * a0_re;
1312  A0_im += g01_re * a1_im;
1313  A0_im += g01_im * a1_re;
1314  A0_im += g02_re * a2_im;
1315  A0_im += g02_im * a2_re;
1316  spinorFloat B0_re = 0;
1317  B0_re += g00_re * b0_re;
1318  B0_re -= g00_im * b0_im;
1319  B0_re += g01_re * b1_re;
1320  B0_re -= g01_im * b1_im;
1321  B0_re += g02_re * b2_re;
1322  B0_re -= g02_im * b2_im;
1323  spinorFloat B0_im = 0;
1324  B0_im += g00_re * b0_im;
1325  B0_im += g00_im * b0_re;
1326  B0_im += g01_re * b1_im;
1327  B0_im += g01_im * b1_re;
1328  B0_im += g02_re * b2_im;
1329  B0_im += g02_im * b2_re;
1330 
1331  // multiply row 1
1332  spinorFloat A1_re = 0;
1333  A1_re += g10_re * a0_re;
1334  A1_re -= g10_im * a0_im;
1335  A1_re += g11_re * a1_re;
1336  A1_re -= g11_im * a1_im;
1337  A1_re += g12_re * a2_re;
1338  A1_re -= g12_im * a2_im;
1339  spinorFloat A1_im = 0;
1340  A1_im += g10_re * a0_im;
1341  A1_im += g10_im * a0_re;
1342  A1_im += g11_re * a1_im;
1343  A1_im += g11_im * a1_re;
1344  A1_im += g12_re * a2_im;
1345  A1_im += g12_im * a2_re;
1346  spinorFloat B1_re = 0;
1347  B1_re += g10_re * b0_re;
1348  B1_re -= g10_im * b0_im;
1349  B1_re += g11_re * b1_re;
1350  B1_re -= g11_im * b1_im;
1351  B1_re += g12_re * b2_re;
1352  B1_re -= g12_im * b2_im;
1353  spinorFloat B1_im = 0;
1354  B1_im += g10_re * b0_im;
1355  B1_im += g10_im * b0_re;
1356  B1_im += g11_re * b1_im;
1357  B1_im += g11_im * b1_re;
1358  B1_im += g12_re * b2_im;
1359  B1_im += g12_im * b2_re;
1360 
1361  // multiply row 2
1362  spinorFloat A2_re = 0;
1363  A2_re += g20_re * a0_re;
1364  A2_re -= g20_im * a0_im;
1365  A2_re += g21_re * a1_re;
1366  A2_re -= g21_im * a1_im;
1367  A2_re += g22_re * a2_re;
1368  A2_re -= g22_im * a2_im;
1369  spinorFloat A2_im = 0;
1370  A2_im += g20_re * a0_im;
1371  A2_im += g20_im * a0_re;
1372  A2_im += g21_re * a1_im;
1373  A2_im += g21_im * a1_re;
1374  A2_im += g22_re * a2_im;
1375  A2_im += g22_im * a2_re;
1376  spinorFloat B2_re = 0;
1377  B2_re += g20_re * b0_re;
1378  B2_re -= g20_im * b0_im;
1379  B2_re += g21_re * b1_re;
1380  B2_re -= g21_im * b1_im;
1381  B2_re += g22_re * b2_re;
1382  B2_re -= g22_im * b2_im;
1383  spinorFloat B2_im = 0;
1384  B2_im += g20_re * b0_im;
1385  B2_im += g20_im * b0_re;
1386  B2_im += g21_re * b1_im;
1387  B2_im += g21_im * b1_re;
1388  B2_im += g22_re * b2_im;
1389  B2_im += g22_im * b2_re;
1390 
1391  o00_re += A0_re;
1392  o00_im += A0_im;
1393  o10_re += B0_re;
1394  o10_im += B0_im;
1395  o20_re -= A0_im;
1396  o20_im += A0_re;
1397  o30_re += B0_im;
1398  o30_im -= B0_re;
1399 
1400  o01_re += A1_re;
1401  o01_im += A1_im;
1402  o11_re += B1_re;
1403  o11_im += B1_im;
1404  o21_re -= A1_im;
1405  o21_im += A1_re;
1406  o31_re += B1_im;
1407  o31_im -= B1_re;
1408 
1409  o02_re += A2_re;
1410  o02_im += A2_im;
1411  o12_re += B2_re;
1412  o12_im += B2_im;
1413  o22_re -= A2_im;
1414  o22_im += A2_re;
1415  o32_re += B2_im;
1416  o32_im -= B2_re;
1417 
1418 }
1419 
1420 #ifdef MULTI_GPU
1421 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1422  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1423 #endif
1424 {
1425  // Projector P2+
1426  // 1 0 i 0
1427  // 0 1 0 -i
1428  // -i 0 1 0
1429  // 0 i 0 1
1430 
1431 #ifdef MULTI_GPU
1432  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1433  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1434 #else
1435  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1436 #endif
1437 
1438 #ifdef MULTI_GPU
1439  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1440 #else
1441  const int ga_idx = sp_idx;
1442 #endif
1443 
1450 
1451 #ifdef MULTI_GPU
1452  if (kernel_type == INTERIOR_KERNEL) {
1453 #endif
1454 
1455  // read spinor from device memory
1456  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1457 
1458  // project spinor into half spinors
1459  a0_re = +i00_re-i20_im;
1460  a0_im = +i00_im+i20_re;
1461  a1_re = +i01_re-i21_im;
1462  a1_im = +i01_im+i21_re;
1463  a2_re = +i02_re-i22_im;
1464  a2_im = +i02_im+i22_re;
1465  b0_re = +i10_re+i30_im;
1466  b0_im = +i10_im-i30_re;
1467  b1_re = +i11_re+i31_im;
1468  b1_im = +i11_im-i31_re;
1469  b2_re = +i12_re+i32_im;
1470  b2_im = +i12_im-i32_re;
1471 
1472 #ifdef MULTI_GPU
1473  } else {
1474 
1475  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1476 
1477  // read half spinor from device memory
1478  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1479 
1480  a0_re = i00_re; a0_im = i00_im;
1481  a1_re = i01_re; a1_im = i01_im;
1482  a2_re = i02_re; a2_im = i02_im;
1483  b0_re = i10_re; b0_im = i10_im;
1484  b1_re = i11_re; b1_im = i11_im;
1485  b2_re = i12_re; b2_im = i12_im;
1486 
1487  }
1488 #endif // MULTI_GPU
1489 
1490  // read gauge matrix from device memory
1491  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1492 
1493  // reconstruct gauge matrix
1495 
1496  // multiply row 0
1497  spinorFloat A0_re = 0;
1498  A0_re += gT00_re * a0_re;
1499  A0_re -= gT00_im * a0_im;
1500  A0_re += gT01_re * a1_re;
1501  A0_re -= gT01_im * a1_im;
1502  A0_re += gT02_re * a2_re;
1503  A0_re -= gT02_im * a2_im;
1504  spinorFloat A0_im = 0;
1505  A0_im += gT00_re * a0_im;
1506  A0_im += gT00_im * a0_re;
1507  A0_im += gT01_re * a1_im;
1508  A0_im += gT01_im * a1_re;
1509  A0_im += gT02_re * a2_im;
1510  A0_im += gT02_im * a2_re;
1511  spinorFloat B0_re = 0;
1512  B0_re += gT00_re * b0_re;
1513  B0_re -= gT00_im * b0_im;
1514  B0_re += gT01_re * b1_re;
1515  B0_re -= gT01_im * b1_im;
1516  B0_re += gT02_re * b2_re;
1517  B0_re -= gT02_im * b2_im;
1518  spinorFloat B0_im = 0;
1519  B0_im += gT00_re * b0_im;
1520  B0_im += gT00_im * b0_re;
1521  B0_im += gT01_re * b1_im;
1522  B0_im += gT01_im * b1_re;
1523  B0_im += gT02_re * b2_im;
1524  B0_im += gT02_im * b2_re;
1525 
1526  // multiply row 1
1527  spinorFloat A1_re = 0;
1528  A1_re += gT10_re * a0_re;
1529  A1_re -= gT10_im * a0_im;
1530  A1_re += gT11_re * a1_re;
1531  A1_re -= gT11_im * a1_im;
1532  A1_re += gT12_re * a2_re;
1533  A1_re -= gT12_im * a2_im;
1534  spinorFloat A1_im = 0;
1535  A1_im += gT10_re * a0_im;
1536  A1_im += gT10_im * a0_re;
1537  A1_im += gT11_re * a1_im;
1538  A1_im += gT11_im * a1_re;
1539  A1_im += gT12_re * a2_im;
1540  A1_im += gT12_im * a2_re;
1541  spinorFloat B1_re = 0;
1542  B1_re += gT10_re * b0_re;
1543  B1_re -= gT10_im * b0_im;
1544  B1_re += gT11_re * b1_re;
1545  B1_re -= gT11_im * b1_im;
1546  B1_re += gT12_re * b2_re;
1547  B1_re -= gT12_im * b2_im;
1548  spinorFloat B1_im = 0;
1549  B1_im += gT10_re * b0_im;
1550  B1_im += gT10_im * b0_re;
1551  B1_im += gT11_re * b1_im;
1552  B1_im += gT11_im * b1_re;
1553  B1_im += gT12_re * b2_im;
1554  B1_im += gT12_im * b2_re;
1555 
1556  // multiply row 2
1557  spinorFloat A2_re = 0;
1558  A2_re += gT20_re * a0_re;
1559  A2_re -= gT20_im * a0_im;
1560  A2_re += gT21_re * a1_re;
1561  A2_re -= gT21_im * a1_im;
1562  A2_re += gT22_re * a2_re;
1563  A2_re -= gT22_im * a2_im;
1564  spinorFloat A2_im = 0;
1565  A2_im += gT20_re * a0_im;
1566  A2_im += gT20_im * a0_re;
1567  A2_im += gT21_re * a1_im;
1568  A2_im += gT21_im * a1_re;
1569  A2_im += gT22_re * a2_im;
1570  A2_im += gT22_im * a2_re;
1571  spinorFloat B2_re = 0;
1572  B2_re += gT20_re * b0_re;
1573  B2_re -= gT20_im * b0_im;
1574  B2_re += gT21_re * b1_re;
1575  B2_re -= gT21_im * b1_im;
1576  B2_re += gT22_re * b2_re;
1577  B2_re -= gT22_im * b2_im;
1578  spinorFloat B2_im = 0;
1579  B2_im += gT20_re * b0_im;
1580  B2_im += gT20_im * b0_re;
1581  B2_im += gT21_re * b1_im;
1582  B2_im += gT21_im * b1_re;
1583  B2_im += gT22_re * b2_im;
1584  B2_im += gT22_im * b2_re;
1585 
1586  o00_re += A0_re;
1587  o00_im += A0_im;
1588  o10_re += B0_re;
1589  o10_im += B0_im;
1590  o20_re += A0_im;
1591  o20_im -= A0_re;
1592  o30_re -= B0_im;
1593  o30_im += B0_re;
1594 
1595  o01_re += A1_re;
1596  o01_im += A1_im;
1597  o11_re += B1_re;
1598  o11_im += B1_im;
1599  o21_re += A1_im;
1600  o21_im -= A1_re;
1601  o31_re -= B1_im;
1602  o31_im += B1_re;
1603 
1604  o02_re += A2_re;
1605  o02_im += A2_im;
1606  o12_re += B2_re;
1607  o12_im += B2_im;
1608  o22_re += A2_im;
1609  o22_im -= A2_re;
1610  o32_re -= B2_im;
1611  o32_im += B2_re;
1612 
1613 }
1614 
1615 #ifdef MULTI_GPU
1616 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1618 #endif
1619 {
1620  // Projector P3-
1621  // 0 0 0 0
1622  // 0 0 0 0
1623  // 0 0 2 0
1624  // 0 0 0 2
1625 
1626 #ifdef MULTI_GPU
1627  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1628  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1629 #else
1630  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1631 #endif
1632 
1633  const int ga_idx = sid;
1634 
1636  {
1643 
1644 #ifdef MULTI_GPU
1645  if (kernel_type == INTERIOR_KERNEL) {
1646 #endif
1647 
1648  // read spinor from device memory
1649  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1650 
1651  // project spinor into half spinors
1652  a0_re = +2*i20_re;
1653  a0_im = +2*i20_im;
1654  a1_re = +2*i21_re;
1655  a1_im = +2*i21_im;
1656  a2_re = +2*i22_re;
1657  a2_im = +2*i22_im;
1658  b0_re = +2*i30_re;
1659  b0_im = +2*i30_im;
1660  b1_re = +2*i31_re;
1661  b1_im = +2*i31_im;
1662  b2_re = +2*i32_re;
1663  b2_im = +2*i32_im;
1664 
1665 #ifdef MULTI_GPU
1666  } else {
1667 
1668  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1669  const int t_proj_scale = TPROJSCALE;
1670 
1671  // read half spinor from device memory
1672  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1673 
1674  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1675  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1676  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1677  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1678  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1679  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1680 
1681  }
1682 #endif // MULTI_GPU
1683 
1684  // identity gauge matrix
1691 
1692  o20_re += A0_re;
1693  o20_im += A0_im;
1694  o30_re += B0_re;
1695  o30_im += B0_im;
1696 
1697  o21_re += A1_re;
1698  o21_im += A1_im;
1699  o31_re += B1_re;
1700  o31_im += B1_im;
1701 
1702  o22_re += A2_re;
1703  o22_im += A2_im;
1704  o32_re += B2_re;
1705  o32_im += B2_im;
1706 
1707  } else {
1714 
1715 #ifdef MULTI_GPU
1716  if (kernel_type == INTERIOR_KERNEL) {
1717 #endif
1718 
1719  // read spinor from device memory
1720  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1721 
1722  // project spinor into half spinors
1723  a0_re = +2*i20_re;
1724  a0_im = +2*i20_im;
1725  a1_re = +2*i21_re;
1726  a1_im = +2*i21_im;
1727  a2_re = +2*i22_re;
1728  a2_im = +2*i22_im;
1729  b0_re = +2*i30_re;
1730  b0_im = +2*i30_im;
1731  b1_re = +2*i31_re;
1732  b1_im = +2*i31_im;
1733  b2_re = +2*i32_re;
1734  b2_im = +2*i32_im;
1735 
1736 #ifdef MULTI_GPU
1737  } else {
1738 
1739  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1740  const int t_proj_scale = TPROJSCALE;
1741 
1742  // read half spinor from device memory
1743  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1744 
1745  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1746  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1747  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1748  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1749  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1750  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1751 
1752  }
1753 #endif // MULTI_GPU
1754 
1755  // read gauge matrix from device memory
1756  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1757 
1758  // reconstruct gauge matrix
1760 
1761  // multiply row 0
1762  spinorFloat A0_re = 0;
1763  A0_re += g00_re * a0_re;
1764  A0_re -= g00_im * a0_im;
1765  A0_re += g01_re * a1_re;
1766  A0_re -= g01_im * a1_im;
1767  A0_re += g02_re * a2_re;
1768  A0_re -= g02_im * a2_im;
1769  spinorFloat A0_im = 0;
1770  A0_im += g00_re * a0_im;
1771  A0_im += g00_im * a0_re;
1772  A0_im += g01_re * a1_im;
1773  A0_im += g01_im * a1_re;
1774  A0_im += g02_re * a2_im;
1775  A0_im += g02_im * a2_re;
1776  spinorFloat B0_re = 0;
1777  B0_re += g00_re * b0_re;
1778  B0_re -= g00_im * b0_im;
1779  B0_re += g01_re * b1_re;
1780  B0_re -= g01_im * b1_im;
1781  B0_re += g02_re * b2_re;
1782  B0_re -= g02_im * b2_im;
1783  spinorFloat B0_im = 0;
1784  B0_im += g00_re * b0_im;
1785  B0_im += g00_im * b0_re;
1786  B0_im += g01_re * b1_im;
1787  B0_im += g01_im * b1_re;
1788  B0_im += g02_re * b2_im;
1789  B0_im += g02_im * b2_re;
1790 
1791  // multiply row 1
1792  spinorFloat A1_re = 0;
1793  A1_re += g10_re * a0_re;
1794  A1_re -= g10_im * a0_im;
1795  A1_re += g11_re * a1_re;
1796  A1_re -= g11_im * a1_im;
1797  A1_re += g12_re * a2_re;
1798  A1_re -= g12_im * a2_im;
1799  spinorFloat A1_im = 0;
1800  A1_im += g10_re * a0_im;
1801  A1_im += g10_im * a0_re;
1802  A1_im += g11_re * a1_im;
1803  A1_im += g11_im * a1_re;
1804  A1_im += g12_re * a2_im;
1805  A1_im += g12_im * a2_re;
1806  spinorFloat B1_re = 0;
1807  B1_re += g10_re * b0_re;
1808  B1_re -= g10_im * b0_im;
1809  B1_re += g11_re * b1_re;
1810  B1_re -= g11_im * b1_im;
1811  B1_re += g12_re * b2_re;
1812  B1_re -= g12_im * b2_im;
1813  spinorFloat B1_im = 0;
1814  B1_im += g10_re * b0_im;
1815  B1_im += g10_im * b0_re;
1816  B1_im += g11_re * b1_im;
1817  B1_im += g11_im * b1_re;
1818  B1_im += g12_re * b2_im;
1819  B1_im += g12_im * b2_re;
1820 
1821  // multiply row 2
1822  spinorFloat A2_re = 0;
1823  A2_re += g20_re * a0_re;
1824  A2_re -= g20_im * a0_im;
1825  A2_re += g21_re * a1_re;
1826  A2_re -= g21_im * a1_im;
1827  A2_re += g22_re * a2_re;
1828  A2_re -= g22_im * a2_im;
1829  spinorFloat A2_im = 0;
1830  A2_im += g20_re * a0_im;
1831  A2_im += g20_im * a0_re;
1832  A2_im += g21_re * a1_im;
1833  A2_im += g21_im * a1_re;
1834  A2_im += g22_re * a2_im;
1835  A2_im += g22_im * a2_re;
1836  spinorFloat B2_re = 0;
1837  B2_re += g20_re * b0_re;
1838  B2_re -= g20_im * b0_im;
1839  B2_re += g21_re * b1_re;
1840  B2_re -= g21_im * b1_im;
1841  B2_re += g22_re * b2_re;
1842  B2_re -= g22_im * b2_im;
1843  spinorFloat B2_im = 0;
1844  B2_im += g20_re * b0_im;
1845  B2_im += g20_im * b0_re;
1846  B2_im += g21_re * b1_im;
1847  B2_im += g21_im * b1_re;
1848  B2_im += g22_re * b2_im;
1849  B2_im += g22_im * b2_re;
1850 
1851  o20_re += A0_re;
1852  o20_im += A0_im;
1853  o30_re += B0_re;
1854  o30_im += B0_im;
1855 
1856  o21_re += A1_re;
1857  o21_im += A1_im;
1858  o31_re += B1_re;
1859  o31_im += B1_im;
1860 
1861  o22_re += A2_re;
1862  o22_im += A2_im;
1863  o32_re += B2_re;
1864  o32_im += B2_im;
1865 
1866  }
1867 }
1868 
1869 #ifdef MULTI_GPU
1870 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
1871  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
1872 #endif
1873 {
1874  // Projector P3+
1875  // 2 0 0 0
1876  // 0 2 0 0
1877  // 0 0 0 0
1878  // 0 0 0 0
1879 
1880 #ifdef MULTI_GPU
1881  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
1882  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1883 #else
1884  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
1885 #endif
1886 
1887 #ifdef MULTI_GPU
1888  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1889 #else
1890  const int ga_idx = sp_idx;
1891 #endif
1892 
1893  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1894  {
1901 
1902 #ifdef MULTI_GPU
1903  if (kernel_type == INTERIOR_KERNEL) {
1904 #endif
1905 
1906  // read spinor from device memory
1907  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1908 
1909  // project spinor into half spinors
1910  a0_re = +2*i00_re;
1911  a0_im = +2*i00_im;
1912  a1_re = +2*i01_re;
1913  a1_im = +2*i01_im;
1914  a2_re = +2*i02_re;
1915  a2_im = +2*i02_im;
1916  b0_re = +2*i10_re;
1917  b0_im = +2*i10_im;
1918  b1_re = +2*i11_re;
1919  b1_im = +2*i11_im;
1920  b2_re = +2*i12_re;
1921  b2_im = +2*i12_im;
1922 
1923 #ifdef MULTI_GPU
1924  } else {
1925 
1926  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1927  const int t_proj_scale = TPROJSCALE;
1928 
1929  // read half spinor from device memory
1930  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1931 
1932  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1933  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1934  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1935  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1936  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1937  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1938 
1939  }
1940 #endif // MULTI_GPU
1941 
1942  // identity gauge matrix
1949 
1950  o00_re += A0_re;
1951  o00_im += A0_im;
1952  o10_re += B0_re;
1953  o10_im += B0_im;
1954 
1955  o01_re += A1_re;
1956  o01_im += A1_im;
1957  o11_re += B1_re;
1958  o11_im += B1_im;
1959 
1960  o02_re += A2_re;
1961  o02_im += A2_im;
1962  o12_re += B2_re;
1963  o12_im += B2_im;
1964 
1965  } else {
1972 
1973 #ifdef MULTI_GPU
1974  if (kernel_type == INTERIOR_KERNEL) {
1975 #endif
1976 
1977  // read spinor from device memory
1978  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1979 
1980  // project spinor into half spinors
1981  a0_re = +2*i00_re;
1982  a0_im = +2*i00_im;
1983  a1_re = +2*i01_re;
1984  a1_im = +2*i01_im;
1985  a2_re = +2*i02_re;
1986  a2_im = +2*i02_im;
1987  b0_re = +2*i10_re;
1988  b0_im = +2*i10_im;
1989  b1_re = +2*i11_re;
1990  b1_im = +2*i11_im;
1991  b2_re = +2*i12_re;
1992  b2_im = +2*i12_im;
1993 
1994 #ifdef MULTI_GPU
1995  } else {
1996 
1997  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1998  const int t_proj_scale = TPROJSCALE;
1999 
2000  // read half spinor from device memory
2001  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2002 
2003  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2004  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2005  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2006  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2007  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2008  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2009 
2010  }
2011 #endif // MULTI_GPU
2012 
2013  // read gauge matrix from device memory
2014  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
2015 
2016  // reconstruct gauge matrix
2018 
2019  // multiply row 0
2020  spinorFloat A0_re = 0;
2021  A0_re += gT00_re * a0_re;
2022  A0_re -= gT00_im * a0_im;
2023  A0_re += gT01_re * a1_re;
2024  A0_re -= gT01_im * a1_im;
2025  A0_re += gT02_re * a2_re;
2026  A0_re -= gT02_im * a2_im;
2027  spinorFloat A0_im = 0;
2028  A0_im += gT00_re * a0_im;
2029  A0_im += gT00_im * a0_re;
2030  A0_im += gT01_re * a1_im;
2031  A0_im += gT01_im * a1_re;
2032  A0_im += gT02_re * a2_im;
2033  A0_im += gT02_im * a2_re;
2034  spinorFloat B0_re = 0;
2035  B0_re += gT00_re * b0_re;
2036  B0_re -= gT00_im * b0_im;
2037  B0_re += gT01_re * b1_re;
2038  B0_re -= gT01_im * b1_im;
2039  B0_re += gT02_re * b2_re;
2040  B0_re -= gT02_im * b2_im;
2041  spinorFloat B0_im = 0;
2042  B0_im += gT00_re * b0_im;
2043  B0_im += gT00_im * b0_re;
2044  B0_im += gT01_re * b1_im;
2045  B0_im += gT01_im * b1_re;
2046  B0_im += gT02_re * b2_im;
2047  B0_im += gT02_im * b2_re;
2048 
2049  // multiply row 1
2050  spinorFloat A1_re = 0;
2051  A1_re += gT10_re * a0_re;
2052  A1_re -= gT10_im * a0_im;
2053  A1_re += gT11_re * a1_re;
2054  A1_re -= gT11_im * a1_im;
2055  A1_re += gT12_re * a2_re;
2056  A1_re -= gT12_im * a2_im;
2057  spinorFloat A1_im = 0;
2058  A1_im += gT10_re * a0_im;
2059  A1_im += gT10_im * a0_re;
2060  A1_im += gT11_re * a1_im;
2061  A1_im += gT11_im * a1_re;
2062  A1_im += gT12_re * a2_im;
2063  A1_im += gT12_im * a2_re;
2064  spinorFloat B1_re = 0;
2065  B1_re += gT10_re * b0_re;
2066  B1_re -= gT10_im * b0_im;
2067  B1_re += gT11_re * b1_re;
2068  B1_re -= gT11_im * b1_im;
2069  B1_re += gT12_re * b2_re;
2070  B1_re -= gT12_im * b2_im;
2071  spinorFloat B1_im = 0;
2072  B1_im += gT10_re * b0_im;
2073  B1_im += gT10_im * b0_re;
2074  B1_im += gT11_re * b1_im;
2075  B1_im += gT11_im * b1_re;
2076  B1_im += gT12_re * b2_im;
2077  B1_im += gT12_im * b2_re;
2078 
2079  // multiply row 2
2080  spinorFloat A2_re = 0;
2081  A2_re += gT20_re * a0_re;
2082  A2_re -= gT20_im * a0_im;
2083  A2_re += gT21_re * a1_re;
2084  A2_re -= gT21_im * a1_im;
2085  A2_re += gT22_re * a2_re;
2086  A2_re -= gT22_im * a2_im;
2087  spinorFloat A2_im = 0;
2088  A2_im += gT20_re * a0_im;
2089  A2_im += gT20_im * a0_re;
2090  A2_im += gT21_re * a1_im;
2091  A2_im += gT21_im * a1_re;
2092  A2_im += gT22_re * a2_im;
2093  A2_im += gT22_im * a2_re;
2094  spinorFloat B2_re = 0;
2095  B2_re += gT20_re * b0_re;
2096  B2_re -= gT20_im * b0_im;
2097  B2_re += gT21_re * b1_re;
2098  B2_re -= gT21_im * b1_im;
2099  B2_re += gT22_re * b2_re;
2100  B2_re -= gT22_im * b2_im;
2101  spinorFloat B2_im = 0;
2102  B2_im += gT20_re * b0_im;
2103  B2_im += gT20_im * b0_re;
2104  B2_im += gT21_re * b1_im;
2105  B2_im += gT21_im * b1_re;
2106  B2_im += gT22_re * b2_im;
2107  B2_im += gT22_im * b2_re;
2108 
2109  o00_re += A0_re;
2110  o00_im += A0_im;
2111  o10_re += B0_re;
2112  o10_im += B0_im;
2113 
2114  o01_re += A1_re;
2115  o01_im += A1_im;
2116  o11_re += B1_re;
2117  o11_im += B1_im;
2118 
2119  o02_re += A2_re;
2120  o02_im += A2_im;
2121  o12_re += B2_re;
2122  o12_im += B2_im;
2123 
2124  }
2125 }
2126 
2127 #if defined MULTI_GPU && (defined DSLASH_XPAY || defined DSLASH_CLOVER)
2128 
2129 int incomplete = 0; // Have all 8 contributions been computed for this site?
2130 
2131 switch(kernel_type) { // intentional fall-through
2132 case INTERIOR_KERNEL:
2133  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
2134 case EXTERIOR_KERNEL_T:
2135  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
2136 case EXTERIOR_KERNEL_Z:
2137  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
2138 case EXTERIOR_KERNEL_Y:
2139  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
2140 }
2141 
2142 if (!incomplete)
2143 #endif // MULTI_GPU
2144 {
2145 #ifdef DSLASH_CLOVER
2146 
2147  // change to chiral basis
2148  {
2155  spinorFloat a30_re = o00_re - o20_re;
2156  spinorFloat a30_im = o00_im - o20_im;
2157 
2158  o00_re = a00_re; o00_im = a00_im;
2159  o10_re = a10_re; o10_im = a10_im;
2160  o20_re = a20_re; o20_im = a20_im;
2161  o30_re = a30_re; o30_im = a30_im;
2162  }
2163 
2164  {
2171  spinorFloat a31_re = o01_re - o21_re;
2172  spinorFloat a31_im = o01_im - o21_im;
2173 
2174  o01_re = a01_re; o01_im = a01_im;
2175  o11_re = a11_re; o11_im = a11_im;
2176  o21_re = a21_re; o21_im = a21_im;
2177  o31_re = a31_re; o31_im = a31_im;
2178  }
2179 
2180  {
2187  spinorFloat a32_re = o02_re - o22_re;
2188  spinorFloat a32_im = o02_im - o22_im;
2189 
2190  o02_re = a02_re; o02_im = a02_im;
2191  o12_re = a12_re; o12_im = a12_im;
2192  o22_re = a22_re; o22_im = a22_im;
2193  o32_re = a32_re; o32_im = a32_im;
2194  }
2195 
2196  // apply first chiral block
2197  {
2199 
2206 
2207  a00_re += c00_00_re * o00_re;
2208  a00_im += c00_00_re * o00_im;
2209  a00_re += c00_01_re * o01_re;
2210  a00_re -= c00_01_im * o01_im;
2211  a00_im += c00_01_re * o01_im;
2212  a00_im += c00_01_im * o01_re;
2213  a00_re += c00_02_re * o02_re;
2214  a00_re -= c00_02_im * o02_im;
2215  a00_im += c00_02_re * o02_im;
2216  a00_im += c00_02_im * o02_re;
2217  a00_re += c00_10_re * o10_re;
2218  a00_re -= c00_10_im * o10_im;
2219  a00_im += c00_10_re * o10_im;
2220  a00_im += c00_10_im * o10_re;
2221  a00_re += c00_11_re * o11_re;
2222  a00_re -= c00_11_im * o11_im;
2223  a00_im += c00_11_re * o11_im;
2224  a00_im += c00_11_im * o11_re;
2225  a00_re += c00_12_re * o12_re;
2226  a00_re -= c00_12_im * o12_im;
2227  a00_im += c00_12_re * o12_im;
2228  a00_im += c00_12_im * o12_re;
2229 
2230  a01_re += c01_00_re * o00_re;
2231  a01_re -= c01_00_im * o00_im;
2232  a01_im += c01_00_re * o00_im;
2233  a01_im += c01_00_im * o00_re;
2234  a01_re += c01_01_re * o01_re;
2235  a01_im += c01_01_re * o01_im;
2236  a01_re += c01_02_re * o02_re;
2237  a01_re -= c01_02_im * o02_im;
2238  a01_im += c01_02_re * o02_im;
2239  a01_im += c01_02_im * o02_re;
2240  a01_re += c01_10_re * o10_re;
2241  a01_re -= c01_10_im * o10_im;
2242  a01_im += c01_10_re * o10_im;
2243  a01_im += c01_10_im * o10_re;
2244  a01_re += c01_11_re * o11_re;
2245  a01_re -= c01_11_im * o11_im;
2246  a01_im += c01_11_re * o11_im;
2247  a01_im += c01_11_im * o11_re;
2248  a01_re += c01_12_re * o12_re;
2249  a01_re -= c01_12_im * o12_im;
2250  a01_im += c01_12_re * o12_im;
2251  a01_im += c01_12_im * o12_re;
2252 
2253  a02_re += c02_00_re * o00_re;
2254  a02_re -= c02_00_im * o00_im;
2255  a02_im += c02_00_re * o00_im;
2256  a02_im += c02_00_im * o00_re;
2257  a02_re += c02_01_re * o01_re;
2258  a02_re -= c02_01_im * o01_im;
2259  a02_im += c02_01_re * o01_im;
2260  a02_im += c02_01_im * o01_re;
2261  a02_re += c02_02_re * o02_re;
2262  a02_im += c02_02_re * o02_im;
2263  a02_re += c02_10_re * o10_re;
2264  a02_re -= c02_10_im * o10_im;
2265  a02_im += c02_10_re * o10_im;
2266  a02_im += c02_10_im * o10_re;
2267  a02_re += c02_11_re * o11_re;
2268  a02_re -= c02_11_im * o11_im;
2269  a02_im += c02_11_re * o11_im;
2270  a02_im += c02_11_im * o11_re;
2271  a02_re += c02_12_re * o12_re;
2272  a02_re -= c02_12_im * o12_im;
2273  a02_im += c02_12_re * o12_im;
2274  a02_im += c02_12_im * o12_re;
2275 
2276  a10_re += c10_00_re * o00_re;
2277  a10_re -= c10_00_im * o00_im;
2278  a10_im += c10_00_re * o00_im;
2279  a10_im += c10_00_im * o00_re;
2280  a10_re += c10_01_re * o01_re;
2281  a10_re -= c10_01_im * o01_im;
2282  a10_im += c10_01_re * o01_im;
2283  a10_im += c10_01_im * o01_re;
2284  a10_re += c10_02_re * o02_re;
2285  a10_re -= c10_02_im * o02_im;
2286  a10_im += c10_02_re * o02_im;
2287  a10_im += c10_02_im * o02_re;
2288  a10_re += c10_10_re * o10_re;
2289  a10_im += c10_10_re * o10_im;
2290  a10_re += c10_11_re * o11_re;
2291  a10_re -= c10_11_im * o11_im;
2292  a10_im += c10_11_re * o11_im;
2293  a10_im += c10_11_im * o11_re;
2294  a10_re += c10_12_re * o12_re;
2295  a10_re -= c10_12_im * o12_im;
2296  a10_im += c10_12_re * o12_im;
2297  a10_im += c10_12_im * o12_re;
2298 
2299  a11_re += c11_00_re * o00_re;
2300  a11_re -= c11_00_im * o00_im;
2301  a11_im += c11_00_re * o00_im;
2302  a11_im += c11_00_im * o00_re;
2303  a11_re += c11_01_re * o01_re;
2304  a11_re -= c11_01_im * o01_im;
2305  a11_im += c11_01_re * o01_im;
2306  a11_im += c11_01_im * o01_re;
2307  a11_re += c11_02_re * o02_re;
2308  a11_re -= c11_02_im * o02_im;
2309  a11_im += c11_02_re * o02_im;
2310  a11_im += c11_02_im * o02_re;
2311  a11_re += c11_10_re * o10_re;
2312  a11_re -= c11_10_im * o10_im;
2313  a11_im += c11_10_re * o10_im;
2314  a11_im += c11_10_im * o10_re;
2315  a11_re += c11_11_re * o11_re;
2316  a11_im += c11_11_re * o11_im;
2317  a11_re += c11_12_re * o12_re;
2318  a11_re -= c11_12_im * o12_im;
2319  a11_im += c11_12_re * o12_im;
2320  a11_im += c11_12_im * o12_re;
2321 
2322  a12_re += c12_00_re * o00_re;
2323  a12_re -= c12_00_im * o00_im;
2324  a12_im += c12_00_re * o00_im;
2325  a12_im += c12_00_im * o00_re;
2326  a12_re += c12_01_re * o01_re;
2327  a12_re -= c12_01_im * o01_im;
2328  a12_im += c12_01_re * o01_im;
2329  a12_im += c12_01_im * o01_re;
2330  a12_re += c12_02_re * o02_re;
2331  a12_re -= c12_02_im * o02_im;
2332  a12_im += c12_02_re * o02_im;
2333  a12_im += c12_02_im * o02_re;
2334  a12_re += c12_10_re * o10_re;
2335  a12_re -= c12_10_im * o10_im;
2336  a12_im += c12_10_re * o10_im;
2337  a12_im += c12_10_im * o10_re;
2338  a12_re += c12_11_re * o11_re;
2339  a12_re -= c12_11_im * o11_im;
2340  a12_im += c12_11_re * o11_im;
2341  a12_im += c12_11_im * o11_re;
2342  a12_re += c12_12_re * o12_re;
2343  a12_im += c12_12_re * o12_im;
2344 
2345  o00_re = a00_re; o00_im = a00_im;
2346  o01_re = a01_re; o01_im = a01_im;
2347  o02_re = a02_re; o02_im = a02_im;
2348  o10_re = a10_re; o10_im = a10_im;
2349  o11_re = a11_re; o11_im = a11_im;
2350  o12_re = a12_re; o12_im = a12_im;
2351 
2352  }
2353 
2354  // apply second chiral block
2355  {
2357 
2361  spinorFloat a30_re = 0; spinorFloat a30_im = 0;
2362  spinorFloat a31_re = 0; spinorFloat a31_im = 0;
2363  spinorFloat a32_re = 0; spinorFloat a32_im = 0;
2364 
2365  a20_re += c20_20_re * o20_re;
2366  a20_im += c20_20_re * o20_im;
2367  a20_re += c20_21_re * o21_re;
2368  a20_re -= c20_21_im * o21_im;
2369  a20_im += c20_21_re * o21_im;
2370  a20_im += c20_21_im * o21_re;
2371  a20_re += c20_22_re * o22_re;
2372  a20_re -= c20_22_im * o22_im;
2373  a20_im += c20_22_re * o22_im;
2374  a20_im += c20_22_im * o22_re;
2375  a20_re += c20_30_re * o30_re;
2376  a20_re -= c20_30_im * o30_im;
2377  a20_im += c20_30_re * o30_im;
2378  a20_im += c20_30_im * o30_re;
2379  a20_re += c20_31_re * o31_re;
2380  a20_re -= c20_31_im * o31_im;
2381  a20_im += c20_31_re * o31_im;
2382  a20_im += c20_31_im * o31_re;
2383  a20_re += c20_32_re * o32_re;
2384  a20_re -= c20_32_im * o32_im;
2385  a20_im += c20_32_re * o32_im;
2386  a20_im += c20_32_im * o32_re;
2387 
2388  a21_re += c21_20_re * o20_re;
2389  a21_re -= c21_20_im * o20_im;
2390  a21_im += c21_20_re * o20_im;
2391  a21_im += c21_20_im * o20_re;
2392  a21_re += c21_21_re * o21_re;
2393  a21_im += c21_21_re * o21_im;
2394  a21_re += c21_22_re * o22_re;
2395  a21_re -= c21_22_im * o22_im;
2396  a21_im += c21_22_re * o22_im;
2397  a21_im += c21_22_im * o22_re;
2398  a21_re += c21_30_re * o30_re;
2399  a21_re -= c21_30_im * o30_im;
2400  a21_im += c21_30_re * o30_im;
2401  a21_im += c21_30_im * o30_re;
2402  a21_re += c21_31_re * o31_re;
2403  a21_re -= c21_31_im * o31_im;
2404  a21_im += c21_31_re * o31_im;
2405  a21_im += c21_31_im * o31_re;
2406  a21_re += c21_32_re * o32_re;
2407  a21_re -= c21_32_im * o32_im;
2408  a21_im += c21_32_re * o32_im;
2409  a21_im += c21_32_im * o32_re;
2410 
2411  a22_re += c22_20_re * o20_re;
2412  a22_re -= c22_20_im * o20_im;
2413  a22_im += c22_20_re * o20_im;
2414  a22_im += c22_20_im * o20_re;
2415  a22_re += c22_21_re * o21_re;
2416  a22_re -= c22_21_im * o21_im;
2417  a22_im += c22_21_re * o21_im;
2418  a22_im += c22_21_im * o21_re;
2419  a22_re += c22_22_re * o22_re;
2420  a22_im += c22_22_re * o22_im;
2421  a22_re += c22_30_re * o30_re;
2422  a22_re -= c22_30_im * o30_im;
2423  a22_im += c22_30_re * o30_im;
2424  a22_im += c22_30_im * o30_re;
2425  a22_re += c22_31_re * o31_re;
2426  a22_re -= c22_31_im * o31_im;
2427  a22_im += c22_31_re * o31_im;
2428  a22_im += c22_31_im * o31_re;
2429  a22_re += c22_32_re * o32_re;
2430  a22_re -= c22_32_im * o32_im;
2431  a22_im += c22_32_re * o32_im;
2432  a22_im += c22_32_im * o32_re;
2433 
2434  a30_re += c30_20_re * o20_re;
2435  a30_re -= c30_20_im * o20_im;
2436  a30_im += c30_20_re * o20_im;
2437  a30_im += c30_20_im * o20_re;
2438  a30_re += c30_21_re * o21_re;
2439  a30_re -= c30_21_im * o21_im;
2440  a30_im += c30_21_re * o21_im;
2441  a30_im += c30_21_im * o21_re;
2442  a30_re += c30_22_re * o22_re;
2443  a30_re -= c30_22_im * o22_im;
2444  a30_im += c30_22_re * o22_im;
2445  a30_im += c30_22_im * o22_re;
2446  a30_re += c30_30_re * o30_re;
2447  a30_im += c30_30_re * o30_im;
2448  a30_re += c30_31_re * o31_re;
2449  a30_re -= c30_31_im * o31_im;
2450  a30_im += c30_31_re * o31_im;
2451  a30_im += c30_31_im * o31_re;
2452  a30_re += c30_32_re * o32_re;
2453  a30_re -= c30_32_im * o32_im;
2454  a30_im += c30_32_re * o32_im;
2455  a30_im += c30_32_im * o32_re;
2456 
2457  a31_re += c31_20_re * o20_re;
2458  a31_re -= c31_20_im * o20_im;
2459  a31_im += c31_20_re * o20_im;
2460  a31_im += c31_20_im * o20_re;
2461  a31_re += c31_21_re * o21_re;
2462  a31_re -= c31_21_im * o21_im;
2463  a31_im += c31_21_re * o21_im;
2464  a31_im += c31_21_im * o21_re;
2465  a31_re += c31_22_re * o22_re;
2466  a31_re -= c31_22_im * o22_im;
2467  a31_im += c31_22_re * o22_im;
2468  a31_im += c31_22_im * o22_re;
2469  a31_re += c31_30_re * o30_re;
2470  a31_re -= c31_30_im * o30_im;
2471  a31_im += c31_30_re * o30_im;
2472  a31_im += c31_30_im * o30_re;
2473  a31_re += c31_31_re * o31_re;
2474  a31_im += c31_31_re * o31_im;
2475  a31_re += c31_32_re * o32_re;
2476  a31_re -= c31_32_im * o32_im;
2477  a31_im += c31_32_re * o32_im;
2478  a31_im += c31_32_im * o32_re;
2479 
2480  a32_re += c32_20_re * o20_re;
2481  a32_re -= c32_20_im * o20_im;
2482  a32_im += c32_20_re * o20_im;
2483  a32_im += c32_20_im * o20_re;
2484  a32_re += c32_21_re * o21_re;
2485  a32_re -= c32_21_im * o21_im;
2486  a32_im += c32_21_re * o21_im;
2487  a32_im += c32_21_im * o21_re;
2488  a32_re += c32_22_re * o22_re;
2489  a32_re -= c32_22_im * o22_im;
2490  a32_im += c32_22_re * o22_im;
2491  a32_im += c32_22_im * o22_re;
2492  a32_re += c32_30_re * o30_re;
2493  a32_re -= c32_30_im * o30_im;
2494  a32_im += c32_30_re * o30_im;
2495  a32_im += c32_30_im * o30_re;
2496  a32_re += c32_31_re * o31_re;
2497  a32_re -= c32_31_im * o31_im;
2498  a32_im += c32_31_re * o31_im;
2499  a32_im += c32_31_im * o31_re;
2500  a32_re += c32_32_re * o32_re;
2501  a32_im += c32_32_re * o32_im;
2502 
2503  o20_re = a20_re; o20_im = a20_im;
2504  o21_re = a21_re; o21_im = a21_im;
2505  o22_re = a22_re; o22_im = a22_im;
2506  o30_re = a30_re; o30_im = a30_im;
2507  o31_re = a31_re; o31_im = a31_im;
2508  o32_re = a32_re; o32_im = a32_im;
2509 
2510  }
2511 
2512  // change back from chiral basis
2513  // (note: required factor of 1/2 is included in clover term normalization)
2514  {
2515  spinorFloat a00_re = o10_re + o30_re;
2516  spinorFloat a00_im = o10_im + o30_im;
2517  spinorFloat a10_re = -o00_re - o20_re;
2518  spinorFloat a10_im = -o00_im - o20_im;
2519  spinorFloat a20_re = o10_re - o30_re;
2520  spinorFloat a20_im = o10_im - o30_im;
2521  spinorFloat a30_re = -o00_re + o20_re;
2522  spinorFloat a30_im = -o00_im + o20_im;
2523 
2524  o00_re = a00_re; o00_im = a00_im;
2525  o10_re = a10_re; o10_im = a10_im;
2526  o20_re = a20_re; o20_im = a20_im;
2527  o30_re = a30_re; o30_im = a30_im;
2528  }
2529 
2530  {
2531  spinorFloat a01_re = o11_re + o31_re;
2532  spinorFloat a01_im = o11_im + o31_im;
2533  spinorFloat a11_re = -o01_re - o21_re;
2534  spinorFloat a11_im = -o01_im - o21_im;
2535  spinorFloat a21_re = o11_re - o31_re;
2536  spinorFloat a21_im = o11_im - o31_im;
2537  spinorFloat a31_re = -o01_re + o21_re;
2538  spinorFloat a31_im = -o01_im + o21_im;
2539 
2540  o01_re = a01_re; o01_im = a01_im;
2541  o11_re = a11_re; o11_im = a11_im;
2542  o21_re = a21_re; o21_im = a21_im;
2543  o31_re = a31_re; o31_im = a31_im;
2544  }
2545 
2546  {
2547  spinorFloat a02_re = o12_re + o32_re;
2548  spinorFloat a02_im = o12_im + o32_im;
2549  spinorFloat a12_re = -o02_re - o22_re;
2550  spinorFloat a12_im = -o02_im - o22_im;
2551  spinorFloat a22_re = o12_re - o32_re;
2552  spinorFloat a22_im = o12_im - o32_im;
2553  spinorFloat a32_re = -o02_re + o22_re;
2554  spinorFloat a32_im = -o02_im + o22_im;
2555 
2556  o02_re = a02_re; o02_im = a02_im;
2557  o12_re = a12_re; o12_im = a12_im;
2558  o22_re = a22_re; o22_im = a22_im;
2559  o32_re = a32_re; o32_im = a32_im;
2560  }
2561 
2562 #endif // DSLASH_CLOVER
2563 
2564 #ifdef DSLASH_XPAY
2565 
2566  READ_ACCUM(ACCUMTEX, param.sp_stride)
2567 
2568  o00_re = a*o00_re+acc00_re;
2569  o00_im = a*o00_im+acc00_im;
2570  o01_re = a*o01_re+acc01_re;
2571  o01_im = a*o01_im+acc01_im;
2572  o02_re = a*o02_re+acc02_re;
2573  o02_im = a*o02_im+acc02_im;
2574  o10_re = a*o10_re+acc10_re;
2575  o10_im = a*o10_im+acc10_im;
2576  o11_re = a*o11_re+acc11_re;
2577  o11_im = a*o11_im+acc11_im;
2578  o12_re = a*o12_re+acc12_re;
2579  o12_im = a*o12_im+acc12_im;
2580  o20_re = a*o20_re+acc20_re;
2581  o20_im = a*o20_im+acc20_im;
2582  o21_re = a*o21_re+acc21_re;
2583  o21_im = a*o21_im+acc21_im;
2584  o22_re = a*o22_re+acc22_re;
2585  o22_im = a*o22_im+acc22_im;
2586  o30_re = a*o30_re+acc30_re;
2587  o30_im = a*o30_im+acc30_im;
2588  o31_re = a*o31_re+acc31_re;
2589  o31_im = a*o31_im+acc31_im;
2590  o32_re = a*o32_re+acc32_re;
2591  o32_im = a*o32_im+acc32_im;
2592 #endif // DSLASH_XPAY
2593 }
2594 
2595 // write spinor field back to device memory
2596 WRITE_SPINOR(param.sp_stride);
2597 
2598 // undefine to prevent warning when precision is changed
2599 #undef spinorFloat
2600 #undef g00_re
2601 #undef g00_im
2602 #undef g01_re
2603 #undef g01_im
2604 #undef g02_re
2605 #undef g02_im
2606 #undef g10_re
2607 #undef g10_im
2608 #undef g11_re
2609 #undef g11_im
2610 #undef g12_re
2611 #undef g12_im
2612 #undef g20_re
2613 #undef g20_im
2614 #undef g21_re
2615 #undef g21_im
2616 #undef g22_re
2617 #undef g22_im
2618 
2619 #undef i00_re
2620 #undef i00_im
2621 #undef i01_re
2622 #undef i01_im
2623 #undef i02_re
2624 #undef i02_im
2625 #undef i10_re
2626 #undef i10_im
2627 #undef i11_re
2628 #undef i11_im
2629 #undef i12_re
2630 #undef i12_im
2631 #undef i20_re
2632 #undef i20_im
2633 #undef i21_re
2634 #undef i21_im
2635 #undef i22_re
2636 #undef i22_im
2637 #undef i30_re
2638 #undef i30_im
2639 #undef i31_re
2640 #undef i31_im
2641 #undef i32_re
2642 #undef i32_im
2643 
2644 #undef acc00_re
2645 #undef acc00_im
2646 #undef acc01_re
2647 #undef acc01_im
2648 #undef acc02_re
2649 #undef acc02_im
2650 #undef acc10_re
2651 #undef acc10_im
2652 #undef acc11_re
2653 #undef acc11_im
2654 #undef acc12_re
2655 #undef acc12_im
2656 #undef acc20_re
2657 #undef acc20_im
2658 #undef acc21_re
2659 #undef acc21_im
2660 #undef acc22_re
2661 #undef acc22_im
2662 #undef acc30_re
2663 #undef acc30_im
2664 #undef acc31_re
2665 #undef acc31_im
2666 #undef acc32_re
2667 #undef acc32_im
2668 
2669 #undef c00_00_re
2670 #undef c01_01_re
2671 #undef c02_02_re
2672 #undef c10_10_re
2673 #undef c11_11_re
2674 #undef c12_12_re
2675 #undef c01_00_re
2676 #undef c01_00_im
2677 #undef c02_00_re
2678 #undef c02_00_im
2679 #undef c10_00_re
2680 #undef c10_00_im
2681 #undef c11_00_re
2682 #undef c11_00_im
2683 #undef c12_00_re
2684 #undef c12_00_im
2685 #undef c02_01_re
2686 #undef c02_01_im
2687 #undef c10_01_re
2688 #undef c10_01_im
2689 #undef c11_01_re
2690 #undef c11_01_im
2691 #undef c12_01_re
2692 #undef c12_01_im
2693 #undef c10_02_re
2694 #undef c10_02_im
2695 #undef c11_02_re
2696 #undef c11_02_im
2697 #undef c12_02_re
2698 #undef c12_02_im
2699 #undef c11_10_re
2700 #undef c11_10_im
2701 #undef c12_10_re
2702 #undef c12_10_im
2703 #undef c12_11_re
2704 #undef c12_11_im
2705 
2706 
2707 #undef VOLATILE
#define i32_re
#define c12_02_im
#define c31_22_re
VOLATILE spinorFloat o20_re
#define g11_im
#define i20_im
VOLATILE spinorFloat o12_im
#define c11_12_im
#define c11_10_im
#define gT02_im
#define c20_31_re
__constant__ int Vh
#define c10_10_re
#define a22_re
Definition: llfat_core.h:131
#define c31_20_re
#define g12_re
#define c11_02_im
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
#define acc20_re
#define i11_re
#define c01_02_im
#define c21_22_re
#define c01_00_im
__constant__ int X2
#define gT12_re
#define c00_01_im
#define i32_im
#define c21_32_re
VOLATILE spinorFloat o11_im
#define acc30_re
#define c21_31_im
spinorFloat A0_re
#define c22_20_im
__constant__ int X2X1mX1
#define c00_00_re
#define c00_02_im
#define c12_00_re
#define g00_im
VOLATILE spinorFloat o32_im
#define acc01_im
spinorFloat B0_im
#define CLOVERTEX
Definition: clover_def.h:101
spinorFloat a1_re
#define a02_im
Definition: llfat_core.h:120
#define c31_32_re
#define c21_31_re
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
#define c21_20_re
__constant__ int X3X2X1mX2X1
#define c02_00_im
spinorFloat b1_im
#define c30_22_im
__constant__ int X1
#define c11_00_im
spinorFloat a0_im
#define i10_im
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
#define c22_31_im
VOLATILE spinorFloat o10_re
int sp_idx
#define c01_11_re
#define acc32_re
#define a22_im
Definition: llfat_core.h:132
spinorFloat B2_im
#define c12_11_im
#define c21_30_re
#define i12_re
#define gT20_im
#define gT00_im
#define c32_30_im
__constant__ int X3X2X1
#define c11_10_re
#define c10_01_im
#define gT20_re
#define g01_re
#define c10_12_re
#define i02_im
#define g22_im
#define a01_re
Definition: llfat_core.h:117
#define c01_01_re
#define acc02_im
#define c32_32_re
#define c30_20_im
#define c22_30_im
#define i00_im
#define g12_im
#define c02_11_im
#define acc31_re
#define c10_00_im
#define gT11_re
#define a02_re
Definition: llfat_core.h:119
#define a20_re
Definition: llfat_core.h:127
#define c30_21_im
#define gT22_re
#define c01_10_re
#define acc30_im
#define gT01_im
#define a12_im
Definition: llfat_core.h:126
#define c11_01_im
VOLATILE spinorFloat o30_im
#define c00_12_im
#define a20_im
Definition: llfat_core.h:128
#define c22_30_re
VOLATILE spinorFloat o12_re
#define c30_31_im
#define c32_21_re
QudaGaugeParam param
Definition: pack_test.cpp:17
VOLATILE spinorFloat o20_im
spinorFloat a0_re
#define i31_re
#define i30_im
__constant__ int ghostFace[QUDA_MAX_DIM+1]
VOLATILE spinorFloat o32_re
#define c01_12_re
#define gT10_re
#define c01_02_re
#define i00_re
#define acc02_re
#define g10_re
#define c10_01_re
#define c20_20_re
#define acc22_im
#define i01_re
#define c31_21_im
spinorFloat A2_im
#define c22_32_re
#define c32_22_re
#define c32_22_im
#define c20_21_re
#define c22_21_im
#define gT21_re
spinorFloat a2_im
#define spinorFloat
#define g20_re
#define g20_im
#define c32_20_im
#define c12_11_re
#define a01_im
Definition: llfat_core.h:118
VOLATILE spinorFloat o02_re
#define acc20_im
#define a12_re
Definition: llfat_core.h:125
#define acc12_im
spinorFloat A0_im
#define GAUGE0TEX
Definition: covDev.h:112
#define g02_im
#define a11_re
Definition: llfat_core.h:123
#define i01_im
VOLATILE spinorFloat o31_re
#define c21_21_re
#define c00_01_re
VOLATILE spinorFloat o00_im
#define c31_20_im
#define c11_12_re
#define c12_02_re
#define c00_10_re
#define c02_01_im
#define i21_im
#define c02_01_re
#define i10_re
#define acc11_im
#define c31_32_im
#define c31_22_im
VOLATILE spinorFloat o01_re
#define c30_31_re
#define acc32_im
#define c02_10_im
#define c32_31_re
spinorFloat B2_re
spinorFloat a1_im
#define VOLATILE
#define g21_im
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define c21_20_im
__constant__ int X2m1
#define c30_32_im
#define i31_im
#define c02_00_re
#define SPINORTEX
Definition: clover_def.h:40
const int dims[]
__constant__ int gauge_fixed
#define gT12_im
#define c20_21_im
#define c20_22_im
__constant__ int X4X3X2X1mX3X2X1
#define c32_20_re
#define c31_21_re
#define c30_22_re
#define c20_22_re
#define SPINOR_HOP
Definition: covDev.h:158
#define c00_11_im
#define c30_30_re
#define c10_11_im
#define c02_11_re
#define c31_30_re
#define g22_re
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define c10_12_im
#define c00_02_re
#define c21_32_im
#define c00_12_re
const int ga_idx
spinorFloat A2_re
__constant__ int ga_stride
spinorFloat B1_im
#define g11_re
#define i11_im
#define c02_10_re
#define acc21_re
VOLATILE spinorFloat o22_re
#define c12_00_im
#define a00_re
Definition: llfat_core.h:115
#define i30_re
#define gT00_re
#define acc21_im
#define c12_10_re
#define c20_32_im
__constant__ int X1m1
#define c11_02_re
#define c21_30_im
__constant__ int X3
spinorFloat A1_re
#define c20_32_re
#define c12_10_im
VOLATILE spinorFloat o30_re
#define c20_30_im
VOLATILE spinorFloat o02_im
#define gT22_im
#define c12_01_re
#define a11_im
Definition: llfat_core.h:124
#define c01_10_im
VOLATILE spinorFloat o31_im
spinorFloat b2_im
RECONSTRUCT_GAUGE_MATRIX(0)
#define a10_re
Definition: llfat_core.h:121
#define c31_31_re
#define acc31_im
#define c01_00_re
#define c11_11_re
#define gT01_re
#define acc11_re
spinorFloat b1_re
VOLATILE spinorFloat o10_im
#define c01_11_im
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define c30_32_re
spinorFloat B0_re
#define c11_01_re
#define GAUGE1TEX
Definition: covDev.h:113
spinorFloat b0_re
#define c12_01_im
#define c22_20_re
#define c22_21_re
#define i12_im
#define c10_00_re
#define c00_10_im
#define c22_31_re
#define c22_32_im
#define c02_12_im
#define gT11_im
#define a10_im
Definition: llfat_core.h:122
#define c20_31_im
#define i22_im
__constant__ int X4m1
#define a21_re
Definition: llfat_core.h:129
#define gT10_im
#define c10_02_im
#define i21_re
#define c20_30_re
#define g01_im
#define WRITE_SPINOR
Definition: clover_def.h:48
#define acc00_im
#define c02_02_re
#define c32_31_im
VOLATILE spinorFloat o11_re
spinorFloat b2_re
#define acc00_re
#define c31_30_im
#define c10_02_re
#define c00_11_re
spinorFloat a2_re
#define c22_22_re
#define i02_re
VOLATILE spinorFloat o01_im
#define c32_21_im
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
#define g21_re
#define c01_12_im
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o00_re
#define i22_re
__constant__ int X4X3X2X1hmX3X2X1h
#define gT21_im
#define acc22_re
#define c12_12_re
VOLATILE spinorFloat o21_re
#define g02_re
#define gT02_re
#define a21_im
Definition: llfat_core.h:130
#define g10_im
#define c32_30_re
#define acc12_re
KernelType kernel_type
#define READ_CLOVER
Definition: clover_def.h:103
spinorFloat B1_re
#define c21_22_im
#define acc01_re
#define c30_21_re
__constant__ int X4
#define c11_00_re
#define i20_re
__constant__ int X3m1
#define acc10_im
#define TPROJSCALE
Definition: covDev.h:101
spinorFloat A1_im
#define acc10_re
#define g00_re
#define c02_12_re
#define c30_20_re
#define c10_11_re
#define a00_im
Definition: llfat_core.h:116
__constant__ int X2X1
VOLATILE spinorFloat o22_im
spinorFloat b0_im