QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
wilson_dslash_g80_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 19
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define i00_re I0.x
15 #define i00_im I0.y
16 #define i01_re I1.x
17 #define i01_im I1.y
18 #define i02_re I2.x
19 #define i02_im I2.y
20 #define i10_re I3.x
21 #define i10_im I3.y
22 #define i11_re I4.x
23 #define i11_im I4.y
24 #define i12_re I5.x
25 #define i12_im I5.y
26 #define i20_re I6.x
27 #define i20_im I6.y
28 #define i21_re I7.x
29 #define i21_im I7.y
30 #define i22_re I8.x
31 #define i22_im I8.y
32 #define i30_re I9.x
33 #define i30_im I9.y
34 #define i31_re I10.x
35 #define i31_im I10.y
36 #define i32_re I11.x
37 #define i32_im I11.y
38 #define acc00_re accum0.x
39 #define acc00_im accum0.y
40 #define acc01_re accum1.x
41 #define acc01_im accum1.y
42 #define acc02_re accum2.x
43 #define acc02_im accum2.y
44 #define acc10_re accum3.x
45 #define acc10_im accum3.y
46 #define acc11_re accum4.x
47 #define acc11_im accum4.y
48 #define acc12_re accum5.x
49 #define acc12_im accum5.y
50 #define acc20_re accum6.x
51 #define acc20_im accum6.y
52 #define acc21_re accum7.x
53 #define acc21_im accum7.y
54 #define acc22_re accum8.x
55 #define acc22_im accum8.y
56 #define acc30_re accum9.x
57 #define acc30_im accum9.y
58 #define acc31_re accum10.x
59 #define acc31_im accum10.y
60 #define acc32_re accum11.x
61 #define acc32_im accum11.y
62 #else
63 #define spinorFloat float
64 #define i00_re I0.x
65 #define i00_im I0.y
66 #define i01_re I0.z
67 #define i01_im I0.w
68 #define i02_re I1.x
69 #define i02_im I1.y
70 #define i10_re I1.z
71 #define i10_im I1.w
72 #define i11_re I2.x
73 #define i11_im I2.y
74 #define i12_re I2.z
75 #define i12_im I2.w
76 #define i20_re I3.x
77 #define i20_im I3.y
78 #define i21_re I3.z
79 #define i21_im I3.w
80 #define i22_re I4.x
81 #define i22_im I4.y
82 #define i30_re I4.z
83 #define i30_im I4.w
84 #define i31_re I5.x
85 #define i31_im I5.y
86 #define i32_re I5.z
87 #define i32_im I5.w
88 #define acc00_re accum0.x
89 #define acc00_im accum0.y
90 #define acc01_re accum0.z
91 #define acc01_im accum0.w
92 #define acc02_re accum1.x
93 #define acc02_im accum1.y
94 #define acc10_re accum1.z
95 #define acc10_im accum1.w
96 #define acc11_re accum2.x
97 #define acc11_im accum2.y
98 #define acc12_re accum2.z
99 #define acc12_im accum2.w
100 #define acc20_re accum3.x
101 #define acc20_im accum3.y
102 #define acc21_re accum3.z
103 #define acc21_im accum3.w
104 #define acc22_re accum4.x
105 #define acc22_im accum4.y
106 #define acc30_re accum4.z
107 #define acc30_im accum4.w
108 #define acc31_re accum5.x
109 #define acc31_im accum5.y
110 #define acc32_re accum5.z
111 #define acc32_im accum5.w
112 #endif // SPINOR_DOUBLE
113 
114 // gauge link
115 #ifdef GAUGE_FLOAT2
116 #define g00_re G0.x
117 #define g00_im G0.y
118 #define g01_re G1.x
119 #define g01_im G1.y
120 #define g02_re G2.x
121 #define g02_im G2.y
122 #define g10_re G3.x
123 #define g10_im G3.y
124 #define g11_re G4.x
125 #define g11_im G4.y
126 #define g12_re G5.x
127 #define g12_im G5.y
128 #define g20_re G6.x
129 #define g20_im G6.y
130 #define g21_re G7.x
131 #define g21_im G7.y
132 #define g22_re G8.x
133 #define g22_im G8.y
134 
135 #else
136 #define g00_re G0.x
137 #define g00_im G0.y
138 #define g01_re G0.z
139 #define g01_im G0.w
140 #define g02_re G1.x
141 #define g02_im G1.y
142 #define g10_re G1.z
143 #define g10_im G1.w
144 #define g11_re G2.x
145 #define g11_im G2.y
146 #define g12_re G2.z
147 #define g12_im G2.w
148 #define g20_re G3.x
149 #define g20_im G3.y
150 #define g21_re G3.z
151 #define g21_im G3.w
152 #define g22_re G4.x
153 #define g22_im G4.y
154 
155 #endif // GAUGE_DOUBLE
156 
157 // conjugated gauge link
158 #define gT00_re (+g00_re)
159 #define gT00_im (-g00_im)
160 #define gT01_re (+g10_re)
161 #define gT01_im (-g10_im)
162 #define gT02_re (+g20_re)
163 #define gT02_im (-g20_im)
164 #define gT10_re (+g01_re)
165 #define gT10_im (-g01_im)
166 #define gT11_re (+g11_re)
167 #define gT11_im (-g11_im)
168 #define gT12_re (+g21_re)
169 #define gT12_im (-g21_im)
170 #define gT20_re (+g02_re)
171 #define gT20_im (-g02_im)
172 #define gT21_re (+g12_re)
173 #define gT21_im (-g12_im)
174 #define gT22_re (+g22_re)
175 #define gT22_im (-g22_im)
176 
177 // first chiral block of inverted clover term
178 #ifdef CLOVER_DOUBLE
179 #define c00_00_re C0.x
180 #define c01_01_re C0.y
181 #define c02_02_re C1.x
182 #define c10_10_re C1.y
183 #define c11_11_re C2.x
184 #define c12_12_re C2.y
185 #define c01_00_re C3.x
186 #define c01_00_im C3.y
187 #define c02_00_re C4.x
188 #define c02_00_im C4.y
189 #define c10_00_re C5.x
190 #define c10_00_im C5.y
191 #define c11_00_re C6.x
192 #define c11_00_im C6.y
193 #define c12_00_re C7.x
194 #define c12_00_im C7.y
195 #define c02_01_re C8.x
196 #define c02_01_im C8.y
197 #define c10_01_re C9.x
198 #define c10_01_im C9.y
199 #define c11_01_re C10.x
200 #define c11_01_im C10.y
201 #define c12_01_re C11.x
202 #define c12_01_im C11.y
203 #define c10_02_re C12.x
204 #define c10_02_im C12.y
205 #define c11_02_re C13.x
206 #define c11_02_im C13.y
207 #define c12_02_re C14.x
208 #define c12_02_im C14.y
209 #define c11_10_re C15.x
210 #define c11_10_im C15.y
211 #define c12_10_re C16.x
212 #define c12_10_im C16.y
213 #define c12_11_re C17.x
214 #define c12_11_im C17.y
215 #else
216 #define c00_00_re C0.x
217 #define c01_01_re C0.y
218 #define c02_02_re C0.z
219 #define c10_10_re C0.w
220 #define c11_11_re C1.x
221 #define c12_12_re C1.y
222 #define c01_00_re C1.z
223 #define c01_00_im C1.w
224 #define c02_00_re C2.x
225 #define c02_00_im C2.y
226 #define c10_00_re C2.z
227 #define c10_00_im C2.w
228 #define c11_00_re C3.x
229 #define c11_00_im C3.y
230 #define c12_00_re C3.z
231 #define c12_00_im C3.w
232 #define c02_01_re C4.x
233 #define c02_01_im C4.y
234 #define c10_01_re C4.z
235 #define c10_01_im C4.w
236 #define c11_01_re C5.x
237 #define c11_01_im C5.y
238 #define c12_01_re C5.z
239 #define c12_01_im C5.w
240 #define c10_02_re C6.x
241 #define c10_02_im C6.y
242 #define c11_02_re C6.z
243 #define c11_02_im C6.w
244 #define c12_02_re C7.x
245 #define c12_02_im C7.y
246 #define c11_10_re C7.z
247 #define c11_10_im C7.w
248 #define c12_10_re C8.x
249 #define c12_10_im C8.y
250 #define c12_11_re C8.z
251 #define c12_11_im C8.w
252 #endif // CLOVER_DOUBLE
253 
254 #define c00_01_re (+c01_00_re)
255 #define c00_01_im (-c01_00_im)
256 #define c00_02_re (+c02_00_re)
257 #define c00_02_im (-c02_00_im)
258 #define c01_02_re (+c02_01_re)
259 #define c01_02_im (-c02_01_im)
260 #define c00_10_re (+c10_00_re)
261 #define c00_10_im (-c10_00_im)
262 #define c01_10_re (+c10_01_re)
263 #define c01_10_im (-c10_01_im)
264 #define c02_10_re (+c10_02_re)
265 #define c02_10_im (-c10_02_im)
266 #define c00_11_re (+c11_00_re)
267 #define c00_11_im (-c11_00_im)
268 #define c01_11_re (+c11_01_re)
269 #define c01_11_im (-c11_01_im)
270 #define c02_11_re (+c11_02_re)
271 #define c02_11_im (-c11_02_im)
272 #define c10_11_re (+c11_10_re)
273 #define c10_11_im (-c11_10_im)
274 #define c00_12_re (+c12_00_re)
275 #define c00_12_im (-c12_00_im)
276 #define c01_12_re (+c12_01_re)
277 #define c01_12_im (-c12_01_im)
278 #define c02_12_re (+c12_02_re)
279 #define c02_12_im (-c12_02_im)
280 #define c10_12_re (+c12_10_re)
281 #define c10_12_im (-c12_10_im)
282 #define c11_12_re (+c12_11_re)
283 #define c11_12_im (-c12_11_im)
284 
285 // second chiral block of inverted clover term (reuses C0,...,C9)
286 #define c20_20_re c00_00_re
287 #define c21_20_re c01_00_re
288 #define c21_20_im c01_00_im
289 #define c22_20_re c02_00_re
290 #define c22_20_im c02_00_im
291 #define c30_20_re c10_00_re
292 #define c30_20_im c10_00_im
293 #define c31_20_re c11_00_re
294 #define c31_20_im c11_00_im
295 #define c32_20_re c12_00_re
296 #define c32_20_im c12_00_im
297 #define c20_21_re c00_01_re
298 #define c20_21_im c00_01_im
299 #define c21_21_re c01_01_re
300 #define c22_21_re c02_01_re
301 #define c22_21_im c02_01_im
302 #define c30_21_re c10_01_re
303 #define c30_21_im c10_01_im
304 #define c31_21_re c11_01_re
305 #define c31_21_im c11_01_im
306 #define c32_21_re c12_01_re
307 #define c32_21_im c12_01_im
308 #define c20_22_re c00_02_re
309 #define c20_22_im c00_02_im
310 #define c21_22_re c01_02_re
311 #define c21_22_im c01_02_im
312 #define c22_22_re c02_02_re
313 #define c30_22_re c10_02_re
314 #define c30_22_im c10_02_im
315 #define c31_22_re c11_02_re
316 #define c31_22_im c11_02_im
317 #define c32_22_re c12_02_re
318 #define c32_22_im c12_02_im
319 #define c20_30_re c00_10_re
320 #define c20_30_im c00_10_im
321 #define c21_30_re c01_10_re
322 #define c21_30_im c01_10_im
323 #define c22_30_re c02_10_re
324 #define c22_30_im c02_10_im
325 #define c30_30_re c10_10_re
326 #define c31_30_re c11_10_re
327 #define c31_30_im c11_10_im
328 #define c32_30_re c12_10_re
329 #define c32_30_im c12_10_im
330 #define c20_31_re c00_11_re
331 #define c20_31_im c00_11_im
332 #define c21_31_re c01_11_re
333 #define c21_31_im c01_11_im
334 #define c22_31_re c02_11_re
335 #define c22_31_im c02_11_im
336 #define c30_31_re c10_11_re
337 #define c30_31_im c10_11_im
338 #define c31_31_re c11_11_re
339 #define c32_31_re c12_11_re
340 #define c32_31_im c12_11_im
341 #define c20_32_re c00_12_re
342 #define c20_32_im c00_12_im
343 #define c21_32_re c01_12_re
344 #define c21_32_im c01_12_im
345 #define c22_32_re c02_12_re
346 #define c22_32_im c02_12_im
347 #define c30_32_re c10_12_re
348 #define c30_32_im c10_12_im
349 #define c31_32_re c11_12_re
350 #define c31_32_im c11_12_im
351 #define c32_32_re c12_12_re
352 
353 // output spinor
354 #define o00_re s[0*SHARED_STRIDE]
355 #define o00_im s[1*SHARED_STRIDE]
356 #define o01_re s[2*SHARED_STRIDE]
357 #define o01_im s[3*SHARED_STRIDE]
358 #define o02_re s[4*SHARED_STRIDE]
359 #define o02_im s[5*SHARED_STRIDE]
360 #define o10_re s[6*SHARED_STRIDE]
361 #define o10_im s[7*SHARED_STRIDE]
362 #define o11_re s[8*SHARED_STRIDE]
363 #define o11_im s[9*SHARED_STRIDE]
364 #define o12_re s[10*SHARED_STRIDE]
365 #define o12_im s[11*SHARED_STRIDE]
366 #define o20_re s[12*SHARED_STRIDE]
367 #define o20_im s[13*SHARED_STRIDE]
368 #define o21_re s[14*SHARED_STRIDE]
369 #define o21_im s[15*SHARED_STRIDE]
370 #define o22_re s[16*SHARED_STRIDE]
371 #define o22_im s[17*SHARED_STRIDE]
372 #define o30_re s[18*SHARED_STRIDE]
378 
379 #ifdef SPINOR_DOUBLE
380 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
381 #else
382 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
383 #endif
384 
385 extern __shared__ char s_data[];
386 
388  + (threadIdx.x % SHARED_STRIDE);
389 
390 #include "read_gauge.h"
391 #include "read_clover.h"
392 #include "io_spinor.h"
393 
394 int x1, x2, x3, x4;
395 int X;
396 
397 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
398 int sp_norm_idx;
399 #endif // MULTI_GPU half precision
400 
401 int sid;
402 
403 #ifdef MULTI_GPU
404 int face_idx;
406 #endif
407 
408  sid = blockIdx.x*blockDim.x + threadIdx.x;
409  if (sid >= param.threads) return;
410 
411  // Inline by hand for the moment and assume even dimensions
412  const int dims[] = {X1, X2, X3, X4};
413  coordsFromIndex<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
414 
415  o00_re = 0; o00_im = 0;
416  o01_re = 0; o01_im = 0;
417  o02_re = 0; o02_im = 0;
418  o10_re = 0; o10_im = 0;
419  o11_re = 0; o11_im = 0;
420  o12_re = 0; o12_im = 0;
421  o20_re = 0; o20_im = 0;
422  o21_re = 0; o21_im = 0;
423  o22_re = 0; o22_im = 0;
424  o30_re = 0; o30_im = 0;
425  o31_re = 0; o31_im = 0;
426  o32_re = 0; o32_im = 0;
427 
428 #ifdef MULTI_GPU
429 } else { // exterior kernel
430 
431  sid = blockIdx.x*blockDim.x + threadIdx.x;
432  if (sid >= param.threads) return;
433 
434  const int dim = static_cast<int>(kernel_type);
435  const int face_volume = (param.threads >> 1); // volume of one face
436  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
437  face_idx = sid - face_num*face_volume; // index into the respective face
438 
439  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
440  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
441  //sp_idx = face_idx + param.ghostOffset[dim];
442 
443 #if (DD_PREC==2) // half precision
444  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
445 #endif
446 
447  const int dims[] = {X1, X2, X3, X4};
448  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
449 
450  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
451 
452  o00_re = i00_re; o00_im = i00_im;
453  o01_re = i01_re; o01_im = i01_im;
454  o02_re = i02_re; o02_im = i02_im;
455  o10_re = i10_re; o10_im = i10_im;
456  o11_re = i11_re; o11_im = i11_im;
457  o12_re = i12_re; o12_im = i12_im;
458  o20_re = i20_re; o20_im = i20_im;
459  o21_re = i21_re; o21_im = i21_im;
460  o22_re = i22_re; o22_im = i22_im;
461  o30_re = i30_re; o30_im = i30_im;
462  o31_re = i31_re; o31_im = i31_im;
463  o32_re = i32_re; o32_im = i32_im;
464 }
465 #endif // MULTI_GPU
466 
467 
468 #ifdef MULTI_GPU
469 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
470  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
471 #endif
472 {
473  // Projector P0-
474  // 1 0 0 -i
475  // 0 1 -i 0
476  // 0 i 1 0
477  // i 0 0 1
478 
479 #ifdef MULTI_GPU
480  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
481  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
482 #else
483  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
484 #endif
485 
486  const int ga_idx = sid;
487 
494 
495 #ifdef MULTI_GPU
496  if (kernel_type == INTERIOR_KERNEL) {
497 #endif
498 
499  // read spinor from device memory
500  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
501 
502  // project spinor into half spinors
503  a0_re = +i00_re+i30_im;
504  a0_im = +i00_im-i30_re;
505  a1_re = +i01_re+i31_im;
506  a1_im = +i01_im-i31_re;
507  a2_re = +i02_re+i32_im;
508  a2_im = +i02_im-i32_re;
509  b0_re = +i10_re+i20_im;
510  b0_im = +i10_im-i20_re;
511  b1_re = +i11_re+i21_im;
512  b1_im = +i11_im-i21_re;
513  b2_re = +i12_re+i22_im;
514  b2_im = +i12_im-i22_re;
515 
516 #ifdef MULTI_GPU
517  } else {
518 
519  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
520 
521  // read half spinor from device memory
522  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
523 
524  a0_re = i00_re; a0_im = i00_im;
525  a1_re = i01_re; a1_im = i01_im;
526  a2_re = i02_re; a2_im = i02_im;
527  b0_re = i10_re; b0_im = i10_im;
528  b1_re = i11_re; b1_im = i11_im;
529  b2_re = i12_re; b2_im = i12_im;
530 
531  }
532 #endif // MULTI_GPU
533 
534  // read gauge matrix from device memory
535  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
536 
537  // reconstruct gauge matrix
539 
540  // multiply row 0
542  A0_re += g00_re * a0_re;
543  A0_re -= g00_im * a0_im;
544  A0_re += g01_re * a1_re;
545  A0_re -= g01_im * a1_im;
546  A0_re += g02_re * a2_re;
547  A0_re -= g02_im * a2_im;
549  A0_im += g00_re * a0_im;
550  A0_im += g00_im * a0_re;
551  A0_im += g01_re * a1_im;
552  A0_im += g01_im * a1_re;
553  A0_im += g02_re * a2_im;
554  A0_im += g02_im * a2_re;
556  B0_re += g00_re * b0_re;
557  B0_re -= g00_im * b0_im;
558  B0_re += g01_re * b1_re;
559  B0_re -= g01_im * b1_im;
560  B0_re += g02_re * b2_re;
561  B0_re -= g02_im * b2_im;
563  B0_im += g00_re * b0_im;
564  B0_im += g00_im * b0_re;
565  B0_im += g01_re * b1_im;
566  B0_im += g01_im * b1_re;
567  B0_im += g02_re * b2_im;
568  B0_im += g02_im * b2_re;
569 
570  // multiply row 1
572  A1_re += g10_re * a0_re;
573  A1_re -= g10_im * a0_im;
574  A1_re += g11_re * a1_re;
575  A1_re -= g11_im * a1_im;
576  A1_re += g12_re * a2_re;
577  A1_re -= g12_im * a2_im;
579  A1_im += g10_re * a0_im;
580  A1_im += g10_im * a0_re;
581  A1_im += g11_re * a1_im;
582  A1_im += g11_im * a1_re;
583  A1_im += g12_re * a2_im;
584  A1_im += g12_im * a2_re;
586  B1_re += g10_re * b0_re;
587  B1_re -= g10_im * b0_im;
588  B1_re += g11_re * b1_re;
589  B1_re -= g11_im * b1_im;
590  B1_re += g12_re * b2_re;
591  B1_re -= g12_im * b2_im;
593  B1_im += g10_re * b0_im;
594  B1_im += g10_im * b0_re;
595  B1_im += g11_re * b1_im;
596  B1_im += g11_im * b1_re;
597  B1_im += g12_re * b2_im;
598  B1_im += g12_im * b2_re;
599 
600  // multiply row 2
602  A2_re += g20_re * a0_re;
603  A2_re -= g20_im * a0_im;
604  A2_re += g21_re * a1_re;
605  A2_re -= g21_im * a1_im;
606  A2_re += g22_re * a2_re;
607  A2_re -= g22_im * a2_im;
609  A2_im += g20_re * a0_im;
610  A2_im += g20_im * a0_re;
611  A2_im += g21_re * a1_im;
612  A2_im += g21_im * a1_re;
613  A2_im += g22_re * a2_im;
614  A2_im += g22_im * a2_re;
616  B2_re += g20_re * b0_re;
617  B2_re -= g20_im * b0_im;
618  B2_re += g21_re * b1_re;
619  B2_re -= g21_im * b1_im;
620  B2_re += g22_re * b2_re;
621  B2_re -= g22_im * b2_im;
623  B2_im += g20_re * b0_im;
624  B2_im += g20_im * b0_re;
625  B2_im += g21_re * b1_im;
626  B2_im += g21_im * b1_re;
627  B2_im += g22_re * b2_im;
628  B2_im += g22_im * b2_re;
629 
630  o00_re += A0_re;
631  o00_im += A0_im;
632  o10_re += B0_re;
633  o10_im += B0_im;
634  o20_re -= B0_im;
635  o20_im += B0_re;
636  o30_re -= A0_im;
637  o30_im += A0_re;
638 
639  o01_re += A1_re;
640  o01_im += A1_im;
641  o11_re += B1_re;
642  o11_im += B1_im;
643  o21_re -= B1_im;
644  o21_im += B1_re;
645  o31_re -= A1_im;
646  o31_im += A1_re;
647 
648  o02_re += A2_re;
649  o02_im += A2_im;
650  o12_re += B2_re;
651  o12_im += B2_im;
652  o22_re -= B2_im;
653  o22_im += B2_re;
654  o32_re -= A2_im;
655  o32_im += A2_re;
656 
657 }
658 
659 #ifdef MULTI_GPU
660 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
661  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
662 #endif
663 {
664  // Projector P0+
665  // 1 0 0 i
666  // 0 1 i 0
667  // 0 -i 1 0
668  // -i 0 0 1
669 
670 #ifdef MULTI_GPU
671  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
672  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
673 #else
674  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
675 #endif
676 
677 #ifdef MULTI_GPU
678  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
679 #else
680  const int ga_idx = sp_idx;
681 #endif
682 
689 
690 #ifdef MULTI_GPU
691  if (kernel_type == INTERIOR_KERNEL) {
692 #endif
693 
694  // read spinor from device memory
695  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
696 
697  // project spinor into half spinors
698  a0_re = +i00_re-i30_im;
699  a0_im = +i00_im+i30_re;
700  a1_re = +i01_re-i31_im;
701  a1_im = +i01_im+i31_re;
702  a2_re = +i02_re-i32_im;
703  a2_im = +i02_im+i32_re;
704  b0_re = +i10_re-i20_im;
705  b0_im = +i10_im+i20_re;
706  b1_re = +i11_re-i21_im;
707  b1_im = +i11_im+i21_re;
708  b2_re = +i12_re-i22_im;
709  b2_im = +i12_im+i22_re;
710 
711 #ifdef MULTI_GPU
712  } else {
713 
714  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
715 
716  // read half spinor from device memory
717  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
718 
719  a0_re = i00_re; a0_im = i00_im;
720  a1_re = i01_re; a1_im = i01_im;
721  a2_re = i02_re; a2_im = i02_im;
722  b0_re = i10_re; b0_im = i10_im;
723  b1_re = i11_re; b1_im = i11_im;
724  b2_re = i12_re; b2_im = i12_im;
725 
726  }
727 #endif // MULTI_GPU
728 
729  // read gauge matrix from device memory
730  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
731 
732  // reconstruct gauge matrix
734 
735  // multiply row 0
736  spinorFloat A0_re = 0;
737  A0_re += gT00_re * a0_re;
738  A0_re -= gT00_im * a0_im;
739  A0_re += gT01_re * a1_re;
740  A0_re -= gT01_im * a1_im;
741  A0_re += gT02_re * a2_re;
742  A0_re -= gT02_im * a2_im;
743  spinorFloat A0_im = 0;
744  A0_im += gT00_re * a0_im;
745  A0_im += gT00_im * a0_re;
746  A0_im += gT01_re * a1_im;
747  A0_im += gT01_im * a1_re;
748  A0_im += gT02_re * a2_im;
749  A0_im += gT02_im * a2_re;
750  spinorFloat B0_re = 0;
751  B0_re += gT00_re * b0_re;
752  B0_re -= gT00_im * b0_im;
753  B0_re += gT01_re * b1_re;
754  B0_re -= gT01_im * b1_im;
755  B0_re += gT02_re * b2_re;
756  B0_re -= gT02_im * b2_im;
757  spinorFloat B0_im = 0;
758  B0_im += gT00_re * b0_im;
759  B0_im += gT00_im * b0_re;
760  B0_im += gT01_re * b1_im;
761  B0_im += gT01_im * b1_re;
762  B0_im += gT02_re * b2_im;
763  B0_im += gT02_im * b2_re;
764 
765  // multiply row 1
766  spinorFloat A1_re = 0;
767  A1_re += gT10_re * a0_re;
768  A1_re -= gT10_im * a0_im;
769  A1_re += gT11_re * a1_re;
770  A1_re -= gT11_im * a1_im;
771  A1_re += gT12_re * a2_re;
772  A1_re -= gT12_im * a2_im;
773  spinorFloat A1_im = 0;
774  A1_im += gT10_re * a0_im;
775  A1_im += gT10_im * a0_re;
776  A1_im += gT11_re * a1_im;
777  A1_im += gT11_im * a1_re;
778  A1_im += gT12_re * a2_im;
779  A1_im += gT12_im * a2_re;
780  spinorFloat B1_re = 0;
781  B1_re += gT10_re * b0_re;
782  B1_re -= gT10_im * b0_im;
783  B1_re += gT11_re * b1_re;
784  B1_re -= gT11_im * b1_im;
785  B1_re += gT12_re * b2_re;
786  B1_re -= gT12_im * b2_im;
787  spinorFloat B1_im = 0;
788  B1_im += gT10_re * b0_im;
789  B1_im += gT10_im * b0_re;
790  B1_im += gT11_re * b1_im;
791  B1_im += gT11_im * b1_re;
792  B1_im += gT12_re * b2_im;
793  B1_im += gT12_im * b2_re;
794 
795  // multiply row 2
796  spinorFloat A2_re = 0;
797  A2_re += gT20_re * a0_re;
798  A2_re -= gT20_im * a0_im;
799  A2_re += gT21_re * a1_re;
800  A2_re -= gT21_im * a1_im;
801  A2_re += gT22_re * a2_re;
802  A2_re -= gT22_im * a2_im;
803  spinorFloat A2_im = 0;
804  A2_im += gT20_re * a0_im;
805  A2_im += gT20_im * a0_re;
806  A2_im += gT21_re * a1_im;
807  A2_im += gT21_im * a1_re;
808  A2_im += gT22_re * a2_im;
809  A2_im += gT22_im * a2_re;
810  spinorFloat B2_re = 0;
811  B2_re += gT20_re * b0_re;
812  B2_re -= gT20_im * b0_im;
813  B2_re += gT21_re * b1_re;
814  B2_re -= gT21_im * b1_im;
815  B2_re += gT22_re * b2_re;
816  B2_re -= gT22_im * b2_im;
817  spinorFloat B2_im = 0;
818  B2_im += gT20_re * b0_im;
819  B2_im += gT20_im * b0_re;
820  B2_im += gT21_re * b1_im;
821  B2_im += gT21_im * b1_re;
822  B2_im += gT22_re * b2_im;
823  B2_im += gT22_im * b2_re;
824 
825  o00_re += A0_re;
826  o00_im += A0_im;
827  o10_re += B0_re;
828  o10_im += B0_im;
829  o20_re += B0_im;
830  o20_im -= B0_re;
831  o30_re += A0_im;
832  o30_im -= A0_re;
833 
834  o01_re += A1_re;
835  o01_im += A1_im;
836  o11_re += B1_re;
837  o11_im += B1_im;
838  o21_re += B1_im;
839  o21_im -= B1_re;
840  o31_re += A1_im;
841  o31_im -= A1_re;
842 
843  o02_re += A2_re;
844  o02_im += A2_im;
845  o12_re += B2_re;
846  o12_im += B2_im;
847  o22_re += B2_im;
848  o22_im -= B2_re;
849  o32_re += A2_im;
850  o32_im -= A2_re;
851 
852 }
853 
854 #ifdef MULTI_GPU
855 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
856  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
857 #endif
858 {
859  // Projector P1-
860  // 1 0 0 -1
861  // 0 1 1 0
862  // 0 1 1 0
863  // -1 0 0 1
864 
865 #ifdef MULTI_GPU
866  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
867  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
868 #else
869  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
870 #endif
871 
872  const int ga_idx = sid;
873 
880 
881 #ifdef MULTI_GPU
882  if (kernel_type == INTERIOR_KERNEL) {
883 #endif
884 
885  // read spinor from device memory
886  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
887 
888  // project spinor into half spinors
889  a0_re = +i00_re-i30_re;
890  a0_im = +i00_im-i30_im;
891  a1_re = +i01_re-i31_re;
892  a1_im = +i01_im-i31_im;
893  a2_re = +i02_re-i32_re;
894  a2_im = +i02_im-i32_im;
895  b0_re = +i10_re+i20_re;
896  b0_im = +i10_im+i20_im;
897  b1_re = +i11_re+i21_re;
898  b1_im = +i11_im+i21_im;
899  b2_re = +i12_re+i22_re;
900  b2_im = +i12_im+i22_im;
901 
902 #ifdef MULTI_GPU
903  } else {
904 
905  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
906 
907  // read half spinor from device memory
908  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
909 
910  a0_re = i00_re; a0_im = i00_im;
911  a1_re = i01_re; a1_im = i01_im;
912  a2_re = i02_re; a2_im = i02_im;
913  b0_re = i10_re; b0_im = i10_im;
914  b1_re = i11_re; b1_im = i11_im;
915  b2_re = i12_re; b2_im = i12_im;
916 
917  }
918 #endif // MULTI_GPU
919 
920  // read gauge matrix from device memory
921  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
922 
923  // reconstruct gauge matrix
925 
926  // multiply row 0
927  spinorFloat A0_re = 0;
928  A0_re += g00_re * a0_re;
929  A0_re -= g00_im * a0_im;
930  A0_re += g01_re * a1_re;
931  A0_re -= g01_im * a1_im;
932  A0_re += g02_re * a2_re;
933  A0_re -= g02_im * a2_im;
934  spinorFloat A0_im = 0;
935  A0_im += g00_re * a0_im;
936  A0_im += g00_im * a0_re;
937  A0_im += g01_re * a1_im;
938  A0_im += g01_im * a1_re;
939  A0_im += g02_re * a2_im;
940  A0_im += g02_im * a2_re;
941  spinorFloat B0_re = 0;
942  B0_re += g00_re * b0_re;
943  B0_re -= g00_im * b0_im;
944  B0_re += g01_re * b1_re;
945  B0_re -= g01_im * b1_im;
946  B0_re += g02_re * b2_re;
947  B0_re -= g02_im * b2_im;
948  spinorFloat B0_im = 0;
949  B0_im += g00_re * b0_im;
950  B0_im += g00_im * b0_re;
951  B0_im += g01_re * b1_im;
952  B0_im += g01_im * b1_re;
953  B0_im += g02_re * b2_im;
954  B0_im += g02_im * b2_re;
955 
956  // multiply row 1
957  spinorFloat A1_re = 0;
958  A1_re += g10_re * a0_re;
959  A1_re -= g10_im * a0_im;
960  A1_re += g11_re * a1_re;
961  A1_re -= g11_im * a1_im;
962  A1_re += g12_re * a2_re;
963  A1_re -= g12_im * a2_im;
964  spinorFloat A1_im = 0;
965  A1_im += g10_re * a0_im;
966  A1_im += g10_im * a0_re;
967  A1_im += g11_re * a1_im;
968  A1_im += g11_im * a1_re;
969  A1_im += g12_re * a2_im;
970  A1_im += g12_im * a2_re;
971  spinorFloat B1_re = 0;
972  B1_re += g10_re * b0_re;
973  B1_re -= g10_im * b0_im;
974  B1_re += g11_re * b1_re;
975  B1_re -= g11_im * b1_im;
976  B1_re += g12_re * b2_re;
977  B1_re -= g12_im * b2_im;
978  spinorFloat B1_im = 0;
979  B1_im += g10_re * b0_im;
980  B1_im += g10_im * b0_re;
981  B1_im += g11_re * b1_im;
982  B1_im += g11_im * b1_re;
983  B1_im += g12_re * b2_im;
984  B1_im += g12_im * b2_re;
985 
986  // multiply row 2
987  spinorFloat A2_re = 0;
988  A2_re += g20_re * a0_re;
989  A2_re -= g20_im * a0_im;
990  A2_re += g21_re * a1_re;
991  A2_re -= g21_im * a1_im;
992  A2_re += g22_re * a2_re;
993  A2_re -= g22_im * a2_im;
994  spinorFloat A2_im = 0;
995  A2_im += g20_re * a0_im;
996  A2_im += g20_im * a0_re;
997  A2_im += g21_re * a1_im;
998  A2_im += g21_im * a1_re;
999  A2_im += g22_re * a2_im;
1000  A2_im += g22_im * a2_re;
1001  spinorFloat B2_re = 0;
1002  B2_re += g20_re * b0_re;
1003  B2_re -= g20_im * b0_im;
1004  B2_re += g21_re * b1_re;
1005  B2_re -= g21_im * b1_im;
1006  B2_re += g22_re * b2_re;
1007  B2_re -= g22_im * b2_im;
1008  spinorFloat B2_im = 0;
1009  B2_im += g20_re * b0_im;
1010  B2_im += g20_im * b0_re;
1011  B2_im += g21_re * b1_im;
1012  B2_im += g21_im * b1_re;
1013  B2_im += g22_re * b2_im;
1014  B2_im += g22_im * b2_re;
1015 
1016  o00_re += A0_re;
1017  o00_im += A0_im;
1018  o10_re += B0_re;
1019  o10_im += B0_im;
1020  o20_re += B0_re;
1021  o20_im += B0_im;
1022  o30_re -= A0_re;
1023  o30_im -= A0_im;
1024 
1025  o01_re += A1_re;
1026  o01_im += A1_im;
1027  o11_re += B1_re;
1028  o11_im += B1_im;
1029  o21_re += B1_re;
1030  o21_im += B1_im;
1031  o31_re -= A1_re;
1032  o31_im -= A1_im;
1033 
1034  o02_re += A2_re;
1035  o02_im += A2_im;
1036  o12_re += B2_re;
1037  o12_im += B2_im;
1038  o22_re += B2_re;
1039  o22_im += B2_im;
1040  o32_re -= A2_re;
1041  o32_im -= A2_im;
1042 
1043 }
1044 
1045 #ifdef MULTI_GPU
1046 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
1047  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
1048 #endif
1049 {
1050  // Projector P1+
1051  // 1 0 0 1
1052  // 0 1 -1 0
1053  // 0 -1 1 0
1054  // 1 0 0 1
1055 
1056 #ifdef MULTI_GPU
1057  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
1058  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1059 #else
1060  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
1061 #endif
1062 
1063 #ifdef MULTI_GPU
1064  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1065 #else
1066  const int ga_idx = sp_idx;
1067 #endif
1068 
1075 
1076 #ifdef MULTI_GPU
1077  if (kernel_type == INTERIOR_KERNEL) {
1078 #endif
1079 
1080  // read spinor from device memory
1081  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1082 
1083  // project spinor into half spinors
1084  a0_re = +i00_re+i30_re;
1085  a0_im = +i00_im+i30_im;
1086  a1_re = +i01_re+i31_re;
1087  a1_im = +i01_im+i31_im;
1088  a2_re = +i02_re+i32_re;
1089  a2_im = +i02_im+i32_im;
1090  b0_re = +i10_re-i20_re;
1091  b0_im = +i10_im-i20_im;
1092  b1_re = +i11_re-i21_re;
1093  b1_im = +i11_im-i21_im;
1094  b2_re = +i12_re-i22_re;
1095  b2_im = +i12_im-i22_im;
1096 
1097 #ifdef MULTI_GPU
1098  } else {
1099 
1100  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1101 
1102  // read half spinor from device memory
1103  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1104 
1105  a0_re = i00_re; a0_im = i00_im;
1106  a1_re = i01_re; a1_im = i01_im;
1107  a2_re = i02_re; a2_im = i02_im;
1108  b0_re = i10_re; b0_im = i10_im;
1109  b1_re = i11_re; b1_im = i11_im;
1110  b2_re = i12_re; b2_im = i12_im;
1111 
1112  }
1113 #endif // MULTI_GPU
1114 
1115  // read gauge matrix from device memory
1116  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
1117 
1118  // reconstruct gauge matrix
1120 
1121  // multiply row 0
1122  spinorFloat A0_re = 0;
1123  A0_re += gT00_re * a0_re;
1124  A0_re -= gT00_im * a0_im;
1125  A0_re += gT01_re * a1_re;
1126  A0_re -= gT01_im * a1_im;
1127  A0_re += gT02_re * a2_re;
1128  A0_re -= gT02_im * a2_im;
1129  spinorFloat A0_im = 0;
1130  A0_im += gT00_re * a0_im;
1131  A0_im += gT00_im * a0_re;
1132  A0_im += gT01_re * a1_im;
1133  A0_im += gT01_im * a1_re;
1134  A0_im += gT02_re * a2_im;
1135  A0_im += gT02_im * a2_re;
1136  spinorFloat B0_re = 0;
1137  B0_re += gT00_re * b0_re;
1138  B0_re -= gT00_im * b0_im;
1139  B0_re += gT01_re * b1_re;
1140  B0_re -= gT01_im * b1_im;
1141  B0_re += gT02_re * b2_re;
1142  B0_re -= gT02_im * b2_im;
1143  spinorFloat B0_im = 0;
1144  B0_im += gT00_re * b0_im;
1145  B0_im += gT00_im * b0_re;
1146  B0_im += gT01_re * b1_im;
1147  B0_im += gT01_im * b1_re;
1148  B0_im += gT02_re * b2_im;
1149  B0_im += gT02_im * b2_re;
1150 
1151  // multiply row 1
1152  spinorFloat A1_re = 0;
1153  A1_re += gT10_re * a0_re;
1154  A1_re -= gT10_im * a0_im;
1155  A1_re += gT11_re * a1_re;
1156  A1_re -= gT11_im * a1_im;
1157  A1_re += gT12_re * a2_re;
1158  A1_re -= gT12_im * a2_im;
1159  spinorFloat A1_im = 0;
1160  A1_im += gT10_re * a0_im;
1161  A1_im += gT10_im * a0_re;
1162  A1_im += gT11_re * a1_im;
1163  A1_im += gT11_im * a1_re;
1164  A1_im += gT12_re * a2_im;
1165  A1_im += gT12_im * a2_re;
1166  spinorFloat B1_re = 0;
1167  B1_re += gT10_re * b0_re;
1168  B1_re -= gT10_im * b0_im;
1169  B1_re += gT11_re * b1_re;
1170  B1_re -= gT11_im * b1_im;
1171  B1_re += gT12_re * b2_re;
1172  B1_re -= gT12_im * b2_im;
1173  spinorFloat B1_im = 0;
1174  B1_im += gT10_re * b0_im;
1175  B1_im += gT10_im * b0_re;
1176  B1_im += gT11_re * b1_im;
1177  B1_im += gT11_im * b1_re;
1178  B1_im += gT12_re * b2_im;
1179  B1_im += gT12_im * b2_re;
1180 
1181  // multiply row 2
1182  spinorFloat A2_re = 0;
1183  A2_re += gT20_re * a0_re;
1184  A2_re -= gT20_im * a0_im;
1185  A2_re += gT21_re * a1_re;
1186  A2_re -= gT21_im * a1_im;
1187  A2_re += gT22_re * a2_re;
1188  A2_re -= gT22_im * a2_im;
1189  spinorFloat A2_im = 0;
1190  A2_im += gT20_re * a0_im;
1191  A2_im += gT20_im * a0_re;
1192  A2_im += gT21_re * a1_im;
1193  A2_im += gT21_im * a1_re;
1194  A2_im += gT22_re * a2_im;
1195  A2_im += gT22_im * a2_re;
1196  spinorFloat B2_re = 0;
1197  B2_re += gT20_re * b0_re;
1198  B2_re -= gT20_im * b0_im;
1199  B2_re += gT21_re * b1_re;
1200  B2_re -= gT21_im * b1_im;
1201  B2_re += gT22_re * b2_re;
1202  B2_re -= gT22_im * b2_im;
1203  spinorFloat B2_im = 0;
1204  B2_im += gT20_re * b0_im;
1205  B2_im += gT20_im * b0_re;
1206  B2_im += gT21_re * b1_im;
1207  B2_im += gT21_im * b1_re;
1208  B2_im += gT22_re * b2_im;
1209  B2_im += gT22_im * b2_re;
1210 
1211  o00_re += A0_re;
1212  o00_im += A0_im;
1213  o10_re += B0_re;
1214  o10_im += B0_im;
1215  o20_re -= B0_re;
1216  o20_im -= B0_im;
1217  o30_re += A0_re;
1218  o30_im += A0_im;
1219 
1220  o01_re += A1_re;
1221  o01_im += A1_im;
1222  o11_re += B1_re;
1223  o11_im += B1_im;
1224  o21_re -= B1_re;
1225  o21_im -= B1_im;
1226  o31_re += A1_re;
1227  o31_im += A1_im;
1228 
1229  o02_re += A2_re;
1230  o02_im += A2_im;
1231  o12_re += B2_re;
1232  o12_im += B2_im;
1233  o22_re -= B2_re;
1234  o22_im -= B2_im;
1235  o32_re += A2_re;
1236  o32_im += A2_im;
1237 
1238 }
1239 
1240 #ifdef MULTI_GPU
1241 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1242  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1243 #endif
1244 {
1245  // Projector P2-
1246  // 1 0 -i 0
1247  // 0 1 0 i
1248  // i 0 1 0
1249  // 0 -i 0 1
1250 
1251 #ifdef MULTI_GPU
1252  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1253  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1254 #else
1255  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1256 #endif
1257 
1258  const int ga_idx = sid;
1259 
1266 
1267 #ifdef MULTI_GPU
1268  if (kernel_type == INTERIOR_KERNEL) {
1269 #endif
1270 
1271  // read spinor from device memory
1272  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1273 
1274  // project spinor into half spinors
1275  a0_re = +i00_re+i20_im;
1276  a0_im = +i00_im-i20_re;
1277  a1_re = +i01_re+i21_im;
1278  a1_im = +i01_im-i21_re;
1279  a2_re = +i02_re+i22_im;
1280  a2_im = +i02_im-i22_re;
1281  b0_re = +i10_re-i30_im;
1282  b0_im = +i10_im+i30_re;
1283  b1_re = +i11_re-i31_im;
1284  b1_im = +i11_im+i31_re;
1285  b2_re = +i12_re-i32_im;
1286  b2_im = +i12_im+i32_re;
1287 
1288 #ifdef MULTI_GPU
1289  } else {
1290 
1291  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1292 
1293  // read half spinor from device memory
1294  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1295 
1296  a0_re = i00_re; a0_im = i00_im;
1297  a1_re = i01_re; a1_im = i01_im;
1298  a2_re = i02_re; a2_im = i02_im;
1299  b0_re = i10_re; b0_im = i10_im;
1300  b1_re = i11_re; b1_im = i11_im;
1301  b2_re = i12_re; b2_im = i12_im;
1302 
1303  }
1304 #endif // MULTI_GPU
1305 
1306  // read gauge matrix from device memory
1307  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1308 
1309  // reconstruct gauge matrix
1311 
1312  // multiply row 0
1313  spinorFloat A0_re = 0;
1314  A0_re += g00_re * a0_re;
1315  A0_re -= g00_im * a0_im;
1316  A0_re += g01_re * a1_re;
1317  A0_re -= g01_im * a1_im;
1318  A0_re += g02_re * a2_re;
1319  A0_re -= g02_im * a2_im;
1320  spinorFloat A0_im = 0;
1321  A0_im += g00_re * a0_im;
1322  A0_im += g00_im * a0_re;
1323  A0_im += g01_re * a1_im;
1324  A0_im += g01_im * a1_re;
1325  A0_im += g02_re * a2_im;
1326  A0_im += g02_im * a2_re;
1327  spinorFloat B0_re = 0;
1328  B0_re += g00_re * b0_re;
1329  B0_re -= g00_im * b0_im;
1330  B0_re += g01_re * b1_re;
1331  B0_re -= g01_im * b1_im;
1332  B0_re += g02_re * b2_re;
1333  B0_re -= g02_im * b2_im;
1334  spinorFloat B0_im = 0;
1335  B0_im += g00_re * b0_im;
1336  B0_im += g00_im * b0_re;
1337  B0_im += g01_re * b1_im;
1338  B0_im += g01_im * b1_re;
1339  B0_im += g02_re * b2_im;
1340  B0_im += g02_im * b2_re;
1341 
1342  // multiply row 1
1343  spinorFloat A1_re = 0;
1344  A1_re += g10_re * a0_re;
1345  A1_re -= g10_im * a0_im;
1346  A1_re += g11_re * a1_re;
1347  A1_re -= g11_im * a1_im;
1348  A1_re += g12_re * a2_re;
1349  A1_re -= g12_im * a2_im;
1350  spinorFloat A1_im = 0;
1351  A1_im += g10_re * a0_im;
1352  A1_im += g10_im * a0_re;
1353  A1_im += g11_re * a1_im;
1354  A1_im += g11_im * a1_re;
1355  A1_im += g12_re * a2_im;
1356  A1_im += g12_im * a2_re;
1357  spinorFloat B1_re = 0;
1358  B1_re += g10_re * b0_re;
1359  B1_re -= g10_im * b0_im;
1360  B1_re += g11_re * b1_re;
1361  B1_re -= g11_im * b1_im;
1362  B1_re += g12_re * b2_re;
1363  B1_re -= g12_im * b2_im;
1364  spinorFloat B1_im = 0;
1365  B1_im += g10_re * b0_im;
1366  B1_im += g10_im * b0_re;
1367  B1_im += g11_re * b1_im;
1368  B1_im += g11_im * b1_re;
1369  B1_im += g12_re * b2_im;
1370  B1_im += g12_im * b2_re;
1371 
1372  // multiply row 2
1373  spinorFloat A2_re = 0;
1374  A2_re += g20_re * a0_re;
1375  A2_re -= g20_im * a0_im;
1376  A2_re += g21_re * a1_re;
1377  A2_re -= g21_im * a1_im;
1378  A2_re += g22_re * a2_re;
1379  A2_re -= g22_im * a2_im;
1380  spinorFloat A2_im = 0;
1381  A2_im += g20_re * a0_im;
1382  A2_im += g20_im * a0_re;
1383  A2_im += g21_re * a1_im;
1384  A2_im += g21_im * a1_re;
1385  A2_im += g22_re * a2_im;
1386  A2_im += g22_im * a2_re;
1387  spinorFloat B2_re = 0;
1388  B2_re += g20_re * b0_re;
1389  B2_re -= g20_im * b0_im;
1390  B2_re += g21_re * b1_re;
1391  B2_re -= g21_im * b1_im;
1392  B2_re += g22_re * b2_re;
1393  B2_re -= g22_im * b2_im;
1394  spinorFloat B2_im = 0;
1395  B2_im += g20_re * b0_im;
1396  B2_im += g20_im * b0_re;
1397  B2_im += g21_re * b1_im;
1398  B2_im += g21_im * b1_re;
1399  B2_im += g22_re * b2_im;
1400  B2_im += g22_im * b2_re;
1401 
1402  o00_re += A0_re;
1403  o00_im += A0_im;
1404  o10_re += B0_re;
1405  o10_im += B0_im;
1406  o20_re -= A0_im;
1407  o20_im += A0_re;
1408  o30_re += B0_im;
1409  o30_im -= B0_re;
1410 
1411  o01_re += A1_re;
1412  o01_im += A1_im;
1413  o11_re += B1_re;
1414  o11_im += B1_im;
1415  o21_re -= A1_im;
1416  o21_im += A1_re;
1417  o31_re += B1_im;
1418  o31_im -= B1_re;
1419 
1420  o02_re += A2_re;
1421  o02_im += A2_im;
1422  o12_re += B2_re;
1423  o12_im += B2_im;
1424  o22_re -= A2_im;
1425  o22_im += A2_re;
1426  o32_re += B2_im;
1427  o32_im -= B2_re;
1428 
1429 }
1430 
1431 #ifdef MULTI_GPU
1432 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1433  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1434 #endif
1435 {
1436  // Projector P2+
1437  // 1 0 i 0
1438  // 0 1 0 -i
1439  // -i 0 1 0
1440  // 0 i 0 1
1441 
1442 #ifdef MULTI_GPU
1443  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1444  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1445 #else
1446  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1447 #endif
1448 
1449 #ifdef MULTI_GPU
1450  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1451 #else
1452  const int ga_idx = sp_idx;
1453 #endif
1454 
1461 
1462 #ifdef MULTI_GPU
1463  if (kernel_type == INTERIOR_KERNEL) {
1464 #endif
1465 
1466  // read spinor from device memory
1467  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1468 
1469  // project spinor into half spinors
1470  a0_re = +i00_re-i20_im;
1471  a0_im = +i00_im+i20_re;
1472  a1_re = +i01_re-i21_im;
1473  a1_im = +i01_im+i21_re;
1474  a2_re = +i02_re-i22_im;
1475  a2_im = +i02_im+i22_re;
1476  b0_re = +i10_re+i30_im;
1477  b0_im = +i10_im-i30_re;
1478  b1_re = +i11_re+i31_im;
1479  b1_im = +i11_im-i31_re;
1480  b2_re = +i12_re+i32_im;
1481  b2_im = +i12_im-i32_re;
1482 
1483 #ifdef MULTI_GPU
1484  } else {
1485 
1486  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1487 
1488  // read half spinor from device memory
1489  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1490 
1491  a0_re = i00_re; a0_im = i00_im;
1492  a1_re = i01_re; a1_im = i01_im;
1493  a2_re = i02_re; a2_im = i02_im;
1494  b0_re = i10_re; b0_im = i10_im;
1495  b1_re = i11_re; b1_im = i11_im;
1496  b2_re = i12_re; b2_im = i12_im;
1497 
1498  }
1499 #endif // MULTI_GPU
1500 
1501  // read gauge matrix from device memory
1502  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1503 
1504  // reconstruct gauge matrix
1506 
1507  // multiply row 0
1508  spinorFloat A0_re = 0;
1509  A0_re += gT00_re * a0_re;
1510  A0_re -= gT00_im * a0_im;
1511  A0_re += gT01_re * a1_re;
1512  A0_re -= gT01_im * a1_im;
1513  A0_re += gT02_re * a2_re;
1514  A0_re -= gT02_im * a2_im;
1515  spinorFloat A0_im = 0;
1516  A0_im += gT00_re * a0_im;
1517  A0_im += gT00_im * a0_re;
1518  A0_im += gT01_re * a1_im;
1519  A0_im += gT01_im * a1_re;
1520  A0_im += gT02_re * a2_im;
1521  A0_im += gT02_im * a2_re;
1522  spinorFloat B0_re = 0;
1523  B0_re += gT00_re * b0_re;
1524  B0_re -= gT00_im * b0_im;
1525  B0_re += gT01_re * b1_re;
1526  B0_re -= gT01_im * b1_im;
1527  B0_re += gT02_re * b2_re;
1528  B0_re -= gT02_im * b2_im;
1529  spinorFloat B0_im = 0;
1530  B0_im += gT00_re * b0_im;
1531  B0_im += gT00_im * b0_re;
1532  B0_im += gT01_re * b1_im;
1533  B0_im += gT01_im * b1_re;
1534  B0_im += gT02_re * b2_im;
1535  B0_im += gT02_im * b2_re;
1536 
1537  // multiply row 1
1538  spinorFloat A1_re = 0;
1539  A1_re += gT10_re * a0_re;
1540  A1_re -= gT10_im * a0_im;
1541  A1_re += gT11_re * a1_re;
1542  A1_re -= gT11_im * a1_im;
1543  A1_re += gT12_re * a2_re;
1544  A1_re -= gT12_im * a2_im;
1545  spinorFloat A1_im = 0;
1546  A1_im += gT10_re * a0_im;
1547  A1_im += gT10_im * a0_re;
1548  A1_im += gT11_re * a1_im;
1549  A1_im += gT11_im * a1_re;
1550  A1_im += gT12_re * a2_im;
1551  A1_im += gT12_im * a2_re;
1552  spinorFloat B1_re = 0;
1553  B1_re += gT10_re * b0_re;
1554  B1_re -= gT10_im * b0_im;
1555  B1_re += gT11_re * b1_re;
1556  B1_re -= gT11_im * b1_im;
1557  B1_re += gT12_re * b2_re;
1558  B1_re -= gT12_im * b2_im;
1559  spinorFloat B1_im = 0;
1560  B1_im += gT10_re * b0_im;
1561  B1_im += gT10_im * b0_re;
1562  B1_im += gT11_re * b1_im;
1563  B1_im += gT11_im * b1_re;
1564  B1_im += gT12_re * b2_im;
1565  B1_im += gT12_im * b2_re;
1566 
1567  // multiply row 2
1568  spinorFloat A2_re = 0;
1569  A2_re += gT20_re * a0_re;
1570  A2_re -= gT20_im * a0_im;
1571  A2_re += gT21_re * a1_re;
1572  A2_re -= gT21_im * a1_im;
1573  A2_re += gT22_re * a2_re;
1574  A2_re -= gT22_im * a2_im;
1575  spinorFloat A2_im = 0;
1576  A2_im += gT20_re * a0_im;
1577  A2_im += gT20_im * a0_re;
1578  A2_im += gT21_re * a1_im;
1579  A2_im += gT21_im * a1_re;
1580  A2_im += gT22_re * a2_im;
1581  A2_im += gT22_im * a2_re;
1582  spinorFloat B2_re = 0;
1583  B2_re += gT20_re * b0_re;
1584  B2_re -= gT20_im * b0_im;
1585  B2_re += gT21_re * b1_re;
1586  B2_re -= gT21_im * b1_im;
1587  B2_re += gT22_re * b2_re;
1588  B2_re -= gT22_im * b2_im;
1589  spinorFloat B2_im = 0;
1590  B2_im += gT20_re * b0_im;
1591  B2_im += gT20_im * b0_re;
1592  B2_im += gT21_re * b1_im;
1593  B2_im += gT21_im * b1_re;
1594  B2_im += gT22_re * b2_im;
1595  B2_im += gT22_im * b2_re;
1596 
1597  o00_re += A0_re;
1598  o00_im += A0_im;
1599  o10_re += B0_re;
1600  o10_im += B0_im;
1601  o20_re += A0_im;
1602  o20_im -= A0_re;
1603  o30_re -= B0_im;
1604  o30_im += B0_re;
1605 
1606  o01_re += A1_re;
1607  o01_im += A1_im;
1608  o11_re += B1_re;
1609  o11_im += B1_im;
1610  o21_re += A1_im;
1611  o21_im -= A1_re;
1612  o31_re -= B1_im;
1613  o31_im += B1_re;
1614 
1615  o02_re += A2_re;
1616  o02_im += A2_im;
1617  o12_re += B2_re;
1618  o12_im += B2_im;
1619  o22_re += A2_im;
1620  o22_im -= A2_re;
1621  o32_re -= B2_im;
1622  o32_im += B2_re;
1623 
1624 }
1625 
1626 #ifdef MULTI_GPU
1627 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1628  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
1629 #endif
1630 {
1631  // Projector P3-
1632  // 0 0 0 0
1633  // 0 0 0 0
1634  // 0 0 2 0
1635  // 0 0 0 2
1636 
1637 #ifdef MULTI_GPU
1638  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1639  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1640 #else
1641  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1642 #endif
1643 
1644  const int ga_idx = sid;
1645 
1647  {
1654 
1655 #ifdef MULTI_GPU
1656  if (kernel_type == INTERIOR_KERNEL) {
1657 #endif
1658 
1659  // read spinor from device memory
1660  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1661 
1662  // project spinor into half spinors
1663  a0_re = +2*i20_re;
1664  a0_im = +2*i20_im;
1665  a1_re = +2*i21_re;
1666  a1_im = +2*i21_im;
1667  a2_re = +2*i22_re;
1668  a2_im = +2*i22_im;
1669  b0_re = +2*i30_re;
1670  b0_im = +2*i30_im;
1671  b1_re = +2*i31_re;
1672  b1_im = +2*i31_im;
1673  b2_re = +2*i32_re;
1674  b2_im = +2*i32_im;
1675 
1676 #ifdef MULTI_GPU
1677  } else {
1678 
1679  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1680  const int t_proj_scale = TPROJSCALE;
1681 
1682  // read half spinor from device memory
1683  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1684 
1685  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1686  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1687  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1688  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1689  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1690  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1691 
1692  }
1693 #endif // MULTI_GPU
1694 
1695  // identity gauge matrix
1702 
1703  o20_re += A0_re;
1704  o20_im += A0_im;
1705  o30_re += B0_re;
1706  o30_im += B0_im;
1707 
1708  o21_re += A1_re;
1709  o21_im += A1_im;
1710  o31_re += B1_re;
1711  o31_im += B1_im;
1712 
1713  o22_re += A2_re;
1714  o22_im += A2_im;
1715  o32_re += B2_re;
1716  o32_im += B2_im;
1717 
1718  } else {
1725 
1726 #ifdef MULTI_GPU
1727  if (kernel_type == INTERIOR_KERNEL) {
1728 #endif
1729 
1730  // read spinor from device memory
1731  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1732 
1733  // project spinor into half spinors
1734  a0_re = +2*i20_re;
1735  a0_im = +2*i20_im;
1736  a1_re = +2*i21_re;
1737  a1_im = +2*i21_im;
1738  a2_re = +2*i22_re;
1739  a2_im = +2*i22_im;
1740  b0_re = +2*i30_re;
1741  b0_im = +2*i30_im;
1742  b1_re = +2*i31_re;
1743  b1_im = +2*i31_im;
1744  b2_re = +2*i32_re;
1745  b2_im = +2*i32_im;
1746 
1747 #ifdef MULTI_GPU
1748  } else {
1749 
1750  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1751  const int t_proj_scale = TPROJSCALE;
1752 
1753  // read half spinor from device memory
1754  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1755 
1756  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1757  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1758  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1759  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1760  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1761  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1762 
1763  }
1764 #endif // MULTI_GPU
1765 
1766  // read gauge matrix from device memory
1767  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1768 
1769  // reconstruct gauge matrix
1771 
1772  // multiply row 0
1773  spinorFloat A0_re = 0;
1774  A0_re += g00_re * a0_re;
1775  A0_re -= g00_im * a0_im;
1776  A0_re += g01_re * a1_re;
1777  A0_re -= g01_im * a1_im;
1778  A0_re += g02_re * a2_re;
1779  A0_re -= g02_im * a2_im;
1780  spinorFloat A0_im = 0;
1781  A0_im += g00_re * a0_im;
1782  A0_im += g00_im * a0_re;
1783  A0_im += g01_re * a1_im;
1784  A0_im += g01_im * a1_re;
1785  A0_im += g02_re * a2_im;
1786  A0_im += g02_im * a2_re;
1787  spinorFloat B0_re = 0;
1788  B0_re += g00_re * b0_re;
1789  B0_re -= g00_im * b0_im;
1790  B0_re += g01_re * b1_re;
1791  B0_re -= g01_im * b1_im;
1792  B0_re += g02_re * b2_re;
1793  B0_re -= g02_im * b2_im;
1794  spinorFloat B0_im = 0;
1795  B0_im += g00_re * b0_im;
1796  B0_im += g00_im * b0_re;
1797  B0_im += g01_re * b1_im;
1798  B0_im += g01_im * b1_re;
1799  B0_im += g02_re * b2_im;
1800  B0_im += g02_im * b2_re;
1801 
1802  // multiply row 1
1803  spinorFloat A1_re = 0;
1804  A1_re += g10_re * a0_re;
1805  A1_re -= g10_im * a0_im;
1806  A1_re += g11_re * a1_re;
1807  A1_re -= g11_im * a1_im;
1808  A1_re += g12_re * a2_re;
1809  A1_re -= g12_im * a2_im;
1810  spinorFloat A1_im = 0;
1811  A1_im += g10_re * a0_im;
1812  A1_im += g10_im * a0_re;
1813  A1_im += g11_re * a1_im;
1814  A1_im += g11_im * a1_re;
1815  A1_im += g12_re * a2_im;
1816  A1_im += g12_im * a2_re;
1817  spinorFloat B1_re = 0;
1818  B1_re += g10_re * b0_re;
1819  B1_re -= g10_im * b0_im;
1820  B1_re += g11_re * b1_re;
1821  B1_re -= g11_im * b1_im;
1822  B1_re += g12_re * b2_re;
1823  B1_re -= g12_im * b2_im;
1824  spinorFloat B1_im = 0;
1825  B1_im += g10_re * b0_im;
1826  B1_im += g10_im * b0_re;
1827  B1_im += g11_re * b1_im;
1828  B1_im += g11_im * b1_re;
1829  B1_im += g12_re * b2_im;
1830  B1_im += g12_im * b2_re;
1831 
1832  // multiply row 2
1833  spinorFloat A2_re = 0;
1834  A2_re += g20_re * a0_re;
1835  A2_re -= g20_im * a0_im;
1836  A2_re += g21_re * a1_re;
1837  A2_re -= g21_im * a1_im;
1838  A2_re += g22_re * a2_re;
1839  A2_re -= g22_im * a2_im;
1840  spinorFloat A2_im = 0;
1841  A2_im += g20_re * a0_im;
1842  A2_im += g20_im * a0_re;
1843  A2_im += g21_re * a1_im;
1844  A2_im += g21_im * a1_re;
1845  A2_im += g22_re * a2_im;
1846  A2_im += g22_im * a2_re;
1847  spinorFloat B2_re = 0;
1848  B2_re += g20_re * b0_re;
1849  B2_re -= g20_im * b0_im;
1850  B2_re += g21_re * b1_re;
1851  B2_re -= g21_im * b1_im;
1852  B2_re += g22_re * b2_re;
1853  B2_re -= g22_im * b2_im;
1854  spinorFloat B2_im = 0;
1855  B2_im += g20_re * b0_im;
1856  B2_im += g20_im * b0_re;
1857  B2_im += g21_re * b1_im;
1858  B2_im += g21_im * b1_re;
1859  B2_im += g22_re * b2_im;
1860  B2_im += g22_im * b2_re;
1861 
1862  o20_re += A0_re;
1863  o20_im += A0_im;
1864  o30_re += B0_re;
1865  o30_im += B0_im;
1866 
1867  o21_re += A1_re;
1868  o21_im += A1_im;
1869  o31_re += B1_re;
1870  o31_im += B1_im;
1871 
1872  o22_re += A2_re;
1873  o22_im += A2_im;
1874  o32_re += B2_re;
1875  o32_im += B2_im;
1876 
1877  }
1878 }
1879 
1880 #ifdef MULTI_GPU
1881 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
1882  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
1883 #endif
1884 {
1885  // Projector P3+
1886  // 2 0 0 0
1887  // 0 2 0 0
1888  // 0 0 0 0
1889  // 0 0 0 0
1890 
1891 #ifdef MULTI_GPU
1892  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
1893  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1894 #else
1895  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
1896 #endif
1897 
1898 #ifdef MULTI_GPU
1899  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1900 #else
1901  const int ga_idx = sp_idx;
1902 #endif
1903 
1904  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1905  {
1912 
1913 #ifdef MULTI_GPU
1914  if (kernel_type == INTERIOR_KERNEL) {
1915 #endif
1916 
1917  // read spinor from device memory
1918  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1919 
1920  // project spinor into half spinors
1921  a0_re = +2*i00_re;
1922  a0_im = +2*i00_im;
1923  a1_re = +2*i01_re;
1924  a1_im = +2*i01_im;
1925  a2_re = +2*i02_re;
1926  a2_im = +2*i02_im;
1927  b0_re = +2*i10_re;
1928  b0_im = +2*i10_im;
1929  b1_re = +2*i11_re;
1930  b1_im = +2*i11_im;
1931  b2_re = +2*i12_re;
1932  b2_im = +2*i12_im;
1933 
1934 #ifdef MULTI_GPU
1935  } else {
1936 
1937  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1938  const int t_proj_scale = TPROJSCALE;
1939 
1940  // read half spinor from device memory
1941  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1942 
1943  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1944  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1945  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1946  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1947  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1948  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1949 
1950  }
1951 #endif // MULTI_GPU
1952 
1953  // identity gauge matrix
1960 
1961  o00_re += A0_re;
1962  o00_im += A0_im;
1963  o10_re += B0_re;
1964  o10_im += B0_im;
1965 
1966  o01_re += A1_re;
1967  o01_im += A1_im;
1968  o11_re += B1_re;
1969  o11_im += B1_im;
1970 
1971  o02_re += A2_re;
1972  o02_im += A2_im;
1973  o12_re += B2_re;
1974  o12_im += B2_im;
1975 
1976  } else {
1983 
1984 #ifdef MULTI_GPU
1985  if (kernel_type == INTERIOR_KERNEL) {
1986 #endif
1987 
1988  // read spinor from device memory
1989  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1990 
1991  // project spinor into half spinors
1992  a0_re = +2*i00_re;
1993  a0_im = +2*i00_im;
1994  a1_re = +2*i01_re;
1995  a1_im = +2*i01_im;
1996  a2_re = +2*i02_re;
1997  a2_im = +2*i02_im;
1998  b0_re = +2*i10_re;
1999  b0_im = +2*i10_im;
2000  b1_re = +2*i11_re;
2001  b1_im = +2*i11_im;
2002  b2_re = +2*i12_re;
2003  b2_im = +2*i12_im;
2004 
2005 #ifdef MULTI_GPU
2006  } else {
2007 
2008  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2009  const int t_proj_scale = TPROJSCALE;
2010 
2011  // read half spinor from device memory
2012  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2013 
2014  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2015  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2016  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2017  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2018  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2019  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2020 
2021  }
2022 #endif // MULTI_GPU
2023 
2024  // read gauge matrix from device memory
2025  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
2026 
2027  // reconstruct gauge matrix
2029 
2030  // multiply row 0
2031  spinorFloat A0_re = 0;
2032  A0_re += gT00_re * a0_re;
2033  A0_re -= gT00_im * a0_im;
2034  A0_re += gT01_re * a1_re;
2035  A0_re -= gT01_im * a1_im;
2036  A0_re += gT02_re * a2_re;
2037  A0_re -= gT02_im * a2_im;
2038  spinorFloat A0_im = 0;
2039  A0_im += gT00_re * a0_im;
2040  A0_im += gT00_im * a0_re;
2041  A0_im += gT01_re * a1_im;
2042  A0_im += gT01_im * a1_re;
2043  A0_im += gT02_re * a2_im;
2044  A0_im += gT02_im * a2_re;
2045  spinorFloat B0_re = 0;
2046  B0_re += gT00_re * b0_re;
2047  B0_re -= gT00_im * b0_im;
2048  B0_re += gT01_re * b1_re;
2049  B0_re -= gT01_im * b1_im;
2050  B0_re += gT02_re * b2_re;
2051  B0_re -= gT02_im * b2_im;
2052  spinorFloat B0_im = 0;
2053  B0_im += gT00_re * b0_im;
2054  B0_im += gT00_im * b0_re;
2055  B0_im += gT01_re * b1_im;
2056  B0_im += gT01_im * b1_re;
2057  B0_im += gT02_re * b2_im;
2058  B0_im += gT02_im * b2_re;
2059 
2060  // multiply row 1
2061  spinorFloat A1_re = 0;
2062  A1_re += gT10_re * a0_re;
2063  A1_re -= gT10_im * a0_im;
2064  A1_re += gT11_re * a1_re;
2065  A1_re -= gT11_im * a1_im;
2066  A1_re += gT12_re * a2_re;
2067  A1_re -= gT12_im * a2_im;
2068  spinorFloat A1_im = 0;
2069  A1_im += gT10_re * a0_im;
2070  A1_im += gT10_im * a0_re;
2071  A1_im += gT11_re * a1_im;
2072  A1_im += gT11_im * a1_re;
2073  A1_im += gT12_re * a2_im;
2074  A1_im += gT12_im * a2_re;
2075  spinorFloat B1_re = 0;
2076  B1_re += gT10_re * b0_re;
2077  B1_re -= gT10_im * b0_im;
2078  B1_re += gT11_re * b1_re;
2079  B1_re -= gT11_im * b1_im;
2080  B1_re += gT12_re * b2_re;
2081  B1_re -= gT12_im * b2_im;
2082  spinorFloat B1_im = 0;
2083  B1_im += gT10_re * b0_im;
2084  B1_im += gT10_im * b0_re;
2085  B1_im += gT11_re * b1_im;
2086  B1_im += gT11_im * b1_re;
2087  B1_im += gT12_re * b2_im;
2088  B1_im += gT12_im * b2_re;
2089 
2090  // multiply row 2
2091  spinorFloat A2_re = 0;
2092  A2_re += gT20_re * a0_re;
2093  A2_re -= gT20_im * a0_im;
2094  A2_re += gT21_re * a1_re;
2095  A2_re -= gT21_im * a1_im;
2096  A2_re += gT22_re * a2_re;
2097  A2_re -= gT22_im * a2_im;
2098  spinorFloat A2_im = 0;
2099  A2_im += gT20_re * a0_im;
2100  A2_im += gT20_im * a0_re;
2101  A2_im += gT21_re * a1_im;
2102  A2_im += gT21_im * a1_re;
2103  A2_im += gT22_re * a2_im;
2104  A2_im += gT22_im * a2_re;
2105  spinorFloat B2_re = 0;
2106  B2_re += gT20_re * b0_re;
2107  B2_re -= gT20_im * b0_im;
2108  B2_re += gT21_re * b1_re;
2109  B2_re -= gT21_im * b1_im;
2110  B2_re += gT22_re * b2_re;
2111  B2_re -= gT22_im * b2_im;
2112  spinorFloat B2_im = 0;
2113  B2_im += gT20_re * b0_im;
2114  B2_im += gT20_im * b0_re;
2115  B2_im += gT21_re * b1_im;
2116  B2_im += gT21_im * b1_re;
2117  B2_im += gT22_re * b2_im;
2118  B2_im += gT22_im * b2_re;
2119 
2120  o00_re += A0_re;
2121  o00_im += A0_im;
2122  o10_re += B0_re;
2123  o10_im += B0_im;
2124 
2125  o01_re += A1_re;
2126  o01_im += A1_im;
2127  o11_re += B1_re;
2128  o11_im += B1_im;
2129 
2130  o02_re += A2_re;
2131  o02_im += A2_im;
2132  o12_re += B2_re;
2133  o12_im += B2_im;
2134 
2135  }
2136 }
2137 
2138 #if defined MULTI_GPU && (defined DSLASH_XPAY || defined DSLASH_CLOVER)
2139 
2140 int incomplete = 0; // Have all 8 contributions been computed for this site?
2141 
2142 switch(kernel_type) { // intentional fall-through
2143 case INTERIOR_KERNEL:
2144  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
2145 case EXTERIOR_KERNEL_T:
2146  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
2147 case EXTERIOR_KERNEL_Z:
2148  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
2149 case EXTERIOR_KERNEL_Y:
2150  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
2151 }
2152 
2153 if (!incomplete)
2154 #endif // MULTI_GPU
2155 {
2156 #ifdef DSLASH_CLOVER
2157 
2158  // change to chiral basis
2159  {
2166  spinorFloat a30_re = o00_re - o20_re;
2167  spinorFloat a30_im = o00_im - o20_im;
2168 
2169  o00_re = a00_re; o00_im = a00_im;
2170  o10_re = a10_re; o10_im = a10_im;
2171  o20_re = a20_re; o20_im = a20_im;
2172  o30_re = a30_re; o30_im = a30_im;
2173  }
2174 
2175  {
2182  spinorFloat a31_re = o01_re - o21_re;
2183  spinorFloat a31_im = o01_im - o21_im;
2184 
2185  o01_re = a01_re; o01_im = a01_im;
2186  o11_re = a11_re; o11_im = a11_im;
2187  o21_re = a21_re; o21_im = a21_im;
2188  o31_re = a31_re; o31_im = a31_im;
2189  }
2190 
2191  {
2198  spinorFloat a32_re = o02_re - o22_re;
2199  spinorFloat a32_im = o02_im - o22_im;
2200 
2201  o02_re = a02_re; o02_im = a02_im;
2202  o12_re = a12_re; o12_im = a12_im;
2203  o22_re = a22_re; o22_im = a22_im;
2204  o32_re = a32_re; o32_im = a32_im;
2205  }
2206 
2207  // apply first chiral block
2208  {
2210 
2217 
2218  a00_re += c00_00_re * o00_re;
2219  a00_im += c00_00_re * o00_im;
2220  a00_re += c00_01_re * o01_re;
2221  a00_re -= c00_01_im * o01_im;
2222  a00_im += c00_01_re * o01_im;
2223  a00_im += c00_01_im * o01_re;
2224  a00_re += c00_02_re * o02_re;
2225  a00_re -= c00_02_im * o02_im;
2226  a00_im += c00_02_re * o02_im;
2227  a00_im += c00_02_im * o02_re;
2228  a00_re += c00_10_re * o10_re;
2229  a00_re -= c00_10_im * o10_im;
2230  a00_im += c00_10_re * o10_im;
2231  a00_im += c00_10_im * o10_re;
2232  a00_re += c00_11_re * o11_re;
2233  a00_re -= c00_11_im * o11_im;
2234  a00_im += c00_11_re * o11_im;
2235  a00_im += c00_11_im * o11_re;
2236  a00_re += c00_12_re * o12_re;
2237  a00_re -= c00_12_im * o12_im;
2238  a00_im += c00_12_re * o12_im;
2239  a00_im += c00_12_im * o12_re;
2240 
2241  a01_re += c01_00_re * o00_re;
2242  a01_re -= c01_00_im * o00_im;
2243  a01_im += c01_00_re * o00_im;
2244  a01_im += c01_00_im * o00_re;
2245  a01_re += c01_01_re * o01_re;
2246  a01_im += c01_01_re * o01_im;
2247  a01_re += c01_02_re * o02_re;
2248  a01_re -= c01_02_im * o02_im;
2249  a01_im += c01_02_re * o02_im;
2250  a01_im += c01_02_im * o02_re;
2251  a01_re += c01_10_re * o10_re;
2252  a01_re -= c01_10_im * o10_im;
2253  a01_im += c01_10_re * o10_im;
2254  a01_im += c01_10_im * o10_re;
2255  a01_re += c01_11_re * o11_re;
2256  a01_re -= c01_11_im * o11_im;
2257  a01_im += c01_11_re * o11_im;
2258  a01_im += c01_11_im * o11_re;
2259  a01_re += c01_12_re * o12_re;
2260  a01_re -= c01_12_im * o12_im;
2261  a01_im += c01_12_re * o12_im;
2262  a01_im += c01_12_im * o12_re;
2263 
2264  a02_re += c02_00_re * o00_re;
2265  a02_re -= c02_00_im * o00_im;
2266  a02_im += c02_00_re * o00_im;
2267  a02_im += c02_00_im * o00_re;
2268  a02_re += c02_01_re * o01_re;
2269  a02_re -= c02_01_im * o01_im;
2270  a02_im += c02_01_re * o01_im;
2271  a02_im += c02_01_im * o01_re;
2272  a02_re += c02_02_re * o02_re;
2273  a02_im += c02_02_re * o02_im;
2274  a02_re += c02_10_re * o10_re;
2275  a02_re -= c02_10_im * o10_im;
2276  a02_im += c02_10_re * o10_im;
2277  a02_im += c02_10_im * o10_re;
2278  a02_re += c02_11_re * o11_re;
2279  a02_re -= c02_11_im * o11_im;
2280  a02_im += c02_11_re * o11_im;
2281  a02_im += c02_11_im * o11_re;
2282  a02_re += c02_12_re * o12_re;
2283  a02_re -= c02_12_im * o12_im;
2284  a02_im += c02_12_re * o12_im;
2285  a02_im += c02_12_im * o12_re;
2286 
2287  a10_re += c10_00_re * o00_re;
2288  a10_re -= c10_00_im * o00_im;
2289  a10_im += c10_00_re * o00_im;
2290  a10_im += c10_00_im * o00_re;
2291  a10_re += c10_01_re * o01_re;
2292  a10_re -= c10_01_im * o01_im;
2293  a10_im += c10_01_re * o01_im;
2294  a10_im += c10_01_im * o01_re;
2295  a10_re += c10_02_re * o02_re;
2296  a10_re -= c10_02_im * o02_im;
2297  a10_im += c10_02_re * o02_im;
2298  a10_im += c10_02_im * o02_re;
2299  a10_re += c10_10_re * o10_re;
2300  a10_im += c10_10_re * o10_im;
2301  a10_re += c10_11_re * o11_re;
2302  a10_re -= c10_11_im * o11_im;
2303  a10_im += c10_11_re * o11_im;
2304  a10_im += c10_11_im * o11_re;
2305  a10_re += c10_12_re * o12_re;
2306  a10_re -= c10_12_im * o12_im;
2307  a10_im += c10_12_re * o12_im;
2308  a10_im += c10_12_im * o12_re;
2309 
2310  a11_re += c11_00_re * o00_re;
2311  a11_re -= c11_00_im * o00_im;
2312  a11_im += c11_00_re * o00_im;
2313  a11_im += c11_00_im * o00_re;
2314  a11_re += c11_01_re * o01_re;
2315  a11_re -= c11_01_im * o01_im;
2316  a11_im += c11_01_re * o01_im;
2317  a11_im += c11_01_im * o01_re;
2318  a11_re += c11_02_re * o02_re;
2319  a11_re -= c11_02_im * o02_im;
2320  a11_im += c11_02_re * o02_im;
2321  a11_im += c11_02_im * o02_re;
2322  a11_re += c11_10_re * o10_re;
2323  a11_re -= c11_10_im * o10_im;
2324  a11_im += c11_10_re * o10_im;
2325  a11_im += c11_10_im * o10_re;
2326  a11_re += c11_11_re * o11_re;
2327  a11_im += c11_11_re * o11_im;
2328  a11_re += c11_12_re * o12_re;
2329  a11_re -= c11_12_im * o12_im;
2330  a11_im += c11_12_re * o12_im;
2331  a11_im += c11_12_im * o12_re;
2332 
2333  a12_re += c12_00_re * o00_re;
2334  a12_re -= c12_00_im * o00_im;
2335  a12_im += c12_00_re * o00_im;
2336  a12_im += c12_00_im * o00_re;
2337  a12_re += c12_01_re * o01_re;
2338  a12_re -= c12_01_im * o01_im;
2339  a12_im += c12_01_re * o01_im;
2340  a12_im += c12_01_im * o01_re;
2341  a12_re += c12_02_re * o02_re;
2342  a12_re -= c12_02_im * o02_im;
2343  a12_im += c12_02_re * o02_im;
2344  a12_im += c12_02_im * o02_re;
2345  a12_re += c12_10_re * o10_re;
2346  a12_re -= c12_10_im * o10_im;
2347  a12_im += c12_10_re * o10_im;
2348  a12_im += c12_10_im * o10_re;
2349  a12_re += c12_11_re * o11_re;
2350  a12_re -= c12_11_im * o11_im;
2351  a12_im += c12_11_re * o11_im;
2352  a12_im += c12_11_im * o11_re;
2353  a12_re += c12_12_re * o12_re;
2354  a12_im += c12_12_re * o12_im;
2355 
2356  o00_re = a00_re; o00_im = a00_im;
2357  o01_re = a01_re; o01_im = a01_im;
2358  o02_re = a02_re; o02_im = a02_im;
2359  o10_re = a10_re; o10_im = a10_im;
2360  o11_re = a11_re; o11_im = a11_im;
2361  o12_re = a12_re; o12_im = a12_im;
2362 
2363  }
2364 
2365  // apply second chiral block
2366  {
2368 
2372  spinorFloat a30_re = 0; spinorFloat a30_im = 0;
2373  spinorFloat a31_re = 0; spinorFloat a31_im = 0;
2374  spinorFloat a32_re = 0; spinorFloat a32_im = 0;
2375 
2376  a20_re += c20_20_re * o20_re;
2377  a20_im += c20_20_re * o20_im;
2378  a20_re += c20_21_re * o21_re;
2379  a20_re -= c20_21_im * o21_im;
2380  a20_im += c20_21_re * o21_im;
2381  a20_im += c20_21_im * o21_re;
2382  a20_re += c20_22_re * o22_re;
2383  a20_re -= c20_22_im * o22_im;
2384  a20_im += c20_22_re * o22_im;
2385  a20_im += c20_22_im * o22_re;
2386  a20_re += c20_30_re * o30_re;
2387  a20_re -= c20_30_im * o30_im;
2388  a20_im += c20_30_re * o30_im;
2389  a20_im += c20_30_im * o30_re;
2390  a20_re += c20_31_re * o31_re;
2391  a20_re -= c20_31_im * o31_im;
2392  a20_im += c20_31_re * o31_im;
2393  a20_im += c20_31_im * o31_re;
2394  a20_re += c20_32_re * o32_re;
2395  a20_re -= c20_32_im * o32_im;
2396  a20_im += c20_32_re * o32_im;
2397  a20_im += c20_32_im * o32_re;
2398 
2399  a21_re += c21_20_re * o20_re;
2400  a21_re -= c21_20_im * o20_im;
2401  a21_im += c21_20_re * o20_im;
2402  a21_im += c21_20_im * o20_re;
2403  a21_re += c21_21_re * o21_re;
2404  a21_im += c21_21_re * o21_im;
2405  a21_re += c21_22_re * o22_re;
2406  a21_re -= c21_22_im * o22_im;
2407  a21_im += c21_22_re * o22_im;
2408  a21_im += c21_22_im * o22_re;
2409  a21_re += c21_30_re * o30_re;
2410  a21_re -= c21_30_im * o30_im;
2411  a21_im += c21_30_re * o30_im;
2412  a21_im += c21_30_im * o30_re;
2413  a21_re += c21_31_re * o31_re;
2414  a21_re -= c21_31_im * o31_im;
2415  a21_im += c21_31_re * o31_im;
2416  a21_im += c21_31_im * o31_re;
2417  a21_re += c21_32_re * o32_re;
2418  a21_re -= c21_32_im * o32_im;
2419  a21_im += c21_32_re * o32_im;
2420  a21_im += c21_32_im * o32_re;
2421 
2422  a22_re += c22_20_re * o20_re;
2423  a22_re -= c22_20_im * o20_im;
2424  a22_im += c22_20_re * o20_im;
2425  a22_im += c22_20_im * o20_re;
2426  a22_re += c22_21_re * o21_re;
2427  a22_re -= c22_21_im * o21_im;
2428  a22_im += c22_21_re * o21_im;
2429  a22_im += c22_21_im * o21_re;
2430  a22_re += c22_22_re * o22_re;
2431  a22_im += c22_22_re * o22_im;
2432  a22_re += c22_30_re * o30_re;
2433  a22_re -= c22_30_im * o30_im;
2434  a22_im += c22_30_re * o30_im;
2435  a22_im += c22_30_im * o30_re;
2436  a22_re += c22_31_re * o31_re;
2437  a22_re -= c22_31_im * o31_im;
2438  a22_im += c22_31_re * o31_im;
2439  a22_im += c22_31_im * o31_re;
2440  a22_re += c22_32_re * o32_re;
2441  a22_re -= c22_32_im * o32_im;
2442  a22_im += c22_32_re * o32_im;
2443  a22_im += c22_32_im * o32_re;
2444 
2445  a30_re += c30_20_re * o20_re;
2446  a30_re -= c30_20_im * o20_im;
2447  a30_im += c30_20_re * o20_im;
2448  a30_im += c30_20_im * o20_re;
2449  a30_re += c30_21_re * o21_re;
2450  a30_re -= c30_21_im * o21_im;
2451  a30_im += c30_21_re * o21_im;
2452  a30_im += c30_21_im * o21_re;
2453  a30_re += c30_22_re * o22_re;
2454  a30_re -= c30_22_im * o22_im;
2455  a30_im += c30_22_re * o22_im;
2456  a30_im += c30_22_im * o22_re;
2457  a30_re += c30_30_re * o30_re;
2458  a30_im += c30_30_re * o30_im;
2459  a30_re += c30_31_re * o31_re;
2460  a30_re -= c30_31_im * o31_im;
2461  a30_im += c30_31_re * o31_im;
2462  a30_im += c30_31_im * o31_re;
2463  a30_re += c30_32_re * o32_re;
2464  a30_re -= c30_32_im * o32_im;
2465  a30_im += c30_32_re * o32_im;
2466  a30_im += c30_32_im * o32_re;
2467 
2468  a31_re += c31_20_re * o20_re;
2469  a31_re -= c31_20_im * o20_im;
2470  a31_im += c31_20_re * o20_im;
2471  a31_im += c31_20_im * o20_re;
2472  a31_re += c31_21_re * o21_re;
2473  a31_re -= c31_21_im * o21_im;
2474  a31_im += c31_21_re * o21_im;
2475  a31_im += c31_21_im * o21_re;
2476  a31_re += c31_22_re * o22_re;
2477  a31_re -= c31_22_im * o22_im;
2478  a31_im += c31_22_re * o22_im;
2479  a31_im += c31_22_im * o22_re;
2480  a31_re += c31_30_re * o30_re;
2481  a31_re -= c31_30_im * o30_im;
2482  a31_im += c31_30_re * o30_im;
2483  a31_im += c31_30_im * o30_re;
2484  a31_re += c31_31_re * o31_re;
2485  a31_im += c31_31_re * o31_im;
2486  a31_re += c31_32_re * o32_re;
2487  a31_re -= c31_32_im * o32_im;
2488  a31_im += c31_32_re * o32_im;
2489  a31_im += c31_32_im * o32_re;
2490 
2491  a32_re += c32_20_re * o20_re;
2492  a32_re -= c32_20_im * o20_im;
2493  a32_im += c32_20_re * o20_im;
2494  a32_im += c32_20_im * o20_re;
2495  a32_re += c32_21_re * o21_re;
2496  a32_re -= c32_21_im * o21_im;
2497  a32_im += c32_21_re * o21_im;
2498  a32_im += c32_21_im * o21_re;
2499  a32_re += c32_22_re * o22_re;
2500  a32_re -= c32_22_im * o22_im;
2501  a32_im += c32_22_re * o22_im;
2502  a32_im += c32_22_im * o22_re;
2503  a32_re += c32_30_re * o30_re;
2504  a32_re -= c32_30_im * o30_im;
2505  a32_im += c32_30_re * o30_im;
2506  a32_im += c32_30_im * o30_re;
2507  a32_re += c32_31_re * o31_re;
2508  a32_re -= c32_31_im * o31_im;
2509  a32_im += c32_31_re * o31_im;
2510  a32_im += c32_31_im * o31_re;
2511  a32_re += c32_32_re * o32_re;
2512  a32_im += c32_32_re * o32_im;
2513 
2514  o20_re = a20_re; o20_im = a20_im;
2515  o21_re = a21_re; o21_im = a21_im;
2516  o22_re = a22_re; o22_im = a22_im;
2517  o30_re = a30_re; o30_im = a30_im;
2518  o31_re = a31_re; o31_im = a31_im;
2519  o32_re = a32_re; o32_im = a32_im;
2520 
2521  }
2522 
2523  // change back from chiral basis
2524  // (note: required factor of 1/2 is included in clover term normalization)
2525  {
2526  spinorFloat a00_re = o10_re + o30_re;
2527  spinorFloat a00_im = o10_im + o30_im;
2528  spinorFloat a10_re = -o00_re - o20_re;
2529  spinorFloat a10_im = -o00_im - o20_im;
2530  spinorFloat a20_re = o10_re - o30_re;
2531  spinorFloat a20_im = o10_im - o30_im;
2532  spinorFloat a30_re = -o00_re + o20_re;
2533  spinorFloat a30_im = -o00_im + o20_im;
2534 
2535  o00_re = a00_re; o00_im = a00_im;
2536  o10_re = a10_re; o10_im = a10_im;
2537  o20_re = a20_re; o20_im = a20_im;
2538  o30_re = a30_re; o30_im = a30_im;
2539  }
2540 
2541  {
2542  spinorFloat a01_re = o11_re + o31_re;
2543  spinorFloat a01_im = o11_im + o31_im;
2544  spinorFloat a11_re = -o01_re - o21_re;
2545  spinorFloat a11_im = -o01_im - o21_im;
2546  spinorFloat a21_re = o11_re - o31_re;
2547  spinorFloat a21_im = o11_im - o31_im;
2548  spinorFloat a31_re = -o01_re + o21_re;
2549  spinorFloat a31_im = -o01_im + o21_im;
2550 
2551  o01_re = a01_re; o01_im = a01_im;
2552  o11_re = a11_re; o11_im = a11_im;
2553  o21_re = a21_re; o21_im = a21_im;
2554  o31_re = a31_re; o31_im = a31_im;
2555  }
2556 
2557  {
2558  spinorFloat a02_re = o12_re + o32_re;
2559  spinorFloat a02_im = o12_im + o32_im;
2560  spinorFloat a12_re = -o02_re - o22_re;
2561  spinorFloat a12_im = -o02_im - o22_im;
2562  spinorFloat a22_re = o12_re - o32_re;
2563  spinorFloat a22_im = o12_im - o32_im;
2564  spinorFloat a32_re = -o02_re + o22_re;
2565  spinorFloat a32_im = -o02_im + o22_im;
2566 
2567  o02_re = a02_re; o02_im = a02_im;
2568  o12_re = a12_re; o12_im = a12_im;
2569  o22_re = a22_re; o22_im = a22_im;
2570  o32_re = a32_re; o32_im = a32_im;
2571  }
2572 
2573 #endif // DSLASH_CLOVER
2574 
2575 #ifdef DSLASH_XPAY
2576 
2577  READ_ACCUM(ACCUMTEX, param.sp_stride)
2578 
2579  o00_re = a*o00_re+acc00_re;
2580  o00_im = a*o00_im+acc00_im;
2581  o01_re = a*o01_re+acc01_re;
2582  o01_im = a*o01_im+acc01_im;
2583  o02_re = a*o02_re+acc02_re;
2584  o02_im = a*o02_im+acc02_im;
2585  o10_re = a*o10_re+acc10_re;
2586  o10_im = a*o10_im+acc10_im;
2587  o11_re = a*o11_re+acc11_re;
2588  o11_im = a*o11_im+acc11_im;
2589  o12_re = a*o12_re+acc12_re;
2590  o12_im = a*o12_im+acc12_im;
2591  o20_re = a*o20_re+acc20_re;
2592  o20_im = a*o20_im+acc20_im;
2593  o21_re = a*o21_re+acc21_re;
2594  o21_im = a*o21_im+acc21_im;
2595  o22_re = a*o22_re+acc22_re;
2596  o22_im = a*o22_im+acc22_im;
2597  o30_re = a*o30_re+acc30_re;
2598  o30_im = a*o30_im+acc30_im;
2599  o31_re = a*o31_re+acc31_re;
2600  o31_im = a*o31_im+acc31_im;
2601  o32_re = a*o32_re+acc32_re;
2602  o32_im = a*o32_im+acc32_im;
2603 #endif // DSLASH_XPAY
2604 }
2605 
2606 // write spinor field back to device memory
2607 WRITE_SPINOR(param.sp_stride);
2608 
2609 // undefine to prevent warning when precision is changed
2610 #undef spinorFloat
2611 #undef SHARED_STRIDE
2612 
2613 #undef g00_re
2614 #undef g00_im
2615 #undef g01_re
2616 #undef g01_im
2617 #undef g02_re
2618 #undef g02_im
2619 #undef g10_re
2620 #undef g10_im
2621 #undef g11_re
2622 #undef g11_im
2623 #undef g12_re
2624 #undef g12_im
2625 #undef g20_re
2626 #undef g20_im
2627 #undef g21_re
2628 #undef g21_im
2629 #undef g22_re
2630 #undef g22_im
2631 
2632 #undef i00_re
2633 #undef i00_im
2634 #undef i01_re
2635 #undef i01_im
2636 #undef i02_re
2637 #undef i02_im
2638 #undef i10_re
2639 #undef i10_im
2640 #undef i11_re
2641 #undef i11_im
2642 #undef i12_re
2643 #undef i12_im
2644 #undef i20_re
2645 #undef i20_im
2646 #undef i21_re
2647 #undef i21_im
2648 #undef i22_re
2649 #undef i22_im
2650 #undef i30_re
2651 #undef i30_im
2652 #undef i31_re
2653 #undef i31_im
2654 #undef i32_re
2655 #undef i32_im
2656 
2657 #undef acc00_re
2658 #undef acc00_im
2659 #undef acc01_re
2660 #undef acc01_im
2661 #undef acc02_re
2662 #undef acc02_im
2663 #undef acc10_re
2664 #undef acc10_im
2665 #undef acc11_re
2666 #undef acc11_im
2667 #undef acc12_re
2668 #undef acc12_im
2669 #undef acc20_re
2670 #undef acc20_im
2671 #undef acc21_re
2672 #undef acc21_im
2673 #undef acc22_re
2674 #undef acc22_im
2675 #undef acc30_re
2676 #undef acc30_im
2677 #undef acc31_re
2678 #undef acc31_im
2679 #undef acc32_re
2680 #undef acc32_im
2681 
2682 #undef c00_00_re
2683 #undef c01_01_re
2684 #undef c02_02_re
2685 #undef c10_10_re
2686 #undef c11_11_re
2687 #undef c12_12_re
2688 #undef c01_00_re
2689 #undef c01_00_im
2690 #undef c02_00_re
2691 #undef c02_00_im
2692 #undef c10_00_re
2693 #undef c10_00_im
2694 #undef c11_00_re
2695 #undef c11_00_im
2696 #undef c12_00_re
2697 #undef c12_00_im
2698 #undef c02_01_re
2699 #undef c02_01_im
2700 #undef c10_01_re
2701 #undef c10_01_im
2702 #undef c11_01_re
2703 #undef c11_01_im
2704 #undef c12_01_re
2705 #undef c12_01_im
2706 #undef c10_02_re
2707 #undef c10_02_im
2708 #undef c11_02_re
2709 #undef c11_02_im
2710 #undef c12_02_re
2711 #undef c12_02_im
2712 #undef c11_10_re
2713 #undef c11_10_im
2714 #undef c12_10_re
2715 #undef c12_10_im
2716 #undef c12_11_re
2717 #undef c12_11_im
2718 
2719 #undef o00_re
2720 #undef o00_im
2721 #undef o01_re
2722 #undef o01_im
2723 #undef o02_re
2724 #undef o02_im
2725 #undef o10_re
2726 #undef o10_im
2727 #undef o11_re
2728 #undef o11_im
2729 #undef o12_re
2730 #undef o12_im
2731 #undef o20_re
2732 #undef o20_im
2733 #undef o21_re
2734 #undef o21_im
2735 #undef o22_re
2736 #undef o22_im
2737 #undef o30_re
2738 
2739 #undef VOLATILE
#define c10_12_im
#define c02_10_im
#define c30_32_im
#define g02_re
spinorFloat A2_im
const int dims[]
#define acc11_im
spinorFloat b1_im
#define i11_re
#define c32_30_im
__constant__ int Vh
#define c11_02_im
#define a22_re
Definition: llfat_core.h:131
#define c12_02_im
#define c00_12_re
#define o22_im
#define g01_re
#define c30_22_im
#define c01_11_im
#define c11_10_im
VOLATILE spinorFloat o32_re
#define c01_12_re
#define c02_01_re
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
#define c30_31_re
__constant__ int X2
#define c11_00_re
#define c21_31_im
#define acc21_re
#define g10_im
#define g01_im
#define c02_12_re
#define c20_32_re
#define c12_00_re
#define o02_re
__constant__ int X2X1mX1
#define i20_re
#define acc31_re
#define c00_11_re
#define gT21_im
#define c21_22_re
spinorFloat b2_re
#define i30_im
#define CLOVERTEX
Definition: clover_def.h:101
#define c20_22_re
#define acc11_re
#define c12_11_re
#define a02_im
Definition: llfat_core.h:120
__constant__ int X3X2X1mX2X1
#define c02_00_re
#define c20_31_im
#define c12_12_re
spinorFloat A1_im
__constant__ int X1
#define c10_11_re
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
#define o02_im
#define c00_10_im
#define gT01_im
int sp_idx
#define a22_im
Definition: llfat_core.h:132
#define acc32_re
#define i22_re
#define acc02_im
#define c01_11_re
#define acc12_re
#define DSLASH_SHARED_FLOATS_PER_THREAD
#define c32_21_re
#define i12_re
#define g22_im
#define c01_00_re
spinorFloat a1_im
#define c10_11_im
#define c01_00_im
#define c30_20_im
__constant__ int X3X2X1
#define i00_im
spinorFloat a0_re
#define acc20_re
#define acc22_re
#define c31_30_im
#define o10_re
#define c11_01_im
#define i30_re
#define c31_30_re
#define g20_re
#define a01_re
Definition: llfat_core.h:117
#define c22_22_re
#define c20_21_im
#define c00_00_re
#define o00_re
#define i32_re
#define i02_im
#define c22_21_re
#define g22_re
#define i02_re
VOLATILE spinorFloat o30_im
#define c12_01_im
#define gT10_im
#define c20_30_im
#define c22_32_im
#define c11_10_re
#define c21_22_im
#define i01_re
#define a02_re
Definition: llfat_core.h:119
#define c20_20_re
#define a20_re
Definition: llfat_core.h:127
spinorFloat B0_re
#define c31_32_im
#define c11_02_re
#define a12_im
Definition: llfat_core.h:126
#define c21_31_re
#define acc01_im
#define o30_re
#define a20_im
Definition: llfat_core.h:128
#define i21_re
#define acc00_re
#define c02_12_im
#define c32_22_re
QudaGaugeParam param
Definition: pack_test.cpp:17
#define c31_32_re
#define g10_re
#define o21_re
#define c31_22_im
#define acc02_re
#define c01_12_im
#define gT22_re
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define c10_12_re
spinorFloat b0_re
spinorFloat b0_im
#define c12_00_im
VOLATILE spinorFloat o31_im
#define c32_32_re
#define i10_im
#define c01_10_im
#define c20_31_re
#define c21_32_re
#define c00_11_im
#define c11_12_re
#define acc21_im
spinorFloat b2_im
#define acc20_im
#define o12_im
#define o01_re
#define c22_30_im
#define c02_00_im
spinorFloat B2_im
#define a01_im
Definition: llfat_core.h:118
#define c10_00_re
#define c20_30_re
#define c00_01_im
#define a12_re
Definition: llfat_core.h:125
spinorFloat A2_re
#define GAUGE0TEX
Definition: covDev.h:112
#define gT10_re
#define c10_00_im
#define a11_re
Definition: llfat_core.h:123
#define c12_10_im
#define c10_01_im
#define c22_31_im
#define gT02_im
#define c00_10_re
#define g11_im
#define o00_im
#define c02_11_re
#define c32_20_re
#define c21_30_re
#define c32_31_im
#define o01_im
#define c31_21_im
#define i00_re
#define acc10_re
__constant__ int X2m1
#define c22_31_re
#define c22_20_im
VOLATILE spinorFloat o31_re
spinorFloat A1_re
#define gT20_re
#define c12_11_im
#define i21_im
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define SPINORTEX
Definition: clover_def.h:40
spinorFloat B0_im
#define gT00_re
__constant__ int gauge_fixed
#define g12_im
#define c31_22_re
#define i32_im
#define c11_12_im
__shared__ char s_data[]
#define o21_im
#define c31_21_re
#define c32_21_im
#define c31_20_im
__constant__ int X4X3X2X1mX3X2X1
#define gT12_re
#define gT21_re
#define c10_10_re
#define c21_32_im
#define SPINOR_HOP
Definition: covDev.h:158
spinorFloat a0_im
#define c12_01_re
#define i31_im
#define VOLATILE
#define c20_32_im
#define c10_02_im
#define c32_31_re
__constant__ int ga_stride
spinorFloat b1_re
#define c11_01_re
#define acc30_im
#define c01_01_re
#define o20_re
#define c00_02_im
#define gT00_im
#define i31_re
#define spinorFloat
#define c11_11_re
#define a00_re
Definition: llfat_core.h:115
#define c12_10_re
#define c30_20_re
#define c22_20_re
#define acc30_re
VOLATILE spinorFloat * s
#define c00_01_re
#define c30_21_im
#define c01_02_im
#define c31_31_re
__constant__ int X1m1
#define gT02_re
#define c32_30_re
#define c20_22_im
#define gT11_re
__constant__ int X3
#define c30_22_re
#define o11_im
spinorFloat B1_im
#define acc12_im
#define gT12_im
#define i01_im
#define g00_im
#define g21_im
#define g20_im
spinorFloat B1_re
#define c21_20_re
#define c02_10_re
#define a11_im
Definition: llfat_core.h:124
#define i20_im
#define SHARED_STRIDE
#define acc00_im
VOLATILE spinorFloat o32_im
#define acc10_im
#define a10_re
Definition: llfat_core.h:121
#define acc32_im
spinorFloat A0_re
#define c31_20_re
#define o22_re
#define c21_20_im
#define acc22_im
#define g12_re
#define c02_02_re
#define gT22_im
#define c22_21_im
#define c01_02_re
spinorFloat A0_im
#define GAUGE1TEX
Definition: covDev.h:113
#define c12_02_re
const int ga_idx
#define c32_20_im
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define c20_21_re
#define c22_32_re
#define i10_re
#define a10_im
Definition: llfat_core.h:122
__constant__ int X4m1
#define g21_re
#define a21_re
Definition: llfat_core.h:129
RECONSTRUCT_GAUGE_MATRIX(0)
#define i11_im
#define c10_02_re
#define WRITE_SPINOR
Definition: clover_def.h:48
#define c30_21_re
#define c10_01_re
spinorFloat a1_re
#define o11_re
spinorFloat a2_re
#define c30_31_im
#define o10_im
#define g02_im
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define o12_re
#define g11_re
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define c30_32_re
#define INTERTEX
Definition: covDev.h:149
#define c00_02_re
#define gT11_im
#define c02_11_im
#define c01_10_re
__constant__ int X4X3X2X1hmX3X2X1h
#define g00_re
#define c02_01_im
#define c11_00_im
#define acc01_re
#define c21_21_re
#define c21_30_im
#define a21_im
Definition: llfat_core.h:130
#define c22_30_re
spinorFloat a2_im
#define o20_im
KernelType kernel_type
#define READ_CLOVER
Definition: clover_def.h:103
#define gT20_im
#define i22_im
#define gT01_re
#define i12_im
#define c30_30_re
__constant__ int X4
__constant__ int X3m1
#define TPROJSCALE
Definition: covDev.h:101
#define acc31_im
spinorFloat B2_re
#define a00_im
Definition: llfat_core.h:116
#define c00_12_im
__constant__ int X2X1
#define c32_22_im