QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
wilson_dslash_fermi_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 24
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2
15 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
64 #else
65 #define spinorFloat float
66 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4
67 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4
68 #define i00_re I0.x
69 #define i00_im I0.y
70 #define i01_re I0.z
71 #define i01_im I0.w
72 #define i02_re I1.x
73 #define i02_im I1.y
74 #define i10_re I1.z
75 #define i10_im I1.w
76 #define i11_re I2.x
77 #define i11_im I2.y
78 #define i12_re I2.z
79 #define i12_im I2.w
80 #define i20_re I3.x
81 #define i20_im I3.y
82 #define i21_re I3.z
83 #define i21_im I3.w
84 #define i22_re I4.x
85 #define i22_im I4.y
86 #define i30_re I4.z
87 #define i30_im I4.w
88 #define i31_re I5.x
89 #define i31_im I5.y
90 #define i32_re I5.z
91 #define i32_im I5.w
92 #define acc00_re accum0.x
93 #define acc00_im accum0.y
94 #define acc01_re accum0.z
95 #define acc01_im accum0.w
96 #define acc02_re accum1.x
97 #define acc02_im accum1.y
98 #define acc10_re accum1.z
99 #define acc10_im accum1.w
100 #define acc11_re accum2.x
101 #define acc11_im accum2.y
102 #define acc12_re accum2.z
103 #define acc12_im accum2.w
104 #define acc20_re accum3.x
105 #define acc20_im accum3.y
106 #define acc21_re accum3.z
107 #define acc21_im accum3.w
108 #define acc22_re accum4.x
109 #define acc22_im accum4.y
110 #define acc30_re accum4.z
111 #define acc30_im accum4.w
112 #define acc31_re accum5.x
113 #define acc31_im accum5.y
114 #define acc32_re accum5.z
115 #define acc32_im accum5.w
116 #endif // SPINOR_DOUBLE
117 
118 // gauge link
119 #ifdef GAUGE_FLOAT2
120 #define g00_re G0.x
121 #define g00_im G0.y
122 #define g01_re G1.x
123 #define g01_im G1.y
124 #define g02_re G2.x
125 #define g02_im G2.y
126 #define g10_re G3.x
127 #define g10_im G3.y
128 #define g11_re G4.x
129 #define g11_im G4.y
130 #define g12_re G5.x
131 #define g12_im G5.y
132 #define g20_re G6.x
133 #define g20_im G6.y
134 #define g21_re G7.x
135 #define g21_im G7.y
136 #define g22_re G8.x
137 #define g22_im G8.y
138 
139 #else
140 #define g00_re G0.x
141 #define g00_im G0.y
142 #define g01_re G0.z
143 #define g01_im G0.w
144 #define g02_re G1.x
145 #define g02_im G1.y
146 #define g10_re G1.z
147 #define g10_im G1.w
148 #define g11_re G2.x
149 #define g11_im G2.y
150 #define g12_re G2.z
151 #define g12_im G2.w
152 #define g20_re G3.x
153 #define g20_im G3.y
154 #define g21_re G3.z
155 #define g21_im G3.w
156 #define g22_re G4.x
157 #define g22_im G4.y
158 
159 #endif // GAUGE_DOUBLE
160 
161 // conjugated gauge link
162 #define gT00_re (+g00_re)
163 #define gT00_im (-g00_im)
164 #define gT01_re (+g10_re)
165 #define gT01_im (-g10_im)
166 #define gT02_re (+g20_re)
167 #define gT02_im (-g20_im)
168 #define gT10_re (+g01_re)
169 #define gT10_im (-g01_im)
170 #define gT11_re (+g11_re)
171 #define gT11_im (-g11_im)
172 #define gT12_re (+g21_re)
173 #define gT12_im (-g21_im)
174 #define gT20_re (+g02_re)
175 #define gT20_im (-g02_im)
176 #define gT21_re (+g12_re)
177 #define gT21_im (-g12_im)
178 #define gT22_re (+g22_re)
179 #define gT22_im (-g22_im)
180 
181 // first chiral block of inverted clover term
182 #ifdef CLOVER_DOUBLE
183 #define c00_00_re C0.x
184 #define c01_01_re C0.y
185 #define c02_02_re C1.x
186 #define c10_10_re C1.y
187 #define c11_11_re C2.x
188 #define c12_12_re C2.y
189 #define c01_00_re C3.x
190 #define c01_00_im C3.y
191 #define c02_00_re C4.x
192 #define c02_00_im C4.y
193 #define c10_00_re C5.x
194 #define c10_00_im C5.y
195 #define c11_00_re C6.x
196 #define c11_00_im C6.y
197 #define c12_00_re C7.x
198 #define c12_00_im C7.y
199 #define c02_01_re C8.x
200 #define c02_01_im C8.y
201 #define c10_01_re C9.x
202 #define c10_01_im C9.y
203 #define c11_01_re C10.x
204 #define c11_01_im C10.y
205 #define c12_01_re C11.x
206 #define c12_01_im C11.y
207 #define c10_02_re C12.x
208 #define c10_02_im C12.y
209 #define c11_02_re C13.x
210 #define c11_02_im C13.y
211 #define c12_02_re C14.x
212 #define c12_02_im C14.y
213 #define c11_10_re C15.x
214 #define c11_10_im C15.y
215 #define c12_10_re C16.x
216 #define c12_10_im C16.y
217 #define c12_11_re C17.x
218 #define c12_11_im C17.y
219 #else
220 #define c00_00_re C0.x
221 #define c01_01_re C0.y
222 #define c02_02_re C0.z
223 #define c10_10_re C0.w
224 #define c11_11_re C1.x
225 #define c12_12_re C1.y
226 #define c01_00_re C1.z
227 #define c01_00_im C1.w
228 #define c02_00_re C2.x
229 #define c02_00_im C2.y
230 #define c10_00_re C2.z
231 #define c10_00_im C2.w
232 #define c11_00_re C3.x
233 #define c11_00_im C3.y
234 #define c12_00_re C3.z
235 #define c12_00_im C3.w
236 #define c02_01_re C4.x
237 #define c02_01_im C4.y
238 #define c10_01_re C4.z
239 #define c10_01_im C4.w
240 #define c11_01_re C5.x
241 #define c11_01_im C5.y
242 #define c12_01_re C5.z
243 #define c12_01_im C5.w
244 #define c10_02_re C6.x
245 #define c10_02_im C6.y
246 #define c11_02_re C6.z
247 #define c11_02_im C6.w
248 #define c12_02_re C7.x
249 #define c12_02_im C7.y
250 #define c11_10_re C7.z
251 #define c11_10_im C7.w
252 #define c12_10_re C8.x
253 #define c12_10_im C8.y
254 #define c12_11_re C8.z
255 #define c12_11_im C8.w
256 #endif // CLOVER_DOUBLE
257 
258 #define c00_01_re (+c01_00_re)
259 #define c00_01_im (-c01_00_im)
260 #define c00_02_re (+c02_00_re)
261 #define c00_02_im (-c02_00_im)
262 #define c01_02_re (+c02_01_re)
263 #define c01_02_im (-c02_01_im)
264 #define c00_10_re (+c10_00_re)
265 #define c00_10_im (-c10_00_im)
266 #define c01_10_re (+c10_01_re)
267 #define c01_10_im (-c10_01_im)
268 #define c02_10_re (+c10_02_re)
269 #define c02_10_im (-c10_02_im)
270 #define c00_11_re (+c11_00_re)
271 #define c00_11_im (-c11_00_im)
272 #define c01_11_re (+c11_01_re)
273 #define c01_11_im (-c11_01_im)
274 #define c02_11_re (+c11_02_re)
275 #define c02_11_im (-c11_02_im)
276 #define c10_11_re (+c11_10_re)
277 #define c10_11_im (-c11_10_im)
278 #define c00_12_re (+c12_00_re)
279 #define c00_12_im (-c12_00_im)
280 #define c01_12_re (+c12_01_re)
281 #define c01_12_im (-c12_01_im)
282 #define c02_12_re (+c12_02_re)
283 #define c02_12_im (-c12_02_im)
284 #define c10_12_re (+c12_10_re)
285 #define c10_12_im (-c12_10_im)
286 #define c11_12_re (+c12_11_re)
287 #define c11_12_im (-c12_11_im)
288 
289 // second chiral block of inverted clover term (reuses C0,...,C9)
290 #define c20_20_re c00_00_re
291 #define c21_20_re c01_00_re
292 #define c21_20_im c01_00_im
293 #define c22_20_re c02_00_re
294 #define c22_20_im c02_00_im
295 #define c30_20_re c10_00_re
296 #define c30_20_im c10_00_im
297 #define c31_20_re c11_00_re
298 #define c31_20_im c11_00_im
299 #define c32_20_re c12_00_re
300 #define c32_20_im c12_00_im
301 #define c20_21_re c00_01_re
302 #define c20_21_im c00_01_im
303 #define c21_21_re c01_01_re
304 #define c22_21_re c02_01_re
305 #define c22_21_im c02_01_im
306 #define c30_21_re c10_01_re
307 #define c30_21_im c10_01_im
308 #define c31_21_re c11_01_re
309 #define c31_21_im c11_01_im
310 #define c32_21_re c12_01_re
311 #define c32_21_im c12_01_im
312 #define c20_22_re c00_02_re
313 #define c20_22_im c00_02_im
314 #define c21_22_re c01_02_re
315 #define c21_22_im c01_02_im
316 #define c22_22_re c02_02_re
317 #define c30_22_re c10_02_re
318 #define c30_22_im c10_02_im
319 #define c31_22_re c11_02_re
320 #define c31_22_im c11_02_im
321 #define c32_22_re c12_02_re
322 #define c32_22_im c12_02_im
323 #define c20_30_re c00_10_re
324 #define c20_30_im c00_10_im
325 #define c21_30_re c01_10_re
326 #define c21_30_im c01_10_im
327 #define c22_30_re c02_10_re
328 #define c22_30_im c02_10_im
329 #define c30_30_re c10_10_re
330 #define c31_30_re c11_10_re
331 #define c31_30_im c11_10_im
332 #define c32_30_re c12_10_re
333 #define c32_30_im c12_10_im
334 #define c20_31_re c00_11_re
335 #define c20_31_im c00_11_im
336 #define c21_31_re c01_11_re
337 #define c21_31_im c01_11_im
338 #define c22_31_re c02_11_re
339 #define c22_31_im c02_11_im
340 #define c30_31_re c10_11_re
341 #define c30_31_im c10_11_im
342 #define c31_31_re c11_11_re
343 #define c32_31_re c12_11_re
344 #define c32_31_im c12_11_im
345 #define c20_32_re c00_12_re
346 #define c20_32_im c00_12_im
347 #define c21_32_re c01_12_re
348 #define c21_32_im c01_12_im
349 #define c22_32_re c02_12_re
350 #define c22_32_im c02_12_im
351 #define c30_32_re c10_12_re
352 #define c30_32_im c10_12_im
353 #define c31_32_re c11_12_re
354 #define c31_32_im c11_12_im
355 #define c32_32_re c12_12_re
356 
357 // output spinor
382 
383 #ifdef SPINOR_DOUBLE
384 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
385 #else
386 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
387 #endif
388 
389 #include "read_gauge.h"
390 #include "read_clover.h"
391 #include "io_spinor.h"
392 
393 int x1, x2, x3, x4;
394 int X;
395 
396 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
397 int sp_norm_idx;
398 #endif // MULTI_GPU half precision
399 
400 int sid;
401 
402 #ifdef MULTI_GPU
403 int face_idx;
405 #endif
406 
407  // Inline by hand for the moment and assume even dimensions
408  const int dims[] = {X1, X2, X3, X4};
409  coordsFromIndex3D<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
410 
411  // only need to check Y and Z dims currently since X and T set to match exactly
412  if (x2 >= X2) return;
413  if (x3 >= X3) return;
414 
415  o00_re = 0; o00_im = 0;
416  o01_re = 0; o01_im = 0;
417  o02_re = 0; o02_im = 0;
418  o10_re = 0; o10_im = 0;
419  o11_re = 0; o11_im = 0;
420  o12_re = 0; o12_im = 0;
421  o20_re = 0; o20_im = 0;
422  o21_re = 0; o21_im = 0;
423  o22_re = 0; o22_im = 0;
424  o30_re = 0; o30_im = 0;
425  o31_re = 0; o31_im = 0;
426  o32_re = 0; o32_im = 0;
427 
428 #ifdef MULTI_GPU
429 } else { // exterior kernel
430 
431  sid = blockIdx.x*blockDim.x + threadIdx.x;
432  if (sid >= param.threads) return;
433 
434  const int dim = static_cast<int>(kernel_type);
435  const int face_volume = (param.threads >> 1); // volume of one face
436  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
437  face_idx = sid - face_num*face_volume; // index into the respective face
438 
439  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
440  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
441  //sp_idx = face_idx + param.ghostOffset[dim];
442 
443 #if (DD_PREC==2) // half precision
444  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
445 #endif
446 
447  const int dims[] = {X1, X2, X3, X4};
448  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
449 
450  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
451 
452  o00_re = i00_re; o00_im = i00_im;
453  o01_re = i01_re; o01_im = i01_im;
454  o02_re = i02_re; o02_im = i02_im;
455  o10_re = i10_re; o10_im = i10_im;
456  o11_re = i11_re; o11_im = i11_im;
457  o12_re = i12_re; o12_im = i12_im;
458  o20_re = i20_re; o20_im = i20_im;
459  o21_re = i21_re; o21_im = i21_im;
460  o22_re = i22_re; o22_im = i22_im;
461  o30_re = i30_re; o30_im = i30_im;
462  o31_re = i31_re; o31_im = i31_im;
463  o32_re = i32_re; o32_im = i32_im;
464 }
465 #endif // MULTI_GPU
466 
467 
468 #ifdef MULTI_GPU
469 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
470  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
471 #endif
472 {
473  // Projector P0-
474  // 1 0 0 -i
475  // 0 1 -i 0
476  // 0 i 1 0
477  // i 0 0 1
478 
479 #ifdef MULTI_GPU
480  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
481  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
482 #else
483  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
484 #endif
485 
486  const int ga_idx = sid;
487 
494 
495 #ifdef MULTI_GPU
496  if (kernel_type == INTERIOR_KERNEL) {
497 #endif
498 
499  // read spinor from device memory
500  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
501 
502  // store spinor into shared memory
503  WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i);
504 
505  // project spinor into half spinors
506  a0_re = +i00_re+i30_im;
507  a0_im = +i00_im-i30_re;
508  a1_re = +i01_re+i31_im;
509  a1_im = +i01_im-i31_re;
510  a2_re = +i02_re+i32_im;
511  a2_im = +i02_im-i32_re;
512  b0_re = +i10_re+i20_im;
513  b0_im = +i10_im-i20_re;
514  b1_re = +i11_re+i21_im;
515  b1_im = +i11_im-i21_re;
516  b2_re = +i12_re+i22_im;
517  b2_im = +i12_im-i22_re;
518 
519 #ifdef MULTI_GPU
520  } else {
521 
522  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
523 
524  // read half spinor from device memory
525  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
526 
527  a0_re = i00_re; a0_im = i00_im;
528  a1_re = i01_re; a1_im = i01_im;
529  a2_re = i02_re; a2_im = i02_im;
530  b0_re = i10_re; b0_im = i10_im;
531  b1_re = i11_re; b1_im = i11_im;
532  b2_re = i12_re; b2_im = i12_im;
533 
534  }
535 #endif // MULTI_GPU
536 
537  // read gauge matrix from device memory
538  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
539 
540  // reconstruct gauge matrix
542 
543  // multiply row 0
545  A0_re += g00_re * a0_re;
546  A0_re -= g00_im * a0_im;
547  A0_re += g01_re * a1_re;
548  A0_re -= g01_im * a1_im;
549  A0_re += g02_re * a2_re;
550  A0_re -= g02_im * a2_im;
552  A0_im += g00_re * a0_im;
553  A0_im += g00_im * a0_re;
554  A0_im += g01_re * a1_im;
555  A0_im += g01_im * a1_re;
556  A0_im += g02_re * a2_im;
557  A0_im += g02_im * a2_re;
559  B0_re += g00_re * b0_re;
560  B0_re -= g00_im * b0_im;
561  B0_re += g01_re * b1_re;
562  B0_re -= g01_im * b1_im;
563  B0_re += g02_re * b2_re;
564  B0_re -= g02_im * b2_im;
566  B0_im += g00_re * b0_im;
567  B0_im += g00_im * b0_re;
568  B0_im += g01_re * b1_im;
569  B0_im += g01_im * b1_re;
570  B0_im += g02_re * b2_im;
571  B0_im += g02_im * b2_re;
572 
573  // multiply row 1
575  A1_re += g10_re * a0_re;
576  A1_re -= g10_im * a0_im;
577  A1_re += g11_re * a1_re;
578  A1_re -= g11_im * a1_im;
579  A1_re += g12_re * a2_re;
580  A1_re -= g12_im * a2_im;
582  A1_im += g10_re * a0_im;
583  A1_im += g10_im * a0_re;
584  A1_im += g11_re * a1_im;
585  A1_im += g11_im * a1_re;
586  A1_im += g12_re * a2_im;
587  A1_im += g12_im * a2_re;
589  B1_re += g10_re * b0_re;
590  B1_re -= g10_im * b0_im;
591  B1_re += g11_re * b1_re;
592  B1_re -= g11_im * b1_im;
593  B1_re += g12_re * b2_re;
594  B1_re -= g12_im * b2_im;
596  B1_im += g10_re * b0_im;
597  B1_im += g10_im * b0_re;
598  B1_im += g11_re * b1_im;
599  B1_im += g11_im * b1_re;
600  B1_im += g12_re * b2_im;
601  B1_im += g12_im * b2_re;
602 
603  // multiply row 2
605  A2_re += g20_re * a0_re;
606  A2_re -= g20_im * a0_im;
607  A2_re += g21_re * a1_re;
608  A2_re -= g21_im * a1_im;
609  A2_re += g22_re * a2_re;
610  A2_re -= g22_im * a2_im;
612  A2_im += g20_re * a0_im;
613  A2_im += g20_im * a0_re;
614  A2_im += g21_re * a1_im;
615  A2_im += g21_im * a1_re;
616  A2_im += g22_re * a2_im;
617  A2_im += g22_im * a2_re;
619  B2_re += g20_re * b0_re;
620  B2_re -= g20_im * b0_im;
621  B2_re += g21_re * b1_re;
622  B2_re -= g21_im * b1_im;
623  B2_re += g22_re * b2_re;
624  B2_re -= g22_im * b2_im;
626  B2_im += g20_re * b0_im;
627  B2_im += g20_im * b0_re;
628  B2_im += g21_re * b1_im;
629  B2_im += g21_im * b1_re;
630  B2_im += g22_re * b2_im;
631  B2_im += g22_im * b2_re;
632 
633  o00_re += A0_re;
634  o00_im += A0_im;
635  o10_re += B0_re;
636  o10_im += B0_im;
637  o20_re -= B0_im;
638  o20_im += B0_re;
639  o30_re -= A0_im;
640  o30_im += A0_re;
641 
642  o01_re += A1_re;
643  o01_im += A1_im;
644  o11_re += B1_re;
645  o11_im += B1_im;
646  o21_re -= B1_im;
647  o21_im += B1_re;
648  o31_re -= A1_im;
649  o31_im += A1_re;
650 
651  o02_re += A2_re;
652  o02_im += A2_im;
653  o12_re += B2_re;
654  o12_im += B2_im;
655  o22_re -= B2_im;
656  o22_im += B2_re;
657  o32_re -= A2_im;
658  o32_im += A2_re;
659 
660 }
661 
662 #ifdef MULTI_GPU
663 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
664  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
665 #endif
666 {
667  // Projector P0+
668  // 1 0 0 i
669  // 0 1 i 0
670  // 0 -i 1 0
671  // -i 0 0 1
672 
673 #ifdef MULTI_GPU
674  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
675  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
676 #else
677  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
678 #endif
679 
680 #ifdef MULTI_GPU
681  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
682 #else
683  const int ga_idx = sp_idx;
684 #endif
685 
692 
693 #ifdef MULTI_GPU
694  if (kernel_type == INTERIOR_KERNEL) {
695 #endif
696 
697  // load spinor from shared memory
698  int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1;
699  __syncthreads();
700  READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z);
701 
702  // project spinor into half spinors
703  a0_re = +i00_re-i30_im;
704  a0_im = +i00_im+i30_re;
705  a1_re = +i01_re-i31_im;
706  a1_im = +i01_im+i31_re;
707  a2_re = +i02_re-i32_im;
708  a2_im = +i02_im+i32_re;
709  b0_re = +i10_re-i20_im;
710  b0_im = +i10_im+i20_re;
711  b1_re = +i11_re-i21_im;
712  b1_im = +i11_im+i21_re;
713  b2_re = +i12_re-i22_im;
714  b2_im = +i12_im+i22_re;
715 
716 #ifdef MULTI_GPU
717  } else {
718 
719  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
720 
721  // read half spinor from device memory
722  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
723 
724  a0_re = i00_re; a0_im = i00_im;
725  a1_re = i01_re; a1_im = i01_im;
726  a2_re = i02_re; a2_im = i02_im;
727  b0_re = i10_re; b0_im = i10_im;
728  b1_re = i11_re; b1_im = i11_im;
729  b2_re = i12_re; b2_im = i12_im;
730 
731  }
732 #endif // MULTI_GPU
733 
734  // read gauge matrix from device memory
735  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
736 
737  // reconstruct gauge matrix
739 
740  // multiply row 0
741  spinorFloat A0_re = 0;
742  A0_re += gT00_re * a0_re;
743  A0_re -= gT00_im * a0_im;
744  A0_re += gT01_re * a1_re;
745  A0_re -= gT01_im * a1_im;
746  A0_re += gT02_re * a2_re;
747  A0_re -= gT02_im * a2_im;
748  spinorFloat A0_im = 0;
749  A0_im += gT00_re * a0_im;
750  A0_im += gT00_im * a0_re;
751  A0_im += gT01_re * a1_im;
752  A0_im += gT01_im * a1_re;
753  A0_im += gT02_re * a2_im;
754  A0_im += gT02_im * a2_re;
755  spinorFloat B0_re = 0;
756  B0_re += gT00_re * b0_re;
757  B0_re -= gT00_im * b0_im;
758  B0_re += gT01_re * b1_re;
759  B0_re -= gT01_im * b1_im;
760  B0_re += gT02_re * b2_re;
761  B0_re -= gT02_im * b2_im;
762  spinorFloat B0_im = 0;
763  B0_im += gT00_re * b0_im;
764  B0_im += gT00_im * b0_re;
765  B0_im += gT01_re * b1_im;
766  B0_im += gT01_im * b1_re;
767  B0_im += gT02_re * b2_im;
768  B0_im += gT02_im * b2_re;
769 
770  // multiply row 1
771  spinorFloat A1_re = 0;
772  A1_re += gT10_re * a0_re;
773  A1_re -= gT10_im * a0_im;
774  A1_re += gT11_re * a1_re;
775  A1_re -= gT11_im * a1_im;
776  A1_re += gT12_re * a2_re;
777  A1_re -= gT12_im * a2_im;
778  spinorFloat A1_im = 0;
779  A1_im += gT10_re * a0_im;
780  A1_im += gT10_im * a0_re;
781  A1_im += gT11_re * a1_im;
782  A1_im += gT11_im * a1_re;
783  A1_im += gT12_re * a2_im;
784  A1_im += gT12_im * a2_re;
785  spinorFloat B1_re = 0;
786  B1_re += gT10_re * b0_re;
787  B1_re -= gT10_im * b0_im;
788  B1_re += gT11_re * b1_re;
789  B1_re -= gT11_im * b1_im;
790  B1_re += gT12_re * b2_re;
791  B1_re -= gT12_im * b2_im;
792  spinorFloat B1_im = 0;
793  B1_im += gT10_re * b0_im;
794  B1_im += gT10_im * b0_re;
795  B1_im += gT11_re * b1_im;
796  B1_im += gT11_im * b1_re;
797  B1_im += gT12_re * b2_im;
798  B1_im += gT12_im * b2_re;
799 
800  // multiply row 2
801  spinorFloat A2_re = 0;
802  A2_re += gT20_re * a0_re;
803  A2_re -= gT20_im * a0_im;
804  A2_re += gT21_re * a1_re;
805  A2_re -= gT21_im * a1_im;
806  A2_re += gT22_re * a2_re;
807  A2_re -= gT22_im * a2_im;
808  spinorFloat A2_im = 0;
809  A2_im += gT20_re * a0_im;
810  A2_im += gT20_im * a0_re;
811  A2_im += gT21_re * a1_im;
812  A2_im += gT21_im * a1_re;
813  A2_im += gT22_re * a2_im;
814  A2_im += gT22_im * a2_re;
815  spinorFloat B2_re = 0;
816  B2_re += gT20_re * b0_re;
817  B2_re -= gT20_im * b0_im;
818  B2_re += gT21_re * b1_re;
819  B2_re -= gT21_im * b1_im;
820  B2_re += gT22_re * b2_re;
821  B2_re -= gT22_im * b2_im;
822  spinorFloat B2_im = 0;
823  B2_im += gT20_re * b0_im;
824  B2_im += gT20_im * b0_re;
825  B2_im += gT21_re * b1_im;
826  B2_im += gT21_im * b1_re;
827  B2_im += gT22_re * b2_im;
828  B2_im += gT22_im * b2_re;
829 
830  o00_re += A0_re;
831  o00_im += A0_im;
832  o10_re += B0_re;
833  o10_im += B0_im;
834  o20_re += B0_im;
835  o20_im -= B0_re;
836  o30_re += A0_im;
837  o30_im -= A0_re;
838 
839  o01_re += A1_re;
840  o01_im += A1_im;
841  o11_re += B1_re;
842  o11_im += B1_im;
843  o21_re += B1_im;
844  o21_im -= B1_re;
845  o31_re += A1_im;
846  o31_im -= A1_re;
847 
848  o02_re += A2_re;
849  o02_im += A2_im;
850  o12_re += B2_re;
851  o12_im += B2_im;
852  o22_re += B2_im;
853  o22_im -= B2_re;
854  o32_re += A2_im;
855  o32_im -= A2_re;
856 
857 }
858 
859 #ifdef MULTI_GPU
860 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
861  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
862 #endif
863 {
864  // Projector P1-
865  // 1 0 0 -1
866  // 0 1 1 0
867  // 0 1 1 0
868  // -1 0 0 1
869 
870 #ifdef MULTI_GPU
871  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
872  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
873 #else
874  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
875 #endif
876 
877  const int ga_idx = sid;
878 
885 
886 #ifdef MULTI_GPU
887  if (kernel_type == INTERIOR_KERNEL) {
888 #endif
889 
890  if (threadIdx.y == blockDim.y-1 && blockDim.y < X2 ) {
891  // read spinor from device memory
892  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
893 
894  // project spinor into half spinors
895  a0_re = +i00_re-i30_re;
896  a0_im = +i00_im-i30_im;
897  a1_re = +i01_re-i31_re;
898  a1_im = +i01_im-i31_im;
899  a2_re = +i02_re-i32_re;
900  a2_im = +i02_im-i32_im;
901  b0_re = +i10_re+i20_re;
902  b0_im = +i10_im+i20_im;
903  b1_re = +i11_re+i21_re;
904  b1_im = +i11_im+i21_im;
905  b2_re = +i12_re+i22_re;
906  b2_im = +i12_im+i22_im;
907  } else {
908  // load spinor from shared memory
909  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x;
910  int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0;
911  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
912 
913  // project spinor into half spinors
914  a0_re = +i00_re-i30_re;
915  a0_im = +i00_im-i30_im;
916  a1_re = +i01_re-i31_re;
917  a1_im = +i01_im-i31_im;
918  a2_re = +i02_re-i32_re;
919  a2_im = +i02_im-i32_im;
920  b0_re = +i10_re+i20_re;
921  b0_im = +i10_im+i20_im;
922  b1_re = +i11_re+i21_re;
923  b1_im = +i11_im+i21_im;
924  b2_re = +i12_re+i22_re;
925  b2_im = +i12_im+i22_im;
926  }
927 
928 #ifdef MULTI_GPU
929  } else {
930 
931  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
932 
933  // read half spinor from device memory
934  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
935 
936  a0_re = i00_re; a0_im = i00_im;
937  a1_re = i01_re; a1_im = i01_im;
938  a2_re = i02_re; a2_im = i02_im;
939  b0_re = i10_re; b0_im = i10_im;
940  b1_re = i11_re; b1_im = i11_im;
941  b2_re = i12_re; b2_im = i12_im;
942 
943  }
944 #endif // MULTI_GPU
945 
946  // read gauge matrix from device memory
947  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
948 
949  // reconstruct gauge matrix
951 
952  // multiply row 0
953  spinorFloat A0_re = 0;
954  A0_re += g00_re * a0_re;
955  A0_re -= g00_im * a0_im;
956  A0_re += g01_re * a1_re;
957  A0_re -= g01_im * a1_im;
958  A0_re += g02_re * a2_re;
959  A0_re -= g02_im * a2_im;
960  spinorFloat A0_im = 0;
961  A0_im += g00_re * a0_im;
962  A0_im += g00_im * a0_re;
963  A0_im += g01_re * a1_im;
964  A0_im += g01_im * a1_re;
965  A0_im += g02_re * a2_im;
966  A0_im += g02_im * a2_re;
967  spinorFloat B0_re = 0;
968  B0_re += g00_re * b0_re;
969  B0_re -= g00_im * b0_im;
970  B0_re += g01_re * b1_re;
971  B0_re -= g01_im * b1_im;
972  B0_re += g02_re * b2_re;
973  B0_re -= g02_im * b2_im;
974  spinorFloat B0_im = 0;
975  B0_im += g00_re * b0_im;
976  B0_im += g00_im * b0_re;
977  B0_im += g01_re * b1_im;
978  B0_im += g01_im * b1_re;
979  B0_im += g02_re * b2_im;
980  B0_im += g02_im * b2_re;
981 
982  // multiply row 1
983  spinorFloat A1_re = 0;
984  A1_re += g10_re * a0_re;
985  A1_re -= g10_im * a0_im;
986  A1_re += g11_re * a1_re;
987  A1_re -= g11_im * a1_im;
988  A1_re += g12_re * a2_re;
989  A1_re -= g12_im * a2_im;
990  spinorFloat A1_im = 0;
991  A1_im += g10_re * a0_im;
992  A1_im += g10_im * a0_re;
993  A1_im += g11_re * a1_im;
994  A1_im += g11_im * a1_re;
995  A1_im += g12_re * a2_im;
996  A1_im += g12_im * a2_re;
997  spinorFloat B1_re = 0;
998  B1_re += g10_re * b0_re;
999  B1_re -= g10_im * b0_im;
1000  B1_re += g11_re * b1_re;
1001  B1_re -= g11_im * b1_im;
1002  B1_re += g12_re * b2_re;
1003  B1_re -= g12_im * b2_im;
1004  spinorFloat B1_im = 0;
1005  B1_im += g10_re * b0_im;
1006  B1_im += g10_im * b0_re;
1007  B1_im += g11_re * b1_im;
1008  B1_im += g11_im * b1_re;
1009  B1_im += g12_re * b2_im;
1010  B1_im += g12_im * b2_re;
1011 
1012  // multiply row 2
1013  spinorFloat A2_re = 0;
1014  A2_re += g20_re * a0_re;
1015  A2_re -= g20_im * a0_im;
1016  A2_re += g21_re * a1_re;
1017  A2_re -= g21_im * a1_im;
1018  A2_re += g22_re * a2_re;
1019  A2_re -= g22_im * a2_im;
1020  spinorFloat A2_im = 0;
1021  A2_im += g20_re * a0_im;
1022  A2_im += g20_im * a0_re;
1023  A2_im += g21_re * a1_im;
1024  A2_im += g21_im * a1_re;
1025  A2_im += g22_re * a2_im;
1026  A2_im += g22_im * a2_re;
1027  spinorFloat B2_re = 0;
1028  B2_re += g20_re * b0_re;
1029  B2_re -= g20_im * b0_im;
1030  B2_re += g21_re * b1_re;
1031  B2_re -= g21_im * b1_im;
1032  B2_re += g22_re * b2_re;
1033  B2_re -= g22_im * b2_im;
1034  spinorFloat B2_im = 0;
1035  B2_im += g20_re * b0_im;
1036  B2_im += g20_im * b0_re;
1037  B2_im += g21_re * b1_im;
1038  B2_im += g21_im * b1_re;
1039  B2_im += g22_re * b2_im;
1040  B2_im += g22_im * b2_re;
1041 
1042  o00_re += A0_re;
1043  o00_im += A0_im;
1044  o10_re += B0_re;
1045  o10_im += B0_im;
1046  o20_re += B0_re;
1047  o20_im += B0_im;
1048  o30_re -= A0_re;
1049  o30_im -= A0_im;
1050 
1051  o01_re += A1_re;
1052  o01_im += A1_im;
1053  o11_re += B1_re;
1054  o11_im += B1_im;
1055  o21_re += B1_re;
1056  o21_im += B1_im;
1057  o31_re -= A1_re;
1058  o31_im -= A1_im;
1059 
1060  o02_re += A2_re;
1061  o02_im += A2_im;
1062  o12_re += B2_re;
1063  o12_im += B2_im;
1064  o22_re += B2_re;
1065  o22_im += B2_im;
1066  o32_re -= A2_re;
1067  o32_im -= A2_im;
1068 
1069 }
1070 
1071 #ifdef MULTI_GPU
1072 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
1073  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
1074 #endif
1075 {
1076  // Projector P1+
1077  // 1 0 0 1
1078  // 0 1 -1 0
1079  // 0 -1 1 0
1080  // 1 0 0 1
1081 
1082 #ifdef MULTI_GPU
1083  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
1084  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1085 #else
1086  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
1087 #endif
1088 
1089 #ifdef MULTI_GPU
1090  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1091 #else
1092  const int ga_idx = sp_idx;
1093 #endif
1094 
1101 
1102 #ifdef MULTI_GPU
1103  if (kernel_type == INTERIOR_KERNEL) {
1104 #endif
1105 
1106  if (threadIdx.y == 0 && blockDim.y < X2) {
1107  // read spinor from device memory
1108  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1109 
1110  // project spinor into half spinors
1111  a0_re = +i00_re+i30_re;
1112  a0_im = +i00_im+i30_im;
1113  a1_re = +i01_re+i31_re;
1114  a1_im = +i01_im+i31_im;
1115  a2_re = +i02_re+i32_re;
1116  a2_im = +i02_im+i32_im;
1117  b0_re = +i10_re-i20_re;
1118  b0_im = +i10_im-i20_im;
1119  b1_re = +i11_re-i21_re;
1120  b1_im = +i11_im-i21_im;
1121  b2_re = +i12_re-i22_re;
1122  b2_im = +i12_im-i22_im;
1123  } else {
1124  // load spinor from shared memory
1125  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x;
1126  int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1;
1127  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
1128 
1129  // project spinor into half spinors
1130  a0_re = +i00_re+i30_re;
1131  a0_im = +i00_im+i30_im;
1132  a1_re = +i01_re+i31_re;
1133  a1_im = +i01_im+i31_im;
1134  a2_re = +i02_re+i32_re;
1135  a2_im = +i02_im+i32_im;
1136  b0_re = +i10_re-i20_re;
1137  b0_im = +i10_im-i20_im;
1138  b1_re = +i11_re-i21_re;
1139  b1_im = +i11_im-i21_im;
1140  b2_re = +i12_re-i22_re;
1141  b2_im = +i12_im-i22_im;
1142  }
1143 
1144 #ifdef MULTI_GPU
1145  } else {
1146 
1147  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1148 
1149  // read half spinor from device memory
1150  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1151 
1152  a0_re = i00_re; a0_im = i00_im;
1153  a1_re = i01_re; a1_im = i01_im;
1154  a2_re = i02_re; a2_im = i02_im;
1155  b0_re = i10_re; b0_im = i10_im;
1156  b1_re = i11_re; b1_im = i11_im;
1157  b2_re = i12_re; b2_im = i12_im;
1158 
1159  }
1160 #endif // MULTI_GPU
1161 
1162  // read gauge matrix from device memory
1163  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
1164 
1165  // reconstruct gauge matrix
1167 
1168  // multiply row 0
1169  spinorFloat A0_re = 0;
1170  A0_re += gT00_re * a0_re;
1171  A0_re -= gT00_im * a0_im;
1172  A0_re += gT01_re * a1_re;
1173  A0_re -= gT01_im * a1_im;
1174  A0_re += gT02_re * a2_re;
1175  A0_re -= gT02_im * a2_im;
1176  spinorFloat A0_im = 0;
1177  A0_im += gT00_re * a0_im;
1178  A0_im += gT00_im * a0_re;
1179  A0_im += gT01_re * a1_im;
1180  A0_im += gT01_im * a1_re;
1181  A0_im += gT02_re * a2_im;
1182  A0_im += gT02_im * a2_re;
1183  spinorFloat B0_re = 0;
1184  B0_re += gT00_re * b0_re;
1185  B0_re -= gT00_im * b0_im;
1186  B0_re += gT01_re * b1_re;
1187  B0_re -= gT01_im * b1_im;
1188  B0_re += gT02_re * b2_re;
1189  B0_re -= gT02_im * b2_im;
1190  spinorFloat B0_im = 0;
1191  B0_im += gT00_re * b0_im;
1192  B0_im += gT00_im * b0_re;
1193  B0_im += gT01_re * b1_im;
1194  B0_im += gT01_im * b1_re;
1195  B0_im += gT02_re * b2_im;
1196  B0_im += gT02_im * b2_re;
1197 
1198  // multiply row 1
1199  spinorFloat A1_re = 0;
1200  A1_re += gT10_re * a0_re;
1201  A1_re -= gT10_im * a0_im;
1202  A1_re += gT11_re * a1_re;
1203  A1_re -= gT11_im * a1_im;
1204  A1_re += gT12_re * a2_re;
1205  A1_re -= gT12_im * a2_im;
1206  spinorFloat A1_im = 0;
1207  A1_im += gT10_re * a0_im;
1208  A1_im += gT10_im * a0_re;
1209  A1_im += gT11_re * a1_im;
1210  A1_im += gT11_im * a1_re;
1211  A1_im += gT12_re * a2_im;
1212  A1_im += gT12_im * a2_re;
1213  spinorFloat B1_re = 0;
1214  B1_re += gT10_re * b0_re;
1215  B1_re -= gT10_im * b0_im;
1216  B1_re += gT11_re * b1_re;
1217  B1_re -= gT11_im * b1_im;
1218  B1_re += gT12_re * b2_re;
1219  B1_re -= gT12_im * b2_im;
1220  spinorFloat B1_im = 0;
1221  B1_im += gT10_re * b0_im;
1222  B1_im += gT10_im * b0_re;
1223  B1_im += gT11_re * b1_im;
1224  B1_im += gT11_im * b1_re;
1225  B1_im += gT12_re * b2_im;
1226  B1_im += gT12_im * b2_re;
1227 
1228  // multiply row 2
1229  spinorFloat A2_re = 0;
1230  A2_re += gT20_re * a0_re;
1231  A2_re -= gT20_im * a0_im;
1232  A2_re += gT21_re * a1_re;
1233  A2_re -= gT21_im * a1_im;
1234  A2_re += gT22_re * a2_re;
1235  A2_re -= gT22_im * a2_im;
1236  spinorFloat A2_im = 0;
1237  A2_im += gT20_re * a0_im;
1238  A2_im += gT20_im * a0_re;
1239  A2_im += gT21_re * a1_im;
1240  A2_im += gT21_im * a1_re;
1241  A2_im += gT22_re * a2_im;
1242  A2_im += gT22_im * a2_re;
1243  spinorFloat B2_re = 0;
1244  B2_re += gT20_re * b0_re;
1245  B2_re -= gT20_im * b0_im;
1246  B2_re += gT21_re * b1_re;
1247  B2_re -= gT21_im * b1_im;
1248  B2_re += gT22_re * b2_re;
1249  B2_re -= gT22_im * b2_im;
1250  spinorFloat B2_im = 0;
1251  B2_im += gT20_re * b0_im;
1252  B2_im += gT20_im * b0_re;
1253  B2_im += gT21_re * b1_im;
1254  B2_im += gT21_im * b1_re;
1255  B2_im += gT22_re * b2_im;
1256  B2_im += gT22_im * b2_re;
1257 
1258  o00_re += A0_re;
1259  o00_im += A0_im;
1260  o10_re += B0_re;
1261  o10_im += B0_im;
1262  o20_re -= B0_re;
1263  o20_im -= B0_im;
1264  o30_re += A0_re;
1265  o30_im += A0_im;
1266 
1267  o01_re += A1_re;
1268  o01_im += A1_im;
1269  o11_re += B1_re;
1270  o11_im += B1_im;
1271  o21_re -= B1_re;
1272  o21_im -= B1_im;
1273  o31_re += A1_re;
1274  o31_im += A1_im;
1275 
1276  o02_re += A2_re;
1277  o02_im += A2_im;
1278  o12_re += B2_re;
1279  o12_im += B2_im;
1280  o22_re -= B2_re;
1281  o22_im -= B2_im;
1282  o32_re += A2_re;
1283  o32_im += A2_im;
1284 
1285 }
1286 
1287 #ifdef MULTI_GPU
1288 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1289  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1290 #endif
1291 {
1292  // Projector P2-
1293  // 1 0 -i 0
1294  // 0 1 0 i
1295  // i 0 1 0
1296  // 0 -i 0 1
1297 
1298 #ifdef MULTI_GPU
1299  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1300  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1301 #else
1302  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1303 #endif
1304 
1305  const int ga_idx = sid;
1306 
1313 
1314 #ifdef MULTI_GPU
1315  if (kernel_type == INTERIOR_KERNEL) {
1316 #endif
1317 
1318  if (threadIdx.z == blockDim.z-1 && blockDim.z < X3) {
1319  // read spinor from device memory
1320  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1321 
1322  // project spinor into half spinors
1323  a0_re = +i00_re+i20_im;
1324  a0_im = +i00_im-i20_re;
1325  a1_re = +i01_re+i21_im;
1326  a1_im = +i01_im-i21_re;
1327  a2_re = +i02_re+i22_im;
1328  a2_im = +i02_im-i22_re;
1329  b0_re = +i10_re-i30_im;
1330  b0_im = +i10_im+i30_re;
1331  b1_re = +i11_re-i31_im;
1332  b1_im = +i11_im+i31_re;
1333  b2_re = +i12_re-i32_im;
1334  b2_im = +i12_im+i32_re;
1335  } else {
1336  // load spinor from shared memory
1337  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x;
1338  int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0;
1339  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1340 
1341  // project spinor into half spinors
1342  a0_re = +i00_re+i20_im;
1343  a0_im = +i00_im-i20_re;
1344  a1_re = +i01_re+i21_im;
1345  a1_im = +i01_im-i21_re;
1346  a2_re = +i02_re+i22_im;
1347  a2_im = +i02_im-i22_re;
1348  b0_re = +i10_re-i30_im;
1349  b0_im = +i10_im+i30_re;
1350  b1_re = +i11_re-i31_im;
1351  b1_im = +i11_im+i31_re;
1352  b2_re = +i12_re-i32_im;
1353  b2_im = +i12_im+i32_re;
1354  }
1355 
1356 #ifdef MULTI_GPU
1357  } else {
1358 
1359  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1360 
1361  // read half spinor from device memory
1362  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1363 
1364  a0_re = i00_re; a0_im = i00_im;
1365  a1_re = i01_re; a1_im = i01_im;
1366  a2_re = i02_re; a2_im = i02_im;
1367  b0_re = i10_re; b0_im = i10_im;
1368  b1_re = i11_re; b1_im = i11_im;
1369  b2_re = i12_re; b2_im = i12_im;
1370 
1371  }
1372 #endif // MULTI_GPU
1373 
1374  // read gauge matrix from device memory
1375  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1376 
1377  // reconstruct gauge matrix
1379 
1380  // multiply row 0
1381  spinorFloat A0_re = 0;
1382  A0_re += g00_re * a0_re;
1383  A0_re -= g00_im * a0_im;
1384  A0_re += g01_re * a1_re;
1385  A0_re -= g01_im * a1_im;
1386  A0_re += g02_re * a2_re;
1387  A0_re -= g02_im * a2_im;
1388  spinorFloat A0_im = 0;
1389  A0_im += g00_re * a0_im;
1390  A0_im += g00_im * a0_re;
1391  A0_im += g01_re * a1_im;
1392  A0_im += g01_im * a1_re;
1393  A0_im += g02_re * a2_im;
1394  A0_im += g02_im * a2_re;
1395  spinorFloat B0_re = 0;
1396  B0_re += g00_re * b0_re;
1397  B0_re -= g00_im * b0_im;
1398  B0_re += g01_re * b1_re;
1399  B0_re -= g01_im * b1_im;
1400  B0_re += g02_re * b2_re;
1401  B0_re -= g02_im * b2_im;
1402  spinorFloat B0_im = 0;
1403  B0_im += g00_re * b0_im;
1404  B0_im += g00_im * b0_re;
1405  B0_im += g01_re * b1_im;
1406  B0_im += g01_im * b1_re;
1407  B0_im += g02_re * b2_im;
1408  B0_im += g02_im * b2_re;
1409 
1410  // multiply row 1
1411  spinorFloat A1_re = 0;
1412  A1_re += g10_re * a0_re;
1413  A1_re -= g10_im * a0_im;
1414  A1_re += g11_re * a1_re;
1415  A1_re -= g11_im * a1_im;
1416  A1_re += g12_re * a2_re;
1417  A1_re -= g12_im * a2_im;
1418  spinorFloat A1_im = 0;
1419  A1_im += g10_re * a0_im;
1420  A1_im += g10_im * a0_re;
1421  A1_im += g11_re * a1_im;
1422  A1_im += g11_im * a1_re;
1423  A1_im += g12_re * a2_im;
1424  A1_im += g12_im * a2_re;
1425  spinorFloat B1_re = 0;
1426  B1_re += g10_re * b0_re;
1427  B1_re -= g10_im * b0_im;
1428  B1_re += g11_re * b1_re;
1429  B1_re -= g11_im * b1_im;
1430  B1_re += g12_re * b2_re;
1431  B1_re -= g12_im * b2_im;
1432  spinorFloat B1_im = 0;
1433  B1_im += g10_re * b0_im;
1434  B1_im += g10_im * b0_re;
1435  B1_im += g11_re * b1_im;
1436  B1_im += g11_im * b1_re;
1437  B1_im += g12_re * b2_im;
1438  B1_im += g12_im * b2_re;
1439 
1440  // multiply row 2
1441  spinorFloat A2_re = 0;
1442  A2_re += g20_re * a0_re;
1443  A2_re -= g20_im * a0_im;
1444  A2_re += g21_re * a1_re;
1445  A2_re -= g21_im * a1_im;
1446  A2_re += g22_re * a2_re;
1447  A2_re -= g22_im * a2_im;
1448  spinorFloat A2_im = 0;
1449  A2_im += g20_re * a0_im;
1450  A2_im += g20_im * a0_re;
1451  A2_im += g21_re * a1_im;
1452  A2_im += g21_im * a1_re;
1453  A2_im += g22_re * a2_im;
1454  A2_im += g22_im * a2_re;
1455  spinorFloat B2_re = 0;
1456  B2_re += g20_re * b0_re;
1457  B2_re -= g20_im * b0_im;
1458  B2_re += g21_re * b1_re;
1459  B2_re -= g21_im * b1_im;
1460  B2_re += g22_re * b2_re;
1461  B2_re -= g22_im * b2_im;
1462  spinorFloat B2_im = 0;
1463  B2_im += g20_re * b0_im;
1464  B2_im += g20_im * b0_re;
1465  B2_im += g21_re * b1_im;
1466  B2_im += g21_im * b1_re;
1467  B2_im += g22_re * b2_im;
1468  B2_im += g22_im * b2_re;
1469 
1470  o00_re += A0_re;
1471  o00_im += A0_im;
1472  o10_re += B0_re;
1473  o10_im += B0_im;
1474  o20_re -= A0_im;
1475  o20_im += A0_re;
1476  o30_re += B0_im;
1477  o30_im -= B0_re;
1478 
1479  o01_re += A1_re;
1480  o01_im += A1_im;
1481  o11_re += B1_re;
1482  o11_im += B1_im;
1483  o21_re -= A1_im;
1484  o21_im += A1_re;
1485  o31_re += B1_im;
1486  o31_im -= B1_re;
1487 
1488  o02_re += A2_re;
1489  o02_im += A2_im;
1490  o12_re += B2_re;
1491  o12_im += B2_im;
1492  o22_re -= A2_im;
1493  o22_im += A2_re;
1494  o32_re += B2_im;
1495  o32_im -= B2_re;
1496 
1497 }
1498 
1499 #ifdef MULTI_GPU
1500 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1501  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1502 #endif
1503 {
1504  // Projector P2+
1505  // 1 0 i 0
1506  // 0 1 0 -i
1507  // -i 0 1 0
1508  // 0 i 0 1
1509 
1510 #ifdef MULTI_GPU
1511  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1512  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1513 #else
1514  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1515 #endif
1516 
1517 #ifdef MULTI_GPU
1518  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1519 #else
1520  const int ga_idx = sp_idx;
1521 #endif
1522 
1529 
1530 #ifdef MULTI_GPU
1531  if (kernel_type == INTERIOR_KERNEL) {
1532 #endif
1533 
1534  if (threadIdx.z == 0 && blockDim.z < X3) {
1535  // read spinor from device memory
1536  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1537 
1538  // project spinor into half spinors
1539  a0_re = +i00_re-i20_im;
1540  a0_im = +i00_im+i20_re;
1541  a1_re = +i01_re-i21_im;
1542  a1_im = +i01_im+i21_re;
1543  a2_re = +i02_re-i22_im;
1544  a2_im = +i02_im+i22_re;
1545  b0_re = +i10_re+i30_im;
1546  b0_im = +i10_im-i30_re;
1547  b1_re = +i11_re+i31_im;
1548  b1_im = +i11_im-i31_re;
1549  b2_re = +i12_re+i32_im;
1550  b2_im = +i12_im-i32_re;
1551  } else {
1552  // load spinor from shared memory
1553  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x;
1554  int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1;
1555  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1556 
1557  // project spinor into half spinors
1558  a0_re = +i00_re-i20_im;
1559  a0_im = +i00_im+i20_re;
1560  a1_re = +i01_re-i21_im;
1561  a1_im = +i01_im+i21_re;
1562  a2_re = +i02_re-i22_im;
1563  a2_im = +i02_im+i22_re;
1564  b0_re = +i10_re+i30_im;
1565  b0_im = +i10_im-i30_re;
1566  b1_re = +i11_re+i31_im;
1567  b1_im = +i11_im-i31_re;
1568  b2_re = +i12_re+i32_im;
1569  b2_im = +i12_im-i32_re;
1570  }
1571 
1572 #ifdef MULTI_GPU
1573  } else {
1574 
1575  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1576 
1577  // read half spinor from device memory
1578  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1579 
1580  a0_re = i00_re; a0_im = i00_im;
1581  a1_re = i01_re; a1_im = i01_im;
1582  a2_re = i02_re; a2_im = i02_im;
1583  b0_re = i10_re; b0_im = i10_im;
1584  b1_re = i11_re; b1_im = i11_im;
1585  b2_re = i12_re; b2_im = i12_im;
1586 
1587  }
1588 #endif // MULTI_GPU
1589 
1590  // read gauge matrix from device memory
1591  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1592 
1593  // reconstruct gauge matrix
1595 
1596  // multiply row 0
1597  spinorFloat A0_re = 0;
1598  A0_re += gT00_re * a0_re;
1599  A0_re -= gT00_im * a0_im;
1600  A0_re += gT01_re * a1_re;
1601  A0_re -= gT01_im * a1_im;
1602  A0_re += gT02_re * a2_re;
1603  A0_re -= gT02_im * a2_im;
1604  spinorFloat A0_im = 0;
1605  A0_im += gT00_re * a0_im;
1606  A0_im += gT00_im * a0_re;
1607  A0_im += gT01_re * a1_im;
1608  A0_im += gT01_im * a1_re;
1609  A0_im += gT02_re * a2_im;
1610  A0_im += gT02_im * a2_re;
1611  spinorFloat B0_re = 0;
1612  B0_re += gT00_re * b0_re;
1613  B0_re -= gT00_im * b0_im;
1614  B0_re += gT01_re * b1_re;
1615  B0_re -= gT01_im * b1_im;
1616  B0_re += gT02_re * b2_re;
1617  B0_re -= gT02_im * b2_im;
1618  spinorFloat B0_im = 0;
1619  B0_im += gT00_re * b0_im;
1620  B0_im += gT00_im * b0_re;
1621  B0_im += gT01_re * b1_im;
1622  B0_im += gT01_im * b1_re;
1623  B0_im += gT02_re * b2_im;
1624  B0_im += gT02_im * b2_re;
1625 
1626  // multiply row 1
1627  spinorFloat A1_re = 0;
1628  A1_re += gT10_re * a0_re;
1629  A1_re -= gT10_im * a0_im;
1630  A1_re += gT11_re * a1_re;
1631  A1_re -= gT11_im * a1_im;
1632  A1_re += gT12_re * a2_re;
1633  A1_re -= gT12_im * a2_im;
1634  spinorFloat A1_im = 0;
1635  A1_im += gT10_re * a0_im;
1636  A1_im += gT10_im * a0_re;
1637  A1_im += gT11_re * a1_im;
1638  A1_im += gT11_im * a1_re;
1639  A1_im += gT12_re * a2_im;
1640  A1_im += gT12_im * a2_re;
1641  spinorFloat B1_re = 0;
1642  B1_re += gT10_re * b0_re;
1643  B1_re -= gT10_im * b0_im;
1644  B1_re += gT11_re * b1_re;
1645  B1_re -= gT11_im * b1_im;
1646  B1_re += gT12_re * b2_re;
1647  B1_re -= gT12_im * b2_im;
1648  spinorFloat B1_im = 0;
1649  B1_im += gT10_re * b0_im;
1650  B1_im += gT10_im * b0_re;
1651  B1_im += gT11_re * b1_im;
1652  B1_im += gT11_im * b1_re;
1653  B1_im += gT12_re * b2_im;
1654  B1_im += gT12_im * b2_re;
1655 
1656  // multiply row 2
1657  spinorFloat A2_re = 0;
1658  A2_re += gT20_re * a0_re;
1659  A2_re -= gT20_im * a0_im;
1660  A2_re += gT21_re * a1_re;
1661  A2_re -= gT21_im * a1_im;
1662  A2_re += gT22_re * a2_re;
1663  A2_re -= gT22_im * a2_im;
1664  spinorFloat A2_im = 0;
1665  A2_im += gT20_re * a0_im;
1666  A2_im += gT20_im * a0_re;
1667  A2_im += gT21_re * a1_im;
1668  A2_im += gT21_im * a1_re;
1669  A2_im += gT22_re * a2_im;
1670  A2_im += gT22_im * a2_re;
1671  spinorFloat B2_re = 0;
1672  B2_re += gT20_re * b0_re;
1673  B2_re -= gT20_im * b0_im;
1674  B2_re += gT21_re * b1_re;
1675  B2_re -= gT21_im * b1_im;
1676  B2_re += gT22_re * b2_re;
1677  B2_re -= gT22_im * b2_im;
1678  spinorFloat B2_im = 0;
1679  B2_im += gT20_re * b0_im;
1680  B2_im += gT20_im * b0_re;
1681  B2_im += gT21_re * b1_im;
1682  B2_im += gT21_im * b1_re;
1683  B2_im += gT22_re * b2_im;
1684  B2_im += gT22_im * b2_re;
1685 
1686  o00_re += A0_re;
1687  o00_im += A0_im;
1688  o10_re += B0_re;
1689  o10_im += B0_im;
1690  o20_re += A0_im;
1691  o20_im -= A0_re;
1692  o30_re -= B0_im;
1693  o30_im += B0_re;
1694 
1695  o01_re += A1_re;
1696  o01_im += A1_im;
1697  o11_re += B1_re;
1698  o11_im += B1_im;
1699  o21_re += A1_im;
1700  o21_im -= A1_re;
1701  o31_re -= B1_im;
1702  o31_im += B1_re;
1703 
1704  o02_re += A2_re;
1705  o02_im += A2_im;
1706  o12_re += B2_re;
1707  o12_im += B2_im;
1708  o22_re += A2_im;
1709  o22_im -= A2_re;
1710  o32_re -= B2_im;
1711  o32_im += B2_re;
1712 
1713 }
1714 
1715 #ifdef MULTI_GPU
1716 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1717  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
1718 #endif
1719 {
1720  // Projector P3-
1721  // 0 0 0 0
1722  // 0 0 0 0
1723  // 0 0 2 0
1724  // 0 0 0 2
1725 
1726 #ifdef MULTI_GPU
1727  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1728  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1729 #else
1730  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1731 #endif
1732 
1733  const int ga_idx = sid;
1734 
1735  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1736  {
1743 
1744 #ifdef MULTI_GPU
1745  if (kernel_type == INTERIOR_KERNEL) {
1746 #endif
1747 
1748  // read spinor from device memory
1749  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1750 
1751  // project spinor into half spinors
1752  a0_re = +2*i20_re;
1753  a0_im = +2*i20_im;
1754  a1_re = +2*i21_re;
1755  a1_im = +2*i21_im;
1756  a2_re = +2*i22_re;
1757  a2_im = +2*i22_im;
1758  b0_re = +2*i30_re;
1759  b0_im = +2*i30_im;
1760  b1_re = +2*i31_re;
1761  b1_im = +2*i31_im;
1762  b2_re = +2*i32_re;
1763  b2_im = +2*i32_im;
1764 
1765 #ifdef MULTI_GPU
1766  } else {
1767 
1768  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1769  const int t_proj_scale = TPROJSCALE;
1770 
1771  // read half spinor from device memory
1772  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1773 
1774  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1775  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1776  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1777  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1778  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1779  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1780 
1781  }
1782 #endif // MULTI_GPU
1783 
1784  // identity gauge matrix
1791 
1792  o20_re += A0_re;
1793  o20_im += A0_im;
1794  o30_re += B0_re;
1795  o30_im += B0_im;
1796 
1797  o21_re += A1_re;
1798  o21_im += A1_im;
1799  o31_re += B1_re;
1800  o31_im += B1_im;
1801 
1802  o22_re += A2_re;
1803  o22_im += A2_im;
1804  o32_re += B2_re;
1805  o32_im += B2_im;
1806 
1807  } else {
1814 
1815 #ifdef MULTI_GPU
1816  if (kernel_type == INTERIOR_KERNEL) {
1817 #endif
1818 
1819  // read spinor from device memory
1820  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1821 
1822  // project spinor into half spinors
1823  a0_re = +2*i20_re;
1824  a0_im = +2*i20_im;
1825  a1_re = +2*i21_re;
1826  a1_im = +2*i21_im;
1827  a2_re = +2*i22_re;
1828  a2_im = +2*i22_im;
1829  b0_re = +2*i30_re;
1830  b0_im = +2*i30_im;
1831  b1_re = +2*i31_re;
1832  b1_im = +2*i31_im;
1833  b2_re = +2*i32_re;
1834  b2_im = +2*i32_im;
1835 
1836 #ifdef MULTI_GPU
1837  } else {
1838 
1839  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1840  const int t_proj_scale = TPROJSCALE;
1841 
1842  // read half spinor from device memory
1843  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1844 
1845  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1846  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1847  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1848  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1849  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1850  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1851 
1852  }
1853 #endif // MULTI_GPU
1854 
1855  // read gauge matrix from device memory
1856  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1857 
1858  // reconstruct gauge matrix
1860 
1861  // multiply row 0
1862  spinorFloat A0_re = 0;
1863  A0_re += g00_re * a0_re;
1864  A0_re -= g00_im * a0_im;
1865  A0_re += g01_re * a1_re;
1866  A0_re -= g01_im * a1_im;
1867  A0_re += g02_re * a2_re;
1868  A0_re -= g02_im * a2_im;
1869  spinorFloat A0_im = 0;
1870  A0_im += g00_re * a0_im;
1871  A0_im += g00_im * a0_re;
1872  A0_im += g01_re * a1_im;
1873  A0_im += g01_im * a1_re;
1874  A0_im += g02_re * a2_im;
1875  A0_im += g02_im * a2_re;
1876  spinorFloat B0_re = 0;
1877  B0_re += g00_re * b0_re;
1878  B0_re -= g00_im * b0_im;
1879  B0_re += g01_re * b1_re;
1880  B0_re -= g01_im * b1_im;
1881  B0_re += g02_re * b2_re;
1882  B0_re -= g02_im * b2_im;
1883  spinorFloat B0_im = 0;
1884  B0_im += g00_re * b0_im;
1885  B0_im += g00_im * b0_re;
1886  B0_im += g01_re * b1_im;
1887  B0_im += g01_im * b1_re;
1888  B0_im += g02_re * b2_im;
1889  B0_im += g02_im * b2_re;
1890 
1891  // multiply row 1
1892  spinorFloat A1_re = 0;
1893  A1_re += g10_re * a0_re;
1894  A1_re -= g10_im * a0_im;
1895  A1_re += g11_re * a1_re;
1896  A1_re -= g11_im * a1_im;
1897  A1_re += g12_re * a2_re;
1898  A1_re -= g12_im * a2_im;
1899  spinorFloat A1_im = 0;
1900  A1_im += g10_re * a0_im;
1901  A1_im += g10_im * a0_re;
1902  A1_im += g11_re * a1_im;
1903  A1_im += g11_im * a1_re;
1904  A1_im += g12_re * a2_im;
1905  A1_im += g12_im * a2_re;
1906  spinorFloat B1_re = 0;
1907  B1_re += g10_re * b0_re;
1908  B1_re -= g10_im * b0_im;
1909  B1_re += g11_re * b1_re;
1910  B1_re -= g11_im * b1_im;
1911  B1_re += g12_re * b2_re;
1912  B1_re -= g12_im * b2_im;
1913  spinorFloat B1_im = 0;
1914  B1_im += g10_re * b0_im;
1915  B1_im += g10_im * b0_re;
1916  B1_im += g11_re * b1_im;
1917  B1_im += g11_im * b1_re;
1918  B1_im += g12_re * b2_im;
1919  B1_im += g12_im * b2_re;
1920 
1921  // multiply row 2
1922  spinorFloat A2_re = 0;
1923  A2_re += g20_re * a0_re;
1924  A2_re -= g20_im * a0_im;
1925  A2_re += g21_re * a1_re;
1926  A2_re -= g21_im * a1_im;
1927  A2_re += g22_re * a2_re;
1928  A2_re -= g22_im * a2_im;
1929  spinorFloat A2_im = 0;
1930  A2_im += g20_re * a0_im;
1931  A2_im += g20_im * a0_re;
1932  A2_im += g21_re * a1_im;
1933  A2_im += g21_im * a1_re;
1934  A2_im += g22_re * a2_im;
1935  A2_im += g22_im * a2_re;
1936  spinorFloat B2_re = 0;
1937  B2_re += g20_re * b0_re;
1938  B2_re -= g20_im * b0_im;
1939  B2_re += g21_re * b1_re;
1940  B2_re -= g21_im * b1_im;
1941  B2_re += g22_re * b2_re;
1942  B2_re -= g22_im * b2_im;
1943  spinorFloat B2_im = 0;
1944  B2_im += g20_re * b0_im;
1945  B2_im += g20_im * b0_re;
1946  B2_im += g21_re * b1_im;
1947  B2_im += g21_im * b1_re;
1948  B2_im += g22_re * b2_im;
1949  B2_im += g22_im * b2_re;
1950 
1951  o20_re += A0_re;
1952  o20_im += A0_im;
1953  o30_re += B0_re;
1954  o30_im += B0_im;
1955 
1956  o21_re += A1_re;
1957  o21_im += A1_im;
1958  o31_re += B1_re;
1959  o31_im += B1_im;
1960 
1961  o22_re += A2_re;
1962  o22_im += A2_im;
1963  o32_re += B2_re;
1964  o32_im += B2_im;
1965 
1966  }
1967 }
1968 
1969 #ifdef MULTI_GPU
1970 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
1971  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
1972 #endif
1973 {
1974  // Projector P3+
1975  // 2 0 0 0
1976  // 0 2 0 0
1977  // 0 0 0 0
1978  // 0 0 0 0
1979 
1980 #ifdef MULTI_GPU
1981  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
1982  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1983 #else
1984  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
1985 #endif
1986 
1987 #ifdef MULTI_GPU
1988  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1989 #else
1990  const int ga_idx = sp_idx;
1991 #endif
1992 
1993  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1994  {
2001 
2002 #ifdef MULTI_GPU
2003  if (kernel_type == INTERIOR_KERNEL) {
2004 #endif
2005 
2006  // read spinor from device memory
2007  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2008 
2009  // project spinor into half spinors
2010  a0_re = +2*i00_re;
2011  a0_im = +2*i00_im;
2012  a1_re = +2*i01_re;
2013  a1_im = +2*i01_im;
2014  a2_re = +2*i02_re;
2015  a2_im = +2*i02_im;
2016  b0_re = +2*i10_re;
2017  b0_im = +2*i10_im;
2018  b1_re = +2*i11_re;
2019  b1_im = +2*i11_im;
2020  b2_re = +2*i12_re;
2021  b2_im = +2*i12_im;
2022 
2023 #ifdef MULTI_GPU
2024  } else {
2025 
2026  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2027  const int t_proj_scale = TPROJSCALE;
2028 
2029  // read half spinor from device memory
2030  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2031 
2032  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2033  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2034  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2035  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2036  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2037  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2038 
2039  }
2040 #endif // MULTI_GPU
2041 
2042  // identity gauge matrix
2049 
2050  o00_re += A0_re;
2051  o00_im += A0_im;
2052  o10_re += B0_re;
2053  o10_im += B0_im;
2054 
2055  o01_re += A1_re;
2056  o01_im += A1_im;
2057  o11_re += B1_re;
2058  o11_im += B1_im;
2059 
2060  o02_re += A2_re;
2061  o02_im += A2_im;
2062  o12_re += B2_re;
2063  o12_im += B2_im;
2064 
2065  } else {
2072 
2073 #ifdef MULTI_GPU
2074  if (kernel_type == INTERIOR_KERNEL) {
2075 #endif
2076 
2077  // read spinor from device memory
2078  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2079 
2080  // project spinor into half spinors
2081  a0_re = +2*i00_re;
2082  a0_im = +2*i00_im;
2083  a1_re = +2*i01_re;
2084  a1_im = +2*i01_im;
2085  a2_re = +2*i02_re;
2086  a2_im = +2*i02_im;
2087  b0_re = +2*i10_re;
2088  b0_im = +2*i10_im;
2089  b1_re = +2*i11_re;
2090  b1_im = +2*i11_im;
2091  b2_re = +2*i12_re;
2092  b2_im = +2*i12_im;
2093 
2094 #ifdef MULTI_GPU
2095  } else {
2096 
2097  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2098  const int t_proj_scale = TPROJSCALE;
2099 
2100  // read half spinor from device memory
2101  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2102 
2103  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2104  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2105  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2106  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2107  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2108  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2109 
2110  }
2111 #endif // MULTI_GPU
2112 
2113  // read gauge matrix from device memory
2114  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
2115 
2116  // reconstruct gauge matrix
2118 
2119  // multiply row 0
2120  spinorFloat A0_re = 0;
2121  A0_re += gT00_re * a0_re;
2122  A0_re -= gT00_im * a0_im;
2123  A0_re += gT01_re * a1_re;
2124  A0_re -= gT01_im * a1_im;
2125  A0_re += gT02_re * a2_re;
2126  A0_re -= gT02_im * a2_im;
2127  spinorFloat A0_im = 0;
2128  A0_im += gT00_re * a0_im;
2129  A0_im += gT00_im * a0_re;
2130  A0_im += gT01_re * a1_im;
2131  A0_im += gT01_im * a1_re;
2132  A0_im += gT02_re * a2_im;
2133  A0_im += gT02_im * a2_re;
2134  spinorFloat B0_re = 0;
2135  B0_re += gT00_re * b0_re;
2136  B0_re -= gT00_im * b0_im;
2137  B0_re += gT01_re * b1_re;
2138  B0_re -= gT01_im * b1_im;
2139  B0_re += gT02_re * b2_re;
2140  B0_re -= gT02_im * b2_im;
2141  spinorFloat B0_im = 0;
2142  B0_im += gT00_re * b0_im;
2143  B0_im += gT00_im * b0_re;
2144  B0_im += gT01_re * b1_im;
2145  B0_im += gT01_im * b1_re;
2146  B0_im += gT02_re * b2_im;
2147  B0_im += gT02_im * b2_re;
2148 
2149  // multiply row 1
2150  spinorFloat A1_re = 0;
2151  A1_re += gT10_re * a0_re;
2152  A1_re -= gT10_im * a0_im;
2153  A1_re += gT11_re * a1_re;
2154  A1_re -= gT11_im * a1_im;
2155  A1_re += gT12_re * a2_re;
2156  A1_re -= gT12_im * a2_im;
2157  spinorFloat A1_im = 0;
2158  A1_im += gT10_re * a0_im;
2159  A1_im += gT10_im * a0_re;
2160  A1_im += gT11_re * a1_im;
2161  A1_im += gT11_im * a1_re;
2162  A1_im += gT12_re * a2_im;
2163  A1_im += gT12_im * a2_re;
2164  spinorFloat B1_re = 0;
2165  B1_re += gT10_re * b0_re;
2166  B1_re -= gT10_im * b0_im;
2167  B1_re += gT11_re * b1_re;
2168  B1_re -= gT11_im * b1_im;
2169  B1_re += gT12_re * b2_re;
2170  B1_re -= gT12_im * b2_im;
2171  spinorFloat B1_im = 0;
2172  B1_im += gT10_re * b0_im;
2173  B1_im += gT10_im * b0_re;
2174  B1_im += gT11_re * b1_im;
2175  B1_im += gT11_im * b1_re;
2176  B1_im += gT12_re * b2_im;
2177  B1_im += gT12_im * b2_re;
2178 
2179  // multiply row 2
2180  spinorFloat A2_re = 0;
2181  A2_re += gT20_re * a0_re;
2182  A2_re -= gT20_im * a0_im;
2183  A2_re += gT21_re * a1_re;
2184  A2_re -= gT21_im * a1_im;
2185  A2_re += gT22_re * a2_re;
2186  A2_re -= gT22_im * a2_im;
2187  spinorFloat A2_im = 0;
2188  A2_im += gT20_re * a0_im;
2189  A2_im += gT20_im * a0_re;
2190  A2_im += gT21_re * a1_im;
2191  A2_im += gT21_im * a1_re;
2192  A2_im += gT22_re * a2_im;
2193  A2_im += gT22_im * a2_re;
2194  spinorFloat B2_re = 0;
2195  B2_re += gT20_re * b0_re;
2196  B2_re -= gT20_im * b0_im;
2197  B2_re += gT21_re * b1_re;
2198  B2_re -= gT21_im * b1_im;
2199  B2_re += gT22_re * b2_re;
2200  B2_re -= gT22_im * b2_im;
2201  spinorFloat B2_im = 0;
2202  B2_im += gT20_re * b0_im;
2203  B2_im += gT20_im * b0_re;
2204  B2_im += gT21_re * b1_im;
2205  B2_im += gT21_im * b1_re;
2206  B2_im += gT22_re * b2_im;
2207  B2_im += gT22_im * b2_re;
2208 
2209  o00_re += A0_re;
2210  o00_im += A0_im;
2211  o10_re += B0_re;
2212  o10_im += B0_im;
2213 
2214  o01_re += A1_re;
2215  o01_im += A1_im;
2216  o11_re += B1_re;
2217  o11_im += B1_im;
2218 
2219  o02_re += A2_re;
2220  o02_im += A2_im;
2221  o12_re += B2_re;
2222  o12_im += B2_im;
2223 
2224  }
2225 }
2226 
2227 #if defined MULTI_GPU && (defined DSLASH_XPAY || defined DSLASH_CLOVER)
2228 
2229 int incomplete = 0; // Have all 8 contributions been computed for this site?
2230 
2231 switch(kernel_type) { // intentional fall-through
2232 case INTERIOR_KERNEL:
2233  incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
2234 case EXTERIOR_KERNEL_T:
2235  incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
2236 case EXTERIOR_KERNEL_Z:
2237  incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
2238 case EXTERIOR_KERNEL_Y:
2239  incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
2240 }
2241 
2242 if (!incomplete)
2243 #endif // MULTI_GPU
2244 {
2245 #ifdef DSLASH_CLOVER
2246 
2247  // change to chiral basis
2248  {
2255  spinorFloat a30_re = o00_re - o20_re;
2256  spinorFloat a30_im = o00_im - o20_im;
2257 
2258  o00_re = a00_re; o00_im = a00_im;
2259  o10_re = a10_re; o10_im = a10_im;
2260  o20_re = a20_re; o20_im = a20_im;
2261  o30_re = a30_re; o30_im = a30_im;
2262  }
2263 
2264  {
2271  spinorFloat a31_re = o01_re - o21_re;
2272  spinorFloat a31_im = o01_im - o21_im;
2273 
2274  o01_re = a01_re; o01_im = a01_im;
2275  o11_re = a11_re; o11_im = a11_im;
2276  o21_re = a21_re; o21_im = a21_im;
2277  o31_re = a31_re; o31_im = a31_im;
2278  }
2279 
2280  {
2287  spinorFloat a32_re = o02_re - o22_re;
2288  spinorFloat a32_im = o02_im - o22_im;
2289 
2290  o02_re = a02_re; o02_im = a02_im;
2291  o12_re = a12_re; o12_im = a12_im;
2292  o22_re = a22_re; o22_im = a22_im;
2293  o32_re = a32_re; o32_im = a32_im;
2294  }
2295 
2296  // apply first chiral block
2297  {
2299 
2306 
2307  a00_re += c00_00_re * o00_re;
2308  a00_im += c00_00_re * o00_im;
2309  a00_re += c00_01_re * o01_re;
2310  a00_re -= c00_01_im * o01_im;
2311  a00_im += c00_01_re * o01_im;
2312  a00_im += c00_01_im * o01_re;
2313  a00_re += c00_02_re * o02_re;
2314  a00_re -= c00_02_im * o02_im;
2315  a00_im += c00_02_re * o02_im;
2316  a00_im += c00_02_im * o02_re;
2317  a00_re += c00_10_re * o10_re;
2318  a00_re -= c00_10_im * o10_im;
2319  a00_im += c00_10_re * o10_im;
2320  a00_im += c00_10_im * o10_re;
2321  a00_re += c00_11_re * o11_re;
2322  a00_re -= c00_11_im * o11_im;
2323  a00_im += c00_11_re * o11_im;
2324  a00_im += c00_11_im * o11_re;
2325  a00_re += c00_12_re * o12_re;
2326  a00_re -= c00_12_im * o12_im;
2327  a00_im += c00_12_re * o12_im;
2328  a00_im += c00_12_im * o12_re;
2329 
2330  a01_re += c01_00_re * o00_re;
2331  a01_re -= c01_00_im * o00_im;
2332  a01_im += c01_00_re * o00_im;
2333  a01_im += c01_00_im * o00_re;
2334  a01_re += c01_01_re * o01_re;
2335  a01_im += c01_01_re * o01_im;
2336  a01_re += c01_02_re * o02_re;
2337  a01_re -= c01_02_im * o02_im;
2338  a01_im += c01_02_re * o02_im;
2339  a01_im += c01_02_im * o02_re;
2340  a01_re += c01_10_re * o10_re;
2341  a01_re -= c01_10_im * o10_im;
2342  a01_im += c01_10_re * o10_im;
2343  a01_im += c01_10_im * o10_re;
2344  a01_re += c01_11_re * o11_re;
2345  a01_re -= c01_11_im * o11_im;
2346  a01_im += c01_11_re * o11_im;
2347  a01_im += c01_11_im * o11_re;
2348  a01_re += c01_12_re * o12_re;
2349  a01_re -= c01_12_im * o12_im;
2350  a01_im += c01_12_re * o12_im;
2351  a01_im += c01_12_im * o12_re;
2352 
2353  a02_re += c02_00_re * o00_re;
2354  a02_re -= c02_00_im * o00_im;
2355  a02_im += c02_00_re * o00_im;
2356  a02_im += c02_00_im * o00_re;
2357  a02_re += c02_01_re * o01_re;
2358  a02_re -= c02_01_im * o01_im;
2359  a02_im += c02_01_re * o01_im;
2360  a02_im += c02_01_im * o01_re;
2361  a02_re += c02_02_re * o02_re;
2362  a02_im += c02_02_re * o02_im;
2363  a02_re += c02_10_re * o10_re;
2364  a02_re -= c02_10_im * o10_im;
2365  a02_im += c02_10_re * o10_im;
2366  a02_im += c02_10_im * o10_re;
2367  a02_re += c02_11_re * o11_re;
2368  a02_re -= c02_11_im * o11_im;
2369  a02_im += c02_11_re * o11_im;
2370  a02_im += c02_11_im * o11_re;
2371  a02_re += c02_12_re * o12_re;
2372  a02_re -= c02_12_im * o12_im;
2373  a02_im += c02_12_re * o12_im;
2374  a02_im += c02_12_im * o12_re;
2375 
2376  a10_re += c10_00_re * o00_re;
2377  a10_re -= c10_00_im * o00_im;
2378  a10_im += c10_00_re * o00_im;
2379  a10_im += c10_00_im * o00_re;
2380  a10_re += c10_01_re * o01_re;
2381  a10_re -= c10_01_im * o01_im;
2382  a10_im += c10_01_re * o01_im;
2383  a10_im += c10_01_im * o01_re;
2384  a10_re += c10_02_re * o02_re;
2385  a10_re -= c10_02_im * o02_im;
2386  a10_im += c10_02_re * o02_im;
2387  a10_im += c10_02_im * o02_re;
2388  a10_re += c10_10_re * o10_re;
2389  a10_im += c10_10_re * o10_im;
2390  a10_re += c10_11_re * o11_re;
2391  a10_re -= c10_11_im * o11_im;
2392  a10_im += c10_11_re * o11_im;
2393  a10_im += c10_11_im * o11_re;
2394  a10_re += c10_12_re * o12_re;
2395  a10_re -= c10_12_im * o12_im;
2396  a10_im += c10_12_re * o12_im;
2397  a10_im += c10_12_im * o12_re;
2398 
2399  a11_re += c11_00_re * o00_re;
2400  a11_re -= c11_00_im * o00_im;
2401  a11_im += c11_00_re * o00_im;
2402  a11_im += c11_00_im * o00_re;
2403  a11_re += c11_01_re * o01_re;
2404  a11_re -= c11_01_im * o01_im;
2405  a11_im += c11_01_re * o01_im;
2406  a11_im += c11_01_im * o01_re;
2407  a11_re += c11_02_re * o02_re;
2408  a11_re -= c11_02_im * o02_im;
2409  a11_im += c11_02_re * o02_im;
2410  a11_im += c11_02_im * o02_re;
2411  a11_re += c11_10_re * o10_re;
2412  a11_re -= c11_10_im * o10_im;
2413  a11_im += c11_10_re * o10_im;
2414  a11_im += c11_10_im * o10_re;
2415  a11_re += c11_11_re * o11_re;
2416  a11_im += c11_11_re * o11_im;
2417  a11_re += c11_12_re * o12_re;
2418  a11_re -= c11_12_im * o12_im;
2419  a11_im += c11_12_re * o12_im;
2420  a11_im += c11_12_im * o12_re;
2421 
2422  a12_re += c12_00_re * o00_re;
2423  a12_re -= c12_00_im * o00_im;
2424  a12_im += c12_00_re * o00_im;
2425  a12_im += c12_00_im * o00_re;
2426  a12_re += c12_01_re * o01_re;
2427  a12_re -= c12_01_im * o01_im;
2428  a12_im += c12_01_re * o01_im;
2429  a12_im += c12_01_im * o01_re;
2430  a12_re += c12_02_re * o02_re;
2431  a12_re -= c12_02_im * o02_im;
2432  a12_im += c12_02_re * o02_im;
2433  a12_im += c12_02_im * o02_re;
2434  a12_re += c12_10_re * o10_re;
2435  a12_re -= c12_10_im * o10_im;
2436  a12_im += c12_10_re * o10_im;
2437  a12_im += c12_10_im * o10_re;
2438  a12_re += c12_11_re * o11_re;
2439  a12_re -= c12_11_im * o11_im;
2440  a12_im += c12_11_re * o11_im;
2441  a12_im += c12_11_im * o11_re;
2442  a12_re += c12_12_re * o12_re;
2443  a12_im += c12_12_re * o12_im;
2444 
2445  o00_re = a00_re; o00_im = a00_im;
2446  o01_re = a01_re; o01_im = a01_im;
2447  o02_re = a02_re; o02_im = a02_im;
2448  o10_re = a10_re; o10_im = a10_im;
2449  o11_re = a11_re; o11_im = a11_im;
2450  o12_re = a12_re; o12_im = a12_im;
2451 
2452  }
2453 
2454  // apply second chiral block
2455  {
2457 
2461  spinorFloat a30_re = 0; spinorFloat a30_im = 0;
2462  spinorFloat a31_re = 0; spinorFloat a31_im = 0;
2463  spinorFloat a32_re = 0; spinorFloat a32_im = 0;
2464 
2465  a20_re += c20_20_re * o20_re;
2466  a20_im += c20_20_re * o20_im;
2467  a20_re += c20_21_re * o21_re;
2468  a20_re -= c20_21_im * o21_im;
2469  a20_im += c20_21_re * o21_im;
2470  a20_im += c20_21_im * o21_re;
2471  a20_re += c20_22_re * o22_re;
2472  a20_re -= c20_22_im * o22_im;
2473  a20_im += c20_22_re * o22_im;
2474  a20_im += c20_22_im * o22_re;
2475  a20_re += c20_30_re * o30_re;
2476  a20_re -= c20_30_im * o30_im;
2477  a20_im += c20_30_re * o30_im;
2478  a20_im += c20_30_im * o30_re;
2479  a20_re += c20_31_re * o31_re;
2480  a20_re -= c20_31_im * o31_im;
2481  a20_im += c20_31_re * o31_im;
2482  a20_im += c20_31_im * o31_re;
2483  a20_re += c20_32_re * o32_re;
2484  a20_re -= c20_32_im * o32_im;
2485  a20_im += c20_32_re * o32_im;
2486  a20_im += c20_32_im * o32_re;
2487 
2488  a21_re += c21_20_re * o20_re;
2489  a21_re -= c21_20_im * o20_im;
2490  a21_im += c21_20_re * o20_im;
2491  a21_im += c21_20_im * o20_re;
2492  a21_re += c21_21_re * o21_re;
2493  a21_im += c21_21_re * o21_im;
2494  a21_re += c21_22_re * o22_re;
2495  a21_re -= c21_22_im * o22_im;
2496  a21_im += c21_22_re * o22_im;
2497  a21_im += c21_22_im * o22_re;
2498  a21_re += c21_30_re * o30_re;
2499  a21_re -= c21_30_im * o30_im;
2500  a21_im += c21_30_re * o30_im;
2501  a21_im += c21_30_im * o30_re;
2502  a21_re += c21_31_re * o31_re;
2503  a21_re -= c21_31_im * o31_im;
2504  a21_im += c21_31_re * o31_im;
2505  a21_im += c21_31_im * o31_re;
2506  a21_re += c21_32_re * o32_re;
2507  a21_re -= c21_32_im * o32_im;
2508  a21_im += c21_32_re * o32_im;
2509  a21_im += c21_32_im * o32_re;
2510 
2511  a22_re += c22_20_re * o20_re;
2512  a22_re -= c22_20_im * o20_im;
2513  a22_im += c22_20_re * o20_im;
2514  a22_im += c22_20_im * o20_re;
2515  a22_re += c22_21_re * o21_re;
2516  a22_re -= c22_21_im * o21_im;
2517  a22_im += c22_21_re * o21_im;
2518  a22_im += c22_21_im * o21_re;
2519  a22_re += c22_22_re * o22_re;
2520  a22_im += c22_22_re * o22_im;
2521  a22_re += c22_30_re * o30_re;
2522  a22_re -= c22_30_im * o30_im;
2523  a22_im += c22_30_re * o30_im;
2524  a22_im += c22_30_im * o30_re;
2525  a22_re += c22_31_re * o31_re;
2526  a22_re -= c22_31_im * o31_im;
2527  a22_im += c22_31_re * o31_im;
2528  a22_im += c22_31_im * o31_re;
2529  a22_re += c22_32_re * o32_re;
2530  a22_re -= c22_32_im * o32_im;
2531  a22_im += c22_32_re * o32_im;
2532  a22_im += c22_32_im * o32_re;
2533 
2534  a30_re += c30_20_re * o20_re;
2535  a30_re -= c30_20_im * o20_im;
2536  a30_im += c30_20_re * o20_im;
2537  a30_im += c30_20_im * o20_re;
2538  a30_re += c30_21_re * o21_re;
2539  a30_re -= c30_21_im * o21_im;
2540  a30_im += c30_21_re * o21_im;
2541  a30_im += c30_21_im * o21_re;
2542  a30_re += c30_22_re * o22_re;
2543  a30_re -= c30_22_im * o22_im;
2544  a30_im += c30_22_re * o22_im;
2545  a30_im += c30_22_im * o22_re;
2546  a30_re += c30_30_re * o30_re;
2547  a30_im += c30_30_re * o30_im;
2548  a30_re += c30_31_re * o31_re;
2549  a30_re -= c30_31_im * o31_im;
2550  a30_im += c30_31_re * o31_im;
2551  a30_im += c30_31_im * o31_re;
2552  a30_re += c30_32_re * o32_re;
2553  a30_re -= c30_32_im * o32_im;
2554  a30_im += c30_32_re * o32_im;
2555  a30_im += c30_32_im * o32_re;
2556 
2557  a31_re += c31_20_re * o20_re;
2558  a31_re -= c31_20_im * o20_im;
2559  a31_im += c31_20_re * o20_im;
2560  a31_im += c31_20_im * o20_re;
2561  a31_re += c31_21_re * o21_re;
2562  a31_re -= c31_21_im * o21_im;
2563  a31_im += c31_21_re * o21_im;
2564  a31_im += c31_21_im * o21_re;
2565  a31_re += c31_22_re * o22_re;
2566  a31_re -= c31_22_im * o22_im;
2567  a31_im += c31_22_re * o22_im;
2568  a31_im += c31_22_im * o22_re;
2569  a31_re += c31_30_re * o30_re;
2570  a31_re -= c31_30_im * o30_im;
2571  a31_im += c31_30_re * o30_im;
2572  a31_im += c31_30_im * o30_re;
2573  a31_re += c31_31_re * o31_re;
2574  a31_im += c31_31_re * o31_im;
2575  a31_re += c31_32_re * o32_re;
2576  a31_re -= c31_32_im * o32_im;
2577  a31_im += c31_32_re * o32_im;
2578  a31_im += c31_32_im * o32_re;
2579 
2580  a32_re += c32_20_re * o20_re;
2581  a32_re -= c32_20_im * o20_im;
2582  a32_im += c32_20_re * o20_im;
2583  a32_im += c32_20_im * o20_re;
2584  a32_re += c32_21_re * o21_re;
2585  a32_re -= c32_21_im * o21_im;
2586  a32_im += c32_21_re * o21_im;
2587  a32_im += c32_21_im * o21_re;
2588  a32_re += c32_22_re * o22_re;
2589  a32_re -= c32_22_im * o22_im;
2590  a32_im += c32_22_re * o22_im;
2591  a32_im += c32_22_im * o22_re;
2592  a32_re += c32_30_re * o30_re;
2593  a32_re -= c32_30_im * o30_im;
2594  a32_im += c32_30_re * o30_im;
2595  a32_im += c32_30_im * o30_re;
2596  a32_re += c32_31_re * o31_re;
2597  a32_re -= c32_31_im * o31_im;
2598  a32_im += c32_31_re * o31_im;
2599  a32_im += c32_31_im * o31_re;
2600  a32_re += c32_32_re * o32_re;
2601  a32_im += c32_32_re * o32_im;
2602 
2603  o20_re = a20_re; o20_im = a20_im;
2604  o21_re = a21_re; o21_im = a21_im;
2605  o22_re = a22_re; o22_im = a22_im;
2606  o30_re = a30_re; o30_im = a30_im;
2607  o31_re = a31_re; o31_im = a31_im;
2608  o32_re = a32_re; o32_im = a32_im;
2609 
2610  }
2611 
2612  // change back from chiral basis
2613  // (note: required factor of 1/2 is included in clover term normalization)
2614  {
2615  spinorFloat a00_re = o10_re + o30_re;
2616  spinorFloat a00_im = o10_im + o30_im;
2617  spinorFloat a10_re = -o00_re - o20_re;
2618  spinorFloat a10_im = -o00_im - o20_im;
2619  spinorFloat a20_re = o10_re - o30_re;
2620  spinorFloat a20_im = o10_im - o30_im;
2621  spinorFloat a30_re = -o00_re + o20_re;
2622  spinorFloat a30_im = -o00_im + o20_im;
2623 
2624  o00_re = a00_re; o00_im = a00_im;
2625  o10_re = a10_re; o10_im = a10_im;
2626  o20_re = a20_re; o20_im = a20_im;
2627  o30_re = a30_re; o30_im = a30_im;
2628  }
2629 
2630  {
2631  spinorFloat a01_re = o11_re + o31_re;
2632  spinorFloat a01_im = o11_im + o31_im;
2633  spinorFloat a11_re = -o01_re - o21_re;
2634  spinorFloat a11_im = -o01_im - o21_im;
2635  spinorFloat a21_re = o11_re - o31_re;
2636  spinorFloat a21_im = o11_im - o31_im;
2637  spinorFloat a31_re = -o01_re + o21_re;
2638  spinorFloat a31_im = -o01_im + o21_im;
2639 
2640  o01_re = a01_re; o01_im = a01_im;
2641  o11_re = a11_re; o11_im = a11_im;
2642  o21_re = a21_re; o21_im = a21_im;
2643  o31_re = a31_re; o31_im = a31_im;
2644  }
2645 
2646  {
2647  spinorFloat a02_re = o12_re + o32_re;
2648  spinorFloat a02_im = o12_im + o32_im;
2649  spinorFloat a12_re = -o02_re - o22_re;
2650  spinorFloat a12_im = -o02_im - o22_im;
2651  spinorFloat a22_re = o12_re - o32_re;
2652  spinorFloat a22_im = o12_im - o32_im;
2653  spinorFloat a32_re = -o02_re + o22_re;
2654  spinorFloat a32_im = -o02_im + o22_im;
2655 
2656  o02_re = a02_re; o02_im = a02_im;
2657  o12_re = a12_re; o12_im = a12_im;
2658  o22_re = a22_re; o22_im = a22_im;
2659  o32_re = a32_re; o32_im = a32_im;
2660  }
2661 
2662 #endif // DSLASH_CLOVER
2663 
2664 #ifdef DSLASH_XPAY
2665 
2666  READ_ACCUM(ACCUMTEX, param.sp_stride)
2667 
2668  o00_re = a*o00_re+acc00_re;
2669  o00_im = a*o00_im+acc00_im;
2670  o01_re = a*o01_re+acc01_re;
2671  o01_im = a*o01_im+acc01_im;
2672  o02_re = a*o02_re+acc02_re;
2673  o02_im = a*o02_im+acc02_im;
2674  o10_re = a*o10_re+acc10_re;
2675  o10_im = a*o10_im+acc10_im;
2676  o11_re = a*o11_re+acc11_re;
2677  o11_im = a*o11_im+acc11_im;
2678  o12_re = a*o12_re+acc12_re;
2679  o12_im = a*o12_im+acc12_im;
2680  o20_re = a*o20_re+acc20_re;
2681  o20_im = a*o20_im+acc20_im;
2682  o21_re = a*o21_re+acc21_re;
2683  o21_im = a*o21_im+acc21_im;
2684  o22_re = a*o22_re+acc22_re;
2685  o22_im = a*o22_im+acc22_im;
2686  o30_re = a*o30_re+acc30_re;
2687  o30_im = a*o30_im+acc30_im;
2688  o31_re = a*o31_re+acc31_re;
2689  o31_im = a*o31_im+acc31_im;
2690  o32_re = a*o32_re+acc32_re;
2691  o32_im = a*o32_im+acc32_im;
2692 #endif // DSLASH_XPAY
2693 }
2694 
2695 // write spinor field back to device memory
2696 WRITE_SPINOR(param.sp_stride);
2697 
2698 // undefine to prevent warning when precision is changed
2699 #undef spinorFloat
2700 #undef WRITE_SPINOR_SHARED
2701 #undef READ_SPINOR_SHARED
2702 #undef SHARED_STRIDE
2703 
2704 #undef g00_re
2705 #undef g00_im
2706 #undef g01_re
2707 #undef g01_im
2708 #undef g02_re
2709 #undef g02_im
2710 #undef g10_re
2711 #undef g10_im
2712 #undef g11_re
2713 #undef g11_im
2714 #undef g12_re
2715 #undef g12_im
2716 #undef g20_re
2717 #undef g20_im
2718 #undef g21_re
2719 #undef g21_im
2720 #undef g22_re
2721 #undef g22_im
2722 
2723 #undef i00_re
2724 #undef i00_im
2725 #undef i01_re
2726 #undef i01_im
2727 #undef i02_re
2728 #undef i02_im
2729 #undef i10_re
2730 #undef i10_im
2731 #undef i11_re
2732 #undef i11_im
2733 #undef i12_re
2734 #undef i12_im
2735 #undef i20_re
2736 #undef i20_im
2737 #undef i21_re
2738 #undef i21_im
2739 #undef i22_re
2740 #undef i22_im
2741 #undef i30_re
2742 #undef i30_im
2743 #undef i31_re
2744 #undef i31_im
2745 #undef i32_re
2746 #undef i32_im
2747 
2748 #undef acc00_re
2749 #undef acc00_im
2750 #undef acc01_re
2751 #undef acc01_im
2752 #undef acc02_re
2753 #undef acc02_im
2754 #undef acc10_re
2755 #undef acc10_im
2756 #undef acc11_re
2757 #undef acc11_im
2758 #undef acc12_re
2759 #undef acc12_im
2760 #undef acc20_re
2761 #undef acc20_im
2762 #undef acc21_re
2763 #undef acc21_im
2764 #undef acc22_re
2765 #undef acc22_im
2766 #undef acc30_re
2767 #undef acc30_im
2768 #undef acc31_re
2769 #undef acc31_im
2770 #undef acc32_re
2771 #undef acc32_im
2772 
2773 #undef c00_00_re
2774 #undef c01_01_re
2775 #undef c02_02_re
2776 #undef c10_10_re
2777 #undef c11_11_re
2778 #undef c12_12_re
2779 #undef c01_00_re
2780 #undef c01_00_im
2781 #undef c02_00_re
2782 #undef c02_00_im
2783 #undef c10_00_re
2784 #undef c10_00_im
2785 #undef c11_00_re
2786 #undef c11_00_im
2787 #undef c12_00_re
2788 #undef c12_00_im
2789 #undef c02_01_re
2790 #undef c02_01_im
2791 #undef c10_01_re
2792 #undef c10_01_im
2793 #undef c11_01_re
2794 #undef c11_01_im
2795 #undef c12_01_re
2796 #undef c12_01_im
2797 #undef c10_02_re
2798 #undef c10_02_im
2799 #undef c11_02_re
2800 #undef c11_02_im
2801 #undef c12_02_re
2802 #undef c12_02_im
2803 #undef c11_10_re
2804 #undef c11_10_im
2805 #undef c12_10_re
2806 #undef c12_10_im
2807 #undef c12_11_re
2808 #undef c12_11_im
2809 
2810 #undef o00_re
2811 #undef o00_im
2812 #undef o01_re
2813 #undef o01_im
2814 #undef o02_re
2815 #undef o02_im
2816 #undef o10_re
2817 #undef o10_im
2818 #undef o11_re
2819 #undef o11_im
2820 #undef o12_re
2821 #undef o12_im
2822 #undef o20_re
2823 #undef o20_im
2824 #undef o21_re
2825 #undef o21_im
2826 #undef o22_re
2827 #undef o22_im
2828 #undef o30_re
2829 #undef o30_im
2830 #undef o31_re
2831 #undef o31_im
2832 #undef o32_re
2833 #undef o32_im
2834 
2835 #undef VOLATILE
#define c12_10_im
__constant__ int Vh
#define a22_re
Definition: llfat_core.h:131
#define c31_20_im
#define c21_22_re
#define g11_re
#define acc11_re
#define gT21_re
__syncthreads()
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
RECONSTRUCT_GAUGE_MATRIX(0)
#define i22_re
__constant__ int X2
#define acc31_im
#define c02_11_re
#define c32_30_im
#define gT00_re
#define i02_re
spinorFloat a0_im
VOLATILE spinorFloat o31_im
__constant__ int X2X1mX1
#define g20_im
#define c01_12_im
#define spinorFloat
#define g01_im
#define c12_11_re
#define CLOVERTEX
Definition: clover_def.h:101
#define c31_30_im
#define c02_12_im
#define a02_im
Definition: llfat_core.h:120
spinorFloat a1_im
VOLATILE spinorFloat o10_im
#define i01_im
__constant__ int X3X2X1mX2X1
#define c00_01_re
#define c12_02_re
#define acc02_re
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
#define c10_11_re
#define acc21_re
#define c01_02_re
#define c12_12_re
int sp_idx
#define g02_re
#define a22_im
Definition: llfat_core.h:132
spinorFloat A1_im
#define gT02_re
VOLATILE spinorFloat o21_re
#define c10_11_im
#define gT02_im
#define acc32_im
#define c20_30_im
#define c22_22_re
#define c02_01_re
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o11_re
__constant__ int X3X2X1
#define c00_11_re
#define c30_22_im
#define c11_10_im
#define c12_11_im
#define c32_31_im
#define c22_32_re
spinorFloat a0_re
#define c21_30_im
#define c20_21_re
#define i22_im
#define a01_re
Definition: llfat_core.h:117
spinorFloat a2_re
#define gT22_re
#define c31_32_re
#define c20_32_im
#define c00_02_im
spinorFloat B2_im
VOLATILE spinorFloat o01_im
#define acc01_im
#define c22_31_re
#define c20_22_im
#define acc30_re
spinorFloat b1_im
#define c30_20_re
#define c01_00_re
#define c01_11_im
#define c00_10_re
#define c10_02_im
#define a02_re
Definition: llfat_core.h:119
#define gT12_re
#define c22_21_im
#define a20_re
Definition: llfat_core.h:127
#define i11_re
#define c20_31_im
#define a12_im
Definition: llfat_core.h:126
#define c10_00_re
spinorFloat A0_im
#define c02_12_re
#define i11_im
#define acc10_re
#define a20_im
Definition: llfat_core.h:128
#define c11_00_re
#define c22_32_im
#define g11_im
#define READ_SPINOR_SHARED
#define c11_10_re
QudaGaugeParam param
Definition: pack_test.cpp:17
#define c12_00_re
VOLATILE spinorFloat o20_im
#define g02_im
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define c10_01_re
#define gT11_im
VOLATILE spinorFloat o02_re
#define c12_01_re
#define c11_12_im
#define c01_00_im
#define i31_im
#define c01_01_re
#define c30_21_re
#define c22_20_re
#define c21_22_im
#define c11_02_re
#define gT00_im
#define i10_im
#define c01_11_re
#define acc00_im
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define c10_12_im
spinorFloat A2_re
#define c00_12_re
#define a01_im
Definition: llfat_core.h:118
#define c31_30_re
VOLATILE spinorFloat o30_im
#define a12_re
Definition: llfat_core.h:125
#define i00_im
#define GAUGE0TEX
Definition: covDev.h:112
#define acc00_re
#define c30_32_re
#define a11_re
Definition: llfat_core.h:123
#define c20_30_re
#define c32_22_re
#define c22_31_im
#define c12_01_im
#define g00_re
#define c32_20_re
VOLATILE spinorFloat o12_im
VOLATILE spinorFloat o02_im
#define acc01_re
VOLATILE spinorFloat o00_re
#define g01_re
#define c20_22_re
#define gT20_im
spinorFloat b0_re
#define c32_30_re
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o32_im
#define g12_im
#define g00_im
#define c22_20_im
#define c00_11_im
#define c12_00_im
#define c32_20_im
#define c20_31_re
#define c10_02_re
#define c31_20_re
#define acc32_re
#define c01_10_re
#define c02_00_im
#define gT01_re
#define c21_31_re
#define c22_30_im
__constant__ int X2m1
#define c21_32_re
#define c02_10_im
spinorFloat B1_re
#define c11_12_re
VOLATILE spinorFloat o22_re
#define g21_re
spinorFloat B0_re
#define c32_21_im
VOLATILE spinorFloat o20_re
#define c20_21_im
#define SPINORTEX
Definition: clover_def.h:40
#define acc10_im
__constant__ int gauge_fixed
#define c10_00_im
#define i00_re
#define acc12_im
#define c01_10_im
#define c22_30_re
VOLATILE spinorFloat o01_re
#define c11_01_re
__constant__ int X4X3X2X1mX3X2X1
#define c12_10_re
#define c00_10_im
#define i21_im
VOLATILE spinorFloat o10_re
#define g10_im
#define c30_32_im
#define i30_im
#define c00_12_im
#define SPINOR_HOP
Definition: covDev.h:158
spinorFloat A0_re
VOLATILE spinorFloat o21_im
#define acc22_re
#define g10_re
#define WRITE_SPINOR_SHARED
spinorFloat A2_im
#define c31_32_im
VOLATILE spinorFloat o30_re
#define c11_02_im
#define c31_22_re
#define c00_00_re
spinorFloat B0_im
__constant__ int ga_stride
#define c02_00_re
#define gT01_im
#define i20_re
#define c20_32_re
#define c10_01_im
#define c00_02_re
#define a00_re
Definition: llfat_core.h:115
spinorFloat a1_re
#define gT11_re
#define g20_re
#define c11_00_im
#define c30_20_im
#define i31_re
#define acc11_im
__constant__ int X1m1
#define c10_10_re
VOLATILE spinorFloat o31_re
#define g22_im
__constant__ int X3
spinorFloat B1_im
#define c21_20_im
VOLATILE spinorFloat o12_re
#define c30_21_im
#define g22_re
#define a11_im
Definition: llfat_core.h:124
#define i21_re
#define c21_20_re
#define gT21_im
#define i20_im
#define c00_01_im
#define c30_31_im
spinorFloat a2_im
#define acc20_re
VOLATILE spinorFloat o32_re
#define i12_re
coordsFromIndex3D< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
#define a10_re
Definition: llfat_core.h:121
#define i10_re
#define i12_im
#define c30_31_re
#define i01_re
#define c32_32_re
#define c31_31_re
spinorFloat A1_re
#define GAUGE1TEX
Definition: covDev.h:113
#define c22_21_re
#define acc21_im
#define acc12_re
VOLATILE spinorFloat o00_im
#define gT22_im
#define acc22_im
#define c21_31_im
#define gT10_re
#define c32_31_re
#define g21_im
#define c11_11_re
#define c30_30_re
#define a10_im
Definition: llfat_core.h:122
#define c01_12_re
__constant__ int X4m1
#define c02_10_re
#define a21_re
Definition: llfat_core.h:129
#define c20_20_re
#define c21_30_re
#define acc02_im
#define c32_21_re
#define c30_22_re
#define WRITE_SPINOR
Definition: clover_def.h:48
#define acc30_im
#define g12_re
#define c02_01_im
#define c21_32_im
#define gT20_re
#define c10_12_re
#define gT10_im
#define VOLATILE
#define c02_02_re
spinorFloat b2_re
spinorFloat b2_im
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
#define c02_11_im
#define i30_re
__constant__ int X4X3X2X1hmX3X2X1h
#define i32_im
#define i02_im
#define c31_21_re
const int dims[]
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
#define c11_01_im
#define acc20_im
#define a21_im
Definition: llfat_core.h:130
#define acc31_re
KernelType kernel_type
#define READ_CLOVER
Definition: clover_def.h:103
#define c31_22_im
#define c32_22_im
#define c31_21_im
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define c01_02_im
#define c12_02_im
__constant__ int X4
__constant__ int X3m1
#define TPROJSCALE
Definition: covDev.h:101
spinorFloat B2_re
#define gT12_im
#define i32_re
#define a00_im
Definition: llfat_core.h:116
const int ga_idx
spinorFloat b1_re
#define c21_21_re
__constant__ int X2X1
spinorFloat b0_im