QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
asym_wilson_clover_dslash_fermi_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 24
4 
5 
6 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
7 #define VOLATILE
8 #else // Open64 compiler
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2
15 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
64 #else
65 #define spinorFloat float
66 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4
67 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4
68 #define i00_re I0.x
69 #define i00_im I0.y
70 #define i01_re I0.z
71 #define i01_im I0.w
72 #define i02_re I1.x
73 #define i02_im I1.y
74 #define i10_re I1.z
75 #define i10_im I1.w
76 #define i11_re I2.x
77 #define i11_im I2.y
78 #define i12_re I2.z
79 #define i12_im I2.w
80 #define i20_re I3.x
81 #define i20_im I3.y
82 #define i21_re I3.z
83 #define i21_im I3.w
84 #define i22_re I4.x
85 #define i22_im I4.y
86 #define i30_re I4.z
87 #define i30_im I4.w
88 #define i31_re I5.x
89 #define i31_im I5.y
90 #define i32_re I5.z
91 #define i32_im I5.w
92 #define acc00_re accum0.x
93 #define acc00_im accum0.y
94 #define acc01_re accum0.z
95 #define acc01_im accum0.w
96 #define acc02_re accum1.x
97 #define acc02_im accum1.y
98 #define acc10_re accum1.z
99 #define acc10_im accum1.w
100 #define acc11_re accum2.x
101 #define acc11_im accum2.y
102 #define acc12_re accum2.z
103 #define acc12_im accum2.w
104 #define acc20_re accum3.x
105 #define acc20_im accum3.y
106 #define acc21_re accum3.z
107 #define acc21_im accum3.w
108 #define acc22_re accum4.x
109 #define acc22_im accum4.y
110 #define acc30_re accum4.z
111 #define acc30_im accum4.w
112 #define acc31_re accum5.x
113 #define acc31_im accum5.y
114 #define acc32_re accum5.z
115 #define acc32_im accum5.w
116 #endif // SPINOR_DOUBLE
117 
118 // gauge link
119 #ifdef GAUGE_FLOAT2
120 #define g00_re G0.x
121 #define g00_im G0.y
122 #define g01_re G1.x
123 #define g01_im G1.y
124 #define g02_re G2.x
125 #define g02_im G2.y
126 #define g10_re G3.x
127 #define g10_im G3.y
128 #define g11_re G4.x
129 #define g11_im G4.y
130 #define g12_re G5.x
131 #define g12_im G5.y
132 #define g20_re G6.x
133 #define g20_im G6.y
134 #define g21_re G7.x
135 #define g21_im G7.y
136 #define g22_re G8.x
137 #define g22_im G8.y
138 
139 #else
140 #define g00_re G0.x
141 #define g00_im G0.y
142 #define g01_re G0.z
143 #define g01_im G0.w
144 #define g02_re G1.x
145 #define g02_im G1.y
146 #define g10_re G1.z
147 #define g10_im G1.w
148 #define g11_re G2.x
149 #define g11_im G2.y
150 #define g12_re G2.z
151 #define g12_im G2.w
152 #define g20_re G3.x
153 #define g20_im G3.y
154 #define g21_re G3.z
155 #define g21_im G3.w
156 #define g22_re G4.x
157 #define g22_im G4.y
158 
159 #endif // GAUGE_DOUBLE
160 
161 // conjugated gauge link
162 #define gT00_re (+g00_re)
163 #define gT00_im (-g00_im)
164 #define gT01_re (+g10_re)
165 #define gT01_im (-g10_im)
166 #define gT02_re (+g20_re)
167 #define gT02_im (-g20_im)
168 #define gT10_re (+g01_re)
169 #define gT10_im (-g01_im)
170 #define gT11_re (+g11_re)
171 #define gT11_im (-g11_im)
172 #define gT12_re (+g21_re)
173 #define gT12_im (-g21_im)
174 #define gT20_re (+g02_re)
175 #define gT20_im (-g02_im)
176 #define gT21_re (+g12_re)
177 #define gT21_im (-g12_im)
178 #define gT22_re (+g22_re)
179 #define gT22_im (-g22_im)
180 
181 // first chiral block of inverted clover term
182 #ifdef CLOVER_DOUBLE
183 #define c00_00_re C0.x
184 #define c01_01_re C0.y
185 #define c02_02_re C1.x
186 #define c10_10_re C1.y
187 #define c11_11_re C2.x
188 #define c12_12_re C2.y
189 #define c01_00_re C3.x
190 #define c01_00_im C3.y
191 #define c02_00_re C4.x
192 #define c02_00_im C4.y
193 #define c10_00_re C5.x
194 #define c10_00_im C5.y
195 #define c11_00_re C6.x
196 #define c11_00_im C6.y
197 #define c12_00_re C7.x
198 #define c12_00_im C7.y
199 #define c02_01_re C8.x
200 #define c02_01_im C8.y
201 #define c10_01_re C9.x
202 #define c10_01_im C9.y
203 #define c11_01_re C10.x
204 #define c11_01_im C10.y
205 #define c12_01_re C11.x
206 #define c12_01_im C11.y
207 #define c10_02_re C12.x
208 #define c10_02_im C12.y
209 #define c11_02_re C13.x
210 #define c11_02_im C13.y
211 #define c12_02_re C14.x
212 #define c12_02_im C14.y
213 #define c11_10_re C15.x
214 #define c11_10_im C15.y
215 #define c12_10_re C16.x
216 #define c12_10_im C16.y
217 #define c12_11_re C17.x
218 #define c12_11_im C17.y
219 #else
220 #define c00_00_re C0.x
221 #define c01_01_re C0.y
222 #define c02_02_re C0.z
223 #define c10_10_re C0.w
224 #define c11_11_re C1.x
225 #define c12_12_re C1.y
226 #define c01_00_re C1.z
227 #define c01_00_im C1.w
228 #define c02_00_re C2.x
229 #define c02_00_im C2.y
230 #define c10_00_re C2.z
231 #define c10_00_im C2.w
232 #define c11_00_re C3.x
233 #define c11_00_im C3.y
234 #define c12_00_re C3.z
235 #define c12_00_im C3.w
236 #define c02_01_re C4.x
237 #define c02_01_im C4.y
238 #define c10_01_re C4.z
239 #define c10_01_im C4.w
240 #define c11_01_re C5.x
241 #define c11_01_im C5.y
242 #define c12_01_re C5.z
243 #define c12_01_im C5.w
244 #define c10_02_re C6.x
245 #define c10_02_im C6.y
246 #define c11_02_re C6.z
247 #define c11_02_im C6.w
248 #define c12_02_re C7.x
249 #define c12_02_im C7.y
250 #define c11_10_re C7.z
251 #define c11_10_im C7.w
252 #define c12_10_re C8.x
253 #define c12_10_im C8.y
254 #define c12_11_re C8.z
255 #define c12_11_im C8.w
256 #endif // CLOVER_DOUBLE
257 
258 #define c00_01_re (+c01_00_re)
259 #define c00_01_im (-c01_00_im)
260 #define c00_02_re (+c02_00_re)
261 #define c00_02_im (-c02_00_im)
262 #define c01_02_re (+c02_01_re)
263 #define c01_02_im (-c02_01_im)
264 #define c00_10_re (+c10_00_re)
265 #define c00_10_im (-c10_00_im)
266 #define c01_10_re (+c10_01_re)
267 #define c01_10_im (-c10_01_im)
268 #define c02_10_re (+c10_02_re)
269 #define c02_10_im (-c10_02_im)
270 #define c00_11_re (+c11_00_re)
271 #define c00_11_im (-c11_00_im)
272 #define c01_11_re (+c11_01_re)
273 #define c01_11_im (-c11_01_im)
274 #define c02_11_re (+c11_02_re)
275 #define c02_11_im (-c11_02_im)
276 #define c10_11_re (+c11_10_re)
277 #define c10_11_im (-c11_10_im)
278 #define c00_12_re (+c12_00_re)
279 #define c00_12_im (-c12_00_im)
280 #define c01_12_re (+c12_01_re)
281 #define c01_12_im (-c12_01_im)
282 #define c02_12_re (+c12_02_re)
283 #define c02_12_im (-c12_02_im)
284 #define c10_12_re (+c12_10_re)
285 #define c10_12_im (-c12_10_im)
286 #define c11_12_re (+c12_11_re)
287 #define c11_12_im (-c12_11_im)
288 
289 // second chiral block of inverted clover term (reuses C0,...,C9)
290 #define c20_20_re c00_00_re
291 #define c21_20_re c01_00_re
292 #define c21_20_im c01_00_im
293 #define c22_20_re c02_00_re
294 #define c22_20_im c02_00_im
295 #define c30_20_re c10_00_re
296 #define c30_20_im c10_00_im
297 #define c31_20_re c11_00_re
298 #define c31_20_im c11_00_im
299 #define c32_20_re c12_00_re
300 #define c32_20_im c12_00_im
301 #define c20_21_re c00_01_re
302 #define c20_21_im c00_01_im
303 #define c21_21_re c01_01_re
304 #define c22_21_re c02_01_re
305 #define c22_21_im c02_01_im
306 #define c30_21_re c10_01_re
307 #define c30_21_im c10_01_im
308 #define c31_21_re c11_01_re
309 #define c31_21_im c11_01_im
310 #define c32_21_re c12_01_re
311 #define c32_21_im c12_01_im
312 #define c20_22_re c00_02_re
313 #define c20_22_im c00_02_im
314 #define c21_22_re c01_02_re
315 #define c21_22_im c01_02_im
316 #define c22_22_re c02_02_re
317 #define c30_22_re c10_02_re
318 #define c30_22_im c10_02_im
319 #define c31_22_re c11_02_re
320 #define c31_22_im c11_02_im
321 #define c32_22_re c12_02_re
322 #define c32_22_im c12_02_im
323 #define c20_30_re c00_10_re
324 #define c20_30_im c00_10_im
325 #define c21_30_re c01_10_re
326 #define c21_30_im c01_10_im
327 #define c22_30_re c02_10_re
328 #define c22_30_im c02_10_im
329 #define c30_30_re c10_10_re
330 #define c31_30_re c11_10_re
331 #define c31_30_im c11_10_im
332 #define c32_30_re c12_10_re
333 #define c32_30_im c12_10_im
334 #define c20_31_re c00_11_re
335 #define c20_31_im c00_11_im
336 #define c21_31_re c01_11_re
337 #define c21_31_im c01_11_im
338 #define c22_31_re c02_11_re
339 #define c22_31_im c02_11_im
340 #define c30_31_re c10_11_re
341 #define c30_31_im c10_11_im
342 #define c31_31_re c11_11_re
343 #define c32_31_re c12_11_re
344 #define c32_31_im c12_11_im
345 #define c20_32_re c00_12_re
346 #define c20_32_im c00_12_im
347 #define c21_32_re c01_12_re
348 #define c21_32_im c01_12_im
349 #define c22_32_re c02_12_re
350 #define c22_32_im c02_12_im
351 #define c30_32_re c10_12_re
352 #define c30_32_im c10_12_im
353 #define c31_32_re c11_12_re
354 #define c31_32_im c11_12_im
355 #define c32_32_re c12_12_re
356 
357 // output spinor
382 
383 #ifdef SPINOR_DOUBLE
384 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
385 #else
386 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
387 #endif
388 
389 #include "read_gauge.h"
390 #include "read_clover.h"
391 #include "io_spinor.h"
392 
393 int x1, x2, x3, x4;
394 int X;
395 
396 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
397 int sp_norm_idx;
398 #endif // MULTI_GPU half precision
399 
400 int sid;
401 
402 #ifdef MULTI_GPU
403 int face_idx;
405 #endif
406 
407  // Inline by hand for the moment and assume even dimensions
408  const int dims[] = {X1, X2, X3, X4};
409  coordsFromIndex3D<EVEN_X>(X, x1, x2, x3, x4, sid, param.parity, dims);
410 
411  // only need to check Y and Z dims currently since X and T set to match exactly
412  if (x2 >= X2) return;
413  if (x3 >= X3) return;
414 
415  o00_re = 0; o00_im = 0;
416  o01_re = 0; o01_im = 0;
417  o02_re = 0; o02_im = 0;
418  o10_re = 0; o10_im = 0;
419  o11_re = 0; o11_im = 0;
420  o12_re = 0; o12_im = 0;
421  o20_re = 0; o20_im = 0;
422  o21_re = 0; o21_im = 0;
423  o22_re = 0; o22_im = 0;
424  o30_re = 0; o30_im = 0;
425  o31_re = 0; o31_im = 0;
426  o32_re = 0; o32_im = 0;
427 #ifdef DSLASH_CLOVER_XPAY
428 
429  READ_ACCUM(ACCUMTEX, param.sp_stride)
430 
431 #ifdef DSLASH_CLOVER
432 
433  // change to chiral basis
434  {
441  spinorFloat a30_re = acc00_re - acc20_re;
442  spinorFloat a30_im = acc00_im - acc20_im;
443 
446  acc20_re = a20_re; acc20_im = a20_im;
447  acc30_re = a30_re; acc30_im = a30_im;
448  }
449 
450  {
457  spinorFloat a31_re = acc01_re - acc21_re;
458  spinorFloat a31_im = acc01_im - acc21_im;
459 
462  acc21_re = a21_re; acc21_im = a21_im;
463  acc31_re = a31_re; acc31_im = a31_im;
464  }
465 
466  {
473  spinorFloat a32_re = acc02_re - acc22_re;
474  spinorFloat a32_im = acc02_im - acc22_im;
475 
478  acc22_re = a22_re; acc22_im = a22_im;
479  acc32_re = a32_re; acc32_im = a32_im;
480  }
481 
482  // apply first chiral block
483  {
485 
492 
493  a00_re += c00_00_re * acc00_re;
494  a00_im += c00_00_re * acc00_im;
495  a00_re += c00_01_re * acc01_re;
496  a00_re -= c00_01_im * acc01_im;
497  a00_im += c00_01_re * acc01_im;
498  a00_im += c00_01_im * acc01_re;
499  a00_re += c00_02_re * acc02_re;
500  a00_re -= c00_02_im * acc02_im;
501  a00_im += c00_02_re * acc02_im;
502  a00_im += c00_02_im * acc02_re;
503  a00_re += c00_10_re * acc10_re;
504  a00_re -= c00_10_im * acc10_im;
505  a00_im += c00_10_re * acc10_im;
506  a00_im += c00_10_im * acc10_re;
507  a00_re += c00_11_re * acc11_re;
508  a00_re -= c00_11_im * acc11_im;
509  a00_im += c00_11_re * acc11_im;
510  a00_im += c00_11_im * acc11_re;
511  a00_re += c00_12_re * acc12_re;
512  a00_re -= c00_12_im * acc12_im;
513  a00_im += c00_12_re * acc12_im;
514  a00_im += c00_12_im * acc12_re;
515 
516  a01_re += c01_00_re * acc00_re;
517  a01_re -= c01_00_im * acc00_im;
518  a01_im += c01_00_re * acc00_im;
519  a01_im += c01_00_im * acc00_re;
520  a01_re += c01_01_re * acc01_re;
521  a01_im += c01_01_re * acc01_im;
522  a01_re += c01_02_re * acc02_re;
523  a01_re -= c01_02_im * acc02_im;
524  a01_im += c01_02_re * acc02_im;
525  a01_im += c01_02_im * acc02_re;
526  a01_re += c01_10_re * acc10_re;
527  a01_re -= c01_10_im * acc10_im;
528  a01_im += c01_10_re * acc10_im;
529  a01_im += c01_10_im * acc10_re;
530  a01_re += c01_11_re * acc11_re;
531  a01_re -= c01_11_im * acc11_im;
532  a01_im += c01_11_re * acc11_im;
533  a01_im += c01_11_im * acc11_re;
534  a01_re += c01_12_re * acc12_re;
535  a01_re -= c01_12_im * acc12_im;
536  a01_im += c01_12_re * acc12_im;
537  a01_im += c01_12_im * acc12_re;
538 
539  a02_re += c02_00_re * acc00_re;
540  a02_re -= c02_00_im * acc00_im;
541  a02_im += c02_00_re * acc00_im;
542  a02_im += c02_00_im * acc00_re;
543  a02_re += c02_01_re * acc01_re;
544  a02_re -= c02_01_im * acc01_im;
545  a02_im += c02_01_re * acc01_im;
546  a02_im += c02_01_im * acc01_re;
547  a02_re += c02_02_re * acc02_re;
548  a02_im += c02_02_re * acc02_im;
549  a02_re += c02_10_re * acc10_re;
550  a02_re -= c02_10_im * acc10_im;
551  a02_im += c02_10_re * acc10_im;
552  a02_im += c02_10_im * acc10_re;
553  a02_re += c02_11_re * acc11_re;
554  a02_re -= c02_11_im * acc11_im;
555  a02_im += c02_11_re * acc11_im;
556  a02_im += c02_11_im * acc11_re;
557  a02_re += c02_12_re * acc12_re;
558  a02_re -= c02_12_im * acc12_im;
559  a02_im += c02_12_re * acc12_im;
560  a02_im += c02_12_im * acc12_re;
561 
562  a10_re += c10_00_re * acc00_re;
563  a10_re -= c10_00_im * acc00_im;
564  a10_im += c10_00_re * acc00_im;
565  a10_im += c10_00_im * acc00_re;
566  a10_re += c10_01_re * acc01_re;
567  a10_re -= c10_01_im * acc01_im;
568  a10_im += c10_01_re * acc01_im;
569  a10_im += c10_01_im * acc01_re;
570  a10_re += c10_02_re * acc02_re;
571  a10_re -= c10_02_im * acc02_im;
572  a10_im += c10_02_re * acc02_im;
573  a10_im += c10_02_im * acc02_re;
574  a10_re += c10_10_re * acc10_re;
575  a10_im += c10_10_re * acc10_im;
576  a10_re += c10_11_re * acc11_re;
577  a10_re -= c10_11_im * acc11_im;
578  a10_im += c10_11_re * acc11_im;
579  a10_im += c10_11_im * acc11_re;
580  a10_re += c10_12_re * acc12_re;
581  a10_re -= c10_12_im * acc12_im;
582  a10_im += c10_12_re * acc12_im;
583  a10_im += c10_12_im * acc12_re;
584 
585  a11_re += c11_00_re * acc00_re;
586  a11_re -= c11_00_im * acc00_im;
587  a11_im += c11_00_re * acc00_im;
588  a11_im += c11_00_im * acc00_re;
589  a11_re += c11_01_re * acc01_re;
590  a11_re -= c11_01_im * acc01_im;
591  a11_im += c11_01_re * acc01_im;
592  a11_im += c11_01_im * acc01_re;
593  a11_re += c11_02_re * acc02_re;
594  a11_re -= c11_02_im * acc02_im;
595  a11_im += c11_02_re * acc02_im;
596  a11_im += c11_02_im * acc02_re;
597  a11_re += c11_10_re * acc10_re;
598  a11_re -= c11_10_im * acc10_im;
599  a11_im += c11_10_re * acc10_im;
600  a11_im += c11_10_im * acc10_re;
601  a11_re += c11_11_re * acc11_re;
602  a11_im += c11_11_re * acc11_im;
603  a11_re += c11_12_re * acc12_re;
604  a11_re -= c11_12_im * acc12_im;
605  a11_im += c11_12_re * acc12_im;
606  a11_im += c11_12_im * acc12_re;
607 
608  a12_re += c12_00_re * acc00_re;
609  a12_re -= c12_00_im * acc00_im;
610  a12_im += c12_00_re * acc00_im;
611  a12_im += c12_00_im * acc00_re;
612  a12_re += c12_01_re * acc01_re;
613  a12_re -= c12_01_im * acc01_im;
614  a12_im += c12_01_re * acc01_im;
615  a12_im += c12_01_im * acc01_re;
616  a12_re += c12_02_re * acc02_re;
617  a12_re -= c12_02_im * acc02_im;
618  a12_im += c12_02_re * acc02_im;
619  a12_im += c12_02_im * acc02_re;
620  a12_re += c12_10_re * acc10_re;
621  a12_re -= c12_10_im * acc10_im;
622  a12_im += c12_10_re * acc10_im;
623  a12_im += c12_10_im * acc10_re;
624  a12_re += c12_11_re * acc11_re;
625  a12_re -= c12_11_im * acc11_im;
626  a12_im += c12_11_re * acc11_im;
627  a12_im += c12_11_im * acc11_re;
628  a12_re += c12_12_re * acc12_re;
629  a12_im += c12_12_re * acc12_im;
630 
631  acc00_re = a00_re; acc00_im = a00_im;
632  acc01_re = a01_re; acc01_im = a01_im;
633  acc02_re = a02_re; acc02_im = a02_im;
634  acc10_re = a10_re; acc10_im = a10_im;
635  acc11_re = a11_re; acc11_im = a11_im;
636  acc12_re = a12_re; acc12_im = a12_im;
637 
638  }
639 
640  // apply second chiral block
641  {
643 
647  spinorFloat a30_re = 0; spinorFloat a30_im = 0;
648  spinorFloat a31_re = 0; spinorFloat a31_im = 0;
649  spinorFloat a32_re = 0; spinorFloat a32_im = 0;
650 
651  a20_re += c20_20_re * acc20_re;
652  a20_im += c20_20_re * acc20_im;
653  a20_re += c20_21_re * acc21_re;
654  a20_re -= c20_21_im * acc21_im;
655  a20_im += c20_21_re * acc21_im;
656  a20_im += c20_21_im * acc21_re;
657  a20_re += c20_22_re * acc22_re;
658  a20_re -= c20_22_im * acc22_im;
659  a20_im += c20_22_re * acc22_im;
660  a20_im += c20_22_im * acc22_re;
661  a20_re += c20_30_re * acc30_re;
662  a20_re -= c20_30_im * acc30_im;
663  a20_im += c20_30_re * acc30_im;
664  a20_im += c20_30_im * acc30_re;
665  a20_re += c20_31_re * acc31_re;
666  a20_re -= c20_31_im * acc31_im;
667  a20_im += c20_31_re * acc31_im;
668  a20_im += c20_31_im * acc31_re;
669  a20_re += c20_32_re * acc32_re;
670  a20_re -= c20_32_im * acc32_im;
671  a20_im += c20_32_re * acc32_im;
672  a20_im += c20_32_im * acc32_re;
673 
674  a21_re += c21_20_re * acc20_re;
675  a21_re -= c21_20_im * acc20_im;
676  a21_im += c21_20_re * acc20_im;
677  a21_im += c21_20_im * acc20_re;
678  a21_re += c21_21_re * acc21_re;
679  a21_im += c21_21_re * acc21_im;
680  a21_re += c21_22_re * acc22_re;
681  a21_re -= c21_22_im * acc22_im;
682  a21_im += c21_22_re * acc22_im;
683  a21_im += c21_22_im * acc22_re;
684  a21_re += c21_30_re * acc30_re;
685  a21_re -= c21_30_im * acc30_im;
686  a21_im += c21_30_re * acc30_im;
687  a21_im += c21_30_im * acc30_re;
688  a21_re += c21_31_re * acc31_re;
689  a21_re -= c21_31_im * acc31_im;
690  a21_im += c21_31_re * acc31_im;
691  a21_im += c21_31_im * acc31_re;
692  a21_re += c21_32_re * acc32_re;
693  a21_re -= c21_32_im * acc32_im;
694  a21_im += c21_32_re * acc32_im;
695  a21_im += c21_32_im * acc32_re;
696 
697  a22_re += c22_20_re * acc20_re;
698  a22_re -= c22_20_im * acc20_im;
699  a22_im += c22_20_re * acc20_im;
700  a22_im += c22_20_im * acc20_re;
701  a22_re += c22_21_re * acc21_re;
702  a22_re -= c22_21_im * acc21_im;
703  a22_im += c22_21_re * acc21_im;
704  a22_im += c22_21_im * acc21_re;
705  a22_re += c22_22_re * acc22_re;
706  a22_im += c22_22_re * acc22_im;
707  a22_re += c22_30_re * acc30_re;
708  a22_re -= c22_30_im * acc30_im;
709  a22_im += c22_30_re * acc30_im;
710  a22_im += c22_30_im * acc30_re;
711  a22_re += c22_31_re * acc31_re;
712  a22_re -= c22_31_im * acc31_im;
713  a22_im += c22_31_re * acc31_im;
714  a22_im += c22_31_im * acc31_re;
715  a22_re += c22_32_re * acc32_re;
716  a22_re -= c22_32_im * acc32_im;
717  a22_im += c22_32_re * acc32_im;
718  a22_im += c22_32_im * acc32_re;
719 
720  a30_re += c30_20_re * acc20_re;
721  a30_re -= c30_20_im * acc20_im;
722  a30_im += c30_20_re * acc20_im;
723  a30_im += c30_20_im * acc20_re;
724  a30_re += c30_21_re * acc21_re;
725  a30_re -= c30_21_im * acc21_im;
726  a30_im += c30_21_re * acc21_im;
727  a30_im += c30_21_im * acc21_re;
728  a30_re += c30_22_re * acc22_re;
729  a30_re -= c30_22_im * acc22_im;
730  a30_im += c30_22_re * acc22_im;
731  a30_im += c30_22_im * acc22_re;
732  a30_re += c30_30_re * acc30_re;
733  a30_im += c30_30_re * acc30_im;
734  a30_re += c30_31_re * acc31_re;
735  a30_re -= c30_31_im * acc31_im;
736  a30_im += c30_31_re * acc31_im;
737  a30_im += c30_31_im * acc31_re;
738  a30_re += c30_32_re * acc32_re;
739  a30_re -= c30_32_im * acc32_im;
740  a30_im += c30_32_re * acc32_im;
741  a30_im += c30_32_im * acc32_re;
742 
743  a31_re += c31_20_re * acc20_re;
744  a31_re -= c31_20_im * acc20_im;
745  a31_im += c31_20_re * acc20_im;
746  a31_im += c31_20_im * acc20_re;
747  a31_re += c31_21_re * acc21_re;
748  a31_re -= c31_21_im * acc21_im;
749  a31_im += c31_21_re * acc21_im;
750  a31_im += c31_21_im * acc21_re;
751  a31_re += c31_22_re * acc22_re;
752  a31_re -= c31_22_im * acc22_im;
753  a31_im += c31_22_re * acc22_im;
754  a31_im += c31_22_im * acc22_re;
755  a31_re += c31_30_re * acc30_re;
756  a31_re -= c31_30_im * acc30_im;
757  a31_im += c31_30_re * acc30_im;
758  a31_im += c31_30_im * acc30_re;
759  a31_re += c31_31_re * acc31_re;
760  a31_im += c31_31_re * acc31_im;
761  a31_re += c31_32_re * acc32_re;
762  a31_re -= c31_32_im * acc32_im;
763  a31_im += c31_32_re * acc32_im;
764  a31_im += c31_32_im * acc32_re;
765 
766  a32_re += c32_20_re * acc20_re;
767  a32_re -= c32_20_im * acc20_im;
768  a32_im += c32_20_re * acc20_im;
769  a32_im += c32_20_im * acc20_re;
770  a32_re += c32_21_re * acc21_re;
771  a32_re -= c32_21_im * acc21_im;
772  a32_im += c32_21_re * acc21_im;
773  a32_im += c32_21_im * acc21_re;
774  a32_re += c32_22_re * acc22_re;
775  a32_re -= c32_22_im * acc22_im;
776  a32_im += c32_22_re * acc22_im;
777  a32_im += c32_22_im * acc22_re;
778  a32_re += c32_30_re * acc30_re;
779  a32_re -= c32_30_im * acc30_im;
780  a32_im += c32_30_re * acc30_im;
781  a32_im += c32_30_im * acc30_re;
782  a32_re += c32_31_re * acc31_re;
783  a32_re -= c32_31_im * acc31_im;
784  a32_im += c32_31_re * acc31_im;
785  a32_im += c32_31_im * acc31_re;
786  a32_re += c32_32_re * acc32_re;
787  a32_im += c32_32_re * acc32_im;
788 
789  acc20_re = a20_re; acc20_im = a20_im;
790  acc21_re = a21_re; acc21_im = a21_im;
791  acc22_re = a22_re; acc22_im = a22_im;
792  acc30_re = a30_re; acc30_im = a30_im;
793  acc31_re = a31_re; acc31_im = a31_im;
794  acc32_re = a32_re; acc32_im = a32_im;
795 
796  }
797 
798  // change back from chiral basis
799  // (note: required factor of 1/2 is included in clover term normalization)
800  {
801  spinorFloat a00_re = acc10_re + acc30_re;
802  spinorFloat a00_im = acc10_im + acc30_im;
803  spinorFloat a10_re = -acc00_re - acc20_re;
804  spinorFloat a10_im = -acc00_im - acc20_im;
805  spinorFloat a20_re = acc10_re - acc30_re;
806  spinorFloat a20_im = acc10_im - acc30_im;
807  spinorFloat a30_re = -acc00_re + acc20_re;
808  spinorFloat a30_im = -acc00_im + acc20_im;
809 
810  acc00_re = a00_re; acc00_im = a00_im;
811  acc10_re = a10_re; acc10_im = a10_im;
812  acc20_re = a20_re; acc20_im = a20_im;
813  acc30_re = a30_re; acc30_im = a30_im;
814  }
815 
816  {
817  spinorFloat a01_re = acc11_re + acc31_re;
818  spinorFloat a01_im = acc11_im + acc31_im;
819  spinorFloat a11_re = -acc01_re - acc21_re;
820  spinorFloat a11_im = -acc01_im - acc21_im;
821  spinorFloat a21_re = acc11_re - acc31_re;
822  spinorFloat a21_im = acc11_im - acc31_im;
823  spinorFloat a31_re = -acc01_re + acc21_re;
824  spinorFloat a31_im = -acc01_im + acc21_im;
825 
826  acc01_re = a01_re; acc01_im = a01_im;
827  acc11_re = a11_re; acc11_im = a11_im;
828  acc21_re = a21_re; acc21_im = a21_im;
829  acc31_re = a31_re; acc31_im = a31_im;
830  }
831 
832  {
833  spinorFloat a02_re = acc12_re + acc32_re;
834  spinorFloat a02_im = acc12_im + acc32_im;
835  spinorFloat a12_re = -acc02_re - acc22_re;
836  spinorFloat a12_im = -acc02_im - acc22_im;
837  spinorFloat a22_re = acc12_re - acc32_re;
838  spinorFloat a22_im = acc12_im - acc32_im;
839  spinorFloat a32_re = -acc02_re + acc22_re;
840  spinorFloat a32_im = -acc02_im + acc22_im;
841 
842  acc02_re = a02_re; acc02_im = a02_im;
843  acc12_re = a12_re; acc12_im = a12_im;
844  acc22_re = a22_re; acc22_im = a22_im;
845  acc32_re = a32_re; acc32_im = a32_im;
846  }
847 
848 #endif // DSLASH_CLOVER
849 
850  o00_re = acc00_re;
851  o00_im = acc00_im;
852  o01_re = acc01_re;
853  o01_im = acc01_im;
854  o02_re = acc02_re;
855  o02_im = acc02_im;
856  o10_re = acc10_re;
857  o10_im = acc10_im;
858  o11_re = acc11_re;
859  o11_im = acc11_im;
860  o12_re = acc12_re;
861  o12_im = acc12_im;
862  o20_re = acc20_re;
863  o20_im = acc20_im;
864  o21_re = acc21_re;
865  o21_im = acc21_im;
866  o22_re = acc22_re;
867  o22_im = acc22_im;
868  o30_re = acc30_re;
869  o30_im = acc30_im;
870  o31_re = acc31_re;
871  o31_im = acc31_im;
872  o32_re = acc32_re;
873  o32_im = acc32_im;
874 #endif // DSLASH_CLOVER_XPAY
875 
876 #ifdef MULTI_GPU
877 } else { // exterior kernel
878 
879  sid = blockIdx.x*blockDim.x + threadIdx.x;
880  if (sid >= param.threads) return;
881 
882  const int dim = static_cast<int>(kernel_type);
883  const int face_volume = (param.threads >> 1); // volume of one face
884  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
885  face_idx = sid - face_num*face_volume; // index into the respective face
886 
887  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
888  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
889  //sp_idx = face_idx + param.ghostOffset[dim];
890 
891 #if (DD_PREC==2) // half precision
892  sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
893 #endif
894 
895  const int dims[] = {X1, X2, X3, X4};
896  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
897 
898  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
899 
900  o00_re = i00_re; o00_im = i00_im;
901  o01_re = i01_re; o01_im = i01_im;
902  o02_re = i02_re; o02_im = i02_im;
903  o10_re = i10_re; o10_im = i10_im;
904  o11_re = i11_re; o11_im = i11_im;
905  o12_re = i12_re; o12_im = i12_im;
906  o20_re = i20_re; o20_im = i20_im;
907  o21_re = i21_re; o21_im = i21_im;
908  o22_re = i22_re; o22_im = i22_im;
909  o30_re = i30_re; o30_im = i30_im;
910  o31_re = i31_re; o31_im = i31_im;
911  o32_re = i32_re; o32_im = i32_im;
912 }
913 #endif // MULTI_GPU
914 
915 
916 #ifdef MULTI_GPU
917 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
918  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
919 #endif
920 {
921  // Projector P0-
922  // 1 0 0 -i
923  // 0 1 -i 0
924  // 0 i 1 0
925  // i 0 0 1
926 
927 #ifdef MULTI_GPU
928  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
929  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
930 #else
931  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
932 #endif
933 
934  const int ga_idx = sid;
935 
942 
943 #ifdef MULTI_GPU
944  if (kernel_type == INTERIOR_KERNEL) {
945 #endif
946 
947  // read spinor from device memory
948  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
949 
950  // store spinor into shared memory
951  WRITE_SPINOR_SHARED(threadIdx.x, threadIdx.y, threadIdx.z, i);
952 
953  // project spinor into half spinors
954  a0_re = +i00_re+i30_im;
955  a0_im = +i00_im-i30_re;
956  a1_re = +i01_re+i31_im;
957  a1_im = +i01_im-i31_re;
958  a2_re = +i02_re+i32_im;
959  a2_im = +i02_im-i32_re;
960  b0_re = +i10_re+i20_im;
961  b0_im = +i10_im-i20_re;
962  b1_re = +i11_re+i21_im;
963  b1_im = +i11_im-i21_re;
964  b2_re = +i12_re+i22_im;
965  b2_im = +i12_im-i22_re;
966 
967 #ifdef MULTI_GPU
968  } else {
969 
970  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
971 
972  // read half spinor from device memory
973  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
974 
975  a0_re = i00_re; a0_im = i00_im;
976  a1_re = i01_re; a1_im = i01_im;
977  a2_re = i02_re; a2_im = i02_im;
978  b0_re = i10_re; b0_im = i10_im;
979  b1_re = i11_re; b1_im = i11_im;
980  b2_re = i12_re; b2_im = i12_im;
981 
982  }
983 #endif // MULTI_GPU
984 
985  // read gauge matrix from device memory
986  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
987 
988  // reconstruct gauge matrix
990 
991  // multiply row 0
993  A0_re += g00_re * a0_re;
994  A0_re -= g00_im * a0_im;
995  A0_re += g01_re * a1_re;
996  A0_re -= g01_im * a1_im;
997  A0_re += g02_re * a2_re;
998  A0_re -= g02_im * a2_im;
1000  A0_im += g00_re * a0_im;
1001  A0_im += g00_im * a0_re;
1002  A0_im += g01_re * a1_im;
1003  A0_im += g01_im * a1_re;
1004  A0_im += g02_re * a2_im;
1005  A0_im += g02_im * a2_re;
1007  B0_re += g00_re * b0_re;
1008  B0_re -= g00_im * b0_im;
1009  B0_re += g01_re * b1_re;
1010  B0_re -= g01_im * b1_im;
1011  B0_re += g02_re * b2_re;
1012  B0_re -= g02_im * b2_im;
1014  B0_im += g00_re * b0_im;
1015  B0_im += g00_im * b0_re;
1016  B0_im += g01_re * b1_im;
1017  B0_im += g01_im * b1_re;
1018  B0_im += g02_re * b2_im;
1019  B0_im += g02_im * b2_re;
1020 
1021  // multiply row 1
1023  A1_re += g10_re * a0_re;
1024  A1_re -= g10_im * a0_im;
1025  A1_re += g11_re * a1_re;
1026  A1_re -= g11_im * a1_im;
1027  A1_re += g12_re * a2_re;
1028  A1_re -= g12_im * a2_im;
1030  A1_im += g10_re * a0_im;
1031  A1_im += g10_im * a0_re;
1032  A1_im += g11_re * a1_im;
1033  A1_im += g11_im * a1_re;
1034  A1_im += g12_re * a2_im;
1035  A1_im += g12_im * a2_re;
1037  B1_re += g10_re * b0_re;
1038  B1_re -= g10_im * b0_im;
1039  B1_re += g11_re * b1_re;
1040  B1_re -= g11_im * b1_im;
1041  B1_re += g12_re * b2_re;
1042  B1_re -= g12_im * b2_im;
1044  B1_im += g10_re * b0_im;
1045  B1_im += g10_im * b0_re;
1046  B1_im += g11_re * b1_im;
1047  B1_im += g11_im * b1_re;
1048  B1_im += g12_re * b2_im;
1049  B1_im += g12_im * b2_re;
1050 
1051  // multiply row 2
1053  A2_re += g20_re * a0_re;
1054  A2_re -= g20_im * a0_im;
1055  A2_re += g21_re * a1_re;
1056  A2_re -= g21_im * a1_im;
1057  A2_re += g22_re * a2_re;
1058  A2_re -= g22_im * a2_im;
1060  A2_im += g20_re * a0_im;
1061  A2_im += g20_im * a0_re;
1062  A2_im += g21_re * a1_im;
1063  A2_im += g21_im * a1_re;
1064  A2_im += g22_re * a2_im;
1065  A2_im += g22_im * a2_re;
1067  B2_re += g20_re * b0_re;
1068  B2_re -= g20_im * b0_im;
1069  B2_re += g21_re * b1_re;
1070  B2_re -= g21_im * b1_im;
1071  B2_re += g22_re * b2_re;
1072  B2_re -= g22_im * b2_im;
1074  B2_im += g20_re * b0_im;
1075  B2_im += g20_im * b0_re;
1076  B2_im += g21_re * b1_im;
1077  B2_im += g21_im * b1_re;
1078  B2_im += g22_re * b2_im;
1079  B2_im += g22_im * b2_re;
1080 
1081  o00_re += a*A0_re;
1082  o00_im += a*A0_im;
1083  o10_re += a*B0_re;
1084  o10_im += a*B0_im;
1085  o20_re -= a*B0_im;
1086  o20_im += a*B0_re;
1087  o30_re -= a*A0_im;
1088  o30_im += a*A0_re;
1089 
1090  o01_re += a*A1_re;
1091  o01_im += a*A1_im;
1092  o11_re += a*B1_re;
1093  o11_im += a*B1_im;
1094  o21_re -= a*B1_im;
1095  o21_im += a*B1_re;
1096  o31_re -= a*A1_im;
1097  o31_im += a*A1_re;
1098 
1099  o02_re += a*A2_re;
1100  o02_im += a*A2_im;
1101  o12_re += a*B2_re;
1102  o12_im += a*B2_im;
1103  o22_re -= a*B2_im;
1104  o22_im += a*B2_re;
1105  o32_re -= a*A2_im;
1106  o32_im += a*A2_re;
1107 
1108 }
1109 
1110 #ifdef MULTI_GPU
1111 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
1112  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
1113 #endif
1114 {
1115  // Projector P0+
1116  // 1 0 0 i
1117  // 0 1 i 0
1118  // 0 -i 1 0
1119  // -i 0 0 1
1120 
1121 #ifdef MULTI_GPU
1122  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
1123  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1124 #else
1125  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
1126 #endif
1127 
1128 #ifdef MULTI_GPU
1129  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1130 #else
1131  const int ga_idx = sp_idx;
1132 #endif
1133 
1140 
1141 #ifdef MULTI_GPU
1142  if (kernel_type == INTERIOR_KERNEL) {
1143 #endif
1144 
1145  // load spinor from shared memory
1146  int tx = (threadIdx.x > 0) ? threadIdx.x-1 : blockDim.x-1;
1147  __syncthreads();
1148  READ_SPINOR_SHARED(tx, threadIdx.y, threadIdx.z);
1149 
1150  // project spinor into half spinors
1151  a0_re = +i00_re-i30_im;
1152  a0_im = +i00_im+i30_re;
1153  a1_re = +i01_re-i31_im;
1154  a1_im = +i01_im+i31_re;
1155  a2_re = +i02_re-i32_im;
1156  a2_im = +i02_im+i32_re;
1157  b0_re = +i10_re-i20_im;
1158  b0_im = +i10_im+i20_re;
1159  b1_re = +i11_re-i21_im;
1160  b1_im = +i11_im+i21_re;
1161  b2_re = +i12_re-i22_im;
1162  b2_im = +i12_im+i22_re;
1163 
1164 #ifdef MULTI_GPU
1165  } else {
1166 
1167  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1168 
1169  // read half spinor from device memory
1170  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1171 
1172  a0_re = i00_re; a0_im = i00_im;
1173  a1_re = i01_re; a1_im = i01_im;
1174  a2_re = i02_re; a2_im = i02_im;
1175  b0_re = i10_re; b0_im = i10_im;
1176  b1_re = i11_re; b1_im = i11_im;
1177  b2_re = i12_re; b2_im = i12_im;
1178 
1179  }
1180 #endif // MULTI_GPU
1181 
1182  // read gauge matrix from device memory
1183  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
1184 
1185  // reconstruct gauge matrix
1187 
1188  // multiply row 0
1189  spinorFloat A0_re = 0;
1190  A0_re += gT00_re * a0_re;
1191  A0_re -= gT00_im * a0_im;
1192  A0_re += gT01_re * a1_re;
1193  A0_re -= gT01_im * a1_im;
1194  A0_re += gT02_re * a2_re;
1195  A0_re -= gT02_im * a2_im;
1196  spinorFloat A0_im = 0;
1197  A0_im += gT00_re * a0_im;
1198  A0_im += gT00_im * a0_re;
1199  A0_im += gT01_re * a1_im;
1200  A0_im += gT01_im * a1_re;
1201  A0_im += gT02_re * a2_im;
1202  A0_im += gT02_im * a2_re;
1203  spinorFloat B0_re = 0;
1204  B0_re += gT00_re * b0_re;
1205  B0_re -= gT00_im * b0_im;
1206  B0_re += gT01_re * b1_re;
1207  B0_re -= gT01_im * b1_im;
1208  B0_re += gT02_re * b2_re;
1209  B0_re -= gT02_im * b2_im;
1210  spinorFloat B0_im = 0;
1211  B0_im += gT00_re * b0_im;
1212  B0_im += gT00_im * b0_re;
1213  B0_im += gT01_re * b1_im;
1214  B0_im += gT01_im * b1_re;
1215  B0_im += gT02_re * b2_im;
1216  B0_im += gT02_im * b2_re;
1217 
1218  // multiply row 1
1219  spinorFloat A1_re = 0;
1220  A1_re += gT10_re * a0_re;
1221  A1_re -= gT10_im * a0_im;
1222  A1_re += gT11_re * a1_re;
1223  A1_re -= gT11_im * a1_im;
1224  A1_re += gT12_re * a2_re;
1225  A1_re -= gT12_im * a2_im;
1226  spinorFloat A1_im = 0;
1227  A1_im += gT10_re * a0_im;
1228  A1_im += gT10_im * a0_re;
1229  A1_im += gT11_re * a1_im;
1230  A1_im += gT11_im * a1_re;
1231  A1_im += gT12_re * a2_im;
1232  A1_im += gT12_im * a2_re;
1233  spinorFloat B1_re = 0;
1234  B1_re += gT10_re * b0_re;
1235  B1_re -= gT10_im * b0_im;
1236  B1_re += gT11_re * b1_re;
1237  B1_re -= gT11_im * b1_im;
1238  B1_re += gT12_re * b2_re;
1239  B1_re -= gT12_im * b2_im;
1240  spinorFloat B1_im = 0;
1241  B1_im += gT10_re * b0_im;
1242  B1_im += gT10_im * b0_re;
1243  B1_im += gT11_re * b1_im;
1244  B1_im += gT11_im * b1_re;
1245  B1_im += gT12_re * b2_im;
1246  B1_im += gT12_im * b2_re;
1247 
1248  // multiply row 2
1249  spinorFloat A2_re = 0;
1250  A2_re += gT20_re * a0_re;
1251  A2_re -= gT20_im * a0_im;
1252  A2_re += gT21_re * a1_re;
1253  A2_re -= gT21_im * a1_im;
1254  A2_re += gT22_re * a2_re;
1255  A2_re -= gT22_im * a2_im;
1256  spinorFloat A2_im = 0;
1257  A2_im += gT20_re * a0_im;
1258  A2_im += gT20_im * a0_re;
1259  A2_im += gT21_re * a1_im;
1260  A2_im += gT21_im * a1_re;
1261  A2_im += gT22_re * a2_im;
1262  A2_im += gT22_im * a2_re;
1263  spinorFloat B2_re = 0;
1264  B2_re += gT20_re * b0_re;
1265  B2_re -= gT20_im * b0_im;
1266  B2_re += gT21_re * b1_re;
1267  B2_re -= gT21_im * b1_im;
1268  B2_re += gT22_re * b2_re;
1269  B2_re -= gT22_im * b2_im;
1270  spinorFloat B2_im = 0;
1271  B2_im += gT20_re * b0_im;
1272  B2_im += gT20_im * b0_re;
1273  B2_im += gT21_re * b1_im;
1274  B2_im += gT21_im * b1_re;
1275  B2_im += gT22_re * b2_im;
1276  B2_im += gT22_im * b2_re;
1277 
1278  o00_re += a*A0_re;
1279  o00_im += a*A0_im;
1280  o10_re += a*B0_re;
1281  o10_im += a*B0_im;
1282  o20_re += a*B0_im;
1283  o20_im -= a*B0_re;
1284  o30_re += a*A0_im;
1285  o30_im -= a*A0_re;
1286 
1287  o01_re += a*A1_re;
1288  o01_im += a*A1_im;
1289  o11_re += a*B1_re;
1290  o11_im += a*B1_im;
1291  o21_re += a*B1_im;
1292  o21_im -= a*B1_re;
1293  o31_re += a*A1_im;
1294  o31_im -= a*A1_re;
1295 
1296  o02_re += a*A2_re;
1297  o02_im += a*A2_im;
1298  o12_re += a*B2_re;
1299  o12_im += a*B2_im;
1300  o22_re += a*B2_im;
1301  o22_im -= a*B2_re;
1302  o32_re += a*A2_im;
1303  o32_im -= a*A2_re;
1304 
1305 }
1306 
1307 #ifdef MULTI_GPU
1308 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
1309  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
1310 #endif
1311 {
1312  // Projector P1-
1313  // 1 0 0 -1
1314  // 0 1 1 0
1315  // 0 1 1 0
1316  // -1 0 0 1
1317 
1318 #ifdef MULTI_GPU
1319  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
1320  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1321 #else
1322  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
1323 #endif
1324 
1325  const int ga_idx = sid;
1326 
1333 
1334 #ifdef MULTI_GPU
1335  if (kernel_type == INTERIOR_KERNEL) {
1336 #endif
1337 
1338  if (threadIdx.y == blockDim.y-1 && blockDim.y < X2 ) {
1339  // read spinor from device memory
1340  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1341 
1342  // project spinor into half spinors
1343  a0_re = +i00_re-i30_re;
1344  a0_im = +i00_im-i30_im;
1345  a1_re = +i01_re-i31_re;
1346  a1_im = +i01_im-i31_im;
1347  a2_re = +i02_re-i32_re;
1348  a2_im = +i02_im-i32_im;
1349  b0_re = +i10_re+i20_re;
1350  b0_im = +i10_im+i20_im;
1351  b1_re = +i11_re+i21_re;
1352  b1_im = +i11_im+i21_im;
1353  b2_re = +i12_re+i22_re;
1354  b2_im = +i12_im+i22_im;
1355  } else {
1356  // load spinor from shared memory
1357  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x;
1358  int ty = (threadIdx.y < blockDim.y - 1) ? threadIdx.y + 1 : 0;
1359  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
1360 
1361  // project spinor into half spinors
1362  a0_re = +i00_re-i30_re;
1363  a0_im = +i00_im-i30_im;
1364  a1_re = +i01_re-i31_re;
1365  a1_im = +i01_im-i31_im;
1366  a2_re = +i02_re-i32_re;
1367  a2_im = +i02_im-i32_im;
1368  b0_re = +i10_re+i20_re;
1369  b0_im = +i10_im+i20_im;
1370  b1_re = +i11_re+i21_re;
1371  b1_im = +i11_im+i21_im;
1372  b2_re = +i12_re+i22_re;
1373  b2_im = +i12_im+i22_im;
1374  }
1375 
1376 #ifdef MULTI_GPU
1377  } else {
1378 
1379  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1380 
1381  // read half spinor from device memory
1382  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1383 
1384  a0_re = i00_re; a0_im = i00_im;
1385  a1_re = i01_re; a1_im = i01_im;
1386  a2_re = i02_re; a2_im = i02_im;
1387  b0_re = i10_re; b0_im = i10_im;
1388  b1_re = i11_re; b1_im = i11_im;
1389  b2_re = i12_re; b2_im = i12_im;
1390 
1391  }
1392 #endif // MULTI_GPU
1393 
1394  // read gauge matrix from device memory
1395  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
1396 
1397  // reconstruct gauge matrix
1399 
1400  // multiply row 0
1401  spinorFloat A0_re = 0;
1402  A0_re += g00_re * a0_re;
1403  A0_re -= g00_im * a0_im;
1404  A0_re += g01_re * a1_re;
1405  A0_re -= g01_im * a1_im;
1406  A0_re += g02_re * a2_re;
1407  A0_re -= g02_im * a2_im;
1408  spinorFloat A0_im = 0;
1409  A0_im += g00_re * a0_im;
1410  A0_im += g00_im * a0_re;
1411  A0_im += g01_re * a1_im;
1412  A0_im += g01_im * a1_re;
1413  A0_im += g02_re * a2_im;
1414  A0_im += g02_im * a2_re;
1415  spinorFloat B0_re = 0;
1416  B0_re += g00_re * b0_re;
1417  B0_re -= g00_im * b0_im;
1418  B0_re += g01_re * b1_re;
1419  B0_re -= g01_im * b1_im;
1420  B0_re += g02_re * b2_re;
1421  B0_re -= g02_im * b2_im;
1422  spinorFloat B0_im = 0;
1423  B0_im += g00_re * b0_im;
1424  B0_im += g00_im * b0_re;
1425  B0_im += g01_re * b1_im;
1426  B0_im += g01_im * b1_re;
1427  B0_im += g02_re * b2_im;
1428  B0_im += g02_im * b2_re;
1429 
1430  // multiply row 1
1431  spinorFloat A1_re = 0;
1432  A1_re += g10_re * a0_re;
1433  A1_re -= g10_im * a0_im;
1434  A1_re += g11_re * a1_re;
1435  A1_re -= g11_im * a1_im;
1436  A1_re += g12_re * a2_re;
1437  A1_re -= g12_im * a2_im;
1438  spinorFloat A1_im = 0;
1439  A1_im += g10_re * a0_im;
1440  A1_im += g10_im * a0_re;
1441  A1_im += g11_re * a1_im;
1442  A1_im += g11_im * a1_re;
1443  A1_im += g12_re * a2_im;
1444  A1_im += g12_im * a2_re;
1445  spinorFloat B1_re = 0;
1446  B1_re += g10_re * b0_re;
1447  B1_re -= g10_im * b0_im;
1448  B1_re += g11_re * b1_re;
1449  B1_re -= g11_im * b1_im;
1450  B1_re += g12_re * b2_re;
1451  B1_re -= g12_im * b2_im;
1452  spinorFloat B1_im = 0;
1453  B1_im += g10_re * b0_im;
1454  B1_im += g10_im * b0_re;
1455  B1_im += g11_re * b1_im;
1456  B1_im += g11_im * b1_re;
1457  B1_im += g12_re * b2_im;
1458  B1_im += g12_im * b2_re;
1459 
1460  // multiply row 2
1461  spinorFloat A2_re = 0;
1462  A2_re += g20_re * a0_re;
1463  A2_re -= g20_im * a0_im;
1464  A2_re += g21_re * a1_re;
1465  A2_re -= g21_im * a1_im;
1466  A2_re += g22_re * a2_re;
1467  A2_re -= g22_im * a2_im;
1468  spinorFloat A2_im = 0;
1469  A2_im += g20_re * a0_im;
1470  A2_im += g20_im * a0_re;
1471  A2_im += g21_re * a1_im;
1472  A2_im += g21_im * a1_re;
1473  A2_im += g22_re * a2_im;
1474  A2_im += g22_im * a2_re;
1475  spinorFloat B2_re = 0;
1476  B2_re += g20_re * b0_re;
1477  B2_re -= g20_im * b0_im;
1478  B2_re += g21_re * b1_re;
1479  B2_re -= g21_im * b1_im;
1480  B2_re += g22_re * b2_re;
1481  B2_re -= g22_im * b2_im;
1482  spinorFloat B2_im = 0;
1483  B2_im += g20_re * b0_im;
1484  B2_im += g20_im * b0_re;
1485  B2_im += g21_re * b1_im;
1486  B2_im += g21_im * b1_re;
1487  B2_im += g22_re * b2_im;
1488  B2_im += g22_im * b2_re;
1489 
1490  o00_re += a*A0_re;
1491  o00_im += a*A0_im;
1492  o10_re += a*B0_re;
1493  o10_im += a*B0_im;
1494  o20_re += a*B0_re;
1495  o20_im += a*B0_im;
1496  o30_re -= a*A0_re;
1497  o30_im -= a*A0_im;
1498 
1499  o01_re += a*A1_re;
1500  o01_im += a*A1_im;
1501  o11_re += a*B1_re;
1502  o11_im += a*B1_im;
1503  o21_re += a*B1_re;
1504  o21_im += a*B1_im;
1505  o31_re -= a*A1_re;
1506  o31_im -= a*A1_im;
1507 
1508  o02_re += a*A2_re;
1509  o02_im += a*A2_im;
1510  o12_re += a*B2_re;
1511  o12_im += a*B2_im;
1512  o22_re += a*B2_re;
1513  o22_im += a*B2_im;
1514  o32_re -= a*A2_re;
1515  o32_im -= a*A2_im;
1516 
1517 }
1518 
1519 #ifdef MULTI_GPU
1520 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
1521  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
1522 #endif
1523 {
1524  // Projector P1+
1525  // 1 0 0 1
1526  // 0 1 -1 0
1527  // 0 -1 1 0
1528  // 1 0 0 1
1529 
1530 #ifdef MULTI_GPU
1531  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
1532  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1533 #else
1534  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
1535 #endif
1536 
1537 #ifdef MULTI_GPU
1538  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1539 #else
1540  const int ga_idx = sp_idx;
1541 #endif
1542 
1549 
1550 #ifdef MULTI_GPU
1551  if (kernel_type == INTERIOR_KERNEL) {
1552 #endif
1553 
1554  if (threadIdx.y == 0 && blockDim.y < X2) {
1555  // read spinor from device memory
1556  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1557 
1558  // project spinor into half spinors
1559  a0_re = +i00_re+i30_re;
1560  a0_im = +i00_im+i30_im;
1561  a1_re = +i01_re+i31_re;
1562  a1_im = +i01_im+i31_im;
1563  a2_re = +i02_re+i32_re;
1564  a2_im = +i02_im+i32_im;
1565  b0_re = +i10_re-i20_re;
1566  b0_im = +i10_im-i20_im;
1567  b1_re = +i11_re-i21_re;
1568  b1_im = +i11_im-i21_im;
1569  b2_re = +i12_re-i22_re;
1570  b2_im = +i12_im-i22_im;
1571  } else {
1572  // load spinor from shared memory
1573  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x;
1574  int ty = (threadIdx.y > 0) ? threadIdx.y - 1 : blockDim.y - 1;
1575  READ_SPINOR_SHARED(tx, ty, threadIdx.z);
1576 
1577  // project spinor into half spinors
1578  a0_re = +i00_re+i30_re;
1579  a0_im = +i00_im+i30_im;
1580  a1_re = +i01_re+i31_re;
1581  a1_im = +i01_im+i31_im;
1582  a2_re = +i02_re+i32_re;
1583  a2_im = +i02_im+i32_im;
1584  b0_re = +i10_re-i20_re;
1585  b0_im = +i10_im-i20_im;
1586  b1_re = +i11_re-i21_re;
1587  b1_im = +i11_im-i21_im;
1588  b2_re = +i12_re-i22_re;
1589  b2_im = +i12_im-i22_im;
1590  }
1591 
1592 #ifdef MULTI_GPU
1593  } else {
1594 
1595  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1596 
1597  // read half spinor from device memory
1598  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1599 
1600  a0_re = i00_re; a0_im = i00_im;
1601  a1_re = i01_re; a1_im = i01_im;
1602  a2_re = i02_re; a2_im = i02_im;
1603  b0_re = i10_re; b0_im = i10_im;
1604  b1_re = i11_re; b1_im = i11_im;
1605  b2_re = i12_re; b2_im = i12_im;
1606 
1607  }
1608 #endif // MULTI_GPU
1609 
1610  // read gauge matrix from device memory
1611  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
1612 
1613  // reconstruct gauge matrix
1615 
1616  // multiply row 0
1617  spinorFloat A0_re = 0;
1618  A0_re += gT00_re * a0_re;
1619  A0_re -= gT00_im * a0_im;
1620  A0_re += gT01_re * a1_re;
1621  A0_re -= gT01_im * a1_im;
1622  A0_re += gT02_re * a2_re;
1623  A0_re -= gT02_im * a2_im;
1624  spinorFloat A0_im = 0;
1625  A0_im += gT00_re * a0_im;
1626  A0_im += gT00_im * a0_re;
1627  A0_im += gT01_re * a1_im;
1628  A0_im += gT01_im * a1_re;
1629  A0_im += gT02_re * a2_im;
1630  A0_im += gT02_im * a2_re;
1631  spinorFloat B0_re = 0;
1632  B0_re += gT00_re * b0_re;
1633  B0_re -= gT00_im * b0_im;
1634  B0_re += gT01_re * b1_re;
1635  B0_re -= gT01_im * b1_im;
1636  B0_re += gT02_re * b2_re;
1637  B0_re -= gT02_im * b2_im;
1638  spinorFloat B0_im = 0;
1639  B0_im += gT00_re * b0_im;
1640  B0_im += gT00_im * b0_re;
1641  B0_im += gT01_re * b1_im;
1642  B0_im += gT01_im * b1_re;
1643  B0_im += gT02_re * b2_im;
1644  B0_im += gT02_im * b2_re;
1645 
1646  // multiply row 1
1647  spinorFloat A1_re = 0;
1648  A1_re += gT10_re * a0_re;
1649  A1_re -= gT10_im * a0_im;
1650  A1_re += gT11_re * a1_re;
1651  A1_re -= gT11_im * a1_im;
1652  A1_re += gT12_re * a2_re;
1653  A1_re -= gT12_im * a2_im;
1654  spinorFloat A1_im = 0;
1655  A1_im += gT10_re * a0_im;
1656  A1_im += gT10_im * a0_re;
1657  A1_im += gT11_re * a1_im;
1658  A1_im += gT11_im * a1_re;
1659  A1_im += gT12_re * a2_im;
1660  A1_im += gT12_im * a2_re;
1661  spinorFloat B1_re = 0;
1662  B1_re += gT10_re * b0_re;
1663  B1_re -= gT10_im * b0_im;
1664  B1_re += gT11_re * b1_re;
1665  B1_re -= gT11_im * b1_im;
1666  B1_re += gT12_re * b2_re;
1667  B1_re -= gT12_im * b2_im;
1668  spinorFloat B1_im = 0;
1669  B1_im += gT10_re * b0_im;
1670  B1_im += gT10_im * b0_re;
1671  B1_im += gT11_re * b1_im;
1672  B1_im += gT11_im * b1_re;
1673  B1_im += gT12_re * b2_im;
1674  B1_im += gT12_im * b2_re;
1675 
1676  // multiply row 2
1677  spinorFloat A2_re = 0;
1678  A2_re += gT20_re * a0_re;
1679  A2_re -= gT20_im * a0_im;
1680  A2_re += gT21_re * a1_re;
1681  A2_re -= gT21_im * a1_im;
1682  A2_re += gT22_re * a2_re;
1683  A2_re -= gT22_im * a2_im;
1684  spinorFloat A2_im = 0;
1685  A2_im += gT20_re * a0_im;
1686  A2_im += gT20_im * a0_re;
1687  A2_im += gT21_re * a1_im;
1688  A2_im += gT21_im * a1_re;
1689  A2_im += gT22_re * a2_im;
1690  A2_im += gT22_im * a2_re;
1691  spinorFloat B2_re = 0;
1692  B2_re += gT20_re * b0_re;
1693  B2_re -= gT20_im * b0_im;
1694  B2_re += gT21_re * b1_re;
1695  B2_re -= gT21_im * b1_im;
1696  B2_re += gT22_re * b2_re;
1697  B2_re -= gT22_im * b2_im;
1698  spinorFloat B2_im = 0;
1699  B2_im += gT20_re * b0_im;
1700  B2_im += gT20_im * b0_re;
1701  B2_im += gT21_re * b1_im;
1702  B2_im += gT21_im * b1_re;
1703  B2_im += gT22_re * b2_im;
1704  B2_im += gT22_im * b2_re;
1705 
1706  o00_re += a*A0_re;
1707  o00_im += a*A0_im;
1708  o10_re += a*B0_re;
1709  o10_im += a*B0_im;
1710  o20_re -= a*B0_re;
1711  o20_im -= a*B0_im;
1712  o30_re += a*A0_re;
1713  o30_im += a*A0_im;
1714 
1715  o01_re += a*A1_re;
1716  o01_im += a*A1_im;
1717  o11_re += a*B1_re;
1718  o11_im += a*B1_im;
1719  o21_re -= a*B1_re;
1720  o21_im -= a*B1_im;
1721  o31_re += a*A1_re;
1722  o31_im += a*A1_im;
1723 
1724  o02_re += a*A2_re;
1725  o02_im += a*A2_im;
1726  o12_re += a*B2_re;
1727  o12_im += a*B2_im;
1728  o22_re -= a*B2_re;
1729  o22_im -= a*B2_im;
1730  o32_re += a*A2_re;
1731  o32_im += a*A2_im;
1732 
1733 }
1734 
1735 #ifdef MULTI_GPU
1736 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1737  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1738 #endif
1739 {
1740  // Projector P2-
1741  // 1 0 -i 0
1742  // 0 1 0 i
1743  // i 0 1 0
1744  // 0 -i 0 1
1745 
1746 #ifdef MULTI_GPU
1747  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1748  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1749 #else
1750  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1751 #endif
1752 
1753  const int ga_idx = sid;
1754 
1761 
1762 #ifdef MULTI_GPU
1763  if (kernel_type == INTERIOR_KERNEL) {
1764 #endif
1765 
1766  if (threadIdx.z == blockDim.z-1 && blockDim.z < X3) {
1767  // read spinor from device memory
1768  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1769 
1770  // project spinor into half spinors
1771  a0_re = +i00_re+i20_im;
1772  a0_im = +i00_im-i20_re;
1773  a1_re = +i01_re+i21_im;
1774  a1_im = +i01_im-i21_re;
1775  a2_re = +i02_re+i22_im;
1776  a2_im = +i02_im-i22_re;
1777  b0_re = +i10_re-i30_im;
1778  b0_im = +i10_im+i30_re;
1779  b1_re = +i11_re-i31_im;
1780  b1_im = +i11_im+i31_re;
1781  b2_re = +i12_re-i32_im;
1782  b2_im = +i12_im+i32_re;
1783  } else {
1784  // load spinor from shared memory
1785  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1) ) % blockDim.x;
1786  int tz = (threadIdx.z < blockDim.z - 1) ? threadIdx.z + 1 : 0;
1787  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
1788 
1789  // project spinor into half spinors
1790  a0_re = +i00_re+i20_im;
1791  a0_im = +i00_im-i20_re;
1792  a1_re = +i01_re+i21_im;
1793  a1_im = +i01_im-i21_re;
1794  a2_re = +i02_re+i22_im;
1795  a2_im = +i02_im-i22_re;
1796  b0_re = +i10_re-i30_im;
1797  b0_im = +i10_im+i30_re;
1798  b1_re = +i11_re-i31_im;
1799  b1_im = +i11_im+i31_re;
1800  b2_re = +i12_re-i32_im;
1801  b2_im = +i12_im+i32_re;
1802  }
1803 
1804 #ifdef MULTI_GPU
1805  } else {
1806 
1807  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
1808 
1809  // read half spinor from device memory
1810  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1811 
1812  a0_re = i00_re; a0_im = i00_im;
1813  a1_re = i01_re; a1_im = i01_im;
1814  a2_re = i02_re; a2_im = i02_im;
1815  b0_re = i10_re; b0_im = i10_im;
1816  b1_re = i11_re; b1_im = i11_im;
1817  b2_re = i12_re; b2_im = i12_im;
1818 
1819  }
1820 #endif // MULTI_GPU
1821 
1822  // read gauge matrix from device memory
1823  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1824 
1825  // reconstruct gauge matrix
1827 
1828  // multiply row 0
1829  spinorFloat A0_re = 0;
1830  A0_re += g00_re * a0_re;
1831  A0_re -= g00_im * a0_im;
1832  A0_re += g01_re * a1_re;
1833  A0_re -= g01_im * a1_im;
1834  A0_re += g02_re * a2_re;
1835  A0_re -= g02_im * a2_im;
1836  spinorFloat A0_im = 0;
1837  A0_im += g00_re * a0_im;
1838  A0_im += g00_im * a0_re;
1839  A0_im += g01_re * a1_im;
1840  A0_im += g01_im * a1_re;
1841  A0_im += g02_re * a2_im;
1842  A0_im += g02_im * a2_re;
1843  spinorFloat B0_re = 0;
1844  B0_re += g00_re * b0_re;
1845  B0_re -= g00_im * b0_im;
1846  B0_re += g01_re * b1_re;
1847  B0_re -= g01_im * b1_im;
1848  B0_re += g02_re * b2_re;
1849  B0_re -= g02_im * b2_im;
1850  spinorFloat B0_im = 0;
1851  B0_im += g00_re * b0_im;
1852  B0_im += g00_im * b0_re;
1853  B0_im += g01_re * b1_im;
1854  B0_im += g01_im * b1_re;
1855  B0_im += g02_re * b2_im;
1856  B0_im += g02_im * b2_re;
1857 
1858  // multiply row 1
1859  spinorFloat A1_re = 0;
1860  A1_re += g10_re * a0_re;
1861  A1_re -= g10_im * a0_im;
1862  A1_re += g11_re * a1_re;
1863  A1_re -= g11_im * a1_im;
1864  A1_re += g12_re * a2_re;
1865  A1_re -= g12_im * a2_im;
1866  spinorFloat A1_im = 0;
1867  A1_im += g10_re * a0_im;
1868  A1_im += g10_im * a0_re;
1869  A1_im += g11_re * a1_im;
1870  A1_im += g11_im * a1_re;
1871  A1_im += g12_re * a2_im;
1872  A1_im += g12_im * a2_re;
1873  spinorFloat B1_re = 0;
1874  B1_re += g10_re * b0_re;
1875  B1_re -= g10_im * b0_im;
1876  B1_re += g11_re * b1_re;
1877  B1_re -= g11_im * b1_im;
1878  B1_re += g12_re * b2_re;
1879  B1_re -= g12_im * b2_im;
1880  spinorFloat B1_im = 0;
1881  B1_im += g10_re * b0_im;
1882  B1_im += g10_im * b0_re;
1883  B1_im += g11_re * b1_im;
1884  B1_im += g11_im * b1_re;
1885  B1_im += g12_re * b2_im;
1886  B1_im += g12_im * b2_re;
1887 
1888  // multiply row 2
1889  spinorFloat A2_re = 0;
1890  A2_re += g20_re * a0_re;
1891  A2_re -= g20_im * a0_im;
1892  A2_re += g21_re * a1_re;
1893  A2_re -= g21_im * a1_im;
1894  A2_re += g22_re * a2_re;
1895  A2_re -= g22_im * a2_im;
1896  spinorFloat A2_im = 0;
1897  A2_im += g20_re * a0_im;
1898  A2_im += g20_im * a0_re;
1899  A2_im += g21_re * a1_im;
1900  A2_im += g21_im * a1_re;
1901  A2_im += g22_re * a2_im;
1902  A2_im += g22_im * a2_re;
1903  spinorFloat B2_re = 0;
1904  B2_re += g20_re * b0_re;
1905  B2_re -= g20_im * b0_im;
1906  B2_re += g21_re * b1_re;
1907  B2_re -= g21_im * b1_im;
1908  B2_re += g22_re * b2_re;
1909  B2_re -= g22_im * b2_im;
1910  spinorFloat B2_im = 0;
1911  B2_im += g20_re * b0_im;
1912  B2_im += g20_im * b0_re;
1913  B2_im += g21_re * b1_im;
1914  B2_im += g21_im * b1_re;
1915  B2_im += g22_re * b2_im;
1916  B2_im += g22_im * b2_re;
1917 
1918  o00_re += a*A0_re;
1919  o00_im += a*A0_im;
1920  o10_re += a*B0_re;
1921  o10_im += a*B0_im;
1922  o20_re -= a*A0_im;
1923  o20_im += a*A0_re;
1924  o30_re += a*B0_im;
1925  o30_im -= a*B0_re;
1926 
1927  o01_re += a*A1_re;
1928  o01_im += a*A1_im;
1929  o11_re += a*B1_re;
1930  o11_im += a*B1_im;
1931  o21_re -= a*A1_im;
1932  o21_im += a*A1_re;
1933  o31_re += a*B1_im;
1934  o31_im -= a*B1_re;
1935 
1936  o02_re += a*A2_re;
1937  o02_im += a*A2_im;
1938  o12_re += a*B2_re;
1939  o12_im += a*B2_im;
1940  o22_re -= a*A2_im;
1941  o22_im += a*A2_re;
1942  o32_re += a*B2_im;
1943  o32_im -= a*B2_re;
1944 
1945 }
1946 
1947 #ifdef MULTI_GPU
1948 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1949  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1950 #endif
1951 {
1952  // Projector P2+
1953  // 1 0 i 0
1954  // 0 1 0 -i
1955  // -i 0 1 0
1956  // 0 i 0 1
1957 
1958 #ifdef MULTI_GPU
1959  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1960  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1961 #else
1962  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1963 #endif
1964 
1965 #ifdef MULTI_GPU
1966  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
1967 #else
1968  const int ga_idx = sp_idx;
1969 #endif
1970 
1977 
1978 #ifdef MULTI_GPU
1979  if (kernel_type == INTERIOR_KERNEL) {
1980 #endif
1981 
1982  if (threadIdx.z == 0 && blockDim.z < X3) {
1983  // read spinor from device memory
1984  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1985 
1986  // project spinor into half spinors
1987  a0_re = +i00_re-i20_im;
1988  a0_im = +i00_im+i20_re;
1989  a1_re = +i01_re-i21_im;
1990  a1_im = +i01_im+i21_re;
1991  a2_re = +i02_re-i22_im;
1992  a2_im = +i02_im+i22_re;
1993  b0_re = +i10_re+i30_im;
1994  b0_im = +i10_im-i30_re;
1995  b1_re = +i11_re+i31_im;
1996  b1_im = +i11_im-i31_re;
1997  b2_re = +i12_re+i32_im;
1998  b2_im = +i12_im-i32_re;
1999  } else {
2000  // load spinor from shared memory
2001  int tx = (threadIdx.x + blockDim.x - ((x1+1)&1)) % blockDim.x;
2002  int tz = (threadIdx.z > 0) ? threadIdx.z - 1 : blockDim.z - 1;
2003  READ_SPINOR_SHARED(tx, threadIdx.y, tz);
2004 
2005  // project spinor into half spinors
2006  a0_re = +i00_re-i20_im;
2007  a0_im = +i00_im+i20_re;
2008  a1_re = +i01_re-i21_im;
2009  a1_im = +i01_im+i21_re;
2010  a2_re = +i02_re-i22_im;
2011  a2_im = +i02_im+i22_re;
2012  b0_re = +i10_re+i30_im;
2013  b0_im = +i10_im-i30_re;
2014  b1_re = +i11_re+i31_im;
2015  b1_im = +i11_im-i31_re;
2016  b2_re = +i12_re+i32_im;
2017  b2_im = +i12_im-i32_re;
2018  }
2019 
2020 #ifdef MULTI_GPU
2021  } else {
2022 
2023  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2024 
2025  // read half spinor from device memory
2026  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2027 
2028  a0_re = i00_re; a0_im = i00_im;
2029  a1_re = i01_re; a1_im = i01_im;
2030  a2_re = i02_re; a2_im = i02_im;
2031  b0_re = i10_re; b0_im = i10_im;
2032  b1_re = i11_re; b1_im = i11_im;
2033  b2_re = i12_re; b2_im = i12_im;
2034 
2035  }
2036 #endif // MULTI_GPU
2037 
2038  // read gauge matrix from device memory
2039  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
2040 
2041  // reconstruct gauge matrix
2043 
2044  // multiply row 0
2045  spinorFloat A0_re = 0;
2046  A0_re += gT00_re * a0_re;
2047  A0_re -= gT00_im * a0_im;
2048  A0_re += gT01_re * a1_re;
2049  A0_re -= gT01_im * a1_im;
2050  A0_re += gT02_re * a2_re;
2051  A0_re -= gT02_im * a2_im;
2052  spinorFloat A0_im = 0;
2053  A0_im += gT00_re * a0_im;
2054  A0_im += gT00_im * a0_re;
2055  A0_im += gT01_re * a1_im;
2056  A0_im += gT01_im * a1_re;
2057  A0_im += gT02_re * a2_im;
2058  A0_im += gT02_im * a2_re;
2059  spinorFloat B0_re = 0;
2060  B0_re += gT00_re * b0_re;
2061  B0_re -= gT00_im * b0_im;
2062  B0_re += gT01_re * b1_re;
2063  B0_re -= gT01_im * b1_im;
2064  B0_re += gT02_re * b2_re;
2065  B0_re -= gT02_im * b2_im;
2066  spinorFloat B0_im = 0;
2067  B0_im += gT00_re * b0_im;
2068  B0_im += gT00_im * b0_re;
2069  B0_im += gT01_re * b1_im;
2070  B0_im += gT01_im * b1_re;
2071  B0_im += gT02_re * b2_im;
2072  B0_im += gT02_im * b2_re;
2073 
2074  // multiply row 1
2075  spinorFloat A1_re = 0;
2076  A1_re += gT10_re * a0_re;
2077  A1_re -= gT10_im * a0_im;
2078  A1_re += gT11_re * a1_re;
2079  A1_re -= gT11_im * a1_im;
2080  A1_re += gT12_re * a2_re;
2081  A1_re -= gT12_im * a2_im;
2082  spinorFloat A1_im = 0;
2083  A1_im += gT10_re * a0_im;
2084  A1_im += gT10_im * a0_re;
2085  A1_im += gT11_re * a1_im;
2086  A1_im += gT11_im * a1_re;
2087  A1_im += gT12_re * a2_im;
2088  A1_im += gT12_im * a2_re;
2089  spinorFloat B1_re = 0;
2090  B1_re += gT10_re * b0_re;
2091  B1_re -= gT10_im * b0_im;
2092  B1_re += gT11_re * b1_re;
2093  B1_re -= gT11_im * b1_im;
2094  B1_re += gT12_re * b2_re;
2095  B1_re -= gT12_im * b2_im;
2096  spinorFloat B1_im = 0;
2097  B1_im += gT10_re * b0_im;
2098  B1_im += gT10_im * b0_re;
2099  B1_im += gT11_re * b1_im;
2100  B1_im += gT11_im * b1_re;
2101  B1_im += gT12_re * b2_im;
2102  B1_im += gT12_im * b2_re;
2103 
2104  // multiply row 2
2105  spinorFloat A2_re = 0;
2106  A2_re += gT20_re * a0_re;
2107  A2_re -= gT20_im * a0_im;
2108  A2_re += gT21_re * a1_re;
2109  A2_re -= gT21_im * a1_im;
2110  A2_re += gT22_re * a2_re;
2111  A2_re -= gT22_im * a2_im;
2112  spinorFloat A2_im = 0;
2113  A2_im += gT20_re * a0_im;
2114  A2_im += gT20_im * a0_re;
2115  A2_im += gT21_re * a1_im;
2116  A2_im += gT21_im * a1_re;
2117  A2_im += gT22_re * a2_im;
2118  A2_im += gT22_im * a2_re;
2119  spinorFloat B2_re = 0;
2120  B2_re += gT20_re * b0_re;
2121  B2_re -= gT20_im * b0_im;
2122  B2_re += gT21_re * b1_re;
2123  B2_re -= gT21_im * b1_im;
2124  B2_re += gT22_re * b2_re;
2125  B2_re -= gT22_im * b2_im;
2126  spinorFloat B2_im = 0;
2127  B2_im += gT20_re * b0_im;
2128  B2_im += gT20_im * b0_re;
2129  B2_im += gT21_re * b1_im;
2130  B2_im += gT21_im * b1_re;
2131  B2_im += gT22_re * b2_im;
2132  B2_im += gT22_im * b2_re;
2133 
2134  o00_re += a*A0_re;
2135  o00_im += a*A0_im;
2136  o10_re += a*B0_re;
2137  o10_im += a*B0_im;
2138  o20_re += a*A0_im;
2139  o20_im -= a*A0_re;
2140  o30_re -= a*B0_im;
2141  o30_im += a*B0_re;
2142 
2143  o01_re += a*A1_re;
2144  o01_im += a*A1_im;
2145  o11_re += a*B1_re;
2146  o11_im += a*B1_im;
2147  o21_re += a*A1_im;
2148  o21_im -= a*A1_re;
2149  o31_re -= a*B1_im;
2150  o31_im += a*B1_re;
2151 
2152  o02_re += a*A2_re;
2153  o02_im += a*A2_im;
2154  o12_re += a*B2_re;
2155  o12_im += a*B2_im;
2156  o22_re += a*A2_im;
2157  o22_im -= a*A2_re;
2158  o32_re -= a*B2_im;
2159  o32_im += a*B2_re;
2160 
2161 }
2162 
2163 #ifdef MULTI_GPU
2164 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
2165  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
2166 #endif
2167 {
2168  // Projector P3-
2169  // 0 0 0 0
2170  // 0 0 0 0
2171  // 0 0 2 0
2172  // 0 0 0 2
2173 
2174 #ifdef MULTI_GPU
2175  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
2176  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2177 #else
2178  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
2179 #endif
2180 
2181  const int ga_idx = sid;
2182 
2183  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
2184  {
2191 
2192 #ifdef MULTI_GPU
2193  if (kernel_type == INTERIOR_KERNEL) {
2194 #endif
2195 
2196  // read spinor from device memory
2197  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2198 
2199  // project spinor into half spinors
2200  a0_re = +2*i20_re;
2201  a0_im = +2*i20_im;
2202  a1_re = +2*i21_re;
2203  a1_im = +2*i21_im;
2204  a2_re = +2*i22_re;
2205  a2_im = +2*i22_im;
2206  b0_re = +2*i30_re;
2207  b0_im = +2*i30_im;
2208  b1_re = +2*i31_re;
2209  b1_im = +2*i31_im;
2210  b2_re = +2*i32_re;
2211  b2_im = +2*i32_im;
2212 
2213 #ifdef MULTI_GPU
2214  } else {
2215 
2216  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2217  const int t_proj_scale = TPROJSCALE;
2218 
2219  // read half spinor from device memory
2220  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
2221 
2222  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2223  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2224  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2225  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2226  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2227  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2228 
2229  }
2230 #endif // MULTI_GPU
2231 
2232  // identity gauge matrix
2239 
2240  o20_re += a*A0_re;
2241  o20_im += a*A0_im;
2242  o30_re += a*B0_re;
2243  o30_im += a*B0_im;
2244 
2245  o21_re += a*A1_re;
2246  o21_im += a*A1_im;
2247  o31_re += a*B1_re;
2248  o31_im += a*B1_im;
2249 
2250  o22_re += a*A2_re;
2251  o22_im += a*A2_im;
2252  o32_re += a*B2_re;
2253  o32_im += a*B2_im;
2254 
2255  } else {
2262 
2263 #ifdef MULTI_GPU
2264  if (kernel_type == INTERIOR_KERNEL) {
2265 #endif
2266 
2267  // read spinor from device memory
2268  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2269 
2270  // project spinor into half spinors
2271  a0_re = +2*i20_re;
2272  a0_im = +2*i20_im;
2273  a1_re = +2*i21_re;
2274  a1_im = +2*i21_im;
2275  a2_re = +2*i22_re;
2276  a2_im = +2*i22_im;
2277  b0_re = +2*i30_re;
2278  b0_im = +2*i30_im;
2279  b1_re = +2*i31_re;
2280  b1_im = +2*i31_im;
2281  b2_re = +2*i32_re;
2282  b2_im = +2*i32_im;
2283 
2284 #ifdef MULTI_GPU
2285  } else {
2286 
2287  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2288  const int t_proj_scale = TPROJSCALE;
2289 
2290  // read half spinor from device memory
2291  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
2292 
2293  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2294  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2295  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2296  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2297  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2298  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2299 
2300  }
2301 #endif // MULTI_GPU
2302 
2303  // read gauge matrix from device memory
2304  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
2305 
2306  // reconstruct gauge matrix
2308 
2309  // multiply row 0
2310  spinorFloat A0_re = 0;
2311  A0_re += g00_re * a0_re;
2312  A0_re -= g00_im * a0_im;
2313  A0_re += g01_re * a1_re;
2314  A0_re -= g01_im * a1_im;
2315  A0_re += g02_re * a2_re;
2316  A0_re -= g02_im * a2_im;
2317  spinorFloat A0_im = 0;
2318  A0_im += g00_re * a0_im;
2319  A0_im += g00_im * a0_re;
2320  A0_im += g01_re * a1_im;
2321  A0_im += g01_im * a1_re;
2322  A0_im += g02_re * a2_im;
2323  A0_im += g02_im * a2_re;
2324  spinorFloat B0_re = 0;
2325  B0_re += g00_re * b0_re;
2326  B0_re -= g00_im * b0_im;
2327  B0_re += g01_re * b1_re;
2328  B0_re -= g01_im * b1_im;
2329  B0_re += g02_re * b2_re;
2330  B0_re -= g02_im * b2_im;
2331  spinorFloat B0_im = 0;
2332  B0_im += g00_re * b0_im;
2333  B0_im += g00_im * b0_re;
2334  B0_im += g01_re * b1_im;
2335  B0_im += g01_im * b1_re;
2336  B0_im += g02_re * b2_im;
2337  B0_im += g02_im * b2_re;
2338 
2339  // multiply row 1
2340  spinorFloat A1_re = 0;
2341  A1_re += g10_re * a0_re;
2342  A1_re -= g10_im * a0_im;
2343  A1_re += g11_re * a1_re;
2344  A1_re -= g11_im * a1_im;
2345  A1_re += g12_re * a2_re;
2346  A1_re -= g12_im * a2_im;
2347  spinorFloat A1_im = 0;
2348  A1_im += g10_re * a0_im;
2349  A1_im += g10_im * a0_re;
2350  A1_im += g11_re * a1_im;
2351  A1_im += g11_im * a1_re;
2352  A1_im += g12_re * a2_im;
2353  A1_im += g12_im * a2_re;
2354  spinorFloat B1_re = 0;
2355  B1_re += g10_re * b0_re;
2356  B1_re -= g10_im * b0_im;
2357  B1_re += g11_re * b1_re;
2358  B1_re -= g11_im * b1_im;
2359  B1_re += g12_re * b2_re;
2360  B1_re -= g12_im * b2_im;
2361  spinorFloat B1_im = 0;
2362  B1_im += g10_re * b0_im;
2363  B1_im += g10_im * b0_re;
2364  B1_im += g11_re * b1_im;
2365  B1_im += g11_im * b1_re;
2366  B1_im += g12_re * b2_im;
2367  B1_im += g12_im * b2_re;
2368 
2369  // multiply row 2
2370  spinorFloat A2_re = 0;
2371  A2_re += g20_re * a0_re;
2372  A2_re -= g20_im * a0_im;
2373  A2_re += g21_re * a1_re;
2374  A2_re -= g21_im * a1_im;
2375  A2_re += g22_re * a2_re;
2376  A2_re -= g22_im * a2_im;
2377  spinorFloat A2_im = 0;
2378  A2_im += g20_re * a0_im;
2379  A2_im += g20_im * a0_re;
2380  A2_im += g21_re * a1_im;
2381  A2_im += g21_im * a1_re;
2382  A2_im += g22_re * a2_im;
2383  A2_im += g22_im * a2_re;
2384  spinorFloat B2_re = 0;
2385  B2_re += g20_re * b0_re;
2386  B2_re -= g20_im * b0_im;
2387  B2_re += g21_re * b1_re;
2388  B2_re -= g21_im * b1_im;
2389  B2_re += g22_re * b2_re;
2390  B2_re -= g22_im * b2_im;
2391  spinorFloat B2_im = 0;
2392  B2_im += g20_re * b0_im;
2393  B2_im += g20_im * b0_re;
2394  B2_im += g21_re * b1_im;
2395  B2_im += g21_im * b1_re;
2396  B2_im += g22_re * b2_im;
2397  B2_im += g22_im * b2_re;
2398 
2399  o20_re += a*A0_re;
2400  o20_im += a*A0_im;
2401  o30_re += a*B0_re;
2402  o30_im += a*B0_im;
2403 
2404  o21_re += a*A1_re;
2405  o21_im += a*A1_im;
2406  o31_re += a*B1_re;
2407  o31_im += a*B1_im;
2408 
2409  o22_re += a*A2_re;
2410  o22_im += a*A2_im;
2411  o32_re += a*B2_re;
2412  o32_im += a*B2_im;
2413 
2414  }
2415 }
2416 
2417 #ifdef MULTI_GPU
2418 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
2419  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
2420 #endif
2421 {
2422  // Projector P3+
2423  // 2 0 0 0
2424  // 0 2 0 0
2425  // 0 0 0 0
2426  // 0 0 0 0
2427 
2428 #ifdef MULTI_GPU
2429  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
2430  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
2431 #else
2432  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
2433 #endif
2434 
2435 #ifdef MULTI_GPU
2436  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx : Vh+face_idx);
2437 #else
2438  const int ga_idx = sp_idx;
2439 #endif
2440 
2441  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
2442  {
2449 
2450 #ifdef MULTI_GPU
2451  if (kernel_type == INTERIOR_KERNEL) {
2452 #endif
2453 
2454  // read spinor from device memory
2455  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2456 
2457  // project spinor into half spinors
2458  a0_re = +2*i00_re;
2459  a0_im = +2*i00_im;
2460  a1_re = +2*i01_re;
2461  a1_im = +2*i01_im;
2462  a2_re = +2*i02_re;
2463  a2_im = +2*i02_im;
2464  b0_re = +2*i10_re;
2465  b0_im = +2*i10_im;
2466  b1_re = +2*i11_re;
2467  b1_im = +2*i11_im;
2468  b2_re = +2*i12_re;
2469  b2_im = +2*i12_im;
2470 
2471 #ifdef MULTI_GPU
2472  } else {
2473 
2474  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2475  const int t_proj_scale = TPROJSCALE;
2476 
2477  // read half spinor from device memory
2478  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2479 
2480  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2481  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2482  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2483  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2484  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2485  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2486 
2487  }
2488 #endif // MULTI_GPU
2489 
2490  // identity gauge matrix
2497 
2498  o00_re += a*A0_re;
2499  o00_im += a*A0_im;
2500  o10_re += a*B0_re;
2501  o10_im += a*B0_im;
2502 
2503  o01_re += a*A1_re;
2504  o01_im += a*A1_im;
2505  o11_re += a*B1_re;
2506  o11_im += a*B1_im;
2507 
2508  o02_re += a*A2_re;
2509  o02_im += a*A2_im;
2510  o12_re += a*B2_re;
2511  o12_im += a*B2_im;
2512 
2513  } else {
2520 
2521 #ifdef MULTI_GPU
2522  if (kernel_type == INTERIOR_KERNEL) {
2523 #endif
2524 
2525  // read spinor from device memory
2526  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
2527 
2528  // project spinor into half spinors
2529  a0_re = +2*i00_re;
2530  a0_im = +2*i00_im;
2531  a1_re = +2*i01_re;
2532  a1_im = +2*i01_im;
2533  a2_re = +2*i02_re;
2534  a2_im = +2*i02_im;
2535  b0_re = +2*i10_re;
2536  b0_im = +2*i10_im;
2537  b1_re = +2*i11_re;
2538  b1_im = +2*i11_im;
2539  b2_re = +2*i12_re;
2540  b2_im = +2*i12_im;
2541 
2542 #ifdef MULTI_GPU
2543  } else {
2544 
2545  const int sp_stride_pad = ghostFace[static_cast<int>(kernel_type)];
2546  const int t_proj_scale = TPROJSCALE;
2547 
2548  // read half spinor from device memory
2549  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
2550 
2551  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
2552  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
2553  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
2554  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
2555  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
2556  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
2557 
2558  }
2559 #endif // MULTI_GPU
2560 
2561  // read gauge matrix from device memory
2562  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
2563 
2564  // reconstruct gauge matrix
2566 
2567  // multiply row 0
2568  spinorFloat A0_re = 0;
2569  A0_re += gT00_re * a0_re;
2570  A0_re -= gT00_im * a0_im;
2571  A0_re += gT01_re * a1_re;
2572  A0_re -= gT01_im * a1_im;
2573  A0_re += gT02_re * a2_re;
2574  A0_re -= gT02_im * a2_im;
2575  spinorFloat A0_im = 0;
2576  A0_im += gT00_re * a0_im;
2577  A0_im += gT00_im * a0_re;
2578  A0_im += gT01_re * a1_im;
2579  A0_im += gT01_im * a1_re;
2580  A0_im += gT02_re * a2_im;
2581  A0_im += gT02_im * a2_re;
2582  spinorFloat B0_re = 0;
2583  B0_re += gT00_re * b0_re;
2584  B0_re -= gT00_im * b0_im;
2585  B0_re += gT01_re * b1_re;
2586  B0_re -= gT01_im * b1_im;
2587  B0_re += gT02_re * b2_re;
2588  B0_re -= gT02_im * b2_im;
2589  spinorFloat B0_im = 0;
2590  B0_im += gT00_re * b0_im;
2591  B0_im += gT00_im * b0_re;
2592  B0_im += gT01_re * b1_im;
2593  B0_im += gT01_im * b1_re;
2594  B0_im += gT02_re * b2_im;
2595  B0_im += gT02_im * b2_re;
2596 
2597  // multiply row 1
2598  spinorFloat A1_re = 0;
2599  A1_re += gT10_re * a0_re;
2600  A1_re -= gT10_im * a0_im;
2601  A1_re += gT11_re * a1_re;
2602  A1_re -= gT11_im * a1_im;
2603  A1_re += gT12_re * a2_re;
2604  A1_re -= gT12_im * a2_im;
2605  spinorFloat A1_im = 0;
2606  A1_im += gT10_re * a0_im;
2607  A1_im += gT10_im * a0_re;
2608  A1_im += gT11_re * a1_im;
2609  A1_im += gT11_im * a1_re;
2610  A1_im += gT12_re * a2_im;
2611  A1_im += gT12_im * a2_re;
2612  spinorFloat B1_re = 0;
2613  B1_re += gT10_re * b0_re;
2614  B1_re -= gT10_im * b0_im;
2615  B1_re += gT11_re * b1_re;
2616  B1_re -= gT11_im * b1_im;
2617  B1_re += gT12_re * b2_re;
2618  B1_re -= gT12_im * b2_im;
2619  spinorFloat B1_im = 0;
2620  B1_im += gT10_re * b0_im;
2621  B1_im += gT10_im * b0_re;
2622  B1_im += gT11_re * b1_im;
2623  B1_im += gT11_im * b1_re;
2624  B1_im += gT12_re * b2_im;
2625  B1_im += gT12_im * b2_re;
2626 
2627  // multiply row 2
2628  spinorFloat A2_re = 0;
2629  A2_re += gT20_re * a0_re;
2630  A2_re -= gT20_im * a0_im;
2631  A2_re += gT21_re * a1_re;
2632  A2_re -= gT21_im * a1_im;
2633  A2_re += gT22_re * a2_re;
2634  A2_re -= gT22_im * a2_im;
2635  spinorFloat A2_im = 0;
2636  A2_im += gT20_re * a0_im;
2637  A2_im += gT20_im * a0_re;
2638  A2_im += gT21_re * a1_im;
2639  A2_im += gT21_im * a1_re;
2640  A2_im += gT22_re * a2_im;
2641  A2_im += gT22_im * a2_re;
2642  spinorFloat B2_re = 0;
2643  B2_re += gT20_re * b0_re;
2644  B2_re -= gT20_im * b0_im;
2645  B2_re += gT21_re * b1_re;
2646  B2_re -= gT21_im * b1_im;
2647  B2_re += gT22_re * b2_re;
2648  B2_re -= gT22_im * b2_im;
2649  spinorFloat B2_im = 0;
2650  B2_im += gT20_re * b0_im;
2651  B2_im += gT20_im * b0_re;
2652  B2_im += gT21_re * b1_im;
2653  B2_im += gT21_im * b1_re;
2654  B2_im += gT22_re * b2_im;
2655  B2_im += gT22_im * b2_re;
2656 
2657  o00_re += a*A0_re;
2658  o00_im += a*A0_im;
2659  o10_re += a*B0_re;
2660  o10_im += a*B0_im;
2661 
2662  o01_re += a*A1_re;
2663  o01_im += a*A1_im;
2664  o11_re += a*B1_re;
2665  o11_im += a*B1_im;
2666 
2667  o02_re += a*A2_re;
2668  o02_im += a*A2_im;
2669  o12_re += a*B2_re;
2670  o12_im += a*B2_im;
2671 
2672  }
2673 }
2674 
2675 
2676 
2677 // write spinor field back to device memory
2678 WRITE_SPINOR(param.sp_stride);
2679 
2680 // undefine to prevent warning when precision is changed
2681 #undef spinorFloat
2682 #undef WRITE_SPINOR_SHARED
2683 #undef READ_SPINOR_SHARED
2684 #undef SHARED_STRIDE
2685 
2686 #undef g00_re
2687 #undef g00_im
2688 #undef g01_re
2689 #undef g01_im
2690 #undef g02_re
2691 #undef g02_im
2692 #undef g10_re
2693 #undef g10_im
2694 #undef g11_re
2695 #undef g11_im
2696 #undef g12_re
2697 #undef g12_im
2698 #undef g20_re
2699 #undef g20_im
2700 #undef g21_re
2701 #undef g21_im
2702 #undef g22_re
2703 #undef g22_im
2704 
2705 #undef i00_re
2706 #undef i00_im
2707 #undef i01_re
2708 #undef i01_im
2709 #undef i02_re
2710 #undef i02_im
2711 #undef i10_re
2712 #undef i10_im
2713 #undef i11_re
2714 #undef i11_im
2715 #undef i12_re
2716 #undef i12_im
2717 #undef i20_re
2718 #undef i20_im
2719 #undef i21_re
2720 #undef i21_im
2721 #undef i22_re
2722 #undef i22_im
2723 #undef i30_re
2724 #undef i30_im
2725 #undef i31_re
2726 #undef i31_im
2727 #undef i32_re
2728 #undef i32_im
2729 
2730 #undef acc00_re
2731 #undef acc00_im
2732 #undef acc01_re
2733 #undef acc01_im
2734 #undef acc02_re
2735 #undef acc02_im
2736 #undef acc10_re
2737 #undef acc10_im
2738 #undef acc11_re
2739 #undef acc11_im
2740 #undef acc12_re
2741 #undef acc12_im
2742 #undef acc20_re
2743 #undef acc20_im
2744 #undef acc21_re
2745 #undef acc21_im
2746 #undef acc22_re
2747 #undef acc22_im
2748 #undef acc30_re
2749 #undef acc30_im
2750 #undef acc31_re
2751 #undef acc31_im
2752 #undef acc32_re
2753 #undef acc32_im
2754 
2755 #undef c00_00_re
2756 #undef c01_01_re
2757 #undef c02_02_re
2758 #undef c10_10_re
2759 #undef c11_11_re
2760 #undef c12_12_re
2761 #undef c01_00_re
2762 #undef c01_00_im
2763 #undef c02_00_re
2764 #undef c02_00_im
2765 #undef c10_00_re
2766 #undef c10_00_im
2767 #undef c11_00_re
2768 #undef c11_00_im
2769 #undef c12_00_re
2770 #undef c12_00_im
2771 #undef c02_01_re
2772 #undef c02_01_im
2773 #undef c10_01_re
2774 #undef c10_01_im
2775 #undef c11_01_re
2776 #undef c11_01_im
2777 #undef c12_01_re
2778 #undef c12_01_im
2779 #undef c10_02_re
2780 #undef c10_02_im
2781 #undef c11_02_re
2782 #undef c11_02_im
2783 #undef c12_02_re
2784 #undef c12_02_im
2785 #undef c11_10_re
2786 #undef c11_10_im
2787 #undef c12_10_re
2788 #undef c12_10_im
2789 #undef c12_11_re
2790 #undef c12_11_im
2791 
2792 #undef o00_re
2793 #undef o00_im
2794 #undef o01_re
2795 #undef o01_im
2796 #undef o02_re
2797 #undef o02_im
2798 #undef o10_re
2799 #undef o10_im
2800 #undef o11_re
2801 #undef o11_im
2802 #undef o12_re
2803 #undef o12_im
2804 #undef o20_re
2805 #undef o20_im
2806 #undef o21_re
2807 #undef o21_im
2808 #undef o22_re
2809 #undef o22_im
2810 #undef o30_re
2811 #undef o30_im
2812 #undef o31_re
2813 #undef o31_im
2814 #undef o32_re
2815 #undef o32_im
2816 
2817 #undef VOLATILE
__constant__ int Vh
#define a22_re
Definition: llfat_core.h:131
VOLATILE spinorFloat o11_im
coordsFromIndex3D< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
__constant__ int X2
__constant__ int X2X1mX1
#define CLOVERTEX
Definition: clover_def.h:101
VOLATILE spinorFloat o31_im
VOLATILE spinorFloat o11_re
#define a02_im
Definition: llfat_core.h:120
__constant__ int X3X2X1mX2X1
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
#define a22_im
Definition: llfat_core.h:132
__syncthreads()
VOLATILE spinorFloat o12_im
__constant__ int X3X2X1
VOLATILE spinorFloat o32_re
RECONSTRUCT_GAUGE_MATRIX(0)
#define a01_re
Definition: llfat_core.h:117
VOLATILE spinorFloat o02_im
#define a02_re
Definition: llfat_core.h:119
#define a20_re
Definition: llfat_core.h:127
VOLATILE spinorFloat o21_re
VOLATILE spinorFloat o00_re
#define a12_im
Definition: llfat_core.h:126
VOLATILE spinorFloat o00_im
#define a20_im
Definition: llfat_core.h:128
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int ghostFace[QUDA_MAX_DIM+1]
VOLATILE spinorFloat o01_im
VOLATILE spinorFloat o12_re
#define a01_im
Definition: llfat_core.h:118
#define a12_re
Definition: llfat_core.h:125
VOLATILE spinorFloat o32_im
#define GAUGE0TEX
Definition: covDev.h:112
#define a11_re
Definition: llfat_core.h:123
VOLATILE spinorFloat o02_re
__constant__ int X2m1
#define SPINORTEX
Definition: clover_def.h:40
__constant__ int gauge_fixed
__constant__ int X4X3X2X1mX3X2X1
VOLATILE spinorFloat o30_im
#define SPINOR_HOP
Definition: covDev.h:158
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
__constant__ int ga_stride
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
VOLATILE spinorFloat o01_re
#define a00_re
Definition: llfat_core.h:115
__constant__ int X1m1
__constant__ int X3
VOLATILE spinorFloat o20_im
#define a11_im
Definition: llfat_core.h:124
VOLATILE spinorFloat o20_re
VOLATILE spinorFloat o31_re
#define a10_re
Definition: llfat_core.h:121
VOLATILE spinorFloat o30_re
#define GAUGE1TEX
Definition: covDev.h:113
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define a10_im
Definition: llfat_core.h:122
__constant__ int X4m1
#define a21_re
Definition: llfat_core.h:129
WRITE_SPINOR(param.sp_stride)
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o10_im
VOLATILE spinorFloat o21_im
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
__constant__ int X4X3X2X1hmX3X2X1h
VOLATILE spinorFloat o10_re
#define a21_im
Definition: llfat_core.h:130
KernelType kernel_type
#define READ_CLOVER
Definition: clover_def.h:103
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
__constant__ int X4
__constant__ int X3m1
#define TPROJSCALE
Definition: covDev.h:101
VOLATILE spinorFloat o22_im
#define a00_im
Definition: llfat_core.h:116
__constant__ int X2X1