QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tmc_fused_exterior_dslash_g80_core.h
Go to the documentation of this file.
1 #ifdef MULTI_GPU
2 
3 // *** CUDA DSLASH ***
4 
5 #define DSLASH_SHARED_FLOATS_PER_THREAD 19
6 
7 
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
9 #define VOLATILE
10 #else // Open64 compiler
11 #define VOLATILE volatile
12 #endif
13 // input spinor
14 #ifdef SPINOR_DOUBLE
15 #define spinorFloat double
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
64 #else
65 #define spinorFloat float
66 #define i00_re I0.x
67 #define i00_im I0.y
68 #define i01_re I0.z
69 #define i01_im I0.w
70 #define i02_re I1.x
71 #define i02_im I1.y
72 #define i10_re I1.z
73 #define i10_im I1.w
74 #define i11_re I2.x
75 #define i11_im I2.y
76 #define i12_re I2.z
77 #define i12_im I2.w
78 #define i20_re I3.x
79 #define i20_im I3.y
80 #define i21_re I3.z
81 #define i21_im I3.w
82 #define i22_re I4.x
83 #define i22_im I4.y
84 #define i30_re I4.z
85 #define i30_im I4.w
86 #define i31_re I5.x
87 #define i31_im I5.y
88 #define i32_re I5.z
89 #define i32_im I5.w
90 #define acc00_re accum0.x
91 #define acc00_im accum0.y
92 #define acc01_re accum0.z
93 #define acc01_im accum0.w
94 #define acc02_re accum1.x
95 #define acc02_im accum1.y
96 #define acc10_re accum1.z
97 #define acc10_im accum1.w
98 #define acc11_re accum2.x
99 #define acc11_im accum2.y
100 #define acc12_re accum2.z
101 #define acc12_im accum2.w
102 #define acc20_re accum3.x
103 #define acc20_im accum3.y
104 #define acc21_re accum3.z
105 #define acc21_im accum3.w
106 #define acc22_re accum4.x
107 #define acc22_im accum4.y
108 #define acc30_re accum4.z
109 #define acc30_im accum4.w
110 #define acc31_re accum5.x
111 #define acc31_im accum5.y
112 #define acc32_re accum5.z
113 #define acc32_im accum5.w
114 #endif // SPINOR_DOUBLE
115 
116 // gauge link
117 #ifdef GAUGE_FLOAT2
118 #define g00_re G0.x
119 #define g00_im G0.y
120 #define g01_re G1.x
121 #define g01_im G1.y
122 #define g02_re G2.x
123 #define g02_im G2.y
124 #define g10_re G3.x
125 #define g10_im G3.y
126 #define g11_re G4.x
127 #define g11_im G4.y
128 #define g12_re G5.x
129 #define g12_im G5.y
130 #define g20_re G6.x
131 #define g20_im G6.y
132 #define g21_re G7.x
133 #define g21_im G7.y
134 #define g22_re G8.x
135 #define g22_im G8.y
136 
137 #else
138 #define g00_re G0.x
139 #define g00_im G0.y
140 #define g01_re G0.z
141 #define g01_im G0.w
142 #define g02_re G1.x
143 #define g02_im G1.y
144 #define g10_re G1.z
145 #define g10_im G1.w
146 #define g11_re G2.x
147 #define g11_im G2.y
148 #define g12_re G2.z
149 #define g12_im G2.w
150 #define g20_re G3.x
151 #define g20_im G3.y
152 #define g21_re G3.z
153 #define g21_im G3.w
154 #define g22_re G4.x
155 #define g22_im G4.y
156 
157 #endif // GAUGE_DOUBLE
158 
159 // conjugated gauge link
160 #define gT00_re (+g00_re)
161 #define gT00_im (-g00_im)
162 #define gT01_re (+g10_re)
163 #define gT01_im (-g10_im)
164 #define gT02_re (+g20_re)
165 #define gT02_im (-g20_im)
166 #define gT10_re (+g01_re)
167 #define gT10_im (-g01_im)
168 #define gT11_re (+g11_re)
169 #define gT11_im (-g11_im)
170 #define gT12_re (+g21_re)
171 #define gT12_im (-g21_im)
172 #define gT20_re (+g02_re)
173 #define gT20_im (-g02_im)
174 #define gT21_re (+g12_re)
175 #define gT21_im (-g12_im)
176 #define gT22_re (+g22_re)
177 #define gT22_im (-g22_im)
178 
179 // first chiral block of clover term
180 #ifdef CLOVER_DOUBLE
181 #define c00_00_re C0.x
182 #define c01_01_re C0.y
183 #define c02_02_re C1.x
184 #define c10_10_re C1.y
185 #define c11_11_re C2.x
186 #define c12_12_re C2.y
187 #define c01_00_re C3.x
188 #define c01_00_im C3.y
189 #define c02_00_re C4.x
190 #define c02_00_im C4.y
191 #define c10_00_re C5.x
192 #define c10_00_im C5.y
193 #define c11_00_re C6.x
194 #define c11_00_im C6.y
195 #define c12_00_re C7.x
196 #define c12_00_im C7.y
197 #define c02_01_re C8.x
198 #define c02_01_im C8.y
199 #define c10_01_re C9.x
200 #define c10_01_im C9.y
201 #define c11_01_re C10.x
202 #define c11_01_im C10.y
203 #define c12_01_re C11.x
204 #define c12_01_im C11.y
205 #define c10_02_re C12.x
206 #define c10_02_im C12.y
207 #define c11_02_re C13.x
208 #define c11_02_im C13.y
209 #define c12_02_re C14.x
210 #define c12_02_im C14.y
211 #define c11_10_re C15.x
212 #define c11_10_im C15.y
213 #define c12_10_re C16.x
214 #define c12_10_im C16.y
215 #define c12_11_re C17.x
216 #define c12_11_im C17.y
217 #else
218 #define c00_00_re C0.x
219 #define c01_01_re C0.y
220 #define c02_02_re C0.z
221 #define c10_10_re C0.w
222 #define c11_11_re C1.x
223 #define c12_12_re C1.y
224 #define c01_00_re C1.z
225 #define c01_00_im C1.w
226 #define c02_00_re C2.x
227 #define c02_00_im C2.y
228 #define c10_00_re C2.z
229 #define c10_00_im C2.w
230 #define c11_00_re C3.x
231 #define c11_00_im C3.y
232 #define c12_00_re C3.z
233 #define c12_00_im C3.w
234 #define c02_01_re C4.x
235 #define c02_01_im C4.y
236 #define c10_01_re C4.z
237 #define c10_01_im C4.w
238 #define c11_01_re C5.x
239 #define c11_01_im C5.y
240 #define c12_01_re C5.z
241 #define c12_01_im C5.w
242 #define c10_02_re C6.x
243 #define c10_02_im C6.y
244 #define c11_02_re C6.z
245 #define c11_02_im C6.w
246 #define c12_02_re C7.x
247 #define c12_02_im C7.y
248 #define c11_10_re C7.z
249 #define c11_10_im C7.w
250 #define c12_10_re C8.x
251 #define c12_10_im C8.y
252 #define c12_11_re C8.z
253 #define c12_11_im C8.w
254 #endif // CLOVER_DOUBLE
255 
256 #define c00_01_re (+c01_00_re)
257 #define c00_01_im (-c01_00_im)
258 #define c00_02_re (+c02_00_re)
259 #define c00_02_im (-c02_00_im)
260 #define c01_02_re (+c02_01_re)
261 #define c01_02_im (-c02_01_im)
262 #define c00_10_re (+c10_00_re)
263 #define c00_10_im (-c10_00_im)
264 #define c01_10_re (+c10_01_re)
265 #define c01_10_im (-c10_01_im)
266 #define c02_10_re (+c10_02_re)
267 #define c02_10_im (-c10_02_im)
268 #define c00_11_re (+c11_00_re)
269 #define c00_11_im (-c11_00_im)
270 #define c01_11_re (+c11_01_re)
271 #define c01_11_im (-c11_01_im)
272 #define c02_11_re (+c11_02_re)
273 #define c02_11_im (-c11_02_im)
274 #define c10_11_re (+c11_10_re)
275 #define c10_11_im (-c11_10_im)
276 #define c00_12_re (+c12_00_re)
277 #define c00_12_im (-c12_00_im)
278 #define c01_12_re (+c12_01_re)
279 #define c01_12_im (-c12_01_im)
280 #define c02_12_re (+c12_02_re)
281 #define c02_12_im (-c12_02_im)
282 #define c10_12_re (+c12_10_re)
283 #define c10_12_im (-c12_10_im)
284 #define c11_12_re (+c12_11_re)
285 #define c11_12_im (-c12_11_im)
286 
287 // second chiral block of clover term (reuses C0,...,C9)
288 #define c20_20_re c00_00_re
289 #define c21_20_re c01_00_re
290 #define c21_20_im c01_00_im
291 #define c22_20_re c02_00_re
292 #define c22_20_im c02_00_im
293 #define c30_20_re c10_00_re
294 #define c30_20_im c10_00_im
295 #define c31_20_re c11_00_re
296 #define c31_20_im c11_00_im
297 #define c32_20_re c12_00_re
298 #define c32_20_im c12_00_im
299 #define c20_21_re c00_01_re
300 #define c20_21_im c00_01_im
301 #define c21_21_re c01_01_re
302 #define c22_21_re c02_01_re
303 #define c22_21_im c02_01_im
304 #define c30_21_re c10_01_re
305 #define c30_21_im c10_01_im
306 #define c31_21_re c11_01_re
307 #define c31_21_im c11_01_im
308 #define c32_21_re c12_01_re
309 #define c32_21_im c12_01_im
310 #define c20_22_re c00_02_re
311 #define c20_22_im c00_02_im
312 #define c21_22_re c01_02_re
313 #define c21_22_im c01_02_im
314 #define c22_22_re c02_02_re
315 #define c30_22_re c10_02_re
316 #define c30_22_im c10_02_im
317 #define c31_22_re c11_02_re
318 #define c31_22_im c11_02_im
319 #define c32_22_re c12_02_re
320 #define c32_22_im c12_02_im
321 #define c20_30_re c00_10_re
322 #define c20_30_im c00_10_im
323 #define c21_30_re c01_10_re
324 #define c21_30_im c01_10_im
325 #define c22_30_re c02_10_re
326 #define c22_30_im c02_10_im
327 #define c30_30_re c10_10_re
328 #define c31_30_re c11_10_re
329 #define c31_30_im c11_10_im
330 #define c32_30_re c12_10_re
331 #define c32_30_im c12_10_im
332 #define c20_31_re c00_11_re
333 #define c20_31_im c00_11_im
334 #define c21_31_re c01_11_re
335 #define c21_31_im c01_11_im
336 #define c22_31_re c02_11_re
337 #define c22_31_im c02_11_im
338 #define c30_31_re c10_11_re
339 #define c30_31_im c10_11_im
340 #define c31_31_re c11_11_re
341 #define c32_31_re c12_11_re
342 #define c32_31_im c12_11_im
343 #define c20_32_re c00_12_re
344 #define c20_32_im c00_12_im
345 #define c21_32_re c01_12_re
346 #define c21_32_im c01_12_im
347 #define c22_32_re c02_12_re
348 #define c22_32_im c02_12_im
349 #define c30_32_re c10_12_re
350 #define c30_32_im c10_12_im
351 #define c31_32_re c11_12_re
352 #define c31_32_im c11_12_im
353 #define c32_32_re c12_12_re
354 
355 
356 // first chiral block of inverted clover term
357 #ifdef CLOVER_DOUBLE
358 #define cinv00_00_re C0.x
359 #define cinv01_01_re C0.y
360 #define cinv02_02_re C1.x
361 #define cinv10_10_re C1.y
362 #define cinv11_11_re C2.x
363 #define cinv12_12_re C2.y
364 #define cinv01_00_re C3.x
365 #define cinv01_00_im C3.y
366 #define cinv02_00_re C4.x
367 #define cinv02_00_im C4.y
368 #define cinv10_00_re C5.x
369 #define cinv10_00_im C5.y
370 #define cinv11_00_re C6.x
371 #define cinv11_00_im C6.y
372 #define cinv12_00_re C7.x
373 #define cinv12_00_im C7.y
374 #define cinv02_01_re C8.x
375 #define cinv02_01_im C8.y
376 #define cinv10_01_re C9.x
377 #define cinv10_01_im C9.y
378 #define cinv11_01_re C10.x
379 #define cinv11_01_im C10.y
380 #define cinv12_01_re C11.x
381 #define cinv12_01_im C11.y
382 #define cinv10_02_re C12.x
383 #define cinv10_02_im C12.y
384 #define cinv11_02_re C13.x
385 #define cinv11_02_im C13.y
386 #define cinv12_02_re C14.x
387 #define cinv12_02_im C14.y
388 #define cinv11_10_re C15.x
389 #define cinv11_10_im C15.y
390 #define cinv12_10_re C16.x
391 #define cinv12_10_im C16.y
392 #define cinv12_11_re C17.x
393 #define cinv12_11_im C17.y
394 #else
395 #define cinv00_00_re C0.x
396 #define cinv01_01_re C0.y
397 #define cinv02_02_re C0.z
398 #define cinv10_10_re C0.w
399 #define cinv11_11_re C1.x
400 #define cinv12_12_re C1.y
401 #define cinv01_00_re C1.z
402 #define cinv01_00_im C1.w
403 #define cinv02_00_re C2.x
404 #define cinv02_00_im C2.y
405 #define cinv10_00_re C2.z
406 #define cinv10_00_im C2.w
407 #define cinv11_00_re C3.x
408 #define cinv11_00_im C3.y
409 #define cinv12_00_re C3.z
410 #define cinv12_00_im C3.w
411 #define cinv02_01_re C4.x
412 #define cinv02_01_im C4.y
413 #define cinv10_01_re C4.z
414 #define cinv10_01_im C4.w
415 #define cinv11_01_re C5.x
416 #define cinv11_01_im C5.y
417 #define cinv12_01_re C5.z
418 #define cinv12_01_im C5.w
419 #define cinv10_02_re C6.x
420 #define cinv10_02_im C6.y
421 #define cinv11_02_re C6.z
422 #define cinv11_02_im C6.w
423 #define cinv12_02_re C7.x
424 #define cinv12_02_im C7.y
425 #define cinv11_10_re C7.z
426 #define cinv11_10_im C7.w
427 #define cinv12_10_re C8.x
428 #define cinv12_10_im C8.y
429 #define cinv12_11_re C8.z
430 #define cinv12_11_im C8.w
431 #endif // CLOVER_DOUBLE
432 
433 #define cinv00_01_re (+cinv01_00_re)
434 #define cinv00_01_im (-cinv01_00_im)
435 #define cinv00_02_re (+cinv02_00_re)
436 #define cinv00_02_im (-cinv02_00_im)
437 #define cinv01_02_re (+cinv02_01_re)
438 #define cinv01_02_im (-cinv02_01_im)
439 #define cinv00_10_re (+cinv10_00_re)
440 #define cinv00_10_im (-cinv10_00_im)
441 #define cinv01_10_re (+cinv10_01_re)
442 #define cinv01_10_im (-cinv10_01_im)
443 #define cinv02_10_re (+cinv10_02_re)
444 #define cinv02_10_im (-cinv10_02_im)
445 #define cinv00_11_re (+cinv11_00_re)
446 #define cinv00_11_im (-cinv11_00_im)
447 #define cinv01_11_re (+cinv11_01_re)
448 #define cinv01_11_im (-cinv11_01_im)
449 #define cinv02_11_re (+cinv11_02_re)
450 #define cinv02_11_im (-cinv11_02_im)
451 #define cinv10_11_re (+cinv11_10_re)
452 #define cinv10_11_im (-cinv11_10_im)
453 #define cinv00_12_re (+cinv12_00_re)
454 #define cinv00_12_im (-cinv12_00_im)
455 #define cinv01_12_re (+cinv12_01_re)
456 #define cinv01_12_im (-cinv12_01_im)
457 #define cinv02_12_re (+cinv12_02_re)
458 #define cinv02_12_im (-cinv12_02_im)
459 #define cinv10_12_re (+cinv12_10_re)
460 #define cinv10_12_im (-cinv12_10_im)
461 #define cinv11_12_re (+cinv12_11_re)
462 #define cinv11_12_im (-cinv12_11_im)
463 
464 // second chiral block of inverted clover term (reuses C0,...,C9)
465 #define cinv20_20_re cinv00_00_re
466 #define cinv21_20_re cinv01_00_re
467 #define cinv21_20_im cinv01_00_im
468 #define cinv22_20_re cinv02_00_re
469 #define cinv22_20_im cinv02_00_im
470 #define cinv30_20_re cinv10_00_re
471 #define cinv30_20_im cinv10_00_im
472 #define cinv31_20_re cinv11_00_re
473 #define cinv31_20_im cinv11_00_im
474 #define cinv32_20_re cinv12_00_re
475 #define cinv32_20_im cinv12_00_im
476 #define cinv20_21_re cinv00_01_re
477 #define cinv20_21_im cinv00_01_im
478 #define cinv21_21_re cinv01_01_re
479 #define cinv22_21_re cinv02_01_re
480 #define cinv22_21_im cinv02_01_im
481 #define cinv30_21_re cinv10_01_re
482 #define cinv30_21_im cinv10_01_im
483 #define cinv31_21_re cinv11_01_re
484 #define cinv31_21_im cinv11_01_im
485 #define cinv32_21_re cinv12_01_re
486 #define cinv32_21_im cinv12_01_im
487 #define cinv20_22_re cinv00_02_re
488 #define cinv20_22_im cinv00_02_im
489 #define cinv21_22_re cinv01_02_re
490 #define cinv21_22_im cinv01_02_im
491 #define cinv22_22_re cinv02_02_re
492 #define cinv30_22_re cinv10_02_re
493 #define cinv30_22_im cinv10_02_im
494 #define cinv31_22_re cinv11_02_re
495 #define cinv31_22_im cinv11_02_im
496 #define cinv32_22_re cinv12_02_re
497 #define cinv32_22_im cinv12_02_im
498 #define cinv20_30_re cinv00_10_re
499 #define cinv20_30_im cinv00_10_im
500 #define cinv21_30_re cinv01_10_re
501 #define cinv21_30_im cinv01_10_im
502 #define cinv22_30_re cinv02_10_re
503 #define cinv22_30_im cinv02_10_im
504 #define cinv30_30_re cinv10_10_re
505 #define cinv31_30_re cinv11_10_re
506 #define cinv31_30_im cinv11_10_im
507 #define cinv32_30_re cinv12_10_re
508 #define cinv32_30_im cinv12_10_im
509 #define cinv20_31_re cinv00_11_re
510 #define cinv20_31_im cinv00_11_im
511 #define cinv21_31_re cinv01_11_re
512 #define cinv21_31_im cinv01_11_im
513 #define cinv22_31_re cinv02_11_re
514 #define cinv22_31_im cinv02_11_im
515 #define cinv30_31_re cinv10_11_re
516 #define cinv30_31_im cinv10_11_im
517 #define cinv31_31_re cinv11_11_re
518 #define cinv32_31_re cinv12_11_re
519 #define cinv32_31_im cinv12_11_im
520 #define cinv20_32_re cinv00_12_re
521 #define cinv20_32_im cinv00_12_im
522 #define cinv21_32_re cinv01_12_re
523 #define cinv21_32_im cinv01_12_im
524 #define cinv22_32_re cinv02_12_re
525 #define cinv22_32_im cinv02_12_im
526 #define cinv30_32_re cinv10_12_re
527 #define cinv30_32_im cinv10_12_im
528 #define cinv31_32_re cinv11_12_re
529 #define cinv31_32_im cinv11_12_im
530 #define cinv32_32_re cinv12_12_re
531 
532 
533 
534 // declare C## here and use ASSN below instead of READ
535 #ifdef CLOVER_DOUBLE
536 double2 C0;
537 double2 C1;
538 double2 C2;
539 double2 C3;
540 double2 C4;
541 double2 C5;
542 double2 C6;
543 double2 C7;
544 double2 C8;
545 double2 C9;
546 double2 C10;
547 double2 C11;
548 double2 C12;
549 double2 C13;
550 double2 C14;
551 double2 C15;
552 double2 C16;
553 double2 C17;
554 #else
555 float4 C0;
556 float4 C1;
557 float4 C2;
558 float4 C3;
559 float4 C4;
560 float4 C5;
561 float4 C6;
562 float4 C7;
563 float4 C8;
564 
565 #if (DD_PREC==2)
566 float K;
567 #endif
568 
569 #endif // CLOVER_DOUBLE
570 // output spinor
571 #define o00_re s[0*SHARED_STRIDE]
572 #define o00_im s[1*SHARED_STRIDE]
573 #define o01_re s[2*SHARED_STRIDE]
574 #define o01_im s[3*SHARED_STRIDE]
575 #define o02_re s[4*SHARED_STRIDE]
576 #define o02_im s[5*SHARED_STRIDE]
577 #define o10_re s[6*SHARED_STRIDE]
578 #define o10_im s[7*SHARED_STRIDE]
579 #define o11_re s[8*SHARED_STRIDE]
580 #define o11_im s[9*SHARED_STRIDE]
581 #define o12_re s[10*SHARED_STRIDE]
582 #define o12_im s[11*SHARED_STRIDE]
583 #define o20_re s[12*SHARED_STRIDE]
584 #define o20_im s[13*SHARED_STRIDE]
585 #define o21_re s[14*SHARED_STRIDE]
586 #define o21_im s[15*SHARED_STRIDE]
587 #define o22_re s[16*SHARED_STRIDE]
588 #define o22_im s[17*SHARED_STRIDE]
589 #define o30_re s[18*SHARED_STRIDE]
595 
596 #ifdef SPINOR_DOUBLE
597 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
598 #else
599 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
600 #endif
601 
602 extern __shared__ char s_data[];
603 
605  + (threadIdx.x % SHARED_STRIDE);
606 
607 #include "read_gauge.h"
608 #include "io_spinor.h"
609 #include "read_clover.h"
610 #include "tmc_core.h"
611 
612 int x1, x2, x3, x4;
613 int X;
614 
615 #if (DD_PREC==2) // half precision
616 int sp_norm_idx;
617 #endif // half precision
618 
619 int sid;
620 
621 int dim;
622 int face_idx;
623 int Y[4] = {X1,X2,X3,X4};
624 int faceVolume[4];
625 faceVolume[0] = (X2*X3*X4)>>1;
626 faceVolume[1] = (X1*X3*X4)>>1;
627 faceVolume[2] = (X1*X2*X4)>>1;
628 faceVolume[3] = (X1*X2*X3)>>1;
629 
630  sid = blockIdx.x*blockDim.x + threadIdx.x;
631  if (sid >= param.threads) return;
632 
633 
634  dim = dimFromFaceIndex(sid, param); // sid is also modified
635 
636  const int face_volume = ((param.threadDimMapUpper[dim] - param.threadDimMapLower[dim]) >> 1);
637  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
638  face_idx = sid - face_num*face_volume; // index into the respective face
639 
640 
641  const int dims[] = {X1, X2, X3, X4};
642  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
643 
644  bool active = false;
645  for(int dir=0; dir<4; ++dir){
646  active = active || isActive(dim,dir,+1,x1,x2,x3,x4,param.commDim,param.X);
647  }
648  if(!active) return;
649 
650 
651  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
652 
653  o00_re = i00_re; o00_im = i00_im;
654  o01_re = i01_re; o01_im = i01_im;
655  o02_re = i02_re; o02_im = i02_im;
656  o10_re = i10_re; o10_im = i10_im;
657  o11_re = i11_re; o11_im = i11_im;
658  o12_re = i12_re; o12_im = i12_im;
659  o20_re = i20_re; o20_im = i20_im;
660  o21_re = i21_re; o21_im = i21_im;
661  o22_re = i22_re; o22_im = i22_im;
662  o30_re = i30_re; o30_im = i30_im;
663  o31_re = i31_re; o31_im = i31_im;
664  o32_re = i32_re; o32_im = i32_im;
665 if (isActive(dim,0,+1,x1,x2,x3,x4,param.commDim,param.X) && x1==X1m1 )
666 {
667  // Projector P0-
668  // 1 0 0 -i
669  // 0 1 -i 0
670  // 0 i 1 0
671  // i 0 0 1
672 
673  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,0,Y);
674  const int sp_idx = face_idx + param.ghostOffset[0];
675 #if (DD_PREC==2)
676  sp_norm_idx = face_idx + faceVolume[0] + param.ghostNormOffset[0];
677 #endif
678 
679  const int ga_idx = sid;
680 
687 
688 
689  const int sp_stride_pad = ghostFace[0];
690 
691  // read half spinor from device memory
692  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
693 
694  a0_re = i00_re; a0_im = i00_im;
695  a1_re = i01_re; a1_im = i01_im;
696  a2_re = i02_re; a2_im = i02_im;
697  b0_re = i10_re; b0_im = i10_im;
698  b1_re = i11_re; b1_im = i11_im;
699  b2_re = i12_re; b2_im = i12_im;
700 
701  // read gauge matrix from device memory
702  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
703 
704  // reconstruct gauge matrix
706 
707  // multiply row 0
708  spinorFloat A0_re = 0;
709  A0_re += g00_re * a0_re;
710  A0_re -= g00_im * a0_im;
711  A0_re += g01_re * a1_re;
712  A0_re -= g01_im * a1_im;
713  A0_re += g02_re * a2_re;
714  A0_re -= g02_im * a2_im;
715  spinorFloat A0_im = 0;
716  A0_im += g00_re * a0_im;
717  A0_im += g00_im * a0_re;
718  A0_im += g01_re * a1_im;
719  A0_im += g01_im * a1_re;
720  A0_im += g02_re * a2_im;
721  A0_im += g02_im * a2_re;
722  spinorFloat B0_re = 0;
723  B0_re += g00_re * b0_re;
724  B0_re -= g00_im * b0_im;
725  B0_re += g01_re * b1_re;
726  B0_re -= g01_im * b1_im;
727  B0_re += g02_re * b2_re;
728  B0_re -= g02_im * b2_im;
729  spinorFloat B0_im = 0;
730  B0_im += g00_re * b0_im;
731  B0_im += g00_im * b0_re;
732  B0_im += g01_re * b1_im;
733  B0_im += g01_im * b1_re;
734  B0_im += g02_re * b2_im;
735  B0_im += g02_im * b2_re;
736 
737  // multiply row 1
738  spinorFloat A1_re = 0;
739  A1_re += g10_re * a0_re;
740  A1_re -= g10_im * a0_im;
741  A1_re += g11_re * a1_re;
742  A1_re -= g11_im * a1_im;
743  A1_re += g12_re * a2_re;
744  A1_re -= g12_im * a2_im;
745  spinorFloat A1_im = 0;
746  A1_im += g10_re * a0_im;
747  A1_im += g10_im * a0_re;
748  A1_im += g11_re * a1_im;
749  A1_im += g11_im * a1_re;
750  A1_im += g12_re * a2_im;
751  A1_im += g12_im * a2_re;
752  spinorFloat B1_re = 0;
753  B1_re += g10_re * b0_re;
754  B1_re -= g10_im * b0_im;
755  B1_re += g11_re * b1_re;
756  B1_re -= g11_im * b1_im;
757  B1_re += g12_re * b2_re;
758  B1_re -= g12_im * b2_im;
759  spinorFloat B1_im = 0;
760  B1_im += g10_re * b0_im;
761  B1_im += g10_im * b0_re;
762  B1_im += g11_re * b1_im;
763  B1_im += g11_im * b1_re;
764  B1_im += g12_re * b2_im;
765  B1_im += g12_im * b2_re;
766 
767  // multiply row 2
768  spinorFloat A2_re = 0;
769  A2_re += g20_re * a0_re;
770  A2_re -= g20_im * a0_im;
771  A2_re += g21_re * a1_re;
772  A2_re -= g21_im * a1_im;
773  A2_re += g22_re * a2_re;
774  A2_re -= g22_im * a2_im;
775  spinorFloat A2_im = 0;
776  A2_im += g20_re * a0_im;
777  A2_im += g20_im * a0_re;
778  A2_im += g21_re * a1_im;
779  A2_im += g21_im * a1_re;
780  A2_im += g22_re * a2_im;
781  A2_im += g22_im * a2_re;
782  spinorFloat B2_re = 0;
783  B2_re += g20_re * b0_re;
784  B2_re -= g20_im * b0_im;
785  B2_re += g21_re * b1_re;
786  B2_re -= g21_im * b1_im;
787  B2_re += g22_re * b2_re;
788  B2_re -= g22_im * b2_im;
789  spinorFloat B2_im = 0;
790  B2_im += g20_re * b0_im;
791  B2_im += g20_im * b0_re;
792  B2_im += g21_re * b1_im;
793  B2_im += g21_im * b1_re;
794  B2_im += g22_re * b2_im;
795  B2_im += g22_im * b2_re;
796 
797  o00_re += A0_re;
798  o00_im += A0_im;
799  o10_re += B0_re;
800  o10_im += B0_im;
801  o20_re -= B0_im;
802  o20_im += B0_re;
803  o30_re -= A0_im;
804  o30_im += A0_re;
805 
806  o01_re += A1_re;
807  o01_im += A1_im;
808  o11_re += B1_re;
809  o11_im += B1_im;
810  o21_re -= B1_im;
811  o21_im += B1_re;
812  o31_re -= A1_im;
813  o31_im += A1_re;
814 
815  o02_re += A2_re;
816  o02_im += A2_im;
817  o12_re += B2_re;
818  o12_im += B2_im;
819  o22_re -= B2_im;
820  o22_im += B2_re;
821  o32_re -= A2_im;
822  o32_im += A2_re;
823 
824 }
825 
826 if (isActive(dim,0,-1,x1,x2,x3,x4,param.commDim,param.X) && x1==0 )
827 {
828  // Projector P0+
829  // 1 0 0 i
830  // 0 1 i 0
831  // 0 -i 1 0
832  // -i 0 0 1
833 
834  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,0,Y);
835  const int sp_idx = face_idx + param.ghostOffset[0];
836 #if (DD_PREC==2)
837  sp_norm_idx = face_idx + param.ghostNormOffset[0];
838 #endif
839 
840  const int ga_idx = Vh+face_idx;
841 
848 
849 
850  const int sp_stride_pad = ghostFace[0];
851 
852  // read half spinor from device memory
853  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
854 
855  a0_re = i00_re; a0_im = i00_im;
856  a1_re = i01_re; a1_im = i01_im;
857  a2_re = i02_re; a2_im = i02_im;
858  b0_re = i10_re; b0_im = i10_im;
859  b1_re = i11_re; b1_im = i11_im;
860  b2_re = i12_re; b2_im = i12_im;
861 
862  // read gauge matrix from device memory
863  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
864 
865  // reconstruct gauge matrix
867 
868  // multiply row 0
869  spinorFloat A0_re = 0;
870  A0_re += gT00_re * a0_re;
871  A0_re -= gT00_im * a0_im;
872  A0_re += gT01_re * a1_re;
873  A0_re -= gT01_im * a1_im;
874  A0_re += gT02_re * a2_re;
875  A0_re -= gT02_im * a2_im;
876  spinorFloat A0_im = 0;
877  A0_im += gT00_re * a0_im;
878  A0_im += gT00_im * a0_re;
879  A0_im += gT01_re * a1_im;
880  A0_im += gT01_im * a1_re;
881  A0_im += gT02_re * a2_im;
882  A0_im += gT02_im * a2_re;
883  spinorFloat B0_re = 0;
884  B0_re += gT00_re * b0_re;
885  B0_re -= gT00_im * b0_im;
886  B0_re += gT01_re * b1_re;
887  B0_re -= gT01_im * b1_im;
888  B0_re += gT02_re * b2_re;
889  B0_re -= gT02_im * b2_im;
890  spinorFloat B0_im = 0;
891  B0_im += gT00_re * b0_im;
892  B0_im += gT00_im * b0_re;
893  B0_im += gT01_re * b1_im;
894  B0_im += gT01_im * b1_re;
895  B0_im += gT02_re * b2_im;
896  B0_im += gT02_im * b2_re;
897 
898  // multiply row 1
899  spinorFloat A1_re = 0;
900  A1_re += gT10_re * a0_re;
901  A1_re -= gT10_im * a0_im;
902  A1_re += gT11_re * a1_re;
903  A1_re -= gT11_im * a1_im;
904  A1_re += gT12_re * a2_re;
905  A1_re -= gT12_im * a2_im;
906  spinorFloat A1_im = 0;
907  A1_im += gT10_re * a0_im;
908  A1_im += gT10_im * a0_re;
909  A1_im += gT11_re * a1_im;
910  A1_im += gT11_im * a1_re;
911  A1_im += gT12_re * a2_im;
912  A1_im += gT12_im * a2_re;
913  spinorFloat B1_re = 0;
914  B1_re += gT10_re * b0_re;
915  B1_re -= gT10_im * b0_im;
916  B1_re += gT11_re * b1_re;
917  B1_re -= gT11_im * b1_im;
918  B1_re += gT12_re * b2_re;
919  B1_re -= gT12_im * b2_im;
920  spinorFloat B1_im = 0;
921  B1_im += gT10_re * b0_im;
922  B1_im += gT10_im * b0_re;
923  B1_im += gT11_re * b1_im;
924  B1_im += gT11_im * b1_re;
925  B1_im += gT12_re * b2_im;
926  B1_im += gT12_im * b2_re;
927 
928  // multiply row 2
929  spinorFloat A2_re = 0;
930  A2_re += gT20_re * a0_re;
931  A2_re -= gT20_im * a0_im;
932  A2_re += gT21_re * a1_re;
933  A2_re -= gT21_im * a1_im;
934  A2_re += gT22_re * a2_re;
935  A2_re -= gT22_im * a2_im;
936  spinorFloat A2_im = 0;
937  A2_im += gT20_re * a0_im;
938  A2_im += gT20_im * a0_re;
939  A2_im += gT21_re * a1_im;
940  A2_im += gT21_im * a1_re;
941  A2_im += gT22_re * a2_im;
942  A2_im += gT22_im * a2_re;
943  spinorFloat B2_re = 0;
944  B2_re += gT20_re * b0_re;
945  B2_re -= gT20_im * b0_im;
946  B2_re += gT21_re * b1_re;
947  B2_re -= gT21_im * b1_im;
948  B2_re += gT22_re * b2_re;
949  B2_re -= gT22_im * b2_im;
950  spinorFloat B2_im = 0;
951  B2_im += gT20_re * b0_im;
952  B2_im += gT20_im * b0_re;
953  B2_im += gT21_re * b1_im;
954  B2_im += gT21_im * b1_re;
955  B2_im += gT22_re * b2_im;
956  B2_im += gT22_im * b2_re;
957 
958  o00_re += A0_re;
959  o00_im += A0_im;
960  o10_re += B0_re;
961  o10_im += B0_im;
962  o20_re += B0_im;
963  o20_im -= B0_re;
964  o30_re += A0_im;
965  o30_im -= A0_re;
966 
967  o01_re += A1_re;
968  o01_im += A1_im;
969  o11_re += B1_re;
970  o11_im += B1_im;
971  o21_re += B1_im;
972  o21_im -= B1_re;
973  o31_re += A1_im;
974  o31_im -= A1_re;
975 
976  o02_re += A2_re;
977  o02_im += A2_im;
978  o12_re += B2_re;
979  o12_im += B2_im;
980  o22_re += B2_im;
981  o22_im -= B2_re;
982  o32_re += A2_im;
983  o32_im -= A2_re;
984 
985 }
986 
987 if (isActive(dim,1,+1,x1,x2,x3,x4,param.commDim,param.X) && x2==X2m1 )
988 {
989  // Projector P1-
990  // 1 0 0 -1
991  // 0 1 1 0
992  // 0 1 1 0
993  // -1 0 0 1
994 
995  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,1,Y);
996  const int sp_idx = face_idx + param.ghostOffset[1];
997 #if (DD_PREC==2)
998  sp_norm_idx = face_idx + faceVolume[1] + param.ghostNormOffset[1];
999 #endif
1000 
1001  const int ga_idx = sid;
1002 
1009 
1010 
1011  const int sp_stride_pad = ghostFace[1];
1012 
1013  // read half spinor from device memory
1014  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1015 
1016  a0_re = i00_re; a0_im = i00_im;
1017  a1_re = i01_re; a1_im = i01_im;
1018  a2_re = i02_re; a2_im = i02_im;
1019  b0_re = i10_re; b0_im = i10_im;
1020  b1_re = i11_re; b1_im = i11_im;
1021  b2_re = i12_re; b2_im = i12_im;
1022 
1023  // read gauge matrix from device memory
1024  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
1025 
1026  // reconstruct gauge matrix
1028 
1029  // multiply row 0
1030  spinorFloat A0_re = 0;
1031  A0_re += g00_re * a0_re;
1032  A0_re -= g00_im * a0_im;
1033  A0_re += g01_re * a1_re;
1034  A0_re -= g01_im * a1_im;
1035  A0_re += g02_re * a2_re;
1036  A0_re -= g02_im * a2_im;
1037  spinorFloat A0_im = 0;
1038  A0_im += g00_re * a0_im;
1039  A0_im += g00_im * a0_re;
1040  A0_im += g01_re * a1_im;
1041  A0_im += g01_im * a1_re;
1042  A0_im += g02_re * a2_im;
1043  A0_im += g02_im * a2_re;
1044  spinorFloat B0_re = 0;
1045  B0_re += g00_re * b0_re;
1046  B0_re -= g00_im * b0_im;
1047  B0_re += g01_re * b1_re;
1048  B0_re -= g01_im * b1_im;
1049  B0_re += g02_re * b2_re;
1050  B0_re -= g02_im * b2_im;
1051  spinorFloat B0_im = 0;
1052  B0_im += g00_re * b0_im;
1053  B0_im += g00_im * b0_re;
1054  B0_im += g01_re * b1_im;
1055  B0_im += g01_im * b1_re;
1056  B0_im += g02_re * b2_im;
1057  B0_im += g02_im * b2_re;
1058 
1059  // multiply row 1
1060  spinorFloat A1_re = 0;
1061  A1_re += g10_re * a0_re;
1062  A1_re -= g10_im * a0_im;
1063  A1_re += g11_re * a1_re;
1064  A1_re -= g11_im * a1_im;
1065  A1_re += g12_re * a2_re;
1066  A1_re -= g12_im * a2_im;
1067  spinorFloat A1_im = 0;
1068  A1_im += g10_re * a0_im;
1069  A1_im += g10_im * a0_re;
1070  A1_im += g11_re * a1_im;
1071  A1_im += g11_im * a1_re;
1072  A1_im += g12_re * a2_im;
1073  A1_im += g12_im * a2_re;
1074  spinorFloat B1_re = 0;
1075  B1_re += g10_re * b0_re;
1076  B1_re -= g10_im * b0_im;
1077  B1_re += g11_re * b1_re;
1078  B1_re -= g11_im * b1_im;
1079  B1_re += g12_re * b2_re;
1080  B1_re -= g12_im * b2_im;
1081  spinorFloat B1_im = 0;
1082  B1_im += g10_re * b0_im;
1083  B1_im += g10_im * b0_re;
1084  B1_im += g11_re * b1_im;
1085  B1_im += g11_im * b1_re;
1086  B1_im += g12_re * b2_im;
1087  B1_im += g12_im * b2_re;
1088 
1089  // multiply row 2
1090  spinorFloat A2_re = 0;
1091  A2_re += g20_re * a0_re;
1092  A2_re -= g20_im * a0_im;
1093  A2_re += g21_re * a1_re;
1094  A2_re -= g21_im * a1_im;
1095  A2_re += g22_re * a2_re;
1096  A2_re -= g22_im * a2_im;
1097  spinorFloat A2_im = 0;
1098  A2_im += g20_re * a0_im;
1099  A2_im += g20_im * a0_re;
1100  A2_im += g21_re * a1_im;
1101  A2_im += g21_im * a1_re;
1102  A2_im += g22_re * a2_im;
1103  A2_im += g22_im * a2_re;
1104  spinorFloat B2_re = 0;
1105  B2_re += g20_re * b0_re;
1106  B2_re -= g20_im * b0_im;
1107  B2_re += g21_re * b1_re;
1108  B2_re -= g21_im * b1_im;
1109  B2_re += g22_re * b2_re;
1110  B2_re -= g22_im * b2_im;
1111  spinorFloat B2_im = 0;
1112  B2_im += g20_re * b0_im;
1113  B2_im += g20_im * b0_re;
1114  B2_im += g21_re * b1_im;
1115  B2_im += g21_im * b1_re;
1116  B2_im += g22_re * b2_im;
1117  B2_im += g22_im * b2_re;
1118 
1119  o00_re += A0_re;
1120  o00_im += A0_im;
1121  o10_re += B0_re;
1122  o10_im += B0_im;
1123  o20_re += B0_re;
1124  o20_im += B0_im;
1125  o30_re -= A0_re;
1126  o30_im -= A0_im;
1127 
1128  o01_re += A1_re;
1129  o01_im += A1_im;
1130  o11_re += B1_re;
1131  o11_im += B1_im;
1132  o21_re += B1_re;
1133  o21_im += B1_im;
1134  o31_re -= A1_re;
1135  o31_im -= A1_im;
1136 
1137  o02_re += A2_re;
1138  o02_im += A2_im;
1139  o12_re += B2_re;
1140  o12_im += B2_im;
1141  o22_re += B2_re;
1142  o22_im += B2_im;
1143  o32_re -= A2_re;
1144  o32_im -= A2_im;
1145 
1146 }
1147 
1148 if (isActive(dim,1,-1,x1,x2,x3,x4,param.commDim,param.X) && x2==0 )
1149 {
1150  // Projector P1+
1151  // 1 0 0 1
1152  // 0 1 -1 0
1153  // 0 -1 1 0
1154  // 1 0 0 1
1155 
1156  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,1,Y);
1157  const int sp_idx = face_idx + param.ghostOffset[1];
1158 #if (DD_PREC==2)
1159  sp_norm_idx = face_idx + param.ghostNormOffset[1];
1160 #endif
1161 
1162  const int ga_idx = Vh+face_idx;
1163 
1170 
1171 
1172  const int sp_stride_pad = ghostFace[1];
1173 
1174  // read half spinor from device memory
1175  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1176 
1177  a0_re = i00_re; a0_im = i00_im;
1178  a1_re = i01_re; a1_im = i01_im;
1179  a2_re = i02_re; a2_im = i02_im;
1180  b0_re = i10_re; b0_im = i10_im;
1181  b1_re = i11_re; b1_im = i11_im;
1182  b2_re = i12_re; b2_im = i12_im;
1183 
1184  // read gauge matrix from device memory
1185  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
1186 
1187  // reconstruct gauge matrix
1189 
1190  // multiply row 0
1191  spinorFloat A0_re = 0;
1192  A0_re += gT00_re * a0_re;
1193  A0_re -= gT00_im * a0_im;
1194  A0_re += gT01_re * a1_re;
1195  A0_re -= gT01_im * a1_im;
1196  A0_re += gT02_re * a2_re;
1197  A0_re -= gT02_im * a2_im;
1198  spinorFloat A0_im = 0;
1199  A0_im += gT00_re * a0_im;
1200  A0_im += gT00_im * a0_re;
1201  A0_im += gT01_re * a1_im;
1202  A0_im += gT01_im * a1_re;
1203  A0_im += gT02_re * a2_im;
1204  A0_im += gT02_im * a2_re;
1205  spinorFloat B0_re = 0;
1206  B0_re += gT00_re * b0_re;
1207  B0_re -= gT00_im * b0_im;
1208  B0_re += gT01_re * b1_re;
1209  B0_re -= gT01_im * b1_im;
1210  B0_re += gT02_re * b2_re;
1211  B0_re -= gT02_im * b2_im;
1212  spinorFloat B0_im = 0;
1213  B0_im += gT00_re * b0_im;
1214  B0_im += gT00_im * b0_re;
1215  B0_im += gT01_re * b1_im;
1216  B0_im += gT01_im * b1_re;
1217  B0_im += gT02_re * b2_im;
1218  B0_im += gT02_im * b2_re;
1219 
1220  // multiply row 1
1221  spinorFloat A1_re = 0;
1222  A1_re += gT10_re * a0_re;
1223  A1_re -= gT10_im * a0_im;
1224  A1_re += gT11_re * a1_re;
1225  A1_re -= gT11_im * a1_im;
1226  A1_re += gT12_re * a2_re;
1227  A1_re -= gT12_im * a2_im;
1228  spinorFloat A1_im = 0;
1229  A1_im += gT10_re * a0_im;
1230  A1_im += gT10_im * a0_re;
1231  A1_im += gT11_re * a1_im;
1232  A1_im += gT11_im * a1_re;
1233  A1_im += gT12_re * a2_im;
1234  A1_im += gT12_im * a2_re;
1235  spinorFloat B1_re = 0;
1236  B1_re += gT10_re * b0_re;
1237  B1_re -= gT10_im * b0_im;
1238  B1_re += gT11_re * b1_re;
1239  B1_re -= gT11_im * b1_im;
1240  B1_re += gT12_re * b2_re;
1241  B1_re -= gT12_im * b2_im;
1242  spinorFloat B1_im = 0;
1243  B1_im += gT10_re * b0_im;
1244  B1_im += gT10_im * b0_re;
1245  B1_im += gT11_re * b1_im;
1246  B1_im += gT11_im * b1_re;
1247  B1_im += gT12_re * b2_im;
1248  B1_im += gT12_im * b2_re;
1249 
1250  // multiply row 2
1251  spinorFloat A2_re = 0;
1252  A2_re += gT20_re * a0_re;
1253  A2_re -= gT20_im * a0_im;
1254  A2_re += gT21_re * a1_re;
1255  A2_re -= gT21_im * a1_im;
1256  A2_re += gT22_re * a2_re;
1257  A2_re -= gT22_im * a2_im;
1258  spinorFloat A2_im = 0;
1259  A2_im += gT20_re * a0_im;
1260  A2_im += gT20_im * a0_re;
1261  A2_im += gT21_re * a1_im;
1262  A2_im += gT21_im * a1_re;
1263  A2_im += gT22_re * a2_im;
1264  A2_im += gT22_im * a2_re;
1265  spinorFloat B2_re = 0;
1266  B2_re += gT20_re * b0_re;
1267  B2_re -= gT20_im * b0_im;
1268  B2_re += gT21_re * b1_re;
1269  B2_re -= gT21_im * b1_im;
1270  B2_re += gT22_re * b2_re;
1271  B2_re -= gT22_im * b2_im;
1272  spinorFloat B2_im = 0;
1273  B2_im += gT20_re * b0_im;
1274  B2_im += gT20_im * b0_re;
1275  B2_im += gT21_re * b1_im;
1276  B2_im += gT21_im * b1_re;
1277  B2_im += gT22_re * b2_im;
1278  B2_im += gT22_im * b2_re;
1279 
1280  o00_re += A0_re;
1281  o00_im += A0_im;
1282  o10_re += B0_re;
1283  o10_im += B0_im;
1284  o20_re -= B0_re;
1285  o20_im -= B0_im;
1286  o30_re += A0_re;
1287  o30_im += A0_im;
1288 
1289  o01_re += A1_re;
1290  o01_im += A1_im;
1291  o11_re += B1_re;
1292  o11_im += B1_im;
1293  o21_re -= B1_re;
1294  o21_im -= B1_im;
1295  o31_re += A1_re;
1296  o31_im += A1_im;
1297 
1298  o02_re += A2_re;
1299  o02_im += A2_im;
1300  o12_re += B2_re;
1301  o12_im += B2_im;
1302  o22_re -= B2_re;
1303  o22_im -= B2_im;
1304  o32_re += A2_re;
1305  o32_im += A2_im;
1306 
1307 }
1308 
1309 if (isActive(dim,2,+1,x1,x2,x3,x4,param.commDim,param.X) && x3==X3m1 )
1310 {
1311  // Projector P2-
1312  // 1 0 -i 0
1313  // 0 1 0 i
1314  // i 0 1 0
1315  // 0 -i 0 1
1316 
1317  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,2,Y);
1318  const int sp_idx = face_idx + param.ghostOffset[2];
1319 #if (DD_PREC==2)
1320  sp_norm_idx = face_idx + faceVolume[2] + param.ghostNormOffset[2];
1321 #endif
1322 
1323  const int ga_idx = sid;
1324 
1331 
1332 
1333  const int sp_stride_pad = ghostFace[2];
1334 
1335  // read half spinor from device memory
1336  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1337 
1338  a0_re = i00_re; a0_im = i00_im;
1339  a1_re = i01_re; a1_im = i01_im;
1340  a2_re = i02_re; a2_im = i02_im;
1341  b0_re = i10_re; b0_im = i10_im;
1342  b1_re = i11_re; b1_im = i11_im;
1343  b2_re = i12_re; b2_im = i12_im;
1344 
1345  // read gauge matrix from device memory
1346  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1347 
1348  // reconstruct gauge matrix
1350 
1351  // multiply row 0
1352  spinorFloat A0_re = 0;
1353  A0_re += g00_re * a0_re;
1354  A0_re -= g00_im * a0_im;
1355  A0_re += g01_re * a1_re;
1356  A0_re -= g01_im * a1_im;
1357  A0_re += g02_re * a2_re;
1358  A0_re -= g02_im * a2_im;
1359  spinorFloat A0_im = 0;
1360  A0_im += g00_re * a0_im;
1361  A0_im += g00_im * a0_re;
1362  A0_im += g01_re * a1_im;
1363  A0_im += g01_im * a1_re;
1364  A0_im += g02_re * a2_im;
1365  A0_im += g02_im * a2_re;
1366  spinorFloat B0_re = 0;
1367  B0_re += g00_re * b0_re;
1368  B0_re -= g00_im * b0_im;
1369  B0_re += g01_re * b1_re;
1370  B0_re -= g01_im * b1_im;
1371  B0_re += g02_re * b2_re;
1372  B0_re -= g02_im * b2_im;
1373  spinorFloat B0_im = 0;
1374  B0_im += g00_re * b0_im;
1375  B0_im += g00_im * b0_re;
1376  B0_im += g01_re * b1_im;
1377  B0_im += g01_im * b1_re;
1378  B0_im += g02_re * b2_im;
1379  B0_im += g02_im * b2_re;
1380 
1381  // multiply row 1
1382  spinorFloat A1_re = 0;
1383  A1_re += g10_re * a0_re;
1384  A1_re -= g10_im * a0_im;
1385  A1_re += g11_re * a1_re;
1386  A1_re -= g11_im * a1_im;
1387  A1_re += g12_re * a2_re;
1388  A1_re -= g12_im * a2_im;
1389  spinorFloat A1_im = 0;
1390  A1_im += g10_re * a0_im;
1391  A1_im += g10_im * a0_re;
1392  A1_im += g11_re * a1_im;
1393  A1_im += g11_im * a1_re;
1394  A1_im += g12_re * a2_im;
1395  A1_im += g12_im * a2_re;
1396  spinorFloat B1_re = 0;
1397  B1_re += g10_re * b0_re;
1398  B1_re -= g10_im * b0_im;
1399  B1_re += g11_re * b1_re;
1400  B1_re -= g11_im * b1_im;
1401  B1_re += g12_re * b2_re;
1402  B1_re -= g12_im * b2_im;
1403  spinorFloat B1_im = 0;
1404  B1_im += g10_re * b0_im;
1405  B1_im += g10_im * b0_re;
1406  B1_im += g11_re * b1_im;
1407  B1_im += g11_im * b1_re;
1408  B1_im += g12_re * b2_im;
1409  B1_im += g12_im * b2_re;
1410 
1411  // multiply row 2
1412  spinorFloat A2_re = 0;
1413  A2_re += g20_re * a0_re;
1414  A2_re -= g20_im * a0_im;
1415  A2_re += g21_re * a1_re;
1416  A2_re -= g21_im * a1_im;
1417  A2_re += g22_re * a2_re;
1418  A2_re -= g22_im * a2_im;
1419  spinorFloat A2_im = 0;
1420  A2_im += g20_re * a0_im;
1421  A2_im += g20_im * a0_re;
1422  A2_im += g21_re * a1_im;
1423  A2_im += g21_im * a1_re;
1424  A2_im += g22_re * a2_im;
1425  A2_im += g22_im * a2_re;
1426  spinorFloat B2_re = 0;
1427  B2_re += g20_re * b0_re;
1428  B2_re -= g20_im * b0_im;
1429  B2_re += g21_re * b1_re;
1430  B2_re -= g21_im * b1_im;
1431  B2_re += g22_re * b2_re;
1432  B2_re -= g22_im * b2_im;
1433  spinorFloat B2_im = 0;
1434  B2_im += g20_re * b0_im;
1435  B2_im += g20_im * b0_re;
1436  B2_im += g21_re * b1_im;
1437  B2_im += g21_im * b1_re;
1438  B2_im += g22_re * b2_im;
1439  B2_im += g22_im * b2_re;
1440 
1441  o00_re += A0_re;
1442  o00_im += A0_im;
1443  o10_re += B0_re;
1444  o10_im += B0_im;
1445  o20_re -= A0_im;
1446  o20_im += A0_re;
1447  o30_re += B0_im;
1448  o30_im -= B0_re;
1449 
1450  o01_re += A1_re;
1451  o01_im += A1_im;
1452  o11_re += B1_re;
1453  o11_im += B1_im;
1454  o21_re -= A1_im;
1455  o21_im += A1_re;
1456  o31_re += B1_im;
1457  o31_im -= B1_re;
1458 
1459  o02_re += A2_re;
1460  o02_im += A2_im;
1461  o12_re += B2_re;
1462  o12_im += B2_im;
1463  o22_re -= A2_im;
1464  o22_im += A2_re;
1465  o32_re += B2_im;
1466  o32_im -= B2_re;
1467 
1468 }
1469 
1470 if (isActive(dim,2,-1,x1,x2,x3,x4,param.commDim,param.X) && x3==0 )
1471 {
1472  // Projector P2+
1473  // 1 0 i 0
1474  // 0 1 0 -i
1475  // -i 0 1 0
1476  // 0 i 0 1
1477 
1478  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,2,Y);
1479  const int sp_idx = face_idx + param.ghostOffset[2];
1480 #if (DD_PREC==2)
1481  sp_norm_idx = face_idx + param.ghostNormOffset[2];
1482 #endif
1483 
1484  const int ga_idx = Vh+face_idx;
1485 
1492 
1493 
1494  const int sp_stride_pad = ghostFace[2];
1495 
1496  // read half spinor from device memory
1497  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1498 
1499  a0_re = i00_re; a0_im = i00_im;
1500  a1_re = i01_re; a1_im = i01_im;
1501  a2_re = i02_re; a2_im = i02_im;
1502  b0_re = i10_re; b0_im = i10_im;
1503  b1_re = i11_re; b1_im = i11_im;
1504  b2_re = i12_re; b2_im = i12_im;
1505 
1506  // read gauge matrix from device memory
1507  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1508 
1509  // reconstruct gauge matrix
1511 
1512  // multiply row 0
1513  spinorFloat A0_re = 0;
1514  A0_re += gT00_re * a0_re;
1515  A0_re -= gT00_im * a0_im;
1516  A0_re += gT01_re * a1_re;
1517  A0_re -= gT01_im * a1_im;
1518  A0_re += gT02_re * a2_re;
1519  A0_re -= gT02_im * a2_im;
1520  spinorFloat A0_im = 0;
1521  A0_im += gT00_re * a0_im;
1522  A0_im += gT00_im * a0_re;
1523  A0_im += gT01_re * a1_im;
1524  A0_im += gT01_im * a1_re;
1525  A0_im += gT02_re * a2_im;
1526  A0_im += gT02_im * a2_re;
1527  spinorFloat B0_re = 0;
1528  B0_re += gT00_re * b0_re;
1529  B0_re -= gT00_im * b0_im;
1530  B0_re += gT01_re * b1_re;
1531  B0_re -= gT01_im * b1_im;
1532  B0_re += gT02_re * b2_re;
1533  B0_re -= gT02_im * b2_im;
1534  spinorFloat B0_im = 0;
1535  B0_im += gT00_re * b0_im;
1536  B0_im += gT00_im * b0_re;
1537  B0_im += gT01_re * b1_im;
1538  B0_im += gT01_im * b1_re;
1539  B0_im += gT02_re * b2_im;
1540  B0_im += gT02_im * b2_re;
1541 
1542  // multiply row 1
1543  spinorFloat A1_re = 0;
1544  A1_re += gT10_re * a0_re;
1545  A1_re -= gT10_im * a0_im;
1546  A1_re += gT11_re * a1_re;
1547  A1_re -= gT11_im * a1_im;
1548  A1_re += gT12_re * a2_re;
1549  A1_re -= gT12_im * a2_im;
1550  spinorFloat A1_im = 0;
1551  A1_im += gT10_re * a0_im;
1552  A1_im += gT10_im * a0_re;
1553  A1_im += gT11_re * a1_im;
1554  A1_im += gT11_im * a1_re;
1555  A1_im += gT12_re * a2_im;
1556  A1_im += gT12_im * a2_re;
1557  spinorFloat B1_re = 0;
1558  B1_re += gT10_re * b0_re;
1559  B1_re -= gT10_im * b0_im;
1560  B1_re += gT11_re * b1_re;
1561  B1_re -= gT11_im * b1_im;
1562  B1_re += gT12_re * b2_re;
1563  B1_re -= gT12_im * b2_im;
1564  spinorFloat B1_im = 0;
1565  B1_im += gT10_re * b0_im;
1566  B1_im += gT10_im * b0_re;
1567  B1_im += gT11_re * b1_im;
1568  B1_im += gT11_im * b1_re;
1569  B1_im += gT12_re * b2_im;
1570  B1_im += gT12_im * b2_re;
1571 
1572  // multiply row 2
1573  spinorFloat A2_re = 0;
1574  A2_re += gT20_re * a0_re;
1575  A2_re -= gT20_im * a0_im;
1576  A2_re += gT21_re * a1_re;
1577  A2_re -= gT21_im * a1_im;
1578  A2_re += gT22_re * a2_re;
1579  A2_re -= gT22_im * a2_im;
1580  spinorFloat A2_im = 0;
1581  A2_im += gT20_re * a0_im;
1582  A2_im += gT20_im * a0_re;
1583  A2_im += gT21_re * a1_im;
1584  A2_im += gT21_im * a1_re;
1585  A2_im += gT22_re * a2_im;
1586  A2_im += gT22_im * a2_re;
1587  spinorFloat B2_re = 0;
1588  B2_re += gT20_re * b0_re;
1589  B2_re -= gT20_im * b0_im;
1590  B2_re += gT21_re * b1_re;
1591  B2_re -= gT21_im * b1_im;
1592  B2_re += gT22_re * b2_re;
1593  B2_re -= gT22_im * b2_im;
1594  spinorFloat B2_im = 0;
1595  B2_im += gT20_re * b0_im;
1596  B2_im += gT20_im * b0_re;
1597  B2_im += gT21_re * b1_im;
1598  B2_im += gT21_im * b1_re;
1599  B2_im += gT22_re * b2_im;
1600  B2_im += gT22_im * b2_re;
1601 
1602  o00_re += A0_re;
1603  o00_im += A0_im;
1604  o10_re += B0_re;
1605  o10_im += B0_im;
1606  o20_re += A0_im;
1607  o20_im -= A0_re;
1608  o30_re -= B0_im;
1609  o30_im += B0_re;
1610 
1611  o01_re += A1_re;
1612  o01_im += A1_im;
1613  o11_re += B1_re;
1614  o11_im += B1_im;
1615  o21_re += A1_im;
1616  o21_im -= A1_re;
1617  o31_re -= B1_im;
1618  o31_im += B1_re;
1619 
1620  o02_re += A2_re;
1621  o02_im += A2_im;
1622  o12_re += B2_re;
1623  o12_im += B2_im;
1624  o22_re += A2_im;
1625  o22_im -= A2_re;
1626  o32_re -= B2_im;
1627  o32_im += B2_re;
1628 
1629 }
1630 
1631 if (isActive(dim,3,+1,x1,x2,x3,x4,param.commDim,param.X) && x4==X4m1 )
1632 {
1633  // Projector P3-
1634  // 0 0 0 0
1635  // 0 0 0 0
1636  // 0 0 2 0
1637  // 0 0 0 2
1638 
1639  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,3,Y);
1640  const int sp_idx = face_idx + param.ghostOffset[3];
1641 #if (DD_PREC==2)
1642  sp_norm_idx = face_idx + faceVolume[3] + param.ghostNormOffset[3];
1643 #endif
1644 
1645  const int ga_idx = sid;
1646 
1647  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1648  {
1655 
1656 
1657  const int sp_stride_pad = ghostFace[3];
1658  //const int t_proj_scale = TPROJSCALE;
1659 
1660  // read half spinor from device memory
1661  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1662 
1663 #ifdef CLOVER_TWIST_INV_DSLASH
1664  a0_re = i00_re; a0_im = i00_im;
1665  a1_re = i01_re; a1_im = i01_im;
1666  a2_re = i02_re; a2_im = i02_im;
1667  b0_re = i10_re; b0_im = i10_im;
1668  b1_re = i11_re; b1_im = i11_im;
1669  b2_re = i12_re; b2_im = i12_im;
1670 #else
1671  a0_re = 2*i00_re; a0_im = 2*i00_im;
1672  a1_re = 2*i01_re; a1_im = 2*i01_im;
1673  a2_re = 2*i02_re; a2_im = 2*i02_im;
1674  b0_re = 2*i10_re; b0_im = 2*i10_im;
1675  b1_re = 2*i11_re; b1_im = 2*i11_im;
1676  b2_re = 2*i12_re; b2_im = 2*i12_im;
1677 #endif
1678 
1679  // identity gauge matrix
1686 
1687  o20_re += A0_re;
1688  o20_im += A0_im;
1689  o30_re += B0_re;
1690  o30_im += B0_im;
1691 
1692  o21_re += A1_re;
1693  o21_im += A1_im;
1694  o31_re += B1_re;
1695  o31_im += B1_im;
1696 
1697  o22_re += A2_re;
1698  o22_im += A2_im;
1699  o32_re += B2_re;
1700  o32_im += B2_im;
1701 
1702  } else {
1709 
1710 
1711  const int sp_stride_pad = ghostFace[3];
1712  //const int t_proj_scale = TPROJSCALE;
1713 
1714  // read half spinor from device memory
1715  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1716 
1717 #ifdef CLOVER_TWIST_INV_DSLASH
1718  a0_re = i00_re; a0_im = i00_im;
1719  a1_re = i01_re; a1_im = i01_im;
1720  a2_re = i02_re; a2_im = i02_im;
1721  b0_re = i10_re; b0_im = i10_im;
1722  b1_re = i11_re; b1_im = i11_im;
1723  b2_re = i12_re; b2_im = i12_im;
1724 #else
1725  a0_re = 2*i00_re; a0_im = 2*i00_im;
1726  a1_re = 2*i01_re; a1_im = 2*i01_im;
1727  a2_re = 2*i02_re; a2_im = 2*i02_im;
1728  b0_re = 2*i10_re; b0_im = 2*i10_im;
1729  b1_re = 2*i11_re; b1_im = 2*i11_im;
1730  b2_re = 2*i12_re; b2_im = 2*i12_im;
1731 #endif
1732 
1733  // read gauge matrix from device memory
1734  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1735 
1736  // reconstruct gauge matrix
1738 
1739  // multiply row 0
1740  spinorFloat A0_re = 0;
1741  A0_re += g00_re * a0_re;
1742  A0_re -= g00_im * a0_im;
1743  A0_re += g01_re * a1_re;
1744  A0_re -= g01_im * a1_im;
1745  A0_re += g02_re * a2_re;
1746  A0_re -= g02_im * a2_im;
1747  spinorFloat A0_im = 0;
1748  A0_im += g00_re * a0_im;
1749  A0_im += g00_im * a0_re;
1750  A0_im += g01_re * a1_im;
1751  A0_im += g01_im * a1_re;
1752  A0_im += g02_re * a2_im;
1753  A0_im += g02_im * a2_re;
1754  spinorFloat B0_re = 0;
1755  B0_re += g00_re * b0_re;
1756  B0_re -= g00_im * b0_im;
1757  B0_re += g01_re * b1_re;
1758  B0_re -= g01_im * b1_im;
1759  B0_re += g02_re * b2_re;
1760  B0_re -= g02_im * b2_im;
1761  spinorFloat B0_im = 0;
1762  B0_im += g00_re * b0_im;
1763  B0_im += g00_im * b0_re;
1764  B0_im += g01_re * b1_im;
1765  B0_im += g01_im * b1_re;
1766  B0_im += g02_re * b2_im;
1767  B0_im += g02_im * b2_re;
1768 
1769  // multiply row 1
1770  spinorFloat A1_re = 0;
1771  A1_re += g10_re * a0_re;
1772  A1_re -= g10_im * a0_im;
1773  A1_re += g11_re * a1_re;
1774  A1_re -= g11_im * a1_im;
1775  A1_re += g12_re * a2_re;
1776  A1_re -= g12_im * a2_im;
1777  spinorFloat A1_im = 0;
1778  A1_im += g10_re * a0_im;
1779  A1_im += g10_im * a0_re;
1780  A1_im += g11_re * a1_im;
1781  A1_im += g11_im * a1_re;
1782  A1_im += g12_re * a2_im;
1783  A1_im += g12_im * a2_re;
1784  spinorFloat B1_re = 0;
1785  B1_re += g10_re * b0_re;
1786  B1_re -= g10_im * b0_im;
1787  B1_re += g11_re * b1_re;
1788  B1_re -= g11_im * b1_im;
1789  B1_re += g12_re * b2_re;
1790  B1_re -= g12_im * b2_im;
1791  spinorFloat B1_im = 0;
1792  B1_im += g10_re * b0_im;
1793  B1_im += g10_im * b0_re;
1794  B1_im += g11_re * b1_im;
1795  B1_im += g11_im * b1_re;
1796  B1_im += g12_re * b2_im;
1797  B1_im += g12_im * b2_re;
1798 
1799  // multiply row 2
1800  spinorFloat A2_re = 0;
1801  A2_re += g20_re * a0_re;
1802  A2_re -= g20_im * a0_im;
1803  A2_re += g21_re * a1_re;
1804  A2_re -= g21_im * a1_im;
1805  A2_re += g22_re * a2_re;
1806  A2_re -= g22_im * a2_im;
1807  spinorFloat A2_im = 0;
1808  A2_im += g20_re * a0_im;
1809  A2_im += g20_im * a0_re;
1810  A2_im += g21_re * a1_im;
1811  A2_im += g21_im * a1_re;
1812  A2_im += g22_re * a2_im;
1813  A2_im += g22_im * a2_re;
1814  spinorFloat B2_re = 0;
1815  B2_re += g20_re * b0_re;
1816  B2_re -= g20_im * b0_im;
1817  B2_re += g21_re * b1_re;
1818  B2_re -= g21_im * b1_im;
1819  B2_re += g22_re * b2_re;
1820  B2_re -= g22_im * b2_im;
1821  spinorFloat B2_im = 0;
1822  B2_im += g20_re * b0_im;
1823  B2_im += g20_im * b0_re;
1824  B2_im += g21_re * b1_im;
1825  B2_im += g21_im * b1_re;
1826  B2_im += g22_re * b2_im;
1827  B2_im += g22_im * b2_re;
1828 
1829  o20_re += A0_re;
1830  o20_im += A0_im;
1831  o30_re += B0_re;
1832  o30_im += B0_im;
1833 
1834  o21_re += A1_re;
1835  o21_im += A1_im;
1836  o31_re += B1_re;
1837  o31_im += B1_im;
1838 
1839  o22_re += A2_re;
1840  o22_im += A2_im;
1841  o32_re += B2_re;
1842  o32_im += B2_im;
1843 
1844  }
1845 }
1846 
1847 if (isActive(dim,3,-1,x1,x2,x3,x4,param.commDim,param.X) && x4==0 )
1848 {
1849  // Projector P3+
1850  // 2 0 0 0
1851  // 0 2 0 0
1852  // 0 0 0 0
1853  // 0 0 0 0
1854 
1855  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,3,Y);
1856  const int sp_idx = face_idx + param.ghostOffset[3];
1857 #if (DD_PREC==2)
1858  sp_norm_idx = face_idx + param.ghostNormOffset[3];
1859 #endif
1860 
1861  const int ga_idx = Vh+face_idx;
1862 
1863  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1864  {
1871 
1872 
1873  const int sp_stride_pad = ghostFace[3];
1874  //const int t_proj_scale = TPROJSCALE;
1875 
1876  // read half spinor from device memory
1877  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1878 
1879 #ifdef CLOVER_TWIST_INV_DSLASH
1880  a0_re = i00_re; a0_im = i00_im;
1881  a1_re = i01_re; a1_im = i01_im;
1882  a2_re = i02_re; a2_im = i02_im;
1883  b0_re = i10_re; b0_im = i10_im;
1884  b1_re = i11_re; b1_im = i11_im;
1885  b2_re = i12_re; b2_im = i12_im;
1886 #else
1887  a0_re = 2*i00_re; a0_im = 2*i00_im;
1888  a1_re = 2*i01_re; a1_im = 2*i01_im;
1889  a2_re = 2*i02_re; a2_im = 2*i02_im;
1890  b0_re = 2*i10_re; b0_im = 2*i10_im;
1891  b1_re = 2*i11_re; b1_im = 2*i11_im;
1892  b2_re = 2*i12_re; b2_im = 2*i12_im;
1893 #endif
1894 
1895  // identity gauge matrix
1902 
1903  o00_re += A0_re;
1904  o00_im += A0_im;
1905  o10_re += B0_re;
1906  o10_im += B0_im;
1907 
1908  o01_re += A1_re;
1909  o01_im += A1_im;
1910  o11_re += B1_re;
1911  o11_im += B1_im;
1912 
1913  o02_re += A2_re;
1914  o02_im += A2_im;
1915  o12_re += B2_re;
1916  o12_im += B2_im;
1917 
1918  } else {
1925 
1926 
1927  const int sp_stride_pad = ghostFace[3];
1928  //const int t_proj_scale = TPROJSCALE;
1929 
1930  // read half spinor from device memory
1931  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1932 
1933 #ifdef CLOVER_TWIST_INV_DSLASH
1934  a0_re = i00_re; a0_im = i00_im;
1935  a1_re = i01_re; a1_im = i01_im;
1936  a2_re = i02_re; a2_im = i02_im;
1937  b0_re = i10_re; b0_im = i10_im;
1938  b1_re = i11_re; b1_im = i11_im;
1939  b2_re = i12_re; b2_im = i12_im;
1940 #else
1941  a0_re = 2*i00_re; a0_im = 2*i00_im;
1942  a1_re = 2*i01_re; a1_im = 2*i01_im;
1943  a2_re = 2*i02_re; a2_im = 2*i02_im;
1944  b0_re = 2*i10_re; b0_im = 2*i10_im;
1945  b1_re = 2*i11_re; b1_im = 2*i11_im;
1946  b2_re = 2*i12_re; b2_im = 2*i12_im;
1947 #endif
1948 
1949  // read gauge matrix from device memory
1950  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
1951 
1952  // reconstruct gauge matrix
1954 
1955  // multiply row 0
1956  spinorFloat A0_re = 0;
1957  A0_re += gT00_re * a0_re;
1958  A0_re -= gT00_im * a0_im;
1959  A0_re += gT01_re * a1_re;
1960  A0_re -= gT01_im * a1_im;
1961  A0_re += gT02_re * a2_re;
1962  A0_re -= gT02_im * a2_im;
1963  spinorFloat A0_im = 0;
1964  A0_im += gT00_re * a0_im;
1965  A0_im += gT00_im * a0_re;
1966  A0_im += gT01_re * a1_im;
1967  A0_im += gT01_im * a1_re;
1968  A0_im += gT02_re * a2_im;
1969  A0_im += gT02_im * a2_re;
1970  spinorFloat B0_re = 0;
1971  B0_re += gT00_re * b0_re;
1972  B0_re -= gT00_im * b0_im;
1973  B0_re += gT01_re * b1_re;
1974  B0_re -= gT01_im * b1_im;
1975  B0_re += gT02_re * b2_re;
1976  B0_re -= gT02_im * b2_im;
1977  spinorFloat B0_im = 0;
1978  B0_im += gT00_re * b0_im;
1979  B0_im += gT00_im * b0_re;
1980  B0_im += gT01_re * b1_im;
1981  B0_im += gT01_im * b1_re;
1982  B0_im += gT02_re * b2_im;
1983  B0_im += gT02_im * b2_re;
1984 
1985  // multiply row 1
1986  spinorFloat A1_re = 0;
1987  A1_re += gT10_re * a0_re;
1988  A1_re -= gT10_im * a0_im;
1989  A1_re += gT11_re * a1_re;
1990  A1_re -= gT11_im * a1_im;
1991  A1_re += gT12_re * a2_re;
1992  A1_re -= gT12_im * a2_im;
1993  spinorFloat A1_im = 0;
1994  A1_im += gT10_re * a0_im;
1995  A1_im += gT10_im * a0_re;
1996  A1_im += gT11_re * a1_im;
1997  A1_im += gT11_im * a1_re;
1998  A1_im += gT12_re * a2_im;
1999  A1_im += gT12_im * a2_re;
2000  spinorFloat B1_re = 0;
2001  B1_re += gT10_re * b0_re;
2002  B1_re -= gT10_im * b0_im;
2003  B1_re += gT11_re * b1_re;
2004  B1_re -= gT11_im * b1_im;
2005  B1_re += gT12_re * b2_re;
2006  B1_re -= gT12_im * b2_im;
2007  spinorFloat B1_im = 0;
2008  B1_im += gT10_re * b0_im;
2009  B1_im += gT10_im * b0_re;
2010  B1_im += gT11_re * b1_im;
2011  B1_im += gT11_im * b1_re;
2012  B1_im += gT12_re * b2_im;
2013  B1_im += gT12_im * b2_re;
2014 
2015  // multiply row 2
2016  spinorFloat A2_re = 0;
2017  A2_re += gT20_re * a0_re;
2018  A2_re -= gT20_im * a0_im;
2019  A2_re += gT21_re * a1_re;
2020  A2_re -= gT21_im * a1_im;
2021  A2_re += gT22_re * a2_re;
2022  A2_re -= gT22_im * a2_im;
2023  spinorFloat A2_im = 0;
2024  A2_im += gT20_re * a0_im;
2025  A2_im += gT20_im * a0_re;
2026  A2_im += gT21_re * a1_im;
2027  A2_im += gT21_im * a1_re;
2028  A2_im += gT22_re * a2_im;
2029  A2_im += gT22_im * a2_re;
2030  spinorFloat B2_re = 0;
2031  B2_re += gT20_re * b0_re;
2032  B2_re -= gT20_im * b0_im;
2033  B2_re += gT21_re * b1_re;
2034  B2_re -= gT21_im * b1_im;
2035  B2_re += gT22_re * b2_re;
2036  B2_re -= gT22_im * b2_im;
2037  spinorFloat B2_im = 0;
2038  B2_im += gT20_re * b0_im;
2039  B2_im += gT20_im * b0_re;
2040  B2_im += gT21_re * b1_im;
2041  B2_im += gT21_im * b1_re;
2042  B2_im += gT22_re * b2_im;
2043  B2_im += gT22_im * b2_re;
2044 
2045  o00_re += A0_re;
2046  o00_im += A0_im;
2047  o10_re += B0_re;
2048  o10_im += B0_im;
2049 
2050  o01_re += A1_re;
2051  o01_im += A1_im;
2052  o11_re += B1_re;
2053  o11_im += B1_im;
2054 
2055  o02_re += A2_re;
2056  o02_im += A2_im;
2057  o12_re += B2_re;
2058  o12_im += B2_im;
2059 
2060  }
2061 }
2062 
2063 {
2064 #ifdef DSLASH_XPAY
2065  READ_ACCUM(ACCUMTEX, param.sp_stride)
2066 
2067 #ifndef CLOVER_TWIST_XPAY
2068  //perform invert twist first:
2069  APPLY_CLOVER_TWIST_INV(c, cinv, a, o);
2070  o00_re = b*o00_re + acc00_re;
2071  o00_im = b*o00_im + acc00_im;
2072  o01_re = b*o01_re + acc01_re;
2073  o01_im = b*o01_im + acc01_im;
2074  o02_re = b*o02_re + acc02_re;
2075  o02_im = b*o02_im + acc02_im;
2076  o10_re = b*o10_re + acc10_re;
2077  o10_im = b*o10_im + acc10_im;
2078  o11_re = b*o11_re + acc11_re;
2079  o11_im = b*o11_im + acc11_im;
2080  o12_re = b*o12_re + acc12_re;
2081  o12_im = b*o12_im + acc12_im;
2082  o20_re = b*o20_re + acc20_re;
2083  o20_im = b*o20_im + acc20_im;
2084  o21_re = b*o21_re + acc21_re;
2085  o21_im = b*o21_im + acc21_im;
2086  o22_re = b*o22_re + acc22_re;
2087  o22_im = b*o22_im + acc22_im;
2088  o30_re = b*o30_re + acc30_re;
2089  o30_im = b*o30_im + acc30_im;
2090  o31_re = b*o31_re + acc31_re;
2091  o31_im = b*o31_im + acc31_im;
2092  o32_re = b*o32_re + acc32_re;
2093  o32_im = b*o32_im + acc32_im;
2094 #else
2095  APPLY_CLOVER_TWIST(c, a, acc);
2096  o00_re = b*o00_re + acc00_re;
2097  o00_im = b*o00_im + acc00_im;
2098  o01_re = b*o01_re + acc01_re;
2099  o01_im = b*o01_im + acc01_im;
2100  o02_re = b*o02_re + acc02_re;
2101  o02_im = b*o02_im + acc02_im;
2102  o10_re = b*o10_re + acc10_re;
2103  o10_im = b*o10_im + acc10_im;
2104  o11_re = b*o11_re + acc11_re;
2105  o11_im = b*o11_im + acc11_im;
2106  o12_re = b*o12_re + acc12_re;
2107  o12_im = b*o12_im + acc12_im;
2108  o20_re = b*o20_re + acc20_re;
2109  o20_im = b*o20_im + acc20_im;
2110  o21_re = b*o21_re + acc21_re;
2111  o21_im = b*o21_im + acc21_im;
2112  o22_re = b*o22_re + acc22_re;
2113  o22_im = b*o22_im + acc22_im;
2114  o30_re = b*o30_re + acc30_re;
2115  o30_im = b*o30_im + acc30_im;
2116  o31_re = b*o31_re + acc31_re;
2117  o31_im = b*o31_im + acc31_im;
2118  o32_re = b*o32_re + acc32_re;
2119  o32_im = b*o32_im + acc32_im;
2120 #endif//CLOVER_TWIST_XPAY
2121 #else //no XPAY
2122  APPLY_CLOVER_TWIST_INV(c, cinv, a, o);
2123 #endif
2124 }
2125 
2126 // write spinor field back to device memory
2127 WRITE_SPINOR(param.sp_stride);
2128 
2129 // undefine to prevent warning when precision is changed
2130 #undef spinorFloat
2131 #undef SHARED_STRIDE
2132 
2133 #undef g00_re
2134 #undef g00_im
2135 #undef g01_re
2136 #undef g01_im
2137 #undef g02_re
2138 #undef g02_im
2139 #undef g10_re
2140 #undef g10_im
2141 #undef g11_re
2142 #undef g11_im
2143 #undef g12_re
2144 #undef g12_im
2145 #undef g20_re
2146 #undef g20_im
2147 #undef g21_re
2148 #undef g21_im
2149 #undef g22_re
2150 #undef g22_im
2151 
2152 #undef i00_re
2153 #undef i00_im
2154 #undef i01_re
2155 #undef i01_im
2156 #undef i02_re
2157 #undef i02_im
2158 #undef i10_re
2159 #undef i10_im
2160 #undef i11_re
2161 #undef i11_im
2162 #undef i12_re
2163 #undef i12_im
2164 #undef i20_re
2165 #undef i20_im
2166 #undef i21_re
2167 #undef i21_im
2168 #undef i22_re
2169 #undef i22_im
2170 #undef i30_re
2171 #undef i30_im
2172 #undef i31_re
2173 #undef i31_im
2174 #undef i32_re
2175 #undef i32_im
2176 
2177 #undef acc00_re
2178 #undef acc00_im
2179 #undef acc01_re
2180 #undef acc01_im
2181 #undef acc02_re
2182 #undef acc02_im
2183 #undef acc10_re
2184 #undef acc10_im
2185 #undef acc11_re
2186 #undef acc11_im
2187 #undef acc12_re
2188 #undef acc12_im
2189 #undef acc20_re
2190 #undef acc20_im
2191 #undef acc21_re
2192 #undef acc21_im
2193 #undef acc22_re
2194 #undef acc22_im
2195 #undef acc30_re
2196 #undef acc30_im
2197 #undef acc31_re
2198 #undef acc31_im
2199 #undef acc32_re
2200 #undef acc32_im
2201 
2202 
2203 #undef o00_re
2204 #undef o00_im
2205 #undef o01_re
2206 #undef o01_im
2207 #undef o02_re
2208 #undef o02_im
2209 #undef o10_re
2210 #undef o10_im
2211 #undef o11_re
2212 #undef o11_im
2213 #undef o12_re
2214 #undef o12_im
2215 #undef o20_re
2216 #undef o20_im
2217 #undef o21_re
2218 #undef o21_im
2219 #undef o22_re
2220 #undef o22_im
2221 #undef o30_re
2222 
2223 #undef c00_00_re
2224 #undef c01_01_re
2225 #undef c02_02_re
2226 #undef c10_10_re
2227 #undef c11_11_re
2228 #undef c12_12_re
2229 #undef c01_00_re
2230 #undef c01_00_im
2231 #undef c02_00_re
2232 #undef c02_00_im
2233 #undef c10_00_re
2234 #undef c10_00_im
2235 #undef c11_00_re
2236 #undef c11_00_im
2237 #undef c12_00_re
2238 #undef c12_00_im
2239 #undef c02_01_re
2240 #undef c02_01_im
2241 #undef c10_01_re
2242 #undef c10_01_im
2243 #undef c11_01_re
2244 #undef c11_01_im
2245 #undef c12_01_re
2246 #undef c12_01_im
2247 #undef c10_02_re
2248 #undef c10_02_im
2249 #undef c11_02_re
2250 #undef c11_02_im
2251 #undef c12_02_re
2252 #undef c12_02_im
2253 #undef c11_10_re
2254 #undef c11_10_im
2255 #undef c12_10_re
2256 #undef c12_10_im
2257 #undef c12_11_re
2258 #undef c12_11_im
2259 
2260 #undef cinv00_00_re
2261 #undef cinv01_01_re
2262 #undef cinv02_02_re
2263 #undef cinv10_10_re
2264 #undef cinv11_11_re
2265 #undef cinv12_12_re
2266 #undef cinv01_00_re
2267 #undef cinv01_00_im
2268 #undef cinv02_00_re
2269 #undef cinv02_00_im
2270 #undef cinv10_00_re
2271 #undef cinv10_00_im
2272 #undef cinv11_00_re
2273 #undef cinv11_00_im
2274 #undef cinv12_00_re
2275 #undef cinv12_00_im
2276 #undef cinv02_01_re
2277 #undef cinv02_01_im
2278 #undef cinv10_01_re
2279 #undef cinv10_01_im
2280 #undef cinv11_01_re
2281 #undef cinv11_01_im
2282 #undef cinv12_01_re
2283 #undef cinv12_01_im
2284 #undef cinv10_02_re
2285 #undef cinv10_02_im
2286 #undef cinv11_02_re
2287 #undef cinv11_02_im
2288 #undef cinv12_02_re
2289 #undef cinv12_02_im
2290 #undef cinv11_10_re
2291 #undef cinv11_10_im
2292 #undef cinv12_10_re
2293 #undef cinv12_10_im
2294 #undef cinv12_11_re
2295 #undef cinv12_11_im
2296 
2297 #undef VOLATILE
2298 
2299 #endif // MULTI_GPU
__constant__ int Vh
#define APPLY_CLOVER_TWIST(c, a, reg)
Definition: tmc_core.h:1
__constant__ int X2
#define o32_im
Definition: gamma5.h:295
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
#define o31_im
Definition: gamma5.h:293
#define DSLASH_SHARED_FLOATS_PER_THREAD
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define RECONSTRUCT_GAUGE_MATRIX
Definition: covDev.h:39
#define APPLY_CLOVER_TWIST_INV(c, cinv, a, reg)
Definition: tmc_core.h:432
__shared__ char s_data[]
#define GAUGE0TEX
Definition: covDev.h:112
#define o30_im
Definition: gamma5.h:291
__constant__ int X2m1
#define SPINORTEX
Definition: clover_def.h:40
#define o32_re
Definition: gamma5.h:294
int X[4]
Definition: quda.h:29
__constant__ int gauge_fixed
#define o31_re
Definition: gamma5.h:292
#define SPINOR_HOP
Definition: covDev.h:158
__constant__ int ga_stride
__constant__ int X1m1
__constant__ int X3
#define GAUGE1TEX
Definition: covDev.h:113
#define READ_GAUGE_MATRIX
Definition: covDev.h:44
__constant__ int X4m1
#define WRITE_SPINOR
Definition: clover_def.h:48
VOLATILE spinorFloat * s
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
__constant__ int X4X3X2X1hmX3X2X1h
__constant__ int X4
__constant__ int X3m1