QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
wilson_fused_exterior_dslash_gt200_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #ifdef MULTI_GPU
7 
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
9 #define VOLATILE
10 #else // Open64 compiler
11 #define VOLATILE volatile
12 #endif
13 // input spinor
14 #ifdef SPINOR_DOUBLE
15 #define spinorFloat double
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
64 #else
65 #define spinorFloat float
66 #define i00_re I0.x
67 #define i00_im I0.y
68 #define i01_re I0.z
69 #define i01_im I0.w
70 #define i02_re I1.x
71 #define i02_im I1.y
72 #define i10_re I1.z
73 #define i10_im I1.w
74 #define i11_re I2.x
75 #define i11_im I2.y
76 #define i12_re I2.z
77 #define i12_im I2.w
78 #define i20_re I3.x
79 #define i20_im I3.y
80 #define i21_re I3.z
81 #define i21_im I3.w
82 #define i22_re I4.x
83 #define i22_im I4.y
84 #define i30_re I4.z
85 #define i30_im I4.w
86 #define i31_re I5.x
87 #define i31_im I5.y
88 #define i32_re I5.z
89 #define i32_im I5.w
90 #define acc00_re accum0.x
91 #define acc00_im accum0.y
92 #define acc01_re accum0.z
93 #define acc01_im accum0.w
94 #define acc02_re accum1.x
95 #define acc02_im accum1.y
96 #define acc10_re accum1.z
97 #define acc10_im accum1.w
98 #define acc11_re accum2.x
99 #define acc11_im accum2.y
100 #define acc12_re accum2.z
101 #define acc12_im accum2.w
102 #define acc20_re accum3.x
103 #define acc20_im accum3.y
104 #define acc21_re accum3.z
105 #define acc21_im accum3.w
106 #define acc22_re accum4.x
107 #define acc22_im accum4.y
108 #define acc30_re accum4.z
109 #define acc30_im accum4.w
110 #define acc31_re accum5.x
111 #define acc31_im accum5.y
112 #define acc32_re accum5.z
113 #define acc32_im accum5.w
114 #endif // SPINOR_DOUBLE
115 
116 // gauge link
117 #ifdef GAUGE_FLOAT2
118 #define g00_re G0.x
119 #define g00_im G0.y
120 #define g01_re G1.x
121 #define g01_im G1.y
122 #define g02_re G2.x
123 #define g02_im G2.y
124 #define g10_re G3.x
125 #define g10_im G3.y
126 #define g11_re G4.x
127 #define g11_im G4.y
128 #define g12_re G5.x
129 #define g12_im G5.y
130 #define g20_re G6.x
131 #define g20_im G6.y
132 #define g21_re G7.x
133 #define g21_im G7.y
134 #define g22_re G8.x
135 #define g22_im G8.y
136 
137 #else
138 #define g00_re G0.x
139 #define g00_im G0.y
140 #define g01_re G0.z
141 #define g01_im G0.w
142 #define g02_re G1.x
143 #define g02_im G1.y
144 #define g10_re G1.z
145 #define g10_im G1.w
146 #define g11_re G2.x
147 #define g11_im G2.y
148 #define g12_re G2.z
149 #define g12_im G2.w
150 #define g20_re G3.x
151 #define g20_im G3.y
152 #define g21_re G3.z
153 #define g21_im G3.w
154 #define g22_re G4.x
155 #define g22_im G4.y
156 
157 #endif // GAUGE_DOUBLE
158 
159 // conjugated gauge link
160 #define gT00_re (+g00_re)
161 #define gT00_im (-g00_im)
162 #define gT01_re (+g10_re)
163 #define gT01_im (-g10_im)
164 #define gT02_re (+g20_re)
165 #define gT02_im (-g20_im)
166 #define gT10_re (+g01_re)
167 #define gT10_im (-g01_im)
168 #define gT11_re (+g11_re)
169 #define gT11_im (-g11_im)
170 #define gT12_re (+g21_re)
171 #define gT12_im (-g21_im)
172 #define gT20_re (+g02_re)
173 #define gT20_im (-g02_im)
174 #define gT21_re (+g12_re)
175 #define gT21_im (-g12_im)
176 #define gT22_re (+g22_re)
177 #define gT22_im (-g22_im)
178 
179 // first chiral block of inverted clover term
180 #ifdef CLOVER_DOUBLE
181 #define c00_00_re C0.x
182 #define c01_01_re C0.y
183 #define c02_02_re C1.x
184 #define c10_10_re C1.y
185 #define c11_11_re C2.x
186 #define c12_12_re C2.y
187 #define c01_00_re C3.x
188 #define c01_00_im C3.y
189 #define c02_00_re C4.x
190 #define c02_00_im C4.y
191 #define c10_00_re C5.x
192 #define c10_00_im C5.y
193 #define c11_00_re C6.x
194 #define c11_00_im C6.y
195 #define c12_00_re C7.x
196 #define c12_00_im C7.y
197 #define c02_01_re C8.x
198 #define c02_01_im C8.y
199 #define c10_01_re C9.x
200 #define c10_01_im C9.y
201 #define c11_01_re C10.x
202 #define c11_01_im C10.y
203 #define c12_01_re C11.x
204 #define c12_01_im C11.y
205 #define c10_02_re C12.x
206 #define c10_02_im C12.y
207 #define c11_02_re C13.x
208 #define c11_02_im C13.y
209 #define c12_02_re C14.x
210 #define c12_02_im C14.y
211 #define c11_10_re C15.x
212 #define c11_10_im C15.y
213 #define c12_10_re C16.x
214 #define c12_10_im C16.y
215 #define c12_11_re C17.x
216 #define c12_11_im C17.y
217 #else
218 #define c00_00_re C0.x
219 #define c01_01_re C0.y
220 #define c02_02_re C0.z
221 #define c10_10_re C0.w
222 #define c11_11_re C1.x
223 #define c12_12_re C1.y
224 #define c01_00_re C1.z
225 #define c01_00_im C1.w
226 #define c02_00_re C2.x
227 #define c02_00_im C2.y
228 #define c10_00_re C2.z
229 #define c10_00_im C2.w
230 #define c11_00_re C3.x
231 #define c11_00_im C3.y
232 #define c12_00_re C3.z
233 #define c12_00_im C3.w
234 #define c02_01_re C4.x
235 #define c02_01_im C4.y
236 #define c10_01_re C4.z
237 #define c10_01_im C4.w
238 #define c11_01_re C5.x
239 #define c11_01_im C5.y
240 #define c12_01_re C5.z
241 #define c12_01_im C5.w
242 #define c10_02_re C6.x
243 #define c10_02_im C6.y
244 #define c11_02_re C6.z
245 #define c11_02_im C6.w
246 #define c12_02_re C7.x
247 #define c12_02_im C7.y
248 #define c11_10_re C7.z
249 #define c11_10_im C7.w
250 #define c12_10_re C8.x
251 #define c12_10_im C8.y
252 #define c12_11_re C8.z
253 #define c12_11_im C8.w
254 #endif // CLOVER_DOUBLE
255 
256 #define c00_01_re (+c01_00_re)
257 #define c00_01_im (-c01_00_im)
258 #define c00_02_re (+c02_00_re)
259 #define c00_02_im (-c02_00_im)
260 #define c01_02_re (+c02_01_re)
261 #define c01_02_im (-c02_01_im)
262 #define c00_10_re (+c10_00_re)
263 #define c00_10_im (-c10_00_im)
264 #define c01_10_re (+c10_01_re)
265 #define c01_10_im (-c10_01_im)
266 #define c02_10_re (+c10_02_re)
267 #define c02_10_im (-c10_02_im)
268 #define c00_11_re (+c11_00_re)
269 #define c00_11_im (-c11_00_im)
270 #define c01_11_re (+c11_01_re)
271 #define c01_11_im (-c11_01_im)
272 #define c02_11_re (+c11_02_re)
273 #define c02_11_im (-c11_02_im)
274 #define c10_11_re (+c11_10_re)
275 #define c10_11_im (-c11_10_im)
276 #define c00_12_re (+c12_00_re)
277 #define c00_12_im (-c12_00_im)
278 #define c01_12_re (+c12_01_re)
279 #define c01_12_im (-c12_01_im)
280 #define c02_12_re (+c12_02_re)
281 #define c02_12_im (-c12_02_im)
282 #define c10_12_re (+c12_10_re)
283 #define c10_12_im (-c12_10_im)
284 #define c11_12_re (+c12_11_re)
285 #define c11_12_im (-c12_11_im)
286 
287 // second chiral block of inverted clover term (reuses C0,...,C9)
288 #define c20_20_re c00_00_re
289 #define c21_20_re c01_00_re
290 #define c21_20_im c01_00_im
291 #define c22_20_re c02_00_re
292 #define c22_20_im c02_00_im
293 #define c30_20_re c10_00_re
294 #define c30_20_im c10_00_im
295 #define c31_20_re c11_00_re
296 #define c31_20_im c11_00_im
297 #define c32_20_re c12_00_re
298 #define c32_20_im c12_00_im
299 #define c20_21_re c00_01_re
300 #define c20_21_im c00_01_im
301 #define c21_21_re c01_01_re
302 #define c22_21_re c02_01_re
303 #define c22_21_im c02_01_im
304 #define c30_21_re c10_01_re
305 #define c30_21_im c10_01_im
306 #define c31_21_re c11_01_re
307 #define c31_21_im c11_01_im
308 #define c32_21_re c12_01_re
309 #define c32_21_im c12_01_im
310 #define c20_22_re c00_02_re
311 #define c20_22_im c00_02_im
312 #define c21_22_re c01_02_re
313 #define c21_22_im c01_02_im
314 #define c22_22_re c02_02_re
315 #define c30_22_re c10_02_re
316 #define c30_22_im c10_02_im
317 #define c31_22_re c11_02_re
318 #define c31_22_im c11_02_im
319 #define c32_22_re c12_02_re
320 #define c32_22_im c12_02_im
321 #define c20_30_re c00_10_re
322 #define c20_30_im c00_10_im
323 #define c21_30_re c01_10_re
324 #define c21_30_im c01_10_im
325 #define c22_30_re c02_10_re
326 #define c22_30_im c02_10_im
327 #define c30_30_re c10_10_re
328 #define c31_30_re c11_10_re
329 #define c31_30_im c11_10_im
330 #define c32_30_re c12_10_re
331 #define c32_30_im c12_10_im
332 #define c20_31_re c00_11_re
333 #define c20_31_im c00_11_im
334 #define c21_31_re c01_11_re
335 #define c21_31_im c01_11_im
336 #define c22_31_re c02_11_re
337 #define c22_31_im c02_11_im
338 #define c30_31_re c10_11_re
339 #define c30_31_im c10_11_im
340 #define c31_31_re c11_11_re
341 #define c32_31_re c12_11_re
342 #define c32_31_im c12_11_im
343 #define c20_32_re c00_12_re
344 #define c20_32_im c00_12_im
345 #define c21_32_re c01_12_re
346 #define c21_32_im c01_12_im
347 #define c22_32_re c02_12_re
348 #define c22_32_im c02_12_im
349 #define c30_32_re c10_12_re
350 #define c30_32_im c10_12_im
351 #define c31_32_re c11_12_re
352 #define c31_32_im c11_12_im
353 #define c32_32_re c12_12_re
354 
355 // output spinor
380 
381 #include "read_gauge.h"
382 #include "read_clover.h"
383 #include "io_spinor.h"
384 
385 int x1, x2, x3, x4;
386 int X;
387 
388 #if (DD_PREC==2) // half precision
389 int sp_norm_idx;
390 #endif // half precision
391 
392 int sid;
393 
394 int dim;
395 int face_num;
396 int face_idx;
397 int Y[4] = {X1,X2,X3,X4};
398 int faceVolume[4];
399 faceVolume[0] = (X2*X3*X4)>>1;
400 faceVolume[1] = (X1*X3*X4)>>1;
401 faceVolume[2] = (X1*X2*X4)>>1;
402 faceVolume[3] = (X1*X2*X3)>>1;
403 
404 
405 
406 
407 
408  sid = blockIdx.x*blockDim.x + threadIdx.x;
409  if (sid >= param.threads) return;
410 
411  dim = dimFromFaceIndex(sid, param); // sid is also modified
412 
413 
414  const int face_volume = ((param.threadDimMapUpper[dim] - param.threadDimMapLower[dim]) >> 1); // volume of one face
415  face_num = (sid >= face_volume); // is this thread updating face 0 or 1
416  face_idx = sid - face_num*face_volume; // index into the respective face
417 
418  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
419  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
420  //sp_idx = face_idx + param.ghostOffset[dim];
421 
422 
423  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity,Y);
424 
425  {
426  bool active = false;
427  for(int dir=0; dir<4; ++dir){
428  active = active || isActive(dim,dir,+1,x1,x2,x3,x4,param.commDim,param.X);
429  }
430  if(!active) return;
431  }
432 
433 
434 
435  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
436 
437  o00_re = i00_re; o00_im = i00_im;
438  o01_re = i01_re; o01_im = i01_im;
439  o02_re = i02_re; o02_im = i02_im;
440  o10_re = i10_re; o10_im = i10_im;
441  o11_re = i11_re; o11_im = i11_im;
442  o12_re = i12_re; o12_im = i12_im;
443  o20_re = i20_re; o20_im = i20_im;
444  o21_re = i21_re; o21_im = i21_im;
445  o22_re = i22_re; o22_im = i22_im;
446  o30_re = i30_re; o30_im = i30_im;
447  o31_re = i31_re; o31_im = i31_im;
448  o32_re = i32_re; o32_im = i32_im;
449 
450 
451 if ( isActive(dim,0,+1,x1,x2,x3,x4,param.commDim,param.X) && x1==X1m1 )
452 {
453  // Projector P0-
454  // 1 0 0 -i
455  // 0 1 -i 0
456  // 0 i 1 0
457  // i 0 0 1
458 
459  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,0,Y);
460  const int sp_idx = face_idx + param.ghostOffset[0];
461 #if (DD_PREC==2)
462  sp_norm_idx = face_idx + faceVolume[0] + param.ghostNormOffset[0];
463 #endif
464 
465  const int ga_idx = sid;
466 
473 
474 
475  const int sp_stride_pad = ghostFace[0];
476 
477  // read half spinor from device memory
478  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
479 
480  a0_re = i00_re; a0_im = i00_im;
481  a1_re = i01_re; a1_im = i01_im;
482  a2_re = i02_re; a2_im = i02_im;
483  b0_re = i10_re; b0_im = i10_im;
484  b1_re = i11_re; b1_im = i11_im;
485  b2_re = i12_re; b2_im = i12_im;
486 
487 
488  // read gauge matrix from device memory
489  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
490 
491  // reconstruct gauge matrix
493 
494  // multiply row 0
495  spinorFloat A0_re = 0;
496  A0_re += g00_re * a0_re;
497  A0_re -= g00_im * a0_im;
498  A0_re += g01_re * a1_re;
499  A0_re -= g01_im * a1_im;
500  A0_re += g02_re * a2_re;
501  A0_re -= g02_im * a2_im;
502  spinorFloat A0_im = 0;
503  A0_im += g00_re * a0_im;
504  A0_im += g00_im * a0_re;
505  A0_im += g01_re * a1_im;
506  A0_im += g01_im * a1_re;
507  A0_im += g02_re * a2_im;
508  A0_im += g02_im * a2_re;
509  spinorFloat B0_re = 0;
510  B0_re += g00_re * b0_re;
511  B0_re -= g00_im * b0_im;
512  B0_re += g01_re * b1_re;
513  B0_re -= g01_im * b1_im;
514  B0_re += g02_re * b2_re;
515  B0_re -= g02_im * b2_im;
516  spinorFloat B0_im = 0;
517  B0_im += g00_re * b0_im;
518  B0_im += g00_im * b0_re;
519  B0_im += g01_re * b1_im;
520  B0_im += g01_im * b1_re;
521  B0_im += g02_re * b2_im;
522  B0_im += g02_im * b2_re;
523 
524  // multiply row 1
525  spinorFloat A1_re = 0;
526  A1_re += g10_re * a0_re;
527  A1_re -= g10_im * a0_im;
528  A1_re += g11_re * a1_re;
529  A1_re -= g11_im * a1_im;
530  A1_re += g12_re * a2_re;
531  A1_re -= g12_im * a2_im;
532  spinorFloat A1_im = 0;
533  A1_im += g10_re * a0_im;
534  A1_im += g10_im * a0_re;
535  A1_im += g11_re * a1_im;
536  A1_im += g11_im * a1_re;
537  A1_im += g12_re * a2_im;
538  A1_im += g12_im * a2_re;
539  spinorFloat B1_re = 0;
540  B1_re += g10_re * b0_re;
541  B1_re -= g10_im * b0_im;
542  B1_re += g11_re * b1_re;
543  B1_re -= g11_im * b1_im;
544  B1_re += g12_re * b2_re;
545  B1_re -= g12_im * b2_im;
546  spinorFloat B1_im = 0;
547  B1_im += g10_re * b0_im;
548  B1_im += g10_im * b0_re;
549  B1_im += g11_re * b1_im;
550  B1_im += g11_im * b1_re;
551  B1_im += g12_re * b2_im;
552  B1_im += g12_im * b2_re;
553 
554  // multiply row 2
555  spinorFloat A2_re = 0;
556  A2_re += g20_re * a0_re;
557  A2_re -= g20_im * a0_im;
558  A2_re += g21_re * a1_re;
559  A2_re -= g21_im * a1_im;
560  A2_re += g22_re * a2_re;
561  A2_re -= g22_im * a2_im;
562  spinorFloat A2_im = 0;
563  A2_im += g20_re * a0_im;
564  A2_im += g20_im * a0_re;
565  A2_im += g21_re * a1_im;
566  A2_im += g21_im * a1_re;
567  A2_im += g22_re * a2_im;
568  A2_im += g22_im * a2_re;
569  spinorFloat B2_re = 0;
570  B2_re += g20_re * b0_re;
571  B2_re -= g20_im * b0_im;
572  B2_re += g21_re * b1_re;
573  B2_re -= g21_im * b1_im;
574  B2_re += g22_re * b2_re;
575  B2_re -= g22_im * b2_im;
576  spinorFloat B2_im = 0;
577  B2_im += g20_re * b0_im;
578  B2_im += g20_im * b0_re;
579  B2_im += g21_re * b1_im;
580  B2_im += g21_im * b1_re;
581  B2_im += g22_re * b2_im;
582  B2_im += g22_im * b2_re;
583 
584  o00_re += A0_re;
585  o00_im += A0_im;
586  o10_re += B0_re;
587  o10_im += B0_im;
588  o20_re -= B0_im;
589  o20_im += B0_re;
590  o30_re -= A0_im;
591  o30_im += A0_re;
592 
593  o01_re += A1_re;
594  o01_im += A1_im;
595  o11_re += B1_re;
596  o11_im += B1_im;
597  o21_re -= B1_im;
598  o21_im += B1_re;
599  o31_re -= A1_im;
600  o31_im += A1_re;
601 
602  o02_re += A2_re;
603  o02_im += A2_im;
604  o12_re += B2_re;
605  o12_im += B2_im;
606  o22_re -= B2_im;
607  o22_im += B2_re;
608  o32_re -= A2_im;
609  o32_im += A2_re;
610 
611 }
612 
613 if ( isActive(dim,0,-1,x1,x2,x3,x4,param.commDim,param.X) && x1==0 )
614 {
615  // Projector P0+
616  // 1 0 0 i
617  // 0 1 i 0
618  // 0 -i 1 0
619  // -i 0 0 1
620 
621  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,0,Y);
622  const int sp_idx = face_idx + param.ghostOffset[0];
623 #if (DD_PREC==2)
624  sp_norm_idx = face_idx + param.ghostNormOffset[0];
625 #endif
626 
627  const int ga_idx = Vh+face_idx;
628 
635 
636 
637  const int sp_stride_pad = ghostFace[0];
638 
639  // read half spinor from device memory
640  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
641 
642  a0_re = i00_re; a0_im = i00_im;
643  a1_re = i01_re; a1_im = i01_im;
644  a2_re = i02_re; a2_im = i02_im;
645  b0_re = i10_re; b0_im = i10_im;
646  b1_re = i11_re; b1_im = i11_im;
647  b2_re = i12_re; b2_im = i12_im;
648 
649 
650  // read gauge matrix from device memory
651  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
652 
653  // reconstruct gauge matrix
655 
656  // multiply row 0
657  spinorFloat A0_re = 0;
658  A0_re += gT00_re * a0_re;
659  A0_re -= gT00_im * a0_im;
660  A0_re += gT01_re * a1_re;
661  A0_re -= gT01_im * a1_im;
662  A0_re += gT02_re * a2_re;
663  A0_re -= gT02_im * a2_im;
664  spinorFloat A0_im = 0;
665  A0_im += gT00_re * a0_im;
666  A0_im += gT00_im * a0_re;
667  A0_im += gT01_re * a1_im;
668  A0_im += gT01_im * a1_re;
669  A0_im += gT02_re * a2_im;
670  A0_im += gT02_im * a2_re;
671  spinorFloat B0_re = 0;
672  B0_re += gT00_re * b0_re;
673  B0_re -= gT00_im * b0_im;
674  B0_re += gT01_re * b1_re;
675  B0_re -= gT01_im * b1_im;
676  B0_re += gT02_re * b2_re;
677  B0_re -= gT02_im * b2_im;
678  spinorFloat B0_im = 0;
679  B0_im += gT00_re * b0_im;
680  B0_im += gT00_im * b0_re;
681  B0_im += gT01_re * b1_im;
682  B0_im += gT01_im * b1_re;
683  B0_im += gT02_re * b2_im;
684  B0_im += gT02_im * b2_re;
685 
686  // multiply row 1
687  spinorFloat A1_re = 0;
688  A1_re += gT10_re * a0_re;
689  A1_re -= gT10_im * a0_im;
690  A1_re += gT11_re * a1_re;
691  A1_re -= gT11_im * a1_im;
692  A1_re += gT12_re * a2_re;
693  A1_re -= gT12_im * a2_im;
694  spinorFloat A1_im = 0;
695  A1_im += gT10_re * a0_im;
696  A1_im += gT10_im * a0_re;
697  A1_im += gT11_re * a1_im;
698  A1_im += gT11_im * a1_re;
699  A1_im += gT12_re * a2_im;
700  A1_im += gT12_im * a2_re;
701  spinorFloat B1_re = 0;
702  B1_re += gT10_re * b0_re;
703  B1_re -= gT10_im * b0_im;
704  B1_re += gT11_re * b1_re;
705  B1_re -= gT11_im * b1_im;
706  B1_re += gT12_re * b2_re;
707  B1_re -= gT12_im * b2_im;
708  spinorFloat B1_im = 0;
709  B1_im += gT10_re * b0_im;
710  B1_im += gT10_im * b0_re;
711  B1_im += gT11_re * b1_im;
712  B1_im += gT11_im * b1_re;
713  B1_im += gT12_re * b2_im;
714  B1_im += gT12_im * b2_re;
715 
716  // multiply row 2
717  spinorFloat A2_re = 0;
718  A2_re += gT20_re * a0_re;
719  A2_re -= gT20_im * a0_im;
720  A2_re += gT21_re * a1_re;
721  A2_re -= gT21_im * a1_im;
722  A2_re += gT22_re * a2_re;
723  A2_re -= gT22_im * a2_im;
724  spinorFloat A2_im = 0;
725  A2_im += gT20_re * a0_im;
726  A2_im += gT20_im * a0_re;
727  A2_im += gT21_re * a1_im;
728  A2_im += gT21_im * a1_re;
729  A2_im += gT22_re * a2_im;
730  A2_im += gT22_im * a2_re;
731  spinorFloat B2_re = 0;
732  B2_re += gT20_re * b0_re;
733  B2_re -= gT20_im * b0_im;
734  B2_re += gT21_re * b1_re;
735  B2_re -= gT21_im * b1_im;
736  B2_re += gT22_re * b2_re;
737  B2_re -= gT22_im * b2_im;
738  spinorFloat B2_im = 0;
739  B2_im += gT20_re * b0_im;
740  B2_im += gT20_im * b0_re;
741  B2_im += gT21_re * b1_im;
742  B2_im += gT21_im * b1_re;
743  B2_im += gT22_re * b2_im;
744  B2_im += gT22_im * b2_re;
745 
746  o00_re += A0_re;
747  o00_im += A0_im;
748  o10_re += B0_re;
749  o10_im += B0_im;
750  o20_re += B0_im;
751  o20_im -= B0_re;
752  o30_re += A0_im;
753  o30_im -= A0_re;
754 
755  o01_re += A1_re;
756  o01_im += A1_im;
757  o11_re += B1_re;
758  o11_im += B1_im;
759  o21_re += B1_im;
760  o21_im -= B1_re;
761  o31_re += A1_im;
762  o31_im -= A1_re;
763 
764  o02_re += A2_re;
765  o02_im += A2_im;
766  o12_re += B2_re;
767  o12_im += B2_im;
768  o22_re += B2_im;
769  o22_im -= B2_re;
770  o32_re += A2_im;
771  o32_im -= A2_re;
772 
773 }
774 
775 if ( isActive(dim,1,+1,x1,x2,x3,x4,param.commDim,param.X) && x2==X2m1 )
776 {
777  // Projector P1-
778  // 1 0 0 -1
779  // 0 1 1 0
780  // 0 1 1 0
781  // -1 0 0 1
782 
783  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,1,Y);
784  const int sp_idx = face_idx + param.ghostOffset[1];
785 #if (DD_PREC==2)
786  sp_norm_idx = face_idx + faceVolume[1] + param.ghostNormOffset[1];
787 #endif
788 
789  const int ga_idx = sid;
790 
797 
798 
799  const int sp_stride_pad = ghostFace[1];
800 
801  // read half spinor from device memory
802  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
803 
804  a0_re = i00_re; a0_im = i00_im;
805  a1_re = i01_re; a1_im = i01_im;
806  a2_re = i02_re; a2_im = i02_im;
807  b0_re = i10_re; b0_im = i10_im;
808  b1_re = i11_re; b1_im = i11_im;
809  b2_re = i12_re; b2_im = i12_im;
810 
811 
812  // read gauge matrix from device memory
813  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
814 
815  // reconstruct gauge matrix
817 
818  // multiply row 0
819  spinorFloat A0_re = 0;
820  A0_re += g00_re * a0_re;
821  A0_re -= g00_im * a0_im;
822  A0_re += g01_re * a1_re;
823  A0_re -= g01_im * a1_im;
824  A0_re += g02_re * a2_re;
825  A0_re -= g02_im * a2_im;
826  spinorFloat A0_im = 0;
827  A0_im += g00_re * a0_im;
828  A0_im += g00_im * a0_re;
829  A0_im += g01_re * a1_im;
830  A0_im += g01_im * a1_re;
831  A0_im += g02_re * a2_im;
832  A0_im += g02_im * a2_re;
833  spinorFloat B0_re = 0;
834  B0_re += g00_re * b0_re;
835  B0_re -= g00_im * b0_im;
836  B0_re += g01_re * b1_re;
837  B0_re -= g01_im * b1_im;
838  B0_re += g02_re * b2_re;
839  B0_re -= g02_im * b2_im;
840  spinorFloat B0_im = 0;
841  B0_im += g00_re * b0_im;
842  B0_im += g00_im * b0_re;
843  B0_im += g01_re * b1_im;
844  B0_im += g01_im * b1_re;
845  B0_im += g02_re * b2_im;
846  B0_im += g02_im * b2_re;
847 
848  // multiply row 1
849  spinorFloat A1_re = 0;
850  A1_re += g10_re * a0_re;
851  A1_re -= g10_im * a0_im;
852  A1_re += g11_re * a1_re;
853  A1_re -= g11_im * a1_im;
854  A1_re += g12_re * a2_re;
855  A1_re -= g12_im * a2_im;
856  spinorFloat A1_im = 0;
857  A1_im += g10_re * a0_im;
858  A1_im += g10_im * a0_re;
859  A1_im += g11_re * a1_im;
860  A1_im += g11_im * a1_re;
861  A1_im += g12_re * a2_im;
862  A1_im += g12_im * a2_re;
863  spinorFloat B1_re = 0;
864  B1_re += g10_re * b0_re;
865  B1_re -= g10_im * b0_im;
866  B1_re += g11_re * b1_re;
867  B1_re -= g11_im * b1_im;
868  B1_re += g12_re * b2_re;
869  B1_re -= g12_im * b2_im;
870  spinorFloat B1_im = 0;
871  B1_im += g10_re * b0_im;
872  B1_im += g10_im * b0_re;
873  B1_im += g11_re * b1_im;
874  B1_im += g11_im * b1_re;
875  B1_im += g12_re * b2_im;
876  B1_im += g12_im * b2_re;
877 
878  // multiply row 2
879  spinorFloat A2_re = 0;
880  A2_re += g20_re * a0_re;
881  A2_re -= g20_im * a0_im;
882  A2_re += g21_re * a1_re;
883  A2_re -= g21_im * a1_im;
884  A2_re += g22_re * a2_re;
885  A2_re -= g22_im * a2_im;
886  spinorFloat A2_im = 0;
887  A2_im += g20_re * a0_im;
888  A2_im += g20_im * a0_re;
889  A2_im += g21_re * a1_im;
890  A2_im += g21_im * a1_re;
891  A2_im += g22_re * a2_im;
892  A2_im += g22_im * a2_re;
893  spinorFloat B2_re = 0;
894  B2_re += g20_re * b0_re;
895  B2_re -= g20_im * b0_im;
896  B2_re += g21_re * b1_re;
897  B2_re -= g21_im * b1_im;
898  B2_re += g22_re * b2_re;
899  B2_re -= g22_im * b2_im;
900  spinorFloat B2_im = 0;
901  B2_im += g20_re * b0_im;
902  B2_im += g20_im * b0_re;
903  B2_im += g21_re * b1_im;
904  B2_im += g21_im * b1_re;
905  B2_im += g22_re * b2_im;
906  B2_im += g22_im * b2_re;
907 
908  o00_re += A0_re;
909  o00_im += A0_im;
910  o10_re += B0_re;
911  o10_im += B0_im;
912  o20_re += B0_re;
913  o20_im += B0_im;
914  o30_re -= A0_re;
915  o30_im -= A0_im;
916 
917  o01_re += A1_re;
918  o01_im += A1_im;
919  o11_re += B1_re;
920  o11_im += B1_im;
921  o21_re += B1_re;
922  o21_im += B1_im;
923  o31_re -= A1_re;
924  o31_im -= A1_im;
925 
926  o02_re += A2_re;
927  o02_im += A2_im;
928  o12_re += B2_re;
929  o12_im += B2_im;
930  o22_re += B2_re;
931  o22_im += B2_im;
932  o32_re -= A2_re;
933  o32_im -= A2_im;
934 
935 }
936 
937 if ( isActive(dim,1,-1,x1,x2,x3,x4,param.commDim,param.X) && x2==0 )
938 {
939  // Projector P1+
940  // 1 0 0 1
941  // 0 1 -1 0
942  // 0 -1 1 0
943  // 1 0 0 1
944 
945  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,1,Y);
946  const int sp_idx = face_idx + param.ghostOffset[1];
947 #if (DD_PREC==2)
948  sp_norm_idx = face_idx + param.ghostNormOffset[1];
949 #endif
950 
951  const int ga_idx = Vh+face_idx;
952 
959 
960 
961  const int sp_stride_pad = ghostFace[1];
962 
963  // read half spinor from device memory
964  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
965 
966  a0_re = i00_re; a0_im = i00_im;
967  a1_re = i01_re; a1_im = i01_im;
968  a2_re = i02_re; a2_im = i02_im;
969  b0_re = i10_re; b0_im = i10_im;
970  b1_re = i11_re; b1_im = i11_im;
971  b2_re = i12_re; b2_im = i12_im;
972 
973 
974  // read gauge matrix from device memory
975  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
976 
977  // reconstruct gauge matrix
979 
980  // multiply row 0
981  spinorFloat A0_re = 0;
982  A0_re += gT00_re * a0_re;
983  A0_re -= gT00_im * a0_im;
984  A0_re += gT01_re * a1_re;
985  A0_re -= gT01_im * a1_im;
986  A0_re += gT02_re * a2_re;
987  A0_re -= gT02_im * a2_im;
988  spinorFloat A0_im = 0;
989  A0_im += gT00_re * a0_im;
990  A0_im += gT00_im * a0_re;
991  A0_im += gT01_re * a1_im;
992  A0_im += gT01_im * a1_re;
993  A0_im += gT02_re * a2_im;
994  A0_im += gT02_im * a2_re;
995  spinorFloat B0_re = 0;
996  B0_re += gT00_re * b0_re;
997  B0_re -= gT00_im * b0_im;
998  B0_re += gT01_re * b1_re;
999  B0_re -= gT01_im * b1_im;
1000  B0_re += gT02_re * b2_re;
1001  B0_re -= gT02_im * b2_im;
1002  spinorFloat B0_im = 0;
1003  B0_im += gT00_re * b0_im;
1004  B0_im += gT00_im * b0_re;
1005  B0_im += gT01_re * b1_im;
1006  B0_im += gT01_im * b1_re;
1007  B0_im += gT02_re * b2_im;
1008  B0_im += gT02_im * b2_re;
1009 
1010  // multiply row 1
1011  spinorFloat A1_re = 0;
1012  A1_re += gT10_re * a0_re;
1013  A1_re -= gT10_im * a0_im;
1014  A1_re += gT11_re * a1_re;
1015  A1_re -= gT11_im * a1_im;
1016  A1_re += gT12_re * a2_re;
1017  A1_re -= gT12_im * a2_im;
1018  spinorFloat A1_im = 0;
1019  A1_im += gT10_re * a0_im;
1020  A1_im += gT10_im * a0_re;
1021  A1_im += gT11_re * a1_im;
1022  A1_im += gT11_im * a1_re;
1023  A1_im += gT12_re * a2_im;
1024  A1_im += gT12_im * a2_re;
1025  spinorFloat B1_re = 0;
1026  B1_re += gT10_re * b0_re;
1027  B1_re -= gT10_im * b0_im;
1028  B1_re += gT11_re * b1_re;
1029  B1_re -= gT11_im * b1_im;
1030  B1_re += gT12_re * b2_re;
1031  B1_re -= gT12_im * b2_im;
1032  spinorFloat B1_im = 0;
1033  B1_im += gT10_re * b0_im;
1034  B1_im += gT10_im * b0_re;
1035  B1_im += gT11_re * b1_im;
1036  B1_im += gT11_im * b1_re;
1037  B1_im += gT12_re * b2_im;
1038  B1_im += gT12_im * b2_re;
1039 
1040  // multiply row 2
1041  spinorFloat A2_re = 0;
1042  A2_re += gT20_re * a0_re;
1043  A2_re -= gT20_im * a0_im;
1044  A2_re += gT21_re * a1_re;
1045  A2_re -= gT21_im * a1_im;
1046  A2_re += gT22_re * a2_re;
1047  A2_re -= gT22_im * a2_im;
1048  spinorFloat A2_im = 0;
1049  A2_im += gT20_re * a0_im;
1050  A2_im += gT20_im * a0_re;
1051  A2_im += gT21_re * a1_im;
1052  A2_im += gT21_im * a1_re;
1053  A2_im += gT22_re * a2_im;
1054  A2_im += gT22_im * a2_re;
1055  spinorFloat B2_re = 0;
1056  B2_re += gT20_re * b0_re;
1057  B2_re -= gT20_im * b0_im;
1058  B2_re += gT21_re * b1_re;
1059  B2_re -= gT21_im * b1_im;
1060  B2_re += gT22_re * b2_re;
1061  B2_re -= gT22_im * b2_im;
1062  spinorFloat B2_im = 0;
1063  B2_im += gT20_re * b0_im;
1064  B2_im += gT20_im * b0_re;
1065  B2_im += gT21_re * b1_im;
1066  B2_im += gT21_im * b1_re;
1067  B2_im += gT22_re * b2_im;
1068  B2_im += gT22_im * b2_re;
1069 
1070  o00_re += A0_re;
1071  o00_im += A0_im;
1072  o10_re += B0_re;
1073  o10_im += B0_im;
1074  o20_re -= B0_re;
1075  o20_im -= B0_im;
1076  o30_re += A0_re;
1077  o30_im += A0_im;
1078 
1079  o01_re += A1_re;
1080  o01_im += A1_im;
1081  o11_re += B1_re;
1082  o11_im += B1_im;
1083  o21_re -= B1_re;
1084  o21_im -= B1_im;
1085  o31_re += A1_re;
1086  o31_im += A1_im;
1087 
1088  o02_re += A2_re;
1089  o02_im += A2_im;
1090  o12_re += B2_re;
1091  o12_im += B2_im;
1092  o22_re -= B2_re;
1093  o22_im -= B2_im;
1094  o32_re += A2_re;
1095  o32_im += A2_im;
1096 
1097 }
1098 
1099 if ( isActive(dim,2,+1,x1,x2,x3,x4,param.commDim,param.X) && x3==X3m1 )
1100 {
1101  // Projector P2-
1102  // 1 0 -i 0
1103  // 0 1 0 i
1104  // i 0 1 0
1105  // 0 -i 0 1
1106 
1107  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,2,Y);
1108  const int sp_idx = face_idx + param.ghostOffset[2];
1109 #if (DD_PREC==2)
1110  sp_norm_idx = face_idx + faceVolume[2] + param.ghostNormOffset[2];
1111 #endif
1112 
1113  const int ga_idx = sid;
1114 
1121 
1122 
1123  const int sp_stride_pad = ghostFace[2];
1124 
1125  // read half spinor from device memory
1126  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1127 
1128  a0_re = i00_re; a0_im = i00_im;
1129  a1_re = i01_re; a1_im = i01_im;
1130  a2_re = i02_re; a2_im = i02_im;
1131  b0_re = i10_re; b0_im = i10_im;
1132  b1_re = i11_re; b1_im = i11_im;
1133  b2_re = i12_re; b2_im = i12_im;
1134 
1135 
1136  // read gauge matrix from device memory
1137  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1138 
1139  // reconstruct gauge matrix
1141 
1142  // multiply row 0
1143  spinorFloat A0_re = 0;
1144  A0_re += g00_re * a0_re;
1145  A0_re -= g00_im * a0_im;
1146  A0_re += g01_re * a1_re;
1147  A0_re -= g01_im * a1_im;
1148  A0_re += g02_re * a2_re;
1149  A0_re -= g02_im * a2_im;
1150  spinorFloat A0_im = 0;
1151  A0_im += g00_re * a0_im;
1152  A0_im += g00_im * a0_re;
1153  A0_im += g01_re * a1_im;
1154  A0_im += g01_im * a1_re;
1155  A0_im += g02_re * a2_im;
1156  A0_im += g02_im * a2_re;
1157  spinorFloat B0_re = 0;
1158  B0_re += g00_re * b0_re;
1159  B0_re -= g00_im * b0_im;
1160  B0_re += g01_re * b1_re;
1161  B0_re -= g01_im * b1_im;
1162  B0_re += g02_re * b2_re;
1163  B0_re -= g02_im * b2_im;
1164  spinorFloat B0_im = 0;
1165  B0_im += g00_re * b0_im;
1166  B0_im += g00_im * b0_re;
1167  B0_im += g01_re * b1_im;
1168  B0_im += g01_im * b1_re;
1169  B0_im += g02_re * b2_im;
1170  B0_im += g02_im * b2_re;
1171 
1172  // multiply row 1
1173  spinorFloat A1_re = 0;
1174  A1_re += g10_re * a0_re;
1175  A1_re -= g10_im * a0_im;
1176  A1_re += g11_re * a1_re;
1177  A1_re -= g11_im * a1_im;
1178  A1_re += g12_re * a2_re;
1179  A1_re -= g12_im * a2_im;
1180  spinorFloat A1_im = 0;
1181  A1_im += g10_re * a0_im;
1182  A1_im += g10_im * a0_re;
1183  A1_im += g11_re * a1_im;
1184  A1_im += g11_im * a1_re;
1185  A1_im += g12_re * a2_im;
1186  A1_im += g12_im * a2_re;
1187  spinorFloat B1_re = 0;
1188  B1_re += g10_re * b0_re;
1189  B1_re -= g10_im * b0_im;
1190  B1_re += g11_re * b1_re;
1191  B1_re -= g11_im * b1_im;
1192  B1_re += g12_re * b2_re;
1193  B1_re -= g12_im * b2_im;
1194  spinorFloat B1_im = 0;
1195  B1_im += g10_re * b0_im;
1196  B1_im += g10_im * b0_re;
1197  B1_im += g11_re * b1_im;
1198  B1_im += g11_im * b1_re;
1199  B1_im += g12_re * b2_im;
1200  B1_im += g12_im * b2_re;
1201 
1202  // multiply row 2
1203  spinorFloat A2_re = 0;
1204  A2_re += g20_re * a0_re;
1205  A2_re -= g20_im * a0_im;
1206  A2_re += g21_re * a1_re;
1207  A2_re -= g21_im * a1_im;
1208  A2_re += g22_re * a2_re;
1209  A2_re -= g22_im * a2_im;
1210  spinorFloat A2_im = 0;
1211  A2_im += g20_re * a0_im;
1212  A2_im += g20_im * a0_re;
1213  A2_im += g21_re * a1_im;
1214  A2_im += g21_im * a1_re;
1215  A2_im += g22_re * a2_im;
1216  A2_im += g22_im * a2_re;
1217  spinorFloat B2_re = 0;
1218  B2_re += g20_re * b0_re;
1219  B2_re -= g20_im * b0_im;
1220  B2_re += g21_re * b1_re;
1221  B2_re -= g21_im * b1_im;
1222  B2_re += g22_re * b2_re;
1223  B2_re -= g22_im * b2_im;
1224  spinorFloat B2_im = 0;
1225  B2_im += g20_re * b0_im;
1226  B2_im += g20_im * b0_re;
1227  B2_im += g21_re * b1_im;
1228  B2_im += g21_im * b1_re;
1229  B2_im += g22_re * b2_im;
1230  B2_im += g22_im * b2_re;
1231 
1232  o00_re += A0_re;
1233  o00_im += A0_im;
1234  o10_re += B0_re;
1235  o10_im += B0_im;
1236  o20_re -= A0_im;
1237  o20_im += A0_re;
1238  o30_re += B0_im;
1239  o30_im -= B0_re;
1240 
1241  o01_re += A1_re;
1242  o01_im += A1_im;
1243  o11_re += B1_re;
1244  o11_im += B1_im;
1245  o21_re -= A1_im;
1246  o21_im += A1_re;
1247  o31_re += B1_im;
1248  o31_im -= B1_re;
1249 
1250  o02_re += A2_re;
1251  o02_im += A2_im;
1252  o12_re += B2_re;
1253  o12_im += B2_im;
1254  o22_re -= A2_im;
1255  o22_im += A2_re;
1256  o32_re += B2_im;
1257  o32_im -= B2_re;
1258 
1259 }
1260 
1261 if ( isActive(dim,2,-1,x1,x2,x3,x4,param.commDim,param.X) && x3==0 )
1262 {
1263  // Projector P2+
1264  // 1 0 i 0
1265  // 0 1 0 -i
1266  // -i 0 1 0
1267  // 0 i 0 1
1268 
1269  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,2,Y);
1270  const int sp_idx = face_idx + param.ghostOffset[2];
1271 #if (DD_PREC==2)
1272  sp_norm_idx = face_idx + param.ghostNormOffset[2];
1273 #endif
1274 
1275  const int ga_idx = Vh+face_idx;
1276 
1283 
1284 
1285  const int sp_stride_pad = ghostFace[2];
1286 
1287  // read half spinor from device memory
1288  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1289 
1290  a0_re = i00_re; a0_im = i00_im;
1291  a1_re = i01_re; a1_im = i01_im;
1292  a2_re = i02_re; a2_im = i02_im;
1293  b0_re = i10_re; b0_im = i10_im;
1294  b1_re = i11_re; b1_im = i11_im;
1295  b2_re = i12_re; b2_im = i12_im;
1296 
1297 
1298  // read gauge matrix from device memory
1299  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1300 
1301  // reconstruct gauge matrix
1303 
1304  // multiply row 0
1305  spinorFloat A0_re = 0;
1306  A0_re += gT00_re * a0_re;
1307  A0_re -= gT00_im * a0_im;
1308  A0_re += gT01_re * a1_re;
1309  A0_re -= gT01_im * a1_im;
1310  A0_re += gT02_re * a2_re;
1311  A0_re -= gT02_im * a2_im;
1312  spinorFloat A0_im = 0;
1313  A0_im += gT00_re * a0_im;
1314  A0_im += gT00_im * a0_re;
1315  A0_im += gT01_re * a1_im;
1316  A0_im += gT01_im * a1_re;
1317  A0_im += gT02_re * a2_im;
1318  A0_im += gT02_im * a2_re;
1319  spinorFloat B0_re = 0;
1320  B0_re += gT00_re * b0_re;
1321  B0_re -= gT00_im * b0_im;
1322  B0_re += gT01_re * b1_re;
1323  B0_re -= gT01_im * b1_im;
1324  B0_re += gT02_re * b2_re;
1325  B0_re -= gT02_im * b2_im;
1326  spinorFloat B0_im = 0;
1327  B0_im += gT00_re * b0_im;
1328  B0_im += gT00_im * b0_re;
1329  B0_im += gT01_re * b1_im;
1330  B0_im += gT01_im * b1_re;
1331  B0_im += gT02_re * b2_im;
1332  B0_im += gT02_im * b2_re;
1333 
1334  // multiply row 1
1335  spinorFloat A1_re = 0;
1336  A1_re += gT10_re * a0_re;
1337  A1_re -= gT10_im * a0_im;
1338  A1_re += gT11_re * a1_re;
1339  A1_re -= gT11_im * a1_im;
1340  A1_re += gT12_re * a2_re;
1341  A1_re -= gT12_im * a2_im;
1342  spinorFloat A1_im = 0;
1343  A1_im += gT10_re * a0_im;
1344  A1_im += gT10_im * a0_re;
1345  A1_im += gT11_re * a1_im;
1346  A1_im += gT11_im * a1_re;
1347  A1_im += gT12_re * a2_im;
1348  A1_im += gT12_im * a2_re;
1349  spinorFloat B1_re = 0;
1350  B1_re += gT10_re * b0_re;
1351  B1_re -= gT10_im * b0_im;
1352  B1_re += gT11_re * b1_re;
1353  B1_re -= gT11_im * b1_im;
1354  B1_re += gT12_re * b2_re;
1355  B1_re -= gT12_im * b2_im;
1356  spinorFloat B1_im = 0;
1357  B1_im += gT10_re * b0_im;
1358  B1_im += gT10_im * b0_re;
1359  B1_im += gT11_re * b1_im;
1360  B1_im += gT11_im * b1_re;
1361  B1_im += gT12_re * b2_im;
1362  B1_im += gT12_im * b2_re;
1363 
1364  // multiply row 2
1365  spinorFloat A2_re = 0;
1366  A2_re += gT20_re * a0_re;
1367  A2_re -= gT20_im * a0_im;
1368  A2_re += gT21_re * a1_re;
1369  A2_re -= gT21_im * a1_im;
1370  A2_re += gT22_re * a2_re;
1371  A2_re -= gT22_im * a2_im;
1372  spinorFloat A2_im = 0;
1373  A2_im += gT20_re * a0_im;
1374  A2_im += gT20_im * a0_re;
1375  A2_im += gT21_re * a1_im;
1376  A2_im += gT21_im * a1_re;
1377  A2_im += gT22_re * a2_im;
1378  A2_im += gT22_im * a2_re;
1379  spinorFloat B2_re = 0;
1380  B2_re += gT20_re * b0_re;
1381  B2_re -= gT20_im * b0_im;
1382  B2_re += gT21_re * b1_re;
1383  B2_re -= gT21_im * b1_im;
1384  B2_re += gT22_re * b2_re;
1385  B2_re -= gT22_im * b2_im;
1386  spinorFloat B2_im = 0;
1387  B2_im += gT20_re * b0_im;
1388  B2_im += gT20_im * b0_re;
1389  B2_im += gT21_re * b1_im;
1390  B2_im += gT21_im * b1_re;
1391  B2_im += gT22_re * b2_im;
1392  B2_im += gT22_im * b2_re;
1393 
1394  o00_re += A0_re;
1395  o00_im += A0_im;
1396  o10_re += B0_re;
1397  o10_im += B0_im;
1398  o20_re += A0_im;
1399  o20_im -= A0_re;
1400  o30_re -= B0_im;
1401  o30_im += B0_re;
1402 
1403  o01_re += A1_re;
1404  o01_im += A1_im;
1405  o11_re += B1_re;
1406  o11_im += B1_im;
1407  o21_re += A1_im;
1408  o21_im -= A1_re;
1409  o31_re -= B1_im;
1410  o31_im += B1_re;
1411 
1412  o02_re += A2_re;
1413  o02_im += A2_im;
1414  o12_re += B2_re;
1415  o12_im += B2_im;
1416  o22_re += A2_im;
1417  o22_im -= A2_re;
1418  o32_re -= B2_im;
1419  o32_im += B2_re;
1420 
1421 }
1422 
1423 if ( isActive(dim,3,+1,x1,x2,x3,x4,param.commDim,param.X) && x4==X4m1 )
1424 {
1425  // Projector P3-
1426  // 0 0 0 0
1427  // 0 0 0 0
1428  // 0 0 2 0
1429  // 0 0 0 2
1430 
1431  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,3,Y);
1432  const int sp_idx = face_idx + param.ghostOffset[3];
1433 #if (DD_PREC==2)
1434  sp_norm_idx = face_idx + faceVolume[3] + param.ghostNormOffset[3];
1435 #endif
1436 
1437  const int ga_idx = sid;
1438 
1439  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1440  {
1447 
1448 
1449  const int sp_stride_pad = ghostFace[3];
1450  const int t_proj_scale = TPROJSCALE;
1451 
1452  // read half spinor from device memory
1453  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1454 
1455  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1456  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1457  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1458  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1459  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1460  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1461 
1462 
1463  // identity gauge matrix
1470 
1471  o20_re += A0_re;
1472  o20_im += A0_im;
1473  o30_re += B0_re;
1474  o30_im += B0_im;
1475 
1476  o21_re += A1_re;
1477  o21_im += A1_im;
1478  o31_re += B1_re;
1479  o31_im += B1_im;
1480 
1481  o22_re += A2_re;
1482  o22_im += A2_im;
1483  o32_re += B2_re;
1484  o32_im += B2_im;
1485 
1486  } else {
1493 
1494 
1495  const int sp_stride_pad = ghostFace[3];
1496  const int t_proj_scale = TPROJSCALE;
1497 
1498  // read half spinor from device memory
1499  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1500 
1501  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1502  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1503  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1504  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1505  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1506  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1507 
1508 
1509  // read gauge matrix from device memory
1510  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1511 
1512  // reconstruct gauge matrix
1514 
1515  // multiply row 0
1516  spinorFloat A0_re = 0;
1517  A0_re += g00_re * a0_re;
1518  A0_re -= g00_im * a0_im;
1519  A0_re += g01_re * a1_re;
1520  A0_re -= g01_im * a1_im;
1521  A0_re += g02_re * a2_re;
1522  A0_re -= g02_im * a2_im;
1523  spinorFloat A0_im = 0;
1524  A0_im += g00_re * a0_im;
1525  A0_im += g00_im * a0_re;
1526  A0_im += g01_re * a1_im;
1527  A0_im += g01_im * a1_re;
1528  A0_im += g02_re * a2_im;
1529  A0_im += g02_im * a2_re;
1530  spinorFloat B0_re = 0;
1531  B0_re += g00_re * b0_re;
1532  B0_re -= g00_im * b0_im;
1533  B0_re += g01_re * b1_re;
1534  B0_re -= g01_im * b1_im;
1535  B0_re += g02_re * b2_re;
1536  B0_re -= g02_im * b2_im;
1537  spinorFloat B0_im = 0;
1538  B0_im += g00_re * b0_im;
1539  B0_im += g00_im * b0_re;
1540  B0_im += g01_re * b1_im;
1541  B0_im += g01_im * b1_re;
1542  B0_im += g02_re * b2_im;
1543  B0_im += g02_im * b2_re;
1544 
1545  // multiply row 1
1546  spinorFloat A1_re = 0;
1547  A1_re += g10_re * a0_re;
1548  A1_re -= g10_im * a0_im;
1549  A1_re += g11_re * a1_re;
1550  A1_re -= g11_im * a1_im;
1551  A1_re += g12_re * a2_re;
1552  A1_re -= g12_im * a2_im;
1553  spinorFloat A1_im = 0;
1554  A1_im += g10_re * a0_im;
1555  A1_im += g10_im * a0_re;
1556  A1_im += g11_re * a1_im;
1557  A1_im += g11_im * a1_re;
1558  A1_im += g12_re * a2_im;
1559  A1_im += g12_im * a2_re;
1560  spinorFloat B1_re = 0;
1561  B1_re += g10_re * b0_re;
1562  B1_re -= g10_im * b0_im;
1563  B1_re += g11_re * b1_re;
1564  B1_re -= g11_im * b1_im;
1565  B1_re += g12_re * b2_re;
1566  B1_re -= g12_im * b2_im;
1567  spinorFloat B1_im = 0;
1568  B1_im += g10_re * b0_im;
1569  B1_im += g10_im * b0_re;
1570  B1_im += g11_re * b1_im;
1571  B1_im += g11_im * b1_re;
1572  B1_im += g12_re * b2_im;
1573  B1_im += g12_im * b2_re;
1574 
1575  // multiply row 2
1576  spinorFloat A2_re = 0;
1577  A2_re += g20_re * a0_re;
1578  A2_re -= g20_im * a0_im;
1579  A2_re += g21_re * a1_re;
1580  A2_re -= g21_im * a1_im;
1581  A2_re += g22_re * a2_re;
1582  A2_re -= g22_im * a2_im;
1583  spinorFloat A2_im = 0;
1584  A2_im += g20_re * a0_im;
1585  A2_im += g20_im * a0_re;
1586  A2_im += g21_re * a1_im;
1587  A2_im += g21_im * a1_re;
1588  A2_im += g22_re * a2_im;
1589  A2_im += g22_im * a2_re;
1590  spinorFloat B2_re = 0;
1591  B2_re += g20_re * b0_re;
1592  B2_re -= g20_im * b0_im;
1593  B2_re += g21_re * b1_re;
1594  B2_re -= g21_im * b1_im;
1595  B2_re += g22_re * b2_re;
1596  B2_re -= g22_im * b2_im;
1597  spinorFloat B2_im = 0;
1598  B2_im += g20_re * b0_im;
1599  B2_im += g20_im * b0_re;
1600  B2_im += g21_re * b1_im;
1601  B2_im += g21_im * b1_re;
1602  B2_im += g22_re * b2_im;
1603  B2_im += g22_im * b2_re;
1604 
1605  o20_re += A0_re;
1606  o20_im += A0_im;
1607  o30_re += B0_re;
1608  o30_im += B0_im;
1609 
1610  o21_re += A1_re;
1611  o21_im += A1_im;
1612  o31_re += B1_re;
1613  o31_im += B1_im;
1614 
1615  o22_re += A2_re;
1616  o22_im += A2_im;
1617  o32_re += B2_re;
1618  o32_im += B2_im;
1619 
1620  }
1621 }
1622 
1623 if ( isActive(dim,3,-1,x1,x2,x3,x4,param.commDim,param.X) && x4==0 )
1624 {
1625  // Projector P3+
1626  // 2 0 0 0
1627  // 0 2 0 0
1628  // 0 0 0 0
1629  // 0 0 0 0
1630 
1631  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,3,Y);
1632  const int sp_idx = face_idx + param.ghostOffset[3];
1633 #if (DD_PREC==2)
1634  sp_norm_idx = face_idx + param.ghostNormOffset[3];
1635 #endif
1636 
1637  const int ga_idx = Vh+face_idx;
1638 
1639  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1640  {
1647 
1648 
1649  const int sp_stride_pad = ghostFace[3];
1650  const int t_proj_scale = TPROJSCALE;
1651 
1652  // read half spinor from device memory
1653  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1654 
1655  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1656  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1657  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1658  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1659  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1660  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1661 
1662 
1663  // identity gauge matrix
1670 
1671  o00_re += A0_re;
1672  o00_im += A0_im;
1673  o10_re += B0_re;
1674  o10_im += B0_im;
1675 
1676  o01_re += A1_re;
1677  o01_im += A1_im;
1678  o11_re += B1_re;
1679  o11_im += B1_im;
1680 
1681  o02_re += A2_re;
1682  o02_im += A2_im;
1683  o12_re += B2_re;
1684  o12_im += B2_im;
1685 
1686  } else {
1693 
1694 
1695  const int sp_stride_pad = ghostFace[3];
1696  const int t_proj_scale = TPROJSCALE;
1697 
1698  // read half spinor from device memory
1699  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1700 
1701  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1702  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1703  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1704  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1705  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1706  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1707 
1708 
1709  // read gauge matrix from device memory
1710  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
1711 
1712  // reconstruct gauge matrix
1714 
1715  // multiply row 0
1716  spinorFloat A0_re = 0;
1717  A0_re += gT00_re * a0_re;
1718  A0_re -= gT00_im * a0_im;
1719  A0_re += gT01_re * a1_re;
1720  A0_re -= gT01_im * a1_im;
1721  A0_re += gT02_re * a2_re;
1722  A0_re -= gT02_im * a2_im;
1723  spinorFloat A0_im = 0;
1724  A0_im += gT00_re * a0_im;
1725  A0_im += gT00_im * a0_re;
1726  A0_im += gT01_re * a1_im;
1727  A0_im += gT01_im * a1_re;
1728  A0_im += gT02_re * a2_im;
1729  A0_im += gT02_im * a2_re;
1730  spinorFloat B0_re = 0;
1731  B0_re += gT00_re * b0_re;
1732  B0_re -= gT00_im * b0_im;
1733  B0_re += gT01_re * b1_re;
1734  B0_re -= gT01_im * b1_im;
1735  B0_re += gT02_re * b2_re;
1736  B0_re -= gT02_im * b2_im;
1737  spinorFloat B0_im = 0;
1738  B0_im += gT00_re * b0_im;
1739  B0_im += gT00_im * b0_re;
1740  B0_im += gT01_re * b1_im;
1741  B0_im += gT01_im * b1_re;
1742  B0_im += gT02_re * b2_im;
1743  B0_im += gT02_im * b2_re;
1744 
1745  // multiply row 1
1746  spinorFloat A1_re = 0;
1747  A1_re += gT10_re * a0_re;
1748  A1_re -= gT10_im * a0_im;
1749  A1_re += gT11_re * a1_re;
1750  A1_re -= gT11_im * a1_im;
1751  A1_re += gT12_re * a2_re;
1752  A1_re -= gT12_im * a2_im;
1753  spinorFloat A1_im = 0;
1754  A1_im += gT10_re * a0_im;
1755  A1_im += gT10_im * a0_re;
1756  A1_im += gT11_re * a1_im;
1757  A1_im += gT11_im * a1_re;
1758  A1_im += gT12_re * a2_im;
1759  A1_im += gT12_im * a2_re;
1760  spinorFloat B1_re = 0;
1761  B1_re += gT10_re * b0_re;
1762  B1_re -= gT10_im * b0_im;
1763  B1_re += gT11_re * b1_re;
1764  B1_re -= gT11_im * b1_im;
1765  B1_re += gT12_re * b2_re;
1766  B1_re -= gT12_im * b2_im;
1767  spinorFloat B1_im = 0;
1768  B1_im += gT10_re * b0_im;
1769  B1_im += gT10_im * b0_re;
1770  B1_im += gT11_re * b1_im;
1771  B1_im += gT11_im * b1_re;
1772  B1_im += gT12_re * b2_im;
1773  B1_im += gT12_im * b2_re;
1774 
1775  // multiply row 2
1776  spinorFloat A2_re = 0;
1777  A2_re += gT20_re * a0_re;
1778  A2_re -= gT20_im * a0_im;
1779  A2_re += gT21_re * a1_re;
1780  A2_re -= gT21_im * a1_im;
1781  A2_re += gT22_re * a2_re;
1782  A2_re -= gT22_im * a2_im;
1783  spinorFloat A2_im = 0;
1784  A2_im += gT20_re * a0_im;
1785  A2_im += gT20_im * a0_re;
1786  A2_im += gT21_re * a1_im;
1787  A2_im += gT21_im * a1_re;
1788  A2_im += gT22_re * a2_im;
1789  A2_im += gT22_im * a2_re;
1790  spinorFloat B2_re = 0;
1791  B2_re += gT20_re * b0_re;
1792  B2_re -= gT20_im * b0_im;
1793  B2_re += gT21_re * b1_re;
1794  B2_re -= gT21_im * b1_im;
1795  B2_re += gT22_re * b2_re;
1796  B2_re -= gT22_im * b2_im;
1797  spinorFloat B2_im = 0;
1798  B2_im += gT20_re * b0_im;
1799  B2_im += gT20_im * b0_re;
1800  B2_im += gT21_re * b1_im;
1801  B2_im += gT21_im * b1_re;
1802  B2_im += gT22_re * b2_im;
1803  B2_im += gT22_im * b2_re;
1804 
1805  o00_re += A0_re;
1806  o00_im += A0_im;
1807  o10_re += B0_re;
1808  o10_im += B0_im;
1809 
1810  o01_re += A1_re;
1811  o01_im += A1_im;
1812  o11_re += B1_re;
1813  o11_im += B1_im;
1814 
1815  o02_re += A2_re;
1816  o02_im += A2_im;
1817  o12_re += B2_re;
1818  o12_im += B2_im;
1819 
1820  }
1821 }
1822 
1823 {
1824 #ifdef DSLASH_CLOVER
1825 
1826  // change to chiral basis
1827  {
1834  spinorFloat a30_re = o00_re - o20_re;
1835  spinorFloat a30_im = o00_im - o20_im;
1836 
1837  o00_re = a00_re; o00_im = a00_im;
1838  o10_re = a10_re; o10_im = a10_im;
1839  o20_re = a20_re; o20_im = a20_im;
1840  o30_re = a30_re; o30_im = a30_im;
1841  }
1842 
1843  {
1850  spinorFloat a31_re = o01_re - o21_re;
1851  spinorFloat a31_im = o01_im - o21_im;
1852 
1853  o01_re = a01_re; o01_im = a01_im;
1854  o11_re = a11_re; o11_im = a11_im;
1855  o21_re = a21_re; o21_im = a21_im;
1856  o31_re = a31_re; o31_im = a31_im;
1857  }
1858 
1859  {
1866  spinorFloat a32_re = o02_re - o22_re;
1867  spinorFloat a32_im = o02_im - o22_im;
1868 
1869  o02_re = a02_re; o02_im = a02_im;
1870  o12_re = a12_re; o12_im = a12_im;
1871  o22_re = a22_re; o22_im = a22_im;
1872  o32_re = a32_re; o32_im = a32_im;
1873  }
1874 
1875  // apply first chiral block
1876  {
1878 
1885 
1886  a00_re += c00_00_re * o00_re;
1887  a00_im += c00_00_re * o00_im;
1888  a00_re += c00_01_re * o01_re;
1889  a00_re -= c00_01_im * o01_im;
1890  a00_im += c00_01_re * o01_im;
1891  a00_im += c00_01_im * o01_re;
1892  a00_re += c00_02_re * o02_re;
1893  a00_re -= c00_02_im * o02_im;
1894  a00_im += c00_02_re * o02_im;
1895  a00_im += c00_02_im * o02_re;
1896  a00_re += c00_10_re * o10_re;
1897  a00_re -= c00_10_im * o10_im;
1898  a00_im += c00_10_re * o10_im;
1899  a00_im += c00_10_im * o10_re;
1900  a00_re += c00_11_re * o11_re;
1901  a00_re -= c00_11_im * o11_im;
1902  a00_im += c00_11_re * o11_im;
1903  a00_im += c00_11_im * o11_re;
1904  a00_re += c00_12_re * o12_re;
1905  a00_re -= c00_12_im * o12_im;
1906  a00_im += c00_12_re * o12_im;
1907  a00_im += c00_12_im * o12_re;
1908 
1909  a01_re += c01_00_re * o00_re;
1910  a01_re -= c01_00_im * o00_im;
1911  a01_im += c01_00_re * o00_im;
1912  a01_im += c01_00_im * o00_re;
1913  a01_re += c01_01_re * o01_re;
1914  a01_im += c01_01_re * o01_im;
1915  a01_re += c01_02_re * o02_re;
1916  a01_re -= c01_02_im * o02_im;
1917  a01_im += c01_02_re * o02_im;
1918  a01_im += c01_02_im * o02_re;
1919  a01_re += c01_10_re * o10_re;
1920  a01_re -= c01_10_im * o10_im;
1921  a01_im += c01_10_re * o10_im;
1922  a01_im += c01_10_im * o10_re;
1923  a01_re += c01_11_re * o11_re;
1924  a01_re -= c01_11_im * o11_im;
1925  a01_im += c01_11_re * o11_im;
1926  a01_im += c01_11_im * o11_re;
1927  a01_re += c01_12_re * o12_re;
1928  a01_re -= c01_12_im * o12_im;
1929  a01_im += c01_12_re * o12_im;
1930  a01_im += c01_12_im * o12_re;
1931 
1932  a02_re += c02_00_re * o00_re;
1933  a02_re -= c02_00_im * o00_im;
1934  a02_im += c02_00_re * o00_im;
1935  a02_im += c02_00_im * o00_re;
1936  a02_re += c02_01_re * o01_re;
1937  a02_re -= c02_01_im * o01_im;
1938  a02_im += c02_01_re * o01_im;
1939  a02_im += c02_01_im * o01_re;
1940  a02_re += c02_02_re * o02_re;
1941  a02_im += c02_02_re * o02_im;
1942  a02_re += c02_10_re * o10_re;
1943  a02_re -= c02_10_im * o10_im;
1944  a02_im += c02_10_re * o10_im;
1945  a02_im += c02_10_im * o10_re;
1946  a02_re += c02_11_re * o11_re;
1947  a02_re -= c02_11_im * o11_im;
1948  a02_im += c02_11_re * o11_im;
1949  a02_im += c02_11_im * o11_re;
1950  a02_re += c02_12_re * o12_re;
1951  a02_re -= c02_12_im * o12_im;
1952  a02_im += c02_12_re * o12_im;
1953  a02_im += c02_12_im * o12_re;
1954 
1955  a10_re += c10_00_re * o00_re;
1956  a10_re -= c10_00_im * o00_im;
1957  a10_im += c10_00_re * o00_im;
1958  a10_im += c10_00_im * o00_re;
1959  a10_re += c10_01_re * o01_re;
1960  a10_re -= c10_01_im * o01_im;
1961  a10_im += c10_01_re * o01_im;
1962  a10_im += c10_01_im * o01_re;
1963  a10_re += c10_02_re * o02_re;
1964  a10_re -= c10_02_im * o02_im;
1965  a10_im += c10_02_re * o02_im;
1966  a10_im += c10_02_im * o02_re;
1967  a10_re += c10_10_re * o10_re;
1968  a10_im += c10_10_re * o10_im;
1969  a10_re += c10_11_re * o11_re;
1970  a10_re -= c10_11_im * o11_im;
1971  a10_im += c10_11_re * o11_im;
1972  a10_im += c10_11_im * o11_re;
1973  a10_re += c10_12_re * o12_re;
1974  a10_re -= c10_12_im * o12_im;
1975  a10_im += c10_12_re * o12_im;
1976  a10_im += c10_12_im * o12_re;
1977 
1978  a11_re += c11_00_re * o00_re;
1979  a11_re -= c11_00_im * o00_im;
1980  a11_im += c11_00_re * o00_im;
1981  a11_im += c11_00_im * o00_re;
1982  a11_re += c11_01_re * o01_re;
1983  a11_re -= c11_01_im * o01_im;
1984  a11_im += c11_01_re * o01_im;
1985  a11_im += c11_01_im * o01_re;
1986  a11_re += c11_02_re * o02_re;
1987  a11_re -= c11_02_im * o02_im;
1988  a11_im += c11_02_re * o02_im;
1989  a11_im += c11_02_im * o02_re;
1990  a11_re += c11_10_re * o10_re;
1991  a11_re -= c11_10_im * o10_im;
1992  a11_im += c11_10_re * o10_im;
1993  a11_im += c11_10_im * o10_re;
1994  a11_re += c11_11_re * o11_re;
1995  a11_im += c11_11_re * o11_im;
1996  a11_re += c11_12_re * o12_re;
1997  a11_re -= c11_12_im * o12_im;
1998  a11_im += c11_12_re * o12_im;
1999  a11_im += c11_12_im * o12_re;
2000 
2001  a12_re += c12_00_re * o00_re;
2002  a12_re -= c12_00_im * o00_im;
2003  a12_im += c12_00_re * o00_im;
2004  a12_im += c12_00_im * o00_re;
2005  a12_re += c12_01_re * o01_re;
2006  a12_re -= c12_01_im * o01_im;
2007  a12_im += c12_01_re * o01_im;
2008  a12_im += c12_01_im * o01_re;
2009  a12_re += c12_02_re * o02_re;
2010  a12_re -= c12_02_im * o02_im;
2011  a12_im += c12_02_re * o02_im;
2012  a12_im += c12_02_im * o02_re;
2013  a12_re += c12_10_re * o10_re;
2014  a12_re -= c12_10_im * o10_im;
2015  a12_im += c12_10_re * o10_im;
2016  a12_im += c12_10_im * o10_re;
2017  a12_re += c12_11_re * o11_re;
2018  a12_re -= c12_11_im * o11_im;
2019  a12_im += c12_11_re * o11_im;
2020  a12_im += c12_11_im * o11_re;
2021  a12_re += c12_12_re * o12_re;
2022  a12_im += c12_12_re * o12_im;
2023 
2024  o00_re = a00_re; o00_im = a00_im;
2025  o01_re = a01_re; o01_im = a01_im;
2026  o02_re = a02_re; o02_im = a02_im;
2027  o10_re = a10_re; o10_im = a10_im;
2028  o11_re = a11_re; o11_im = a11_im;
2029  o12_re = a12_re; o12_im = a12_im;
2030 
2031  }
2032 
2033  // apply second chiral block
2034  {
2036 
2040  spinorFloat a30_re = 0; spinorFloat a30_im = 0;
2041  spinorFloat a31_re = 0; spinorFloat a31_im = 0;
2042  spinorFloat a32_re = 0; spinorFloat a32_im = 0;
2043 
2044  a20_re += c20_20_re * o20_re;
2045  a20_im += c20_20_re * o20_im;
2046  a20_re += c20_21_re * o21_re;
2047  a20_re -= c20_21_im * o21_im;
2048  a20_im += c20_21_re * o21_im;
2049  a20_im += c20_21_im * o21_re;
2050  a20_re += c20_22_re * o22_re;
2051  a20_re -= c20_22_im * o22_im;
2052  a20_im += c20_22_re * o22_im;
2053  a20_im += c20_22_im * o22_re;
2054  a20_re += c20_30_re * o30_re;
2055  a20_re -= c20_30_im * o30_im;
2056  a20_im += c20_30_re * o30_im;
2057  a20_im += c20_30_im * o30_re;
2058  a20_re += c20_31_re * o31_re;
2059  a20_re -= c20_31_im * o31_im;
2060  a20_im += c20_31_re * o31_im;
2061  a20_im += c20_31_im * o31_re;
2062  a20_re += c20_32_re * o32_re;
2063  a20_re -= c20_32_im * o32_im;
2064  a20_im += c20_32_re * o32_im;
2065  a20_im += c20_32_im * o32_re;
2066 
2067  a21_re += c21_20_re * o20_re;
2068  a21_re -= c21_20_im * o20_im;
2069  a21_im += c21_20_re * o20_im;
2070  a21_im += c21_20_im * o20_re;
2071  a21_re += c21_21_re * o21_re;
2072  a21_im += c21_21_re * o21_im;
2073  a21_re += c21_22_re * o22_re;
2074  a21_re -= c21_22_im * o22_im;
2075  a21_im += c21_22_re * o22_im;
2076  a21_im += c21_22_im * o22_re;
2077  a21_re += c21_30_re * o30_re;
2078  a21_re -= c21_30_im * o30_im;
2079  a21_im += c21_30_re * o30_im;
2080  a21_im += c21_30_im * o30_re;
2081  a21_re += c21_31_re * o31_re;
2082  a21_re -= c21_31_im * o31_im;
2083  a21_im += c21_31_re * o31_im;
2084  a21_im += c21_31_im * o31_re;
2085  a21_re += c21_32_re * o32_re;
2086  a21_re -= c21_32_im * o32_im;
2087  a21_im += c21_32_re * o32_im;
2088  a21_im += c21_32_im * o32_re;
2089 
2090  a22_re += c22_20_re * o20_re;
2091  a22_re -= c22_20_im * o20_im;
2092  a22_im += c22_20_re * o20_im;
2093  a22_im += c22_20_im * o20_re;
2094  a22_re += c22_21_re * o21_re;
2095  a22_re -= c22_21_im * o21_im;
2096  a22_im += c22_21_re * o21_im;
2097  a22_im += c22_21_im * o21_re;
2098  a22_re += c22_22_re * o22_re;
2099  a22_im += c22_22_re * o22_im;
2100  a22_re += c22_30_re * o30_re;
2101  a22_re -= c22_30_im * o30_im;
2102  a22_im += c22_30_re * o30_im;
2103  a22_im += c22_30_im * o30_re;
2104  a22_re += c22_31_re * o31_re;
2105  a22_re -= c22_31_im * o31_im;
2106  a22_im += c22_31_re * o31_im;
2107  a22_im += c22_31_im * o31_re;
2108  a22_re += c22_32_re * o32_re;
2109  a22_re -= c22_32_im * o32_im;
2110  a22_im += c22_32_re * o32_im;
2111  a22_im += c22_32_im * o32_re;
2112 
2113  a30_re += c30_20_re * o20_re;
2114  a30_re -= c30_20_im * o20_im;
2115  a30_im += c30_20_re * o20_im;
2116  a30_im += c30_20_im * o20_re;
2117  a30_re += c30_21_re * o21_re;
2118  a30_re -= c30_21_im * o21_im;
2119  a30_im += c30_21_re * o21_im;
2120  a30_im += c30_21_im * o21_re;
2121  a30_re += c30_22_re * o22_re;
2122  a30_re -= c30_22_im * o22_im;
2123  a30_im += c30_22_re * o22_im;
2124  a30_im += c30_22_im * o22_re;
2125  a30_re += c30_30_re * o30_re;
2126  a30_im += c30_30_re * o30_im;
2127  a30_re += c30_31_re * o31_re;
2128  a30_re -= c30_31_im * o31_im;
2129  a30_im += c30_31_re * o31_im;
2130  a30_im += c30_31_im * o31_re;
2131  a30_re += c30_32_re * o32_re;
2132  a30_re -= c30_32_im * o32_im;
2133  a30_im += c30_32_re * o32_im;
2134  a30_im += c30_32_im * o32_re;
2135 
2136  a31_re += c31_20_re * o20_re;
2137  a31_re -= c31_20_im * o20_im;
2138  a31_im += c31_20_re * o20_im;
2139  a31_im += c31_20_im * o20_re;
2140  a31_re += c31_21_re * o21_re;
2141  a31_re -= c31_21_im * o21_im;
2142  a31_im += c31_21_re * o21_im;
2143  a31_im += c31_21_im * o21_re;
2144  a31_re += c31_22_re * o22_re;
2145  a31_re -= c31_22_im * o22_im;
2146  a31_im += c31_22_re * o22_im;
2147  a31_im += c31_22_im * o22_re;
2148  a31_re += c31_30_re * o30_re;
2149  a31_re -= c31_30_im * o30_im;
2150  a31_im += c31_30_re * o30_im;
2151  a31_im += c31_30_im * o30_re;
2152  a31_re += c31_31_re * o31_re;
2153  a31_im += c31_31_re * o31_im;
2154  a31_re += c31_32_re * o32_re;
2155  a31_re -= c31_32_im * o32_im;
2156  a31_im += c31_32_re * o32_im;
2157  a31_im += c31_32_im * o32_re;
2158 
2159  a32_re += c32_20_re * o20_re;
2160  a32_re -= c32_20_im * o20_im;
2161  a32_im += c32_20_re * o20_im;
2162  a32_im += c32_20_im * o20_re;
2163  a32_re += c32_21_re * o21_re;
2164  a32_re -= c32_21_im * o21_im;
2165  a32_im += c32_21_re * o21_im;
2166  a32_im += c32_21_im * o21_re;
2167  a32_re += c32_22_re * o22_re;
2168  a32_re -= c32_22_im * o22_im;
2169  a32_im += c32_22_re * o22_im;
2170  a32_im += c32_22_im * o22_re;
2171  a32_re += c32_30_re * o30_re;
2172  a32_re -= c32_30_im * o30_im;
2173  a32_im += c32_30_re * o30_im;
2174  a32_im += c32_30_im * o30_re;
2175  a32_re += c32_31_re * o31_re;
2176  a32_re -= c32_31_im * o31_im;
2177  a32_im += c32_31_re * o31_im;
2178  a32_im += c32_31_im * o31_re;
2179  a32_re += c32_32_re * o32_re;
2180  a32_im += c32_32_re * o32_im;
2181 
2182  o20_re = a20_re; o20_im = a20_im;
2183  o21_re = a21_re; o21_im = a21_im;
2184  o22_re = a22_re; o22_im = a22_im;
2185  o30_re = a30_re; o30_im = a30_im;
2186  o31_re = a31_re; o31_im = a31_im;
2187  o32_re = a32_re; o32_im = a32_im;
2188 
2189  }
2190 
2191  // change back from chiral basis
2192  // (note: required factor of 1/2 is included in clover term normalization)
2193  {
2194  spinorFloat a00_re = o10_re + o30_re;
2195  spinorFloat a00_im = o10_im + o30_im;
2196  spinorFloat a10_re = -o00_re - o20_re;
2197  spinorFloat a10_im = -o00_im - o20_im;
2198  spinorFloat a20_re = o10_re - o30_re;
2199  spinorFloat a20_im = o10_im - o30_im;
2200  spinorFloat a30_re = -o00_re + o20_re;
2201  spinorFloat a30_im = -o00_im + o20_im;
2202 
2203  o00_re = a00_re; o00_im = a00_im;
2204  o10_re = a10_re; o10_im = a10_im;
2205  o20_re = a20_re; o20_im = a20_im;
2206  o30_re = a30_re; o30_im = a30_im;
2207  }
2208 
2209  {
2210  spinorFloat a01_re = o11_re + o31_re;
2211  spinorFloat a01_im = o11_im + o31_im;
2212  spinorFloat a11_re = -o01_re - o21_re;
2213  spinorFloat a11_im = -o01_im - o21_im;
2214  spinorFloat a21_re = o11_re - o31_re;
2215  spinorFloat a21_im = o11_im - o31_im;
2216  spinorFloat a31_re = -o01_re + o21_re;
2217  spinorFloat a31_im = -o01_im + o21_im;
2218 
2219  o01_re = a01_re; o01_im = a01_im;
2220  o11_re = a11_re; o11_im = a11_im;
2221  o21_re = a21_re; o21_im = a21_im;
2222  o31_re = a31_re; o31_im = a31_im;
2223  }
2224 
2225  {
2226  spinorFloat a02_re = o12_re + o32_re;
2227  spinorFloat a02_im = o12_im + o32_im;
2228  spinorFloat a12_re = -o02_re - o22_re;
2229  spinorFloat a12_im = -o02_im - o22_im;
2230  spinorFloat a22_re = o12_re - o32_re;
2231  spinorFloat a22_im = o12_im - o32_im;
2232  spinorFloat a32_re = -o02_re + o22_re;
2233  spinorFloat a32_im = -o02_im + o22_im;
2234 
2235  o02_re = a02_re; o02_im = a02_im;
2236  o12_re = a12_re; o12_im = a12_im;
2237  o22_re = a22_re; o22_im = a22_im;
2238  o32_re = a32_re; o32_im = a32_im;
2239  }
2240 
2241 #endif // DSLASH_CLOVER
2242 
2243 #ifdef DSLASH_XPAY
2244 
2245  READ_ACCUM(ACCUMTEX, param.sp_stride)
2246 
2247  o00_re = a*o00_re+acc00_re;
2248  o00_im = a*o00_im+acc00_im;
2249  o01_re = a*o01_re+acc01_re;
2250  o01_im = a*o01_im+acc01_im;
2251  o02_re = a*o02_re+acc02_re;
2252  o02_im = a*o02_im+acc02_im;
2253  o10_re = a*o10_re+acc10_re;
2254  o10_im = a*o10_im+acc10_im;
2255  o11_re = a*o11_re+acc11_re;
2256  o11_im = a*o11_im+acc11_im;
2257  o12_re = a*o12_re+acc12_re;
2258  o12_im = a*o12_im+acc12_im;
2259  o20_re = a*o20_re+acc20_re;
2260  o20_im = a*o20_im+acc20_im;
2261  o21_re = a*o21_re+acc21_re;
2262  o21_im = a*o21_im+acc21_im;
2263  o22_re = a*o22_re+acc22_re;
2264  o22_im = a*o22_im+acc22_im;
2265  o30_re = a*o30_re+acc30_re;
2266  o30_im = a*o30_im+acc30_im;
2267  o31_re = a*o31_re+acc31_re;
2268  o31_im = a*o31_im+acc31_im;
2269  o32_re = a*o32_re+acc32_re;
2270  o32_im = a*o32_im+acc32_im;
2271 #endif // DSLASH_XPAY
2272 }
2273 
2274 // write spinor field back to device memory
2275 WRITE_SPINOR(param.sp_stride);
2276 
2277 // undefine to prevent warning when precision is changed
2278 #undef spinorFloat
2279 #undef g00_re
2280 #undef g00_im
2281 #undef g01_re
2282 #undef g01_im
2283 #undef g02_re
2284 #undef g02_im
2285 #undef g10_re
2286 #undef g10_im
2287 #undef g11_re
2288 #undef g11_im
2289 #undef g12_re
2290 #undef g12_im
2291 #undef g20_re
2292 #undef g20_im
2293 #undef g21_re
2294 #undef g21_im
2295 #undef g22_re
2296 #undef g22_im
2297 
2298 #undef i00_re
2299 #undef i00_im
2300 #undef i01_re
2301 #undef i01_im
2302 #undef i02_re
2303 #undef i02_im
2304 #undef i10_re
2305 #undef i10_im
2306 #undef i11_re
2307 #undef i11_im
2308 #undef i12_re
2309 #undef i12_im
2310 #undef i20_re
2311 #undef i20_im
2312 #undef i21_re
2313 #undef i21_im
2314 #undef i22_re
2315 #undef i22_im
2316 #undef i30_re
2317 #undef i30_im
2318 #undef i31_re
2319 #undef i31_im
2320 #undef i32_re
2321 #undef i32_im
2322 
2323 #undef acc00_re
2324 #undef acc00_im
2325 #undef acc01_re
2326 #undef acc01_im
2327 #undef acc02_re
2328 #undef acc02_im
2329 #undef acc10_re
2330 #undef acc10_im
2331 #undef acc11_re
2332 #undef acc11_im
2333 #undef acc12_re
2334 #undef acc12_im
2335 #undef acc20_re
2336 #undef acc20_im
2337 #undef acc21_re
2338 #undef acc21_im
2339 #undef acc22_re
2340 #undef acc22_im
2341 #undef acc30_re
2342 #undef acc30_im
2343 #undef acc31_re
2344 #undef acc31_im
2345 #undef acc32_re
2346 #undef acc32_im
2347 
2348 #undef c00_00_re
2349 #undef c01_01_re
2350 #undef c02_02_re
2351 #undef c10_10_re
2352 #undef c11_11_re
2353 #undef c12_12_re
2354 #undef c01_00_re
2355 #undef c01_00_im
2356 #undef c02_00_re
2357 #undef c02_00_im
2358 #undef c10_00_re
2359 #undef c10_00_im
2360 #undef c11_00_re
2361 #undef c11_00_im
2362 #undef c12_00_re
2363 #undef c12_00_im
2364 #undef c02_01_re
2365 #undef c02_01_im
2366 #undef c10_01_re
2367 #undef c10_01_im
2368 #undef c11_01_re
2369 #undef c11_01_im
2370 #undef c12_01_re
2371 #undef c12_01_im
2372 #undef c10_02_re
2373 #undef c10_02_im
2374 #undef c11_02_re
2375 #undef c11_02_im
2376 #undef c12_02_re
2377 #undef c12_02_im
2378 #undef c11_10_re
2379 #undef c11_10_im
2380 #undef c12_10_re
2381 #undef c12_10_im
2382 #undef c12_11_re
2383 #undef c12_11_im
2384 
2385 
2386 #undef VOLATILE
2387 
2388 #endif // MULTI_GPU
__constant__ int Vh
#define a22_re
Definition: llfat_core.h:131
__constant__ int X2
#define o32_im
Definition: gamma5.h:295
#define CLOVERTEX
Definition: clover_def.h:101
#define a02_im
Definition: llfat_core.h:120
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
#define a22_im
Definition: llfat_core.h:132
#define a01_re
Definition: llfat_core.h:117
#define a02_re
Definition: llfat_core.h:119
#define a20_re
Definition: llfat_core.h:127
#define o31_im
Definition: gamma5.h:293
#define a12_im
Definition: llfat_core.h:126
#define a20_im
Definition: llfat_core.h:128
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define RECONSTRUCT_GAUGE_MATRIX
Definition: covDev.h:39
#define a01_im
Definition: llfat_core.h:118
#define a12_re
Definition: llfat_core.h:125
#define GAUGE0TEX
Definition: covDev.h:112
#define a11_re
Definition: llfat_core.h:123
#define o30_im
Definition: gamma5.h:291
__constant__ int X2m1
#define SPINORTEX
Definition: clover_def.h:40
#define o32_re
Definition: gamma5.h:294
int X[4]
Definition: quda.h:29
__constant__ int gauge_fixed
#define o31_re
Definition: gamma5.h:292
#define SPINOR_HOP
Definition: covDev.h:158
__constant__ int ga_stride
#define a00_re
Definition: llfat_core.h:115
__constant__ int X1m1
__constant__ int X3
#define a11_im
Definition: llfat_core.h:124
#define a10_re
Definition: llfat_core.h:121
#define GAUGE1TEX
Definition: covDev.h:113
#define READ_GAUGE_MATRIX
Definition: covDev.h:44
#define a10_im
Definition: llfat_core.h:122
__constant__ int X4m1
#define a21_re
Definition: llfat_core.h:129
#define WRITE_SPINOR
Definition: clover_def.h:48
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
__constant__ int X4X3X2X1hmX3X2X1h
#define a21_im
Definition: llfat_core.h:130
#define READ_CLOVER
Definition: clover_def.h:103
__constant__ int X4
__constant__ int X3m1
#define TPROJSCALE
Definition: covDev.h:101
#define a00_im
Definition: llfat_core.h:116