QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
wilson_fused_exterior_dslash_g80_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 19
4 
5 
6 #ifdef MULTI_GPU
7 
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
9 #define VOLATILE
10 #else // Open64 compiler
11 #define VOLATILE volatile
12 #endif
13 // input spinor
14 #ifdef SPINOR_DOUBLE
15 #define spinorFloat double
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
64 #else
65 #define spinorFloat float
66 #define i00_re I0.x
67 #define i00_im I0.y
68 #define i01_re I0.z
69 #define i01_im I0.w
70 #define i02_re I1.x
71 #define i02_im I1.y
72 #define i10_re I1.z
73 #define i10_im I1.w
74 #define i11_re I2.x
75 #define i11_im I2.y
76 #define i12_re I2.z
77 #define i12_im I2.w
78 #define i20_re I3.x
79 #define i20_im I3.y
80 #define i21_re I3.z
81 #define i21_im I3.w
82 #define i22_re I4.x
83 #define i22_im I4.y
84 #define i30_re I4.z
85 #define i30_im I4.w
86 #define i31_re I5.x
87 #define i31_im I5.y
88 #define i32_re I5.z
89 #define i32_im I5.w
90 #define acc00_re accum0.x
91 #define acc00_im accum0.y
92 #define acc01_re accum0.z
93 #define acc01_im accum0.w
94 #define acc02_re accum1.x
95 #define acc02_im accum1.y
96 #define acc10_re accum1.z
97 #define acc10_im accum1.w
98 #define acc11_re accum2.x
99 #define acc11_im accum2.y
100 #define acc12_re accum2.z
101 #define acc12_im accum2.w
102 #define acc20_re accum3.x
103 #define acc20_im accum3.y
104 #define acc21_re accum3.z
105 #define acc21_im accum3.w
106 #define acc22_re accum4.x
107 #define acc22_im accum4.y
108 #define acc30_re accum4.z
109 #define acc30_im accum4.w
110 #define acc31_re accum5.x
111 #define acc31_im accum5.y
112 #define acc32_re accum5.z
113 #define acc32_im accum5.w
114 #endif // SPINOR_DOUBLE
115 
116 // gauge link
117 #ifdef GAUGE_FLOAT2
118 #define g00_re G0.x
119 #define g00_im G0.y
120 #define g01_re G1.x
121 #define g01_im G1.y
122 #define g02_re G2.x
123 #define g02_im G2.y
124 #define g10_re G3.x
125 #define g10_im G3.y
126 #define g11_re G4.x
127 #define g11_im G4.y
128 #define g12_re G5.x
129 #define g12_im G5.y
130 #define g20_re G6.x
131 #define g20_im G6.y
132 #define g21_re G7.x
133 #define g21_im G7.y
134 #define g22_re G8.x
135 #define g22_im G8.y
136 
137 #else
138 #define g00_re G0.x
139 #define g00_im G0.y
140 #define g01_re G0.z
141 #define g01_im G0.w
142 #define g02_re G1.x
143 #define g02_im G1.y
144 #define g10_re G1.z
145 #define g10_im G1.w
146 #define g11_re G2.x
147 #define g11_im G2.y
148 #define g12_re G2.z
149 #define g12_im G2.w
150 #define g20_re G3.x
151 #define g20_im G3.y
152 #define g21_re G3.z
153 #define g21_im G3.w
154 #define g22_re G4.x
155 #define g22_im G4.y
156 
157 #endif // GAUGE_DOUBLE
158 
159 // conjugated gauge link
160 #define gT00_re (+g00_re)
161 #define gT00_im (-g00_im)
162 #define gT01_re (+g10_re)
163 #define gT01_im (-g10_im)
164 #define gT02_re (+g20_re)
165 #define gT02_im (-g20_im)
166 #define gT10_re (+g01_re)
167 #define gT10_im (-g01_im)
168 #define gT11_re (+g11_re)
169 #define gT11_im (-g11_im)
170 #define gT12_re (+g21_re)
171 #define gT12_im (-g21_im)
172 #define gT20_re (+g02_re)
173 #define gT20_im (-g02_im)
174 #define gT21_re (+g12_re)
175 #define gT21_im (-g12_im)
176 #define gT22_re (+g22_re)
177 #define gT22_im (-g22_im)
178 
179 // first chiral block of inverted clover term
180 #ifdef CLOVER_DOUBLE
181 #define c00_00_re C0.x
182 #define c01_01_re C0.y
183 #define c02_02_re C1.x
184 #define c10_10_re C1.y
185 #define c11_11_re C2.x
186 #define c12_12_re C2.y
187 #define c01_00_re C3.x
188 #define c01_00_im C3.y
189 #define c02_00_re C4.x
190 #define c02_00_im C4.y
191 #define c10_00_re C5.x
192 #define c10_00_im C5.y
193 #define c11_00_re C6.x
194 #define c11_00_im C6.y
195 #define c12_00_re C7.x
196 #define c12_00_im C7.y
197 #define c02_01_re C8.x
198 #define c02_01_im C8.y
199 #define c10_01_re C9.x
200 #define c10_01_im C9.y
201 #define c11_01_re C10.x
202 #define c11_01_im C10.y
203 #define c12_01_re C11.x
204 #define c12_01_im C11.y
205 #define c10_02_re C12.x
206 #define c10_02_im C12.y
207 #define c11_02_re C13.x
208 #define c11_02_im C13.y
209 #define c12_02_re C14.x
210 #define c12_02_im C14.y
211 #define c11_10_re C15.x
212 #define c11_10_im C15.y
213 #define c12_10_re C16.x
214 #define c12_10_im C16.y
215 #define c12_11_re C17.x
216 #define c12_11_im C17.y
217 #else
218 #define c00_00_re C0.x
219 #define c01_01_re C0.y
220 #define c02_02_re C0.z
221 #define c10_10_re C0.w
222 #define c11_11_re C1.x
223 #define c12_12_re C1.y
224 #define c01_00_re C1.z
225 #define c01_00_im C1.w
226 #define c02_00_re C2.x
227 #define c02_00_im C2.y
228 #define c10_00_re C2.z
229 #define c10_00_im C2.w
230 #define c11_00_re C3.x
231 #define c11_00_im C3.y
232 #define c12_00_re C3.z
233 #define c12_00_im C3.w
234 #define c02_01_re C4.x
235 #define c02_01_im C4.y
236 #define c10_01_re C4.z
237 #define c10_01_im C4.w
238 #define c11_01_re C5.x
239 #define c11_01_im C5.y
240 #define c12_01_re C5.z
241 #define c12_01_im C5.w
242 #define c10_02_re C6.x
243 #define c10_02_im C6.y
244 #define c11_02_re C6.z
245 #define c11_02_im C6.w
246 #define c12_02_re C7.x
247 #define c12_02_im C7.y
248 #define c11_10_re C7.z
249 #define c11_10_im C7.w
250 #define c12_10_re C8.x
251 #define c12_10_im C8.y
252 #define c12_11_re C8.z
253 #define c12_11_im C8.w
254 #endif // CLOVER_DOUBLE
255 
256 #define c00_01_re (+c01_00_re)
257 #define c00_01_im (-c01_00_im)
258 #define c00_02_re (+c02_00_re)
259 #define c00_02_im (-c02_00_im)
260 #define c01_02_re (+c02_01_re)
261 #define c01_02_im (-c02_01_im)
262 #define c00_10_re (+c10_00_re)
263 #define c00_10_im (-c10_00_im)
264 #define c01_10_re (+c10_01_re)
265 #define c01_10_im (-c10_01_im)
266 #define c02_10_re (+c10_02_re)
267 #define c02_10_im (-c10_02_im)
268 #define c00_11_re (+c11_00_re)
269 #define c00_11_im (-c11_00_im)
270 #define c01_11_re (+c11_01_re)
271 #define c01_11_im (-c11_01_im)
272 #define c02_11_re (+c11_02_re)
273 #define c02_11_im (-c11_02_im)
274 #define c10_11_re (+c11_10_re)
275 #define c10_11_im (-c11_10_im)
276 #define c00_12_re (+c12_00_re)
277 #define c00_12_im (-c12_00_im)
278 #define c01_12_re (+c12_01_re)
279 #define c01_12_im (-c12_01_im)
280 #define c02_12_re (+c12_02_re)
281 #define c02_12_im (-c12_02_im)
282 #define c10_12_re (+c12_10_re)
283 #define c10_12_im (-c12_10_im)
284 #define c11_12_re (+c12_11_re)
285 #define c11_12_im (-c12_11_im)
286 
287 // second chiral block of inverted clover term (reuses C0,...,C9)
288 #define c20_20_re c00_00_re
289 #define c21_20_re c01_00_re
290 #define c21_20_im c01_00_im
291 #define c22_20_re c02_00_re
292 #define c22_20_im c02_00_im
293 #define c30_20_re c10_00_re
294 #define c30_20_im c10_00_im
295 #define c31_20_re c11_00_re
296 #define c31_20_im c11_00_im
297 #define c32_20_re c12_00_re
298 #define c32_20_im c12_00_im
299 #define c20_21_re c00_01_re
300 #define c20_21_im c00_01_im
301 #define c21_21_re c01_01_re
302 #define c22_21_re c02_01_re
303 #define c22_21_im c02_01_im
304 #define c30_21_re c10_01_re
305 #define c30_21_im c10_01_im
306 #define c31_21_re c11_01_re
307 #define c31_21_im c11_01_im
308 #define c32_21_re c12_01_re
309 #define c32_21_im c12_01_im
310 #define c20_22_re c00_02_re
311 #define c20_22_im c00_02_im
312 #define c21_22_re c01_02_re
313 #define c21_22_im c01_02_im
314 #define c22_22_re c02_02_re
315 #define c30_22_re c10_02_re
316 #define c30_22_im c10_02_im
317 #define c31_22_re c11_02_re
318 #define c31_22_im c11_02_im
319 #define c32_22_re c12_02_re
320 #define c32_22_im c12_02_im
321 #define c20_30_re c00_10_re
322 #define c20_30_im c00_10_im
323 #define c21_30_re c01_10_re
324 #define c21_30_im c01_10_im
325 #define c22_30_re c02_10_re
326 #define c22_30_im c02_10_im
327 #define c30_30_re c10_10_re
328 #define c31_30_re c11_10_re
329 #define c31_30_im c11_10_im
330 #define c32_30_re c12_10_re
331 #define c32_30_im c12_10_im
332 #define c20_31_re c00_11_re
333 #define c20_31_im c00_11_im
334 #define c21_31_re c01_11_re
335 #define c21_31_im c01_11_im
336 #define c22_31_re c02_11_re
337 #define c22_31_im c02_11_im
338 #define c30_31_re c10_11_re
339 #define c30_31_im c10_11_im
340 #define c31_31_re c11_11_re
341 #define c32_31_re c12_11_re
342 #define c32_31_im c12_11_im
343 #define c20_32_re c00_12_re
344 #define c20_32_im c00_12_im
345 #define c21_32_re c01_12_re
346 #define c21_32_im c01_12_im
347 #define c22_32_re c02_12_re
348 #define c22_32_im c02_12_im
349 #define c30_32_re c10_12_re
350 #define c30_32_im c10_12_im
351 #define c31_32_re c11_12_re
352 #define c31_32_im c11_12_im
353 #define c32_32_re c12_12_re
354 
355 // output spinor
356 #define o00_re s[0*SHARED_STRIDE]
357 #define o00_im s[1*SHARED_STRIDE]
358 #define o01_re s[2*SHARED_STRIDE]
359 #define o01_im s[3*SHARED_STRIDE]
360 #define o02_re s[4*SHARED_STRIDE]
361 #define o02_im s[5*SHARED_STRIDE]
362 #define o10_re s[6*SHARED_STRIDE]
363 #define o10_im s[7*SHARED_STRIDE]
364 #define o11_re s[8*SHARED_STRIDE]
365 #define o11_im s[9*SHARED_STRIDE]
366 #define o12_re s[10*SHARED_STRIDE]
367 #define o12_im s[11*SHARED_STRIDE]
368 #define o20_re s[12*SHARED_STRIDE]
369 #define o20_im s[13*SHARED_STRIDE]
370 #define o21_re s[14*SHARED_STRIDE]
371 #define o21_im s[15*SHARED_STRIDE]
372 #define o22_re s[16*SHARED_STRIDE]
373 #define o22_im s[17*SHARED_STRIDE]
374 #define o30_re s[18*SHARED_STRIDE]
380 
381 #ifdef SPINOR_DOUBLE
382 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
383 #else
384 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
385 #endif
386 
387 extern __shared__ char s_data[];
388 
390  + (threadIdx.x % SHARED_STRIDE);
391 
392 #include "read_gauge.h"
393 #include "read_clover.h"
394 #include "io_spinor.h"
395 
396 int x1, x2, x3, x4;
397 int X;
398 
399 #if (DD_PREC==2) // half precision
400 int sp_norm_idx;
401 #endif // half precision
402 
403 int sid;
404 
405 int dim;
406 int face_num;
407 int face_idx;
408 int Y[4] = {X1,X2,X3,X4};
409 int faceVolume[4];
410 faceVolume[0] = (X2*X3*X4)>>1;
411 faceVolume[1] = (X1*X3*X4)>>1;
412 faceVolume[2] = (X1*X2*X4)>>1;
413 faceVolume[3] = (X1*X2*X3)>>1;
414 
415 
416 
417 
418 
419  sid = blockIdx.x*blockDim.x + threadIdx.x;
420  if (sid >= param.threads) return;
421 
422  dim = dimFromFaceIndex(sid, param); // sid is also modified
423 
424 
425  const int face_volume = ((param.threadDimMapUpper[dim] - param.threadDimMapLower[dim]) >> 1); // volume of one face
426  face_num = (sid >= face_volume); // is this thread updating face 0 or 1
427  face_idx = sid - face_num*face_volume; // index into the respective face
428 
429  // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
430  // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
431  //sp_idx = face_idx + param.ghostOffset[dim];
432 
433 
434  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity,Y);
435 
436  {
437  bool active = false;
438  for(int dir=0; dir<4; ++dir){
439  active = active || isActive(dim,dir,+1,x1,x2,x3,x4,param.commDim,param.X);
440  }
441  if(!active) return;
442  }
443 
444 
445 
446  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
447 
448  o00_re = i00_re; o00_im = i00_im;
449  o01_re = i01_re; o01_im = i01_im;
450  o02_re = i02_re; o02_im = i02_im;
451  o10_re = i10_re; o10_im = i10_im;
452  o11_re = i11_re; o11_im = i11_im;
453  o12_re = i12_re; o12_im = i12_im;
454  o20_re = i20_re; o20_im = i20_im;
455  o21_re = i21_re; o21_im = i21_im;
456  o22_re = i22_re; o22_im = i22_im;
457  o30_re = i30_re; o30_im = i30_im;
458  o31_re = i31_re; o31_im = i31_im;
459  o32_re = i32_re; o32_im = i32_im;
460 
461 
462 if ( isActive(dim,0,+1,x1,x2,x3,x4,param.commDim,param.X) && x1==X1m1 )
463 {
464  // Projector P0-
465  // 1 0 0 -i
466  // 0 1 -i 0
467  // 0 i 1 0
468  // i 0 0 1
469 
470  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,0,Y);
471  const int sp_idx = face_idx + param.ghostOffset[0];
472 #if (DD_PREC==2)
473  sp_norm_idx = face_idx + faceVolume[0] + param.ghostNormOffset[0];
474 #endif
475 
476  const int ga_idx = sid;
477 
484 
485 
486  const int sp_stride_pad = ghostFace[0];
487 
488  // read half spinor from device memory
489  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
490 
491  a0_re = i00_re; a0_im = i00_im;
492  a1_re = i01_re; a1_im = i01_im;
493  a2_re = i02_re; a2_im = i02_im;
494  b0_re = i10_re; b0_im = i10_im;
495  b1_re = i11_re; b1_im = i11_im;
496  b2_re = i12_re; b2_im = i12_im;
497 
498 
499  // read gauge matrix from device memory
500  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
501 
502  // reconstruct gauge matrix
504 
505  // multiply row 0
506  spinorFloat A0_re = 0;
507  A0_re += g00_re * a0_re;
508  A0_re -= g00_im * a0_im;
509  A0_re += g01_re * a1_re;
510  A0_re -= g01_im * a1_im;
511  A0_re += g02_re * a2_re;
512  A0_re -= g02_im * a2_im;
513  spinorFloat A0_im = 0;
514  A0_im += g00_re * a0_im;
515  A0_im += g00_im * a0_re;
516  A0_im += g01_re * a1_im;
517  A0_im += g01_im * a1_re;
518  A0_im += g02_re * a2_im;
519  A0_im += g02_im * a2_re;
520  spinorFloat B0_re = 0;
521  B0_re += g00_re * b0_re;
522  B0_re -= g00_im * b0_im;
523  B0_re += g01_re * b1_re;
524  B0_re -= g01_im * b1_im;
525  B0_re += g02_re * b2_re;
526  B0_re -= g02_im * b2_im;
527  spinorFloat B0_im = 0;
528  B0_im += g00_re * b0_im;
529  B0_im += g00_im * b0_re;
530  B0_im += g01_re * b1_im;
531  B0_im += g01_im * b1_re;
532  B0_im += g02_re * b2_im;
533  B0_im += g02_im * b2_re;
534 
535  // multiply row 1
536  spinorFloat A1_re = 0;
537  A1_re += g10_re * a0_re;
538  A1_re -= g10_im * a0_im;
539  A1_re += g11_re * a1_re;
540  A1_re -= g11_im * a1_im;
541  A1_re += g12_re * a2_re;
542  A1_re -= g12_im * a2_im;
543  spinorFloat A1_im = 0;
544  A1_im += g10_re * a0_im;
545  A1_im += g10_im * a0_re;
546  A1_im += g11_re * a1_im;
547  A1_im += g11_im * a1_re;
548  A1_im += g12_re * a2_im;
549  A1_im += g12_im * a2_re;
550  spinorFloat B1_re = 0;
551  B1_re += g10_re * b0_re;
552  B1_re -= g10_im * b0_im;
553  B1_re += g11_re * b1_re;
554  B1_re -= g11_im * b1_im;
555  B1_re += g12_re * b2_re;
556  B1_re -= g12_im * b2_im;
557  spinorFloat B1_im = 0;
558  B1_im += g10_re * b0_im;
559  B1_im += g10_im * b0_re;
560  B1_im += g11_re * b1_im;
561  B1_im += g11_im * b1_re;
562  B1_im += g12_re * b2_im;
563  B1_im += g12_im * b2_re;
564 
565  // multiply row 2
566  spinorFloat A2_re = 0;
567  A2_re += g20_re * a0_re;
568  A2_re -= g20_im * a0_im;
569  A2_re += g21_re * a1_re;
570  A2_re -= g21_im * a1_im;
571  A2_re += g22_re * a2_re;
572  A2_re -= g22_im * a2_im;
573  spinorFloat A2_im = 0;
574  A2_im += g20_re * a0_im;
575  A2_im += g20_im * a0_re;
576  A2_im += g21_re * a1_im;
577  A2_im += g21_im * a1_re;
578  A2_im += g22_re * a2_im;
579  A2_im += g22_im * a2_re;
580  spinorFloat B2_re = 0;
581  B2_re += g20_re * b0_re;
582  B2_re -= g20_im * b0_im;
583  B2_re += g21_re * b1_re;
584  B2_re -= g21_im * b1_im;
585  B2_re += g22_re * b2_re;
586  B2_re -= g22_im * b2_im;
587  spinorFloat B2_im = 0;
588  B2_im += g20_re * b0_im;
589  B2_im += g20_im * b0_re;
590  B2_im += g21_re * b1_im;
591  B2_im += g21_im * b1_re;
592  B2_im += g22_re * b2_im;
593  B2_im += g22_im * b2_re;
594 
595  o00_re += A0_re;
596  o00_im += A0_im;
597  o10_re += B0_re;
598  o10_im += B0_im;
599  o20_re -= B0_im;
600  o20_im += B0_re;
601  o30_re -= A0_im;
602  o30_im += A0_re;
603 
604  o01_re += A1_re;
605  o01_im += A1_im;
606  o11_re += B1_re;
607  o11_im += B1_im;
608  o21_re -= B1_im;
609  o21_im += B1_re;
610  o31_re -= A1_im;
611  o31_im += A1_re;
612 
613  o02_re += A2_re;
614  o02_im += A2_im;
615  o12_re += B2_re;
616  o12_im += B2_im;
617  o22_re -= B2_im;
618  o22_im += B2_re;
619  o32_re -= A2_im;
620  o32_im += A2_re;
621 
622 }
623 
624 if ( isActive(dim,0,-1,x1,x2,x3,x4,param.commDim,param.X) && x1==0 )
625 {
626  // Projector P0+
627  // 1 0 0 i
628  // 0 1 i 0
629  // 0 -i 1 0
630  // -i 0 0 1
631 
632  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,0,Y);
633  const int sp_idx = face_idx + param.ghostOffset[0];
634 #if (DD_PREC==2)
635  sp_norm_idx = face_idx + param.ghostNormOffset[0];
636 #endif
637 
638  const int ga_idx = Vh+face_idx;
639 
646 
647 
648  const int sp_stride_pad = ghostFace[0];
649 
650  // read half spinor from device memory
651  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
652 
653  a0_re = i00_re; a0_im = i00_im;
654  a1_re = i01_re; a1_im = i01_im;
655  a2_re = i02_re; a2_im = i02_im;
656  b0_re = i10_re; b0_im = i10_im;
657  b1_re = i11_re; b1_im = i11_im;
658  b2_re = i12_re; b2_im = i12_im;
659 
660 
661  // read gauge matrix from device memory
662  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
663 
664  // reconstruct gauge matrix
666 
667  // multiply row 0
668  spinorFloat A0_re = 0;
669  A0_re += gT00_re * a0_re;
670  A0_re -= gT00_im * a0_im;
671  A0_re += gT01_re * a1_re;
672  A0_re -= gT01_im * a1_im;
673  A0_re += gT02_re * a2_re;
674  A0_re -= gT02_im * a2_im;
675  spinorFloat A0_im = 0;
676  A0_im += gT00_re * a0_im;
677  A0_im += gT00_im * a0_re;
678  A0_im += gT01_re * a1_im;
679  A0_im += gT01_im * a1_re;
680  A0_im += gT02_re * a2_im;
681  A0_im += gT02_im * a2_re;
682  spinorFloat B0_re = 0;
683  B0_re += gT00_re * b0_re;
684  B0_re -= gT00_im * b0_im;
685  B0_re += gT01_re * b1_re;
686  B0_re -= gT01_im * b1_im;
687  B0_re += gT02_re * b2_re;
688  B0_re -= gT02_im * b2_im;
689  spinorFloat B0_im = 0;
690  B0_im += gT00_re * b0_im;
691  B0_im += gT00_im * b0_re;
692  B0_im += gT01_re * b1_im;
693  B0_im += gT01_im * b1_re;
694  B0_im += gT02_re * b2_im;
695  B0_im += gT02_im * b2_re;
696 
697  // multiply row 1
698  spinorFloat A1_re = 0;
699  A1_re += gT10_re * a0_re;
700  A1_re -= gT10_im * a0_im;
701  A1_re += gT11_re * a1_re;
702  A1_re -= gT11_im * a1_im;
703  A1_re += gT12_re * a2_re;
704  A1_re -= gT12_im * a2_im;
705  spinorFloat A1_im = 0;
706  A1_im += gT10_re * a0_im;
707  A1_im += gT10_im * a0_re;
708  A1_im += gT11_re * a1_im;
709  A1_im += gT11_im * a1_re;
710  A1_im += gT12_re * a2_im;
711  A1_im += gT12_im * a2_re;
712  spinorFloat B1_re = 0;
713  B1_re += gT10_re * b0_re;
714  B1_re -= gT10_im * b0_im;
715  B1_re += gT11_re * b1_re;
716  B1_re -= gT11_im * b1_im;
717  B1_re += gT12_re * b2_re;
718  B1_re -= gT12_im * b2_im;
719  spinorFloat B1_im = 0;
720  B1_im += gT10_re * b0_im;
721  B1_im += gT10_im * b0_re;
722  B1_im += gT11_re * b1_im;
723  B1_im += gT11_im * b1_re;
724  B1_im += gT12_re * b2_im;
725  B1_im += gT12_im * b2_re;
726 
727  // multiply row 2
728  spinorFloat A2_re = 0;
729  A2_re += gT20_re * a0_re;
730  A2_re -= gT20_im * a0_im;
731  A2_re += gT21_re * a1_re;
732  A2_re -= gT21_im * a1_im;
733  A2_re += gT22_re * a2_re;
734  A2_re -= gT22_im * a2_im;
735  spinorFloat A2_im = 0;
736  A2_im += gT20_re * a0_im;
737  A2_im += gT20_im * a0_re;
738  A2_im += gT21_re * a1_im;
739  A2_im += gT21_im * a1_re;
740  A2_im += gT22_re * a2_im;
741  A2_im += gT22_im * a2_re;
742  spinorFloat B2_re = 0;
743  B2_re += gT20_re * b0_re;
744  B2_re -= gT20_im * b0_im;
745  B2_re += gT21_re * b1_re;
746  B2_re -= gT21_im * b1_im;
747  B2_re += gT22_re * b2_re;
748  B2_re -= gT22_im * b2_im;
749  spinorFloat B2_im = 0;
750  B2_im += gT20_re * b0_im;
751  B2_im += gT20_im * b0_re;
752  B2_im += gT21_re * b1_im;
753  B2_im += gT21_im * b1_re;
754  B2_im += gT22_re * b2_im;
755  B2_im += gT22_im * b2_re;
756 
757  o00_re += A0_re;
758  o00_im += A0_im;
759  o10_re += B0_re;
760  o10_im += B0_im;
761  o20_re += B0_im;
762  o20_im -= B0_re;
763  o30_re += A0_im;
764  o30_im -= A0_re;
765 
766  o01_re += A1_re;
767  o01_im += A1_im;
768  o11_re += B1_re;
769  o11_im += B1_im;
770  o21_re += B1_im;
771  o21_im -= B1_re;
772  o31_re += A1_im;
773  o31_im -= A1_re;
774 
775  o02_re += A2_re;
776  o02_im += A2_im;
777  o12_re += B2_re;
778  o12_im += B2_im;
779  o22_re += B2_im;
780  o22_im -= B2_re;
781  o32_re += A2_im;
782  o32_im -= A2_re;
783 
784 }
785 
786 if ( isActive(dim,1,+1,x1,x2,x3,x4,param.commDim,param.X) && x2==X2m1 )
787 {
788  // Projector P1-
789  // 1 0 0 -1
790  // 0 1 1 0
791  // 0 1 1 0
792  // -1 0 0 1
793 
794  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,1,Y);
795  const int sp_idx = face_idx + param.ghostOffset[1];
796 #if (DD_PREC==2)
797  sp_norm_idx = face_idx + faceVolume[1] + param.ghostNormOffset[1];
798 #endif
799 
800  const int ga_idx = sid;
801 
808 
809 
810  const int sp_stride_pad = ghostFace[1];
811 
812  // read half spinor from device memory
813  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
814 
815  a0_re = i00_re; a0_im = i00_im;
816  a1_re = i01_re; a1_im = i01_im;
817  a2_re = i02_re; a2_im = i02_im;
818  b0_re = i10_re; b0_im = i10_im;
819  b1_re = i11_re; b1_im = i11_im;
820  b2_re = i12_re; b2_im = i12_im;
821 
822 
823  // read gauge matrix from device memory
824  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
825 
826  // reconstruct gauge matrix
828 
829  // multiply row 0
830  spinorFloat A0_re = 0;
831  A0_re += g00_re * a0_re;
832  A0_re -= g00_im * a0_im;
833  A0_re += g01_re * a1_re;
834  A0_re -= g01_im * a1_im;
835  A0_re += g02_re * a2_re;
836  A0_re -= g02_im * a2_im;
837  spinorFloat A0_im = 0;
838  A0_im += g00_re * a0_im;
839  A0_im += g00_im * a0_re;
840  A0_im += g01_re * a1_im;
841  A0_im += g01_im * a1_re;
842  A0_im += g02_re * a2_im;
843  A0_im += g02_im * a2_re;
844  spinorFloat B0_re = 0;
845  B0_re += g00_re * b0_re;
846  B0_re -= g00_im * b0_im;
847  B0_re += g01_re * b1_re;
848  B0_re -= g01_im * b1_im;
849  B0_re += g02_re * b2_re;
850  B0_re -= g02_im * b2_im;
851  spinorFloat B0_im = 0;
852  B0_im += g00_re * b0_im;
853  B0_im += g00_im * b0_re;
854  B0_im += g01_re * b1_im;
855  B0_im += g01_im * b1_re;
856  B0_im += g02_re * b2_im;
857  B0_im += g02_im * b2_re;
858 
859  // multiply row 1
860  spinorFloat A1_re = 0;
861  A1_re += g10_re * a0_re;
862  A1_re -= g10_im * a0_im;
863  A1_re += g11_re * a1_re;
864  A1_re -= g11_im * a1_im;
865  A1_re += g12_re * a2_re;
866  A1_re -= g12_im * a2_im;
867  spinorFloat A1_im = 0;
868  A1_im += g10_re * a0_im;
869  A1_im += g10_im * a0_re;
870  A1_im += g11_re * a1_im;
871  A1_im += g11_im * a1_re;
872  A1_im += g12_re * a2_im;
873  A1_im += g12_im * a2_re;
874  spinorFloat B1_re = 0;
875  B1_re += g10_re * b0_re;
876  B1_re -= g10_im * b0_im;
877  B1_re += g11_re * b1_re;
878  B1_re -= g11_im * b1_im;
879  B1_re += g12_re * b2_re;
880  B1_re -= g12_im * b2_im;
881  spinorFloat B1_im = 0;
882  B1_im += g10_re * b0_im;
883  B1_im += g10_im * b0_re;
884  B1_im += g11_re * b1_im;
885  B1_im += g11_im * b1_re;
886  B1_im += g12_re * b2_im;
887  B1_im += g12_im * b2_re;
888 
889  // multiply row 2
890  spinorFloat A2_re = 0;
891  A2_re += g20_re * a0_re;
892  A2_re -= g20_im * a0_im;
893  A2_re += g21_re * a1_re;
894  A2_re -= g21_im * a1_im;
895  A2_re += g22_re * a2_re;
896  A2_re -= g22_im * a2_im;
897  spinorFloat A2_im = 0;
898  A2_im += g20_re * a0_im;
899  A2_im += g20_im * a0_re;
900  A2_im += g21_re * a1_im;
901  A2_im += g21_im * a1_re;
902  A2_im += g22_re * a2_im;
903  A2_im += g22_im * a2_re;
904  spinorFloat B2_re = 0;
905  B2_re += g20_re * b0_re;
906  B2_re -= g20_im * b0_im;
907  B2_re += g21_re * b1_re;
908  B2_re -= g21_im * b1_im;
909  B2_re += g22_re * b2_re;
910  B2_re -= g22_im * b2_im;
911  spinorFloat B2_im = 0;
912  B2_im += g20_re * b0_im;
913  B2_im += g20_im * b0_re;
914  B2_im += g21_re * b1_im;
915  B2_im += g21_im * b1_re;
916  B2_im += g22_re * b2_im;
917  B2_im += g22_im * b2_re;
918 
919  o00_re += A0_re;
920  o00_im += A0_im;
921  o10_re += B0_re;
922  o10_im += B0_im;
923  o20_re += B0_re;
924  o20_im += B0_im;
925  o30_re -= A0_re;
926  o30_im -= A0_im;
927 
928  o01_re += A1_re;
929  o01_im += A1_im;
930  o11_re += B1_re;
931  o11_im += B1_im;
932  o21_re += B1_re;
933  o21_im += B1_im;
934  o31_re -= A1_re;
935  o31_im -= A1_im;
936 
937  o02_re += A2_re;
938  o02_im += A2_im;
939  o12_re += B2_re;
940  o12_im += B2_im;
941  o22_re += B2_re;
942  o22_im += B2_im;
943  o32_re -= A2_re;
944  o32_im -= A2_im;
945 
946 }
947 
948 if ( isActive(dim,1,-1,x1,x2,x3,x4,param.commDim,param.X) && x2==0 )
949 {
950  // Projector P1+
951  // 1 0 0 1
952  // 0 1 -1 0
953  // 0 -1 1 0
954  // 1 0 0 1
955 
956  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,1,Y);
957  const int sp_idx = face_idx + param.ghostOffset[1];
958 #if (DD_PREC==2)
959  sp_norm_idx = face_idx + param.ghostNormOffset[1];
960 #endif
961 
962  const int ga_idx = Vh+face_idx;
963 
970 
971 
972  const int sp_stride_pad = ghostFace[1];
973 
974  // read half spinor from device memory
975  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
976 
977  a0_re = i00_re; a0_im = i00_im;
978  a1_re = i01_re; a1_im = i01_im;
979  a2_re = i02_re; a2_im = i02_im;
980  b0_re = i10_re; b0_im = i10_im;
981  b1_re = i11_re; b1_im = i11_im;
982  b2_re = i12_re; b2_im = i12_im;
983 
984 
985  // read gauge matrix from device memory
986  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
987 
988  // reconstruct gauge matrix
990 
991  // multiply row 0
992  spinorFloat A0_re = 0;
993  A0_re += gT00_re * a0_re;
994  A0_re -= gT00_im * a0_im;
995  A0_re += gT01_re * a1_re;
996  A0_re -= gT01_im * a1_im;
997  A0_re += gT02_re * a2_re;
998  A0_re -= gT02_im * a2_im;
999  spinorFloat A0_im = 0;
1000  A0_im += gT00_re * a0_im;
1001  A0_im += gT00_im * a0_re;
1002  A0_im += gT01_re * a1_im;
1003  A0_im += gT01_im * a1_re;
1004  A0_im += gT02_re * a2_im;
1005  A0_im += gT02_im * a2_re;
1006  spinorFloat B0_re = 0;
1007  B0_re += gT00_re * b0_re;
1008  B0_re -= gT00_im * b0_im;
1009  B0_re += gT01_re * b1_re;
1010  B0_re -= gT01_im * b1_im;
1011  B0_re += gT02_re * b2_re;
1012  B0_re -= gT02_im * b2_im;
1013  spinorFloat B0_im = 0;
1014  B0_im += gT00_re * b0_im;
1015  B0_im += gT00_im * b0_re;
1016  B0_im += gT01_re * b1_im;
1017  B0_im += gT01_im * b1_re;
1018  B0_im += gT02_re * b2_im;
1019  B0_im += gT02_im * b2_re;
1020 
1021  // multiply row 1
1022  spinorFloat A1_re = 0;
1023  A1_re += gT10_re * a0_re;
1024  A1_re -= gT10_im * a0_im;
1025  A1_re += gT11_re * a1_re;
1026  A1_re -= gT11_im * a1_im;
1027  A1_re += gT12_re * a2_re;
1028  A1_re -= gT12_im * a2_im;
1029  spinorFloat A1_im = 0;
1030  A1_im += gT10_re * a0_im;
1031  A1_im += gT10_im * a0_re;
1032  A1_im += gT11_re * a1_im;
1033  A1_im += gT11_im * a1_re;
1034  A1_im += gT12_re * a2_im;
1035  A1_im += gT12_im * a2_re;
1036  spinorFloat B1_re = 0;
1037  B1_re += gT10_re * b0_re;
1038  B1_re -= gT10_im * b0_im;
1039  B1_re += gT11_re * b1_re;
1040  B1_re -= gT11_im * b1_im;
1041  B1_re += gT12_re * b2_re;
1042  B1_re -= gT12_im * b2_im;
1043  spinorFloat B1_im = 0;
1044  B1_im += gT10_re * b0_im;
1045  B1_im += gT10_im * b0_re;
1046  B1_im += gT11_re * b1_im;
1047  B1_im += gT11_im * b1_re;
1048  B1_im += gT12_re * b2_im;
1049  B1_im += gT12_im * b2_re;
1050 
1051  // multiply row 2
1052  spinorFloat A2_re = 0;
1053  A2_re += gT20_re * a0_re;
1054  A2_re -= gT20_im * a0_im;
1055  A2_re += gT21_re * a1_re;
1056  A2_re -= gT21_im * a1_im;
1057  A2_re += gT22_re * a2_re;
1058  A2_re -= gT22_im * a2_im;
1059  spinorFloat A2_im = 0;
1060  A2_im += gT20_re * a0_im;
1061  A2_im += gT20_im * a0_re;
1062  A2_im += gT21_re * a1_im;
1063  A2_im += gT21_im * a1_re;
1064  A2_im += gT22_re * a2_im;
1065  A2_im += gT22_im * a2_re;
1066  spinorFloat B2_re = 0;
1067  B2_re += gT20_re * b0_re;
1068  B2_re -= gT20_im * b0_im;
1069  B2_re += gT21_re * b1_re;
1070  B2_re -= gT21_im * b1_im;
1071  B2_re += gT22_re * b2_re;
1072  B2_re -= gT22_im * b2_im;
1073  spinorFloat B2_im = 0;
1074  B2_im += gT20_re * b0_im;
1075  B2_im += gT20_im * b0_re;
1076  B2_im += gT21_re * b1_im;
1077  B2_im += gT21_im * b1_re;
1078  B2_im += gT22_re * b2_im;
1079  B2_im += gT22_im * b2_re;
1080 
1081  o00_re += A0_re;
1082  o00_im += A0_im;
1083  o10_re += B0_re;
1084  o10_im += B0_im;
1085  o20_re -= B0_re;
1086  o20_im -= B0_im;
1087  o30_re += A0_re;
1088  o30_im += A0_im;
1089 
1090  o01_re += A1_re;
1091  o01_im += A1_im;
1092  o11_re += B1_re;
1093  o11_im += B1_im;
1094  o21_re -= B1_re;
1095  o21_im -= B1_im;
1096  o31_re += A1_re;
1097  o31_im += A1_im;
1098 
1099  o02_re += A2_re;
1100  o02_im += A2_im;
1101  o12_re += B2_re;
1102  o12_im += B2_im;
1103  o22_re -= B2_re;
1104  o22_im -= B2_im;
1105  o32_re += A2_re;
1106  o32_im += A2_im;
1107 
1108 }
1109 
1110 if ( isActive(dim,2,+1,x1,x2,x3,x4,param.commDim,param.X) && x3==X3m1 )
1111 {
1112  // Projector P2-
1113  // 1 0 -i 0
1114  // 0 1 0 i
1115  // i 0 1 0
1116  // 0 -i 0 1
1117 
1118  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,2,Y);
1119  const int sp_idx = face_idx + param.ghostOffset[2];
1120 #if (DD_PREC==2)
1121  sp_norm_idx = face_idx + faceVolume[2] + param.ghostNormOffset[2];
1122 #endif
1123 
1124  const int ga_idx = sid;
1125 
1132 
1133 
1134  const int sp_stride_pad = ghostFace[2];
1135 
1136  // read half spinor from device memory
1137  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1138 
1139  a0_re = i00_re; a0_im = i00_im;
1140  a1_re = i01_re; a1_im = i01_im;
1141  a2_re = i02_re; a2_im = i02_im;
1142  b0_re = i10_re; b0_im = i10_im;
1143  b1_re = i11_re; b1_im = i11_im;
1144  b2_re = i12_re; b2_im = i12_im;
1145 
1146 
1147  // read gauge matrix from device memory
1148  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1149 
1150  // reconstruct gauge matrix
1152 
1153  // multiply row 0
1154  spinorFloat A0_re = 0;
1155  A0_re += g00_re * a0_re;
1156  A0_re -= g00_im * a0_im;
1157  A0_re += g01_re * a1_re;
1158  A0_re -= g01_im * a1_im;
1159  A0_re += g02_re * a2_re;
1160  A0_re -= g02_im * a2_im;
1161  spinorFloat A0_im = 0;
1162  A0_im += g00_re * a0_im;
1163  A0_im += g00_im * a0_re;
1164  A0_im += g01_re * a1_im;
1165  A0_im += g01_im * a1_re;
1166  A0_im += g02_re * a2_im;
1167  A0_im += g02_im * a2_re;
1168  spinorFloat B0_re = 0;
1169  B0_re += g00_re * b0_re;
1170  B0_re -= g00_im * b0_im;
1171  B0_re += g01_re * b1_re;
1172  B0_re -= g01_im * b1_im;
1173  B0_re += g02_re * b2_re;
1174  B0_re -= g02_im * b2_im;
1175  spinorFloat B0_im = 0;
1176  B0_im += g00_re * b0_im;
1177  B0_im += g00_im * b0_re;
1178  B0_im += g01_re * b1_im;
1179  B0_im += g01_im * b1_re;
1180  B0_im += g02_re * b2_im;
1181  B0_im += g02_im * b2_re;
1182 
1183  // multiply row 1
1184  spinorFloat A1_re = 0;
1185  A1_re += g10_re * a0_re;
1186  A1_re -= g10_im * a0_im;
1187  A1_re += g11_re * a1_re;
1188  A1_re -= g11_im * a1_im;
1189  A1_re += g12_re * a2_re;
1190  A1_re -= g12_im * a2_im;
1191  spinorFloat A1_im = 0;
1192  A1_im += g10_re * a0_im;
1193  A1_im += g10_im * a0_re;
1194  A1_im += g11_re * a1_im;
1195  A1_im += g11_im * a1_re;
1196  A1_im += g12_re * a2_im;
1197  A1_im += g12_im * a2_re;
1198  spinorFloat B1_re = 0;
1199  B1_re += g10_re * b0_re;
1200  B1_re -= g10_im * b0_im;
1201  B1_re += g11_re * b1_re;
1202  B1_re -= g11_im * b1_im;
1203  B1_re += g12_re * b2_re;
1204  B1_re -= g12_im * b2_im;
1205  spinorFloat B1_im = 0;
1206  B1_im += g10_re * b0_im;
1207  B1_im += g10_im * b0_re;
1208  B1_im += g11_re * b1_im;
1209  B1_im += g11_im * b1_re;
1210  B1_im += g12_re * b2_im;
1211  B1_im += g12_im * b2_re;
1212 
1213  // multiply row 2
1214  spinorFloat A2_re = 0;
1215  A2_re += g20_re * a0_re;
1216  A2_re -= g20_im * a0_im;
1217  A2_re += g21_re * a1_re;
1218  A2_re -= g21_im * a1_im;
1219  A2_re += g22_re * a2_re;
1220  A2_re -= g22_im * a2_im;
1221  spinorFloat A2_im = 0;
1222  A2_im += g20_re * a0_im;
1223  A2_im += g20_im * a0_re;
1224  A2_im += g21_re * a1_im;
1225  A2_im += g21_im * a1_re;
1226  A2_im += g22_re * a2_im;
1227  A2_im += g22_im * a2_re;
1228  spinorFloat B2_re = 0;
1229  B2_re += g20_re * b0_re;
1230  B2_re -= g20_im * b0_im;
1231  B2_re += g21_re * b1_re;
1232  B2_re -= g21_im * b1_im;
1233  B2_re += g22_re * b2_re;
1234  B2_re -= g22_im * b2_im;
1235  spinorFloat B2_im = 0;
1236  B2_im += g20_re * b0_im;
1237  B2_im += g20_im * b0_re;
1238  B2_im += g21_re * b1_im;
1239  B2_im += g21_im * b1_re;
1240  B2_im += g22_re * b2_im;
1241  B2_im += g22_im * b2_re;
1242 
1243  o00_re += A0_re;
1244  o00_im += A0_im;
1245  o10_re += B0_re;
1246  o10_im += B0_im;
1247  o20_re -= A0_im;
1248  o20_im += A0_re;
1249  o30_re += B0_im;
1250  o30_im -= B0_re;
1251 
1252  o01_re += A1_re;
1253  o01_im += A1_im;
1254  o11_re += B1_re;
1255  o11_im += B1_im;
1256  o21_re -= A1_im;
1257  o21_im += A1_re;
1258  o31_re += B1_im;
1259  o31_im -= B1_re;
1260 
1261  o02_re += A2_re;
1262  o02_im += A2_im;
1263  o12_re += B2_re;
1264  o12_im += B2_im;
1265  o22_re -= A2_im;
1266  o22_im += A2_re;
1267  o32_re += B2_im;
1268  o32_im -= B2_re;
1269 
1270 }
1271 
1272 if ( isActive(dim,2,-1,x1,x2,x3,x4,param.commDim,param.X) && x3==0 )
1273 {
1274  // Projector P2+
1275  // 1 0 i 0
1276  // 0 1 0 -i
1277  // -i 0 1 0
1278  // 0 i 0 1
1279 
1280  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,2,Y);
1281  const int sp_idx = face_idx + param.ghostOffset[2];
1282 #if (DD_PREC==2)
1283  sp_norm_idx = face_idx + param.ghostNormOffset[2];
1284 #endif
1285 
1286  const int ga_idx = Vh+face_idx;
1287 
1294 
1295 
1296  const int sp_stride_pad = ghostFace[2];
1297 
1298  // read half spinor from device memory
1299  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1300 
1301  a0_re = i00_re; a0_im = i00_im;
1302  a1_re = i01_re; a1_im = i01_im;
1303  a2_re = i02_re; a2_im = i02_im;
1304  b0_re = i10_re; b0_im = i10_im;
1305  b1_re = i11_re; b1_im = i11_im;
1306  b2_re = i12_re; b2_im = i12_im;
1307 
1308 
1309  // read gauge matrix from device memory
1310  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1311 
1312  // reconstruct gauge matrix
1314 
1315  // multiply row 0
1316  spinorFloat A0_re = 0;
1317  A0_re += gT00_re * a0_re;
1318  A0_re -= gT00_im * a0_im;
1319  A0_re += gT01_re * a1_re;
1320  A0_re -= gT01_im * a1_im;
1321  A0_re += gT02_re * a2_re;
1322  A0_re -= gT02_im * a2_im;
1323  spinorFloat A0_im = 0;
1324  A0_im += gT00_re * a0_im;
1325  A0_im += gT00_im * a0_re;
1326  A0_im += gT01_re * a1_im;
1327  A0_im += gT01_im * a1_re;
1328  A0_im += gT02_re * a2_im;
1329  A0_im += gT02_im * a2_re;
1330  spinorFloat B0_re = 0;
1331  B0_re += gT00_re * b0_re;
1332  B0_re -= gT00_im * b0_im;
1333  B0_re += gT01_re * b1_re;
1334  B0_re -= gT01_im * b1_im;
1335  B0_re += gT02_re * b2_re;
1336  B0_re -= gT02_im * b2_im;
1337  spinorFloat B0_im = 0;
1338  B0_im += gT00_re * b0_im;
1339  B0_im += gT00_im * b0_re;
1340  B0_im += gT01_re * b1_im;
1341  B0_im += gT01_im * b1_re;
1342  B0_im += gT02_re * b2_im;
1343  B0_im += gT02_im * b2_re;
1344 
1345  // multiply row 1
1346  spinorFloat A1_re = 0;
1347  A1_re += gT10_re * a0_re;
1348  A1_re -= gT10_im * a0_im;
1349  A1_re += gT11_re * a1_re;
1350  A1_re -= gT11_im * a1_im;
1351  A1_re += gT12_re * a2_re;
1352  A1_re -= gT12_im * a2_im;
1353  spinorFloat A1_im = 0;
1354  A1_im += gT10_re * a0_im;
1355  A1_im += gT10_im * a0_re;
1356  A1_im += gT11_re * a1_im;
1357  A1_im += gT11_im * a1_re;
1358  A1_im += gT12_re * a2_im;
1359  A1_im += gT12_im * a2_re;
1360  spinorFloat B1_re = 0;
1361  B1_re += gT10_re * b0_re;
1362  B1_re -= gT10_im * b0_im;
1363  B1_re += gT11_re * b1_re;
1364  B1_re -= gT11_im * b1_im;
1365  B1_re += gT12_re * b2_re;
1366  B1_re -= gT12_im * b2_im;
1367  spinorFloat B1_im = 0;
1368  B1_im += gT10_re * b0_im;
1369  B1_im += gT10_im * b0_re;
1370  B1_im += gT11_re * b1_im;
1371  B1_im += gT11_im * b1_re;
1372  B1_im += gT12_re * b2_im;
1373  B1_im += gT12_im * b2_re;
1374 
1375  // multiply row 2
1376  spinorFloat A2_re = 0;
1377  A2_re += gT20_re * a0_re;
1378  A2_re -= gT20_im * a0_im;
1379  A2_re += gT21_re * a1_re;
1380  A2_re -= gT21_im * a1_im;
1381  A2_re += gT22_re * a2_re;
1382  A2_re -= gT22_im * a2_im;
1383  spinorFloat A2_im = 0;
1384  A2_im += gT20_re * a0_im;
1385  A2_im += gT20_im * a0_re;
1386  A2_im += gT21_re * a1_im;
1387  A2_im += gT21_im * a1_re;
1388  A2_im += gT22_re * a2_im;
1389  A2_im += gT22_im * a2_re;
1390  spinorFloat B2_re = 0;
1391  B2_re += gT20_re * b0_re;
1392  B2_re -= gT20_im * b0_im;
1393  B2_re += gT21_re * b1_re;
1394  B2_re -= gT21_im * b1_im;
1395  B2_re += gT22_re * b2_re;
1396  B2_re -= gT22_im * b2_im;
1397  spinorFloat B2_im = 0;
1398  B2_im += gT20_re * b0_im;
1399  B2_im += gT20_im * b0_re;
1400  B2_im += gT21_re * b1_im;
1401  B2_im += gT21_im * b1_re;
1402  B2_im += gT22_re * b2_im;
1403  B2_im += gT22_im * b2_re;
1404 
1405  o00_re += A0_re;
1406  o00_im += A0_im;
1407  o10_re += B0_re;
1408  o10_im += B0_im;
1409  o20_re += A0_im;
1410  o20_im -= A0_re;
1411  o30_re -= B0_im;
1412  o30_im += B0_re;
1413 
1414  o01_re += A1_re;
1415  o01_im += A1_im;
1416  o11_re += B1_re;
1417  o11_im += B1_im;
1418  o21_re += A1_im;
1419  o21_im -= A1_re;
1420  o31_re -= B1_im;
1421  o31_im += B1_re;
1422 
1423  o02_re += A2_re;
1424  o02_im += A2_im;
1425  o12_re += B2_re;
1426  o12_im += B2_im;
1427  o22_re += A2_im;
1428  o22_im -= A2_re;
1429  o32_re -= B2_im;
1430  o32_im += B2_re;
1431 
1432 }
1433 
1434 if ( isActive(dim,3,+1,x1,x2,x3,x4,param.commDim,param.X) && x4==X4m1 )
1435 {
1436  // Projector P3-
1437  // 0 0 0 0
1438  // 0 0 0 0
1439  // 0 0 2 0
1440  // 0 0 0 2
1441 
1442  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,3,Y);
1443  const int sp_idx = face_idx + param.ghostOffset[3];
1444 #if (DD_PREC==2)
1445  sp_norm_idx = face_idx + faceVolume[3] + param.ghostNormOffset[3];
1446 #endif
1447 
1448  const int ga_idx = sid;
1449 
1450  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1451  {
1458 
1459 
1460  const int sp_stride_pad = ghostFace[3];
1461  const int t_proj_scale = TPROJSCALE;
1462 
1463  // read half spinor from device memory
1464  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1465 
1466  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1467  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1468  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1469  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1470  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1471  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1472 
1473 
1474  // identity gauge matrix
1481 
1482  o20_re += A0_re;
1483  o20_im += A0_im;
1484  o30_re += B0_re;
1485  o30_im += B0_im;
1486 
1487  o21_re += A1_re;
1488  o21_im += A1_im;
1489  o31_re += B1_re;
1490  o31_im += B1_im;
1491 
1492  o22_re += A2_re;
1493  o22_im += A2_im;
1494  o32_re += B2_re;
1495  o32_im += B2_im;
1496 
1497  } else {
1504 
1505 
1506  const int sp_stride_pad = ghostFace[3];
1507  const int t_proj_scale = TPROJSCALE;
1508 
1509  // read half spinor from device memory
1510  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1511 
1512  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1513  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1514  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1515  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1516  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1517  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1518 
1519 
1520  // read gauge matrix from device memory
1521  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1522 
1523  // reconstruct gauge matrix
1525 
1526  // multiply row 0
1527  spinorFloat A0_re = 0;
1528  A0_re += g00_re * a0_re;
1529  A0_re -= g00_im * a0_im;
1530  A0_re += g01_re * a1_re;
1531  A0_re -= g01_im * a1_im;
1532  A0_re += g02_re * a2_re;
1533  A0_re -= g02_im * a2_im;
1534  spinorFloat A0_im = 0;
1535  A0_im += g00_re * a0_im;
1536  A0_im += g00_im * a0_re;
1537  A0_im += g01_re * a1_im;
1538  A0_im += g01_im * a1_re;
1539  A0_im += g02_re * a2_im;
1540  A0_im += g02_im * a2_re;
1541  spinorFloat B0_re = 0;
1542  B0_re += g00_re * b0_re;
1543  B0_re -= g00_im * b0_im;
1544  B0_re += g01_re * b1_re;
1545  B0_re -= g01_im * b1_im;
1546  B0_re += g02_re * b2_re;
1547  B0_re -= g02_im * b2_im;
1548  spinorFloat B0_im = 0;
1549  B0_im += g00_re * b0_im;
1550  B0_im += g00_im * b0_re;
1551  B0_im += g01_re * b1_im;
1552  B0_im += g01_im * b1_re;
1553  B0_im += g02_re * b2_im;
1554  B0_im += g02_im * b2_re;
1555 
1556  // multiply row 1
1557  spinorFloat A1_re = 0;
1558  A1_re += g10_re * a0_re;
1559  A1_re -= g10_im * a0_im;
1560  A1_re += g11_re * a1_re;
1561  A1_re -= g11_im * a1_im;
1562  A1_re += g12_re * a2_re;
1563  A1_re -= g12_im * a2_im;
1564  spinorFloat A1_im = 0;
1565  A1_im += g10_re * a0_im;
1566  A1_im += g10_im * a0_re;
1567  A1_im += g11_re * a1_im;
1568  A1_im += g11_im * a1_re;
1569  A1_im += g12_re * a2_im;
1570  A1_im += g12_im * a2_re;
1571  spinorFloat B1_re = 0;
1572  B1_re += g10_re * b0_re;
1573  B1_re -= g10_im * b0_im;
1574  B1_re += g11_re * b1_re;
1575  B1_re -= g11_im * b1_im;
1576  B1_re += g12_re * b2_re;
1577  B1_re -= g12_im * b2_im;
1578  spinorFloat B1_im = 0;
1579  B1_im += g10_re * b0_im;
1580  B1_im += g10_im * b0_re;
1581  B1_im += g11_re * b1_im;
1582  B1_im += g11_im * b1_re;
1583  B1_im += g12_re * b2_im;
1584  B1_im += g12_im * b2_re;
1585 
1586  // multiply row 2
1587  spinorFloat A2_re = 0;
1588  A2_re += g20_re * a0_re;
1589  A2_re -= g20_im * a0_im;
1590  A2_re += g21_re * a1_re;
1591  A2_re -= g21_im * a1_im;
1592  A2_re += g22_re * a2_re;
1593  A2_re -= g22_im * a2_im;
1594  spinorFloat A2_im = 0;
1595  A2_im += g20_re * a0_im;
1596  A2_im += g20_im * a0_re;
1597  A2_im += g21_re * a1_im;
1598  A2_im += g21_im * a1_re;
1599  A2_im += g22_re * a2_im;
1600  A2_im += g22_im * a2_re;
1601  spinorFloat B2_re = 0;
1602  B2_re += g20_re * b0_re;
1603  B2_re -= g20_im * b0_im;
1604  B2_re += g21_re * b1_re;
1605  B2_re -= g21_im * b1_im;
1606  B2_re += g22_re * b2_re;
1607  B2_re -= g22_im * b2_im;
1608  spinorFloat B2_im = 0;
1609  B2_im += g20_re * b0_im;
1610  B2_im += g20_im * b0_re;
1611  B2_im += g21_re * b1_im;
1612  B2_im += g21_im * b1_re;
1613  B2_im += g22_re * b2_im;
1614  B2_im += g22_im * b2_re;
1615 
1616  o20_re += A0_re;
1617  o20_im += A0_im;
1618  o30_re += B0_re;
1619  o30_im += B0_im;
1620 
1621  o21_re += A1_re;
1622  o21_im += A1_im;
1623  o31_re += B1_re;
1624  o31_im += B1_im;
1625 
1626  o22_re += A2_re;
1627  o22_im += A2_im;
1628  o32_re += B2_re;
1629  o32_im += B2_im;
1630 
1631  }
1632 }
1633 
1634 if ( isActive(dim,3,-1,x1,x2,x3,x4,param.commDim,param.X) && x4==0 )
1635 {
1636  // Projector P3+
1637  // 2 0 0 0
1638  // 0 2 0 0
1639  // 0 0 0 0
1640  // 0 0 0 0
1641 
1642  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,3,Y);
1643  const int sp_idx = face_idx + param.ghostOffset[3];
1644 #if (DD_PREC==2)
1645  sp_norm_idx = face_idx + param.ghostNormOffset[3];
1646 #endif
1647 
1648  const int ga_idx = Vh+face_idx;
1649 
1650  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1651  {
1658 
1659 
1660  const int sp_stride_pad = ghostFace[3];
1661  const int t_proj_scale = TPROJSCALE;
1662 
1663  // read half spinor from device memory
1664  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1665 
1666  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1667  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1668  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1669  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1670  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1671  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1672 
1673 
1674  // identity gauge matrix
1681 
1682  o00_re += A0_re;
1683  o00_im += A0_im;
1684  o10_re += B0_re;
1685  o10_im += B0_im;
1686 
1687  o01_re += A1_re;
1688  o01_im += A1_im;
1689  o11_re += B1_re;
1690  o11_im += B1_im;
1691 
1692  o02_re += A2_re;
1693  o02_im += A2_im;
1694  o12_re += B2_re;
1695  o12_im += B2_im;
1696 
1697  } else {
1704 
1705 
1706  const int sp_stride_pad = ghostFace[3];
1707  const int t_proj_scale = TPROJSCALE;
1708 
1709  // read half spinor from device memory
1710  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1711 
1712  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1713  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1714  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1715  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1716  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1717  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1718 
1719 
1720  // read gauge matrix from device memory
1721  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
1722 
1723  // reconstruct gauge matrix
1725 
1726  // multiply row 0
1727  spinorFloat A0_re = 0;
1728  A0_re += gT00_re * a0_re;
1729  A0_re -= gT00_im * a0_im;
1730  A0_re += gT01_re * a1_re;
1731  A0_re -= gT01_im * a1_im;
1732  A0_re += gT02_re * a2_re;
1733  A0_re -= gT02_im * a2_im;
1734  spinorFloat A0_im = 0;
1735  A0_im += gT00_re * a0_im;
1736  A0_im += gT00_im * a0_re;
1737  A0_im += gT01_re * a1_im;
1738  A0_im += gT01_im * a1_re;
1739  A0_im += gT02_re * a2_im;
1740  A0_im += gT02_im * a2_re;
1741  spinorFloat B0_re = 0;
1742  B0_re += gT00_re * b0_re;
1743  B0_re -= gT00_im * b0_im;
1744  B0_re += gT01_re * b1_re;
1745  B0_re -= gT01_im * b1_im;
1746  B0_re += gT02_re * b2_re;
1747  B0_re -= gT02_im * b2_im;
1748  spinorFloat B0_im = 0;
1749  B0_im += gT00_re * b0_im;
1750  B0_im += gT00_im * b0_re;
1751  B0_im += gT01_re * b1_im;
1752  B0_im += gT01_im * b1_re;
1753  B0_im += gT02_re * b2_im;
1754  B0_im += gT02_im * b2_re;
1755 
1756  // multiply row 1
1757  spinorFloat A1_re = 0;
1758  A1_re += gT10_re * a0_re;
1759  A1_re -= gT10_im * a0_im;
1760  A1_re += gT11_re * a1_re;
1761  A1_re -= gT11_im * a1_im;
1762  A1_re += gT12_re * a2_re;
1763  A1_re -= gT12_im * a2_im;
1764  spinorFloat A1_im = 0;
1765  A1_im += gT10_re * a0_im;
1766  A1_im += gT10_im * a0_re;
1767  A1_im += gT11_re * a1_im;
1768  A1_im += gT11_im * a1_re;
1769  A1_im += gT12_re * a2_im;
1770  A1_im += gT12_im * a2_re;
1771  spinorFloat B1_re = 0;
1772  B1_re += gT10_re * b0_re;
1773  B1_re -= gT10_im * b0_im;
1774  B1_re += gT11_re * b1_re;
1775  B1_re -= gT11_im * b1_im;
1776  B1_re += gT12_re * b2_re;
1777  B1_re -= gT12_im * b2_im;
1778  spinorFloat B1_im = 0;
1779  B1_im += gT10_re * b0_im;
1780  B1_im += gT10_im * b0_re;
1781  B1_im += gT11_re * b1_im;
1782  B1_im += gT11_im * b1_re;
1783  B1_im += gT12_re * b2_im;
1784  B1_im += gT12_im * b2_re;
1785 
1786  // multiply row 2
1787  spinorFloat A2_re = 0;
1788  A2_re += gT20_re * a0_re;
1789  A2_re -= gT20_im * a0_im;
1790  A2_re += gT21_re * a1_re;
1791  A2_re -= gT21_im * a1_im;
1792  A2_re += gT22_re * a2_re;
1793  A2_re -= gT22_im * a2_im;
1794  spinorFloat A2_im = 0;
1795  A2_im += gT20_re * a0_im;
1796  A2_im += gT20_im * a0_re;
1797  A2_im += gT21_re * a1_im;
1798  A2_im += gT21_im * a1_re;
1799  A2_im += gT22_re * a2_im;
1800  A2_im += gT22_im * a2_re;
1801  spinorFloat B2_re = 0;
1802  B2_re += gT20_re * b0_re;
1803  B2_re -= gT20_im * b0_im;
1804  B2_re += gT21_re * b1_re;
1805  B2_re -= gT21_im * b1_im;
1806  B2_re += gT22_re * b2_re;
1807  B2_re -= gT22_im * b2_im;
1808  spinorFloat B2_im = 0;
1809  B2_im += gT20_re * b0_im;
1810  B2_im += gT20_im * b0_re;
1811  B2_im += gT21_re * b1_im;
1812  B2_im += gT21_im * b1_re;
1813  B2_im += gT22_re * b2_im;
1814  B2_im += gT22_im * b2_re;
1815 
1816  o00_re += A0_re;
1817  o00_im += A0_im;
1818  o10_re += B0_re;
1819  o10_im += B0_im;
1820 
1821  o01_re += A1_re;
1822  o01_im += A1_im;
1823  o11_re += B1_re;
1824  o11_im += B1_im;
1825 
1826  o02_re += A2_re;
1827  o02_im += A2_im;
1828  o12_re += B2_re;
1829  o12_im += B2_im;
1830 
1831  }
1832 }
1833 
1834 {
1835 #ifdef DSLASH_CLOVER
1836 
1837  // change to chiral basis
1838  {
1845  spinorFloat a30_re = o00_re - o20_re;
1846  spinorFloat a30_im = o00_im - o20_im;
1847 
1848  o00_re = a00_re; o00_im = a00_im;
1849  o10_re = a10_re; o10_im = a10_im;
1850  o20_re = a20_re; o20_im = a20_im;
1851  o30_re = a30_re; o30_im = a30_im;
1852  }
1853 
1854  {
1861  spinorFloat a31_re = o01_re - o21_re;
1862  spinorFloat a31_im = o01_im - o21_im;
1863 
1864  o01_re = a01_re; o01_im = a01_im;
1865  o11_re = a11_re; o11_im = a11_im;
1866  o21_re = a21_re; o21_im = a21_im;
1867  o31_re = a31_re; o31_im = a31_im;
1868  }
1869 
1870  {
1877  spinorFloat a32_re = o02_re - o22_re;
1878  spinorFloat a32_im = o02_im - o22_im;
1879 
1880  o02_re = a02_re; o02_im = a02_im;
1881  o12_re = a12_re; o12_im = a12_im;
1882  o22_re = a22_re; o22_im = a22_im;
1883  o32_re = a32_re; o32_im = a32_im;
1884  }
1885 
1886  // apply first chiral block
1887  {
1889 
1896 
1897  a00_re += c00_00_re * o00_re;
1898  a00_im += c00_00_re * o00_im;
1899  a00_re += c00_01_re * o01_re;
1900  a00_re -= c00_01_im * o01_im;
1901  a00_im += c00_01_re * o01_im;
1902  a00_im += c00_01_im * o01_re;
1903  a00_re += c00_02_re * o02_re;
1904  a00_re -= c00_02_im * o02_im;
1905  a00_im += c00_02_re * o02_im;
1906  a00_im += c00_02_im * o02_re;
1907  a00_re += c00_10_re * o10_re;
1908  a00_re -= c00_10_im * o10_im;
1909  a00_im += c00_10_re * o10_im;
1910  a00_im += c00_10_im * o10_re;
1911  a00_re += c00_11_re * o11_re;
1912  a00_re -= c00_11_im * o11_im;
1913  a00_im += c00_11_re * o11_im;
1914  a00_im += c00_11_im * o11_re;
1915  a00_re += c00_12_re * o12_re;
1916  a00_re -= c00_12_im * o12_im;
1917  a00_im += c00_12_re * o12_im;
1918  a00_im += c00_12_im * o12_re;
1919 
1920  a01_re += c01_00_re * o00_re;
1921  a01_re -= c01_00_im * o00_im;
1922  a01_im += c01_00_re * o00_im;
1923  a01_im += c01_00_im * o00_re;
1924  a01_re += c01_01_re * o01_re;
1925  a01_im += c01_01_re * o01_im;
1926  a01_re += c01_02_re * o02_re;
1927  a01_re -= c01_02_im * o02_im;
1928  a01_im += c01_02_re * o02_im;
1929  a01_im += c01_02_im * o02_re;
1930  a01_re += c01_10_re * o10_re;
1931  a01_re -= c01_10_im * o10_im;
1932  a01_im += c01_10_re * o10_im;
1933  a01_im += c01_10_im * o10_re;
1934  a01_re += c01_11_re * o11_re;
1935  a01_re -= c01_11_im * o11_im;
1936  a01_im += c01_11_re * o11_im;
1937  a01_im += c01_11_im * o11_re;
1938  a01_re += c01_12_re * o12_re;
1939  a01_re -= c01_12_im * o12_im;
1940  a01_im += c01_12_re * o12_im;
1941  a01_im += c01_12_im * o12_re;
1942 
1943  a02_re += c02_00_re * o00_re;
1944  a02_re -= c02_00_im * o00_im;
1945  a02_im += c02_00_re * o00_im;
1946  a02_im += c02_00_im * o00_re;
1947  a02_re += c02_01_re * o01_re;
1948  a02_re -= c02_01_im * o01_im;
1949  a02_im += c02_01_re * o01_im;
1950  a02_im += c02_01_im * o01_re;
1951  a02_re += c02_02_re * o02_re;
1952  a02_im += c02_02_re * o02_im;
1953  a02_re += c02_10_re * o10_re;
1954  a02_re -= c02_10_im * o10_im;
1955  a02_im += c02_10_re * o10_im;
1956  a02_im += c02_10_im * o10_re;
1957  a02_re += c02_11_re * o11_re;
1958  a02_re -= c02_11_im * o11_im;
1959  a02_im += c02_11_re * o11_im;
1960  a02_im += c02_11_im * o11_re;
1961  a02_re += c02_12_re * o12_re;
1962  a02_re -= c02_12_im * o12_im;
1963  a02_im += c02_12_re * o12_im;
1964  a02_im += c02_12_im * o12_re;
1965 
1966  a10_re += c10_00_re * o00_re;
1967  a10_re -= c10_00_im * o00_im;
1968  a10_im += c10_00_re * o00_im;
1969  a10_im += c10_00_im * o00_re;
1970  a10_re += c10_01_re * o01_re;
1971  a10_re -= c10_01_im * o01_im;
1972  a10_im += c10_01_re * o01_im;
1973  a10_im += c10_01_im * o01_re;
1974  a10_re += c10_02_re * o02_re;
1975  a10_re -= c10_02_im * o02_im;
1976  a10_im += c10_02_re * o02_im;
1977  a10_im += c10_02_im * o02_re;
1978  a10_re += c10_10_re * o10_re;
1979  a10_im += c10_10_re * o10_im;
1980  a10_re += c10_11_re * o11_re;
1981  a10_re -= c10_11_im * o11_im;
1982  a10_im += c10_11_re * o11_im;
1983  a10_im += c10_11_im * o11_re;
1984  a10_re += c10_12_re * o12_re;
1985  a10_re -= c10_12_im * o12_im;
1986  a10_im += c10_12_re * o12_im;
1987  a10_im += c10_12_im * o12_re;
1988 
1989  a11_re += c11_00_re * o00_re;
1990  a11_re -= c11_00_im * o00_im;
1991  a11_im += c11_00_re * o00_im;
1992  a11_im += c11_00_im * o00_re;
1993  a11_re += c11_01_re * o01_re;
1994  a11_re -= c11_01_im * o01_im;
1995  a11_im += c11_01_re * o01_im;
1996  a11_im += c11_01_im * o01_re;
1997  a11_re += c11_02_re * o02_re;
1998  a11_re -= c11_02_im * o02_im;
1999  a11_im += c11_02_re * o02_im;
2000  a11_im += c11_02_im * o02_re;
2001  a11_re += c11_10_re * o10_re;
2002  a11_re -= c11_10_im * o10_im;
2003  a11_im += c11_10_re * o10_im;
2004  a11_im += c11_10_im * o10_re;
2005  a11_re += c11_11_re * o11_re;
2006  a11_im += c11_11_re * o11_im;
2007  a11_re += c11_12_re * o12_re;
2008  a11_re -= c11_12_im * o12_im;
2009  a11_im += c11_12_re * o12_im;
2010  a11_im += c11_12_im * o12_re;
2011 
2012  a12_re += c12_00_re * o00_re;
2013  a12_re -= c12_00_im * o00_im;
2014  a12_im += c12_00_re * o00_im;
2015  a12_im += c12_00_im * o00_re;
2016  a12_re += c12_01_re * o01_re;
2017  a12_re -= c12_01_im * o01_im;
2018  a12_im += c12_01_re * o01_im;
2019  a12_im += c12_01_im * o01_re;
2020  a12_re += c12_02_re * o02_re;
2021  a12_re -= c12_02_im * o02_im;
2022  a12_im += c12_02_re * o02_im;
2023  a12_im += c12_02_im * o02_re;
2024  a12_re += c12_10_re * o10_re;
2025  a12_re -= c12_10_im * o10_im;
2026  a12_im += c12_10_re * o10_im;
2027  a12_im += c12_10_im * o10_re;
2028  a12_re += c12_11_re * o11_re;
2029  a12_re -= c12_11_im * o11_im;
2030  a12_im += c12_11_re * o11_im;
2031  a12_im += c12_11_im * o11_re;
2032  a12_re += c12_12_re * o12_re;
2033  a12_im += c12_12_re * o12_im;
2034 
2035  o00_re = a00_re; o00_im = a00_im;
2036  o01_re = a01_re; o01_im = a01_im;
2037  o02_re = a02_re; o02_im = a02_im;
2038  o10_re = a10_re; o10_im = a10_im;
2039  o11_re = a11_re; o11_im = a11_im;
2040  o12_re = a12_re; o12_im = a12_im;
2041 
2042  }
2043 
2044  // apply second chiral block
2045  {
2047 
2051  spinorFloat a30_re = 0; spinorFloat a30_im = 0;
2052  spinorFloat a31_re = 0; spinorFloat a31_im = 0;
2053  spinorFloat a32_re = 0; spinorFloat a32_im = 0;
2054 
2055  a20_re += c20_20_re * o20_re;
2056  a20_im += c20_20_re * o20_im;
2057  a20_re += c20_21_re * o21_re;
2058  a20_re -= c20_21_im * o21_im;
2059  a20_im += c20_21_re * o21_im;
2060  a20_im += c20_21_im * o21_re;
2061  a20_re += c20_22_re * o22_re;
2062  a20_re -= c20_22_im * o22_im;
2063  a20_im += c20_22_re * o22_im;
2064  a20_im += c20_22_im * o22_re;
2065  a20_re += c20_30_re * o30_re;
2066  a20_re -= c20_30_im * o30_im;
2067  a20_im += c20_30_re * o30_im;
2068  a20_im += c20_30_im * o30_re;
2069  a20_re += c20_31_re * o31_re;
2070  a20_re -= c20_31_im * o31_im;
2071  a20_im += c20_31_re * o31_im;
2072  a20_im += c20_31_im * o31_re;
2073  a20_re += c20_32_re * o32_re;
2074  a20_re -= c20_32_im * o32_im;
2075  a20_im += c20_32_re * o32_im;
2076  a20_im += c20_32_im * o32_re;
2077 
2078  a21_re += c21_20_re * o20_re;
2079  a21_re -= c21_20_im * o20_im;
2080  a21_im += c21_20_re * o20_im;
2081  a21_im += c21_20_im * o20_re;
2082  a21_re += c21_21_re * o21_re;
2083  a21_im += c21_21_re * o21_im;
2084  a21_re += c21_22_re * o22_re;
2085  a21_re -= c21_22_im * o22_im;
2086  a21_im += c21_22_re * o22_im;
2087  a21_im += c21_22_im * o22_re;
2088  a21_re += c21_30_re * o30_re;
2089  a21_re -= c21_30_im * o30_im;
2090  a21_im += c21_30_re * o30_im;
2091  a21_im += c21_30_im * o30_re;
2092  a21_re += c21_31_re * o31_re;
2093  a21_re -= c21_31_im * o31_im;
2094  a21_im += c21_31_re * o31_im;
2095  a21_im += c21_31_im * o31_re;
2096  a21_re += c21_32_re * o32_re;
2097  a21_re -= c21_32_im * o32_im;
2098  a21_im += c21_32_re * o32_im;
2099  a21_im += c21_32_im * o32_re;
2100 
2101  a22_re += c22_20_re * o20_re;
2102  a22_re -= c22_20_im * o20_im;
2103  a22_im += c22_20_re * o20_im;
2104  a22_im += c22_20_im * o20_re;
2105  a22_re += c22_21_re * o21_re;
2106  a22_re -= c22_21_im * o21_im;
2107  a22_im += c22_21_re * o21_im;
2108  a22_im += c22_21_im * o21_re;
2109  a22_re += c22_22_re * o22_re;
2110  a22_im += c22_22_re * o22_im;
2111  a22_re += c22_30_re * o30_re;
2112  a22_re -= c22_30_im * o30_im;
2113  a22_im += c22_30_re * o30_im;
2114  a22_im += c22_30_im * o30_re;
2115  a22_re += c22_31_re * o31_re;
2116  a22_re -= c22_31_im * o31_im;
2117  a22_im += c22_31_re * o31_im;
2118  a22_im += c22_31_im * o31_re;
2119  a22_re += c22_32_re * o32_re;
2120  a22_re -= c22_32_im * o32_im;
2121  a22_im += c22_32_re * o32_im;
2122  a22_im += c22_32_im * o32_re;
2123 
2124  a30_re += c30_20_re * o20_re;
2125  a30_re -= c30_20_im * o20_im;
2126  a30_im += c30_20_re * o20_im;
2127  a30_im += c30_20_im * o20_re;
2128  a30_re += c30_21_re * o21_re;
2129  a30_re -= c30_21_im * o21_im;
2130  a30_im += c30_21_re * o21_im;
2131  a30_im += c30_21_im * o21_re;
2132  a30_re += c30_22_re * o22_re;
2133  a30_re -= c30_22_im * o22_im;
2134  a30_im += c30_22_re * o22_im;
2135  a30_im += c30_22_im * o22_re;
2136  a30_re += c30_30_re * o30_re;
2137  a30_im += c30_30_re * o30_im;
2138  a30_re += c30_31_re * o31_re;
2139  a30_re -= c30_31_im * o31_im;
2140  a30_im += c30_31_re * o31_im;
2141  a30_im += c30_31_im * o31_re;
2142  a30_re += c30_32_re * o32_re;
2143  a30_re -= c30_32_im * o32_im;
2144  a30_im += c30_32_re * o32_im;
2145  a30_im += c30_32_im * o32_re;
2146 
2147  a31_re += c31_20_re * o20_re;
2148  a31_re -= c31_20_im * o20_im;
2149  a31_im += c31_20_re * o20_im;
2150  a31_im += c31_20_im * o20_re;
2151  a31_re += c31_21_re * o21_re;
2152  a31_re -= c31_21_im * o21_im;
2153  a31_im += c31_21_re * o21_im;
2154  a31_im += c31_21_im * o21_re;
2155  a31_re += c31_22_re * o22_re;
2156  a31_re -= c31_22_im * o22_im;
2157  a31_im += c31_22_re * o22_im;
2158  a31_im += c31_22_im * o22_re;
2159  a31_re += c31_30_re * o30_re;
2160  a31_re -= c31_30_im * o30_im;
2161  a31_im += c31_30_re * o30_im;
2162  a31_im += c31_30_im * o30_re;
2163  a31_re += c31_31_re * o31_re;
2164  a31_im += c31_31_re * o31_im;
2165  a31_re += c31_32_re * o32_re;
2166  a31_re -= c31_32_im * o32_im;
2167  a31_im += c31_32_re * o32_im;
2168  a31_im += c31_32_im * o32_re;
2169 
2170  a32_re += c32_20_re * o20_re;
2171  a32_re -= c32_20_im * o20_im;
2172  a32_im += c32_20_re * o20_im;
2173  a32_im += c32_20_im * o20_re;
2174  a32_re += c32_21_re * o21_re;
2175  a32_re -= c32_21_im * o21_im;
2176  a32_im += c32_21_re * o21_im;
2177  a32_im += c32_21_im * o21_re;
2178  a32_re += c32_22_re * o22_re;
2179  a32_re -= c32_22_im * o22_im;
2180  a32_im += c32_22_re * o22_im;
2181  a32_im += c32_22_im * o22_re;
2182  a32_re += c32_30_re * o30_re;
2183  a32_re -= c32_30_im * o30_im;
2184  a32_im += c32_30_re * o30_im;
2185  a32_im += c32_30_im * o30_re;
2186  a32_re += c32_31_re * o31_re;
2187  a32_re -= c32_31_im * o31_im;
2188  a32_im += c32_31_re * o31_im;
2189  a32_im += c32_31_im * o31_re;
2190  a32_re += c32_32_re * o32_re;
2191  a32_im += c32_32_re * o32_im;
2192 
2193  o20_re = a20_re; o20_im = a20_im;
2194  o21_re = a21_re; o21_im = a21_im;
2195  o22_re = a22_re; o22_im = a22_im;
2196  o30_re = a30_re; o30_im = a30_im;
2197  o31_re = a31_re; o31_im = a31_im;
2198  o32_re = a32_re; o32_im = a32_im;
2199 
2200  }
2201 
2202  // change back from chiral basis
2203  // (note: required factor of 1/2 is included in clover term normalization)
2204  {
2205  spinorFloat a00_re = o10_re + o30_re;
2206  spinorFloat a00_im = o10_im + o30_im;
2207  spinorFloat a10_re = -o00_re - o20_re;
2208  spinorFloat a10_im = -o00_im - o20_im;
2209  spinorFloat a20_re = o10_re - o30_re;
2210  spinorFloat a20_im = o10_im - o30_im;
2211  spinorFloat a30_re = -o00_re + o20_re;
2212  spinorFloat a30_im = -o00_im + o20_im;
2213 
2214  o00_re = a00_re; o00_im = a00_im;
2215  o10_re = a10_re; o10_im = a10_im;
2216  o20_re = a20_re; o20_im = a20_im;
2217  o30_re = a30_re; o30_im = a30_im;
2218  }
2219 
2220  {
2221  spinorFloat a01_re = o11_re + o31_re;
2222  spinorFloat a01_im = o11_im + o31_im;
2223  spinorFloat a11_re = -o01_re - o21_re;
2224  spinorFloat a11_im = -o01_im - o21_im;
2225  spinorFloat a21_re = o11_re - o31_re;
2226  spinorFloat a21_im = o11_im - o31_im;
2227  spinorFloat a31_re = -o01_re + o21_re;
2228  spinorFloat a31_im = -o01_im + o21_im;
2229 
2230  o01_re = a01_re; o01_im = a01_im;
2231  o11_re = a11_re; o11_im = a11_im;
2232  o21_re = a21_re; o21_im = a21_im;
2233  o31_re = a31_re; o31_im = a31_im;
2234  }
2235 
2236  {
2237  spinorFloat a02_re = o12_re + o32_re;
2238  spinorFloat a02_im = o12_im + o32_im;
2239  spinorFloat a12_re = -o02_re - o22_re;
2240  spinorFloat a12_im = -o02_im - o22_im;
2241  spinorFloat a22_re = o12_re - o32_re;
2242  spinorFloat a22_im = o12_im - o32_im;
2243  spinorFloat a32_re = -o02_re + o22_re;
2244  spinorFloat a32_im = -o02_im + o22_im;
2245 
2246  o02_re = a02_re; o02_im = a02_im;
2247  o12_re = a12_re; o12_im = a12_im;
2248  o22_re = a22_re; o22_im = a22_im;
2249  o32_re = a32_re; o32_im = a32_im;
2250  }
2251 
2252 #endif // DSLASH_CLOVER
2253 
2254 #ifdef DSLASH_XPAY
2255 
2256  READ_ACCUM(ACCUMTEX, param.sp_stride)
2257 
2258  o00_re = a*o00_re+acc00_re;
2259  o00_im = a*o00_im+acc00_im;
2260  o01_re = a*o01_re+acc01_re;
2261  o01_im = a*o01_im+acc01_im;
2262  o02_re = a*o02_re+acc02_re;
2263  o02_im = a*o02_im+acc02_im;
2264  o10_re = a*o10_re+acc10_re;
2265  o10_im = a*o10_im+acc10_im;
2266  o11_re = a*o11_re+acc11_re;
2267  o11_im = a*o11_im+acc11_im;
2268  o12_re = a*o12_re+acc12_re;
2269  o12_im = a*o12_im+acc12_im;
2270  o20_re = a*o20_re+acc20_re;
2271  o20_im = a*o20_im+acc20_im;
2272  o21_re = a*o21_re+acc21_re;
2273  o21_im = a*o21_im+acc21_im;
2274  o22_re = a*o22_re+acc22_re;
2275  o22_im = a*o22_im+acc22_im;
2276  o30_re = a*o30_re+acc30_re;
2277  o30_im = a*o30_im+acc30_im;
2278  o31_re = a*o31_re+acc31_re;
2279  o31_im = a*o31_im+acc31_im;
2280  o32_re = a*o32_re+acc32_re;
2281  o32_im = a*o32_im+acc32_im;
2282 #endif // DSLASH_XPAY
2283 }
2284 
2285 // write spinor field back to device memory
2286 WRITE_SPINOR(param.sp_stride);
2287 
2288 // undefine to prevent warning when precision is changed
2289 #undef spinorFloat
2290 #undef SHARED_STRIDE
2291 
2292 #undef g00_re
2293 #undef g00_im
2294 #undef g01_re
2295 #undef g01_im
2296 #undef g02_re
2297 #undef g02_im
2298 #undef g10_re
2299 #undef g10_im
2300 #undef g11_re
2301 #undef g11_im
2302 #undef g12_re
2303 #undef g12_im
2304 #undef g20_re
2305 #undef g20_im
2306 #undef g21_re
2307 #undef g21_im
2308 #undef g22_re
2309 #undef g22_im
2310 
2311 #undef i00_re
2312 #undef i00_im
2313 #undef i01_re
2314 #undef i01_im
2315 #undef i02_re
2316 #undef i02_im
2317 #undef i10_re
2318 #undef i10_im
2319 #undef i11_re
2320 #undef i11_im
2321 #undef i12_re
2322 #undef i12_im
2323 #undef i20_re
2324 #undef i20_im
2325 #undef i21_re
2326 #undef i21_im
2327 #undef i22_re
2328 #undef i22_im
2329 #undef i30_re
2330 #undef i30_im
2331 #undef i31_re
2332 #undef i31_im
2333 #undef i32_re
2334 #undef i32_im
2335 
2336 #undef acc00_re
2337 #undef acc00_im
2338 #undef acc01_re
2339 #undef acc01_im
2340 #undef acc02_re
2341 #undef acc02_im
2342 #undef acc10_re
2343 #undef acc10_im
2344 #undef acc11_re
2345 #undef acc11_im
2346 #undef acc12_re
2347 #undef acc12_im
2348 #undef acc20_re
2349 #undef acc20_im
2350 #undef acc21_re
2351 #undef acc21_im
2352 #undef acc22_re
2353 #undef acc22_im
2354 #undef acc30_re
2355 #undef acc30_im
2356 #undef acc31_re
2357 #undef acc31_im
2358 #undef acc32_re
2359 #undef acc32_im
2360 
2361 #undef c00_00_re
2362 #undef c01_01_re
2363 #undef c02_02_re
2364 #undef c10_10_re
2365 #undef c11_11_re
2366 #undef c12_12_re
2367 #undef c01_00_re
2368 #undef c01_00_im
2369 #undef c02_00_re
2370 #undef c02_00_im
2371 #undef c10_00_re
2372 #undef c10_00_im
2373 #undef c11_00_re
2374 #undef c11_00_im
2375 #undef c12_00_re
2376 #undef c12_00_im
2377 #undef c02_01_re
2378 #undef c02_01_im
2379 #undef c10_01_re
2380 #undef c10_01_im
2381 #undef c11_01_re
2382 #undef c11_01_im
2383 #undef c12_01_re
2384 #undef c12_01_im
2385 #undef c10_02_re
2386 #undef c10_02_im
2387 #undef c11_02_re
2388 #undef c11_02_im
2389 #undef c12_02_re
2390 #undef c12_02_im
2391 #undef c11_10_re
2392 #undef c11_10_im
2393 #undef c12_10_re
2394 #undef c12_10_im
2395 #undef c12_11_re
2396 #undef c12_11_im
2397 
2398 #undef o00_re
2399 #undef o00_im
2400 #undef o01_re
2401 #undef o01_im
2402 #undef o02_re
2403 #undef o02_im
2404 #undef o10_re
2405 #undef o10_im
2406 #undef o11_re
2407 #undef o11_im
2408 #undef o12_re
2409 #undef o12_im
2410 #undef o20_re
2411 #undef o20_im
2412 #undef o21_re
2413 #undef o21_im
2414 #undef o22_re
2415 #undef o22_im
2416 #undef o30_re
2417 
2418 #undef VOLATILE
2419 
2420 #endif // MULTI_GPU
__constant__ int Vh
#define a22_re
Definition: llfat_core.h:131
__constant__ int X2
#define o32_im
Definition: gamma5.h:295
#define CLOVERTEX
Definition: clover_def.h:101
#define a02_im
Definition: llfat_core.h:120
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
#define a22_im
Definition: llfat_core.h:132
#define a01_re
Definition: llfat_core.h:117
#define a02_re
Definition: llfat_core.h:119
#define a20_re
Definition: llfat_core.h:127
#define o31_im
Definition: gamma5.h:293
#define a12_im
Definition: llfat_core.h:126
#define a20_im
Definition: llfat_core.h:128
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define RECONSTRUCT_GAUGE_MATRIX
Definition: covDev.h:39
__shared__ char s_data[]
#define a01_im
Definition: llfat_core.h:118
#define a12_re
Definition: llfat_core.h:125
#define GAUGE0TEX
Definition: covDev.h:112
#define a11_re
Definition: llfat_core.h:123
#define o30_im
Definition: gamma5.h:291
__constant__ int X2m1
#define SPINORTEX
Definition: clover_def.h:40
#define o32_re
Definition: gamma5.h:294
#define DSLASH_SHARED_FLOATS_PER_THREAD
int X[4]
Definition: quda.h:29
__constant__ int gauge_fixed
#define o31_re
Definition: gamma5.h:292
#define SPINOR_HOP
Definition: covDev.h:158
__constant__ int ga_stride
#define a00_re
Definition: llfat_core.h:115
__constant__ int X1m1
__constant__ int X3
#define a11_im
Definition: llfat_core.h:124
#define a10_re
Definition: llfat_core.h:121
#define GAUGE1TEX
Definition: covDev.h:113
#define READ_GAUGE_MATRIX
Definition: covDev.h:44
#define a10_im
Definition: llfat_core.h:122
__constant__ int X4m1
#define a21_re
Definition: llfat_core.h:129
#define WRITE_SPINOR
Definition: clover_def.h:48
VOLATILE spinorFloat * s
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
__constant__ int X4X3X2X1hmX3X2X1h
#define a21_im
Definition: llfat_core.h:130
#define READ_CLOVER
Definition: clover_def.h:103
__constant__ int X4
__constant__ int X3m1
#define TPROJSCALE
Definition: covDev.h:101
#define a00_im
Definition: llfat_core.h:116