QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tm_fused_exterior_dslash_fermi_core.h
Go to the documentation of this file.
1 #ifdef MULTI_GPU
2 
3 // *** CUDA DSLASH ***
4 
5 #define DSLASH_SHARED_FLOATS_PER_THREAD 24
6 
7 
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
9 #define VOLATILE
10 #else // Open64 compiler
11 #define VOLATILE volatile
12 #endif
13 // input spinor
14 #ifdef SPINOR_DOUBLE
15 #define spinorFloat double
16 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_DOUBLE2
17 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_DOUBLE2
18 #define i00_re I0.x
19 #define i00_im I0.y
20 #define i01_re I1.x
21 #define i01_im I1.y
22 #define i02_re I2.x
23 #define i02_im I2.y
24 #define i10_re I3.x
25 #define i10_im I3.y
26 #define i11_re I4.x
27 #define i11_im I4.y
28 #define i12_re I5.x
29 #define i12_im I5.y
30 #define i20_re I6.x
31 #define i20_im I6.y
32 #define i21_re I7.x
33 #define i21_im I7.y
34 #define i22_re I8.x
35 #define i22_im I8.y
36 #define i30_re I9.x
37 #define i30_im I9.y
38 #define i31_re I10.x
39 #define i31_im I10.y
40 #define i32_re I11.x
41 #define i32_im I11.y
42 #define acc00_re accum0.x
43 #define acc00_im accum0.y
44 #define acc01_re accum1.x
45 #define acc01_im accum1.y
46 #define acc02_re accum2.x
47 #define acc02_im accum2.y
48 #define acc10_re accum3.x
49 #define acc10_im accum3.y
50 #define acc11_re accum4.x
51 #define acc11_im accum4.y
52 #define acc12_re accum5.x
53 #define acc12_im accum5.y
54 #define acc20_re accum6.x
55 #define acc20_im accum6.y
56 #define acc21_re accum7.x
57 #define acc21_im accum7.y
58 #define acc22_re accum8.x
59 #define acc22_im accum8.y
60 #define acc30_re accum9.x
61 #define acc30_im accum9.y
62 #define acc31_re accum10.x
63 #define acc31_im accum10.y
64 #define acc32_re accum11.x
65 #define acc32_im accum11.y
66 #else
67 #define spinorFloat float
68 #define WRITE_SPINOR_SHARED WRITE_SPINOR_SHARED_FLOAT4
69 #define READ_SPINOR_SHARED READ_SPINOR_SHARED_FLOAT4
70 #define i00_re I0.x
71 #define i00_im I0.y
72 #define i01_re I0.z
73 #define i01_im I0.w
74 #define i02_re I1.x
75 #define i02_im I1.y
76 #define i10_re I1.z
77 #define i10_im I1.w
78 #define i11_re I2.x
79 #define i11_im I2.y
80 #define i12_re I2.z
81 #define i12_im I2.w
82 #define i20_re I3.x
83 #define i20_im I3.y
84 #define i21_re I3.z
85 #define i21_im I3.w
86 #define i22_re I4.x
87 #define i22_im I4.y
88 #define i30_re I4.z
89 #define i30_im I4.w
90 #define i31_re I5.x
91 #define i31_im I5.y
92 #define i32_re I5.z
93 #define i32_im I5.w
94 #define acc00_re accum0.x
95 #define acc00_im accum0.y
96 #define acc01_re accum0.z
97 #define acc01_im accum0.w
98 #define acc02_re accum1.x
99 #define acc02_im accum1.y
100 #define acc10_re accum1.z
101 #define acc10_im accum1.w
102 #define acc11_re accum2.x
103 #define acc11_im accum2.y
104 #define acc12_re accum2.z
105 #define acc12_im accum2.w
106 #define acc20_re accum3.x
107 #define acc20_im accum3.y
108 #define acc21_re accum3.z
109 #define acc21_im accum3.w
110 #define acc22_re accum4.x
111 #define acc22_im accum4.y
112 #define acc30_re accum4.z
113 #define acc30_im accum4.w
114 #define acc31_re accum5.x
115 #define acc31_im accum5.y
116 #define acc32_re accum5.z
117 #define acc32_im accum5.w
118 #endif // SPINOR_DOUBLE
119 
120 // gauge link
121 #ifdef GAUGE_FLOAT2
122 #define g00_re G0.x
123 #define g00_im G0.y
124 #define g01_re G1.x
125 #define g01_im G1.y
126 #define g02_re G2.x
127 #define g02_im G2.y
128 #define g10_re G3.x
129 #define g10_im G3.y
130 #define g11_re G4.x
131 #define g11_im G4.y
132 #define g12_re G5.x
133 #define g12_im G5.y
134 #define g20_re G6.x
135 #define g20_im G6.y
136 #define g21_re G7.x
137 #define g21_im G7.y
138 #define g22_re G8.x
139 #define g22_im G8.y
140 
141 #else
142 #define g00_re G0.x
143 #define g00_im G0.y
144 #define g01_re G0.z
145 #define g01_im G0.w
146 #define g02_re G1.x
147 #define g02_im G1.y
148 #define g10_re G1.z
149 #define g10_im G1.w
150 #define g11_re G2.x
151 #define g11_im G2.y
152 #define g12_re G2.z
153 #define g12_im G2.w
154 #define g20_re G3.x
155 #define g20_im G3.y
156 #define g21_re G3.z
157 #define g21_im G3.w
158 #define g22_re G4.x
159 #define g22_im G4.y
160 
161 #endif // GAUGE_DOUBLE
162 
163 // conjugated gauge link
164 #define gT00_re (+g00_re)
165 #define gT00_im (-g00_im)
166 #define gT01_re (+g10_re)
167 #define gT01_im (-g10_im)
168 #define gT02_re (+g20_re)
169 #define gT02_im (-g20_im)
170 #define gT10_re (+g01_re)
171 #define gT10_im (-g01_im)
172 #define gT11_re (+g11_re)
173 #define gT11_im (-g11_im)
174 #define gT12_re (+g21_re)
175 #define gT12_im (-g21_im)
176 #define gT20_re (+g02_re)
177 #define gT20_im (-g02_im)
178 #define gT21_re (+g12_re)
179 #define gT21_im (-g12_im)
180 #define gT22_re (+g22_re)
181 #define gT22_im (-g22_im)
182 
183 // output spinor
208 
209 #ifdef SPINOR_DOUBLE
210 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
211 #else
212 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
213 #endif
214 
215 #include "read_gauge.h"
216 #include "io_spinor.h"
217 
218 int x1, x2, x3, x4;
219 int X;
220 
221 #if (DD_PREC==2) // half precision
222 int sp_norm_idx;
223 #endif // half precision
224 
225 int sid;
226 
227 int dim;
228 int face_idx;
229 int Y[4] = {X1,X2,X3,X4};
230 int faceVolume[4];
231 faceVolume[0] = (X2*X3*X4)>>1;
232 faceVolume[1] = (X1*X3*X4)>>1;
233 faceVolume[2] = (X1*X2*X4)>>1;
234 faceVolume[3] = (X1*X2*X3)>>1;
235 
236  sid = blockIdx.x*blockDim.x + threadIdx.x;
237  if (sid >= param.threads) return;
238 
239 
240  dim = dimFromFaceIndex(sid, param); // sid is also modified
241 
242  const int face_volume = ((param.threadDimMapUpper[dim] - param.threadDimMapLower[dim]) >> 1);
243  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
244  face_idx = sid - face_num*face_volume; // index into the respective face
245 
246 
247  const int dims[] = {X1, X2, X3, X4};
248  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
249 
250  bool active = false;
251  for(int dir=0; dir<4; ++dir){
252  active = active || isActive(dim,dir,+1,x1,x2,x3,x4,param.commDim,param.X);
253  }
254  if(!active) return;
255 
256 
257  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
258 
259  o00_re = i00_re; o00_im = i00_im;
260  o01_re = i01_re; o01_im = i01_im;
261  o02_re = i02_re; o02_im = i02_im;
262  o10_re = i10_re; o10_im = i10_im;
263  o11_re = i11_re; o11_im = i11_im;
264  o12_re = i12_re; o12_im = i12_im;
265  o20_re = i20_re; o20_im = i20_im;
266  o21_re = i21_re; o21_im = i21_im;
267  o22_re = i22_re; o22_im = i22_im;
268  o30_re = i30_re; o30_im = i30_im;
269  o31_re = i31_re; o31_im = i31_im;
270  o32_re = i32_re; o32_im = i32_im;
271 if (isActive(dim,0,+1,x1,x2,x3,x4,param.commDim,param.X) && x1==X1m1 )
272 {
273  // Projector P0-
274  // 1 0 0 -i
275  // 0 1 -i 0
276  // 0 i 1 0
277  // i 0 0 1
278 
279  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,0,Y);
280  const int sp_idx = face_idx + param.ghostOffset[0];
281 #if (DD_PREC==2)
282  sp_norm_idx = face_idx + faceVolume[0] + param.ghostNormOffset[0];
283 #endif
284 
285  const int ga_idx = sid;
286 
293 
294 
295  const int sp_stride_pad = ghostFace[0];
296 
297  // read half spinor from device memory
298  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
299 
300  a0_re = i00_re; a0_im = i00_im;
301  a1_re = i01_re; a1_im = i01_im;
302  a2_re = i02_re; a2_im = i02_im;
303  b0_re = i10_re; b0_im = i10_im;
304  b1_re = i11_re; b1_im = i11_im;
305  b2_re = i12_re; b2_im = i12_im;
306 
307  // read gauge matrix from device memory
308  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
309 
310  // reconstruct gauge matrix
312 
313  // multiply row 0
314  spinorFloat A0_re = 0;
315  A0_re += g00_re * a0_re;
316  A0_re -= g00_im * a0_im;
317  A0_re += g01_re * a1_re;
318  A0_re -= g01_im * a1_im;
319  A0_re += g02_re * a2_re;
320  A0_re -= g02_im * a2_im;
321  spinorFloat A0_im = 0;
322  A0_im += g00_re * a0_im;
323  A0_im += g00_im * a0_re;
324  A0_im += g01_re * a1_im;
325  A0_im += g01_im * a1_re;
326  A0_im += g02_re * a2_im;
327  A0_im += g02_im * a2_re;
328  spinorFloat B0_re = 0;
329  B0_re += g00_re * b0_re;
330  B0_re -= g00_im * b0_im;
331  B0_re += g01_re * b1_re;
332  B0_re -= g01_im * b1_im;
333  B0_re += g02_re * b2_re;
334  B0_re -= g02_im * b2_im;
335  spinorFloat B0_im = 0;
336  B0_im += g00_re * b0_im;
337  B0_im += g00_im * b0_re;
338  B0_im += g01_re * b1_im;
339  B0_im += g01_im * b1_re;
340  B0_im += g02_re * b2_im;
341  B0_im += g02_im * b2_re;
342 
343  // multiply row 1
344  spinorFloat A1_re = 0;
345  A1_re += g10_re * a0_re;
346  A1_re -= g10_im * a0_im;
347  A1_re += g11_re * a1_re;
348  A1_re -= g11_im * a1_im;
349  A1_re += g12_re * a2_re;
350  A1_re -= g12_im * a2_im;
351  spinorFloat A1_im = 0;
352  A1_im += g10_re * a0_im;
353  A1_im += g10_im * a0_re;
354  A1_im += g11_re * a1_im;
355  A1_im += g11_im * a1_re;
356  A1_im += g12_re * a2_im;
357  A1_im += g12_im * a2_re;
358  spinorFloat B1_re = 0;
359  B1_re += g10_re * b0_re;
360  B1_re -= g10_im * b0_im;
361  B1_re += g11_re * b1_re;
362  B1_re -= g11_im * b1_im;
363  B1_re += g12_re * b2_re;
364  B1_re -= g12_im * b2_im;
365  spinorFloat B1_im = 0;
366  B1_im += g10_re * b0_im;
367  B1_im += g10_im * b0_re;
368  B1_im += g11_re * b1_im;
369  B1_im += g11_im * b1_re;
370  B1_im += g12_re * b2_im;
371  B1_im += g12_im * b2_re;
372 
373  // multiply row 2
374  spinorFloat A2_re = 0;
375  A2_re += g20_re * a0_re;
376  A2_re -= g20_im * a0_im;
377  A2_re += g21_re * a1_re;
378  A2_re -= g21_im * a1_im;
379  A2_re += g22_re * a2_re;
380  A2_re -= g22_im * a2_im;
381  spinorFloat A2_im = 0;
382  A2_im += g20_re * a0_im;
383  A2_im += g20_im * a0_re;
384  A2_im += g21_re * a1_im;
385  A2_im += g21_im * a1_re;
386  A2_im += g22_re * a2_im;
387  A2_im += g22_im * a2_re;
388  spinorFloat B2_re = 0;
389  B2_re += g20_re * b0_re;
390  B2_re -= g20_im * b0_im;
391  B2_re += g21_re * b1_re;
392  B2_re -= g21_im * b1_im;
393  B2_re += g22_re * b2_re;
394  B2_re -= g22_im * b2_im;
395  spinorFloat B2_im = 0;
396  B2_im += g20_re * b0_im;
397  B2_im += g20_im * b0_re;
398  B2_im += g21_re * b1_im;
399  B2_im += g21_im * b1_re;
400  B2_im += g22_re * b2_im;
401  B2_im += g22_im * b2_re;
402 
403  o00_re += A0_re;
404  o00_im += A0_im;
405  o10_re += B0_re;
406  o10_im += B0_im;
407  o20_re -= B0_im;
408  o20_im += B0_re;
409  o30_re -= A0_im;
410  o30_im += A0_re;
411 
412  o01_re += A1_re;
413  o01_im += A1_im;
414  o11_re += B1_re;
415  o11_im += B1_im;
416  o21_re -= B1_im;
417  o21_im += B1_re;
418  o31_re -= A1_im;
419  o31_im += A1_re;
420 
421  o02_re += A2_re;
422  o02_im += A2_im;
423  o12_re += B2_re;
424  o12_im += B2_im;
425  o22_re -= B2_im;
426  o22_im += B2_re;
427  o32_re -= A2_im;
428  o32_im += A2_re;
429 
430 }
431 
432 if (isActive(dim,0,-1,x1,x2,x3,x4,param.commDim,param.X) && x1==0 )
433 {
434  // Projector P0+
435  // 1 0 0 i
436  // 0 1 i 0
437  // 0 -i 1 0
438  // -i 0 0 1
439 
440  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,0,Y);
441  const int sp_idx = face_idx + param.ghostOffset[0];
442 #if (DD_PREC==2)
443  sp_norm_idx = face_idx + param.ghostNormOffset[0];
444 #endif
445 
446  const int ga_idx = Vh+face_idx;
447 
454 
455 
456  const int sp_stride_pad = ghostFace[0];
457 
458  // read half spinor from device memory
459  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
460 
461  a0_re = i00_re; a0_im = i00_im;
462  a1_re = i01_re; a1_im = i01_im;
463  a2_re = i02_re; a2_im = i02_im;
464  b0_re = i10_re; b0_im = i10_im;
465  b1_re = i11_re; b1_im = i11_im;
466  b2_re = i12_re; b2_im = i12_im;
467 
468  // read gauge matrix from device memory
469  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
470 
471  // reconstruct gauge matrix
473 
474  // multiply row 0
475  spinorFloat A0_re = 0;
476  A0_re += gT00_re * a0_re;
477  A0_re -= gT00_im * a0_im;
478  A0_re += gT01_re * a1_re;
479  A0_re -= gT01_im * a1_im;
480  A0_re += gT02_re * a2_re;
481  A0_re -= gT02_im * a2_im;
482  spinorFloat A0_im = 0;
483  A0_im += gT00_re * a0_im;
484  A0_im += gT00_im * a0_re;
485  A0_im += gT01_re * a1_im;
486  A0_im += gT01_im * a1_re;
487  A0_im += gT02_re * a2_im;
488  A0_im += gT02_im * a2_re;
489  spinorFloat B0_re = 0;
490  B0_re += gT00_re * b0_re;
491  B0_re -= gT00_im * b0_im;
492  B0_re += gT01_re * b1_re;
493  B0_re -= gT01_im * b1_im;
494  B0_re += gT02_re * b2_re;
495  B0_re -= gT02_im * b2_im;
496  spinorFloat B0_im = 0;
497  B0_im += gT00_re * b0_im;
498  B0_im += gT00_im * b0_re;
499  B0_im += gT01_re * b1_im;
500  B0_im += gT01_im * b1_re;
501  B0_im += gT02_re * b2_im;
502  B0_im += gT02_im * b2_re;
503 
504  // multiply row 1
505  spinorFloat A1_re = 0;
506  A1_re += gT10_re * a0_re;
507  A1_re -= gT10_im * a0_im;
508  A1_re += gT11_re * a1_re;
509  A1_re -= gT11_im * a1_im;
510  A1_re += gT12_re * a2_re;
511  A1_re -= gT12_im * a2_im;
512  spinorFloat A1_im = 0;
513  A1_im += gT10_re * a0_im;
514  A1_im += gT10_im * a0_re;
515  A1_im += gT11_re * a1_im;
516  A1_im += gT11_im * a1_re;
517  A1_im += gT12_re * a2_im;
518  A1_im += gT12_im * a2_re;
519  spinorFloat B1_re = 0;
520  B1_re += gT10_re * b0_re;
521  B1_re -= gT10_im * b0_im;
522  B1_re += gT11_re * b1_re;
523  B1_re -= gT11_im * b1_im;
524  B1_re += gT12_re * b2_re;
525  B1_re -= gT12_im * b2_im;
526  spinorFloat B1_im = 0;
527  B1_im += gT10_re * b0_im;
528  B1_im += gT10_im * b0_re;
529  B1_im += gT11_re * b1_im;
530  B1_im += gT11_im * b1_re;
531  B1_im += gT12_re * b2_im;
532  B1_im += gT12_im * b2_re;
533 
534  // multiply row 2
535  spinorFloat A2_re = 0;
536  A2_re += gT20_re * a0_re;
537  A2_re -= gT20_im * a0_im;
538  A2_re += gT21_re * a1_re;
539  A2_re -= gT21_im * a1_im;
540  A2_re += gT22_re * a2_re;
541  A2_re -= gT22_im * a2_im;
542  spinorFloat A2_im = 0;
543  A2_im += gT20_re * a0_im;
544  A2_im += gT20_im * a0_re;
545  A2_im += gT21_re * a1_im;
546  A2_im += gT21_im * a1_re;
547  A2_im += gT22_re * a2_im;
548  A2_im += gT22_im * a2_re;
549  spinorFloat B2_re = 0;
550  B2_re += gT20_re * b0_re;
551  B2_re -= gT20_im * b0_im;
552  B2_re += gT21_re * b1_re;
553  B2_re -= gT21_im * b1_im;
554  B2_re += gT22_re * b2_re;
555  B2_re -= gT22_im * b2_im;
556  spinorFloat B2_im = 0;
557  B2_im += gT20_re * b0_im;
558  B2_im += gT20_im * b0_re;
559  B2_im += gT21_re * b1_im;
560  B2_im += gT21_im * b1_re;
561  B2_im += gT22_re * b2_im;
562  B2_im += gT22_im * b2_re;
563 
564  o00_re += A0_re;
565  o00_im += A0_im;
566  o10_re += B0_re;
567  o10_im += B0_im;
568  o20_re += B0_im;
569  o20_im -= B0_re;
570  o30_re += A0_im;
571  o30_im -= A0_re;
572 
573  o01_re += A1_re;
574  o01_im += A1_im;
575  o11_re += B1_re;
576  o11_im += B1_im;
577  o21_re += B1_im;
578  o21_im -= B1_re;
579  o31_re += A1_im;
580  o31_im -= A1_re;
581 
582  o02_re += A2_re;
583  o02_im += A2_im;
584  o12_re += B2_re;
585  o12_im += B2_im;
586  o22_re += B2_im;
587  o22_im -= B2_re;
588  o32_re += A2_im;
589  o32_im -= A2_re;
590 
591 }
592 
593 if (isActive(dim,1,+1,x1,x2,x3,x4,param.commDim,param.X) && x2==X2m1 )
594 {
595  // Projector P1-
596  // 1 0 0 -1
597  // 0 1 1 0
598  // 0 1 1 0
599  // -1 0 0 1
600 
601  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,1,Y);
602  const int sp_idx = face_idx + param.ghostOffset[1];
603 #if (DD_PREC==2)
604  sp_norm_idx = face_idx + faceVolume[1] + param.ghostNormOffset[1];
605 #endif
606 
607  const int ga_idx = sid;
608 
615 
616 
617  const int sp_stride_pad = ghostFace[1];
618 
619  // read half spinor from device memory
620  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
621 
622  a0_re = i00_re; a0_im = i00_im;
623  a1_re = i01_re; a1_im = i01_im;
624  a2_re = i02_re; a2_im = i02_im;
625  b0_re = i10_re; b0_im = i10_im;
626  b1_re = i11_re; b1_im = i11_im;
627  b2_re = i12_re; b2_im = i12_im;
628 
629  // read gauge matrix from device memory
630  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
631 
632  // reconstruct gauge matrix
634 
635  // multiply row 0
636  spinorFloat A0_re = 0;
637  A0_re += g00_re * a0_re;
638  A0_re -= g00_im * a0_im;
639  A0_re += g01_re * a1_re;
640  A0_re -= g01_im * a1_im;
641  A0_re += g02_re * a2_re;
642  A0_re -= g02_im * a2_im;
643  spinorFloat A0_im = 0;
644  A0_im += g00_re * a0_im;
645  A0_im += g00_im * a0_re;
646  A0_im += g01_re * a1_im;
647  A0_im += g01_im * a1_re;
648  A0_im += g02_re * a2_im;
649  A0_im += g02_im * a2_re;
650  spinorFloat B0_re = 0;
651  B0_re += g00_re * b0_re;
652  B0_re -= g00_im * b0_im;
653  B0_re += g01_re * b1_re;
654  B0_re -= g01_im * b1_im;
655  B0_re += g02_re * b2_re;
656  B0_re -= g02_im * b2_im;
657  spinorFloat B0_im = 0;
658  B0_im += g00_re * b0_im;
659  B0_im += g00_im * b0_re;
660  B0_im += g01_re * b1_im;
661  B0_im += g01_im * b1_re;
662  B0_im += g02_re * b2_im;
663  B0_im += g02_im * b2_re;
664 
665  // multiply row 1
666  spinorFloat A1_re = 0;
667  A1_re += g10_re * a0_re;
668  A1_re -= g10_im * a0_im;
669  A1_re += g11_re * a1_re;
670  A1_re -= g11_im * a1_im;
671  A1_re += g12_re * a2_re;
672  A1_re -= g12_im * a2_im;
673  spinorFloat A1_im = 0;
674  A1_im += g10_re * a0_im;
675  A1_im += g10_im * a0_re;
676  A1_im += g11_re * a1_im;
677  A1_im += g11_im * a1_re;
678  A1_im += g12_re * a2_im;
679  A1_im += g12_im * a2_re;
680  spinorFloat B1_re = 0;
681  B1_re += g10_re * b0_re;
682  B1_re -= g10_im * b0_im;
683  B1_re += g11_re * b1_re;
684  B1_re -= g11_im * b1_im;
685  B1_re += g12_re * b2_re;
686  B1_re -= g12_im * b2_im;
687  spinorFloat B1_im = 0;
688  B1_im += g10_re * b0_im;
689  B1_im += g10_im * b0_re;
690  B1_im += g11_re * b1_im;
691  B1_im += g11_im * b1_re;
692  B1_im += g12_re * b2_im;
693  B1_im += g12_im * b2_re;
694 
695  // multiply row 2
696  spinorFloat A2_re = 0;
697  A2_re += g20_re * a0_re;
698  A2_re -= g20_im * a0_im;
699  A2_re += g21_re * a1_re;
700  A2_re -= g21_im * a1_im;
701  A2_re += g22_re * a2_re;
702  A2_re -= g22_im * a2_im;
703  spinorFloat A2_im = 0;
704  A2_im += g20_re * a0_im;
705  A2_im += g20_im * a0_re;
706  A2_im += g21_re * a1_im;
707  A2_im += g21_im * a1_re;
708  A2_im += g22_re * a2_im;
709  A2_im += g22_im * a2_re;
710  spinorFloat B2_re = 0;
711  B2_re += g20_re * b0_re;
712  B2_re -= g20_im * b0_im;
713  B2_re += g21_re * b1_re;
714  B2_re -= g21_im * b1_im;
715  B2_re += g22_re * b2_re;
716  B2_re -= g22_im * b2_im;
717  spinorFloat B2_im = 0;
718  B2_im += g20_re * b0_im;
719  B2_im += g20_im * b0_re;
720  B2_im += g21_re * b1_im;
721  B2_im += g21_im * b1_re;
722  B2_im += g22_re * b2_im;
723  B2_im += g22_im * b2_re;
724 
725  o00_re += A0_re;
726  o00_im += A0_im;
727  o10_re += B0_re;
728  o10_im += B0_im;
729  o20_re += B0_re;
730  o20_im += B0_im;
731  o30_re -= A0_re;
732  o30_im -= A0_im;
733 
734  o01_re += A1_re;
735  o01_im += A1_im;
736  o11_re += B1_re;
737  o11_im += B1_im;
738  o21_re += B1_re;
739  o21_im += B1_im;
740  o31_re -= A1_re;
741  o31_im -= A1_im;
742 
743  o02_re += A2_re;
744  o02_im += A2_im;
745  o12_re += B2_re;
746  o12_im += B2_im;
747  o22_re += B2_re;
748  o22_im += B2_im;
749  o32_re -= A2_re;
750  o32_im -= A2_im;
751 
752 }
753 
754 if (isActive(dim,1,-1,x1,x2,x3,x4,param.commDim,param.X) && x2==0 )
755 {
756  // Projector P1+
757  // 1 0 0 1
758  // 0 1 -1 0
759  // 0 -1 1 0
760  // 1 0 0 1
761 
762  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,1,Y);
763  const int sp_idx = face_idx + param.ghostOffset[1];
764 #if (DD_PREC==2)
765  sp_norm_idx = face_idx + param.ghostNormOffset[1];
766 #endif
767 
768  const int ga_idx = Vh+face_idx;
769 
776 
777 
778  const int sp_stride_pad = ghostFace[1];
779 
780  // read half spinor from device memory
781  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
782 
783  a0_re = i00_re; a0_im = i00_im;
784  a1_re = i01_re; a1_im = i01_im;
785  a2_re = i02_re; a2_im = i02_im;
786  b0_re = i10_re; b0_im = i10_im;
787  b1_re = i11_re; b1_im = i11_im;
788  b2_re = i12_re; b2_im = i12_im;
789 
790  // read gauge matrix from device memory
791  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
792 
793  // reconstruct gauge matrix
795 
796  // multiply row 0
797  spinorFloat A0_re = 0;
798  A0_re += gT00_re * a0_re;
799  A0_re -= gT00_im * a0_im;
800  A0_re += gT01_re * a1_re;
801  A0_re -= gT01_im * a1_im;
802  A0_re += gT02_re * a2_re;
803  A0_re -= gT02_im * a2_im;
804  spinorFloat A0_im = 0;
805  A0_im += gT00_re * a0_im;
806  A0_im += gT00_im * a0_re;
807  A0_im += gT01_re * a1_im;
808  A0_im += gT01_im * a1_re;
809  A0_im += gT02_re * a2_im;
810  A0_im += gT02_im * a2_re;
811  spinorFloat B0_re = 0;
812  B0_re += gT00_re * b0_re;
813  B0_re -= gT00_im * b0_im;
814  B0_re += gT01_re * b1_re;
815  B0_re -= gT01_im * b1_im;
816  B0_re += gT02_re * b2_re;
817  B0_re -= gT02_im * b2_im;
818  spinorFloat B0_im = 0;
819  B0_im += gT00_re * b0_im;
820  B0_im += gT00_im * b0_re;
821  B0_im += gT01_re * b1_im;
822  B0_im += gT01_im * b1_re;
823  B0_im += gT02_re * b2_im;
824  B0_im += gT02_im * b2_re;
825 
826  // multiply row 1
827  spinorFloat A1_re = 0;
828  A1_re += gT10_re * a0_re;
829  A1_re -= gT10_im * a0_im;
830  A1_re += gT11_re * a1_re;
831  A1_re -= gT11_im * a1_im;
832  A1_re += gT12_re * a2_re;
833  A1_re -= gT12_im * a2_im;
834  spinorFloat A1_im = 0;
835  A1_im += gT10_re * a0_im;
836  A1_im += gT10_im * a0_re;
837  A1_im += gT11_re * a1_im;
838  A1_im += gT11_im * a1_re;
839  A1_im += gT12_re * a2_im;
840  A1_im += gT12_im * a2_re;
841  spinorFloat B1_re = 0;
842  B1_re += gT10_re * b0_re;
843  B1_re -= gT10_im * b0_im;
844  B1_re += gT11_re * b1_re;
845  B1_re -= gT11_im * b1_im;
846  B1_re += gT12_re * b2_re;
847  B1_re -= gT12_im * b2_im;
848  spinorFloat B1_im = 0;
849  B1_im += gT10_re * b0_im;
850  B1_im += gT10_im * b0_re;
851  B1_im += gT11_re * b1_im;
852  B1_im += gT11_im * b1_re;
853  B1_im += gT12_re * b2_im;
854  B1_im += gT12_im * b2_re;
855 
856  // multiply row 2
857  spinorFloat A2_re = 0;
858  A2_re += gT20_re * a0_re;
859  A2_re -= gT20_im * a0_im;
860  A2_re += gT21_re * a1_re;
861  A2_re -= gT21_im * a1_im;
862  A2_re += gT22_re * a2_re;
863  A2_re -= gT22_im * a2_im;
864  spinorFloat A2_im = 0;
865  A2_im += gT20_re * a0_im;
866  A2_im += gT20_im * a0_re;
867  A2_im += gT21_re * a1_im;
868  A2_im += gT21_im * a1_re;
869  A2_im += gT22_re * a2_im;
870  A2_im += gT22_im * a2_re;
871  spinorFloat B2_re = 0;
872  B2_re += gT20_re * b0_re;
873  B2_re -= gT20_im * b0_im;
874  B2_re += gT21_re * b1_re;
875  B2_re -= gT21_im * b1_im;
876  B2_re += gT22_re * b2_re;
877  B2_re -= gT22_im * b2_im;
878  spinorFloat B2_im = 0;
879  B2_im += gT20_re * b0_im;
880  B2_im += gT20_im * b0_re;
881  B2_im += gT21_re * b1_im;
882  B2_im += gT21_im * b1_re;
883  B2_im += gT22_re * b2_im;
884  B2_im += gT22_im * b2_re;
885 
886  o00_re += A0_re;
887  o00_im += A0_im;
888  o10_re += B0_re;
889  o10_im += B0_im;
890  o20_re -= B0_re;
891  o20_im -= B0_im;
892  o30_re += A0_re;
893  o30_im += A0_im;
894 
895  o01_re += A1_re;
896  o01_im += A1_im;
897  o11_re += B1_re;
898  o11_im += B1_im;
899  o21_re -= B1_re;
900  o21_im -= B1_im;
901  o31_re += A1_re;
902  o31_im += A1_im;
903 
904  o02_re += A2_re;
905  o02_im += A2_im;
906  o12_re += B2_re;
907  o12_im += B2_im;
908  o22_re -= B2_re;
909  o22_im -= B2_im;
910  o32_re += A2_re;
911  o32_im += A2_im;
912 
913 }
914 
915 if (isActive(dim,2,+1,x1,x2,x3,x4,param.commDim,param.X) && x3==X3m1 )
916 {
917  // Projector P2-
918  // 1 0 -i 0
919  // 0 1 0 i
920  // i 0 1 0
921  // 0 -i 0 1
922 
923  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,2,Y);
924  const int sp_idx = face_idx + param.ghostOffset[2];
925 #if (DD_PREC==2)
926  sp_norm_idx = face_idx + faceVolume[2] + param.ghostNormOffset[2];
927 #endif
928 
929  const int ga_idx = sid;
930 
937 
938 
939  const int sp_stride_pad = ghostFace[2];
940 
941  // read half spinor from device memory
942  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
943 
944  a0_re = i00_re; a0_im = i00_im;
945  a1_re = i01_re; a1_im = i01_im;
946  a2_re = i02_re; a2_im = i02_im;
947  b0_re = i10_re; b0_im = i10_im;
948  b1_re = i11_re; b1_im = i11_im;
949  b2_re = i12_re; b2_im = i12_im;
950 
951  // read gauge matrix from device memory
952  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
953 
954  // reconstruct gauge matrix
956 
957  // multiply row 0
958  spinorFloat A0_re = 0;
959  A0_re += g00_re * a0_re;
960  A0_re -= g00_im * a0_im;
961  A0_re += g01_re * a1_re;
962  A0_re -= g01_im * a1_im;
963  A0_re += g02_re * a2_re;
964  A0_re -= g02_im * a2_im;
965  spinorFloat A0_im = 0;
966  A0_im += g00_re * a0_im;
967  A0_im += g00_im * a0_re;
968  A0_im += g01_re * a1_im;
969  A0_im += g01_im * a1_re;
970  A0_im += g02_re * a2_im;
971  A0_im += g02_im * a2_re;
972  spinorFloat B0_re = 0;
973  B0_re += g00_re * b0_re;
974  B0_re -= g00_im * b0_im;
975  B0_re += g01_re * b1_re;
976  B0_re -= g01_im * b1_im;
977  B0_re += g02_re * b2_re;
978  B0_re -= g02_im * b2_im;
979  spinorFloat B0_im = 0;
980  B0_im += g00_re * b0_im;
981  B0_im += g00_im * b0_re;
982  B0_im += g01_re * b1_im;
983  B0_im += g01_im * b1_re;
984  B0_im += g02_re * b2_im;
985  B0_im += g02_im * b2_re;
986 
987  // multiply row 1
988  spinorFloat A1_re = 0;
989  A1_re += g10_re * a0_re;
990  A1_re -= g10_im * a0_im;
991  A1_re += g11_re * a1_re;
992  A1_re -= g11_im * a1_im;
993  A1_re += g12_re * a2_re;
994  A1_re -= g12_im * a2_im;
995  spinorFloat A1_im = 0;
996  A1_im += g10_re * a0_im;
997  A1_im += g10_im * a0_re;
998  A1_im += g11_re * a1_im;
999  A1_im += g11_im * a1_re;
1000  A1_im += g12_re * a2_im;
1001  A1_im += g12_im * a2_re;
1002  spinorFloat B1_re = 0;
1003  B1_re += g10_re * b0_re;
1004  B1_re -= g10_im * b0_im;
1005  B1_re += g11_re * b1_re;
1006  B1_re -= g11_im * b1_im;
1007  B1_re += g12_re * b2_re;
1008  B1_re -= g12_im * b2_im;
1009  spinorFloat B1_im = 0;
1010  B1_im += g10_re * b0_im;
1011  B1_im += g10_im * b0_re;
1012  B1_im += g11_re * b1_im;
1013  B1_im += g11_im * b1_re;
1014  B1_im += g12_re * b2_im;
1015  B1_im += g12_im * b2_re;
1016 
1017  // multiply row 2
1018  spinorFloat A2_re = 0;
1019  A2_re += g20_re * a0_re;
1020  A2_re -= g20_im * a0_im;
1021  A2_re += g21_re * a1_re;
1022  A2_re -= g21_im * a1_im;
1023  A2_re += g22_re * a2_re;
1024  A2_re -= g22_im * a2_im;
1025  spinorFloat A2_im = 0;
1026  A2_im += g20_re * a0_im;
1027  A2_im += g20_im * a0_re;
1028  A2_im += g21_re * a1_im;
1029  A2_im += g21_im * a1_re;
1030  A2_im += g22_re * a2_im;
1031  A2_im += g22_im * a2_re;
1032  spinorFloat B2_re = 0;
1033  B2_re += g20_re * b0_re;
1034  B2_re -= g20_im * b0_im;
1035  B2_re += g21_re * b1_re;
1036  B2_re -= g21_im * b1_im;
1037  B2_re += g22_re * b2_re;
1038  B2_re -= g22_im * b2_im;
1039  spinorFloat B2_im = 0;
1040  B2_im += g20_re * b0_im;
1041  B2_im += g20_im * b0_re;
1042  B2_im += g21_re * b1_im;
1043  B2_im += g21_im * b1_re;
1044  B2_im += g22_re * b2_im;
1045  B2_im += g22_im * b2_re;
1046 
1047  o00_re += A0_re;
1048  o00_im += A0_im;
1049  o10_re += B0_re;
1050  o10_im += B0_im;
1051  o20_re -= A0_im;
1052  o20_im += A0_re;
1053  o30_re += B0_im;
1054  o30_im -= B0_re;
1055 
1056  o01_re += A1_re;
1057  o01_im += A1_im;
1058  o11_re += B1_re;
1059  o11_im += B1_im;
1060  o21_re -= A1_im;
1061  o21_im += A1_re;
1062  o31_re += B1_im;
1063  o31_im -= B1_re;
1064 
1065  o02_re += A2_re;
1066  o02_im += A2_im;
1067  o12_re += B2_re;
1068  o12_im += B2_im;
1069  o22_re -= A2_im;
1070  o22_im += A2_re;
1071  o32_re += B2_im;
1072  o32_im -= B2_re;
1073 
1074 }
1075 
1076 if (isActive(dim,2,-1,x1,x2,x3,x4,param.commDim,param.X) && x3==0 )
1077 {
1078  // Projector P2+
1079  // 1 0 i 0
1080  // 0 1 0 -i
1081  // -i 0 1 0
1082  // 0 i 0 1
1083 
1084  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,2,Y);
1085  const int sp_idx = face_idx + param.ghostOffset[2];
1086 #if (DD_PREC==2)
1087  sp_norm_idx = face_idx + param.ghostNormOffset[2];
1088 #endif
1089 
1090  const int ga_idx = Vh+face_idx;
1091 
1098 
1099 
1100  const int sp_stride_pad = ghostFace[2];
1101 
1102  // read half spinor from device memory
1103  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1104 
1105  a0_re = i00_re; a0_im = i00_im;
1106  a1_re = i01_re; a1_im = i01_im;
1107  a2_re = i02_re; a2_im = i02_im;
1108  b0_re = i10_re; b0_im = i10_im;
1109  b1_re = i11_re; b1_im = i11_im;
1110  b2_re = i12_re; b2_im = i12_im;
1111 
1112  // read gauge matrix from device memory
1113  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1114 
1115  // reconstruct gauge matrix
1117 
1118  // multiply row 0
1119  spinorFloat A0_re = 0;
1120  A0_re += gT00_re * a0_re;
1121  A0_re -= gT00_im * a0_im;
1122  A0_re += gT01_re * a1_re;
1123  A0_re -= gT01_im * a1_im;
1124  A0_re += gT02_re * a2_re;
1125  A0_re -= gT02_im * a2_im;
1126  spinorFloat A0_im = 0;
1127  A0_im += gT00_re * a0_im;
1128  A0_im += gT00_im * a0_re;
1129  A0_im += gT01_re * a1_im;
1130  A0_im += gT01_im * a1_re;
1131  A0_im += gT02_re * a2_im;
1132  A0_im += gT02_im * a2_re;
1133  spinorFloat B0_re = 0;
1134  B0_re += gT00_re * b0_re;
1135  B0_re -= gT00_im * b0_im;
1136  B0_re += gT01_re * b1_re;
1137  B0_re -= gT01_im * b1_im;
1138  B0_re += gT02_re * b2_re;
1139  B0_re -= gT02_im * b2_im;
1140  spinorFloat B0_im = 0;
1141  B0_im += gT00_re * b0_im;
1142  B0_im += gT00_im * b0_re;
1143  B0_im += gT01_re * b1_im;
1144  B0_im += gT01_im * b1_re;
1145  B0_im += gT02_re * b2_im;
1146  B0_im += gT02_im * b2_re;
1147 
1148  // multiply row 1
1149  spinorFloat A1_re = 0;
1150  A1_re += gT10_re * a0_re;
1151  A1_re -= gT10_im * a0_im;
1152  A1_re += gT11_re * a1_re;
1153  A1_re -= gT11_im * a1_im;
1154  A1_re += gT12_re * a2_re;
1155  A1_re -= gT12_im * a2_im;
1156  spinorFloat A1_im = 0;
1157  A1_im += gT10_re * a0_im;
1158  A1_im += gT10_im * a0_re;
1159  A1_im += gT11_re * a1_im;
1160  A1_im += gT11_im * a1_re;
1161  A1_im += gT12_re * a2_im;
1162  A1_im += gT12_im * a2_re;
1163  spinorFloat B1_re = 0;
1164  B1_re += gT10_re * b0_re;
1165  B1_re -= gT10_im * b0_im;
1166  B1_re += gT11_re * b1_re;
1167  B1_re -= gT11_im * b1_im;
1168  B1_re += gT12_re * b2_re;
1169  B1_re -= gT12_im * b2_im;
1170  spinorFloat B1_im = 0;
1171  B1_im += gT10_re * b0_im;
1172  B1_im += gT10_im * b0_re;
1173  B1_im += gT11_re * b1_im;
1174  B1_im += gT11_im * b1_re;
1175  B1_im += gT12_re * b2_im;
1176  B1_im += gT12_im * b2_re;
1177 
1178  // multiply row 2
1179  spinorFloat A2_re = 0;
1180  A2_re += gT20_re * a0_re;
1181  A2_re -= gT20_im * a0_im;
1182  A2_re += gT21_re * a1_re;
1183  A2_re -= gT21_im * a1_im;
1184  A2_re += gT22_re * a2_re;
1185  A2_re -= gT22_im * a2_im;
1186  spinorFloat A2_im = 0;
1187  A2_im += gT20_re * a0_im;
1188  A2_im += gT20_im * a0_re;
1189  A2_im += gT21_re * a1_im;
1190  A2_im += gT21_im * a1_re;
1191  A2_im += gT22_re * a2_im;
1192  A2_im += gT22_im * a2_re;
1193  spinorFloat B2_re = 0;
1194  B2_re += gT20_re * b0_re;
1195  B2_re -= gT20_im * b0_im;
1196  B2_re += gT21_re * b1_re;
1197  B2_re -= gT21_im * b1_im;
1198  B2_re += gT22_re * b2_re;
1199  B2_re -= gT22_im * b2_im;
1200  spinorFloat B2_im = 0;
1201  B2_im += gT20_re * b0_im;
1202  B2_im += gT20_im * b0_re;
1203  B2_im += gT21_re * b1_im;
1204  B2_im += gT21_im * b1_re;
1205  B2_im += gT22_re * b2_im;
1206  B2_im += gT22_im * b2_re;
1207 
1208  o00_re += A0_re;
1209  o00_im += A0_im;
1210  o10_re += B0_re;
1211  o10_im += B0_im;
1212  o20_re += A0_im;
1213  o20_im -= A0_re;
1214  o30_re -= B0_im;
1215  o30_im += B0_re;
1216 
1217  o01_re += A1_re;
1218  o01_im += A1_im;
1219  o11_re += B1_re;
1220  o11_im += B1_im;
1221  o21_re += A1_im;
1222  o21_im -= A1_re;
1223  o31_re -= B1_im;
1224  o31_im += B1_re;
1225 
1226  o02_re += A2_re;
1227  o02_im += A2_im;
1228  o12_re += B2_re;
1229  o12_im += B2_im;
1230  o22_re += A2_im;
1231  o22_im -= A2_re;
1232  o32_re -= B2_im;
1233  o32_im += B2_re;
1234 
1235 }
1236 
1237 if (isActive(dim,3,+1,x1,x2,x3,x4,param.commDim,param.X) && x4==X4m1 )
1238 {
1239  // Projector P3-
1240  // 0 0 0 0
1241  // 0 0 0 0
1242  // 0 0 2 0
1243  // 0 0 0 2
1244 
1245  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,3,Y);
1246  const int sp_idx = face_idx + param.ghostOffset[3];
1247 #if (DD_PREC==2)
1248  sp_norm_idx = face_idx + faceVolume[3] + param.ghostNormOffset[3];
1249 #endif
1250 
1251  const int ga_idx = sid;
1252 
1253  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1254  {
1261 
1262 
1263  const int sp_stride_pad = ghostFace[3];
1264  //const int t_proj_scale = TPROJSCALE;
1265 
1266  // read half spinor from device memory
1267  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1268 
1269 #ifdef TWIST_INV_DSLASH
1270  a0_re = i00_re; a0_im = i00_im;
1271  a1_re = i01_re; a1_im = i01_im;
1272  a2_re = i02_re; a2_im = i02_im;
1273  b0_re = i10_re; b0_im = i10_im;
1274  b1_re = i11_re; b1_im = i11_im;
1275  b2_re = i12_re; b2_im = i12_im;
1276 #else
1277  a0_re = 2*i00_re; a0_im = 2*i00_im;
1278  a1_re = 2*i01_re; a1_im = 2*i01_im;
1279  a2_re = 2*i02_re; a2_im = 2*i02_im;
1280  b0_re = 2*i10_re; b0_im = 2*i10_im;
1281  b1_re = 2*i11_re; b1_im = 2*i11_im;
1282  b2_re = 2*i12_re; b2_im = 2*i12_im;
1283 #endif
1284 
1285  // identity gauge matrix
1292 
1293  o20_re += A0_re;
1294  o20_im += A0_im;
1295  o30_re += B0_re;
1296  o30_im += B0_im;
1297 
1298  o21_re += A1_re;
1299  o21_im += A1_im;
1300  o31_re += B1_re;
1301  o31_im += B1_im;
1302 
1303  o22_re += A2_re;
1304  o22_im += A2_im;
1305  o32_re += B2_re;
1306  o32_im += B2_im;
1307 
1308  } else {
1315 
1316 
1317  const int sp_stride_pad = ghostFace[3];
1318  //const int t_proj_scale = TPROJSCALE;
1319 
1320  // read half spinor from device memory
1321  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1322 
1323 #ifdef TWIST_INV_DSLASH
1324  a0_re = i00_re; a0_im = i00_im;
1325  a1_re = i01_re; a1_im = i01_im;
1326  a2_re = i02_re; a2_im = i02_im;
1327  b0_re = i10_re; b0_im = i10_im;
1328  b1_re = i11_re; b1_im = i11_im;
1329  b2_re = i12_re; b2_im = i12_im;
1330 #else
1331  a0_re = 2*i00_re; a0_im = 2*i00_im;
1332  a1_re = 2*i01_re; a1_im = 2*i01_im;
1333  a2_re = 2*i02_re; a2_im = 2*i02_im;
1334  b0_re = 2*i10_re; b0_im = 2*i10_im;
1335  b1_re = 2*i11_re; b1_im = 2*i11_im;
1336  b2_re = 2*i12_re; b2_im = 2*i12_im;
1337 #endif
1338 
1339  // read gauge matrix from device memory
1340  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1341 
1342  // reconstruct gauge matrix
1344 
1345  // multiply row 0
1346  spinorFloat A0_re = 0;
1347  A0_re += g00_re * a0_re;
1348  A0_re -= g00_im * a0_im;
1349  A0_re += g01_re * a1_re;
1350  A0_re -= g01_im * a1_im;
1351  A0_re += g02_re * a2_re;
1352  A0_re -= g02_im * a2_im;
1353  spinorFloat A0_im = 0;
1354  A0_im += g00_re * a0_im;
1355  A0_im += g00_im * a0_re;
1356  A0_im += g01_re * a1_im;
1357  A0_im += g01_im * a1_re;
1358  A0_im += g02_re * a2_im;
1359  A0_im += g02_im * a2_re;
1360  spinorFloat B0_re = 0;
1361  B0_re += g00_re * b0_re;
1362  B0_re -= g00_im * b0_im;
1363  B0_re += g01_re * b1_re;
1364  B0_re -= g01_im * b1_im;
1365  B0_re += g02_re * b2_re;
1366  B0_re -= g02_im * b2_im;
1367  spinorFloat B0_im = 0;
1368  B0_im += g00_re * b0_im;
1369  B0_im += g00_im * b0_re;
1370  B0_im += g01_re * b1_im;
1371  B0_im += g01_im * b1_re;
1372  B0_im += g02_re * b2_im;
1373  B0_im += g02_im * b2_re;
1374 
1375  // multiply row 1
1376  spinorFloat A1_re = 0;
1377  A1_re += g10_re * a0_re;
1378  A1_re -= g10_im * a0_im;
1379  A1_re += g11_re * a1_re;
1380  A1_re -= g11_im * a1_im;
1381  A1_re += g12_re * a2_re;
1382  A1_re -= g12_im * a2_im;
1383  spinorFloat A1_im = 0;
1384  A1_im += g10_re * a0_im;
1385  A1_im += g10_im * a0_re;
1386  A1_im += g11_re * a1_im;
1387  A1_im += g11_im * a1_re;
1388  A1_im += g12_re * a2_im;
1389  A1_im += g12_im * a2_re;
1390  spinorFloat B1_re = 0;
1391  B1_re += g10_re * b0_re;
1392  B1_re -= g10_im * b0_im;
1393  B1_re += g11_re * b1_re;
1394  B1_re -= g11_im * b1_im;
1395  B1_re += g12_re * b2_re;
1396  B1_re -= g12_im * b2_im;
1397  spinorFloat B1_im = 0;
1398  B1_im += g10_re * b0_im;
1399  B1_im += g10_im * b0_re;
1400  B1_im += g11_re * b1_im;
1401  B1_im += g11_im * b1_re;
1402  B1_im += g12_re * b2_im;
1403  B1_im += g12_im * b2_re;
1404 
1405  // multiply row 2
1406  spinorFloat A2_re = 0;
1407  A2_re += g20_re * a0_re;
1408  A2_re -= g20_im * a0_im;
1409  A2_re += g21_re * a1_re;
1410  A2_re -= g21_im * a1_im;
1411  A2_re += g22_re * a2_re;
1412  A2_re -= g22_im * a2_im;
1413  spinorFloat A2_im = 0;
1414  A2_im += g20_re * a0_im;
1415  A2_im += g20_im * a0_re;
1416  A2_im += g21_re * a1_im;
1417  A2_im += g21_im * a1_re;
1418  A2_im += g22_re * a2_im;
1419  A2_im += g22_im * a2_re;
1420  spinorFloat B2_re = 0;
1421  B2_re += g20_re * b0_re;
1422  B2_re -= g20_im * b0_im;
1423  B2_re += g21_re * b1_re;
1424  B2_re -= g21_im * b1_im;
1425  B2_re += g22_re * b2_re;
1426  B2_re -= g22_im * b2_im;
1427  spinorFloat B2_im = 0;
1428  B2_im += g20_re * b0_im;
1429  B2_im += g20_im * b0_re;
1430  B2_im += g21_re * b1_im;
1431  B2_im += g21_im * b1_re;
1432  B2_im += g22_re * b2_im;
1433  B2_im += g22_im * b2_re;
1434 
1435  o20_re += A0_re;
1436  o20_im += A0_im;
1437  o30_re += B0_re;
1438  o30_im += B0_im;
1439 
1440  o21_re += A1_re;
1441  o21_im += A1_im;
1442  o31_re += B1_re;
1443  o31_im += B1_im;
1444 
1445  o22_re += A2_re;
1446  o22_im += A2_im;
1447  o32_re += B2_re;
1448  o32_im += B2_im;
1449 
1450  }
1451 }
1452 
1453 if (isActive(dim,3,-1,x1,x2,x3,x4,param.commDim,param.X) && x4==0 )
1454 {
1455  // Projector P3+
1456  // 2 0 0 0
1457  // 0 2 0 0
1458  // 0 0 0 0
1459  // 0 0 0 0
1460 
1461  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,3,Y);
1462  const int sp_idx = face_idx + param.ghostOffset[3];
1463 #if (DD_PREC==2)
1464  sp_norm_idx = face_idx + param.ghostNormOffset[3];
1465 #endif
1466 
1467  const int ga_idx = Vh+face_idx;
1468 
1469  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1470  {
1477 
1478 
1479  const int sp_stride_pad = ghostFace[3];
1480  //const int t_proj_scale = TPROJSCALE;
1481 
1482  // read half spinor from device memory
1483  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1484 
1485 #ifdef TWIST_INV_DSLASH
1486  a0_re = i00_re; a0_im = i00_im;
1487  a1_re = i01_re; a1_im = i01_im;
1488  a2_re = i02_re; a2_im = i02_im;
1489  b0_re = i10_re; b0_im = i10_im;
1490  b1_re = i11_re; b1_im = i11_im;
1491  b2_re = i12_re; b2_im = i12_im;
1492 #else
1493  a0_re = 2*i00_re; a0_im = 2*i00_im;
1494  a1_re = 2*i01_re; a1_im = 2*i01_im;
1495  a2_re = 2*i02_re; a2_im = 2*i02_im;
1496  b0_re = 2*i10_re; b0_im = 2*i10_im;
1497  b1_re = 2*i11_re; b1_im = 2*i11_im;
1498  b2_re = 2*i12_re; b2_im = 2*i12_im;
1499 #endif
1500 
1501  // identity gauge matrix
1508 
1509  o00_re += A0_re;
1510  o00_im += A0_im;
1511  o10_re += B0_re;
1512  o10_im += B0_im;
1513 
1514  o01_re += A1_re;
1515  o01_im += A1_im;
1516  o11_re += B1_re;
1517  o11_im += B1_im;
1518 
1519  o02_re += A2_re;
1520  o02_im += A2_im;
1521  o12_re += B2_re;
1522  o12_im += B2_im;
1523 
1524  } else {
1531 
1532 
1533  const int sp_stride_pad = ghostFace[3];
1534  //const int t_proj_scale = TPROJSCALE;
1535 
1536  // read half spinor from device memory
1537  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1538 
1539 #ifdef TWIST_INV_DSLASH
1540  a0_re = i00_re; a0_im = i00_im;
1541  a1_re = i01_re; a1_im = i01_im;
1542  a2_re = i02_re; a2_im = i02_im;
1543  b0_re = i10_re; b0_im = i10_im;
1544  b1_re = i11_re; b1_im = i11_im;
1545  b2_re = i12_re; b2_im = i12_im;
1546 #else
1547  a0_re = 2*i00_re; a0_im = 2*i00_im;
1548  a1_re = 2*i01_re; a1_im = 2*i01_im;
1549  a2_re = 2*i02_re; a2_im = 2*i02_im;
1550  b0_re = 2*i10_re; b0_im = 2*i10_im;
1551  b1_re = 2*i11_re; b1_im = 2*i11_im;
1552  b2_re = 2*i12_re; b2_im = 2*i12_im;
1553 #endif
1554 
1555  // read gauge matrix from device memory
1556  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
1557 
1558  // reconstruct gauge matrix
1560 
1561  // multiply row 0
1562  spinorFloat A0_re = 0;
1563  A0_re += gT00_re * a0_re;
1564  A0_re -= gT00_im * a0_im;
1565  A0_re += gT01_re * a1_re;
1566  A0_re -= gT01_im * a1_im;
1567  A0_re += gT02_re * a2_re;
1568  A0_re -= gT02_im * a2_im;
1569  spinorFloat A0_im = 0;
1570  A0_im += gT00_re * a0_im;
1571  A0_im += gT00_im * a0_re;
1572  A0_im += gT01_re * a1_im;
1573  A0_im += gT01_im * a1_re;
1574  A0_im += gT02_re * a2_im;
1575  A0_im += gT02_im * a2_re;
1576  spinorFloat B0_re = 0;
1577  B0_re += gT00_re * b0_re;
1578  B0_re -= gT00_im * b0_im;
1579  B0_re += gT01_re * b1_re;
1580  B0_re -= gT01_im * b1_im;
1581  B0_re += gT02_re * b2_re;
1582  B0_re -= gT02_im * b2_im;
1583  spinorFloat B0_im = 0;
1584  B0_im += gT00_re * b0_im;
1585  B0_im += gT00_im * b0_re;
1586  B0_im += gT01_re * b1_im;
1587  B0_im += gT01_im * b1_re;
1588  B0_im += gT02_re * b2_im;
1589  B0_im += gT02_im * b2_re;
1590 
1591  // multiply row 1
1592  spinorFloat A1_re = 0;
1593  A1_re += gT10_re * a0_re;
1594  A1_re -= gT10_im * a0_im;
1595  A1_re += gT11_re * a1_re;
1596  A1_re -= gT11_im * a1_im;
1597  A1_re += gT12_re * a2_re;
1598  A1_re -= gT12_im * a2_im;
1599  spinorFloat A1_im = 0;
1600  A1_im += gT10_re * a0_im;
1601  A1_im += gT10_im * a0_re;
1602  A1_im += gT11_re * a1_im;
1603  A1_im += gT11_im * a1_re;
1604  A1_im += gT12_re * a2_im;
1605  A1_im += gT12_im * a2_re;
1606  spinorFloat B1_re = 0;
1607  B1_re += gT10_re * b0_re;
1608  B1_re -= gT10_im * b0_im;
1609  B1_re += gT11_re * b1_re;
1610  B1_re -= gT11_im * b1_im;
1611  B1_re += gT12_re * b2_re;
1612  B1_re -= gT12_im * b2_im;
1613  spinorFloat B1_im = 0;
1614  B1_im += gT10_re * b0_im;
1615  B1_im += gT10_im * b0_re;
1616  B1_im += gT11_re * b1_im;
1617  B1_im += gT11_im * b1_re;
1618  B1_im += gT12_re * b2_im;
1619  B1_im += gT12_im * b2_re;
1620 
1621  // multiply row 2
1622  spinorFloat A2_re = 0;
1623  A2_re += gT20_re * a0_re;
1624  A2_re -= gT20_im * a0_im;
1625  A2_re += gT21_re * a1_re;
1626  A2_re -= gT21_im * a1_im;
1627  A2_re += gT22_re * a2_re;
1628  A2_re -= gT22_im * a2_im;
1629  spinorFloat A2_im = 0;
1630  A2_im += gT20_re * a0_im;
1631  A2_im += gT20_im * a0_re;
1632  A2_im += gT21_re * a1_im;
1633  A2_im += gT21_im * a1_re;
1634  A2_im += gT22_re * a2_im;
1635  A2_im += gT22_im * a2_re;
1636  spinorFloat B2_re = 0;
1637  B2_re += gT20_re * b0_re;
1638  B2_re -= gT20_im * b0_im;
1639  B2_re += gT21_re * b1_re;
1640  B2_re -= gT21_im * b1_im;
1641  B2_re += gT22_re * b2_re;
1642  B2_re -= gT22_im * b2_im;
1643  spinorFloat B2_im = 0;
1644  B2_im += gT20_re * b0_im;
1645  B2_im += gT20_im * b0_re;
1646  B2_im += gT21_re * b1_im;
1647  B2_im += gT21_im * b1_re;
1648  B2_im += gT22_re * b2_im;
1649  B2_im += gT22_im * b2_re;
1650 
1651  o00_re += A0_re;
1652  o00_im += A0_im;
1653  o10_re += B0_re;
1654  o10_im += B0_im;
1655 
1656  o01_re += A1_re;
1657  o01_im += A1_im;
1658  o11_re += B1_re;
1659  o11_im += B1_im;
1660 
1661  o02_re += A2_re;
1662  o02_im += A2_im;
1663  o12_re += B2_re;
1664  o12_im += B2_im;
1665 
1666  }
1667 }
1668 
1669 {
1670 #ifdef DSLASH_XPAY
1671  READ_ACCUM(ACCUMTEX, param.sp_stride)
1672 
1673 #ifndef TWIST_XPAY
1674 #ifndef TWIST_INV_DSLASH
1675  //perform invert twist first:
1676  APPLY_TWIST_INV( a, b, o);
1677 #endif
1678  o00_re += acc00_re;
1679  o00_im += acc00_im;
1680  o01_re += acc01_re;
1681  o01_im += acc01_im;
1682  o02_re += acc02_re;
1683  o02_im += acc02_im;
1684  o10_re += acc10_re;
1685  o10_im += acc10_im;
1686  o11_re += acc11_re;
1687  o11_im += acc11_im;
1688  o12_re += acc12_re;
1689  o12_im += acc12_im;
1690  o20_re += acc20_re;
1691  o20_im += acc20_im;
1692  o21_re += acc21_re;
1693  o21_im += acc21_im;
1694  o22_re += acc22_re;
1695  o22_im += acc22_im;
1696  o30_re += acc30_re;
1697  o30_im += acc30_im;
1698  o31_re += acc31_re;
1699  o31_im += acc31_im;
1700  o32_re += acc32_re;
1701  o32_im += acc32_im;
1702 #else
1703  APPLY_TWIST( a, acc);
1704  //warning! b is unrelated to the twisted mass parameter in this case!
1705 
1706  o00_re = b*o00_re+acc00_re;
1707  o00_im = b*o00_im+acc00_im;
1708  o01_re = b*o01_re+acc01_re;
1709  o01_im = b*o01_im+acc01_im;
1710  o02_re = b*o02_re+acc02_re;
1711  o02_im = b*o02_im+acc02_im;
1712  o10_re = b*o10_re+acc10_re;
1713  o10_im = b*o10_im+acc10_im;
1714  o11_re = b*o11_re+acc11_re;
1715  o11_im = b*o11_im+acc11_im;
1716  o12_re = b*o12_re+acc12_re;
1717  o12_im = b*o12_im+acc12_im;
1718  o20_re = b*o20_re+acc20_re;
1719  o20_im = b*o20_im+acc20_im;
1720  o21_re = b*o21_re+acc21_re;
1721  o21_im = b*o21_im+acc21_im;
1722  o22_re = b*o22_re+acc22_re;
1723  o22_im = b*o22_im+acc22_im;
1724  o30_re = b*o30_re+acc30_re;
1725  o30_im = b*o30_im+acc30_im;
1726  o31_re = b*o31_re+acc31_re;
1727  o31_im = b*o31_im+acc31_im;
1728  o32_re = b*o32_re+acc32_re;
1729  o32_im = b*o32_im+acc32_im;
1730 #endif//TWIST_XPAY
1731 #else //no XPAY
1732 #ifndef TWIST_INV_DSLASH
1733  APPLY_TWIST_INV( a, b, o);
1734 #endif
1735 #endif
1736 }
1737 
1738 // write spinor field back to device memory
1739 WRITE_SPINOR(param.sp_stride);
1740 
1741 // undefine to prevent warning when precision is changed
1742 #undef spinorFloat
1743 #undef WRITE_SPINOR_SHARED
1744 #undef READ_SPINOR_SHARED
1745 #undef SHARED_STRIDE
1746 
1747 #undef g00_re
1748 #undef g00_im
1749 #undef g01_re
1750 #undef g01_im
1751 #undef g02_re
1752 #undef g02_im
1753 #undef g10_re
1754 #undef g10_im
1755 #undef g11_re
1756 #undef g11_im
1757 #undef g12_re
1758 #undef g12_im
1759 #undef g20_re
1760 #undef g20_im
1761 #undef g21_re
1762 #undef g21_im
1763 #undef g22_re
1764 #undef g22_im
1765 
1766 #undef i00_re
1767 #undef i00_im
1768 #undef i01_re
1769 #undef i01_im
1770 #undef i02_re
1771 #undef i02_im
1772 #undef i10_re
1773 #undef i10_im
1774 #undef i11_re
1775 #undef i11_im
1776 #undef i12_re
1777 #undef i12_im
1778 #undef i20_re
1779 #undef i20_im
1780 #undef i21_re
1781 #undef i21_im
1782 #undef i22_re
1783 #undef i22_im
1784 #undef i30_re
1785 #undef i30_im
1786 #undef i31_re
1787 #undef i31_im
1788 #undef i32_re
1789 #undef i32_im
1790 
1791 #undef acc00_re
1792 #undef acc00_im
1793 #undef acc01_re
1794 #undef acc01_im
1795 #undef acc02_re
1796 #undef acc02_im
1797 #undef acc10_re
1798 #undef acc10_im
1799 #undef acc11_re
1800 #undef acc11_im
1801 #undef acc12_re
1802 #undef acc12_im
1803 #undef acc20_re
1804 #undef acc20_im
1805 #undef acc21_re
1806 #undef acc21_im
1807 #undef acc22_re
1808 #undef acc22_im
1809 #undef acc30_re
1810 #undef acc30_im
1811 #undef acc31_re
1812 #undef acc31_im
1813 #undef acc32_re
1814 #undef acc32_im
1815 
1816 
1817 #undef o00_re
1818 #undef o00_im
1819 #undef o01_re
1820 #undef o01_im
1821 #undef o02_re
1822 #undef o02_im
1823 #undef o10_re
1824 #undef o10_im
1825 #undef o11_re
1826 #undef o11_im
1827 #undef o12_re
1828 #undef o12_im
1829 #undef o20_re
1830 #undef o20_im
1831 #undef o21_re
1832 #undef o21_im
1833 #undef o22_re
1834 #undef o22_im
1835 #undef o30_re
1836 #undef o30_im
1837 #undef o31_re
1838 #undef o31_im
1839 #undef o32_re
1840 #undef o32_im
1841 
1842 #undef VOLATILE
1843 
1844 #endif // MULTI_GPU
__constant__ int Vh
__constant__ int X2
#define o32_im
Definition: gamma5.h:295
#define APPLY_TWIST(a, reg)
Definition: io_spinor.h:1187
#define APPLY_TWIST_INV(a, b, reg)
**************************only for deg tm:*******************************
Definition: io_spinor.h:1122
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
#define o31_im
Definition: gamma5.h:293
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define RECONSTRUCT_GAUGE_MATRIX
Definition: covDev.h:39
#define GAUGE0TEX
Definition: covDev.h:112
#define o30_im
Definition: gamma5.h:291
__constant__ int X2m1
#define SPINORTEX
Definition: clover_def.h:40
#define o32_re
Definition: gamma5.h:294
int X[4]
Definition: quda.h:29
__constant__ int gauge_fixed
#define o31_re
Definition: gamma5.h:292
#define SPINOR_HOP
Definition: covDev.h:158
__constant__ int ga_stride
__constant__ int X1m1
__constant__ int X3
#define GAUGE1TEX
Definition: covDev.h:113
#define READ_GAUGE_MATRIX
Definition: covDev.h:44
__constant__ int X4m1
#define WRITE_SPINOR
Definition: clover_def.h:48
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
__constant__ int X4X3X2X1hmX3X2X1h
__constant__ int X4
__constant__ int X3m1