QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tm_fused_exterior_dslash_gt200_core.h
Go to the documentation of this file.
1 #ifdef MULTI_GPU
2 
3 // *** CUDA DSLASH ***
4 
5 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
6 
7 
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
9 #define VOLATILE
10 #else // Open64 compiler
11 #define VOLATILE volatile
12 #endif
13 // input spinor
14 #ifdef SPINOR_DOUBLE
15 #define spinorFloat double
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
64 #else
65 #define spinorFloat float
66 #define i00_re I0.x
67 #define i00_im I0.y
68 #define i01_re I0.z
69 #define i01_im I0.w
70 #define i02_re I1.x
71 #define i02_im I1.y
72 #define i10_re I1.z
73 #define i10_im I1.w
74 #define i11_re I2.x
75 #define i11_im I2.y
76 #define i12_re I2.z
77 #define i12_im I2.w
78 #define i20_re I3.x
79 #define i20_im I3.y
80 #define i21_re I3.z
81 #define i21_im I3.w
82 #define i22_re I4.x
83 #define i22_im I4.y
84 #define i30_re I4.z
85 #define i30_im I4.w
86 #define i31_re I5.x
87 #define i31_im I5.y
88 #define i32_re I5.z
89 #define i32_im I5.w
90 #define acc00_re accum0.x
91 #define acc00_im accum0.y
92 #define acc01_re accum0.z
93 #define acc01_im accum0.w
94 #define acc02_re accum1.x
95 #define acc02_im accum1.y
96 #define acc10_re accum1.z
97 #define acc10_im accum1.w
98 #define acc11_re accum2.x
99 #define acc11_im accum2.y
100 #define acc12_re accum2.z
101 #define acc12_im accum2.w
102 #define acc20_re accum3.x
103 #define acc20_im accum3.y
104 #define acc21_re accum3.z
105 #define acc21_im accum3.w
106 #define acc22_re accum4.x
107 #define acc22_im accum4.y
108 #define acc30_re accum4.z
109 #define acc30_im accum4.w
110 #define acc31_re accum5.x
111 #define acc31_im accum5.y
112 #define acc32_re accum5.z
113 #define acc32_im accum5.w
114 #endif // SPINOR_DOUBLE
115 
116 // gauge link
117 #ifdef GAUGE_FLOAT2
118 #define g00_re G0.x
119 #define g00_im G0.y
120 #define g01_re G1.x
121 #define g01_im G1.y
122 #define g02_re G2.x
123 #define g02_im G2.y
124 #define g10_re G3.x
125 #define g10_im G3.y
126 #define g11_re G4.x
127 #define g11_im G4.y
128 #define g12_re G5.x
129 #define g12_im G5.y
130 #define g20_re G6.x
131 #define g20_im G6.y
132 #define g21_re G7.x
133 #define g21_im G7.y
134 #define g22_re G8.x
135 #define g22_im G8.y
136 
137 #else
138 #define g00_re G0.x
139 #define g00_im G0.y
140 #define g01_re G0.z
141 #define g01_im G0.w
142 #define g02_re G1.x
143 #define g02_im G1.y
144 #define g10_re G1.z
145 #define g10_im G1.w
146 #define g11_re G2.x
147 #define g11_im G2.y
148 #define g12_re G2.z
149 #define g12_im G2.w
150 #define g20_re G3.x
151 #define g20_im G3.y
152 #define g21_re G3.z
153 #define g21_im G3.w
154 #define g22_re G4.x
155 #define g22_im G4.y
156 
157 #endif // GAUGE_DOUBLE
158 
159 // conjugated gauge link
160 #define gT00_re (+g00_re)
161 #define gT00_im (-g00_im)
162 #define gT01_re (+g10_re)
163 #define gT01_im (-g10_im)
164 #define gT02_re (+g20_re)
165 #define gT02_im (-g20_im)
166 #define gT10_re (+g01_re)
167 #define gT10_im (-g01_im)
168 #define gT11_re (+g11_re)
169 #define gT11_im (-g11_im)
170 #define gT12_re (+g21_re)
171 #define gT12_im (-g21_im)
172 #define gT20_re (+g02_re)
173 #define gT20_im (-g02_im)
174 #define gT21_re (+g12_re)
175 #define gT21_im (-g12_im)
176 #define gT22_re (+g22_re)
177 #define gT22_im (-g22_im)
178 
179 // output spinor
204 
205 #include "read_gauge.h"
206 #include "io_spinor.h"
207 
208 int x1, x2, x3, x4;
209 int X;
210 
211 #if (DD_PREC==2) // half precision
212 int sp_norm_idx;
213 #endif // half precision
214 
215 int sid;
216 
217 int dim;
218 int face_idx;
219 int Y[4] = {X1,X2,X3,X4};
220 int faceVolume[4];
221 faceVolume[0] = (X2*X3*X4)>>1;
222 faceVolume[1] = (X1*X3*X4)>>1;
223 faceVolume[2] = (X1*X2*X4)>>1;
224 faceVolume[3] = (X1*X2*X3)>>1;
225 
226  sid = blockIdx.x*blockDim.x + threadIdx.x;
227  if (sid >= param.threads) return;
228 
229 
230  dim = dimFromFaceIndex(sid, param); // sid is also modified
231 
232  const int face_volume = ((param.threadDimMapUpper[dim] - param.threadDimMapLower[dim]) >> 1);
233  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
234  face_idx = sid - face_num*face_volume; // index into the respective face
235 
236 
237  const int dims[] = {X1, X2, X3, X4};
238  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
239 
240  bool active = false;
241  for(int dir=0; dir<4; ++dir){
242  active = active || isActive(dim,dir,+1,x1,x2,x3,x4,param.commDim,param.X);
243  }
244  if(!active) return;
245 
246 
247  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
248 
249  o00_re = i00_re; o00_im = i00_im;
250  o01_re = i01_re; o01_im = i01_im;
251  o02_re = i02_re; o02_im = i02_im;
252  o10_re = i10_re; o10_im = i10_im;
253  o11_re = i11_re; o11_im = i11_im;
254  o12_re = i12_re; o12_im = i12_im;
255  o20_re = i20_re; o20_im = i20_im;
256  o21_re = i21_re; o21_im = i21_im;
257  o22_re = i22_re; o22_im = i22_im;
258  o30_re = i30_re; o30_im = i30_im;
259  o31_re = i31_re; o31_im = i31_im;
260  o32_re = i32_re; o32_im = i32_im;
261 if (isActive(dim,0,+1,x1,x2,x3,x4,param.commDim,param.X) && x1==X1m1 )
262 {
263  // Projector P0-
264  // 1 0 0 -i
265  // 0 1 -i 0
266  // 0 i 1 0
267  // i 0 0 1
268 
269  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,0,Y);
270  const int sp_idx = face_idx + param.ghostOffset[0];
271 #if (DD_PREC==2)
272  sp_norm_idx = face_idx + faceVolume[0] + param.ghostNormOffset[0];
273 #endif
274 
275  const int ga_idx = sid;
276 
283 
284 
285  const int sp_stride_pad = ghostFace[0];
286 
287  // read half spinor from device memory
288  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
289 
290  a0_re = i00_re; a0_im = i00_im;
291  a1_re = i01_re; a1_im = i01_im;
292  a2_re = i02_re; a2_im = i02_im;
293  b0_re = i10_re; b0_im = i10_im;
294  b1_re = i11_re; b1_im = i11_im;
295  b2_re = i12_re; b2_im = i12_im;
296 
297  // read gauge matrix from device memory
298  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
299 
300  // reconstruct gauge matrix
302 
303  // multiply row 0
304  spinorFloat A0_re = 0;
305  A0_re += g00_re * a0_re;
306  A0_re -= g00_im * a0_im;
307  A0_re += g01_re * a1_re;
308  A0_re -= g01_im * a1_im;
309  A0_re += g02_re * a2_re;
310  A0_re -= g02_im * a2_im;
311  spinorFloat A0_im = 0;
312  A0_im += g00_re * a0_im;
313  A0_im += g00_im * a0_re;
314  A0_im += g01_re * a1_im;
315  A0_im += g01_im * a1_re;
316  A0_im += g02_re * a2_im;
317  A0_im += g02_im * a2_re;
318  spinorFloat B0_re = 0;
319  B0_re += g00_re * b0_re;
320  B0_re -= g00_im * b0_im;
321  B0_re += g01_re * b1_re;
322  B0_re -= g01_im * b1_im;
323  B0_re += g02_re * b2_re;
324  B0_re -= g02_im * b2_im;
325  spinorFloat B0_im = 0;
326  B0_im += g00_re * b0_im;
327  B0_im += g00_im * b0_re;
328  B0_im += g01_re * b1_im;
329  B0_im += g01_im * b1_re;
330  B0_im += g02_re * b2_im;
331  B0_im += g02_im * b2_re;
332 
333  // multiply row 1
334  spinorFloat A1_re = 0;
335  A1_re += g10_re * a0_re;
336  A1_re -= g10_im * a0_im;
337  A1_re += g11_re * a1_re;
338  A1_re -= g11_im * a1_im;
339  A1_re += g12_re * a2_re;
340  A1_re -= g12_im * a2_im;
341  spinorFloat A1_im = 0;
342  A1_im += g10_re * a0_im;
343  A1_im += g10_im * a0_re;
344  A1_im += g11_re * a1_im;
345  A1_im += g11_im * a1_re;
346  A1_im += g12_re * a2_im;
347  A1_im += g12_im * a2_re;
348  spinorFloat B1_re = 0;
349  B1_re += g10_re * b0_re;
350  B1_re -= g10_im * b0_im;
351  B1_re += g11_re * b1_re;
352  B1_re -= g11_im * b1_im;
353  B1_re += g12_re * b2_re;
354  B1_re -= g12_im * b2_im;
355  spinorFloat B1_im = 0;
356  B1_im += g10_re * b0_im;
357  B1_im += g10_im * b0_re;
358  B1_im += g11_re * b1_im;
359  B1_im += g11_im * b1_re;
360  B1_im += g12_re * b2_im;
361  B1_im += g12_im * b2_re;
362 
363  // multiply row 2
364  spinorFloat A2_re = 0;
365  A2_re += g20_re * a0_re;
366  A2_re -= g20_im * a0_im;
367  A2_re += g21_re * a1_re;
368  A2_re -= g21_im * a1_im;
369  A2_re += g22_re * a2_re;
370  A2_re -= g22_im * a2_im;
371  spinorFloat A2_im = 0;
372  A2_im += g20_re * a0_im;
373  A2_im += g20_im * a0_re;
374  A2_im += g21_re * a1_im;
375  A2_im += g21_im * a1_re;
376  A2_im += g22_re * a2_im;
377  A2_im += g22_im * a2_re;
378  spinorFloat B2_re = 0;
379  B2_re += g20_re * b0_re;
380  B2_re -= g20_im * b0_im;
381  B2_re += g21_re * b1_re;
382  B2_re -= g21_im * b1_im;
383  B2_re += g22_re * b2_re;
384  B2_re -= g22_im * b2_im;
385  spinorFloat B2_im = 0;
386  B2_im += g20_re * b0_im;
387  B2_im += g20_im * b0_re;
388  B2_im += g21_re * b1_im;
389  B2_im += g21_im * b1_re;
390  B2_im += g22_re * b2_im;
391  B2_im += g22_im * b2_re;
392 
393  o00_re += A0_re;
394  o00_im += A0_im;
395  o10_re += B0_re;
396  o10_im += B0_im;
397  o20_re -= B0_im;
398  o20_im += B0_re;
399  o30_re -= A0_im;
400  o30_im += A0_re;
401 
402  o01_re += A1_re;
403  o01_im += A1_im;
404  o11_re += B1_re;
405  o11_im += B1_im;
406  o21_re -= B1_im;
407  o21_im += B1_re;
408  o31_re -= A1_im;
409  o31_im += A1_re;
410 
411  o02_re += A2_re;
412  o02_im += A2_im;
413  o12_re += B2_re;
414  o12_im += B2_im;
415  o22_re -= B2_im;
416  o22_im += B2_re;
417  o32_re -= A2_im;
418  o32_im += A2_re;
419 
420 }
421 
422 if (isActive(dim,0,-1,x1,x2,x3,x4,param.commDim,param.X) && x1==0 )
423 {
424  // Projector P0+
425  // 1 0 0 i
426  // 0 1 i 0
427  // 0 -i 1 0
428  // -i 0 0 1
429 
430  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,0,Y);
431  const int sp_idx = face_idx + param.ghostOffset[0];
432 #if (DD_PREC==2)
433  sp_norm_idx = face_idx + param.ghostNormOffset[0];
434 #endif
435 
436  const int ga_idx = Vh+face_idx;
437 
444 
445 
446  const int sp_stride_pad = ghostFace[0];
447 
448  // read half spinor from device memory
449  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
450 
451  a0_re = i00_re; a0_im = i00_im;
452  a1_re = i01_re; a1_im = i01_im;
453  a2_re = i02_re; a2_im = i02_im;
454  b0_re = i10_re; b0_im = i10_im;
455  b1_re = i11_re; b1_im = i11_im;
456  b2_re = i12_re; b2_im = i12_im;
457 
458  // read gauge matrix from device memory
459  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
460 
461  // reconstruct gauge matrix
463 
464  // multiply row 0
465  spinorFloat A0_re = 0;
466  A0_re += gT00_re * a0_re;
467  A0_re -= gT00_im * a0_im;
468  A0_re += gT01_re * a1_re;
469  A0_re -= gT01_im * a1_im;
470  A0_re += gT02_re * a2_re;
471  A0_re -= gT02_im * a2_im;
472  spinorFloat A0_im = 0;
473  A0_im += gT00_re * a0_im;
474  A0_im += gT00_im * a0_re;
475  A0_im += gT01_re * a1_im;
476  A0_im += gT01_im * a1_re;
477  A0_im += gT02_re * a2_im;
478  A0_im += gT02_im * a2_re;
479  spinorFloat B0_re = 0;
480  B0_re += gT00_re * b0_re;
481  B0_re -= gT00_im * b0_im;
482  B0_re += gT01_re * b1_re;
483  B0_re -= gT01_im * b1_im;
484  B0_re += gT02_re * b2_re;
485  B0_re -= gT02_im * b2_im;
486  spinorFloat B0_im = 0;
487  B0_im += gT00_re * b0_im;
488  B0_im += gT00_im * b0_re;
489  B0_im += gT01_re * b1_im;
490  B0_im += gT01_im * b1_re;
491  B0_im += gT02_re * b2_im;
492  B0_im += gT02_im * b2_re;
493 
494  // multiply row 1
495  spinorFloat A1_re = 0;
496  A1_re += gT10_re * a0_re;
497  A1_re -= gT10_im * a0_im;
498  A1_re += gT11_re * a1_re;
499  A1_re -= gT11_im * a1_im;
500  A1_re += gT12_re * a2_re;
501  A1_re -= gT12_im * a2_im;
502  spinorFloat A1_im = 0;
503  A1_im += gT10_re * a0_im;
504  A1_im += gT10_im * a0_re;
505  A1_im += gT11_re * a1_im;
506  A1_im += gT11_im * a1_re;
507  A1_im += gT12_re * a2_im;
508  A1_im += gT12_im * a2_re;
509  spinorFloat B1_re = 0;
510  B1_re += gT10_re * b0_re;
511  B1_re -= gT10_im * b0_im;
512  B1_re += gT11_re * b1_re;
513  B1_re -= gT11_im * b1_im;
514  B1_re += gT12_re * b2_re;
515  B1_re -= gT12_im * b2_im;
516  spinorFloat B1_im = 0;
517  B1_im += gT10_re * b0_im;
518  B1_im += gT10_im * b0_re;
519  B1_im += gT11_re * b1_im;
520  B1_im += gT11_im * b1_re;
521  B1_im += gT12_re * b2_im;
522  B1_im += gT12_im * b2_re;
523 
524  // multiply row 2
525  spinorFloat A2_re = 0;
526  A2_re += gT20_re * a0_re;
527  A2_re -= gT20_im * a0_im;
528  A2_re += gT21_re * a1_re;
529  A2_re -= gT21_im * a1_im;
530  A2_re += gT22_re * a2_re;
531  A2_re -= gT22_im * a2_im;
532  spinorFloat A2_im = 0;
533  A2_im += gT20_re * a0_im;
534  A2_im += gT20_im * a0_re;
535  A2_im += gT21_re * a1_im;
536  A2_im += gT21_im * a1_re;
537  A2_im += gT22_re * a2_im;
538  A2_im += gT22_im * a2_re;
539  spinorFloat B2_re = 0;
540  B2_re += gT20_re * b0_re;
541  B2_re -= gT20_im * b0_im;
542  B2_re += gT21_re * b1_re;
543  B2_re -= gT21_im * b1_im;
544  B2_re += gT22_re * b2_re;
545  B2_re -= gT22_im * b2_im;
546  spinorFloat B2_im = 0;
547  B2_im += gT20_re * b0_im;
548  B2_im += gT20_im * b0_re;
549  B2_im += gT21_re * b1_im;
550  B2_im += gT21_im * b1_re;
551  B2_im += gT22_re * b2_im;
552  B2_im += gT22_im * b2_re;
553 
554  o00_re += A0_re;
555  o00_im += A0_im;
556  o10_re += B0_re;
557  o10_im += B0_im;
558  o20_re += B0_im;
559  o20_im -= B0_re;
560  o30_re += A0_im;
561  o30_im -= A0_re;
562 
563  o01_re += A1_re;
564  o01_im += A1_im;
565  o11_re += B1_re;
566  o11_im += B1_im;
567  o21_re += B1_im;
568  o21_im -= B1_re;
569  o31_re += A1_im;
570  o31_im -= A1_re;
571 
572  o02_re += A2_re;
573  o02_im += A2_im;
574  o12_re += B2_re;
575  o12_im += B2_im;
576  o22_re += B2_im;
577  o22_im -= B2_re;
578  o32_re += A2_im;
579  o32_im -= A2_re;
580 
581 }
582 
583 if (isActive(dim,1,+1,x1,x2,x3,x4,param.commDim,param.X) && x2==X2m1 )
584 {
585  // Projector P1-
586  // 1 0 0 -1
587  // 0 1 1 0
588  // 0 1 1 0
589  // -1 0 0 1
590 
591  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,1,Y);
592  const int sp_idx = face_idx + param.ghostOffset[1];
593 #if (DD_PREC==2)
594  sp_norm_idx = face_idx + faceVolume[1] + param.ghostNormOffset[1];
595 #endif
596 
597  const int ga_idx = sid;
598 
605 
606 
607  const int sp_stride_pad = ghostFace[1];
608 
609  // read half spinor from device memory
610  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
611 
612  a0_re = i00_re; a0_im = i00_im;
613  a1_re = i01_re; a1_im = i01_im;
614  a2_re = i02_re; a2_im = i02_im;
615  b0_re = i10_re; b0_im = i10_im;
616  b1_re = i11_re; b1_im = i11_im;
617  b2_re = i12_re; b2_im = i12_im;
618 
619  // read gauge matrix from device memory
620  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
621 
622  // reconstruct gauge matrix
624 
625  // multiply row 0
626  spinorFloat A0_re = 0;
627  A0_re += g00_re * a0_re;
628  A0_re -= g00_im * a0_im;
629  A0_re += g01_re * a1_re;
630  A0_re -= g01_im * a1_im;
631  A0_re += g02_re * a2_re;
632  A0_re -= g02_im * a2_im;
633  spinorFloat A0_im = 0;
634  A0_im += g00_re * a0_im;
635  A0_im += g00_im * a0_re;
636  A0_im += g01_re * a1_im;
637  A0_im += g01_im * a1_re;
638  A0_im += g02_re * a2_im;
639  A0_im += g02_im * a2_re;
640  spinorFloat B0_re = 0;
641  B0_re += g00_re * b0_re;
642  B0_re -= g00_im * b0_im;
643  B0_re += g01_re * b1_re;
644  B0_re -= g01_im * b1_im;
645  B0_re += g02_re * b2_re;
646  B0_re -= g02_im * b2_im;
647  spinorFloat B0_im = 0;
648  B0_im += g00_re * b0_im;
649  B0_im += g00_im * b0_re;
650  B0_im += g01_re * b1_im;
651  B0_im += g01_im * b1_re;
652  B0_im += g02_re * b2_im;
653  B0_im += g02_im * b2_re;
654 
655  // multiply row 1
656  spinorFloat A1_re = 0;
657  A1_re += g10_re * a0_re;
658  A1_re -= g10_im * a0_im;
659  A1_re += g11_re * a1_re;
660  A1_re -= g11_im * a1_im;
661  A1_re += g12_re * a2_re;
662  A1_re -= g12_im * a2_im;
663  spinorFloat A1_im = 0;
664  A1_im += g10_re * a0_im;
665  A1_im += g10_im * a0_re;
666  A1_im += g11_re * a1_im;
667  A1_im += g11_im * a1_re;
668  A1_im += g12_re * a2_im;
669  A1_im += g12_im * a2_re;
670  spinorFloat B1_re = 0;
671  B1_re += g10_re * b0_re;
672  B1_re -= g10_im * b0_im;
673  B1_re += g11_re * b1_re;
674  B1_re -= g11_im * b1_im;
675  B1_re += g12_re * b2_re;
676  B1_re -= g12_im * b2_im;
677  spinorFloat B1_im = 0;
678  B1_im += g10_re * b0_im;
679  B1_im += g10_im * b0_re;
680  B1_im += g11_re * b1_im;
681  B1_im += g11_im * b1_re;
682  B1_im += g12_re * b2_im;
683  B1_im += g12_im * b2_re;
684 
685  // multiply row 2
686  spinorFloat A2_re = 0;
687  A2_re += g20_re * a0_re;
688  A2_re -= g20_im * a0_im;
689  A2_re += g21_re * a1_re;
690  A2_re -= g21_im * a1_im;
691  A2_re += g22_re * a2_re;
692  A2_re -= g22_im * a2_im;
693  spinorFloat A2_im = 0;
694  A2_im += g20_re * a0_im;
695  A2_im += g20_im * a0_re;
696  A2_im += g21_re * a1_im;
697  A2_im += g21_im * a1_re;
698  A2_im += g22_re * a2_im;
699  A2_im += g22_im * a2_re;
700  spinorFloat B2_re = 0;
701  B2_re += g20_re * b0_re;
702  B2_re -= g20_im * b0_im;
703  B2_re += g21_re * b1_re;
704  B2_re -= g21_im * b1_im;
705  B2_re += g22_re * b2_re;
706  B2_re -= g22_im * b2_im;
707  spinorFloat B2_im = 0;
708  B2_im += g20_re * b0_im;
709  B2_im += g20_im * b0_re;
710  B2_im += g21_re * b1_im;
711  B2_im += g21_im * b1_re;
712  B2_im += g22_re * b2_im;
713  B2_im += g22_im * b2_re;
714 
715  o00_re += A0_re;
716  o00_im += A0_im;
717  o10_re += B0_re;
718  o10_im += B0_im;
719  o20_re += B0_re;
720  o20_im += B0_im;
721  o30_re -= A0_re;
722  o30_im -= A0_im;
723 
724  o01_re += A1_re;
725  o01_im += A1_im;
726  o11_re += B1_re;
727  o11_im += B1_im;
728  o21_re += B1_re;
729  o21_im += B1_im;
730  o31_re -= A1_re;
731  o31_im -= A1_im;
732 
733  o02_re += A2_re;
734  o02_im += A2_im;
735  o12_re += B2_re;
736  o12_im += B2_im;
737  o22_re += B2_re;
738  o22_im += B2_im;
739  o32_re -= A2_re;
740  o32_im -= A2_im;
741 
742 }
743 
744 if (isActive(dim,1,-1,x1,x2,x3,x4,param.commDim,param.X) && x2==0 )
745 {
746  // Projector P1+
747  // 1 0 0 1
748  // 0 1 -1 0
749  // 0 -1 1 0
750  // 1 0 0 1
751 
752  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,1,Y);
753  const int sp_idx = face_idx + param.ghostOffset[1];
754 #if (DD_PREC==2)
755  sp_norm_idx = face_idx + param.ghostNormOffset[1];
756 #endif
757 
758  const int ga_idx = Vh+face_idx;
759 
766 
767 
768  const int sp_stride_pad = ghostFace[1];
769 
770  // read half spinor from device memory
771  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
772 
773  a0_re = i00_re; a0_im = i00_im;
774  a1_re = i01_re; a1_im = i01_im;
775  a2_re = i02_re; a2_im = i02_im;
776  b0_re = i10_re; b0_im = i10_im;
777  b1_re = i11_re; b1_im = i11_im;
778  b2_re = i12_re; b2_im = i12_im;
779 
780  // read gauge matrix from device memory
781  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
782 
783  // reconstruct gauge matrix
785 
786  // multiply row 0
787  spinorFloat A0_re = 0;
788  A0_re += gT00_re * a0_re;
789  A0_re -= gT00_im * a0_im;
790  A0_re += gT01_re * a1_re;
791  A0_re -= gT01_im * a1_im;
792  A0_re += gT02_re * a2_re;
793  A0_re -= gT02_im * a2_im;
794  spinorFloat A0_im = 0;
795  A0_im += gT00_re * a0_im;
796  A0_im += gT00_im * a0_re;
797  A0_im += gT01_re * a1_im;
798  A0_im += gT01_im * a1_re;
799  A0_im += gT02_re * a2_im;
800  A0_im += gT02_im * a2_re;
801  spinorFloat B0_re = 0;
802  B0_re += gT00_re * b0_re;
803  B0_re -= gT00_im * b0_im;
804  B0_re += gT01_re * b1_re;
805  B0_re -= gT01_im * b1_im;
806  B0_re += gT02_re * b2_re;
807  B0_re -= gT02_im * b2_im;
808  spinorFloat B0_im = 0;
809  B0_im += gT00_re * b0_im;
810  B0_im += gT00_im * b0_re;
811  B0_im += gT01_re * b1_im;
812  B0_im += gT01_im * b1_re;
813  B0_im += gT02_re * b2_im;
814  B0_im += gT02_im * b2_re;
815 
816  // multiply row 1
817  spinorFloat A1_re = 0;
818  A1_re += gT10_re * a0_re;
819  A1_re -= gT10_im * a0_im;
820  A1_re += gT11_re * a1_re;
821  A1_re -= gT11_im * a1_im;
822  A1_re += gT12_re * a2_re;
823  A1_re -= gT12_im * a2_im;
824  spinorFloat A1_im = 0;
825  A1_im += gT10_re * a0_im;
826  A1_im += gT10_im * a0_re;
827  A1_im += gT11_re * a1_im;
828  A1_im += gT11_im * a1_re;
829  A1_im += gT12_re * a2_im;
830  A1_im += gT12_im * a2_re;
831  spinorFloat B1_re = 0;
832  B1_re += gT10_re * b0_re;
833  B1_re -= gT10_im * b0_im;
834  B1_re += gT11_re * b1_re;
835  B1_re -= gT11_im * b1_im;
836  B1_re += gT12_re * b2_re;
837  B1_re -= gT12_im * b2_im;
838  spinorFloat B1_im = 0;
839  B1_im += gT10_re * b0_im;
840  B1_im += gT10_im * b0_re;
841  B1_im += gT11_re * b1_im;
842  B1_im += gT11_im * b1_re;
843  B1_im += gT12_re * b2_im;
844  B1_im += gT12_im * b2_re;
845 
846  // multiply row 2
847  spinorFloat A2_re = 0;
848  A2_re += gT20_re * a0_re;
849  A2_re -= gT20_im * a0_im;
850  A2_re += gT21_re * a1_re;
851  A2_re -= gT21_im * a1_im;
852  A2_re += gT22_re * a2_re;
853  A2_re -= gT22_im * a2_im;
854  spinorFloat A2_im = 0;
855  A2_im += gT20_re * a0_im;
856  A2_im += gT20_im * a0_re;
857  A2_im += gT21_re * a1_im;
858  A2_im += gT21_im * a1_re;
859  A2_im += gT22_re * a2_im;
860  A2_im += gT22_im * a2_re;
861  spinorFloat B2_re = 0;
862  B2_re += gT20_re * b0_re;
863  B2_re -= gT20_im * b0_im;
864  B2_re += gT21_re * b1_re;
865  B2_re -= gT21_im * b1_im;
866  B2_re += gT22_re * b2_re;
867  B2_re -= gT22_im * b2_im;
868  spinorFloat B2_im = 0;
869  B2_im += gT20_re * b0_im;
870  B2_im += gT20_im * b0_re;
871  B2_im += gT21_re * b1_im;
872  B2_im += gT21_im * b1_re;
873  B2_im += gT22_re * b2_im;
874  B2_im += gT22_im * b2_re;
875 
876  o00_re += A0_re;
877  o00_im += A0_im;
878  o10_re += B0_re;
879  o10_im += B0_im;
880  o20_re -= B0_re;
881  o20_im -= B0_im;
882  o30_re += A0_re;
883  o30_im += A0_im;
884 
885  o01_re += A1_re;
886  o01_im += A1_im;
887  o11_re += B1_re;
888  o11_im += B1_im;
889  o21_re -= B1_re;
890  o21_im -= B1_im;
891  o31_re += A1_re;
892  o31_im += A1_im;
893 
894  o02_re += A2_re;
895  o02_im += A2_im;
896  o12_re += B2_re;
897  o12_im += B2_im;
898  o22_re -= B2_re;
899  o22_im -= B2_im;
900  o32_re += A2_re;
901  o32_im += A2_im;
902 
903 }
904 
905 if (isActive(dim,2,+1,x1,x2,x3,x4,param.commDim,param.X) && x3==X3m1 )
906 {
907  // Projector P2-
908  // 1 0 -i 0
909  // 0 1 0 i
910  // i 0 1 0
911  // 0 -i 0 1
912 
913  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,2,Y);
914  const int sp_idx = face_idx + param.ghostOffset[2];
915 #if (DD_PREC==2)
916  sp_norm_idx = face_idx + faceVolume[2] + param.ghostNormOffset[2];
917 #endif
918 
919  const int ga_idx = sid;
920 
927 
928 
929  const int sp_stride_pad = ghostFace[2];
930 
931  // read half spinor from device memory
932  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
933 
934  a0_re = i00_re; a0_im = i00_im;
935  a1_re = i01_re; a1_im = i01_im;
936  a2_re = i02_re; a2_im = i02_im;
937  b0_re = i10_re; b0_im = i10_im;
938  b1_re = i11_re; b1_im = i11_im;
939  b2_re = i12_re; b2_im = i12_im;
940 
941  // read gauge matrix from device memory
942  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
943 
944  // reconstruct gauge matrix
946 
947  // multiply row 0
948  spinorFloat A0_re = 0;
949  A0_re += g00_re * a0_re;
950  A0_re -= g00_im * a0_im;
951  A0_re += g01_re * a1_re;
952  A0_re -= g01_im * a1_im;
953  A0_re += g02_re * a2_re;
954  A0_re -= g02_im * a2_im;
955  spinorFloat A0_im = 0;
956  A0_im += g00_re * a0_im;
957  A0_im += g00_im * a0_re;
958  A0_im += g01_re * a1_im;
959  A0_im += g01_im * a1_re;
960  A0_im += g02_re * a2_im;
961  A0_im += g02_im * a2_re;
962  spinorFloat B0_re = 0;
963  B0_re += g00_re * b0_re;
964  B0_re -= g00_im * b0_im;
965  B0_re += g01_re * b1_re;
966  B0_re -= g01_im * b1_im;
967  B0_re += g02_re * b2_re;
968  B0_re -= g02_im * b2_im;
969  spinorFloat B0_im = 0;
970  B0_im += g00_re * b0_im;
971  B0_im += g00_im * b0_re;
972  B0_im += g01_re * b1_im;
973  B0_im += g01_im * b1_re;
974  B0_im += g02_re * b2_im;
975  B0_im += g02_im * b2_re;
976 
977  // multiply row 1
978  spinorFloat A1_re = 0;
979  A1_re += g10_re * a0_re;
980  A1_re -= g10_im * a0_im;
981  A1_re += g11_re * a1_re;
982  A1_re -= g11_im * a1_im;
983  A1_re += g12_re * a2_re;
984  A1_re -= g12_im * a2_im;
985  spinorFloat A1_im = 0;
986  A1_im += g10_re * a0_im;
987  A1_im += g10_im * a0_re;
988  A1_im += g11_re * a1_im;
989  A1_im += g11_im * a1_re;
990  A1_im += g12_re * a2_im;
991  A1_im += g12_im * a2_re;
992  spinorFloat B1_re = 0;
993  B1_re += g10_re * b0_re;
994  B1_re -= g10_im * b0_im;
995  B1_re += g11_re * b1_re;
996  B1_re -= g11_im * b1_im;
997  B1_re += g12_re * b2_re;
998  B1_re -= g12_im * b2_im;
999  spinorFloat B1_im = 0;
1000  B1_im += g10_re * b0_im;
1001  B1_im += g10_im * b0_re;
1002  B1_im += g11_re * b1_im;
1003  B1_im += g11_im * b1_re;
1004  B1_im += g12_re * b2_im;
1005  B1_im += g12_im * b2_re;
1006 
1007  // multiply row 2
1008  spinorFloat A2_re = 0;
1009  A2_re += g20_re * a0_re;
1010  A2_re -= g20_im * a0_im;
1011  A2_re += g21_re * a1_re;
1012  A2_re -= g21_im * a1_im;
1013  A2_re += g22_re * a2_re;
1014  A2_re -= g22_im * a2_im;
1015  spinorFloat A2_im = 0;
1016  A2_im += g20_re * a0_im;
1017  A2_im += g20_im * a0_re;
1018  A2_im += g21_re * a1_im;
1019  A2_im += g21_im * a1_re;
1020  A2_im += g22_re * a2_im;
1021  A2_im += g22_im * a2_re;
1022  spinorFloat B2_re = 0;
1023  B2_re += g20_re * b0_re;
1024  B2_re -= g20_im * b0_im;
1025  B2_re += g21_re * b1_re;
1026  B2_re -= g21_im * b1_im;
1027  B2_re += g22_re * b2_re;
1028  B2_re -= g22_im * b2_im;
1029  spinorFloat B2_im = 0;
1030  B2_im += g20_re * b0_im;
1031  B2_im += g20_im * b0_re;
1032  B2_im += g21_re * b1_im;
1033  B2_im += g21_im * b1_re;
1034  B2_im += g22_re * b2_im;
1035  B2_im += g22_im * b2_re;
1036 
1037  o00_re += A0_re;
1038  o00_im += A0_im;
1039  o10_re += B0_re;
1040  o10_im += B0_im;
1041  o20_re -= A0_im;
1042  o20_im += A0_re;
1043  o30_re += B0_im;
1044  o30_im -= B0_re;
1045 
1046  o01_re += A1_re;
1047  o01_im += A1_im;
1048  o11_re += B1_re;
1049  o11_im += B1_im;
1050  o21_re -= A1_im;
1051  o21_im += A1_re;
1052  o31_re += B1_im;
1053  o31_im -= B1_re;
1054 
1055  o02_re += A2_re;
1056  o02_im += A2_im;
1057  o12_re += B2_re;
1058  o12_im += B2_im;
1059  o22_re -= A2_im;
1060  o22_im += A2_re;
1061  o32_re += B2_im;
1062  o32_im -= B2_re;
1063 
1064 }
1065 
1066 if (isActive(dim,2,-1,x1,x2,x3,x4,param.commDim,param.X) && x3==0 )
1067 {
1068  // Projector P2+
1069  // 1 0 i 0
1070  // 0 1 0 -i
1071  // -i 0 1 0
1072  // 0 i 0 1
1073 
1074  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,2,Y);
1075  const int sp_idx = face_idx + param.ghostOffset[2];
1076 #if (DD_PREC==2)
1077  sp_norm_idx = face_idx + param.ghostNormOffset[2];
1078 #endif
1079 
1080  const int ga_idx = Vh+face_idx;
1081 
1088 
1089 
1090  const int sp_stride_pad = ghostFace[2];
1091 
1092  // read half spinor from device memory
1093  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1094 
1095  a0_re = i00_re; a0_im = i00_im;
1096  a1_re = i01_re; a1_im = i01_im;
1097  a2_re = i02_re; a2_im = i02_im;
1098  b0_re = i10_re; b0_im = i10_im;
1099  b1_re = i11_re; b1_im = i11_im;
1100  b2_re = i12_re; b2_im = i12_im;
1101 
1102  // read gauge matrix from device memory
1103  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1104 
1105  // reconstruct gauge matrix
1107 
1108  // multiply row 0
1109  spinorFloat A0_re = 0;
1110  A0_re += gT00_re * a0_re;
1111  A0_re -= gT00_im * a0_im;
1112  A0_re += gT01_re * a1_re;
1113  A0_re -= gT01_im * a1_im;
1114  A0_re += gT02_re * a2_re;
1115  A0_re -= gT02_im * a2_im;
1116  spinorFloat A0_im = 0;
1117  A0_im += gT00_re * a0_im;
1118  A0_im += gT00_im * a0_re;
1119  A0_im += gT01_re * a1_im;
1120  A0_im += gT01_im * a1_re;
1121  A0_im += gT02_re * a2_im;
1122  A0_im += gT02_im * a2_re;
1123  spinorFloat B0_re = 0;
1124  B0_re += gT00_re * b0_re;
1125  B0_re -= gT00_im * b0_im;
1126  B0_re += gT01_re * b1_re;
1127  B0_re -= gT01_im * b1_im;
1128  B0_re += gT02_re * b2_re;
1129  B0_re -= gT02_im * b2_im;
1130  spinorFloat B0_im = 0;
1131  B0_im += gT00_re * b0_im;
1132  B0_im += gT00_im * b0_re;
1133  B0_im += gT01_re * b1_im;
1134  B0_im += gT01_im * b1_re;
1135  B0_im += gT02_re * b2_im;
1136  B0_im += gT02_im * b2_re;
1137 
1138  // multiply row 1
1139  spinorFloat A1_re = 0;
1140  A1_re += gT10_re * a0_re;
1141  A1_re -= gT10_im * a0_im;
1142  A1_re += gT11_re * a1_re;
1143  A1_re -= gT11_im * a1_im;
1144  A1_re += gT12_re * a2_re;
1145  A1_re -= gT12_im * a2_im;
1146  spinorFloat A1_im = 0;
1147  A1_im += gT10_re * a0_im;
1148  A1_im += gT10_im * a0_re;
1149  A1_im += gT11_re * a1_im;
1150  A1_im += gT11_im * a1_re;
1151  A1_im += gT12_re * a2_im;
1152  A1_im += gT12_im * a2_re;
1153  spinorFloat B1_re = 0;
1154  B1_re += gT10_re * b0_re;
1155  B1_re -= gT10_im * b0_im;
1156  B1_re += gT11_re * b1_re;
1157  B1_re -= gT11_im * b1_im;
1158  B1_re += gT12_re * b2_re;
1159  B1_re -= gT12_im * b2_im;
1160  spinorFloat B1_im = 0;
1161  B1_im += gT10_re * b0_im;
1162  B1_im += gT10_im * b0_re;
1163  B1_im += gT11_re * b1_im;
1164  B1_im += gT11_im * b1_re;
1165  B1_im += gT12_re * b2_im;
1166  B1_im += gT12_im * b2_re;
1167 
1168  // multiply row 2
1169  spinorFloat A2_re = 0;
1170  A2_re += gT20_re * a0_re;
1171  A2_re -= gT20_im * a0_im;
1172  A2_re += gT21_re * a1_re;
1173  A2_re -= gT21_im * a1_im;
1174  A2_re += gT22_re * a2_re;
1175  A2_re -= gT22_im * a2_im;
1176  spinorFloat A2_im = 0;
1177  A2_im += gT20_re * a0_im;
1178  A2_im += gT20_im * a0_re;
1179  A2_im += gT21_re * a1_im;
1180  A2_im += gT21_im * a1_re;
1181  A2_im += gT22_re * a2_im;
1182  A2_im += gT22_im * a2_re;
1183  spinorFloat B2_re = 0;
1184  B2_re += gT20_re * b0_re;
1185  B2_re -= gT20_im * b0_im;
1186  B2_re += gT21_re * b1_re;
1187  B2_re -= gT21_im * b1_im;
1188  B2_re += gT22_re * b2_re;
1189  B2_re -= gT22_im * b2_im;
1190  spinorFloat B2_im = 0;
1191  B2_im += gT20_re * b0_im;
1192  B2_im += gT20_im * b0_re;
1193  B2_im += gT21_re * b1_im;
1194  B2_im += gT21_im * b1_re;
1195  B2_im += gT22_re * b2_im;
1196  B2_im += gT22_im * b2_re;
1197 
1198  o00_re += A0_re;
1199  o00_im += A0_im;
1200  o10_re += B0_re;
1201  o10_im += B0_im;
1202  o20_re += A0_im;
1203  o20_im -= A0_re;
1204  o30_re -= B0_im;
1205  o30_im += B0_re;
1206 
1207  o01_re += A1_re;
1208  o01_im += A1_im;
1209  o11_re += B1_re;
1210  o11_im += B1_im;
1211  o21_re += A1_im;
1212  o21_im -= A1_re;
1213  o31_re -= B1_im;
1214  o31_im += B1_re;
1215 
1216  o02_re += A2_re;
1217  o02_im += A2_im;
1218  o12_re += B2_re;
1219  o12_im += B2_im;
1220  o22_re += A2_im;
1221  o22_im -= A2_re;
1222  o32_re -= B2_im;
1223  o32_im += B2_re;
1224 
1225 }
1226 
1227 if (isActive(dim,3,+1,x1,x2,x3,x4,param.commDim,param.X) && x4==X4m1 )
1228 {
1229  // Projector P3-
1230  // 0 0 0 0
1231  // 0 0 0 0
1232  // 0 0 2 0
1233  // 0 0 0 2
1234 
1235  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,3,Y);
1236  const int sp_idx = face_idx + param.ghostOffset[3];
1237 #if (DD_PREC==2)
1238  sp_norm_idx = face_idx + faceVolume[3] + param.ghostNormOffset[3];
1239 #endif
1240 
1241  const int ga_idx = sid;
1242 
1243  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1244  {
1251 
1252 
1253  const int sp_stride_pad = ghostFace[3];
1254  //const int t_proj_scale = TPROJSCALE;
1255 
1256  // read half spinor from device memory
1257  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1258 
1259 #ifdef TWIST_INV_DSLASH
1260  a0_re = i00_re; a0_im = i00_im;
1261  a1_re = i01_re; a1_im = i01_im;
1262  a2_re = i02_re; a2_im = i02_im;
1263  b0_re = i10_re; b0_im = i10_im;
1264  b1_re = i11_re; b1_im = i11_im;
1265  b2_re = i12_re; b2_im = i12_im;
1266 #else
1267  a0_re = 2*i00_re; a0_im = 2*i00_im;
1268  a1_re = 2*i01_re; a1_im = 2*i01_im;
1269  a2_re = 2*i02_re; a2_im = 2*i02_im;
1270  b0_re = 2*i10_re; b0_im = 2*i10_im;
1271  b1_re = 2*i11_re; b1_im = 2*i11_im;
1272  b2_re = 2*i12_re; b2_im = 2*i12_im;
1273 #endif
1274 
1275  // identity gauge matrix
1282 
1283  o20_re += A0_re;
1284  o20_im += A0_im;
1285  o30_re += B0_re;
1286  o30_im += B0_im;
1287 
1288  o21_re += A1_re;
1289  o21_im += A1_im;
1290  o31_re += B1_re;
1291  o31_im += B1_im;
1292 
1293  o22_re += A2_re;
1294  o22_im += A2_im;
1295  o32_re += B2_re;
1296  o32_im += B2_im;
1297 
1298  } else {
1305 
1306 
1307  const int sp_stride_pad = ghostFace[3];
1308  //const int t_proj_scale = TPROJSCALE;
1309 
1310  // read half spinor from device memory
1311  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1312 
1313 #ifdef TWIST_INV_DSLASH
1314  a0_re = i00_re; a0_im = i00_im;
1315  a1_re = i01_re; a1_im = i01_im;
1316  a2_re = i02_re; a2_im = i02_im;
1317  b0_re = i10_re; b0_im = i10_im;
1318  b1_re = i11_re; b1_im = i11_im;
1319  b2_re = i12_re; b2_im = i12_im;
1320 #else
1321  a0_re = 2*i00_re; a0_im = 2*i00_im;
1322  a1_re = 2*i01_re; a1_im = 2*i01_im;
1323  a2_re = 2*i02_re; a2_im = 2*i02_im;
1324  b0_re = 2*i10_re; b0_im = 2*i10_im;
1325  b1_re = 2*i11_re; b1_im = 2*i11_im;
1326  b2_re = 2*i12_re; b2_im = 2*i12_im;
1327 #endif
1328 
1329  // read gauge matrix from device memory
1330  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1331 
1332  // reconstruct gauge matrix
1334 
1335  // multiply row 0
1336  spinorFloat A0_re = 0;
1337  A0_re += g00_re * a0_re;
1338  A0_re -= g00_im * a0_im;
1339  A0_re += g01_re * a1_re;
1340  A0_re -= g01_im * a1_im;
1341  A0_re += g02_re * a2_re;
1342  A0_re -= g02_im * a2_im;
1343  spinorFloat A0_im = 0;
1344  A0_im += g00_re * a0_im;
1345  A0_im += g00_im * a0_re;
1346  A0_im += g01_re * a1_im;
1347  A0_im += g01_im * a1_re;
1348  A0_im += g02_re * a2_im;
1349  A0_im += g02_im * a2_re;
1350  spinorFloat B0_re = 0;
1351  B0_re += g00_re * b0_re;
1352  B0_re -= g00_im * b0_im;
1353  B0_re += g01_re * b1_re;
1354  B0_re -= g01_im * b1_im;
1355  B0_re += g02_re * b2_re;
1356  B0_re -= g02_im * b2_im;
1357  spinorFloat B0_im = 0;
1358  B0_im += g00_re * b0_im;
1359  B0_im += g00_im * b0_re;
1360  B0_im += g01_re * b1_im;
1361  B0_im += g01_im * b1_re;
1362  B0_im += g02_re * b2_im;
1363  B0_im += g02_im * b2_re;
1364 
1365  // multiply row 1
1366  spinorFloat A1_re = 0;
1367  A1_re += g10_re * a0_re;
1368  A1_re -= g10_im * a0_im;
1369  A1_re += g11_re * a1_re;
1370  A1_re -= g11_im * a1_im;
1371  A1_re += g12_re * a2_re;
1372  A1_re -= g12_im * a2_im;
1373  spinorFloat A1_im = 0;
1374  A1_im += g10_re * a0_im;
1375  A1_im += g10_im * a0_re;
1376  A1_im += g11_re * a1_im;
1377  A1_im += g11_im * a1_re;
1378  A1_im += g12_re * a2_im;
1379  A1_im += g12_im * a2_re;
1380  spinorFloat B1_re = 0;
1381  B1_re += g10_re * b0_re;
1382  B1_re -= g10_im * b0_im;
1383  B1_re += g11_re * b1_re;
1384  B1_re -= g11_im * b1_im;
1385  B1_re += g12_re * b2_re;
1386  B1_re -= g12_im * b2_im;
1387  spinorFloat B1_im = 0;
1388  B1_im += g10_re * b0_im;
1389  B1_im += g10_im * b0_re;
1390  B1_im += g11_re * b1_im;
1391  B1_im += g11_im * b1_re;
1392  B1_im += g12_re * b2_im;
1393  B1_im += g12_im * b2_re;
1394 
1395  // multiply row 2
1396  spinorFloat A2_re = 0;
1397  A2_re += g20_re * a0_re;
1398  A2_re -= g20_im * a0_im;
1399  A2_re += g21_re * a1_re;
1400  A2_re -= g21_im * a1_im;
1401  A2_re += g22_re * a2_re;
1402  A2_re -= g22_im * a2_im;
1403  spinorFloat A2_im = 0;
1404  A2_im += g20_re * a0_im;
1405  A2_im += g20_im * a0_re;
1406  A2_im += g21_re * a1_im;
1407  A2_im += g21_im * a1_re;
1408  A2_im += g22_re * a2_im;
1409  A2_im += g22_im * a2_re;
1410  spinorFloat B2_re = 0;
1411  B2_re += g20_re * b0_re;
1412  B2_re -= g20_im * b0_im;
1413  B2_re += g21_re * b1_re;
1414  B2_re -= g21_im * b1_im;
1415  B2_re += g22_re * b2_re;
1416  B2_re -= g22_im * b2_im;
1417  spinorFloat B2_im = 0;
1418  B2_im += g20_re * b0_im;
1419  B2_im += g20_im * b0_re;
1420  B2_im += g21_re * b1_im;
1421  B2_im += g21_im * b1_re;
1422  B2_im += g22_re * b2_im;
1423  B2_im += g22_im * b2_re;
1424 
1425  o20_re += A0_re;
1426  o20_im += A0_im;
1427  o30_re += B0_re;
1428  o30_im += B0_im;
1429 
1430  o21_re += A1_re;
1431  o21_im += A1_im;
1432  o31_re += B1_re;
1433  o31_im += B1_im;
1434 
1435  o22_re += A2_re;
1436  o22_im += A2_im;
1437  o32_re += B2_re;
1438  o32_im += B2_im;
1439 
1440  }
1441 }
1442 
1443 if (isActive(dim,3,-1,x1,x2,x3,x4,param.commDim,param.X) && x4==0 )
1444 {
1445  // Projector P3+
1446  // 2 0 0 0
1447  // 0 2 0 0
1448  // 0 0 0 0
1449  // 0 0 0 0
1450 
1451  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,3,Y);
1452  const int sp_idx = face_idx + param.ghostOffset[3];
1453 #if (DD_PREC==2)
1454  sp_norm_idx = face_idx + param.ghostNormOffset[3];
1455 #endif
1456 
1457  const int ga_idx = Vh+face_idx;
1458 
1459  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1460  {
1467 
1468 
1469  const int sp_stride_pad = ghostFace[3];
1470  //const int t_proj_scale = TPROJSCALE;
1471 
1472  // read half spinor from device memory
1473  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1474 
1475 #ifdef TWIST_INV_DSLASH
1476  a0_re = i00_re; a0_im = i00_im;
1477  a1_re = i01_re; a1_im = i01_im;
1478  a2_re = i02_re; a2_im = i02_im;
1479  b0_re = i10_re; b0_im = i10_im;
1480  b1_re = i11_re; b1_im = i11_im;
1481  b2_re = i12_re; b2_im = i12_im;
1482 #else
1483  a0_re = 2*i00_re; a0_im = 2*i00_im;
1484  a1_re = 2*i01_re; a1_im = 2*i01_im;
1485  a2_re = 2*i02_re; a2_im = 2*i02_im;
1486  b0_re = 2*i10_re; b0_im = 2*i10_im;
1487  b1_re = 2*i11_re; b1_im = 2*i11_im;
1488  b2_re = 2*i12_re; b2_im = 2*i12_im;
1489 #endif
1490 
1491  // identity gauge matrix
1498 
1499  o00_re += A0_re;
1500  o00_im += A0_im;
1501  o10_re += B0_re;
1502  o10_im += B0_im;
1503 
1504  o01_re += A1_re;
1505  o01_im += A1_im;
1506  o11_re += B1_re;
1507  o11_im += B1_im;
1508 
1509  o02_re += A2_re;
1510  o02_im += A2_im;
1511  o12_re += B2_re;
1512  o12_im += B2_im;
1513 
1514  } else {
1521 
1522 
1523  const int sp_stride_pad = ghostFace[3];
1524  //const int t_proj_scale = TPROJSCALE;
1525 
1526  // read half spinor from device memory
1527  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1528 
1529 #ifdef TWIST_INV_DSLASH
1530  a0_re = i00_re; a0_im = i00_im;
1531  a1_re = i01_re; a1_im = i01_im;
1532  a2_re = i02_re; a2_im = i02_im;
1533  b0_re = i10_re; b0_im = i10_im;
1534  b1_re = i11_re; b1_im = i11_im;
1535  b2_re = i12_re; b2_im = i12_im;
1536 #else
1537  a0_re = 2*i00_re; a0_im = 2*i00_im;
1538  a1_re = 2*i01_re; a1_im = 2*i01_im;
1539  a2_re = 2*i02_re; a2_im = 2*i02_im;
1540  b0_re = 2*i10_re; b0_im = 2*i10_im;
1541  b1_re = 2*i11_re; b1_im = 2*i11_im;
1542  b2_re = 2*i12_re; b2_im = 2*i12_im;
1543 #endif
1544 
1545  // read gauge matrix from device memory
1546  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
1547 
1548  // reconstruct gauge matrix
1550 
1551  // multiply row 0
1552  spinorFloat A0_re = 0;
1553  A0_re += gT00_re * a0_re;
1554  A0_re -= gT00_im * a0_im;
1555  A0_re += gT01_re * a1_re;
1556  A0_re -= gT01_im * a1_im;
1557  A0_re += gT02_re * a2_re;
1558  A0_re -= gT02_im * a2_im;
1559  spinorFloat A0_im = 0;
1560  A0_im += gT00_re * a0_im;
1561  A0_im += gT00_im * a0_re;
1562  A0_im += gT01_re * a1_im;
1563  A0_im += gT01_im * a1_re;
1564  A0_im += gT02_re * a2_im;
1565  A0_im += gT02_im * a2_re;
1566  spinorFloat B0_re = 0;
1567  B0_re += gT00_re * b0_re;
1568  B0_re -= gT00_im * b0_im;
1569  B0_re += gT01_re * b1_re;
1570  B0_re -= gT01_im * b1_im;
1571  B0_re += gT02_re * b2_re;
1572  B0_re -= gT02_im * b2_im;
1573  spinorFloat B0_im = 0;
1574  B0_im += gT00_re * b0_im;
1575  B0_im += gT00_im * b0_re;
1576  B0_im += gT01_re * b1_im;
1577  B0_im += gT01_im * b1_re;
1578  B0_im += gT02_re * b2_im;
1579  B0_im += gT02_im * b2_re;
1580 
1581  // multiply row 1
1582  spinorFloat A1_re = 0;
1583  A1_re += gT10_re * a0_re;
1584  A1_re -= gT10_im * a0_im;
1585  A1_re += gT11_re * a1_re;
1586  A1_re -= gT11_im * a1_im;
1587  A1_re += gT12_re * a2_re;
1588  A1_re -= gT12_im * a2_im;
1589  spinorFloat A1_im = 0;
1590  A1_im += gT10_re * a0_im;
1591  A1_im += gT10_im * a0_re;
1592  A1_im += gT11_re * a1_im;
1593  A1_im += gT11_im * a1_re;
1594  A1_im += gT12_re * a2_im;
1595  A1_im += gT12_im * a2_re;
1596  spinorFloat B1_re = 0;
1597  B1_re += gT10_re * b0_re;
1598  B1_re -= gT10_im * b0_im;
1599  B1_re += gT11_re * b1_re;
1600  B1_re -= gT11_im * b1_im;
1601  B1_re += gT12_re * b2_re;
1602  B1_re -= gT12_im * b2_im;
1603  spinorFloat B1_im = 0;
1604  B1_im += gT10_re * b0_im;
1605  B1_im += gT10_im * b0_re;
1606  B1_im += gT11_re * b1_im;
1607  B1_im += gT11_im * b1_re;
1608  B1_im += gT12_re * b2_im;
1609  B1_im += gT12_im * b2_re;
1610 
1611  // multiply row 2
1612  spinorFloat A2_re = 0;
1613  A2_re += gT20_re * a0_re;
1614  A2_re -= gT20_im * a0_im;
1615  A2_re += gT21_re * a1_re;
1616  A2_re -= gT21_im * a1_im;
1617  A2_re += gT22_re * a2_re;
1618  A2_re -= gT22_im * a2_im;
1619  spinorFloat A2_im = 0;
1620  A2_im += gT20_re * a0_im;
1621  A2_im += gT20_im * a0_re;
1622  A2_im += gT21_re * a1_im;
1623  A2_im += gT21_im * a1_re;
1624  A2_im += gT22_re * a2_im;
1625  A2_im += gT22_im * a2_re;
1626  spinorFloat B2_re = 0;
1627  B2_re += gT20_re * b0_re;
1628  B2_re -= gT20_im * b0_im;
1629  B2_re += gT21_re * b1_re;
1630  B2_re -= gT21_im * b1_im;
1631  B2_re += gT22_re * b2_re;
1632  B2_re -= gT22_im * b2_im;
1633  spinorFloat B2_im = 0;
1634  B2_im += gT20_re * b0_im;
1635  B2_im += gT20_im * b0_re;
1636  B2_im += gT21_re * b1_im;
1637  B2_im += gT21_im * b1_re;
1638  B2_im += gT22_re * b2_im;
1639  B2_im += gT22_im * b2_re;
1640 
1641  o00_re += A0_re;
1642  o00_im += A0_im;
1643  o10_re += B0_re;
1644  o10_im += B0_im;
1645 
1646  o01_re += A1_re;
1647  o01_im += A1_im;
1648  o11_re += B1_re;
1649  o11_im += B1_im;
1650 
1651  o02_re += A2_re;
1652  o02_im += A2_im;
1653  o12_re += B2_re;
1654  o12_im += B2_im;
1655 
1656  }
1657 }
1658 
1659 {
1660 #ifdef DSLASH_XPAY
1661  READ_ACCUM(ACCUMTEX, param.sp_stride)
1662 
1663 #ifndef TWIST_XPAY
1664 #ifndef TWIST_INV_DSLASH
1665  //perform invert twist first:
1666  APPLY_TWIST_INV( a, b, o);
1667 #endif
1668  o00_re += acc00_re;
1669  o00_im += acc00_im;
1670  o01_re += acc01_re;
1671  o01_im += acc01_im;
1672  o02_re += acc02_re;
1673  o02_im += acc02_im;
1674  o10_re += acc10_re;
1675  o10_im += acc10_im;
1676  o11_re += acc11_re;
1677  o11_im += acc11_im;
1678  o12_re += acc12_re;
1679  o12_im += acc12_im;
1680  o20_re += acc20_re;
1681  o20_im += acc20_im;
1682  o21_re += acc21_re;
1683  o21_im += acc21_im;
1684  o22_re += acc22_re;
1685  o22_im += acc22_im;
1686  o30_re += acc30_re;
1687  o30_im += acc30_im;
1688  o31_re += acc31_re;
1689  o31_im += acc31_im;
1690  o32_re += acc32_re;
1691  o32_im += acc32_im;
1692 #else
1693  APPLY_TWIST( a, acc);
1694  //warning! b is unrelated to the twisted mass parameter in this case!
1695 
1696  o00_re = b*o00_re+acc00_re;
1697  o00_im = b*o00_im+acc00_im;
1698  o01_re = b*o01_re+acc01_re;
1699  o01_im = b*o01_im+acc01_im;
1700  o02_re = b*o02_re+acc02_re;
1701  o02_im = b*o02_im+acc02_im;
1702  o10_re = b*o10_re+acc10_re;
1703  o10_im = b*o10_im+acc10_im;
1704  o11_re = b*o11_re+acc11_re;
1705  o11_im = b*o11_im+acc11_im;
1706  o12_re = b*o12_re+acc12_re;
1707  o12_im = b*o12_im+acc12_im;
1708  o20_re = b*o20_re+acc20_re;
1709  o20_im = b*o20_im+acc20_im;
1710  o21_re = b*o21_re+acc21_re;
1711  o21_im = b*o21_im+acc21_im;
1712  o22_re = b*o22_re+acc22_re;
1713  o22_im = b*o22_im+acc22_im;
1714  o30_re = b*o30_re+acc30_re;
1715  o30_im = b*o30_im+acc30_im;
1716  o31_re = b*o31_re+acc31_re;
1717  o31_im = b*o31_im+acc31_im;
1718  o32_re = b*o32_re+acc32_re;
1719  o32_im = b*o32_im+acc32_im;
1720 #endif//TWIST_XPAY
1721 #else //no XPAY
1722 #ifndef TWIST_INV_DSLASH
1723  APPLY_TWIST_INV( a, b, o);
1724 #endif
1725 #endif
1726 }
1727 
1728 // write spinor field back to device memory
1729 WRITE_SPINOR(param.sp_stride);
1730 
1731 // undefine to prevent warning when precision is changed
1732 #undef spinorFloat
1733 #undef g00_re
1734 #undef g00_im
1735 #undef g01_re
1736 #undef g01_im
1737 #undef g02_re
1738 #undef g02_im
1739 #undef g10_re
1740 #undef g10_im
1741 #undef g11_re
1742 #undef g11_im
1743 #undef g12_re
1744 #undef g12_im
1745 #undef g20_re
1746 #undef g20_im
1747 #undef g21_re
1748 #undef g21_im
1749 #undef g22_re
1750 #undef g22_im
1751 
1752 #undef i00_re
1753 #undef i00_im
1754 #undef i01_re
1755 #undef i01_im
1756 #undef i02_re
1757 #undef i02_im
1758 #undef i10_re
1759 #undef i10_im
1760 #undef i11_re
1761 #undef i11_im
1762 #undef i12_re
1763 #undef i12_im
1764 #undef i20_re
1765 #undef i20_im
1766 #undef i21_re
1767 #undef i21_im
1768 #undef i22_re
1769 #undef i22_im
1770 #undef i30_re
1771 #undef i30_im
1772 #undef i31_re
1773 #undef i31_im
1774 #undef i32_re
1775 #undef i32_im
1776 
1777 #undef acc00_re
1778 #undef acc00_im
1779 #undef acc01_re
1780 #undef acc01_im
1781 #undef acc02_re
1782 #undef acc02_im
1783 #undef acc10_re
1784 #undef acc10_im
1785 #undef acc11_re
1786 #undef acc11_im
1787 #undef acc12_re
1788 #undef acc12_im
1789 #undef acc20_re
1790 #undef acc20_im
1791 #undef acc21_re
1792 #undef acc21_im
1793 #undef acc22_re
1794 #undef acc22_im
1795 #undef acc30_re
1796 #undef acc30_im
1797 #undef acc31_re
1798 #undef acc31_im
1799 #undef acc32_re
1800 #undef acc32_im
1801 
1802 
1803 
1804 #undef VOLATILE
1805 
1806 #endif // MULTI_GPU
__constant__ int Vh
__constant__ int X2
#define o32_im
Definition: gamma5.h:295
#define APPLY_TWIST(a, reg)
Definition: io_spinor.h:1187
#define APPLY_TWIST_INV(a, b, reg)
**************************only for deg tm:*******************************
Definition: io_spinor.h:1122
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
#define o31_im
Definition: gamma5.h:293
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define RECONSTRUCT_GAUGE_MATRIX
Definition: covDev.h:39
#define GAUGE0TEX
Definition: covDev.h:112
#define o30_im
Definition: gamma5.h:291
__constant__ int X2m1
#define SPINORTEX
Definition: clover_def.h:40
#define o32_re
Definition: gamma5.h:294
int X[4]
Definition: quda.h:29
__constant__ int gauge_fixed
#define o31_re
Definition: gamma5.h:292
#define SPINOR_HOP
Definition: covDev.h:158
__constant__ int ga_stride
__constant__ int X1m1
__constant__ int X3
#define GAUGE1TEX
Definition: covDev.h:113
#define READ_GAUGE_MATRIX
Definition: covDev.h:44
__constant__ int X4m1
#define WRITE_SPINOR
Definition: clover_def.h:48
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
__constant__ int X4X3X2X1hmX3X2X1h
__constant__ int X4
__constant__ int X3m1