QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
tm_fused_exterior_dslash_g80_core.h
Go to the documentation of this file.
1 #ifdef MULTI_GPU
2 
3 // *** CUDA DSLASH ***
4 
5 #define DSLASH_SHARED_FLOATS_PER_THREAD 19
6 
7 
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
9 #define VOLATILE
10 #else // Open64 compiler
11 #define VOLATILE volatile
12 #endif
13 // input spinor
14 #ifdef SPINOR_DOUBLE
15 #define spinorFloat double
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define acc00_re accum0.x
41 #define acc00_im accum0.y
42 #define acc01_re accum1.x
43 #define acc01_im accum1.y
44 #define acc02_re accum2.x
45 #define acc02_im accum2.y
46 #define acc10_re accum3.x
47 #define acc10_im accum3.y
48 #define acc11_re accum4.x
49 #define acc11_im accum4.y
50 #define acc12_re accum5.x
51 #define acc12_im accum5.y
52 #define acc20_re accum6.x
53 #define acc20_im accum6.y
54 #define acc21_re accum7.x
55 #define acc21_im accum7.y
56 #define acc22_re accum8.x
57 #define acc22_im accum8.y
58 #define acc30_re accum9.x
59 #define acc30_im accum9.y
60 #define acc31_re accum10.x
61 #define acc31_im accum10.y
62 #define acc32_re accum11.x
63 #define acc32_im accum11.y
64 #else
65 #define spinorFloat float
66 #define i00_re I0.x
67 #define i00_im I0.y
68 #define i01_re I0.z
69 #define i01_im I0.w
70 #define i02_re I1.x
71 #define i02_im I1.y
72 #define i10_re I1.z
73 #define i10_im I1.w
74 #define i11_re I2.x
75 #define i11_im I2.y
76 #define i12_re I2.z
77 #define i12_im I2.w
78 #define i20_re I3.x
79 #define i20_im I3.y
80 #define i21_re I3.z
81 #define i21_im I3.w
82 #define i22_re I4.x
83 #define i22_im I4.y
84 #define i30_re I4.z
85 #define i30_im I4.w
86 #define i31_re I5.x
87 #define i31_im I5.y
88 #define i32_re I5.z
89 #define i32_im I5.w
90 #define acc00_re accum0.x
91 #define acc00_im accum0.y
92 #define acc01_re accum0.z
93 #define acc01_im accum0.w
94 #define acc02_re accum1.x
95 #define acc02_im accum1.y
96 #define acc10_re accum1.z
97 #define acc10_im accum1.w
98 #define acc11_re accum2.x
99 #define acc11_im accum2.y
100 #define acc12_re accum2.z
101 #define acc12_im accum2.w
102 #define acc20_re accum3.x
103 #define acc20_im accum3.y
104 #define acc21_re accum3.z
105 #define acc21_im accum3.w
106 #define acc22_re accum4.x
107 #define acc22_im accum4.y
108 #define acc30_re accum4.z
109 #define acc30_im accum4.w
110 #define acc31_re accum5.x
111 #define acc31_im accum5.y
112 #define acc32_re accum5.z
113 #define acc32_im accum5.w
114 #endif // SPINOR_DOUBLE
115 
116 // gauge link
117 #ifdef GAUGE_FLOAT2
118 #define g00_re G0.x
119 #define g00_im G0.y
120 #define g01_re G1.x
121 #define g01_im G1.y
122 #define g02_re G2.x
123 #define g02_im G2.y
124 #define g10_re G3.x
125 #define g10_im G3.y
126 #define g11_re G4.x
127 #define g11_im G4.y
128 #define g12_re G5.x
129 #define g12_im G5.y
130 #define g20_re G6.x
131 #define g20_im G6.y
132 #define g21_re G7.x
133 #define g21_im G7.y
134 #define g22_re G8.x
135 #define g22_im G8.y
136 
137 #else
138 #define g00_re G0.x
139 #define g00_im G0.y
140 #define g01_re G0.z
141 #define g01_im G0.w
142 #define g02_re G1.x
143 #define g02_im G1.y
144 #define g10_re G1.z
145 #define g10_im G1.w
146 #define g11_re G2.x
147 #define g11_im G2.y
148 #define g12_re G2.z
149 #define g12_im G2.w
150 #define g20_re G3.x
151 #define g20_im G3.y
152 #define g21_re G3.z
153 #define g21_im G3.w
154 #define g22_re G4.x
155 #define g22_im G4.y
156 
157 #endif // GAUGE_DOUBLE
158 
159 // conjugated gauge link
160 #define gT00_re (+g00_re)
161 #define gT00_im (-g00_im)
162 #define gT01_re (+g10_re)
163 #define gT01_im (-g10_im)
164 #define gT02_re (+g20_re)
165 #define gT02_im (-g20_im)
166 #define gT10_re (+g01_re)
167 #define gT10_im (-g01_im)
168 #define gT11_re (+g11_re)
169 #define gT11_im (-g11_im)
170 #define gT12_re (+g21_re)
171 #define gT12_im (-g21_im)
172 #define gT20_re (+g02_re)
173 #define gT20_im (-g02_im)
174 #define gT21_re (+g12_re)
175 #define gT21_im (-g12_im)
176 #define gT22_re (+g22_re)
177 #define gT22_im (-g22_im)
178 
179 // output spinor
180 #define o00_re s[0*SHARED_STRIDE]
181 #define o00_im s[1*SHARED_STRIDE]
182 #define o01_re s[2*SHARED_STRIDE]
183 #define o01_im s[3*SHARED_STRIDE]
184 #define o02_re s[4*SHARED_STRIDE]
185 #define o02_im s[5*SHARED_STRIDE]
186 #define o10_re s[6*SHARED_STRIDE]
187 #define o10_im s[7*SHARED_STRIDE]
188 #define o11_re s[8*SHARED_STRIDE]
189 #define o11_im s[9*SHARED_STRIDE]
190 #define o12_re s[10*SHARED_STRIDE]
191 #define o12_im s[11*SHARED_STRIDE]
192 #define o20_re s[12*SHARED_STRIDE]
193 #define o20_im s[13*SHARED_STRIDE]
194 #define o21_re s[14*SHARED_STRIDE]
195 #define o21_im s[15*SHARED_STRIDE]
196 #define o22_re s[16*SHARED_STRIDE]
197 #define o22_im s[17*SHARED_STRIDE]
198 #define o30_re s[18*SHARED_STRIDE]
204 
205 #ifdef SPINOR_DOUBLE
206 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
207 #else
208 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
209 #endif
210 
211 extern __shared__ char s_data[];
212 
214  + (threadIdx.x % SHARED_STRIDE);
215 
216 #include "read_gauge.h"
217 #include "io_spinor.h"
218 
219 int x1, x2, x3, x4;
220 int X;
221 
222 #if (DD_PREC==2) // half precision
223 int sp_norm_idx;
224 #endif // half precision
225 
226 int sid;
227 
228 int dim;
229 int face_idx;
230 int Y[4] = {X1,X2,X3,X4};
231 int faceVolume[4];
232 faceVolume[0] = (X2*X3*X4)>>1;
233 faceVolume[1] = (X1*X3*X4)>>1;
234 faceVolume[2] = (X1*X2*X4)>>1;
235 faceVolume[3] = (X1*X2*X3)>>1;
236 
237  sid = blockIdx.x*blockDim.x + threadIdx.x;
238  if (sid >= param.threads) return;
239 
240 
241  dim = dimFromFaceIndex(sid, param); // sid is also modified
242 
243  const int face_volume = ((param.threadDimMapUpper[dim] - param.threadDimMapLower[dim]) >> 1);
244  const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
245  face_idx = sid - face_num*face_volume; // index into the respective face
246 
247 
248  const int dims[] = {X1, X2, X3, X4};
249  coordsFromFaceIndex<1>(X, sid, x1, x2, x3, x4, face_idx, face_volume, dim, face_num, param.parity, dims);
250 
251  bool active = false;
252  for(int dir=0; dir<4; ++dir){
253  active = active || isActive(dim,dir,+1,x1,x2,x3,x4,param.commDim,param.X);
254  }
255  if(!active) return;
256 
257 
258  READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
259 
260  o00_re = i00_re; o00_im = i00_im;
261  o01_re = i01_re; o01_im = i01_im;
262  o02_re = i02_re; o02_im = i02_im;
263  o10_re = i10_re; o10_im = i10_im;
264  o11_re = i11_re; o11_im = i11_im;
265  o12_re = i12_re; o12_im = i12_im;
266  o20_re = i20_re; o20_im = i20_im;
267  o21_re = i21_re; o21_im = i21_im;
268  o22_re = i22_re; o22_im = i22_im;
269  o30_re = i30_re; o30_im = i30_im;
270  o31_re = i31_re; o31_im = i31_im;
271  o32_re = i32_re; o32_im = i32_im;
272 if (isActive(dim,0,+1,x1,x2,x3,x4,param.commDim,param.X) && x1==X1m1 )
273 {
274  // Projector P0-
275  // 1 0 0 -i
276  // 0 1 -i 0
277  // 0 i 1 0
278  // i 0 0 1
279 
280  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,0,Y);
281  const int sp_idx = face_idx + param.ghostOffset[0];
282 #if (DD_PREC==2)
283  sp_norm_idx = face_idx + faceVolume[0] + param.ghostNormOffset[0];
284 #endif
285 
286  const int ga_idx = sid;
287 
294 
295 
296  const int sp_stride_pad = ghostFace[0];
297 
298  // read half spinor from device memory
299  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
300 
301  a0_re = i00_re; a0_im = i00_im;
302  a1_re = i01_re; a1_im = i01_im;
303  a2_re = i02_re; a2_im = i02_im;
304  b0_re = i10_re; b0_im = i10_im;
305  b1_re = i11_re; b1_im = i11_im;
306  b2_re = i12_re; b2_im = i12_im;
307 
308  // read gauge matrix from device memory
309  READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
310 
311  // reconstruct gauge matrix
313 
314  // multiply row 0
315  spinorFloat A0_re = 0;
316  A0_re += g00_re * a0_re;
317  A0_re -= g00_im * a0_im;
318  A0_re += g01_re * a1_re;
319  A0_re -= g01_im * a1_im;
320  A0_re += g02_re * a2_re;
321  A0_re -= g02_im * a2_im;
322  spinorFloat A0_im = 0;
323  A0_im += g00_re * a0_im;
324  A0_im += g00_im * a0_re;
325  A0_im += g01_re * a1_im;
326  A0_im += g01_im * a1_re;
327  A0_im += g02_re * a2_im;
328  A0_im += g02_im * a2_re;
329  spinorFloat B0_re = 0;
330  B0_re += g00_re * b0_re;
331  B0_re -= g00_im * b0_im;
332  B0_re += g01_re * b1_re;
333  B0_re -= g01_im * b1_im;
334  B0_re += g02_re * b2_re;
335  B0_re -= g02_im * b2_im;
336  spinorFloat B0_im = 0;
337  B0_im += g00_re * b0_im;
338  B0_im += g00_im * b0_re;
339  B0_im += g01_re * b1_im;
340  B0_im += g01_im * b1_re;
341  B0_im += g02_re * b2_im;
342  B0_im += g02_im * b2_re;
343 
344  // multiply row 1
345  spinorFloat A1_re = 0;
346  A1_re += g10_re * a0_re;
347  A1_re -= g10_im * a0_im;
348  A1_re += g11_re * a1_re;
349  A1_re -= g11_im * a1_im;
350  A1_re += g12_re * a2_re;
351  A1_re -= g12_im * a2_im;
352  spinorFloat A1_im = 0;
353  A1_im += g10_re * a0_im;
354  A1_im += g10_im * a0_re;
355  A1_im += g11_re * a1_im;
356  A1_im += g11_im * a1_re;
357  A1_im += g12_re * a2_im;
358  A1_im += g12_im * a2_re;
359  spinorFloat B1_re = 0;
360  B1_re += g10_re * b0_re;
361  B1_re -= g10_im * b0_im;
362  B1_re += g11_re * b1_re;
363  B1_re -= g11_im * b1_im;
364  B1_re += g12_re * b2_re;
365  B1_re -= g12_im * b2_im;
366  spinorFloat B1_im = 0;
367  B1_im += g10_re * b0_im;
368  B1_im += g10_im * b0_re;
369  B1_im += g11_re * b1_im;
370  B1_im += g11_im * b1_re;
371  B1_im += g12_re * b2_im;
372  B1_im += g12_im * b2_re;
373 
374  // multiply row 2
375  spinorFloat A2_re = 0;
376  A2_re += g20_re * a0_re;
377  A2_re -= g20_im * a0_im;
378  A2_re += g21_re * a1_re;
379  A2_re -= g21_im * a1_im;
380  A2_re += g22_re * a2_re;
381  A2_re -= g22_im * a2_im;
382  spinorFloat A2_im = 0;
383  A2_im += g20_re * a0_im;
384  A2_im += g20_im * a0_re;
385  A2_im += g21_re * a1_im;
386  A2_im += g21_im * a1_re;
387  A2_im += g22_re * a2_im;
388  A2_im += g22_im * a2_re;
389  spinorFloat B2_re = 0;
390  B2_re += g20_re * b0_re;
391  B2_re -= g20_im * b0_im;
392  B2_re += g21_re * b1_re;
393  B2_re -= g21_im * b1_im;
394  B2_re += g22_re * b2_re;
395  B2_re -= g22_im * b2_im;
396  spinorFloat B2_im = 0;
397  B2_im += g20_re * b0_im;
398  B2_im += g20_im * b0_re;
399  B2_im += g21_re * b1_im;
400  B2_im += g21_im * b1_re;
401  B2_im += g22_re * b2_im;
402  B2_im += g22_im * b2_re;
403 
404  o00_re += A0_re;
405  o00_im += A0_im;
406  o10_re += B0_re;
407  o10_im += B0_im;
408  o20_re -= B0_im;
409  o20_im += B0_re;
410  o30_re -= A0_im;
411  o30_im += A0_re;
412 
413  o01_re += A1_re;
414  o01_im += A1_im;
415  o11_re += B1_re;
416  o11_im += B1_im;
417  o21_re -= B1_im;
418  o21_im += B1_re;
419  o31_re -= A1_im;
420  o31_im += A1_re;
421 
422  o02_re += A2_re;
423  o02_im += A2_im;
424  o12_re += B2_re;
425  o12_im += B2_im;
426  o22_re -= B2_im;
427  o22_im += B2_re;
428  o32_re -= A2_im;
429  o32_im += A2_re;
430 
431 }
432 
433 if (isActive(dim,0,-1,x1,x2,x3,x4,param.commDim,param.X) && x1==0 )
434 {
435  // Projector P0+
436  // 1 0 0 i
437  // 0 1 i 0
438  // 0 -i 1 0
439  // -i 0 0 1
440 
441  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,0,Y);
442  const int sp_idx = face_idx + param.ghostOffset[0];
443 #if (DD_PREC==2)
444  sp_norm_idx = face_idx + param.ghostNormOffset[0];
445 #endif
446 
447  const int ga_idx = Vh+face_idx;
448 
455 
456 
457  const int sp_stride_pad = ghostFace[0];
458 
459  // read half spinor from device memory
460  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
461 
462  a0_re = i00_re; a0_im = i00_im;
463  a1_re = i01_re; a1_im = i01_im;
464  a2_re = i02_re; a2_im = i02_im;
465  b0_re = i10_re; b0_im = i10_im;
466  b1_re = i11_re; b1_im = i11_im;
467  b2_re = i12_re; b2_im = i12_im;
468 
469  // read gauge matrix from device memory
470  READ_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
471 
472  // reconstruct gauge matrix
474 
475  // multiply row 0
476  spinorFloat A0_re = 0;
477  A0_re += gT00_re * a0_re;
478  A0_re -= gT00_im * a0_im;
479  A0_re += gT01_re * a1_re;
480  A0_re -= gT01_im * a1_im;
481  A0_re += gT02_re * a2_re;
482  A0_re -= gT02_im * a2_im;
483  spinorFloat A0_im = 0;
484  A0_im += gT00_re * a0_im;
485  A0_im += gT00_im * a0_re;
486  A0_im += gT01_re * a1_im;
487  A0_im += gT01_im * a1_re;
488  A0_im += gT02_re * a2_im;
489  A0_im += gT02_im * a2_re;
490  spinorFloat B0_re = 0;
491  B0_re += gT00_re * b0_re;
492  B0_re -= gT00_im * b0_im;
493  B0_re += gT01_re * b1_re;
494  B0_re -= gT01_im * b1_im;
495  B0_re += gT02_re * b2_re;
496  B0_re -= gT02_im * b2_im;
497  spinorFloat B0_im = 0;
498  B0_im += gT00_re * b0_im;
499  B0_im += gT00_im * b0_re;
500  B0_im += gT01_re * b1_im;
501  B0_im += gT01_im * b1_re;
502  B0_im += gT02_re * b2_im;
503  B0_im += gT02_im * b2_re;
504 
505  // multiply row 1
506  spinorFloat A1_re = 0;
507  A1_re += gT10_re * a0_re;
508  A1_re -= gT10_im * a0_im;
509  A1_re += gT11_re * a1_re;
510  A1_re -= gT11_im * a1_im;
511  A1_re += gT12_re * a2_re;
512  A1_re -= gT12_im * a2_im;
513  spinorFloat A1_im = 0;
514  A1_im += gT10_re * a0_im;
515  A1_im += gT10_im * a0_re;
516  A1_im += gT11_re * a1_im;
517  A1_im += gT11_im * a1_re;
518  A1_im += gT12_re * a2_im;
519  A1_im += gT12_im * a2_re;
520  spinorFloat B1_re = 0;
521  B1_re += gT10_re * b0_re;
522  B1_re -= gT10_im * b0_im;
523  B1_re += gT11_re * b1_re;
524  B1_re -= gT11_im * b1_im;
525  B1_re += gT12_re * b2_re;
526  B1_re -= gT12_im * b2_im;
527  spinorFloat B1_im = 0;
528  B1_im += gT10_re * b0_im;
529  B1_im += gT10_im * b0_re;
530  B1_im += gT11_re * b1_im;
531  B1_im += gT11_im * b1_re;
532  B1_im += gT12_re * b2_im;
533  B1_im += gT12_im * b2_re;
534 
535  // multiply row 2
536  spinorFloat A2_re = 0;
537  A2_re += gT20_re * a0_re;
538  A2_re -= gT20_im * a0_im;
539  A2_re += gT21_re * a1_re;
540  A2_re -= gT21_im * a1_im;
541  A2_re += gT22_re * a2_re;
542  A2_re -= gT22_im * a2_im;
543  spinorFloat A2_im = 0;
544  A2_im += gT20_re * a0_im;
545  A2_im += gT20_im * a0_re;
546  A2_im += gT21_re * a1_im;
547  A2_im += gT21_im * a1_re;
548  A2_im += gT22_re * a2_im;
549  A2_im += gT22_im * a2_re;
550  spinorFloat B2_re = 0;
551  B2_re += gT20_re * b0_re;
552  B2_re -= gT20_im * b0_im;
553  B2_re += gT21_re * b1_re;
554  B2_re -= gT21_im * b1_im;
555  B2_re += gT22_re * b2_re;
556  B2_re -= gT22_im * b2_im;
557  spinorFloat B2_im = 0;
558  B2_im += gT20_re * b0_im;
559  B2_im += gT20_im * b0_re;
560  B2_im += gT21_re * b1_im;
561  B2_im += gT21_im * b1_re;
562  B2_im += gT22_re * b2_im;
563  B2_im += gT22_im * b2_re;
564 
565  o00_re += A0_re;
566  o00_im += A0_im;
567  o10_re += B0_re;
568  o10_im += B0_im;
569  o20_re += B0_im;
570  o20_im -= B0_re;
571  o30_re += A0_im;
572  o30_im -= A0_re;
573 
574  o01_re += A1_re;
575  o01_im += A1_im;
576  o11_re += B1_re;
577  o11_im += B1_im;
578  o21_re += B1_im;
579  o21_im -= B1_re;
580  o31_re += A1_im;
581  o31_im -= A1_re;
582 
583  o02_re += A2_re;
584  o02_im += A2_im;
585  o12_re += B2_re;
586  o12_im += B2_im;
587  o22_re += B2_im;
588  o22_im -= B2_re;
589  o32_re += A2_im;
590  o32_im -= A2_re;
591 
592 }
593 
594 if (isActive(dim,1,+1,x1,x2,x3,x4,param.commDim,param.X) && x2==X2m1 )
595 {
596  // Projector P1-
597  // 1 0 0 -1
598  // 0 1 1 0
599  // 0 1 1 0
600  // -1 0 0 1
601 
602  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,1,Y);
603  const int sp_idx = face_idx + param.ghostOffset[1];
604 #if (DD_PREC==2)
605  sp_norm_idx = face_idx + faceVolume[1] + param.ghostNormOffset[1];
606 #endif
607 
608  const int ga_idx = sid;
609 
616 
617 
618  const int sp_stride_pad = ghostFace[1];
619 
620  // read half spinor from device memory
621  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
622 
623  a0_re = i00_re; a0_im = i00_im;
624  a1_re = i01_re; a1_im = i01_im;
625  a2_re = i02_re; a2_im = i02_im;
626  b0_re = i10_re; b0_im = i10_im;
627  b1_re = i11_re; b1_im = i11_im;
628  b2_re = i12_re; b2_im = i12_im;
629 
630  // read gauge matrix from device memory
631  READ_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
632 
633  // reconstruct gauge matrix
635 
636  // multiply row 0
637  spinorFloat A0_re = 0;
638  A0_re += g00_re * a0_re;
639  A0_re -= g00_im * a0_im;
640  A0_re += g01_re * a1_re;
641  A0_re -= g01_im * a1_im;
642  A0_re += g02_re * a2_re;
643  A0_re -= g02_im * a2_im;
644  spinorFloat A0_im = 0;
645  A0_im += g00_re * a0_im;
646  A0_im += g00_im * a0_re;
647  A0_im += g01_re * a1_im;
648  A0_im += g01_im * a1_re;
649  A0_im += g02_re * a2_im;
650  A0_im += g02_im * a2_re;
651  spinorFloat B0_re = 0;
652  B0_re += g00_re * b0_re;
653  B0_re -= g00_im * b0_im;
654  B0_re += g01_re * b1_re;
655  B0_re -= g01_im * b1_im;
656  B0_re += g02_re * b2_re;
657  B0_re -= g02_im * b2_im;
658  spinorFloat B0_im = 0;
659  B0_im += g00_re * b0_im;
660  B0_im += g00_im * b0_re;
661  B0_im += g01_re * b1_im;
662  B0_im += g01_im * b1_re;
663  B0_im += g02_re * b2_im;
664  B0_im += g02_im * b2_re;
665 
666  // multiply row 1
667  spinorFloat A1_re = 0;
668  A1_re += g10_re * a0_re;
669  A1_re -= g10_im * a0_im;
670  A1_re += g11_re * a1_re;
671  A1_re -= g11_im * a1_im;
672  A1_re += g12_re * a2_re;
673  A1_re -= g12_im * a2_im;
674  spinorFloat A1_im = 0;
675  A1_im += g10_re * a0_im;
676  A1_im += g10_im * a0_re;
677  A1_im += g11_re * a1_im;
678  A1_im += g11_im * a1_re;
679  A1_im += g12_re * a2_im;
680  A1_im += g12_im * a2_re;
681  spinorFloat B1_re = 0;
682  B1_re += g10_re * b0_re;
683  B1_re -= g10_im * b0_im;
684  B1_re += g11_re * b1_re;
685  B1_re -= g11_im * b1_im;
686  B1_re += g12_re * b2_re;
687  B1_re -= g12_im * b2_im;
688  spinorFloat B1_im = 0;
689  B1_im += g10_re * b0_im;
690  B1_im += g10_im * b0_re;
691  B1_im += g11_re * b1_im;
692  B1_im += g11_im * b1_re;
693  B1_im += g12_re * b2_im;
694  B1_im += g12_im * b2_re;
695 
696  // multiply row 2
697  spinorFloat A2_re = 0;
698  A2_re += g20_re * a0_re;
699  A2_re -= g20_im * a0_im;
700  A2_re += g21_re * a1_re;
701  A2_re -= g21_im * a1_im;
702  A2_re += g22_re * a2_re;
703  A2_re -= g22_im * a2_im;
704  spinorFloat A2_im = 0;
705  A2_im += g20_re * a0_im;
706  A2_im += g20_im * a0_re;
707  A2_im += g21_re * a1_im;
708  A2_im += g21_im * a1_re;
709  A2_im += g22_re * a2_im;
710  A2_im += g22_im * a2_re;
711  spinorFloat B2_re = 0;
712  B2_re += g20_re * b0_re;
713  B2_re -= g20_im * b0_im;
714  B2_re += g21_re * b1_re;
715  B2_re -= g21_im * b1_im;
716  B2_re += g22_re * b2_re;
717  B2_re -= g22_im * b2_im;
718  spinorFloat B2_im = 0;
719  B2_im += g20_re * b0_im;
720  B2_im += g20_im * b0_re;
721  B2_im += g21_re * b1_im;
722  B2_im += g21_im * b1_re;
723  B2_im += g22_re * b2_im;
724  B2_im += g22_im * b2_re;
725 
726  o00_re += A0_re;
727  o00_im += A0_im;
728  o10_re += B0_re;
729  o10_im += B0_im;
730  o20_re += B0_re;
731  o20_im += B0_im;
732  o30_re -= A0_re;
733  o30_im -= A0_im;
734 
735  o01_re += A1_re;
736  o01_im += A1_im;
737  o11_re += B1_re;
738  o11_im += B1_im;
739  o21_re += B1_re;
740  o21_im += B1_im;
741  o31_re -= A1_re;
742  o31_im -= A1_im;
743 
744  o02_re += A2_re;
745  o02_im += A2_im;
746  o12_re += B2_re;
747  o12_im += B2_im;
748  o22_re += B2_re;
749  o22_im += B2_im;
750  o32_re -= A2_re;
751  o32_im -= A2_im;
752 
753 }
754 
755 if (isActive(dim,1,-1,x1,x2,x3,x4,param.commDim,param.X) && x2==0 )
756 {
757  // Projector P1+
758  // 1 0 0 1
759  // 0 1 -1 0
760  // 0 -1 1 0
761  // 1 0 0 1
762 
763  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,1,Y);
764  const int sp_idx = face_idx + param.ghostOffset[1];
765 #if (DD_PREC==2)
766  sp_norm_idx = face_idx + param.ghostNormOffset[1];
767 #endif
768 
769  const int ga_idx = Vh+face_idx;
770 
777 
778 
779  const int sp_stride_pad = ghostFace[1];
780 
781  // read half spinor from device memory
782  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
783 
784  a0_re = i00_re; a0_im = i00_im;
785  a1_re = i01_re; a1_im = i01_im;
786  a2_re = i02_re; a2_im = i02_im;
787  b0_re = i10_re; b0_im = i10_im;
788  b1_re = i11_re; b1_im = i11_im;
789  b2_re = i12_re; b2_im = i12_im;
790 
791  // read gauge matrix from device memory
792  READ_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
793 
794  // reconstruct gauge matrix
796 
797  // multiply row 0
798  spinorFloat A0_re = 0;
799  A0_re += gT00_re * a0_re;
800  A0_re -= gT00_im * a0_im;
801  A0_re += gT01_re * a1_re;
802  A0_re -= gT01_im * a1_im;
803  A0_re += gT02_re * a2_re;
804  A0_re -= gT02_im * a2_im;
805  spinorFloat A0_im = 0;
806  A0_im += gT00_re * a0_im;
807  A0_im += gT00_im * a0_re;
808  A0_im += gT01_re * a1_im;
809  A0_im += gT01_im * a1_re;
810  A0_im += gT02_re * a2_im;
811  A0_im += gT02_im * a2_re;
812  spinorFloat B0_re = 0;
813  B0_re += gT00_re * b0_re;
814  B0_re -= gT00_im * b0_im;
815  B0_re += gT01_re * b1_re;
816  B0_re -= gT01_im * b1_im;
817  B0_re += gT02_re * b2_re;
818  B0_re -= gT02_im * b2_im;
819  spinorFloat B0_im = 0;
820  B0_im += gT00_re * b0_im;
821  B0_im += gT00_im * b0_re;
822  B0_im += gT01_re * b1_im;
823  B0_im += gT01_im * b1_re;
824  B0_im += gT02_re * b2_im;
825  B0_im += gT02_im * b2_re;
826 
827  // multiply row 1
828  spinorFloat A1_re = 0;
829  A1_re += gT10_re * a0_re;
830  A1_re -= gT10_im * a0_im;
831  A1_re += gT11_re * a1_re;
832  A1_re -= gT11_im * a1_im;
833  A1_re += gT12_re * a2_re;
834  A1_re -= gT12_im * a2_im;
835  spinorFloat A1_im = 0;
836  A1_im += gT10_re * a0_im;
837  A1_im += gT10_im * a0_re;
838  A1_im += gT11_re * a1_im;
839  A1_im += gT11_im * a1_re;
840  A1_im += gT12_re * a2_im;
841  A1_im += gT12_im * a2_re;
842  spinorFloat B1_re = 0;
843  B1_re += gT10_re * b0_re;
844  B1_re -= gT10_im * b0_im;
845  B1_re += gT11_re * b1_re;
846  B1_re -= gT11_im * b1_im;
847  B1_re += gT12_re * b2_re;
848  B1_re -= gT12_im * b2_im;
849  spinorFloat B1_im = 0;
850  B1_im += gT10_re * b0_im;
851  B1_im += gT10_im * b0_re;
852  B1_im += gT11_re * b1_im;
853  B1_im += gT11_im * b1_re;
854  B1_im += gT12_re * b2_im;
855  B1_im += gT12_im * b2_re;
856 
857  // multiply row 2
858  spinorFloat A2_re = 0;
859  A2_re += gT20_re * a0_re;
860  A2_re -= gT20_im * a0_im;
861  A2_re += gT21_re * a1_re;
862  A2_re -= gT21_im * a1_im;
863  A2_re += gT22_re * a2_re;
864  A2_re -= gT22_im * a2_im;
865  spinorFloat A2_im = 0;
866  A2_im += gT20_re * a0_im;
867  A2_im += gT20_im * a0_re;
868  A2_im += gT21_re * a1_im;
869  A2_im += gT21_im * a1_re;
870  A2_im += gT22_re * a2_im;
871  A2_im += gT22_im * a2_re;
872  spinorFloat B2_re = 0;
873  B2_re += gT20_re * b0_re;
874  B2_re -= gT20_im * b0_im;
875  B2_re += gT21_re * b1_re;
876  B2_re -= gT21_im * b1_im;
877  B2_re += gT22_re * b2_re;
878  B2_re -= gT22_im * b2_im;
879  spinorFloat B2_im = 0;
880  B2_im += gT20_re * b0_im;
881  B2_im += gT20_im * b0_re;
882  B2_im += gT21_re * b1_im;
883  B2_im += gT21_im * b1_re;
884  B2_im += gT22_re * b2_im;
885  B2_im += gT22_im * b2_re;
886 
887  o00_re += A0_re;
888  o00_im += A0_im;
889  o10_re += B0_re;
890  o10_im += B0_im;
891  o20_re -= B0_re;
892  o20_im -= B0_im;
893  o30_re += A0_re;
894  o30_im += A0_im;
895 
896  o01_re += A1_re;
897  o01_im += A1_im;
898  o11_re += B1_re;
899  o11_im += B1_im;
900  o21_re -= B1_re;
901  o21_im -= B1_im;
902  o31_re += A1_re;
903  o31_im += A1_im;
904 
905  o02_re += A2_re;
906  o02_im += A2_im;
907  o12_re += B2_re;
908  o12_im += B2_im;
909  o22_re -= B2_re;
910  o22_im -= B2_im;
911  o32_re += A2_re;
912  o32_im += A2_im;
913 
914 }
915 
916 if (isActive(dim,2,+1,x1,x2,x3,x4,param.commDim,param.X) && x3==X3m1 )
917 {
918  // Projector P2-
919  // 1 0 -i 0
920  // 0 1 0 i
921  // i 0 1 0
922  // 0 -i 0 1
923 
924  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,2,Y);
925  const int sp_idx = face_idx + param.ghostOffset[2];
926 #if (DD_PREC==2)
927  sp_norm_idx = face_idx + faceVolume[2] + param.ghostNormOffset[2];
928 #endif
929 
930  const int ga_idx = sid;
931 
938 
939 
940  const int sp_stride_pad = ghostFace[2];
941 
942  // read half spinor from device memory
943  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
944 
945  a0_re = i00_re; a0_im = i00_im;
946  a1_re = i01_re; a1_im = i01_im;
947  a2_re = i02_re; a2_im = i02_im;
948  b0_re = i10_re; b0_im = i10_im;
949  b1_re = i11_re; b1_im = i11_im;
950  b2_re = i12_re; b2_im = i12_im;
951 
952  // read gauge matrix from device memory
953  READ_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
954 
955  // reconstruct gauge matrix
957 
958  // multiply row 0
959  spinorFloat A0_re = 0;
960  A0_re += g00_re * a0_re;
961  A0_re -= g00_im * a0_im;
962  A0_re += g01_re * a1_re;
963  A0_re -= g01_im * a1_im;
964  A0_re += g02_re * a2_re;
965  A0_re -= g02_im * a2_im;
966  spinorFloat A0_im = 0;
967  A0_im += g00_re * a0_im;
968  A0_im += g00_im * a0_re;
969  A0_im += g01_re * a1_im;
970  A0_im += g01_im * a1_re;
971  A0_im += g02_re * a2_im;
972  A0_im += g02_im * a2_re;
973  spinorFloat B0_re = 0;
974  B0_re += g00_re * b0_re;
975  B0_re -= g00_im * b0_im;
976  B0_re += g01_re * b1_re;
977  B0_re -= g01_im * b1_im;
978  B0_re += g02_re * b2_re;
979  B0_re -= g02_im * b2_im;
980  spinorFloat B0_im = 0;
981  B0_im += g00_re * b0_im;
982  B0_im += g00_im * b0_re;
983  B0_im += g01_re * b1_im;
984  B0_im += g01_im * b1_re;
985  B0_im += g02_re * b2_im;
986  B0_im += g02_im * b2_re;
987 
988  // multiply row 1
989  spinorFloat A1_re = 0;
990  A1_re += g10_re * a0_re;
991  A1_re -= g10_im * a0_im;
992  A1_re += g11_re * a1_re;
993  A1_re -= g11_im * a1_im;
994  A1_re += g12_re * a2_re;
995  A1_re -= g12_im * a2_im;
996  spinorFloat A1_im = 0;
997  A1_im += g10_re * a0_im;
998  A1_im += g10_im * a0_re;
999  A1_im += g11_re * a1_im;
1000  A1_im += g11_im * a1_re;
1001  A1_im += g12_re * a2_im;
1002  A1_im += g12_im * a2_re;
1003  spinorFloat B1_re = 0;
1004  B1_re += g10_re * b0_re;
1005  B1_re -= g10_im * b0_im;
1006  B1_re += g11_re * b1_re;
1007  B1_re -= g11_im * b1_im;
1008  B1_re += g12_re * b2_re;
1009  B1_re -= g12_im * b2_im;
1010  spinorFloat B1_im = 0;
1011  B1_im += g10_re * b0_im;
1012  B1_im += g10_im * b0_re;
1013  B1_im += g11_re * b1_im;
1014  B1_im += g11_im * b1_re;
1015  B1_im += g12_re * b2_im;
1016  B1_im += g12_im * b2_re;
1017 
1018  // multiply row 2
1019  spinorFloat A2_re = 0;
1020  A2_re += g20_re * a0_re;
1021  A2_re -= g20_im * a0_im;
1022  A2_re += g21_re * a1_re;
1023  A2_re -= g21_im * a1_im;
1024  A2_re += g22_re * a2_re;
1025  A2_re -= g22_im * a2_im;
1026  spinorFloat A2_im = 0;
1027  A2_im += g20_re * a0_im;
1028  A2_im += g20_im * a0_re;
1029  A2_im += g21_re * a1_im;
1030  A2_im += g21_im * a1_re;
1031  A2_im += g22_re * a2_im;
1032  A2_im += g22_im * a2_re;
1033  spinorFloat B2_re = 0;
1034  B2_re += g20_re * b0_re;
1035  B2_re -= g20_im * b0_im;
1036  B2_re += g21_re * b1_re;
1037  B2_re -= g21_im * b1_im;
1038  B2_re += g22_re * b2_re;
1039  B2_re -= g22_im * b2_im;
1040  spinorFloat B2_im = 0;
1041  B2_im += g20_re * b0_im;
1042  B2_im += g20_im * b0_re;
1043  B2_im += g21_re * b1_im;
1044  B2_im += g21_im * b1_re;
1045  B2_im += g22_re * b2_im;
1046  B2_im += g22_im * b2_re;
1047 
1048  o00_re += A0_re;
1049  o00_im += A0_im;
1050  o10_re += B0_re;
1051  o10_im += B0_im;
1052  o20_re -= A0_im;
1053  o20_im += A0_re;
1054  o30_re += B0_im;
1055  o30_im -= B0_re;
1056 
1057  o01_re += A1_re;
1058  o01_im += A1_im;
1059  o11_re += B1_re;
1060  o11_im += B1_im;
1061  o21_re -= A1_im;
1062  o21_im += A1_re;
1063  o31_re += B1_im;
1064  o31_im -= B1_re;
1065 
1066  o02_re += A2_re;
1067  o02_im += A2_im;
1068  o12_re += B2_re;
1069  o12_im += B2_im;
1070  o22_re -= A2_im;
1071  o22_im += A2_re;
1072  o32_re += B2_im;
1073  o32_im -= B2_re;
1074 
1075 }
1076 
1077 if (isActive(dim,2,-1,x1,x2,x3,x4,param.commDim,param.X) && x3==0 )
1078 {
1079  // Projector P2+
1080  // 1 0 i 0
1081  // 0 1 0 -i
1082  // -i 0 1 0
1083  // 0 i 0 1
1084 
1085  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,2,Y);
1086  const int sp_idx = face_idx + param.ghostOffset[2];
1087 #if (DD_PREC==2)
1088  sp_norm_idx = face_idx + param.ghostNormOffset[2];
1089 #endif
1090 
1091  const int ga_idx = Vh+face_idx;
1092 
1099 
1100 
1101  const int sp_stride_pad = ghostFace[2];
1102 
1103  // read half spinor from device memory
1104  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1105 
1106  a0_re = i00_re; a0_im = i00_im;
1107  a1_re = i01_re; a1_im = i01_im;
1108  a2_re = i02_re; a2_im = i02_im;
1109  b0_re = i10_re; b0_im = i10_im;
1110  b1_re = i11_re; b1_im = i11_im;
1111  b2_re = i12_re; b2_im = i12_im;
1112 
1113  // read gauge matrix from device memory
1114  READ_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1115 
1116  // reconstruct gauge matrix
1118 
1119  // multiply row 0
1120  spinorFloat A0_re = 0;
1121  A0_re += gT00_re * a0_re;
1122  A0_re -= gT00_im * a0_im;
1123  A0_re += gT01_re * a1_re;
1124  A0_re -= gT01_im * a1_im;
1125  A0_re += gT02_re * a2_re;
1126  A0_re -= gT02_im * a2_im;
1127  spinorFloat A0_im = 0;
1128  A0_im += gT00_re * a0_im;
1129  A0_im += gT00_im * a0_re;
1130  A0_im += gT01_re * a1_im;
1131  A0_im += gT01_im * a1_re;
1132  A0_im += gT02_re * a2_im;
1133  A0_im += gT02_im * a2_re;
1134  spinorFloat B0_re = 0;
1135  B0_re += gT00_re * b0_re;
1136  B0_re -= gT00_im * b0_im;
1137  B0_re += gT01_re * b1_re;
1138  B0_re -= gT01_im * b1_im;
1139  B0_re += gT02_re * b2_re;
1140  B0_re -= gT02_im * b2_im;
1141  spinorFloat B0_im = 0;
1142  B0_im += gT00_re * b0_im;
1143  B0_im += gT00_im * b0_re;
1144  B0_im += gT01_re * b1_im;
1145  B0_im += gT01_im * b1_re;
1146  B0_im += gT02_re * b2_im;
1147  B0_im += gT02_im * b2_re;
1148 
1149  // multiply row 1
1150  spinorFloat A1_re = 0;
1151  A1_re += gT10_re * a0_re;
1152  A1_re -= gT10_im * a0_im;
1153  A1_re += gT11_re * a1_re;
1154  A1_re -= gT11_im * a1_im;
1155  A1_re += gT12_re * a2_re;
1156  A1_re -= gT12_im * a2_im;
1157  spinorFloat A1_im = 0;
1158  A1_im += gT10_re * a0_im;
1159  A1_im += gT10_im * a0_re;
1160  A1_im += gT11_re * a1_im;
1161  A1_im += gT11_im * a1_re;
1162  A1_im += gT12_re * a2_im;
1163  A1_im += gT12_im * a2_re;
1164  spinorFloat B1_re = 0;
1165  B1_re += gT10_re * b0_re;
1166  B1_re -= gT10_im * b0_im;
1167  B1_re += gT11_re * b1_re;
1168  B1_re -= gT11_im * b1_im;
1169  B1_re += gT12_re * b2_re;
1170  B1_re -= gT12_im * b2_im;
1171  spinorFloat B1_im = 0;
1172  B1_im += gT10_re * b0_im;
1173  B1_im += gT10_im * b0_re;
1174  B1_im += gT11_re * b1_im;
1175  B1_im += gT11_im * b1_re;
1176  B1_im += gT12_re * b2_im;
1177  B1_im += gT12_im * b2_re;
1178 
1179  // multiply row 2
1180  spinorFloat A2_re = 0;
1181  A2_re += gT20_re * a0_re;
1182  A2_re -= gT20_im * a0_im;
1183  A2_re += gT21_re * a1_re;
1184  A2_re -= gT21_im * a1_im;
1185  A2_re += gT22_re * a2_re;
1186  A2_re -= gT22_im * a2_im;
1187  spinorFloat A2_im = 0;
1188  A2_im += gT20_re * a0_im;
1189  A2_im += gT20_im * a0_re;
1190  A2_im += gT21_re * a1_im;
1191  A2_im += gT21_im * a1_re;
1192  A2_im += gT22_re * a2_im;
1193  A2_im += gT22_im * a2_re;
1194  spinorFloat B2_re = 0;
1195  B2_re += gT20_re * b0_re;
1196  B2_re -= gT20_im * b0_im;
1197  B2_re += gT21_re * b1_re;
1198  B2_re -= gT21_im * b1_im;
1199  B2_re += gT22_re * b2_re;
1200  B2_re -= gT22_im * b2_im;
1201  spinorFloat B2_im = 0;
1202  B2_im += gT20_re * b0_im;
1203  B2_im += gT20_im * b0_re;
1204  B2_im += gT21_re * b1_im;
1205  B2_im += gT21_im * b1_re;
1206  B2_im += gT22_re * b2_im;
1207  B2_im += gT22_im * b2_re;
1208 
1209  o00_re += A0_re;
1210  o00_im += A0_im;
1211  o10_re += B0_re;
1212  o10_im += B0_im;
1213  o20_re += A0_im;
1214  o20_im -= A0_re;
1215  o30_re -= B0_im;
1216  o30_im += B0_re;
1217 
1218  o01_re += A1_re;
1219  o01_im += A1_im;
1220  o11_re += B1_re;
1221  o11_im += B1_im;
1222  o21_re += A1_im;
1223  o21_im -= A1_re;
1224  o31_re -= B1_im;
1225  o31_im += B1_re;
1226 
1227  o02_re += A2_re;
1228  o02_im += A2_im;
1229  o12_re += B2_re;
1230  o12_im += B2_im;
1231  o22_re += A2_im;
1232  o22_im -= A2_re;
1233  o32_re -= B2_im;
1234  o32_im += B2_re;
1235 
1236 }
1237 
1238 if (isActive(dim,3,+1,x1,x2,x3,x4,param.commDim,param.X) && x4==X4m1 )
1239 {
1240  // Projector P3-
1241  // 0 0 0 0
1242  // 0 0 0 0
1243  // 0 0 2 0
1244  // 0 0 0 2
1245 
1246  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,3,Y);
1247  const int sp_idx = face_idx + param.ghostOffset[3];
1248 #if (DD_PREC==2)
1249  sp_norm_idx = face_idx + faceVolume[3] + param.ghostNormOffset[3];
1250 #endif
1251 
1252  const int ga_idx = sid;
1253 
1254  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1255  {
1262 
1263 
1264  const int sp_stride_pad = ghostFace[3];
1265  //const int t_proj_scale = TPROJSCALE;
1266 
1267  // read half spinor from device memory
1268  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1269 
1270 #ifdef TWIST_INV_DSLASH
1271  a0_re = i00_re; a0_im = i00_im;
1272  a1_re = i01_re; a1_im = i01_im;
1273  a2_re = i02_re; a2_im = i02_im;
1274  b0_re = i10_re; b0_im = i10_im;
1275  b1_re = i11_re; b1_im = i11_im;
1276  b2_re = i12_re; b2_im = i12_im;
1277 #else
1278  a0_re = 2*i00_re; a0_im = 2*i00_im;
1279  a1_re = 2*i01_re; a1_im = 2*i01_im;
1280  a2_re = 2*i02_re; a2_im = 2*i02_im;
1281  b0_re = 2*i10_re; b0_im = 2*i10_im;
1282  b1_re = 2*i11_re; b1_im = 2*i11_im;
1283  b2_re = 2*i12_re; b2_im = 2*i12_im;
1284 #endif
1285 
1286  // identity gauge matrix
1293 
1294  o20_re += A0_re;
1295  o20_im += A0_im;
1296  o30_re += B0_re;
1297  o30_im += B0_im;
1298 
1299  o21_re += A1_re;
1300  o21_im += A1_im;
1301  o31_re += B1_re;
1302  o31_im += B1_im;
1303 
1304  o22_re += A2_re;
1305  o22_im += A2_im;
1306  o32_re += B2_re;
1307  o32_im += B2_im;
1308 
1309  } else {
1316 
1317 
1318  const int sp_stride_pad = ghostFace[3];
1319  //const int t_proj_scale = TPROJSCALE;
1320 
1321  // read half spinor from device memory
1322  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1323 
1324 #ifdef TWIST_INV_DSLASH
1325  a0_re = i00_re; a0_im = i00_im;
1326  a1_re = i01_re; a1_im = i01_im;
1327  a2_re = i02_re; a2_im = i02_im;
1328  b0_re = i10_re; b0_im = i10_im;
1329  b1_re = i11_re; b1_im = i11_im;
1330  b2_re = i12_re; b2_im = i12_im;
1331 #else
1332  a0_re = 2*i00_re; a0_im = 2*i00_im;
1333  a1_re = 2*i01_re; a1_im = 2*i01_im;
1334  a2_re = 2*i02_re; a2_im = 2*i02_im;
1335  b0_re = 2*i10_re; b0_im = 2*i10_im;
1336  b1_re = 2*i11_re; b1_im = 2*i11_im;
1337  b2_re = 2*i12_re; b2_im = 2*i12_im;
1338 #endif
1339 
1340  // read gauge matrix from device memory
1341  READ_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1342 
1343  // reconstruct gauge matrix
1345 
1346  // multiply row 0
1347  spinorFloat A0_re = 0;
1348  A0_re += g00_re * a0_re;
1349  A0_re -= g00_im * a0_im;
1350  A0_re += g01_re * a1_re;
1351  A0_re -= g01_im * a1_im;
1352  A0_re += g02_re * a2_re;
1353  A0_re -= g02_im * a2_im;
1354  spinorFloat A0_im = 0;
1355  A0_im += g00_re * a0_im;
1356  A0_im += g00_im * a0_re;
1357  A0_im += g01_re * a1_im;
1358  A0_im += g01_im * a1_re;
1359  A0_im += g02_re * a2_im;
1360  A0_im += g02_im * a2_re;
1361  spinorFloat B0_re = 0;
1362  B0_re += g00_re * b0_re;
1363  B0_re -= g00_im * b0_im;
1364  B0_re += g01_re * b1_re;
1365  B0_re -= g01_im * b1_im;
1366  B0_re += g02_re * b2_re;
1367  B0_re -= g02_im * b2_im;
1368  spinorFloat B0_im = 0;
1369  B0_im += g00_re * b0_im;
1370  B0_im += g00_im * b0_re;
1371  B0_im += g01_re * b1_im;
1372  B0_im += g01_im * b1_re;
1373  B0_im += g02_re * b2_im;
1374  B0_im += g02_im * b2_re;
1375 
1376  // multiply row 1
1377  spinorFloat A1_re = 0;
1378  A1_re += g10_re * a0_re;
1379  A1_re -= g10_im * a0_im;
1380  A1_re += g11_re * a1_re;
1381  A1_re -= g11_im * a1_im;
1382  A1_re += g12_re * a2_re;
1383  A1_re -= g12_im * a2_im;
1384  spinorFloat A1_im = 0;
1385  A1_im += g10_re * a0_im;
1386  A1_im += g10_im * a0_re;
1387  A1_im += g11_re * a1_im;
1388  A1_im += g11_im * a1_re;
1389  A1_im += g12_re * a2_im;
1390  A1_im += g12_im * a2_re;
1391  spinorFloat B1_re = 0;
1392  B1_re += g10_re * b0_re;
1393  B1_re -= g10_im * b0_im;
1394  B1_re += g11_re * b1_re;
1395  B1_re -= g11_im * b1_im;
1396  B1_re += g12_re * b2_re;
1397  B1_re -= g12_im * b2_im;
1398  spinorFloat B1_im = 0;
1399  B1_im += g10_re * b0_im;
1400  B1_im += g10_im * b0_re;
1401  B1_im += g11_re * b1_im;
1402  B1_im += g11_im * b1_re;
1403  B1_im += g12_re * b2_im;
1404  B1_im += g12_im * b2_re;
1405 
1406  // multiply row 2
1407  spinorFloat A2_re = 0;
1408  A2_re += g20_re * a0_re;
1409  A2_re -= g20_im * a0_im;
1410  A2_re += g21_re * a1_re;
1411  A2_re -= g21_im * a1_im;
1412  A2_re += g22_re * a2_re;
1413  A2_re -= g22_im * a2_im;
1414  spinorFloat A2_im = 0;
1415  A2_im += g20_re * a0_im;
1416  A2_im += g20_im * a0_re;
1417  A2_im += g21_re * a1_im;
1418  A2_im += g21_im * a1_re;
1419  A2_im += g22_re * a2_im;
1420  A2_im += g22_im * a2_re;
1421  spinorFloat B2_re = 0;
1422  B2_re += g20_re * b0_re;
1423  B2_re -= g20_im * b0_im;
1424  B2_re += g21_re * b1_re;
1425  B2_re -= g21_im * b1_im;
1426  B2_re += g22_re * b2_re;
1427  B2_re -= g22_im * b2_im;
1428  spinorFloat B2_im = 0;
1429  B2_im += g20_re * b0_im;
1430  B2_im += g20_im * b0_re;
1431  B2_im += g21_re * b1_im;
1432  B2_im += g21_im * b1_re;
1433  B2_im += g22_re * b2_im;
1434  B2_im += g22_im * b2_re;
1435 
1436  o20_re += A0_re;
1437  o20_im += A0_im;
1438  o30_re += B0_re;
1439  o30_im += B0_im;
1440 
1441  o21_re += A1_re;
1442  o21_im += A1_im;
1443  o31_re += B1_re;
1444  o31_im += B1_im;
1445 
1446  o22_re += A2_re;
1447  o22_im += A2_im;
1448  o32_re += B2_re;
1449  o32_im += B2_im;
1450 
1451  }
1452 }
1453 
1454 if (isActive(dim,3,-1,x1,x2,x3,x4,param.commDim,param.X) && x4==0 )
1455 {
1456  // Projector P3+
1457  // 2 0 0 0
1458  // 0 2 0 0
1459  // 0 0 0 0
1460  // 0 0 0 0
1461 
1462  faceIndexFromCoords<1>(face_idx,x1,x2,x3,x4,3,Y);
1463  const int sp_idx = face_idx + param.ghostOffset[3];
1464 #if (DD_PREC==2)
1465  sp_norm_idx = face_idx + param.ghostNormOffset[3];
1466 #endif
1467 
1468  const int ga_idx = Vh+face_idx;
1469 
1470  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1471  {
1478 
1479 
1480  const int sp_stride_pad = ghostFace[3];
1481  //const int t_proj_scale = TPROJSCALE;
1482 
1483  // read half spinor from device memory
1484  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1485 
1486 #ifdef TWIST_INV_DSLASH
1487  a0_re = i00_re; a0_im = i00_im;
1488  a1_re = i01_re; a1_im = i01_im;
1489  a2_re = i02_re; a2_im = i02_im;
1490  b0_re = i10_re; b0_im = i10_im;
1491  b1_re = i11_re; b1_im = i11_im;
1492  b2_re = i12_re; b2_im = i12_im;
1493 #else
1494  a0_re = 2*i00_re; a0_im = 2*i00_im;
1495  a1_re = 2*i01_re; a1_im = 2*i01_im;
1496  a2_re = 2*i02_re; a2_im = 2*i02_im;
1497  b0_re = 2*i10_re; b0_im = 2*i10_im;
1498  b1_re = 2*i11_re; b1_im = 2*i11_im;
1499  b2_re = 2*i12_re; b2_im = 2*i12_im;
1500 #endif
1501 
1502  // identity gauge matrix
1509 
1510  o00_re += A0_re;
1511  o00_im += A0_im;
1512  o10_re += B0_re;
1513  o10_im += B0_im;
1514 
1515  o01_re += A1_re;
1516  o01_im += A1_im;
1517  o11_re += B1_re;
1518  o11_im += B1_im;
1519 
1520  o02_re += A2_re;
1521  o02_im += A2_im;
1522  o12_re += B2_re;
1523  o12_im += B2_im;
1524 
1525  } else {
1532 
1533 
1534  const int sp_stride_pad = ghostFace[3];
1535  //const int t_proj_scale = TPROJSCALE;
1536 
1537  // read half spinor from device memory
1538  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1539 
1540 #ifdef TWIST_INV_DSLASH
1541  a0_re = i00_re; a0_im = i00_im;
1542  a1_re = i01_re; a1_im = i01_im;
1543  a2_re = i02_re; a2_im = i02_im;
1544  b0_re = i10_re; b0_im = i10_im;
1545  b1_re = i11_re; b1_im = i11_im;
1546  b2_re = i12_re; b2_im = i12_im;
1547 #else
1548  a0_re = 2*i00_re; a0_im = 2*i00_im;
1549  a1_re = 2*i01_re; a1_im = 2*i01_im;
1550  a2_re = 2*i02_re; a2_im = 2*i02_im;
1551  b0_re = 2*i10_re; b0_im = 2*i10_im;
1552  b1_re = 2*i11_re; b1_im = 2*i11_im;
1553  b2_re = 2*i12_re; b2_im = 2*i12_im;
1554 #endif
1555 
1556  // read gauge matrix from device memory
1557  READ_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
1558 
1559  // reconstruct gauge matrix
1561 
1562  // multiply row 0
1563  spinorFloat A0_re = 0;
1564  A0_re += gT00_re * a0_re;
1565  A0_re -= gT00_im * a0_im;
1566  A0_re += gT01_re * a1_re;
1567  A0_re -= gT01_im * a1_im;
1568  A0_re += gT02_re * a2_re;
1569  A0_re -= gT02_im * a2_im;
1570  spinorFloat A0_im = 0;
1571  A0_im += gT00_re * a0_im;
1572  A0_im += gT00_im * a0_re;
1573  A0_im += gT01_re * a1_im;
1574  A0_im += gT01_im * a1_re;
1575  A0_im += gT02_re * a2_im;
1576  A0_im += gT02_im * a2_re;
1577  spinorFloat B0_re = 0;
1578  B0_re += gT00_re * b0_re;
1579  B0_re -= gT00_im * b0_im;
1580  B0_re += gT01_re * b1_re;
1581  B0_re -= gT01_im * b1_im;
1582  B0_re += gT02_re * b2_re;
1583  B0_re -= gT02_im * b2_im;
1584  spinorFloat B0_im = 0;
1585  B0_im += gT00_re * b0_im;
1586  B0_im += gT00_im * b0_re;
1587  B0_im += gT01_re * b1_im;
1588  B0_im += gT01_im * b1_re;
1589  B0_im += gT02_re * b2_im;
1590  B0_im += gT02_im * b2_re;
1591 
1592  // multiply row 1
1593  spinorFloat A1_re = 0;
1594  A1_re += gT10_re * a0_re;
1595  A1_re -= gT10_im * a0_im;
1596  A1_re += gT11_re * a1_re;
1597  A1_re -= gT11_im * a1_im;
1598  A1_re += gT12_re * a2_re;
1599  A1_re -= gT12_im * a2_im;
1600  spinorFloat A1_im = 0;
1601  A1_im += gT10_re * a0_im;
1602  A1_im += gT10_im * a0_re;
1603  A1_im += gT11_re * a1_im;
1604  A1_im += gT11_im * a1_re;
1605  A1_im += gT12_re * a2_im;
1606  A1_im += gT12_im * a2_re;
1607  spinorFloat B1_re = 0;
1608  B1_re += gT10_re * b0_re;
1609  B1_re -= gT10_im * b0_im;
1610  B1_re += gT11_re * b1_re;
1611  B1_re -= gT11_im * b1_im;
1612  B1_re += gT12_re * b2_re;
1613  B1_re -= gT12_im * b2_im;
1614  spinorFloat B1_im = 0;
1615  B1_im += gT10_re * b0_im;
1616  B1_im += gT10_im * b0_re;
1617  B1_im += gT11_re * b1_im;
1618  B1_im += gT11_im * b1_re;
1619  B1_im += gT12_re * b2_im;
1620  B1_im += gT12_im * b2_re;
1621 
1622  // multiply row 2
1623  spinorFloat A2_re = 0;
1624  A2_re += gT20_re * a0_re;
1625  A2_re -= gT20_im * a0_im;
1626  A2_re += gT21_re * a1_re;
1627  A2_re -= gT21_im * a1_im;
1628  A2_re += gT22_re * a2_re;
1629  A2_re -= gT22_im * a2_im;
1630  spinorFloat A2_im = 0;
1631  A2_im += gT20_re * a0_im;
1632  A2_im += gT20_im * a0_re;
1633  A2_im += gT21_re * a1_im;
1634  A2_im += gT21_im * a1_re;
1635  A2_im += gT22_re * a2_im;
1636  A2_im += gT22_im * a2_re;
1637  spinorFloat B2_re = 0;
1638  B2_re += gT20_re * b0_re;
1639  B2_re -= gT20_im * b0_im;
1640  B2_re += gT21_re * b1_re;
1641  B2_re -= gT21_im * b1_im;
1642  B2_re += gT22_re * b2_re;
1643  B2_re -= gT22_im * b2_im;
1644  spinorFloat B2_im = 0;
1645  B2_im += gT20_re * b0_im;
1646  B2_im += gT20_im * b0_re;
1647  B2_im += gT21_re * b1_im;
1648  B2_im += gT21_im * b1_re;
1649  B2_im += gT22_re * b2_im;
1650  B2_im += gT22_im * b2_re;
1651 
1652  o00_re += A0_re;
1653  o00_im += A0_im;
1654  o10_re += B0_re;
1655  o10_im += B0_im;
1656 
1657  o01_re += A1_re;
1658  o01_im += A1_im;
1659  o11_re += B1_re;
1660  o11_im += B1_im;
1661 
1662  o02_re += A2_re;
1663  o02_im += A2_im;
1664  o12_re += B2_re;
1665  o12_im += B2_im;
1666 
1667  }
1668 }
1669 
1670 {
1671 #ifdef DSLASH_XPAY
1672  READ_ACCUM(ACCUMTEX, param.sp_stride)
1673 
1674 #ifndef TWIST_XPAY
1675 #ifndef TWIST_INV_DSLASH
1676  //perform invert twist first:
1677  APPLY_TWIST_INV( a, b, o);
1678 #endif
1679  o00_re += acc00_re;
1680  o00_im += acc00_im;
1681  o01_re += acc01_re;
1682  o01_im += acc01_im;
1683  o02_re += acc02_re;
1684  o02_im += acc02_im;
1685  o10_re += acc10_re;
1686  o10_im += acc10_im;
1687  o11_re += acc11_re;
1688  o11_im += acc11_im;
1689  o12_re += acc12_re;
1690  o12_im += acc12_im;
1691  o20_re += acc20_re;
1692  o20_im += acc20_im;
1693  o21_re += acc21_re;
1694  o21_im += acc21_im;
1695  o22_re += acc22_re;
1696  o22_im += acc22_im;
1697  o30_re += acc30_re;
1698  o30_im += acc30_im;
1699  o31_re += acc31_re;
1700  o31_im += acc31_im;
1701  o32_re += acc32_re;
1702  o32_im += acc32_im;
1703 #else
1704  APPLY_TWIST( a, acc);
1705  //warning! b is unrelated to the twisted mass parameter in this case!
1706 
1707  o00_re = b*o00_re+acc00_re;
1708  o00_im = b*o00_im+acc00_im;
1709  o01_re = b*o01_re+acc01_re;
1710  o01_im = b*o01_im+acc01_im;
1711  o02_re = b*o02_re+acc02_re;
1712  o02_im = b*o02_im+acc02_im;
1713  o10_re = b*o10_re+acc10_re;
1714  o10_im = b*o10_im+acc10_im;
1715  o11_re = b*o11_re+acc11_re;
1716  o11_im = b*o11_im+acc11_im;
1717  o12_re = b*o12_re+acc12_re;
1718  o12_im = b*o12_im+acc12_im;
1719  o20_re = b*o20_re+acc20_re;
1720  o20_im = b*o20_im+acc20_im;
1721  o21_re = b*o21_re+acc21_re;
1722  o21_im = b*o21_im+acc21_im;
1723  o22_re = b*o22_re+acc22_re;
1724  o22_im = b*o22_im+acc22_im;
1725  o30_re = b*o30_re+acc30_re;
1726  o30_im = b*o30_im+acc30_im;
1727  o31_re = b*o31_re+acc31_re;
1728  o31_im = b*o31_im+acc31_im;
1729  o32_re = b*o32_re+acc32_re;
1730  o32_im = b*o32_im+acc32_im;
1731 #endif//TWIST_XPAY
1732 #else //no XPAY
1733 #ifndef TWIST_INV_DSLASH
1734  APPLY_TWIST_INV( a, b, o);
1735 #endif
1736 #endif
1737 }
1738 
1739 // write spinor field back to device memory
1740 WRITE_SPINOR(param.sp_stride);
1741 
1742 // undefine to prevent warning when precision is changed
1743 #undef spinorFloat
1744 #undef SHARED_STRIDE
1745 
1746 #undef g00_re
1747 #undef g00_im
1748 #undef g01_re
1749 #undef g01_im
1750 #undef g02_re
1751 #undef g02_im
1752 #undef g10_re
1753 #undef g10_im
1754 #undef g11_re
1755 #undef g11_im
1756 #undef g12_re
1757 #undef g12_im
1758 #undef g20_re
1759 #undef g20_im
1760 #undef g21_re
1761 #undef g21_im
1762 #undef g22_re
1763 #undef g22_im
1764 
1765 #undef i00_re
1766 #undef i00_im
1767 #undef i01_re
1768 #undef i01_im
1769 #undef i02_re
1770 #undef i02_im
1771 #undef i10_re
1772 #undef i10_im
1773 #undef i11_re
1774 #undef i11_im
1775 #undef i12_re
1776 #undef i12_im
1777 #undef i20_re
1778 #undef i20_im
1779 #undef i21_re
1780 #undef i21_im
1781 #undef i22_re
1782 #undef i22_im
1783 #undef i30_re
1784 #undef i30_im
1785 #undef i31_re
1786 #undef i31_im
1787 #undef i32_re
1788 #undef i32_im
1789 
1790 #undef acc00_re
1791 #undef acc00_im
1792 #undef acc01_re
1793 #undef acc01_im
1794 #undef acc02_re
1795 #undef acc02_im
1796 #undef acc10_re
1797 #undef acc10_im
1798 #undef acc11_re
1799 #undef acc11_im
1800 #undef acc12_re
1801 #undef acc12_im
1802 #undef acc20_re
1803 #undef acc20_im
1804 #undef acc21_re
1805 #undef acc21_im
1806 #undef acc22_re
1807 #undef acc22_im
1808 #undef acc30_re
1809 #undef acc30_im
1810 #undef acc31_re
1811 #undef acc31_im
1812 #undef acc32_re
1813 #undef acc32_im
1814 
1815 
1816 #undef o00_re
1817 #undef o00_im
1818 #undef o01_re
1819 #undef o01_im
1820 #undef o02_re
1821 #undef o02_im
1822 #undef o10_re
1823 #undef o10_im
1824 #undef o11_re
1825 #undef o11_im
1826 #undef o12_re
1827 #undef o12_im
1828 #undef o20_re
1829 #undef o20_im
1830 #undef o21_re
1831 #undef o21_im
1832 #undef o22_re
1833 #undef o22_im
1834 #undef o30_re
1835 
1836 #undef VOLATILE
1837 
1838 #endif // MULTI_GPU
__constant__ int Vh
__constant__ int X2
#define o32_im
Definition: gamma5.h:295
#define APPLY_TWIST(a, reg)
Definition: io_spinor.h:1187
#define APPLY_TWIST_INV(a, b, reg)
**************************only for deg tm:*******************************
Definition: io_spinor.h:1122
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
#define o31_im
Definition: gamma5.h:293
#define DSLASH_SHARED_FLOATS_PER_THREAD
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define RECONSTRUCT_GAUGE_MATRIX
Definition: covDev.h:39
__shared__ char s_data[]
#define GAUGE0TEX
Definition: covDev.h:112
#define o30_im
Definition: gamma5.h:291
__constant__ int X2m1
#define SPINORTEX
Definition: clover_def.h:40
#define o32_re
Definition: gamma5.h:294
int X[4]
Definition: quda.h:29
__constant__ int gauge_fixed
#define o31_re
Definition: gamma5.h:292
#define SPINOR_HOP
Definition: covDev.h:158
__constant__ int ga_stride
__constant__ int X1m1
__constant__ int X3
#define GAUGE1TEX
Definition: covDev.h:113
#define READ_GAUGE_MATRIX
Definition: covDev.h:44
__constant__ int X4m1
#define WRITE_SPINOR
Definition: clover_def.h:48
VOLATILE spinorFloat * s
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
__constant__ int X4X3X2X1hmX3X2X1h
__constant__ int X4
__constant__ int X3m1