QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dw_fused_exterior_dslash_dagger_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #ifdef MULTI_GPU
7 
8 #if (CUDA_VERSION >= 4010)
9 #define VOLATILE
10 #else
11 #define VOLATILE volatile
12 #endif
13 // input spinor
14 #ifdef SPINOR_DOUBLE
15 #define spinorFloat double
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #else
41 #define spinorFloat float
42 #define i00_re I0.x
43 #define i00_im I0.y
44 #define i01_re I0.z
45 #define i01_im I0.w
46 #define i02_re I1.x
47 #define i02_im I1.y
48 #define i10_re I1.z
49 #define i10_im I1.w
50 #define i11_re I2.x
51 #define i11_im I2.y
52 #define i12_re I2.z
53 #define i12_im I2.w
54 #define i20_re I3.x
55 #define i20_im I3.y
56 #define i21_re I3.z
57 #define i21_im I3.w
58 #define i22_re I4.x
59 #define i22_im I4.y
60 #define i30_re I4.z
61 #define i30_im I4.w
62 #define i31_re I5.x
63 #define i31_im I5.y
64 #define i32_re I5.z
65 #define i32_im I5.w
66 #endif // SPINOR_DOUBLE
67 
68 // gauge link
69 #ifdef GAUGE_FLOAT2
70 #define g00_re G0.x
71 #define g00_im G0.y
72 #define g01_re G1.x
73 #define g01_im G1.y
74 #define g02_re G2.x
75 #define g02_im G2.y
76 #define g10_re G3.x
77 #define g10_im G3.y
78 #define g11_re G4.x
79 #define g11_im G4.y
80 #define g12_re G5.x
81 #define g12_im G5.y
82 #define g20_re G6.x
83 #define g20_im G6.y
84 #define g21_re G7.x
85 #define g21_im G7.y
86 #define g22_re G8.x
87 #define g22_im G8.y
88 
89 #else
90 #define g00_re G0.x
91 #define g00_im G0.y
92 #define g01_re G0.z
93 #define g01_im G0.w
94 #define g02_re G1.x
95 #define g02_im G1.y
96 #define g10_re G1.z
97 #define g10_im G1.w
98 #define g11_re G2.x
99 #define g11_im G2.y
100 #define g12_re G2.z
101 #define g12_im G2.w
102 #define g20_re G3.x
103 #define g20_im G3.y
104 #define g21_re G3.z
105 #define g21_im G3.w
106 #define g22_re G4.x
107 #define g22_im G4.y
108 
109 #endif // GAUGE_DOUBLE
110 
111 // conjugated gauge link
112 #define gT00_re (+g00_re)
113 #define gT00_im (-g00_im)
114 #define gT01_re (+g10_re)
115 #define gT01_im (-g10_im)
116 #define gT02_re (+g20_re)
117 #define gT02_im (-g20_im)
118 #define gT10_re (+g01_re)
119 #define gT10_im (-g01_im)
120 #define gT11_re (+g11_re)
121 #define gT11_im (-g11_im)
122 #define gT12_re (+g21_re)
123 #define gT12_im (-g21_im)
124 #define gT20_re (+g02_re)
125 #define gT20_im (-g02_im)
126 #define gT21_re (+g12_re)
127 #define gT21_im (-g12_im)
128 #define gT22_re (+g22_re)
129 #define gT22_im (-g22_im)
130 
131 // output spinor
156 
157 #ifdef SPINOR_DOUBLE
158 #if (__COMPUTE_CAPABILITY__ >= 200)
159 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
160 #else
161 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
162 #endif
163 #else
164 #if (__COMPUTE_CAPABILITY__ >= 200)
165 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
166 #else
167 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
168 #endif
169 #endif
170 
171 #include "read_gauge.h"
172 #include "io_spinor.h"
173 
174 #if (DD_PREC==2) // half precision
175 int sp_norm_idx;
176 #endif // half precision
177 
178 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
179 if (sid >= param.threads*param.Ls) return;
180 
181 
182 int dim;
183 int face_num;
184 int face_idx;
185 int Y[4] = {X1,X2,X3,X4};
186 int faceVolume[4];
187 faceVolume[0] = (X2*X3*X4)>>1;
188 faceVolume[1] = (X1*X3*X4)>>1;
189 faceVolume[2] = (X1*X2*X4)>>1;
190 faceVolume[3] = (X1*X2*X3)>>1;
191 
192 
193 int X, x1, x2, x3, x4, xs;
194 
196 
197 
198 
199 { // exterior kernel
200 
201 dim = dimFromDWFaceIndex(sid, param); // sid is also modified
202 
203 const int face_volume = ((param.threadDimMapUpper[dim] - param.threadDimMapLower[dim])*param.Ls >> 1);
204 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
205 face_idx = sid - face_num*face_volume; // index into the respective face
206 
207 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
208 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
209 //sp_idx = face_idx + param.ghostOffset[dim];
210 
211 const int dims[] = {X1, X2, X3, X4};
212 coordsFromDWFaceIndex<1>(sid, x1, x2, x3, x4, xs, face_idx, face_volume, dim, face_num, param.parity, dims);
213 
214 {
215  bool active = false;
216  for(int dir=0; dir<4; ++dir){
217  active = active || isActive(dim,dir,+1,x1,x2,x3,x4,param.commDim,param.X);
218  }
219  if(!active) return;
220 }
221 
222 
223 s_parity = ( sid/(X4*X3*X2*X1h) ) % 2;
224 boundaryCrossing = sid/X1h + sid/(X2*X1h) + sid/(X3*X2*X1h) + sid/(X4*X3*X2*X1h);
225 
226 X = 2*sid + (boundaryCrossing + param.parity) % 2;
227 
228 READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
229 
230  o00_re = i00_re; o00_im = i00_im;
231  o01_re = i01_re; o01_im = i01_im;
232  o02_re = i02_re; o02_im = i02_im;
233  o10_re = i10_re; o10_im = i10_im;
234  o11_re = i11_re; o11_im = i11_im;
235  o12_re = i12_re; o12_im = i12_im;
236  o20_re = i20_re; o20_im = i20_im;
237  o21_re = i21_re; o21_im = i21_im;
238  o22_re = i22_re; o22_im = i22_im;
239  o30_re = i30_re; o30_im = i30_im;
240  o31_re = i31_re; o31_im = i31_im;
241  o32_re = i32_re; o32_im = i32_im;
242 }
243 
244 // declare G## here and use ASSN below instead of READ
245 #ifdef GAUGE_FLOAT2
246 #if (DD_PREC==0) //temporal hack
247 double2 G0;
248 double2 G1;
249 double2 G2;
250 double2 G3;
251 double2 G4;
252 double2 G5;
253 double2 G6;
254 double2 G7;
255 double2 G8;
256 #else
257 float2 G0;
258 float2 G1;
259 float2 G2;
260 float2 G3;
261 float2 G4;
262 float2 G5;
263 float2 G6;
264 float2 G7;
265 float2 G8;
266 #endif
267 #else
268 float4 G0;
269 float4 G1;
270 float4 G2;
271 float4 G3;
272 float4 G4;
273 #endif
274 
275 
276 
277 if (isActive(dim,0,+1,x1,x2,x3,x4,param.commDim,param.X) && x1==X1m1 )
278 {
279  // Projector P0+
280  // 1 0 0 i
281  // 0 1 i 0
282  // 0 -i 1 0
283  // -i 0 0 1
284 
285  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,0,Y);
286  const int sp_idx = face_idx + param.ghostOffset[0];
287 #if (DD_PREC==2) // half precision
288  sp_norm_idx = face_idx + param.Ls*ghostFace[0] + param.ghostNormOffset[0];
289 #endif
290 
291 
292  const int ga_idx = sid % Vh;
293 
294  // read gauge matrix from device memory
295  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride); }
296  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 0, ga_idx, ga_stride); }
297 
304 
305  {
306 
307  const int sp_stride_pad = param.Ls*ghostFace[0];
308 
309  // read half spinor from device memory
310  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
311 
312  a0_re = i00_re; a0_im = i00_im;
313  a1_re = i01_re; a1_im = i01_im;
314  a2_re = i02_re; a2_im = i02_im;
315  b0_re = i10_re; b0_im = i10_im;
316  b1_re = i11_re; b1_im = i11_im;
317  b2_re = i12_re; b2_im = i12_im;
318 
319  }
320 
321  // reconstruct gauge matrix
323 
324  // multiply row 0
325  spinorFloat A0_re = 0;
326  A0_re += g00_re * a0_re;
327  A0_re -= g00_im * a0_im;
328  A0_re += g01_re * a1_re;
329  A0_re -= g01_im * a1_im;
330  A0_re += g02_re * a2_re;
331  A0_re -= g02_im * a2_im;
332  spinorFloat A0_im = 0;
333  A0_im += g00_re * a0_im;
334  A0_im += g00_im * a0_re;
335  A0_im += g01_re * a1_im;
336  A0_im += g01_im * a1_re;
337  A0_im += g02_re * a2_im;
338  A0_im += g02_im * a2_re;
339  spinorFloat B0_re = 0;
340  B0_re += g00_re * b0_re;
341  B0_re -= g00_im * b0_im;
342  B0_re += g01_re * b1_re;
343  B0_re -= g01_im * b1_im;
344  B0_re += g02_re * b2_re;
345  B0_re -= g02_im * b2_im;
346  spinorFloat B0_im = 0;
347  B0_im += g00_re * b0_im;
348  B0_im += g00_im * b0_re;
349  B0_im += g01_re * b1_im;
350  B0_im += g01_im * b1_re;
351  B0_im += g02_re * b2_im;
352  B0_im += g02_im * b2_re;
353 
354  // multiply row 1
355  spinorFloat A1_re = 0;
356  A1_re += g10_re * a0_re;
357  A1_re -= g10_im * a0_im;
358  A1_re += g11_re * a1_re;
359  A1_re -= g11_im * a1_im;
360  A1_re += g12_re * a2_re;
361  A1_re -= g12_im * a2_im;
362  spinorFloat A1_im = 0;
363  A1_im += g10_re * a0_im;
364  A1_im += g10_im * a0_re;
365  A1_im += g11_re * a1_im;
366  A1_im += g11_im * a1_re;
367  A1_im += g12_re * a2_im;
368  A1_im += g12_im * a2_re;
369  spinorFloat B1_re = 0;
370  B1_re += g10_re * b0_re;
371  B1_re -= g10_im * b0_im;
372  B1_re += g11_re * b1_re;
373  B1_re -= g11_im * b1_im;
374  B1_re += g12_re * b2_re;
375  B1_re -= g12_im * b2_im;
376  spinorFloat B1_im = 0;
377  B1_im += g10_re * b0_im;
378  B1_im += g10_im * b0_re;
379  B1_im += g11_re * b1_im;
380  B1_im += g11_im * b1_re;
381  B1_im += g12_re * b2_im;
382  B1_im += g12_im * b2_re;
383 
384  // multiply row 2
385  spinorFloat A2_re = 0;
386  A2_re += g20_re * a0_re;
387  A2_re -= g20_im * a0_im;
388  A2_re += g21_re * a1_re;
389  A2_re -= g21_im * a1_im;
390  A2_re += g22_re * a2_re;
391  A2_re -= g22_im * a2_im;
392  spinorFloat A2_im = 0;
393  A2_im += g20_re * a0_im;
394  A2_im += g20_im * a0_re;
395  A2_im += g21_re * a1_im;
396  A2_im += g21_im * a1_re;
397  A2_im += g22_re * a2_im;
398  A2_im += g22_im * a2_re;
399  spinorFloat B2_re = 0;
400  B2_re += g20_re * b0_re;
401  B2_re -= g20_im * b0_im;
402  B2_re += g21_re * b1_re;
403  B2_re -= g21_im * b1_im;
404  B2_re += g22_re * b2_re;
405  B2_re -= g22_im * b2_im;
406  spinorFloat B2_im = 0;
407  B2_im += g20_re * b0_im;
408  B2_im += g20_im * b0_re;
409  B2_im += g21_re * b1_im;
410  B2_im += g21_im * b1_re;
411  B2_im += g22_re * b2_im;
412  B2_im += g22_im * b2_re;
413 
414  o00_re += A0_re;
415  o00_im += A0_im;
416  o10_re += B0_re;
417  o10_im += B0_im;
418  o20_re += B0_im;
419  o20_im -= B0_re;
420  o30_re += A0_im;
421  o30_im -= A0_re;
422 
423  o01_re += A1_re;
424  o01_im += A1_im;
425  o11_re += B1_re;
426  o11_im += B1_im;
427  o21_re += B1_im;
428  o21_im -= B1_re;
429  o31_re += A1_im;
430  o31_im -= A1_re;
431 
432  o02_re += A2_re;
433  o02_im += A2_im;
434  o12_re += B2_re;
435  o12_im += B2_im;
436  o22_re += B2_im;
437  o22_im -= B2_re;
438  o32_re += A2_im;
439  o32_im -= A2_re;
440 }
441 
442 if (isActive(dim,0,-1,x1,x2,x3,x4,param.commDim,param.X) && x1==0 )
443 {
444  // Projector P0-
445  // 1 0 0 -i
446  // 0 1 -i 0
447  // 0 i 1 0
448  // i 0 0 1
449 
450  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,0,Y);
451  const int sp_idx = face_idx + param.ghostOffset[0];
452 #if (DD_PREC==2) // half precision
453  sp_norm_idx = face_idx + param.ghostNormOffset[0];
454 #endif
455 
456 
457  const int ga_idx = Vh+(face_idx % ghostFace[0]);
458 
459  // read gauge matrix from device memory
460  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride); }
461  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 1, ga_idx, ga_stride); }
462 
469 
470  {
471 
472  const int sp_stride_pad = param.Ls*ghostFace[0];
473 
474  // read half spinor from device memory
475  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
476 
477  a0_re = i00_re; a0_im = i00_im;
478  a1_re = i01_re; a1_im = i01_im;
479  a2_re = i02_re; a2_im = i02_im;
480  b0_re = i10_re; b0_im = i10_im;
481  b1_re = i11_re; b1_im = i11_im;
482  b2_re = i12_re; b2_im = i12_im;
483 
484  }
485 
486  // reconstruct gauge matrix
488 
489  // multiply row 0
490  spinorFloat A0_re = 0;
491  A0_re += gT00_re * a0_re;
492  A0_re -= gT00_im * a0_im;
493  A0_re += gT01_re * a1_re;
494  A0_re -= gT01_im * a1_im;
495  A0_re += gT02_re * a2_re;
496  A0_re -= gT02_im * a2_im;
497  spinorFloat A0_im = 0;
498  A0_im += gT00_re * a0_im;
499  A0_im += gT00_im * a0_re;
500  A0_im += gT01_re * a1_im;
501  A0_im += gT01_im * a1_re;
502  A0_im += gT02_re * a2_im;
503  A0_im += gT02_im * a2_re;
504  spinorFloat B0_re = 0;
505  B0_re += gT00_re * b0_re;
506  B0_re -= gT00_im * b0_im;
507  B0_re += gT01_re * b1_re;
508  B0_re -= gT01_im * b1_im;
509  B0_re += gT02_re * b2_re;
510  B0_re -= gT02_im * b2_im;
511  spinorFloat B0_im = 0;
512  B0_im += gT00_re * b0_im;
513  B0_im += gT00_im * b0_re;
514  B0_im += gT01_re * b1_im;
515  B0_im += gT01_im * b1_re;
516  B0_im += gT02_re * b2_im;
517  B0_im += gT02_im * b2_re;
518 
519  // multiply row 1
520  spinorFloat A1_re = 0;
521  A1_re += gT10_re * a0_re;
522  A1_re -= gT10_im * a0_im;
523  A1_re += gT11_re * a1_re;
524  A1_re -= gT11_im * a1_im;
525  A1_re += gT12_re * a2_re;
526  A1_re -= gT12_im * a2_im;
527  spinorFloat A1_im = 0;
528  A1_im += gT10_re * a0_im;
529  A1_im += gT10_im * a0_re;
530  A1_im += gT11_re * a1_im;
531  A1_im += gT11_im * a1_re;
532  A1_im += gT12_re * a2_im;
533  A1_im += gT12_im * a2_re;
534  spinorFloat B1_re = 0;
535  B1_re += gT10_re * b0_re;
536  B1_re -= gT10_im * b0_im;
537  B1_re += gT11_re * b1_re;
538  B1_re -= gT11_im * b1_im;
539  B1_re += gT12_re * b2_re;
540  B1_re -= gT12_im * b2_im;
541  spinorFloat B1_im = 0;
542  B1_im += gT10_re * b0_im;
543  B1_im += gT10_im * b0_re;
544  B1_im += gT11_re * b1_im;
545  B1_im += gT11_im * b1_re;
546  B1_im += gT12_re * b2_im;
547  B1_im += gT12_im * b2_re;
548 
549  // multiply row 2
550  spinorFloat A2_re = 0;
551  A2_re += gT20_re * a0_re;
552  A2_re -= gT20_im * a0_im;
553  A2_re += gT21_re * a1_re;
554  A2_re -= gT21_im * a1_im;
555  A2_re += gT22_re * a2_re;
556  A2_re -= gT22_im * a2_im;
557  spinorFloat A2_im = 0;
558  A2_im += gT20_re * a0_im;
559  A2_im += gT20_im * a0_re;
560  A2_im += gT21_re * a1_im;
561  A2_im += gT21_im * a1_re;
562  A2_im += gT22_re * a2_im;
563  A2_im += gT22_im * a2_re;
564  spinorFloat B2_re = 0;
565  B2_re += gT20_re * b0_re;
566  B2_re -= gT20_im * b0_im;
567  B2_re += gT21_re * b1_re;
568  B2_re -= gT21_im * b1_im;
569  B2_re += gT22_re * b2_re;
570  B2_re -= gT22_im * b2_im;
571  spinorFloat B2_im = 0;
572  B2_im += gT20_re * b0_im;
573  B2_im += gT20_im * b0_re;
574  B2_im += gT21_re * b1_im;
575  B2_im += gT21_im * b1_re;
576  B2_im += gT22_re * b2_im;
577  B2_im += gT22_im * b2_re;
578 
579  o00_re += A0_re;
580  o00_im += A0_im;
581  o10_re += B0_re;
582  o10_im += B0_im;
583  o20_re -= B0_im;
584  o20_im += B0_re;
585  o30_re -= A0_im;
586  o30_im += A0_re;
587 
588  o01_re += A1_re;
589  o01_im += A1_im;
590  o11_re += B1_re;
591  o11_im += B1_im;
592  o21_re -= B1_im;
593  o21_im += B1_re;
594  o31_re -= A1_im;
595  o31_im += A1_re;
596 
597  o02_re += A2_re;
598  o02_im += A2_im;
599  o12_re += B2_re;
600  o12_im += B2_im;
601  o22_re -= B2_im;
602  o22_im += B2_re;
603  o32_re -= A2_im;
604  o32_im += A2_re;
605 }
606 
607 if (isActive(dim,1,+1,x1,x2,x3,x4,param.commDim,param.X) && x2==X2m1 )
608 {
609  // Projector P1+
610  // 1 0 0 1
611  // 0 1 -1 0
612  // 0 -1 1 0
613  // 1 0 0 1
614 
615  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,1,Y);
616  const int sp_idx = face_idx + param.ghostOffset[1];
617 #if (DD_PREC==2) // half precision
618  sp_norm_idx = face_idx + param.Ls*ghostFace[1] + param.ghostNormOffset[1];
619 #endif
620 
621 
622  const int ga_idx = sid % Vh;
623 
624  // read gauge matrix from device memory
625  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride); }
626  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 2, ga_idx, ga_stride); }
627 
634 
635  {
636 
637  const int sp_stride_pad = param.Ls*ghostFace[1];
638 
639  // read half spinor from device memory
640  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
641 
642  a0_re = i00_re; a0_im = i00_im;
643  a1_re = i01_re; a1_im = i01_im;
644  a2_re = i02_re; a2_im = i02_im;
645  b0_re = i10_re; b0_im = i10_im;
646  b1_re = i11_re; b1_im = i11_im;
647  b2_re = i12_re; b2_im = i12_im;
648 
649  }
650 
651  // reconstruct gauge matrix
653 
654  // multiply row 0
655  spinorFloat A0_re = 0;
656  A0_re += g00_re * a0_re;
657  A0_re -= g00_im * a0_im;
658  A0_re += g01_re * a1_re;
659  A0_re -= g01_im * a1_im;
660  A0_re += g02_re * a2_re;
661  A0_re -= g02_im * a2_im;
662  spinorFloat A0_im = 0;
663  A0_im += g00_re * a0_im;
664  A0_im += g00_im * a0_re;
665  A0_im += g01_re * a1_im;
666  A0_im += g01_im * a1_re;
667  A0_im += g02_re * a2_im;
668  A0_im += g02_im * a2_re;
669  spinorFloat B0_re = 0;
670  B0_re += g00_re * b0_re;
671  B0_re -= g00_im * b0_im;
672  B0_re += g01_re * b1_re;
673  B0_re -= g01_im * b1_im;
674  B0_re += g02_re * b2_re;
675  B0_re -= g02_im * b2_im;
676  spinorFloat B0_im = 0;
677  B0_im += g00_re * b0_im;
678  B0_im += g00_im * b0_re;
679  B0_im += g01_re * b1_im;
680  B0_im += g01_im * b1_re;
681  B0_im += g02_re * b2_im;
682  B0_im += g02_im * b2_re;
683 
684  // multiply row 1
685  spinorFloat A1_re = 0;
686  A1_re += g10_re * a0_re;
687  A1_re -= g10_im * a0_im;
688  A1_re += g11_re * a1_re;
689  A1_re -= g11_im * a1_im;
690  A1_re += g12_re * a2_re;
691  A1_re -= g12_im * a2_im;
692  spinorFloat A1_im = 0;
693  A1_im += g10_re * a0_im;
694  A1_im += g10_im * a0_re;
695  A1_im += g11_re * a1_im;
696  A1_im += g11_im * a1_re;
697  A1_im += g12_re * a2_im;
698  A1_im += g12_im * a2_re;
699  spinorFloat B1_re = 0;
700  B1_re += g10_re * b0_re;
701  B1_re -= g10_im * b0_im;
702  B1_re += g11_re * b1_re;
703  B1_re -= g11_im * b1_im;
704  B1_re += g12_re * b2_re;
705  B1_re -= g12_im * b2_im;
706  spinorFloat B1_im = 0;
707  B1_im += g10_re * b0_im;
708  B1_im += g10_im * b0_re;
709  B1_im += g11_re * b1_im;
710  B1_im += g11_im * b1_re;
711  B1_im += g12_re * b2_im;
712  B1_im += g12_im * b2_re;
713 
714  // multiply row 2
715  spinorFloat A2_re = 0;
716  A2_re += g20_re * a0_re;
717  A2_re -= g20_im * a0_im;
718  A2_re += g21_re * a1_re;
719  A2_re -= g21_im * a1_im;
720  A2_re += g22_re * a2_re;
721  A2_re -= g22_im * a2_im;
722  spinorFloat A2_im = 0;
723  A2_im += g20_re * a0_im;
724  A2_im += g20_im * a0_re;
725  A2_im += g21_re * a1_im;
726  A2_im += g21_im * a1_re;
727  A2_im += g22_re * a2_im;
728  A2_im += g22_im * a2_re;
729  spinorFloat B2_re = 0;
730  B2_re += g20_re * b0_re;
731  B2_re -= g20_im * b0_im;
732  B2_re += g21_re * b1_re;
733  B2_re -= g21_im * b1_im;
734  B2_re += g22_re * b2_re;
735  B2_re -= g22_im * b2_im;
736  spinorFloat B2_im = 0;
737  B2_im += g20_re * b0_im;
738  B2_im += g20_im * b0_re;
739  B2_im += g21_re * b1_im;
740  B2_im += g21_im * b1_re;
741  B2_im += g22_re * b2_im;
742  B2_im += g22_im * b2_re;
743 
744  o00_re += A0_re;
745  o00_im += A0_im;
746  o10_re += B0_re;
747  o10_im += B0_im;
748  o20_re -= B0_re;
749  o20_im -= B0_im;
750  o30_re += A0_re;
751  o30_im += A0_im;
752 
753  o01_re += A1_re;
754  o01_im += A1_im;
755  o11_re += B1_re;
756  o11_im += B1_im;
757  o21_re -= B1_re;
758  o21_im -= B1_im;
759  o31_re += A1_re;
760  o31_im += A1_im;
761 
762  o02_re += A2_re;
763  o02_im += A2_im;
764  o12_re += B2_re;
765  o12_im += B2_im;
766  o22_re -= B2_re;
767  o22_im -= B2_im;
768  o32_re += A2_re;
769  o32_im += A2_im;
770 }
771 
772 if (isActive(dim,1,-1,x1,x2,x3,x4,param.commDim,param.X) && x2==0 )
773 {
774  // Projector P1-
775  // 1 0 0 -1
776  // 0 1 1 0
777  // 0 1 1 0
778  // -1 0 0 1
779 
780  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,1,Y);
781  const int sp_idx = face_idx + param.ghostOffset[1];
782 #if (DD_PREC==2) // half precision
783  sp_norm_idx = face_idx + param.ghostNormOffset[1];
784 #endif
785 
786 
787  const int ga_idx = Vh+(face_idx % ghostFace[1]);
788 
789  // read gauge matrix from device memory
790  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride); }
791  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 3, ga_idx, ga_stride); }
792 
799 
800  {
801 
802  const int sp_stride_pad = param.Ls*ghostFace[1];
803 
804  // read half spinor from device memory
805  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
806 
807  a0_re = i00_re; a0_im = i00_im;
808  a1_re = i01_re; a1_im = i01_im;
809  a2_re = i02_re; a2_im = i02_im;
810  b0_re = i10_re; b0_im = i10_im;
811  b1_re = i11_re; b1_im = i11_im;
812  b2_re = i12_re; b2_im = i12_im;
813 
814  }
815 
816  // reconstruct gauge matrix
818 
819  // multiply row 0
820  spinorFloat A0_re = 0;
821  A0_re += gT00_re * a0_re;
822  A0_re -= gT00_im * a0_im;
823  A0_re += gT01_re * a1_re;
824  A0_re -= gT01_im * a1_im;
825  A0_re += gT02_re * a2_re;
826  A0_re -= gT02_im * a2_im;
827  spinorFloat A0_im = 0;
828  A0_im += gT00_re * a0_im;
829  A0_im += gT00_im * a0_re;
830  A0_im += gT01_re * a1_im;
831  A0_im += gT01_im * a1_re;
832  A0_im += gT02_re * a2_im;
833  A0_im += gT02_im * a2_re;
834  spinorFloat B0_re = 0;
835  B0_re += gT00_re * b0_re;
836  B0_re -= gT00_im * b0_im;
837  B0_re += gT01_re * b1_re;
838  B0_re -= gT01_im * b1_im;
839  B0_re += gT02_re * b2_re;
840  B0_re -= gT02_im * b2_im;
841  spinorFloat B0_im = 0;
842  B0_im += gT00_re * b0_im;
843  B0_im += gT00_im * b0_re;
844  B0_im += gT01_re * b1_im;
845  B0_im += gT01_im * b1_re;
846  B0_im += gT02_re * b2_im;
847  B0_im += gT02_im * b2_re;
848 
849  // multiply row 1
850  spinorFloat A1_re = 0;
851  A1_re += gT10_re * a0_re;
852  A1_re -= gT10_im * a0_im;
853  A1_re += gT11_re * a1_re;
854  A1_re -= gT11_im * a1_im;
855  A1_re += gT12_re * a2_re;
856  A1_re -= gT12_im * a2_im;
857  spinorFloat A1_im = 0;
858  A1_im += gT10_re * a0_im;
859  A1_im += gT10_im * a0_re;
860  A1_im += gT11_re * a1_im;
861  A1_im += gT11_im * a1_re;
862  A1_im += gT12_re * a2_im;
863  A1_im += gT12_im * a2_re;
864  spinorFloat B1_re = 0;
865  B1_re += gT10_re * b0_re;
866  B1_re -= gT10_im * b0_im;
867  B1_re += gT11_re * b1_re;
868  B1_re -= gT11_im * b1_im;
869  B1_re += gT12_re * b2_re;
870  B1_re -= gT12_im * b2_im;
871  spinorFloat B1_im = 0;
872  B1_im += gT10_re * b0_im;
873  B1_im += gT10_im * b0_re;
874  B1_im += gT11_re * b1_im;
875  B1_im += gT11_im * b1_re;
876  B1_im += gT12_re * b2_im;
877  B1_im += gT12_im * b2_re;
878 
879  // multiply row 2
880  spinorFloat A2_re = 0;
881  A2_re += gT20_re * a0_re;
882  A2_re -= gT20_im * a0_im;
883  A2_re += gT21_re * a1_re;
884  A2_re -= gT21_im * a1_im;
885  A2_re += gT22_re * a2_re;
886  A2_re -= gT22_im * a2_im;
887  spinorFloat A2_im = 0;
888  A2_im += gT20_re * a0_im;
889  A2_im += gT20_im * a0_re;
890  A2_im += gT21_re * a1_im;
891  A2_im += gT21_im * a1_re;
892  A2_im += gT22_re * a2_im;
893  A2_im += gT22_im * a2_re;
894  spinorFloat B2_re = 0;
895  B2_re += gT20_re * b0_re;
896  B2_re -= gT20_im * b0_im;
897  B2_re += gT21_re * b1_re;
898  B2_re -= gT21_im * b1_im;
899  B2_re += gT22_re * b2_re;
900  B2_re -= gT22_im * b2_im;
901  spinorFloat B2_im = 0;
902  B2_im += gT20_re * b0_im;
903  B2_im += gT20_im * b0_re;
904  B2_im += gT21_re * b1_im;
905  B2_im += gT21_im * b1_re;
906  B2_im += gT22_re * b2_im;
907  B2_im += gT22_im * b2_re;
908 
909  o00_re += A0_re;
910  o00_im += A0_im;
911  o10_re += B0_re;
912  o10_im += B0_im;
913  o20_re += B0_re;
914  o20_im += B0_im;
915  o30_re -= A0_re;
916  o30_im -= A0_im;
917 
918  o01_re += A1_re;
919  o01_im += A1_im;
920  o11_re += B1_re;
921  o11_im += B1_im;
922  o21_re += B1_re;
923  o21_im += B1_im;
924  o31_re -= A1_re;
925  o31_im -= A1_im;
926 
927  o02_re += A2_re;
928  o02_im += A2_im;
929  o12_re += B2_re;
930  o12_im += B2_im;
931  o22_re += B2_re;
932  o22_im += B2_im;
933  o32_re -= A2_re;
934  o32_im -= A2_im;
935 }
936 
937 if (isActive(dim,2,+1,x1,x2,x3,x4,param.commDim,param.X) && x3==X3m1 )
938 {
939  // Projector P2+
940  // 1 0 i 0
941  // 0 1 0 -i
942  // -i 0 1 0
943  // 0 i 0 1
944 
945  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,2,Y);
946  const int sp_idx = face_idx + param.ghostOffset[2];
947 #if (DD_PREC==2) // half precision
948  sp_norm_idx = face_idx + param.Ls*ghostFace[2] + param.ghostNormOffset[2];
949 #endif
950 
951 
952  const int ga_idx = sid % Vh;
953 
954  // read gauge matrix from device memory
955  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride); }
956  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 4, ga_idx, ga_stride); }
957 
964 
965  {
966 
967  const int sp_stride_pad = param.Ls*ghostFace[2];
968 
969  // read half spinor from device memory
970  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
971 
972  a0_re = i00_re; a0_im = i00_im;
973  a1_re = i01_re; a1_im = i01_im;
974  a2_re = i02_re; a2_im = i02_im;
975  b0_re = i10_re; b0_im = i10_im;
976  b1_re = i11_re; b1_im = i11_im;
977  b2_re = i12_re; b2_im = i12_im;
978 
979  }
980 
981  // reconstruct gauge matrix
983 
984  // multiply row 0
985  spinorFloat A0_re = 0;
986  A0_re += g00_re * a0_re;
987  A0_re -= g00_im * a0_im;
988  A0_re += g01_re * a1_re;
989  A0_re -= g01_im * a1_im;
990  A0_re += g02_re * a2_re;
991  A0_re -= g02_im * a2_im;
992  spinorFloat A0_im = 0;
993  A0_im += g00_re * a0_im;
994  A0_im += g00_im * a0_re;
995  A0_im += g01_re * a1_im;
996  A0_im += g01_im * a1_re;
997  A0_im += g02_re * a2_im;
998  A0_im += g02_im * a2_re;
999  spinorFloat B0_re = 0;
1000  B0_re += g00_re * b0_re;
1001  B0_re -= g00_im * b0_im;
1002  B0_re += g01_re * b1_re;
1003  B0_re -= g01_im * b1_im;
1004  B0_re += g02_re * b2_re;
1005  B0_re -= g02_im * b2_im;
1006  spinorFloat B0_im = 0;
1007  B0_im += g00_re * b0_im;
1008  B0_im += g00_im * b0_re;
1009  B0_im += g01_re * b1_im;
1010  B0_im += g01_im * b1_re;
1011  B0_im += g02_re * b2_im;
1012  B0_im += g02_im * b2_re;
1013 
1014  // multiply row 1
1015  spinorFloat A1_re = 0;
1016  A1_re += g10_re * a0_re;
1017  A1_re -= g10_im * a0_im;
1018  A1_re += g11_re * a1_re;
1019  A1_re -= g11_im * a1_im;
1020  A1_re += g12_re * a2_re;
1021  A1_re -= g12_im * a2_im;
1022  spinorFloat A1_im = 0;
1023  A1_im += g10_re * a0_im;
1024  A1_im += g10_im * a0_re;
1025  A1_im += g11_re * a1_im;
1026  A1_im += g11_im * a1_re;
1027  A1_im += g12_re * a2_im;
1028  A1_im += g12_im * a2_re;
1029  spinorFloat B1_re = 0;
1030  B1_re += g10_re * b0_re;
1031  B1_re -= g10_im * b0_im;
1032  B1_re += g11_re * b1_re;
1033  B1_re -= g11_im * b1_im;
1034  B1_re += g12_re * b2_re;
1035  B1_re -= g12_im * b2_im;
1036  spinorFloat B1_im = 0;
1037  B1_im += g10_re * b0_im;
1038  B1_im += g10_im * b0_re;
1039  B1_im += g11_re * b1_im;
1040  B1_im += g11_im * b1_re;
1041  B1_im += g12_re * b2_im;
1042  B1_im += g12_im * b2_re;
1043 
1044  // multiply row 2
1045  spinorFloat A2_re = 0;
1046  A2_re += g20_re * a0_re;
1047  A2_re -= g20_im * a0_im;
1048  A2_re += g21_re * a1_re;
1049  A2_re -= g21_im * a1_im;
1050  A2_re += g22_re * a2_re;
1051  A2_re -= g22_im * a2_im;
1052  spinorFloat A2_im = 0;
1053  A2_im += g20_re * a0_im;
1054  A2_im += g20_im * a0_re;
1055  A2_im += g21_re * a1_im;
1056  A2_im += g21_im * a1_re;
1057  A2_im += g22_re * a2_im;
1058  A2_im += g22_im * a2_re;
1059  spinorFloat B2_re = 0;
1060  B2_re += g20_re * b0_re;
1061  B2_re -= g20_im * b0_im;
1062  B2_re += g21_re * b1_re;
1063  B2_re -= g21_im * b1_im;
1064  B2_re += g22_re * b2_re;
1065  B2_re -= g22_im * b2_im;
1066  spinorFloat B2_im = 0;
1067  B2_im += g20_re * b0_im;
1068  B2_im += g20_im * b0_re;
1069  B2_im += g21_re * b1_im;
1070  B2_im += g21_im * b1_re;
1071  B2_im += g22_re * b2_im;
1072  B2_im += g22_im * b2_re;
1073 
1074  o00_re += A0_re;
1075  o00_im += A0_im;
1076  o10_re += B0_re;
1077  o10_im += B0_im;
1078  o20_re += A0_im;
1079  o20_im -= A0_re;
1080  o30_re -= B0_im;
1081  o30_im += B0_re;
1082 
1083  o01_re += A1_re;
1084  o01_im += A1_im;
1085  o11_re += B1_re;
1086  o11_im += B1_im;
1087  o21_re += A1_im;
1088  o21_im -= A1_re;
1089  o31_re -= B1_im;
1090  o31_im += B1_re;
1091 
1092  o02_re += A2_re;
1093  o02_im += A2_im;
1094  o12_re += B2_re;
1095  o12_im += B2_im;
1096  o22_re += A2_im;
1097  o22_im -= A2_re;
1098  o32_re -= B2_im;
1099  o32_im += B2_re;
1100 }
1101 
1102 if (isActive(dim,2,-1,x1,x2,x3,x4,param.commDim,param.X) && x3==0 )
1103 {
1104  // Projector P2-
1105  // 1 0 -i 0
1106  // 0 1 0 i
1107  // i 0 1 0
1108  // 0 -i 0 1
1109 
1110  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,2,Y);
1111  const int sp_idx = face_idx + param.ghostOffset[2];
1112 #if (DD_PREC==2) // half precision
1113  sp_norm_idx = face_idx + param.ghostNormOffset[2];
1114 #endif
1115 
1116 
1117  const int ga_idx = Vh+(face_idx % ghostFace[2]);
1118 
1119  // read gauge matrix from device memory
1120  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride); }
1121  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 5, ga_idx, ga_stride); }
1122 
1129 
1130  {
1131 
1132  const int sp_stride_pad = param.Ls*ghostFace[2];
1133 
1134  // read half spinor from device memory
1135  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1136 
1137  a0_re = i00_re; a0_im = i00_im;
1138  a1_re = i01_re; a1_im = i01_im;
1139  a2_re = i02_re; a2_im = i02_im;
1140  b0_re = i10_re; b0_im = i10_im;
1141  b1_re = i11_re; b1_im = i11_im;
1142  b2_re = i12_re; b2_im = i12_im;
1143 
1144  }
1145 
1146  // reconstruct gauge matrix
1148 
1149  // multiply row 0
1150  spinorFloat A0_re = 0;
1151  A0_re += gT00_re * a0_re;
1152  A0_re -= gT00_im * a0_im;
1153  A0_re += gT01_re * a1_re;
1154  A0_re -= gT01_im * a1_im;
1155  A0_re += gT02_re * a2_re;
1156  A0_re -= gT02_im * a2_im;
1157  spinorFloat A0_im = 0;
1158  A0_im += gT00_re * a0_im;
1159  A0_im += gT00_im * a0_re;
1160  A0_im += gT01_re * a1_im;
1161  A0_im += gT01_im * a1_re;
1162  A0_im += gT02_re * a2_im;
1163  A0_im += gT02_im * a2_re;
1164  spinorFloat B0_re = 0;
1165  B0_re += gT00_re * b0_re;
1166  B0_re -= gT00_im * b0_im;
1167  B0_re += gT01_re * b1_re;
1168  B0_re -= gT01_im * b1_im;
1169  B0_re += gT02_re * b2_re;
1170  B0_re -= gT02_im * b2_im;
1171  spinorFloat B0_im = 0;
1172  B0_im += gT00_re * b0_im;
1173  B0_im += gT00_im * b0_re;
1174  B0_im += gT01_re * b1_im;
1175  B0_im += gT01_im * b1_re;
1176  B0_im += gT02_re * b2_im;
1177  B0_im += gT02_im * b2_re;
1178 
1179  // multiply row 1
1180  spinorFloat A1_re = 0;
1181  A1_re += gT10_re * a0_re;
1182  A1_re -= gT10_im * a0_im;
1183  A1_re += gT11_re * a1_re;
1184  A1_re -= gT11_im * a1_im;
1185  A1_re += gT12_re * a2_re;
1186  A1_re -= gT12_im * a2_im;
1187  spinorFloat A1_im = 0;
1188  A1_im += gT10_re * a0_im;
1189  A1_im += gT10_im * a0_re;
1190  A1_im += gT11_re * a1_im;
1191  A1_im += gT11_im * a1_re;
1192  A1_im += gT12_re * a2_im;
1193  A1_im += gT12_im * a2_re;
1194  spinorFloat B1_re = 0;
1195  B1_re += gT10_re * b0_re;
1196  B1_re -= gT10_im * b0_im;
1197  B1_re += gT11_re * b1_re;
1198  B1_re -= gT11_im * b1_im;
1199  B1_re += gT12_re * b2_re;
1200  B1_re -= gT12_im * b2_im;
1201  spinorFloat B1_im = 0;
1202  B1_im += gT10_re * b0_im;
1203  B1_im += gT10_im * b0_re;
1204  B1_im += gT11_re * b1_im;
1205  B1_im += gT11_im * b1_re;
1206  B1_im += gT12_re * b2_im;
1207  B1_im += gT12_im * b2_re;
1208 
1209  // multiply row 2
1210  spinorFloat A2_re = 0;
1211  A2_re += gT20_re * a0_re;
1212  A2_re -= gT20_im * a0_im;
1213  A2_re += gT21_re * a1_re;
1214  A2_re -= gT21_im * a1_im;
1215  A2_re += gT22_re * a2_re;
1216  A2_re -= gT22_im * a2_im;
1217  spinorFloat A2_im = 0;
1218  A2_im += gT20_re * a0_im;
1219  A2_im += gT20_im * a0_re;
1220  A2_im += gT21_re * a1_im;
1221  A2_im += gT21_im * a1_re;
1222  A2_im += gT22_re * a2_im;
1223  A2_im += gT22_im * a2_re;
1224  spinorFloat B2_re = 0;
1225  B2_re += gT20_re * b0_re;
1226  B2_re -= gT20_im * b0_im;
1227  B2_re += gT21_re * b1_re;
1228  B2_re -= gT21_im * b1_im;
1229  B2_re += gT22_re * b2_re;
1230  B2_re -= gT22_im * b2_im;
1231  spinorFloat B2_im = 0;
1232  B2_im += gT20_re * b0_im;
1233  B2_im += gT20_im * b0_re;
1234  B2_im += gT21_re * b1_im;
1235  B2_im += gT21_im * b1_re;
1236  B2_im += gT22_re * b2_im;
1237  B2_im += gT22_im * b2_re;
1238 
1239  o00_re += A0_re;
1240  o00_im += A0_im;
1241  o10_re += B0_re;
1242  o10_im += B0_im;
1243  o20_re -= A0_im;
1244  o20_im += A0_re;
1245  o30_re += B0_im;
1246  o30_im -= B0_re;
1247 
1248  o01_re += A1_re;
1249  o01_im += A1_im;
1250  o11_re += B1_re;
1251  o11_im += B1_im;
1252  o21_re -= A1_im;
1253  o21_im += A1_re;
1254  o31_re += B1_im;
1255  o31_im -= B1_re;
1256 
1257  o02_re += A2_re;
1258  o02_im += A2_im;
1259  o12_re += B2_re;
1260  o12_im += B2_im;
1261  o22_re -= A2_im;
1262  o22_im += A2_re;
1263  o32_re += B2_im;
1264  o32_im -= B2_re;
1265 }
1266 
1267 if (isActive(dim,3,+1,x1,x2,x3,x4,param.commDim,param.X) && x4==X4m1 )
1268 {
1269  // Projector P3+
1270  // 2 0 0 0
1271  // 0 2 0 0
1272  // 0 0 0 0
1273  // 0 0 0 0
1274 
1275  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,3,Y);
1276  const int sp_idx = face_idx + param.ghostOffset[3];
1277 #if (DD_PREC==2) // half precision
1278  sp_norm_idx = face_idx + param.Ls*ghostFace[3] + param.ghostNormOffset[3];
1279 #endif
1280 
1281 
1282  const int ga_idx = sid % Vh;
1283 
1284  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1285  {
1292 
1293  {
1294 
1295  const int sp_stride_pad = param.Ls*ghostFace[3];
1296  const int t_proj_scale = TPROJSCALE;
1297 
1298  // read half spinor from device memory
1299  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1300 
1301  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1302  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1303  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1304  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1305  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1306  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1307 
1308  }
1309 
1310  // identity gauge matrix
1317 
1318  o00_re += A0_re;
1319  o00_im += A0_im;
1320  o10_re += B0_re;
1321  o10_im += B0_im;
1322 
1323  o01_re += A1_re;
1324  o01_im += A1_im;
1325  o11_re += B1_re;
1326  o11_im += B1_im;
1327 
1328  o02_re += A2_re;
1329  o02_im += A2_im;
1330  o12_re += B2_re;
1331  o12_im += B2_im;
1332  } else {
1333  // read gauge matrix from device memory
1334  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride); }
1335  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 6, ga_idx, ga_stride); }
1336 
1343 
1344  {
1345 
1346  const int sp_stride_pad = param.Ls*ghostFace[3];
1347  const int t_proj_scale = TPROJSCALE;
1348 
1349  // read half spinor from device memory
1350  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1351 
1352  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1353  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1354  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1355  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1356  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1357  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1358 
1359  }
1360 
1361  // reconstruct gauge matrix
1363 
1364  // multiply row 0
1365  spinorFloat A0_re = 0;
1366  A0_re += g00_re * a0_re;
1367  A0_re -= g00_im * a0_im;
1368  A0_re += g01_re * a1_re;
1369  A0_re -= g01_im * a1_im;
1370  A0_re += g02_re * a2_re;
1371  A0_re -= g02_im * a2_im;
1372  spinorFloat A0_im = 0;
1373  A0_im += g00_re * a0_im;
1374  A0_im += g00_im * a0_re;
1375  A0_im += g01_re * a1_im;
1376  A0_im += g01_im * a1_re;
1377  A0_im += g02_re * a2_im;
1378  A0_im += g02_im * a2_re;
1379  spinorFloat B0_re = 0;
1380  B0_re += g00_re * b0_re;
1381  B0_re -= g00_im * b0_im;
1382  B0_re += g01_re * b1_re;
1383  B0_re -= g01_im * b1_im;
1384  B0_re += g02_re * b2_re;
1385  B0_re -= g02_im * b2_im;
1386  spinorFloat B0_im = 0;
1387  B0_im += g00_re * b0_im;
1388  B0_im += g00_im * b0_re;
1389  B0_im += g01_re * b1_im;
1390  B0_im += g01_im * b1_re;
1391  B0_im += g02_re * b2_im;
1392  B0_im += g02_im * b2_re;
1393 
1394  // multiply row 1
1395  spinorFloat A1_re = 0;
1396  A1_re += g10_re * a0_re;
1397  A1_re -= g10_im * a0_im;
1398  A1_re += g11_re * a1_re;
1399  A1_re -= g11_im * a1_im;
1400  A1_re += g12_re * a2_re;
1401  A1_re -= g12_im * a2_im;
1402  spinorFloat A1_im = 0;
1403  A1_im += g10_re * a0_im;
1404  A1_im += g10_im * a0_re;
1405  A1_im += g11_re * a1_im;
1406  A1_im += g11_im * a1_re;
1407  A1_im += g12_re * a2_im;
1408  A1_im += g12_im * a2_re;
1409  spinorFloat B1_re = 0;
1410  B1_re += g10_re * b0_re;
1411  B1_re -= g10_im * b0_im;
1412  B1_re += g11_re * b1_re;
1413  B1_re -= g11_im * b1_im;
1414  B1_re += g12_re * b2_re;
1415  B1_re -= g12_im * b2_im;
1416  spinorFloat B1_im = 0;
1417  B1_im += g10_re * b0_im;
1418  B1_im += g10_im * b0_re;
1419  B1_im += g11_re * b1_im;
1420  B1_im += g11_im * b1_re;
1421  B1_im += g12_re * b2_im;
1422  B1_im += g12_im * b2_re;
1423 
1424  // multiply row 2
1425  spinorFloat A2_re = 0;
1426  A2_re += g20_re * a0_re;
1427  A2_re -= g20_im * a0_im;
1428  A2_re += g21_re * a1_re;
1429  A2_re -= g21_im * a1_im;
1430  A2_re += g22_re * a2_re;
1431  A2_re -= g22_im * a2_im;
1432  spinorFloat A2_im = 0;
1433  A2_im += g20_re * a0_im;
1434  A2_im += g20_im * a0_re;
1435  A2_im += g21_re * a1_im;
1436  A2_im += g21_im * a1_re;
1437  A2_im += g22_re * a2_im;
1438  A2_im += g22_im * a2_re;
1439  spinorFloat B2_re = 0;
1440  B2_re += g20_re * b0_re;
1441  B2_re -= g20_im * b0_im;
1442  B2_re += g21_re * b1_re;
1443  B2_re -= g21_im * b1_im;
1444  B2_re += g22_re * b2_re;
1445  B2_re -= g22_im * b2_im;
1446  spinorFloat B2_im = 0;
1447  B2_im += g20_re * b0_im;
1448  B2_im += g20_im * b0_re;
1449  B2_im += g21_re * b1_im;
1450  B2_im += g21_im * b1_re;
1451  B2_im += g22_re * b2_im;
1452  B2_im += g22_im * b2_re;
1453 
1454  o00_re += A0_re;
1455  o00_im += A0_im;
1456  o10_re += B0_re;
1457  o10_im += B0_im;
1458 
1459  o01_re += A1_re;
1460  o01_im += A1_im;
1461  o11_re += B1_re;
1462  o11_im += B1_im;
1463 
1464  o02_re += A2_re;
1465  o02_im += A2_im;
1466  o12_re += B2_re;
1467  o12_im += B2_im;
1468  }
1469 }
1470 
1471 if (isActive(dim,3,-1,x1,x2,x3,x4,param.commDim,param.X) && x4==0 )
1472 {
1473  // Projector P3-
1474  // 0 0 0 0
1475  // 0 0 0 0
1476  // 0 0 2 0
1477  // 0 0 0 2
1478 
1479  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,3,Y);
1480  const int sp_idx = face_idx + param.ghostOffset[3];
1481 #if (DD_PREC==2) // half precision
1482  sp_norm_idx = face_idx + param.ghostNormOffset[3];
1483 #endif
1484 
1485 
1486  const int ga_idx = Vh+(face_idx % ghostFace[3]);
1487 
1488  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1489  {
1496 
1497  {
1498 
1499  const int sp_stride_pad = param.Ls*ghostFace[3];
1500  const int t_proj_scale = TPROJSCALE;
1501 
1502  // read half spinor from device memory
1503  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1504 
1505  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1506  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1507  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1508  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1509  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1510  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1511 
1512  }
1513 
1514  // identity gauge matrix
1521 
1522  o20_re += A0_re;
1523  o20_im += A0_im;
1524  o30_re += B0_re;
1525  o30_im += B0_im;
1526 
1527  o21_re += A1_re;
1528  o21_im += A1_im;
1529  o31_re += B1_re;
1530  o31_im += B1_im;
1531 
1532  o22_re += A2_re;
1533  o22_im += A2_im;
1534  o32_re += B2_re;
1535  o32_im += B2_im;
1536  } else {
1537  // read gauge matrix from device memory
1538  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride); }
1539  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 7, ga_idx, ga_stride); }
1540 
1547 
1548  {
1549 
1550  const int sp_stride_pad = param.Ls*ghostFace[3];
1551  const int t_proj_scale = TPROJSCALE;
1552 
1553  // read half spinor from device memory
1554  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1555 
1556  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1557  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1558  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1559  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1560  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1561  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1562 
1563  }
1564 
1565  // reconstruct gauge matrix
1567 
1568  // multiply row 0
1569  spinorFloat A0_re = 0;
1570  A0_re += gT00_re * a0_re;
1571  A0_re -= gT00_im * a0_im;
1572  A0_re += gT01_re * a1_re;
1573  A0_re -= gT01_im * a1_im;
1574  A0_re += gT02_re * a2_re;
1575  A0_re -= gT02_im * a2_im;
1576  spinorFloat A0_im = 0;
1577  A0_im += gT00_re * a0_im;
1578  A0_im += gT00_im * a0_re;
1579  A0_im += gT01_re * a1_im;
1580  A0_im += gT01_im * a1_re;
1581  A0_im += gT02_re * a2_im;
1582  A0_im += gT02_im * a2_re;
1583  spinorFloat B0_re = 0;
1584  B0_re += gT00_re * b0_re;
1585  B0_re -= gT00_im * b0_im;
1586  B0_re += gT01_re * b1_re;
1587  B0_re -= gT01_im * b1_im;
1588  B0_re += gT02_re * b2_re;
1589  B0_re -= gT02_im * b2_im;
1590  spinorFloat B0_im = 0;
1591  B0_im += gT00_re * b0_im;
1592  B0_im += gT00_im * b0_re;
1593  B0_im += gT01_re * b1_im;
1594  B0_im += gT01_im * b1_re;
1595  B0_im += gT02_re * b2_im;
1596  B0_im += gT02_im * b2_re;
1597 
1598  // multiply row 1
1599  spinorFloat A1_re = 0;
1600  A1_re += gT10_re * a0_re;
1601  A1_re -= gT10_im * a0_im;
1602  A1_re += gT11_re * a1_re;
1603  A1_re -= gT11_im * a1_im;
1604  A1_re += gT12_re * a2_re;
1605  A1_re -= gT12_im * a2_im;
1606  spinorFloat A1_im = 0;
1607  A1_im += gT10_re * a0_im;
1608  A1_im += gT10_im * a0_re;
1609  A1_im += gT11_re * a1_im;
1610  A1_im += gT11_im * a1_re;
1611  A1_im += gT12_re * a2_im;
1612  A1_im += gT12_im * a2_re;
1613  spinorFloat B1_re = 0;
1614  B1_re += gT10_re * b0_re;
1615  B1_re -= gT10_im * b0_im;
1616  B1_re += gT11_re * b1_re;
1617  B1_re -= gT11_im * b1_im;
1618  B1_re += gT12_re * b2_re;
1619  B1_re -= gT12_im * b2_im;
1620  spinorFloat B1_im = 0;
1621  B1_im += gT10_re * b0_im;
1622  B1_im += gT10_im * b0_re;
1623  B1_im += gT11_re * b1_im;
1624  B1_im += gT11_im * b1_re;
1625  B1_im += gT12_re * b2_im;
1626  B1_im += gT12_im * b2_re;
1627 
1628  // multiply row 2
1629  spinorFloat A2_re = 0;
1630  A2_re += gT20_re * a0_re;
1631  A2_re -= gT20_im * a0_im;
1632  A2_re += gT21_re * a1_re;
1633  A2_re -= gT21_im * a1_im;
1634  A2_re += gT22_re * a2_re;
1635  A2_re -= gT22_im * a2_im;
1636  spinorFloat A2_im = 0;
1637  A2_im += gT20_re * a0_im;
1638  A2_im += gT20_im * a0_re;
1639  A2_im += gT21_re * a1_im;
1640  A2_im += gT21_im * a1_re;
1641  A2_im += gT22_re * a2_im;
1642  A2_im += gT22_im * a2_re;
1643  spinorFloat B2_re = 0;
1644  B2_re += gT20_re * b0_re;
1645  B2_re -= gT20_im * b0_im;
1646  B2_re += gT21_re * b1_re;
1647  B2_re -= gT21_im * b1_im;
1648  B2_re += gT22_re * b2_re;
1649  B2_re -= gT22_im * b2_im;
1650  spinorFloat B2_im = 0;
1651  B2_im += gT20_re * b0_im;
1652  B2_im += gT20_im * b0_re;
1653  B2_im += gT21_re * b1_im;
1654  B2_im += gT21_im * b1_re;
1655  B2_im += gT22_re * b2_im;
1656  B2_im += gT22_im * b2_re;
1657 
1658  o20_re += A0_re;
1659  o20_im += A0_im;
1660  o30_re += B0_re;
1661  o30_im += B0_im;
1662 
1663  o21_re += A1_re;
1664  o21_im += A1_im;
1665  o31_re += B1_re;
1666  o31_im += B1_im;
1667 
1668  o22_re += A2_re;
1669  o22_im += A2_im;
1670  o32_re += B2_re;
1671  o32_im += B2_im;
1672  }
1673 }
1674 
1675 {
1676 
1677 #ifdef DSLASH_XPAY
1678 #ifdef SPINOR_DOUBLE
1679  o00_re = a*o00_re;
1680  o00_im = a*o00_im;
1681  o01_re = a*o01_re;
1682  o01_im = a*o01_im;
1683  o02_re = a*o02_re;
1684  o02_im = a*o02_im;
1685  o10_re = a*o10_re;
1686  o10_im = a*o10_im;
1687  o11_re = a*o11_re;
1688  o11_im = a*o11_im;
1689  o12_re = a*o12_re;
1690  o12_im = a*o12_im;
1691  o20_re = a*o20_re;
1692  o20_im = a*o20_im;
1693  o21_re = a*o21_re;
1694  o21_im = a*o21_im;
1695  o22_re = a*o22_re;
1696  o22_im = a*o22_im;
1697  o30_re = a*o30_re;
1698  o30_im = a*o30_im;
1699  o31_re = a*o31_re;
1700  o31_im = a*o31_im;
1701  o32_re = a*o32_re;
1702  o32_im = a*o32_im;
1703 #else
1704  o00_re = a*o00_re;
1705  o00_im = a*o00_im;
1706  o01_re = a*o01_re;
1707  o01_im = a*o01_im;
1708  o02_re = a*o02_re;
1709  o02_im = a*o02_im;
1710  o10_re = a*o10_re;
1711  o10_im = a*o10_im;
1712  o11_re = a*o11_re;
1713  o11_im = a*o11_im;
1714  o12_re = a*o12_re;
1715  o12_im = a*o12_im;
1716  o20_re = a*o20_re;
1717  o20_im = a*o20_im;
1718  o21_re = a*o21_re;
1719  o21_im = a*o21_im;
1720  o22_re = a*o22_re;
1721  o22_im = a*o22_im;
1722  o30_re = a*o30_re;
1723  o30_im = a*o30_im;
1724  o31_re = a*o31_re;
1725  o31_im = a*o31_im;
1726  o32_re = a*o32_re;
1727  o32_im = a*o32_im;
1728 #endif // SPINOR_DOUBLE
1729 
1730 #endif // DSLASH_XPAY
1731 }
1732 
1733 // write spinor field back to device memory
1734 WRITE_SPINOR(param.sp_stride);
1735 
1736 // undefine to prevent warning when precision is changed
1737 #undef spinorFloat
1738 #undef SHARED_STRIDE
1739 
1740 #undef g00_re
1741 #undef g00_im
1742 #undef g01_re
1743 #undef g01_im
1744 #undef g02_re
1745 #undef g02_im
1746 #undef g10_re
1747 #undef g10_im
1748 #undef g11_re
1749 #undef g11_im
1750 #undef g12_re
1751 #undef g12_im
1752 #undef g20_re
1753 #undef g20_im
1754 #undef g21_re
1755 #undef g21_im
1756 #undef g22_re
1757 #undef g22_im
1758 
1759 #undef i00_re
1760 #undef i00_im
1761 #undef i01_re
1762 #undef i01_im
1763 #undef i02_re
1764 #undef i02_im
1765 #undef i10_re
1766 #undef i10_im
1767 #undef i11_re
1768 #undef i11_im
1769 #undef i12_re
1770 #undef i12_im
1771 #undef i20_re
1772 #undef i20_im
1773 #undef i21_re
1774 #undef i21_im
1775 #undef i22_re
1776 #undef i22_im
1777 #undef i30_re
1778 #undef i30_im
1779 #undef i31_re
1780 #undef i31_im
1781 #undef i32_re
1782 #undef i32_im
1783 
1784 
1785 
1786 #undef VOLATILE
1787 
1788 #endif // MULTI_GPU
int s_parity
__constant__ int Vh
float4 G2
__constant__ int X1h
__constant__ int X2
#define o32_im
Definition: gamma5.h:295
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
#define o31_im
Definition: gamma5.h:293
int xs
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int ghostFace[QUDA_MAX_DIM+1]
float4 G1
#define RECONSTRUCT_GAUGE_MATRIX
Definition: covDev.h:39
float4 G0
#define GAUGE0TEX
Definition: covDev.h:112
#define o30_im
Definition: gamma5.h:291
__constant__ int X2m1
#define SPINORTEX
Definition: clover_def.h:40
#define o32_re
Definition: gamma5.h:294
int X[4]
Definition: quda.h:29
__constant__ int gauge_fixed
#define o31_re
Definition: gamma5.h:292
float4 G4
#define SPINOR_HOP
Definition: covDev.h:158
int boundaryCrossing
float4 G3
#define ASSN_GAUGE_MATRIX
__constant__ int ga_stride
__constant__ int X1m1
__constant__ int X3
#define GAUGE1TEX
Definition: covDev.h:113
__constant__ int X4m1
#define WRITE_SPINOR
Definition: clover_def.h:48
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
__constant__ int X4X3X2X1hmX3X2X1h
__constant__ int X4
__constant__ int X3m1
#define TPROJSCALE
Definition: covDev.h:101