QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dw_fused_exterior_dslash4_core.h
Go to the documentation of this file.
1 #ifdef MULTI_GPU
2 
3 // *** CUDA DSLASH ***
4 
5 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
6 
7 
8 #if (CUDA_VERSION >= 4010)
9 #define VOLATILE
10 #else
11 #define VOLATILE volatile
12 #endif
13 // input spinor
14 #ifdef SPINOR_DOUBLE
15 #define spinorFloat double
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define m5 m5_d
41 #define mdwf_b5 mdwf_b5_d
42 #define mdwf_c5 mdwf_c5_d
43 #else
44 #define spinorFloat float
45 #define i00_re I0.x
46 #define i00_im I0.y
47 #define i01_re I0.z
48 #define i01_im I0.w
49 #define i02_re I1.x
50 #define i02_im I1.y
51 #define i10_re I1.z
52 #define i10_im I1.w
53 #define i11_re I2.x
54 #define i11_im I2.y
55 #define i12_re I2.z
56 #define i12_im I2.w
57 #define i20_re I3.x
58 #define i20_im I3.y
59 #define i21_re I3.z
60 #define i21_im I3.w
61 #define i22_re I4.x
62 #define i22_im I4.y
63 #define i30_re I4.z
64 #define i30_im I4.w
65 #define i31_re I5.x
66 #define i31_im I5.y
67 #define i32_re I5.z
68 #define i32_im I5.w
69 #define m5 m5_f
70 #define mdwf_b5 mdwf_b5_f
71 #define mdwf_c5 mdwf_c5_f
72 #endif // SPINOR_DOUBLE
73 
74 // gauge link
75 #ifdef GAUGE_FLOAT2
76 #define g00_re G0.x
77 #define g00_im G0.y
78 #define g01_re G1.x
79 #define g01_im G1.y
80 #define g02_re G2.x
81 #define g02_im G2.y
82 #define g10_re G3.x
83 #define g10_im G3.y
84 #define g11_re G4.x
85 #define g11_im G4.y
86 #define g12_re G5.x
87 #define g12_im G5.y
88 #define g20_re G6.x
89 #define g20_im G6.y
90 #define g21_re G7.x
91 #define g21_im G7.y
92 #define g22_re G8.x
93 #define g22_im G8.y
94 
95 #else
96 #define g00_re G0.x
97 #define g00_im G0.y
98 #define g01_re G0.z
99 #define g01_im G0.w
100 #define g02_re G1.x
101 #define g02_im G1.y
102 #define g10_re G1.z
103 #define g10_im G1.w
104 #define g11_re G2.x
105 #define g11_im G2.y
106 #define g12_re G2.z
107 #define g12_im G2.w
108 #define g20_re G3.x
109 #define g20_im G3.y
110 #define g21_re G3.z
111 #define g21_im G3.w
112 #define g22_re G4.x
113 #define g22_im G4.y
114 
115 #endif // GAUGE_DOUBLE
116 
117 // conjugated gauge link
118 #define gT00_re (+g00_re)
119 #define gT00_im (-g00_im)
120 #define gT01_re (+g10_re)
121 #define gT01_im (-g10_im)
122 #define gT02_re (+g20_re)
123 #define gT02_im (-g20_im)
124 #define gT10_re (+g01_re)
125 #define gT10_im (-g01_im)
126 #define gT11_re (+g11_re)
127 #define gT11_im (-g11_im)
128 #define gT12_re (+g21_re)
129 #define gT12_im (-g21_im)
130 #define gT20_re (+g02_re)
131 #define gT20_im (-g02_im)
132 #define gT21_re (+g12_re)
133 #define gT21_im (-g12_im)
134 #define gT22_re (+g22_re)
135 #define gT22_im (-g22_im)
136 
137 // output spinor
162 
163 #ifdef SPINOR_DOUBLE
164 #if (__COMPUTE_CAPABILITY__ >= 200)
165 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
166 #else
167 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
168 #endif
169 #else
170 #if (__COMPUTE_CAPABILITY__ >= 200)
171 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
172 #else
173 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
174 #endif
175 #endif
176 
177 #include "read_gauge.h"
178 #include "io_spinor.h"
179 
180 #if (DD_PREC==2) // half precision
181 int sp_norm_idx;
182 #endif // half precision
183 
184 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
185 if (sid >= param.threads*param.Ls) return;
186 
187 int dim;
188 int face_num;
189 int face_idx;
190 int Y[4] = {X1,X2,X3,X4};
191 int faceVolume[4];
192 faceVolume[0] = (X2*X3*X4)>>1;
193 faceVolume[1] = (X1*X3*X4)>>1;
194 faceVolume[2] = (X1*X2*X4)>>1;
195 faceVolume[3] = (X1*X2*X3)>>1;
196 
197 
198 
199 
200 int boundaryCrossing;
201 
202 int X, x1, x2, x3, x4, xs;
203 
204 { // exterior kernel
205 
206 dim = dimFromDWFaceIndex(sid, param); // sid is also modified
207 
208 //const int face_volume = (param.threads*param.Ls >> 1); // volume of one face
209 const int face_volume = ((param.threadDimMapUpper[dim] - param.threadDimMapLower[dim])*param.Ls >> 1);
210 
211 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
212 face_idx = sid - face_num*face_volume; // index into the respective face
213 
214 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
215 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
216 //sp_idx = face_idx + param.ghostOffset[dim];
217 
218 
219 const int dims[] = {X1, X2, X3, X4};
220 coordsFromDW4DFaceIndex<1>(sid, x1, x2, x3, x4, xs, face_idx, face_volume, dim, face_num, param.parity, dims);
221 
222 {
223  bool active = false;
224  for(int dir=0; dir<4; ++dir){
225  active = active || isActive(dim,dir,+1,x1,x2,x3,x4,param.commDim,param.X);
226  }
227  if(!active) return;
228 }
229 
230 
231 
232 
233 boundaryCrossing = sid/X1h + sid/(X2*X1h) + sid/(X3*X2*X1h);
234 
235 X = 2*sid + (boundaryCrossing + param.parity) % 2;
236 
237 READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
238  o00_re = i00_re; o00_im = i00_im;
239  o01_re = i01_re; o01_im = i01_im;
240  o02_re = i02_re; o02_im = i02_im;
241  o10_re = i10_re; o10_im = i10_im;
242  o11_re = i11_re; o11_im = i11_im;
243  o12_re = i12_re; o12_im = i12_im;
244  o20_re = i20_re; o20_im = i20_im;
245  o21_re = i21_re; o21_im = i21_im;
246  o22_re = i22_re; o22_im = i22_im;
247  o30_re = i30_re; o30_im = i30_im;
248  o31_re = i31_re; o31_im = i31_im;
249  o32_re = i32_re; o32_im = i32_im;
250 }
251 
252 // declare G## here and use ASSN below instead of READ
253 #ifdef GAUGE_FLOAT2
254 #if (DD_PREC==0) //temporal hack
255 double2 G0;
256 double2 G1;
257 double2 G2;
258 double2 G3;
259 double2 G4;
260 double2 G5;
261 double2 G6;
262 double2 G7;
263 double2 G8;
264 #else
265 float2 G0;
266 float2 G1;
267 float2 G2;
268 float2 G3;
269 float2 G4;
270 float2 G5;
271 float2 G6;
272 float2 G7;
273 float2 G8;
274 #endif
275 #else
276 float4 G0;
277 float4 G1;
278 float4 G2;
279 float4 G3;
280 float4 G4;
281 #endif
282 
283 
284 
285 if (isActive(dim,0,+1,x1,x2,x3,x4,param.commDim,param.X) && x1==X1m1 )
286 {
287  // Projector P0-
288  // 1 0 0 -i
289  // 0 1 -i 0
290  // 0 i 1 0
291  // i 0 0 1
292 
293  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,0,Y);
294  const int sp_idx = face_idx + param.ghostOffset[0];
295 #if (DD_PREC==2) // half precision
296  sp_norm_idx = face_idx + param.Ls*ghostFace[0] + param.ghostNormOffset[0];
297 #endif
298 
299 
300  const int ga_idx = sid % Vh;
301 
302  // read gauge matrix from device memory
303  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
304 
311 
312 
313 
314  const int sp_stride_pad = param.Ls*ghostFace[0];
315 
316  // read half spinor from device memory
317  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
318 
319  a0_re = i00_re; a0_im = i00_im;
320  a1_re = i01_re; a1_im = i01_im;
321  a2_re = i02_re; a2_im = i02_im;
322  b0_re = i10_re; b0_im = i10_im;
323  b1_re = i11_re; b1_im = i11_im;
324  b2_re = i12_re; b2_im = i12_im;
325 
326 
327  // reconstruct gauge matrix
329 
330  // multiply row 0
331  spinorFloat A0_re = 0;
332  A0_re += g00_re * a0_re;
333  A0_re -= g00_im * a0_im;
334  A0_re += g01_re * a1_re;
335  A0_re -= g01_im * a1_im;
336  A0_re += g02_re * a2_re;
337  A0_re -= g02_im * a2_im;
338  spinorFloat A0_im = 0;
339  A0_im += g00_re * a0_im;
340  A0_im += g00_im * a0_re;
341  A0_im += g01_re * a1_im;
342  A0_im += g01_im * a1_re;
343  A0_im += g02_re * a2_im;
344  A0_im += g02_im * a2_re;
345  spinorFloat B0_re = 0;
346  B0_re += g00_re * b0_re;
347  B0_re -= g00_im * b0_im;
348  B0_re += g01_re * b1_re;
349  B0_re -= g01_im * b1_im;
350  B0_re += g02_re * b2_re;
351  B0_re -= g02_im * b2_im;
352  spinorFloat B0_im = 0;
353  B0_im += g00_re * b0_im;
354  B0_im += g00_im * b0_re;
355  B0_im += g01_re * b1_im;
356  B0_im += g01_im * b1_re;
357  B0_im += g02_re * b2_im;
358  B0_im += g02_im * b2_re;
359 
360  // multiply row 1
361  spinorFloat A1_re = 0;
362  A1_re += g10_re * a0_re;
363  A1_re -= g10_im * a0_im;
364  A1_re += g11_re * a1_re;
365  A1_re -= g11_im * a1_im;
366  A1_re += g12_re * a2_re;
367  A1_re -= g12_im * a2_im;
368  spinorFloat A1_im = 0;
369  A1_im += g10_re * a0_im;
370  A1_im += g10_im * a0_re;
371  A1_im += g11_re * a1_im;
372  A1_im += g11_im * a1_re;
373  A1_im += g12_re * a2_im;
374  A1_im += g12_im * a2_re;
375  spinorFloat B1_re = 0;
376  B1_re += g10_re * b0_re;
377  B1_re -= g10_im * b0_im;
378  B1_re += g11_re * b1_re;
379  B1_re -= g11_im * b1_im;
380  B1_re += g12_re * b2_re;
381  B1_re -= g12_im * b2_im;
382  spinorFloat B1_im = 0;
383  B1_im += g10_re * b0_im;
384  B1_im += g10_im * b0_re;
385  B1_im += g11_re * b1_im;
386  B1_im += g11_im * b1_re;
387  B1_im += g12_re * b2_im;
388  B1_im += g12_im * b2_re;
389 
390  // multiply row 2
391  spinorFloat A2_re = 0;
392  A2_re += g20_re * a0_re;
393  A2_re -= g20_im * a0_im;
394  A2_re += g21_re * a1_re;
395  A2_re -= g21_im * a1_im;
396  A2_re += g22_re * a2_re;
397  A2_re -= g22_im * a2_im;
398  spinorFloat A2_im = 0;
399  A2_im += g20_re * a0_im;
400  A2_im += g20_im * a0_re;
401  A2_im += g21_re * a1_im;
402  A2_im += g21_im * a1_re;
403  A2_im += g22_re * a2_im;
404  A2_im += g22_im * a2_re;
405  spinorFloat B2_re = 0;
406  B2_re += g20_re * b0_re;
407  B2_re -= g20_im * b0_im;
408  B2_re += g21_re * b1_re;
409  B2_re -= g21_im * b1_im;
410  B2_re += g22_re * b2_re;
411  B2_re -= g22_im * b2_im;
412  spinorFloat B2_im = 0;
413  B2_im += g20_re * b0_im;
414  B2_im += g20_im * b0_re;
415  B2_im += g21_re * b1_im;
416  B2_im += g21_im * b1_re;
417  B2_im += g22_re * b2_im;
418  B2_im += g22_im * b2_re;
419 
420  o00_re += A0_re;
421  o00_im += A0_im;
422  o10_re += B0_re;
423  o10_im += B0_im;
424  o20_re -= B0_im;
425  o20_im += B0_re;
426  o30_re -= A0_im;
427  o30_im += A0_re;
428 
429  o01_re += A1_re;
430  o01_im += A1_im;
431  o11_re += B1_re;
432  o11_im += B1_im;
433  o21_re -= B1_im;
434  o21_im += B1_re;
435  o31_re -= A1_im;
436  o31_im += A1_re;
437 
438  o02_re += A2_re;
439  o02_im += A2_im;
440  o12_re += B2_re;
441  o12_im += B2_im;
442  o22_re -= B2_im;
443  o22_im += B2_re;
444  o32_re -= A2_im;
445  o32_im += A2_re;
446 }
447 
448 if (isActive(dim,0,-1,x1,x2,x3,x4,param.commDim,param.X) && x1==0 )
449 {
450  // Projector P0+
451  // 1 0 0 i
452  // 0 1 i 0
453  // 0 -i 1 0
454  // -i 0 0 1
455 
456  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,0,Y);
457  const int sp_idx = face_idx + param.ghostOffset[0];
458 #if (DD_PREC==2) // half precision
459  sp_norm_idx = face_idx + param.ghostNormOffset[0];
460 #endif
461 
462 
463  const int ga_idx = Vh+(face_idx % ghostFace[0]);
464 
465  // read gauge matrix from device memory
466  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
467 
474 
475 
476 
477  const int sp_stride_pad = param.Ls*ghostFace[0];
478 
479  // read half spinor from device memory
480  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
481 
482  a0_re = i00_re; a0_im = i00_im;
483  a1_re = i01_re; a1_im = i01_im;
484  a2_re = i02_re; a2_im = i02_im;
485  b0_re = i10_re; b0_im = i10_im;
486  b1_re = i11_re; b1_im = i11_im;
487  b2_re = i12_re; b2_im = i12_im;
488 
489 
490  // reconstruct gauge matrix
492 
493  // multiply row 0
494  spinorFloat A0_re = 0;
495  A0_re += gT00_re * a0_re;
496  A0_re -= gT00_im * a0_im;
497  A0_re += gT01_re * a1_re;
498  A0_re -= gT01_im * a1_im;
499  A0_re += gT02_re * a2_re;
500  A0_re -= gT02_im * a2_im;
501  spinorFloat A0_im = 0;
502  A0_im += gT00_re * a0_im;
503  A0_im += gT00_im * a0_re;
504  A0_im += gT01_re * a1_im;
505  A0_im += gT01_im * a1_re;
506  A0_im += gT02_re * a2_im;
507  A0_im += gT02_im * a2_re;
508  spinorFloat B0_re = 0;
509  B0_re += gT00_re * b0_re;
510  B0_re -= gT00_im * b0_im;
511  B0_re += gT01_re * b1_re;
512  B0_re -= gT01_im * b1_im;
513  B0_re += gT02_re * b2_re;
514  B0_re -= gT02_im * b2_im;
515  spinorFloat B0_im = 0;
516  B0_im += gT00_re * b0_im;
517  B0_im += gT00_im * b0_re;
518  B0_im += gT01_re * b1_im;
519  B0_im += gT01_im * b1_re;
520  B0_im += gT02_re * b2_im;
521  B0_im += gT02_im * b2_re;
522 
523  // multiply row 1
524  spinorFloat A1_re = 0;
525  A1_re += gT10_re * a0_re;
526  A1_re -= gT10_im * a0_im;
527  A1_re += gT11_re * a1_re;
528  A1_re -= gT11_im * a1_im;
529  A1_re += gT12_re * a2_re;
530  A1_re -= gT12_im * a2_im;
531  spinorFloat A1_im = 0;
532  A1_im += gT10_re * a0_im;
533  A1_im += gT10_im * a0_re;
534  A1_im += gT11_re * a1_im;
535  A1_im += gT11_im * a1_re;
536  A1_im += gT12_re * a2_im;
537  A1_im += gT12_im * a2_re;
538  spinorFloat B1_re = 0;
539  B1_re += gT10_re * b0_re;
540  B1_re -= gT10_im * b0_im;
541  B1_re += gT11_re * b1_re;
542  B1_re -= gT11_im * b1_im;
543  B1_re += gT12_re * b2_re;
544  B1_re -= gT12_im * b2_im;
545  spinorFloat B1_im = 0;
546  B1_im += gT10_re * b0_im;
547  B1_im += gT10_im * b0_re;
548  B1_im += gT11_re * b1_im;
549  B1_im += gT11_im * b1_re;
550  B1_im += gT12_re * b2_im;
551  B1_im += gT12_im * b2_re;
552 
553  // multiply row 2
554  spinorFloat A2_re = 0;
555  A2_re += gT20_re * a0_re;
556  A2_re -= gT20_im * a0_im;
557  A2_re += gT21_re * a1_re;
558  A2_re -= gT21_im * a1_im;
559  A2_re += gT22_re * a2_re;
560  A2_re -= gT22_im * a2_im;
561  spinorFloat A2_im = 0;
562  A2_im += gT20_re * a0_im;
563  A2_im += gT20_im * a0_re;
564  A2_im += gT21_re * a1_im;
565  A2_im += gT21_im * a1_re;
566  A2_im += gT22_re * a2_im;
567  A2_im += gT22_im * a2_re;
568  spinorFloat B2_re = 0;
569  B2_re += gT20_re * b0_re;
570  B2_re -= gT20_im * b0_im;
571  B2_re += gT21_re * b1_re;
572  B2_re -= gT21_im * b1_im;
573  B2_re += gT22_re * b2_re;
574  B2_re -= gT22_im * b2_im;
575  spinorFloat B2_im = 0;
576  B2_im += gT20_re * b0_im;
577  B2_im += gT20_im * b0_re;
578  B2_im += gT21_re * b1_im;
579  B2_im += gT21_im * b1_re;
580  B2_im += gT22_re * b2_im;
581  B2_im += gT22_im * b2_re;
582 
583  o00_re += A0_re;
584  o00_im += A0_im;
585  o10_re += B0_re;
586  o10_im += B0_im;
587  o20_re += B0_im;
588  o20_im -= B0_re;
589  o30_re += A0_im;
590  o30_im -= A0_re;
591 
592  o01_re += A1_re;
593  o01_im += A1_im;
594  o11_re += B1_re;
595  o11_im += B1_im;
596  o21_re += B1_im;
597  o21_im -= B1_re;
598  o31_re += A1_im;
599  o31_im -= A1_re;
600 
601  o02_re += A2_re;
602  o02_im += A2_im;
603  o12_re += B2_re;
604  o12_im += B2_im;
605  o22_re += B2_im;
606  o22_im -= B2_re;
607  o32_re += A2_im;
608  o32_im -= A2_re;
609 }
610 
611 if (isActive(dim,1,+1,x1,x2,x3,x4,param.commDim,param.X) && x2==X2m1 )
612 {
613  // Projector P1-
614  // 1 0 0 -1
615  // 0 1 1 0
616  // 0 1 1 0
617  // -1 0 0 1
618 
619  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,1,Y);
620  const int sp_idx = face_idx + param.ghostOffset[1];
621 #if (DD_PREC==2) // half precision
622  sp_norm_idx = face_idx + param.Ls*ghostFace[1] + param.ghostNormOffset[1];
623 #endif
624 
625 
626  const int ga_idx = sid % Vh;
627 
628  // read gauge matrix from device memory
629  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
630 
637 
638 
639 
640  const int sp_stride_pad = param.Ls*ghostFace[1];
641 
642  // read half spinor from device memory
643  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
644 
645  a0_re = i00_re; a0_im = i00_im;
646  a1_re = i01_re; a1_im = i01_im;
647  a2_re = i02_re; a2_im = i02_im;
648  b0_re = i10_re; b0_im = i10_im;
649  b1_re = i11_re; b1_im = i11_im;
650  b2_re = i12_re; b2_im = i12_im;
651 
652 
653  // reconstruct gauge matrix
655 
656  // multiply row 0
657  spinorFloat A0_re = 0;
658  A0_re += g00_re * a0_re;
659  A0_re -= g00_im * a0_im;
660  A0_re += g01_re * a1_re;
661  A0_re -= g01_im * a1_im;
662  A0_re += g02_re * a2_re;
663  A0_re -= g02_im * a2_im;
664  spinorFloat A0_im = 0;
665  A0_im += g00_re * a0_im;
666  A0_im += g00_im * a0_re;
667  A0_im += g01_re * a1_im;
668  A0_im += g01_im * a1_re;
669  A0_im += g02_re * a2_im;
670  A0_im += g02_im * a2_re;
671  spinorFloat B0_re = 0;
672  B0_re += g00_re * b0_re;
673  B0_re -= g00_im * b0_im;
674  B0_re += g01_re * b1_re;
675  B0_re -= g01_im * b1_im;
676  B0_re += g02_re * b2_re;
677  B0_re -= g02_im * b2_im;
678  spinorFloat B0_im = 0;
679  B0_im += g00_re * b0_im;
680  B0_im += g00_im * b0_re;
681  B0_im += g01_re * b1_im;
682  B0_im += g01_im * b1_re;
683  B0_im += g02_re * b2_im;
684  B0_im += g02_im * b2_re;
685 
686  // multiply row 1
687  spinorFloat A1_re = 0;
688  A1_re += g10_re * a0_re;
689  A1_re -= g10_im * a0_im;
690  A1_re += g11_re * a1_re;
691  A1_re -= g11_im * a1_im;
692  A1_re += g12_re * a2_re;
693  A1_re -= g12_im * a2_im;
694  spinorFloat A1_im = 0;
695  A1_im += g10_re * a0_im;
696  A1_im += g10_im * a0_re;
697  A1_im += g11_re * a1_im;
698  A1_im += g11_im * a1_re;
699  A1_im += g12_re * a2_im;
700  A1_im += g12_im * a2_re;
701  spinorFloat B1_re = 0;
702  B1_re += g10_re * b0_re;
703  B1_re -= g10_im * b0_im;
704  B1_re += g11_re * b1_re;
705  B1_re -= g11_im * b1_im;
706  B1_re += g12_re * b2_re;
707  B1_re -= g12_im * b2_im;
708  spinorFloat B1_im = 0;
709  B1_im += g10_re * b0_im;
710  B1_im += g10_im * b0_re;
711  B1_im += g11_re * b1_im;
712  B1_im += g11_im * b1_re;
713  B1_im += g12_re * b2_im;
714  B1_im += g12_im * b2_re;
715 
716  // multiply row 2
717  spinorFloat A2_re = 0;
718  A2_re += g20_re * a0_re;
719  A2_re -= g20_im * a0_im;
720  A2_re += g21_re * a1_re;
721  A2_re -= g21_im * a1_im;
722  A2_re += g22_re * a2_re;
723  A2_re -= g22_im * a2_im;
724  spinorFloat A2_im = 0;
725  A2_im += g20_re * a0_im;
726  A2_im += g20_im * a0_re;
727  A2_im += g21_re * a1_im;
728  A2_im += g21_im * a1_re;
729  A2_im += g22_re * a2_im;
730  A2_im += g22_im * a2_re;
731  spinorFloat B2_re = 0;
732  B2_re += g20_re * b0_re;
733  B2_re -= g20_im * b0_im;
734  B2_re += g21_re * b1_re;
735  B2_re -= g21_im * b1_im;
736  B2_re += g22_re * b2_re;
737  B2_re -= g22_im * b2_im;
738  spinorFloat B2_im = 0;
739  B2_im += g20_re * b0_im;
740  B2_im += g20_im * b0_re;
741  B2_im += g21_re * b1_im;
742  B2_im += g21_im * b1_re;
743  B2_im += g22_re * b2_im;
744  B2_im += g22_im * b2_re;
745 
746  o00_re += A0_re;
747  o00_im += A0_im;
748  o10_re += B0_re;
749  o10_im += B0_im;
750  o20_re += B0_re;
751  o20_im += B0_im;
752  o30_re -= A0_re;
753  o30_im -= A0_im;
754 
755  o01_re += A1_re;
756  o01_im += A1_im;
757  o11_re += B1_re;
758  o11_im += B1_im;
759  o21_re += B1_re;
760  o21_im += B1_im;
761  o31_re -= A1_re;
762  o31_im -= A1_im;
763 
764  o02_re += A2_re;
765  o02_im += A2_im;
766  o12_re += B2_re;
767  o12_im += B2_im;
768  o22_re += B2_re;
769  o22_im += B2_im;
770  o32_re -= A2_re;
771  o32_im -= A2_im;
772 }
773 
774 if (isActive(dim,1,-1,x1,x2,x3,x4,param.commDim,param.X) && x2==0 )
775 {
776  // Projector P1+
777  // 1 0 0 1
778  // 0 1 -1 0
779  // 0 -1 1 0
780  // 1 0 0 1
781 
782  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,1,Y);
783  const int sp_idx = face_idx + param.ghostOffset[1];
784 #if (DD_PREC==2) // half precision
785  sp_norm_idx = face_idx + param.ghostNormOffset[1];
786 #endif
787 
788 
789  const int ga_idx = Vh+(face_idx % ghostFace[1]);
790 
791  // read gauge matrix from device memory
792  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
793 
800 
801 
802 
803  const int sp_stride_pad = param.Ls*ghostFace[1];
804 
805  // read half spinor from device memory
806  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
807 
808  a0_re = i00_re; a0_im = i00_im;
809  a1_re = i01_re; a1_im = i01_im;
810  a2_re = i02_re; a2_im = i02_im;
811  b0_re = i10_re; b0_im = i10_im;
812  b1_re = i11_re; b1_im = i11_im;
813  b2_re = i12_re; b2_im = i12_im;
814 
815 
816  // reconstruct gauge matrix
818 
819  // multiply row 0
820  spinorFloat A0_re = 0;
821  A0_re += gT00_re * a0_re;
822  A0_re -= gT00_im * a0_im;
823  A0_re += gT01_re * a1_re;
824  A0_re -= gT01_im * a1_im;
825  A0_re += gT02_re * a2_re;
826  A0_re -= gT02_im * a2_im;
827  spinorFloat A0_im = 0;
828  A0_im += gT00_re * a0_im;
829  A0_im += gT00_im * a0_re;
830  A0_im += gT01_re * a1_im;
831  A0_im += gT01_im * a1_re;
832  A0_im += gT02_re * a2_im;
833  A0_im += gT02_im * a2_re;
834  spinorFloat B0_re = 0;
835  B0_re += gT00_re * b0_re;
836  B0_re -= gT00_im * b0_im;
837  B0_re += gT01_re * b1_re;
838  B0_re -= gT01_im * b1_im;
839  B0_re += gT02_re * b2_re;
840  B0_re -= gT02_im * b2_im;
841  spinorFloat B0_im = 0;
842  B0_im += gT00_re * b0_im;
843  B0_im += gT00_im * b0_re;
844  B0_im += gT01_re * b1_im;
845  B0_im += gT01_im * b1_re;
846  B0_im += gT02_re * b2_im;
847  B0_im += gT02_im * b2_re;
848 
849  // multiply row 1
850  spinorFloat A1_re = 0;
851  A1_re += gT10_re * a0_re;
852  A1_re -= gT10_im * a0_im;
853  A1_re += gT11_re * a1_re;
854  A1_re -= gT11_im * a1_im;
855  A1_re += gT12_re * a2_re;
856  A1_re -= gT12_im * a2_im;
857  spinorFloat A1_im = 0;
858  A1_im += gT10_re * a0_im;
859  A1_im += gT10_im * a0_re;
860  A1_im += gT11_re * a1_im;
861  A1_im += gT11_im * a1_re;
862  A1_im += gT12_re * a2_im;
863  A1_im += gT12_im * a2_re;
864  spinorFloat B1_re = 0;
865  B1_re += gT10_re * b0_re;
866  B1_re -= gT10_im * b0_im;
867  B1_re += gT11_re * b1_re;
868  B1_re -= gT11_im * b1_im;
869  B1_re += gT12_re * b2_re;
870  B1_re -= gT12_im * b2_im;
871  spinorFloat B1_im = 0;
872  B1_im += gT10_re * b0_im;
873  B1_im += gT10_im * b0_re;
874  B1_im += gT11_re * b1_im;
875  B1_im += gT11_im * b1_re;
876  B1_im += gT12_re * b2_im;
877  B1_im += gT12_im * b2_re;
878 
879  // multiply row 2
880  spinorFloat A2_re = 0;
881  A2_re += gT20_re * a0_re;
882  A2_re -= gT20_im * a0_im;
883  A2_re += gT21_re * a1_re;
884  A2_re -= gT21_im * a1_im;
885  A2_re += gT22_re * a2_re;
886  A2_re -= gT22_im * a2_im;
887  spinorFloat A2_im = 0;
888  A2_im += gT20_re * a0_im;
889  A2_im += gT20_im * a0_re;
890  A2_im += gT21_re * a1_im;
891  A2_im += gT21_im * a1_re;
892  A2_im += gT22_re * a2_im;
893  A2_im += gT22_im * a2_re;
894  spinorFloat B2_re = 0;
895  B2_re += gT20_re * b0_re;
896  B2_re -= gT20_im * b0_im;
897  B2_re += gT21_re * b1_re;
898  B2_re -= gT21_im * b1_im;
899  B2_re += gT22_re * b2_re;
900  B2_re -= gT22_im * b2_im;
901  spinorFloat B2_im = 0;
902  B2_im += gT20_re * b0_im;
903  B2_im += gT20_im * b0_re;
904  B2_im += gT21_re * b1_im;
905  B2_im += gT21_im * b1_re;
906  B2_im += gT22_re * b2_im;
907  B2_im += gT22_im * b2_re;
908 
909  o00_re += A0_re;
910  o00_im += A0_im;
911  o10_re += B0_re;
912  o10_im += B0_im;
913  o20_re -= B0_re;
914  o20_im -= B0_im;
915  o30_re += A0_re;
916  o30_im += A0_im;
917 
918  o01_re += A1_re;
919  o01_im += A1_im;
920  o11_re += B1_re;
921  o11_im += B1_im;
922  o21_re -= B1_re;
923  o21_im -= B1_im;
924  o31_re += A1_re;
925  o31_im += A1_im;
926 
927  o02_re += A2_re;
928  o02_im += A2_im;
929  o12_re += B2_re;
930  o12_im += B2_im;
931  o22_re -= B2_re;
932  o22_im -= B2_im;
933  o32_re += A2_re;
934  o32_im += A2_im;
935 }
936 
937 if (isActive(dim,2,+1,x1,x2,x3,x4,param.commDim,param.X) && x3==X3m1 )
938 {
939  // Projector P2-
940  // 1 0 -i 0
941  // 0 1 0 i
942  // i 0 1 0
943  // 0 -i 0 1
944 
945  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,2,Y);
946  const int sp_idx = face_idx + param.ghostOffset[2];
947 #if (DD_PREC==2) // half precision
948  sp_norm_idx = face_idx + param.Ls*ghostFace[2] + param.ghostNormOffset[2];
949 #endif
950 
951 
952  const int ga_idx = sid % Vh;
953 
954  // read gauge matrix from device memory
955  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
956 
963 
964 
965 
966  const int sp_stride_pad = param.Ls*ghostFace[2];
967 
968  // read half spinor from device memory
969  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
970 
971  a0_re = i00_re; a0_im = i00_im;
972  a1_re = i01_re; a1_im = i01_im;
973  a2_re = i02_re; a2_im = i02_im;
974  b0_re = i10_re; b0_im = i10_im;
975  b1_re = i11_re; b1_im = i11_im;
976  b2_re = i12_re; b2_im = i12_im;
977 
978 
979  // reconstruct gauge matrix
981 
982  // multiply row 0
983  spinorFloat A0_re = 0;
984  A0_re += g00_re * a0_re;
985  A0_re -= g00_im * a0_im;
986  A0_re += g01_re * a1_re;
987  A0_re -= g01_im * a1_im;
988  A0_re += g02_re * a2_re;
989  A0_re -= g02_im * a2_im;
990  spinorFloat A0_im = 0;
991  A0_im += g00_re * a0_im;
992  A0_im += g00_im * a0_re;
993  A0_im += g01_re * a1_im;
994  A0_im += g01_im * a1_re;
995  A0_im += g02_re * a2_im;
996  A0_im += g02_im * a2_re;
997  spinorFloat B0_re = 0;
998  B0_re += g00_re * b0_re;
999  B0_re -= g00_im * b0_im;
1000  B0_re += g01_re * b1_re;
1001  B0_re -= g01_im * b1_im;
1002  B0_re += g02_re * b2_re;
1003  B0_re -= g02_im * b2_im;
1004  spinorFloat B0_im = 0;
1005  B0_im += g00_re * b0_im;
1006  B0_im += g00_im * b0_re;
1007  B0_im += g01_re * b1_im;
1008  B0_im += g01_im * b1_re;
1009  B0_im += g02_re * b2_im;
1010  B0_im += g02_im * b2_re;
1011 
1012  // multiply row 1
1013  spinorFloat A1_re = 0;
1014  A1_re += g10_re * a0_re;
1015  A1_re -= g10_im * a0_im;
1016  A1_re += g11_re * a1_re;
1017  A1_re -= g11_im * a1_im;
1018  A1_re += g12_re * a2_re;
1019  A1_re -= g12_im * a2_im;
1020  spinorFloat A1_im = 0;
1021  A1_im += g10_re * a0_im;
1022  A1_im += g10_im * a0_re;
1023  A1_im += g11_re * a1_im;
1024  A1_im += g11_im * a1_re;
1025  A1_im += g12_re * a2_im;
1026  A1_im += g12_im * a2_re;
1027  spinorFloat B1_re = 0;
1028  B1_re += g10_re * b0_re;
1029  B1_re -= g10_im * b0_im;
1030  B1_re += g11_re * b1_re;
1031  B1_re -= g11_im * b1_im;
1032  B1_re += g12_re * b2_re;
1033  B1_re -= g12_im * b2_im;
1034  spinorFloat B1_im = 0;
1035  B1_im += g10_re * b0_im;
1036  B1_im += g10_im * b0_re;
1037  B1_im += g11_re * b1_im;
1038  B1_im += g11_im * b1_re;
1039  B1_im += g12_re * b2_im;
1040  B1_im += g12_im * b2_re;
1041 
1042  // multiply row 2
1043  spinorFloat A2_re = 0;
1044  A2_re += g20_re * a0_re;
1045  A2_re -= g20_im * a0_im;
1046  A2_re += g21_re * a1_re;
1047  A2_re -= g21_im * a1_im;
1048  A2_re += g22_re * a2_re;
1049  A2_re -= g22_im * a2_im;
1050  spinorFloat A2_im = 0;
1051  A2_im += g20_re * a0_im;
1052  A2_im += g20_im * a0_re;
1053  A2_im += g21_re * a1_im;
1054  A2_im += g21_im * a1_re;
1055  A2_im += g22_re * a2_im;
1056  A2_im += g22_im * a2_re;
1057  spinorFloat B2_re = 0;
1058  B2_re += g20_re * b0_re;
1059  B2_re -= g20_im * b0_im;
1060  B2_re += g21_re * b1_re;
1061  B2_re -= g21_im * b1_im;
1062  B2_re += g22_re * b2_re;
1063  B2_re -= g22_im * b2_im;
1064  spinorFloat B2_im = 0;
1065  B2_im += g20_re * b0_im;
1066  B2_im += g20_im * b0_re;
1067  B2_im += g21_re * b1_im;
1068  B2_im += g21_im * b1_re;
1069  B2_im += g22_re * b2_im;
1070  B2_im += g22_im * b2_re;
1071 
1072  o00_re += A0_re;
1073  o00_im += A0_im;
1074  o10_re += B0_re;
1075  o10_im += B0_im;
1076  o20_re -= A0_im;
1077  o20_im += A0_re;
1078  o30_re += B0_im;
1079  o30_im -= B0_re;
1080 
1081  o01_re += A1_re;
1082  o01_im += A1_im;
1083  o11_re += B1_re;
1084  o11_im += B1_im;
1085  o21_re -= A1_im;
1086  o21_im += A1_re;
1087  o31_re += B1_im;
1088  o31_im -= B1_re;
1089 
1090  o02_re += A2_re;
1091  o02_im += A2_im;
1092  o12_re += B2_re;
1093  o12_im += B2_im;
1094  o22_re -= A2_im;
1095  o22_im += A2_re;
1096  o32_re += B2_im;
1097  o32_im -= B2_re;
1098 }
1099 
1100 if (isActive(dim,2,-1,x1,x2,x3,x4,param.commDim,param.X) && x3==0 )
1101 {
1102  // Projector P2+
1103  // 1 0 i 0
1104  // 0 1 0 -i
1105  // -i 0 1 0
1106  // 0 i 0 1
1107 
1108  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,2,Y);
1109  const int sp_idx = face_idx + param.ghostOffset[2];
1110 #if (DD_PREC==2) // half precision
1111  sp_norm_idx = face_idx + param.ghostNormOffset[2];
1112 #endif
1113 
1114 
1115  const int ga_idx = Vh+(face_idx % ghostFace[2]);
1116 
1117  // read gauge matrix from device memory
1118  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1119 
1126 
1127 
1128 
1129  const int sp_stride_pad = param.Ls*ghostFace[2];
1130 
1131  // read half spinor from device memory
1132  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1133 
1134  a0_re = i00_re; a0_im = i00_im;
1135  a1_re = i01_re; a1_im = i01_im;
1136  a2_re = i02_re; a2_im = i02_im;
1137  b0_re = i10_re; b0_im = i10_im;
1138  b1_re = i11_re; b1_im = i11_im;
1139  b2_re = i12_re; b2_im = i12_im;
1140 
1141 
1142  // reconstruct gauge matrix
1144 
1145  // multiply row 0
1146  spinorFloat A0_re = 0;
1147  A0_re += gT00_re * a0_re;
1148  A0_re -= gT00_im * a0_im;
1149  A0_re += gT01_re * a1_re;
1150  A0_re -= gT01_im * a1_im;
1151  A0_re += gT02_re * a2_re;
1152  A0_re -= gT02_im * a2_im;
1153  spinorFloat A0_im = 0;
1154  A0_im += gT00_re * a0_im;
1155  A0_im += gT00_im * a0_re;
1156  A0_im += gT01_re * a1_im;
1157  A0_im += gT01_im * a1_re;
1158  A0_im += gT02_re * a2_im;
1159  A0_im += gT02_im * a2_re;
1160  spinorFloat B0_re = 0;
1161  B0_re += gT00_re * b0_re;
1162  B0_re -= gT00_im * b0_im;
1163  B0_re += gT01_re * b1_re;
1164  B0_re -= gT01_im * b1_im;
1165  B0_re += gT02_re * b2_re;
1166  B0_re -= gT02_im * b2_im;
1167  spinorFloat B0_im = 0;
1168  B0_im += gT00_re * b0_im;
1169  B0_im += gT00_im * b0_re;
1170  B0_im += gT01_re * b1_im;
1171  B0_im += gT01_im * b1_re;
1172  B0_im += gT02_re * b2_im;
1173  B0_im += gT02_im * b2_re;
1174 
1175  // multiply row 1
1176  spinorFloat A1_re = 0;
1177  A1_re += gT10_re * a0_re;
1178  A1_re -= gT10_im * a0_im;
1179  A1_re += gT11_re * a1_re;
1180  A1_re -= gT11_im * a1_im;
1181  A1_re += gT12_re * a2_re;
1182  A1_re -= gT12_im * a2_im;
1183  spinorFloat A1_im = 0;
1184  A1_im += gT10_re * a0_im;
1185  A1_im += gT10_im * a0_re;
1186  A1_im += gT11_re * a1_im;
1187  A1_im += gT11_im * a1_re;
1188  A1_im += gT12_re * a2_im;
1189  A1_im += gT12_im * a2_re;
1190  spinorFloat B1_re = 0;
1191  B1_re += gT10_re * b0_re;
1192  B1_re -= gT10_im * b0_im;
1193  B1_re += gT11_re * b1_re;
1194  B1_re -= gT11_im * b1_im;
1195  B1_re += gT12_re * b2_re;
1196  B1_re -= gT12_im * b2_im;
1197  spinorFloat B1_im = 0;
1198  B1_im += gT10_re * b0_im;
1199  B1_im += gT10_im * b0_re;
1200  B1_im += gT11_re * b1_im;
1201  B1_im += gT11_im * b1_re;
1202  B1_im += gT12_re * b2_im;
1203  B1_im += gT12_im * b2_re;
1204 
1205  // multiply row 2
1206  spinorFloat A2_re = 0;
1207  A2_re += gT20_re * a0_re;
1208  A2_re -= gT20_im * a0_im;
1209  A2_re += gT21_re * a1_re;
1210  A2_re -= gT21_im * a1_im;
1211  A2_re += gT22_re * a2_re;
1212  A2_re -= gT22_im * a2_im;
1213  spinorFloat A2_im = 0;
1214  A2_im += gT20_re * a0_im;
1215  A2_im += gT20_im * a0_re;
1216  A2_im += gT21_re * a1_im;
1217  A2_im += gT21_im * a1_re;
1218  A2_im += gT22_re * a2_im;
1219  A2_im += gT22_im * a2_re;
1220  spinorFloat B2_re = 0;
1221  B2_re += gT20_re * b0_re;
1222  B2_re -= gT20_im * b0_im;
1223  B2_re += gT21_re * b1_re;
1224  B2_re -= gT21_im * b1_im;
1225  B2_re += gT22_re * b2_re;
1226  B2_re -= gT22_im * b2_im;
1227  spinorFloat B2_im = 0;
1228  B2_im += gT20_re * b0_im;
1229  B2_im += gT20_im * b0_re;
1230  B2_im += gT21_re * b1_im;
1231  B2_im += gT21_im * b1_re;
1232  B2_im += gT22_re * b2_im;
1233  B2_im += gT22_im * b2_re;
1234 
1235  o00_re += A0_re;
1236  o00_im += A0_im;
1237  o10_re += B0_re;
1238  o10_im += B0_im;
1239  o20_re += A0_im;
1240  o20_im -= A0_re;
1241  o30_re -= B0_im;
1242  o30_im += B0_re;
1243 
1244  o01_re += A1_re;
1245  o01_im += A1_im;
1246  o11_re += B1_re;
1247  o11_im += B1_im;
1248  o21_re += A1_im;
1249  o21_im -= A1_re;
1250  o31_re -= B1_im;
1251  o31_im += B1_re;
1252 
1253  o02_re += A2_re;
1254  o02_im += A2_im;
1255  o12_re += B2_re;
1256  o12_im += B2_im;
1257  o22_re += A2_im;
1258  o22_im -= A2_re;
1259  o32_re -= B2_im;
1260  o32_im += B2_re;
1261 }
1262 
1263 if (isActive(dim,3,+1,x1,x2,x3,x4,param.commDim,param.X) && x4==X4m1 )
1264 {
1265  // Projector P3-
1266  // 0 0 0 0
1267  // 0 0 0 0
1268  // 0 0 2 0
1269  // 0 0 0 2
1270 
1271  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,3,Y);
1272  const int sp_idx = face_idx + param.ghostOffset[3];
1273 #if (DD_PREC==2) // half precision
1274  sp_norm_idx = face_idx + param.Ls*ghostFace[3] + param.ghostNormOffset[3];
1275 #endif
1276 
1277 
1278  const int ga_idx = sid % Vh;
1279 
1280  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1281  {
1288 
1289 
1290 
1291  const int sp_stride_pad = param.Ls*ghostFace[3];
1292  const int t_proj_scale = TPROJSCALE;
1293 
1294  // read half spinor from device memory
1295  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1296 
1297  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1298  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1299  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1300  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1301  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1302  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1303 
1304 
1305  // identity gauge matrix
1312 
1313  o20_re += A0_re;
1314  o20_im += A0_im;
1315  o30_re += B0_re;
1316  o30_im += B0_im;
1317 
1318  o21_re += A1_re;
1319  o21_im += A1_im;
1320  o31_re += B1_re;
1321  o31_im += B1_im;
1322 
1323  o22_re += A2_re;
1324  o22_im += A2_im;
1325  o32_re += B2_re;
1326  o32_im += B2_im;
1327  } else {
1328  // read gauge matrix from device memory
1329  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1330 
1337 
1338 
1339 
1340  const int sp_stride_pad = param.Ls*ghostFace[3];
1341  const int t_proj_scale = TPROJSCALE;
1342 
1343  // read half spinor from device memory
1344  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1345 
1346  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1347  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1348  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1349  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1350  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1351  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1352 
1353 
1354  // reconstruct gauge matrix
1356 
1357  // multiply row 0
1358  spinorFloat A0_re = 0;
1359  A0_re += g00_re * a0_re;
1360  A0_re -= g00_im * a0_im;
1361  A0_re += g01_re * a1_re;
1362  A0_re -= g01_im * a1_im;
1363  A0_re += g02_re * a2_re;
1364  A0_re -= g02_im * a2_im;
1365  spinorFloat A0_im = 0;
1366  A0_im += g00_re * a0_im;
1367  A0_im += g00_im * a0_re;
1368  A0_im += g01_re * a1_im;
1369  A0_im += g01_im * a1_re;
1370  A0_im += g02_re * a2_im;
1371  A0_im += g02_im * a2_re;
1372  spinorFloat B0_re = 0;
1373  B0_re += g00_re * b0_re;
1374  B0_re -= g00_im * b0_im;
1375  B0_re += g01_re * b1_re;
1376  B0_re -= g01_im * b1_im;
1377  B0_re += g02_re * b2_re;
1378  B0_re -= g02_im * b2_im;
1379  spinorFloat B0_im = 0;
1380  B0_im += g00_re * b0_im;
1381  B0_im += g00_im * b0_re;
1382  B0_im += g01_re * b1_im;
1383  B0_im += g01_im * b1_re;
1384  B0_im += g02_re * b2_im;
1385  B0_im += g02_im * b2_re;
1386 
1387  // multiply row 1
1388  spinorFloat A1_re = 0;
1389  A1_re += g10_re * a0_re;
1390  A1_re -= g10_im * a0_im;
1391  A1_re += g11_re * a1_re;
1392  A1_re -= g11_im * a1_im;
1393  A1_re += g12_re * a2_re;
1394  A1_re -= g12_im * a2_im;
1395  spinorFloat A1_im = 0;
1396  A1_im += g10_re * a0_im;
1397  A1_im += g10_im * a0_re;
1398  A1_im += g11_re * a1_im;
1399  A1_im += g11_im * a1_re;
1400  A1_im += g12_re * a2_im;
1401  A1_im += g12_im * a2_re;
1402  spinorFloat B1_re = 0;
1403  B1_re += g10_re * b0_re;
1404  B1_re -= g10_im * b0_im;
1405  B1_re += g11_re * b1_re;
1406  B1_re -= g11_im * b1_im;
1407  B1_re += g12_re * b2_re;
1408  B1_re -= g12_im * b2_im;
1409  spinorFloat B1_im = 0;
1410  B1_im += g10_re * b0_im;
1411  B1_im += g10_im * b0_re;
1412  B1_im += g11_re * b1_im;
1413  B1_im += g11_im * b1_re;
1414  B1_im += g12_re * b2_im;
1415  B1_im += g12_im * b2_re;
1416 
1417  // multiply row 2
1418  spinorFloat A2_re = 0;
1419  A2_re += g20_re * a0_re;
1420  A2_re -= g20_im * a0_im;
1421  A2_re += g21_re * a1_re;
1422  A2_re -= g21_im * a1_im;
1423  A2_re += g22_re * a2_re;
1424  A2_re -= g22_im * a2_im;
1425  spinorFloat A2_im = 0;
1426  A2_im += g20_re * a0_im;
1427  A2_im += g20_im * a0_re;
1428  A2_im += g21_re * a1_im;
1429  A2_im += g21_im * a1_re;
1430  A2_im += g22_re * a2_im;
1431  A2_im += g22_im * a2_re;
1432  spinorFloat B2_re = 0;
1433  B2_re += g20_re * b0_re;
1434  B2_re -= g20_im * b0_im;
1435  B2_re += g21_re * b1_re;
1436  B2_re -= g21_im * b1_im;
1437  B2_re += g22_re * b2_re;
1438  B2_re -= g22_im * b2_im;
1439  spinorFloat B2_im = 0;
1440  B2_im += g20_re * b0_im;
1441  B2_im += g20_im * b0_re;
1442  B2_im += g21_re * b1_im;
1443  B2_im += g21_im * b1_re;
1444  B2_im += g22_re * b2_im;
1445  B2_im += g22_im * b2_re;
1446 
1447  o20_re += A0_re;
1448  o20_im += A0_im;
1449  o30_re += B0_re;
1450  o30_im += B0_im;
1451 
1452  o21_re += A1_re;
1453  o21_im += A1_im;
1454  o31_re += B1_re;
1455  o31_im += B1_im;
1456 
1457  o22_re += A2_re;
1458  o22_im += A2_im;
1459  o32_re += B2_re;
1460  o32_im += B2_im;
1461  }
1462 }
1463 
1464 if (isActive(dim,3,-1,x1,x2,x3,x4,param.commDim,param.X) && x4==0 )
1465 {
1466  // Projector P3+
1467  // 2 0 0 0
1468  // 0 2 0 0
1469  // 0 0 0 0
1470  // 0 0 0 0
1471 
1472  faceIndexFromDWCoords<1>(face_idx,x1,x2,x3,x4,xs,3,Y);
1473  const int sp_idx = face_idx + param.ghostOffset[3];
1474 #if (DD_PREC==2) // half precision
1475  sp_norm_idx = face_idx + param.ghostNormOffset[3];
1476 #endif
1477 
1478 
1479  const int ga_idx = Vh+(face_idx % ghostFace[3]);
1480 
1481  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1482  {
1489 
1490 
1491 
1492  const int sp_stride_pad = param.Ls*ghostFace[3];
1493  const int t_proj_scale = TPROJSCALE;
1494 
1495  // read half spinor from device memory
1496  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1497 
1498  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1499  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1500  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1501  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1502  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1503  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1504 
1505 
1506  // identity gauge matrix
1513 
1514  o00_re += A0_re;
1515  o00_im += A0_im;
1516  o10_re += B0_re;
1517  o10_im += B0_im;
1518 
1519  o01_re += A1_re;
1520  o01_im += A1_im;
1521  o11_re += B1_re;
1522  o11_im += B1_im;
1523 
1524  o02_re += A2_re;
1525  o02_im += A2_im;
1526  o12_re += B2_re;
1527  o12_im += B2_im;
1528  } else {
1529  // read gauge matrix from device memory
1530  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
1531 
1538 
1539 
1540 
1541  const int sp_stride_pad = param.Ls*ghostFace[3];
1542  const int t_proj_scale = TPROJSCALE;
1543 
1544  // read half spinor from device memory
1545  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1546 
1547  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1548  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1549  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1550  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1551  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1552  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1553 
1554 
1555  // reconstruct gauge matrix
1557 
1558  // multiply row 0
1559  spinorFloat A0_re = 0;
1560  A0_re += gT00_re * a0_re;
1561  A0_re -= gT00_im * a0_im;
1562  A0_re += gT01_re * a1_re;
1563  A0_re -= gT01_im * a1_im;
1564  A0_re += gT02_re * a2_re;
1565  A0_re -= gT02_im * a2_im;
1566  spinorFloat A0_im = 0;
1567  A0_im += gT00_re * a0_im;
1568  A0_im += gT00_im * a0_re;
1569  A0_im += gT01_re * a1_im;
1570  A0_im += gT01_im * a1_re;
1571  A0_im += gT02_re * a2_im;
1572  A0_im += gT02_im * a2_re;
1573  spinorFloat B0_re = 0;
1574  B0_re += gT00_re * b0_re;
1575  B0_re -= gT00_im * b0_im;
1576  B0_re += gT01_re * b1_re;
1577  B0_re -= gT01_im * b1_im;
1578  B0_re += gT02_re * b2_re;
1579  B0_re -= gT02_im * b2_im;
1580  spinorFloat B0_im = 0;
1581  B0_im += gT00_re * b0_im;
1582  B0_im += gT00_im * b0_re;
1583  B0_im += gT01_re * b1_im;
1584  B0_im += gT01_im * b1_re;
1585  B0_im += gT02_re * b2_im;
1586  B0_im += gT02_im * b2_re;
1587 
1588  // multiply row 1
1589  spinorFloat A1_re = 0;
1590  A1_re += gT10_re * a0_re;
1591  A1_re -= gT10_im * a0_im;
1592  A1_re += gT11_re * a1_re;
1593  A1_re -= gT11_im * a1_im;
1594  A1_re += gT12_re * a2_re;
1595  A1_re -= gT12_im * a2_im;
1596  spinorFloat A1_im = 0;
1597  A1_im += gT10_re * a0_im;
1598  A1_im += gT10_im * a0_re;
1599  A1_im += gT11_re * a1_im;
1600  A1_im += gT11_im * a1_re;
1601  A1_im += gT12_re * a2_im;
1602  A1_im += gT12_im * a2_re;
1603  spinorFloat B1_re = 0;
1604  B1_re += gT10_re * b0_re;
1605  B1_re -= gT10_im * b0_im;
1606  B1_re += gT11_re * b1_re;
1607  B1_re -= gT11_im * b1_im;
1608  B1_re += gT12_re * b2_re;
1609  B1_re -= gT12_im * b2_im;
1610  spinorFloat B1_im = 0;
1611  B1_im += gT10_re * b0_im;
1612  B1_im += gT10_im * b0_re;
1613  B1_im += gT11_re * b1_im;
1614  B1_im += gT11_im * b1_re;
1615  B1_im += gT12_re * b2_im;
1616  B1_im += gT12_im * b2_re;
1617 
1618  // multiply row 2
1619  spinorFloat A2_re = 0;
1620  A2_re += gT20_re * a0_re;
1621  A2_re -= gT20_im * a0_im;
1622  A2_re += gT21_re * a1_re;
1623  A2_re -= gT21_im * a1_im;
1624  A2_re += gT22_re * a2_re;
1625  A2_re -= gT22_im * a2_im;
1626  spinorFloat A2_im = 0;
1627  A2_im += gT20_re * a0_im;
1628  A2_im += gT20_im * a0_re;
1629  A2_im += gT21_re * a1_im;
1630  A2_im += gT21_im * a1_re;
1631  A2_im += gT22_re * a2_im;
1632  A2_im += gT22_im * a2_re;
1633  spinorFloat B2_re = 0;
1634  B2_re += gT20_re * b0_re;
1635  B2_re -= gT20_im * b0_im;
1636  B2_re += gT21_re * b1_re;
1637  B2_re -= gT21_im * b1_im;
1638  B2_re += gT22_re * b2_re;
1639  B2_re -= gT22_im * b2_im;
1640  spinorFloat B2_im = 0;
1641  B2_im += gT20_re * b0_im;
1642  B2_im += gT20_im * b0_re;
1643  B2_im += gT21_re * b1_im;
1644  B2_im += gT21_im * b1_re;
1645  B2_im += gT22_re * b2_im;
1646  B2_im += gT22_im * b2_re;
1647 
1648  o00_re += A0_re;
1649  o00_im += A0_im;
1650  o10_re += B0_re;
1651  o10_im += B0_im;
1652 
1653  o01_re += A1_re;
1654  o01_im += A1_im;
1655  o11_re += B1_re;
1656  o11_im += B1_im;
1657 
1658  o02_re += A2_re;
1659  o02_im += A2_im;
1660  o12_re += B2_re;
1661  o12_im += B2_im;
1662  }
1663 }
1664 
1665 {
1666 
1667 #ifdef DSLASH_XPAY
1668  READ_ACCUM(ACCUMTEX, param.sp_stride)
1670 
1671 #ifdef MDWF_mode
1672  coeff = (spinorFloat)(0.5*a/(mdwf_b5[xs]*(m5+4.0) + 1.0));
1673 #else
1674  coeff = a;
1675 #endif
1676 
1677 #ifdef SPINOR_DOUBLE
1678  o00_re = coeff*o00_re + accum0.x;
1679  o00_im = coeff*o00_im + accum0.y;
1680  o01_re = coeff*o01_re + accum1.x;
1681  o01_im = coeff*o01_im + accum1.y;
1682  o02_re = coeff*o02_re + accum2.x;
1683  o02_im = coeff*o02_im + accum2.y;
1684  o10_re = coeff*o10_re + accum3.x;
1685  o10_im = coeff*o10_im + accum3.y;
1686  o11_re = coeff*o11_re + accum4.x;
1687  o11_im = coeff*o11_im + accum4.y;
1688  o12_re = coeff*o12_re + accum5.x;
1689  o12_im = coeff*o12_im + accum5.y;
1690  o20_re = coeff*o20_re + accum6.x;
1691  o20_im = coeff*o20_im + accum6.y;
1692  o21_re = coeff*o21_re + accum7.x;
1693  o21_im = coeff*o21_im + accum7.y;
1694  o22_re = coeff*o22_re + accum8.x;
1695  o22_im = coeff*o22_im + accum8.y;
1696  o30_re = coeff*o30_re + accum9.x;
1697  o30_im = coeff*o30_im + accum9.y;
1698  o31_re = coeff*o31_re + accum10.x;
1699  o31_im = coeff*o31_im + accum10.y;
1700  o32_re = coeff*o32_re + accum11.x;
1701  o32_im = coeff*o32_im + accum11.y;
1702 #else
1703  o00_re = coeff*o00_re + accum0.x;
1704  o00_im = coeff*o00_im + accum0.y;
1705  o01_re = coeff*o01_re + accum0.z;
1706  o01_im = coeff*o01_im + accum0.w;
1707  o02_re = coeff*o02_re + accum1.x;
1708  o02_im = coeff*o02_im + accum1.y;
1709  o10_re = coeff*o10_re + accum1.z;
1710  o10_im = coeff*o10_im + accum1.w;
1711  o11_re = coeff*o11_re + accum2.x;
1712  o11_im = coeff*o11_im + accum2.y;
1713  o12_re = coeff*o12_re + accum2.z;
1714  o12_im = coeff*o12_im + accum2.w;
1715  o20_re = coeff*o20_re + accum3.x;
1716  o20_im = coeff*o20_im + accum3.y;
1717  o21_re = coeff*o21_re + accum3.z;
1718  o21_im = coeff*o21_im + accum3.w;
1719  o22_re = coeff*o22_re + accum4.x;
1720  o22_im = coeff*o22_im + accum4.y;
1721  o30_re = coeff*o30_re + accum4.z;
1722  o30_im = coeff*o30_im + accum4.w;
1723  o31_re = coeff*o31_re + accum5.x;
1724  o31_im = coeff*o31_im + accum5.y;
1725  o32_re = coeff*o32_re + accum5.z;
1726  o32_im = coeff*o32_im + accum5.w;
1727 #endif // SPINOR_DOUBLE
1728 #endif // DSLASH_XPAY
1729 }
1730 
1731 // write spinor field back to device memory
1732 WRITE_SPINOR(param.sp_stride);
1733 
1734 // undefine to prevent warning when precision is changed
1735 #undef m5
1736 #undef mdwf_b5
1737 #undef mdwf_c5
1738 #undef spinorFloat
1739 #undef SHARED_STRIDE
1740 
1741 #undef g00_re
1742 #undef g00_im
1743 #undef g01_re
1744 #undef g01_im
1745 #undef g02_re
1746 #undef g02_im
1747 #undef g10_re
1748 #undef g10_im
1749 #undef g11_re
1750 #undef g11_im
1751 #undef g12_re
1752 #undef g12_im
1753 #undef g20_re
1754 #undef g20_im
1755 #undef g21_re
1756 #undef g21_im
1757 #undef g22_re
1758 #undef g22_im
1759 
1760 #undef i00_re
1761 #undef i00_im
1762 #undef i01_re
1763 #undef i01_im
1764 #undef i02_re
1765 #undef i02_im
1766 #undef i10_re
1767 #undef i10_im
1768 #undef i11_re
1769 #undef i11_im
1770 #undef i12_re
1771 #undef i12_im
1772 #undef i20_re
1773 #undef i20_im
1774 #undef i21_re
1775 #undef i21_im
1776 #undef i22_re
1777 #undef i22_im
1778 #undef i30_re
1779 #undef i30_im
1780 #undef i31_re
1781 #undef i31_im
1782 #undef i32_re
1783 #undef i32_im
1784 
1785 
1786 
1787 #undef VOLATILE
1788 #endif // MULTI_GPU
__constant__ int Vh
float4 G2
__constant__ int X1h
__constant__ int X2
#define o32_im
Definition: gamma5.h:295
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
#define m5
#define o31_im
Definition: gamma5.h:293
int xs
QudaGaugeParam param
Definition: pack_test.cpp:17
__constant__ int ghostFace[QUDA_MAX_DIM+1]
float4 G1
#define RECONSTRUCT_GAUGE_MATRIX
Definition: covDev.h:39
float4 G0
#define GAUGE0TEX
Definition: covDev.h:112
#define o30_im
Definition: gamma5.h:291
__constant__ double coeff
__constant__ int X2m1
#define mdwf_b5
#define SPINORTEX
Definition: clover_def.h:40
#define o32_re
Definition: gamma5.h:294
int X[4]
Definition: quda.h:29
__constant__ int gauge_fixed
#define o31_re
Definition: gamma5.h:292
float4 G4
#define SPINOR_HOP
Definition: covDev.h:158
int boundaryCrossing
float4 G3
#define ASSN_GAUGE_MATRIX
__constant__ int ga_stride
__constant__ int X1m1
__constant__ int X3
#define GAUGE1TEX
Definition: covDev.h:113
__constant__ int X4m1
#define WRITE_SPINOR
Definition: clover_def.h:48
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
__constant__ int X4X3X2X1hmX3X2X1h
__constant__ int X4
__constant__ int X3m1
#define TPROJSCALE
Definition: covDev.h:101