QUDA  0.9.0
dw_fused_exterior_dslash4_core.h
Go to the documentation of this file.
1 #ifdef MULTI_GPU
2 
3 // *** CUDA DSLASH ***
4 
5 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
6 
7 
8 #if (CUDA_VERSION >= 4010)
9 #define VOLATILE
10 #else
11 #define VOLATILE volatile
12 #endif
13 // input spinor
14 #ifdef SPINOR_DOUBLE
15 #define spinorFloat double
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #define m5 param.m5_d
41 #define mdwf_b5 param.mdwf_b5_d
42 #define mdwf_c5 param.mdwf_c5_d
43 #define mferm param.mferm
44 #define a param.a
45 #define b param.b
46 #else
47 #define spinorFloat float
48 #define i00_re I0.x
49 #define i00_im I0.y
50 #define i01_re I0.z
51 #define i01_im I0.w
52 #define i02_re I1.x
53 #define i02_im I1.y
54 #define i10_re I1.z
55 #define i10_im I1.w
56 #define i11_re I2.x
57 #define i11_im I2.y
58 #define i12_re I2.z
59 #define i12_im I2.w
60 #define i20_re I3.x
61 #define i20_im I3.y
62 #define i21_re I3.z
63 #define i21_im I3.w
64 #define i22_re I4.x
65 #define i22_im I4.y
66 #define i30_re I4.z
67 #define i30_im I4.w
68 #define i31_re I5.x
69 #define i31_im I5.y
70 #define i32_re I5.z
71 #define i32_im I5.w
72 #define m5 param.m5_f
73 #define mdwf_b5 param.mdwf_b5_f
74 #define mdwf_c5 param.mdwf_c5_f
75 #define mferm param.mferm_f
76 #define a param.a_f
77 #define b param.b_f
78 #endif // SPINOR_DOUBLE
79 
80 // gauge link
81 #ifdef GAUGE_FLOAT2
82 #define g00_re G0.x
83 #define g00_im G0.y
84 #define g01_re G1.x
85 #define g01_im G1.y
86 #define g02_re G2.x
87 #define g02_im G2.y
88 #define g10_re G3.x
89 #define g10_im G3.y
90 #define g11_re G4.x
91 #define g11_im G4.y
92 #define g12_re G5.x
93 #define g12_im G5.y
94 #define g20_re G6.x
95 #define g20_im G6.y
96 #define g21_re G7.x
97 #define g21_im G7.y
98 #define g22_re G8.x
99 #define g22_im G8.y
100 
101 #else
102 #define g00_re G0.x
103 #define g00_im G0.y
104 #define g01_re G0.z
105 #define g01_im G0.w
106 #define g02_re G1.x
107 #define g02_im G1.y
108 #define g10_re G1.z
109 #define g10_im G1.w
110 #define g11_re G2.x
111 #define g11_im G2.y
112 #define g12_re G2.z
113 #define g12_im G2.w
114 #define g20_re G3.x
115 #define g20_im G3.y
116 #define g21_re G3.z
117 #define g21_im G3.w
118 #define g22_re G4.x
119 #define g22_im G4.y
120 
121 #endif // GAUGE_DOUBLE
122 
123 // conjugated gauge link
124 #define gT00_re (+g00_re)
125 #define gT00_im (-g00_im)
126 #define gT01_re (+g10_re)
127 #define gT01_im (-g10_im)
128 #define gT02_re (+g20_re)
129 #define gT02_im (-g20_im)
130 #define gT10_re (+g01_re)
131 #define gT10_im (-g01_im)
132 #define gT11_re (+g11_re)
133 #define gT11_im (-g11_im)
134 #define gT12_re (+g21_re)
135 #define gT12_im (-g21_im)
136 #define gT20_re (+g02_re)
137 #define gT20_im (-g02_im)
138 #define gT21_re (+g12_re)
139 #define gT21_im (-g12_im)
140 #define gT22_re (+g22_re)
141 #define gT22_im (-g22_im)
142 
143 // output spinor
168 
169 #ifdef SPINOR_DOUBLE
170 #if (__COMPUTE_CAPABILITY__ >= 200)
171 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
172 #else
173 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
174 #endif
175 #else
176 #if (__COMPUTE_CAPABILITY__ >= 200)
177 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
178 #else
179 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
180 #endif
181 #endif
182 
183 #include "read_gauge.h"
184 #include "io_spinor.h"
185 
186 #if (DD_PREC==2) // half precision
187 int sp_norm_idx;
188 #endif // half precision
189 
190 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
191 if (sid >= param.threads*param.dc.Ls) return;
192 
193 int dim;
194 int face_idx;
195 
196 
197 
198 int coord[5];
199 int X;
200 
201 { // exterior kernel
202 
203 dim = dimFromFaceIndex<5>(sid, param); // sid is also modified
204 
205 //const int face_volume = (param.threads*param.dc.Ls >> 1); // volume of one face
206 const int face_volume = ((param.threadDimMapUpper[dim] - param.threadDimMapLower[dim])*param.dc.Ls >> 1);
207 
208 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
209 face_idx = sid - face_num*face_volume; // index into the respective face
210 
211 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
212 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
213 //sp_idx = face_idx + param.ghostOffset[dim];
214 
215 switch(dim) {
216 case 0:
217  coordsFromFaceIndex<5,QUDA_4D_PC,0,1>(X, sid, coord, face_idx, face_num, param);
218  break;
219 case 1:
220  coordsFromFaceIndex<5,QUDA_4D_PC,1,1>(X, sid, coord, face_idx, face_num, param);
221  break;
222 case 2:
223  coordsFromFaceIndex<5,QUDA_4D_PC,2,1>(X, sid, coord, face_idx, face_num, param);
224  break;
225 case 3:
226  coordsFromFaceIndex<5,QUDA_4D_PC,3,1>(X, sid, coord, face_idx, face_num, param);
227  break;
228 }
229 
230 {
231  bool active = false;
232  for(int dir=0; dir<4; ++dir){
233  active = active || isActive(dim,dir,+1,coord,param.commDim,param.dc.X);
234  }
235  if(!active) return;
236 }
237 
238 
239 
241  o00_re = i00_re; o00_im = i00_im;
242  o01_re = i01_re; o01_im = i01_im;
243  o02_re = i02_re; o02_im = i02_im;
244  o10_re = i10_re; o10_im = i10_im;
245  o11_re = i11_re; o11_im = i11_im;
246  o12_re = i12_re; o12_im = i12_im;
247  o20_re = i20_re; o20_im = i20_im;
248  o21_re = i21_re; o21_im = i21_im;
249  o22_re = i22_re; o22_im = i22_im;
250  o30_re = i30_re; o30_im = i30_im;
251  o31_re = i31_re; o31_im = i31_im;
252  o32_re = i32_re; o32_im = i32_im;
253 }
254 
255 // declare G## here and use ASSN below instead of READ
256 #ifdef GAUGE_FLOAT2
257 #if (DD_PREC==0) //temporal hack
258 double2 G0;
259 double2 G1;
260 double2 G2;
261 double2 G3;
262 double2 G4;
263 double2 G5;
264 double2 G6;
265 double2 G7;
266 double2 G8;
267 #else
268 float2 G0;
269 float2 G1;
270 float2 G2;
271 float2 G3;
272 float2 G4;
273 float2 G5;
274 float2 G6;
275 float2 G7;
276 float2 G8;
277 #endif
278 #else
279 float4 G0;
280 float4 G1;
281 float4 G2;
282 float4 G3;
283 float4 G4;
284 #endif
285 
286 
287 
288 if (isActive(dim,0,+1,coord,param.commDim,param.dc.X) && coord[0]==(param.dc.X[0]-1) )
289 {
290  // Projector P0-
291  // 1 0 0 -i
292  // 0 1 -i 0
293  // 0 i 1 0
294  // i 0 0 1
295 
296  faceIndexFromCoords<5,1>(face_idx,coord,0,param);
297  const int sp_idx = face_idx + param.ghostOffset[0][1];
298 #if (DD_PREC==2) // half precision
299  sp_norm_idx = face_idx + param.ghostNormOffset[0][1];
300 #endif
301 
302 
303  const int ga_idx = sid % param.dc.volume_4d_cb;
304 
305  // read gauge matrix from device memory
306  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride);
307 
314 
315 
316 
317  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[0];
318 
319  // read half spinor from device memory
320  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 0);
321 
322  a0_re = i00_re; a0_im = i00_im;
323  a1_re = i01_re; a1_im = i01_im;
324  a2_re = i02_re; a2_im = i02_im;
325  b0_re = i10_re; b0_im = i10_im;
326  b1_re = i11_re; b1_im = i11_im;
327  b2_re = i12_re; b2_im = i12_im;
328 
329 
330  // reconstruct gauge matrix
332 
333  // multiply row 0
334  spinorFloat A0_re = 0;
335  A0_re += g00_re * a0_re;
336  A0_re -= g00_im * a0_im;
337  A0_re += g01_re * a1_re;
338  A0_re -= g01_im * a1_im;
339  A0_re += g02_re * a2_re;
340  A0_re -= g02_im * a2_im;
341  spinorFloat A0_im = 0;
342  A0_im += g00_re * a0_im;
343  A0_im += g00_im * a0_re;
344  A0_im += g01_re * a1_im;
345  A0_im += g01_im * a1_re;
346  A0_im += g02_re * a2_im;
347  A0_im += g02_im * a2_re;
348  spinorFloat B0_re = 0;
349  B0_re += g00_re * b0_re;
350  B0_re -= g00_im * b0_im;
351  B0_re += g01_re * b1_re;
352  B0_re -= g01_im * b1_im;
353  B0_re += g02_re * b2_re;
354  B0_re -= g02_im * b2_im;
355  spinorFloat B0_im = 0;
356  B0_im += g00_re * b0_im;
357  B0_im += g00_im * b0_re;
358  B0_im += g01_re * b1_im;
359  B0_im += g01_im * b1_re;
360  B0_im += g02_re * b2_im;
361  B0_im += g02_im * b2_re;
362 
363  // multiply row 1
364  spinorFloat A1_re = 0;
365  A1_re += g10_re * a0_re;
366  A1_re -= g10_im * a0_im;
367  A1_re += g11_re * a1_re;
368  A1_re -= g11_im * a1_im;
369  A1_re += g12_re * a2_re;
370  A1_re -= g12_im * a2_im;
371  spinorFloat A1_im = 0;
372  A1_im += g10_re * a0_im;
373  A1_im += g10_im * a0_re;
374  A1_im += g11_re * a1_im;
375  A1_im += g11_im * a1_re;
376  A1_im += g12_re * a2_im;
377  A1_im += g12_im * a2_re;
378  spinorFloat B1_re = 0;
379  B1_re += g10_re * b0_re;
380  B1_re -= g10_im * b0_im;
381  B1_re += g11_re * b1_re;
382  B1_re -= g11_im * b1_im;
383  B1_re += g12_re * b2_re;
384  B1_re -= g12_im * b2_im;
385  spinorFloat B1_im = 0;
386  B1_im += g10_re * b0_im;
387  B1_im += g10_im * b0_re;
388  B1_im += g11_re * b1_im;
389  B1_im += g11_im * b1_re;
390  B1_im += g12_re * b2_im;
391  B1_im += g12_im * b2_re;
392 
393  // multiply row 2
394  spinorFloat A2_re = 0;
395  A2_re += g20_re * a0_re;
396  A2_re -= g20_im * a0_im;
397  A2_re += g21_re * a1_re;
398  A2_re -= g21_im * a1_im;
399  A2_re += g22_re * a2_re;
400  A2_re -= g22_im * a2_im;
401  spinorFloat A2_im = 0;
402  A2_im += g20_re * a0_im;
403  A2_im += g20_im * a0_re;
404  A2_im += g21_re * a1_im;
405  A2_im += g21_im * a1_re;
406  A2_im += g22_re * a2_im;
407  A2_im += g22_im * a2_re;
408  spinorFloat B2_re = 0;
409  B2_re += g20_re * b0_re;
410  B2_re -= g20_im * b0_im;
411  B2_re += g21_re * b1_re;
412  B2_re -= g21_im * b1_im;
413  B2_re += g22_re * b2_re;
414  B2_re -= g22_im * b2_im;
415  spinorFloat B2_im = 0;
416  B2_im += g20_re * b0_im;
417  B2_im += g20_im * b0_re;
418  B2_im += g21_re * b1_im;
419  B2_im += g21_im * b1_re;
420  B2_im += g22_re * b2_im;
421  B2_im += g22_im * b2_re;
422 
423  o00_re += A0_re;
424  o00_im += A0_im;
425  o10_re += B0_re;
426  o10_im += B0_im;
427  o20_re -= B0_im;
428  o20_im += B0_re;
429  o30_re -= A0_im;
430  o30_im += A0_re;
431 
432  o01_re += A1_re;
433  o01_im += A1_im;
434  o11_re += B1_re;
435  o11_im += B1_im;
436  o21_re -= B1_im;
437  o21_im += B1_re;
438  o31_re -= A1_im;
439  o31_im += A1_re;
440 
441  o02_re += A2_re;
442  o02_im += A2_im;
443  o12_re += B2_re;
444  o12_im += B2_im;
445  o22_re -= B2_im;
446  o22_im += B2_re;
447  o32_re -= A2_im;
448  o32_im += A2_re;
449 }
450 
451 if (isActive(dim,0,-1,coord,param.commDim,param.dc.X) && coord[0]==0 )
452 {
453  // Projector P0+
454  // 1 0 0 i
455  // 0 1 i 0
456  // 0 -i 1 0
457  // -i 0 0 1
458 
459  faceIndexFromCoords<5,1>(face_idx,coord,0,param);
460  const int sp_idx = face_idx + param.ghostOffset[0][0];
461 #if (DD_PREC==2) // half precision
462  sp_norm_idx = face_idx + param.ghostNormOffset[0][0];
463 #endif
464 
465 
466  const int ga_idx = param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[0]);
467 
468  // read gauge matrix from device memory
469  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, param.gauge_stride);
470 
477 
478 
479 
480  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[0];
481 
482  // read half spinor from device memory
483  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 1);
484 
485  a0_re = i00_re; a0_im = i00_im;
486  a1_re = i01_re; a1_im = i01_im;
487  a2_re = i02_re; a2_im = i02_im;
488  b0_re = i10_re; b0_im = i10_im;
489  b1_re = i11_re; b1_im = i11_im;
490  b2_re = i12_re; b2_im = i12_im;
491 
492 
493  // reconstruct gauge matrix
495 
496  // multiply row 0
497  spinorFloat A0_re = 0;
498  A0_re += gT00_re * a0_re;
499  A0_re -= gT00_im * a0_im;
500  A0_re += gT01_re * a1_re;
501  A0_re -= gT01_im * a1_im;
502  A0_re += gT02_re * a2_re;
503  A0_re -= gT02_im * a2_im;
504  spinorFloat A0_im = 0;
505  A0_im += gT00_re * a0_im;
506  A0_im += gT00_im * a0_re;
507  A0_im += gT01_re * a1_im;
508  A0_im += gT01_im * a1_re;
509  A0_im += gT02_re * a2_im;
510  A0_im += gT02_im * a2_re;
511  spinorFloat B0_re = 0;
512  B0_re += gT00_re * b0_re;
513  B0_re -= gT00_im * b0_im;
514  B0_re += gT01_re * b1_re;
515  B0_re -= gT01_im * b1_im;
516  B0_re += gT02_re * b2_re;
517  B0_re -= gT02_im * b2_im;
518  spinorFloat B0_im = 0;
519  B0_im += gT00_re * b0_im;
520  B0_im += gT00_im * b0_re;
521  B0_im += gT01_re * b1_im;
522  B0_im += gT01_im * b1_re;
523  B0_im += gT02_re * b2_im;
524  B0_im += gT02_im * b2_re;
525 
526  // multiply row 1
527  spinorFloat A1_re = 0;
528  A1_re += gT10_re * a0_re;
529  A1_re -= gT10_im * a0_im;
530  A1_re += gT11_re * a1_re;
531  A1_re -= gT11_im * a1_im;
532  A1_re += gT12_re * a2_re;
533  A1_re -= gT12_im * a2_im;
534  spinorFloat A1_im = 0;
535  A1_im += gT10_re * a0_im;
536  A1_im += gT10_im * a0_re;
537  A1_im += gT11_re * a1_im;
538  A1_im += gT11_im * a1_re;
539  A1_im += gT12_re * a2_im;
540  A1_im += gT12_im * a2_re;
541  spinorFloat B1_re = 0;
542  B1_re += gT10_re * b0_re;
543  B1_re -= gT10_im * b0_im;
544  B1_re += gT11_re * b1_re;
545  B1_re -= gT11_im * b1_im;
546  B1_re += gT12_re * b2_re;
547  B1_re -= gT12_im * b2_im;
548  spinorFloat B1_im = 0;
549  B1_im += gT10_re * b0_im;
550  B1_im += gT10_im * b0_re;
551  B1_im += gT11_re * b1_im;
552  B1_im += gT11_im * b1_re;
553  B1_im += gT12_re * b2_im;
554  B1_im += gT12_im * b2_re;
555 
556  // multiply row 2
557  spinorFloat A2_re = 0;
558  A2_re += gT20_re * a0_re;
559  A2_re -= gT20_im * a0_im;
560  A2_re += gT21_re * a1_re;
561  A2_re -= gT21_im * a1_im;
562  A2_re += gT22_re * a2_re;
563  A2_re -= gT22_im * a2_im;
564  spinorFloat A2_im = 0;
565  A2_im += gT20_re * a0_im;
566  A2_im += gT20_im * a0_re;
567  A2_im += gT21_re * a1_im;
568  A2_im += gT21_im * a1_re;
569  A2_im += gT22_re * a2_im;
570  A2_im += gT22_im * a2_re;
571  spinorFloat B2_re = 0;
572  B2_re += gT20_re * b0_re;
573  B2_re -= gT20_im * b0_im;
574  B2_re += gT21_re * b1_re;
575  B2_re -= gT21_im * b1_im;
576  B2_re += gT22_re * b2_re;
577  B2_re -= gT22_im * b2_im;
578  spinorFloat B2_im = 0;
579  B2_im += gT20_re * b0_im;
580  B2_im += gT20_im * b0_re;
581  B2_im += gT21_re * b1_im;
582  B2_im += gT21_im * b1_re;
583  B2_im += gT22_re * b2_im;
584  B2_im += gT22_im * b2_re;
585 
586  o00_re += A0_re;
587  o00_im += A0_im;
588  o10_re += B0_re;
589  o10_im += B0_im;
590  o20_re += B0_im;
591  o20_im -= B0_re;
592  o30_re += A0_im;
593  o30_im -= A0_re;
594 
595  o01_re += A1_re;
596  o01_im += A1_im;
597  o11_re += B1_re;
598  o11_im += B1_im;
599  o21_re += B1_im;
600  o21_im -= B1_re;
601  o31_re += A1_im;
602  o31_im -= A1_re;
603 
604  o02_re += A2_re;
605  o02_im += A2_im;
606  o12_re += B2_re;
607  o12_im += B2_im;
608  o22_re += B2_im;
609  o22_im -= B2_re;
610  o32_re += A2_im;
611  o32_im -= A2_re;
612 }
613 
614 if (isActive(dim,1,+1,coord,param.commDim,param.dc.X) && coord[1]==(param.dc.X[1]-1) )
615 {
616  // Projector P1-
617  // 1 0 0 -1
618  // 0 1 1 0
619  // 0 1 1 0
620  // -1 0 0 1
621 
622  faceIndexFromCoords<5,1>(face_idx,coord,1,param);
623  const int sp_idx = face_idx + param.ghostOffset[1][1];
624 #if (DD_PREC==2) // half precision
625  sp_norm_idx = face_idx + param.ghostNormOffset[1][1];
626 #endif
627 
628 
629  const int ga_idx = sid % param.dc.volume_4d_cb;
630 
631  // read gauge matrix from device memory
632  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, param.gauge_stride);
633 
640 
641 
642 
643  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[1];
644 
645  // read half spinor from device memory
646  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 2);
647 
648  a0_re = i00_re; a0_im = i00_im;
649  a1_re = i01_re; a1_im = i01_im;
650  a2_re = i02_re; a2_im = i02_im;
651  b0_re = i10_re; b0_im = i10_im;
652  b1_re = i11_re; b1_im = i11_im;
653  b2_re = i12_re; b2_im = i12_im;
654 
655 
656  // reconstruct gauge matrix
658 
659  // multiply row 0
660  spinorFloat A0_re = 0;
661  A0_re += g00_re * a0_re;
662  A0_re -= g00_im * a0_im;
663  A0_re += g01_re * a1_re;
664  A0_re -= g01_im * a1_im;
665  A0_re += g02_re * a2_re;
666  A0_re -= g02_im * a2_im;
667  spinorFloat A0_im = 0;
668  A0_im += g00_re * a0_im;
669  A0_im += g00_im * a0_re;
670  A0_im += g01_re * a1_im;
671  A0_im += g01_im * a1_re;
672  A0_im += g02_re * a2_im;
673  A0_im += g02_im * a2_re;
674  spinorFloat B0_re = 0;
675  B0_re += g00_re * b0_re;
676  B0_re -= g00_im * b0_im;
677  B0_re += g01_re * b1_re;
678  B0_re -= g01_im * b1_im;
679  B0_re += g02_re * b2_re;
680  B0_re -= g02_im * b2_im;
681  spinorFloat B0_im = 0;
682  B0_im += g00_re * b0_im;
683  B0_im += g00_im * b0_re;
684  B0_im += g01_re * b1_im;
685  B0_im += g01_im * b1_re;
686  B0_im += g02_re * b2_im;
687  B0_im += g02_im * b2_re;
688 
689  // multiply row 1
690  spinorFloat A1_re = 0;
691  A1_re += g10_re * a0_re;
692  A1_re -= g10_im * a0_im;
693  A1_re += g11_re * a1_re;
694  A1_re -= g11_im * a1_im;
695  A1_re += g12_re * a2_re;
696  A1_re -= g12_im * a2_im;
697  spinorFloat A1_im = 0;
698  A1_im += g10_re * a0_im;
699  A1_im += g10_im * a0_re;
700  A1_im += g11_re * a1_im;
701  A1_im += g11_im * a1_re;
702  A1_im += g12_re * a2_im;
703  A1_im += g12_im * a2_re;
704  spinorFloat B1_re = 0;
705  B1_re += g10_re * b0_re;
706  B1_re -= g10_im * b0_im;
707  B1_re += g11_re * b1_re;
708  B1_re -= g11_im * b1_im;
709  B1_re += g12_re * b2_re;
710  B1_re -= g12_im * b2_im;
711  spinorFloat B1_im = 0;
712  B1_im += g10_re * b0_im;
713  B1_im += g10_im * b0_re;
714  B1_im += g11_re * b1_im;
715  B1_im += g11_im * b1_re;
716  B1_im += g12_re * b2_im;
717  B1_im += g12_im * b2_re;
718 
719  // multiply row 2
720  spinorFloat A2_re = 0;
721  A2_re += g20_re * a0_re;
722  A2_re -= g20_im * a0_im;
723  A2_re += g21_re * a1_re;
724  A2_re -= g21_im * a1_im;
725  A2_re += g22_re * a2_re;
726  A2_re -= g22_im * a2_im;
727  spinorFloat A2_im = 0;
728  A2_im += g20_re * a0_im;
729  A2_im += g20_im * a0_re;
730  A2_im += g21_re * a1_im;
731  A2_im += g21_im * a1_re;
732  A2_im += g22_re * a2_im;
733  A2_im += g22_im * a2_re;
734  spinorFloat B2_re = 0;
735  B2_re += g20_re * b0_re;
736  B2_re -= g20_im * b0_im;
737  B2_re += g21_re * b1_re;
738  B2_re -= g21_im * b1_im;
739  B2_re += g22_re * b2_re;
740  B2_re -= g22_im * b2_im;
741  spinorFloat B2_im = 0;
742  B2_im += g20_re * b0_im;
743  B2_im += g20_im * b0_re;
744  B2_im += g21_re * b1_im;
745  B2_im += g21_im * b1_re;
746  B2_im += g22_re * b2_im;
747  B2_im += g22_im * b2_re;
748 
749  o00_re += A0_re;
750  o00_im += A0_im;
751  o10_re += B0_re;
752  o10_im += B0_im;
753  o20_re += B0_re;
754  o20_im += B0_im;
755  o30_re -= A0_re;
756  o30_im -= A0_im;
757 
758  o01_re += A1_re;
759  o01_im += A1_im;
760  o11_re += B1_re;
761  o11_im += B1_im;
762  o21_re += B1_re;
763  o21_im += B1_im;
764  o31_re -= A1_re;
765  o31_im -= A1_im;
766 
767  o02_re += A2_re;
768  o02_im += A2_im;
769  o12_re += B2_re;
770  o12_im += B2_im;
771  o22_re += B2_re;
772  o22_im += B2_im;
773  o32_re -= A2_re;
774  o32_im -= A2_im;
775 }
776 
777 if (isActive(dim,1,-1,coord,param.commDim,param.dc.X) && coord[1]==0 )
778 {
779  // Projector P1+
780  // 1 0 0 1
781  // 0 1 -1 0
782  // 0 -1 1 0
783  // 1 0 0 1
784 
785  faceIndexFromCoords<5,1>(face_idx,coord,1,param);
786  const int sp_idx = face_idx + param.ghostOffset[1][0];
787 #if (DD_PREC==2) // half precision
788  sp_norm_idx = face_idx + param.ghostNormOffset[1][0];
789 #endif
790 
791 
792  const int ga_idx = param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[1]);
793 
794  // read gauge matrix from device memory
795  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, param.gauge_stride);
796 
803 
804 
805 
806  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[1];
807 
808  // read half spinor from device memory
809  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 3);
810 
811  a0_re = i00_re; a0_im = i00_im;
812  a1_re = i01_re; a1_im = i01_im;
813  a2_re = i02_re; a2_im = i02_im;
814  b0_re = i10_re; b0_im = i10_im;
815  b1_re = i11_re; b1_im = i11_im;
816  b2_re = i12_re; b2_im = i12_im;
817 
818 
819  // reconstruct gauge matrix
821 
822  // multiply row 0
823  spinorFloat A0_re = 0;
824  A0_re += gT00_re * a0_re;
825  A0_re -= gT00_im * a0_im;
826  A0_re += gT01_re * a1_re;
827  A0_re -= gT01_im * a1_im;
828  A0_re += gT02_re * a2_re;
829  A0_re -= gT02_im * a2_im;
830  spinorFloat A0_im = 0;
831  A0_im += gT00_re * a0_im;
832  A0_im += gT00_im * a0_re;
833  A0_im += gT01_re * a1_im;
834  A0_im += gT01_im * a1_re;
835  A0_im += gT02_re * a2_im;
836  A0_im += gT02_im * a2_re;
837  spinorFloat B0_re = 0;
838  B0_re += gT00_re * b0_re;
839  B0_re -= gT00_im * b0_im;
840  B0_re += gT01_re * b1_re;
841  B0_re -= gT01_im * b1_im;
842  B0_re += gT02_re * b2_re;
843  B0_re -= gT02_im * b2_im;
844  spinorFloat B0_im = 0;
845  B0_im += gT00_re * b0_im;
846  B0_im += gT00_im * b0_re;
847  B0_im += gT01_re * b1_im;
848  B0_im += gT01_im * b1_re;
849  B0_im += gT02_re * b2_im;
850  B0_im += gT02_im * b2_re;
851 
852  // multiply row 1
853  spinorFloat A1_re = 0;
854  A1_re += gT10_re * a0_re;
855  A1_re -= gT10_im * a0_im;
856  A1_re += gT11_re * a1_re;
857  A1_re -= gT11_im * a1_im;
858  A1_re += gT12_re * a2_re;
859  A1_re -= gT12_im * a2_im;
860  spinorFloat A1_im = 0;
861  A1_im += gT10_re * a0_im;
862  A1_im += gT10_im * a0_re;
863  A1_im += gT11_re * a1_im;
864  A1_im += gT11_im * a1_re;
865  A1_im += gT12_re * a2_im;
866  A1_im += gT12_im * a2_re;
867  spinorFloat B1_re = 0;
868  B1_re += gT10_re * b0_re;
869  B1_re -= gT10_im * b0_im;
870  B1_re += gT11_re * b1_re;
871  B1_re -= gT11_im * b1_im;
872  B1_re += gT12_re * b2_re;
873  B1_re -= gT12_im * b2_im;
874  spinorFloat B1_im = 0;
875  B1_im += gT10_re * b0_im;
876  B1_im += gT10_im * b0_re;
877  B1_im += gT11_re * b1_im;
878  B1_im += gT11_im * b1_re;
879  B1_im += gT12_re * b2_im;
880  B1_im += gT12_im * b2_re;
881 
882  // multiply row 2
883  spinorFloat A2_re = 0;
884  A2_re += gT20_re * a0_re;
885  A2_re -= gT20_im * a0_im;
886  A2_re += gT21_re * a1_re;
887  A2_re -= gT21_im * a1_im;
888  A2_re += gT22_re * a2_re;
889  A2_re -= gT22_im * a2_im;
890  spinorFloat A2_im = 0;
891  A2_im += gT20_re * a0_im;
892  A2_im += gT20_im * a0_re;
893  A2_im += gT21_re * a1_im;
894  A2_im += gT21_im * a1_re;
895  A2_im += gT22_re * a2_im;
896  A2_im += gT22_im * a2_re;
897  spinorFloat B2_re = 0;
898  B2_re += gT20_re * b0_re;
899  B2_re -= gT20_im * b0_im;
900  B2_re += gT21_re * b1_re;
901  B2_re -= gT21_im * b1_im;
902  B2_re += gT22_re * b2_re;
903  B2_re -= gT22_im * b2_im;
904  spinorFloat B2_im = 0;
905  B2_im += gT20_re * b0_im;
906  B2_im += gT20_im * b0_re;
907  B2_im += gT21_re * b1_im;
908  B2_im += gT21_im * b1_re;
909  B2_im += gT22_re * b2_im;
910  B2_im += gT22_im * b2_re;
911 
912  o00_re += A0_re;
913  o00_im += A0_im;
914  o10_re += B0_re;
915  o10_im += B0_im;
916  o20_re -= B0_re;
917  o20_im -= B0_im;
918  o30_re += A0_re;
919  o30_im += A0_im;
920 
921  o01_re += A1_re;
922  o01_im += A1_im;
923  o11_re += B1_re;
924  o11_im += B1_im;
925  o21_re -= B1_re;
926  o21_im -= B1_im;
927  o31_re += A1_re;
928  o31_im += A1_im;
929 
930  o02_re += A2_re;
931  o02_im += A2_im;
932  o12_re += B2_re;
933  o12_im += B2_im;
934  o22_re -= B2_re;
935  o22_im -= B2_im;
936  o32_re += A2_re;
937  o32_im += A2_im;
938 }
939 
940 if (isActive(dim,2,+1,coord,param.commDim,param.dc.X) && coord[2]==(param.dc.X[2]-1) )
941 {
942  // Projector P2-
943  // 1 0 -i 0
944  // 0 1 0 i
945  // i 0 1 0
946  // 0 -i 0 1
947 
948  faceIndexFromCoords<5,1>(face_idx,coord,2,param);
949  const int sp_idx = face_idx + param.ghostOffset[2][1];
950 #if (DD_PREC==2) // half precision
951  sp_norm_idx = face_idx + param.ghostNormOffset[2][1];
952 #endif
953 
954 
955  const int ga_idx = sid % param.dc.volume_4d_cb;
956 
957  // read gauge matrix from device memory
958  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, param.gauge_stride);
959 
966 
967 
968 
969  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[2];
970 
971  // read half spinor from device memory
972  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 4);
973 
974  a0_re = i00_re; a0_im = i00_im;
975  a1_re = i01_re; a1_im = i01_im;
976  a2_re = i02_re; a2_im = i02_im;
977  b0_re = i10_re; b0_im = i10_im;
978  b1_re = i11_re; b1_im = i11_im;
979  b2_re = i12_re; b2_im = i12_im;
980 
981 
982  // reconstruct gauge matrix
984 
985  // multiply row 0
986  spinorFloat A0_re = 0;
987  A0_re += g00_re * a0_re;
988  A0_re -= g00_im * a0_im;
989  A0_re += g01_re * a1_re;
990  A0_re -= g01_im * a1_im;
991  A0_re += g02_re * a2_re;
992  A0_re -= g02_im * a2_im;
993  spinorFloat A0_im = 0;
994  A0_im += g00_re * a0_im;
995  A0_im += g00_im * a0_re;
996  A0_im += g01_re * a1_im;
997  A0_im += g01_im * a1_re;
998  A0_im += g02_re * a2_im;
999  A0_im += g02_im * a2_re;
1000  spinorFloat B0_re = 0;
1001  B0_re += g00_re * b0_re;
1002  B0_re -= g00_im * b0_im;
1003  B0_re += g01_re * b1_re;
1004  B0_re -= g01_im * b1_im;
1005  B0_re += g02_re * b2_re;
1006  B0_re -= g02_im * b2_im;
1007  spinorFloat B0_im = 0;
1008  B0_im += g00_re * b0_im;
1009  B0_im += g00_im * b0_re;
1010  B0_im += g01_re * b1_im;
1011  B0_im += g01_im * b1_re;
1012  B0_im += g02_re * b2_im;
1013  B0_im += g02_im * b2_re;
1014 
1015  // multiply row 1
1016  spinorFloat A1_re = 0;
1017  A1_re += g10_re * a0_re;
1018  A1_re -= g10_im * a0_im;
1019  A1_re += g11_re * a1_re;
1020  A1_re -= g11_im * a1_im;
1021  A1_re += g12_re * a2_re;
1022  A1_re -= g12_im * a2_im;
1023  spinorFloat A1_im = 0;
1024  A1_im += g10_re * a0_im;
1025  A1_im += g10_im * a0_re;
1026  A1_im += g11_re * a1_im;
1027  A1_im += g11_im * a1_re;
1028  A1_im += g12_re * a2_im;
1029  A1_im += g12_im * a2_re;
1030  spinorFloat B1_re = 0;
1031  B1_re += g10_re * b0_re;
1032  B1_re -= g10_im * b0_im;
1033  B1_re += g11_re * b1_re;
1034  B1_re -= g11_im * b1_im;
1035  B1_re += g12_re * b2_re;
1036  B1_re -= g12_im * b2_im;
1037  spinorFloat B1_im = 0;
1038  B1_im += g10_re * b0_im;
1039  B1_im += g10_im * b0_re;
1040  B1_im += g11_re * b1_im;
1041  B1_im += g11_im * b1_re;
1042  B1_im += g12_re * b2_im;
1043  B1_im += g12_im * b2_re;
1044 
1045  // multiply row 2
1046  spinorFloat A2_re = 0;
1047  A2_re += g20_re * a0_re;
1048  A2_re -= g20_im * a0_im;
1049  A2_re += g21_re * a1_re;
1050  A2_re -= g21_im * a1_im;
1051  A2_re += g22_re * a2_re;
1052  A2_re -= g22_im * a2_im;
1053  spinorFloat A2_im = 0;
1054  A2_im += g20_re * a0_im;
1055  A2_im += g20_im * a0_re;
1056  A2_im += g21_re * a1_im;
1057  A2_im += g21_im * a1_re;
1058  A2_im += g22_re * a2_im;
1059  A2_im += g22_im * a2_re;
1060  spinorFloat B2_re = 0;
1061  B2_re += g20_re * b0_re;
1062  B2_re -= g20_im * b0_im;
1063  B2_re += g21_re * b1_re;
1064  B2_re -= g21_im * b1_im;
1065  B2_re += g22_re * b2_re;
1066  B2_re -= g22_im * b2_im;
1067  spinorFloat B2_im = 0;
1068  B2_im += g20_re * b0_im;
1069  B2_im += g20_im * b0_re;
1070  B2_im += g21_re * b1_im;
1071  B2_im += g21_im * b1_re;
1072  B2_im += g22_re * b2_im;
1073  B2_im += g22_im * b2_re;
1074 
1075  o00_re += A0_re;
1076  o00_im += A0_im;
1077  o10_re += B0_re;
1078  o10_im += B0_im;
1079  o20_re -= A0_im;
1080  o20_im += A0_re;
1081  o30_re += B0_im;
1082  o30_im -= B0_re;
1083 
1084  o01_re += A1_re;
1085  o01_im += A1_im;
1086  o11_re += B1_re;
1087  o11_im += B1_im;
1088  o21_re -= A1_im;
1089  o21_im += A1_re;
1090  o31_re += B1_im;
1091  o31_im -= B1_re;
1092 
1093  o02_re += A2_re;
1094  o02_im += A2_im;
1095  o12_re += B2_re;
1096  o12_im += B2_im;
1097  o22_re -= A2_im;
1098  o22_im += A2_re;
1099  o32_re += B2_im;
1100  o32_im -= B2_re;
1101 }
1102 
1103 if (isActive(dim,2,-1,coord,param.commDim,param.dc.X) && coord[2]==0 )
1104 {
1105  // Projector P2+
1106  // 1 0 i 0
1107  // 0 1 0 -i
1108  // -i 0 1 0
1109  // 0 i 0 1
1110 
1111  faceIndexFromCoords<5,1>(face_idx,coord,2,param);
1112  const int sp_idx = face_idx + param.ghostOffset[2][0];
1113 #if (DD_PREC==2) // half precision
1114  sp_norm_idx = face_idx + param.ghostNormOffset[2][0];
1115 #endif
1116 
1117 
1118  const int ga_idx = param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[2]);
1119 
1120  // read gauge matrix from device memory
1121  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, param.gauge_stride);
1122 
1129 
1130 
1131 
1132  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[2];
1133 
1134  // read half spinor from device memory
1135  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 5);
1136 
1137  a0_re = i00_re; a0_im = i00_im;
1138  a1_re = i01_re; a1_im = i01_im;
1139  a2_re = i02_re; a2_im = i02_im;
1140  b0_re = i10_re; b0_im = i10_im;
1141  b1_re = i11_re; b1_im = i11_im;
1142  b2_re = i12_re; b2_im = i12_im;
1143 
1144 
1145  // reconstruct gauge matrix
1147 
1148  // multiply row 0
1149  spinorFloat A0_re = 0;
1150  A0_re += gT00_re * a0_re;
1151  A0_re -= gT00_im * a0_im;
1152  A0_re += gT01_re * a1_re;
1153  A0_re -= gT01_im * a1_im;
1154  A0_re += gT02_re * a2_re;
1155  A0_re -= gT02_im * a2_im;
1156  spinorFloat A0_im = 0;
1157  A0_im += gT00_re * a0_im;
1158  A0_im += gT00_im * a0_re;
1159  A0_im += gT01_re * a1_im;
1160  A0_im += gT01_im * a1_re;
1161  A0_im += gT02_re * a2_im;
1162  A0_im += gT02_im * a2_re;
1163  spinorFloat B0_re = 0;
1164  B0_re += gT00_re * b0_re;
1165  B0_re -= gT00_im * b0_im;
1166  B0_re += gT01_re * b1_re;
1167  B0_re -= gT01_im * b1_im;
1168  B0_re += gT02_re * b2_re;
1169  B0_re -= gT02_im * b2_im;
1170  spinorFloat B0_im = 0;
1171  B0_im += gT00_re * b0_im;
1172  B0_im += gT00_im * b0_re;
1173  B0_im += gT01_re * b1_im;
1174  B0_im += gT01_im * b1_re;
1175  B0_im += gT02_re * b2_im;
1176  B0_im += gT02_im * b2_re;
1177 
1178  // multiply row 1
1179  spinorFloat A1_re = 0;
1180  A1_re += gT10_re * a0_re;
1181  A1_re -= gT10_im * a0_im;
1182  A1_re += gT11_re * a1_re;
1183  A1_re -= gT11_im * a1_im;
1184  A1_re += gT12_re * a2_re;
1185  A1_re -= gT12_im * a2_im;
1186  spinorFloat A1_im = 0;
1187  A1_im += gT10_re * a0_im;
1188  A1_im += gT10_im * a0_re;
1189  A1_im += gT11_re * a1_im;
1190  A1_im += gT11_im * a1_re;
1191  A1_im += gT12_re * a2_im;
1192  A1_im += gT12_im * a2_re;
1193  spinorFloat B1_re = 0;
1194  B1_re += gT10_re * b0_re;
1195  B1_re -= gT10_im * b0_im;
1196  B1_re += gT11_re * b1_re;
1197  B1_re -= gT11_im * b1_im;
1198  B1_re += gT12_re * b2_re;
1199  B1_re -= gT12_im * b2_im;
1200  spinorFloat B1_im = 0;
1201  B1_im += gT10_re * b0_im;
1202  B1_im += gT10_im * b0_re;
1203  B1_im += gT11_re * b1_im;
1204  B1_im += gT11_im * b1_re;
1205  B1_im += gT12_re * b2_im;
1206  B1_im += gT12_im * b2_re;
1207 
1208  // multiply row 2
1209  spinorFloat A2_re = 0;
1210  A2_re += gT20_re * a0_re;
1211  A2_re -= gT20_im * a0_im;
1212  A2_re += gT21_re * a1_re;
1213  A2_re -= gT21_im * a1_im;
1214  A2_re += gT22_re * a2_re;
1215  A2_re -= gT22_im * a2_im;
1216  spinorFloat A2_im = 0;
1217  A2_im += gT20_re * a0_im;
1218  A2_im += gT20_im * a0_re;
1219  A2_im += gT21_re * a1_im;
1220  A2_im += gT21_im * a1_re;
1221  A2_im += gT22_re * a2_im;
1222  A2_im += gT22_im * a2_re;
1223  spinorFloat B2_re = 0;
1224  B2_re += gT20_re * b0_re;
1225  B2_re -= gT20_im * b0_im;
1226  B2_re += gT21_re * b1_re;
1227  B2_re -= gT21_im * b1_im;
1228  B2_re += gT22_re * b2_re;
1229  B2_re -= gT22_im * b2_im;
1230  spinorFloat B2_im = 0;
1231  B2_im += gT20_re * b0_im;
1232  B2_im += gT20_im * b0_re;
1233  B2_im += gT21_re * b1_im;
1234  B2_im += gT21_im * b1_re;
1235  B2_im += gT22_re * b2_im;
1236  B2_im += gT22_im * b2_re;
1237 
1238  o00_re += A0_re;
1239  o00_im += A0_im;
1240  o10_re += B0_re;
1241  o10_im += B0_im;
1242  o20_re += A0_im;
1243  o20_im -= A0_re;
1244  o30_re -= B0_im;
1245  o30_im += B0_re;
1246 
1247  o01_re += A1_re;
1248  o01_im += A1_im;
1249  o11_re += B1_re;
1250  o11_im += B1_im;
1251  o21_re += A1_im;
1252  o21_im -= A1_re;
1253  o31_re -= B1_im;
1254  o31_im += B1_re;
1255 
1256  o02_re += A2_re;
1257  o02_im += A2_im;
1258  o12_re += B2_re;
1259  o12_im += B2_im;
1260  o22_re += A2_im;
1261  o22_im -= A2_re;
1262  o32_re -= B2_im;
1263  o32_im += B2_re;
1264 }
1265 
1266 if (isActive(dim,3,+1,coord,param.commDim,param.dc.X) && coord[3]==(param.dc.X[3]-1) )
1267 {
1268  // Projector P3-
1269  // 0 0 0 0
1270  // 0 0 0 0
1271  // 0 0 2 0
1272  // 0 0 0 2
1273 
1274  faceIndexFromCoords<5,1>(face_idx,coord,3,param);
1275  const int sp_idx = face_idx + param.ghostOffset[3][1];
1276 #if (DD_PREC==2) // half precision
1277  sp_norm_idx = face_idx + param.ghostNormOffset[3][1];
1278 #endif
1279 
1280 
1281  const int ga_idx = sid % param.dc.volume_4d_cb;
1282 
1283  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1284  {
1291 
1292 
1293 
1294  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[3];
1295  const int t_proj_scale = TPROJSCALE;
1296 
1297  // read half spinor from device memory
1298  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1299 
1300  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1301  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1302  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1303  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1304  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1305  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1306 
1307 
1308  // identity gauge matrix
1315 
1316  o20_re += A0_re;
1317  o20_im += A0_im;
1318  o30_re += B0_re;
1319  o30_im += B0_im;
1320 
1321  o21_re += A1_re;
1322  o21_im += A1_im;
1323  o31_re += B1_re;
1324  o31_im += B1_im;
1325 
1326  o22_re += A2_re;
1327  o22_im += A2_im;
1328  o32_re += B2_re;
1329  o32_im += B2_im;
1330  } else {
1331  // read gauge matrix from device memory
1332  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, param.gauge_stride);
1333 
1340 
1341 
1342 
1343  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[3];
1344  const int t_proj_scale = TPROJSCALE;
1345 
1346  // read half spinor from device memory
1347  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1348 
1349  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1350  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1351  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1352  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1353  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1354  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1355 
1356 
1357  // reconstruct gauge matrix
1359 
1360  // multiply row 0
1361  spinorFloat A0_re = 0;
1362  A0_re += g00_re * a0_re;
1363  A0_re -= g00_im * a0_im;
1364  A0_re += g01_re * a1_re;
1365  A0_re -= g01_im * a1_im;
1366  A0_re += g02_re * a2_re;
1367  A0_re -= g02_im * a2_im;
1368  spinorFloat A0_im = 0;
1369  A0_im += g00_re * a0_im;
1370  A0_im += g00_im * a0_re;
1371  A0_im += g01_re * a1_im;
1372  A0_im += g01_im * a1_re;
1373  A0_im += g02_re * a2_im;
1374  A0_im += g02_im * a2_re;
1375  spinorFloat B0_re = 0;
1376  B0_re += g00_re * b0_re;
1377  B0_re -= g00_im * b0_im;
1378  B0_re += g01_re * b1_re;
1379  B0_re -= g01_im * b1_im;
1380  B0_re += g02_re * b2_re;
1381  B0_re -= g02_im * b2_im;
1382  spinorFloat B0_im = 0;
1383  B0_im += g00_re * b0_im;
1384  B0_im += g00_im * b0_re;
1385  B0_im += g01_re * b1_im;
1386  B0_im += g01_im * b1_re;
1387  B0_im += g02_re * b2_im;
1388  B0_im += g02_im * b2_re;
1389 
1390  // multiply row 1
1391  spinorFloat A1_re = 0;
1392  A1_re += g10_re * a0_re;
1393  A1_re -= g10_im * a0_im;
1394  A1_re += g11_re * a1_re;
1395  A1_re -= g11_im * a1_im;
1396  A1_re += g12_re * a2_re;
1397  A1_re -= g12_im * a2_im;
1398  spinorFloat A1_im = 0;
1399  A1_im += g10_re * a0_im;
1400  A1_im += g10_im * a0_re;
1401  A1_im += g11_re * a1_im;
1402  A1_im += g11_im * a1_re;
1403  A1_im += g12_re * a2_im;
1404  A1_im += g12_im * a2_re;
1405  spinorFloat B1_re = 0;
1406  B1_re += g10_re * b0_re;
1407  B1_re -= g10_im * b0_im;
1408  B1_re += g11_re * b1_re;
1409  B1_re -= g11_im * b1_im;
1410  B1_re += g12_re * b2_re;
1411  B1_re -= g12_im * b2_im;
1412  spinorFloat B1_im = 0;
1413  B1_im += g10_re * b0_im;
1414  B1_im += g10_im * b0_re;
1415  B1_im += g11_re * b1_im;
1416  B1_im += g11_im * b1_re;
1417  B1_im += g12_re * b2_im;
1418  B1_im += g12_im * b2_re;
1419 
1420  // multiply row 2
1421  spinorFloat A2_re = 0;
1422  A2_re += g20_re * a0_re;
1423  A2_re -= g20_im * a0_im;
1424  A2_re += g21_re * a1_re;
1425  A2_re -= g21_im * a1_im;
1426  A2_re += g22_re * a2_re;
1427  A2_re -= g22_im * a2_im;
1428  spinorFloat A2_im = 0;
1429  A2_im += g20_re * a0_im;
1430  A2_im += g20_im * a0_re;
1431  A2_im += g21_re * a1_im;
1432  A2_im += g21_im * a1_re;
1433  A2_im += g22_re * a2_im;
1434  A2_im += g22_im * a2_re;
1435  spinorFloat B2_re = 0;
1436  B2_re += g20_re * b0_re;
1437  B2_re -= g20_im * b0_im;
1438  B2_re += g21_re * b1_re;
1439  B2_re -= g21_im * b1_im;
1440  B2_re += g22_re * b2_re;
1441  B2_re -= g22_im * b2_im;
1442  spinorFloat B2_im = 0;
1443  B2_im += g20_re * b0_im;
1444  B2_im += g20_im * b0_re;
1445  B2_im += g21_re * b1_im;
1446  B2_im += g21_im * b1_re;
1447  B2_im += g22_re * b2_im;
1448  B2_im += g22_im * b2_re;
1449 
1450  o20_re += A0_re;
1451  o20_im += A0_im;
1452  o30_re += B0_re;
1453  o30_im += B0_im;
1454 
1455  o21_re += A1_re;
1456  o21_im += A1_im;
1457  o31_re += B1_re;
1458  o31_im += B1_im;
1459 
1460  o22_re += A2_re;
1461  o22_im += A2_im;
1462  o32_re += B2_re;
1463  o32_im += B2_im;
1464  }
1465 }
1466 
1467 if (isActive(dim,3,-1,coord,param.commDim,param.dc.X) && coord[3]==0 )
1468 {
1469  // Projector P3+
1470  // 2 0 0 0
1471  // 0 2 0 0
1472  // 0 0 0 0
1473  // 0 0 0 0
1474 
1475  faceIndexFromCoords<5,1>(face_idx,coord,3,param);
1476  const int sp_idx = face_idx + param.ghostOffset[3][0];
1477 #if (DD_PREC==2) // half precision
1478  sp_norm_idx = face_idx + param.ghostNormOffset[3][0];
1479 #endif
1480 
1481 
1482  const int ga_idx = param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[3]);
1483 
1484  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1485  {
1492 
1493 
1494 
1495  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[3];
1496  const int t_proj_scale = TPROJSCALE;
1497 
1498  // read half spinor from device memory
1499  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
1500 
1501  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1502  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1503  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1504  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1505  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1506  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1507 
1508 
1509  // identity gauge matrix
1516 
1517  o00_re += A0_re;
1518  o00_im += A0_im;
1519  o10_re += B0_re;
1520  o10_im += B0_im;
1521 
1522  o01_re += A1_re;
1523  o01_im += A1_im;
1524  o11_re += B1_re;
1525  o11_im += B1_im;
1526 
1527  o02_re += A2_re;
1528  o02_im += A2_im;
1529  o12_re += B2_re;
1530  o12_im += B2_im;
1531  } else {
1532  // read gauge matrix from device memory
1533  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, param.gauge_stride);
1534 
1541 
1542 
1543 
1544  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[3];
1545  const int t_proj_scale = TPROJSCALE;
1546 
1547  // read half spinor from device memory
1548  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
1549 
1550  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1551  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1552  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1553  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1554  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1555  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1556 
1557 
1558  // reconstruct gauge matrix
1560 
1561  // multiply row 0
1562  spinorFloat A0_re = 0;
1563  A0_re += gT00_re * a0_re;
1564  A0_re -= gT00_im * a0_im;
1565  A0_re += gT01_re * a1_re;
1566  A0_re -= gT01_im * a1_im;
1567  A0_re += gT02_re * a2_re;
1568  A0_re -= gT02_im * a2_im;
1569  spinorFloat A0_im = 0;
1570  A0_im += gT00_re * a0_im;
1571  A0_im += gT00_im * a0_re;
1572  A0_im += gT01_re * a1_im;
1573  A0_im += gT01_im * a1_re;
1574  A0_im += gT02_re * a2_im;
1575  A0_im += gT02_im * a2_re;
1576  spinorFloat B0_re = 0;
1577  B0_re += gT00_re * b0_re;
1578  B0_re -= gT00_im * b0_im;
1579  B0_re += gT01_re * b1_re;
1580  B0_re -= gT01_im * b1_im;
1581  B0_re += gT02_re * b2_re;
1582  B0_re -= gT02_im * b2_im;
1583  spinorFloat B0_im = 0;
1584  B0_im += gT00_re * b0_im;
1585  B0_im += gT00_im * b0_re;
1586  B0_im += gT01_re * b1_im;
1587  B0_im += gT01_im * b1_re;
1588  B0_im += gT02_re * b2_im;
1589  B0_im += gT02_im * b2_re;
1590 
1591  // multiply row 1
1592  spinorFloat A1_re = 0;
1593  A1_re += gT10_re * a0_re;
1594  A1_re -= gT10_im * a0_im;
1595  A1_re += gT11_re * a1_re;
1596  A1_re -= gT11_im * a1_im;
1597  A1_re += gT12_re * a2_re;
1598  A1_re -= gT12_im * a2_im;
1599  spinorFloat A1_im = 0;
1600  A1_im += gT10_re * a0_im;
1601  A1_im += gT10_im * a0_re;
1602  A1_im += gT11_re * a1_im;
1603  A1_im += gT11_im * a1_re;
1604  A1_im += gT12_re * a2_im;
1605  A1_im += gT12_im * a2_re;
1606  spinorFloat B1_re = 0;
1607  B1_re += gT10_re * b0_re;
1608  B1_re -= gT10_im * b0_im;
1609  B1_re += gT11_re * b1_re;
1610  B1_re -= gT11_im * b1_im;
1611  B1_re += gT12_re * b2_re;
1612  B1_re -= gT12_im * b2_im;
1613  spinorFloat B1_im = 0;
1614  B1_im += gT10_re * b0_im;
1615  B1_im += gT10_im * b0_re;
1616  B1_im += gT11_re * b1_im;
1617  B1_im += gT11_im * b1_re;
1618  B1_im += gT12_re * b2_im;
1619  B1_im += gT12_im * b2_re;
1620 
1621  // multiply row 2
1622  spinorFloat A2_re = 0;
1623  A2_re += gT20_re * a0_re;
1624  A2_re -= gT20_im * a0_im;
1625  A2_re += gT21_re * a1_re;
1626  A2_re -= gT21_im * a1_im;
1627  A2_re += gT22_re * a2_re;
1628  A2_re -= gT22_im * a2_im;
1629  spinorFloat A2_im = 0;
1630  A2_im += gT20_re * a0_im;
1631  A2_im += gT20_im * a0_re;
1632  A2_im += gT21_re * a1_im;
1633  A2_im += gT21_im * a1_re;
1634  A2_im += gT22_re * a2_im;
1635  A2_im += gT22_im * a2_re;
1636  spinorFloat B2_re = 0;
1637  B2_re += gT20_re * b0_re;
1638  B2_re -= gT20_im * b0_im;
1639  B2_re += gT21_re * b1_re;
1640  B2_re -= gT21_im * b1_im;
1641  B2_re += gT22_re * b2_re;
1642  B2_re -= gT22_im * b2_im;
1643  spinorFloat B2_im = 0;
1644  B2_im += gT20_re * b0_im;
1645  B2_im += gT20_im * b0_re;
1646  B2_im += gT21_re * b1_im;
1647  B2_im += gT21_im * b1_re;
1648  B2_im += gT22_re * b2_im;
1649  B2_im += gT22_im * b2_re;
1650 
1651  o00_re += A0_re;
1652  o00_im += A0_im;
1653  o10_re += B0_re;
1654  o10_im += B0_im;
1655 
1656  o01_re += A1_re;
1657  o01_im += A1_im;
1658  o11_re += B1_re;
1659  o11_im += B1_im;
1660 
1661  o02_re += A2_re;
1662  o02_im += A2_im;
1663  o12_re += B2_re;
1664  o12_im += B2_im;
1665  }
1666 }
1667 
1668 {
1669 
1670 #ifdef DSLASH_XPAY
1671  READ_ACCUM(ACCUMTEX, param.sp_stride)
1673 
1674 #ifdef MDWF_mode
1675  coeff = (spinorFloat)(0.5*a/(mdwf_b5[coord[4]]*(m5+4.0) + 1.0));
1676 #else
1677  coeff = a;
1678 #endif
1679 
1680 #ifdef SPINOR_DOUBLE
1681  o00_re = coeff*o00_re + accum0.x;
1682  o00_im = coeff*o00_im + accum0.y;
1683  o01_re = coeff*o01_re + accum1.x;
1684  o01_im = coeff*o01_im + accum1.y;
1685  o02_re = coeff*o02_re + accum2.x;
1686  o02_im = coeff*o02_im + accum2.y;
1687  o10_re = coeff*o10_re + accum3.x;
1688  o10_im = coeff*o10_im + accum3.y;
1689  o11_re = coeff*o11_re + accum4.x;
1690  o11_im = coeff*o11_im + accum4.y;
1691  o12_re = coeff*o12_re + accum5.x;
1692  o12_im = coeff*o12_im + accum5.y;
1693  o20_re = coeff*o20_re + accum6.x;
1694  o20_im = coeff*o20_im + accum6.y;
1695  o21_re = coeff*o21_re + accum7.x;
1696  o21_im = coeff*o21_im + accum7.y;
1697  o22_re = coeff*o22_re + accum8.x;
1698  o22_im = coeff*o22_im + accum8.y;
1699  o30_re = coeff*o30_re + accum9.x;
1700  o30_im = coeff*o30_im + accum9.y;
1701  o31_re = coeff*o31_re + accum10.x;
1702  o31_im = coeff*o31_im + accum10.y;
1703  o32_re = coeff*o32_re + accum11.x;
1704  o32_im = coeff*o32_im + accum11.y;
1705 #else
1706  o00_re = coeff*o00_re + accum0.x;
1707  o00_im = coeff*o00_im + accum0.y;
1708  o01_re = coeff*o01_re + accum0.z;
1709  o01_im = coeff*o01_im + accum0.w;
1710  o02_re = coeff*o02_re + accum1.x;
1711  o02_im = coeff*o02_im + accum1.y;
1712  o10_re = coeff*o10_re + accum1.z;
1713  o10_im = coeff*o10_im + accum1.w;
1714  o11_re = coeff*o11_re + accum2.x;
1715  o11_im = coeff*o11_im + accum2.y;
1716  o12_re = coeff*o12_re + accum2.z;
1717  o12_im = coeff*o12_im + accum2.w;
1718  o20_re = coeff*o20_re + accum3.x;
1719  o20_im = coeff*o20_im + accum3.y;
1720  o21_re = coeff*o21_re + accum3.z;
1721  o21_im = coeff*o21_im + accum3.w;
1722  o22_re = coeff*o22_re + accum4.x;
1723  o22_im = coeff*o22_im + accum4.y;
1724  o30_re = coeff*o30_re + accum4.z;
1725  o30_im = coeff*o30_im + accum4.w;
1726  o31_re = coeff*o31_re + accum5.x;
1727  o31_im = coeff*o31_im + accum5.y;
1728  o32_re = coeff*o32_re + accum5.z;
1729  o32_im = coeff*o32_im + accum5.w;
1730 #endif // SPINOR_DOUBLE
1731 #endif // DSLASH_XPAY
1732 }
1733 
1734 // write spinor field back to device memory
1735 WRITE_SPINOR(param.sp_stride);
1736 
1737 // undefine to prevent warning when precision is changed
1738 #undef m5
1739 #undef mdwf_b5
1740 #undef mdwf_c5
1741 #undef mferm
1742 #undef a
1743 #undef b
1744 #undef spinorFloat
1745 #undef SHARED_STRIDE
1746 
1747 #undef g00_re
1748 #undef g00_im
1749 #undef g01_re
1750 #undef g01_im
1751 #undef g02_re
1752 #undef g02_im
1753 #undef g10_re
1754 #undef g10_im
1755 #undef g11_re
1756 #undef g11_im
1757 #undef g12_re
1758 #undef g12_im
1759 #undef g20_re
1760 #undef g20_im
1761 #undef g21_re
1762 #undef g21_im
1763 #undef g22_re
1764 #undef g22_im
1765 
1766 #undef i00_re
1767 #undef i00_im
1768 #undef i01_re
1769 #undef i01_im
1770 #undef i02_re
1771 #undef i02_im
1772 #undef i10_re
1773 #undef i10_im
1774 #undef i11_re
1775 #undef i11_im
1776 #undef i12_re
1777 #undef i12_im
1778 #undef i20_re
1779 #undef i20_im
1780 #undef i21_re
1781 #undef i21_im
1782 #undef i22_re
1783 #undef i22_im
1784 #undef i30_re
1785 #undef i30_im
1786 #undef i31_re
1787 #undef i31_im
1788 #undef i32_re
1789 #undef i32_im
1790 
1791 
1792 
1793 #undef VOLATILE
1794 #endif // MULTI_GPU
dim3 dim3 blockDim
float4 G2
VOLATILE spinorFloat o22_im
#define WRITE_SPINOR
VOLATILE spinorFloat o01_re
int sp_idx
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
#define m5
static __device__ bool isActive(const int threadDim, int offsetDim, int offset, const int y[], const int partitioned[], const T X[])
Compute whether this thread should be active for updating the a given offsetDim halo. This is used by the fused halo region update kernels: here every thread has a prescribed dimension it is tasked with updating, but for the edges and vertices, the thread responsible for the entire update is the "greatest" one. Hence some threads may be labelled as a given dimension, but they have to update other dimensions too. Conversely, a given thread may be labeled for a given dimension, but if that thread lies at en edge or vertex, and we have partitioned a higher dimension, then that thread will cede to the higher thread.
VOLATILE spinorFloat o01_im
#define GAUGE0TEX
VOLATILE spinorFloat o20_re
QudaGaugeParam param
Definition: pack_test.cpp:17
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o02_re
VOLATILE spinorFloat o02_im
float4 G1
float4 G0
VOLATILE spinorFloat o00_re
#define GAUGE1TEX
VOLATILE spinorFloat o20_im
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o32_im
VOLATILE spinorFloat o31_im
#define mdwf_b5
int X[4]
Definition: quda.h:29
VOLATILE spinorFloat o10_im
float4 G4
#define READ_SPINOR_GHOST
float4 G3
#define ASSN_GAUGE_MATRIX
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o31_re
#define RECONSTRUCT_GAUGE_MATRIX
#define INTERTEX
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o11_re
int face_idx
VOLATILE spinorFloat o21_re
const int face_num
#define TPROJSCALE
VOLATILE spinorFloat o00_im
#define GHOSTSPINORTEX
VOLATILE spinorFloat o12_im
#define a
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o30_im
VOLATILE spinorFloat o32_re
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o30_re