QUDA  0.9.0
dw_fused_exterior_dslash_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #ifdef MULTI_GPU
7 
8 #if (CUDA_VERSION >= 4010)
9 #define VOLATILE
10 #else
11 #define VOLATILE volatile
12 #endif
13 // input spinor
14 #ifdef SPINOR_DOUBLE
15 #define spinorFloat double
16 #define i00_re I0.x
17 #define i00_im I0.y
18 #define i01_re I1.x
19 #define i01_im I1.y
20 #define i02_re I2.x
21 #define i02_im I2.y
22 #define i10_re I3.x
23 #define i10_im I3.y
24 #define i11_re I4.x
25 #define i11_im I4.y
26 #define i12_re I5.x
27 #define i12_im I5.y
28 #define i20_re I6.x
29 #define i20_im I6.y
30 #define i21_re I7.x
31 #define i21_im I7.y
32 #define i22_re I8.x
33 #define i22_im I8.y
34 #define i30_re I9.x
35 #define i30_im I9.y
36 #define i31_re I10.x
37 #define i31_im I10.y
38 #define i32_re I11.x
39 #define i32_im I11.y
40 #else
41 #define spinorFloat float
42 #define i00_re I0.x
43 #define i00_im I0.y
44 #define i01_re I0.z
45 #define i01_im I0.w
46 #define i02_re I1.x
47 #define i02_im I1.y
48 #define i10_re I1.z
49 #define i10_im I1.w
50 #define i11_re I2.x
51 #define i11_im I2.y
52 #define i12_re I2.z
53 #define i12_im I2.w
54 #define i20_re I3.x
55 #define i20_im I3.y
56 #define i21_re I3.z
57 #define i21_im I3.w
58 #define i22_re I4.x
59 #define i22_im I4.y
60 #define i30_re I4.z
61 #define i30_im I4.w
62 #define i31_re I5.x
63 #define i31_im I5.y
64 #define i32_re I5.z
65 #define i32_im I5.w
66 #endif // SPINOR_DOUBLE
67 
68 // gauge link
69 #ifdef GAUGE_FLOAT2
70 #define g00_re G0.x
71 #define g00_im G0.y
72 #define g01_re G1.x
73 #define g01_im G1.y
74 #define g02_re G2.x
75 #define g02_im G2.y
76 #define g10_re G3.x
77 #define g10_im G3.y
78 #define g11_re G4.x
79 #define g11_im G4.y
80 #define g12_re G5.x
81 #define g12_im G5.y
82 #define g20_re G6.x
83 #define g20_im G6.y
84 #define g21_re G7.x
85 #define g21_im G7.y
86 #define g22_re G8.x
87 #define g22_im G8.y
88 
89 #else
90 #define g00_re G0.x
91 #define g00_im G0.y
92 #define g01_re G0.z
93 #define g01_im G0.w
94 #define g02_re G1.x
95 #define g02_im G1.y
96 #define g10_re G1.z
97 #define g10_im G1.w
98 #define g11_re G2.x
99 #define g11_im G2.y
100 #define g12_re G2.z
101 #define g12_im G2.w
102 #define g20_re G3.x
103 #define g20_im G3.y
104 #define g21_re G3.z
105 #define g21_im G3.w
106 #define g22_re G4.x
107 #define g22_im G4.y
108 
109 #endif // GAUGE_DOUBLE
110 
111 // conjugated gauge link
112 #define gT00_re (+g00_re)
113 #define gT00_im (-g00_im)
114 #define gT01_re (+g10_re)
115 #define gT01_im (-g10_im)
116 #define gT02_re (+g20_re)
117 #define gT02_im (-g20_im)
118 #define gT10_re (+g01_re)
119 #define gT10_im (-g01_im)
120 #define gT11_re (+g11_re)
121 #define gT11_im (-g11_im)
122 #define gT12_re (+g21_re)
123 #define gT12_im (-g21_im)
124 #define gT20_re (+g02_re)
125 #define gT20_im (-g02_im)
126 #define gT21_re (+g12_re)
127 #define gT21_im (-g12_im)
128 #define gT22_re (+g22_re)
129 #define gT22_im (-g22_im)
130 
131 // output spinor
156 
157 #ifdef SPINOR_DOUBLE
158 #if (__COMPUTE_CAPABILITY__ >= 200)
159 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
160 #else
161 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
162 #endif
163 #else
164 #if (__COMPUTE_CAPABILITY__ >= 200)
165 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
166 #else
167 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
168 #endif
169 #endif
170 
171 #include "read_gauge.h"
172 #include "io_spinor.h"
173 
174 #if (DD_PREC==2) // half precision
175 int sp_norm_idx;
176 #endif // half precision
177 
178 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
179 if (sid >= param.threads*param.dc.Ls) return;
180 
181 int dim;
182 int face_idx;
183 int coord[5];
184 int X;
185 int s_parity;
186 
187 
188 
189 { // exterior kernel
190 
191 dim = dimFromFaceIndex<5>(sid, param); // sid is also modified
192 
193 const int face_volume = ((param.threadDimMapUpper[dim] - param.threadDimMapLower[dim])*param.dc.Ls >> 1);
194 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
195 face_idx = sid - face_num*face_volume; // index into the respective face
196 
197 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
198 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
199 //sp_idx = face_idx + param.ghostOffset[dim];
200 
201 switch(dim) {
202 case 0:
203  coordsFromFaceIndex<5,QUDA_5D_PC,0,1>(X, sid, coord, face_idx, face_num, param);
204  break;
205 case 1:
206  coordsFromFaceIndex<5,QUDA_5D_PC,1,1>(X, sid, coord, face_idx, face_num, param);
207  break;
208 case 2:
209  coordsFromFaceIndex<5,QUDA_5D_PC,2,1>(X, sid, coord, face_idx, face_num, param);
210  break;
211 case 3:
212  coordsFromFaceIndex<5,QUDA_5D_PC,3,1>(X, sid, coord, face_idx, face_num, param);
213  break;
214 }
215 
216 bool active = false;
217 for(int dir=0; dir<4; ++dir){
218  active = active || isActive(dim,dir,+1,coord,param.commDim,param.dc.X);
219 }
220 if(!active) return;
221 
222 
223 s_parity = ( sid/param.dc.volume_4d_cb ) % 2;
224 
226 
227  o00_re = i00_re; o00_im = i00_im;
228  o01_re = i01_re; o01_im = i01_im;
229  o02_re = i02_re; o02_im = i02_im;
230  o10_re = i10_re; o10_im = i10_im;
231  o11_re = i11_re; o11_im = i11_im;
232  o12_re = i12_re; o12_im = i12_im;
233  o20_re = i20_re; o20_im = i20_im;
234  o21_re = i21_re; o21_im = i21_im;
235  o22_re = i22_re; o22_im = i22_im;
236  o30_re = i30_re; o30_im = i30_im;
237  o31_re = i31_re; o31_im = i31_im;
238  o32_re = i32_re; o32_im = i32_im;
239 }
240 
241 // declare G## here and use ASSN below instead of READ
242 #ifdef GAUGE_FLOAT2
243 #if (DD_PREC==0) //temporal hack
244 double2 G0;
245 double2 G1;
246 double2 G2;
247 double2 G3;
248 double2 G4;
249 double2 G5;
250 double2 G6;
251 double2 G7;
252 double2 G8;
253 #else
254 float2 G0;
255 float2 G1;
256 float2 G2;
257 float2 G3;
258 float2 G4;
259 float2 G5;
260 float2 G6;
261 float2 G7;
262 float2 G8;
263 #endif
264 #else
265 float4 G0;
266 float4 G1;
267 float4 G2;
268 float4 G3;
269 float4 G4;
270 #endif
271 
272 
273 
274 if (isActive(dim,0,+1,coord,param.commDim,param.dc.X) && coord[0]==(param.dc.X[0]-1) )
275 {
276  // Projector P0-
277  // 1 0 0 -i
278  // 0 1 -i 0
279  // 0 i 1 0
280  // i 0 0 1
281 
282  faceIndexFromCoords<5,1>(face_idx,coord,0,param);
283  const int sp_idx = face_idx + param.ghostOffset[0][1];
284 #if (DD_PREC==2) // half precision
285  sp_norm_idx = face_idx + param.ghostNormOffset[0][1];
286 #endif
287 
288 
289  const int ga_idx = sid % param.dc.volume_4d_cb;
290 
291  // read gauge matrix from device memory
292  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride); }
293  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 0, ga_idx, param.gauge_stride); }
294 
301 
302  {
303 
304  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[0];
305 
306  // read half spinor from device memory
307  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 0);
308 
309  a0_re = i00_re; a0_im = i00_im;
310  a1_re = i01_re; a1_im = i01_im;
311  a2_re = i02_re; a2_im = i02_im;
312  b0_re = i10_re; b0_im = i10_im;
313  b1_re = i11_re; b1_im = i11_im;
314  b2_re = i12_re; b2_im = i12_im;
315 
316  }
317 
318  // reconstruct gauge matrix
320 
321  // multiply row 0
322  spinorFloat A0_re = 0;
323  A0_re += g00_re * a0_re;
324  A0_re -= g00_im * a0_im;
325  A0_re += g01_re * a1_re;
326  A0_re -= g01_im * a1_im;
327  A0_re += g02_re * a2_re;
328  A0_re -= g02_im * a2_im;
329  spinorFloat A0_im = 0;
330  A0_im += g00_re * a0_im;
331  A0_im += g00_im * a0_re;
332  A0_im += g01_re * a1_im;
333  A0_im += g01_im * a1_re;
334  A0_im += g02_re * a2_im;
335  A0_im += g02_im * a2_re;
336  spinorFloat B0_re = 0;
337  B0_re += g00_re * b0_re;
338  B0_re -= g00_im * b0_im;
339  B0_re += g01_re * b1_re;
340  B0_re -= g01_im * b1_im;
341  B0_re += g02_re * b2_re;
342  B0_re -= g02_im * b2_im;
343  spinorFloat B0_im = 0;
344  B0_im += g00_re * b0_im;
345  B0_im += g00_im * b0_re;
346  B0_im += g01_re * b1_im;
347  B0_im += g01_im * b1_re;
348  B0_im += g02_re * b2_im;
349  B0_im += g02_im * b2_re;
350 
351  // multiply row 1
352  spinorFloat A1_re = 0;
353  A1_re += g10_re * a0_re;
354  A1_re -= g10_im * a0_im;
355  A1_re += g11_re * a1_re;
356  A1_re -= g11_im * a1_im;
357  A1_re += g12_re * a2_re;
358  A1_re -= g12_im * a2_im;
359  spinorFloat A1_im = 0;
360  A1_im += g10_re * a0_im;
361  A1_im += g10_im * a0_re;
362  A1_im += g11_re * a1_im;
363  A1_im += g11_im * a1_re;
364  A1_im += g12_re * a2_im;
365  A1_im += g12_im * a2_re;
366  spinorFloat B1_re = 0;
367  B1_re += g10_re * b0_re;
368  B1_re -= g10_im * b0_im;
369  B1_re += g11_re * b1_re;
370  B1_re -= g11_im * b1_im;
371  B1_re += g12_re * b2_re;
372  B1_re -= g12_im * b2_im;
373  spinorFloat B1_im = 0;
374  B1_im += g10_re * b0_im;
375  B1_im += g10_im * b0_re;
376  B1_im += g11_re * b1_im;
377  B1_im += g11_im * b1_re;
378  B1_im += g12_re * b2_im;
379  B1_im += g12_im * b2_re;
380 
381  // multiply row 2
382  spinorFloat A2_re = 0;
383  A2_re += g20_re * a0_re;
384  A2_re -= g20_im * a0_im;
385  A2_re += g21_re * a1_re;
386  A2_re -= g21_im * a1_im;
387  A2_re += g22_re * a2_re;
388  A2_re -= g22_im * a2_im;
389  spinorFloat A2_im = 0;
390  A2_im += g20_re * a0_im;
391  A2_im += g20_im * a0_re;
392  A2_im += g21_re * a1_im;
393  A2_im += g21_im * a1_re;
394  A2_im += g22_re * a2_im;
395  A2_im += g22_im * a2_re;
396  spinorFloat B2_re = 0;
397  B2_re += g20_re * b0_re;
398  B2_re -= g20_im * b0_im;
399  B2_re += g21_re * b1_re;
400  B2_re -= g21_im * b1_im;
401  B2_re += g22_re * b2_re;
402  B2_re -= g22_im * b2_im;
403  spinorFloat B2_im = 0;
404  B2_im += g20_re * b0_im;
405  B2_im += g20_im * b0_re;
406  B2_im += g21_re * b1_im;
407  B2_im += g21_im * b1_re;
408  B2_im += g22_re * b2_im;
409  B2_im += g22_im * b2_re;
410 
411  o00_re += A0_re;
412  o00_im += A0_im;
413  o10_re += B0_re;
414  o10_im += B0_im;
415  o20_re -= B0_im;
416  o20_im += B0_re;
417  o30_re -= A0_im;
418  o30_im += A0_re;
419 
420  o01_re += A1_re;
421  o01_im += A1_im;
422  o11_re += B1_re;
423  o11_im += B1_im;
424  o21_re -= B1_im;
425  o21_im += B1_re;
426  o31_re -= A1_im;
427  o31_im += A1_re;
428 
429  o02_re += A2_re;
430  o02_im += A2_im;
431  o12_re += B2_re;
432  o12_im += B2_im;
433  o22_re -= B2_im;
434  o22_im += B2_re;
435  o32_re -= A2_im;
436  o32_im += A2_re;
437 }
438 
439 if (isActive(dim,0,-1,coord,param.commDim,param.dc.X) && coord[0]==0 )
440 {
441  // Projector P0+
442  // 1 0 0 i
443  // 0 1 i 0
444  // 0 -i 1 0
445  // -i 0 0 1
446 
447  faceIndexFromCoords<5,1>(face_idx,coord,0,param);
448  const int sp_idx = face_idx + param.ghostOffset[0][0];
449 #if (DD_PREC==2) // half precision
450  sp_norm_idx = face_idx + param.ghostNormOffset[0][0];
451 #endif
452 
453 
454  const int ga_idx = param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[0]);
455 
456  // read gauge matrix from device memory
457  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, param.gauge_stride); }
458  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 1, ga_idx, param.gauge_stride); }
459 
466 
467  {
468 
469  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[0];
470 
471  // read half spinor from device memory
472  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 1);
473 
474  a0_re = i00_re; a0_im = i00_im;
475  a1_re = i01_re; a1_im = i01_im;
476  a2_re = i02_re; a2_im = i02_im;
477  b0_re = i10_re; b0_im = i10_im;
478  b1_re = i11_re; b1_im = i11_im;
479  b2_re = i12_re; b2_im = i12_im;
480 
481  }
482 
483  // reconstruct gauge matrix
485 
486  // multiply row 0
487  spinorFloat A0_re = 0;
488  A0_re += gT00_re * a0_re;
489  A0_re -= gT00_im * a0_im;
490  A0_re += gT01_re * a1_re;
491  A0_re -= gT01_im * a1_im;
492  A0_re += gT02_re * a2_re;
493  A0_re -= gT02_im * a2_im;
494  spinorFloat A0_im = 0;
495  A0_im += gT00_re * a0_im;
496  A0_im += gT00_im * a0_re;
497  A0_im += gT01_re * a1_im;
498  A0_im += gT01_im * a1_re;
499  A0_im += gT02_re * a2_im;
500  A0_im += gT02_im * a2_re;
501  spinorFloat B0_re = 0;
502  B0_re += gT00_re * b0_re;
503  B0_re -= gT00_im * b0_im;
504  B0_re += gT01_re * b1_re;
505  B0_re -= gT01_im * b1_im;
506  B0_re += gT02_re * b2_re;
507  B0_re -= gT02_im * b2_im;
508  spinorFloat B0_im = 0;
509  B0_im += gT00_re * b0_im;
510  B0_im += gT00_im * b0_re;
511  B0_im += gT01_re * b1_im;
512  B0_im += gT01_im * b1_re;
513  B0_im += gT02_re * b2_im;
514  B0_im += gT02_im * b2_re;
515 
516  // multiply row 1
517  spinorFloat A1_re = 0;
518  A1_re += gT10_re * a0_re;
519  A1_re -= gT10_im * a0_im;
520  A1_re += gT11_re * a1_re;
521  A1_re -= gT11_im * a1_im;
522  A1_re += gT12_re * a2_re;
523  A1_re -= gT12_im * a2_im;
524  spinorFloat A1_im = 0;
525  A1_im += gT10_re * a0_im;
526  A1_im += gT10_im * a0_re;
527  A1_im += gT11_re * a1_im;
528  A1_im += gT11_im * a1_re;
529  A1_im += gT12_re * a2_im;
530  A1_im += gT12_im * a2_re;
531  spinorFloat B1_re = 0;
532  B1_re += gT10_re * b0_re;
533  B1_re -= gT10_im * b0_im;
534  B1_re += gT11_re * b1_re;
535  B1_re -= gT11_im * b1_im;
536  B1_re += gT12_re * b2_re;
537  B1_re -= gT12_im * b2_im;
538  spinorFloat B1_im = 0;
539  B1_im += gT10_re * b0_im;
540  B1_im += gT10_im * b0_re;
541  B1_im += gT11_re * b1_im;
542  B1_im += gT11_im * b1_re;
543  B1_im += gT12_re * b2_im;
544  B1_im += gT12_im * b2_re;
545 
546  // multiply row 2
547  spinorFloat A2_re = 0;
548  A2_re += gT20_re * a0_re;
549  A2_re -= gT20_im * a0_im;
550  A2_re += gT21_re * a1_re;
551  A2_re -= gT21_im * a1_im;
552  A2_re += gT22_re * a2_re;
553  A2_re -= gT22_im * a2_im;
554  spinorFloat A2_im = 0;
555  A2_im += gT20_re * a0_im;
556  A2_im += gT20_im * a0_re;
557  A2_im += gT21_re * a1_im;
558  A2_im += gT21_im * a1_re;
559  A2_im += gT22_re * a2_im;
560  A2_im += gT22_im * a2_re;
561  spinorFloat B2_re = 0;
562  B2_re += gT20_re * b0_re;
563  B2_re -= gT20_im * b0_im;
564  B2_re += gT21_re * b1_re;
565  B2_re -= gT21_im * b1_im;
566  B2_re += gT22_re * b2_re;
567  B2_re -= gT22_im * b2_im;
568  spinorFloat B2_im = 0;
569  B2_im += gT20_re * b0_im;
570  B2_im += gT20_im * b0_re;
571  B2_im += gT21_re * b1_im;
572  B2_im += gT21_im * b1_re;
573  B2_im += gT22_re * b2_im;
574  B2_im += gT22_im * b2_re;
575 
576  o00_re += A0_re;
577  o00_im += A0_im;
578  o10_re += B0_re;
579  o10_im += B0_im;
580  o20_re += B0_im;
581  o20_im -= B0_re;
582  o30_re += A0_im;
583  o30_im -= A0_re;
584 
585  o01_re += A1_re;
586  o01_im += A1_im;
587  o11_re += B1_re;
588  o11_im += B1_im;
589  o21_re += B1_im;
590  o21_im -= B1_re;
591  o31_re += A1_im;
592  o31_im -= A1_re;
593 
594  o02_re += A2_re;
595  o02_im += A2_im;
596  o12_re += B2_re;
597  o12_im += B2_im;
598  o22_re += B2_im;
599  o22_im -= B2_re;
600  o32_re += A2_im;
601  o32_im -= A2_re;
602 }
603 
604 if (isActive(dim,1,+1,coord,param.commDim,param.dc.X) && coord[1]==(param.dc.X[1]-1) )
605 {
606  // Projector P1-
607  // 1 0 0 -1
608  // 0 1 1 0
609  // 0 1 1 0
610  // -1 0 0 1
611 
612  faceIndexFromCoords<5,1>(face_idx,coord,1,param);
613  const int sp_idx = face_idx + param.ghostOffset[1][1];
614 #if (DD_PREC==2) // half precision
615  sp_norm_idx = face_idx + param.ghostNormOffset[1][1];
616 #endif
617 
618 
619  const int ga_idx = sid % param.dc.volume_4d_cb;
620 
621  // read gauge matrix from device memory
622  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, param.gauge_stride); }
623  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 2, ga_idx, param.gauge_stride); }
624 
631 
632  {
633 
634  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[1];
635 
636  // read half spinor from device memory
637  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 2);
638 
639  a0_re = i00_re; a0_im = i00_im;
640  a1_re = i01_re; a1_im = i01_im;
641  a2_re = i02_re; a2_im = i02_im;
642  b0_re = i10_re; b0_im = i10_im;
643  b1_re = i11_re; b1_im = i11_im;
644  b2_re = i12_re; b2_im = i12_im;
645 
646  }
647 
648  // reconstruct gauge matrix
650 
651  // multiply row 0
652  spinorFloat A0_re = 0;
653  A0_re += g00_re * a0_re;
654  A0_re -= g00_im * a0_im;
655  A0_re += g01_re * a1_re;
656  A0_re -= g01_im * a1_im;
657  A0_re += g02_re * a2_re;
658  A0_re -= g02_im * a2_im;
659  spinorFloat A0_im = 0;
660  A0_im += g00_re * a0_im;
661  A0_im += g00_im * a0_re;
662  A0_im += g01_re * a1_im;
663  A0_im += g01_im * a1_re;
664  A0_im += g02_re * a2_im;
665  A0_im += g02_im * a2_re;
666  spinorFloat B0_re = 0;
667  B0_re += g00_re * b0_re;
668  B0_re -= g00_im * b0_im;
669  B0_re += g01_re * b1_re;
670  B0_re -= g01_im * b1_im;
671  B0_re += g02_re * b2_re;
672  B0_re -= g02_im * b2_im;
673  spinorFloat B0_im = 0;
674  B0_im += g00_re * b0_im;
675  B0_im += g00_im * b0_re;
676  B0_im += g01_re * b1_im;
677  B0_im += g01_im * b1_re;
678  B0_im += g02_re * b2_im;
679  B0_im += g02_im * b2_re;
680 
681  // multiply row 1
682  spinorFloat A1_re = 0;
683  A1_re += g10_re * a0_re;
684  A1_re -= g10_im * a0_im;
685  A1_re += g11_re * a1_re;
686  A1_re -= g11_im * a1_im;
687  A1_re += g12_re * a2_re;
688  A1_re -= g12_im * a2_im;
689  spinorFloat A1_im = 0;
690  A1_im += g10_re * a0_im;
691  A1_im += g10_im * a0_re;
692  A1_im += g11_re * a1_im;
693  A1_im += g11_im * a1_re;
694  A1_im += g12_re * a2_im;
695  A1_im += g12_im * a2_re;
696  spinorFloat B1_re = 0;
697  B1_re += g10_re * b0_re;
698  B1_re -= g10_im * b0_im;
699  B1_re += g11_re * b1_re;
700  B1_re -= g11_im * b1_im;
701  B1_re += g12_re * b2_re;
702  B1_re -= g12_im * b2_im;
703  spinorFloat B1_im = 0;
704  B1_im += g10_re * b0_im;
705  B1_im += g10_im * b0_re;
706  B1_im += g11_re * b1_im;
707  B1_im += g11_im * b1_re;
708  B1_im += g12_re * b2_im;
709  B1_im += g12_im * b2_re;
710 
711  // multiply row 2
712  spinorFloat A2_re = 0;
713  A2_re += g20_re * a0_re;
714  A2_re -= g20_im * a0_im;
715  A2_re += g21_re * a1_re;
716  A2_re -= g21_im * a1_im;
717  A2_re += g22_re * a2_re;
718  A2_re -= g22_im * a2_im;
719  spinorFloat A2_im = 0;
720  A2_im += g20_re * a0_im;
721  A2_im += g20_im * a0_re;
722  A2_im += g21_re * a1_im;
723  A2_im += g21_im * a1_re;
724  A2_im += g22_re * a2_im;
725  A2_im += g22_im * a2_re;
726  spinorFloat B2_re = 0;
727  B2_re += g20_re * b0_re;
728  B2_re -= g20_im * b0_im;
729  B2_re += g21_re * b1_re;
730  B2_re -= g21_im * b1_im;
731  B2_re += g22_re * b2_re;
732  B2_re -= g22_im * b2_im;
733  spinorFloat B2_im = 0;
734  B2_im += g20_re * b0_im;
735  B2_im += g20_im * b0_re;
736  B2_im += g21_re * b1_im;
737  B2_im += g21_im * b1_re;
738  B2_im += g22_re * b2_im;
739  B2_im += g22_im * b2_re;
740 
741  o00_re += A0_re;
742  o00_im += A0_im;
743  o10_re += B0_re;
744  o10_im += B0_im;
745  o20_re += B0_re;
746  o20_im += B0_im;
747  o30_re -= A0_re;
748  o30_im -= A0_im;
749 
750  o01_re += A1_re;
751  o01_im += A1_im;
752  o11_re += B1_re;
753  o11_im += B1_im;
754  o21_re += B1_re;
755  o21_im += B1_im;
756  o31_re -= A1_re;
757  o31_im -= A1_im;
758 
759  o02_re += A2_re;
760  o02_im += A2_im;
761  o12_re += B2_re;
762  o12_im += B2_im;
763  o22_re += B2_re;
764  o22_im += B2_im;
765  o32_re -= A2_re;
766  o32_im -= A2_im;
767 }
768 
769 if (isActive(dim,1,-1,coord,param.commDim,param.dc.X) && coord[1]==0 )
770 {
771  // Projector P1+
772  // 1 0 0 1
773  // 0 1 -1 0
774  // 0 -1 1 0
775  // 1 0 0 1
776 
777  faceIndexFromCoords<5,1>(face_idx,coord,1,param);
778  const int sp_idx = face_idx + param.ghostOffset[1][0];
779 #if (DD_PREC==2) // half precision
780  sp_norm_idx = face_idx + param.ghostNormOffset[1][0];
781 #endif
782 
783 
784  const int ga_idx = param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[1]);
785 
786  // read gauge matrix from device memory
787  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, param.gauge_stride); }
788  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 3, ga_idx, param.gauge_stride); }
789 
796 
797  {
798 
799  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[1];
800 
801  // read half spinor from device memory
802  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 3);
803 
804  a0_re = i00_re; a0_im = i00_im;
805  a1_re = i01_re; a1_im = i01_im;
806  a2_re = i02_re; a2_im = i02_im;
807  b0_re = i10_re; b0_im = i10_im;
808  b1_re = i11_re; b1_im = i11_im;
809  b2_re = i12_re; b2_im = i12_im;
810 
811  }
812 
813  // reconstruct gauge matrix
815 
816  // multiply row 0
817  spinorFloat A0_re = 0;
818  A0_re += gT00_re * a0_re;
819  A0_re -= gT00_im * a0_im;
820  A0_re += gT01_re * a1_re;
821  A0_re -= gT01_im * a1_im;
822  A0_re += gT02_re * a2_re;
823  A0_re -= gT02_im * a2_im;
824  spinorFloat A0_im = 0;
825  A0_im += gT00_re * a0_im;
826  A0_im += gT00_im * a0_re;
827  A0_im += gT01_re * a1_im;
828  A0_im += gT01_im * a1_re;
829  A0_im += gT02_re * a2_im;
830  A0_im += gT02_im * a2_re;
831  spinorFloat B0_re = 0;
832  B0_re += gT00_re * b0_re;
833  B0_re -= gT00_im * b0_im;
834  B0_re += gT01_re * b1_re;
835  B0_re -= gT01_im * b1_im;
836  B0_re += gT02_re * b2_re;
837  B0_re -= gT02_im * b2_im;
838  spinorFloat B0_im = 0;
839  B0_im += gT00_re * b0_im;
840  B0_im += gT00_im * b0_re;
841  B0_im += gT01_re * b1_im;
842  B0_im += gT01_im * b1_re;
843  B0_im += gT02_re * b2_im;
844  B0_im += gT02_im * b2_re;
845 
846  // multiply row 1
847  spinorFloat A1_re = 0;
848  A1_re += gT10_re * a0_re;
849  A1_re -= gT10_im * a0_im;
850  A1_re += gT11_re * a1_re;
851  A1_re -= gT11_im * a1_im;
852  A1_re += gT12_re * a2_re;
853  A1_re -= gT12_im * a2_im;
854  spinorFloat A1_im = 0;
855  A1_im += gT10_re * a0_im;
856  A1_im += gT10_im * a0_re;
857  A1_im += gT11_re * a1_im;
858  A1_im += gT11_im * a1_re;
859  A1_im += gT12_re * a2_im;
860  A1_im += gT12_im * a2_re;
861  spinorFloat B1_re = 0;
862  B1_re += gT10_re * b0_re;
863  B1_re -= gT10_im * b0_im;
864  B1_re += gT11_re * b1_re;
865  B1_re -= gT11_im * b1_im;
866  B1_re += gT12_re * b2_re;
867  B1_re -= gT12_im * b2_im;
868  spinorFloat B1_im = 0;
869  B1_im += gT10_re * b0_im;
870  B1_im += gT10_im * b0_re;
871  B1_im += gT11_re * b1_im;
872  B1_im += gT11_im * b1_re;
873  B1_im += gT12_re * b2_im;
874  B1_im += gT12_im * b2_re;
875 
876  // multiply row 2
877  spinorFloat A2_re = 0;
878  A2_re += gT20_re * a0_re;
879  A2_re -= gT20_im * a0_im;
880  A2_re += gT21_re * a1_re;
881  A2_re -= gT21_im * a1_im;
882  A2_re += gT22_re * a2_re;
883  A2_re -= gT22_im * a2_im;
884  spinorFloat A2_im = 0;
885  A2_im += gT20_re * a0_im;
886  A2_im += gT20_im * a0_re;
887  A2_im += gT21_re * a1_im;
888  A2_im += gT21_im * a1_re;
889  A2_im += gT22_re * a2_im;
890  A2_im += gT22_im * a2_re;
891  spinorFloat B2_re = 0;
892  B2_re += gT20_re * b0_re;
893  B2_re -= gT20_im * b0_im;
894  B2_re += gT21_re * b1_re;
895  B2_re -= gT21_im * b1_im;
896  B2_re += gT22_re * b2_re;
897  B2_re -= gT22_im * b2_im;
898  spinorFloat B2_im = 0;
899  B2_im += gT20_re * b0_im;
900  B2_im += gT20_im * b0_re;
901  B2_im += gT21_re * b1_im;
902  B2_im += gT21_im * b1_re;
903  B2_im += gT22_re * b2_im;
904  B2_im += gT22_im * b2_re;
905 
906  o00_re += A0_re;
907  o00_im += A0_im;
908  o10_re += B0_re;
909  o10_im += B0_im;
910  o20_re -= B0_re;
911  o20_im -= B0_im;
912  o30_re += A0_re;
913  o30_im += A0_im;
914 
915  o01_re += A1_re;
916  o01_im += A1_im;
917  o11_re += B1_re;
918  o11_im += B1_im;
919  o21_re -= B1_re;
920  o21_im -= B1_im;
921  o31_re += A1_re;
922  o31_im += A1_im;
923 
924  o02_re += A2_re;
925  o02_im += A2_im;
926  o12_re += B2_re;
927  o12_im += B2_im;
928  o22_re -= B2_re;
929  o22_im -= B2_im;
930  o32_re += A2_re;
931  o32_im += A2_im;
932 }
933 
934 if (isActive(dim,2,+1,coord,param.commDim,param.dc.X) && coord[2]==(param.dc.X[2]-1) )
935 {
936  // Projector P2-
937  // 1 0 -i 0
938  // 0 1 0 i
939  // i 0 1 0
940  // 0 -i 0 1
941 
942  faceIndexFromCoords<5,1>(face_idx,coord,2,param);
943  const int sp_idx = face_idx + param.ghostOffset[2][1];
944 #if (DD_PREC==2) // half precision
945  sp_norm_idx = face_idx + param.ghostNormOffset[2][1];
946 #endif
947 
948 
949  const int ga_idx = sid % param.dc.volume_4d_cb;
950 
951  // read gauge matrix from device memory
952  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, param.gauge_stride); }
953  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 4, ga_idx, param.gauge_stride); }
954 
961 
962  {
963 
964  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[2];
965 
966  // read half spinor from device memory
967  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 4);
968 
969  a0_re = i00_re; a0_im = i00_im;
970  a1_re = i01_re; a1_im = i01_im;
971  a2_re = i02_re; a2_im = i02_im;
972  b0_re = i10_re; b0_im = i10_im;
973  b1_re = i11_re; b1_im = i11_im;
974  b2_re = i12_re; b2_im = i12_im;
975 
976  }
977 
978  // reconstruct gauge matrix
980 
981  // multiply row 0
982  spinorFloat A0_re = 0;
983  A0_re += g00_re * a0_re;
984  A0_re -= g00_im * a0_im;
985  A0_re += g01_re * a1_re;
986  A0_re -= g01_im * a1_im;
987  A0_re += g02_re * a2_re;
988  A0_re -= g02_im * a2_im;
989  spinorFloat A0_im = 0;
990  A0_im += g00_re * a0_im;
991  A0_im += g00_im * a0_re;
992  A0_im += g01_re * a1_im;
993  A0_im += g01_im * a1_re;
994  A0_im += g02_re * a2_im;
995  A0_im += g02_im * a2_re;
996  spinorFloat B0_re = 0;
997  B0_re += g00_re * b0_re;
998  B0_re -= g00_im * b0_im;
999  B0_re += g01_re * b1_re;
1000  B0_re -= g01_im * b1_im;
1001  B0_re += g02_re * b2_re;
1002  B0_re -= g02_im * b2_im;
1003  spinorFloat B0_im = 0;
1004  B0_im += g00_re * b0_im;
1005  B0_im += g00_im * b0_re;
1006  B0_im += g01_re * b1_im;
1007  B0_im += g01_im * b1_re;
1008  B0_im += g02_re * b2_im;
1009  B0_im += g02_im * b2_re;
1010 
1011  // multiply row 1
1012  spinorFloat A1_re = 0;
1013  A1_re += g10_re * a0_re;
1014  A1_re -= g10_im * a0_im;
1015  A1_re += g11_re * a1_re;
1016  A1_re -= g11_im * a1_im;
1017  A1_re += g12_re * a2_re;
1018  A1_re -= g12_im * a2_im;
1019  spinorFloat A1_im = 0;
1020  A1_im += g10_re * a0_im;
1021  A1_im += g10_im * a0_re;
1022  A1_im += g11_re * a1_im;
1023  A1_im += g11_im * a1_re;
1024  A1_im += g12_re * a2_im;
1025  A1_im += g12_im * a2_re;
1026  spinorFloat B1_re = 0;
1027  B1_re += g10_re * b0_re;
1028  B1_re -= g10_im * b0_im;
1029  B1_re += g11_re * b1_re;
1030  B1_re -= g11_im * b1_im;
1031  B1_re += g12_re * b2_re;
1032  B1_re -= g12_im * b2_im;
1033  spinorFloat B1_im = 0;
1034  B1_im += g10_re * b0_im;
1035  B1_im += g10_im * b0_re;
1036  B1_im += g11_re * b1_im;
1037  B1_im += g11_im * b1_re;
1038  B1_im += g12_re * b2_im;
1039  B1_im += g12_im * b2_re;
1040 
1041  // multiply row 2
1042  spinorFloat A2_re = 0;
1043  A2_re += g20_re * a0_re;
1044  A2_re -= g20_im * a0_im;
1045  A2_re += g21_re * a1_re;
1046  A2_re -= g21_im * a1_im;
1047  A2_re += g22_re * a2_re;
1048  A2_re -= g22_im * a2_im;
1049  spinorFloat A2_im = 0;
1050  A2_im += g20_re * a0_im;
1051  A2_im += g20_im * a0_re;
1052  A2_im += g21_re * a1_im;
1053  A2_im += g21_im * a1_re;
1054  A2_im += g22_re * a2_im;
1055  A2_im += g22_im * a2_re;
1056  spinorFloat B2_re = 0;
1057  B2_re += g20_re * b0_re;
1058  B2_re -= g20_im * b0_im;
1059  B2_re += g21_re * b1_re;
1060  B2_re -= g21_im * b1_im;
1061  B2_re += g22_re * b2_re;
1062  B2_re -= g22_im * b2_im;
1063  spinorFloat B2_im = 0;
1064  B2_im += g20_re * b0_im;
1065  B2_im += g20_im * b0_re;
1066  B2_im += g21_re * b1_im;
1067  B2_im += g21_im * b1_re;
1068  B2_im += g22_re * b2_im;
1069  B2_im += g22_im * b2_re;
1070 
1071  o00_re += A0_re;
1072  o00_im += A0_im;
1073  o10_re += B0_re;
1074  o10_im += B0_im;
1075  o20_re -= A0_im;
1076  o20_im += A0_re;
1077  o30_re += B0_im;
1078  o30_im -= B0_re;
1079 
1080  o01_re += A1_re;
1081  o01_im += A1_im;
1082  o11_re += B1_re;
1083  o11_im += B1_im;
1084  o21_re -= A1_im;
1085  o21_im += A1_re;
1086  o31_re += B1_im;
1087  o31_im -= B1_re;
1088 
1089  o02_re += A2_re;
1090  o02_im += A2_im;
1091  o12_re += B2_re;
1092  o12_im += B2_im;
1093  o22_re -= A2_im;
1094  o22_im += A2_re;
1095  o32_re += B2_im;
1096  o32_im -= B2_re;
1097 }
1098 
1099 if (isActive(dim,2,-1,coord,param.commDim,param.dc.X) && coord[2]==0 )
1100 {
1101  // Projector P2+
1102  // 1 0 i 0
1103  // 0 1 0 -i
1104  // -i 0 1 0
1105  // 0 i 0 1
1106 
1107  faceIndexFromCoords<5,1>(face_idx,coord,2,param);
1108  const int sp_idx = face_idx + param.ghostOffset[2][0];
1109 #if (DD_PREC==2) // half precision
1110  sp_norm_idx = face_idx + param.ghostNormOffset[2][0];
1111 #endif
1112 
1113 
1114  const int ga_idx = param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[2]);
1115 
1116  // read gauge matrix from device memory
1117  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, param.gauge_stride); }
1118  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 5, ga_idx, param.gauge_stride); }
1119 
1126 
1127  {
1128 
1129  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[2];
1130 
1131  // read half spinor from device memory
1132  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 5);
1133 
1134  a0_re = i00_re; a0_im = i00_im;
1135  a1_re = i01_re; a1_im = i01_im;
1136  a2_re = i02_re; a2_im = i02_im;
1137  b0_re = i10_re; b0_im = i10_im;
1138  b1_re = i11_re; b1_im = i11_im;
1139  b2_re = i12_re; b2_im = i12_im;
1140 
1141  }
1142 
1143  // reconstruct gauge matrix
1145 
1146  // multiply row 0
1147  spinorFloat A0_re = 0;
1148  A0_re += gT00_re * a0_re;
1149  A0_re -= gT00_im * a0_im;
1150  A0_re += gT01_re * a1_re;
1151  A0_re -= gT01_im * a1_im;
1152  A0_re += gT02_re * a2_re;
1153  A0_re -= gT02_im * a2_im;
1154  spinorFloat A0_im = 0;
1155  A0_im += gT00_re * a0_im;
1156  A0_im += gT00_im * a0_re;
1157  A0_im += gT01_re * a1_im;
1158  A0_im += gT01_im * a1_re;
1159  A0_im += gT02_re * a2_im;
1160  A0_im += gT02_im * a2_re;
1161  spinorFloat B0_re = 0;
1162  B0_re += gT00_re * b0_re;
1163  B0_re -= gT00_im * b0_im;
1164  B0_re += gT01_re * b1_re;
1165  B0_re -= gT01_im * b1_im;
1166  B0_re += gT02_re * b2_re;
1167  B0_re -= gT02_im * b2_im;
1168  spinorFloat B0_im = 0;
1169  B0_im += gT00_re * b0_im;
1170  B0_im += gT00_im * b0_re;
1171  B0_im += gT01_re * b1_im;
1172  B0_im += gT01_im * b1_re;
1173  B0_im += gT02_re * b2_im;
1174  B0_im += gT02_im * b2_re;
1175 
1176  // multiply row 1
1177  spinorFloat A1_re = 0;
1178  A1_re += gT10_re * a0_re;
1179  A1_re -= gT10_im * a0_im;
1180  A1_re += gT11_re * a1_re;
1181  A1_re -= gT11_im * a1_im;
1182  A1_re += gT12_re * a2_re;
1183  A1_re -= gT12_im * a2_im;
1184  spinorFloat A1_im = 0;
1185  A1_im += gT10_re * a0_im;
1186  A1_im += gT10_im * a0_re;
1187  A1_im += gT11_re * a1_im;
1188  A1_im += gT11_im * a1_re;
1189  A1_im += gT12_re * a2_im;
1190  A1_im += gT12_im * a2_re;
1191  spinorFloat B1_re = 0;
1192  B1_re += gT10_re * b0_re;
1193  B1_re -= gT10_im * b0_im;
1194  B1_re += gT11_re * b1_re;
1195  B1_re -= gT11_im * b1_im;
1196  B1_re += gT12_re * b2_re;
1197  B1_re -= gT12_im * b2_im;
1198  spinorFloat B1_im = 0;
1199  B1_im += gT10_re * b0_im;
1200  B1_im += gT10_im * b0_re;
1201  B1_im += gT11_re * b1_im;
1202  B1_im += gT11_im * b1_re;
1203  B1_im += gT12_re * b2_im;
1204  B1_im += gT12_im * b2_re;
1205 
1206  // multiply row 2
1207  spinorFloat A2_re = 0;
1208  A2_re += gT20_re * a0_re;
1209  A2_re -= gT20_im * a0_im;
1210  A2_re += gT21_re * a1_re;
1211  A2_re -= gT21_im * a1_im;
1212  A2_re += gT22_re * a2_re;
1213  A2_re -= gT22_im * a2_im;
1214  spinorFloat A2_im = 0;
1215  A2_im += gT20_re * a0_im;
1216  A2_im += gT20_im * a0_re;
1217  A2_im += gT21_re * a1_im;
1218  A2_im += gT21_im * a1_re;
1219  A2_im += gT22_re * a2_im;
1220  A2_im += gT22_im * a2_re;
1221  spinorFloat B2_re = 0;
1222  B2_re += gT20_re * b0_re;
1223  B2_re -= gT20_im * b0_im;
1224  B2_re += gT21_re * b1_re;
1225  B2_re -= gT21_im * b1_im;
1226  B2_re += gT22_re * b2_re;
1227  B2_re -= gT22_im * b2_im;
1228  spinorFloat B2_im = 0;
1229  B2_im += gT20_re * b0_im;
1230  B2_im += gT20_im * b0_re;
1231  B2_im += gT21_re * b1_im;
1232  B2_im += gT21_im * b1_re;
1233  B2_im += gT22_re * b2_im;
1234  B2_im += gT22_im * b2_re;
1235 
1236  o00_re += A0_re;
1237  o00_im += A0_im;
1238  o10_re += B0_re;
1239  o10_im += B0_im;
1240  o20_re += A0_im;
1241  o20_im -= A0_re;
1242  o30_re -= B0_im;
1243  o30_im += B0_re;
1244 
1245  o01_re += A1_re;
1246  o01_im += A1_im;
1247  o11_re += B1_re;
1248  o11_im += B1_im;
1249  o21_re += A1_im;
1250  o21_im -= A1_re;
1251  o31_re -= B1_im;
1252  o31_im += B1_re;
1253 
1254  o02_re += A2_re;
1255  o02_im += A2_im;
1256  o12_re += B2_re;
1257  o12_im += B2_im;
1258  o22_re += A2_im;
1259  o22_im -= A2_re;
1260  o32_re -= B2_im;
1261  o32_im += B2_re;
1262 }
1263 
1264 if (isActive(dim,3,+1,coord,param.commDim,param.dc.X) && coord[3]==(param.dc.X[3]-1) )
1265 {
1266  // Projector P3-
1267  // 0 0 0 0
1268  // 0 0 0 0
1269  // 0 0 2 0
1270  // 0 0 0 2
1271 
1272  faceIndexFromCoords<5,1>(face_idx,coord,3,param);
1273  const int sp_idx = face_idx + param.ghostOffset[3][1];
1274 #if (DD_PREC==2) // half precision
1275  sp_norm_idx = face_idx + param.ghostNormOffset[3][1];
1276 #endif
1277 
1278 
1279  const int ga_idx = sid % param.dc.volume_4d_cb;
1280 
1281  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1282  {
1289 
1290  {
1291 
1292  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[3];
1293  const int t_proj_scale = TPROJSCALE;
1294 
1295  // read half spinor from device memory
1296  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1297 
1298  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1299  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1300  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1301  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1302  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1303  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1304 
1305  }
1306 
1307  // identity gauge matrix
1314 
1315  o20_re += A0_re;
1316  o20_im += A0_im;
1317  o30_re += B0_re;
1318  o30_im += B0_im;
1319 
1320  o21_re += A1_re;
1321  o21_im += A1_im;
1322  o31_re += B1_re;
1323  o31_im += B1_im;
1324 
1325  o22_re += A2_re;
1326  o22_im += A2_im;
1327  o32_re += B2_re;
1328  o32_im += B2_im;
1329  } else {
1330  // read gauge matrix from device memory
1331  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, param.gauge_stride); }
1332  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 6, ga_idx, param.gauge_stride); }
1333 
1340 
1341  {
1342 
1343  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[3];
1344  const int t_proj_scale = TPROJSCALE;
1345 
1346  // read half spinor from device memory
1347  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1348 
1349  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1350  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1351  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1352  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1353  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1354  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1355 
1356  }
1357 
1358  // reconstruct gauge matrix
1360 
1361  // multiply row 0
1362  spinorFloat A0_re = 0;
1363  A0_re += g00_re * a0_re;
1364  A0_re -= g00_im * a0_im;
1365  A0_re += g01_re * a1_re;
1366  A0_re -= g01_im * a1_im;
1367  A0_re += g02_re * a2_re;
1368  A0_re -= g02_im * a2_im;
1369  spinorFloat A0_im = 0;
1370  A0_im += g00_re * a0_im;
1371  A0_im += g00_im * a0_re;
1372  A0_im += g01_re * a1_im;
1373  A0_im += g01_im * a1_re;
1374  A0_im += g02_re * a2_im;
1375  A0_im += g02_im * a2_re;
1376  spinorFloat B0_re = 0;
1377  B0_re += g00_re * b0_re;
1378  B0_re -= g00_im * b0_im;
1379  B0_re += g01_re * b1_re;
1380  B0_re -= g01_im * b1_im;
1381  B0_re += g02_re * b2_re;
1382  B0_re -= g02_im * b2_im;
1383  spinorFloat B0_im = 0;
1384  B0_im += g00_re * b0_im;
1385  B0_im += g00_im * b0_re;
1386  B0_im += g01_re * b1_im;
1387  B0_im += g01_im * b1_re;
1388  B0_im += g02_re * b2_im;
1389  B0_im += g02_im * b2_re;
1390 
1391  // multiply row 1
1392  spinorFloat A1_re = 0;
1393  A1_re += g10_re * a0_re;
1394  A1_re -= g10_im * a0_im;
1395  A1_re += g11_re * a1_re;
1396  A1_re -= g11_im * a1_im;
1397  A1_re += g12_re * a2_re;
1398  A1_re -= g12_im * a2_im;
1399  spinorFloat A1_im = 0;
1400  A1_im += g10_re * a0_im;
1401  A1_im += g10_im * a0_re;
1402  A1_im += g11_re * a1_im;
1403  A1_im += g11_im * a1_re;
1404  A1_im += g12_re * a2_im;
1405  A1_im += g12_im * a2_re;
1406  spinorFloat B1_re = 0;
1407  B1_re += g10_re * b0_re;
1408  B1_re -= g10_im * b0_im;
1409  B1_re += g11_re * b1_re;
1410  B1_re -= g11_im * b1_im;
1411  B1_re += g12_re * b2_re;
1412  B1_re -= g12_im * b2_im;
1413  spinorFloat B1_im = 0;
1414  B1_im += g10_re * b0_im;
1415  B1_im += g10_im * b0_re;
1416  B1_im += g11_re * b1_im;
1417  B1_im += g11_im * b1_re;
1418  B1_im += g12_re * b2_im;
1419  B1_im += g12_im * b2_re;
1420 
1421  // multiply row 2
1422  spinorFloat A2_re = 0;
1423  A2_re += g20_re * a0_re;
1424  A2_re -= g20_im * a0_im;
1425  A2_re += g21_re * a1_re;
1426  A2_re -= g21_im * a1_im;
1427  A2_re += g22_re * a2_re;
1428  A2_re -= g22_im * a2_im;
1429  spinorFloat A2_im = 0;
1430  A2_im += g20_re * a0_im;
1431  A2_im += g20_im * a0_re;
1432  A2_im += g21_re * a1_im;
1433  A2_im += g21_im * a1_re;
1434  A2_im += g22_re * a2_im;
1435  A2_im += g22_im * a2_re;
1436  spinorFloat B2_re = 0;
1437  B2_re += g20_re * b0_re;
1438  B2_re -= g20_im * b0_im;
1439  B2_re += g21_re * b1_re;
1440  B2_re -= g21_im * b1_im;
1441  B2_re += g22_re * b2_re;
1442  B2_re -= g22_im * b2_im;
1443  spinorFloat B2_im = 0;
1444  B2_im += g20_re * b0_im;
1445  B2_im += g20_im * b0_re;
1446  B2_im += g21_re * b1_im;
1447  B2_im += g21_im * b1_re;
1448  B2_im += g22_re * b2_im;
1449  B2_im += g22_im * b2_re;
1450 
1451  o20_re += A0_re;
1452  o20_im += A0_im;
1453  o30_re += B0_re;
1454  o30_im += B0_im;
1455 
1456  o21_re += A1_re;
1457  o21_im += A1_im;
1458  o31_re += B1_re;
1459  o31_im += B1_im;
1460 
1461  o22_re += A2_re;
1462  o22_im += A2_im;
1463  o32_re += B2_re;
1464  o32_im += B2_im;
1465  }
1466 }
1467 
1468 if (isActive(dim,3,-1,coord,param.commDim,param.dc.X) && coord[3]==0 )
1469 {
1470  // Projector P3+
1471  // 2 0 0 0
1472  // 0 2 0 0
1473  // 0 0 0 0
1474  // 0 0 0 0
1475 
1476  faceIndexFromCoords<5,1>(face_idx,coord,3,param);
1477  const int sp_idx = face_idx + param.ghostOffset[3][0];
1478 #if (DD_PREC==2) // half precision
1479  sp_norm_idx = face_idx + param.ghostNormOffset[3][0];
1480 #endif
1481 
1482 
1483  const int ga_idx = param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[3]);
1484 
1485  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1486  {
1493 
1494  {
1495 
1496  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[3];
1497  const int t_proj_scale = TPROJSCALE;
1498 
1499  // read half spinor from device memory
1500  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
1501 
1502  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1503  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1504  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1505  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1506  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1507  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1508 
1509  }
1510 
1511  // identity gauge matrix
1518 
1519  o00_re += A0_re;
1520  o00_im += A0_im;
1521  o10_re += B0_re;
1522  o10_im += B0_im;
1523 
1524  o01_re += A1_re;
1525  o01_im += A1_im;
1526  o11_re += B1_re;
1527  o11_im += B1_im;
1528 
1529  o02_re += A2_re;
1530  o02_im += A2_im;
1531  o12_re += B2_re;
1532  o12_im += B2_im;
1533  } else {
1534  // read gauge matrix from device memory
1535  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, param.gauge_stride); }
1536  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 7, ga_idx, param.gauge_stride); }
1537 
1544 
1545  {
1546 
1547  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[3];
1548  const int t_proj_scale = TPROJSCALE;
1549 
1550  // read half spinor from device memory
1551  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
1552 
1553  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1554  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1555  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1556  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1557  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1558  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1559 
1560  }
1561 
1562  // reconstruct gauge matrix
1564 
1565  // multiply row 0
1566  spinorFloat A0_re = 0;
1567  A0_re += gT00_re * a0_re;
1568  A0_re -= gT00_im * a0_im;
1569  A0_re += gT01_re * a1_re;
1570  A0_re -= gT01_im * a1_im;
1571  A0_re += gT02_re * a2_re;
1572  A0_re -= gT02_im * a2_im;
1573  spinorFloat A0_im = 0;
1574  A0_im += gT00_re * a0_im;
1575  A0_im += gT00_im * a0_re;
1576  A0_im += gT01_re * a1_im;
1577  A0_im += gT01_im * a1_re;
1578  A0_im += gT02_re * a2_im;
1579  A0_im += gT02_im * a2_re;
1580  spinorFloat B0_re = 0;
1581  B0_re += gT00_re * b0_re;
1582  B0_re -= gT00_im * b0_im;
1583  B0_re += gT01_re * b1_re;
1584  B0_re -= gT01_im * b1_im;
1585  B0_re += gT02_re * b2_re;
1586  B0_re -= gT02_im * b2_im;
1587  spinorFloat B0_im = 0;
1588  B0_im += gT00_re * b0_im;
1589  B0_im += gT00_im * b0_re;
1590  B0_im += gT01_re * b1_im;
1591  B0_im += gT01_im * b1_re;
1592  B0_im += gT02_re * b2_im;
1593  B0_im += gT02_im * b2_re;
1594 
1595  // multiply row 1
1596  spinorFloat A1_re = 0;
1597  A1_re += gT10_re * a0_re;
1598  A1_re -= gT10_im * a0_im;
1599  A1_re += gT11_re * a1_re;
1600  A1_re -= gT11_im * a1_im;
1601  A1_re += gT12_re * a2_re;
1602  A1_re -= gT12_im * a2_im;
1603  spinorFloat A1_im = 0;
1604  A1_im += gT10_re * a0_im;
1605  A1_im += gT10_im * a0_re;
1606  A1_im += gT11_re * a1_im;
1607  A1_im += gT11_im * a1_re;
1608  A1_im += gT12_re * a2_im;
1609  A1_im += gT12_im * a2_re;
1610  spinorFloat B1_re = 0;
1611  B1_re += gT10_re * b0_re;
1612  B1_re -= gT10_im * b0_im;
1613  B1_re += gT11_re * b1_re;
1614  B1_re -= gT11_im * b1_im;
1615  B1_re += gT12_re * b2_re;
1616  B1_re -= gT12_im * b2_im;
1617  spinorFloat B1_im = 0;
1618  B1_im += gT10_re * b0_im;
1619  B1_im += gT10_im * b0_re;
1620  B1_im += gT11_re * b1_im;
1621  B1_im += gT11_im * b1_re;
1622  B1_im += gT12_re * b2_im;
1623  B1_im += gT12_im * b2_re;
1624 
1625  // multiply row 2
1626  spinorFloat A2_re = 0;
1627  A2_re += gT20_re * a0_re;
1628  A2_re -= gT20_im * a0_im;
1629  A2_re += gT21_re * a1_re;
1630  A2_re -= gT21_im * a1_im;
1631  A2_re += gT22_re * a2_re;
1632  A2_re -= gT22_im * a2_im;
1633  spinorFloat A2_im = 0;
1634  A2_im += gT20_re * a0_im;
1635  A2_im += gT20_im * a0_re;
1636  A2_im += gT21_re * a1_im;
1637  A2_im += gT21_im * a1_re;
1638  A2_im += gT22_re * a2_im;
1639  A2_im += gT22_im * a2_re;
1640  spinorFloat B2_re = 0;
1641  B2_re += gT20_re * b0_re;
1642  B2_re -= gT20_im * b0_im;
1643  B2_re += gT21_re * b1_re;
1644  B2_re -= gT21_im * b1_im;
1645  B2_re += gT22_re * b2_re;
1646  B2_re -= gT22_im * b2_im;
1647  spinorFloat B2_im = 0;
1648  B2_im += gT20_re * b0_im;
1649  B2_im += gT20_im * b0_re;
1650  B2_im += gT21_re * b1_im;
1651  B2_im += gT21_im * b1_re;
1652  B2_im += gT22_re * b2_im;
1653  B2_im += gT22_im * b2_re;
1654 
1655  o00_re += A0_re;
1656  o00_im += A0_im;
1657  o10_re += B0_re;
1658  o10_im += B0_im;
1659 
1660  o01_re += A1_re;
1661  o01_im += A1_im;
1662  o11_re += B1_re;
1663  o11_im += B1_im;
1664 
1665  o02_re += A2_re;
1666  o02_im += A2_im;
1667  o12_re += B2_re;
1668  o12_im += B2_im;
1669  }
1670 }
1671 
1672 {
1673 
1674 #ifdef DSLASH_XPAY
1675 #ifdef SPINOR_DOUBLE
1676  spinorFloat a = param.a;
1677 #else
1678  spinorFloat a = param.a_f;
1679 #endif
1680 #ifdef SPINOR_DOUBLE
1681  o00_re = a*o00_re;
1682  o00_im = a*o00_im;
1683  o01_re = a*o01_re;
1684  o01_im = a*o01_im;
1685  o02_re = a*o02_re;
1686  o02_im = a*o02_im;
1687  o10_re = a*o10_re;
1688  o10_im = a*o10_im;
1689  o11_re = a*o11_re;
1690  o11_im = a*o11_im;
1691  o12_re = a*o12_re;
1692  o12_im = a*o12_im;
1693  o20_re = a*o20_re;
1694  o20_im = a*o20_im;
1695  o21_re = a*o21_re;
1696  o21_im = a*o21_im;
1697  o22_re = a*o22_re;
1698  o22_im = a*o22_im;
1699  o30_re = a*o30_re;
1700  o30_im = a*o30_im;
1701  o31_re = a*o31_re;
1702  o31_im = a*o31_im;
1703  o32_re = a*o32_re;
1704  o32_im = a*o32_im;
1705 #else
1706  o00_re = a*o00_re;
1707  o00_im = a*o00_im;
1708  o01_re = a*o01_re;
1709  o01_im = a*o01_im;
1710  o02_re = a*o02_re;
1711  o02_im = a*o02_im;
1712  o10_re = a*o10_re;
1713  o10_im = a*o10_im;
1714  o11_re = a*o11_re;
1715  o11_im = a*o11_im;
1716  o12_re = a*o12_re;
1717  o12_im = a*o12_im;
1718  o20_re = a*o20_re;
1719  o20_im = a*o20_im;
1720  o21_re = a*o21_re;
1721  o21_im = a*o21_im;
1722  o22_re = a*o22_re;
1723  o22_im = a*o22_im;
1724  o30_re = a*o30_re;
1725  o30_im = a*o30_im;
1726  o31_re = a*o31_re;
1727  o31_im = a*o31_im;
1728  o32_re = a*o32_re;
1729  o32_im = a*o32_im;
1730 #endif // SPINOR_DOUBLE
1731 
1732 #endif // DSLASH_XPAY
1733 }
1734 
1735 // write spinor field back to device memory
1736 WRITE_SPINOR(param.sp_stride);
1737 
1738 // undefine to prevent warning when precision is changed
1739 #undef spinorFloat
1740 #undef SHARED_STRIDE
1741 
1742 #undef g00_re
1743 #undef g00_im
1744 #undef g01_re
1745 #undef g01_im
1746 #undef g02_re
1747 #undef g02_im
1748 #undef g10_re
1749 #undef g10_im
1750 #undef g11_re
1751 #undef g11_im
1752 #undef g12_re
1753 #undef g12_im
1754 #undef g20_re
1755 #undef g20_im
1756 #undef g21_re
1757 #undef g21_im
1758 #undef g22_re
1759 #undef g22_im
1760 
1761 #undef i00_re
1762 #undef i00_im
1763 #undef i01_re
1764 #undef i01_im
1765 #undef i02_re
1766 #undef i02_im
1767 #undef i10_re
1768 #undef i10_im
1769 #undef i11_re
1770 #undef i11_im
1771 #undef i12_re
1772 #undef i12_im
1773 #undef i20_re
1774 #undef i20_im
1775 #undef i21_re
1776 #undef i21_im
1777 #undef i22_re
1778 #undef i22_im
1779 #undef i30_re
1780 #undef i30_im
1781 #undef i31_re
1782 #undef i31_im
1783 #undef i32_re
1784 #undef i32_im
1785 
1786 
1787 
1788 #undef VOLATILE
1789 
1790 #endif // MULTI_GPU
int s_parity
dim3 dim3 blockDim
float4 G2
VOLATILE spinorFloat o22_im
#define WRITE_SPINOR
VOLATILE spinorFloat o01_re
int sp_idx
static __inline__ dim3 dim3 void size_t cudaStream_t int dim
static __device__ bool isActive(const int threadDim, int offsetDim, int offset, const int y[], const int partitioned[], const T X[])
Compute whether this thread should be active for updating the a given offsetDim halo. This is used by the fused halo region update kernels: here every thread has a prescribed dimension it is tasked with updating, but for the edges and vertices, the thread responsible for the entire update is the "greatest" one. Hence some threads may be labelled as a given dimension, but they have to update other dimensions too. Conversely, a given thread may be labeled for a given dimension, but if that thread lies at en edge or vertex, and we have partitioned a higher dimension, then that thread will cede to the higher thread.
VOLATILE spinorFloat o01_im
#define GAUGE0TEX
VOLATILE spinorFloat o20_re
QudaGaugeParam param
Definition: pack_test.cpp:17
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o02_re
VOLATILE spinorFloat o02_im
float4 G1
float4 G0
VOLATILE spinorFloat o00_re
#define GAUGE1TEX
VOLATILE spinorFloat o20_im
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o32_im
VOLATILE spinorFloat o31_im
int X[4]
Definition: quda.h:29
VOLATILE spinorFloat o10_im
float4 G4
#define READ_SPINOR_GHOST
float4 G3
#define ASSN_GAUGE_MATRIX
VOLATILE spinorFloat o11_im
VOLATILE spinorFloat o31_re
#define RECONSTRUCT_GAUGE_MATRIX
#define INTERTEX
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o11_re
int face_idx
VOLATILE spinorFloat o21_re
const int face_num
#define TPROJSCALE
VOLATILE spinorFloat o00_im
#define GHOSTSPINORTEX
VOLATILE spinorFloat o12_im
#define a
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o30_im
VOLATILE spinorFloat o32_re
VOLATILE spinorFloat o22_re
VOLATILE spinorFloat o30_re