QUDA  0.9.0
dw_dslash4_dagger_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 
6 #if (CUDA_VERSION >= 4010)
7 #define VOLATILE
8 #else
9 #define VOLATILE volatile
10 #endif
11 // input spinor
12 #ifdef SPINOR_DOUBLE
13 #define spinorFloat double
14 // workaround for C++11 bug in CUDA 6.5/7.0
15 #if CUDA_VERSION >= 6050 && CUDA_VERSION < 7050
16 #define POW(a, b) pow(a, static_cast<spinorFloat>(b))
17 #else
18 #define POW(a, b) pow(a, b)
19 #endif
20 
21 #define i00_re I0.x
22 #define i00_im I0.y
23 #define i01_re I1.x
24 #define i01_im I1.y
25 #define i02_re I2.x
26 #define i02_im I2.y
27 #define i10_re I3.x
28 #define i10_im I3.y
29 #define i11_re I4.x
30 #define i11_im I4.y
31 #define i12_re I5.x
32 #define i12_im I5.y
33 #define i20_re I6.x
34 #define i20_im I6.y
35 #define i21_re I7.x
36 #define i21_im I7.y
37 #define i22_re I8.x
38 #define i22_im I8.y
39 #define i30_re I9.x
40 #define i30_im I9.y
41 #define i31_re I10.x
42 #define i31_im I10.y
43 #define i32_re I11.x
44 #define i32_im I11.y
45 #define m5 param.m5_d
46 #define mdwf_b5 param.mdwf_b5_d
47 #define mdwf_c5 param.mdwf_c5_d
48 #define mferm param.mferm
49 #define a param.a
50 #define b param.b
51 #else
52 #define spinorFloat float
53 #define POW(a, b) __fast_pow(a, b)
54 #define i00_re I0.x
55 #define i00_im I0.y
56 #define i01_re I0.z
57 #define i01_im I0.w
58 #define i02_re I1.x
59 #define i02_im I1.y
60 #define i10_re I1.z
61 #define i10_im I1.w
62 #define i11_re I2.x
63 #define i11_im I2.y
64 #define i12_re I2.z
65 #define i12_im I2.w
66 #define i20_re I3.x
67 #define i20_im I3.y
68 #define i21_re I3.z
69 #define i21_im I3.w
70 #define i22_re I4.x
71 #define i22_im I4.y
72 #define i30_re I4.z
73 #define i30_im I4.w
74 #define i31_re I5.x
75 #define i31_im I5.y
76 #define i32_re I5.z
77 #define i32_im I5.w
78 #define m5 param.m5_f
79 #define mdwf_b5 param.mdwf_b5_f
80 #define mdwf_c5 param.mdwf_c5_f
81 #define mferm param.mferm_f
82 #define a param.a
83 #define b param.b
84 #endif // SPINOR_DOUBLE
85 
86 // gauge link
87 #ifdef GAUGE_FLOAT2
88 #define g00_re G0.x
89 #define g00_im G0.y
90 #define g01_re G1.x
91 #define g01_im G1.y
92 #define g02_re G2.x
93 #define g02_im G2.y
94 #define g10_re G3.x
95 #define g10_im G3.y
96 #define g11_re G4.x
97 #define g11_im G4.y
98 #define g12_re G5.x
99 #define g12_im G5.y
100 #define g20_re G6.x
101 #define g20_im G6.y
102 #define g21_re G7.x
103 #define g21_im G7.y
104 #define g22_re G8.x
105 #define g22_im G8.y
106 
107 #else
108 #define g00_re G0.x
109 #define g00_im G0.y
110 #define g01_re G0.z
111 #define g01_im G0.w
112 #define g02_re G1.x
113 #define g02_im G1.y
114 #define g10_re G1.z
115 #define g10_im G1.w
116 #define g11_re G2.x
117 #define g11_im G2.y
118 #define g12_re G2.z
119 #define g12_im G2.w
120 #define g20_re G3.x
121 #define g20_im G3.y
122 #define g21_re G3.z
123 #define g21_im G3.w
124 #define g22_re G4.x
125 #define g22_im G4.y
126 
127 #endif // GAUGE_DOUBLE
128 
129 // conjugated gauge link
130 #define gT00_re (+g00_re)
131 #define gT00_im (-g00_im)
132 #define gT01_re (+g10_re)
133 #define gT01_im (-g10_im)
134 #define gT02_re (+g20_re)
135 #define gT02_im (-g20_im)
136 #define gT10_re (+g01_re)
137 #define gT10_im (-g01_im)
138 #define gT11_re (+g11_re)
139 #define gT11_im (-g11_im)
140 #define gT12_re (+g21_re)
141 #define gT12_im (-g21_im)
142 #define gT20_re (+g02_re)
143 #define gT20_im (-g02_im)
144 #define gT21_re (+g12_re)
145 #define gT21_im (-g12_im)
146 #define gT22_re (+g22_re)
147 #define gT22_im (-g22_im)
148 
149 // output spinor
174 
175 #ifdef SPINOR_DOUBLE
176 #if (__COMPUTE_CAPABILITY__ >= 200)
177 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
178 #else
179 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
180 #endif
181 #else
182 #if (__COMPUTE_CAPABILITY__ >= 200)
183 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
184 #else
185 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
186 #endif
187 #endif
188 
189 #include "read_gauge.h"
190 #include "io_spinor.h"
191 
192 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
193 if (sid >= param.threads*param.dc.Ls) return;
194 
195 
196 int X, coord[5];
197 
199 if (kernel_type == INTERIOR_KERNEL) {
200 
201 
202 
203 
204  // Assume even dimensions
205  coordsFromIndex<5,QUDA_4D_PC,EVEN_X>(X, coord, sid, param);
206 
207  o00_re = 0; o00_im = 0;
208  o01_re = 0; o01_im = 0;
209  o02_re = 0; o02_im = 0;
210  o10_re = 0; o10_im = 0;
211  o11_re = 0; o11_im = 0;
212  o12_re = 0; o12_im = 0;
213  o20_re = 0; o20_im = 0;
214  o21_re = 0; o21_im = 0;
215  o22_re = 0; o22_im = 0;
216  o30_re = 0; o30_im = 0;
217  o31_re = 0; o31_im = 0;
218  o32_re = 0; o32_im = 0;
219 
220 } else { // exterior kernel
221 
222 const int face_volume = (param.threads*param.dc.Ls >> 1); // volume of one face
223 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
224 face_idx = sid - face_num*face_volume; // index into the respective face
225 
226 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
227 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
228 //sp_idx = face_idx + param.ghostOffset[dim];
229 
231 
233  o00_re = i00_re; o00_im = i00_im;
234  o01_re = i01_re; o01_im = i01_im;
235  o02_re = i02_re; o02_im = i02_im;
236  o10_re = i10_re; o10_im = i10_im;
237  o11_re = i11_re; o11_im = i11_im;
238  o12_re = i12_re; o12_im = i12_im;
239  o20_re = i20_re; o20_im = i20_im;
240  o21_re = i21_re; o21_im = i21_im;
241  o22_re = i22_re; o22_im = i22_im;
242  o30_re = i30_re; o30_im = i30_im;
243  o31_re = i31_re; o31_im = i31_im;
244  o32_re = i32_re; o32_im = i32_im;
245 }
246 
247 // declare G## here and use ASSN below instead of READ
248 #ifdef GAUGE_FLOAT2
249 #if (DD_PREC==0) //temporal hack
250 double2 G0;
251 double2 G1;
252 double2 G2;
253 double2 G3;
254 double2 G4;
255 double2 G5;
256 double2 G6;
257 double2 G7;
258 double2 G8;
259 #else
260 float2 G0;
261 float2 G1;
262 float2 G2;
263 float2 G3;
264 float2 G4;
265 float2 G5;
266 float2 G6;
267 float2 G7;
268 float2 G8;
269 #endif
270 #else
271 float4 G0;
272 float4 G1;
273 float4 G2;
274 float4 G3;
275 float4 G4;
276 #endif
277 
278 
279 
280 #ifdef MULTI_GPU
281 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]<(param.dc.X[0]-1))) ||
282  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==(param.dc.X[0]-1)) )
283 #endif
284 {
285  // Projector P0+
286  // 1 0 0 i
287  // 0 1 i 0
288  // 0 -i 1 0
289  // -i 0 0 1
290 
291 #ifdef MULTI_GPU
292  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1 :
293  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
294 #if (DD_PREC==2) // half precision
295  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
296 #endif
297 #else
298  const int sp_idx = (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1;
299 #endif
300 
301  const int ga_idx = sid % param.dc.volume_4d_cb;
302 
303  // read gauge matrix from device memory
304  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride);
305 
312 
313 #ifdef MULTI_GPU
314  if (kernel_type == INTERIOR_KERNEL) {
315 #endif
316 
317  // read spinor from device memory
318  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
319 
320  // project spinor into half spinors
321  a0_re = +i00_re-i30_im;
322  a0_im = +i00_im+i30_re;
323  a1_re = +i01_re-i31_im;
324  a1_im = +i01_im+i31_re;
325  a2_re = +i02_re-i32_im;
326  a2_im = +i02_im+i32_re;
327  b0_re = +i10_re-i20_im;
328  b0_im = +i10_im+i20_re;
329  b1_re = +i11_re-i21_im;
330  b1_im = +i11_im+i21_re;
331  b2_re = +i12_re-i22_im;
332  b2_im = +i12_im+i22_re;
333 
334 #ifdef MULTI_GPU
335  } else {
336 
337  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
338 
339  // read half spinor from device memory
340  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 0);
341 
342  a0_re = i00_re; a0_im = i00_im;
343  a1_re = i01_re; a1_im = i01_im;
344  a2_re = i02_re; a2_im = i02_im;
345  b0_re = i10_re; b0_im = i10_im;
346  b1_re = i11_re; b1_im = i11_im;
347  b2_re = i12_re; b2_im = i12_im;
348 
349  }
350 #endif // MULTI_GPU
351 
352  // reconstruct gauge matrix
354 
355  // multiply row 0
357  A0_re += g00_re * a0_re;
358  A0_re -= g00_im * a0_im;
359  A0_re += g01_re * a1_re;
360  A0_re -= g01_im * a1_im;
361  A0_re += g02_re * a2_re;
362  A0_re -= g02_im * a2_im;
364  A0_im += g00_re * a0_im;
365  A0_im += g00_im * a0_re;
366  A0_im += g01_re * a1_im;
367  A0_im += g01_im * a1_re;
368  A0_im += g02_re * a2_im;
369  A0_im += g02_im * a2_re;
371  B0_re += g00_re * b0_re;
372  B0_re -= g00_im * b0_im;
373  B0_re += g01_re * b1_re;
374  B0_re -= g01_im * b1_im;
375  B0_re += g02_re * b2_re;
376  B0_re -= g02_im * b2_im;
378  B0_im += g00_re * b0_im;
379  B0_im += g00_im * b0_re;
380  B0_im += g01_re * b1_im;
381  B0_im += g01_im * b1_re;
382  B0_im += g02_re * b2_im;
383  B0_im += g02_im * b2_re;
384 
385  // multiply row 1
387  A1_re += g10_re * a0_re;
388  A1_re -= g10_im * a0_im;
389  A1_re += g11_re * a1_re;
390  A1_re -= g11_im * a1_im;
391  A1_re += g12_re * a2_re;
392  A1_re -= g12_im * a2_im;
394  A1_im += g10_re * a0_im;
395  A1_im += g10_im * a0_re;
396  A1_im += g11_re * a1_im;
397  A1_im += g11_im * a1_re;
398  A1_im += g12_re * a2_im;
399  A1_im += g12_im * a2_re;
401  B1_re += g10_re * b0_re;
402  B1_re -= g10_im * b0_im;
403  B1_re += g11_re * b1_re;
404  B1_re -= g11_im * b1_im;
405  B1_re += g12_re * b2_re;
406  B1_re -= g12_im * b2_im;
408  B1_im += g10_re * b0_im;
409  B1_im += g10_im * b0_re;
410  B1_im += g11_re * b1_im;
411  B1_im += g11_im * b1_re;
412  B1_im += g12_re * b2_im;
413  B1_im += g12_im * b2_re;
414 
415  // multiply row 2
417  A2_re += g20_re * a0_re;
418  A2_re -= g20_im * a0_im;
419  A2_re += g21_re * a1_re;
420  A2_re -= g21_im * a1_im;
421  A2_re += g22_re * a2_re;
422  A2_re -= g22_im * a2_im;
424  A2_im += g20_re * a0_im;
425  A2_im += g20_im * a0_re;
426  A2_im += g21_re * a1_im;
427  A2_im += g21_im * a1_re;
428  A2_im += g22_re * a2_im;
429  A2_im += g22_im * a2_re;
431  B2_re += g20_re * b0_re;
432  B2_re -= g20_im * b0_im;
433  B2_re += g21_re * b1_re;
434  B2_re -= g21_im * b1_im;
435  B2_re += g22_re * b2_re;
436  B2_re -= g22_im * b2_im;
438  B2_im += g20_re * b0_im;
439  B2_im += g20_im * b0_re;
440  B2_im += g21_re * b1_im;
441  B2_im += g21_im * b1_re;
442  B2_im += g22_re * b2_im;
443  B2_im += g22_im * b2_re;
444 
445  o00_re += A0_re;
446  o00_im += A0_im;
447  o10_re += B0_re;
448  o10_im += B0_im;
449  o20_re += B0_im;
450  o20_im -= B0_re;
451  o30_re += A0_im;
452  o30_im -= A0_re;
453 
454  o01_re += A1_re;
455  o01_im += A1_im;
456  o11_re += B1_re;
457  o11_im += B1_im;
458  o21_re += B1_im;
459  o21_im -= B1_re;
460  o31_re += A1_im;
461  o31_im -= A1_re;
462 
463  o02_re += A2_re;
464  o02_im += A2_im;
465  o12_re += B2_re;
466  o12_im += B2_im;
467  o22_re += B2_im;
468  o22_im -= B2_re;
469  o32_re += A2_im;
470  o32_im -= A2_re;
471 }
472 
473 #ifdef MULTI_GPU
474 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]>0)) ||
475  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==0) )
476 #endif
477 {
478  // Projector P0-
479  // 1 0 0 -i
480  // 0 1 -i 0
481  // 0 i 1 0
482  // i 0 0 1
483 
484 #ifdef MULTI_GPU
485  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1 :
486  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
487 #if (DD_PREC==2) // half precision
488  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
489 #endif
490 #else
491  const int sp_idx = (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1;
492 #endif
493 
494 #ifdef MULTI_GPU
495  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % param.dc.volume_4d_cb : param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[static_cast<int>(kernel_type)]));
496 #else
497  const int ga_idx = sp_idx % param.dc.volume_4d_cb;
498 #endif
499 
500  // read gauge matrix from device memory
501  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, param.gauge_stride);
502 
509 
510 #ifdef MULTI_GPU
511  if (kernel_type == INTERIOR_KERNEL) {
512 #endif
513 
514  // read spinor from device memory
515  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
516 
517  // project spinor into half spinors
518  a0_re = +i00_re+i30_im;
519  a0_im = +i00_im-i30_re;
520  a1_re = +i01_re+i31_im;
521  a1_im = +i01_im-i31_re;
522  a2_re = +i02_re+i32_im;
523  a2_im = +i02_im-i32_re;
524  b0_re = +i10_re+i20_im;
525  b0_im = +i10_im-i20_re;
526  b1_re = +i11_re+i21_im;
527  b1_im = +i11_im-i21_re;
528  b2_re = +i12_re+i22_im;
529  b2_im = +i12_im-i22_re;
530 
531 #ifdef MULTI_GPU
532  } else {
533 
534  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
535 
536  // read half spinor from device memory
537  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 1);
538 
539  a0_re = i00_re; a0_im = i00_im;
540  a1_re = i01_re; a1_im = i01_im;
541  a2_re = i02_re; a2_im = i02_im;
542  b0_re = i10_re; b0_im = i10_im;
543  b1_re = i11_re; b1_im = i11_im;
544  b2_re = i12_re; b2_im = i12_im;
545 
546  }
547 #endif // MULTI_GPU
548 
549  // reconstruct gauge matrix
551 
552  // multiply row 0
553  spinorFloat A0_re = 0;
554  A0_re += gT00_re * a0_re;
555  A0_re -= gT00_im * a0_im;
556  A0_re += gT01_re * a1_re;
557  A0_re -= gT01_im * a1_im;
558  A0_re += gT02_re * a2_re;
559  A0_re -= gT02_im * a2_im;
560  spinorFloat A0_im = 0;
561  A0_im += gT00_re * a0_im;
562  A0_im += gT00_im * a0_re;
563  A0_im += gT01_re * a1_im;
564  A0_im += gT01_im * a1_re;
565  A0_im += gT02_re * a2_im;
566  A0_im += gT02_im * a2_re;
567  spinorFloat B0_re = 0;
568  B0_re += gT00_re * b0_re;
569  B0_re -= gT00_im * b0_im;
570  B0_re += gT01_re * b1_re;
571  B0_re -= gT01_im * b1_im;
572  B0_re += gT02_re * b2_re;
573  B0_re -= gT02_im * b2_im;
574  spinorFloat B0_im = 0;
575  B0_im += gT00_re * b0_im;
576  B0_im += gT00_im * b0_re;
577  B0_im += gT01_re * b1_im;
578  B0_im += gT01_im * b1_re;
579  B0_im += gT02_re * b2_im;
580  B0_im += gT02_im * b2_re;
581 
582  // multiply row 1
583  spinorFloat A1_re = 0;
584  A1_re += gT10_re * a0_re;
585  A1_re -= gT10_im * a0_im;
586  A1_re += gT11_re * a1_re;
587  A1_re -= gT11_im * a1_im;
588  A1_re += gT12_re * a2_re;
589  A1_re -= gT12_im * a2_im;
590  spinorFloat A1_im = 0;
591  A1_im += gT10_re * a0_im;
592  A1_im += gT10_im * a0_re;
593  A1_im += gT11_re * a1_im;
594  A1_im += gT11_im * a1_re;
595  A1_im += gT12_re * a2_im;
596  A1_im += gT12_im * a2_re;
597  spinorFloat B1_re = 0;
598  B1_re += gT10_re * b0_re;
599  B1_re -= gT10_im * b0_im;
600  B1_re += gT11_re * b1_re;
601  B1_re -= gT11_im * b1_im;
602  B1_re += gT12_re * b2_re;
603  B1_re -= gT12_im * b2_im;
604  spinorFloat B1_im = 0;
605  B1_im += gT10_re * b0_im;
606  B1_im += gT10_im * b0_re;
607  B1_im += gT11_re * b1_im;
608  B1_im += gT11_im * b1_re;
609  B1_im += gT12_re * b2_im;
610  B1_im += gT12_im * b2_re;
611 
612  // multiply row 2
613  spinorFloat A2_re = 0;
614  A2_re += gT20_re * a0_re;
615  A2_re -= gT20_im * a0_im;
616  A2_re += gT21_re * a1_re;
617  A2_re -= gT21_im * a1_im;
618  A2_re += gT22_re * a2_re;
619  A2_re -= gT22_im * a2_im;
620  spinorFloat A2_im = 0;
621  A2_im += gT20_re * a0_im;
622  A2_im += gT20_im * a0_re;
623  A2_im += gT21_re * a1_im;
624  A2_im += gT21_im * a1_re;
625  A2_im += gT22_re * a2_im;
626  A2_im += gT22_im * a2_re;
627  spinorFloat B2_re = 0;
628  B2_re += gT20_re * b0_re;
629  B2_re -= gT20_im * b0_im;
630  B2_re += gT21_re * b1_re;
631  B2_re -= gT21_im * b1_im;
632  B2_re += gT22_re * b2_re;
633  B2_re -= gT22_im * b2_im;
634  spinorFloat B2_im = 0;
635  B2_im += gT20_re * b0_im;
636  B2_im += gT20_im * b0_re;
637  B2_im += gT21_re * b1_im;
638  B2_im += gT21_im * b1_re;
639  B2_im += gT22_re * b2_im;
640  B2_im += gT22_im * b2_re;
641 
642  o00_re += A0_re;
643  o00_im += A0_im;
644  o10_re += B0_re;
645  o10_im += B0_im;
646  o20_re -= B0_im;
647  o20_im += B0_re;
648  o30_re -= A0_im;
649  o30_im += A0_re;
650 
651  o01_re += A1_re;
652  o01_im += A1_im;
653  o11_re += B1_re;
654  o11_im += B1_im;
655  o21_re -= B1_im;
656  o21_im += B1_re;
657  o31_re -= A1_im;
658  o31_im += A1_re;
659 
660  o02_re += A2_re;
661  o02_im += A2_im;
662  o12_re += B2_re;
663  o12_im += B2_im;
664  o22_re -= B2_im;
665  o22_im += B2_re;
666  o32_re -= A2_im;
667  o32_im += A2_re;
668 }
669 
670 #ifdef MULTI_GPU
671 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]<(param.dc.X[1]-1))) ||
672  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==(param.dc.X[1]-1)) )
673 #endif
674 {
675  // Projector P1+
676  // 1 0 0 1
677  // 0 1 -1 0
678  // 0 -1 1 0
679  // 1 0 0 1
680 
681 #ifdef MULTI_GPU
682  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1 :
683  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
684 #if (DD_PREC==2) // half precision
685  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
686 #endif
687 #else
688  const int sp_idx = (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1;
689 #endif
690 
691  const int ga_idx = sid % param.dc.volume_4d_cb;
692 
693  // read gauge matrix from device memory
694  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, param.gauge_stride);
695 
702 
703 #ifdef MULTI_GPU
704  if (kernel_type == INTERIOR_KERNEL) {
705 #endif
706 
707  // read spinor from device memory
708  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
709 
710  // project spinor into half spinors
711  a0_re = +i00_re+i30_re;
712  a0_im = +i00_im+i30_im;
713  a1_re = +i01_re+i31_re;
714  a1_im = +i01_im+i31_im;
715  a2_re = +i02_re+i32_re;
716  a2_im = +i02_im+i32_im;
717  b0_re = +i10_re-i20_re;
718  b0_im = +i10_im-i20_im;
719  b1_re = +i11_re-i21_re;
720  b1_im = +i11_im-i21_im;
721  b2_re = +i12_re-i22_re;
722  b2_im = +i12_im-i22_im;
723 
724 #ifdef MULTI_GPU
725  } else {
726 
727  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
728 
729  // read half spinor from device memory
730  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 2);
731 
732  a0_re = i00_re; a0_im = i00_im;
733  a1_re = i01_re; a1_im = i01_im;
734  a2_re = i02_re; a2_im = i02_im;
735  b0_re = i10_re; b0_im = i10_im;
736  b1_re = i11_re; b1_im = i11_im;
737  b2_re = i12_re; b2_im = i12_im;
738 
739  }
740 #endif // MULTI_GPU
741 
742  // reconstruct gauge matrix
744 
745  // multiply row 0
746  spinorFloat A0_re = 0;
747  A0_re += g00_re * a0_re;
748  A0_re -= g00_im * a0_im;
749  A0_re += g01_re * a1_re;
750  A0_re -= g01_im * a1_im;
751  A0_re += g02_re * a2_re;
752  A0_re -= g02_im * a2_im;
753  spinorFloat A0_im = 0;
754  A0_im += g00_re * a0_im;
755  A0_im += g00_im * a0_re;
756  A0_im += g01_re * a1_im;
757  A0_im += g01_im * a1_re;
758  A0_im += g02_re * a2_im;
759  A0_im += g02_im * a2_re;
760  spinorFloat B0_re = 0;
761  B0_re += g00_re * b0_re;
762  B0_re -= g00_im * b0_im;
763  B0_re += g01_re * b1_re;
764  B0_re -= g01_im * b1_im;
765  B0_re += g02_re * b2_re;
766  B0_re -= g02_im * b2_im;
767  spinorFloat B0_im = 0;
768  B0_im += g00_re * b0_im;
769  B0_im += g00_im * b0_re;
770  B0_im += g01_re * b1_im;
771  B0_im += g01_im * b1_re;
772  B0_im += g02_re * b2_im;
773  B0_im += g02_im * b2_re;
774 
775  // multiply row 1
776  spinorFloat A1_re = 0;
777  A1_re += g10_re * a0_re;
778  A1_re -= g10_im * a0_im;
779  A1_re += g11_re * a1_re;
780  A1_re -= g11_im * a1_im;
781  A1_re += g12_re * a2_re;
782  A1_re -= g12_im * a2_im;
783  spinorFloat A1_im = 0;
784  A1_im += g10_re * a0_im;
785  A1_im += g10_im * a0_re;
786  A1_im += g11_re * a1_im;
787  A1_im += g11_im * a1_re;
788  A1_im += g12_re * a2_im;
789  A1_im += g12_im * a2_re;
790  spinorFloat B1_re = 0;
791  B1_re += g10_re * b0_re;
792  B1_re -= g10_im * b0_im;
793  B1_re += g11_re * b1_re;
794  B1_re -= g11_im * b1_im;
795  B1_re += g12_re * b2_re;
796  B1_re -= g12_im * b2_im;
797  spinorFloat B1_im = 0;
798  B1_im += g10_re * b0_im;
799  B1_im += g10_im * b0_re;
800  B1_im += g11_re * b1_im;
801  B1_im += g11_im * b1_re;
802  B1_im += g12_re * b2_im;
803  B1_im += g12_im * b2_re;
804 
805  // multiply row 2
806  spinorFloat A2_re = 0;
807  A2_re += g20_re * a0_re;
808  A2_re -= g20_im * a0_im;
809  A2_re += g21_re * a1_re;
810  A2_re -= g21_im * a1_im;
811  A2_re += g22_re * a2_re;
812  A2_re -= g22_im * a2_im;
813  spinorFloat A2_im = 0;
814  A2_im += g20_re * a0_im;
815  A2_im += g20_im * a0_re;
816  A2_im += g21_re * a1_im;
817  A2_im += g21_im * a1_re;
818  A2_im += g22_re * a2_im;
819  A2_im += g22_im * a2_re;
820  spinorFloat B2_re = 0;
821  B2_re += g20_re * b0_re;
822  B2_re -= g20_im * b0_im;
823  B2_re += g21_re * b1_re;
824  B2_re -= g21_im * b1_im;
825  B2_re += g22_re * b2_re;
826  B2_re -= g22_im * b2_im;
827  spinorFloat B2_im = 0;
828  B2_im += g20_re * b0_im;
829  B2_im += g20_im * b0_re;
830  B2_im += g21_re * b1_im;
831  B2_im += g21_im * b1_re;
832  B2_im += g22_re * b2_im;
833  B2_im += g22_im * b2_re;
834 
835  o00_re += A0_re;
836  o00_im += A0_im;
837  o10_re += B0_re;
838  o10_im += B0_im;
839  o20_re -= B0_re;
840  o20_im -= B0_im;
841  o30_re += A0_re;
842  o30_im += A0_im;
843 
844  o01_re += A1_re;
845  o01_im += A1_im;
846  o11_re += B1_re;
847  o11_im += B1_im;
848  o21_re -= B1_re;
849  o21_im -= B1_im;
850  o31_re += A1_re;
851  o31_im += A1_im;
852 
853  o02_re += A2_re;
854  o02_im += A2_im;
855  o12_re += B2_re;
856  o12_im += B2_im;
857  o22_re -= B2_re;
858  o22_im -= B2_im;
859  o32_re += A2_re;
860  o32_im += A2_im;
861 }
862 
863 #ifdef MULTI_GPU
864 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]>0)) ||
865  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==0) )
866 #endif
867 {
868  // Projector P1-
869  // 1 0 0 -1
870  // 0 1 1 0
871  // 0 1 1 0
872  // -1 0 0 1
873 
874 #ifdef MULTI_GPU
875  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1 :
876  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
877 #if (DD_PREC==2) // half precision
878  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
879 #endif
880 #else
881  const int sp_idx = (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1;
882 #endif
883 
884 #ifdef MULTI_GPU
885  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % param.dc.volume_4d_cb : param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[static_cast<int>(kernel_type)]));
886 #else
887  const int ga_idx = sp_idx % param.dc.volume_4d_cb;
888 #endif
889 
890  // read gauge matrix from device memory
891  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, param.gauge_stride);
892 
899 
900 #ifdef MULTI_GPU
901  if (kernel_type == INTERIOR_KERNEL) {
902 #endif
903 
904  // read spinor from device memory
905  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
906 
907  // project spinor into half spinors
908  a0_re = +i00_re-i30_re;
909  a0_im = +i00_im-i30_im;
910  a1_re = +i01_re-i31_re;
911  a1_im = +i01_im-i31_im;
912  a2_re = +i02_re-i32_re;
913  a2_im = +i02_im-i32_im;
914  b0_re = +i10_re+i20_re;
915  b0_im = +i10_im+i20_im;
916  b1_re = +i11_re+i21_re;
917  b1_im = +i11_im+i21_im;
918  b2_re = +i12_re+i22_re;
919  b2_im = +i12_im+i22_im;
920 
921 #ifdef MULTI_GPU
922  } else {
923 
924  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
925 
926  // read half spinor from device memory
927  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 3);
928 
929  a0_re = i00_re; a0_im = i00_im;
930  a1_re = i01_re; a1_im = i01_im;
931  a2_re = i02_re; a2_im = i02_im;
932  b0_re = i10_re; b0_im = i10_im;
933  b1_re = i11_re; b1_im = i11_im;
934  b2_re = i12_re; b2_im = i12_im;
935 
936  }
937 #endif // MULTI_GPU
938 
939  // reconstruct gauge matrix
941 
942  // multiply row 0
943  spinorFloat A0_re = 0;
944  A0_re += gT00_re * a0_re;
945  A0_re -= gT00_im * a0_im;
946  A0_re += gT01_re * a1_re;
947  A0_re -= gT01_im * a1_im;
948  A0_re += gT02_re * a2_re;
949  A0_re -= gT02_im * a2_im;
950  spinorFloat A0_im = 0;
951  A0_im += gT00_re * a0_im;
952  A0_im += gT00_im * a0_re;
953  A0_im += gT01_re * a1_im;
954  A0_im += gT01_im * a1_re;
955  A0_im += gT02_re * a2_im;
956  A0_im += gT02_im * a2_re;
957  spinorFloat B0_re = 0;
958  B0_re += gT00_re * b0_re;
959  B0_re -= gT00_im * b0_im;
960  B0_re += gT01_re * b1_re;
961  B0_re -= gT01_im * b1_im;
962  B0_re += gT02_re * b2_re;
963  B0_re -= gT02_im * b2_im;
964  spinorFloat B0_im = 0;
965  B0_im += gT00_re * b0_im;
966  B0_im += gT00_im * b0_re;
967  B0_im += gT01_re * b1_im;
968  B0_im += gT01_im * b1_re;
969  B0_im += gT02_re * b2_im;
970  B0_im += gT02_im * b2_re;
971 
972  // multiply row 1
973  spinorFloat A1_re = 0;
974  A1_re += gT10_re * a0_re;
975  A1_re -= gT10_im * a0_im;
976  A1_re += gT11_re * a1_re;
977  A1_re -= gT11_im * a1_im;
978  A1_re += gT12_re * a2_re;
979  A1_re -= gT12_im * a2_im;
980  spinorFloat A1_im = 0;
981  A1_im += gT10_re * a0_im;
982  A1_im += gT10_im * a0_re;
983  A1_im += gT11_re * a1_im;
984  A1_im += gT11_im * a1_re;
985  A1_im += gT12_re * a2_im;
986  A1_im += gT12_im * a2_re;
987  spinorFloat B1_re = 0;
988  B1_re += gT10_re * b0_re;
989  B1_re -= gT10_im * b0_im;
990  B1_re += gT11_re * b1_re;
991  B1_re -= gT11_im * b1_im;
992  B1_re += gT12_re * b2_re;
993  B1_re -= gT12_im * b2_im;
994  spinorFloat B1_im = 0;
995  B1_im += gT10_re * b0_im;
996  B1_im += gT10_im * b0_re;
997  B1_im += gT11_re * b1_im;
998  B1_im += gT11_im * b1_re;
999  B1_im += gT12_re * b2_im;
1000  B1_im += gT12_im * b2_re;
1001 
1002  // multiply row 2
1003  spinorFloat A2_re = 0;
1004  A2_re += gT20_re * a0_re;
1005  A2_re -= gT20_im * a0_im;
1006  A2_re += gT21_re * a1_re;
1007  A2_re -= gT21_im * a1_im;
1008  A2_re += gT22_re * a2_re;
1009  A2_re -= gT22_im * a2_im;
1010  spinorFloat A2_im = 0;
1011  A2_im += gT20_re * a0_im;
1012  A2_im += gT20_im * a0_re;
1013  A2_im += gT21_re * a1_im;
1014  A2_im += gT21_im * a1_re;
1015  A2_im += gT22_re * a2_im;
1016  A2_im += gT22_im * a2_re;
1017  spinorFloat B2_re = 0;
1018  B2_re += gT20_re * b0_re;
1019  B2_re -= gT20_im * b0_im;
1020  B2_re += gT21_re * b1_re;
1021  B2_re -= gT21_im * b1_im;
1022  B2_re += gT22_re * b2_re;
1023  B2_re -= gT22_im * b2_im;
1024  spinorFloat B2_im = 0;
1025  B2_im += gT20_re * b0_im;
1026  B2_im += gT20_im * b0_re;
1027  B2_im += gT21_re * b1_im;
1028  B2_im += gT21_im * b1_re;
1029  B2_im += gT22_re * b2_im;
1030  B2_im += gT22_im * b2_re;
1031 
1032  o00_re += A0_re;
1033  o00_im += A0_im;
1034  o10_re += B0_re;
1035  o10_im += B0_im;
1036  o20_re += B0_re;
1037  o20_im += B0_im;
1038  o30_re -= A0_re;
1039  o30_im -= A0_im;
1040 
1041  o01_re += A1_re;
1042  o01_im += A1_im;
1043  o11_re += B1_re;
1044  o11_im += B1_im;
1045  o21_re += B1_re;
1046  o21_im += B1_im;
1047  o31_re -= A1_re;
1048  o31_im -= A1_im;
1049 
1050  o02_re += A2_re;
1051  o02_im += A2_im;
1052  o12_re += B2_re;
1053  o12_im += B2_im;
1054  o22_re += B2_re;
1055  o22_im += B2_im;
1056  o32_re -= A2_re;
1057  o32_im -= A2_im;
1058 }
1059 
1060 #ifdef MULTI_GPU
1061 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]<(param.dc.X[2]-1))) ||
1062  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==(param.dc.X[2]-1)) )
1063 #endif
1064 {
1065  // Projector P2+
1066  // 1 0 i 0
1067  // 0 1 0 -i
1068  // -i 0 1 0
1069  // 0 i 0 1
1070 
1071 #ifdef MULTI_GPU
1072  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1 :
1073  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1074 #if (DD_PREC==2) // half precision
1075  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1076 #endif
1077 #else
1078  const int sp_idx = (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1;
1079 #endif
1080 
1081  const int ga_idx = sid % param.dc.volume_4d_cb;
1082 
1083  // read gauge matrix from device memory
1084  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, param.gauge_stride);
1085 
1092 
1093 #ifdef MULTI_GPU
1094  if (kernel_type == INTERIOR_KERNEL) {
1095 #endif
1096 
1097  // read spinor from device memory
1098  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1099 
1100  // project spinor into half spinors
1101  a0_re = +i00_re-i20_im;
1102  a0_im = +i00_im+i20_re;
1103  a1_re = +i01_re-i21_im;
1104  a1_im = +i01_im+i21_re;
1105  a2_re = +i02_re-i22_im;
1106  a2_im = +i02_im+i22_re;
1107  b0_re = +i10_re+i30_im;
1108  b0_im = +i10_im-i30_re;
1109  b1_re = +i11_re+i31_im;
1110  b1_im = +i11_im-i31_re;
1111  b2_re = +i12_re+i32_im;
1112  b2_im = +i12_im-i32_re;
1113 
1114 #ifdef MULTI_GPU
1115  } else {
1116 
1117  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
1118 
1119  // read half spinor from device memory
1120  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 4);
1121 
1122  a0_re = i00_re; a0_im = i00_im;
1123  a1_re = i01_re; a1_im = i01_im;
1124  a2_re = i02_re; a2_im = i02_im;
1125  b0_re = i10_re; b0_im = i10_im;
1126  b1_re = i11_re; b1_im = i11_im;
1127  b2_re = i12_re; b2_im = i12_im;
1128 
1129  }
1130 #endif // MULTI_GPU
1131 
1132  // reconstruct gauge matrix
1134 
1135  // multiply row 0
1136  spinorFloat A0_re = 0;
1137  A0_re += g00_re * a0_re;
1138  A0_re -= g00_im * a0_im;
1139  A0_re += g01_re * a1_re;
1140  A0_re -= g01_im * a1_im;
1141  A0_re += g02_re * a2_re;
1142  A0_re -= g02_im * a2_im;
1143  spinorFloat A0_im = 0;
1144  A0_im += g00_re * a0_im;
1145  A0_im += g00_im * a0_re;
1146  A0_im += g01_re * a1_im;
1147  A0_im += g01_im * a1_re;
1148  A0_im += g02_re * a2_im;
1149  A0_im += g02_im * a2_re;
1150  spinorFloat B0_re = 0;
1151  B0_re += g00_re * b0_re;
1152  B0_re -= g00_im * b0_im;
1153  B0_re += g01_re * b1_re;
1154  B0_re -= g01_im * b1_im;
1155  B0_re += g02_re * b2_re;
1156  B0_re -= g02_im * b2_im;
1157  spinorFloat B0_im = 0;
1158  B0_im += g00_re * b0_im;
1159  B0_im += g00_im * b0_re;
1160  B0_im += g01_re * b1_im;
1161  B0_im += g01_im * b1_re;
1162  B0_im += g02_re * b2_im;
1163  B0_im += g02_im * b2_re;
1164 
1165  // multiply row 1
1166  spinorFloat A1_re = 0;
1167  A1_re += g10_re * a0_re;
1168  A1_re -= g10_im * a0_im;
1169  A1_re += g11_re * a1_re;
1170  A1_re -= g11_im * a1_im;
1171  A1_re += g12_re * a2_re;
1172  A1_re -= g12_im * a2_im;
1173  spinorFloat A1_im = 0;
1174  A1_im += g10_re * a0_im;
1175  A1_im += g10_im * a0_re;
1176  A1_im += g11_re * a1_im;
1177  A1_im += g11_im * a1_re;
1178  A1_im += g12_re * a2_im;
1179  A1_im += g12_im * a2_re;
1180  spinorFloat B1_re = 0;
1181  B1_re += g10_re * b0_re;
1182  B1_re -= g10_im * b0_im;
1183  B1_re += g11_re * b1_re;
1184  B1_re -= g11_im * b1_im;
1185  B1_re += g12_re * b2_re;
1186  B1_re -= g12_im * b2_im;
1187  spinorFloat B1_im = 0;
1188  B1_im += g10_re * b0_im;
1189  B1_im += g10_im * b0_re;
1190  B1_im += g11_re * b1_im;
1191  B1_im += g11_im * b1_re;
1192  B1_im += g12_re * b2_im;
1193  B1_im += g12_im * b2_re;
1194 
1195  // multiply row 2
1196  spinorFloat A2_re = 0;
1197  A2_re += g20_re * a0_re;
1198  A2_re -= g20_im * a0_im;
1199  A2_re += g21_re * a1_re;
1200  A2_re -= g21_im * a1_im;
1201  A2_re += g22_re * a2_re;
1202  A2_re -= g22_im * a2_im;
1203  spinorFloat A2_im = 0;
1204  A2_im += g20_re * a0_im;
1205  A2_im += g20_im * a0_re;
1206  A2_im += g21_re * a1_im;
1207  A2_im += g21_im * a1_re;
1208  A2_im += g22_re * a2_im;
1209  A2_im += g22_im * a2_re;
1210  spinorFloat B2_re = 0;
1211  B2_re += g20_re * b0_re;
1212  B2_re -= g20_im * b0_im;
1213  B2_re += g21_re * b1_re;
1214  B2_re -= g21_im * b1_im;
1215  B2_re += g22_re * b2_re;
1216  B2_re -= g22_im * b2_im;
1217  spinorFloat B2_im = 0;
1218  B2_im += g20_re * b0_im;
1219  B2_im += g20_im * b0_re;
1220  B2_im += g21_re * b1_im;
1221  B2_im += g21_im * b1_re;
1222  B2_im += g22_re * b2_im;
1223  B2_im += g22_im * b2_re;
1224 
1225  o00_re += A0_re;
1226  o00_im += A0_im;
1227  o10_re += B0_re;
1228  o10_im += B0_im;
1229  o20_re += A0_im;
1230  o20_im -= A0_re;
1231  o30_re -= B0_im;
1232  o30_im += B0_re;
1233 
1234  o01_re += A1_re;
1235  o01_im += A1_im;
1236  o11_re += B1_re;
1237  o11_im += B1_im;
1238  o21_re += A1_im;
1239  o21_im -= A1_re;
1240  o31_re -= B1_im;
1241  o31_im += B1_re;
1242 
1243  o02_re += A2_re;
1244  o02_im += A2_im;
1245  o12_re += B2_re;
1246  o12_im += B2_im;
1247  o22_re += A2_im;
1248  o22_im -= A2_re;
1249  o32_re -= B2_im;
1250  o32_im += B2_re;
1251 }
1252 
1253 #ifdef MULTI_GPU
1254 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]>0)) ||
1255  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==0) )
1256 #endif
1257 {
1258  // Projector P2-
1259  // 1 0 -i 0
1260  // 0 1 0 i
1261  // i 0 1 0
1262  // 0 -i 0 1
1263 
1264 #ifdef MULTI_GPU
1265  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1 :
1266  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1267 #if (DD_PREC==2) // half precision
1268  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1269 #endif
1270 #else
1271  const int sp_idx = (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1;
1272 #endif
1273 
1274 #ifdef MULTI_GPU
1275  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % param.dc.volume_4d_cb : param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[static_cast<int>(kernel_type)]));
1276 #else
1277  const int ga_idx = sp_idx % param.dc.volume_4d_cb;
1278 #endif
1279 
1280  // read gauge matrix from device memory
1281  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, param.gauge_stride);
1282 
1289 
1290 #ifdef MULTI_GPU
1291  if (kernel_type == INTERIOR_KERNEL) {
1292 #endif
1293 
1294  // read spinor from device memory
1295  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1296 
1297  // project spinor into half spinors
1298  a0_re = +i00_re+i20_im;
1299  a0_im = +i00_im-i20_re;
1300  a1_re = +i01_re+i21_im;
1301  a1_im = +i01_im-i21_re;
1302  a2_re = +i02_re+i22_im;
1303  a2_im = +i02_im-i22_re;
1304  b0_re = +i10_re-i30_im;
1305  b0_im = +i10_im+i30_re;
1306  b1_re = +i11_re-i31_im;
1307  b1_im = +i11_im+i31_re;
1308  b2_re = +i12_re-i32_im;
1309  b2_im = +i12_im+i32_re;
1310 
1311 #ifdef MULTI_GPU
1312  } else {
1313 
1314  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
1315 
1316  // read half spinor from device memory
1317  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 5);
1318 
1319  a0_re = i00_re; a0_im = i00_im;
1320  a1_re = i01_re; a1_im = i01_im;
1321  a2_re = i02_re; a2_im = i02_im;
1322  b0_re = i10_re; b0_im = i10_im;
1323  b1_re = i11_re; b1_im = i11_im;
1324  b2_re = i12_re; b2_im = i12_im;
1325 
1326  }
1327 #endif // MULTI_GPU
1328 
1329  // reconstruct gauge matrix
1331 
1332  // multiply row 0
1333  spinorFloat A0_re = 0;
1334  A0_re += gT00_re * a0_re;
1335  A0_re -= gT00_im * a0_im;
1336  A0_re += gT01_re * a1_re;
1337  A0_re -= gT01_im * a1_im;
1338  A0_re += gT02_re * a2_re;
1339  A0_re -= gT02_im * a2_im;
1340  spinorFloat A0_im = 0;
1341  A0_im += gT00_re * a0_im;
1342  A0_im += gT00_im * a0_re;
1343  A0_im += gT01_re * a1_im;
1344  A0_im += gT01_im * a1_re;
1345  A0_im += gT02_re * a2_im;
1346  A0_im += gT02_im * a2_re;
1347  spinorFloat B0_re = 0;
1348  B0_re += gT00_re * b0_re;
1349  B0_re -= gT00_im * b0_im;
1350  B0_re += gT01_re * b1_re;
1351  B0_re -= gT01_im * b1_im;
1352  B0_re += gT02_re * b2_re;
1353  B0_re -= gT02_im * b2_im;
1354  spinorFloat B0_im = 0;
1355  B0_im += gT00_re * b0_im;
1356  B0_im += gT00_im * b0_re;
1357  B0_im += gT01_re * b1_im;
1358  B0_im += gT01_im * b1_re;
1359  B0_im += gT02_re * b2_im;
1360  B0_im += gT02_im * b2_re;
1361 
1362  // multiply row 1
1363  spinorFloat A1_re = 0;
1364  A1_re += gT10_re * a0_re;
1365  A1_re -= gT10_im * a0_im;
1366  A1_re += gT11_re * a1_re;
1367  A1_re -= gT11_im * a1_im;
1368  A1_re += gT12_re * a2_re;
1369  A1_re -= gT12_im * a2_im;
1370  spinorFloat A1_im = 0;
1371  A1_im += gT10_re * a0_im;
1372  A1_im += gT10_im * a0_re;
1373  A1_im += gT11_re * a1_im;
1374  A1_im += gT11_im * a1_re;
1375  A1_im += gT12_re * a2_im;
1376  A1_im += gT12_im * a2_re;
1377  spinorFloat B1_re = 0;
1378  B1_re += gT10_re * b0_re;
1379  B1_re -= gT10_im * b0_im;
1380  B1_re += gT11_re * b1_re;
1381  B1_re -= gT11_im * b1_im;
1382  B1_re += gT12_re * b2_re;
1383  B1_re -= gT12_im * b2_im;
1384  spinorFloat B1_im = 0;
1385  B1_im += gT10_re * b0_im;
1386  B1_im += gT10_im * b0_re;
1387  B1_im += gT11_re * b1_im;
1388  B1_im += gT11_im * b1_re;
1389  B1_im += gT12_re * b2_im;
1390  B1_im += gT12_im * b2_re;
1391 
1392  // multiply row 2
1393  spinorFloat A2_re = 0;
1394  A2_re += gT20_re * a0_re;
1395  A2_re -= gT20_im * a0_im;
1396  A2_re += gT21_re * a1_re;
1397  A2_re -= gT21_im * a1_im;
1398  A2_re += gT22_re * a2_re;
1399  A2_re -= gT22_im * a2_im;
1400  spinorFloat A2_im = 0;
1401  A2_im += gT20_re * a0_im;
1402  A2_im += gT20_im * a0_re;
1403  A2_im += gT21_re * a1_im;
1404  A2_im += gT21_im * a1_re;
1405  A2_im += gT22_re * a2_im;
1406  A2_im += gT22_im * a2_re;
1407  spinorFloat B2_re = 0;
1408  B2_re += gT20_re * b0_re;
1409  B2_re -= gT20_im * b0_im;
1410  B2_re += gT21_re * b1_re;
1411  B2_re -= gT21_im * b1_im;
1412  B2_re += gT22_re * b2_re;
1413  B2_re -= gT22_im * b2_im;
1414  spinorFloat B2_im = 0;
1415  B2_im += gT20_re * b0_im;
1416  B2_im += gT20_im * b0_re;
1417  B2_im += gT21_re * b1_im;
1418  B2_im += gT21_im * b1_re;
1419  B2_im += gT22_re * b2_im;
1420  B2_im += gT22_im * b2_re;
1421 
1422  o00_re += A0_re;
1423  o00_im += A0_im;
1424  o10_re += B0_re;
1425  o10_im += B0_im;
1426  o20_re -= A0_im;
1427  o20_im += A0_re;
1428  o30_re += B0_im;
1429  o30_im -= B0_re;
1430 
1431  o01_re += A1_re;
1432  o01_im += A1_im;
1433  o11_re += B1_re;
1434  o11_im += B1_im;
1435  o21_re -= A1_im;
1436  o21_im += A1_re;
1437  o31_re += B1_im;
1438  o31_im -= B1_re;
1439 
1440  o02_re += A2_re;
1441  o02_im += A2_im;
1442  o12_re += B2_re;
1443  o12_im += B2_im;
1444  o22_re -= A2_im;
1445  o22_im += A2_re;
1446  o32_re += B2_im;
1447  o32_im -= B2_re;
1448 }
1449 
1450 #ifdef MULTI_GPU
1451 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]<(param.dc.X[3]-1))) ||
1452  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==(param.dc.X[3]-1)) )
1453 #endif
1454 {
1455  // Projector P3+
1456  // 2 0 0 0
1457  // 0 2 0 0
1458  // 0 0 0 0
1459  // 0 0 0 0
1460 
1461 #ifdef MULTI_GPU
1462  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1 :
1463  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1464 #if (DD_PREC==2) // half precision
1465  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1466 #endif
1467 #else
1468  const int sp_idx = (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1;
1469 #endif
1470 
1471  const int ga_idx = sid % param.dc.volume_4d_cb;
1472 
1473  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1474  {
1481 
1482 #ifdef MULTI_GPU
1483  if (kernel_type == INTERIOR_KERNEL) {
1484 #endif
1485 
1486  // read spinor from device memory
1487  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1488 
1489  // project spinor into half spinors
1490  a0_re = +2*i00_re;
1491  a0_im = +2*i00_im;
1492  a1_re = +2*i01_re;
1493  a1_im = +2*i01_im;
1494  a2_re = +2*i02_re;
1495  a2_im = +2*i02_im;
1496  b0_re = +2*i10_re;
1497  b0_im = +2*i10_im;
1498  b1_re = +2*i11_re;
1499  b1_im = +2*i11_im;
1500  b2_re = +2*i12_re;
1501  b2_im = +2*i12_im;
1502 
1503 #ifdef MULTI_GPU
1504  } else {
1505 
1506  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
1507  const int t_proj_scale = TPROJSCALE;
1508 
1509  // read half spinor from device memory
1510  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1511 
1512  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1513  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1514  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1515  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1516  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1517  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1518 
1519  }
1520 #endif // MULTI_GPU
1521 
1522  // identity gauge matrix
1529 
1530  o00_re += A0_re;
1531  o00_im += A0_im;
1532  o10_re += B0_re;
1533  o10_im += B0_im;
1534 
1535  o01_re += A1_re;
1536  o01_im += A1_im;
1537  o11_re += B1_re;
1538  o11_im += B1_im;
1539 
1540  o02_re += A2_re;
1541  o02_im += A2_im;
1542  o12_re += B2_re;
1543  o12_im += B2_im;
1544  } else {
1545  // read gauge matrix from device memory
1546  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, param.gauge_stride);
1547 
1554 
1555 #ifdef MULTI_GPU
1556  if (kernel_type == INTERIOR_KERNEL) {
1557 #endif
1558 
1559  // read spinor from device memory
1560  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1561 
1562  // project spinor into half spinors
1563  a0_re = +2*i00_re;
1564  a0_im = +2*i00_im;
1565  a1_re = +2*i01_re;
1566  a1_im = +2*i01_im;
1567  a2_re = +2*i02_re;
1568  a2_im = +2*i02_im;
1569  b0_re = +2*i10_re;
1570  b0_im = +2*i10_im;
1571  b1_re = +2*i11_re;
1572  b1_im = +2*i11_im;
1573  b2_re = +2*i12_re;
1574  b2_im = +2*i12_im;
1575 
1576 #ifdef MULTI_GPU
1577  } else {
1578 
1579  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
1580  const int t_proj_scale = TPROJSCALE;
1581 
1582  // read half spinor from device memory
1583  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1584 
1585  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1586  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1587  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1588  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1589  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1590  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1591 
1592  }
1593 #endif // MULTI_GPU
1594 
1595  // reconstruct gauge matrix
1597 
1598  // multiply row 0
1599  spinorFloat A0_re = 0;
1600  A0_re += g00_re * a0_re;
1601  A0_re -= g00_im * a0_im;
1602  A0_re += g01_re * a1_re;
1603  A0_re -= g01_im * a1_im;
1604  A0_re += g02_re * a2_re;
1605  A0_re -= g02_im * a2_im;
1606  spinorFloat A0_im = 0;
1607  A0_im += g00_re * a0_im;
1608  A0_im += g00_im * a0_re;
1609  A0_im += g01_re * a1_im;
1610  A0_im += g01_im * a1_re;
1611  A0_im += g02_re * a2_im;
1612  A0_im += g02_im * a2_re;
1613  spinorFloat B0_re = 0;
1614  B0_re += g00_re * b0_re;
1615  B0_re -= g00_im * b0_im;
1616  B0_re += g01_re * b1_re;
1617  B0_re -= g01_im * b1_im;
1618  B0_re += g02_re * b2_re;
1619  B0_re -= g02_im * b2_im;
1620  spinorFloat B0_im = 0;
1621  B0_im += g00_re * b0_im;
1622  B0_im += g00_im * b0_re;
1623  B0_im += g01_re * b1_im;
1624  B0_im += g01_im * b1_re;
1625  B0_im += g02_re * b2_im;
1626  B0_im += g02_im * b2_re;
1627 
1628  // multiply row 1
1629  spinorFloat A1_re = 0;
1630  A1_re += g10_re * a0_re;
1631  A1_re -= g10_im * a0_im;
1632  A1_re += g11_re * a1_re;
1633  A1_re -= g11_im * a1_im;
1634  A1_re += g12_re * a2_re;
1635  A1_re -= g12_im * a2_im;
1636  spinorFloat A1_im = 0;
1637  A1_im += g10_re * a0_im;
1638  A1_im += g10_im * a0_re;
1639  A1_im += g11_re * a1_im;
1640  A1_im += g11_im * a1_re;
1641  A1_im += g12_re * a2_im;
1642  A1_im += g12_im * a2_re;
1643  spinorFloat B1_re = 0;
1644  B1_re += g10_re * b0_re;
1645  B1_re -= g10_im * b0_im;
1646  B1_re += g11_re * b1_re;
1647  B1_re -= g11_im * b1_im;
1648  B1_re += g12_re * b2_re;
1649  B1_re -= g12_im * b2_im;
1650  spinorFloat B1_im = 0;
1651  B1_im += g10_re * b0_im;
1652  B1_im += g10_im * b0_re;
1653  B1_im += g11_re * b1_im;
1654  B1_im += g11_im * b1_re;
1655  B1_im += g12_re * b2_im;
1656  B1_im += g12_im * b2_re;
1657 
1658  // multiply row 2
1659  spinorFloat A2_re = 0;
1660  A2_re += g20_re * a0_re;
1661  A2_re -= g20_im * a0_im;
1662  A2_re += g21_re * a1_re;
1663  A2_re -= g21_im * a1_im;
1664  A2_re += g22_re * a2_re;
1665  A2_re -= g22_im * a2_im;
1666  spinorFloat A2_im = 0;
1667  A2_im += g20_re * a0_im;
1668  A2_im += g20_im * a0_re;
1669  A2_im += g21_re * a1_im;
1670  A2_im += g21_im * a1_re;
1671  A2_im += g22_re * a2_im;
1672  A2_im += g22_im * a2_re;
1673  spinorFloat B2_re = 0;
1674  B2_re += g20_re * b0_re;
1675  B2_re -= g20_im * b0_im;
1676  B2_re += g21_re * b1_re;
1677  B2_re -= g21_im * b1_im;
1678  B2_re += g22_re * b2_re;
1679  B2_re -= g22_im * b2_im;
1680  spinorFloat B2_im = 0;
1681  B2_im += g20_re * b0_im;
1682  B2_im += g20_im * b0_re;
1683  B2_im += g21_re * b1_im;
1684  B2_im += g21_im * b1_re;
1685  B2_im += g22_re * b2_im;
1686  B2_im += g22_im * b2_re;
1687 
1688  o00_re += A0_re;
1689  o00_im += A0_im;
1690  o10_re += B0_re;
1691  o10_im += B0_im;
1692 
1693  o01_re += A1_re;
1694  o01_im += A1_im;
1695  o11_re += B1_re;
1696  o11_im += B1_im;
1697 
1698  o02_re += A2_re;
1699  o02_im += A2_im;
1700  o12_re += B2_re;
1701  o12_im += B2_im;
1702  }
1703 }
1704 
1705 #ifdef MULTI_GPU
1706 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]>0)) ||
1707  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==0) )
1708 #endif
1709 {
1710  // Projector P3-
1711  // 0 0 0 0
1712  // 0 0 0 0
1713  // 0 0 2 0
1714  // 0 0 0 2
1715 
1716 #ifdef MULTI_GPU
1717  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1 :
1718  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1719 #if (DD_PREC==2) // half precision
1720  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1721 #endif
1722 #else
1723  const int sp_idx = (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1;
1724 #endif
1725 
1726 #ifdef MULTI_GPU
1727  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % param.dc.volume_4d_cb : param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[static_cast<int>(kernel_type)]));
1728 #else
1729  const int ga_idx = sp_idx % param.dc.volume_4d_cb;
1730 #endif
1731 
1732  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1733  {
1740 
1741 #ifdef MULTI_GPU
1742  if (kernel_type == INTERIOR_KERNEL) {
1743 #endif
1744 
1745  // read spinor from device memory
1747 
1748  // project spinor into half spinors
1749  a0_re = +2*i20_re;
1750  a0_im = +2*i20_im;
1751  a1_re = +2*i21_re;
1752  a1_im = +2*i21_im;
1753  a2_re = +2*i22_re;
1754  a2_im = +2*i22_im;
1755  b0_re = +2*i30_re;
1756  b0_im = +2*i30_im;
1757  b1_re = +2*i31_re;
1758  b1_im = +2*i31_im;
1759  b2_re = +2*i32_re;
1760  b2_im = +2*i32_im;
1761 
1762 #ifdef MULTI_GPU
1763  } else {
1764 
1765  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
1766  const int t_proj_scale = TPROJSCALE;
1767 
1768  // read half spinor from device memory
1769  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
1770 
1771  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1772  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1773  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1774  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1775  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1776  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1777 
1778  }
1779 #endif // MULTI_GPU
1780 
1781  // identity gauge matrix
1788 
1789  o20_re += A0_re;
1790  o20_im += A0_im;
1791  o30_re += B0_re;
1792  o30_im += B0_im;
1793 
1794  o21_re += A1_re;
1795  o21_im += A1_im;
1796  o31_re += B1_re;
1797  o31_im += B1_im;
1798 
1799  o22_re += A2_re;
1800  o22_im += A2_im;
1801  o32_re += B2_re;
1802  o32_im += B2_im;
1803  } else {
1804  // read gauge matrix from device memory
1805  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, param.gauge_stride);
1806 
1813 
1814 #ifdef MULTI_GPU
1815  if (kernel_type == INTERIOR_KERNEL) {
1816 #endif
1817 
1818  // read spinor from device memory
1820 
1821  // project spinor into half spinors
1822  a0_re = +2*i20_re;
1823  a0_im = +2*i20_im;
1824  a1_re = +2*i21_re;
1825  a1_im = +2*i21_im;
1826  a2_re = +2*i22_re;
1827  a2_im = +2*i22_im;
1828  b0_re = +2*i30_re;
1829  b0_im = +2*i30_im;
1830  b1_re = +2*i31_re;
1831  b1_im = +2*i31_im;
1832  b2_re = +2*i32_re;
1833  b2_im = +2*i32_im;
1834 
1835 #ifdef MULTI_GPU
1836  } else {
1837 
1838  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
1839  const int t_proj_scale = TPROJSCALE;
1840 
1841  // read half spinor from device memory
1842  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
1843 
1844  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1845  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1846  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1847  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1848  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1849  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1850 
1851  }
1852 #endif // MULTI_GPU
1853 
1854  // reconstruct gauge matrix
1856 
1857  // multiply row 0
1858  spinorFloat A0_re = 0;
1859  A0_re += gT00_re * a0_re;
1860  A0_re -= gT00_im * a0_im;
1861  A0_re += gT01_re * a1_re;
1862  A0_re -= gT01_im * a1_im;
1863  A0_re += gT02_re * a2_re;
1864  A0_re -= gT02_im * a2_im;
1865  spinorFloat A0_im = 0;
1866  A0_im += gT00_re * a0_im;
1867  A0_im += gT00_im * a0_re;
1868  A0_im += gT01_re * a1_im;
1869  A0_im += gT01_im * a1_re;
1870  A0_im += gT02_re * a2_im;
1871  A0_im += gT02_im * a2_re;
1872  spinorFloat B0_re = 0;
1873  B0_re += gT00_re * b0_re;
1874  B0_re -= gT00_im * b0_im;
1875  B0_re += gT01_re * b1_re;
1876  B0_re -= gT01_im * b1_im;
1877  B0_re += gT02_re * b2_re;
1878  B0_re -= gT02_im * b2_im;
1879  spinorFloat B0_im = 0;
1880  B0_im += gT00_re * b0_im;
1881  B0_im += gT00_im * b0_re;
1882  B0_im += gT01_re * b1_im;
1883  B0_im += gT01_im * b1_re;
1884  B0_im += gT02_re * b2_im;
1885  B0_im += gT02_im * b2_re;
1886 
1887  // multiply row 1
1888  spinorFloat A1_re = 0;
1889  A1_re += gT10_re * a0_re;
1890  A1_re -= gT10_im * a0_im;
1891  A1_re += gT11_re * a1_re;
1892  A1_re -= gT11_im * a1_im;
1893  A1_re += gT12_re * a2_re;
1894  A1_re -= gT12_im * a2_im;
1895  spinorFloat A1_im = 0;
1896  A1_im += gT10_re * a0_im;
1897  A1_im += gT10_im * a0_re;
1898  A1_im += gT11_re * a1_im;
1899  A1_im += gT11_im * a1_re;
1900  A1_im += gT12_re * a2_im;
1901  A1_im += gT12_im * a2_re;
1902  spinorFloat B1_re = 0;
1903  B1_re += gT10_re * b0_re;
1904  B1_re -= gT10_im * b0_im;
1905  B1_re += gT11_re * b1_re;
1906  B1_re -= gT11_im * b1_im;
1907  B1_re += gT12_re * b2_re;
1908  B1_re -= gT12_im * b2_im;
1909  spinorFloat B1_im = 0;
1910  B1_im += gT10_re * b0_im;
1911  B1_im += gT10_im * b0_re;
1912  B1_im += gT11_re * b1_im;
1913  B1_im += gT11_im * b1_re;
1914  B1_im += gT12_re * b2_im;
1915  B1_im += gT12_im * b2_re;
1916 
1917  // multiply row 2
1918  spinorFloat A2_re = 0;
1919  A2_re += gT20_re * a0_re;
1920  A2_re -= gT20_im * a0_im;
1921  A2_re += gT21_re * a1_re;
1922  A2_re -= gT21_im * a1_im;
1923  A2_re += gT22_re * a2_re;
1924  A2_re -= gT22_im * a2_im;
1925  spinorFloat A2_im = 0;
1926  A2_im += gT20_re * a0_im;
1927  A2_im += gT20_im * a0_re;
1928  A2_im += gT21_re * a1_im;
1929  A2_im += gT21_im * a1_re;
1930  A2_im += gT22_re * a2_im;
1931  A2_im += gT22_im * a2_re;
1932  spinorFloat B2_re = 0;
1933  B2_re += gT20_re * b0_re;
1934  B2_re -= gT20_im * b0_im;
1935  B2_re += gT21_re * b1_re;
1936  B2_re -= gT21_im * b1_im;
1937  B2_re += gT22_re * b2_re;
1938  B2_re -= gT22_im * b2_im;
1939  spinorFloat B2_im = 0;
1940  B2_im += gT20_re * b0_im;
1941  B2_im += gT20_im * b0_re;
1942  B2_im += gT21_re * b1_im;
1943  B2_im += gT21_im * b1_re;
1944  B2_im += gT22_re * b2_im;
1945  B2_im += gT22_im * b2_re;
1946 
1947  o20_re += A0_re;
1948  o20_im += A0_im;
1949  o30_re += B0_re;
1950  o30_im += B0_im;
1951 
1952  o21_re += A1_re;
1953  o21_im += A1_im;
1954  o31_re += B1_re;
1955  o31_im += B1_im;
1956 
1957  o22_re += A2_re;
1958  o22_im += A2_im;
1959  o32_re += B2_re;
1960  o32_im += B2_im;
1961  }
1962 }
1963 
1964 #if defined MULTI_GPU && defined DSLASH_XPAY
1965 
1966 int incomplete = 0; // Have all 8 contributions been computed for this site?
1967 
1968 switch(kernel_type) { // intentional fall-through
1969 case INTERIOR_KERNEL:
1970  incomplete = incomplete || (param.commDim[3] && (coord[3]==0 || coord[3]==(param.dc.X[3]-1)));
1971 case EXTERIOR_KERNEL_T:
1972  incomplete = incomplete || (param.commDim[2] && (coord[2]==0 || coord[2]==(param.dc.X[2]-1)));
1973 case EXTERIOR_KERNEL_Z:
1974  incomplete = incomplete || (param.commDim[1] && (coord[1]==0 || coord[1]==(param.dc.X[1]-1)));
1975 case EXTERIOR_KERNEL_Y:
1976  incomplete = incomplete || (param.commDim[0] && (coord[0]==0 || coord[0]==(param.dc.X[0]-1)));
1977 }
1978 
1979 if (!incomplete)
1980 #endif // MULTI_GPU
1981 {
1982 
1983 #ifdef DSLASH_XPAY
1984  READ_ACCUM(ACCUMTEX, param.sp_stride)
1986 
1987 #ifdef MDWF_mode
1988  coeff = static_cast<spinorFloat>(0.5)*a/(mdwf_b5[coord[4]]*(m5+static_cast<spinorFloat>(4.0)) + static_cast<spinorFloat>(1.0));
1989 #else
1990  coeff = a;
1991 #endif
1992 
1993 #ifdef SPINOR_DOUBLE
1994  o00_re = coeff*o00_re + accum0.x;
1995  o00_im = coeff*o00_im + accum0.y;
1996  o01_re = coeff*o01_re + accum1.x;
1997  o01_im = coeff*o01_im + accum1.y;
1998  o02_re = coeff*o02_re + accum2.x;
1999  o02_im = coeff*o02_im + accum2.y;
2000  o10_re = coeff*o10_re + accum3.x;
2001  o10_im = coeff*o10_im + accum3.y;
2002  o11_re = coeff*o11_re + accum4.x;
2003  o11_im = coeff*o11_im + accum4.y;
2004  o12_re = coeff*o12_re + accum5.x;
2005  o12_im = coeff*o12_im + accum5.y;
2006  o20_re = coeff*o20_re + accum6.x;
2007  o20_im = coeff*o20_im + accum6.y;
2008  o21_re = coeff*o21_re + accum7.x;
2009  o21_im = coeff*o21_im + accum7.y;
2010  o22_re = coeff*o22_re + accum8.x;
2011  o22_im = coeff*o22_im + accum8.y;
2012  o30_re = coeff*o30_re + accum9.x;
2013  o30_im = coeff*o30_im + accum9.y;
2014  o31_re = coeff*o31_re + accum10.x;
2015  o31_im = coeff*o31_im + accum10.y;
2016  o32_re = coeff*o32_re + accum11.x;
2017  o32_im = coeff*o32_im + accum11.y;
2018 #else
2019  o00_re = coeff*o00_re + accum0.x;
2020  o00_im = coeff*o00_im + accum0.y;
2021  o01_re = coeff*o01_re + accum0.z;
2022  o01_im = coeff*o01_im + accum0.w;
2023  o02_re = coeff*o02_re + accum1.x;
2024  o02_im = coeff*o02_im + accum1.y;
2025  o10_re = coeff*o10_re + accum1.z;
2026  o10_im = coeff*o10_im + accum1.w;
2027  o11_re = coeff*o11_re + accum2.x;
2028  o11_im = coeff*o11_im + accum2.y;
2029  o12_re = coeff*o12_re + accum2.z;
2030  o12_im = coeff*o12_im + accum2.w;
2031  o20_re = coeff*o20_re + accum3.x;
2032  o20_im = coeff*o20_im + accum3.y;
2033  o21_re = coeff*o21_re + accum3.z;
2034  o21_im = coeff*o21_im + accum3.w;
2035  o22_re = coeff*o22_re + accum4.x;
2036  o22_im = coeff*o22_im + accum4.y;
2037  o30_re = coeff*o30_re + accum4.z;
2038  o30_im = coeff*o30_im + accum4.w;
2039  o31_re = coeff*o31_re + accum5.x;
2040  o31_im = coeff*o31_im + accum5.y;
2041  o32_re = coeff*o32_re + accum5.z;
2042  o32_im = coeff*o32_im + accum5.w;
2043 #endif // SPINOR_DOUBLE
2044 #endif // DSLASH_XPAY
2045 }
2046 
2047 // write spinor field back to device memory
2048 WRITE_SPINOR(param.sp_stride);
2049 
2050 // undefine to prevent warning when precision is changed
2051 #undef m5
2052 #undef mdwf_b5
2053 #undef mdwf_c5
2054 #undef mferm
2055 #undef a
2056 #undef b
2057 #undef spinorFloat
2058 #undef POW
2059 #undef SHARED_STRIDE
2060 
2061 #undef g00_re
2062 #undef g00_im
2063 #undef g01_re
2064 #undef g01_im
2065 #undef g02_re
2066 #undef g02_im
2067 #undef g10_re
2068 #undef g10_im
2069 #undef g11_re
2070 #undef g11_im
2071 #undef g12_re
2072 #undef g12_im
2073 #undef g20_re
2074 #undef g20_im
2075 #undef g21_re
2076 #undef g21_im
2077 #undef g22_re
2078 #undef g22_im
2079 
2080 #undef i00_re
2081 #undef i00_im
2082 #undef i01_re
2083 #undef i01_im
2084 #undef i02_re
2085 #undef i02_im
2086 #undef i10_re
2087 #undef i10_im
2088 #undef i11_re
2089 #undef i11_im
2090 #undef i12_re
2091 #undef i12_im
2092 #undef i20_re
2093 #undef i20_im
2094 #undef i21_re
2095 #undef i21_im
2096 #undef i22_re
2097 #undef i22_im
2098 #undef i30_re
2099 #undef i30_im
2100 #undef i31_re
2101 #undef i31_im
2102 #undef i32_re
2103 #undef i32_im
2104 
2105 
2106 
2107 #undef VOLATILE
spinorFloat A2_re
spinorFloat b1_im
#define g21_re
VOLATILE spinorFloat o20_re
#define i20_re
dim3 dim3 blockDim
#define gT10_re
VOLATILE spinorFloat o11_re
#define i11_im
VOLATILE spinorFloat o11_im
#define i22_im
#define i30_im
VOLATILE spinorFloat o10_re
#define gT21_re
#define i10_re
#define gT11_re
#define WRITE_SPINOR
#define i20_im
#define i12_re
spinorFloat b2_re
#define i31_re
spinorFloat B1_re
int sp_idx
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o00_re
float4 G2
VOLATILE spinorFloat o32_im
int face_idx
#define gT22_re
const int ga_idx
VOLATILE spinorFloat o12_im
float4 G3
#define g10_re
#define i31_im
#define i01_re
#define gT00_re
ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride)
#define gT22_im
#define g20_im
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define g01_re
#define GAUGE0TEX
VOLATILE spinorFloat o31_re
spinorFloat A0_re
spinorFloat A2_im
spinorFloat B1_im
#define g12_re
spinorFloat b0_re
#define i32_im
VOLATILE spinorFloat o22_re
QudaGaugeParam param
Definition: pack_test.cpp:17
#define g11_re
#define m5
const int face_num
spinorFloat B0_re
spinorFloat A1_re
spinorFloat b0_im
spinorFloat b1_re
#define i12_im
spinorFloat a0_re
#define gT12_im
float4 G0
spinorFloat a2_im
spinorFloat a2_re
#define GAUGE1TEX
coordsFromFaceIndex< 5, QUDA_4D_PC, kernel_type, 1 >(X, sid, coord, face_idx, face_num, param)
#define SPINORTEX
spinorFloat A0_im
#define gT01_re
spinorFloat B2_re
#define g21_im
#define a
#define i32_re
#define gT21_im
VOLATILE spinorFloat o20_im
#define i01_im
VOLATILE spinorFloat o12_re
#define gT20_im
VOLATILE spinorFloat o02_re
#define g02_re
#define VOLATILE
int X[4]
Definition: quda.h:29
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o00_im
RECONSTRUCT_GAUGE_MATRIX(0)
#define i02_re
VOLATILE spinorFloat o21_re
#define gT01_im
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define READ_SPINOR_GHOST
#define g02_im
#define i02_im
VOLATILE spinorFloat o31_im
#define i21_im
#define g12_im
#define gT11_im
spinorFloat a0_im
spinorFloat a1_im
VOLATILE spinorFloat o01_im
#define i22_re
VOLATILE spinorFloat o30_re
#define i00_im
#define gT12_re
#define i10_im
#define INTERTEX
#define gT02_re
VOLATILE spinorFloat o10_im
#define mdwf_b5
#define g20_re
#define g11_im
#define g22_re
#define g01_im
VOLATILE spinorFloat o02_im
#define spinorFloat
#define TPROJSCALE
#define g00_im
spinorFloat B0_im
#define gT10_im
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define GHOSTSPINORTEX
#define i30_re
int coord[5]
spinorFloat A1_im
#define i11_re
spinorFloat a1_re
float4 G4
float4 G1
VOLATILE spinorFloat o30_im
#define i21_re
READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid)
#define gT20_re
#define g00_re
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o32_re
#define g22_im
#define i00_re
#define g10_im
#define gT00_im
spinorFloat B2_im
spinorFloat b2_im
#define gT02_im