QUDA  0.9.0
dw_dslash_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 // NB! Don't trust any MULTI_GPU code
6 
7 #if (CUDA_VERSION >= 4010)
8 #define VOLATILE
9 #else
10 #define VOLATILE volatile
11 #endif
12 // input spinor
13 #ifdef SPINOR_DOUBLE
14 #define spinorFloat double
15 #define i00_re I0.x
16 #define i00_im I0.y
17 #define i01_re I1.x
18 #define i01_im I1.y
19 #define i02_re I2.x
20 #define i02_im I2.y
21 #define i10_re I3.x
22 #define i10_im I3.y
23 #define i11_re I4.x
24 #define i11_im I4.y
25 #define i12_re I5.x
26 #define i12_im I5.y
27 #define i20_re I6.x
28 #define i20_im I6.y
29 #define i21_re I7.x
30 #define i21_im I7.y
31 #define i22_re I8.x
32 #define i22_im I8.y
33 #define i30_re I9.x
34 #define i30_im I9.y
35 #define i31_re I10.x
36 #define i31_im I10.y
37 #define i32_re I11.x
38 #define i32_im I11.y
39 #else
40 #define spinorFloat float
41 #define i00_re I0.x
42 #define i00_im I0.y
43 #define i01_re I0.z
44 #define i01_im I0.w
45 #define i02_re I1.x
46 #define i02_im I1.y
47 #define i10_re I1.z
48 #define i10_im I1.w
49 #define i11_re I2.x
50 #define i11_im I2.y
51 #define i12_re I2.z
52 #define i12_im I2.w
53 #define i20_re I3.x
54 #define i20_im I3.y
55 #define i21_re I3.z
56 #define i21_im I3.w
57 #define i22_re I4.x
58 #define i22_im I4.y
59 #define i30_re I4.z
60 #define i30_im I4.w
61 #define i31_re I5.x
62 #define i31_im I5.y
63 #define i32_re I5.z
64 #define i32_im I5.w
65 #endif // SPINOR_DOUBLE
66 
67 // gauge link
68 #ifdef GAUGE_FLOAT2
69 #define g00_re G0.x
70 #define g00_im G0.y
71 #define g01_re G1.x
72 #define g01_im G1.y
73 #define g02_re G2.x
74 #define g02_im G2.y
75 #define g10_re G3.x
76 #define g10_im G3.y
77 #define g11_re G4.x
78 #define g11_im G4.y
79 #define g12_re G5.x
80 #define g12_im G5.y
81 #define g20_re G6.x
82 #define g20_im G6.y
83 #define g21_re G7.x
84 #define g21_im G7.y
85 #define g22_re G8.x
86 #define g22_im G8.y
87 
88 #else
89 #define g00_re G0.x
90 #define g00_im G0.y
91 #define g01_re G0.z
92 #define g01_im G0.w
93 #define g02_re G1.x
94 #define g02_im G1.y
95 #define g10_re G1.z
96 #define g10_im G1.w
97 #define g11_re G2.x
98 #define g11_im G2.y
99 #define g12_re G2.z
100 #define g12_im G2.w
101 #define g20_re G3.x
102 #define g20_im G3.y
103 #define g21_re G3.z
104 #define g21_im G3.w
105 #define g22_re G4.x
106 #define g22_im G4.y
107 
108 #endif // GAUGE_DOUBLE
109 
110 // conjugated gauge link
111 #define gT00_re (+g00_re)
112 #define gT00_im (-g00_im)
113 #define gT01_re (+g10_re)
114 #define gT01_im (-g10_im)
115 #define gT02_re (+g20_re)
116 #define gT02_im (-g20_im)
117 #define gT10_re (+g01_re)
118 #define gT10_im (-g01_im)
119 #define gT11_re (+g11_re)
120 #define gT11_im (-g11_im)
121 #define gT12_re (+g21_re)
122 #define gT12_im (-g21_im)
123 #define gT20_re (+g02_re)
124 #define gT20_im (-g02_im)
125 #define gT21_re (+g12_re)
126 #define gT21_im (-g12_im)
127 #define gT22_re (+g22_re)
128 #define gT22_im (-g22_im)
129 
130 // output spinor
155 
156 #ifdef SPINOR_DOUBLE
157 #if (__COMPUTE_CAPABILITY__ >= 200)
158 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
159 #else
160 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
161 #endif
162 #else
163 #if (__COMPUTE_CAPABILITY__ >= 200)
164 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
165 #else
166 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
167 #endif
168 #endif
169 
170 #include "read_gauge.h"
171 #include "io_spinor.h"
172 
173 
174 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
175 if (sid >= param.threads*param.dc.Ls) return;
176 
177 int X, coord[5];
178 
180 
181 #ifdef MULTI_GPU
182 int face_idx;
183 if (kernel_type == INTERIOR_KERNEL) {
184 #endif
185 
186 
188 s_parity = ( sid/param.dc.volume_4d_cb ) % 2;
189 
190  o00_re = 0; o00_im = 0;
191  o01_re = 0; o01_im = 0;
192  o02_re = 0; o02_im = 0;
193  o10_re = 0; o10_im = 0;
194  o11_re = 0; o11_im = 0;
195  o12_re = 0; o12_im = 0;
196  o20_re = 0; o20_im = 0;
197  o21_re = 0; o21_im = 0;
198  o22_re = 0; o22_im = 0;
199  o30_re = 0; o30_im = 0;
200  o31_re = 0; o31_im = 0;
201  o32_re = 0; o32_im = 0;
202 
203 #ifdef MULTI_GPU
204 } else { // exterior kernel
205 
206 const int face_volume = (param.threads*param.dc.Ls >> 1); // volume of one face
207 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
208 face_idx = sid - face_num*face_volume; // index into the respective face
209 
210 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
211 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
212 //sp_idx = face_idx + param.ghostOffset[dim];
213 
214 coordsFromFaceIndex<5,QUDA_5D_PC,kernel_type,1>(X, sid, coord, face_idx, face_num, param);
215 s_parity = ( sid/param.dc.volume_4d_cb ) % 2;
216 
218 
219  o00_re = i00_re; o00_im = i00_im;
220  o01_re = i01_re; o01_im = i01_im;
221  o02_re = i02_re; o02_im = i02_im;
222  o10_re = i10_re; o10_im = i10_im;
223  o11_re = i11_re; o11_im = i11_im;
224  o12_re = i12_re; o12_im = i12_im;
225  o20_re = i20_re; o20_im = i20_im;
226  o21_re = i21_re; o21_im = i21_im;
227  o22_re = i22_re; o22_im = i22_im;
228  o30_re = i30_re; o30_im = i30_im;
229  o31_re = i31_re; o31_im = i31_im;
230  o32_re = i32_re; o32_im = i32_im;
231 }
232 #endif // MULTI_GPU
233 
234 // declare G## here and use ASSN below instead of READ
235 #ifdef GAUGE_FLOAT2
236 #if (DD_PREC==0) //temporal hack
237 double2 G0;
238 double2 G1;
239 double2 G2;
240 double2 G3;
241 double2 G4;
242 double2 G5;
243 double2 G6;
244 double2 G7;
245 double2 G8;
246 #else
247 float2 G0;
248 float2 G1;
249 float2 G2;
250 float2 G3;
251 float2 G4;
252 float2 G5;
253 float2 G6;
254 float2 G7;
255 float2 G8;
256 #endif
257 #else
258 float4 G0;
259 float4 G1;
260 float4 G2;
261 float4 G3;
262 float4 G4;
263 #endif
264 
265 
266 
267 #ifdef MULTI_GPU
268 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]<(param.dc.X[0]-1))) ||
269  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==(param.dc.X[0]-1)) )
270 #endif
271 {
272  // Projector P0-
273  // 1 0 0 -i
274  // 0 1 -i 0
275  // 0 i 1 0
276  // i 0 0 1
277 
278 #ifdef MULTI_GPU
279  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1 :
280  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
281 #if (DD_PREC==2) // half precision
282  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
283 #endif
284 #else
285  const int sp_idx = (coord[0]==(param.dc.X[0]-1) ? X-(param.dc.X[0]-1) : X+1) >> 1;
286 #endif
287 
288  const int ga_idx = sid % param.dc.volume_4d_cb;
289 
290  // read gauge matrix from device memory
291  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, param.gauge_stride); }
292  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 0, ga_idx, param.gauge_stride); }
293 
300 
301 #ifdef MULTI_GPU
302  if (kernel_type == INTERIOR_KERNEL) {
303 #endif
304 
305  // read spinor from device memory
306  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
307 
308  // project spinor into half spinors
309  a0_re = +i00_re+i30_im;
310  a0_im = +i00_im-i30_re;
311  a1_re = +i01_re+i31_im;
312  a1_im = +i01_im-i31_re;
313  a2_re = +i02_re+i32_im;
314  a2_im = +i02_im-i32_re;
315  b0_re = +i10_re+i20_im;
316  b0_im = +i10_im-i20_re;
317  b1_re = +i11_re+i21_im;
318  b1_im = +i11_im-i21_re;
319  b2_re = +i12_re+i22_im;
320  b2_im = +i12_im-i22_re;
321 
322 #ifdef MULTI_GPU
323  } else {
324 
325  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
326 
327  // read half spinor from device memory
328  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 0);
329 
330  a0_re = i00_re; a0_im = i00_im;
331  a1_re = i01_re; a1_im = i01_im;
332  a2_re = i02_re; a2_im = i02_im;
333  b0_re = i10_re; b0_im = i10_im;
334  b1_re = i11_re; b1_im = i11_im;
335  b2_re = i12_re; b2_im = i12_im;
336 
337  }
338 #endif // MULTI_GPU
339 
340  // reconstruct gauge matrix
342 
343  // multiply row 0
345  A0_re += g00_re * a0_re;
346  A0_re -= g00_im * a0_im;
347  A0_re += g01_re * a1_re;
348  A0_re -= g01_im * a1_im;
349  A0_re += g02_re * a2_re;
350  A0_re -= g02_im * a2_im;
352  A0_im += g00_re * a0_im;
353  A0_im += g00_im * a0_re;
354  A0_im += g01_re * a1_im;
355  A0_im += g01_im * a1_re;
356  A0_im += g02_re * a2_im;
357  A0_im += g02_im * a2_re;
359  B0_re += g00_re * b0_re;
360  B0_re -= g00_im * b0_im;
361  B0_re += g01_re * b1_re;
362  B0_re -= g01_im * b1_im;
363  B0_re += g02_re * b2_re;
364  B0_re -= g02_im * b2_im;
366  B0_im += g00_re * b0_im;
367  B0_im += g00_im * b0_re;
368  B0_im += g01_re * b1_im;
369  B0_im += g01_im * b1_re;
370  B0_im += g02_re * b2_im;
371  B0_im += g02_im * b2_re;
372 
373  // multiply row 1
375  A1_re += g10_re * a0_re;
376  A1_re -= g10_im * a0_im;
377  A1_re += g11_re * a1_re;
378  A1_re -= g11_im * a1_im;
379  A1_re += g12_re * a2_re;
380  A1_re -= g12_im * a2_im;
382  A1_im += g10_re * a0_im;
383  A1_im += g10_im * a0_re;
384  A1_im += g11_re * a1_im;
385  A1_im += g11_im * a1_re;
386  A1_im += g12_re * a2_im;
387  A1_im += g12_im * a2_re;
389  B1_re += g10_re * b0_re;
390  B1_re -= g10_im * b0_im;
391  B1_re += g11_re * b1_re;
392  B1_re -= g11_im * b1_im;
393  B1_re += g12_re * b2_re;
394  B1_re -= g12_im * b2_im;
396  B1_im += g10_re * b0_im;
397  B1_im += g10_im * b0_re;
398  B1_im += g11_re * b1_im;
399  B1_im += g11_im * b1_re;
400  B1_im += g12_re * b2_im;
401  B1_im += g12_im * b2_re;
402 
403  // multiply row 2
405  A2_re += g20_re * a0_re;
406  A2_re -= g20_im * a0_im;
407  A2_re += g21_re * a1_re;
408  A2_re -= g21_im * a1_im;
409  A2_re += g22_re * a2_re;
410  A2_re -= g22_im * a2_im;
412  A2_im += g20_re * a0_im;
413  A2_im += g20_im * a0_re;
414  A2_im += g21_re * a1_im;
415  A2_im += g21_im * a1_re;
416  A2_im += g22_re * a2_im;
417  A2_im += g22_im * a2_re;
419  B2_re += g20_re * b0_re;
420  B2_re -= g20_im * b0_im;
421  B2_re += g21_re * b1_re;
422  B2_re -= g21_im * b1_im;
423  B2_re += g22_re * b2_re;
424  B2_re -= g22_im * b2_im;
426  B2_im += g20_re * b0_im;
427  B2_im += g20_im * b0_re;
428  B2_im += g21_re * b1_im;
429  B2_im += g21_im * b1_re;
430  B2_im += g22_re * b2_im;
431  B2_im += g22_im * b2_re;
432 
433  o00_re += A0_re;
434  o00_im += A0_im;
435  o10_re += B0_re;
436  o10_im += B0_im;
437  o20_re -= B0_im;
438  o20_im += B0_re;
439  o30_re -= A0_im;
440  o30_im += A0_re;
441 
442  o01_re += A1_re;
443  o01_im += A1_im;
444  o11_re += B1_re;
445  o11_im += B1_im;
446  o21_re -= B1_im;
447  o21_im += B1_re;
448  o31_re -= A1_im;
449  o31_im += A1_re;
450 
451  o02_re += A2_re;
452  o02_im += A2_im;
453  o12_re += B2_re;
454  o12_im += B2_im;
455  o22_re -= B2_im;
456  o22_im += B2_re;
457  o32_re -= A2_im;
458  o32_im += A2_re;
459 }
460 
461 #ifdef MULTI_GPU
462 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || coord[0]>0)) ||
463  (kernel_type == EXTERIOR_KERNEL_X && coord[0]==0) )
464 #endif
465 {
466  // Projector P0+
467  // 1 0 0 i
468  // 0 1 i 0
469  // 0 -i 1 0
470  // -i 0 0 1
471 
472 #ifdef MULTI_GPU
473  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1 :
474  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
475 #if (DD_PREC==2) // half precision
476  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
477 #endif
478 #else
479  const int sp_idx = (coord[0]==0 ? X+(param.dc.X[0]-1) : X-1) >> 1;
480 #endif
481 
482 #ifdef MULTI_GPU
483  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % param.dc.volume_4d_cb : param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[static_cast<int>(kernel_type)]));
484 #else
485  const int ga_idx = sp_idx % param.dc.volume_4d_cb;
486 #endif
487 
488  // read gauge matrix from device memory
489  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, param.gauge_stride); }
490  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 1, ga_idx, param.gauge_stride); }
491 
498 
499 #ifdef MULTI_GPU
500  if (kernel_type == INTERIOR_KERNEL) {
501 #endif
502 
503  // read spinor from device memory
504  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
505 
506  // project spinor into half spinors
507  a0_re = +i00_re-i30_im;
508  a0_im = +i00_im+i30_re;
509  a1_re = +i01_re-i31_im;
510  a1_im = +i01_im+i31_re;
511  a2_re = +i02_re-i32_im;
512  a2_im = +i02_im+i32_re;
513  b0_re = +i10_re-i20_im;
514  b0_im = +i10_im+i20_re;
515  b1_re = +i11_re-i21_im;
516  b1_im = +i11_im+i21_re;
517  b2_re = +i12_re-i22_im;
518  b2_im = +i12_im+i22_re;
519 
520 #ifdef MULTI_GPU
521  } else {
522 
523  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
524 
525  // read half spinor from device memory
526  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 1);
527 
528  a0_re = i00_re; a0_im = i00_im;
529  a1_re = i01_re; a1_im = i01_im;
530  a2_re = i02_re; a2_im = i02_im;
531  b0_re = i10_re; b0_im = i10_im;
532  b1_re = i11_re; b1_im = i11_im;
533  b2_re = i12_re; b2_im = i12_im;
534 
535  }
536 #endif // MULTI_GPU
537 
538  // reconstruct gauge matrix
540 
541  // multiply row 0
542  spinorFloat A0_re = 0;
543  A0_re += gT00_re * a0_re;
544  A0_re -= gT00_im * a0_im;
545  A0_re += gT01_re * a1_re;
546  A0_re -= gT01_im * a1_im;
547  A0_re += gT02_re * a2_re;
548  A0_re -= gT02_im * a2_im;
549  spinorFloat A0_im = 0;
550  A0_im += gT00_re * a0_im;
551  A0_im += gT00_im * a0_re;
552  A0_im += gT01_re * a1_im;
553  A0_im += gT01_im * a1_re;
554  A0_im += gT02_re * a2_im;
555  A0_im += gT02_im * a2_re;
556  spinorFloat B0_re = 0;
557  B0_re += gT00_re * b0_re;
558  B0_re -= gT00_im * b0_im;
559  B0_re += gT01_re * b1_re;
560  B0_re -= gT01_im * b1_im;
561  B0_re += gT02_re * b2_re;
562  B0_re -= gT02_im * b2_im;
563  spinorFloat B0_im = 0;
564  B0_im += gT00_re * b0_im;
565  B0_im += gT00_im * b0_re;
566  B0_im += gT01_re * b1_im;
567  B0_im += gT01_im * b1_re;
568  B0_im += gT02_re * b2_im;
569  B0_im += gT02_im * b2_re;
570 
571  // multiply row 1
572  spinorFloat A1_re = 0;
573  A1_re += gT10_re * a0_re;
574  A1_re -= gT10_im * a0_im;
575  A1_re += gT11_re * a1_re;
576  A1_re -= gT11_im * a1_im;
577  A1_re += gT12_re * a2_re;
578  A1_re -= gT12_im * a2_im;
579  spinorFloat A1_im = 0;
580  A1_im += gT10_re * a0_im;
581  A1_im += gT10_im * a0_re;
582  A1_im += gT11_re * a1_im;
583  A1_im += gT11_im * a1_re;
584  A1_im += gT12_re * a2_im;
585  A1_im += gT12_im * a2_re;
586  spinorFloat B1_re = 0;
587  B1_re += gT10_re * b0_re;
588  B1_re -= gT10_im * b0_im;
589  B1_re += gT11_re * b1_re;
590  B1_re -= gT11_im * b1_im;
591  B1_re += gT12_re * b2_re;
592  B1_re -= gT12_im * b2_im;
593  spinorFloat B1_im = 0;
594  B1_im += gT10_re * b0_im;
595  B1_im += gT10_im * b0_re;
596  B1_im += gT11_re * b1_im;
597  B1_im += gT11_im * b1_re;
598  B1_im += gT12_re * b2_im;
599  B1_im += gT12_im * b2_re;
600 
601  // multiply row 2
602  spinorFloat A2_re = 0;
603  A2_re += gT20_re * a0_re;
604  A2_re -= gT20_im * a0_im;
605  A2_re += gT21_re * a1_re;
606  A2_re -= gT21_im * a1_im;
607  A2_re += gT22_re * a2_re;
608  A2_re -= gT22_im * a2_im;
609  spinorFloat A2_im = 0;
610  A2_im += gT20_re * a0_im;
611  A2_im += gT20_im * a0_re;
612  A2_im += gT21_re * a1_im;
613  A2_im += gT21_im * a1_re;
614  A2_im += gT22_re * a2_im;
615  A2_im += gT22_im * a2_re;
616  spinorFloat B2_re = 0;
617  B2_re += gT20_re * b0_re;
618  B2_re -= gT20_im * b0_im;
619  B2_re += gT21_re * b1_re;
620  B2_re -= gT21_im * b1_im;
621  B2_re += gT22_re * b2_re;
622  B2_re -= gT22_im * b2_im;
623  spinorFloat B2_im = 0;
624  B2_im += gT20_re * b0_im;
625  B2_im += gT20_im * b0_re;
626  B2_im += gT21_re * b1_im;
627  B2_im += gT21_im * b1_re;
628  B2_im += gT22_re * b2_im;
629  B2_im += gT22_im * b2_re;
630 
631  o00_re += A0_re;
632  o00_im += A0_im;
633  o10_re += B0_re;
634  o10_im += B0_im;
635  o20_re += B0_im;
636  o20_im -= B0_re;
637  o30_re += A0_im;
638  o30_im -= A0_re;
639 
640  o01_re += A1_re;
641  o01_im += A1_im;
642  o11_re += B1_re;
643  o11_im += B1_im;
644  o21_re += B1_im;
645  o21_im -= B1_re;
646  o31_re += A1_im;
647  o31_im -= A1_re;
648 
649  o02_re += A2_re;
650  o02_im += A2_im;
651  o12_re += B2_re;
652  o12_im += B2_im;
653  o22_re += B2_im;
654  o22_im -= B2_re;
655  o32_re += A2_im;
656  o32_im -= A2_re;
657 }
658 
659 #ifdef MULTI_GPU
660 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]<(param.dc.X[1]-1))) ||
661  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==(param.dc.X[1]-1)) )
662 #endif
663 {
664  // Projector P1-
665  // 1 0 0 -1
666  // 0 1 1 0
667  // 0 1 1 0
668  // -1 0 0 1
669 
670 #ifdef MULTI_GPU
671  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1 :
672  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
673 #if (DD_PREC==2) // half precision
674  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
675 #endif
676 #else
677  const int sp_idx = (coord[1]==(param.dc.X[1]-1) ? X-param.dc.X2X1mX1 : X+param.dc.X[0]) >> 1;
678 #endif
679 
680  const int ga_idx = sid % param.dc.volume_4d_cb;
681 
682  // read gauge matrix from device memory
683  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, param.gauge_stride); }
684  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 2, ga_idx, param.gauge_stride); }
685 
692 
693 #ifdef MULTI_GPU
694  if (kernel_type == INTERIOR_KERNEL) {
695 #endif
696 
697  // read spinor from device memory
698  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
699 
700  // project spinor into half spinors
701  a0_re = +i00_re-i30_re;
702  a0_im = +i00_im-i30_im;
703  a1_re = +i01_re-i31_re;
704  a1_im = +i01_im-i31_im;
705  a2_re = +i02_re-i32_re;
706  a2_im = +i02_im-i32_im;
707  b0_re = +i10_re+i20_re;
708  b0_im = +i10_im+i20_im;
709  b1_re = +i11_re+i21_re;
710  b1_im = +i11_im+i21_im;
711  b2_re = +i12_re+i22_re;
712  b2_im = +i12_im+i22_im;
713 
714 #ifdef MULTI_GPU
715  } else {
716 
717  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
718 
719  // read half spinor from device memory
720  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 2);
721 
722  a0_re = i00_re; a0_im = i00_im;
723  a1_re = i01_re; a1_im = i01_im;
724  a2_re = i02_re; a2_im = i02_im;
725  b0_re = i10_re; b0_im = i10_im;
726  b1_re = i11_re; b1_im = i11_im;
727  b2_re = i12_re; b2_im = i12_im;
728 
729  }
730 #endif // MULTI_GPU
731 
732  // reconstruct gauge matrix
734 
735  // multiply row 0
736  spinorFloat A0_re = 0;
737  A0_re += g00_re * a0_re;
738  A0_re -= g00_im * a0_im;
739  A0_re += g01_re * a1_re;
740  A0_re -= g01_im * a1_im;
741  A0_re += g02_re * a2_re;
742  A0_re -= g02_im * a2_im;
743  spinorFloat A0_im = 0;
744  A0_im += g00_re * a0_im;
745  A0_im += g00_im * a0_re;
746  A0_im += g01_re * a1_im;
747  A0_im += g01_im * a1_re;
748  A0_im += g02_re * a2_im;
749  A0_im += g02_im * a2_re;
750  spinorFloat B0_re = 0;
751  B0_re += g00_re * b0_re;
752  B0_re -= g00_im * b0_im;
753  B0_re += g01_re * b1_re;
754  B0_re -= g01_im * b1_im;
755  B0_re += g02_re * b2_re;
756  B0_re -= g02_im * b2_im;
757  spinorFloat B0_im = 0;
758  B0_im += g00_re * b0_im;
759  B0_im += g00_im * b0_re;
760  B0_im += g01_re * b1_im;
761  B0_im += g01_im * b1_re;
762  B0_im += g02_re * b2_im;
763  B0_im += g02_im * b2_re;
764 
765  // multiply row 1
766  spinorFloat A1_re = 0;
767  A1_re += g10_re * a0_re;
768  A1_re -= g10_im * a0_im;
769  A1_re += g11_re * a1_re;
770  A1_re -= g11_im * a1_im;
771  A1_re += g12_re * a2_re;
772  A1_re -= g12_im * a2_im;
773  spinorFloat A1_im = 0;
774  A1_im += g10_re * a0_im;
775  A1_im += g10_im * a0_re;
776  A1_im += g11_re * a1_im;
777  A1_im += g11_im * a1_re;
778  A1_im += g12_re * a2_im;
779  A1_im += g12_im * a2_re;
780  spinorFloat B1_re = 0;
781  B1_re += g10_re * b0_re;
782  B1_re -= g10_im * b0_im;
783  B1_re += g11_re * b1_re;
784  B1_re -= g11_im * b1_im;
785  B1_re += g12_re * b2_re;
786  B1_re -= g12_im * b2_im;
787  spinorFloat B1_im = 0;
788  B1_im += g10_re * b0_im;
789  B1_im += g10_im * b0_re;
790  B1_im += g11_re * b1_im;
791  B1_im += g11_im * b1_re;
792  B1_im += g12_re * b2_im;
793  B1_im += g12_im * b2_re;
794 
795  // multiply row 2
796  spinorFloat A2_re = 0;
797  A2_re += g20_re * a0_re;
798  A2_re -= g20_im * a0_im;
799  A2_re += g21_re * a1_re;
800  A2_re -= g21_im * a1_im;
801  A2_re += g22_re * a2_re;
802  A2_re -= g22_im * a2_im;
803  spinorFloat A2_im = 0;
804  A2_im += g20_re * a0_im;
805  A2_im += g20_im * a0_re;
806  A2_im += g21_re * a1_im;
807  A2_im += g21_im * a1_re;
808  A2_im += g22_re * a2_im;
809  A2_im += g22_im * a2_re;
810  spinorFloat B2_re = 0;
811  B2_re += g20_re * b0_re;
812  B2_re -= g20_im * b0_im;
813  B2_re += g21_re * b1_re;
814  B2_re -= g21_im * b1_im;
815  B2_re += g22_re * b2_re;
816  B2_re -= g22_im * b2_im;
817  spinorFloat B2_im = 0;
818  B2_im += g20_re * b0_im;
819  B2_im += g20_im * b0_re;
820  B2_im += g21_re * b1_im;
821  B2_im += g21_im * b1_re;
822  B2_im += g22_re * b2_im;
823  B2_im += g22_im * b2_re;
824 
825  o00_re += A0_re;
826  o00_im += A0_im;
827  o10_re += B0_re;
828  o10_im += B0_im;
829  o20_re += B0_re;
830  o20_im += B0_im;
831  o30_re -= A0_re;
832  o30_im -= A0_im;
833 
834  o01_re += A1_re;
835  o01_im += A1_im;
836  o11_re += B1_re;
837  o11_im += B1_im;
838  o21_re += B1_re;
839  o21_im += B1_im;
840  o31_re -= A1_re;
841  o31_im -= A1_im;
842 
843  o02_re += A2_re;
844  o02_im += A2_im;
845  o12_re += B2_re;
846  o12_im += B2_im;
847  o22_re += B2_re;
848  o22_im += B2_im;
849  o32_re -= A2_re;
850  o32_im -= A2_im;
851 }
852 
853 #ifdef MULTI_GPU
854 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || coord[1]>0)) ||
855  (kernel_type == EXTERIOR_KERNEL_Y && coord[1]==0) )
856 #endif
857 {
858  // Projector P1+
859  // 1 0 0 1
860  // 0 1 -1 0
861  // 0 -1 1 0
862  // 1 0 0 1
863 
864 #ifdef MULTI_GPU
865  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1 :
866  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
867 #if (DD_PREC==2) // half precision
868  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
869 #endif
870 #else
871  const int sp_idx = (coord[1]==0 ? X+param.dc.X2X1mX1 : X-param.dc.X[0]) >> 1;
872 #endif
873 
874 #ifdef MULTI_GPU
875  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % param.dc.volume_4d_cb : param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[static_cast<int>(kernel_type)]));
876 #else
877  const int ga_idx = sp_idx % param.dc.volume_4d_cb;
878 #endif
879 
880  // read gauge matrix from device memory
881  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, param.gauge_stride); }
882  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 3, ga_idx, param.gauge_stride); }
883 
890 
891 #ifdef MULTI_GPU
892  if (kernel_type == INTERIOR_KERNEL) {
893 #endif
894 
895  // read spinor from device memory
896  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
897 
898  // project spinor into half spinors
899  a0_re = +i00_re+i30_re;
900  a0_im = +i00_im+i30_im;
901  a1_re = +i01_re+i31_re;
902  a1_im = +i01_im+i31_im;
903  a2_re = +i02_re+i32_re;
904  a2_im = +i02_im+i32_im;
905  b0_re = +i10_re-i20_re;
906  b0_im = +i10_im-i20_im;
907  b1_re = +i11_re-i21_re;
908  b1_im = +i11_im-i21_im;
909  b2_re = +i12_re-i22_re;
910  b2_im = +i12_im-i22_im;
911 
912 #ifdef MULTI_GPU
913  } else {
914 
915  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
916 
917  // read half spinor from device memory
918  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 3);
919 
920  a0_re = i00_re; a0_im = i00_im;
921  a1_re = i01_re; a1_im = i01_im;
922  a2_re = i02_re; a2_im = i02_im;
923  b0_re = i10_re; b0_im = i10_im;
924  b1_re = i11_re; b1_im = i11_im;
925  b2_re = i12_re; b2_im = i12_im;
926 
927  }
928 #endif // MULTI_GPU
929 
930  // reconstruct gauge matrix
932 
933  // multiply row 0
934  spinorFloat A0_re = 0;
935  A0_re += gT00_re * a0_re;
936  A0_re -= gT00_im * a0_im;
937  A0_re += gT01_re * a1_re;
938  A0_re -= gT01_im * a1_im;
939  A0_re += gT02_re * a2_re;
940  A0_re -= gT02_im * a2_im;
941  spinorFloat A0_im = 0;
942  A0_im += gT00_re * a0_im;
943  A0_im += gT00_im * a0_re;
944  A0_im += gT01_re * a1_im;
945  A0_im += gT01_im * a1_re;
946  A0_im += gT02_re * a2_im;
947  A0_im += gT02_im * a2_re;
948  spinorFloat B0_re = 0;
949  B0_re += gT00_re * b0_re;
950  B0_re -= gT00_im * b0_im;
951  B0_re += gT01_re * b1_re;
952  B0_re -= gT01_im * b1_im;
953  B0_re += gT02_re * b2_re;
954  B0_re -= gT02_im * b2_im;
955  spinorFloat B0_im = 0;
956  B0_im += gT00_re * b0_im;
957  B0_im += gT00_im * b0_re;
958  B0_im += gT01_re * b1_im;
959  B0_im += gT01_im * b1_re;
960  B0_im += gT02_re * b2_im;
961  B0_im += gT02_im * b2_re;
962 
963  // multiply row 1
964  spinorFloat A1_re = 0;
965  A1_re += gT10_re * a0_re;
966  A1_re -= gT10_im * a0_im;
967  A1_re += gT11_re * a1_re;
968  A1_re -= gT11_im * a1_im;
969  A1_re += gT12_re * a2_re;
970  A1_re -= gT12_im * a2_im;
971  spinorFloat A1_im = 0;
972  A1_im += gT10_re * a0_im;
973  A1_im += gT10_im * a0_re;
974  A1_im += gT11_re * a1_im;
975  A1_im += gT11_im * a1_re;
976  A1_im += gT12_re * a2_im;
977  A1_im += gT12_im * a2_re;
978  spinorFloat B1_re = 0;
979  B1_re += gT10_re * b0_re;
980  B1_re -= gT10_im * b0_im;
981  B1_re += gT11_re * b1_re;
982  B1_re -= gT11_im * b1_im;
983  B1_re += gT12_re * b2_re;
984  B1_re -= gT12_im * b2_im;
985  spinorFloat B1_im = 0;
986  B1_im += gT10_re * b0_im;
987  B1_im += gT10_im * b0_re;
988  B1_im += gT11_re * b1_im;
989  B1_im += gT11_im * b1_re;
990  B1_im += gT12_re * b2_im;
991  B1_im += gT12_im * b2_re;
992 
993  // multiply row 2
994  spinorFloat A2_re = 0;
995  A2_re += gT20_re * a0_re;
996  A2_re -= gT20_im * a0_im;
997  A2_re += gT21_re * a1_re;
998  A2_re -= gT21_im * a1_im;
999  A2_re += gT22_re * a2_re;
1000  A2_re -= gT22_im * a2_im;
1001  spinorFloat A2_im = 0;
1002  A2_im += gT20_re * a0_im;
1003  A2_im += gT20_im * a0_re;
1004  A2_im += gT21_re * a1_im;
1005  A2_im += gT21_im * a1_re;
1006  A2_im += gT22_re * a2_im;
1007  A2_im += gT22_im * a2_re;
1008  spinorFloat B2_re = 0;
1009  B2_re += gT20_re * b0_re;
1010  B2_re -= gT20_im * b0_im;
1011  B2_re += gT21_re * b1_re;
1012  B2_re -= gT21_im * b1_im;
1013  B2_re += gT22_re * b2_re;
1014  B2_re -= gT22_im * b2_im;
1015  spinorFloat B2_im = 0;
1016  B2_im += gT20_re * b0_im;
1017  B2_im += gT20_im * b0_re;
1018  B2_im += gT21_re * b1_im;
1019  B2_im += gT21_im * b1_re;
1020  B2_im += gT22_re * b2_im;
1021  B2_im += gT22_im * b2_re;
1022 
1023  o00_re += A0_re;
1024  o00_im += A0_im;
1025  o10_re += B0_re;
1026  o10_im += B0_im;
1027  o20_re -= B0_re;
1028  o20_im -= B0_im;
1029  o30_re += A0_re;
1030  o30_im += A0_im;
1031 
1032  o01_re += A1_re;
1033  o01_im += A1_im;
1034  o11_re += B1_re;
1035  o11_im += B1_im;
1036  o21_re -= B1_re;
1037  o21_im -= B1_im;
1038  o31_re += A1_re;
1039  o31_im += A1_im;
1040 
1041  o02_re += A2_re;
1042  o02_im += A2_im;
1043  o12_re += B2_re;
1044  o12_im += B2_im;
1045  o22_re -= B2_re;
1046  o22_im -= B2_im;
1047  o32_re += A2_re;
1048  o32_im += A2_im;
1049 }
1050 
1051 #ifdef MULTI_GPU
1052 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]<(param.dc.X[2]-1))) ||
1053  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==(param.dc.X[2]-1)) )
1054 #endif
1055 {
1056  // Projector P2-
1057  // 1 0 -i 0
1058  // 0 1 0 i
1059  // i 0 1 0
1060  // 0 -i 0 1
1061 
1062 #ifdef MULTI_GPU
1063  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1 :
1064  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1065 #if (DD_PREC==2) // half precision
1066  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1067 #endif
1068 #else
1069  const int sp_idx = (coord[2]==(param.dc.X[2]-1) ? X-param.dc.X3X2X1mX2X1 : X+param.dc.X2X1) >> 1;
1070 #endif
1071 
1072  const int ga_idx = sid % param.dc.volume_4d_cb;
1073 
1074  // read gauge matrix from device memory
1075  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, param.gauge_stride); }
1076  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 4, ga_idx, param.gauge_stride); }
1077 
1084 
1085 #ifdef MULTI_GPU
1086  if (kernel_type == INTERIOR_KERNEL) {
1087 #endif
1088 
1089  // read spinor from device memory
1090  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1091 
1092  // project spinor into half spinors
1093  a0_re = +i00_re+i20_im;
1094  a0_im = +i00_im-i20_re;
1095  a1_re = +i01_re+i21_im;
1096  a1_im = +i01_im-i21_re;
1097  a2_re = +i02_re+i22_im;
1098  a2_im = +i02_im-i22_re;
1099  b0_re = +i10_re-i30_im;
1100  b0_im = +i10_im+i30_re;
1101  b1_re = +i11_re-i31_im;
1102  b1_im = +i11_im+i31_re;
1103  b2_re = +i12_re-i32_im;
1104  b2_im = +i12_im+i32_re;
1105 
1106 #ifdef MULTI_GPU
1107  } else {
1108 
1109  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
1110 
1111  // read half spinor from device memory
1112  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 4);
1113 
1114  a0_re = i00_re; a0_im = i00_im;
1115  a1_re = i01_re; a1_im = i01_im;
1116  a2_re = i02_re; a2_im = i02_im;
1117  b0_re = i10_re; b0_im = i10_im;
1118  b1_re = i11_re; b1_im = i11_im;
1119  b2_re = i12_re; b2_im = i12_im;
1120 
1121  }
1122 #endif // MULTI_GPU
1123 
1124  // reconstruct gauge matrix
1126 
1127  // multiply row 0
1128  spinorFloat A0_re = 0;
1129  A0_re += g00_re * a0_re;
1130  A0_re -= g00_im * a0_im;
1131  A0_re += g01_re * a1_re;
1132  A0_re -= g01_im * a1_im;
1133  A0_re += g02_re * a2_re;
1134  A0_re -= g02_im * a2_im;
1135  spinorFloat A0_im = 0;
1136  A0_im += g00_re * a0_im;
1137  A0_im += g00_im * a0_re;
1138  A0_im += g01_re * a1_im;
1139  A0_im += g01_im * a1_re;
1140  A0_im += g02_re * a2_im;
1141  A0_im += g02_im * a2_re;
1142  spinorFloat B0_re = 0;
1143  B0_re += g00_re * b0_re;
1144  B0_re -= g00_im * b0_im;
1145  B0_re += g01_re * b1_re;
1146  B0_re -= g01_im * b1_im;
1147  B0_re += g02_re * b2_re;
1148  B0_re -= g02_im * b2_im;
1149  spinorFloat B0_im = 0;
1150  B0_im += g00_re * b0_im;
1151  B0_im += g00_im * b0_re;
1152  B0_im += g01_re * b1_im;
1153  B0_im += g01_im * b1_re;
1154  B0_im += g02_re * b2_im;
1155  B0_im += g02_im * b2_re;
1156 
1157  // multiply row 1
1158  spinorFloat A1_re = 0;
1159  A1_re += g10_re * a0_re;
1160  A1_re -= g10_im * a0_im;
1161  A1_re += g11_re * a1_re;
1162  A1_re -= g11_im * a1_im;
1163  A1_re += g12_re * a2_re;
1164  A1_re -= g12_im * a2_im;
1165  spinorFloat A1_im = 0;
1166  A1_im += g10_re * a0_im;
1167  A1_im += g10_im * a0_re;
1168  A1_im += g11_re * a1_im;
1169  A1_im += g11_im * a1_re;
1170  A1_im += g12_re * a2_im;
1171  A1_im += g12_im * a2_re;
1172  spinorFloat B1_re = 0;
1173  B1_re += g10_re * b0_re;
1174  B1_re -= g10_im * b0_im;
1175  B1_re += g11_re * b1_re;
1176  B1_re -= g11_im * b1_im;
1177  B1_re += g12_re * b2_re;
1178  B1_re -= g12_im * b2_im;
1179  spinorFloat B1_im = 0;
1180  B1_im += g10_re * b0_im;
1181  B1_im += g10_im * b0_re;
1182  B1_im += g11_re * b1_im;
1183  B1_im += g11_im * b1_re;
1184  B1_im += g12_re * b2_im;
1185  B1_im += g12_im * b2_re;
1186 
1187  // multiply row 2
1188  spinorFloat A2_re = 0;
1189  A2_re += g20_re * a0_re;
1190  A2_re -= g20_im * a0_im;
1191  A2_re += g21_re * a1_re;
1192  A2_re -= g21_im * a1_im;
1193  A2_re += g22_re * a2_re;
1194  A2_re -= g22_im * a2_im;
1195  spinorFloat A2_im = 0;
1196  A2_im += g20_re * a0_im;
1197  A2_im += g20_im * a0_re;
1198  A2_im += g21_re * a1_im;
1199  A2_im += g21_im * a1_re;
1200  A2_im += g22_re * a2_im;
1201  A2_im += g22_im * a2_re;
1202  spinorFloat B2_re = 0;
1203  B2_re += g20_re * b0_re;
1204  B2_re -= g20_im * b0_im;
1205  B2_re += g21_re * b1_re;
1206  B2_re -= g21_im * b1_im;
1207  B2_re += g22_re * b2_re;
1208  B2_re -= g22_im * b2_im;
1209  spinorFloat B2_im = 0;
1210  B2_im += g20_re * b0_im;
1211  B2_im += g20_im * b0_re;
1212  B2_im += g21_re * b1_im;
1213  B2_im += g21_im * b1_re;
1214  B2_im += g22_re * b2_im;
1215  B2_im += g22_im * b2_re;
1216 
1217  o00_re += A0_re;
1218  o00_im += A0_im;
1219  o10_re += B0_re;
1220  o10_im += B0_im;
1221  o20_re -= A0_im;
1222  o20_im += A0_re;
1223  o30_re += B0_im;
1224  o30_im -= B0_re;
1225 
1226  o01_re += A1_re;
1227  o01_im += A1_im;
1228  o11_re += B1_re;
1229  o11_im += B1_im;
1230  o21_re -= A1_im;
1231  o21_im += A1_re;
1232  o31_re += B1_im;
1233  o31_im -= B1_re;
1234 
1235  o02_re += A2_re;
1236  o02_im += A2_im;
1237  o12_re += B2_re;
1238  o12_im += B2_im;
1239  o22_re -= A2_im;
1240  o22_im += A2_re;
1241  o32_re += B2_im;
1242  o32_im -= B2_re;
1243 }
1244 
1245 #ifdef MULTI_GPU
1246 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || coord[2]>0)) ||
1247  (kernel_type == EXTERIOR_KERNEL_Z && coord[2]==0) )
1248 #endif
1249 {
1250  // Projector P2+
1251  // 1 0 i 0
1252  // 0 1 0 -i
1253  // -i 0 1 0
1254  // 0 i 0 1
1255 
1256 #ifdef MULTI_GPU
1257  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1 :
1258  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1259 #if (DD_PREC==2) // half precision
1260  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1261 #endif
1262 #else
1263  const int sp_idx = (coord[2]==0 ? X+param.dc.X3X2X1mX2X1 : X-param.dc.X2X1) >> 1;
1264 #endif
1265 
1266 #ifdef MULTI_GPU
1267  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % param.dc.volume_4d_cb : param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[static_cast<int>(kernel_type)]));
1268 #else
1269  const int ga_idx = sp_idx % param.dc.volume_4d_cb;
1270 #endif
1271 
1272  // read gauge matrix from device memory
1273  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, param.gauge_stride); }
1274  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 5, ga_idx, param.gauge_stride); }
1275 
1282 
1283 #ifdef MULTI_GPU
1284  if (kernel_type == INTERIOR_KERNEL) {
1285 #endif
1286 
1287  // read spinor from device memory
1288  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1289 
1290  // project spinor into half spinors
1291  a0_re = +i00_re-i20_im;
1292  a0_im = +i00_im+i20_re;
1293  a1_re = +i01_re-i21_im;
1294  a1_im = +i01_im+i21_re;
1295  a2_re = +i02_re-i22_im;
1296  a2_im = +i02_im+i22_re;
1297  b0_re = +i10_re+i30_im;
1298  b0_im = +i10_im-i30_re;
1299  b1_re = +i11_re+i31_im;
1300  b1_im = +i11_im-i31_re;
1301  b2_re = +i12_re+i32_im;
1302  b2_im = +i12_im-i32_re;
1303 
1304 #ifdef MULTI_GPU
1305  } else {
1306 
1307  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
1308 
1309  // read half spinor from device memory
1310  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 5);
1311 
1312  a0_re = i00_re; a0_im = i00_im;
1313  a1_re = i01_re; a1_im = i01_im;
1314  a2_re = i02_re; a2_im = i02_im;
1315  b0_re = i10_re; b0_im = i10_im;
1316  b1_re = i11_re; b1_im = i11_im;
1317  b2_re = i12_re; b2_im = i12_im;
1318 
1319  }
1320 #endif // MULTI_GPU
1321 
1322  // reconstruct gauge matrix
1324 
1325  // multiply row 0
1326  spinorFloat A0_re = 0;
1327  A0_re += gT00_re * a0_re;
1328  A0_re -= gT00_im * a0_im;
1329  A0_re += gT01_re * a1_re;
1330  A0_re -= gT01_im * a1_im;
1331  A0_re += gT02_re * a2_re;
1332  A0_re -= gT02_im * a2_im;
1333  spinorFloat A0_im = 0;
1334  A0_im += gT00_re * a0_im;
1335  A0_im += gT00_im * a0_re;
1336  A0_im += gT01_re * a1_im;
1337  A0_im += gT01_im * a1_re;
1338  A0_im += gT02_re * a2_im;
1339  A0_im += gT02_im * a2_re;
1340  spinorFloat B0_re = 0;
1341  B0_re += gT00_re * b0_re;
1342  B0_re -= gT00_im * b0_im;
1343  B0_re += gT01_re * b1_re;
1344  B0_re -= gT01_im * b1_im;
1345  B0_re += gT02_re * b2_re;
1346  B0_re -= gT02_im * b2_im;
1347  spinorFloat B0_im = 0;
1348  B0_im += gT00_re * b0_im;
1349  B0_im += gT00_im * b0_re;
1350  B0_im += gT01_re * b1_im;
1351  B0_im += gT01_im * b1_re;
1352  B0_im += gT02_re * b2_im;
1353  B0_im += gT02_im * b2_re;
1354 
1355  // multiply row 1
1356  spinorFloat A1_re = 0;
1357  A1_re += gT10_re * a0_re;
1358  A1_re -= gT10_im * a0_im;
1359  A1_re += gT11_re * a1_re;
1360  A1_re -= gT11_im * a1_im;
1361  A1_re += gT12_re * a2_re;
1362  A1_re -= gT12_im * a2_im;
1363  spinorFloat A1_im = 0;
1364  A1_im += gT10_re * a0_im;
1365  A1_im += gT10_im * a0_re;
1366  A1_im += gT11_re * a1_im;
1367  A1_im += gT11_im * a1_re;
1368  A1_im += gT12_re * a2_im;
1369  A1_im += gT12_im * a2_re;
1370  spinorFloat B1_re = 0;
1371  B1_re += gT10_re * b0_re;
1372  B1_re -= gT10_im * b0_im;
1373  B1_re += gT11_re * b1_re;
1374  B1_re -= gT11_im * b1_im;
1375  B1_re += gT12_re * b2_re;
1376  B1_re -= gT12_im * b2_im;
1377  spinorFloat B1_im = 0;
1378  B1_im += gT10_re * b0_im;
1379  B1_im += gT10_im * b0_re;
1380  B1_im += gT11_re * b1_im;
1381  B1_im += gT11_im * b1_re;
1382  B1_im += gT12_re * b2_im;
1383  B1_im += gT12_im * b2_re;
1384 
1385  // multiply row 2
1386  spinorFloat A2_re = 0;
1387  A2_re += gT20_re * a0_re;
1388  A2_re -= gT20_im * a0_im;
1389  A2_re += gT21_re * a1_re;
1390  A2_re -= gT21_im * a1_im;
1391  A2_re += gT22_re * a2_re;
1392  A2_re -= gT22_im * a2_im;
1393  spinorFloat A2_im = 0;
1394  A2_im += gT20_re * a0_im;
1395  A2_im += gT20_im * a0_re;
1396  A2_im += gT21_re * a1_im;
1397  A2_im += gT21_im * a1_re;
1398  A2_im += gT22_re * a2_im;
1399  A2_im += gT22_im * a2_re;
1400  spinorFloat B2_re = 0;
1401  B2_re += gT20_re * b0_re;
1402  B2_re -= gT20_im * b0_im;
1403  B2_re += gT21_re * b1_re;
1404  B2_re -= gT21_im * b1_im;
1405  B2_re += gT22_re * b2_re;
1406  B2_re -= gT22_im * b2_im;
1407  spinorFloat B2_im = 0;
1408  B2_im += gT20_re * b0_im;
1409  B2_im += gT20_im * b0_re;
1410  B2_im += gT21_re * b1_im;
1411  B2_im += gT21_im * b1_re;
1412  B2_im += gT22_re * b2_im;
1413  B2_im += gT22_im * b2_re;
1414 
1415  o00_re += A0_re;
1416  o00_im += A0_im;
1417  o10_re += B0_re;
1418  o10_im += B0_im;
1419  o20_re += A0_im;
1420  o20_im -= A0_re;
1421  o30_re -= B0_im;
1422  o30_im += B0_re;
1423 
1424  o01_re += A1_re;
1425  o01_im += A1_im;
1426  o11_re += B1_re;
1427  o11_im += B1_im;
1428  o21_re += A1_im;
1429  o21_im -= A1_re;
1430  o31_re -= B1_im;
1431  o31_im += B1_re;
1432 
1433  o02_re += A2_re;
1434  o02_im += A2_im;
1435  o12_re += B2_re;
1436  o12_im += B2_im;
1437  o22_re += A2_im;
1438  o22_im -= A2_re;
1439  o32_re -= B2_im;
1440  o32_im += B2_re;
1441 }
1442 
1443 #ifdef MULTI_GPU
1444 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]<(param.dc.X[3]-1))) ||
1445  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==(param.dc.X[3]-1)) )
1446 #endif
1447 {
1448  // Projector P3-
1449  // 0 0 0 0
1450  // 0 0 0 0
1451  // 0 0 2 0
1452  // 0 0 0 2
1453 
1454 #ifdef MULTI_GPU
1455  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1 :
1456  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][1];
1457 #if (DD_PREC==2) // half precision
1458  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][1];
1459 #endif
1460 #else
1461  const int sp_idx = (coord[3]==(param.dc.X[3]-1) ? X-param.dc.X4X3X2X1mX3X2X1 : X+param.dc.X3X2X1) >> 1;
1462 #endif
1463 
1464  const int ga_idx = sid % param.dc.volume_4d_cb;
1465 
1466  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1467  {
1474 
1475 #ifdef MULTI_GPU
1476  if (kernel_type == INTERIOR_KERNEL) {
1477 #endif
1478 
1479  // read spinor from device memory
1481 
1482  // project spinor into half spinors
1483  a0_re = +2*i20_re;
1484  a0_im = +2*i20_im;
1485  a1_re = +2*i21_re;
1486  a1_im = +2*i21_im;
1487  a2_re = +2*i22_re;
1488  a2_im = +2*i22_im;
1489  b0_re = +2*i30_re;
1490  b0_im = +2*i30_im;
1491  b1_re = +2*i31_re;
1492  b1_im = +2*i31_im;
1493  b2_re = +2*i32_re;
1494  b2_im = +2*i32_im;
1495 
1496 #ifdef MULTI_GPU
1497  } else {
1498 
1499  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
1500  const int t_proj_scale = TPROJSCALE;
1501 
1502  // read half spinor from device memory
1503  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1504 
1505  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1506  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1507  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1508  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1509  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1510  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1511 
1512  }
1513 #endif // MULTI_GPU
1514 
1515  // identity gauge matrix
1522 
1523  o20_re += A0_re;
1524  o20_im += A0_im;
1525  o30_re += B0_re;
1526  o30_im += B0_im;
1527 
1528  o21_re += A1_re;
1529  o21_im += A1_im;
1530  o31_re += B1_re;
1531  o31_im += B1_im;
1532 
1533  o22_re += A2_re;
1534  o22_im += A2_im;
1535  o32_re += B2_re;
1536  o32_im += B2_im;
1537  } else {
1538  // read gauge matrix from device memory
1539  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, param.gauge_stride); }
1540  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 6, ga_idx, param.gauge_stride); }
1541 
1548 
1549 #ifdef MULTI_GPU
1550  if (kernel_type == INTERIOR_KERNEL) {
1551 #endif
1552 
1553  // read spinor from device memory
1555 
1556  // project spinor into half spinors
1557  a0_re = +2*i20_re;
1558  a0_im = +2*i20_im;
1559  a1_re = +2*i21_re;
1560  a1_im = +2*i21_im;
1561  a2_re = +2*i22_re;
1562  a2_im = +2*i22_im;
1563  b0_re = +2*i30_re;
1564  b0_im = +2*i30_im;
1565  b1_re = +2*i31_re;
1566  b1_im = +2*i31_im;
1567  b2_re = +2*i32_re;
1568  b2_im = +2*i32_im;
1569 
1570 #ifdef MULTI_GPU
1571  } else {
1572 
1573  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
1574  const int t_proj_scale = TPROJSCALE;
1575 
1576  // read half spinor from device memory
1577  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 6);
1578 
1579  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1580  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1581  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1582  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1583  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1584  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1585 
1586  }
1587 #endif // MULTI_GPU
1588 
1589  // reconstruct gauge matrix
1591 
1592  // multiply row 0
1593  spinorFloat A0_re = 0;
1594  A0_re += g00_re * a0_re;
1595  A0_re -= g00_im * a0_im;
1596  A0_re += g01_re * a1_re;
1597  A0_re -= g01_im * a1_im;
1598  A0_re += g02_re * a2_re;
1599  A0_re -= g02_im * a2_im;
1600  spinorFloat A0_im = 0;
1601  A0_im += g00_re * a0_im;
1602  A0_im += g00_im * a0_re;
1603  A0_im += g01_re * a1_im;
1604  A0_im += g01_im * a1_re;
1605  A0_im += g02_re * a2_im;
1606  A0_im += g02_im * a2_re;
1607  spinorFloat B0_re = 0;
1608  B0_re += g00_re * b0_re;
1609  B0_re -= g00_im * b0_im;
1610  B0_re += g01_re * b1_re;
1611  B0_re -= g01_im * b1_im;
1612  B0_re += g02_re * b2_re;
1613  B0_re -= g02_im * b2_im;
1614  spinorFloat B0_im = 0;
1615  B0_im += g00_re * b0_im;
1616  B0_im += g00_im * b0_re;
1617  B0_im += g01_re * b1_im;
1618  B0_im += g01_im * b1_re;
1619  B0_im += g02_re * b2_im;
1620  B0_im += g02_im * b2_re;
1621 
1622  // multiply row 1
1623  spinorFloat A1_re = 0;
1624  A1_re += g10_re * a0_re;
1625  A1_re -= g10_im * a0_im;
1626  A1_re += g11_re * a1_re;
1627  A1_re -= g11_im * a1_im;
1628  A1_re += g12_re * a2_re;
1629  A1_re -= g12_im * a2_im;
1630  spinorFloat A1_im = 0;
1631  A1_im += g10_re * a0_im;
1632  A1_im += g10_im * a0_re;
1633  A1_im += g11_re * a1_im;
1634  A1_im += g11_im * a1_re;
1635  A1_im += g12_re * a2_im;
1636  A1_im += g12_im * a2_re;
1637  spinorFloat B1_re = 0;
1638  B1_re += g10_re * b0_re;
1639  B1_re -= g10_im * b0_im;
1640  B1_re += g11_re * b1_re;
1641  B1_re -= g11_im * b1_im;
1642  B1_re += g12_re * b2_re;
1643  B1_re -= g12_im * b2_im;
1644  spinorFloat B1_im = 0;
1645  B1_im += g10_re * b0_im;
1646  B1_im += g10_im * b0_re;
1647  B1_im += g11_re * b1_im;
1648  B1_im += g11_im * b1_re;
1649  B1_im += g12_re * b2_im;
1650  B1_im += g12_im * b2_re;
1651 
1652  // multiply row 2
1653  spinorFloat A2_re = 0;
1654  A2_re += g20_re * a0_re;
1655  A2_re -= g20_im * a0_im;
1656  A2_re += g21_re * a1_re;
1657  A2_re -= g21_im * a1_im;
1658  A2_re += g22_re * a2_re;
1659  A2_re -= g22_im * a2_im;
1660  spinorFloat A2_im = 0;
1661  A2_im += g20_re * a0_im;
1662  A2_im += g20_im * a0_re;
1663  A2_im += g21_re * a1_im;
1664  A2_im += g21_im * a1_re;
1665  A2_im += g22_re * a2_im;
1666  A2_im += g22_im * a2_re;
1667  spinorFloat B2_re = 0;
1668  B2_re += g20_re * b0_re;
1669  B2_re -= g20_im * b0_im;
1670  B2_re += g21_re * b1_re;
1671  B2_re -= g21_im * b1_im;
1672  B2_re += g22_re * b2_re;
1673  B2_re -= g22_im * b2_im;
1674  spinorFloat B2_im = 0;
1675  B2_im += g20_re * b0_im;
1676  B2_im += g20_im * b0_re;
1677  B2_im += g21_re * b1_im;
1678  B2_im += g21_im * b1_re;
1679  B2_im += g22_re * b2_im;
1680  B2_im += g22_im * b2_re;
1681 
1682  o20_re += A0_re;
1683  o20_im += A0_im;
1684  o30_re += B0_re;
1685  o30_im += B0_im;
1686 
1687  o21_re += A1_re;
1688  o21_im += A1_im;
1689  o31_re += B1_re;
1690  o31_im += B1_im;
1691 
1692  o22_re += A2_re;
1693  o22_im += A2_im;
1694  o32_re += B2_re;
1695  o32_im += B2_im;
1696  }
1697 }
1698 
1699 #ifdef MULTI_GPU
1700 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || coord[3]>0)) ||
1701  (kernel_type == EXTERIOR_KERNEL_T && coord[3]==0) )
1702 #endif
1703 {
1704  // Projector P3+
1705  // 2 0 0 0
1706  // 0 2 0 0
1707  // 0 0 0 0
1708  // 0 0 0 0
1709 
1710 #ifdef MULTI_GPU
1711  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1 :
1712  face_idx + param.ghostOffset[static_cast<int>(kernel_type)][0];
1713 #if (DD_PREC==2) // half precision
1714  const int sp_norm_idx = face_idx + param.ghostNormOffset[static_cast<int>(kernel_type)][0];
1715 #endif
1716 #else
1717  const int sp_idx = (coord[3]==0 ? X+param.dc.X4X3X2X1mX3X2X1 : X-param.dc.X3X2X1) >> 1;
1718 #endif
1719 
1720 #ifdef MULTI_GPU
1721  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % param.dc.volume_4d_cb : param.dc.volume_4d_cb+(face_idx % param.dc.ghostFace[static_cast<int>(kernel_type)]));
1722 #else
1723  const int ga_idx = sp_idx % param.dc.volume_4d_cb;
1724 #endif
1725 
1726  if (param.gauge_fixed && ga_idx < param.dc.X4X3X2X1hmX3X2X1h)
1727  {
1734 
1735 #ifdef MULTI_GPU
1736  if (kernel_type == INTERIOR_KERNEL) {
1737 #endif
1738 
1739  // read spinor from device memory
1740  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1741 
1742  // project spinor into half spinors
1743  a0_re = +2*i00_re;
1744  a0_im = +2*i00_im;
1745  a1_re = +2*i01_re;
1746  a1_im = +2*i01_im;
1747  a2_re = +2*i02_re;
1748  a2_im = +2*i02_im;
1749  b0_re = +2*i10_re;
1750  b0_im = +2*i10_im;
1751  b1_re = +2*i11_re;
1752  b1_im = +2*i11_im;
1753  b2_re = +2*i12_re;
1754  b2_im = +2*i12_im;
1755 
1756 #ifdef MULTI_GPU
1757  } else {
1758 
1759  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
1760  const int t_proj_scale = TPROJSCALE;
1761 
1762  // read half spinor from device memory
1763  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
1764 
1765  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1766  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1767  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1768  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1769  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1770  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1771 
1772  }
1773 #endif // MULTI_GPU
1774 
1775  // identity gauge matrix
1782 
1783  o00_re += A0_re;
1784  o00_im += A0_im;
1785  o10_re += B0_re;
1786  o10_im += B0_im;
1787 
1788  o01_re += A1_re;
1789  o01_im += A1_im;
1790  o11_re += B1_re;
1791  o11_im += B1_im;
1792 
1793  o02_re += A2_re;
1794  o02_im += A2_im;
1795  o12_re += B2_re;
1796  o12_im += B2_im;
1797  } else {
1798  // read gauge matrix from device memory
1799  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, param.gauge_stride); }
1800  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 7, ga_idx, param.gauge_stride); }
1801 
1808 
1809 #ifdef MULTI_GPU
1810  if (kernel_type == INTERIOR_KERNEL) {
1811 #endif
1812 
1813  // read spinor from device memory
1814  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1815 
1816  // project spinor into half spinors
1817  a0_re = +2*i00_re;
1818  a0_im = +2*i00_im;
1819  a1_re = +2*i01_re;
1820  a1_im = +2*i01_im;
1821  a2_re = +2*i02_re;
1822  a2_im = +2*i02_im;
1823  b0_re = +2*i10_re;
1824  b0_im = +2*i10_im;
1825  b1_re = +2*i11_re;
1826  b1_im = +2*i11_im;
1827  b2_re = +2*i12_re;
1828  b2_im = +2*i12_im;
1829 
1830 #ifdef MULTI_GPU
1831  } else {
1832 
1833  const int sp_stride_pad = param.dc.Ls*param.dc.ghostFace[static_cast<int>(kernel_type)];
1834  const int t_proj_scale = TPROJSCALE;
1835 
1836  // read half spinor from device memory
1837  READ_SPINOR_GHOST(GHOSTSPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx, 7);
1838 
1839  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1840  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1841  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1842  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1843  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1844  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1845 
1846  }
1847 #endif // MULTI_GPU
1848 
1849  // reconstruct gauge matrix
1851 
1852  // multiply row 0
1853  spinorFloat A0_re = 0;
1854  A0_re += gT00_re * a0_re;
1855  A0_re -= gT00_im * a0_im;
1856  A0_re += gT01_re * a1_re;
1857  A0_re -= gT01_im * a1_im;
1858  A0_re += gT02_re * a2_re;
1859  A0_re -= gT02_im * a2_im;
1860  spinorFloat A0_im = 0;
1861  A0_im += gT00_re * a0_im;
1862  A0_im += gT00_im * a0_re;
1863  A0_im += gT01_re * a1_im;
1864  A0_im += gT01_im * a1_re;
1865  A0_im += gT02_re * a2_im;
1866  A0_im += gT02_im * a2_re;
1867  spinorFloat B0_re = 0;
1868  B0_re += gT00_re * b0_re;
1869  B0_re -= gT00_im * b0_im;
1870  B0_re += gT01_re * b1_re;
1871  B0_re -= gT01_im * b1_im;
1872  B0_re += gT02_re * b2_re;
1873  B0_re -= gT02_im * b2_im;
1874  spinorFloat B0_im = 0;
1875  B0_im += gT00_re * b0_im;
1876  B0_im += gT00_im * b0_re;
1877  B0_im += gT01_re * b1_im;
1878  B0_im += gT01_im * b1_re;
1879  B0_im += gT02_re * b2_im;
1880  B0_im += gT02_im * b2_re;
1881 
1882  // multiply row 1
1883  spinorFloat A1_re = 0;
1884  A1_re += gT10_re * a0_re;
1885  A1_re -= gT10_im * a0_im;
1886  A1_re += gT11_re * a1_re;
1887  A1_re -= gT11_im * a1_im;
1888  A1_re += gT12_re * a2_re;
1889  A1_re -= gT12_im * a2_im;
1890  spinorFloat A1_im = 0;
1891  A1_im += gT10_re * a0_im;
1892  A1_im += gT10_im * a0_re;
1893  A1_im += gT11_re * a1_im;
1894  A1_im += gT11_im * a1_re;
1895  A1_im += gT12_re * a2_im;
1896  A1_im += gT12_im * a2_re;
1897  spinorFloat B1_re = 0;
1898  B1_re += gT10_re * b0_re;
1899  B1_re -= gT10_im * b0_im;
1900  B1_re += gT11_re * b1_re;
1901  B1_re -= gT11_im * b1_im;
1902  B1_re += gT12_re * b2_re;
1903  B1_re -= gT12_im * b2_im;
1904  spinorFloat B1_im = 0;
1905  B1_im += gT10_re * b0_im;
1906  B1_im += gT10_im * b0_re;
1907  B1_im += gT11_re * b1_im;
1908  B1_im += gT11_im * b1_re;
1909  B1_im += gT12_re * b2_im;
1910  B1_im += gT12_im * b2_re;
1911 
1912  // multiply row 2
1913  spinorFloat A2_re = 0;
1914  A2_re += gT20_re * a0_re;
1915  A2_re -= gT20_im * a0_im;
1916  A2_re += gT21_re * a1_re;
1917  A2_re -= gT21_im * a1_im;
1918  A2_re += gT22_re * a2_re;
1919  A2_re -= gT22_im * a2_im;
1920  spinorFloat A2_im = 0;
1921  A2_im += gT20_re * a0_im;
1922  A2_im += gT20_im * a0_re;
1923  A2_im += gT21_re * a1_im;
1924  A2_im += gT21_im * a1_re;
1925  A2_im += gT22_re * a2_im;
1926  A2_im += gT22_im * a2_re;
1927  spinorFloat B2_re = 0;
1928  B2_re += gT20_re * b0_re;
1929  B2_re -= gT20_im * b0_im;
1930  B2_re += gT21_re * b1_re;
1931  B2_re -= gT21_im * b1_im;
1932  B2_re += gT22_re * b2_re;
1933  B2_re -= gT22_im * b2_im;
1934  spinorFloat B2_im = 0;
1935  B2_im += gT20_re * b0_im;
1936  B2_im += gT20_im * b0_re;
1937  B2_im += gT21_re * b1_im;
1938  B2_im += gT21_im * b1_re;
1939  B2_im += gT22_re * b2_im;
1940  B2_im += gT22_im * b2_re;
1941 
1942  o00_re += A0_re;
1943  o00_im += A0_im;
1944  o10_re += B0_re;
1945  o10_im += B0_im;
1946 
1947  o01_re += A1_re;
1948  o01_im += A1_im;
1949  o11_re += B1_re;
1950  o11_im += B1_im;
1951 
1952  o02_re += A2_re;
1953  o02_im += A2_im;
1954  o12_re += B2_re;
1955  o12_im += B2_im;
1956  }
1957 }
1958 
1959 
1960 
1961 // 5th dimension -- NB: not partitionable!
1962 #ifdef MULTI_GPU
1963 if(kernel_type == INTERIOR_KERNEL)
1964 #endif
1965 {
1966 // 2 P_L = 2 P_- = ( ( +1, -1 ), ( -1, +1 ) )
1967  {
1968  int sp_idx = ( coord[4] == param.dc.Ls-1 ? X-(param.dc.Ls-1)*2*param.dc.volume_4d_cb : X+2*param.dc.volume_4d_cb ) / 2;
1969 
1970 // read spinor from device memory
1971  READ_SPINOR( SPINORTEX, param.sp_stride, sp_idx, sp_idx );
1972 
1973  if ( coord[4] != param.dc.Ls-1 )
1974  {
1978 
1982 
1986 
1990  }
1991  else
1992  {
1993  o00_re += -param.mferm*(+i00_re-i20_re); o00_im += -param.mferm*(+i00_im-i20_im);
1994  o01_re += -param.mferm*(+i01_re-i21_re); o01_im += -param.mferm*(+i01_im-i21_im);
1995  o02_re += -param.mferm*(+i02_re-i22_re); o02_im += -param.mferm*(+i02_im-i22_im);
1996 
1997  o10_re += -param.mferm*(+i10_re-i30_re); o10_im += -param.mferm*(+i10_im-i30_im);
1998  o11_re += -param.mferm*(+i11_re-i31_re); o11_im += -param.mferm*(+i11_im-i31_im);
1999  o12_re += -param.mferm*(+i12_re-i32_re); o12_im += -param.mferm*(+i12_im-i32_im);
2000 
2001  o20_re += -param.mferm*(-i00_re+i20_re); o20_im += -param.mferm*(-i00_im+i20_im);
2002  o21_re += -param.mferm*(-i01_re+i21_re); o21_im += -param.mferm*(-i01_im+i21_im);
2003  o22_re += -param.mferm*(-i02_re+i22_re); o22_im += -param.mferm*(-i02_im+i22_im);
2004 
2005  o30_re += -param.mferm*(-i10_re+i30_re); o30_im += -param.mferm*(-i10_im+i30_im);
2006  o31_re += -param.mferm*(-i11_re+i31_re); o31_im += -param.mferm*(-i11_im+i31_im);
2007  o32_re += -param.mferm*(-i12_re+i32_re); o32_im += -param.mferm*(-i12_im+i32_im);
2008  } // end if ( coord[4]!= param.dc.Ls-1 )
2009  } // end P_L
2010 
2011  // 2 P_R = 2 P_+ = ( ( +1, +1 ), ( +1, +1 ) )
2012  {
2013  int sp_idx = ( coord[4] == 0 ? X+(param.dc.Ls-1)*2*param.dc.volume_4d_cb : X-2*param.dc.volume_4d_cb ) / 2;
2014 
2015 // read spinor from device memory
2016  READ_SPINOR( SPINORTEX, param.sp_stride, sp_idx, sp_idx );
2017 
2018  if ( coord[4] != 0 )
2019  {
2023 
2027 
2031 
2035  }
2036  else
2037  {
2038  o00_re += -param.mferm*(+i00_re+i20_re); o00_im += -param.mferm*(+i00_im+i20_im);
2039  o01_re += -param.mferm*(+i01_re+i21_re); o01_im += -param.mferm*(+i01_im+i21_im);
2040  o02_re += -param.mferm*(+i02_re+i22_re); o02_im += -param.mferm*(+i02_im+i22_im);
2041 
2042  o10_re += -param.mferm*(+i10_re+i30_re); o10_im += -param.mferm*(+i10_im+i30_im);
2043  o11_re += -param.mferm*(+i11_re+i31_re); o11_im += -param.mferm*(+i11_im+i31_im);
2044  o12_re += -param.mferm*(+i12_re+i32_re); o12_im += -param.mferm*(+i12_im+i32_im);
2045 
2046  o20_re += -param.mferm*(+i00_re+i20_re); o20_im += -param.mferm*(+i00_im+i20_im);
2047  o21_re += -param.mferm*(+i01_re+i21_re); o21_im += -param.mferm*(+i01_im+i21_im);
2048  o22_re += -param.mferm*(+i02_re+i22_re); o22_im += -param.mferm*(+i02_im+i22_im);
2049 
2050  o30_re += -param.mferm*(+i10_re+i30_re); o30_im += -param.mferm*(+i10_im+i30_im);
2051  o31_re += -param.mferm*(+i11_re+i31_re); o31_im += -param.mferm*(+i11_im+i31_im);
2052  o32_re += -param.mferm*(+i12_re+i32_re); o32_im += -param.mferm*(+i12_im+i32_im);
2053  } // end if ( coord[4] != 0 )
2054  } // end P_R
2055 } // end 5th dimension
2056 
2057 
2058 #if defined MULTI_GPU && defined DSLASH_XPAY
2059 if (kernel_type == INTERIOR_KERNEL)
2060 #endif
2061 {
2062 #ifdef DSLASH_XPAY
2063  READ_ACCUM(ACCUMTEX, param.sp_stride)
2064 #ifdef SPINOR_DOUBLE
2065 spinorFloat a_inv = param.a_inv;
2066 #else
2067 spinorFloat a_inv = param.a_inv_f;
2068 #endif
2069 #ifdef SPINOR_DOUBLE
2070  o00_re = o00_re + a_inv*accum0.x;
2071  o00_im = o00_im + a_inv*accum0.y;
2072  o01_re = o01_re + a_inv*accum1.x;
2073  o01_im = o01_im + a_inv*accum1.y;
2074  o02_re = o02_re + a_inv*accum2.x;
2075  o02_im = o02_im + a_inv*accum2.y;
2076  o10_re = o10_re + a_inv*accum3.x;
2077  o10_im = o10_im + a_inv*accum3.y;
2078  o11_re = o11_re + a_inv*accum4.x;
2079  o11_im = o11_im + a_inv*accum4.y;
2080  o12_re = o12_re + a_inv*accum5.x;
2081  o12_im = o12_im + a_inv*accum5.y;
2082  o20_re = o20_re + a_inv*accum6.x;
2083  o20_im = o20_im + a_inv*accum6.y;
2084  o21_re = o21_re + a_inv*accum7.x;
2085  o21_im = o21_im + a_inv*accum7.y;
2086  o22_re = o22_re + a_inv*accum8.x;
2087  o22_im = o22_im + a_inv*accum8.y;
2088  o30_re = o30_re + a_inv*accum9.x;
2089  o30_im = o30_im + a_inv*accum9.y;
2090  o31_re = o31_re + a_inv*accum10.x;
2091  o31_im = o31_im + a_inv*accum10.y;
2092  o32_re = o32_re + a_inv*accum11.x;
2093  o32_im = o32_im + a_inv*accum11.y;
2094 #else
2095  o00_re = o00_re + a_inv*accum0.x;
2096  o00_im = o00_im + a_inv*accum0.y;
2097  o01_re = o01_re + a_inv*accum0.z;
2098  o01_im = o01_im + a_inv*accum0.w;
2099  o02_re = o02_re + a_inv*accum1.x;
2100  o02_im = o02_im + a_inv*accum1.y;
2101  o10_re = o10_re + a_inv*accum1.z;
2102  o10_im = o10_im + a_inv*accum1.w;
2103  o11_re = o11_re + a_inv*accum2.x;
2104  o11_im = o11_im + a_inv*accum2.y;
2105  o12_re = o12_re + a_inv*accum2.z;
2106  o12_im = o12_im + a_inv*accum2.w;
2107  o20_re = o20_re + a_inv*accum3.x;
2108  o20_im = o20_im + a_inv*accum3.y;
2109  o21_re = o21_re + a_inv*accum3.z;
2110  o21_im = o21_im + a_inv*accum3.w;
2111  o22_re = o22_re + a_inv*accum4.x;
2112  o22_im = o22_im + a_inv*accum4.y;
2113  o30_re = o30_re + a_inv*accum4.z;
2114  o30_im = o30_im + a_inv*accum4.w;
2115  o31_re = o31_re + a_inv*accum5.x;
2116  o31_im = o31_im + a_inv*accum5.y;
2117  o32_re = o32_re + a_inv*accum5.z;
2118  o32_im = o32_im + a_inv*accum5.w;
2119 #endif // SPINOR_DOUBLE
2120 
2121 #endif // DSLASH_XPAY
2122 }
2123 
2124 #if defined MULTI_GPU && defined DSLASH_XPAY
2125 
2126 int incomplete = 0; // Have all 8 contributions been computed for this site?
2127 
2128 switch(kernel_type) { // intentional fall-through
2129 case INTERIOR_KERNEL:
2130  incomplete = incomplete || (param.commDim[3] && (coord[3]==0 || coord[3]==(param.dc.X[3]-1)));
2131 case EXTERIOR_KERNEL_T:
2132  incomplete = incomplete || (param.commDim[2] && (coord[2]==0 || coord[2]==(param.dc.X[2]-1)));
2133 case EXTERIOR_KERNEL_Z:
2134  incomplete = incomplete || (param.commDim[1] && (coord[1]==0 || coord[1]==(param.dc.X[1]-1)));
2135 case EXTERIOR_KERNEL_Y:
2136  incomplete = incomplete || (param.commDim[0] && (coord[0]==0 || coord[0]==(param.dc.X[0]-1)));
2137 }
2138 
2139 if (!incomplete)
2140 #endif // MULTI_GPU
2141 {
2142 
2143 #ifdef DSLASH_XPAY
2144 #ifdef SPINOR_DOUBLE
2145  spinorFloat a = param.a;
2146 #else
2147  spinorFloat a = param.a_f;
2148 #endif
2149 #ifdef SPINOR_DOUBLE
2150  o00_re = a*o00_re;
2151  o00_im = a*o00_im;
2152  o01_re = a*o01_re;
2153  o01_im = a*o01_im;
2154  o02_re = a*o02_re;
2155  o02_im = a*o02_im;
2156  o10_re = a*o10_re;
2157  o10_im = a*o10_im;
2158  o11_re = a*o11_re;
2159  o11_im = a*o11_im;
2160  o12_re = a*o12_re;
2161  o12_im = a*o12_im;
2162  o20_re = a*o20_re;
2163  o20_im = a*o20_im;
2164  o21_re = a*o21_re;
2165  o21_im = a*o21_im;
2166  o22_re = a*o22_re;
2167  o22_im = a*o22_im;
2168  o30_re = a*o30_re;
2169  o30_im = a*o30_im;
2170  o31_re = a*o31_re;
2171  o31_im = a*o31_im;
2172  o32_re = a*o32_re;
2173  o32_im = a*o32_im;
2174 #else
2175  o00_re = a*o00_re;
2176  o00_im = a*o00_im;
2177  o01_re = a*o01_re;
2178  o01_im = a*o01_im;
2179  o02_re = a*o02_re;
2180  o02_im = a*o02_im;
2181  o10_re = a*o10_re;
2182  o10_im = a*o10_im;
2183  o11_re = a*o11_re;
2184  o11_im = a*o11_im;
2185  o12_re = a*o12_re;
2186  o12_im = a*o12_im;
2187  o20_re = a*o20_re;
2188  o20_im = a*o20_im;
2189  o21_re = a*o21_re;
2190  o21_im = a*o21_im;
2191  o22_re = a*o22_re;
2192  o22_im = a*o22_im;
2193  o30_re = a*o30_re;
2194  o30_im = a*o30_im;
2195  o31_re = a*o31_re;
2196  o31_im = a*o31_im;
2197  o32_re = a*o32_re;
2198  o32_im = a*o32_im;
2199 #endif // SPINOR_DOUBLE
2200 
2201 #endif // DSLASH_XPAY
2202 }
2203 
2204 // write spinor field back to device memory
2205 WRITE_SPINOR(param.sp_stride);
2206 
2207 // undefine to prevent warning when precision is changed
2208 #undef spinorFloat
2209 #undef SHARED_STRIDE
2210 
2211 #undef g00_re
2212 #undef g00_im
2213 #undef g01_re
2214 #undef g01_im
2215 #undef g02_re
2216 #undef g02_im
2217 #undef g10_re
2218 #undef g10_im
2219 #undef g11_re
2220 #undef g11_im
2221 #undef g12_re
2222 #undef g12_im
2223 #undef g20_re
2224 #undef g20_im
2225 #undef g21_re
2226 #undef g21_im
2227 #undef g22_re
2228 #undef g22_im
2229 
2230 #undef i00_re
2231 #undef i00_im
2232 #undef i01_re
2233 #undef i01_im
2234 #undef i02_re
2235 #undef i02_im
2236 #undef i10_re
2237 #undef i10_im
2238 #undef i11_re
2239 #undef i11_im
2240 #undef i12_re
2241 #undef i12_im
2242 #undef i20_re
2243 #undef i20_im
2244 #undef i21_re
2245 #undef i21_im
2246 #undef i22_re
2247 #undef i22_im
2248 #undef i30_re
2249 #undef i30_im
2250 #undef i31_re
2251 #undef i31_im
2252 #undef i32_re
2253 #undef i32_im
2254 
2255 
2256 
2257 #undef VOLATILE
#define g21_im
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
int s_parity
spinorFloat B0_re
VOLATILE spinorFloat o30_im
RECONSTRUCT_GAUGE_MATRIX(0)
VOLATILE spinorFloat o20_re
dim3 dim3 blockDim
#define i32_im
#define g22_im
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define g10_im
#define i10_im
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o11_im
spinorFloat b0_im
#define gT12_re
#define g02_re
float4 G4
spinorFloat A0_re
#define i01_im
spinorFloat A2_re
#define i12_im
#define WRITE_SPINOR
#define gT10_re
spinorFloat a0_re
#define gT20_re
VOLATILE spinorFloat o02_re
spinorFloat b0_re
VOLATILE spinorFloat o02_im
int sp_idx
#define gT21_re
#define gT00_im
spinorFloat A0_im
VOLATILE spinorFloat o30_re
#define g10_re
#define i10_re
#define gT11_im
VOLATILE spinorFloat o21_re
#define gT20_im
#define i21_re
#define i11_im
VOLATILE spinorFloat o10_im
#define i22_im
#define gT01_im
VOLATILE spinorFloat o22_im
const int ga_idx
spinorFloat b1_re
VOLATILE spinorFloat o20_im
int X
spinorFloat B1_re
#define g22_re
VOLATILE spinorFloat o31_im
float4 G2
#define GAUGE0TEX
#define i01_re
#define i11_re
float4 G0
spinorFloat B2_re
spinorFloat a0_im
#define gT22_im
QudaGaugeParam param
Definition: pack_test.cpp:17
VOLATILE spinorFloat o12_re
VOLATILE spinorFloat o00_re
spinorFloat A1_re
#define g12_im
#define i31_im
#define i12_re
VOLATILE spinorFloat o31_re
#define i21_im
#define gT21_im
#define g01_re
spinorFloat b2_re
#define i32_re
#define g11_re
#define gT02_im
spinorFloat B0_im
#define GAUGE1TEX
#define g02_im
VOLATILE spinorFloat o01_re
spinorFloat B1_im
spinorFloat A1_im
VOLATILE spinorFloat o01_im
#define SPINORTEX
#define g00_re
#define g12_re
#define g20_re
spinorFloat b1_im
#define gT12_im
VOLATILE spinorFloat o21_im
#define READ_INTERMEDIATE_SPINOR
spinorFloat a2_im
spinorFloat b2_im
VOLATILE spinorFloat o00_im
int X[4]
Definition: quda.h:29
#define i30_re
#define i20_re
#define g20_im
#define i00_im
coordsFromIndex< 5, QUDA_5D_PC, EVEN_X >(X, coord, sid, param)
#define i20_im
VOLATILE spinorFloat o32_im
#define READ_SPINOR_GHOST
#define gT02_re
#define i02_re
#define VOLATILE
#define ASSN_GAUGE_MATRIX
#define g21_re
#define gT22_re
spinorFloat a2_re
#define gT00_re
spinorFloat a1_re
#define i00_re
#define i22_re
VOLATILE spinorFloat o11_re
#define INTERTEX
#define gT10_im
int sid
int face_idx
#define i30_im
#define i02_im
const int face_num
#define TPROJSCALE
VOLATILE spinorFloat o12_im
#define i31_re
#define GHOSTSPINORTEX
VOLATILE spinorFloat o32_re
#define g00_im
float4 G3
#define g01_im
#define gT01_re
spinorFloat B2_im
VOLATILE spinorFloat o22_re
spinorFloat a1_im
#define gT11_re
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
float4 G1
#define g11_im
#define a
spinorFloat A2_im
int coord[5]
#define spinorFloat