QUDA  v0.5.0
A library for QCD on GPUs
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dw_dslash_dagger_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 // NB! Don't trust any MULTI_GPU code
6 
7 #if (CUDA_VERSION >= 4010)
8 #define VOLATILE
9 #else
10 #define VOLATILE volatile
11 #endif
12 // input spinor
13 #ifdef SPINOR_DOUBLE
14 #define spinorFloat double
15 #define i00_re I0.x
16 #define i00_im I0.y
17 #define i01_re I1.x
18 #define i01_im I1.y
19 #define i02_re I2.x
20 #define i02_im I2.y
21 #define i10_re I3.x
22 #define i10_im I3.y
23 #define i11_re I4.x
24 #define i11_im I4.y
25 #define i12_re I5.x
26 #define i12_im I5.y
27 #define i20_re I6.x
28 #define i20_im I6.y
29 #define i21_re I7.x
30 #define i21_im I7.y
31 #define i22_re I8.x
32 #define i22_im I8.y
33 #define i30_re I9.x
34 #define i30_im I9.y
35 #define i31_re I10.x
36 #define i31_im I10.y
37 #define i32_re I11.x
38 #define i32_im I11.y
39 #else
40 #define spinorFloat float
41 #define i00_re I0.x
42 #define i00_im I0.y
43 #define i01_re I0.z
44 #define i01_im I0.w
45 #define i02_re I1.x
46 #define i02_im I1.y
47 #define i10_re I1.z
48 #define i10_im I1.w
49 #define i11_re I2.x
50 #define i11_im I2.y
51 #define i12_re I2.z
52 #define i12_im I2.w
53 #define i20_re I3.x
54 #define i20_im I3.y
55 #define i21_re I3.z
56 #define i21_im I3.w
57 #define i22_re I4.x
58 #define i22_im I4.y
59 #define i30_re I4.z
60 #define i30_im I4.w
61 #define i31_re I5.x
62 #define i31_im I5.y
63 #define i32_re I5.z
64 #define i32_im I5.w
65 #endif // SPINOR_DOUBLE
66 
67 // gauge link
68 #ifdef GAUGE_FLOAT2
69 #define g00_re G0.x
70 #define g00_im G0.y
71 #define g01_re G1.x
72 #define g01_im G1.y
73 #define g02_re G2.x
74 #define g02_im G2.y
75 #define g10_re G3.x
76 #define g10_im G3.y
77 #define g11_re G4.x
78 #define g11_im G4.y
79 #define g12_re G5.x
80 #define g12_im G5.y
81 #define g20_re G6.x
82 #define g20_im G6.y
83 #define g21_re G7.x
84 #define g21_im G7.y
85 #define g22_re G8.x
86 #define g22_im G8.y
87 
88 #else
89 #define g00_re G0.x
90 #define g00_im G0.y
91 #define g01_re G0.z
92 #define g01_im G0.w
93 #define g02_re G1.x
94 #define g02_im G1.y
95 #define g10_re G1.z
96 #define g10_im G1.w
97 #define g11_re G2.x
98 #define g11_im G2.y
99 #define g12_re G2.z
100 #define g12_im G2.w
101 #define g20_re G3.x
102 #define g20_im G3.y
103 #define g21_re G3.z
104 #define g21_im G3.w
105 #define g22_re G4.x
106 #define g22_im G4.y
107 
108 #endif // GAUGE_DOUBLE
109 
110 // conjugated gauge link
111 #define gT00_re (+g00_re)
112 #define gT00_im (-g00_im)
113 #define gT01_re (+g10_re)
114 #define gT01_im (-g10_im)
115 #define gT02_re (+g20_re)
116 #define gT02_im (-g20_im)
117 #define gT10_re (+g01_re)
118 #define gT10_im (-g01_im)
119 #define gT11_re (+g11_re)
120 #define gT11_im (-g11_im)
121 #define gT12_re (+g21_re)
122 #define gT12_im (-g21_im)
123 #define gT20_re (+g02_re)
124 #define gT20_im (-g02_im)
125 #define gT21_re (+g12_re)
126 #define gT21_im (-g12_im)
127 #define gT22_re (+g22_re)
128 #define gT22_im (-g22_im)
129 
130 // output spinor
155 
156 #ifdef SPINOR_DOUBLE
157 #if (__COMPUTE_CAPABILITY__ >= 200)
158 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
159 #else
160 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
161 #endif
162 #else
163 #if (__COMPUTE_CAPABILITY__ >= 200)
164 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
165 #else
166 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
167 #endif
168 #endif
169 
170 #include "read_gauge.h"
171 #include "io_spinor.h"
172 
173 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
174 int sp_norm_idx;
175 #endif // MULTI_GPU half precision
176 
177 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
178 if (sid >= param.threads*Ls) return;
179 
180 int X, x1, x2, x3, x4, xs;
181 
183 
184 #ifdef MULTI_GPU
185 int face_idx;
186 if (kernel_type == INTERIOR_KERNEL) {
187 #endif
188 
189 // Inline by hand for the moment and assume even dimensions
190 //coordsFromIndex(X, x1, x2, x3, x4, sid, param.parity);
191 
192 s_parity = ( sid/(X4*X3*X2*X1h) ) % 2;
193 boundaryCrossing = sid/X1h + sid/(X2*X1h) + sid/(X3*X2*X1h) + sid/(X4*X3*X2*X1h);
194 
195 X = 2*sid + (boundaryCrossing + param.parity) % 2;
196 x1 = X % X1;
197 x2 = (X/X1) % X2;
198 x3 = (X/(X1*X2)) % X3;
199 x4 = (X/(X1*X2*X3)) % X4;
200 xs = X/(X1*X2*X3*X4);
201 
202  o00_re = 0; o00_im = 0;
203  o01_re = 0; o01_im = 0;
204  o02_re = 0; o02_im = 0;
205  o10_re = 0; o10_im = 0;
206  o11_re = 0; o11_im = 0;
207  o12_re = 0; o12_im = 0;
208  o20_re = 0; o20_im = 0;
209  o21_re = 0; o21_im = 0;
210  o22_re = 0; o22_im = 0;
211  o30_re = 0; o30_im = 0;
212  o31_re = 0; o31_im = 0;
213  o32_re = 0; o32_im = 0;
214 
215 #ifdef MULTI_GPU
216 } else { // exterior kernel
217 
218 const int dim = static_cast<int>(kernel_type);
219 const int face_volume = (param.threads*Ls >> 1); // volume of one face
220 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
221 face_idx = sid - face_num*face_volume; // index into the respective face
222 
223 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
224 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
225 //sp_idx = face_idx + param.ghostOffset[dim];
226 
227 #if (DD_PREC==2) // half precision
228 sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
229 #endif
230 
231 coordsFromDWFaceIndex<1>(sid, x1, x2, x3, x4, xs, face_idx, face_volume, dim, face_num, param.parity);
232 
233 s_parity = ( sid/(X4*X3*X2*X1h) ) % 2;
234 boundaryCrossing = sid/X1h + sid/(X2*X1h) + sid/(X3*X2*X1h) + sid/(X4*X3*X2*X1h);
235 
236 X = 2*sid + (boundaryCrossing + param.parity) % 2;
237 
239 
240  o00_re = i00_re; o00_im = i00_im;
241  o01_re = i01_re; o01_im = i01_im;
242  o02_re = i02_re; o02_im = i02_im;
243  o10_re = i10_re; o10_im = i10_im;
244  o11_re = i11_re; o11_im = i11_im;
245  o12_re = i12_re; o12_im = i12_im;
246  o20_re = i20_re; o20_im = i20_im;
247  o21_re = i21_re; o21_im = i21_im;
248  o22_re = i22_re; o22_im = i22_im;
249  o30_re = i30_re; o30_im = i30_im;
250  o31_re = i31_re; o31_im = i31_im;
251  o32_re = i32_re; o32_im = i32_im;
252 }
253 #endif // MULTI_GPU
254 
255 // declare G## here and use ASSN below instead of READ
256 #ifdef GAUGE_FLOAT2
257 #if (DD_PREC==0) //temporal hack
258 double2 G0;
259 double2 G1;
260 double2 G2;
261 double2 G3;
262 double2 G4;
263 double2 G5;
264 double2 G6;
265 double2 G7;
266 double2 G8;
267 #else
268 float2 G0;
269 float2 G1;
270 float2 G2;
271 float2 G3;
272 float2 G4;
273 float2 G5;
274 float2 G6;
275 float2 G7;
276 float2 G8;
277 #endif
278 #else
279 float4 G0;
280 float4 G1;
281 float4 G2;
282 float4 G3;
283 float4 G4;
284 #endif
285 
286 
287 
288 #ifdef MULTI_GPU
289 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
290  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
291 #endif
292 {
293  // Projector P0+
294  // 1 0 0 i
295  // 0 1 i 0
296  // 0 -i 1 0
297  // -i 0 0 1
298 
299 #ifdef MULTI_GPU
300  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
301  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
302 #else
303  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
304 #endif
305 
306  const int ga_idx = sid % Vh;
307 
308  // read gauge matrix from device memory
309  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride); }
310  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 0, ga_idx, ga_stride); }
311 
318 
319 #ifdef MULTI_GPU
320  if (kernel_type == INTERIOR_KERNEL) {
321 #endif
322 
323  // read spinor from device memory
324  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
325 
326  // project spinor into half spinors
327  a0_re = +i00_re-i30_im;
328  a0_im = +i00_im+i30_re;
329  a1_re = +i01_re-i31_im;
330  a1_im = +i01_im+i31_re;
331  a2_re = +i02_re-i32_im;
332  a2_im = +i02_im+i32_re;
333  b0_re = +i10_re-i20_im;
334  b0_im = +i10_im+i20_re;
335  b1_re = +i11_re-i21_im;
336  b1_im = +i11_im+i21_re;
337  b2_re = +i12_re-i22_im;
338  b2_im = +i12_im+i22_re;
339 
340 #ifdef MULTI_GPU
341  } else {
342 
343  const int sp_stride_pad = Ls*ghostFace[static_cast<int>(kernel_type)];
344 
345  // read half spinor from device memory
346  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
347 
348  a0_re = i00_re; a0_im = i00_im;
349  a1_re = i01_re; a1_im = i01_im;
350  a2_re = i02_re; a2_im = i02_im;
351  b0_re = i10_re; b0_im = i10_im;
352  b1_re = i11_re; b1_im = i11_im;
353  b2_re = i12_re; b2_im = i12_im;
354 
355  }
356 #endif // MULTI_GPU
357 
358  // reconstruct gauge matrix
360 
361  // multiply row 0
363  A0_re += g00_re * a0_re;
364  A0_re -= g00_im * a0_im;
365  A0_re += g01_re * a1_re;
366  A0_re -= g01_im * a1_im;
367  A0_re += g02_re * a2_re;
368  A0_re -= g02_im * a2_im;
370  A0_im += g00_re * a0_im;
371  A0_im += g00_im * a0_re;
372  A0_im += g01_re * a1_im;
373  A0_im += g01_im * a1_re;
374  A0_im += g02_re * a2_im;
375  A0_im += g02_im * a2_re;
377  B0_re += g00_re * b0_re;
378  B0_re -= g00_im * b0_im;
379  B0_re += g01_re * b1_re;
380  B0_re -= g01_im * b1_im;
381  B0_re += g02_re * b2_re;
382  B0_re -= g02_im * b2_im;
384  B0_im += g00_re * b0_im;
385  B0_im += g00_im * b0_re;
386  B0_im += g01_re * b1_im;
387  B0_im += g01_im * b1_re;
388  B0_im += g02_re * b2_im;
389  B0_im += g02_im * b2_re;
390 
391  // multiply row 1
393  A1_re += g10_re * a0_re;
394  A1_re -= g10_im * a0_im;
395  A1_re += g11_re * a1_re;
396  A1_re -= g11_im * a1_im;
397  A1_re += g12_re * a2_re;
398  A1_re -= g12_im * a2_im;
400  A1_im += g10_re * a0_im;
401  A1_im += g10_im * a0_re;
402  A1_im += g11_re * a1_im;
403  A1_im += g11_im * a1_re;
404  A1_im += g12_re * a2_im;
405  A1_im += g12_im * a2_re;
407  B1_re += g10_re * b0_re;
408  B1_re -= g10_im * b0_im;
409  B1_re += g11_re * b1_re;
410  B1_re -= g11_im * b1_im;
411  B1_re += g12_re * b2_re;
412  B1_re -= g12_im * b2_im;
414  B1_im += g10_re * b0_im;
415  B1_im += g10_im * b0_re;
416  B1_im += g11_re * b1_im;
417  B1_im += g11_im * b1_re;
418  B1_im += g12_re * b2_im;
419  B1_im += g12_im * b2_re;
420 
421  // multiply row 2
423  A2_re += g20_re * a0_re;
424  A2_re -= g20_im * a0_im;
425  A2_re += g21_re * a1_re;
426  A2_re -= g21_im * a1_im;
427  A2_re += g22_re * a2_re;
428  A2_re -= g22_im * a2_im;
430  A2_im += g20_re * a0_im;
431  A2_im += g20_im * a0_re;
432  A2_im += g21_re * a1_im;
433  A2_im += g21_im * a1_re;
434  A2_im += g22_re * a2_im;
435  A2_im += g22_im * a2_re;
437  B2_re += g20_re * b0_re;
438  B2_re -= g20_im * b0_im;
439  B2_re += g21_re * b1_re;
440  B2_re -= g21_im * b1_im;
441  B2_re += g22_re * b2_re;
442  B2_re -= g22_im * b2_im;
444  B2_im += g20_re * b0_im;
445  B2_im += g20_im * b0_re;
446  B2_im += g21_re * b1_im;
447  B2_im += g21_im * b1_re;
448  B2_im += g22_re * b2_im;
449  B2_im += g22_im * b2_re;
450 
451  o00_re += A0_re;
452  o00_im += A0_im;
453  o10_re += B0_re;
454  o10_im += B0_im;
455  o20_re += B0_im;
456  o20_im -= B0_re;
457  o30_re += A0_im;
458  o30_im -= A0_re;
459 
460  o01_re += A1_re;
461  o01_im += A1_im;
462  o11_re += B1_re;
463  o11_im += B1_im;
464  o21_re += B1_im;
465  o21_im -= B1_re;
466  o31_re += A1_im;
467  o31_im -= A1_re;
468 
469  o02_re += A2_re;
470  o02_im += A2_im;
471  o12_re += B2_re;
472  o12_im += B2_im;
473  o22_re += B2_im;
474  o22_im -= B2_re;
475  o32_re += A2_im;
476  o32_im -= A2_re;
477 }
478 
479 #ifdef MULTI_GPU
480 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
481  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
482 #endif
483 {
484  // Projector P0-
485  // 1 0 0 -i
486  // 0 1 -i 0
487  // 0 i 1 0
488  // i 0 0 1
489 
490 #ifdef MULTI_GPU
491  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
492  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
493 #else
494  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
495 #endif
496 
497 #ifdef MULTI_GPU
498  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % Vh : Vh+(face_idx % ghostFace[static_cast<int>(kernel_type)]));
499 #else
500  const int ga_idx = sp_idx % Vh;
501 #endif
502 
503  // read gauge matrix from device memory
504  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride); }
505  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 1, ga_idx, ga_stride); }
506 
513 
514 #ifdef MULTI_GPU
515  if (kernel_type == INTERIOR_KERNEL) {
516 #endif
517 
518  // read spinor from device memory
519  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
520 
521  // project spinor into half spinors
522  a0_re = +i00_re+i30_im;
523  a0_im = +i00_im-i30_re;
524  a1_re = +i01_re+i31_im;
525  a1_im = +i01_im-i31_re;
526  a2_re = +i02_re+i32_im;
527  a2_im = +i02_im-i32_re;
528  b0_re = +i10_re+i20_im;
529  b0_im = +i10_im-i20_re;
530  b1_re = +i11_re+i21_im;
531  b1_im = +i11_im-i21_re;
532  b2_re = +i12_re+i22_im;
533  b2_im = +i12_im-i22_re;
534 
535 #ifdef MULTI_GPU
536  } else {
537 
538  const int sp_stride_pad = Ls*ghostFace[static_cast<int>(kernel_type)];
539 
540  // read half spinor from device memory
541  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
542 
543  a0_re = i00_re; a0_im = i00_im;
544  a1_re = i01_re; a1_im = i01_im;
545  a2_re = i02_re; a2_im = i02_im;
546  b0_re = i10_re; b0_im = i10_im;
547  b1_re = i11_re; b1_im = i11_im;
548  b2_re = i12_re; b2_im = i12_im;
549 
550  }
551 #endif // MULTI_GPU
552 
553  // reconstruct gauge matrix
555 
556  // multiply row 0
557  spinorFloat A0_re = 0;
558  A0_re += gT00_re * a0_re;
559  A0_re -= gT00_im * a0_im;
560  A0_re += gT01_re * a1_re;
561  A0_re -= gT01_im * a1_im;
562  A0_re += gT02_re * a2_re;
563  A0_re -= gT02_im * a2_im;
564  spinorFloat A0_im = 0;
565  A0_im += gT00_re * a0_im;
566  A0_im += gT00_im * a0_re;
567  A0_im += gT01_re * a1_im;
568  A0_im += gT01_im * a1_re;
569  A0_im += gT02_re * a2_im;
570  A0_im += gT02_im * a2_re;
571  spinorFloat B0_re = 0;
572  B0_re += gT00_re * b0_re;
573  B0_re -= gT00_im * b0_im;
574  B0_re += gT01_re * b1_re;
575  B0_re -= gT01_im * b1_im;
576  B0_re += gT02_re * b2_re;
577  B0_re -= gT02_im * b2_im;
578  spinorFloat B0_im = 0;
579  B0_im += gT00_re * b0_im;
580  B0_im += gT00_im * b0_re;
581  B0_im += gT01_re * b1_im;
582  B0_im += gT01_im * b1_re;
583  B0_im += gT02_re * b2_im;
584  B0_im += gT02_im * b2_re;
585 
586  // multiply row 1
587  spinorFloat A1_re = 0;
588  A1_re += gT10_re * a0_re;
589  A1_re -= gT10_im * a0_im;
590  A1_re += gT11_re * a1_re;
591  A1_re -= gT11_im * a1_im;
592  A1_re += gT12_re * a2_re;
593  A1_re -= gT12_im * a2_im;
594  spinorFloat A1_im = 0;
595  A1_im += gT10_re * a0_im;
596  A1_im += gT10_im * a0_re;
597  A1_im += gT11_re * a1_im;
598  A1_im += gT11_im * a1_re;
599  A1_im += gT12_re * a2_im;
600  A1_im += gT12_im * a2_re;
601  spinorFloat B1_re = 0;
602  B1_re += gT10_re * b0_re;
603  B1_re -= gT10_im * b0_im;
604  B1_re += gT11_re * b1_re;
605  B1_re -= gT11_im * b1_im;
606  B1_re += gT12_re * b2_re;
607  B1_re -= gT12_im * b2_im;
608  spinorFloat B1_im = 0;
609  B1_im += gT10_re * b0_im;
610  B1_im += gT10_im * b0_re;
611  B1_im += gT11_re * b1_im;
612  B1_im += gT11_im * b1_re;
613  B1_im += gT12_re * b2_im;
614  B1_im += gT12_im * b2_re;
615 
616  // multiply row 2
617  spinorFloat A2_re = 0;
618  A2_re += gT20_re * a0_re;
619  A2_re -= gT20_im * a0_im;
620  A2_re += gT21_re * a1_re;
621  A2_re -= gT21_im * a1_im;
622  A2_re += gT22_re * a2_re;
623  A2_re -= gT22_im * a2_im;
624  spinorFloat A2_im = 0;
625  A2_im += gT20_re * a0_im;
626  A2_im += gT20_im * a0_re;
627  A2_im += gT21_re * a1_im;
628  A2_im += gT21_im * a1_re;
629  A2_im += gT22_re * a2_im;
630  A2_im += gT22_im * a2_re;
631  spinorFloat B2_re = 0;
632  B2_re += gT20_re * b0_re;
633  B2_re -= gT20_im * b0_im;
634  B2_re += gT21_re * b1_re;
635  B2_re -= gT21_im * b1_im;
636  B2_re += gT22_re * b2_re;
637  B2_re -= gT22_im * b2_im;
638  spinorFloat B2_im = 0;
639  B2_im += gT20_re * b0_im;
640  B2_im += gT20_im * b0_re;
641  B2_im += gT21_re * b1_im;
642  B2_im += gT21_im * b1_re;
643  B2_im += gT22_re * b2_im;
644  B2_im += gT22_im * b2_re;
645 
646  o00_re += A0_re;
647  o00_im += A0_im;
648  o10_re += B0_re;
649  o10_im += B0_im;
650  o20_re -= B0_im;
651  o20_im += B0_re;
652  o30_re -= A0_im;
653  o30_im += A0_re;
654 
655  o01_re += A1_re;
656  o01_im += A1_im;
657  o11_re += B1_re;
658  o11_im += B1_im;
659  o21_re -= B1_im;
660  o21_im += B1_re;
661  o31_re -= A1_im;
662  o31_im += A1_re;
663 
664  o02_re += A2_re;
665  o02_im += A2_im;
666  o12_re += B2_re;
667  o12_im += B2_im;
668  o22_re -= B2_im;
669  o22_im += B2_re;
670  o32_re -= A2_im;
671  o32_im += A2_re;
672 }
673 
674 #ifdef MULTI_GPU
675 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
676  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
677 #endif
678 {
679  // Projector P1+
680  // 1 0 0 1
681  // 0 1 -1 0
682  // 0 -1 1 0
683  // 1 0 0 1
684 
685 #ifdef MULTI_GPU
686  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
687  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
688 #else
689  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
690 #endif
691 
692  const int ga_idx = sid % Vh;
693 
694  // read gauge matrix from device memory
695  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride); }
696  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 2, ga_idx, ga_stride); }
697 
704 
705 #ifdef MULTI_GPU
706  if (kernel_type == INTERIOR_KERNEL) {
707 #endif
708 
709  // read spinor from device memory
710  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
711 
712  // project spinor into half spinors
713  a0_re = +i00_re+i30_re;
714  a0_im = +i00_im+i30_im;
715  a1_re = +i01_re+i31_re;
716  a1_im = +i01_im+i31_im;
717  a2_re = +i02_re+i32_re;
718  a2_im = +i02_im+i32_im;
719  b0_re = +i10_re-i20_re;
720  b0_im = +i10_im-i20_im;
721  b1_re = +i11_re-i21_re;
722  b1_im = +i11_im-i21_im;
723  b2_re = +i12_re-i22_re;
724  b2_im = +i12_im-i22_im;
725 
726 #ifdef MULTI_GPU
727  } else {
728 
729  const int sp_stride_pad = Ls*ghostFace[static_cast<int>(kernel_type)];
730 
731  // read half spinor from device memory
732  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
733 
734  a0_re = i00_re; a0_im = i00_im;
735  a1_re = i01_re; a1_im = i01_im;
736  a2_re = i02_re; a2_im = i02_im;
737  b0_re = i10_re; b0_im = i10_im;
738  b1_re = i11_re; b1_im = i11_im;
739  b2_re = i12_re; b2_im = i12_im;
740 
741  }
742 #endif // MULTI_GPU
743 
744  // reconstruct gauge matrix
746 
747  // multiply row 0
748  spinorFloat A0_re = 0;
749  A0_re += g00_re * a0_re;
750  A0_re -= g00_im * a0_im;
751  A0_re += g01_re * a1_re;
752  A0_re -= g01_im * a1_im;
753  A0_re += g02_re * a2_re;
754  A0_re -= g02_im * a2_im;
755  spinorFloat A0_im = 0;
756  A0_im += g00_re * a0_im;
757  A0_im += g00_im * a0_re;
758  A0_im += g01_re * a1_im;
759  A0_im += g01_im * a1_re;
760  A0_im += g02_re * a2_im;
761  A0_im += g02_im * a2_re;
762  spinorFloat B0_re = 0;
763  B0_re += g00_re * b0_re;
764  B0_re -= g00_im * b0_im;
765  B0_re += g01_re * b1_re;
766  B0_re -= g01_im * b1_im;
767  B0_re += g02_re * b2_re;
768  B0_re -= g02_im * b2_im;
769  spinorFloat B0_im = 0;
770  B0_im += g00_re * b0_im;
771  B0_im += g00_im * b0_re;
772  B0_im += g01_re * b1_im;
773  B0_im += g01_im * b1_re;
774  B0_im += g02_re * b2_im;
775  B0_im += g02_im * b2_re;
776 
777  // multiply row 1
778  spinorFloat A1_re = 0;
779  A1_re += g10_re * a0_re;
780  A1_re -= g10_im * a0_im;
781  A1_re += g11_re * a1_re;
782  A1_re -= g11_im * a1_im;
783  A1_re += g12_re * a2_re;
784  A1_re -= g12_im * a2_im;
785  spinorFloat A1_im = 0;
786  A1_im += g10_re * a0_im;
787  A1_im += g10_im * a0_re;
788  A1_im += g11_re * a1_im;
789  A1_im += g11_im * a1_re;
790  A1_im += g12_re * a2_im;
791  A1_im += g12_im * a2_re;
792  spinorFloat B1_re = 0;
793  B1_re += g10_re * b0_re;
794  B1_re -= g10_im * b0_im;
795  B1_re += g11_re * b1_re;
796  B1_re -= g11_im * b1_im;
797  B1_re += g12_re * b2_re;
798  B1_re -= g12_im * b2_im;
799  spinorFloat B1_im = 0;
800  B1_im += g10_re * b0_im;
801  B1_im += g10_im * b0_re;
802  B1_im += g11_re * b1_im;
803  B1_im += g11_im * b1_re;
804  B1_im += g12_re * b2_im;
805  B1_im += g12_im * b2_re;
806 
807  // multiply row 2
808  spinorFloat A2_re = 0;
809  A2_re += g20_re * a0_re;
810  A2_re -= g20_im * a0_im;
811  A2_re += g21_re * a1_re;
812  A2_re -= g21_im * a1_im;
813  A2_re += g22_re * a2_re;
814  A2_re -= g22_im * a2_im;
815  spinorFloat A2_im = 0;
816  A2_im += g20_re * a0_im;
817  A2_im += g20_im * a0_re;
818  A2_im += g21_re * a1_im;
819  A2_im += g21_im * a1_re;
820  A2_im += g22_re * a2_im;
821  A2_im += g22_im * a2_re;
822  spinorFloat B2_re = 0;
823  B2_re += g20_re * b0_re;
824  B2_re -= g20_im * b0_im;
825  B2_re += g21_re * b1_re;
826  B2_re -= g21_im * b1_im;
827  B2_re += g22_re * b2_re;
828  B2_re -= g22_im * b2_im;
829  spinorFloat B2_im = 0;
830  B2_im += g20_re * b0_im;
831  B2_im += g20_im * b0_re;
832  B2_im += g21_re * b1_im;
833  B2_im += g21_im * b1_re;
834  B2_im += g22_re * b2_im;
835  B2_im += g22_im * b2_re;
836 
837  o00_re += A0_re;
838  o00_im += A0_im;
839  o10_re += B0_re;
840  o10_im += B0_im;
841  o20_re -= B0_re;
842  o20_im -= B0_im;
843  o30_re += A0_re;
844  o30_im += A0_im;
845 
846  o01_re += A1_re;
847  o01_im += A1_im;
848  o11_re += B1_re;
849  o11_im += B1_im;
850  o21_re -= B1_re;
851  o21_im -= B1_im;
852  o31_re += A1_re;
853  o31_im += A1_im;
854 
855  o02_re += A2_re;
856  o02_im += A2_im;
857  o12_re += B2_re;
858  o12_im += B2_im;
859  o22_re -= B2_re;
860  o22_im -= B2_im;
861  o32_re += A2_re;
862  o32_im += A2_im;
863 }
864 
865 #ifdef MULTI_GPU
866 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
867  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
868 #endif
869 {
870  // Projector P1-
871  // 1 0 0 -1
872  // 0 1 1 0
873  // 0 1 1 0
874  // -1 0 0 1
875 
876 #ifdef MULTI_GPU
877  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
878  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
879 #else
880  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
881 #endif
882 
883 #ifdef MULTI_GPU
884  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % Vh : Vh+(face_idx % ghostFace[static_cast<int>(kernel_type)]));
885 #else
886  const int ga_idx = sp_idx % Vh;
887 #endif
888 
889  // read gauge matrix from device memory
890  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride); }
891  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 3, ga_idx, ga_stride); }
892 
899 
900 #ifdef MULTI_GPU
901  if (kernel_type == INTERIOR_KERNEL) {
902 #endif
903 
904  // read spinor from device memory
905  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
906 
907  // project spinor into half spinors
908  a0_re = +i00_re-i30_re;
909  a0_im = +i00_im-i30_im;
910  a1_re = +i01_re-i31_re;
911  a1_im = +i01_im-i31_im;
912  a2_re = +i02_re-i32_re;
913  a2_im = +i02_im-i32_im;
914  b0_re = +i10_re+i20_re;
915  b0_im = +i10_im+i20_im;
916  b1_re = +i11_re+i21_re;
917  b1_im = +i11_im+i21_im;
918  b2_re = +i12_re+i22_re;
919  b2_im = +i12_im+i22_im;
920 
921 #ifdef MULTI_GPU
922  } else {
923 
924  const int sp_stride_pad = Ls*ghostFace[static_cast<int>(kernel_type)];
925 
926  // read half spinor from device memory
927  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
928 
929  a0_re = i00_re; a0_im = i00_im;
930  a1_re = i01_re; a1_im = i01_im;
931  a2_re = i02_re; a2_im = i02_im;
932  b0_re = i10_re; b0_im = i10_im;
933  b1_re = i11_re; b1_im = i11_im;
934  b2_re = i12_re; b2_im = i12_im;
935 
936  }
937 #endif // MULTI_GPU
938 
939  // reconstruct gauge matrix
941 
942  // multiply row 0
943  spinorFloat A0_re = 0;
944  A0_re += gT00_re * a0_re;
945  A0_re -= gT00_im * a0_im;
946  A0_re += gT01_re * a1_re;
947  A0_re -= gT01_im * a1_im;
948  A0_re += gT02_re * a2_re;
949  A0_re -= gT02_im * a2_im;
950  spinorFloat A0_im = 0;
951  A0_im += gT00_re * a0_im;
952  A0_im += gT00_im * a0_re;
953  A0_im += gT01_re * a1_im;
954  A0_im += gT01_im * a1_re;
955  A0_im += gT02_re * a2_im;
956  A0_im += gT02_im * a2_re;
957  spinorFloat B0_re = 0;
958  B0_re += gT00_re * b0_re;
959  B0_re -= gT00_im * b0_im;
960  B0_re += gT01_re * b1_re;
961  B0_re -= gT01_im * b1_im;
962  B0_re += gT02_re * b2_re;
963  B0_re -= gT02_im * b2_im;
964  spinorFloat B0_im = 0;
965  B0_im += gT00_re * b0_im;
966  B0_im += gT00_im * b0_re;
967  B0_im += gT01_re * b1_im;
968  B0_im += gT01_im * b1_re;
969  B0_im += gT02_re * b2_im;
970  B0_im += gT02_im * b2_re;
971 
972  // multiply row 1
973  spinorFloat A1_re = 0;
974  A1_re += gT10_re * a0_re;
975  A1_re -= gT10_im * a0_im;
976  A1_re += gT11_re * a1_re;
977  A1_re -= gT11_im * a1_im;
978  A1_re += gT12_re * a2_re;
979  A1_re -= gT12_im * a2_im;
980  spinorFloat A1_im = 0;
981  A1_im += gT10_re * a0_im;
982  A1_im += gT10_im * a0_re;
983  A1_im += gT11_re * a1_im;
984  A1_im += gT11_im * a1_re;
985  A1_im += gT12_re * a2_im;
986  A1_im += gT12_im * a2_re;
987  spinorFloat B1_re = 0;
988  B1_re += gT10_re * b0_re;
989  B1_re -= gT10_im * b0_im;
990  B1_re += gT11_re * b1_re;
991  B1_re -= gT11_im * b1_im;
992  B1_re += gT12_re * b2_re;
993  B1_re -= gT12_im * b2_im;
994  spinorFloat B1_im = 0;
995  B1_im += gT10_re * b0_im;
996  B1_im += gT10_im * b0_re;
997  B1_im += gT11_re * b1_im;
998  B1_im += gT11_im * b1_re;
999  B1_im += gT12_re * b2_im;
1000  B1_im += gT12_im * b2_re;
1001 
1002  // multiply row 2
1003  spinorFloat A2_re = 0;
1004  A2_re += gT20_re * a0_re;
1005  A2_re -= gT20_im * a0_im;
1006  A2_re += gT21_re * a1_re;
1007  A2_re -= gT21_im * a1_im;
1008  A2_re += gT22_re * a2_re;
1009  A2_re -= gT22_im * a2_im;
1010  spinorFloat A2_im = 0;
1011  A2_im += gT20_re * a0_im;
1012  A2_im += gT20_im * a0_re;
1013  A2_im += gT21_re * a1_im;
1014  A2_im += gT21_im * a1_re;
1015  A2_im += gT22_re * a2_im;
1016  A2_im += gT22_im * a2_re;
1017  spinorFloat B2_re = 0;
1018  B2_re += gT20_re * b0_re;
1019  B2_re -= gT20_im * b0_im;
1020  B2_re += gT21_re * b1_re;
1021  B2_re -= gT21_im * b1_im;
1022  B2_re += gT22_re * b2_re;
1023  B2_re -= gT22_im * b2_im;
1024  spinorFloat B2_im = 0;
1025  B2_im += gT20_re * b0_im;
1026  B2_im += gT20_im * b0_re;
1027  B2_im += gT21_re * b1_im;
1028  B2_im += gT21_im * b1_re;
1029  B2_im += gT22_re * b2_im;
1030  B2_im += gT22_im * b2_re;
1031 
1032  o00_re += A0_re;
1033  o00_im += A0_im;
1034  o10_re += B0_re;
1035  o10_im += B0_im;
1036  o20_re += B0_re;
1037  o20_im += B0_im;
1038  o30_re -= A0_re;
1039  o30_im -= A0_im;
1040 
1041  o01_re += A1_re;
1042  o01_im += A1_im;
1043  o11_re += B1_re;
1044  o11_im += B1_im;
1045  o21_re += B1_re;
1046  o21_im += B1_im;
1047  o31_re -= A1_re;
1048  o31_im -= A1_im;
1049 
1050  o02_re += A2_re;
1051  o02_im += A2_im;
1052  o12_re += B2_re;
1053  o12_im += B2_im;
1054  o22_re += B2_re;
1055  o22_im += B2_im;
1056  o32_re -= A2_re;
1057  o32_im -= A2_im;
1058 }
1059 
1060 #ifdef MULTI_GPU
1061 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1062  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1063 #endif
1064 {
1065  // Projector P2+
1066  // 1 0 i 0
1067  // 0 1 0 -i
1068  // -i 0 1 0
1069  // 0 i 0 1
1070 
1071 #ifdef MULTI_GPU
1072  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1073  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1074 #else
1075  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1076 #endif
1077 
1078  const int ga_idx = sid % Vh;
1079 
1080  // read gauge matrix from device memory
1081  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride); }
1082  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 4, ga_idx, ga_stride); }
1083 
1090 
1091 #ifdef MULTI_GPU
1092  if (kernel_type == INTERIOR_KERNEL) {
1093 #endif
1094 
1095  // read spinor from device memory
1096  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
1097 
1098  // project spinor into half spinors
1099  a0_re = +i00_re-i20_im;
1100  a0_im = +i00_im+i20_re;
1101  a1_re = +i01_re-i21_im;
1102  a1_im = +i01_im+i21_re;
1103  a2_re = +i02_re-i22_im;
1104  a2_im = +i02_im+i22_re;
1105  b0_re = +i10_re+i30_im;
1106  b0_im = +i10_im-i30_re;
1107  b1_re = +i11_re+i31_im;
1108  b1_im = +i11_im-i31_re;
1109  b2_re = +i12_re+i32_im;
1110  b2_im = +i12_im-i32_re;
1111 
1112 #ifdef MULTI_GPU
1113  } else {
1114 
1115  const int sp_stride_pad = Ls*ghostFace[static_cast<int>(kernel_type)];
1116 
1117  // read half spinor from device memory
1118  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1119 
1120  a0_re = i00_re; a0_im = i00_im;
1121  a1_re = i01_re; a1_im = i01_im;
1122  a2_re = i02_re; a2_im = i02_im;
1123  b0_re = i10_re; b0_im = i10_im;
1124  b1_re = i11_re; b1_im = i11_im;
1125  b2_re = i12_re; b2_im = i12_im;
1126 
1127  }
1128 #endif // MULTI_GPU
1129 
1130  // reconstruct gauge matrix
1132 
1133  // multiply row 0
1134  spinorFloat A0_re = 0;
1135  A0_re += g00_re * a0_re;
1136  A0_re -= g00_im * a0_im;
1137  A0_re += g01_re * a1_re;
1138  A0_re -= g01_im * a1_im;
1139  A0_re += g02_re * a2_re;
1140  A0_re -= g02_im * a2_im;
1141  spinorFloat A0_im = 0;
1142  A0_im += g00_re * a0_im;
1143  A0_im += g00_im * a0_re;
1144  A0_im += g01_re * a1_im;
1145  A0_im += g01_im * a1_re;
1146  A0_im += g02_re * a2_im;
1147  A0_im += g02_im * a2_re;
1148  spinorFloat B0_re = 0;
1149  B0_re += g00_re * b0_re;
1150  B0_re -= g00_im * b0_im;
1151  B0_re += g01_re * b1_re;
1152  B0_re -= g01_im * b1_im;
1153  B0_re += g02_re * b2_re;
1154  B0_re -= g02_im * b2_im;
1155  spinorFloat B0_im = 0;
1156  B0_im += g00_re * b0_im;
1157  B0_im += g00_im * b0_re;
1158  B0_im += g01_re * b1_im;
1159  B0_im += g01_im * b1_re;
1160  B0_im += g02_re * b2_im;
1161  B0_im += g02_im * b2_re;
1162 
1163  // multiply row 1
1164  spinorFloat A1_re = 0;
1165  A1_re += g10_re * a0_re;
1166  A1_re -= g10_im * a0_im;
1167  A1_re += g11_re * a1_re;
1168  A1_re -= g11_im * a1_im;
1169  A1_re += g12_re * a2_re;
1170  A1_re -= g12_im * a2_im;
1171  spinorFloat A1_im = 0;
1172  A1_im += g10_re * a0_im;
1173  A1_im += g10_im * a0_re;
1174  A1_im += g11_re * a1_im;
1175  A1_im += g11_im * a1_re;
1176  A1_im += g12_re * a2_im;
1177  A1_im += g12_im * a2_re;
1178  spinorFloat B1_re = 0;
1179  B1_re += g10_re * b0_re;
1180  B1_re -= g10_im * b0_im;
1181  B1_re += g11_re * b1_re;
1182  B1_re -= g11_im * b1_im;
1183  B1_re += g12_re * b2_re;
1184  B1_re -= g12_im * b2_im;
1185  spinorFloat B1_im = 0;
1186  B1_im += g10_re * b0_im;
1187  B1_im += g10_im * b0_re;
1188  B1_im += g11_re * b1_im;
1189  B1_im += g11_im * b1_re;
1190  B1_im += g12_re * b2_im;
1191  B1_im += g12_im * b2_re;
1192 
1193  // multiply row 2
1194  spinorFloat A2_re = 0;
1195  A2_re += g20_re * a0_re;
1196  A2_re -= g20_im * a0_im;
1197  A2_re += g21_re * a1_re;
1198  A2_re -= g21_im * a1_im;
1199  A2_re += g22_re * a2_re;
1200  A2_re -= g22_im * a2_im;
1201  spinorFloat A2_im = 0;
1202  A2_im += g20_re * a0_im;
1203  A2_im += g20_im * a0_re;
1204  A2_im += g21_re * a1_im;
1205  A2_im += g21_im * a1_re;
1206  A2_im += g22_re * a2_im;
1207  A2_im += g22_im * a2_re;
1208  spinorFloat B2_re = 0;
1209  B2_re += g20_re * b0_re;
1210  B2_re -= g20_im * b0_im;
1211  B2_re += g21_re * b1_re;
1212  B2_re -= g21_im * b1_im;
1213  B2_re += g22_re * b2_re;
1214  B2_re -= g22_im * b2_im;
1215  spinorFloat B2_im = 0;
1216  B2_im += g20_re * b0_im;
1217  B2_im += g20_im * b0_re;
1218  B2_im += g21_re * b1_im;
1219  B2_im += g21_im * b1_re;
1220  B2_im += g22_re * b2_im;
1221  B2_im += g22_im * b2_re;
1222 
1223  o00_re += A0_re;
1224  o00_im += A0_im;
1225  o10_re += B0_re;
1226  o10_im += B0_im;
1227  o20_re += A0_im;
1228  o20_im -= A0_re;
1229  o30_re -= B0_im;
1230  o30_im += B0_re;
1231 
1232  o01_re += A1_re;
1233  o01_im += A1_im;
1234  o11_re += B1_re;
1235  o11_im += B1_im;
1236  o21_re += A1_im;
1237  o21_im -= A1_re;
1238  o31_re -= B1_im;
1239  o31_im += B1_re;
1240 
1241  o02_re += A2_re;
1242  o02_im += A2_im;
1243  o12_re += B2_re;
1244  o12_im += B2_im;
1245  o22_re += A2_im;
1246  o22_im -= A2_re;
1247  o32_re -= B2_im;
1248  o32_im += B2_re;
1249 }
1250 
1251 #ifdef MULTI_GPU
1252 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1253  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1254 #endif
1255 {
1256  // Projector P2-
1257  // 1 0 -i 0
1258  // 0 1 0 i
1259  // i 0 1 0
1260  // 0 -i 0 1
1261 
1262 #ifdef MULTI_GPU
1263  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1264  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1265 #else
1266  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1267 #endif
1268 
1269 #ifdef MULTI_GPU
1270  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % Vh : Vh+(face_idx % ghostFace[static_cast<int>(kernel_type)]));
1271 #else
1272  const int ga_idx = sp_idx % Vh;
1273 #endif
1274 
1275  // read gauge matrix from device memory
1276  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride); }
1277  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 5, ga_idx, ga_stride); }
1278 
1285 
1286 #ifdef MULTI_GPU
1287  if (kernel_type == INTERIOR_KERNEL) {
1288 #endif
1289 
1290  // read spinor from device memory
1291  READ_SPINOR(SPINORTEX, sp_stride, sp_idx, sp_idx);
1292 
1293  // project spinor into half spinors
1294  a0_re = +i00_re+i20_im;
1295  a0_im = +i00_im-i20_re;
1296  a1_re = +i01_re+i21_im;
1297  a1_im = +i01_im-i21_re;
1298  a2_re = +i02_re+i22_im;
1299  a2_im = +i02_im-i22_re;
1300  b0_re = +i10_re-i30_im;
1301  b0_im = +i10_im+i30_re;
1302  b1_re = +i11_re-i31_im;
1303  b1_im = +i11_im+i31_re;
1304  b2_re = +i12_re-i32_im;
1305  b2_im = +i12_im+i32_re;
1306 
1307 #ifdef MULTI_GPU
1308  } else {
1309 
1310  const int sp_stride_pad = Ls*ghostFace[static_cast<int>(kernel_type)];
1311 
1312  // read half spinor from device memory
1313  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1314 
1315  a0_re = i00_re; a0_im = i00_im;
1316  a1_re = i01_re; a1_im = i01_im;
1317  a2_re = i02_re; a2_im = i02_im;
1318  b0_re = i10_re; b0_im = i10_im;
1319  b1_re = i11_re; b1_im = i11_im;
1320  b2_re = i12_re; b2_im = i12_im;
1321 
1322  }
1323 #endif // MULTI_GPU
1324 
1325  // reconstruct gauge matrix
1327 
1328  // multiply row 0
1329  spinorFloat A0_re = 0;
1330  A0_re += gT00_re * a0_re;
1331  A0_re -= gT00_im * a0_im;
1332  A0_re += gT01_re * a1_re;
1333  A0_re -= gT01_im * a1_im;
1334  A0_re += gT02_re * a2_re;
1335  A0_re -= gT02_im * a2_im;
1336  spinorFloat A0_im = 0;
1337  A0_im += gT00_re * a0_im;
1338  A0_im += gT00_im * a0_re;
1339  A0_im += gT01_re * a1_im;
1340  A0_im += gT01_im * a1_re;
1341  A0_im += gT02_re * a2_im;
1342  A0_im += gT02_im * a2_re;
1343  spinorFloat B0_re = 0;
1344  B0_re += gT00_re * b0_re;
1345  B0_re -= gT00_im * b0_im;
1346  B0_re += gT01_re * b1_re;
1347  B0_re -= gT01_im * b1_im;
1348  B0_re += gT02_re * b2_re;
1349  B0_re -= gT02_im * b2_im;
1350  spinorFloat B0_im = 0;
1351  B0_im += gT00_re * b0_im;
1352  B0_im += gT00_im * b0_re;
1353  B0_im += gT01_re * b1_im;
1354  B0_im += gT01_im * b1_re;
1355  B0_im += gT02_re * b2_im;
1356  B0_im += gT02_im * b2_re;
1357 
1358  // multiply row 1
1359  spinorFloat A1_re = 0;
1360  A1_re += gT10_re * a0_re;
1361  A1_re -= gT10_im * a0_im;
1362  A1_re += gT11_re * a1_re;
1363  A1_re -= gT11_im * a1_im;
1364  A1_re += gT12_re * a2_re;
1365  A1_re -= gT12_im * a2_im;
1366  spinorFloat A1_im = 0;
1367  A1_im += gT10_re * a0_im;
1368  A1_im += gT10_im * a0_re;
1369  A1_im += gT11_re * a1_im;
1370  A1_im += gT11_im * a1_re;
1371  A1_im += gT12_re * a2_im;
1372  A1_im += gT12_im * a2_re;
1373  spinorFloat B1_re = 0;
1374  B1_re += gT10_re * b0_re;
1375  B1_re -= gT10_im * b0_im;
1376  B1_re += gT11_re * b1_re;
1377  B1_re -= gT11_im * b1_im;
1378  B1_re += gT12_re * b2_re;
1379  B1_re -= gT12_im * b2_im;
1380  spinorFloat B1_im = 0;
1381  B1_im += gT10_re * b0_im;
1382  B1_im += gT10_im * b0_re;
1383  B1_im += gT11_re * b1_im;
1384  B1_im += gT11_im * b1_re;
1385  B1_im += gT12_re * b2_im;
1386  B1_im += gT12_im * b2_re;
1387 
1388  // multiply row 2
1389  spinorFloat A2_re = 0;
1390  A2_re += gT20_re * a0_re;
1391  A2_re -= gT20_im * a0_im;
1392  A2_re += gT21_re * a1_re;
1393  A2_re -= gT21_im * a1_im;
1394  A2_re += gT22_re * a2_re;
1395  A2_re -= gT22_im * a2_im;
1396  spinorFloat A2_im = 0;
1397  A2_im += gT20_re * a0_im;
1398  A2_im += gT20_im * a0_re;
1399  A2_im += gT21_re * a1_im;
1400  A2_im += gT21_im * a1_re;
1401  A2_im += gT22_re * a2_im;
1402  A2_im += gT22_im * a2_re;
1403  spinorFloat B2_re = 0;
1404  B2_re += gT20_re * b0_re;
1405  B2_re -= gT20_im * b0_im;
1406  B2_re += gT21_re * b1_re;
1407  B2_re -= gT21_im * b1_im;
1408  B2_re += gT22_re * b2_re;
1409  B2_re -= gT22_im * b2_im;
1410  spinorFloat B2_im = 0;
1411  B2_im += gT20_re * b0_im;
1412  B2_im += gT20_im * b0_re;
1413  B2_im += gT21_re * b1_im;
1414  B2_im += gT21_im * b1_re;
1415  B2_im += gT22_re * b2_im;
1416  B2_im += gT22_im * b2_re;
1417 
1418  o00_re += A0_re;
1419  o00_im += A0_im;
1420  o10_re += B0_re;
1421  o10_im += B0_im;
1422  o20_re -= A0_im;
1423  o20_im += A0_re;
1424  o30_re += B0_im;
1425  o30_im -= B0_re;
1426 
1427  o01_re += A1_re;
1428  o01_im += A1_im;
1429  o11_re += B1_re;
1430  o11_im += B1_im;
1431  o21_re -= A1_im;
1432  o21_im += A1_re;
1433  o31_re += B1_im;
1434  o31_im -= B1_re;
1435 
1436  o02_re += A2_re;
1437  o02_im += A2_im;
1438  o12_re += B2_re;
1439  o12_im += B2_im;
1440  o22_re -= A2_im;
1441  o22_im += A2_re;
1442  o32_re += B2_im;
1443  o32_im -= B2_re;
1444 }
1445 
1446 #ifdef MULTI_GPU
1447 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1448  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
1449 #endif
1450 {
1451  // Projector P3+
1452  // 2 0 0 0
1453  // 0 2 0 0
1454  // 0 0 0 0
1455  // 0 0 0 0
1456 
1457 #ifdef MULTI_GPU
1458  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1459  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1460 #else
1461  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1462 #endif
1463 
1464  const int ga_idx = sid % Vh;
1465 
1467  {
1474 
1475 #ifdef MULTI_GPU
1476  if (kernel_type == INTERIOR_KERNEL) {
1477 #endif
1478 
1479  // read spinor from device memory
1480  READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx);
1481 
1482  // project spinor into half spinors
1483  a0_re = +2*i00_re;
1484  a0_im = +2*i00_im;
1485  a1_re = +2*i01_re;
1486  a1_im = +2*i01_im;
1487  a2_re = +2*i02_re;
1488  a2_im = +2*i02_im;
1489  b0_re = +2*i10_re;
1490  b0_im = +2*i10_im;
1491  b1_re = +2*i11_re;
1492  b1_im = +2*i11_im;
1493  b2_re = +2*i12_re;
1494  b2_im = +2*i12_im;
1495 
1496 #ifdef MULTI_GPU
1497  } else {
1498 
1499  const int sp_stride_pad = Ls*ghostFace[static_cast<int>(kernel_type)];
1500  const int t_proj_scale = TPROJSCALE;
1501 
1502  // read half spinor from device memory
1503  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1504 
1505  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1506  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1507  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1508  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1509  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1510  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1511 
1512  }
1513 #endif // MULTI_GPU
1514 
1515  // identity gauge matrix
1522 
1523  o00_re += A0_re;
1524  o00_im += A0_im;
1525  o10_re += B0_re;
1526  o10_im += B0_im;
1527 
1528  o01_re += A1_re;
1529  o01_im += A1_im;
1530  o11_re += B1_re;
1531  o11_im += B1_im;
1532 
1533  o02_re += A2_re;
1534  o02_im += A2_im;
1535  o12_re += B2_re;
1536  o12_im += B2_im;
1537  } else {
1538  // read gauge matrix from device memory
1539  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride); }
1540  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 6, ga_idx, ga_stride); }
1541 
1548 
1549 #ifdef MULTI_GPU
1550  if (kernel_type == INTERIOR_KERNEL) {
1551 #endif
1552 
1553  // read spinor from device memory
1554  READ_SPINOR_UP(SPINORTEX, sp_stride, sp_idx, sp_idx);
1555 
1556  // project spinor into half spinors
1557  a0_re = +2*i00_re;
1558  a0_im = +2*i00_im;
1559  a1_re = +2*i01_re;
1560  a1_im = +2*i01_im;
1561  a2_re = +2*i02_re;
1562  a2_im = +2*i02_im;
1563  b0_re = +2*i10_re;
1564  b0_im = +2*i10_im;
1565  b1_re = +2*i11_re;
1566  b1_im = +2*i11_im;
1567  b2_re = +2*i12_re;
1568  b2_im = +2*i12_im;
1569 
1570 #ifdef MULTI_GPU
1571  } else {
1572 
1573  const int sp_stride_pad = Ls*ghostFace[static_cast<int>(kernel_type)];
1574  const int t_proj_scale = TPROJSCALE;
1575 
1576  // read half spinor from device memory
1577  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1578 
1579  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1580  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1581  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1582  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1583  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1584  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1585 
1586  }
1587 #endif // MULTI_GPU
1588 
1589  // reconstruct gauge matrix
1591 
1592  // multiply row 0
1593  spinorFloat A0_re = 0;
1594  A0_re += g00_re * a0_re;
1595  A0_re -= g00_im * a0_im;
1596  A0_re += g01_re * a1_re;
1597  A0_re -= g01_im * a1_im;
1598  A0_re += g02_re * a2_re;
1599  A0_re -= g02_im * a2_im;
1600  spinorFloat A0_im = 0;
1601  A0_im += g00_re * a0_im;
1602  A0_im += g00_im * a0_re;
1603  A0_im += g01_re * a1_im;
1604  A0_im += g01_im * a1_re;
1605  A0_im += g02_re * a2_im;
1606  A0_im += g02_im * a2_re;
1607  spinorFloat B0_re = 0;
1608  B0_re += g00_re * b0_re;
1609  B0_re -= g00_im * b0_im;
1610  B0_re += g01_re * b1_re;
1611  B0_re -= g01_im * b1_im;
1612  B0_re += g02_re * b2_re;
1613  B0_re -= g02_im * b2_im;
1614  spinorFloat B0_im = 0;
1615  B0_im += g00_re * b0_im;
1616  B0_im += g00_im * b0_re;
1617  B0_im += g01_re * b1_im;
1618  B0_im += g01_im * b1_re;
1619  B0_im += g02_re * b2_im;
1620  B0_im += g02_im * b2_re;
1621 
1622  // multiply row 1
1623  spinorFloat A1_re = 0;
1624  A1_re += g10_re * a0_re;
1625  A1_re -= g10_im * a0_im;
1626  A1_re += g11_re * a1_re;
1627  A1_re -= g11_im * a1_im;
1628  A1_re += g12_re * a2_re;
1629  A1_re -= g12_im * a2_im;
1630  spinorFloat A1_im = 0;
1631  A1_im += g10_re * a0_im;
1632  A1_im += g10_im * a0_re;
1633  A1_im += g11_re * a1_im;
1634  A1_im += g11_im * a1_re;
1635  A1_im += g12_re * a2_im;
1636  A1_im += g12_im * a2_re;
1637  spinorFloat B1_re = 0;
1638  B1_re += g10_re * b0_re;
1639  B1_re -= g10_im * b0_im;
1640  B1_re += g11_re * b1_re;
1641  B1_re -= g11_im * b1_im;
1642  B1_re += g12_re * b2_re;
1643  B1_re -= g12_im * b2_im;
1644  spinorFloat B1_im = 0;
1645  B1_im += g10_re * b0_im;
1646  B1_im += g10_im * b0_re;
1647  B1_im += g11_re * b1_im;
1648  B1_im += g11_im * b1_re;
1649  B1_im += g12_re * b2_im;
1650  B1_im += g12_im * b2_re;
1651 
1652  // multiply row 2
1653  spinorFloat A2_re = 0;
1654  A2_re += g20_re * a0_re;
1655  A2_re -= g20_im * a0_im;
1656  A2_re += g21_re * a1_re;
1657  A2_re -= g21_im * a1_im;
1658  A2_re += g22_re * a2_re;
1659  A2_re -= g22_im * a2_im;
1660  spinorFloat A2_im = 0;
1661  A2_im += g20_re * a0_im;
1662  A2_im += g20_im * a0_re;
1663  A2_im += g21_re * a1_im;
1664  A2_im += g21_im * a1_re;
1665  A2_im += g22_re * a2_im;
1666  A2_im += g22_im * a2_re;
1667  spinorFloat B2_re = 0;
1668  B2_re += g20_re * b0_re;
1669  B2_re -= g20_im * b0_im;
1670  B2_re += g21_re * b1_re;
1671  B2_re -= g21_im * b1_im;
1672  B2_re += g22_re * b2_re;
1673  B2_re -= g22_im * b2_im;
1674  spinorFloat B2_im = 0;
1675  B2_im += g20_re * b0_im;
1676  B2_im += g20_im * b0_re;
1677  B2_im += g21_re * b1_im;
1678  B2_im += g21_im * b1_re;
1679  B2_im += g22_re * b2_im;
1680  B2_im += g22_im * b2_re;
1681 
1682  o00_re += A0_re;
1683  o00_im += A0_im;
1684  o10_re += B0_re;
1685  o10_im += B0_im;
1686 
1687  o01_re += A1_re;
1688  o01_im += A1_im;
1689  o11_re += B1_re;
1690  o11_im += B1_im;
1691 
1692  o02_re += A2_re;
1693  o02_im += A2_im;
1694  o12_re += B2_re;
1695  o12_im += B2_im;
1696  }
1697 }
1698 
1699 #ifdef MULTI_GPU
1700 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
1701  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
1702 #endif
1703 {
1704  // Projector P3-
1705  // 0 0 0 0
1706  // 0 0 0 0
1707  // 0 0 2 0
1708  // 0 0 0 2
1709 
1710 #ifdef MULTI_GPU
1711  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
1712  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1713 #else
1714  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
1715 #endif
1716 
1717 #ifdef MULTI_GPU
1718  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % Vh : Vh+(face_idx % ghostFace[static_cast<int>(kernel_type)]));
1719 #else
1720  const int ga_idx = sp_idx % Vh;
1721 #endif
1722 
1723  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1724  {
1731 
1732 #ifdef MULTI_GPU
1733  if (kernel_type == INTERIOR_KERNEL) {
1734 #endif
1735 
1736  // read spinor from device memory
1737  READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx);
1738 
1739  // project spinor into half spinors
1740  a0_re = +2*i20_re;
1741  a0_im = +2*i20_im;
1742  a1_re = +2*i21_re;
1743  a1_im = +2*i21_im;
1744  a2_re = +2*i22_re;
1745  a2_im = +2*i22_im;
1746  b0_re = +2*i30_re;
1747  b0_im = +2*i30_im;
1748  b1_re = +2*i31_re;
1749  b1_im = +2*i31_im;
1750  b2_re = +2*i32_re;
1751  b2_im = +2*i32_im;
1752 
1753 #ifdef MULTI_GPU
1754  } else {
1755 
1756  const int sp_stride_pad = Ls*ghostFace[static_cast<int>(kernel_type)];
1757  const int t_proj_scale = TPROJSCALE;
1758 
1759  // read half spinor from device memory
1760  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1761 
1762  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1763  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1764  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1765  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1766  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1767  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1768 
1769  }
1770 #endif // MULTI_GPU
1771 
1772  // identity gauge matrix
1779 
1780  o20_re += A0_re;
1781  o20_im += A0_im;
1782  o30_re += B0_re;
1783  o30_im += B0_im;
1784 
1785  o21_re += A1_re;
1786  o21_im += A1_im;
1787  o31_re += B1_re;
1788  o31_im += B1_im;
1789 
1790  o22_re += A2_re;
1791  o22_im += A2_im;
1792  o32_re += B2_re;
1793  o32_im += B2_im;
1794  } else {
1795  // read gauge matrix from device memory
1796  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride); }
1797  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 7, ga_idx, ga_stride); }
1798 
1805 
1806 #ifdef MULTI_GPU
1807  if (kernel_type == INTERIOR_KERNEL) {
1808 #endif
1809 
1810  // read spinor from device memory
1811  READ_SPINOR_DOWN(SPINORTEX, sp_stride, sp_idx, sp_idx);
1812 
1813  // project spinor into half spinors
1814  a0_re = +2*i20_re;
1815  a0_im = +2*i20_im;
1816  a1_re = +2*i21_re;
1817  a1_im = +2*i21_im;
1818  a2_re = +2*i22_re;
1819  a2_im = +2*i22_im;
1820  b0_re = +2*i30_re;
1821  b0_im = +2*i30_im;
1822  b1_re = +2*i31_re;
1823  b1_im = +2*i31_im;
1824  b2_re = +2*i32_re;
1825  b2_im = +2*i32_im;
1826 
1827 #ifdef MULTI_GPU
1828  } else {
1829 
1830  const int sp_stride_pad = Ls*ghostFace[static_cast<int>(kernel_type)];
1831  const int t_proj_scale = TPROJSCALE;
1832 
1833  // read half spinor from device memory
1834  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1835 
1836  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1837  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1838  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1839  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1840  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1841  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1842 
1843  }
1844 #endif // MULTI_GPU
1845 
1846  // reconstruct gauge matrix
1848 
1849  // multiply row 0
1850  spinorFloat A0_re = 0;
1851  A0_re += gT00_re * a0_re;
1852  A0_re -= gT00_im * a0_im;
1853  A0_re += gT01_re * a1_re;
1854  A0_re -= gT01_im * a1_im;
1855  A0_re += gT02_re * a2_re;
1856  A0_re -= gT02_im * a2_im;
1857  spinorFloat A0_im = 0;
1858  A0_im += gT00_re * a0_im;
1859  A0_im += gT00_im * a0_re;
1860  A0_im += gT01_re * a1_im;
1861  A0_im += gT01_im * a1_re;
1862  A0_im += gT02_re * a2_im;
1863  A0_im += gT02_im * a2_re;
1864  spinorFloat B0_re = 0;
1865  B0_re += gT00_re * b0_re;
1866  B0_re -= gT00_im * b0_im;
1867  B0_re += gT01_re * b1_re;
1868  B0_re -= gT01_im * b1_im;
1869  B0_re += gT02_re * b2_re;
1870  B0_re -= gT02_im * b2_im;
1871  spinorFloat B0_im = 0;
1872  B0_im += gT00_re * b0_im;
1873  B0_im += gT00_im * b0_re;
1874  B0_im += gT01_re * b1_im;
1875  B0_im += gT01_im * b1_re;
1876  B0_im += gT02_re * b2_im;
1877  B0_im += gT02_im * b2_re;
1878 
1879  // multiply row 1
1880  spinorFloat A1_re = 0;
1881  A1_re += gT10_re * a0_re;
1882  A1_re -= gT10_im * a0_im;
1883  A1_re += gT11_re * a1_re;
1884  A1_re -= gT11_im * a1_im;
1885  A1_re += gT12_re * a2_re;
1886  A1_re -= gT12_im * a2_im;
1887  spinorFloat A1_im = 0;
1888  A1_im += gT10_re * a0_im;
1889  A1_im += gT10_im * a0_re;
1890  A1_im += gT11_re * a1_im;
1891  A1_im += gT11_im * a1_re;
1892  A1_im += gT12_re * a2_im;
1893  A1_im += gT12_im * a2_re;
1894  spinorFloat B1_re = 0;
1895  B1_re += gT10_re * b0_re;
1896  B1_re -= gT10_im * b0_im;
1897  B1_re += gT11_re * b1_re;
1898  B1_re -= gT11_im * b1_im;
1899  B1_re += gT12_re * b2_re;
1900  B1_re -= gT12_im * b2_im;
1901  spinorFloat B1_im = 0;
1902  B1_im += gT10_re * b0_im;
1903  B1_im += gT10_im * b0_re;
1904  B1_im += gT11_re * b1_im;
1905  B1_im += gT11_im * b1_re;
1906  B1_im += gT12_re * b2_im;
1907  B1_im += gT12_im * b2_re;
1908 
1909  // multiply row 2
1910  spinorFloat A2_re = 0;
1911  A2_re += gT20_re * a0_re;
1912  A2_re -= gT20_im * a0_im;
1913  A2_re += gT21_re * a1_re;
1914  A2_re -= gT21_im * a1_im;
1915  A2_re += gT22_re * a2_re;
1916  A2_re -= gT22_im * a2_im;
1917  spinorFloat A2_im = 0;
1918  A2_im += gT20_re * a0_im;
1919  A2_im += gT20_im * a0_re;
1920  A2_im += gT21_re * a1_im;
1921  A2_im += gT21_im * a1_re;
1922  A2_im += gT22_re * a2_im;
1923  A2_im += gT22_im * a2_re;
1924  spinorFloat B2_re = 0;
1925  B2_re += gT20_re * b0_re;
1926  B2_re -= gT20_im * b0_im;
1927  B2_re += gT21_re * b1_re;
1928  B2_re -= gT21_im * b1_im;
1929  B2_re += gT22_re * b2_re;
1930  B2_re -= gT22_im * b2_im;
1931  spinorFloat B2_im = 0;
1932  B2_im += gT20_re * b0_im;
1933  B2_im += gT20_im * b0_re;
1934  B2_im += gT21_re * b1_im;
1935  B2_im += gT21_im * b1_re;
1936  B2_im += gT22_re * b2_im;
1937  B2_im += gT22_im * b2_re;
1938 
1939  o20_re += A0_re;
1940  o20_im += A0_im;
1941  o30_re += B0_re;
1942  o30_im += B0_im;
1943 
1944  o21_re += A1_re;
1945  o21_im += A1_im;
1946  o31_re += B1_re;
1947  o31_im += B1_im;
1948 
1949  o22_re += A2_re;
1950  o22_im += A2_im;
1951  o32_re += B2_re;
1952  o32_im += B2_im;
1953  }
1954 }
1955 
1956 
1957 
1958 // 5th dimension -- NB: not partitionable!
1959 #ifdef MULTI_GPU
1960 if(kernel_type == INTERIOR_KERNEL)
1961 #endif
1962 {
1963 // 2 P_L = 2 P_- = ( ( +1, -1 ), ( -1, +1 ) )
1964  {
1965  int sp_idx = ( xs == 0 ? X+(Ls-1)*2*Vh : X-2*Vh ) / 2;
1966 
1967 // read spinor from device memory
1968  READ_SPINOR( SPINORTEX, sp_stride, sp_idx, sp_idx );
1969 
1970  if ( xs != 0 )
1971  {
1975 
1979 
1983 
1987  }
1988  else
1989  {
1990  o00_re += -mferm*(+i00_re-i20_re); o00_im += -mferm*(+i00_im-i20_im);
1991  o01_re += -mferm*(+i01_re-i21_re); o01_im += -mferm*(+i01_im-i21_im);
1992  o02_re += -mferm*(+i02_re-i22_re); o02_im += -mferm*(+i02_im-i22_im);
1993 
1994  o10_re += -mferm*(+i10_re-i30_re); o10_im += -mferm*(+i10_im-i30_im);
1995  o11_re += -mferm*(+i11_re-i31_re); o11_im += -mferm*(+i11_im-i31_im);
1996  o12_re += -mferm*(+i12_re-i32_re); o12_im += -mferm*(+i12_im-i32_im);
1997 
1998  o20_re += -mferm*(-i00_re+i20_re); o20_im += -mferm*(-i00_im+i20_im);
1999  o21_re += -mferm*(-i01_re+i21_re); o21_im += -mferm*(-i01_im+i21_im);
2000  o22_re += -mferm*(-i02_re+i22_re); o22_im += -mferm*(-i02_im+i22_im);
2001 
2002  o30_re += -mferm*(-i10_re+i30_re); o30_im += -mferm*(-i10_im+i30_im);
2003  o31_re += -mferm*(-i11_re+i31_re); o31_im += -mferm*(-i11_im+i31_im);
2004  o32_re += -mferm*(-i12_re+i32_re); o32_im += -mferm*(-i12_im+i32_im);
2005  } // end if ( xs != 0 )
2006  } // end P_L
2007 
2008  // 2 P_R = 2 P_+ = ( ( +1, +1 ), ( +1, +1 ) )
2009  {
2010  int sp_idx = ( xs == Ls-1 ? X-(Ls-1)*2*Vh : X+2*Vh ) / 2;
2011 
2012 // read spinor from device memory
2013  READ_SPINOR( SPINORTEX, sp_stride, sp_idx, sp_idx );
2014 
2015  if ( xs != Ls-1 )
2016  {
2020 
2024 
2028 
2032  }
2033  else
2034  {
2035  o00_re += -mferm*(+i00_re+i20_re); o00_im += -mferm*(+i00_im+i20_im);
2036  o01_re += -mferm*(+i01_re+i21_re); o01_im += -mferm*(+i01_im+i21_im);
2037  o02_re += -mferm*(+i02_re+i22_re); o02_im += -mferm*(+i02_im+i22_im);
2038 
2039  o10_re += -mferm*(+i10_re+i30_re); o10_im += -mferm*(+i10_im+i30_im);
2040  o11_re += -mferm*(+i11_re+i31_re); o11_im += -mferm*(+i11_im+i31_im);
2041  o12_re += -mferm*(+i12_re+i32_re); o12_im += -mferm*(+i12_im+i32_im);
2042 
2043  o20_re += -mferm*(+i00_re+i20_re); o20_im += -mferm*(+i00_im+i20_im);
2044  o21_re += -mferm*(+i01_re+i21_re); o21_im += -mferm*(+i01_im+i21_im);
2045  o22_re += -mferm*(+i02_re+i22_re); o22_im += -mferm*(+i02_im+i22_im);
2046 
2047  o30_re += -mferm*(+i10_re+i30_re); o30_im += -mferm*(+i10_im+i30_im);
2048  o31_re += -mferm*(+i11_re+i31_re); o31_im += -mferm*(+i11_im+i31_im);
2049  o32_re += -mferm*(+i12_re+i32_re); o32_im += -mferm*(+i12_im+i32_im);
2050  } // end if ( xs != Ls-1 )
2051  } // end P_R
2052 } // end 5th dimension
2053 
2054 
2055 #if defined MULTI_GPU && defined DSLASH_XPAY
2056 
2057 int incomplete = 0; // Have all 8 contributions been computed for this site?
2058 
2059 switch(kernel_type) { // intentional fall-through
2060 case INTERIOR_KERNEL:
2061 incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
2062 case EXTERIOR_KERNEL_T:
2063 incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
2064 case EXTERIOR_KERNEL_Z:
2065 incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
2066 case EXTERIOR_KERNEL_Y:
2067 incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
2068 }
2069 
2070 if (!incomplete)
2071 #endif // MULTI_GPU
2072 {
2073 
2074 #ifdef DSLASH_XPAY
2075 
2076  READ_ACCUM(ACCUMTEX, sp_stride)
2077 
2078 #ifdef SPINOR_DOUBLE
2079  o00_re = a*o00_re + accum0.x;
2080  o00_im = a*o00_im + accum0.y;
2081  o01_re = a*o01_re + accum1.x;
2082  o01_im = a*o01_im + accum1.y;
2083  o02_re = a*o02_re + accum2.x;
2084  o02_im = a*o02_im + accum2.y;
2085  o10_re = a*o10_re + accum3.x;
2086  o10_im = a*o10_im + accum3.y;
2087  o11_re = a*o11_re + accum4.x;
2088  o11_im = a*o11_im + accum4.y;
2089  o12_re = a*o12_re + accum5.x;
2090  o12_im = a*o12_im + accum5.y;
2091  o20_re = a*o20_re + accum6.x;
2092  o20_im = a*o20_im + accum6.y;
2093  o21_re = a*o21_re + accum7.x;
2094  o21_im = a*o21_im + accum7.y;
2095  o22_re = a*o22_re + accum8.x;
2096  o22_im = a*o22_im + accum8.y;
2097  o30_re = a*o30_re + accum9.x;
2098  o30_im = a*o30_im + accum9.y;
2099  o31_re = a*o31_re + accum10.x;
2100  o31_im = a*o31_im + accum10.y;
2101  o32_re = a*o32_re + accum11.x;
2102  o32_im = a*o32_im + accum11.y;
2103 #else
2104  o00_re = a*o00_re + accum0.x;
2105  o00_im = a*o00_im + accum0.y;
2106  o01_re = a*o01_re + accum0.z;
2107  o01_im = a*o01_im + accum0.w;
2108  o02_re = a*o02_re + accum1.x;
2109  o02_im = a*o02_im + accum1.y;
2110  o10_re = a*o10_re + accum1.z;
2111  o10_im = a*o10_im + accum1.w;
2112  o11_re = a*o11_re + accum2.x;
2113  o11_im = a*o11_im + accum2.y;
2114  o12_re = a*o12_re + accum2.z;
2115  o12_im = a*o12_im + accum2.w;
2116  o20_re = a*o20_re + accum3.x;
2117  o20_im = a*o20_im + accum3.y;
2118  o21_re = a*o21_re + accum3.z;
2119  o21_im = a*o21_im + accum3.w;
2120  o22_re = a*o22_re + accum4.x;
2121  o22_im = a*o22_im + accum4.y;
2122  o30_re = a*o30_re + accum4.z;
2123  o30_im = a*o30_im + accum4.w;
2124  o31_re = a*o31_re + accum5.x;
2125  o31_im = a*o31_im + accum5.y;
2126  o32_re = a*o32_re + accum5.z;
2127  o32_im = a*o32_im + accum5.w;
2128 #endif // SPINOR_DOUBLE
2129 
2130 #endif // DSLASH_XPAY
2131 }
2132 
2133 // write spinor field back to device memory
2135 
2136 // undefine to prevent warning when precision is changed
2137 #undef spinorFloat
2138 #undef SHARED_STRIDE
2139 
2140 #undef g00_re
2141 #undef g00_im
2142 #undef g01_re
2143 #undef g01_im
2144 #undef g02_re
2145 #undef g02_im
2146 #undef g10_re
2147 #undef g10_im
2148 #undef g11_re
2149 #undef g11_im
2150 #undef g12_re
2151 #undef g12_im
2152 #undef g20_re
2153 #undef g20_im
2154 #undef g21_re
2155 #undef g21_im
2156 #undef g22_re
2157 #undef g22_im
2158 
2159 #undef i00_re
2160 #undef i00_im
2161 #undef i01_re
2162 #undef i01_im
2163 #undef i02_re
2164 #undef i02_im
2165 #undef i10_re
2166 #undef i10_im
2167 #undef i11_re
2168 #undef i11_im
2169 #undef i12_re
2170 #undef i12_im
2171 #undef i20_re
2172 #undef i20_im
2173 #undef i21_re
2174 #undef i21_im
2175 #undef i22_re
2176 #undef i22_im
2177 #undef i30_re
2178 #undef i30_im
2179 #undef i31_re
2180 #undef i31_im
2181 #undef i32_re
2182 #undef i32_im
2183 
2184 
2185 
2186 #undef VOLATILE