QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dw_dslash4_dagger_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH DAGGER ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 // NB! Don't trust any MULTI_GPU code
6 
7 #if (CUDA_VERSION >= 4010)
8 #define VOLATILE
9 #else
10 #define VOLATILE volatile
11 #endif
12 // input spinor
13 #ifdef SPINOR_DOUBLE
14 #define spinorFloat double
15 #define i00_re I0.x
16 #define i00_im I0.y
17 #define i01_re I1.x
18 #define i01_im I1.y
19 #define i02_re I2.x
20 #define i02_im I2.y
21 #define i10_re I3.x
22 #define i10_im I3.y
23 #define i11_re I4.x
24 #define i11_im I4.y
25 #define i12_re I5.x
26 #define i12_im I5.y
27 #define i20_re I6.x
28 #define i20_im I6.y
29 #define i21_re I7.x
30 #define i21_im I7.y
31 #define i22_re I8.x
32 #define i22_im I8.y
33 #define i30_re I9.x
34 #define i30_im I9.y
35 #define i31_re I10.x
36 #define i31_im I10.y
37 #define i32_re I11.x
38 #define i32_im I11.y
39 #define m5 m5_d
40 #define mdwf_b5 mdwf_b5_d
41 #define mdwf_c5 mdwf_c5_d
42 #else
43 #define spinorFloat float
44 #define i00_re I0.x
45 #define i00_im I0.y
46 #define i01_re I0.z
47 #define i01_im I0.w
48 #define i02_re I1.x
49 #define i02_im I1.y
50 #define i10_re I1.z
51 #define i10_im I1.w
52 #define i11_re I2.x
53 #define i11_im I2.y
54 #define i12_re I2.z
55 #define i12_im I2.w
56 #define i20_re I3.x
57 #define i20_im I3.y
58 #define i21_re I3.z
59 #define i21_im I3.w
60 #define i22_re I4.x
61 #define i22_im I4.y
62 #define i30_re I4.z
63 #define i30_im I4.w
64 #define i31_re I5.x
65 #define i31_im I5.y
66 #define i32_re I5.z
67 #define i32_im I5.w
68 #define m5 m5_f
69 #define mdwf_b5 mdwf_b5_f
70 #define mdwf_c5 mdwf_c5_f
71 #endif // SPINOR_DOUBLE
72 
73 // gauge link
74 #ifdef GAUGE_FLOAT2
75 #define g00_re G0.x
76 #define g00_im G0.y
77 #define g01_re G1.x
78 #define g01_im G1.y
79 #define g02_re G2.x
80 #define g02_im G2.y
81 #define g10_re G3.x
82 #define g10_im G3.y
83 #define g11_re G4.x
84 #define g11_im G4.y
85 #define g12_re G5.x
86 #define g12_im G5.y
87 #define g20_re G6.x
88 #define g20_im G6.y
89 #define g21_re G7.x
90 #define g21_im G7.y
91 #define g22_re G8.x
92 #define g22_im G8.y
93 
94 #else
95 #define g00_re G0.x
96 #define g00_im G0.y
97 #define g01_re G0.z
98 #define g01_im G0.w
99 #define g02_re G1.x
100 #define g02_im G1.y
101 #define g10_re G1.z
102 #define g10_im G1.w
103 #define g11_re G2.x
104 #define g11_im G2.y
105 #define g12_re G2.z
106 #define g12_im G2.w
107 #define g20_re G3.x
108 #define g20_im G3.y
109 #define g21_re G3.z
110 #define g21_im G3.w
111 #define g22_re G4.x
112 #define g22_im G4.y
113 
114 #endif // GAUGE_DOUBLE
115 
116 // conjugated gauge link
117 #define gT00_re (+g00_re)
118 #define gT00_im (-g00_im)
119 #define gT01_re (+g10_re)
120 #define gT01_im (-g10_im)
121 #define gT02_re (+g20_re)
122 #define gT02_im (-g20_im)
123 #define gT10_re (+g01_re)
124 #define gT10_im (-g01_im)
125 #define gT11_re (+g11_re)
126 #define gT11_im (-g11_im)
127 #define gT12_re (+g21_re)
128 #define gT12_im (-g21_im)
129 #define gT20_re (+g02_re)
130 #define gT20_im (-g02_im)
131 #define gT21_re (+g12_re)
132 #define gT21_im (-g12_im)
133 #define gT22_re (+g22_re)
134 #define gT22_im (-g22_im)
135 
136 // output spinor
161 
162 #ifdef SPINOR_DOUBLE
163 #if (__COMPUTE_CAPABILITY__ >= 200)
164 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
165 #else
166 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
167 #endif
168 #else
169 #if (__COMPUTE_CAPABILITY__ >= 200)
170 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
171 #else
172 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
173 #endif
174 #endif
175 
176 #include "read_gauge.h"
177 #include "io_spinor.h"
178 
179 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
180 int sp_norm_idx;
181 #endif // MULTI_GPU half precision
182 
183 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
184 if (sid >= param.threads*param.Ls) return;
185 
187 
188 int X, x1, x2, x3, x4, xs;
189 
190 #ifdef MULTI_GPU
191 int face_idx;
193 #endif
194 
195 // Inline by hand for the moment and assume even dimensions
196 //coordsFromIndex(X, x1, x2, x3, x4, sid, param.parity);
197 
198 boundaryCrossing = sid/X1h + sid/(X2*X1h) + sid/(X3*X2*X1h);
199 
200 X = 2*sid + (boundaryCrossing + param.parity) % 2;
201 x1 = X % X1;
202 x2 = (X/X1) % X2;
203 x3 = (X/(X1*X2)) % X3;
204 x4 = (X/(X1*X2*X3)) % X4;
205 xs = X/(X1*X2*X3*X4);
206 
207  o00_re = 0; o00_im = 0;
208  o01_re = 0; o01_im = 0;
209  o02_re = 0; o02_im = 0;
210  o10_re = 0; o10_im = 0;
211  o11_re = 0; o11_im = 0;
212  o12_re = 0; o12_im = 0;
213  o20_re = 0; o20_im = 0;
214  o21_re = 0; o21_im = 0;
215  o22_re = 0; o22_im = 0;
216  o30_re = 0; o30_im = 0;
217  o31_re = 0; o31_im = 0;
218  o32_re = 0; o32_im = 0;
219 
220 #ifdef MULTI_GPU
221 } else { // exterior kernel
222 
223 const int dim = static_cast<int>(kernel_type);
224 const int face_volume = (param.threads*param.Ls >> 1); // volume of one face
225 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
226 face_idx = sid - face_num*face_volume; // index into the respective face
227 
228 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
229 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
230 //sp_idx = face_idx + param.ghostOffset[dim];
231 
232 #if (DD_PREC==2) // half precision
233 sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
234 #endif
235 
236 const int dims[] = {X1, X2, X3, X4};
237 coordsFromDW4DFaceIndex<1>(sid, x1, x2, x3, x4, xs, face_idx, face_volume, dim, face_num, param.parity, dims);
238 
239 boundaryCrossing = sid/X1h + sid/(X2*X1h) + sid/(X3*X2*X1h);
240 
241 X = 2*sid + (boundaryCrossing + param.parity) % 2;
242 
243 READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
244  o00_re = i00_re; o00_im = i00_im;
245  o01_re = i01_re; o01_im = i01_im;
246  o02_re = i02_re; o02_im = i02_im;
247  o10_re = i10_re; o10_im = i10_im;
248  o11_re = i11_re; o11_im = i11_im;
249  o12_re = i12_re; o12_im = i12_im;
250  o20_re = i20_re; o20_im = i20_im;
251  o21_re = i21_re; o21_im = i21_im;
252  o22_re = i22_re; o22_im = i22_im;
253  o30_re = i30_re; o30_im = i30_im;
254  o31_re = i31_re; o31_im = i31_im;
255  o32_re = i32_re; o32_im = i32_im;
256 }
257 #endif // MULTI_GPU
258 
259 // declare G## here and use ASSN below instead of READ
260 #ifdef GAUGE_FLOAT2
261 #if (DD_PREC==0) //temporal hack
262 double2 G0;
263 double2 G1;
264 double2 G2;
265 double2 G3;
266 double2 G4;
267 double2 G5;
268 double2 G6;
269 double2 G7;
270 double2 G8;
271 #else
272 float2 G0;
273 float2 G1;
274 float2 G2;
275 float2 G3;
276 float2 G4;
277 float2 G5;
278 float2 G6;
279 float2 G7;
280 float2 G8;
281 #endif
282 #else
283 float4 G0;
284 float4 G1;
285 float4 G2;
286 float4 G3;
287 float4 G4;
288 #endif
289 
290 
291 
292 #ifdef MULTI_GPU
293 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
294  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
295 #endif
296 {
297  // Projector P0+
298  // 1 0 0 i
299  // 0 1 i 0
300  // 0 -i 1 0
301  // -i 0 0 1
302 
303 #ifdef MULTI_GPU
304  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
305  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
306 #else
307  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
308 #endif
309 
310  const int ga_idx = sid % Vh;
311 
312  // read gauge matrix from device memory
313  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride);
314 
321 
322 #ifdef MULTI_GPU
323  if (kernel_type == INTERIOR_KERNEL) {
324 #endif
325 
326  // read spinor from device memory
327  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
328 
329  // project spinor into half spinors
330  a0_re = +i00_re-i30_im;
331  a0_im = +i00_im+i30_re;
332  a1_re = +i01_re-i31_im;
333  a1_im = +i01_im+i31_re;
334  a2_re = +i02_re-i32_im;
335  a2_im = +i02_im+i32_re;
336  b0_re = +i10_re-i20_im;
337  b0_im = +i10_im+i20_re;
338  b1_re = +i11_re-i21_im;
339  b1_im = +i11_im+i21_re;
340  b2_re = +i12_re-i22_im;
341  b2_im = +i12_im+i22_re;
342 
343 #ifdef MULTI_GPU
344  } else {
345 
346  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
347 
348  // read half spinor from device memory
349  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
350 
351  a0_re = i00_re; a0_im = i00_im;
352  a1_re = i01_re; a1_im = i01_im;
353  a2_re = i02_re; a2_im = i02_im;
354  b0_re = i10_re; b0_im = i10_im;
355  b1_re = i11_re; b1_im = i11_im;
356  b2_re = i12_re; b2_im = i12_im;
357 
358  }
359 #endif // MULTI_GPU
360 
361  // reconstruct gauge matrix
363 
364  // multiply row 0
366  A0_re += g00_re * a0_re;
367  A0_re -= g00_im * a0_im;
368  A0_re += g01_re * a1_re;
369  A0_re -= g01_im * a1_im;
370  A0_re += g02_re * a2_re;
371  A0_re -= g02_im * a2_im;
373  A0_im += g00_re * a0_im;
374  A0_im += g00_im * a0_re;
375  A0_im += g01_re * a1_im;
376  A0_im += g01_im * a1_re;
377  A0_im += g02_re * a2_im;
378  A0_im += g02_im * a2_re;
380  B0_re += g00_re * b0_re;
381  B0_re -= g00_im * b0_im;
382  B0_re += g01_re * b1_re;
383  B0_re -= g01_im * b1_im;
384  B0_re += g02_re * b2_re;
385  B0_re -= g02_im * b2_im;
387  B0_im += g00_re * b0_im;
388  B0_im += g00_im * b0_re;
389  B0_im += g01_re * b1_im;
390  B0_im += g01_im * b1_re;
391  B0_im += g02_re * b2_im;
392  B0_im += g02_im * b2_re;
393 
394  // multiply row 1
396  A1_re += g10_re * a0_re;
397  A1_re -= g10_im * a0_im;
398  A1_re += g11_re * a1_re;
399  A1_re -= g11_im * a1_im;
400  A1_re += g12_re * a2_re;
401  A1_re -= g12_im * a2_im;
403  A1_im += g10_re * a0_im;
404  A1_im += g10_im * a0_re;
405  A1_im += g11_re * a1_im;
406  A1_im += g11_im * a1_re;
407  A1_im += g12_re * a2_im;
408  A1_im += g12_im * a2_re;
410  B1_re += g10_re * b0_re;
411  B1_re -= g10_im * b0_im;
412  B1_re += g11_re * b1_re;
413  B1_re -= g11_im * b1_im;
414  B1_re += g12_re * b2_re;
415  B1_re -= g12_im * b2_im;
417  B1_im += g10_re * b0_im;
418  B1_im += g10_im * b0_re;
419  B1_im += g11_re * b1_im;
420  B1_im += g11_im * b1_re;
421  B1_im += g12_re * b2_im;
422  B1_im += g12_im * b2_re;
423 
424  // multiply row 2
426  A2_re += g20_re * a0_re;
427  A2_re -= g20_im * a0_im;
428  A2_re += g21_re * a1_re;
429  A2_re -= g21_im * a1_im;
430  A2_re += g22_re * a2_re;
431  A2_re -= g22_im * a2_im;
433  A2_im += g20_re * a0_im;
434  A2_im += g20_im * a0_re;
435  A2_im += g21_re * a1_im;
436  A2_im += g21_im * a1_re;
437  A2_im += g22_re * a2_im;
438  A2_im += g22_im * a2_re;
440  B2_re += g20_re * b0_re;
441  B2_re -= g20_im * b0_im;
442  B2_re += g21_re * b1_re;
443  B2_re -= g21_im * b1_im;
444  B2_re += g22_re * b2_re;
445  B2_re -= g22_im * b2_im;
447  B2_im += g20_re * b0_im;
448  B2_im += g20_im * b0_re;
449  B2_im += g21_re * b1_im;
450  B2_im += g21_im * b1_re;
451  B2_im += g22_re * b2_im;
452  B2_im += g22_im * b2_re;
453 
454  o00_re += A0_re;
455  o00_im += A0_im;
456  o10_re += B0_re;
457  o10_im += B0_im;
458  o20_re += B0_im;
459  o20_im -= B0_re;
460  o30_re += A0_im;
461  o30_im -= A0_re;
462 
463  o01_re += A1_re;
464  o01_im += A1_im;
465  o11_re += B1_re;
466  o11_im += B1_im;
467  o21_re += B1_im;
468  o21_im -= B1_re;
469  o31_re += A1_im;
470  o31_im -= A1_re;
471 
472  o02_re += A2_re;
473  o02_im += A2_im;
474  o12_re += B2_re;
475  o12_im += B2_im;
476  o22_re += B2_im;
477  o22_im -= B2_re;
478  o32_re += A2_im;
479  o32_im -= A2_re;
480 }
481 
482 #ifdef MULTI_GPU
483 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
484  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
485 #endif
486 {
487  // Projector P0-
488  // 1 0 0 -i
489  // 0 1 -i 0
490  // 0 i 1 0
491  // i 0 0 1
492 
493 #ifdef MULTI_GPU
494  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
495  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
496 #else
497  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
498 #endif
499 
500 #ifdef MULTI_GPU
501  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % Vh : Vh+(face_idx % ghostFace[static_cast<int>(kernel_type)]));
502 #else
503  const int ga_idx = sp_idx % Vh;
504 #endif
505 
506  // read gauge matrix from device memory
507  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride);
508 
515 
516 #ifdef MULTI_GPU
517  if (kernel_type == INTERIOR_KERNEL) {
518 #endif
519 
520  // read spinor from device memory
521  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
522 
523  // project spinor into half spinors
524  a0_re = +i00_re+i30_im;
525  a0_im = +i00_im-i30_re;
526  a1_re = +i01_re+i31_im;
527  a1_im = +i01_im-i31_re;
528  a2_re = +i02_re+i32_im;
529  a2_im = +i02_im-i32_re;
530  b0_re = +i10_re+i20_im;
531  b0_im = +i10_im-i20_re;
532  b1_re = +i11_re+i21_im;
533  b1_im = +i11_im-i21_re;
534  b2_re = +i12_re+i22_im;
535  b2_im = +i12_im-i22_re;
536 
537 #ifdef MULTI_GPU
538  } else {
539 
540  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
541 
542  // read half spinor from device memory
543  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
544 
545  a0_re = i00_re; a0_im = i00_im;
546  a1_re = i01_re; a1_im = i01_im;
547  a2_re = i02_re; a2_im = i02_im;
548  b0_re = i10_re; b0_im = i10_im;
549  b1_re = i11_re; b1_im = i11_im;
550  b2_re = i12_re; b2_im = i12_im;
551 
552  }
553 #endif // MULTI_GPU
554 
555  // reconstruct gauge matrix
557 
558  // multiply row 0
559  spinorFloat A0_re = 0;
560  A0_re += gT00_re * a0_re;
561  A0_re -= gT00_im * a0_im;
562  A0_re += gT01_re * a1_re;
563  A0_re -= gT01_im * a1_im;
564  A0_re += gT02_re * a2_re;
565  A0_re -= gT02_im * a2_im;
566  spinorFloat A0_im = 0;
567  A0_im += gT00_re * a0_im;
568  A0_im += gT00_im * a0_re;
569  A0_im += gT01_re * a1_im;
570  A0_im += gT01_im * a1_re;
571  A0_im += gT02_re * a2_im;
572  A0_im += gT02_im * a2_re;
573  spinorFloat B0_re = 0;
574  B0_re += gT00_re * b0_re;
575  B0_re -= gT00_im * b0_im;
576  B0_re += gT01_re * b1_re;
577  B0_re -= gT01_im * b1_im;
578  B0_re += gT02_re * b2_re;
579  B0_re -= gT02_im * b2_im;
580  spinorFloat B0_im = 0;
581  B0_im += gT00_re * b0_im;
582  B0_im += gT00_im * b0_re;
583  B0_im += gT01_re * b1_im;
584  B0_im += gT01_im * b1_re;
585  B0_im += gT02_re * b2_im;
586  B0_im += gT02_im * b2_re;
587 
588  // multiply row 1
589  spinorFloat A1_re = 0;
590  A1_re += gT10_re * a0_re;
591  A1_re -= gT10_im * a0_im;
592  A1_re += gT11_re * a1_re;
593  A1_re -= gT11_im * a1_im;
594  A1_re += gT12_re * a2_re;
595  A1_re -= gT12_im * a2_im;
596  spinorFloat A1_im = 0;
597  A1_im += gT10_re * a0_im;
598  A1_im += gT10_im * a0_re;
599  A1_im += gT11_re * a1_im;
600  A1_im += gT11_im * a1_re;
601  A1_im += gT12_re * a2_im;
602  A1_im += gT12_im * a2_re;
603  spinorFloat B1_re = 0;
604  B1_re += gT10_re * b0_re;
605  B1_re -= gT10_im * b0_im;
606  B1_re += gT11_re * b1_re;
607  B1_re -= gT11_im * b1_im;
608  B1_re += gT12_re * b2_re;
609  B1_re -= gT12_im * b2_im;
610  spinorFloat B1_im = 0;
611  B1_im += gT10_re * b0_im;
612  B1_im += gT10_im * b0_re;
613  B1_im += gT11_re * b1_im;
614  B1_im += gT11_im * b1_re;
615  B1_im += gT12_re * b2_im;
616  B1_im += gT12_im * b2_re;
617 
618  // multiply row 2
619  spinorFloat A2_re = 0;
620  A2_re += gT20_re * a0_re;
621  A2_re -= gT20_im * a0_im;
622  A2_re += gT21_re * a1_re;
623  A2_re -= gT21_im * a1_im;
624  A2_re += gT22_re * a2_re;
625  A2_re -= gT22_im * a2_im;
626  spinorFloat A2_im = 0;
627  A2_im += gT20_re * a0_im;
628  A2_im += gT20_im * a0_re;
629  A2_im += gT21_re * a1_im;
630  A2_im += gT21_im * a1_re;
631  A2_im += gT22_re * a2_im;
632  A2_im += gT22_im * a2_re;
633  spinorFloat B2_re = 0;
634  B2_re += gT20_re * b0_re;
635  B2_re -= gT20_im * b0_im;
636  B2_re += gT21_re * b1_re;
637  B2_re -= gT21_im * b1_im;
638  B2_re += gT22_re * b2_re;
639  B2_re -= gT22_im * b2_im;
640  spinorFloat B2_im = 0;
641  B2_im += gT20_re * b0_im;
642  B2_im += gT20_im * b0_re;
643  B2_im += gT21_re * b1_im;
644  B2_im += gT21_im * b1_re;
645  B2_im += gT22_re * b2_im;
646  B2_im += gT22_im * b2_re;
647 
648  o00_re += A0_re;
649  o00_im += A0_im;
650  o10_re += B0_re;
651  o10_im += B0_im;
652  o20_re -= B0_im;
653  o20_im += B0_re;
654  o30_re -= A0_im;
655  o30_im += A0_re;
656 
657  o01_re += A1_re;
658  o01_im += A1_im;
659  o11_re += B1_re;
660  o11_im += B1_im;
661  o21_re -= B1_im;
662  o21_im += B1_re;
663  o31_re -= A1_im;
664  o31_im += A1_re;
665 
666  o02_re += A2_re;
667  o02_im += A2_im;
668  o12_re += B2_re;
669  o12_im += B2_im;
670  o22_re -= B2_im;
671  o22_im += B2_re;
672  o32_re -= A2_im;
673  o32_im += A2_re;
674 }
675 
676 #ifdef MULTI_GPU
677 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
678  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
679 #endif
680 {
681  // Projector P1+
682  // 1 0 0 1
683  // 0 1 -1 0
684  // 0 -1 1 0
685  // 1 0 0 1
686 
687 #ifdef MULTI_GPU
688  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
689  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
690 #else
691  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
692 #endif
693 
694  const int ga_idx = sid % Vh;
695 
696  // read gauge matrix from device memory
697  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride);
698 
705 
706 #ifdef MULTI_GPU
707  if (kernel_type == INTERIOR_KERNEL) {
708 #endif
709 
710  // read spinor from device memory
711  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
712 
713  // project spinor into half spinors
714  a0_re = +i00_re+i30_re;
715  a0_im = +i00_im+i30_im;
716  a1_re = +i01_re+i31_re;
717  a1_im = +i01_im+i31_im;
718  a2_re = +i02_re+i32_re;
719  a2_im = +i02_im+i32_im;
720  b0_re = +i10_re-i20_re;
721  b0_im = +i10_im-i20_im;
722  b1_re = +i11_re-i21_re;
723  b1_im = +i11_im-i21_im;
724  b2_re = +i12_re-i22_re;
725  b2_im = +i12_im-i22_im;
726 
727 #ifdef MULTI_GPU
728  } else {
729 
730  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
731 
732  // read half spinor from device memory
733  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
734 
735  a0_re = i00_re; a0_im = i00_im;
736  a1_re = i01_re; a1_im = i01_im;
737  a2_re = i02_re; a2_im = i02_im;
738  b0_re = i10_re; b0_im = i10_im;
739  b1_re = i11_re; b1_im = i11_im;
740  b2_re = i12_re; b2_im = i12_im;
741 
742  }
743 #endif // MULTI_GPU
744 
745  // reconstruct gauge matrix
747 
748  // multiply row 0
749  spinorFloat A0_re = 0;
750  A0_re += g00_re * a0_re;
751  A0_re -= g00_im * a0_im;
752  A0_re += g01_re * a1_re;
753  A0_re -= g01_im * a1_im;
754  A0_re += g02_re * a2_re;
755  A0_re -= g02_im * a2_im;
756  spinorFloat A0_im = 0;
757  A0_im += g00_re * a0_im;
758  A0_im += g00_im * a0_re;
759  A0_im += g01_re * a1_im;
760  A0_im += g01_im * a1_re;
761  A0_im += g02_re * a2_im;
762  A0_im += g02_im * a2_re;
763  spinorFloat B0_re = 0;
764  B0_re += g00_re * b0_re;
765  B0_re -= g00_im * b0_im;
766  B0_re += g01_re * b1_re;
767  B0_re -= g01_im * b1_im;
768  B0_re += g02_re * b2_re;
769  B0_re -= g02_im * b2_im;
770  spinorFloat B0_im = 0;
771  B0_im += g00_re * b0_im;
772  B0_im += g00_im * b0_re;
773  B0_im += g01_re * b1_im;
774  B0_im += g01_im * b1_re;
775  B0_im += g02_re * b2_im;
776  B0_im += g02_im * b2_re;
777 
778  // multiply row 1
779  spinorFloat A1_re = 0;
780  A1_re += g10_re * a0_re;
781  A1_re -= g10_im * a0_im;
782  A1_re += g11_re * a1_re;
783  A1_re -= g11_im * a1_im;
784  A1_re += g12_re * a2_re;
785  A1_re -= g12_im * a2_im;
786  spinorFloat A1_im = 0;
787  A1_im += g10_re * a0_im;
788  A1_im += g10_im * a0_re;
789  A1_im += g11_re * a1_im;
790  A1_im += g11_im * a1_re;
791  A1_im += g12_re * a2_im;
792  A1_im += g12_im * a2_re;
793  spinorFloat B1_re = 0;
794  B1_re += g10_re * b0_re;
795  B1_re -= g10_im * b0_im;
796  B1_re += g11_re * b1_re;
797  B1_re -= g11_im * b1_im;
798  B1_re += g12_re * b2_re;
799  B1_re -= g12_im * b2_im;
800  spinorFloat B1_im = 0;
801  B1_im += g10_re * b0_im;
802  B1_im += g10_im * b0_re;
803  B1_im += g11_re * b1_im;
804  B1_im += g11_im * b1_re;
805  B1_im += g12_re * b2_im;
806  B1_im += g12_im * b2_re;
807 
808  // multiply row 2
809  spinorFloat A2_re = 0;
810  A2_re += g20_re * a0_re;
811  A2_re -= g20_im * a0_im;
812  A2_re += g21_re * a1_re;
813  A2_re -= g21_im * a1_im;
814  A2_re += g22_re * a2_re;
815  A2_re -= g22_im * a2_im;
816  spinorFloat A2_im = 0;
817  A2_im += g20_re * a0_im;
818  A2_im += g20_im * a0_re;
819  A2_im += g21_re * a1_im;
820  A2_im += g21_im * a1_re;
821  A2_im += g22_re * a2_im;
822  A2_im += g22_im * a2_re;
823  spinorFloat B2_re = 0;
824  B2_re += g20_re * b0_re;
825  B2_re -= g20_im * b0_im;
826  B2_re += g21_re * b1_re;
827  B2_re -= g21_im * b1_im;
828  B2_re += g22_re * b2_re;
829  B2_re -= g22_im * b2_im;
830  spinorFloat B2_im = 0;
831  B2_im += g20_re * b0_im;
832  B2_im += g20_im * b0_re;
833  B2_im += g21_re * b1_im;
834  B2_im += g21_im * b1_re;
835  B2_im += g22_re * b2_im;
836  B2_im += g22_im * b2_re;
837 
838  o00_re += A0_re;
839  o00_im += A0_im;
840  o10_re += B0_re;
841  o10_im += B0_im;
842  o20_re -= B0_re;
843  o20_im -= B0_im;
844  o30_re += A0_re;
845  o30_im += A0_im;
846 
847  o01_re += A1_re;
848  o01_im += A1_im;
849  o11_re += B1_re;
850  o11_im += B1_im;
851  o21_re -= B1_re;
852  o21_im -= B1_im;
853  o31_re += A1_re;
854  o31_im += A1_im;
855 
856  o02_re += A2_re;
857  o02_im += A2_im;
858  o12_re += B2_re;
859  o12_im += B2_im;
860  o22_re -= B2_re;
861  o22_im -= B2_im;
862  o32_re += A2_re;
863  o32_im += A2_im;
864 }
865 
866 #ifdef MULTI_GPU
867 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
868  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
869 #endif
870 {
871  // Projector P1-
872  // 1 0 0 -1
873  // 0 1 1 0
874  // 0 1 1 0
875  // -1 0 0 1
876 
877 #ifdef MULTI_GPU
878  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
879  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
880 #else
881  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
882 #endif
883 
884 #ifdef MULTI_GPU
885  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % Vh : Vh+(face_idx % ghostFace[static_cast<int>(kernel_type)]));
886 #else
887  const int ga_idx = sp_idx % Vh;
888 #endif
889 
890  // read gauge matrix from device memory
891  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride);
892 
899 
900 #ifdef MULTI_GPU
901  if (kernel_type == INTERIOR_KERNEL) {
902 #endif
903 
904  // read spinor from device memory
905  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
906 
907  // project spinor into half spinors
908  a0_re = +i00_re-i30_re;
909  a0_im = +i00_im-i30_im;
910  a1_re = +i01_re-i31_re;
911  a1_im = +i01_im-i31_im;
912  a2_re = +i02_re-i32_re;
913  a2_im = +i02_im-i32_im;
914  b0_re = +i10_re+i20_re;
915  b0_im = +i10_im+i20_im;
916  b1_re = +i11_re+i21_re;
917  b1_im = +i11_im+i21_im;
918  b2_re = +i12_re+i22_re;
919  b2_im = +i12_im+i22_im;
920 
921 #ifdef MULTI_GPU
922  } else {
923 
924  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
925 
926  // read half spinor from device memory
927  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
928 
929  a0_re = i00_re; a0_im = i00_im;
930  a1_re = i01_re; a1_im = i01_im;
931  a2_re = i02_re; a2_im = i02_im;
932  b0_re = i10_re; b0_im = i10_im;
933  b1_re = i11_re; b1_im = i11_im;
934  b2_re = i12_re; b2_im = i12_im;
935 
936  }
937 #endif // MULTI_GPU
938 
939  // reconstruct gauge matrix
941 
942  // multiply row 0
943  spinorFloat A0_re = 0;
944  A0_re += gT00_re * a0_re;
945  A0_re -= gT00_im * a0_im;
946  A0_re += gT01_re * a1_re;
947  A0_re -= gT01_im * a1_im;
948  A0_re += gT02_re * a2_re;
949  A0_re -= gT02_im * a2_im;
950  spinorFloat A0_im = 0;
951  A0_im += gT00_re * a0_im;
952  A0_im += gT00_im * a0_re;
953  A0_im += gT01_re * a1_im;
954  A0_im += gT01_im * a1_re;
955  A0_im += gT02_re * a2_im;
956  A0_im += gT02_im * a2_re;
957  spinorFloat B0_re = 0;
958  B0_re += gT00_re * b0_re;
959  B0_re -= gT00_im * b0_im;
960  B0_re += gT01_re * b1_re;
961  B0_re -= gT01_im * b1_im;
962  B0_re += gT02_re * b2_re;
963  B0_re -= gT02_im * b2_im;
964  spinorFloat B0_im = 0;
965  B0_im += gT00_re * b0_im;
966  B0_im += gT00_im * b0_re;
967  B0_im += gT01_re * b1_im;
968  B0_im += gT01_im * b1_re;
969  B0_im += gT02_re * b2_im;
970  B0_im += gT02_im * b2_re;
971 
972  // multiply row 1
973  spinorFloat A1_re = 0;
974  A1_re += gT10_re * a0_re;
975  A1_re -= gT10_im * a0_im;
976  A1_re += gT11_re * a1_re;
977  A1_re -= gT11_im * a1_im;
978  A1_re += gT12_re * a2_re;
979  A1_re -= gT12_im * a2_im;
980  spinorFloat A1_im = 0;
981  A1_im += gT10_re * a0_im;
982  A1_im += gT10_im * a0_re;
983  A1_im += gT11_re * a1_im;
984  A1_im += gT11_im * a1_re;
985  A1_im += gT12_re * a2_im;
986  A1_im += gT12_im * a2_re;
987  spinorFloat B1_re = 0;
988  B1_re += gT10_re * b0_re;
989  B1_re -= gT10_im * b0_im;
990  B1_re += gT11_re * b1_re;
991  B1_re -= gT11_im * b1_im;
992  B1_re += gT12_re * b2_re;
993  B1_re -= gT12_im * b2_im;
994  spinorFloat B1_im = 0;
995  B1_im += gT10_re * b0_im;
996  B1_im += gT10_im * b0_re;
997  B1_im += gT11_re * b1_im;
998  B1_im += gT11_im * b1_re;
999  B1_im += gT12_re * b2_im;
1000  B1_im += gT12_im * b2_re;
1001 
1002  // multiply row 2
1003  spinorFloat A2_re = 0;
1004  A2_re += gT20_re * a0_re;
1005  A2_re -= gT20_im * a0_im;
1006  A2_re += gT21_re * a1_re;
1007  A2_re -= gT21_im * a1_im;
1008  A2_re += gT22_re * a2_re;
1009  A2_re -= gT22_im * a2_im;
1010  spinorFloat A2_im = 0;
1011  A2_im += gT20_re * a0_im;
1012  A2_im += gT20_im * a0_re;
1013  A2_im += gT21_re * a1_im;
1014  A2_im += gT21_im * a1_re;
1015  A2_im += gT22_re * a2_im;
1016  A2_im += gT22_im * a2_re;
1017  spinorFloat B2_re = 0;
1018  B2_re += gT20_re * b0_re;
1019  B2_re -= gT20_im * b0_im;
1020  B2_re += gT21_re * b1_re;
1021  B2_re -= gT21_im * b1_im;
1022  B2_re += gT22_re * b2_re;
1023  B2_re -= gT22_im * b2_im;
1024  spinorFloat B2_im = 0;
1025  B2_im += gT20_re * b0_im;
1026  B2_im += gT20_im * b0_re;
1027  B2_im += gT21_re * b1_im;
1028  B2_im += gT21_im * b1_re;
1029  B2_im += gT22_re * b2_im;
1030  B2_im += gT22_im * b2_re;
1031 
1032  o00_re += A0_re;
1033  o00_im += A0_im;
1034  o10_re += B0_re;
1035  o10_im += B0_im;
1036  o20_re += B0_re;
1037  o20_im += B0_im;
1038  o30_re -= A0_re;
1039  o30_im -= A0_im;
1040 
1041  o01_re += A1_re;
1042  o01_im += A1_im;
1043  o11_re += B1_re;
1044  o11_im += B1_im;
1045  o21_re += B1_re;
1046  o21_im += B1_im;
1047  o31_re -= A1_re;
1048  o31_im -= A1_im;
1049 
1050  o02_re += A2_re;
1051  o02_im += A2_im;
1052  o12_re += B2_re;
1053  o12_im += B2_im;
1054  o22_re += B2_re;
1055  o22_im += B2_im;
1056  o32_re -= A2_re;
1057  o32_im -= A2_im;
1058 }
1059 
1060 #ifdef MULTI_GPU
1061 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1062  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1063 #endif
1064 {
1065  // Projector P2+
1066  // 1 0 i 0
1067  // 0 1 0 -i
1068  // -i 0 1 0
1069  // 0 i 0 1
1070 
1071 #ifdef MULTI_GPU
1072  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1073  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1074 #else
1075  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1076 #endif
1077 
1078  const int ga_idx = sid % Vh;
1079 
1080  // read gauge matrix from device memory
1081  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride);
1082 
1089 
1090 #ifdef MULTI_GPU
1091  if (kernel_type == INTERIOR_KERNEL) {
1092 #endif
1093 
1094  // read spinor from device memory
1095  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1096 
1097  // project spinor into half spinors
1098  a0_re = +i00_re-i20_im;
1099  a0_im = +i00_im+i20_re;
1100  a1_re = +i01_re-i21_im;
1101  a1_im = +i01_im+i21_re;
1102  a2_re = +i02_re-i22_im;
1103  a2_im = +i02_im+i22_re;
1104  b0_re = +i10_re+i30_im;
1105  b0_im = +i10_im-i30_re;
1106  b1_re = +i11_re+i31_im;
1107  b1_im = +i11_im-i31_re;
1108  b2_re = +i12_re+i32_im;
1109  b2_im = +i12_im-i32_re;
1110 
1111 #ifdef MULTI_GPU
1112  } else {
1113 
1114  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
1115 
1116  // read half spinor from device memory
1117  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1118 
1119  a0_re = i00_re; a0_im = i00_im;
1120  a1_re = i01_re; a1_im = i01_im;
1121  a2_re = i02_re; a2_im = i02_im;
1122  b0_re = i10_re; b0_im = i10_im;
1123  b1_re = i11_re; b1_im = i11_im;
1124  b2_re = i12_re; b2_im = i12_im;
1125 
1126  }
1127 #endif // MULTI_GPU
1128 
1129  // reconstruct gauge matrix
1131 
1132  // multiply row 0
1133  spinorFloat A0_re = 0;
1134  A0_re += g00_re * a0_re;
1135  A0_re -= g00_im * a0_im;
1136  A0_re += g01_re * a1_re;
1137  A0_re -= g01_im * a1_im;
1138  A0_re += g02_re * a2_re;
1139  A0_re -= g02_im * a2_im;
1140  spinorFloat A0_im = 0;
1141  A0_im += g00_re * a0_im;
1142  A0_im += g00_im * a0_re;
1143  A0_im += g01_re * a1_im;
1144  A0_im += g01_im * a1_re;
1145  A0_im += g02_re * a2_im;
1146  A0_im += g02_im * a2_re;
1147  spinorFloat B0_re = 0;
1148  B0_re += g00_re * b0_re;
1149  B0_re -= g00_im * b0_im;
1150  B0_re += g01_re * b1_re;
1151  B0_re -= g01_im * b1_im;
1152  B0_re += g02_re * b2_re;
1153  B0_re -= g02_im * b2_im;
1154  spinorFloat B0_im = 0;
1155  B0_im += g00_re * b0_im;
1156  B0_im += g00_im * b0_re;
1157  B0_im += g01_re * b1_im;
1158  B0_im += g01_im * b1_re;
1159  B0_im += g02_re * b2_im;
1160  B0_im += g02_im * b2_re;
1161 
1162  // multiply row 1
1163  spinorFloat A1_re = 0;
1164  A1_re += g10_re * a0_re;
1165  A1_re -= g10_im * a0_im;
1166  A1_re += g11_re * a1_re;
1167  A1_re -= g11_im * a1_im;
1168  A1_re += g12_re * a2_re;
1169  A1_re -= g12_im * a2_im;
1170  spinorFloat A1_im = 0;
1171  A1_im += g10_re * a0_im;
1172  A1_im += g10_im * a0_re;
1173  A1_im += g11_re * a1_im;
1174  A1_im += g11_im * a1_re;
1175  A1_im += g12_re * a2_im;
1176  A1_im += g12_im * a2_re;
1177  spinorFloat B1_re = 0;
1178  B1_re += g10_re * b0_re;
1179  B1_re -= g10_im * b0_im;
1180  B1_re += g11_re * b1_re;
1181  B1_re -= g11_im * b1_im;
1182  B1_re += g12_re * b2_re;
1183  B1_re -= g12_im * b2_im;
1184  spinorFloat B1_im = 0;
1185  B1_im += g10_re * b0_im;
1186  B1_im += g10_im * b0_re;
1187  B1_im += g11_re * b1_im;
1188  B1_im += g11_im * b1_re;
1189  B1_im += g12_re * b2_im;
1190  B1_im += g12_im * b2_re;
1191 
1192  // multiply row 2
1193  spinorFloat A2_re = 0;
1194  A2_re += g20_re * a0_re;
1195  A2_re -= g20_im * a0_im;
1196  A2_re += g21_re * a1_re;
1197  A2_re -= g21_im * a1_im;
1198  A2_re += g22_re * a2_re;
1199  A2_re -= g22_im * a2_im;
1200  spinorFloat A2_im = 0;
1201  A2_im += g20_re * a0_im;
1202  A2_im += g20_im * a0_re;
1203  A2_im += g21_re * a1_im;
1204  A2_im += g21_im * a1_re;
1205  A2_im += g22_re * a2_im;
1206  A2_im += g22_im * a2_re;
1207  spinorFloat B2_re = 0;
1208  B2_re += g20_re * b0_re;
1209  B2_re -= g20_im * b0_im;
1210  B2_re += g21_re * b1_re;
1211  B2_re -= g21_im * b1_im;
1212  B2_re += g22_re * b2_re;
1213  B2_re -= g22_im * b2_im;
1214  spinorFloat B2_im = 0;
1215  B2_im += g20_re * b0_im;
1216  B2_im += g20_im * b0_re;
1217  B2_im += g21_re * b1_im;
1218  B2_im += g21_im * b1_re;
1219  B2_im += g22_re * b2_im;
1220  B2_im += g22_im * b2_re;
1221 
1222  o00_re += A0_re;
1223  o00_im += A0_im;
1224  o10_re += B0_re;
1225  o10_im += B0_im;
1226  o20_re += A0_im;
1227  o20_im -= A0_re;
1228  o30_re -= B0_im;
1229  o30_im += B0_re;
1230 
1231  o01_re += A1_re;
1232  o01_im += A1_im;
1233  o11_re += B1_re;
1234  o11_im += B1_im;
1235  o21_re += A1_im;
1236  o21_im -= A1_re;
1237  o31_re -= B1_im;
1238  o31_im += B1_re;
1239 
1240  o02_re += A2_re;
1241  o02_im += A2_im;
1242  o12_re += B2_re;
1243  o12_im += B2_im;
1244  o22_re += A2_im;
1245  o22_im -= A2_re;
1246  o32_re -= B2_im;
1247  o32_im += B2_re;
1248 }
1249 
1250 #ifdef MULTI_GPU
1251 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1252  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1253 #endif
1254 {
1255  // Projector P2-
1256  // 1 0 -i 0
1257  // 0 1 0 i
1258  // i 0 1 0
1259  // 0 -i 0 1
1260 
1261 #ifdef MULTI_GPU
1262  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1263  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1264 #else
1265  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1266 #endif
1267 
1268 #ifdef MULTI_GPU
1269  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % Vh : Vh+(face_idx % ghostFace[static_cast<int>(kernel_type)]));
1270 #else
1271  const int ga_idx = sp_idx % Vh;
1272 #endif
1273 
1274  // read gauge matrix from device memory
1275  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride);
1276 
1283 
1284 #ifdef MULTI_GPU
1285  if (kernel_type == INTERIOR_KERNEL) {
1286 #endif
1287 
1288  // read spinor from device memory
1289  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1290 
1291  // project spinor into half spinors
1292  a0_re = +i00_re+i20_im;
1293  a0_im = +i00_im-i20_re;
1294  a1_re = +i01_re+i21_im;
1295  a1_im = +i01_im-i21_re;
1296  a2_re = +i02_re+i22_im;
1297  a2_im = +i02_im-i22_re;
1298  b0_re = +i10_re-i30_im;
1299  b0_im = +i10_im+i30_re;
1300  b1_re = +i11_re-i31_im;
1301  b1_im = +i11_im+i31_re;
1302  b2_re = +i12_re-i32_im;
1303  b2_im = +i12_im+i32_re;
1304 
1305 #ifdef MULTI_GPU
1306  } else {
1307 
1308  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
1309 
1310  // read half spinor from device memory
1311  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1312 
1313  a0_re = i00_re; a0_im = i00_im;
1314  a1_re = i01_re; a1_im = i01_im;
1315  a2_re = i02_re; a2_im = i02_im;
1316  b0_re = i10_re; b0_im = i10_im;
1317  b1_re = i11_re; b1_im = i11_im;
1318  b2_re = i12_re; b2_im = i12_im;
1319 
1320  }
1321 #endif // MULTI_GPU
1322 
1323  // reconstruct gauge matrix
1325 
1326  // multiply row 0
1327  spinorFloat A0_re = 0;
1328  A0_re += gT00_re * a0_re;
1329  A0_re -= gT00_im * a0_im;
1330  A0_re += gT01_re * a1_re;
1331  A0_re -= gT01_im * a1_im;
1332  A0_re += gT02_re * a2_re;
1333  A0_re -= gT02_im * a2_im;
1334  spinorFloat A0_im = 0;
1335  A0_im += gT00_re * a0_im;
1336  A0_im += gT00_im * a0_re;
1337  A0_im += gT01_re * a1_im;
1338  A0_im += gT01_im * a1_re;
1339  A0_im += gT02_re * a2_im;
1340  A0_im += gT02_im * a2_re;
1341  spinorFloat B0_re = 0;
1342  B0_re += gT00_re * b0_re;
1343  B0_re -= gT00_im * b0_im;
1344  B0_re += gT01_re * b1_re;
1345  B0_re -= gT01_im * b1_im;
1346  B0_re += gT02_re * b2_re;
1347  B0_re -= gT02_im * b2_im;
1348  spinorFloat B0_im = 0;
1349  B0_im += gT00_re * b0_im;
1350  B0_im += gT00_im * b0_re;
1351  B0_im += gT01_re * b1_im;
1352  B0_im += gT01_im * b1_re;
1353  B0_im += gT02_re * b2_im;
1354  B0_im += gT02_im * b2_re;
1355 
1356  // multiply row 1
1357  spinorFloat A1_re = 0;
1358  A1_re += gT10_re * a0_re;
1359  A1_re -= gT10_im * a0_im;
1360  A1_re += gT11_re * a1_re;
1361  A1_re -= gT11_im * a1_im;
1362  A1_re += gT12_re * a2_re;
1363  A1_re -= gT12_im * a2_im;
1364  spinorFloat A1_im = 0;
1365  A1_im += gT10_re * a0_im;
1366  A1_im += gT10_im * a0_re;
1367  A1_im += gT11_re * a1_im;
1368  A1_im += gT11_im * a1_re;
1369  A1_im += gT12_re * a2_im;
1370  A1_im += gT12_im * a2_re;
1371  spinorFloat B1_re = 0;
1372  B1_re += gT10_re * b0_re;
1373  B1_re -= gT10_im * b0_im;
1374  B1_re += gT11_re * b1_re;
1375  B1_re -= gT11_im * b1_im;
1376  B1_re += gT12_re * b2_re;
1377  B1_re -= gT12_im * b2_im;
1378  spinorFloat B1_im = 0;
1379  B1_im += gT10_re * b0_im;
1380  B1_im += gT10_im * b0_re;
1381  B1_im += gT11_re * b1_im;
1382  B1_im += gT11_im * b1_re;
1383  B1_im += gT12_re * b2_im;
1384  B1_im += gT12_im * b2_re;
1385 
1386  // multiply row 2
1387  spinorFloat A2_re = 0;
1388  A2_re += gT20_re * a0_re;
1389  A2_re -= gT20_im * a0_im;
1390  A2_re += gT21_re * a1_re;
1391  A2_re -= gT21_im * a1_im;
1392  A2_re += gT22_re * a2_re;
1393  A2_re -= gT22_im * a2_im;
1394  spinorFloat A2_im = 0;
1395  A2_im += gT20_re * a0_im;
1396  A2_im += gT20_im * a0_re;
1397  A2_im += gT21_re * a1_im;
1398  A2_im += gT21_im * a1_re;
1399  A2_im += gT22_re * a2_im;
1400  A2_im += gT22_im * a2_re;
1401  spinorFloat B2_re = 0;
1402  B2_re += gT20_re * b0_re;
1403  B2_re -= gT20_im * b0_im;
1404  B2_re += gT21_re * b1_re;
1405  B2_re -= gT21_im * b1_im;
1406  B2_re += gT22_re * b2_re;
1407  B2_re -= gT22_im * b2_im;
1408  spinorFloat B2_im = 0;
1409  B2_im += gT20_re * b0_im;
1410  B2_im += gT20_im * b0_re;
1411  B2_im += gT21_re * b1_im;
1412  B2_im += gT21_im * b1_re;
1413  B2_im += gT22_re * b2_im;
1414  B2_im += gT22_im * b2_re;
1415 
1416  o00_re += A0_re;
1417  o00_im += A0_im;
1418  o10_re += B0_re;
1419  o10_im += B0_im;
1420  o20_re -= A0_im;
1421  o20_im += A0_re;
1422  o30_re += B0_im;
1423  o30_im -= B0_re;
1424 
1425  o01_re += A1_re;
1426  o01_im += A1_im;
1427  o11_re += B1_re;
1428  o11_im += B1_im;
1429  o21_re -= A1_im;
1430  o21_im += A1_re;
1431  o31_re += B1_im;
1432  o31_im -= B1_re;
1433 
1434  o02_re += A2_re;
1435  o02_im += A2_im;
1436  o12_re += B2_re;
1437  o12_im += B2_im;
1438  o22_re -= A2_im;
1439  o22_im += A2_re;
1440  o32_re += B2_im;
1441  o32_im -= B2_re;
1442 }
1443 
1444 #ifdef MULTI_GPU
1445 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1446  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
1447 #endif
1448 {
1449  // Projector P3+
1450  // 2 0 0 0
1451  // 0 2 0 0
1452  // 0 0 0 0
1453  // 0 0 0 0
1454 
1455 #ifdef MULTI_GPU
1456  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1457  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1458 #else
1459  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1460 #endif
1461 
1462  const int ga_idx = sid % Vh;
1463 
1465  {
1472 
1473 #ifdef MULTI_GPU
1474  if (kernel_type == INTERIOR_KERNEL) {
1475 #endif
1476 
1477  // read spinor from device memory
1478  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1479 
1480  // project spinor into half spinors
1481  a0_re = +2*i00_re;
1482  a0_im = +2*i00_im;
1483  a1_re = +2*i01_re;
1484  a1_im = +2*i01_im;
1485  a2_re = +2*i02_re;
1486  a2_im = +2*i02_im;
1487  b0_re = +2*i10_re;
1488  b0_im = +2*i10_im;
1489  b1_re = +2*i11_re;
1490  b1_im = +2*i11_im;
1491  b2_re = +2*i12_re;
1492  b2_im = +2*i12_im;
1493 
1494 #ifdef MULTI_GPU
1495  } else {
1496 
1497  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
1498  const int t_proj_scale = TPROJSCALE;
1499 
1500  // read half spinor from device memory
1501  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1502 
1503  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1504  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1505  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1506  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1507  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1508  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1509 
1510  }
1511 #endif // MULTI_GPU
1512 
1513  // identity gauge matrix
1520 
1521  o00_re += A0_re;
1522  o00_im += A0_im;
1523  o10_re += B0_re;
1524  o10_im += B0_im;
1525 
1526  o01_re += A1_re;
1527  o01_im += A1_im;
1528  o11_re += B1_re;
1529  o11_im += B1_im;
1530 
1531  o02_re += A2_re;
1532  o02_im += A2_im;
1533  o12_re += B2_re;
1534  o12_im += B2_im;
1535  } else {
1536  // read gauge matrix from device memory
1537  ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride);
1538 
1545 
1546 #ifdef MULTI_GPU
1547  if (kernel_type == INTERIOR_KERNEL) {
1548 #endif
1549 
1550  // read spinor from device memory
1551  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1552 
1553  // project spinor into half spinors
1554  a0_re = +2*i00_re;
1555  a0_im = +2*i00_im;
1556  a1_re = +2*i01_re;
1557  a1_im = +2*i01_im;
1558  a2_re = +2*i02_re;
1559  a2_im = +2*i02_im;
1560  b0_re = +2*i10_re;
1561  b0_im = +2*i10_im;
1562  b1_re = +2*i11_re;
1563  b1_im = +2*i11_im;
1564  b2_re = +2*i12_re;
1565  b2_im = +2*i12_im;
1566 
1567 #ifdef MULTI_GPU
1568  } else {
1569 
1570  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
1571  const int t_proj_scale = TPROJSCALE;
1572 
1573  // read half spinor from device memory
1574  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1575 
1576  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1577  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1578  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1579  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1580  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1581  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1582 
1583  }
1584 #endif // MULTI_GPU
1585 
1586  // reconstruct gauge matrix
1588 
1589  // multiply row 0
1590  spinorFloat A0_re = 0;
1591  A0_re += g00_re * a0_re;
1592  A0_re -= g00_im * a0_im;
1593  A0_re += g01_re * a1_re;
1594  A0_re -= g01_im * a1_im;
1595  A0_re += g02_re * a2_re;
1596  A0_re -= g02_im * a2_im;
1597  spinorFloat A0_im = 0;
1598  A0_im += g00_re * a0_im;
1599  A0_im += g00_im * a0_re;
1600  A0_im += g01_re * a1_im;
1601  A0_im += g01_im * a1_re;
1602  A0_im += g02_re * a2_im;
1603  A0_im += g02_im * a2_re;
1604  spinorFloat B0_re = 0;
1605  B0_re += g00_re * b0_re;
1606  B0_re -= g00_im * b0_im;
1607  B0_re += g01_re * b1_re;
1608  B0_re -= g01_im * b1_im;
1609  B0_re += g02_re * b2_re;
1610  B0_re -= g02_im * b2_im;
1611  spinorFloat B0_im = 0;
1612  B0_im += g00_re * b0_im;
1613  B0_im += g00_im * b0_re;
1614  B0_im += g01_re * b1_im;
1615  B0_im += g01_im * b1_re;
1616  B0_im += g02_re * b2_im;
1617  B0_im += g02_im * b2_re;
1618 
1619  // multiply row 1
1620  spinorFloat A1_re = 0;
1621  A1_re += g10_re * a0_re;
1622  A1_re -= g10_im * a0_im;
1623  A1_re += g11_re * a1_re;
1624  A1_re -= g11_im * a1_im;
1625  A1_re += g12_re * a2_re;
1626  A1_re -= g12_im * a2_im;
1627  spinorFloat A1_im = 0;
1628  A1_im += g10_re * a0_im;
1629  A1_im += g10_im * a0_re;
1630  A1_im += g11_re * a1_im;
1631  A1_im += g11_im * a1_re;
1632  A1_im += g12_re * a2_im;
1633  A1_im += g12_im * a2_re;
1634  spinorFloat B1_re = 0;
1635  B1_re += g10_re * b0_re;
1636  B1_re -= g10_im * b0_im;
1637  B1_re += g11_re * b1_re;
1638  B1_re -= g11_im * b1_im;
1639  B1_re += g12_re * b2_re;
1640  B1_re -= g12_im * b2_im;
1641  spinorFloat B1_im = 0;
1642  B1_im += g10_re * b0_im;
1643  B1_im += g10_im * b0_re;
1644  B1_im += g11_re * b1_im;
1645  B1_im += g11_im * b1_re;
1646  B1_im += g12_re * b2_im;
1647  B1_im += g12_im * b2_re;
1648 
1649  // multiply row 2
1650  spinorFloat A2_re = 0;
1651  A2_re += g20_re * a0_re;
1652  A2_re -= g20_im * a0_im;
1653  A2_re += g21_re * a1_re;
1654  A2_re -= g21_im * a1_im;
1655  A2_re += g22_re * a2_re;
1656  A2_re -= g22_im * a2_im;
1657  spinorFloat A2_im = 0;
1658  A2_im += g20_re * a0_im;
1659  A2_im += g20_im * a0_re;
1660  A2_im += g21_re * a1_im;
1661  A2_im += g21_im * a1_re;
1662  A2_im += g22_re * a2_im;
1663  A2_im += g22_im * a2_re;
1664  spinorFloat B2_re = 0;
1665  B2_re += g20_re * b0_re;
1666  B2_re -= g20_im * b0_im;
1667  B2_re += g21_re * b1_re;
1668  B2_re -= g21_im * b1_im;
1669  B2_re += g22_re * b2_re;
1670  B2_re -= g22_im * b2_im;
1671  spinorFloat B2_im = 0;
1672  B2_im += g20_re * b0_im;
1673  B2_im += g20_im * b0_re;
1674  B2_im += g21_re * b1_im;
1675  B2_im += g21_im * b1_re;
1676  B2_im += g22_re * b2_im;
1677  B2_im += g22_im * b2_re;
1678 
1679  o00_re += A0_re;
1680  o00_im += A0_im;
1681  o10_re += B0_re;
1682  o10_im += B0_im;
1683 
1684  o01_re += A1_re;
1685  o01_im += A1_im;
1686  o11_re += B1_re;
1687  o11_im += B1_im;
1688 
1689  o02_re += A2_re;
1690  o02_im += A2_im;
1691  o12_re += B2_re;
1692  o12_im += B2_im;
1693  }
1694 }
1695 
1696 #ifdef MULTI_GPU
1697 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
1698  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
1699 #endif
1700 {
1701  // Projector P3-
1702  // 0 0 0 0
1703  // 0 0 0 0
1704  // 0 0 2 0
1705  // 0 0 0 2
1706 
1707 #ifdef MULTI_GPU
1708  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
1709  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1710 #else
1711  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
1712 #endif
1713 
1714 #ifdef MULTI_GPU
1715  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % Vh : Vh+(face_idx % ghostFace[static_cast<int>(kernel_type)]));
1716 #else
1717  const int ga_idx = sp_idx % Vh;
1718 #endif
1719 
1720  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1721  {
1728 
1729 #ifdef MULTI_GPU
1730  if (kernel_type == INTERIOR_KERNEL) {
1731 #endif
1732 
1733  // read spinor from device memory
1734  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1735 
1736  // project spinor into half spinors
1737  a0_re = +2*i20_re;
1738  a0_im = +2*i20_im;
1739  a1_re = +2*i21_re;
1740  a1_im = +2*i21_im;
1741  a2_re = +2*i22_re;
1742  a2_im = +2*i22_im;
1743  b0_re = +2*i30_re;
1744  b0_im = +2*i30_im;
1745  b1_re = +2*i31_re;
1746  b1_im = +2*i31_im;
1747  b2_re = +2*i32_re;
1748  b2_im = +2*i32_im;
1749 
1750 #ifdef MULTI_GPU
1751  } else {
1752 
1753  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
1754  const int t_proj_scale = TPROJSCALE;
1755 
1756  // read half spinor from device memory
1757  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1758 
1759  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1760  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1761  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1762  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1763  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1764  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1765 
1766  }
1767 #endif // MULTI_GPU
1768 
1769  // identity gauge matrix
1776 
1777  o20_re += A0_re;
1778  o20_im += A0_im;
1779  o30_re += B0_re;
1780  o30_im += B0_im;
1781 
1782  o21_re += A1_re;
1783  o21_im += A1_im;
1784  o31_re += B1_re;
1785  o31_im += B1_im;
1786 
1787  o22_re += A2_re;
1788  o22_im += A2_im;
1789  o32_re += B2_re;
1790  o32_im += B2_im;
1791  } else {
1792  // read gauge matrix from device memory
1793  ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride);
1794 
1801 
1802 #ifdef MULTI_GPU
1803  if (kernel_type == INTERIOR_KERNEL) {
1804 #endif
1805 
1806  // read spinor from device memory
1807  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1808 
1809  // project spinor into half spinors
1810  a0_re = +2*i20_re;
1811  a0_im = +2*i20_im;
1812  a1_re = +2*i21_re;
1813  a1_im = +2*i21_im;
1814  a2_re = +2*i22_re;
1815  a2_im = +2*i22_im;
1816  b0_re = +2*i30_re;
1817  b0_im = +2*i30_im;
1818  b1_re = +2*i31_re;
1819  b1_im = +2*i31_im;
1820  b2_re = +2*i32_re;
1821  b2_im = +2*i32_im;
1822 
1823 #ifdef MULTI_GPU
1824  } else {
1825 
1826  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
1827  const int t_proj_scale = TPROJSCALE;
1828 
1829  // read half spinor from device memory
1830  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1831 
1832  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1833  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1834  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1835  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1836  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1837  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1838 
1839  }
1840 #endif // MULTI_GPU
1841 
1842  // reconstruct gauge matrix
1844 
1845  // multiply row 0
1846  spinorFloat A0_re = 0;
1847  A0_re += gT00_re * a0_re;
1848  A0_re -= gT00_im * a0_im;
1849  A0_re += gT01_re * a1_re;
1850  A0_re -= gT01_im * a1_im;
1851  A0_re += gT02_re * a2_re;
1852  A0_re -= gT02_im * a2_im;
1853  spinorFloat A0_im = 0;
1854  A0_im += gT00_re * a0_im;
1855  A0_im += gT00_im * a0_re;
1856  A0_im += gT01_re * a1_im;
1857  A0_im += gT01_im * a1_re;
1858  A0_im += gT02_re * a2_im;
1859  A0_im += gT02_im * a2_re;
1860  spinorFloat B0_re = 0;
1861  B0_re += gT00_re * b0_re;
1862  B0_re -= gT00_im * b0_im;
1863  B0_re += gT01_re * b1_re;
1864  B0_re -= gT01_im * b1_im;
1865  B0_re += gT02_re * b2_re;
1866  B0_re -= gT02_im * b2_im;
1867  spinorFloat B0_im = 0;
1868  B0_im += gT00_re * b0_im;
1869  B0_im += gT00_im * b0_re;
1870  B0_im += gT01_re * b1_im;
1871  B0_im += gT01_im * b1_re;
1872  B0_im += gT02_re * b2_im;
1873  B0_im += gT02_im * b2_re;
1874 
1875  // multiply row 1
1876  spinorFloat A1_re = 0;
1877  A1_re += gT10_re * a0_re;
1878  A1_re -= gT10_im * a0_im;
1879  A1_re += gT11_re * a1_re;
1880  A1_re -= gT11_im * a1_im;
1881  A1_re += gT12_re * a2_re;
1882  A1_re -= gT12_im * a2_im;
1883  spinorFloat A1_im = 0;
1884  A1_im += gT10_re * a0_im;
1885  A1_im += gT10_im * a0_re;
1886  A1_im += gT11_re * a1_im;
1887  A1_im += gT11_im * a1_re;
1888  A1_im += gT12_re * a2_im;
1889  A1_im += gT12_im * a2_re;
1890  spinorFloat B1_re = 0;
1891  B1_re += gT10_re * b0_re;
1892  B1_re -= gT10_im * b0_im;
1893  B1_re += gT11_re * b1_re;
1894  B1_re -= gT11_im * b1_im;
1895  B1_re += gT12_re * b2_re;
1896  B1_re -= gT12_im * b2_im;
1897  spinorFloat B1_im = 0;
1898  B1_im += gT10_re * b0_im;
1899  B1_im += gT10_im * b0_re;
1900  B1_im += gT11_re * b1_im;
1901  B1_im += gT11_im * b1_re;
1902  B1_im += gT12_re * b2_im;
1903  B1_im += gT12_im * b2_re;
1904 
1905  // multiply row 2
1906  spinorFloat A2_re = 0;
1907  A2_re += gT20_re * a0_re;
1908  A2_re -= gT20_im * a0_im;
1909  A2_re += gT21_re * a1_re;
1910  A2_re -= gT21_im * a1_im;
1911  A2_re += gT22_re * a2_re;
1912  A2_re -= gT22_im * a2_im;
1913  spinorFloat A2_im = 0;
1914  A2_im += gT20_re * a0_im;
1915  A2_im += gT20_im * a0_re;
1916  A2_im += gT21_re * a1_im;
1917  A2_im += gT21_im * a1_re;
1918  A2_im += gT22_re * a2_im;
1919  A2_im += gT22_im * a2_re;
1920  spinorFloat B2_re = 0;
1921  B2_re += gT20_re * b0_re;
1922  B2_re -= gT20_im * b0_im;
1923  B2_re += gT21_re * b1_re;
1924  B2_re -= gT21_im * b1_im;
1925  B2_re += gT22_re * b2_re;
1926  B2_re -= gT22_im * b2_im;
1927  spinorFloat B2_im = 0;
1928  B2_im += gT20_re * b0_im;
1929  B2_im += gT20_im * b0_re;
1930  B2_im += gT21_re * b1_im;
1931  B2_im += gT21_im * b1_re;
1932  B2_im += gT22_re * b2_im;
1933  B2_im += gT22_im * b2_re;
1934 
1935  o20_re += A0_re;
1936  o20_im += A0_im;
1937  o30_re += B0_re;
1938  o30_im += B0_im;
1939 
1940  o21_re += A1_re;
1941  o21_im += A1_im;
1942  o31_re += B1_re;
1943  o31_im += B1_im;
1944 
1945  o22_re += A2_re;
1946  o22_im += A2_im;
1947  o32_re += B2_re;
1948  o32_im += B2_im;
1949  }
1950 }
1951 
1952 #if defined MULTI_GPU && defined DSLASH_XPAY
1953 
1954 int incomplete = 0; // Have all 8 contributions been computed for this site?
1955 
1956 switch(kernel_type) { // intentional fall-through
1957 case INTERIOR_KERNEL:
1958 incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
1959 case EXTERIOR_KERNEL_T:
1960 incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
1961 case EXTERIOR_KERNEL_Z:
1962 incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
1963 case EXTERIOR_KERNEL_Y:
1964 incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
1965 }
1966 
1967 if (!incomplete)
1968 #endif // MULTI_GPU
1969 {
1970 
1971 #ifdef DSLASH_XPAY
1972  READ_ACCUM(ACCUMTEX, param.sp_stride)
1974 
1975 #ifdef MDWF_mode
1976  coeff = (spinorFloat)(0.5*a/(mdwf_b5[xs]*(m5+4.0) + 1.0));
1977 #else
1978  coeff = a;
1979 #endif
1980 
1981 #ifdef SPINOR_DOUBLE
1982  o00_re = coeff*o00_re + accum0.x;
1983  o00_im = coeff*o00_im + accum0.y;
1984  o01_re = coeff*o01_re + accum1.x;
1985  o01_im = coeff*o01_im + accum1.y;
1986  o02_re = coeff*o02_re + accum2.x;
1987  o02_im = coeff*o02_im + accum2.y;
1988  o10_re = coeff*o10_re + accum3.x;
1989  o10_im = coeff*o10_im + accum3.y;
1990  o11_re = coeff*o11_re + accum4.x;
1991  o11_im = coeff*o11_im + accum4.y;
1992  o12_re = coeff*o12_re + accum5.x;
1993  o12_im = coeff*o12_im + accum5.y;
1994  o20_re = coeff*o20_re + accum6.x;
1995  o20_im = coeff*o20_im + accum6.y;
1996  o21_re = coeff*o21_re + accum7.x;
1997  o21_im = coeff*o21_im + accum7.y;
1998  o22_re = coeff*o22_re + accum8.x;
1999  o22_im = coeff*o22_im + accum8.y;
2000  o30_re = coeff*o30_re + accum9.x;
2001  o30_im = coeff*o30_im + accum9.y;
2002  o31_re = coeff*o31_re + accum10.x;
2003  o31_im = coeff*o31_im + accum10.y;
2004  o32_re = coeff*o32_re + accum11.x;
2005  o32_im = coeff*o32_im + accum11.y;
2006 #else
2007  o00_re = coeff*o00_re + accum0.x;
2008  o00_im = coeff*o00_im + accum0.y;
2009  o01_re = coeff*o01_re + accum0.z;
2010  o01_im = coeff*o01_im + accum0.w;
2011  o02_re = coeff*o02_re + accum1.x;
2012  o02_im = coeff*o02_im + accum1.y;
2013  o10_re = coeff*o10_re + accum1.z;
2014  o10_im = coeff*o10_im + accum1.w;
2015  o11_re = coeff*o11_re + accum2.x;
2016  o11_im = coeff*o11_im + accum2.y;
2017  o12_re = coeff*o12_re + accum2.z;
2018  o12_im = coeff*o12_im + accum2.w;
2019  o20_re = coeff*o20_re + accum3.x;
2020  o20_im = coeff*o20_im + accum3.y;
2021  o21_re = coeff*o21_re + accum3.z;
2022  o21_im = coeff*o21_im + accum3.w;
2023  o22_re = coeff*o22_re + accum4.x;
2024  o22_im = coeff*o22_im + accum4.y;
2025  o30_re = coeff*o30_re + accum4.z;
2026  o30_im = coeff*o30_im + accum4.w;
2027  o31_re = coeff*o31_re + accum5.x;
2028  o31_im = coeff*o31_im + accum5.y;
2029  o32_re = coeff*o32_re + accum5.z;
2030  o32_im = coeff*o32_im + accum5.w;
2031 #endif // SPINOR_DOUBLE
2032 #endif // DSLASH_XPAY
2033 }
2034 
2035 // write spinor field back to device memory
2036 WRITE_SPINOR(param.sp_stride);
2037 
2038 // undefine to prevent warning when precision is changed
2039 #undef m5
2040 #undef mdwf_b5
2041 #undef mdwf_c5
2042 #undef spinorFloat
2043 #undef SHARED_STRIDE
2044 
2045 #undef g00_re
2046 #undef g00_im
2047 #undef g01_re
2048 #undef g01_im
2049 #undef g02_re
2050 #undef g02_im
2051 #undef g10_re
2052 #undef g10_im
2053 #undef g11_re
2054 #undef g11_im
2055 #undef g12_re
2056 #undef g12_im
2057 #undef g20_re
2058 #undef g20_im
2059 #undef g21_re
2060 #undef g21_im
2061 #undef g22_re
2062 #undef g22_im
2063 
2064 #undef i00_re
2065 #undef i00_im
2066 #undef i01_re
2067 #undef i01_im
2068 #undef i02_re
2069 #undef i02_im
2070 #undef i10_re
2071 #undef i10_im
2072 #undef i11_re
2073 #undef i11_im
2074 #undef i12_re
2075 #undef i12_im
2076 #undef i20_re
2077 #undef i20_im
2078 #undef i21_re
2079 #undef i21_im
2080 #undef i22_re
2081 #undef i22_im
2082 #undef i30_re
2083 #undef i30_im
2084 #undef i31_re
2085 #undef i31_im
2086 #undef i32_re
2087 #undef i32_im
2088 
2089 
2090 
2091 #undef VOLATILE
spinorFloat A2_re
spinorFloat b1_im
__constant__ int Vh
#define g21_re
VOLATILE spinorFloat o20_re
#define i20_re
ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)
#define gT10_re
__constant__ int X1h
VOLATILE spinorFloat o11_re
#define i11_im
__constant__ int X2
VOLATILE spinorFloat o11_im
#define i22_im
#define i30_im
VOLATILE spinorFloat o10_re
__constant__ int X2X1mX1
#define gT21_re
#define i10_re
#define gT11_re
#define i20_im
#define i12_re
spinorFloat b2_re
#define i31_re
int boundaryCrossing
__constant__ int X3X2X1mX2X1
spinorFloat B1_re
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
int sp_idx
VOLATILE spinorFloat o01_re
VOLATILE spinorFloat o00_re
float4 G2
VOLATILE spinorFloat o32_im
#define gT22_re
const int ga_idx
VOLATILE spinorFloat o12_im
float4 G3
#define g10_re
__constant__ int X3X2X1
#define i31_im
#define i01_re
#define gT00_re
#define gT22_im
#define g20_im
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define g01_re
VOLATILE spinorFloat o31_re
spinorFloat A0_re
spinorFloat A2_im
spinorFloat B1_im
#define g12_re
spinorFloat b0_re
#define i32_im
VOLATILE spinorFloat o22_re
QudaGaugeParam param
Definition: pack_test.cpp:17
#define g11_re
#define m5
__constant__ int ghostFace[QUDA_MAX_DIM+1]
spinorFloat B0_re
spinorFloat A1_re
spinorFloat b0_im
spinorFloat b1_re
#define i12_im
spinorFloat a0_re
#define gT12_im
float4 G0
spinorFloat a2_im
#define GAUGE0TEX
Definition: covDev.h:112
spinorFloat a2_re
spinorFloat A0_im
#define gT01_re
spinorFloat B2_re
#define g21_im
#define i32_re
#define gT21_im
__constant__ double coeff
VOLATILE spinorFloat o20_im
#define i01_im
__constant__ int X2m1
VOLATILE spinorFloat o12_re
#define gT20_im
VOLATILE spinorFloat o02_re
#define SPINORTEX
Definition: clover_def.h:40
#define g02_re
#define VOLATILE
__constant__ int gauge_fixed
VOLATILE spinorFloat o21_im
VOLATILE spinorFloat o00_im
RECONSTRUCT_GAUGE_MATRIX(0)
#define i02_re
__constant__ int X4X3X2X1mX3X2X1
VOLATILE spinorFloat o21_re
#define gT01_im
#define SPINOR_HOP
Definition: covDev.h:158
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define g02_im
#define i02_im
VOLATILE spinorFloat o31_im
__constant__ int ga_stride
#define i21_im
#define g12_im
#define gT11_im
spinorFloat a0_im
spinorFloat a1_im
__constant__ int X1m1
VOLATILE spinorFloat o01_im
__constant__ int X3
#define i22_re
VOLATILE spinorFloat o30_re
#define i00_im
#define gT12_re
#define i10_im
#define gT02_re
VOLATILE spinorFloat o10_im
#define mdwf_b5
#define g20_re
#define g11_im
#define g22_re
#define g01_im
#define GAUGE1TEX
Definition: covDev.h:113
VOLATILE spinorFloat o02_im
#define spinorFloat
#define g00_im
spinorFloat B0_im
#define gT10_im
__constant__ int X4m1
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define i30_re
#define WRITE_SPINOR
Definition: clover_def.h:48
spinorFloat A1_im
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
#define i11_re
spinorFloat a1_re
float4 G4
__constant__ int X4X3X2X1hmX3X2X1h
float4 G1
VOLATILE spinorFloat o30_im
#define i21_re
#define gT20_re
#define g00_re
VOLATILE spinorFloat o22_im
VOLATILE spinorFloat o32_re
KernelType kernel_type
#define g22_im
#define i00_re
__constant__ int X4
__constant__ int X3m1
#define TPROJSCALE
Definition: covDev.h:101
#define g10_im
#define gT00_im
spinorFloat B2_im
spinorFloat b2_im
#define gT02_im
__constant__ int X2X1