QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dw_dslash_core.h
Go to the documentation of this file.
1 // *** CUDA DSLASH ***
2 
3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
4 
5 // NB! Don't trust any MULTI_GPU code
6 
7 #if (CUDA_VERSION >= 4010)
8 #define VOLATILE
9 #else
10 #define VOLATILE volatile
11 #endif
12 // input spinor
13 #ifdef SPINOR_DOUBLE
14 #define spinorFloat double
15 #define i00_re I0.x
16 #define i00_im I0.y
17 #define i01_re I1.x
18 #define i01_im I1.y
19 #define i02_re I2.x
20 #define i02_im I2.y
21 #define i10_re I3.x
22 #define i10_im I3.y
23 #define i11_re I4.x
24 #define i11_im I4.y
25 #define i12_re I5.x
26 #define i12_im I5.y
27 #define i20_re I6.x
28 #define i20_im I6.y
29 #define i21_re I7.x
30 #define i21_im I7.y
31 #define i22_re I8.x
32 #define i22_im I8.y
33 #define i30_re I9.x
34 #define i30_im I9.y
35 #define i31_re I10.x
36 #define i31_im I10.y
37 #define i32_re I11.x
38 #define i32_im I11.y
39 #else
40 #define spinorFloat float
41 #define i00_re I0.x
42 #define i00_im I0.y
43 #define i01_re I0.z
44 #define i01_im I0.w
45 #define i02_re I1.x
46 #define i02_im I1.y
47 #define i10_re I1.z
48 #define i10_im I1.w
49 #define i11_re I2.x
50 #define i11_im I2.y
51 #define i12_re I2.z
52 #define i12_im I2.w
53 #define i20_re I3.x
54 #define i20_im I3.y
55 #define i21_re I3.z
56 #define i21_im I3.w
57 #define i22_re I4.x
58 #define i22_im I4.y
59 #define i30_re I4.z
60 #define i30_im I4.w
61 #define i31_re I5.x
62 #define i31_im I5.y
63 #define i32_re I5.z
64 #define i32_im I5.w
65 #endif // SPINOR_DOUBLE
66 
67 // gauge link
68 #ifdef GAUGE_FLOAT2
69 #define g00_re G0.x
70 #define g00_im G0.y
71 #define g01_re G1.x
72 #define g01_im G1.y
73 #define g02_re G2.x
74 #define g02_im G2.y
75 #define g10_re G3.x
76 #define g10_im G3.y
77 #define g11_re G4.x
78 #define g11_im G4.y
79 #define g12_re G5.x
80 #define g12_im G5.y
81 #define g20_re G6.x
82 #define g20_im G6.y
83 #define g21_re G7.x
84 #define g21_im G7.y
85 #define g22_re G8.x
86 #define g22_im G8.y
87 
88 #else
89 #define g00_re G0.x
90 #define g00_im G0.y
91 #define g01_re G0.z
92 #define g01_im G0.w
93 #define g02_re G1.x
94 #define g02_im G1.y
95 #define g10_re G1.z
96 #define g10_im G1.w
97 #define g11_re G2.x
98 #define g11_im G2.y
99 #define g12_re G2.z
100 #define g12_im G2.w
101 #define g20_re G3.x
102 #define g20_im G3.y
103 #define g21_re G3.z
104 #define g21_im G3.w
105 #define g22_re G4.x
106 #define g22_im G4.y
107 
108 #endif // GAUGE_DOUBLE
109 
110 // conjugated gauge link
111 #define gT00_re (+g00_re)
112 #define gT00_im (-g00_im)
113 #define gT01_re (+g10_re)
114 #define gT01_im (-g10_im)
115 #define gT02_re (+g20_re)
116 #define gT02_im (-g20_im)
117 #define gT10_re (+g01_re)
118 #define gT10_im (-g01_im)
119 #define gT11_re (+g11_re)
120 #define gT11_im (-g11_im)
121 #define gT12_re (+g21_re)
122 #define gT12_im (-g21_im)
123 #define gT20_re (+g02_re)
124 #define gT20_im (-g02_im)
125 #define gT21_re (+g12_re)
126 #define gT21_im (-g12_im)
127 #define gT22_re (+g22_re)
128 #define gT22_im (-g22_im)
129 
130 // output spinor
155 
156 #ifdef SPINOR_DOUBLE
157 #if (__COMPUTE_CAPABILITY__ >= 200)
158 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
159 #else
160 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
161 #endif
162 #else
163 #if (__COMPUTE_CAPABILITY__ >= 200)
164 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
165 #else
166 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
167 #endif
168 #endif
169 
170 #include "read_gauge.h"
171 #include "io_spinor.h"
172 
173 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
174 int sp_norm_idx;
175 #endif // MULTI_GPU half precision
176 
177 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
178 if (sid >= param.threads*param.Ls) return;
179 
180 int X, x1, x2, x3, x4, xs;
181 
183 
184 #ifdef MULTI_GPU
185 int face_idx;
187 #endif
188 
189 // Inline by hand for the moment and assume even dimensions
190 //coordsFromIndex(X, x1, x2, x3, x4, sid, param.parity);
191 
192 s_parity = ( sid/(X4*X3*X2*X1h) ) % 2;
193 boundaryCrossing = sid/X1h + sid/(X2*X1h) + sid/(X3*X2*X1h) + sid/(X4*X3*X2*X1h);
194 
195 X = 2*sid + (boundaryCrossing + param.parity) % 2;
196 x1 = X % X1;
197 x2 = (X/X1) % X2;
198 x3 = (X/(X1*X2)) % X3;
199 x4 = (X/(X1*X2*X3)) % X4;
200 xs = X/(X1*X2*X3*X4);
201 
202  o00_re = 0; o00_im = 0;
203  o01_re = 0; o01_im = 0;
204  o02_re = 0; o02_im = 0;
205  o10_re = 0; o10_im = 0;
206  o11_re = 0; o11_im = 0;
207  o12_re = 0; o12_im = 0;
208  o20_re = 0; o20_im = 0;
209  o21_re = 0; o21_im = 0;
210  o22_re = 0; o22_im = 0;
211  o30_re = 0; o30_im = 0;
212  o31_re = 0; o31_im = 0;
213  o32_re = 0; o32_im = 0;
214 
215 #ifdef MULTI_GPU
216 } else { // exterior kernel
217 
218 const int dim = static_cast<int>(kernel_type);
219 const int face_volume = (param.threads*param.Ls >> 1); // volume of one face
220 const int face_num = (sid >= face_volume); // is this thread updating face 0 or 1
221 face_idx = sid - face_num*face_volume; // index into the respective face
222 
223 // ghostOffset is scaled to include body (includes stride) and number of FloatN arrays (SPINOR_HOP)
224 // face_idx not sid since faces are spin projected and share the same volume index (modulo UP/DOWN reading)
225 //sp_idx = face_idx + param.ghostOffset[dim];
226 
227 #if (DD_PREC==2) // half precision
228 sp_norm_idx = sid + param.ghostNormOffset[static_cast<int>(kernel_type)];
229 #endif
230 
231 const int dims[] = {X1, X2, X3, X4};
232 coordsFromDWFaceIndex<1>(sid, x1, x2, x3, x4, xs, face_idx, face_volume, dim, face_num, param.parity, dims);
233 
234 s_parity = ( sid/(X4*X3*X2*X1h) ) % 2;
235 boundaryCrossing = sid/X1h + sid/(X2*X1h) + sid/(X3*X2*X1h) + sid/(X4*X3*X2*X1h);
236 
237 X = 2*sid + (boundaryCrossing + param.parity) % 2;
238 
239 READ_INTERMEDIATE_SPINOR(INTERTEX, param.sp_stride, sid, sid);
240 
241  o00_re = i00_re; o00_im = i00_im;
242  o01_re = i01_re; o01_im = i01_im;
243  o02_re = i02_re; o02_im = i02_im;
244  o10_re = i10_re; o10_im = i10_im;
245  o11_re = i11_re; o11_im = i11_im;
246  o12_re = i12_re; o12_im = i12_im;
247  o20_re = i20_re; o20_im = i20_im;
248  o21_re = i21_re; o21_im = i21_im;
249  o22_re = i22_re; o22_im = i22_im;
250  o30_re = i30_re; o30_im = i30_im;
251  o31_re = i31_re; o31_im = i31_im;
252  o32_re = i32_re; o32_im = i32_im;
253 }
254 #endif // MULTI_GPU
255 
256 // declare G## here and use ASSN below instead of READ
257 #ifdef GAUGE_FLOAT2
258 #if (DD_PREC==0) //temporal hack
259 double2 G0;
260 double2 G1;
261 double2 G2;
262 double2 G3;
263 double2 G4;
264 double2 G5;
265 double2 G6;
266 double2 G7;
267 double2 G8;
268 #else
269 float2 G0;
270 float2 G1;
271 float2 G2;
272 float2 G3;
273 float2 G4;
274 float2 G5;
275 float2 G6;
276 float2 G7;
277 float2 G8;
278 #endif
279 #else
280 float4 G0;
281 float4 G1;
282 float4 G2;
283 float4 G3;
284 float4 G4;
285 #endif
286 
287 
288 
289 #ifdef MULTI_GPU
290 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1<X1m1)) ||
291  (kernel_type == EXTERIOR_KERNEL_X && x1==X1m1) )
292 #endif
293 {
294  // Projector P0-
295  // 1 0 0 -i
296  // 0 1 -i 0
297  // 0 i 1 0
298  // i 0 0 1
299 
300 #ifdef MULTI_GPU
301  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==X1m1 ? X-X1m1 : X+1) >> 1 :
302  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
303 #else
304  const int sp_idx = (x1==X1m1 ? X-X1m1 : X+1) >> 1;
305 #endif
306 
307  const int ga_idx = sid % Vh;
308 
309  // read gauge matrix from device memory
310  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride); }
311  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 0, ga_idx, ga_stride); }
312 
319 
320 #ifdef MULTI_GPU
321  if (kernel_type == INTERIOR_KERNEL) {
322 #endif
323 
324  // read spinor from device memory
325  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
326 
327  // project spinor into half spinors
328  a0_re = +i00_re+i30_im;
329  a0_im = +i00_im-i30_re;
330  a1_re = +i01_re+i31_im;
331  a1_im = +i01_im-i31_re;
332  a2_re = +i02_re+i32_im;
333  a2_im = +i02_im-i32_re;
334  b0_re = +i10_re+i20_im;
335  b0_im = +i10_im-i20_re;
336  b1_re = +i11_re+i21_im;
337  b1_im = +i11_im-i21_re;
338  b2_re = +i12_re+i22_im;
339  b2_im = +i12_im-i22_re;
340 
341 #ifdef MULTI_GPU
342  } else {
343 
344  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
345 
346  // read half spinor from device memory
347  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
348 
349  a0_re = i00_re; a0_im = i00_im;
350  a1_re = i01_re; a1_im = i01_im;
351  a2_re = i02_re; a2_im = i02_im;
352  b0_re = i10_re; b0_im = i10_im;
353  b1_re = i11_re; b1_im = i11_im;
354  b2_re = i12_re; b2_im = i12_im;
355 
356  }
357 #endif // MULTI_GPU
358 
359  // reconstruct gauge matrix
361 
362  // multiply row 0
364  A0_re += g00_re * a0_re;
365  A0_re -= g00_im * a0_im;
366  A0_re += g01_re * a1_re;
367  A0_re -= g01_im * a1_im;
368  A0_re += g02_re * a2_re;
369  A0_re -= g02_im * a2_im;
371  A0_im += g00_re * a0_im;
372  A0_im += g00_im * a0_re;
373  A0_im += g01_re * a1_im;
374  A0_im += g01_im * a1_re;
375  A0_im += g02_re * a2_im;
376  A0_im += g02_im * a2_re;
378  B0_re += g00_re * b0_re;
379  B0_re -= g00_im * b0_im;
380  B0_re += g01_re * b1_re;
381  B0_re -= g01_im * b1_im;
382  B0_re += g02_re * b2_re;
383  B0_re -= g02_im * b2_im;
385  B0_im += g00_re * b0_im;
386  B0_im += g00_im * b0_re;
387  B0_im += g01_re * b1_im;
388  B0_im += g01_im * b1_re;
389  B0_im += g02_re * b2_im;
390  B0_im += g02_im * b2_re;
391 
392  // multiply row 1
394  A1_re += g10_re * a0_re;
395  A1_re -= g10_im * a0_im;
396  A1_re += g11_re * a1_re;
397  A1_re -= g11_im * a1_im;
398  A1_re += g12_re * a2_re;
399  A1_re -= g12_im * a2_im;
401  A1_im += g10_re * a0_im;
402  A1_im += g10_im * a0_re;
403  A1_im += g11_re * a1_im;
404  A1_im += g11_im * a1_re;
405  A1_im += g12_re * a2_im;
406  A1_im += g12_im * a2_re;
408  B1_re += g10_re * b0_re;
409  B1_re -= g10_im * b0_im;
410  B1_re += g11_re * b1_re;
411  B1_re -= g11_im * b1_im;
412  B1_re += g12_re * b2_re;
413  B1_re -= g12_im * b2_im;
415  B1_im += g10_re * b0_im;
416  B1_im += g10_im * b0_re;
417  B1_im += g11_re * b1_im;
418  B1_im += g11_im * b1_re;
419  B1_im += g12_re * b2_im;
420  B1_im += g12_im * b2_re;
421 
422  // multiply row 2
424  A2_re += g20_re * a0_re;
425  A2_re -= g20_im * a0_im;
426  A2_re += g21_re * a1_re;
427  A2_re -= g21_im * a1_im;
428  A2_re += g22_re * a2_re;
429  A2_re -= g22_im * a2_im;
431  A2_im += g20_re * a0_im;
432  A2_im += g20_im * a0_re;
433  A2_im += g21_re * a1_im;
434  A2_im += g21_im * a1_re;
435  A2_im += g22_re * a2_im;
436  A2_im += g22_im * a2_re;
438  B2_re += g20_re * b0_re;
439  B2_re -= g20_im * b0_im;
440  B2_re += g21_re * b1_re;
441  B2_re -= g21_im * b1_im;
442  B2_re += g22_re * b2_re;
443  B2_re -= g22_im * b2_im;
445  B2_im += g20_re * b0_im;
446  B2_im += g20_im * b0_re;
447  B2_im += g21_re * b1_im;
448  B2_im += g21_im * b1_re;
449  B2_im += g22_re * b2_im;
450  B2_im += g22_im * b2_re;
451 
452  o00_re += A0_re;
453  o00_im += A0_im;
454  o10_re += B0_re;
455  o10_im += B0_im;
456  o20_re -= B0_im;
457  o20_im += B0_re;
458  o30_re -= A0_im;
459  o30_im += A0_re;
460 
461  o01_re += A1_re;
462  o01_im += A1_im;
463  o11_re += B1_re;
464  o11_im += B1_im;
465  o21_re -= B1_im;
466  o21_im += B1_re;
467  o31_re -= A1_im;
468  o31_im += A1_re;
469 
470  o02_re += A2_re;
471  o02_im += A2_im;
472  o12_re += B2_re;
473  o12_im += B2_im;
474  o22_re -= B2_im;
475  o22_im += B2_re;
476  o32_re -= A2_im;
477  o32_im += A2_re;
478 }
479 
480 #ifdef MULTI_GPU
481 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[0] || x1>0)) ||
482  (kernel_type == EXTERIOR_KERNEL_X && x1==0) )
483 #endif
484 {
485  // Projector P0+
486  // 1 0 0 i
487  // 0 1 i 0
488  // 0 -i 1 0
489  // -i 0 0 1
490 
491 #ifdef MULTI_GPU
492  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x1==0 ? X+X1m1 : X-1) >> 1 :
493  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
494 #else
495  const int sp_idx = (x1==0 ? X+X1m1 : X-1) >> 1;
496 #endif
497 
498 #ifdef MULTI_GPU
499  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % Vh : Vh+(face_idx % ghostFace[static_cast<int>(kernel_type)]));
500 #else
501  const int ga_idx = sp_idx % Vh;
502 #endif
503 
504  // read gauge matrix from device memory
505  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 1, ga_idx, ga_stride); }
506  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 1, ga_idx, ga_stride); }
507 
514 
515 #ifdef MULTI_GPU
516  if (kernel_type == INTERIOR_KERNEL) {
517 #endif
518 
519  // read spinor from device memory
520  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
521 
522  // project spinor into half spinors
523  a0_re = +i00_re-i30_im;
524  a0_im = +i00_im+i30_re;
525  a1_re = +i01_re-i31_im;
526  a1_im = +i01_im+i31_re;
527  a2_re = +i02_re-i32_im;
528  a2_im = +i02_im+i32_re;
529  b0_re = +i10_re-i20_im;
530  b0_im = +i10_im+i20_re;
531  b1_re = +i11_re-i21_im;
532  b1_im = +i11_im+i21_re;
533  b2_re = +i12_re-i22_im;
534  b2_im = +i12_im+i22_re;
535 
536 #ifdef MULTI_GPU
537  } else {
538 
539  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
540 
541  // read half spinor from device memory
542  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
543 
544  a0_re = i00_re; a0_im = i00_im;
545  a1_re = i01_re; a1_im = i01_im;
546  a2_re = i02_re; a2_im = i02_im;
547  b0_re = i10_re; b0_im = i10_im;
548  b1_re = i11_re; b1_im = i11_im;
549  b2_re = i12_re; b2_im = i12_im;
550 
551  }
552 #endif // MULTI_GPU
553 
554  // reconstruct gauge matrix
556 
557  // multiply row 0
558  spinorFloat A0_re = 0;
559  A0_re += gT00_re * a0_re;
560  A0_re -= gT00_im * a0_im;
561  A0_re += gT01_re * a1_re;
562  A0_re -= gT01_im * a1_im;
563  A0_re += gT02_re * a2_re;
564  A0_re -= gT02_im * a2_im;
565  spinorFloat A0_im = 0;
566  A0_im += gT00_re * a0_im;
567  A0_im += gT00_im * a0_re;
568  A0_im += gT01_re * a1_im;
569  A0_im += gT01_im * a1_re;
570  A0_im += gT02_re * a2_im;
571  A0_im += gT02_im * a2_re;
572  spinorFloat B0_re = 0;
573  B0_re += gT00_re * b0_re;
574  B0_re -= gT00_im * b0_im;
575  B0_re += gT01_re * b1_re;
576  B0_re -= gT01_im * b1_im;
577  B0_re += gT02_re * b2_re;
578  B0_re -= gT02_im * b2_im;
579  spinorFloat B0_im = 0;
580  B0_im += gT00_re * b0_im;
581  B0_im += gT00_im * b0_re;
582  B0_im += gT01_re * b1_im;
583  B0_im += gT01_im * b1_re;
584  B0_im += gT02_re * b2_im;
585  B0_im += gT02_im * b2_re;
586 
587  // multiply row 1
588  spinorFloat A1_re = 0;
589  A1_re += gT10_re * a0_re;
590  A1_re -= gT10_im * a0_im;
591  A1_re += gT11_re * a1_re;
592  A1_re -= gT11_im * a1_im;
593  A1_re += gT12_re * a2_re;
594  A1_re -= gT12_im * a2_im;
595  spinorFloat A1_im = 0;
596  A1_im += gT10_re * a0_im;
597  A1_im += gT10_im * a0_re;
598  A1_im += gT11_re * a1_im;
599  A1_im += gT11_im * a1_re;
600  A1_im += gT12_re * a2_im;
601  A1_im += gT12_im * a2_re;
602  spinorFloat B1_re = 0;
603  B1_re += gT10_re * b0_re;
604  B1_re -= gT10_im * b0_im;
605  B1_re += gT11_re * b1_re;
606  B1_re -= gT11_im * b1_im;
607  B1_re += gT12_re * b2_re;
608  B1_re -= gT12_im * b2_im;
609  spinorFloat B1_im = 0;
610  B1_im += gT10_re * b0_im;
611  B1_im += gT10_im * b0_re;
612  B1_im += gT11_re * b1_im;
613  B1_im += gT11_im * b1_re;
614  B1_im += gT12_re * b2_im;
615  B1_im += gT12_im * b2_re;
616 
617  // multiply row 2
618  spinorFloat A2_re = 0;
619  A2_re += gT20_re * a0_re;
620  A2_re -= gT20_im * a0_im;
621  A2_re += gT21_re * a1_re;
622  A2_re -= gT21_im * a1_im;
623  A2_re += gT22_re * a2_re;
624  A2_re -= gT22_im * a2_im;
625  spinorFloat A2_im = 0;
626  A2_im += gT20_re * a0_im;
627  A2_im += gT20_im * a0_re;
628  A2_im += gT21_re * a1_im;
629  A2_im += gT21_im * a1_re;
630  A2_im += gT22_re * a2_im;
631  A2_im += gT22_im * a2_re;
632  spinorFloat B2_re = 0;
633  B2_re += gT20_re * b0_re;
634  B2_re -= gT20_im * b0_im;
635  B2_re += gT21_re * b1_re;
636  B2_re -= gT21_im * b1_im;
637  B2_re += gT22_re * b2_re;
638  B2_re -= gT22_im * b2_im;
639  spinorFloat B2_im = 0;
640  B2_im += gT20_re * b0_im;
641  B2_im += gT20_im * b0_re;
642  B2_im += gT21_re * b1_im;
643  B2_im += gT21_im * b1_re;
644  B2_im += gT22_re * b2_im;
645  B2_im += gT22_im * b2_re;
646 
647  o00_re += A0_re;
648  o00_im += A0_im;
649  o10_re += B0_re;
650  o10_im += B0_im;
651  o20_re += B0_im;
652  o20_im -= B0_re;
653  o30_re += A0_im;
654  o30_im -= A0_re;
655 
656  o01_re += A1_re;
657  o01_im += A1_im;
658  o11_re += B1_re;
659  o11_im += B1_im;
660  o21_re += B1_im;
661  o21_im -= B1_re;
662  o31_re += A1_im;
663  o31_im -= A1_re;
664 
665  o02_re += A2_re;
666  o02_im += A2_im;
667  o12_re += B2_re;
668  o12_im += B2_im;
669  o22_re += B2_im;
670  o22_im -= B2_re;
671  o32_re += A2_im;
672  o32_im -= A2_re;
673 }
674 
675 #ifdef MULTI_GPU
676 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2<X2m1)) ||
677  (kernel_type == EXTERIOR_KERNEL_Y && x2==X2m1) )
678 #endif
679 {
680  // Projector P1-
681  // 1 0 0 -1
682  // 0 1 1 0
683  // 0 1 1 0
684  // -1 0 0 1
685 
686 #ifdef MULTI_GPU
687  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1 :
688  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
689 #else
690  const int sp_idx = (x2==X2m1 ? X-X2X1mX1 : X+X1) >> 1;
691 #endif
692 
693  const int ga_idx = sid % Vh;
694 
695  // read gauge matrix from device memory
696  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 2, ga_idx, ga_stride); }
697  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 2, ga_idx, ga_stride); }
698 
705 
706 #ifdef MULTI_GPU
707  if (kernel_type == INTERIOR_KERNEL) {
708 #endif
709 
710  // read spinor from device memory
711  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
712 
713  // project spinor into half spinors
714  a0_re = +i00_re-i30_re;
715  a0_im = +i00_im-i30_im;
716  a1_re = +i01_re-i31_re;
717  a1_im = +i01_im-i31_im;
718  a2_re = +i02_re-i32_re;
719  a2_im = +i02_im-i32_im;
720  b0_re = +i10_re+i20_re;
721  b0_im = +i10_im+i20_im;
722  b1_re = +i11_re+i21_re;
723  b1_im = +i11_im+i21_im;
724  b2_re = +i12_re+i22_re;
725  b2_im = +i12_im+i22_im;
726 
727 #ifdef MULTI_GPU
728  } else {
729 
730  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
731 
732  // read half spinor from device memory
733  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
734 
735  a0_re = i00_re; a0_im = i00_im;
736  a1_re = i01_re; a1_im = i01_im;
737  a2_re = i02_re; a2_im = i02_im;
738  b0_re = i10_re; b0_im = i10_im;
739  b1_re = i11_re; b1_im = i11_im;
740  b2_re = i12_re; b2_im = i12_im;
741 
742  }
743 #endif // MULTI_GPU
744 
745  // reconstruct gauge matrix
747 
748  // multiply row 0
749  spinorFloat A0_re = 0;
750  A0_re += g00_re * a0_re;
751  A0_re -= g00_im * a0_im;
752  A0_re += g01_re * a1_re;
753  A0_re -= g01_im * a1_im;
754  A0_re += g02_re * a2_re;
755  A0_re -= g02_im * a2_im;
756  spinorFloat A0_im = 0;
757  A0_im += g00_re * a0_im;
758  A0_im += g00_im * a0_re;
759  A0_im += g01_re * a1_im;
760  A0_im += g01_im * a1_re;
761  A0_im += g02_re * a2_im;
762  A0_im += g02_im * a2_re;
763  spinorFloat B0_re = 0;
764  B0_re += g00_re * b0_re;
765  B0_re -= g00_im * b0_im;
766  B0_re += g01_re * b1_re;
767  B0_re -= g01_im * b1_im;
768  B0_re += g02_re * b2_re;
769  B0_re -= g02_im * b2_im;
770  spinorFloat B0_im = 0;
771  B0_im += g00_re * b0_im;
772  B0_im += g00_im * b0_re;
773  B0_im += g01_re * b1_im;
774  B0_im += g01_im * b1_re;
775  B0_im += g02_re * b2_im;
776  B0_im += g02_im * b2_re;
777 
778  // multiply row 1
779  spinorFloat A1_re = 0;
780  A1_re += g10_re * a0_re;
781  A1_re -= g10_im * a0_im;
782  A1_re += g11_re * a1_re;
783  A1_re -= g11_im * a1_im;
784  A1_re += g12_re * a2_re;
785  A1_re -= g12_im * a2_im;
786  spinorFloat A1_im = 0;
787  A1_im += g10_re * a0_im;
788  A1_im += g10_im * a0_re;
789  A1_im += g11_re * a1_im;
790  A1_im += g11_im * a1_re;
791  A1_im += g12_re * a2_im;
792  A1_im += g12_im * a2_re;
793  spinorFloat B1_re = 0;
794  B1_re += g10_re * b0_re;
795  B1_re -= g10_im * b0_im;
796  B1_re += g11_re * b1_re;
797  B1_re -= g11_im * b1_im;
798  B1_re += g12_re * b2_re;
799  B1_re -= g12_im * b2_im;
800  spinorFloat B1_im = 0;
801  B1_im += g10_re * b0_im;
802  B1_im += g10_im * b0_re;
803  B1_im += g11_re * b1_im;
804  B1_im += g11_im * b1_re;
805  B1_im += g12_re * b2_im;
806  B1_im += g12_im * b2_re;
807 
808  // multiply row 2
809  spinorFloat A2_re = 0;
810  A2_re += g20_re * a0_re;
811  A2_re -= g20_im * a0_im;
812  A2_re += g21_re * a1_re;
813  A2_re -= g21_im * a1_im;
814  A2_re += g22_re * a2_re;
815  A2_re -= g22_im * a2_im;
816  spinorFloat A2_im = 0;
817  A2_im += g20_re * a0_im;
818  A2_im += g20_im * a0_re;
819  A2_im += g21_re * a1_im;
820  A2_im += g21_im * a1_re;
821  A2_im += g22_re * a2_im;
822  A2_im += g22_im * a2_re;
823  spinorFloat B2_re = 0;
824  B2_re += g20_re * b0_re;
825  B2_re -= g20_im * b0_im;
826  B2_re += g21_re * b1_re;
827  B2_re -= g21_im * b1_im;
828  B2_re += g22_re * b2_re;
829  B2_re -= g22_im * b2_im;
830  spinorFloat B2_im = 0;
831  B2_im += g20_re * b0_im;
832  B2_im += g20_im * b0_re;
833  B2_im += g21_re * b1_im;
834  B2_im += g21_im * b1_re;
835  B2_im += g22_re * b2_im;
836  B2_im += g22_im * b2_re;
837 
838  o00_re += A0_re;
839  o00_im += A0_im;
840  o10_re += B0_re;
841  o10_im += B0_im;
842  o20_re += B0_re;
843  o20_im += B0_im;
844  o30_re -= A0_re;
845  o30_im -= A0_im;
846 
847  o01_re += A1_re;
848  o01_im += A1_im;
849  o11_re += B1_re;
850  o11_im += B1_im;
851  o21_re += B1_re;
852  o21_im += B1_im;
853  o31_re -= A1_re;
854  o31_im -= A1_im;
855 
856  o02_re += A2_re;
857  o02_im += A2_im;
858  o12_re += B2_re;
859  o12_im += B2_im;
860  o22_re += B2_re;
861  o22_im += B2_im;
862  o32_re -= A2_re;
863  o32_im -= A2_im;
864 }
865 
866 #ifdef MULTI_GPU
867 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[1] || x2>0)) ||
868  (kernel_type == EXTERIOR_KERNEL_Y && x2==0) )
869 #endif
870 {
871  // Projector P1+
872  // 1 0 0 1
873  // 0 1 -1 0
874  // 0 -1 1 0
875  // 1 0 0 1
876 
877 #ifdef MULTI_GPU
878  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x2==0 ? X+X2X1mX1 : X-X1) >> 1 :
879  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
880 #else
881  const int sp_idx = (x2==0 ? X+X2X1mX1 : X-X1) >> 1;
882 #endif
883 
884 #ifdef MULTI_GPU
885  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % Vh : Vh+(face_idx % ghostFace[static_cast<int>(kernel_type)]));
886 #else
887  const int ga_idx = sp_idx % Vh;
888 #endif
889 
890  // read gauge matrix from device memory
891  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 3, ga_idx, ga_stride); }
892  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 3, ga_idx, ga_stride); }
893 
900 
901 #ifdef MULTI_GPU
902  if (kernel_type == INTERIOR_KERNEL) {
903 #endif
904 
905  // read spinor from device memory
906  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
907 
908  // project spinor into half spinors
909  a0_re = +i00_re+i30_re;
910  a0_im = +i00_im+i30_im;
911  a1_re = +i01_re+i31_re;
912  a1_im = +i01_im+i31_im;
913  a2_re = +i02_re+i32_re;
914  a2_im = +i02_im+i32_im;
915  b0_re = +i10_re-i20_re;
916  b0_im = +i10_im-i20_im;
917  b1_re = +i11_re-i21_re;
918  b1_im = +i11_im-i21_im;
919  b2_re = +i12_re-i22_re;
920  b2_im = +i12_im-i22_im;
921 
922 #ifdef MULTI_GPU
923  } else {
924 
925  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
926 
927  // read half spinor from device memory
928  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
929 
930  a0_re = i00_re; a0_im = i00_im;
931  a1_re = i01_re; a1_im = i01_im;
932  a2_re = i02_re; a2_im = i02_im;
933  b0_re = i10_re; b0_im = i10_im;
934  b1_re = i11_re; b1_im = i11_im;
935  b2_re = i12_re; b2_im = i12_im;
936 
937  }
938 #endif // MULTI_GPU
939 
940  // reconstruct gauge matrix
942 
943  // multiply row 0
944  spinorFloat A0_re = 0;
945  A0_re += gT00_re * a0_re;
946  A0_re -= gT00_im * a0_im;
947  A0_re += gT01_re * a1_re;
948  A0_re -= gT01_im * a1_im;
949  A0_re += gT02_re * a2_re;
950  A0_re -= gT02_im * a2_im;
951  spinorFloat A0_im = 0;
952  A0_im += gT00_re * a0_im;
953  A0_im += gT00_im * a0_re;
954  A0_im += gT01_re * a1_im;
955  A0_im += gT01_im * a1_re;
956  A0_im += gT02_re * a2_im;
957  A0_im += gT02_im * a2_re;
958  spinorFloat B0_re = 0;
959  B0_re += gT00_re * b0_re;
960  B0_re -= gT00_im * b0_im;
961  B0_re += gT01_re * b1_re;
962  B0_re -= gT01_im * b1_im;
963  B0_re += gT02_re * b2_re;
964  B0_re -= gT02_im * b2_im;
965  spinorFloat B0_im = 0;
966  B0_im += gT00_re * b0_im;
967  B0_im += gT00_im * b0_re;
968  B0_im += gT01_re * b1_im;
969  B0_im += gT01_im * b1_re;
970  B0_im += gT02_re * b2_im;
971  B0_im += gT02_im * b2_re;
972 
973  // multiply row 1
974  spinorFloat A1_re = 0;
975  A1_re += gT10_re * a0_re;
976  A1_re -= gT10_im * a0_im;
977  A1_re += gT11_re * a1_re;
978  A1_re -= gT11_im * a1_im;
979  A1_re += gT12_re * a2_re;
980  A1_re -= gT12_im * a2_im;
981  spinorFloat A1_im = 0;
982  A1_im += gT10_re * a0_im;
983  A1_im += gT10_im * a0_re;
984  A1_im += gT11_re * a1_im;
985  A1_im += gT11_im * a1_re;
986  A1_im += gT12_re * a2_im;
987  A1_im += gT12_im * a2_re;
988  spinorFloat B1_re = 0;
989  B1_re += gT10_re * b0_re;
990  B1_re -= gT10_im * b0_im;
991  B1_re += gT11_re * b1_re;
992  B1_re -= gT11_im * b1_im;
993  B1_re += gT12_re * b2_re;
994  B1_re -= gT12_im * b2_im;
995  spinorFloat B1_im = 0;
996  B1_im += gT10_re * b0_im;
997  B1_im += gT10_im * b0_re;
998  B1_im += gT11_re * b1_im;
999  B1_im += gT11_im * b1_re;
1000  B1_im += gT12_re * b2_im;
1001  B1_im += gT12_im * b2_re;
1002 
1003  // multiply row 2
1004  spinorFloat A2_re = 0;
1005  A2_re += gT20_re * a0_re;
1006  A2_re -= gT20_im * a0_im;
1007  A2_re += gT21_re * a1_re;
1008  A2_re -= gT21_im * a1_im;
1009  A2_re += gT22_re * a2_re;
1010  A2_re -= gT22_im * a2_im;
1011  spinorFloat A2_im = 0;
1012  A2_im += gT20_re * a0_im;
1013  A2_im += gT20_im * a0_re;
1014  A2_im += gT21_re * a1_im;
1015  A2_im += gT21_im * a1_re;
1016  A2_im += gT22_re * a2_im;
1017  A2_im += gT22_im * a2_re;
1018  spinorFloat B2_re = 0;
1019  B2_re += gT20_re * b0_re;
1020  B2_re -= gT20_im * b0_im;
1021  B2_re += gT21_re * b1_re;
1022  B2_re -= gT21_im * b1_im;
1023  B2_re += gT22_re * b2_re;
1024  B2_re -= gT22_im * b2_im;
1025  spinorFloat B2_im = 0;
1026  B2_im += gT20_re * b0_im;
1027  B2_im += gT20_im * b0_re;
1028  B2_im += gT21_re * b1_im;
1029  B2_im += gT21_im * b1_re;
1030  B2_im += gT22_re * b2_im;
1031  B2_im += gT22_im * b2_re;
1032 
1033  o00_re += A0_re;
1034  o00_im += A0_im;
1035  o10_re += B0_re;
1036  o10_im += B0_im;
1037  o20_re -= B0_re;
1038  o20_im -= B0_im;
1039  o30_re += A0_re;
1040  o30_im += A0_im;
1041 
1042  o01_re += A1_re;
1043  o01_im += A1_im;
1044  o11_re += B1_re;
1045  o11_im += B1_im;
1046  o21_re -= B1_re;
1047  o21_im -= B1_im;
1048  o31_re += A1_re;
1049  o31_im += A1_im;
1050 
1051  o02_re += A2_re;
1052  o02_im += A2_im;
1053  o12_re += B2_re;
1054  o12_im += B2_im;
1055  o22_re -= B2_re;
1056  o22_im -= B2_im;
1057  o32_re += A2_re;
1058  o32_im += A2_im;
1059 }
1060 
1061 #ifdef MULTI_GPU
1062 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3<X3m1)) ||
1063  (kernel_type == EXTERIOR_KERNEL_Z && x3==X3m1) )
1064 #endif
1065 {
1066  // Projector P2-
1067  // 1 0 -i 0
1068  // 0 1 0 i
1069  // i 0 1 0
1070  // 0 -i 0 1
1071 
1072 #ifdef MULTI_GPU
1073  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1 :
1074  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1075 #else
1076  const int sp_idx = (x3==X3m1 ? X-X3X2X1mX2X1 : X+X2X1) >> 1;
1077 #endif
1078 
1079  const int ga_idx = sid % Vh;
1080 
1081  // read gauge matrix from device memory
1082  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 4, ga_idx, ga_stride); }
1083  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 4, ga_idx, ga_stride); }
1084 
1091 
1092 #ifdef MULTI_GPU
1093  if (kernel_type == INTERIOR_KERNEL) {
1094 #endif
1095 
1096  // read spinor from device memory
1097  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1098 
1099  // project spinor into half spinors
1100  a0_re = +i00_re+i20_im;
1101  a0_im = +i00_im-i20_re;
1102  a1_re = +i01_re+i21_im;
1103  a1_im = +i01_im-i21_re;
1104  a2_re = +i02_re+i22_im;
1105  a2_im = +i02_im-i22_re;
1106  b0_re = +i10_re-i30_im;
1107  b0_im = +i10_im+i30_re;
1108  b1_re = +i11_re-i31_im;
1109  b1_im = +i11_im+i31_re;
1110  b2_re = +i12_re-i32_im;
1111  b2_im = +i12_im+i32_re;
1112 
1113 #ifdef MULTI_GPU
1114  } else {
1115 
1116  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
1117 
1118  // read half spinor from device memory
1119  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1120 
1121  a0_re = i00_re; a0_im = i00_im;
1122  a1_re = i01_re; a1_im = i01_im;
1123  a2_re = i02_re; a2_im = i02_im;
1124  b0_re = i10_re; b0_im = i10_im;
1125  b1_re = i11_re; b1_im = i11_im;
1126  b2_re = i12_re; b2_im = i12_im;
1127 
1128  }
1129 #endif // MULTI_GPU
1130 
1131  // reconstruct gauge matrix
1133 
1134  // multiply row 0
1135  spinorFloat A0_re = 0;
1136  A0_re += g00_re * a0_re;
1137  A0_re -= g00_im * a0_im;
1138  A0_re += g01_re * a1_re;
1139  A0_re -= g01_im * a1_im;
1140  A0_re += g02_re * a2_re;
1141  A0_re -= g02_im * a2_im;
1142  spinorFloat A0_im = 0;
1143  A0_im += g00_re * a0_im;
1144  A0_im += g00_im * a0_re;
1145  A0_im += g01_re * a1_im;
1146  A0_im += g01_im * a1_re;
1147  A0_im += g02_re * a2_im;
1148  A0_im += g02_im * a2_re;
1149  spinorFloat B0_re = 0;
1150  B0_re += g00_re * b0_re;
1151  B0_re -= g00_im * b0_im;
1152  B0_re += g01_re * b1_re;
1153  B0_re -= g01_im * b1_im;
1154  B0_re += g02_re * b2_re;
1155  B0_re -= g02_im * b2_im;
1156  spinorFloat B0_im = 0;
1157  B0_im += g00_re * b0_im;
1158  B0_im += g00_im * b0_re;
1159  B0_im += g01_re * b1_im;
1160  B0_im += g01_im * b1_re;
1161  B0_im += g02_re * b2_im;
1162  B0_im += g02_im * b2_re;
1163 
1164  // multiply row 1
1165  spinorFloat A1_re = 0;
1166  A1_re += g10_re * a0_re;
1167  A1_re -= g10_im * a0_im;
1168  A1_re += g11_re * a1_re;
1169  A1_re -= g11_im * a1_im;
1170  A1_re += g12_re * a2_re;
1171  A1_re -= g12_im * a2_im;
1172  spinorFloat A1_im = 0;
1173  A1_im += g10_re * a0_im;
1174  A1_im += g10_im * a0_re;
1175  A1_im += g11_re * a1_im;
1176  A1_im += g11_im * a1_re;
1177  A1_im += g12_re * a2_im;
1178  A1_im += g12_im * a2_re;
1179  spinorFloat B1_re = 0;
1180  B1_re += g10_re * b0_re;
1181  B1_re -= g10_im * b0_im;
1182  B1_re += g11_re * b1_re;
1183  B1_re -= g11_im * b1_im;
1184  B1_re += g12_re * b2_re;
1185  B1_re -= g12_im * b2_im;
1186  spinorFloat B1_im = 0;
1187  B1_im += g10_re * b0_im;
1188  B1_im += g10_im * b0_re;
1189  B1_im += g11_re * b1_im;
1190  B1_im += g11_im * b1_re;
1191  B1_im += g12_re * b2_im;
1192  B1_im += g12_im * b2_re;
1193 
1194  // multiply row 2
1195  spinorFloat A2_re = 0;
1196  A2_re += g20_re * a0_re;
1197  A2_re -= g20_im * a0_im;
1198  A2_re += g21_re * a1_re;
1199  A2_re -= g21_im * a1_im;
1200  A2_re += g22_re * a2_re;
1201  A2_re -= g22_im * a2_im;
1202  spinorFloat A2_im = 0;
1203  A2_im += g20_re * a0_im;
1204  A2_im += g20_im * a0_re;
1205  A2_im += g21_re * a1_im;
1206  A2_im += g21_im * a1_re;
1207  A2_im += g22_re * a2_im;
1208  A2_im += g22_im * a2_re;
1209  spinorFloat B2_re = 0;
1210  B2_re += g20_re * b0_re;
1211  B2_re -= g20_im * b0_im;
1212  B2_re += g21_re * b1_re;
1213  B2_re -= g21_im * b1_im;
1214  B2_re += g22_re * b2_re;
1215  B2_re -= g22_im * b2_im;
1216  spinorFloat B2_im = 0;
1217  B2_im += g20_re * b0_im;
1218  B2_im += g20_im * b0_re;
1219  B2_im += g21_re * b1_im;
1220  B2_im += g21_im * b1_re;
1221  B2_im += g22_re * b2_im;
1222  B2_im += g22_im * b2_re;
1223 
1224  o00_re += A0_re;
1225  o00_im += A0_im;
1226  o10_re += B0_re;
1227  o10_im += B0_im;
1228  o20_re -= A0_im;
1229  o20_im += A0_re;
1230  o30_re += B0_im;
1231  o30_im -= B0_re;
1232 
1233  o01_re += A1_re;
1234  o01_im += A1_im;
1235  o11_re += B1_re;
1236  o11_im += B1_im;
1237  o21_re -= A1_im;
1238  o21_im += A1_re;
1239  o31_re += B1_im;
1240  o31_im -= B1_re;
1241 
1242  o02_re += A2_re;
1243  o02_im += A2_im;
1244  o12_re += B2_re;
1245  o12_im += B2_im;
1246  o22_re -= A2_im;
1247  o22_im += A2_re;
1248  o32_re += B2_im;
1249  o32_im -= B2_re;
1250 }
1251 
1252 #ifdef MULTI_GPU
1253 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[2] || x3>0)) ||
1254  (kernel_type == EXTERIOR_KERNEL_Z && x3==0) )
1255 #endif
1256 {
1257  // Projector P2+
1258  // 1 0 i 0
1259  // 0 1 0 -i
1260  // -i 0 1 0
1261  // 0 i 0 1
1262 
1263 #ifdef MULTI_GPU
1264  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1 :
1265  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1266 #else
1267  const int sp_idx = (x3==0 ? X+X3X2X1mX2X1 : X-X2X1) >> 1;
1268 #endif
1269 
1270 #ifdef MULTI_GPU
1271  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % Vh : Vh+(face_idx % ghostFace[static_cast<int>(kernel_type)]));
1272 #else
1273  const int ga_idx = sp_idx % Vh;
1274 #endif
1275 
1276  // read gauge matrix from device memory
1277  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 5, ga_idx, ga_stride); }
1278  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 5, ga_idx, ga_stride); }
1279 
1286 
1287 #ifdef MULTI_GPU
1288  if (kernel_type == INTERIOR_KERNEL) {
1289 #endif
1290 
1291  // read spinor from device memory
1292  READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1293 
1294  // project spinor into half spinors
1295  a0_re = +i00_re-i20_im;
1296  a0_im = +i00_im+i20_re;
1297  a1_re = +i01_re-i21_im;
1298  a1_im = +i01_im+i21_re;
1299  a2_re = +i02_re-i22_im;
1300  a2_im = +i02_im+i22_re;
1301  b0_re = +i10_re+i30_im;
1302  b0_im = +i10_im-i30_re;
1303  b1_re = +i11_re+i31_im;
1304  b1_im = +i11_im-i31_re;
1305  b2_re = +i12_re+i32_im;
1306  b2_im = +i12_im-i32_re;
1307 
1308 #ifdef MULTI_GPU
1309  } else {
1310 
1311  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
1312 
1313  // read half spinor from device memory
1314  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1315 
1316  a0_re = i00_re; a0_im = i00_im;
1317  a1_re = i01_re; a1_im = i01_im;
1318  a2_re = i02_re; a2_im = i02_im;
1319  b0_re = i10_re; b0_im = i10_im;
1320  b1_re = i11_re; b1_im = i11_im;
1321  b2_re = i12_re; b2_im = i12_im;
1322 
1323  }
1324 #endif // MULTI_GPU
1325 
1326  // reconstruct gauge matrix
1328 
1329  // multiply row 0
1330  spinorFloat A0_re = 0;
1331  A0_re += gT00_re * a0_re;
1332  A0_re -= gT00_im * a0_im;
1333  A0_re += gT01_re * a1_re;
1334  A0_re -= gT01_im * a1_im;
1335  A0_re += gT02_re * a2_re;
1336  A0_re -= gT02_im * a2_im;
1337  spinorFloat A0_im = 0;
1338  A0_im += gT00_re * a0_im;
1339  A0_im += gT00_im * a0_re;
1340  A0_im += gT01_re * a1_im;
1341  A0_im += gT01_im * a1_re;
1342  A0_im += gT02_re * a2_im;
1343  A0_im += gT02_im * a2_re;
1344  spinorFloat B0_re = 0;
1345  B0_re += gT00_re * b0_re;
1346  B0_re -= gT00_im * b0_im;
1347  B0_re += gT01_re * b1_re;
1348  B0_re -= gT01_im * b1_im;
1349  B0_re += gT02_re * b2_re;
1350  B0_re -= gT02_im * b2_im;
1351  spinorFloat B0_im = 0;
1352  B0_im += gT00_re * b0_im;
1353  B0_im += gT00_im * b0_re;
1354  B0_im += gT01_re * b1_im;
1355  B0_im += gT01_im * b1_re;
1356  B0_im += gT02_re * b2_im;
1357  B0_im += gT02_im * b2_re;
1358 
1359  // multiply row 1
1360  spinorFloat A1_re = 0;
1361  A1_re += gT10_re * a0_re;
1362  A1_re -= gT10_im * a0_im;
1363  A1_re += gT11_re * a1_re;
1364  A1_re -= gT11_im * a1_im;
1365  A1_re += gT12_re * a2_re;
1366  A1_re -= gT12_im * a2_im;
1367  spinorFloat A1_im = 0;
1368  A1_im += gT10_re * a0_im;
1369  A1_im += gT10_im * a0_re;
1370  A1_im += gT11_re * a1_im;
1371  A1_im += gT11_im * a1_re;
1372  A1_im += gT12_re * a2_im;
1373  A1_im += gT12_im * a2_re;
1374  spinorFloat B1_re = 0;
1375  B1_re += gT10_re * b0_re;
1376  B1_re -= gT10_im * b0_im;
1377  B1_re += gT11_re * b1_re;
1378  B1_re -= gT11_im * b1_im;
1379  B1_re += gT12_re * b2_re;
1380  B1_re -= gT12_im * b2_im;
1381  spinorFloat B1_im = 0;
1382  B1_im += gT10_re * b0_im;
1383  B1_im += gT10_im * b0_re;
1384  B1_im += gT11_re * b1_im;
1385  B1_im += gT11_im * b1_re;
1386  B1_im += gT12_re * b2_im;
1387  B1_im += gT12_im * b2_re;
1388 
1389  // multiply row 2
1390  spinorFloat A2_re = 0;
1391  A2_re += gT20_re * a0_re;
1392  A2_re -= gT20_im * a0_im;
1393  A2_re += gT21_re * a1_re;
1394  A2_re -= gT21_im * a1_im;
1395  A2_re += gT22_re * a2_re;
1396  A2_re -= gT22_im * a2_im;
1397  spinorFloat A2_im = 0;
1398  A2_im += gT20_re * a0_im;
1399  A2_im += gT20_im * a0_re;
1400  A2_im += gT21_re * a1_im;
1401  A2_im += gT21_im * a1_re;
1402  A2_im += gT22_re * a2_im;
1403  A2_im += gT22_im * a2_re;
1404  spinorFloat B2_re = 0;
1405  B2_re += gT20_re * b0_re;
1406  B2_re -= gT20_im * b0_im;
1407  B2_re += gT21_re * b1_re;
1408  B2_re -= gT21_im * b1_im;
1409  B2_re += gT22_re * b2_re;
1410  B2_re -= gT22_im * b2_im;
1411  spinorFloat B2_im = 0;
1412  B2_im += gT20_re * b0_im;
1413  B2_im += gT20_im * b0_re;
1414  B2_im += gT21_re * b1_im;
1415  B2_im += gT21_im * b1_re;
1416  B2_im += gT22_re * b2_im;
1417  B2_im += gT22_im * b2_re;
1418 
1419  o00_re += A0_re;
1420  o00_im += A0_im;
1421  o10_re += B0_re;
1422  o10_im += B0_im;
1423  o20_re += A0_im;
1424  o20_im -= A0_re;
1425  o30_re -= B0_im;
1426  o30_im += B0_re;
1427 
1428  o01_re += A1_re;
1429  o01_im += A1_im;
1430  o11_re += B1_re;
1431  o11_im += B1_im;
1432  o21_re += A1_im;
1433  o21_im -= A1_re;
1434  o31_re -= B1_im;
1435  o31_im += B1_re;
1436 
1437  o02_re += A2_re;
1438  o02_im += A2_im;
1439  o12_re += B2_re;
1440  o12_im += B2_im;
1441  o22_re += A2_im;
1442  o22_im -= A2_re;
1443  o32_re -= B2_im;
1444  o32_im += B2_re;
1445 }
1446 
1447 #ifdef MULTI_GPU
1448 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4<X4m1)) ||
1449  (kernel_type == EXTERIOR_KERNEL_T && x4==X4m1) )
1450 #endif
1451 {
1452  // Projector P3-
1453  // 0 0 0 0
1454  // 0 0 0 0
1455  // 0 0 2 0
1456  // 0 0 0 2
1457 
1458 #ifdef MULTI_GPU
1459  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1 :
1460  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1461 #else
1462  const int sp_idx = (x4==X4m1 ? X-X4X3X2X1mX3X2X1 : X+X3X2X1) >> 1;
1463 #endif
1464 
1465  const int ga_idx = sid % Vh;
1466 
1468  {
1475 
1476 #ifdef MULTI_GPU
1477  if (kernel_type == INTERIOR_KERNEL) {
1478 #endif
1479 
1480  // read spinor from device memory
1481  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1482 
1483  // project spinor into half spinors
1484  a0_re = +2*i20_re;
1485  a0_im = +2*i20_im;
1486  a1_re = +2*i21_re;
1487  a1_im = +2*i21_im;
1488  a2_re = +2*i22_re;
1489  a2_im = +2*i22_im;
1490  b0_re = +2*i30_re;
1491  b0_im = +2*i30_im;
1492  b1_re = +2*i31_re;
1493  b1_im = +2*i31_im;
1494  b2_re = +2*i32_re;
1495  b2_im = +2*i32_im;
1496 
1497 #ifdef MULTI_GPU
1498  } else {
1499 
1500  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
1501  const int t_proj_scale = TPROJSCALE;
1502 
1503  // read half spinor from device memory
1504  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1505 
1506  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1507  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1508  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1509  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1510  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1511  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1512 
1513  }
1514 #endif // MULTI_GPU
1515 
1516  // identity gauge matrix
1523 
1524  o20_re += A0_re;
1525  o20_im += A0_im;
1526  o30_re += B0_re;
1527  o30_im += B0_im;
1528 
1529  o21_re += A1_re;
1530  o21_im += A1_im;
1531  o31_re += B1_re;
1532  o31_im += B1_im;
1533 
1534  o22_re += A2_re;
1535  o22_im += A2_im;
1536  o32_re += B2_re;
1537  o32_im += B2_im;
1538  } else {
1539  // read gauge matrix from device memory
1540  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 6, ga_idx, ga_stride); }
1541  else { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 6, ga_idx, ga_stride); }
1542 
1549 
1550 #ifdef MULTI_GPU
1551  if (kernel_type == INTERIOR_KERNEL) {
1552 #endif
1553 
1554  // read spinor from device memory
1555  READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1556 
1557  // project spinor into half spinors
1558  a0_re = +2*i20_re;
1559  a0_im = +2*i20_im;
1560  a1_re = +2*i21_re;
1561  a1_im = +2*i21_im;
1562  a2_re = +2*i22_re;
1563  a2_im = +2*i22_im;
1564  b0_re = +2*i30_re;
1565  b0_im = +2*i30_im;
1566  b1_re = +2*i31_re;
1567  b1_im = +2*i31_im;
1568  b2_re = +2*i32_re;
1569  b2_im = +2*i32_im;
1570 
1571 #ifdef MULTI_GPU
1572  } else {
1573 
1574  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
1575  const int t_proj_scale = TPROJSCALE;
1576 
1577  // read half spinor from device memory
1578  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx + (SPINOR_HOP/2)*sp_stride_pad, sp_norm_idx);
1579 
1580  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1581  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1582  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1583  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1584  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1585  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1586 
1587  }
1588 #endif // MULTI_GPU
1589 
1590  // reconstruct gauge matrix
1592 
1593  // multiply row 0
1594  spinorFloat A0_re = 0;
1595  A0_re += g00_re * a0_re;
1596  A0_re -= g00_im * a0_im;
1597  A0_re += g01_re * a1_re;
1598  A0_re -= g01_im * a1_im;
1599  A0_re += g02_re * a2_re;
1600  A0_re -= g02_im * a2_im;
1601  spinorFloat A0_im = 0;
1602  A0_im += g00_re * a0_im;
1603  A0_im += g00_im * a0_re;
1604  A0_im += g01_re * a1_im;
1605  A0_im += g01_im * a1_re;
1606  A0_im += g02_re * a2_im;
1607  A0_im += g02_im * a2_re;
1608  spinorFloat B0_re = 0;
1609  B0_re += g00_re * b0_re;
1610  B0_re -= g00_im * b0_im;
1611  B0_re += g01_re * b1_re;
1612  B0_re -= g01_im * b1_im;
1613  B0_re += g02_re * b2_re;
1614  B0_re -= g02_im * b2_im;
1615  spinorFloat B0_im = 0;
1616  B0_im += g00_re * b0_im;
1617  B0_im += g00_im * b0_re;
1618  B0_im += g01_re * b1_im;
1619  B0_im += g01_im * b1_re;
1620  B0_im += g02_re * b2_im;
1621  B0_im += g02_im * b2_re;
1622 
1623  // multiply row 1
1624  spinorFloat A1_re = 0;
1625  A1_re += g10_re * a0_re;
1626  A1_re -= g10_im * a0_im;
1627  A1_re += g11_re * a1_re;
1628  A1_re -= g11_im * a1_im;
1629  A1_re += g12_re * a2_re;
1630  A1_re -= g12_im * a2_im;
1631  spinorFloat A1_im = 0;
1632  A1_im += g10_re * a0_im;
1633  A1_im += g10_im * a0_re;
1634  A1_im += g11_re * a1_im;
1635  A1_im += g11_im * a1_re;
1636  A1_im += g12_re * a2_im;
1637  A1_im += g12_im * a2_re;
1638  spinorFloat B1_re = 0;
1639  B1_re += g10_re * b0_re;
1640  B1_re -= g10_im * b0_im;
1641  B1_re += g11_re * b1_re;
1642  B1_re -= g11_im * b1_im;
1643  B1_re += g12_re * b2_re;
1644  B1_re -= g12_im * b2_im;
1645  spinorFloat B1_im = 0;
1646  B1_im += g10_re * b0_im;
1647  B1_im += g10_im * b0_re;
1648  B1_im += g11_re * b1_im;
1649  B1_im += g11_im * b1_re;
1650  B1_im += g12_re * b2_im;
1651  B1_im += g12_im * b2_re;
1652 
1653  // multiply row 2
1654  spinorFloat A2_re = 0;
1655  A2_re += g20_re * a0_re;
1656  A2_re -= g20_im * a0_im;
1657  A2_re += g21_re * a1_re;
1658  A2_re -= g21_im * a1_im;
1659  A2_re += g22_re * a2_re;
1660  A2_re -= g22_im * a2_im;
1661  spinorFloat A2_im = 0;
1662  A2_im += g20_re * a0_im;
1663  A2_im += g20_im * a0_re;
1664  A2_im += g21_re * a1_im;
1665  A2_im += g21_im * a1_re;
1666  A2_im += g22_re * a2_im;
1667  A2_im += g22_im * a2_re;
1668  spinorFloat B2_re = 0;
1669  B2_re += g20_re * b0_re;
1670  B2_re -= g20_im * b0_im;
1671  B2_re += g21_re * b1_re;
1672  B2_re -= g21_im * b1_im;
1673  B2_re += g22_re * b2_re;
1674  B2_re -= g22_im * b2_im;
1675  spinorFloat B2_im = 0;
1676  B2_im += g20_re * b0_im;
1677  B2_im += g20_im * b0_re;
1678  B2_im += g21_re * b1_im;
1679  B2_im += g21_im * b1_re;
1680  B2_im += g22_re * b2_im;
1681  B2_im += g22_im * b2_re;
1682 
1683  o20_re += A0_re;
1684  o20_im += A0_im;
1685  o30_re += B0_re;
1686  o30_im += B0_im;
1687 
1688  o21_re += A1_re;
1689  o21_im += A1_im;
1690  o31_re += B1_re;
1691  o31_im += B1_im;
1692 
1693  o22_re += A2_re;
1694  o22_im += A2_im;
1695  o32_re += B2_re;
1696  o32_im += B2_im;
1697  }
1698 }
1699 
1700 #ifdef MULTI_GPU
1701 if ( (kernel_type == INTERIOR_KERNEL && (!param.ghostDim[3] || x4>0)) ||
1702  (kernel_type == EXTERIOR_KERNEL_T && x4==0) )
1703 #endif
1704 {
1705  // Projector P3+
1706  // 2 0 0 0
1707  // 0 2 0 0
1708  // 0 0 0 0
1709  // 0 0 0 0
1710 
1711 #ifdef MULTI_GPU
1712  const int sp_idx = (kernel_type == INTERIOR_KERNEL) ? (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1 :
1713  face_idx + param.ghostOffset[static_cast<int>(kernel_type)];
1714 #else
1715  const int sp_idx = (x4==0 ? X+X4X3X2X1mX3X2X1 : X-X3X2X1) >> 1;
1716 #endif
1717 
1718 #ifdef MULTI_GPU
1719  const int ga_idx = ((kernel_type == INTERIOR_KERNEL) ? sp_idx % Vh : Vh+(face_idx % ghostFace[static_cast<int>(kernel_type)]));
1720 #else
1721  const int ga_idx = sp_idx % Vh;
1722 #endif
1723 
1724  if (gauge_fixed && ga_idx < X4X3X2X1hmX3X2X1h)
1725  {
1732 
1733 #ifdef MULTI_GPU
1734  if (kernel_type == INTERIOR_KERNEL) {
1735 #endif
1736 
1737  // read spinor from device memory
1738  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1739 
1740  // project spinor into half spinors
1741  a0_re = +2*i00_re;
1742  a0_im = +2*i00_im;
1743  a1_re = +2*i01_re;
1744  a1_im = +2*i01_im;
1745  a2_re = +2*i02_re;
1746  a2_im = +2*i02_im;
1747  b0_re = +2*i10_re;
1748  b0_im = +2*i10_im;
1749  b1_re = +2*i11_re;
1750  b1_im = +2*i11_im;
1751  b2_re = +2*i12_re;
1752  b2_im = +2*i12_im;
1753 
1754 #ifdef MULTI_GPU
1755  } else {
1756 
1757  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
1758  const int t_proj_scale = TPROJSCALE;
1759 
1760  // read half spinor from device memory
1761  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1762 
1763  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1764  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1765  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1766  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1767  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1768  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1769 
1770  }
1771 #endif // MULTI_GPU
1772 
1773  // identity gauge matrix
1780 
1781  o00_re += A0_re;
1782  o00_im += A0_im;
1783  o10_re += B0_re;
1784  o10_im += B0_im;
1785 
1786  o01_re += A1_re;
1787  o01_im += A1_im;
1788  o11_re += B1_re;
1789  o11_im += B1_im;
1790 
1791  o02_re += A2_re;
1792  o02_im += A2_im;
1793  o12_re += B2_re;
1794  o12_im += B2_im;
1795  } else {
1796  // read gauge matrix from device memory
1797  if ( ! s_parity ) { ASSN_GAUGE_MATRIX(G, GAUGE1TEX, 7, ga_idx, ga_stride); }
1798  else { ASSN_GAUGE_MATRIX(G, GAUGE0TEX, 7, ga_idx, ga_stride); }
1799 
1806 
1807 #ifdef MULTI_GPU
1808  if (kernel_type == INTERIOR_KERNEL) {
1809 #endif
1810 
1811  // read spinor from device memory
1812  READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx);
1813 
1814  // project spinor into half spinors
1815  a0_re = +2*i00_re;
1816  a0_im = +2*i00_im;
1817  a1_re = +2*i01_re;
1818  a1_im = +2*i01_im;
1819  a2_re = +2*i02_re;
1820  a2_im = +2*i02_im;
1821  b0_re = +2*i10_re;
1822  b0_im = +2*i10_im;
1823  b1_re = +2*i11_re;
1824  b1_im = +2*i11_im;
1825  b2_re = +2*i12_re;
1826  b2_im = +2*i12_im;
1827 
1828 #ifdef MULTI_GPU
1829  } else {
1830 
1831  const int sp_stride_pad = param.Ls*ghostFace[static_cast<int>(kernel_type)];
1832  const int t_proj_scale = TPROJSCALE;
1833 
1834  // read half spinor from device memory
1835  READ_HALF_SPINOR(SPINORTEX, sp_stride_pad, sp_idx, sp_norm_idx);
1836 
1837  a0_re = t_proj_scale*i00_re; a0_im = t_proj_scale*i00_im;
1838  a1_re = t_proj_scale*i01_re; a1_im = t_proj_scale*i01_im;
1839  a2_re = t_proj_scale*i02_re; a2_im = t_proj_scale*i02_im;
1840  b0_re = t_proj_scale*i10_re; b0_im = t_proj_scale*i10_im;
1841  b1_re = t_proj_scale*i11_re; b1_im = t_proj_scale*i11_im;
1842  b2_re = t_proj_scale*i12_re; b2_im = t_proj_scale*i12_im;
1843 
1844  }
1845 #endif // MULTI_GPU
1846 
1847  // reconstruct gauge matrix
1849 
1850  // multiply row 0
1851  spinorFloat A0_re = 0;
1852  A0_re += gT00_re * a0_re;
1853  A0_re -= gT00_im * a0_im;
1854  A0_re += gT01_re * a1_re;
1855  A0_re -= gT01_im * a1_im;
1856  A0_re += gT02_re * a2_re;
1857  A0_re -= gT02_im * a2_im;
1858  spinorFloat A0_im = 0;
1859  A0_im += gT00_re * a0_im;
1860  A0_im += gT00_im * a0_re;
1861  A0_im += gT01_re * a1_im;
1862  A0_im += gT01_im * a1_re;
1863  A0_im += gT02_re * a2_im;
1864  A0_im += gT02_im * a2_re;
1865  spinorFloat B0_re = 0;
1866  B0_re += gT00_re * b0_re;
1867  B0_re -= gT00_im * b0_im;
1868  B0_re += gT01_re * b1_re;
1869  B0_re -= gT01_im * b1_im;
1870  B0_re += gT02_re * b2_re;
1871  B0_re -= gT02_im * b2_im;
1872  spinorFloat B0_im = 0;
1873  B0_im += gT00_re * b0_im;
1874  B0_im += gT00_im * b0_re;
1875  B0_im += gT01_re * b1_im;
1876  B0_im += gT01_im * b1_re;
1877  B0_im += gT02_re * b2_im;
1878  B0_im += gT02_im * b2_re;
1879 
1880  // multiply row 1
1881  spinorFloat A1_re = 0;
1882  A1_re += gT10_re * a0_re;
1883  A1_re -= gT10_im * a0_im;
1884  A1_re += gT11_re * a1_re;
1885  A1_re -= gT11_im * a1_im;
1886  A1_re += gT12_re * a2_re;
1887  A1_re -= gT12_im * a2_im;
1888  spinorFloat A1_im = 0;
1889  A1_im += gT10_re * a0_im;
1890  A1_im += gT10_im * a0_re;
1891  A1_im += gT11_re * a1_im;
1892  A1_im += gT11_im * a1_re;
1893  A1_im += gT12_re * a2_im;
1894  A1_im += gT12_im * a2_re;
1895  spinorFloat B1_re = 0;
1896  B1_re += gT10_re * b0_re;
1897  B1_re -= gT10_im * b0_im;
1898  B1_re += gT11_re * b1_re;
1899  B1_re -= gT11_im * b1_im;
1900  B1_re += gT12_re * b2_re;
1901  B1_re -= gT12_im * b2_im;
1902  spinorFloat B1_im = 0;
1903  B1_im += gT10_re * b0_im;
1904  B1_im += gT10_im * b0_re;
1905  B1_im += gT11_re * b1_im;
1906  B1_im += gT11_im * b1_re;
1907  B1_im += gT12_re * b2_im;
1908  B1_im += gT12_im * b2_re;
1909 
1910  // multiply row 2
1911  spinorFloat A2_re = 0;
1912  A2_re += gT20_re * a0_re;
1913  A2_re -= gT20_im * a0_im;
1914  A2_re += gT21_re * a1_re;
1915  A2_re -= gT21_im * a1_im;
1916  A2_re += gT22_re * a2_re;
1917  A2_re -= gT22_im * a2_im;
1918  spinorFloat A2_im = 0;
1919  A2_im += gT20_re * a0_im;
1920  A2_im += gT20_im * a0_re;
1921  A2_im += gT21_re * a1_im;
1922  A2_im += gT21_im * a1_re;
1923  A2_im += gT22_re * a2_im;
1924  A2_im += gT22_im * a2_re;
1925  spinorFloat B2_re = 0;
1926  B2_re += gT20_re * b0_re;
1927  B2_re -= gT20_im * b0_im;
1928  B2_re += gT21_re * b1_re;
1929  B2_re -= gT21_im * b1_im;
1930  B2_re += gT22_re * b2_re;
1931  B2_re -= gT22_im * b2_im;
1932  spinorFloat B2_im = 0;
1933  B2_im += gT20_re * b0_im;
1934  B2_im += gT20_im * b0_re;
1935  B2_im += gT21_re * b1_im;
1936  B2_im += gT21_im * b1_re;
1937  B2_im += gT22_re * b2_im;
1938  B2_im += gT22_im * b2_re;
1939 
1940  o00_re += A0_re;
1941  o00_im += A0_im;
1942  o10_re += B0_re;
1943  o10_im += B0_im;
1944 
1945  o01_re += A1_re;
1946  o01_im += A1_im;
1947  o11_re += B1_re;
1948  o11_im += B1_im;
1949 
1950  o02_re += A2_re;
1951  o02_im += A2_im;
1952  o12_re += B2_re;
1953  o12_im += B2_im;
1954  }
1955 }
1956 
1957 
1958 
1959 // 5th dimension -- NB: not partitionable!
1960 #ifdef MULTI_GPU
1962 #endif
1963 {
1964 // 2 P_L = 2 P_- = ( ( +1, -1 ), ( -1, +1 ) )
1965  {
1966  int sp_idx = ( xs == param.Ls-1 ? X-(param.Ls-1)*2*Vh : X+2*Vh ) / 2;
1967 
1968 // read spinor from device memory
1969  READ_SPINOR( SPINORTEX, param.sp_stride, sp_idx, sp_idx );
1970 
1971  if ( xs != param.Ls-1 )
1972  {
1976 
1980 
1984 
1988  }
1989  else
1990  {
1991  o00_re += -mferm*(+i00_re-i20_re); o00_im += -mferm*(+i00_im-i20_im);
1992  o01_re += -mferm*(+i01_re-i21_re); o01_im += -mferm*(+i01_im-i21_im);
1993  o02_re += -mferm*(+i02_re-i22_re); o02_im += -mferm*(+i02_im-i22_im);
1994 
1995  o10_re += -mferm*(+i10_re-i30_re); o10_im += -mferm*(+i10_im-i30_im);
1996  o11_re += -mferm*(+i11_re-i31_re); o11_im += -mferm*(+i11_im-i31_im);
1997  o12_re += -mferm*(+i12_re-i32_re); o12_im += -mferm*(+i12_im-i32_im);
1998 
1999  o20_re += -mferm*(-i00_re+i20_re); o20_im += -mferm*(-i00_im+i20_im);
2000  o21_re += -mferm*(-i01_re+i21_re); o21_im += -mferm*(-i01_im+i21_im);
2001  o22_re += -mferm*(-i02_re+i22_re); o22_im += -mferm*(-i02_im+i22_im);
2002 
2003  o30_re += -mferm*(-i10_re+i30_re); o30_im += -mferm*(-i10_im+i30_im);
2004  o31_re += -mferm*(-i11_re+i31_re); o31_im += -mferm*(-i11_im+i31_im);
2005  o32_re += -mferm*(-i12_re+i32_re); o32_im += -mferm*(-i12_im+i32_im);
2006  } // end if ( xs != param.Ls-1 )
2007  } // end P_L
2008 
2009  // 2 P_R = 2 P_+ = ( ( +1, +1 ), ( +1, +1 ) )
2010  {
2011  int sp_idx = ( xs == 0 ? X+(param.Ls-1)*2*Vh : X-2*Vh ) / 2;
2012 
2013 // read spinor from device memory
2014  READ_SPINOR( SPINORTEX, param.sp_stride, sp_idx, sp_idx );
2015 
2016  if ( xs != 0 )
2017  {
2021 
2025 
2029 
2033  }
2034  else
2035  {
2036  o00_re += -mferm*(+i00_re+i20_re); o00_im += -mferm*(+i00_im+i20_im);
2037  o01_re += -mferm*(+i01_re+i21_re); o01_im += -mferm*(+i01_im+i21_im);
2038  o02_re += -mferm*(+i02_re+i22_re); o02_im += -mferm*(+i02_im+i22_im);
2039 
2040  o10_re += -mferm*(+i10_re+i30_re); o10_im += -mferm*(+i10_im+i30_im);
2041  o11_re += -mferm*(+i11_re+i31_re); o11_im += -mferm*(+i11_im+i31_im);
2042  o12_re += -mferm*(+i12_re+i32_re); o12_im += -mferm*(+i12_im+i32_im);
2043 
2044  o20_re += -mferm*(+i00_re+i20_re); o20_im += -mferm*(+i00_im+i20_im);
2045  o21_re += -mferm*(+i01_re+i21_re); o21_im += -mferm*(+i01_im+i21_im);
2046  o22_re += -mferm*(+i02_re+i22_re); o22_im += -mferm*(+i02_im+i22_im);
2047 
2048  o30_re += -mferm*(+i10_re+i30_re); o30_im += -mferm*(+i10_im+i30_im);
2049  o31_re += -mferm*(+i11_re+i31_re); o31_im += -mferm*(+i11_im+i31_im);
2050  o32_re += -mferm*(+i12_re+i32_re); o32_im += -mferm*(+i12_im+i32_im);
2051  } // end if ( xs != 0 )
2052  } // end P_R
2053 } // end 5th dimension
2054 
2055 
2056 #if defined MULTI_GPU && defined DSLASH_XPAY
2058 #endif
2059 {
2060 #ifdef DSLASH_XPAY
2061  READ_ACCUM(ACCUMTEX, param.sp_stride)
2062  VOLATILE spinorFloat a_inv = 1.0/a;
2063 
2064 #ifdef SPINOR_DOUBLE
2065  o00_re = o00_re + a_inv*accum0.x;
2066  o00_im = o00_im + a_inv*accum0.y;
2067  o01_re = o01_re + a_inv*accum1.x;
2068  o01_im = o01_im + a_inv*accum1.y;
2069  o02_re = o02_re + a_inv*accum2.x;
2070  o02_im = o02_im + a_inv*accum2.y;
2071  o10_re = o10_re + a_inv*accum3.x;
2072  o10_im = o10_im + a_inv*accum3.y;
2073  o11_re = o11_re + a_inv*accum4.x;
2074  o11_im = o11_im + a_inv*accum4.y;
2075  o12_re = o12_re + a_inv*accum5.x;
2076  o12_im = o12_im + a_inv*accum5.y;
2077  o20_re = o20_re + a_inv*accum6.x;
2078  o20_im = o20_im + a_inv*accum6.y;
2079  o21_re = o21_re + a_inv*accum7.x;
2080  o21_im = o21_im + a_inv*accum7.y;
2081  o22_re = o22_re + a_inv*accum8.x;
2082  o22_im = o22_im + a_inv*accum8.y;
2083  o30_re = o30_re + a_inv*accum9.x;
2084  o30_im = o30_im + a_inv*accum9.y;
2085  o31_re = o31_re + a_inv*accum10.x;
2086  o31_im = o31_im + a_inv*accum10.y;
2087  o32_re = o32_re + a_inv*accum11.x;
2088  o32_im = o32_im + a_inv*accum11.y;
2089 #else
2090  o00_re = o00_re + a_inv*accum0.x;
2091  o00_im = o00_im + a_inv*accum0.y;
2092  o01_re = o01_re + a_inv*accum0.z;
2093  o01_im = o01_im + a_inv*accum0.w;
2094  o02_re = o02_re + a_inv*accum1.x;
2095  o02_im = o02_im + a_inv*accum1.y;
2096  o10_re = o10_re + a_inv*accum1.z;
2097  o10_im = o10_im + a_inv*accum1.w;
2098  o11_re = o11_re + a_inv*accum2.x;
2099  o11_im = o11_im + a_inv*accum2.y;
2100  o12_re = o12_re + a_inv*accum2.z;
2101  o12_im = o12_im + a_inv*accum2.w;
2102  o20_re = o20_re + a_inv*accum3.x;
2103  o20_im = o20_im + a_inv*accum3.y;
2104  o21_re = o21_re + a_inv*accum3.z;
2105  o21_im = o21_im + a_inv*accum3.w;
2106  o22_re = o22_re + a_inv*accum4.x;
2107  o22_im = o22_im + a_inv*accum4.y;
2108  o30_re = o30_re + a_inv*accum4.z;
2109  o30_im = o30_im + a_inv*accum4.w;
2110  o31_re = o31_re + a_inv*accum5.x;
2111  o31_im = o31_im + a_inv*accum5.y;
2112  o32_re = o32_re + a_inv*accum5.z;
2113  o32_im = o32_im + a_inv*accum5.w;
2114 #endif // SPINOR_DOUBLE
2115 
2116 #endif // DSLASH_XPAY
2117 }
2118 
2119 #if defined MULTI_GPU && defined DSLASH_XPAY
2120 
2121 int incomplete = 0; // Have all 8 contributions been computed for this site?
2122 
2123 switch(kernel_type) { // intentional fall-through
2124 case INTERIOR_KERNEL:
2125 incomplete = incomplete || (param.commDim[3] && (x4==0 || x4==X4m1));
2126 case EXTERIOR_KERNEL_T:
2127 incomplete = incomplete || (param.commDim[2] && (x3==0 || x3==X3m1));
2128 case EXTERIOR_KERNEL_Z:
2129 incomplete = incomplete || (param.commDim[1] && (x2==0 || x2==X2m1));
2130 case EXTERIOR_KERNEL_Y:
2131 incomplete = incomplete || (param.commDim[0] && (x1==0 || x1==X1m1));
2132 }
2133 
2134 if (!incomplete)
2135 #endif // MULTI_GPU
2136 {
2137 
2138 #ifdef DSLASH_XPAY
2139 #ifdef SPINOR_DOUBLE
2140  o00_re = a*o00_re;
2141  o00_im = a*o00_im;
2142  o01_re = a*o01_re;
2143  o01_im = a*o01_im;
2144  o02_re = a*o02_re;
2145  o02_im = a*o02_im;
2146  o10_re = a*o10_re;
2147  o10_im = a*o10_im;
2148  o11_re = a*o11_re;
2149  o11_im = a*o11_im;
2150  o12_re = a*o12_re;
2151  o12_im = a*o12_im;
2152  o20_re = a*o20_re;
2153  o20_im = a*o20_im;
2154  o21_re = a*o21_re;
2155  o21_im = a*o21_im;
2156  o22_re = a*o22_re;
2157  o22_im = a*o22_im;
2158  o30_re = a*o30_re;
2159  o30_im = a*o30_im;
2160  o31_re = a*o31_re;
2161  o31_im = a*o31_im;
2162  o32_re = a*o32_re;
2163  o32_im = a*o32_im;
2164 #else
2165  o00_re = a*o00_re;
2166  o00_im = a*o00_im;
2167  o01_re = a*o01_re;
2168  o01_im = a*o01_im;
2169  o02_re = a*o02_re;
2170  o02_im = a*o02_im;
2171  o10_re = a*o10_re;
2172  o10_im = a*o10_im;
2173  o11_re = a*o11_re;
2174  o11_im = a*o11_im;
2175  o12_re = a*o12_re;
2176  o12_im = a*o12_im;
2177  o20_re = a*o20_re;
2178  o20_im = a*o20_im;
2179  o21_re = a*o21_re;
2180  o21_im = a*o21_im;
2181  o22_re = a*o22_re;
2182  o22_im = a*o22_im;
2183  o30_re = a*o30_re;
2184  o30_im = a*o30_im;
2185  o31_re = a*o31_re;
2186  o31_im = a*o31_im;
2187  o32_re = a*o32_re;
2188  o32_im = a*o32_im;
2189 #endif // SPINOR_DOUBLE
2190 
2191 #endif // DSLASH_XPAY
2192 }
2193 
2194 // write spinor field back to device memory
2195 WRITE_SPINOR(param.sp_stride);
2196 
2197 // undefine to prevent warning when precision is changed
2198 #undef spinorFloat
2199 #undef SHARED_STRIDE
2200 
2201 #undef g00_re
2202 #undef g00_im
2203 #undef g01_re
2204 #undef g01_im
2205 #undef g02_re
2206 #undef g02_im
2207 #undef g10_re
2208 #undef g10_im
2209 #undef g11_re
2210 #undef g11_im
2211 #undef g12_re
2212 #undef g12_im
2213 #undef g20_re
2214 #undef g20_im
2215 #undef g21_re
2216 #undef g21_im
2217 #undef g22_re
2218 #undef g22_im
2219 
2220 #undef i00_re
2221 #undef i00_im
2222 #undef i01_re
2223 #undef i01_im
2224 #undef i02_re
2225 #undef i02_im
2226 #undef i10_re
2227 #undef i10_im
2228 #undef i11_re
2229 #undef i11_im
2230 #undef i12_re
2231 #undef i12_im
2232 #undef i20_re
2233 #undef i20_im
2234 #undef i21_re
2235 #undef i21_im
2236 #undef i22_re
2237 #undef i22_im
2238 #undef i30_re
2239 #undef i30_im
2240 #undef i31_re
2241 #undef i31_im
2242 #undef i32_re
2243 #undef i32_im
2244 
2245 
2246 
2247 #undef VOLATILE
#define g21_im
READ_SPINOR_UP(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
int s_parity
spinorFloat B0_re
VOLATILE spinorFloat o30_im
RECONSTRUCT_GAUGE_MATRIX(0)
__constant__ int Vh
VOLATILE spinorFloat o20_re
#define i32_im
__constant__ int X1h
#define g22_im
READ_SPINOR(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
#define g10_im
int x3
__constant__ int X2
#define i10_im
VOLATILE spinorFloat o10_re
VOLATILE spinorFloat o11_im
spinorFloat b0_im
__constant__ int X2X1mX1
#define gT12_re
#define g02_re
float4 G4
spinorFloat A0_re
#define i01_im
spinorFloat A2_re
#define i12_im
#define gT10_re
spinorFloat a0_re
#define gT20_re
__constant__ int X3X2X1mX2X1
VOLATILE spinorFloat o02_re
__constant__ int X1
#define READ_INTERMEDIATE_SPINOR
Definition: covDev.h:144
spinorFloat b0_re
VOLATILE spinorFloat o02_im
int sp_idx
#define gT21_re
#define gT00_im
spinorFloat A0_im
VOLATILE spinorFloat o30_re
#define g10_re
#define i10_re
#define gT11_im
VOLATILE spinorFloat o21_re
#define gT20_im
#define i21_re
#define i11_im
__constant__ int X3X2X1
VOLATILE spinorFloat o10_im
#define i22_im
#define gT01_im
VOLATILE spinorFloat o22_im
const int ga_idx
spinorFloat b1_re
int x2
VOLATILE spinorFloat o20_im
spinorFloat B1_re
#define g22_re
VOLATILE spinorFloat o31_im
float4 G2
#define i01_re
#define i11_re
float4 G0
spinorFloat B2_re
spinorFloat a0_im
#define gT22_im
QudaGaugeParam param
Definition: pack_test.cpp:17
VOLATILE spinorFloat o12_re
__constant__ int ghostFace[QUDA_MAX_DIM+1]
VOLATILE spinorFloat o00_re
spinorFloat A1_re
#define g12_im
#define i31_im
#define i12_re
VOLATILE spinorFloat o31_re
int x1
#define i21_im
#define gT21_im
#define g01_re
spinorFloat b2_re
#define i32_re
#define g11_re
#define gT02_im
#define GAUGE0TEX
Definition: covDev.h:112
spinorFloat B0_im
#define g02_im
VOLATILE spinorFloat o01_re
spinorFloat B1_im
spinorFloat A1_im
VOLATILE spinorFloat o01_im
#define g00_re
#define g12_re
#define g20_re
spinorFloat b1_im
#define gT12_im
VOLATILE spinorFloat o21_im
__constant__ int X2m1
spinorFloat a2_im
spinorFloat b2_im
#define SPINORTEX
Definition: clover_def.h:40
VOLATILE spinorFloat o00_im
__constant__ int gauge_fixed
#define i30_re
int boundaryCrossing
int x4
#define i20_re
#define g20_im
__constant__ int X4X3X2X1mX3X2X1
#define i00_im
int X
#define SPINOR_HOP
Definition: covDev.h:158
#define i20_im
VOLATILE spinorFloat o32_im
#define gT02_re
#define i02_re
#define VOLATILE
#define ASSN_GAUGE_MATRIX
__constant__ int ga_stride
#define g21_re
#define gT22_re
spinorFloat a2_re
#define gT00_re
spinorFloat a1_re
#define i00_re
__constant__ int X1m1
__constant__ int X3
#define i22_re
VOLATILE spinorFloat o11_re
#define gT10_im
int sid
#define i30_im
#define GAUGE1TEX
Definition: covDev.h:113
#define i02_im
int xs
VOLATILE spinorFloat o12_im
#define i31_re
__constant__ int X4m1
VOLATILE spinorFloat o32_re
#define g00_im
#define WRITE_SPINOR
Definition: clover_def.h:48
float4 G3
#define g01_im
#define gT01_re
spinorFloat B2_im
#define READ_HALF_SPINOR
Definition: io_spinor.h:390
#define INTERTEX
Definition: covDev.h:149
VOLATILE spinorFloat o22_re
spinorFloat a1_im
#define gT11_re
__constant__ int X4X3X2X1hmX3X2X1h
READ_SPINOR_DOWN(SPINORTEX, param.sp_stride, sp_idx, sp_idx)
float4 G1
#define g11_im
spinorFloat A2_im
KernelType kernel_type
__constant__ int X4
__constant__ int X3m1
#define TPROJSCALE
Definition: covDev.h:101
#define spinorFloat
__constant__ int X2X1