QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
contract_core_plus.h
Go to the documentation of this file.
1 #ifndef _TWIST_QUDA_CONTRACT_PLUS
2 #define _TWIST_QUDA_CONTRACT_PLUS
3 
4 #define tmp_re tmp.x
5 #define tmp_im tmp.y
6 
7 #define TOTAL_COMPONENTS 16
8 
9 #if (__COMPUTE_CAPABILITY__ >= 130)
10 
11 #define READ_INTERMEDIATE_SPINOR_DOUBLE(spinor, stride, sp_idx, norm_idx) \
12  double2 J0 = spinor[sp_idx + 0*(stride)]; \
13  double2 J1 = spinor[sp_idx + 1*(stride)]; \
14  double2 J2 = spinor[sp_idx + 2*(stride)]; \
15  double2 J3 = spinor[sp_idx + 3*(stride)]; \
16  double2 J4 = spinor[sp_idx + 4*(stride)]; \
17  double2 J5 = spinor[sp_idx + 5*(stride)]; \
18  double2 J6 = spinor[sp_idx + 6*(stride)]; \
19  double2 J7 = spinor[sp_idx + 7*(stride)]; \
20  double2 J8 = spinor[sp_idx + 8*(stride)]; \
21  double2 J9 = spinor[sp_idx + 9*(stride)]; \
22  double2 J10 = spinor[sp_idx +10*(stride)]; \
23  double2 J11 = spinor[sp_idx +11*(stride)];
24 
25 #define READ_INTERMEDIATE_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \
26  double2 J0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
27  double2 J1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
28  double2 J2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
29  double2 J3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
30  double2 J4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
31  double2 J5 = fetch_double2((spinor), sp_idx + 5*(stride)); \
32  double2 J6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
33  double2 J7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
34  double2 J8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
35  double2 J9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
36  double2 J10 = fetch_double2((spinor), sp_idx +10*(stride)); \
37  double2 J11 = fetch_double2((spinor), sp_idx +11*(stride));
38 
39 #ifdef DIRECT_ACCESS_WILSON_SPINOR
40  #define READ_SPINOR READ_SPINOR_DOUBLE
41  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE
42  #define SPINORTEX in
43  #define INTERTEX in2
44 #else
45  #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
46  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE_TEX
47 
48  #ifdef USE_TEXTURE_OBJECTS
49  #define SPINORTEX param.inTex
50  #define INTERTEX param.outTex
51  #else
52  #define SPINORTEX spinorTexDouble
53  #define INTERTEX interTexDouble
54  #endif // USE_TEXTURE_OBJECTS
55 
56 #define SPINOR_HOP 12
57 
58 #endif
59 
60 __global__ void contractGamma5PlusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
61 {
62  int sid = blockIdx.x*blockDim.x + threadIdx.x;
63  int outId = sid;
64  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
65 
66  if (sid >= param.threads)
67  return;
68 
69  volatile double2 tmp;
70  extern __shared__ double sm[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
71 
72  volatile double *accum_re = sm + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
73  volatile double *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
74 
75  eutId = 2*sid;
76  auxCoord1 = eutId / X1;
77  xCoord1 = eutId - auxCoord1 * X1;
78  auxCoord2 = auxCoord1 / X2;
79  xCoord2 = auxCoord1 - auxCoord2 * X2;
80  xCoord4 = auxCoord2 / X3;
81  xCoord3 = auxCoord2 - xCoord4 * X3;
82 
83  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
84  xCoord1 += auxCoord1;
85  outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4)); //AQUI
86 
87  READ_SPINOR (SPINORTEX, myStride, sid, sid);
88  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
89 
90  //compute in1^dag * gamma5:
91 
92  tmp_re = +I0.x;
93  tmp_im = -I0.y;
94  I0.x = +I6.x;
95  I0.y = -I6.y;
96  I6.x = tmp_re;
97  I6.y = tmp_im;
98 
99  tmp_re = +I3.x;
100  tmp_im = -I3.y;
101  I3.x = +I9.x;
102  I3.y = -I9.y;
103  I9.x = tmp_re;
104  I9.y = tmp_im;
105 
106  tmp_re = +I1.x;
107  tmp_im = -I1.y;
108  I1.x = +I7.x;
109  I1.y = -I7.y;
110  I7.x = tmp_re;
111  I7.y = tmp_im;
112 
113  tmp_re = +I4.x;
114  tmp_im = -I4.y;
115  I4.x = +I10.x;
116  I4.y = -I10.y;
117  I10.x = tmp_re;
118  I10.y = tmp_im;
119 
120  tmp_re = +I2.x;
121  tmp_im = -I2.y;
122  I2.x = +I8.x;
123  I2.y = -I8.y;
124  I8.x = tmp_re;
125  I8.y = tmp_im;
126 
127  tmp_re = +I5.x;
128  tmp_im = -I5.y;
129  I5.x = +I11.x;
130  I5.y = -I11.y;
131  I11.x = tmp_re;
132  I11.y = tmp_im;
133 
134  //do products for all color component here:
135  //00 component:
136  tmp_re = I0.x * J0.x - I0.y * J0.y;
137  tmp_re += I1.x * J1.x - I1.y * J1.y;
138  tmp_re += I2.x * J2.x - I2.y * J2.y;
139  accum_re[0*blockDim.x] = tmp_re;
140 
141  tmp_im = I0.x * J0.y + I0.y * J0.x;
142  tmp_im += I1.x * J1.y + I1.y * J1.x;
143  tmp_im += I2.x * J2.y + I2.y * J2.x;
144  accum_im[0*blockDim.x] = tmp_im;
145 
146  //01 component:
147  tmp_re = I0.x * J3.x - I0.y * J3.y;
148  tmp_re += I1.x * J4.x - I1.y * J4.y;
149  tmp_re += I2.x * J5.x - I2.y * J5.y;
150  accum_re[1*blockDim.x] = tmp_re;
151 
152  tmp_im = I0.x * J3.y + I0.y * J3.x;
153  tmp_im += I1.x * J4.y + I1.y * J4.x;
154  tmp_im += I2.x * J5.y + I2.y * J5.x;
155  accum_im[1*blockDim.x] = tmp_im;
156 
157  //02 component:
158  tmp_re = I0.x * J6.x - I0.y * J6.y;
159  tmp_re += I1.x * J7.x - I1.y * J7.y;
160  tmp_re += I2.x * J8.x - I2.y * J8.y;
161  accum_re[2*blockDim.x] = tmp_re;
162 
163  tmp_im = I0.x * J6.y + I0.y * J6.x;
164  tmp_im += I1.x * J7.y + I1.y * J7.x;
165  tmp_im += I2.x * J8.y + I2.y * J8.x;
166  accum_im[2*blockDim.x] = tmp_im;
167 
168  //03 component:
169  tmp_re = I0.x * J9.x - I0.y * J9.y;
170  tmp_re += I1.x * J10.x - I1.y * J10.y;
171  tmp_re += I2.x * J11.x - I2.y * J11.y;
172  accum_re[3*blockDim.x] = tmp_re;
173 
174  tmp_im = I0.x * J9.y + I0.y * J9.x;
175  tmp_im += I1.x * J10.y + I1.y * J10.x;
176  tmp_im += I2.x * J11.y + I2.y * J11.x;
177  accum_im[3*blockDim.x] = tmp_im;
178 
179  //10 component:
180  tmp_re = I3.x * J0.x - I3.y * J0.y;
181  tmp_re += I4.x * J1.x - I4.y * J1.y;
182  tmp_re += I5.x * J2.x - I5.y * J2.y;
183  accum_re[ 4*blockDim.x] = tmp_re;
184 
185  tmp_im = I3.x * J0.y + I3.y * J0.x;
186  tmp_im += I4.x * J1.y + I4.y * J1.x;
187  tmp_im += I5.x * J2.y + I5.y * J2.x;
188  accum_im[ 4*blockDim.x] = tmp_im;
189 
190  //11 component:
191  tmp_re = I3.x * J3.x - I3.y * J3.y;
192  tmp_re += I4.x * J4.x - I4.y * J4.y;
193  tmp_re += I5.x * J5.x - I5.y * J5.y;
194  accum_re[ 5*blockDim.x] = tmp_re;
195 
196  tmp_im = I3.x * J3.y + I3.y * J3.x;
197  tmp_im += I4.x * J4.y + I4.y * J4.x;
198  tmp_im += I5.x * J5.y + I5.y * J5.x;
199  accum_im[ 5*blockDim.x] = tmp_im;
200 
201  //12 component:
202  tmp_re = I3.x * J6.x - I3.y * J6.y;
203  tmp_re += I4.x * J7.x - I4.y * J7.y;
204  tmp_re += I5.x * J8.x - I5.y * J8.y;
205  accum_re[ 6*blockDim.x] = tmp_re;
206 
207  tmp_im = I3.x * J6.y + I3.y * J6.x;
208  tmp_im += I4.x * J7.y + I4.y * J7.x;
209  tmp_im += I5.x * J8.y + I5.y * J8.x;
210  accum_im[ 6*blockDim.x] = tmp_im;
211 
212  //13 component:
213  tmp_re = I3.x * J9.x - I3.y * J9.y;
214  tmp_re += I4.x * J10.x - I4.y * J10.y;
215  tmp_re += I5.x * J11.x - I5.y * J11.y;
216  accum_re[ 7*blockDim.x] = tmp_re;
217 
218  tmp_im = I3.x * J9.y + I3.y * J9.x;
219  tmp_im += I4.x * J10.y + I4.y * J10.x;
220  tmp_im += I5.x * J11.y + I5.y * J11.x;
221  accum_im[ 7*blockDim.x] = tmp_im;
222 
223  //20 component:
224  tmp_re = I6.x * J0.x - I6.y * J0.y;
225  tmp_re += I7.x * J1.x - I7.y * J1.y;
226  tmp_re += I8.x * J2.x - I8.y * J2.y;
227  accum_re[ 8*blockDim.x] = tmp_re;
228 
229  tmp_im = I6.x * J0.y + I6.y * J0.x;
230  tmp_im += I7.x * J1.y + I7.y * J1.x;
231  tmp_im += I8.x * J2.y + I8.y * J2.x;
232  accum_im[ 8*blockDim.x] = tmp_im;
233 
234  //21 component:
235  tmp_re = I6.x * J3.x - I6.y * J3.y;
236  tmp_re += I7.x * J4.x - I7.y * J4.y;
237  tmp_re += I8.x * J5.x - I8.y * J5.y;
238  accum_re[ 9*blockDim.x] = tmp_re;
239 
240  tmp_im = I6.x * J3.y + I6.y * J3.x;
241  tmp_im += I7.x * J4.y + I7.y * J4.x;
242  tmp_im += I8.x * J5.y + I8.y * J5.x;
243  accum_im[ 9*blockDim.x] = tmp_im;
244 
245  //22 component:
246  tmp_re = I6.x * J6.x - I6.y * J6.y;
247  tmp_re += I7.x * J7.x - I7.y * J7.y;
248  tmp_re += I8.x * J8.x - I8.y * J8.y;
249  accum_re[10*blockDim.x] = tmp_re;
250 
251  tmp_im = I6.x * J6.y + I6.y * J6.x;
252  tmp_im += I7.x * J7.y + I7.y * J7.x;
253  tmp_im += I8.x * J8.y + I8.y * J8.x;
254  accum_im[10*blockDim.x] = tmp_im;
255 
256  //23 component:
257  tmp_re = I6.x * J9.x - I6.y * J9.y;
258  tmp_re += I7.x * J10.x - I7.y * J10.y;
259  tmp_re += I8.x * J11.x - I8.y * J11.y;
260  accum_re[11*blockDim.x] = tmp_re;
261 
262  tmp_im = I6.x * J9.y + I6.y * J9.x;
263  tmp_im += I7.x * J10.y + I7.y * J10.x;
264  tmp_im += I8.x * J11.y + I8.y * J11.x;
265  accum_im[11*blockDim.x] = tmp_im;
266 
267  //30 component:
268  tmp_re = I9.x * J0.x - I9.y * J0.y;
269  tmp_re += I10.x * J1.x - I10.y * J1.y;
270  tmp_re += I11.x * J2.x - I11.y * J2.y;
271  accum_re[12*blockDim.x] = tmp_re;
272 
273  tmp_im = I9.x * J0.y + I9.y * J0.x;
274  tmp_im += I10.x * J1.y + I10.y * J1.x;
275  tmp_im += I11.x * J2.y + I11.y * J2.x;
276  accum_im[12*blockDim.x] = tmp_im;
277 
278  //31 component:
279  tmp_re = I9.x * J3.x - I9.y * J3.y;
280  tmp_re += I10.x * J4.x - I10.y * J4.y;
281  tmp_re += I11.x * J5.x - I11.y * J5.y;
282  accum_re[13*blockDim.x] = tmp_re;
283 
284  tmp_im = I9.x * J3.y + I9.y * J3.x;
285  tmp_im += I10.x * J4.y + I10.y * J4.x;
286  tmp_im += I11.x * J5.y + I11.y * J5.x;
287  accum_im[13*blockDim.x] = tmp_im;
288 
289  //32 component:
290  tmp_re = I9.x * J6.x - I9.y * J6.y;
291  tmp_re += I10.x * J7.x - I10.y * J7.y;
292  tmp_re += I11.x * J8.x - I11.y * J8.y;
293  accum_re[14*blockDim.x] = tmp_re;
294 
295  tmp_im = I9.x * J6.y + I9.y * J6.x;
296  tmp_im += I10.x * J7.y + I10.y * J7.x;
297  tmp_im += I11.x * J8.y + I11.y * J8.x;
298  accum_im[14*blockDim.x] = tmp_im;
299 
300  //33 component:
301  tmp_re = I9.x * J9.x - I9.y * J9.y;
302  tmp_re += I10.x * J10.x - I10.y * J10.y;
303  tmp_re += I11.x * J11.x - I11.y * J11.y;
304  accum_re[15*blockDim.x] = tmp_re;
305 
306  tmp_im = I9.x * J9.y + I9.y * J9.x;
307  tmp_im += I10.x * J10.y + I10.y * J10.x;
308  tmp_im += I11.x * J11.y + I11.y * J11.x;
309  accum_im[15*blockDim.x] = tmp_im;
310 
311  //Store output back to global buffer:
312 
313 
314 /* CONTRACTION FULL VOLUME */
315 
316  out[outId + 0 *param.threads*2].x += accum_re[ 0*blockDim.x];
317  out[outId + 1 *param.threads*2].x += accum_re[ 1*blockDim.x];
318  out[outId + 2 *param.threads*2].x += accum_re[ 2*blockDim.x];
319  out[outId + 3 *param.threads*2].x += accum_re[ 3*blockDim.x];
320  out[outId + 4 *param.threads*2].x += accum_re[ 4*blockDim.x];
321  out[outId + 5 *param.threads*2].x += accum_re[ 5*blockDim.x];
322  out[outId + 6 *param.threads*2].x += accum_re[ 6*blockDim.x];
323  out[outId + 7 *param.threads*2].x += accum_re[ 7*blockDim.x];
324  out[outId + 8 *param.threads*2].x += accum_re[ 8*blockDim.x];
325  out[outId + 9 *param.threads*2].x += accum_re[ 9*blockDim.x];
326  out[outId + 10*param.threads*2].x += accum_re[10*blockDim.x];
327  out[outId + 11*param.threads*2].x += accum_re[11*blockDim.x];
328  out[outId + 12*param.threads*2].x += accum_re[12*blockDim.x];
329  out[outId + 13*param.threads*2].x += accum_re[13*blockDim.x];
330  out[outId + 14*param.threads*2].x += accum_re[14*blockDim.x];
331  out[outId + 15*param.threads*2].x += accum_re[15*blockDim.x];
332 
333  out[outId + 0 *param.threads*2].y += accum_im[ 0*blockDim.x];
334  out[outId + 1 *param.threads*2].y += accum_im[ 1*blockDim.x];
335  out[outId + 2 *param.threads*2].y += accum_im[ 2*blockDim.x];
336  out[outId + 3 *param.threads*2].y += accum_im[ 3*blockDim.x];
337  out[outId + 4 *param.threads*2].y += accum_im[ 4*blockDim.x];
338  out[outId + 5 *param.threads*2].y += accum_im[ 5*blockDim.x];
339  out[outId + 6 *param.threads*2].y += accum_im[ 6*blockDim.x];
340  out[outId + 7 *param.threads*2].y += accum_im[ 7*blockDim.x];
341  out[outId + 8 *param.threads*2].y += accum_im[ 8*blockDim.x];
342  out[outId + 9 *param.threads*2].y += accum_im[ 9*blockDim.x];
343  out[outId + 10*param.threads*2].y += accum_im[10*blockDim.x];
344  out[outId + 11*param.threads*2].y += accum_im[11*blockDim.x];
345  out[outId + 12*param.threads*2].y += accum_im[12*blockDim.x];
346  out[outId + 13*param.threads*2].y += accum_im[13*blockDim.x];
347  out[outId + 14*param.threads*2].y += accum_im[14*blockDim.x];
348  out[outId + 15*param.threads*2].y += accum_im[15*blockDim.x];
349 
350  return;
351 }
352 
353 //Perform trace in color space only and for a given tslice
354 //since the file is included in dslash_quda.h, no need to add dslash_constants.h file here (for, e.g., Vsh)
355 __global__ void contractTslicePlusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
356 {
357  int sid = blockIdx.x*blockDim.x + threadIdx.x; //number of threads is equal to Tslice volume
358  //Adjust sid to correct tslice (exe domain must be Tslice volume!)
359  int inId = sid + Vsh*Tslice; //Vsh - 3d space volume for the parity spinor (equale to exe domain!)
360  int outId;
361  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
362 
363  if (sid >= param.threads) //param.threads == tslice volume
364  return;
365 
366  volatile double2 tmp;
367  extern __shared__ double sm[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
368 
369  volatile double *accum_re = sm + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
370  volatile double *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
371 
372 //The output only for a given tslice (for the full tslice content, i.e., both parities!):
373 
374  eutId = 2*inId;
375  auxCoord1 = eutId / X1;
376  xCoord1 = eutId - auxCoord1 * X1;
377  auxCoord2 = auxCoord1 / X2;
378  xCoord2 = auxCoord1 - auxCoord2 * X2;
379  xCoord4 = auxCoord2 / X3;
380  xCoord3 = auxCoord2 - xCoord4 * X3;
381 
382  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
383  xCoord1 += auxCoord1;
384  outId = xCoord1 + X1*(xCoord2 + X2*xCoord3);
385 
386  READ_SPINOR (SPINORTEX, myStride, sid, sid);
387  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
388 
389  //compute in1^dag:
390 
391  I0.y = -I0.y;
392  I1.y = -I1.y;
393  I2.y = -I2.y;
394  I3.y = -I3.y;
395  I4.y = -I4.y;
396  I5.y = -I5.y;
397  I6.y = -I6.y;
398  I7.y = -I7.y;
399  I8.y = -I8.y;
400  I9.y = -I9.y;
401  I10.y = -I10.y;
402  I11.y = -I11.y;
403 
404  //do products for all color component here:
405  //00 component:
406  tmp_re = I0.x * J0.x - I0.y * J0.y;
407  tmp_re += I1.x * J1.x - I1.y * J1.y;
408  tmp_re += I2.x * J2.x - I2.y * J2.y;
409  accum_re[0*blockDim.x] = tmp_re;
410 
411  tmp_im = I0.x * J0.y + I0.y * J0.x;
412  tmp_im += I1.x * J1.y + I1.y * J1.x;
413  tmp_im += I2.x * J2.y + I2.y * J2.x;
414  accum_im[0*blockDim.x] = tmp_im;
415 
416  //01 component:
417  tmp_re = I0.x * J3.x - I0.y * J3.y;
418  tmp_re += I1.x * J4.x - I1.y * J4.y;
419  tmp_re += I2.x * J5.x - I2.y * J5.y;
420  accum_re[1*blockDim.x] = tmp_re;
421 
422  tmp_im = I0.x * J3.y + I0.y * J3.x;
423  tmp_im += I1.x * J4.y + I1.y * J4.x;
424  tmp_im += I2.x * J5.y + I2.y * J5.x;
425  accum_im[1*blockDim.x] = tmp_im;
426 
427  //02 component:
428  tmp_re = I0.x * J6.x - I0.y * J6.y;
429  tmp_re += I1.x * J7.x - I1.y * J7.y;
430  tmp_re += I2.x * J8.x - I2.y * J8.y;
431  accum_re[2*blockDim.x] = tmp_re;
432 
433  tmp_im = I0.x * J6.y + I0.y * J6.x;
434  tmp_im += I1.x * J7.y + I1.y * J7.x;
435  tmp_im += I2.x * J8.y + I2.y * J8.x;
436  accum_im[2*blockDim.x] = tmp_im;
437 
438  //03 component:
439  tmp_re = I0.x * J9.x - I0.y * J9.y;
440  tmp_re += I1.x * J10.x - I1.y * J10.y;
441  tmp_re += I2.x * J11.x - I2.y * J11.y;
442  accum_re[3*blockDim.x] = tmp_re;
443 
444  tmp_im = I0.x * J9.y + I0.y * J9.x;
445  tmp_im += I1.x * J10.y + I1.y * J10.x;
446  tmp_im += I2.x * J11.y + I2.y * J11.x;
447  accum_im[3*blockDim.x] = tmp_im;
448 
449  //10 component:
450  tmp_re = I3.x * J0.x - I3.y * J0.y;
451  tmp_re += I4.x * J1.x - I4.y * J1.y;
452  tmp_re += I5.x * J2.x - I5.y * J2.y;
453  accum_re[ 4*blockDim.x] = tmp_re;
454 
455  tmp_im = I3.x * J0.y + I3.y * J0.x;
456  tmp_im += I4.x * J1.y + I4.y * J1.x;
457  tmp_im += I5.x * J2.y + I5.y * J2.x;
458  accum_im[ 4*blockDim.x] = tmp_im;
459 
460  //11 component:
461  tmp_re = I3.x * J3.x - I3.y * J3.y;
462  tmp_re += I4.x * J4.x - I4.y * J4.y;
463  tmp_re += I5.x * J5.x - I5.y * J5.y;
464  accum_re[ 5*blockDim.x] = tmp_re;
465 
466  tmp_im = I3.x * J3.y + I3.y * J3.x;
467  tmp_im += I4.x * J4.y + I4.y * J4.x;
468  tmp_im += I5.x * J5.y + I5.y * J5.x;
469  accum_im[ 5*blockDim.x] = tmp_im;
470 
471  //12 component:
472  tmp_re = I3.x * J6.x - I3.y * J6.y;
473  tmp_re += I4.x * J7.x - I4.y * J7.y;
474  tmp_re += I5.x * J8.x - I5.y * J8.y;
475  accum_re[ 6*blockDim.x] = tmp_re;
476 
477  tmp_im = I3.x * J6.y + I3.y * J6.x;
478  tmp_im += I4.x * J7.y + I4.y * J7.x;
479  tmp_im += I5.x * J8.y + I5.y * J8.x;
480  accum_im[ 6*blockDim.x] = tmp_im;
481 
482  //13 component:
483  tmp_re = I3.x * J9.x - I3.y * J9.y;
484  tmp_re += I4.x * J10.x - I4.y * J10.y;
485  tmp_re += I5.x * J11.x - I5.y * J11.y;
486  accum_re[ 7*blockDim.x] = tmp_re;
487 
488  tmp_im = I3.x * J9.y + I3.y * J9.x;
489  tmp_im += I4.x * J10.y + I4.y * J10.x;
490  tmp_im += I5.x * J11.y + I5.y * J11.x;
491  accum_im[ 7*blockDim.x] = tmp_im;
492 
493  //20 component:
494  tmp_re = I6.x * J0.x - I6.y * J0.y;
495  tmp_re += I7.x * J1.x - I7.y * J1.y;
496  tmp_re += I8.x * J2.x - I8.y * J2.y;
497  accum_re[ 8*blockDim.x] = tmp_re;
498 
499  tmp_im = I6.x * J0.y + I6.y * J0.x;
500  tmp_im += I7.x * J1.y + I7.y * J1.x;
501  tmp_im += I8.x * J2.y + I8.y * J2.x;
502  accum_im[ 8*blockDim.x] = tmp_im;
503 
504  //21 component:
505  tmp_re = I6.x * J3.x - I6.y * J3.y;
506  tmp_re += I7.x * J4.x - I7.y * J4.y;
507  tmp_re += I8.x * J5.x - I8.y * J5.y;
508  accum_re[ 9*blockDim.x] = tmp_re;
509 
510  tmp_im = I6.x * J3.y + I6.y * J3.x;
511  tmp_im += I7.x * J4.y + I7.y * J4.x;
512  tmp_im += I8.x * J5.y + I8.y * J5.x;
513  accum_im[ 9*blockDim.x] = tmp_im;
514 
515  //22 component:
516  tmp_re = I6.x * J6.x - I6.y * J6.y;
517  tmp_re += I7.x * J7.x - I7.y * J7.y;
518  tmp_re += I8.x * J8.x - I8.y * J8.y;
519  accum_re[10*blockDim.x] = tmp_re;
520 
521  tmp_im = I6.x * J6.y + I6.y * J6.x;
522  tmp_im += I7.x * J7.y + I7.y * J7.x;
523  tmp_im += I8.x * J8.y + I8.y * J8.x;
524  accum_im[10*blockDim.x] = tmp_im;
525 
526  //23 component:
527  tmp_re = I6.x * J9.x - I6.y * J9.y;
528  tmp_re += I7.x * J10.x - I7.y * J10.y;
529  tmp_re += I8.x * J11.x - I8.y * J11.y;
530  accum_re[11*blockDim.x] = tmp_re;
531 
532  tmp_im = I6.x * J9.y + I6.y * J9.x;
533  tmp_im += I7.x * J10.y + I7.y * J10.x;
534  tmp_im += I8.x * J11.y + I8.y * J11.x;
535  accum_im[11*blockDim.x] = tmp_im;
536 
537  //30 component:
538  tmp_re = I9.x * J0.x - I9.y * J0.y;
539  tmp_re += I10.x * J1.x - I10.y * J1.y;
540  tmp_re += I11.x * J2.x - I11.y * J2.y;
541  accum_re[12*blockDim.x] = tmp_re;
542 
543  tmp_im = I9.x * J0.y + I9.y * J0.x;
544  tmp_im += I10.x * J1.y + I10.y * J1.x;
545  tmp_im += I11.x * J2.y + I11.y * J2.x;
546  accum_im[12*blockDim.x] = tmp_im;
547 
548  //31 component:
549  tmp_re = I9.x * J3.x - I9.y * J3.y;
550  tmp_re += I10.x * J4.x - I10.y * J4.y;
551  tmp_re += I11.x * J5.x - I11.y * J5.y;
552  accum_re[13*blockDim.x] = tmp_re;
553 
554  tmp_im = I9.x * J3.y + I9.y * J3.x;
555  tmp_im += I10.x * J4.y + I10.y * J4.x;
556  tmp_im += I11.x * J5.y + I11.y * J5.x;
557  accum_im[13*blockDim.x] = tmp_im;
558 
559  //32 component:
560  tmp_re = I9.x * J6.x - I9.y * J6.y;
561  tmp_re += I10.x * J7.x - I10.y * J7.y;
562  tmp_re += I11.x * J8.x - I11.y * J8.y;
563  accum_re[14*blockDim.x] = tmp_re;
564 
565  tmp_im = I9.x * J6.y + I9.y * J6.x;
566  tmp_im += I10.x * J7.y + I10.y * J7.x;
567  tmp_im += I11.x * J8.y + I11.y * J8.x;
568  accum_im[14*blockDim.x] = tmp_im;
569 
570  //33 component:
571  tmp_re = I9.x * J9.x - I9.y * J9.y;
572  tmp_re += I10.x * J10.x - I10.y * J10.y;
573  tmp_re += I11.x * J11.x - I11.y * J11.y;
574  accum_re[15*blockDim.x] = tmp_re;
575 
576  tmp_im = I9.x * J9.y + I9.y * J9.x;
577  tmp_im += I10.x * J10.y + I10.y * J10.x;
578  tmp_im += I11.x * J11.y + I11.y * J11.x;
579  accum_im[15*blockDim.x] = tmp_im;
580 
581  //Store output back to global buffer:
582 
583 
584 /* CONTRACTION FULL VOLUME */
585 
586  out[outId + 0 *param.threads*2].x += accum_re[ 0*blockDim.x];
587  out[outId + 1 *param.threads*2].x += accum_re[ 1*blockDim.x];
588  out[outId + 2 *param.threads*2].x += accum_re[ 2*blockDim.x];
589  out[outId + 3 *param.threads*2].x += accum_re[ 3*blockDim.x];
590  out[outId + 4 *param.threads*2].x += accum_re[ 4*blockDim.x];
591  out[outId + 5 *param.threads*2].x += accum_re[ 5*blockDim.x];
592  out[outId + 6 *param.threads*2].x += accum_re[ 6*blockDim.x];
593  out[outId + 7 *param.threads*2].x += accum_re[ 7*blockDim.x];
594  out[outId + 8 *param.threads*2].x += accum_re[ 8*blockDim.x];
595  out[outId + 9 *param.threads*2].x += accum_re[ 9*blockDim.x];
596  out[outId + 10*param.threads*2].x += accum_re[10*blockDim.x];
597  out[outId + 11*param.threads*2].x += accum_re[11*blockDim.x];
598  out[outId + 12*param.threads*2].x += accum_re[12*blockDim.x];
599  out[outId + 13*param.threads*2].x += accum_re[13*blockDim.x];
600  out[outId + 14*param.threads*2].x += accum_re[14*blockDim.x];
601  out[outId + 15*param.threads*2].x += accum_re[15*blockDim.x];
602 
603  out[outId + 0 *param.threads*2].y += accum_im[ 0*blockDim.x];
604  out[outId + 1 *param.threads*2].y += accum_im[ 1*blockDim.x];
605  out[outId + 2 *param.threads*2].y += accum_im[ 2*blockDim.x];
606  out[outId + 3 *param.threads*2].y += accum_im[ 3*blockDim.x];
607  out[outId + 4 *param.threads*2].y += accum_im[ 4*blockDim.x];
608  out[outId + 5 *param.threads*2].y += accum_im[ 5*blockDim.x];
609  out[outId + 6 *param.threads*2].y += accum_im[ 6*blockDim.x];
610  out[outId + 7 *param.threads*2].y += accum_im[ 7*blockDim.x];
611  out[outId + 8 *param.threads*2].y += accum_im[ 8*blockDim.x];
612  out[outId + 9 *param.threads*2].y += accum_im[ 9*blockDim.x];
613  out[outId + 10*param.threads*2].y += accum_im[10*blockDim.x];
614  out[outId + 11*param.threads*2].y += accum_im[11*blockDim.x];
615  out[outId + 12*param.threads*2].y += accum_im[12*blockDim.x];
616  out[outId + 13*param.threads*2].y += accum_im[13*blockDim.x];
617  out[outId + 14*param.threads*2].y += accum_im[14*blockDim.x];
618  out[outId + 15*param.threads*2].y += accum_im[15*blockDim.x];
619 
620  return;
621 }
622 
623 __global__ void contractPlusKernel (double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
624 {
625  int sid = blockIdx.x*blockDim.x + threadIdx.x;
626  int outId = sid;
627  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
628 
629  if (sid >= param.threads)
630  return;
631 
632  volatile double2 tmp;
633  extern __shared__ double sm[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
634 
635  volatile double *accum_re = sm + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
636  volatile double *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
637 
638  eutId = 2*sid;
639  auxCoord1 = eutId / X1;
640  xCoord1 = eutId - auxCoord1 * X1;
641  auxCoord2 = auxCoord1 / X2;
642  xCoord2 = auxCoord1 - auxCoord2 * X2;
643  xCoord4 = auxCoord2 / X3;
644  xCoord3 = auxCoord2 - xCoord4 * X3;
645 
646  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
647  xCoord1 += auxCoord1;
648  outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4)); //AQUI
649 
650  READ_SPINOR (SPINORTEX, myStride, sid, sid);
651  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
652 
653  //compute in1^dag:
654 
655  I0.y = -I0.y;
656  I1.y = -I1.y;
657  I2.y = -I2.y;
658  I3.y = -I3.y;
659  I4.y = -I4.y;
660  I5.y = -I5.y;
661  I6.y = -I6.y;
662  I7.y = -I7.y;
663  I8.y = -I8.y;
664  I9.y = -I9.y;
665  I10.y = -I10.y;
666  I11.y = -I11.y;
667 
668  //do products for all color component here:
669  //00 component:
670  tmp_re = I0.x * J0.x - I0.y * J0.y;
671  tmp_re += I1.x * J1.x - I1.y * J1.y;
672  tmp_re += I2.x * J2.x - I2.y * J2.y;
673  accum_re[0*blockDim.x] = tmp_re;
674 
675  tmp_im = I0.x * J0.y + I0.y * J0.x;
676  tmp_im += I1.x * J1.y + I1.y * J1.x;
677  tmp_im += I2.x * J2.y + I2.y * J2.x;
678  accum_im[0*blockDim.x] = tmp_im;
679 
680  //01 component:
681  tmp_re = I0.x * J3.x - I0.y * J3.y;
682  tmp_re += I1.x * J4.x - I1.y * J4.y;
683  tmp_re += I2.x * J5.x - I2.y * J5.y;
684  accum_re[1*blockDim.x] = tmp_re;
685 
686  tmp_im = I0.x * J3.y + I0.y * J3.x;
687  tmp_im += I1.x * J4.y + I1.y * J4.x;
688  tmp_im += I2.x * J5.y + I2.y * J5.x;
689  accum_im[1*blockDim.x] = tmp_im;
690 
691  //02 component:
692  tmp_re = I0.x * J6.x - I0.y * J6.y;
693  tmp_re += I1.x * J7.x - I1.y * J7.y;
694  tmp_re += I2.x * J8.x - I2.y * J8.y;
695  accum_re[2*blockDim.x] = tmp_re;
696 
697  tmp_im = I0.x * J6.y + I0.y * J6.x;
698  tmp_im += I1.x * J7.y + I1.y * J7.x;
699  tmp_im += I2.x * J8.y + I2.y * J8.x;
700  accum_im[2*blockDim.x] = tmp_im;
701 
702  //03 component:
703  tmp_re = I0.x * J9.x - I0.y * J9.y;
704  tmp_re += I1.x * J10.x - I1.y * J10.y;
705  tmp_re += I2.x * J11.x - I2.y * J11.y;
706  accum_re[3*blockDim.x] = tmp_re;
707 
708  tmp_im = I0.x * J9.y + I0.y * J9.x;
709  tmp_im += I1.x * J10.y + I1.y * J10.x;
710  tmp_im += I2.x * J11.y + I2.y * J11.x;
711  accum_im[3*blockDim.x] = tmp_im;
712 
713  //10 component:
714  tmp_re = I3.x * J0.x - I3.y * J0.y;
715  tmp_re += I4.x * J1.x - I4.y * J1.y;
716  tmp_re += I5.x * J2.x - I5.y * J2.y;
717  accum_re[ 4*blockDim.x] = tmp_re;
718 
719  tmp_im = I3.x * J0.y + I3.y * J0.x;
720  tmp_im += I4.x * J1.y + I4.y * J1.x;
721  tmp_im += I5.x * J2.y + I5.y * J2.x;
722  accum_im[ 4*blockDim.x] = tmp_im;
723 
724  //11 component:
725  tmp_re = I3.x * J3.x - I3.y * J3.y;
726  tmp_re += I4.x * J4.x - I4.y * J4.y;
727  tmp_re += I5.x * J5.x - I5.y * J5.y;
728  accum_re[ 5*blockDim.x] = tmp_re;
729 
730  tmp_im = I3.x * J3.y + I3.y * J3.x;
731  tmp_im += I4.x * J4.y + I4.y * J4.x;
732  tmp_im += I5.x * J5.y + I5.y * J5.x;
733  accum_im[ 5*blockDim.x] = tmp_im;
734 
735  //12 component:
736  tmp_re = I3.x * J6.x - I3.y * J6.y;
737  tmp_re += I4.x * J7.x - I4.y * J7.y;
738  tmp_re += I5.x * J8.x - I5.y * J8.y;
739  accum_re[ 6*blockDim.x] = tmp_re;
740 
741  tmp_im = I3.x * J6.y + I3.y * J6.x;
742  tmp_im += I4.x * J7.y + I4.y * J7.x;
743  tmp_im += I5.x * J8.y + I5.y * J8.x;
744  accum_im[ 6*blockDim.x] = tmp_im;
745 
746  //13 component:
747  tmp_re = I3.x * J9.x - I3.y * J9.y;
748  tmp_re += I4.x * J10.x - I4.y * J10.y;
749  tmp_re += I5.x * J11.x - I5.y * J11.y;
750  accum_re[ 7*blockDim.x] = tmp_re;
751 
752  tmp_im = I3.x * J9.y + I3.y * J9.x;
753  tmp_im += I4.x * J10.y + I4.y * J10.x;
754  tmp_im += I5.x * J11.y + I5.y * J11.x;
755  accum_im[ 7*blockDim.x] = tmp_im;
756 
757  //20 component:
758  tmp_re = I6.x * J0.x - I6.y * J0.y;
759  tmp_re += I7.x * J1.x - I7.y * J1.y;
760  tmp_re += I8.x * J2.x - I8.y * J2.y;
761  accum_re[ 8*blockDim.x] = tmp_re;
762 
763  tmp_im = I6.x * J0.y + I6.y * J0.x;
764  tmp_im += I7.x * J1.y + I7.y * J1.x;
765  tmp_im += I8.x * J2.y + I8.y * J2.x;
766  accum_im[ 8*blockDim.x] = tmp_im;
767 
768  //21 component:
769  tmp_re = I6.x * J3.x - I6.y * J3.y;
770  tmp_re += I7.x * J4.x - I7.y * J4.y;
771  tmp_re += I8.x * J5.x - I8.y * J5.y;
772  accum_re[ 9*blockDim.x] = tmp_re;
773 
774  tmp_im = I6.x * J3.y + I6.y * J3.x;
775  tmp_im += I7.x * J4.y + I7.y * J4.x;
776  tmp_im += I8.x * J5.y + I8.y * J5.x;
777  accum_im[ 9*blockDim.x] = tmp_im;
778 
779  //22 component:
780  tmp_re = I6.x * J6.x - I6.y * J6.y;
781  tmp_re += I7.x * J7.x - I7.y * J7.y;
782  tmp_re += I8.x * J8.x - I8.y * J8.y;
783  accum_re[10*blockDim.x] = tmp_re;
784 
785  tmp_im = I6.x * J6.y + I6.y * J6.x;
786  tmp_im += I7.x * J7.y + I7.y * J7.x;
787  tmp_im += I8.x * J8.y + I8.y * J8.x;
788  accum_im[10*blockDim.x] = tmp_im;
789 
790  //23 component:
791  tmp_re = I6.x * J9.x - I6.y * J9.y;
792  tmp_re += I7.x * J10.x - I7.y * J10.y;
793  tmp_re += I8.x * J11.x - I8.y * J11.y;
794  accum_re[11*blockDim.x] = tmp_re;
795 
796  tmp_im = I6.x * J9.y + I6.y * J9.x;
797  tmp_im += I7.x * J10.y + I7.y * J10.x;
798  tmp_im += I8.x * J11.y + I8.y * J11.x;
799  accum_im[11*blockDim.x] = tmp_im;
800 
801  //30 component:
802  tmp_re = I9.x * J0.x - I9.y * J0.y;
803  tmp_re += I10.x * J1.x - I10.y * J1.y;
804  tmp_re += I11.x * J2.x - I11.y * J2.y;
805  accum_re[12*blockDim.x] = tmp_re;
806 
807  tmp_im = I9.x * J0.y + I9.y * J0.x;
808  tmp_im += I10.x * J1.y + I10.y * J1.x;
809  tmp_im += I11.x * J2.y + I11.y * J2.x;
810  accum_im[12*blockDim.x] = tmp_im;
811 
812  //31 component:
813  tmp_re = I9.x * J3.x - I9.y * J3.y;
814  tmp_re += I10.x * J4.x - I10.y * J4.y;
815  tmp_re += I11.x * J5.x - I11.y * J5.y;
816  accum_re[13*blockDim.x] = tmp_re;
817 
818  tmp_im = I9.x * J3.y + I9.y * J3.x;
819  tmp_im += I10.x * J4.y + I10.y * J4.x;
820  tmp_im += I11.x * J5.y + I11.y * J5.x;
821  accum_im[13*blockDim.x] = tmp_im;
822 
823  //32 component:
824  tmp_re = I9.x * J6.x - I9.y * J6.y;
825  tmp_re += I10.x * J7.x - I10.y * J7.y;
826  tmp_re += I11.x * J8.x - I11.y * J8.y;
827  accum_re[14*blockDim.x] = tmp_re;
828 
829  tmp_im = I9.x * J6.y + I9.y * J6.x;
830  tmp_im += I10.x * J7.y + I10.y * J7.x;
831  tmp_im += I11.x * J8.y + I11.y * J8.x;
832  accum_im[14*blockDim.x] = tmp_im;
833 
834  //33 component:
835  tmp_re = I9.x * J9.x - I9.y * J9.y;
836  tmp_re += I10.x * J10.x - I10.y * J10.y;
837  tmp_re += I11.x * J11.x - I11.y * J11.y;
838  accum_re[15*blockDim.x] = tmp_re;
839 
840  tmp_im = I9.x * J9.y + I9.y * J9.x;
841  tmp_im += I10.x * J10.y + I10.y * J10.x;
842  tmp_im += I11.x * J11.y + I11.y * J11.x;
843  accum_im[15*blockDim.x] = tmp_im;
844 
845 /* CONTRACTION FULL VOLUME */
846 
847  out[outId + 0 *param.threads*2].x += accum_re[ 0*blockDim.x];
848  out[outId + 1 *param.threads*2].x += accum_re[ 1*blockDim.x];
849  out[outId + 2 *param.threads*2].x += accum_re[ 2*blockDim.x];
850  out[outId + 3 *param.threads*2].x += accum_re[ 3*blockDim.x];
851  out[outId + 4 *param.threads*2].x += accum_re[ 4*blockDim.x];
852  out[outId + 5 *param.threads*2].x += accum_re[ 5*blockDim.x];
853  out[outId + 6 *param.threads*2].x += accum_re[ 6*blockDim.x];
854  out[outId + 7 *param.threads*2].x += accum_re[ 7*blockDim.x];
855  out[outId + 8 *param.threads*2].x += accum_re[ 8*blockDim.x];
856  out[outId + 9 *param.threads*2].x += accum_re[ 9*blockDim.x];
857  out[outId + 10*param.threads*2].x += accum_re[10*blockDim.x];
858  out[outId + 11*param.threads*2].x += accum_re[11*blockDim.x];
859  out[outId + 12*param.threads*2].x += accum_re[12*blockDim.x];
860  out[outId + 13*param.threads*2].x += accum_re[13*blockDim.x];
861  out[outId + 14*param.threads*2].x += accum_re[14*blockDim.x];
862  out[outId + 15*param.threads*2].x += accum_re[15*blockDim.x];
863 
864  out[outId + 0 *param.threads*2].y += accum_im[ 0*blockDim.x];
865  out[outId + 1 *param.threads*2].y += accum_im[ 1*blockDim.x];
866  out[outId + 2 *param.threads*2].y += accum_im[ 2*blockDim.x];
867  out[outId + 3 *param.threads*2].y += accum_im[ 3*blockDim.x];
868  out[outId + 4 *param.threads*2].y += accum_im[ 4*blockDim.x];
869  out[outId + 5 *param.threads*2].y += accum_im[ 5*blockDim.x];
870  out[outId + 6 *param.threads*2].y += accum_im[ 6*blockDim.x];
871  out[outId + 7 *param.threads*2].y += accum_im[ 7*blockDim.x];
872  out[outId + 8 *param.threads*2].y += accum_im[ 8*blockDim.x];
873  out[outId + 9 *param.threads*2].y += accum_im[ 9*blockDim.x];
874  out[outId + 10*param.threads*2].y += accum_im[10*blockDim.x];
875  out[outId + 11*param.threads*2].y += accum_im[11*blockDim.x];
876  out[outId + 12*param.threads*2].y += accum_im[12*blockDim.x];
877  out[outId + 13*param.threads*2].y += accum_im[13*blockDim.x];
878  out[outId + 14*param.threads*2].y += accum_im[14*blockDim.x];
879  out[outId + 15*param.threads*2].y += accum_im[15*blockDim.x];
880 
881  return;
882 }
883 
884 #undef SPINORTEX
885 #undef INTERTEX
886 
887 #undef READ_SPINOR
888 #undef READ_INTERMEDIATE_SPINOR
889 
890 #undef SPINOR_HOP
891 
892 #endif // (__CUDA_ARCH__ >= 130)
893 
894 
895 #define READ_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
896  float4 I0 = spinor[sp_idx + 0*(stride)]; \
897  float4 I1 = spinor[sp_idx + 1*(stride)]; \
898  float4 I2 = spinor[sp_idx + 2*(stride)]; \
899  float4 I3 = spinor[sp_idx + 3*(stride)]; \
900  float4 I4 = spinor[sp_idx + 4*(stride)]; \
901  float4 I5 = spinor[sp_idx + 5*(stride)];
902 
903 
904 #define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
905  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
906  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
907  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
908  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
909  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
910  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
911 
912 #define READ_INTERMEDIATE_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
913  float4 J0 = spinor[sp_idx + 0*(stride)]; \
914  float4 J1 = spinor[sp_idx + 1*(stride)]; \
915  float4 J2 = spinor[sp_idx + 2*(stride)]; \
916  float4 J3 = spinor[sp_idx + 3*(stride)]; \
917  float4 J4 = spinor[sp_idx + 4*(stride)]; \
918  float4 J5 = spinor[sp_idx + 5*(stride)];
919 
920 #define READ_INTERMEDIATE_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
921  float4 J0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
922  float4 J1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
923  float4 J2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
924  float4 J3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
925  float4 J4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
926  float4 J5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
927 
928 
929 #ifdef DIRECT_ACCESS_WILSON_SPINOR
930  #define READ_SPINOR READ_SPINOR_SINGLE
931  #define SPINORTEX in
932 #else
933  #define READ_SPINOR READ_SPINOR_SINGLE_TEX
934 
935  #ifdef USE_TEXTURE_OBJECTS
936  #define SPINORTEX param.inTex
937  #else
938  #define SPINORTEX spinorTexSingle
939  #endif // USE_TEXTURE_OBJECTS
940 #endif
941 
942 #ifdef DIRECT_ACCESS_WILSON_INTER
943  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE
944  #define INTERTEX in2
945 #else
946  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE_TEX
947 
948  #ifdef USE_TEXTURE_OBJECTS
949  #define INTERTEX param.outTex
950  #else
951  #define INTERTEX interTexSingle
952  #endif // USE_TEXTURE_OBJECTS
953 #endif
954 
955 #define SPINOR_HOP 6
956 
957 __global__ void contractGamma5PlusKernel (float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
958 {
959  int sid = blockIdx.x*blockDim.x + threadIdx.x;
960  int outId = sid;
961  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
962 
963  if (sid >= param.threads)
964  return;
965 
966  volatile float2 tmp;
967  extern __shared__ float sms[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
968 
969  volatile float *accum_re = sms + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
970  volatile float *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
971 
972  eutId = 2*sid;
973  auxCoord1 = eutId / X1;
974  xCoord1 = eutId - auxCoord1 * X1;
975  auxCoord2 = auxCoord1 / X2;
976  xCoord2 = auxCoord1 - auxCoord2 * X2;
977  xCoord4 = auxCoord2 / X3;
978  xCoord3 = auxCoord2 - xCoord4 * X3;
979 
980  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
981  xCoord1 += auxCoord1;
982  outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4)); //AQUI
983 
984  //Load the full input spinors:
985 
986  READ_SPINOR (SPINORTEX, myStride, sid, sid);
987  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
988 
989  //compute in1^dag * gamma5:
990 
991  //First color component
992 
993  tmp_re = +I0.x;
994  tmp_im = -I0.y;
995  I0.x = +I3.x;
996  I0.y = -I3.y;
997  I3.x = tmp_re;
998  I3.y = tmp_im;
999 
1000  tmp_re = +I1.z;
1001  tmp_im = -I1.w;
1002  I1.z = +I4.z;
1003  I1.w = -I4.w;
1004  I4.z = tmp_re;
1005  I4.w = tmp_im;
1006 
1007  //Second color component
1008 
1009  tmp_re = +I0.z;
1010  tmp_im = -I0.w;
1011  I0.z = +I3.z;
1012  I0.w = -I3.w;
1013  I3.z = tmp_re;
1014  I3.w = tmp_im;
1015 
1016  tmp_re = +I2.x;
1017  tmp_im = -I2.y;
1018  I2.x = +I5.x;
1019  I2.y = -I5.y;
1020  I5.x = tmp_re;
1021  I5.y = tmp_im;
1022 
1023  //Third color component
1024 
1025  tmp_re = +I1.x;
1026  tmp_im = -I1.y;
1027  I1.x = +I4.x;
1028  I1.y = -I4.y;
1029  I4.x = tmp_re;
1030  I4.y = tmp_im;
1031 
1032  tmp_re = +I2.z;
1033  tmp_im = -I2.w;
1034  I2.z = +I5.z;
1035  I2.w = -I5.w;
1036  I5.z = tmp_re;
1037  I5.w = tmp_im;
1038 
1039  //do products for first color component here:
1040 
1041  //00 component:
1042  tmp_re = I0.x * J0.x - I0.y * J0.y;
1043  tmp_re += I0.z * J0.z - I0.w * J0.w;
1044  tmp_re += I1.x * J1.x - I1.y * J1.y;
1045  accum_re[0*blockDim.x] = tmp_re;
1046 
1047  tmp_im = I0.x * J0.y + I0.y * J0.x;
1048  tmp_im += I0.z * J0.w + I0.w * J0.z;
1049  tmp_im += I1.x * J1.y + I1.y * J1.x;
1050  accum_im[0*blockDim.x] = tmp_im;
1051 
1052  //01 component:
1053  tmp_re = I0.x * J1.z - I0.y * J1.w;
1054  tmp_re += I0.z * J2.x - I0.w * J2.y;
1055  tmp_re += I1.x * J2.z - I1.y * J2.w;
1056  accum_re[1*blockDim.x] = tmp_re;
1057 
1058  tmp_im = I0.x * J1.w + I0.y * J1.z;
1059  tmp_im += I0.z * J2.y + I0.w * J2.x;
1060  tmp_im += I1.x * J2.w + I1.y * J2.z;
1061  accum_im[1*blockDim.x] = tmp_im;
1062 
1063  //02 component:
1064  tmp_re = I0.x * J3.x - I0.y * J3.y;
1065  tmp_re += I0.z * J3.z - I0.w * J3.w;
1066  tmp_re += I1.x * J4.x - I1.y * J4.y;
1067  accum_re[2*blockDim.x] = tmp_re;
1068 
1069  tmp_im = I0.x * J3.y + I0.y * J3.x;
1070  tmp_im += I0.z * J3.w + I0.w * J3.z;
1071  tmp_im += I1.x * J4.y + I1.y * J4.x;
1072  accum_im[2*blockDim.x] = tmp_im;
1073 
1074  //03 component:
1075  tmp_re = I0.x * J4.z - I0.y * J4.w;
1076  tmp_re += I0.z * J5.x - I0.w * J5.x;
1077  tmp_re += I1.x * J5.z - I1.y * J5.w;
1078 
1079  accum_re[3*blockDim.x] = tmp_re;
1080 
1081  tmp_im = I0.x * J4.w + I0.y * J4.z;
1082  tmp_im += I0.z * J5.y + I0.w * J5.y;
1083  tmp_im += I1.x * J5.w + I1.y * J5.z;
1084  accum_im[3*blockDim.x] = tmp_im;
1085 
1086  //10 component:
1087  tmp_re = I1.z * J0.x - I1.w * J0.y;
1088  tmp_re += I2.x * J0.z - I2.y * J0.w;
1089  tmp_re += I2.z * J1.x - I2.w * J1.y;
1090  accum_re[4*blockDim.x] = tmp_re;
1091 
1092  tmp_im = I1.z * J0.y + I1.w * J0.x;
1093  tmp_im += I2.x * J0.w + I2.y * J0.z;
1094  tmp_im += I2.z * J1.y + I2.w * J1.x;
1095  accum_im[4*blockDim.x] = tmp_im;
1096 
1097  //11 component:
1098  tmp_re = I1.z * J1.z - I1.w * J1.w;
1099  tmp_re += I2.x * J2.x - I2.y * J2.y;
1100  tmp_re += I2.z * J2.z - I2.w * J2.w;
1101  accum_re[5*blockDim.x] = tmp_re;
1102 
1103  tmp_im = I1.z * J1.w + I1.w * J1.z;
1104  tmp_im += I2.x * J2.y + I2.y * J2.x;
1105  tmp_im += I2.z * J2.w + I2.w * J2.z;
1106  accum_im[5*blockDim.x] = tmp_im;
1107 
1108  //12 component:
1109  tmp_re = I1.z * J3.x - I1.w * J3.y;
1110  tmp_re += I2.x * J3.z - I2.y * J3.w;
1111  tmp_re += I2.z * J4.x - I2.w * J4.y;
1112  accum_re[6*blockDim.x] = tmp_re;
1113 
1114  tmp_im = I1.z * J3.y + I1.w * J3.x;
1115  tmp_im += I2.x * J3.w + I2.y * J3.z;
1116  tmp_im += I2.z * J4.y + I2.w * J4.x;
1117  accum_im[6*blockDim.x] = tmp_im;
1118 
1119  //13 component:
1120  tmp_re = I1.z * J4.z - I1.w * J4.w;
1121  tmp_re += I2.x * J5.x - I2.y * J5.y;
1122  tmp_re += I2.z * J5.z - I2.w * J5.w;
1123  accum_re[7*blockDim.x] = tmp_re;
1124 
1125  tmp_im = I1.z * J4.w + I1.w * J4.z;
1126  tmp_im += I2.x * J5.y + I2.y * J5.x;
1127  tmp_im += I2.z * J5.w + I2.w * J5.z;
1128  accum_im[7*blockDim.x] = tmp_im;
1129 
1130  //20 component:
1131  tmp_re = I3.x * J0.x - I3.y * J0.y;
1132  tmp_re += I3.z * J0.z - I3.w * J0.w;
1133  tmp_re += I4.x * J1.x - I4.y * J1.y;
1134  accum_re[8*blockDim.x] = tmp_re;
1135 
1136  tmp_im = I3.x * J0.y + I3.y * J0.x;
1137  tmp_im += I3.z * J0.w + I3.w * J0.z;
1138  tmp_im += I4.x * J1.y + I4.y * J1.x;
1139  accum_im[8*blockDim.x] = tmp_im;
1140 
1141  //21 component:
1142  tmp_re = I3.x * J1.z - I3.y * J1.w;
1143  tmp_re += I3.z * J2.x - I3.w * J2.y;
1144  tmp_re += I4.x * J2.z - I4.y * J2.w;
1145  accum_re[9*blockDim.x] = tmp_re;
1146 
1147  tmp_im = I3.x * J1.w + I3.y * J1.z;
1148  tmp_im += I3.z * J2.y + I3.w * J2.x;
1149  tmp_im += I4.x * J2.w + I4.y * J2.z;
1150  accum_im[9*blockDim.x] = tmp_im;
1151 
1152  //22 component:
1153  tmp_re = I3.x * J3.x - I3.y * J3.y;
1154  tmp_re += I3.z * J3.z - I3.w * J3.w;
1155  tmp_re += I4.x * J4.x - I4.y * J4.y;
1156  accum_re[10*blockDim.x] = tmp_re;
1157 
1158  tmp_im = I3.x * J3.y + I3.y * J3.x;
1159  tmp_im += I3.z * J3.w + I3.w * J3.z;
1160  tmp_im += I4.x * J4.y + I4.y * J4.x;
1161  accum_im[10*blockDim.x] = tmp_im;
1162 
1163  //23 component:
1164  tmp_re = I3.x * J4.z - I3.y * J4.w;
1165  tmp_re += I3.z * J5.x - I3.w * J5.y;
1166  tmp_re += I4.x * J5.z - I4.y * J5.w;
1167  accum_re[11*blockDim.x] = tmp_re;
1168 
1169  tmp_im = I3.x * J4.w + I3.y * J4.z;
1170  tmp_im += I3.z * J5.y + I3.w * J5.x;
1171  tmp_im += I4.x * J5.w + I4.y * J5.z;
1172  accum_im[11*blockDim.x] = tmp_im;
1173 
1174  //30 component:
1175  tmp_re = I4.z * J0.x - I4.w * J0.y;
1176  tmp_re += I5.x * J0.z - I5.y * J0.w;
1177  tmp_re += I5.z * J1.x - I5.w * J1.y;
1178  accum_re[12*blockDim.x] = tmp_re;
1179 
1180  tmp_im = I4.z * J0.y + I4.w * J0.x;
1181  tmp_im += I5.x * J0.w + I5.y * J0.z;
1182  tmp_im += I5.z * J1.y + I5.w * J1.x;
1183  accum_im[12*blockDim.x] = tmp_im;
1184 
1185  //31 component:
1186  tmp_re = I4.z * J1.z - I4.w * J1.w;
1187  tmp_re += I5.x * J2.x - I5.y * J2.y;
1188  tmp_re += I5.z * J2.z - I5.w * J2.w;
1189  accum_re[13*blockDim.x] = tmp_re;
1190 
1191  tmp_im = I4.z * J1.w + I4.w * J1.z;
1192  tmp_im += I5.x * J2.y + I5.y * J2.x;
1193  tmp_im += I5.z * J2.w + I5.w * J2.z;
1194  accum_im[13*blockDim.x] = tmp_im;
1195 
1196  //32 component:
1197  tmp_re = I4.z * J3.x - I4.w * J3.y;
1198  tmp_re += I5.x * J3.z - I5.y * J3.w;
1199  tmp_re += I5.z * J4.x - I5.w * J4.y;
1200  accum_re[14*blockDim.x] = tmp_re;
1201 
1202  tmp_im = I4.z * J3.y + I4.w * J3.x;
1203  tmp_im += I5.x * J3.w + I5.y * J3.z;
1204  tmp_im += I5.z * J4.y + I5.w * J4.x;
1205  accum_im[14*blockDim.x] = tmp_im;
1206 
1207  //33 component:
1208  tmp_re = I4.z * J4.z - I4.w * J4.w;
1209  tmp_re += I5.x * J5.x - I5.y * J5.y;
1210  tmp_re += I5.z * J5.z - I5.w * J5.w;
1211  accum_re[15*blockDim.x] = tmp_re;
1212 
1213  tmp_im = I4.z * J4.w + I4.w * J4.z;
1214  tmp_im += I5.x * J5.y + I5.y * J5.x;
1215  tmp_im += I5.z * J5.w + I5.w * J5.z;
1216  accum_im[15*blockDim.x] = tmp_im;
1217 
1218 
1219 
1220  //Store output back to global buffer:
1221 
1222 
1223  /* CONTRACTION FULL VOLUME */
1224 
1225  out[outId + 0 *param.threads*2].x += accum_re[ 0*blockDim.x];
1226  out[outId + 1 *param.threads*2].x += accum_re[ 1*blockDim.x];
1227  out[outId + 2 *param.threads*2].x += accum_re[ 2*blockDim.x];
1228  out[outId + 3 *param.threads*2].x += accum_re[ 3*blockDim.x];
1229  out[outId + 4 *param.threads*2].x += accum_re[ 4*blockDim.x];
1230  out[outId + 5 *param.threads*2].x += accum_re[ 5*blockDim.x];
1231  out[outId + 6 *param.threads*2].x += accum_re[ 6*blockDim.x];
1232  out[outId + 7 *param.threads*2].x += accum_re[ 7*blockDim.x];
1233  out[outId + 8 *param.threads*2].x += accum_re[ 8*blockDim.x];
1234  out[outId + 9 *param.threads*2].x += accum_re[ 9*blockDim.x];
1235  out[outId + 10*param.threads*2].x += accum_re[10*blockDim.x];
1236  out[outId + 11*param.threads*2].x += accum_re[11*blockDim.x];
1237  out[outId + 12*param.threads*2].x += accum_re[12*blockDim.x];
1238  out[outId + 13*param.threads*2].x += accum_re[13*blockDim.x];
1239  out[outId + 14*param.threads*2].x += accum_re[14*blockDim.x];
1240  out[outId + 15*param.threads*2].x += accum_re[15*blockDim.x];
1241 
1242  out[outId + 0 *param.threads*2].y += accum_im[ 0*blockDim.x];
1243  out[outId + 1 *param.threads*2].y += accum_im[ 1*blockDim.x];
1244  out[outId + 2 *param.threads*2].y += accum_im[ 2*blockDim.x];
1245  out[outId + 3 *param.threads*2].y += accum_im[ 3*blockDim.x];
1246  out[outId + 4 *param.threads*2].y += accum_im[ 4*blockDim.x];
1247  out[outId + 5 *param.threads*2].y += accum_im[ 5*blockDim.x];
1248  out[outId + 6 *param.threads*2].y += accum_im[ 6*blockDim.x];
1249  out[outId + 7 *param.threads*2].y += accum_im[ 7*blockDim.x];
1250  out[outId + 8 *param.threads*2].y += accum_im[ 8*blockDim.x];
1251  out[outId + 9 *param.threads*2].y += accum_im[ 9*blockDim.x];
1252  out[outId + 10*param.threads*2].y += accum_im[10*blockDim.x];
1253  out[outId + 11*param.threads*2].y += accum_im[11*blockDim.x];
1254  out[outId + 12*param.threads*2].y += accum_im[12*blockDim.x];
1255  out[outId + 13*param.threads*2].y += accum_im[13*blockDim.x];
1256  out[outId + 14*param.threads*2].y += accum_im[14*blockDim.x];
1257  out[outId + 15*param.threads*2].y += accum_im[15*blockDim.x];
1258 
1259  return;
1260 }
1261 
1262 //Perform trace in color space only and for a given tslice
1263 //since the file is included in dslash_quda.h, no need to add dslash_constants.h file here (for, e.g., Vsh)
1264 __global__ void contractTslicePlusKernel (float2 *out, float4 *in1, float4 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
1265 {
1266  int sid = blockIdx.x*blockDim.x + threadIdx.x; //number of threads is equal to Tslice volume
1267  //Adjust sid to correct tslice (exe domain must be Tslice volume!)
1268  int inId = sid + Vsh*Tslice; //Vsh - 3d space volume for the parity spinor (equale to exe domain!)
1269  int outId;
1270  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1271 
1272  if (sid >= param.threads) //param.threads == tslice volume
1273  return;
1274 
1275  volatile float2 tmp;
1276  extern __shared__ float sms[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
1277 
1278  volatile float *accum_re = sms + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
1279  volatile float *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
1280 
1281 //The output only for a given tslice (for the full tslice content, i.e., both parities!):
1282 
1283  eutId = 2*inId;
1284  auxCoord1 = eutId / X1;
1285  xCoord1 = eutId - auxCoord1 * X1;
1286  auxCoord2 = auxCoord1 / X2;
1287  xCoord2 = auxCoord1 - auxCoord2 * X2;
1288  xCoord4 = auxCoord2 / X3;
1289  xCoord3 = auxCoord2 - xCoord4 * X3;
1290 
1291  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1292  xCoord1 += auxCoord1;
1293  outId = xCoord1 + X1*(xCoord2 + X2*xCoord3); //AQUI
1294 
1295  //Load the full input spinors:
1296 
1297  READ_SPINOR (SPINORTEX, myStride, sid, sid);
1298  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
1299 
1300  //compute in1^dag:
1301 
1302  I0.y = -I0.y;
1303  I0.w = -I0.w;
1304  I1.y = -I1.y;
1305  I1.w = -I1.w;
1306  I2.y = -I2.y;
1307  I2.w = -I2.w;
1308  I3.y = -I3.y;
1309  I3.w = -I3.w;
1310  I4.y = -I4.y;
1311  I4.w = -I4.w;
1312  I5.y = -I5.y;
1313  I5.w = -I5.w;
1314 
1315  //do products for first color component here:
1316  //00 component:
1317  tmp_re = I0.x * J0.x - I0.y * J0.y;
1318  tmp_re += I0.z * J0.z - I0.w * J0.w;
1319  tmp_re += I1.x * J1.x - I1.y * J1.y;
1320  accum_re[0*blockDim.x] = tmp_re;
1321 
1322  tmp_im = I0.x * J0.y + I0.y * J0.x;
1323  tmp_im += I0.z * J0.w + I0.w * J0.z;
1324  tmp_im += I1.x * J1.y + I1.y * J1.x;
1325  accum_im[0*blockDim.x] = tmp_im;
1326 
1327  //01 component:
1328  tmp_re = I0.x * J1.z - I0.y * J1.w;
1329  tmp_re += I0.z * J2.x - I0.w * J2.y;
1330  tmp_re += I1.x * J2.z - I1.y * J2.w;
1331  accum_re[1*blockDim.x] = tmp_re;
1332 
1333  tmp_im = I0.x * J1.w + I0.y * J1.z;
1334  tmp_im += I0.z * J2.y + I0.w * J2.x;
1335  tmp_im += I1.x * J2.w + I1.y * J2.z;
1336  accum_im[1*blockDim.x] = tmp_im;
1337 
1338  //02 component:
1339  tmp_re = I0.x * J3.x - I0.y * J3.y;
1340  tmp_re += I0.z * J3.z - I0.w * J3.w;
1341  tmp_re += I1.x * J4.x - I1.y * J4.y;
1342  accum_re[2*blockDim.x] = tmp_re;
1343 
1344  tmp_im = I0.x * J3.y + I0.y * J3.x;
1345  tmp_im += I0.z * J3.w + I0.w * J3.z;
1346  tmp_im += I1.x * J4.y + I1.y * J4.x;
1347  accum_im[2*blockDim.x] = tmp_im;
1348 
1349  //03 component:
1350  tmp_re = I0.x * J4.z - I0.y * J4.w;
1351  tmp_re += I0.z * J5.x - I0.w * J5.x;
1352  tmp_re += I1.x * J5.z - I1.y * J5.w;
1353 
1354  accum_re[3*blockDim.x] = tmp_re;
1355 
1356  tmp_im = I0.x * J4.w + I0.y * J4.z;
1357  tmp_im += I0.z * J5.y + I0.w * J5.y;
1358  tmp_im += I1.x * J5.w + I1.y * J5.z;
1359  accum_im[3*blockDim.x] = tmp_im;
1360 
1361  //10 component:
1362  tmp_re = I1.z * J0.x - I1.w * J0.y;
1363  tmp_re += I2.x * J0.z - I2.y * J0.w;
1364  tmp_re += I2.z * J1.x - I2.w * J1.y;
1365  accum_re[4*blockDim.x] = tmp_re;
1366 
1367  tmp_im = I1.z * J0.y + I1.w * J0.x;
1368  tmp_im += I2.x * J0.w + I2.y * J0.z;
1369  tmp_im += I2.z * J1.y + I2.w * J1.x;
1370  accum_im[4*blockDim.x] = tmp_im;
1371 
1372  //11 component:
1373  tmp_re = I1.z * J1.z - I1.w * J1.w;
1374  tmp_re += I2.x * J2.x - I2.y * J2.y;
1375  tmp_re += I2.z * J2.z - I2.w * J2.w;
1376  accum_re[5*blockDim.x] = tmp_re;
1377 
1378  tmp_im = I1.z * J1.w + I1.w * J1.z;
1379  tmp_im += I2.x * J2.y + I2.y * J2.x;
1380  tmp_im += I2.z * J2.w + I2.w * J2.z;
1381  accum_im[5*blockDim.x] = tmp_im;
1382 
1383  //12 component:
1384  tmp_re = I1.z * J3.x - I1.w * J3.y;
1385  tmp_re += I2.x * J3.z - I2.y * J3.w;
1386  tmp_re += I2.z * J4.x - I2.w * J4.y;
1387  accum_re[6*blockDim.x] = tmp_re;
1388 
1389  tmp_im = I1.z * J3.y + I1.w * J3.x;
1390  tmp_im += I2.x * J3.w + I2.y * J3.z;
1391  tmp_im += I2.z * J4.y + I2.w * J4.x;
1392  accum_im[6*blockDim.x] = tmp_im;
1393 
1394  //13 component:
1395  tmp_re = I1.z * J4.z - I1.w * J4.w;
1396  tmp_re += I2.x * J5.x - I2.y * J5.y;
1397  tmp_re += I2.z * J5.z - I2.w * J5.w;
1398  accum_re[7*blockDim.x] = tmp_re;
1399 
1400  tmp_im = I1.z * J4.w + I1.w * J4.z;
1401  tmp_im += I2.x * J5.y + I2.y * J5.x;
1402  tmp_im += I2.z * J5.w + I2.w * J5.z;
1403  accum_im[7*blockDim.x] = tmp_im;
1404 
1405  //20 component:
1406  tmp_re = I3.x * J0.x - I3.y * J0.y;
1407  tmp_re += I3.z * J0.z - I3.w * J0.w;
1408  tmp_re += I4.x * J1.x - I4.y * J1.y;
1409  accum_re[8*blockDim.x] = tmp_re;
1410 
1411  tmp_im = I3.x * J0.y + I3.y * J0.x;
1412  tmp_im += I3.z * J0.w + I3.w * J0.z;
1413  tmp_im += I4.x * J1.y + I4.y * J1.x;
1414  accum_im[8*blockDim.x] = tmp_im;
1415 
1416  //21 component:
1417  tmp_re = I3.x * J1.z - I3.y * J1.w;
1418  tmp_re += I3.z * J2.x - I3.w * J2.y;
1419  tmp_re += I4.x * J2.z - I4.y * J2.w;
1420  accum_re[9*blockDim.x] = tmp_re;
1421 
1422  tmp_im = I3.x * J1.w + I3.y * J1.z;
1423  tmp_im += I3.z * J2.y + I3.w * J2.x;
1424  tmp_im += I4.x * J2.w + I4.y * J2.z;
1425  accum_im[9*blockDim.x] = tmp_im;
1426 
1427  //22 component:
1428  tmp_re = I3.x * J3.x - I3.y * J3.y;
1429  tmp_re += I3.z * J3.z - I3.w * J3.w;
1430  tmp_re += I4.x * J4.x - I4.y * J4.y;
1431  accum_re[10*blockDim.x] = tmp_re;
1432 
1433  tmp_im = I3.x * J3.y + I3.y * J3.x;
1434  tmp_im += I3.z * J3.w + I3.w * J3.z;
1435  tmp_im += I4.x * J4.y + I4.y * J4.x;
1436  accum_im[10*blockDim.x] = tmp_im;
1437 
1438  //23 component:
1439  tmp_re = I3.x * J4.z - I3.y * J4.w;
1440  tmp_re += I3.z * J5.x - I3.w * J5.y;
1441  tmp_re += I4.x * J5.z - I4.y * J5.w;
1442  accum_re[11*blockDim.x] = tmp_re;
1443 
1444  tmp_im = I3.x * J4.w + I3.y * J4.z;
1445  tmp_im += I3.z * J5.y + I3.w * J5.x;
1446  tmp_im += I4.x * J5.w + I4.y * J5.z;
1447  accum_im[11*blockDim.x] = tmp_im;
1448 
1449  //30 component:
1450  tmp_re = I4.z * J0.x - I4.w * J0.y;
1451  tmp_re += I5.x * J0.z - I5.y * J0.w;
1452  tmp_re += I5.z * J1.x - I5.w * J1.y;
1453  accum_re[12*blockDim.x] = tmp_re;
1454 
1455  tmp_im = I4.z * J0.y + I4.w * J0.x;
1456  tmp_im += I5.x * J0.w + I5.y * J0.z;
1457  tmp_im += I5.z * J1.y + I5.w * J1.x;
1458  accum_im[12*blockDim.x] = tmp_im;
1459 
1460  //31 component:
1461  tmp_re = I4.z * J1.z - I4.w * J1.w;
1462  tmp_re += I5.x * J2.x - I5.y * J2.y;
1463  tmp_re += I5.z * J2.z - I5.w * J2.w;
1464  accum_re[13*blockDim.x] = tmp_re;
1465 
1466  tmp_im = I4.z * J1.w + I4.w * J1.z;
1467  tmp_im += I5.x * J2.y + I5.y * J2.x;
1468  tmp_im += I5.z * J2.w + I5.w * J2.z;
1469  accum_im[13*blockDim.x] = tmp_im;
1470 
1471  //32 component:
1472  tmp_re = I4.z * J3.x - I4.w * J3.y;
1473  tmp_re += I5.x * J3.z - I5.y * J3.w;
1474  tmp_re += I5.z * J4.x - I5.w * J4.y;
1475  accum_re[14*blockDim.x] = tmp_re;
1476 
1477  tmp_im = I4.z * J3.y + I4.w * J3.x;
1478  tmp_im += I5.x * J3.w + I5.y * J3.z;
1479  tmp_im += I5.z * J4.y + I5.w * J4.x;
1480  accum_im[14*blockDim.x] = tmp_im;
1481 
1482  //33 component:
1483  tmp_re = I4.z * J4.z - I4.w * J4.w;
1484  tmp_re += I5.x * J5.x - I5.y * J5.y;
1485  tmp_re += I5.z * J5.z - I5.w * J5.w;
1486  accum_re[15*blockDim.x] = tmp_re;
1487 
1488  tmp_im = I4.z * J4.w + I4.w * J4.z;
1489  tmp_im += I5.x * J5.y + I5.y * J5.x;
1490  tmp_im += I5.z * J5.w + I5.w * J5.z;
1491  accum_im[15*blockDim.x] = tmp_im;
1492 
1493  //Store output back to global buffer:
1494 
1495 
1496  /* CONTRACTION FULL VOLUME */
1497 
1498  out[outId + 0 *param.threads*2].x += accum_re[ 0*blockDim.x];
1499  out[outId + 1 *param.threads*2].x += accum_re[ 1*blockDim.x];
1500  out[outId + 2 *param.threads*2].x += accum_re[ 2*blockDim.x];
1501  out[outId + 3 *param.threads*2].x += accum_re[ 3*blockDim.x];
1502  out[outId + 4 *param.threads*2].x += accum_re[ 4*blockDim.x];
1503  out[outId + 5 *param.threads*2].x += accum_re[ 5*blockDim.x];
1504  out[outId + 6 *param.threads*2].x += accum_re[ 6*blockDim.x];
1505  out[outId + 7 *param.threads*2].x += accum_re[ 7*blockDim.x];
1506  out[outId + 8 *param.threads*2].x += accum_re[ 8*blockDim.x];
1507  out[outId + 9 *param.threads*2].x += accum_re[ 9*blockDim.x];
1508  out[outId + 10*param.threads*2].x += accum_re[10*blockDim.x];
1509  out[outId + 11*param.threads*2].x += accum_re[11*blockDim.x];
1510  out[outId + 12*param.threads*2].x += accum_re[12*blockDim.x];
1511  out[outId + 13*param.threads*2].x += accum_re[13*blockDim.x];
1512  out[outId + 14*param.threads*2].x += accum_re[14*blockDim.x];
1513  out[outId + 15*param.threads*2].x += accum_re[15*blockDim.x];
1514 
1515  out[outId + 0 *param.threads*2].y += accum_im[ 0*blockDim.x];
1516  out[outId + 1 *param.threads*2].y += accum_im[ 1*blockDim.x];
1517  out[outId + 2 *param.threads*2].y += accum_im[ 2*blockDim.x];
1518  out[outId + 3 *param.threads*2].y += accum_im[ 3*blockDim.x];
1519  out[outId + 4 *param.threads*2].y += accum_im[ 4*blockDim.x];
1520  out[outId + 5 *param.threads*2].y += accum_im[ 5*blockDim.x];
1521  out[outId + 6 *param.threads*2].y += accum_im[ 6*blockDim.x];
1522  out[outId + 7 *param.threads*2].y += accum_im[ 7*blockDim.x];
1523  out[outId + 8 *param.threads*2].y += accum_im[ 8*blockDim.x];
1524  out[outId + 9 *param.threads*2].y += accum_im[ 9*blockDim.x];
1525  out[outId + 10*param.threads*2].y += accum_im[10*blockDim.x];
1526  out[outId + 11*param.threads*2].y += accum_im[11*blockDim.x];
1527  out[outId + 12*param.threads*2].y += accum_im[12*blockDim.x];
1528  out[outId + 13*param.threads*2].y += accum_im[13*blockDim.x];
1529  out[outId + 14*param.threads*2].y += accum_im[14*blockDim.x];
1530  out[outId + 15*param.threads*2].y += accum_im[15*blockDim.x];
1531 
1532  return;
1533 }
1534 
1535 __global__ void contractPlusKernel (float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
1536 {
1537  int sid = blockIdx.x*blockDim.x + threadIdx.x;
1538  int outId = sid;
1539  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1540 
1541  if (sid >= param.threads)
1542  return;
1543 
1544  volatile float2 tmp;
1545  extern __shared__ float sms[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
1546 
1547  volatile float *accum_re = sms + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
1548  volatile float *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
1549 
1550  eutId = 2*sid;
1551  auxCoord1 = eutId / X1;
1552  xCoord1 = eutId - auxCoord1 * X1;
1553  auxCoord2 = auxCoord1 / X2;
1554  xCoord2 = auxCoord1 - auxCoord2 * X2;
1555  xCoord4 = auxCoord2 / X3;
1556  xCoord3 = auxCoord2 - xCoord4 * X3;
1557 
1558  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1559  xCoord1 += auxCoord1;
1560  outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4)); //AQUI
1561 
1562  //Load the full input spinors:
1563 
1564  READ_SPINOR (SPINORTEX, myStride, sid, sid);
1565  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
1566 
1567  //compute in1^dag:
1568 
1569  I0.y = -I0.y;
1570  I0.w = -I0.w;
1571  I1.y = -I1.y;
1572  I1.w = -I1.w;
1573  I2.y = -I2.y;
1574  I2.w = -I2.w;
1575  I3.y = -I3.y;
1576  I3.w = -I3.w;
1577  I4.y = -I4.y;
1578  I4.w = -I4.w;
1579  I5.y = -I5.y;
1580  I5.w = -I5.w;
1581 
1582  //do products for first color component here:
1583  //00 component:
1584  tmp_re = I0.x * J0.x - I0.y * J0.y;
1585  tmp_re += I0.z * J0.z - I0.w * J0.w;
1586  tmp_re += I1.x * J1.x - I1.y * J1.y;
1587  accum_re[0*blockDim.x] = tmp_re;
1588 
1589  tmp_im = I0.x * J0.y + I0.y * J0.x;
1590  tmp_im += I0.z * J0.w + I0.w * J0.z;
1591  tmp_im += I1.x * J1.y + I1.y * J1.x;
1592  accum_im[0*blockDim.x] = tmp_im;
1593 
1594  //01 component:
1595  tmp_re = I0.x * J1.z - I0.y * J1.w;
1596  tmp_re += I0.z * J2.x - I0.w * J2.y;
1597  tmp_re += I1.x * J2.z - I1.y * J2.w;
1598  accum_re[1*blockDim.x] = tmp_re;
1599 
1600  tmp_im = I0.x * J1.w + I0.y * J1.z;
1601  tmp_im += I0.z * J2.y + I0.w * J2.x;
1602  tmp_im += I1.x * J2.w + I1.y * J2.z;
1603  accum_im[1*blockDim.x] = tmp_im;
1604 
1605  //02 component:
1606  tmp_re = I0.x * J3.x - I0.y * J3.y;
1607  tmp_re += I0.z * J3.z - I0.w * J3.w;
1608  tmp_re += I1.x * J4.x - I1.y * J4.y;
1609  accum_re[2*blockDim.x] = tmp_re;
1610 
1611  tmp_im = I0.x * J3.y + I0.y * J3.x;
1612  tmp_im += I0.z * J3.w + I0.w * J3.z;
1613  tmp_im += I1.x * J4.y + I1.y * J4.x;
1614  accum_im[2*blockDim.x] = tmp_im;
1615 
1616  //03 component:
1617  tmp_re = I0.x * J4.z - I0.y * J4.w;
1618  tmp_re += I0.z * J5.x - I0.w * J5.x;
1619  tmp_re += I1.x * J5.z - I1.y * J5.w;
1620 
1621  accum_re[3*blockDim.x] = tmp_re;
1622 
1623  tmp_im = I0.x * J4.w + I0.y * J4.z;
1624  tmp_im += I0.z * J5.y + I0.w * J5.y;
1625  tmp_im += I1.x * J5.w + I1.y * J5.z;
1626  accum_im[3*blockDim.x] = tmp_im;
1627 
1628  //10 component:
1629  tmp_re = I1.z * J0.x - I1.w * J0.y;
1630  tmp_re += I2.x * J0.z - I2.y * J0.w;
1631  tmp_re += I2.z * J1.x - I2.w * J1.y;
1632  accum_re[4*blockDim.x] = tmp_re;
1633 
1634  tmp_im = I1.z * J0.y + I1.w * J0.x;
1635  tmp_im += I2.x * J0.w + I2.y * J0.z;
1636  tmp_im += I2.z * J1.y + I2.w * J1.x;
1637  accum_im[4*blockDim.x] = tmp_im;
1638 
1639  //11 component:
1640  tmp_re = I1.z * J1.z - I1.w * J1.w;
1641  tmp_re += I2.x * J2.x - I2.y * J2.y;
1642  tmp_re += I2.z * J2.z - I2.w * J2.w;
1643  accum_re[5*blockDim.x] = tmp_re;
1644 
1645  tmp_im = I1.z * J1.w + I1.w * J1.z;
1646  tmp_im += I2.x * J2.y + I2.y * J2.x;
1647  tmp_im += I2.z * J2.w + I2.w * J2.z;
1648  accum_im[5*blockDim.x] = tmp_im;
1649 
1650  //12 component:
1651  tmp_re = I1.z * J3.x - I1.w * J3.y;
1652  tmp_re += I2.x * J3.z - I2.y * J3.w;
1653  tmp_re += I2.z * J4.x - I2.w * J4.y;
1654  accum_re[6*blockDim.x] = tmp_re;
1655 
1656  tmp_im = I1.z * J3.y + I1.w * J3.x;
1657  tmp_im += I2.x * J3.w + I2.y * J3.z;
1658  tmp_im += I2.z * J4.y + I2.w * J4.x;
1659  accum_im[6*blockDim.x] = tmp_im;
1660 
1661  //13 component:
1662  tmp_re = I1.z * J4.z - I1.w * J4.w;
1663  tmp_re += I2.x * J5.x - I2.y * J5.y;
1664  tmp_re += I2.z * J5.z - I2.w * J5.w;
1665  accum_re[7*blockDim.x] = tmp_re;
1666 
1667  tmp_im = I1.z * J4.w + I1.w * J4.z;
1668  tmp_im += I2.x * J5.y + I2.y * J5.x;
1669  tmp_im += I2.z * J5.w + I2.w * J5.z;
1670  accum_im[7*blockDim.x] = tmp_im;
1671 
1672  //20 component:
1673  tmp_re = I3.x * J0.x - I3.y * J0.y;
1674  tmp_re += I3.z * J0.z - I3.w * J0.w;
1675  tmp_re += I4.x * J1.x - I4.y * J1.y;
1676  accum_re[8*blockDim.x] = tmp_re;
1677 
1678  tmp_im = I3.x * J0.y + I3.y * J0.x;
1679  tmp_im += I3.z * J0.w + I3.w * J0.z;
1680  tmp_im += I4.x * J1.y + I4.y * J1.x;
1681  accum_im[8*blockDim.x] = tmp_im;
1682 
1683  //21 component:
1684  tmp_re = I3.x * J1.z - I3.y * J1.w;
1685  tmp_re += I3.z * J2.x - I3.w * J2.y;
1686  tmp_re += I4.x * J2.z - I4.y * J2.w;
1687  accum_re[9*blockDim.x] = tmp_re;
1688 
1689  tmp_im = I3.x * J1.w + I3.y * J1.z;
1690  tmp_im += I3.z * J2.y + I3.w * J2.x;
1691  tmp_im += I4.x * J2.w + I4.y * J2.z;
1692  accum_im[9*blockDim.x] = tmp_im;
1693 
1694  //22 component:
1695  tmp_re = I3.x * J3.x - I3.y * J3.y;
1696  tmp_re += I3.z * J3.z - I3.w * J3.w;
1697  tmp_re += I4.x * J4.x - I4.y * J4.y;
1698  accum_re[10*blockDim.x] = tmp_re;
1699 
1700  tmp_im = I3.x * J3.y + I3.y * J3.x;
1701  tmp_im += I3.z * J3.w + I3.w * J3.z;
1702  tmp_im += I4.x * J4.y + I4.y * J4.x;
1703  accum_im[10*blockDim.x] = tmp_im;
1704 
1705  //23 component:
1706  tmp_re = I3.x * J4.z - I3.y * J4.w;
1707  tmp_re += I3.z * J5.x - I3.w * J5.y;
1708  tmp_re += I4.x * J5.z - I4.y * J5.w;
1709  accum_re[11*blockDim.x] = tmp_re;
1710 
1711  tmp_im = I3.x * J4.w + I3.y * J4.z;
1712  tmp_im += I3.z * J5.y + I3.w * J5.x;
1713  tmp_im += I4.x * J5.w + I4.y * J5.z;
1714  accum_im[11*blockDim.x] = tmp_im;
1715 
1716  //30 component:
1717  tmp_re = I4.z * J0.x - I4.w * J0.y;
1718  tmp_re += I5.x * J0.z - I5.y * J0.w;
1719  tmp_re += I5.z * J1.x - I5.w * J1.y;
1720  accum_re[12*blockDim.x] = tmp_re;
1721 
1722  tmp_im = I4.z * J0.y + I4.w * J0.x;
1723  tmp_im += I5.x * J0.w + I5.y * J0.z;
1724  tmp_im += I5.z * J1.y + I5.w * J1.x;
1725  accum_im[12*blockDim.x] = tmp_im;
1726 
1727  //31 component:
1728  tmp_re = I4.z * J1.z - I4.w * J1.w;
1729  tmp_re += I5.x * J2.x - I5.y * J2.y;
1730  tmp_re += I5.z * J2.z - I5.w * J2.w;
1731  accum_re[13*blockDim.x] = tmp_re;
1732 
1733  tmp_im = I4.z * J1.w + I4.w * J1.z;
1734  tmp_im += I5.x * J2.y + I5.y * J2.x;
1735  tmp_im += I5.z * J2.w + I5.w * J2.z;
1736  accum_im[13*blockDim.x] = tmp_im;
1737 
1738  //32 component:
1739  tmp_re = I4.z * J3.x - I4.w * J3.y;
1740  tmp_re += I5.x * J3.z - I5.y * J3.w;
1741  tmp_re += I5.z * J4.x - I5.w * J4.y;
1742  accum_re[14*blockDim.x] = tmp_re;
1743 
1744  tmp_im = I4.z * J3.y + I4.w * J3.x;
1745  tmp_im += I5.x * J3.w + I5.y * J3.z;
1746  tmp_im += I5.z * J4.y + I5.w * J4.x;
1747  accum_im[14*blockDim.x] = tmp_im;
1748 
1749  //33 component:
1750  tmp_re = I4.z * J4.z - I4.w * J4.w;
1751  tmp_re += I5.x * J5.x - I5.y * J5.y;
1752  tmp_re += I5.z * J5.z - I5.w * J5.w;
1753  accum_re[15*blockDim.x] = tmp_re;
1754 
1755  tmp_im = I4.z * J4.w + I4.w * J4.z;
1756  tmp_im += I5.x * J5.y + I5.y * J5.x;
1757  tmp_im += I5.z * J5.w + I5.w * J5.z;
1758  accum_im[15*blockDim.x] = tmp_im;
1759 
1760  //Store output back to global buffer:
1761 
1762  /* CONTRACTION FULL VOLUME */
1763 
1764  out[outId + 0 *param.threads*2].x += accum_re[ 0*blockDim.x];
1765  out[outId + 1 *param.threads*2].x += accum_re[ 1*blockDim.x];
1766  out[outId + 2 *param.threads*2].x += accum_re[ 2*blockDim.x];
1767  out[outId + 3 *param.threads*2].x += accum_re[ 3*blockDim.x];
1768  out[outId + 4 *param.threads*2].x += accum_re[ 4*blockDim.x];
1769  out[outId + 5 *param.threads*2].x += accum_re[ 5*blockDim.x];
1770  out[outId + 6 *param.threads*2].x += accum_re[ 6*blockDim.x];
1771  out[outId + 7 *param.threads*2].x += accum_re[ 7*blockDim.x];
1772  out[outId + 8 *param.threads*2].x += accum_re[ 8*blockDim.x];
1773  out[outId + 9 *param.threads*2].x += accum_re[ 9*blockDim.x];
1774  out[outId + 10*param.threads*2].x += accum_re[10*blockDim.x];
1775  out[outId + 11*param.threads*2].x += accum_re[11*blockDim.x];
1776  out[outId + 12*param.threads*2].x += accum_re[12*blockDim.x];
1777  out[outId + 13*param.threads*2].x += accum_re[13*blockDim.x];
1778  out[outId + 14*param.threads*2].x += accum_re[14*blockDim.x];
1779  out[outId + 15*param.threads*2].x += accum_re[15*blockDim.x];
1780 
1781  out[outId + 0 *param.threads*2].y += accum_im[ 0*blockDim.x];
1782  out[outId + 1 *param.threads*2].y += accum_im[ 1*blockDim.x];
1783  out[outId + 2 *param.threads*2].y += accum_im[ 2*blockDim.x];
1784  out[outId + 3 *param.threads*2].y += accum_im[ 3*blockDim.x];
1785  out[outId + 4 *param.threads*2].y += accum_im[ 4*blockDim.x];
1786  out[outId + 5 *param.threads*2].y += accum_im[ 5*blockDim.x];
1787  out[outId + 6 *param.threads*2].y += accum_im[ 6*blockDim.x];
1788  out[outId + 7 *param.threads*2].y += accum_im[ 7*blockDim.x];
1789  out[outId + 8 *param.threads*2].y += accum_im[ 8*blockDim.x];
1790  out[outId + 9 *param.threads*2].y += accum_im[ 9*blockDim.x];
1791  out[outId + 10*param.threads*2].y += accum_im[10*blockDim.x];
1792  out[outId + 11*param.threads*2].y += accum_im[11*blockDim.x];
1793  out[outId + 12*param.threads*2].y += accum_im[12*blockDim.x];
1794  out[outId + 13*param.threads*2].y += accum_im[13*blockDim.x];
1795  out[outId + 14*param.threads*2].y += accum_im[14*blockDim.x];
1796  out[outId + 15*param.threads*2].y += accum_im[15*blockDim.x];
1797 
1798  return;
1799 }
1800 
1801 #undef SPINORTEX
1802 #undef INTERTEX
1803 
1804 #undef tmp_re
1805 #undef tmp_im
1806 
1807 #undef READ_SPINOR
1808 #undef READ_INTERMEDIATE_SPINOR
1809 
1810 #undef SPINOR_HOP
1811 
1812 #endif //_TWIST_QUDA_CONTRACT_PLUS
__constant__ int X2
__constant__ int Vsh
__constant__ int X1
#define SPINORTEX
#define TOTAL_COMPONENTS
__global__ void contractTslicePlusKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
#define tmp_re
__global__ void contractGamma5PlusKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
#define READ_INTERMEDIATE_SPINOR
QudaGaugeParam param
Definition: pack_test.cpp:17
cudaColorSpinorField * tmp
#define INTERTEX
#define tmp_im
cpuColorSpinorField * out
__constant__ int X3
__global__ void contractPlusKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
#define READ_SPINOR