QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
contract_core.h
Go to the documentation of this file.
1 #ifndef _TWIST_QUDA_CONTRACT
2 #define _TWIST_QUDA_CONTRACT
3 
4 #define tmp_re tmp.x
5 #define tmp_im tmp.y
6 
7 #define TOTAL_COMPONENTS 16
8 
9 #if (__COMPUTE_CAPABILITY__ >= 130)
10 
11 #define READ_INTERMEDIATE_SPINOR_DOUBLE(spinor, stride, sp_idx, norm_idx) \
12  double2 J0 = spinor[sp_idx + 0*(stride)]; \
13  double2 J1 = spinor[sp_idx + 1*(stride)]; \
14  double2 J2 = spinor[sp_idx + 2*(stride)]; \
15  double2 J3 = spinor[sp_idx + 3*(stride)]; \
16  double2 J4 = spinor[sp_idx + 4*(stride)]; \
17  double2 J5 = spinor[sp_idx + 5*(stride)]; \
18  double2 J6 = spinor[sp_idx + 6*(stride)]; \
19  double2 J7 = spinor[sp_idx + 7*(stride)]; \
20  double2 J8 = spinor[sp_idx + 8*(stride)]; \
21  double2 J9 = spinor[sp_idx + 9*(stride)]; \
22  double2 J10 = spinor[sp_idx +10*(stride)]; \
23  double2 J11 = spinor[sp_idx +11*(stride)];
24 
25 #define READ_INTERMEDIATE_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \
26  double2 J0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
27  double2 J1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
28  double2 J2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
29  double2 J3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
30  double2 J4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
31  double2 J5 = fetch_double2((spinor), sp_idx + 5*(stride)); \
32  double2 J6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
33  double2 J7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
34  double2 J8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
35  double2 J9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
36  double2 J10 = fetch_double2((spinor), sp_idx +10*(stride)); \
37  double2 J11 = fetch_double2((spinor), sp_idx +11*(stride));
38 
39 #ifdef DIRECT_ACCESS_WILSON_SPINOR
40  #define READ_SPINOR READ_SPINOR_DOUBLE
41  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE
42  #define SPINORTEX in
43  #define INTERTEX in2
44 #else
45  #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
46  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE_TEX
47 
48  #ifdef USE_TEXTURE_OBJECTS
49  #define SPINORTEX param.inTex
50  #define INTERTEX param.outTex
51  #else
52  #define SPINORTEX spinorTexDouble
53  #define INTERTEX interTexDouble
54  #endif // USE_TEXTURE_OBJECTS
55 
56 #define SPINOR_HOP 12
57 
58 #endif
59 
60 __global__ void contractGamma5Kernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
61 {
62  int sid = blockIdx.x*blockDim.x + threadIdx.x;
63  int outId = sid;
64  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
65 
66  if (sid >= param.threads)
67  return;
68 
69  volatile double2 tmp;
70  extern __shared__ double sm[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
71 
72  volatile double *accum_re = sm + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
73  volatile double *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
74 
75  eutId = 2*sid;
76  auxCoord1 = eutId / X1;
77  xCoord1 = eutId - auxCoord1 * X1;
78  auxCoord2 = auxCoord1 / X2;
79  xCoord2 = auxCoord1 - auxCoord2 * X2;
80  xCoord4 = auxCoord2 / X3;
81  xCoord3 = auxCoord2 - xCoord4 * X3;
82 
83  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
84  xCoord1 += auxCoord1;
85  outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4)); //AQUI
86 
87  READ_SPINOR (SPINORTEX, myStride, sid, sid);
88  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
89 
90  //compute in1^dag * gamma5:
91 
92  tmp_re = +I0.x;
93  tmp_im = -I0.y;
94  I0.x = +I6.x;
95  I0.y = -I6.y;
96  I6.x = tmp_re;
97  I6.y = tmp_im;
98 
99  tmp_re = +I3.x;
100  tmp_im = -I3.y;
101  I3.x = +I9.x;
102  I3.y = -I9.y;
103  I9.x = tmp_re;
104  I9.y = tmp_im;
105 
106  tmp_re = +I1.x;
107  tmp_im = -I1.y;
108  I1.x = +I7.x;
109  I1.y = -I7.y;
110  I7.x = tmp_re;
111  I7.y = tmp_im;
112 
113  tmp_re = +I4.x;
114  tmp_im = -I4.y;
115  I4.x = +I10.x;
116  I4.y = -I10.y;
117  I10.x = tmp_re;
118  I10.y = tmp_im;
119 
120  tmp_re = +I2.x;
121  tmp_im = -I2.y;
122  I2.x = +I8.x;
123  I2.y = -I8.y;
124  I8.x = tmp_re;
125  I8.y = tmp_im;
126 
127  tmp_re = +I5.x;
128  tmp_im = -I5.y;
129  I5.x = +I11.x;
130  I5.y = -I11.y;
131  I11.x = tmp_re;
132  I11.y = tmp_im;
133 
134  //do products for all color component here:
135  //00 component:
136  tmp_re = I0.x * J0.x - I0.y * J0.y;
137  tmp_re += I1.x * J1.x - I1.y * J1.y;
138  tmp_re += I2.x * J2.x - I2.y * J2.y;
139  accum_re[0*blockDim.x] = tmp_re;
140 
141  tmp_im = I0.x * J0.y + I0.y * J0.x;
142  tmp_im += I1.x * J1.y + I1.y * J1.x;
143  tmp_im += I2.x * J2.y + I2.y * J2.x;
144  accum_im[0*blockDim.x] = tmp_im;
145 
146  //01 component:
147  tmp_re = I0.x * J3.x - I0.y * J3.y;
148  tmp_re += I1.x * J4.x - I1.y * J4.y;
149  tmp_re += I2.x * J5.x - I2.y * J5.y;
150  accum_re[1*blockDim.x] = tmp_re;
151 
152  tmp_im = I0.x * J3.y + I0.y * J3.x;
153  tmp_im += I1.x * J4.y + I1.y * J4.x;
154  tmp_im += I2.x * J5.y + I2.y * J5.x;
155  accum_im[1*blockDim.x] = tmp_im;
156 
157  //02 component:
158  tmp_re = I0.x * J6.x - I0.y * J6.y;
159  tmp_re += I1.x * J7.x - I1.y * J7.y;
160  tmp_re += I2.x * J8.x - I2.y * J8.y;
161  accum_re[2*blockDim.x] = tmp_re;
162 
163  tmp_im = I0.x * J6.y + I0.y * J6.x;
164  tmp_im += I1.x * J7.y + I1.y * J7.x;
165  tmp_im += I2.x * J8.y + I2.y * J8.x;
166  accum_im[2*blockDim.x] = tmp_im;
167 
168  //03 component:
169  tmp_re = I0.x * J9.x - I0.y * J9.y;
170  tmp_re += I1.x * J10.x - I1.y * J10.y;
171  tmp_re += I2.x * J11.x - I2.y * J11.y;
172  accum_re[3*blockDim.x] = tmp_re;
173 
174  tmp_im = I0.x * J9.y + I0.y * J9.x;
175  tmp_im += I1.x * J10.y + I1.y * J10.x;
176  tmp_im += I2.x * J11.y + I2.y * J11.x;
177  accum_im[3*blockDim.x] = tmp_im;
178 
179  //10 component:
180  tmp_re = I3.x * J0.x - I3.y * J0.y;
181  tmp_re += I4.x * J1.x - I4.y * J1.y;
182  tmp_re += I5.x * J2.x - I5.y * J2.y;
183  accum_re[ 4*blockDim.x] = tmp_re;
184 
185  tmp_im = I3.x * J0.y + I3.y * J0.x;
186  tmp_im += I4.x * J1.y + I4.y * J1.x;
187  tmp_im += I5.x * J2.y + I5.y * J2.x;
188  accum_im[ 4*blockDim.x] = tmp_im;
189 
190  //11 component:
191  tmp_re = I3.x * J3.x - I3.y * J3.y;
192  tmp_re += I4.x * J4.x - I4.y * J4.y;
193  tmp_re += I5.x * J5.x - I5.y * J5.y;
194  accum_re[ 5*blockDim.x] = tmp_re;
195 
196  tmp_im = I3.x * J3.y + I3.y * J3.x;
197  tmp_im += I4.x * J4.y + I4.y * J4.x;
198  tmp_im += I5.x * J5.y + I5.y * J5.x;
199  accum_im[ 5*blockDim.x] = tmp_im;
200 
201  //12 component:
202  tmp_re = I3.x * J6.x - I3.y * J6.y;
203  tmp_re += I4.x * J7.x - I4.y * J7.y;
204  tmp_re += I5.x * J8.x - I5.y * J8.y;
205  accum_re[ 6*blockDim.x] = tmp_re;
206 
207  tmp_im = I3.x * J6.y + I3.y * J6.x;
208  tmp_im += I4.x * J7.y + I4.y * J7.x;
209  tmp_im += I5.x * J8.y + I5.y * J8.x;
210  accum_im[ 6*blockDim.x] = tmp_im;
211 
212  //13 component:
213  tmp_re = I3.x * J9.x - I3.y * J9.y;
214  tmp_re += I4.x * J10.x - I4.y * J10.y;
215  tmp_re += I5.x * J11.x - I5.y * J11.y;
216  accum_re[ 7*blockDim.x] = tmp_re;
217 
218  tmp_im = I3.x * J9.y + I3.y * J9.x;
219  tmp_im += I4.x * J10.y + I4.y * J10.x;
220  tmp_im += I5.x * J11.y + I5.y * J11.x;
221  accum_im[ 7*blockDim.x] = tmp_im;
222 
223  //20 component:
224  tmp_re = I6.x * J0.x - I6.y * J0.y;
225  tmp_re += I7.x * J1.x - I7.y * J1.y;
226  tmp_re += I8.x * J2.x - I8.y * J2.y;
227  accum_re[ 8*blockDim.x] = tmp_re;
228 
229  tmp_im = I6.x * J0.y + I6.y * J0.x;
230  tmp_im += I7.x * J1.y + I7.y * J1.x;
231  tmp_im += I8.x * J2.y + I8.y * J2.x;
232  accum_im[ 8*blockDim.x] = tmp_im;
233 
234  //21 component:
235  tmp_re = I6.x * J3.x - I6.y * J3.y;
236  tmp_re += I7.x * J4.x - I7.y * J4.y;
237  tmp_re += I8.x * J5.x - I8.y * J5.y;
238  accum_re[ 9*blockDim.x] = tmp_re;
239 
240  tmp_im = I6.x * J3.y + I6.y * J3.x;
241  tmp_im += I7.x * J4.y + I7.y * J4.x;
242  tmp_im += I8.x * J5.y + I8.y * J5.x;
243  accum_im[ 9*blockDim.x] = tmp_im;
244 
245  //22 component:
246  tmp_re = I6.x * J6.x - I6.y * J6.y;
247  tmp_re += I7.x * J7.x - I7.y * J7.y;
248  tmp_re += I8.x * J8.x - I8.y * J8.y;
249  accum_re[10*blockDim.x] = tmp_re;
250 
251  tmp_im = I6.x * J6.y + I6.y * J6.x;
252  tmp_im += I7.x * J7.y + I7.y * J7.x;
253  tmp_im += I8.x * J8.y + I8.y * J8.x;
254  accum_im[10*blockDim.x] = tmp_im;
255 
256  //23 component:
257  tmp_re = I6.x * J9.x - I6.y * J9.y;
258  tmp_re += I7.x * J10.x - I7.y * J10.y;
259  tmp_re += I8.x * J11.x - I8.y * J11.y;
260  accum_re[11*blockDim.x] = tmp_re;
261 
262  tmp_im = I6.x * J9.y + I6.y * J9.x;
263  tmp_im += I7.x * J10.y + I7.y * J10.x;
264  tmp_im += I8.x * J11.y + I8.y * J11.x;
265  accum_im[11*blockDim.x] = tmp_im;
266 
267  //30 component:
268  tmp_re = I9.x * J0.x - I9.y * J0.y;
269  tmp_re += I10.x * J1.x - I10.y * J1.y;
270  tmp_re += I11.x * J2.x - I11.y * J2.y;
271  accum_re[12*blockDim.x] = tmp_re;
272 
273  tmp_im = I9.x * J0.y + I9.y * J0.x;
274  tmp_im += I10.x * J1.y + I10.y * J1.x;
275  tmp_im += I11.x * J2.y + I11.y * J2.x;
276  accum_im[12*blockDim.x] = tmp_im;
277 
278  //31 component:
279  tmp_re = I9.x * J3.x - I9.y * J3.y;
280  tmp_re += I10.x * J4.x - I10.y * J4.y;
281  tmp_re += I11.x * J5.x - I11.y * J5.y;
282  accum_re[13*blockDim.x] = tmp_re;
283 
284  tmp_im = I9.x * J3.y + I9.y * J3.x;
285  tmp_im += I10.x * J4.y + I10.y * J4.x;
286  tmp_im += I11.x * J5.y + I11.y * J5.x;
287  accum_im[13*blockDim.x] = tmp_im;
288 
289  //32 component:
290  tmp_re = I9.x * J6.x - I9.y * J6.y;
291  tmp_re += I10.x * J7.x - I10.y * J7.y;
292  tmp_re += I11.x * J8.x - I11.y * J8.y;
293  accum_re[14*blockDim.x] = tmp_re;
294 
295  tmp_im = I9.x * J6.y + I9.y * J6.x;
296  tmp_im += I10.x * J7.y + I10.y * J7.x;
297  tmp_im += I11.x * J8.y + I11.y * J8.x;
298  accum_im[14*blockDim.x] = tmp_im;
299 
300  //33 component:
301  tmp_re = I9.x * J9.x - I9.y * J9.y;
302  tmp_re += I10.x * J10.x - I10.y * J10.y;
303  tmp_re += I11.x * J11.x - I11.y * J11.y;
304  accum_re[15*blockDim.x] = tmp_re;
305 
306  tmp_im = I9.x * J9.y + I9.y * J9.x;
307  tmp_im += I10.x * J10.y + I10.y * J10.x;
308  tmp_im += I11.x * J11.y + I11.y * J11.x;
309  accum_im[15*blockDim.x] = tmp_im;
310 
311  //Store output back to global buffer:
312 
313 
314 /* CONTRACTION FULL VOLUME */
315 
316  out[outId + 0 *param.threads*2] = make_double2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
317  out[outId + 1 *param.threads*2] = make_double2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
318  out[outId + 2 *param.threads*2] = make_double2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
319  out[outId + 3 *param.threads*2] = make_double2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
320  out[outId + 4 *param.threads*2] = make_double2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
321  out[outId + 5 *param.threads*2] = make_double2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
322  out[outId + 6 *param.threads*2] = make_double2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
323  out[outId + 7 *param.threads*2] = make_double2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
324  out[outId + 8 *param.threads*2] = make_double2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
325  out[outId + 9 *param.threads*2] = make_double2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
326  out[outId + 10*param.threads*2] = make_double2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
327  out[outId + 11*param.threads*2] = make_double2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
328  out[outId + 12*param.threads*2] = make_double2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
329  out[outId + 13*param.threads*2] = make_double2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
330  out[outId + 14*param.threads*2] = make_double2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
331  out[outId + 15*param.threads*2] = make_double2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
332 
333  return;
334 }
335 
336 //Perform trace in color space only and for a given tslice
337 //since the file is included in dslash_quda.h, no need to add dslash_constants.h file here (for, e.g., Vsh)
338 __global__ void contractTsliceKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
339 {
340  int sid = blockIdx.x*blockDim.x + threadIdx.x; //number of threads is equal to Tslice volume
341  //Adjust sid to correct tslice (exe domain must be Tslice volume!)
342  int inId = sid + Vsh*Tslice; //Vsh - 3d space volume for the parity spinor (equale to exe domain!)
343  int outId;
344  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
345 
346  if (sid >= param.threads) //param.threads == tslice volume
347  return;
348 
349  volatile double2 tmp;
350  extern __shared__ double sm[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
351 
352  volatile double *accum_re = sm + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
353  volatile double *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
354 
355 //The output only for a given tslice (for the full tslice content, i.e., both parities!):
356 
357  eutId = 2*inId;
358  auxCoord1 = eutId / X1;
359  xCoord1 = eutId - auxCoord1 * X1;
360  auxCoord2 = auxCoord1 / X2;
361  xCoord2 = auxCoord1 - auxCoord2 * X2;
362  xCoord4 = auxCoord2 / X3;
363  xCoord3 = auxCoord2 - xCoord4 * X3;
364 
365  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
366  xCoord1 += auxCoord1;
367  outId = xCoord1 + X1*(xCoord2 + X2*xCoord3); //AQUI
368 
369  READ_SPINOR (SPINORTEX, myStride, sid, sid);
370  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
371 
372  //compute in1^dag:
373 
374  I0.y = -I0.y;
375  I1.y = -I1.y;
376  I2.y = -I2.y;
377  I3.y = -I3.y;
378  I4.y = -I4.y;
379  I5.y = -I5.y;
380  I6.y = -I6.y;
381  I7.y = -I7.y;
382  I8.y = -I8.y;
383  I9.y = -I9.y;
384  I10.y = -I10.y;
385  I11.y = -I11.y;
386 
387  //do products for all color component here:
388  //00 component:
389  tmp_re = I0.x * J0.x - I0.y * J0.y;
390  tmp_re += I1.x * J1.x - I1.y * J1.y;
391  tmp_re += I2.x * J2.x - I2.y * J2.y;
392  accum_re[0*blockDim.x] = tmp_re;
393 
394  tmp_im = I0.x * J0.y + I0.y * J0.x;
395  tmp_im += I1.x * J1.y + I1.y * J1.x;
396  tmp_im += I2.x * J2.y + I2.y * J2.x;
397  accum_im[0*blockDim.x] = tmp_im;
398 
399  //01 component:
400  tmp_re = I0.x * J3.x - I0.y * J3.y;
401  tmp_re += I1.x * J4.x - I1.y * J4.y;
402  tmp_re += I2.x * J5.x - I2.y * J5.y;
403  accum_re[1*blockDim.x] = tmp_re;
404 
405  tmp_im = I0.x * J3.y + I0.y * J3.x;
406  tmp_im += I1.x * J4.y + I1.y * J4.x;
407  tmp_im += I2.x * J5.y + I2.y * J5.x;
408  accum_im[1*blockDim.x] = tmp_im;
409 
410  //02 component:
411  tmp_re = I0.x * J6.x - I0.y * J6.y;
412  tmp_re += I1.x * J7.x - I1.y * J7.y;
413  tmp_re += I2.x * J8.x - I2.y * J8.y;
414  accum_re[2*blockDim.x] = tmp_re;
415 
416  tmp_im = I0.x * J6.y + I0.y * J6.x;
417  tmp_im += I1.x * J7.y + I1.y * J7.x;
418  tmp_im += I2.x * J8.y + I2.y * J8.x;
419  accum_im[2*blockDim.x] = tmp_im;
420 
421  //03 component:
422  tmp_re = I0.x * J9.x - I0.y * J9.y;
423  tmp_re += I1.x * J10.x - I1.y * J10.y;
424  tmp_re += I2.x * J11.x - I2.y * J11.y;
425  accum_re[3*blockDim.x] = tmp_re;
426 
427  tmp_im = I0.x * J9.y + I0.y * J9.x;
428  tmp_im += I1.x * J10.y + I1.y * J10.x;
429  tmp_im += I2.x * J11.y + I2.y * J11.x;
430  accum_im[3*blockDim.x] = tmp_im;
431 
432  //10 component:
433  tmp_re = I3.x * J0.x - I3.y * J0.y;
434  tmp_re += I4.x * J1.x - I4.y * J1.y;
435  tmp_re += I5.x * J2.x - I5.y * J2.y;
436  accum_re[ 4*blockDim.x] = tmp_re;
437 
438  tmp_im = I3.x * J0.y + I3.y * J0.x;
439  tmp_im += I4.x * J1.y + I4.y * J1.x;
440  tmp_im += I5.x * J2.y + I5.y * J2.x;
441  accum_im[ 4*blockDim.x] = tmp_im;
442 
443  //11 component:
444  tmp_re = I3.x * J3.x - I3.y * J3.y;
445  tmp_re += I4.x * J4.x - I4.y * J4.y;
446  tmp_re += I5.x * J5.x - I5.y * J5.y;
447  accum_re[ 5*blockDim.x] = tmp_re;
448 
449  tmp_im = I3.x * J3.y + I3.y * J3.x;
450  tmp_im += I4.x * J4.y + I4.y * J4.x;
451  tmp_im += I5.x * J5.y + I5.y * J5.x;
452  accum_im[ 5*blockDim.x] = tmp_im;
453 
454  //12 component:
455  tmp_re = I3.x * J6.x - I3.y * J6.y;
456  tmp_re += I4.x * J7.x - I4.y * J7.y;
457  tmp_re += I5.x * J8.x - I5.y * J8.y;
458  accum_re[ 6*blockDim.x] = tmp_re;
459 
460  tmp_im = I3.x * J6.y + I3.y * J6.x;
461  tmp_im += I4.x * J7.y + I4.y * J7.x;
462  tmp_im += I5.x * J8.y + I5.y * J8.x;
463  accum_im[ 6*blockDim.x] = tmp_im;
464 
465  //13 component:
466  tmp_re = I3.x * J9.x - I3.y * J9.y;
467  tmp_re += I4.x * J10.x - I4.y * J10.y;
468  tmp_re += I5.x * J11.x - I5.y * J11.y;
469  accum_re[ 7*blockDim.x] = tmp_re;
470 
471  tmp_im = I3.x * J9.y + I3.y * J9.x;
472  tmp_im += I4.x * J10.y + I4.y * J10.x;
473  tmp_im += I5.x * J11.y + I5.y * J11.x;
474  accum_im[ 7*blockDim.x] = tmp_im;
475 
476  //20 component:
477  tmp_re = I6.x * J0.x - I6.y * J0.y;
478  tmp_re += I7.x * J1.x - I7.y * J1.y;
479  tmp_re += I8.x * J2.x - I8.y * J2.y;
480  accum_re[ 8*blockDim.x] = tmp_re;
481 
482  tmp_im = I6.x * J0.y + I6.y * J0.x;
483  tmp_im += I7.x * J1.y + I7.y * J1.x;
484  tmp_im += I8.x * J2.y + I8.y * J2.x;
485  accum_im[ 8*blockDim.x] = tmp_im;
486 
487  //21 component:
488  tmp_re = I6.x * J3.x - I6.y * J3.y;
489  tmp_re += I7.x * J4.x - I7.y * J4.y;
490  tmp_re += I8.x * J5.x - I8.y * J5.y;
491  accum_re[ 9*blockDim.x] = tmp_re;
492 
493  tmp_im = I6.x * J3.y + I6.y * J3.x;
494  tmp_im += I7.x * J4.y + I7.y * J4.x;
495  tmp_im += I8.x * J5.y + I8.y * J5.x;
496  accum_im[ 9*blockDim.x] = tmp_im;
497 
498  //22 component:
499  tmp_re = I6.x * J6.x - I6.y * J6.y;
500  tmp_re += I7.x * J7.x - I7.y * J7.y;
501  tmp_re += I8.x * J8.x - I8.y * J8.y;
502  accum_re[10*blockDim.x] = tmp_re;
503 
504  tmp_im = I6.x * J6.y + I6.y * J6.x;
505  tmp_im += I7.x * J7.y + I7.y * J7.x;
506  tmp_im += I8.x * J8.y + I8.y * J8.x;
507  accum_im[10*blockDim.x] = tmp_im;
508 
509  //23 component:
510  tmp_re = I6.x * J9.x - I6.y * J9.y;
511  tmp_re += I7.x * J10.x - I7.y * J10.y;
512  tmp_re += I8.x * J11.x - I8.y * J11.y;
513  accum_re[11*blockDim.x] = tmp_re;
514 
515  tmp_im = I6.x * J9.y + I6.y * J9.x;
516  tmp_im += I7.x * J10.y + I7.y * J10.x;
517  tmp_im += I8.x * J11.y + I8.y * J11.x;
518  accum_im[11*blockDim.x] = tmp_im;
519 
520  //30 component:
521  tmp_re = I9.x * J0.x - I9.y * J0.y;
522  tmp_re += I10.x * J1.x - I10.y * J1.y;
523  tmp_re += I11.x * J2.x - I11.y * J2.y;
524  accum_re[12*blockDim.x] = tmp_re;
525 
526  tmp_im = I9.x * J0.y + I9.y * J0.x;
527  tmp_im += I10.x * J1.y + I10.y * J1.x;
528  tmp_im += I11.x * J2.y + I11.y * J2.x;
529  accum_im[12*blockDim.x] = tmp_im;
530 
531  //31 component:
532  tmp_re = I9.x * J3.x - I9.y * J3.y;
533  tmp_re += I10.x * J4.x - I10.y * J4.y;
534  tmp_re += I11.x * J5.x - I11.y * J5.y;
535  accum_re[13*blockDim.x] = tmp_re;
536 
537  tmp_im = I9.x * J3.y + I9.y * J3.x;
538  tmp_im += I10.x * J4.y + I10.y * J4.x;
539  tmp_im += I11.x * J5.y + I11.y * J5.x;
540  accum_im[13*blockDim.x] = tmp_im;
541 
542  //32 component:
543  tmp_re = I9.x * J6.x - I9.y * J6.y;
544  tmp_re += I10.x * J7.x - I10.y * J7.y;
545  tmp_re += I11.x * J8.x - I11.y * J8.y;
546  accum_re[14*blockDim.x] = tmp_re;
547 
548  tmp_im = I9.x * J6.y + I9.y * J6.x;
549  tmp_im += I10.x * J7.y + I10.y * J7.x;
550  tmp_im += I11.x * J8.y + I11.y * J8.x;
551  accum_im[14*blockDim.x] = tmp_im;
552 
553  //33 component:
554  tmp_re = I9.x * J9.x - I9.y * J9.y;
555  tmp_re += I10.x * J10.x - I10.y * J10.y;
556  tmp_re += I11.x * J11.x - I11.y * J11.y;
557  accum_re[15*blockDim.x] = tmp_re;
558 
559  tmp_im = I9.x * J9.y + I9.y * J9.x;
560  tmp_im += I10.x * J10.y + I10.y * J10.x;
561  tmp_im += I11.x * J11.y + I11.y * J11.x;
562  accum_im[15*blockDim.x] = tmp_im;
563 
564  //Store output back to global buffer:
565 
566 
567 /* CONTRACTION FULL VOLUME */
568 
569  out[outId + 0 *param.threads*2] = make_double2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
570  out[outId + 1 *param.threads*2] = make_double2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
571  out[outId + 2 *param.threads*2] = make_double2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
572  out[outId + 3 *param.threads*2] = make_double2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
573  out[outId + 4 *param.threads*2] = make_double2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
574  out[outId + 5 *param.threads*2] = make_double2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
575  out[outId + 6 *param.threads*2] = make_double2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
576  out[outId + 7 *param.threads*2] = make_double2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
577  out[outId + 8 *param.threads*2] = make_double2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
578  out[outId + 9 *param.threads*2] = make_double2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
579  out[outId + 10*param.threads*2] = make_double2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
580  out[outId + 11*param.threads*2] = make_double2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
581  out[outId + 12*param.threads*2] = make_double2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
582  out[outId + 13*param.threads*2] = make_double2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
583  out[outId + 14*param.threads*2] = make_double2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
584  out[outId + 15*param.threads*2] = make_double2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
585 
586  return;
587 }
588 
589 __global__ void contractKernel (double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
590 {
591  int sid = blockIdx.x*blockDim.x + threadIdx.x;
592  int outId = sid;
593  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
594 
595  if (sid >= param.threads)
596  return;
597 
598  volatile double2 tmp;
599  extern __shared__ double sm[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
600 
601  volatile double *accum_re = sm + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
602  volatile double *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
603 
604  eutId = 2*sid;
605  auxCoord1 = eutId / X1;
606  xCoord1 = eutId - auxCoord1 * X1;
607  auxCoord2 = auxCoord1 / X2;
608  xCoord2 = auxCoord1 - auxCoord2 * X2;
609  xCoord4 = auxCoord2 / X3;
610  xCoord3 = auxCoord2 - xCoord4 * X3;
611 
612  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
613  xCoord1 += auxCoord1;
614  outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4)); //AQUI
615 
616  READ_SPINOR (SPINORTEX, myStride, sid, sid);
617  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
618 
619  //compute in1^dag:
620 
621  I0.y = -I0.y;
622  I1.y = -I1.y;
623  I2.y = -I2.y;
624  I3.y = -I3.y;
625  I4.y = -I4.y;
626  I5.y = -I5.y;
627  I6.y = -I6.y;
628  I7.y = -I7.y;
629  I8.y = -I8.y;
630  I9.y = -I9.y;
631  I10.y = -I10.y;
632  I11.y = -I11.y;
633 
634  //do products for all color component here:
635  //00 component:
636  tmp_re = I0.x * J0.x - I0.y * J0.y;
637  tmp_re += I1.x * J1.x - I1.y * J1.y;
638  tmp_re += I2.x * J2.x - I2.y * J2.y;
639  accum_re[0*blockDim.x] = tmp_re;
640 
641  tmp_im = I0.x * J0.y + I0.y * J0.x;
642  tmp_im += I1.x * J1.y + I1.y * J1.x;
643  tmp_im += I2.x * J2.y + I2.y * J2.x;
644  accum_im[0*blockDim.x] = tmp_im;
645 
646  //01 component:
647  tmp_re = I0.x * J3.x - I0.y * J3.y;
648  tmp_re += I1.x * J4.x - I1.y * J4.y;
649  tmp_re += I2.x * J5.x - I2.y * J5.y;
650  accum_re[1*blockDim.x] = tmp_re;
651 
652  tmp_im = I0.x * J3.y + I0.y * J3.x;
653  tmp_im += I1.x * J4.y + I1.y * J4.x;
654  tmp_im += I2.x * J5.y + I2.y * J5.x;
655  accum_im[1*blockDim.x] = tmp_im;
656 
657  //02 component:
658  tmp_re = I0.x * J6.x - I0.y * J6.y;
659  tmp_re += I1.x * J7.x - I1.y * J7.y;
660  tmp_re += I2.x * J8.x - I2.y * J8.y;
661  accum_re[2*blockDim.x] = tmp_re;
662 
663  tmp_im = I0.x * J6.y + I0.y * J6.x;
664  tmp_im += I1.x * J7.y + I1.y * J7.x;
665  tmp_im += I2.x * J8.y + I2.y * J8.x;
666  accum_im[2*blockDim.x] = tmp_im;
667 
668  //03 component:
669  tmp_re = I0.x * J9.x - I0.y * J9.y;
670  tmp_re += I1.x * J10.x - I1.y * J10.y;
671  tmp_re += I2.x * J11.x - I2.y * J11.y;
672  accum_re[3*blockDim.x] = tmp_re;
673 
674  tmp_im = I0.x * J9.y + I0.y * J9.x;
675  tmp_im += I1.x * J10.y + I1.y * J10.x;
676  tmp_im += I2.x * J11.y + I2.y * J11.x;
677  accum_im[3*blockDim.x] = tmp_im;
678 
679  //10 component:
680  tmp_re = I3.x * J0.x - I3.y * J0.y;
681  tmp_re += I4.x * J1.x - I4.y * J1.y;
682  tmp_re += I5.x * J2.x - I5.y * J2.y;
683  accum_re[ 4*blockDim.x] = tmp_re;
684 
685  tmp_im = I3.x * J0.y + I3.y * J0.x;
686  tmp_im += I4.x * J1.y + I4.y * J1.x;
687  tmp_im += I5.x * J2.y + I5.y * J2.x;
688  accum_im[ 4*blockDim.x] = tmp_im;
689 
690  //11 component:
691  tmp_re = I3.x * J3.x - I3.y * J3.y;
692  tmp_re += I4.x * J4.x - I4.y * J4.y;
693  tmp_re += I5.x * J5.x - I5.y * J5.y;
694  accum_re[ 5*blockDim.x] = tmp_re;
695 
696  tmp_im = I3.x * J3.y + I3.y * J3.x;
697  tmp_im += I4.x * J4.y + I4.y * J4.x;
698  tmp_im += I5.x * J5.y + I5.y * J5.x;
699  accum_im[ 5*blockDim.x] = tmp_im;
700 
701  //12 component:
702  tmp_re = I3.x * J6.x - I3.y * J6.y;
703  tmp_re += I4.x * J7.x - I4.y * J7.y;
704  tmp_re += I5.x * J8.x - I5.y * J8.y;
705  accum_re[ 6*blockDim.x] = tmp_re;
706 
707  tmp_im = I3.x * J6.y + I3.y * J6.x;
708  tmp_im += I4.x * J7.y + I4.y * J7.x;
709  tmp_im += I5.x * J8.y + I5.y * J8.x;
710  accum_im[ 6*blockDim.x] = tmp_im;
711 
712  //13 component:
713  tmp_re = I3.x * J9.x - I3.y * J9.y;
714  tmp_re += I4.x * J10.x - I4.y * J10.y;
715  tmp_re += I5.x * J11.x - I5.y * J11.y;
716  accum_re[ 7*blockDim.x] = tmp_re;
717 
718  tmp_im = I3.x * J9.y + I3.y * J9.x;
719  tmp_im += I4.x * J10.y + I4.y * J10.x;
720  tmp_im += I5.x * J11.y + I5.y * J11.x;
721  accum_im[ 7*blockDim.x] = tmp_im;
722 
723  //20 component:
724  tmp_re = I6.x * J0.x - I6.y * J0.y;
725  tmp_re += I7.x * J1.x - I7.y * J1.y;
726  tmp_re += I8.x * J2.x - I8.y * J2.y;
727  accum_re[ 8*blockDim.x] = tmp_re;
728 
729  tmp_im = I6.x * J0.y + I6.y * J0.x;
730  tmp_im += I7.x * J1.y + I7.y * J1.x;
731  tmp_im += I8.x * J2.y + I8.y * J2.x;
732  accum_im[ 8*blockDim.x] = tmp_im;
733 
734  //21 component:
735  tmp_re = I6.x * J3.x - I6.y * J3.y;
736  tmp_re += I7.x * J4.x - I7.y * J4.y;
737  tmp_re += I8.x * J5.x - I8.y * J5.y;
738  accum_re[ 9*blockDim.x] = tmp_re;
739 
740  tmp_im = I6.x * J3.y + I6.y * J3.x;
741  tmp_im += I7.x * J4.y + I7.y * J4.x;
742  tmp_im += I8.x * J5.y + I8.y * J5.x;
743  accum_im[ 9*blockDim.x] = tmp_im;
744 
745  //22 component:
746  tmp_re = I6.x * J6.x - I6.y * J6.y;
747  tmp_re += I7.x * J7.x - I7.y * J7.y;
748  tmp_re += I8.x * J8.x - I8.y * J8.y;
749  accum_re[10*blockDim.x] = tmp_re;
750 
751  tmp_im = I6.x * J6.y + I6.y * J6.x;
752  tmp_im += I7.x * J7.y + I7.y * J7.x;
753  tmp_im += I8.x * J8.y + I8.y * J8.x;
754  accum_im[10*blockDim.x] = tmp_im;
755 
756  //23 component:
757  tmp_re = I6.x * J9.x - I6.y * J9.y;
758  tmp_re += I7.x * J10.x - I7.y * J10.y;
759  tmp_re += I8.x * J11.x - I8.y * J11.y;
760  accum_re[11*blockDim.x] = tmp_re;
761 
762  tmp_im = I6.x * J9.y + I6.y * J9.x;
763  tmp_im += I7.x * J10.y + I7.y * J10.x;
764  tmp_im += I8.x * J11.y + I8.y * J11.x;
765  accum_im[11*blockDim.x] = tmp_im;
766 
767  //30 component:
768  tmp_re = I9.x * J0.x - I9.y * J0.y;
769  tmp_re += I10.x * J1.x - I10.y * J1.y;
770  tmp_re += I11.x * J2.x - I11.y * J2.y;
771  accum_re[12*blockDim.x] = tmp_re;
772 
773  tmp_im = I9.x * J0.y + I9.y * J0.x;
774  tmp_im += I10.x * J1.y + I10.y * J1.x;
775  tmp_im += I11.x * J2.y + I11.y * J2.x;
776  accum_im[12*blockDim.x] = tmp_im;
777 
778  //31 component:
779  tmp_re = I9.x * J3.x - I9.y * J3.y;
780  tmp_re += I10.x * J4.x - I10.y * J4.y;
781  tmp_re += I11.x * J5.x - I11.y * J5.y;
782  accum_re[13*blockDim.x] = tmp_re;
783 
784  tmp_im = I9.x * J3.y + I9.y * J3.x;
785  tmp_im += I10.x * J4.y + I10.y * J4.x;
786  tmp_im += I11.x * J5.y + I11.y * J5.x;
787  accum_im[13*blockDim.x] = tmp_im;
788 
789  //32 component:
790  tmp_re = I9.x * J6.x - I9.y * J6.y;
791  tmp_re += I10.x * J7.x - I10.y * J7.y;
792  tmp_re += I11.x * J8.x - I11.y * J8.y;
793  accum_re[14*blockDim.x] = tmp_re;
794 
795  tmp_im = I9.x * J6.y + I9.y * J6.x;
796  tmp_im += I10.x * J7.y + I10.y * J7.x;
797  tmp_im += I11.x * J8.y + I11.y * J8.x;
798  accum_im[14*blockDim.x] = tmp_im;
799 
800  //33 component:
801  tmp_re = I9.x * J9.x - I9.y * J9.y;
802  tmp_re += I10.x * J10.x - I10.y * J10.y;
803  tmp_re += I11.x * J11.x - I11.y * J11.y;
804  accum_re[15*blockDim.x] = tmp_re;
805 
806  tmp_im = I9.x * J9.y + I9.y * J9.x;
807  tmp_im += I10.x * J10.y + I10.y * J10.x;
808  tmp_im += I11.x * J11.y + I11.y * J11.x;
809  accum_im[15*blockDim.x] = tmp_im;
810 
811 /* CONTRACTION FULL VOLUME */
812 
813  out[outId + 0 *param.threads*2] = make_double2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
814  out[outId + 1 *param.threads*2] = make_double2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
815  out[outId + 2 *param.threads*2] = make_double2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
816  out[outId + 3 *param.threads*2] = make_double2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
817  out[outId + 4 *param.threads*2] = make_double2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
818  out[outId + 5 *param.threads*2] = make_double2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
819  out[outId + 6 *param.threads*2] = make_double2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
820  out[outId + 7 *param.threads*2] = make_double2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
821  out[outId + 8 *param.threads*2] = make_double2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
822  out[outId + 9 *param.threads*2] = make_double2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
823  out[outId + 10*param.threads*2] = make_double2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
824  out[outId + 11*param.threads*2] = make_double2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
825  out[outId + 12*param.threads*2] = make_double2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
826  out[outId + 13*param.threads*2] = make_double2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
827  out[outId + 14*param.threads*2] = make_double2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
828  out[outId + 15*param.threads*2] = make_double2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
829 
830  return;
831 }
832 
833 #undef SPINORTEX
834 #undef INTERTEX
835 
836 #undef READ_SPINOR
837 #undef READ_INTERMEDIATE_SPINOR
838 
839 #undef SPINOR_HOP
840 
841 #endif // (__CUDA_ARCH__ >= 130)
842 
843 
844 #define READ_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
845  float4 I0 = spinor[sp_idx + 0*(stride)]; \
846  float4 I1 = spinor[sp_idx + 1*(stride)]; \
847  float4 I2 = spinor[sp_idx + 2*(stride)]; \
848  float4 I3 = spinor[sp_idx + 3*(stride)]; \
849  float4 I4 = spinor[sp_idx + 4*(stride)]; \
850  float4 I5 = spinor[sp_idx + 5*(stride)];
851 
852 
853 #define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
854  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
855  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
856  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
857  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
858  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
859  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
860 
861 #define READ_INTERMEDIATE_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
862  float4 J0 = spinor[sp_idx + 0*(stride)]; \
863  float4 J1 = spinor[sp_idx + 1*(stride)]; \
864  float4 J2 = spinor[sp_idx + 2*(stride)]; \
865  float4 J3 = spinor[sp_idx + 3*(stride)]; \
866  float4 J4 = spinor[sp_idx + 4*(stride)]; \
867  float4 J5 = spinor[sp_idx + 5*(stride)];
868 
869 #define READ_INTERMEDIATE_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
870  float4 J0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
871  float4 J1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
872  float4 J2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
873  float4 J3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
874  float4 J4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
875  float4 J5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
876 
877 
878 #ifdef DIRECT_ACCESS_WILSON_SPINOR
879  #define READ_SPINOR READ_SPINOR_SINGLE
880  #define SPINORTEX in
881 #else
882  #define READ_SPINOR READ_SPINOR_SINGLE_TEX
883 
884  #ifdef USE_TEXTURE_OBJECTS
885  #define SPINORTEX param.inTex
886  #else
887  #define SPINORTEX spinorTexSingle
888  #endif // USE_TEXTURE_OBJECTS
889 #endif
890 
891 #ifdef DIRECT_ACCESS_WILSON_INTER
892  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE
893  #define INTERTEX in2
894 #else
895  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE_TEX
896 
897  #ifdef USE_TEXTURE_OBJECTS
898  #define INTERTEX param.outTex
899  #else
900  #define INTERTEX interTexSingle
901  #endif // USE_TEXTURE_OBJECTS
902 #endif
903 
904 #define SPINOR_HOP 6
905 
906 __global__ void contractGamma5Kernel (float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
907 {
908  int sid = blockIdx.x*blockDim.x + threadIdx.x;
909  int outId = sid;
910  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
911 
912  if (sid >= param.threads)
913  return;
914 
915  volatile float2 tmp;
916  extern __shared__ float sms[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
917 
918  volatile float *accum_re = sms + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
919  volatile float *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
920 
921  eutId = 2*sid;
922  auxCoord1 = eutId / X1;
923  xCoord1 = eutId - auxCoord1 * X1;
924  auxCoord2 = auxCoord1 / X2;
925  xCoord2 = auxCoord1 - auxCoord2 * X2;
926  xCoord4 = auxCoord2 / X3;
927  xCoord3 = auxCoord2 - xCoord4 * X3;
928 
929  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
930  xCoord1 += auxCoord1;
931  outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4)); //AQUI
932 
933  //Load the full input spinors:
934 
935  READ_SPINOR (SPINORTEX, myStride, sid, sid);
936  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
937 
938  //compute in1^dag * gamma5:
939 
940  //First color component
941 
942  tmp_re = +I0.x;
943  tmp_im = -I0.y;
944  I0.x = +I3.x;
945  I0.y = -I3.y;
946  I3.x = tmp_re;
947  I3.y = tmp_im;
948 
949  tmp_re = +I1.z;
950  tmp_im = -I1.w;
951  I1.z = +I4.z;
952  I1.w = -I4.w;
953  I4.z = tmp_re;
954  I4.w = tmp_im;
955 
956  //Second color component
957 
958  tmp_re = +I0.z;
959  tmp_im = -I0.w;
960  I0.z = +I3.z;
961  I0.w = -I3.w;
962  I3.z = tmp_re;
963  I3.w = tmp_im;
964 
965  tmp_re = +I2.x;
966  tmp_im = -I2.y;
967  I2.x = +I5.x;
968  I2.y = -I5.y;
969  I5.x = tmp_re;
970  I5.y = tmp_im;
971 
972  //Third color component
973 
974  tmp_re = +I1.x;
975  tmp_im = -I1.y;
976  I1.x = +I4.x;
977  I1.y = -I4.y;
978  I4.x = tmp_re;
979  I4.y = tmp_im;
980 
981  tmp_re = +I2.z;
982  tmp_im = -I2.w;
983  I2.z = +I5.z;
984  I2.w = -I5.w;
985  I5.z = tmp_re;
986  I5.w = tmp_im;
987 
988  //do products for first color component here:
989 
990  //00 component:
991  tmp_re = I0.x * J0.x - I0.y * J0.y;
992  tmp_re += I0.z * J0.z - I0.w * J0.w;
993  tmp_re += I1.x * J1.x - I1.y * J1.y;
994  accum_re[0*blockDim.x] = tmp_re;
995 
996  tmp_im = I0.x * J0.y + I0.y * J0.x;
997  tmp_im += I0.z * J0.w + I0.w * J0.z;
998  tmp_im += I1.x * J1.y + I1.y * J1.x;
999  accum_im[0*blockDim.x] = tmp_im;
1000 
1001  //01 component:
1002  tmp_re = I0.x * J1.z - I0.y * J1.w;
1003  tmp_re += I0.z * J2.x - I0.w * J2.y;
1004  tmp_re += I1.x * J2.z - I1.y * J2.w;
1005  accum_re[1*blockDim.x] = tmp_re;
1006 
1007  tmp_im = I0.x * J1.w + I0.y * J1.z;
1008  tmp_im += I0.z * J2.y + I0.w * J2.x;
1009  tmp_im += I1.x * J2.w + I1.y * J2.z;
1010  accum_im[1*blockDim.x] = tmp_im;
1011 
1012  //02 component:
1013  tmp_re = I0.x * J3.x - I0.y * J3.y;
1014  tmp_re += I0.z * J3.z - I0.w * J3.w;
1015  tmp_re += I1.x * J4.x - I1.y * J4.y;
1016  accum_re[2*blockDim.x] = tmp_re;
1017 
1018  tmp_im = I0.x * J3.y + I0.y * J3.x;
1019  tmp_im += I0.z * J3.w + I0.w * J3.z;
1020  tmp_im += I1.x * J4.y + I1.y * J4.x;
1021  accum_im[2*blockDim.x] = tmp_im;
1022 
1023  //03 component:
1024  tmp_re = I0.x * J4.z - I0.y * J4.w;
1025  tmp_re += I0.z * J5.x - I0.w * J5.x;
1026  tmp_re += I1.x * J5.z - I1.y * J5.w;
1027 
1028  accum_re[3*blockDim.x] = tmp_re;
1029 
1030  tmp_im = I0.x * J4.w + I0.y * J4.z;
1031  tmp_im += I0.z * J5.y + I0.w * J5.y;
1032  tmp_im += I1.x * J5.w + I1.y * J5.z;
1033  accum_im[3*blockDim.x] = tmp_im;
1034 
1035  //10 component:
1036  tmp_re = I1.z * J0.x - I1.w * J0.y;
1037  tmp_re += I2.x * J0.z - I2.y * J0.w;
1038  tmp_re += I2.z * J1.x - I2.w * J1.y;
1039  accum_re[4*blockDim.x] = tmp_re;
1040 
1041  tmp_im = I1.z * J0.y + I1.w * J0.x;
1042  tmp_im += I2.x * J0.w + I2.y * J0.z;
1043  tmp_im += I2.z * J1.y + I2.w * J1.x;
1044  accum_im[4*blockDim.x] = tmp_im;
1045 
1046  //11 component:
1047  tmp_re = I1.z * J1.z - I1.w * J1.w;
1048  tmp_re += I2.x * J2.x - I2.y * J2.y;
1049  tmp_re += I2.z * J2.z - I2.w * J2.w;
1050  accum_re[5*blockDim.x] = tmp_re;
1051 
1052  tmp_im = I1.z * J1.w + I1.w * J1.z;
1053  tmp_im += I2.x * J2.y + I2.y * J2.x;
1054  tmp_im += I2.z * J2.w + I2.w * J2.z;
1055  accum_im[5*blockDim.x] = tmp_im;
1056 
1057  //12 component:
1058  tmp_re = I1.z * J3.x - I1.w * J3.y;
1059  tmp_re += I2.x * J3.z - I2.y * J3.w;
1060  tmp_re += I2.z * J4.x - I2.w * J4.y;
1061  accum_re[6*blockDim.x] = tmp_re;
1062 
1063  tmp_im = I1.z * J3.y + I1.w * J3.x;
1064  tmp_im += I2.x * J3.w + I2.y * J3.z;
1065  tmp_im += I2.z * J4.y + I2.w * J4.x;
1066  accum_im[6*blockDim.x] = tmp_im;
1067 
1068  //13 component:
1069  tmp_re = I1.z * J4.z - I1.w * J4.w;
1070  tmp_re += I2.x * J5.x - I2.y * J5.y;
1071  tmp_re += I2.z * J5.z - I2.w * J5.w;
1072  accum_re[7*blockDim.x] = tmp_re;
1073 
1074  tmp_im = I1.z * J4.w + I1.w * J4.z;
1075  tmp_im += I2.x * J5.y + I2.y * J5.x;
1076  tmp_im += I2.z * J5.w + I2.w * J5.z;
1077  accum_im[7*blockDim.x] = tmp_im;
1078 
1079  //20 component:
1080  tmp_re = I3.x * J0.x - I3.y * J0.y;
1081  tmp_re += I3.z * J0.z - I3.w * J0.w;
1082  tmp_re += I4.x * J1.x - I4.y * J1.y;
1083  accum_re[8*blockDim.x] = tmp_re;
1084 
1085  tmp_im = I3.x * J0.y + I3.y * J0.x;
1086  tmp_im += I3.z * J0.w + I3.w * J0.z;
1087  tmp_im += I4.x * J1.y + I4.y * J1.x;
1088  accum_im[8*blockDim.x] = tmp_im;
1089 
1090  //21 component:
1091  tmp_re = I3.x * J1.z - I3.y * J1.w;
1092  tmp_re += I3.z * J2.x - I3.w * J2.y;
1093  tmp_re += I4.x * J2.z - I4.y * J2.w;
1094  accum_re[9*blockDim.x] = tmp_re;
1095 
1096  tmp_im = I3.x * J1.w + I3.y * J1.z;
1097  tmp_im += I3.z * J2.y + I3.w * J2.x;
1098  tmp_im += I4.x * J2.w + I4.y * J2.z;
1099  accum_im[9*blockDim.x] = tmp_im;
1100 
1101  //22 component:
1102  tmp_re = I3.x * J3.x - I3.y * J3.y;
1103  tmp_re += I3.z * J3.z - I3.w * J3.w;
1104  tmp_re += I4.x * J4.x - I4.y * J4.y;
1105  accum_re[10*blockDim.x] = tmp_re;
1106 
1107  tmp_im = I3.x * J3.y + I3.y * J3.x;
1108  tmp_im += I3.z * J3.w + I3.w * J3.z;
1109  tmp_im += I4.x * J4.y + I4.y * J4.x;
1110  accum_im[10*blockDim.x] = tmp_im;
1111 
1112  //23 component:
1113  tmp_re = I3.x * J4.z - I3.y * J4.w;
1114  tmp_re += I3.z * J5.x - I3.w * J5.y;
1115  tmp_re += I4.x * J5.z - I4.y * J5.w;
1116  accum_re[11*blockDim.x] = tmp_re;
1117 
1118  tmp_im = I3.x * J4.w + I3.y * J4.z;
1119  tmp_im += I3.z * J5.y + I3.w * J5.x;
1120  tmp_im += I4.x * J5.w + I4.y * J5.z;
1121  accum_im[11*blockDim.x] = tmp_im;
1122 
1123  //30 component:
1124  tmp_re = I4.z * J0.x - I4.w * J0.y;
1125  tmp_re += I5.x * J0.z - I5.y * J0.w;
1126  tmp_re += I5.z * J1.x - I5.w * J1.y;
1127  accum_re[12*blockDim.x] = tmp_re;
1128 
1129  tmp_im = I4.z * J0.y + I4.w * J0.x;
1130  tmp_im += I5.x * J0.w + I5.y * J0.z;
1131  tmp_im += I5.z * J1.y + I5.w * J1.x;
1132  accum_im[12*blockDim.x] = tmp_im;
1133 
1134  //31 component:
1135  tmp_re = I4.z * J1.z - I4.w * J1.w;
1136  tmp_re += I5.x * J2.x - I5.y * J2.y;
1137  tmp_re += I5.z * J2.z - I5.w * J2.w;
1138  accum_re[13*blockDim.x] = tmp_re;
1139 
1140  tmp_im = I4.z * J1.w + I4.w * J1.z;
1141  tmp_im += I5.x * J2.y + I5.y * J2.x;
1142  tmp_im += I5.z * J2.w + I5.w * J2.z;
1143  accum_im[13*blockDim.x] = tmp_im;
1144 
1145  //32 component:
1146  tmp_re = I4.z * J3.x - I4.w * J3.y;
1147  tmp_re += I5.x * J3.z - I5.y * J3.w;
1148  tmp_re += I5.z * J4.x - I5.w * J4.y;
1149  accum_re[14*blockDim.x] = tmp_re;
1150 
1151  tmp_im = I4.z * J3.y + I4.w * J3.x;
1152  tmp_im += I5.x * J3.w + I5.y * J3.z;
1153  tmp_im += I5.z * J4.y + I5.w * J4.x;
1154  accum_im[14*blockDim.x] = tmp_im;
1155 
1156  //33 component:
1157  tmp_re = I4.z * J4.z - I4.w * J4.w;
1158  tmp_re += I5.x * J5.x - I5.y * J5.y;
1159  tmp_re += I5.z * J5.z - I5.w * J5.w;
1160  accum_re[15*blockDim.x] = tmp_re;
1161 
1162  tmp_im = I4.z * J4.w + I4.w * J4.z;
1163  tmp_im += I5.x * J5.y + I5.y * J5.x;
1164  tmp_im += I5.z * J5.w + I5.w * J5.z;
1165  accum_im[15*blockDim.x] = tmp_im;
1166 
1167 
1168 
1169  //Store output back to global buffer:
1170 
1171 
1172  /* CONTRACTION FULL VOLUME */
1173 
1174  out[outId + 0 *param.threads*2] = make_float2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
1175  out[outId + 1 *param.threads*2] = make_float2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
1176  out[outId + 2 *param.threads*2] = make_float2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
1177  out[outId + 3 *param.threads*2] = make_float2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
1178  out[outId + 4 *param.threads*2] = make_float2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
1179  out[outId + 5 *param.threads*2] = make_float2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
1180  out[outId + 6 *param.threads*2] = make_float2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
1181  out[outId + 7 *param.threads*2] = make_float2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
1182  out[outId + 8 *param.threads*2] = make_float2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
1183  out[outId + 9 *param.threads*2] = make_float2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
1184  out[outId + 10*param.threads*2] = make_float2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
1185  out[outId + 11*param.threads*2] = make_float2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
1186  out[outId + 12*param.threads*2] = make_float2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
1187  out[outId + 13*param.threads*2] = make_float2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
1188  out[outId + 14*param.threads*2] = make_float2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
1189  out[outId + 15*param.threads*2] = make_float2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
1190 
1191  return;
1192 }
1193 
1194 //Perform trace in color space only and for a given tslice
1195 //since the file is included in dslash_quda.h, no need to add dslash_constants.h file here (for, e.g., Vsh)
1196 __global__ void contractTsliceKernel (float2 *out, float4 *in1, float4 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
1197 {
1198  int sid = blockIdx.x*blockDim.x + threadIdx.x; //number of threads is equal to Tslice volume
1199  //Adjust sid to correct tslice (exe domain must be Tslice volume!)
1200  int inId = sid + Vsh*Tslice; //Vsh - 3d space volume for the parity spinor (equale to exe domain!)
1201  int outId;
1202  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1203 
1204  if (sid >= param.threads) //param.threads == tslice volume
1205  return;
1206 
1207  volatile float2 tmp;
1208  extern __shared__ float sms[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
1209 
1210  volatile float *accum_re = sms + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
1211  volatile float *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
1212 
1213 //The output only for a given tslice (for the full tslice content, i.e., both parities!):
1214 
1215  eutId = 2*inId;
1216  auxCoord1 = eutId / X1;
1217  xCoord1 = eutId - auxCoord1 * X1;
1218  auxCoord2 = auxCoord1 / X2;
1219  xCoord2 = auxCoord1 - auxCoord2 * X2;
1220  xCoord4 = auxCoord2 / X3;
1221 
1222 // if (Tslice != xCoord4)
1223 // return;
1224 
1225  xCoord3 = auxCoord2 - xCoord4 * X3;
1226 
1227  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1228  xCoord1 += auxCoord1;
1229  outId = xCoord1 + X1*(xCoord2 + X2*xCoord3); //AQUI
1230 
1231  //Load the full input spinors:
1232 
1233  READ_SPINOR (SPINORTEX, myStride, sid, sid);
1234  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
1235 
1236  //compute in1^dag:
1237 
1238  I0.y = -I0.y;
1239  I0.w = -I0.w;
1240  I1.y = -I1.y;
1241  I1.w = -I1.w;
1242  I2.y = -I2.y;
1243  I2.w = -I2.w;
1244  I3.y = -I3.y;
1245  I3.w = -I3.w;
1246  I4.y = -I4.y;
1247  I4.w = -I4.w;
1248  I5.y = -I5.y;
1249  I5.w = -I5.w;
1250 
1251  //do products for first color component here:
1252  //00 component:
1253  tmp_re = I0.x * J0.x - I0.y * J0.y;
1254  tmp_re += I0.z * J0.z - I0.w * J0.w;
1255  tmp_re += I1.x * J1.x - I1.y * J1.y;
1256  accum_re[0*blockDim.x] = tmp_re;
1257 
1258  tmp_im = I0.x * J0.y + I0.y * J0.x;
1259  tmp_im += I0.z * J0.w + I0.w * J0.z;
1260  tmp_im += I1.x * J1.y + I1.y * J1.x;
1261  accum_im[0*blockDim.x] = tmp_im;
1262 
1263  //01 component:
1264  tmp_re = I0.x * J1.z - I0.y * J1.w;
1265  tmp_re += I0.z * J2.x - I0.w * J2.y;
1266  tmp_re += I1.x * J2.z - I1.y * J2.w;
1267  accum_re[1*blockDim.x] = tmp_re;
1268 
1269  tmp_im = I0.x * J1.w + I0.y * J1.z;
1270  tmp_im += I0.z * J2.y + I0.w * J2.x;
1271  tmp_im += I1.x * J2.w + I1.y * J2.z;
1272  accum_im[1*blockDim.x] = tmp_im;
1273 
1274  //02 component:
1275  tmp_re = I0.x * J3.x - I0.y * J3.y;
1276  tmp_re += I0.z * J3.z - I0.w * J3.w;
1277  tmp_re += I1.x * J4.x - I1.y * J4.y;
1278  accum_re[2*blockDim.x] = tmp_re;
1279 
1280  tmp_im = I0.x * J3.y + I0.y * J3.x;
1281  tmp_im += I0.z * J3.w + I0.w * J3.z;
1282  tmp_im += I1.x * J4.y + I1.y * J4.x;
1283  accum_im[2*blockDim.x] = tmp_im;
1284 
1285  //03 component:
1286  tmp_re = I0.x * J4.z - I0.y * J4.w;
1287  tmp_re += I0.z * J5.x - I0.w * J5.x;
1288  tmp_re += I1.x * J5.z - I1.y * J5.w;
1289 
1290  accum_re[3*blockDim.x] = tmp_re;
1291 
1292  tmp_im = I0.x * J4.w + I0.y * J4.z;
1293  tmp_im += I0.z * J5.y + I0.w * J5.y;
1294  tmp_im += I1.x * J5.w + I1.y * J5.z;
1295  accum_im[3*blockDim.x] = tmp_im;
1296 
1297  //10 component:
1298  tmp_re = I1.z * J0.x - I1.w * J0.y;
1299  tmp_re += I2.x * J0.z - I2.y * J0.w;
1300  tmp_re += I2.z * J1.x - I2.w * J1.y;
1301  accum_re[4*blockDim.x] = tmp_re;
1302 
1303  tmp_im = I1.z * J0.y + I1.w * J0.x;
1304  tmp_im += I2.x * J0.w + I2.y * J0.z;
1305  tmp_im += I2.z * J1.y + I2.w * J1.x;
1306  accum_im[4*blockDim.x] = tmp_im;
1307 
1308  //11 component:
1309  tmp_re = I1.z * J1.z - I1.w * J1.w;
1310  tmp_re += I2.x * J2.x - I2.y * J2.y;
1311  tmp_re += I2.z * J2.z - I2.w * J2.w;
1312  accum_re[5*blockDim.x] = tmp_re;
1313 
1314  tmp_im = I1.z * J1.w + I1.w * J1.z;
1315  tmp_im += I2.x * J2.y + I2.y * J2.x;
1316  tmp_im += I2.z * J2.w + I2.w * J2.z;
1317  accum_im[5*blockDim.x] = tmp_im;
1318 
1319  //12 component:
1320  tmp_re = I1.z * J3.x - I1.w * J3.y;
1321  tmp_re += I2.x * J3.z - I2.y * J3.w;
1322  tmp_re += I2.z * J4.x - I2.w * J4.y;
1323  accum_re[6*blockDim.x] = tmp_re;
1324 
1325  tmp_im = I1.z * J3.y + I1.w * J3.x;
1326  tmp_im += I2.x * J3.w + I2.y * J3.z;
1327  tmp_im += I2.z * J4.y + I2.w * J4.x;
1328  accum_im[6*blockDim.x] = tmp_im;
1329 
1330  //13 component:
1331  tmp_re = I1.z * J4.z - I1.w * J4.w;
1332  tmp_re += I2.x * J5.x - I2.y * J5.y;
1333  tmp_re += I2.z * J5.z - I2.w * J5.w;
1334  accum_re[7*blockDim.x] = tmp_re;
1335 
1336  tmp_im = I1.z * J4.w + I1.w * J4.z;
1337  tmp_im += I2.x * J5.y + I2.y * J5.x;
1338  tmp_im += I2.z * J5.w + I2.w * J5.z;
1339  accum_im[7*blockDim.x] = tmp_im;
1340 
1341  //20 component:
1342  tmp_re = I3.x * J0.x - I3.y * J0.y;
1343  tmp_re += I3.z * J0.z - I3.w * J0.w;
1344  tmp_re += I4.x * J1.x - I4.y * J1.y;
1345  accum_re[8*blockDim.x] = tmp_re;
1346 
1347  tmp_im = I3.x * J0.y + I3.y * J0.x;
1348  tmp_im += I3.z * J0.w + I3.w * J0.z;
1349  tmp_im += I4.x * J1.y + I4.y * J1.x;
1350  accum_im[8*blockDim.x] = tmp_im;
1351 
1352  //21 component:
1353  tmp_re = I3.x * J1.z - I3.y * J1.w;
1354  tmp_re += I3.z * J2.x - I3.w * J2.y;
1355  tmp_re += I4.x * J2.z - I4.y * J2.w;
1356  accum_re[9*blockDim.x] = tmp_re;
1357 
1358  tmp_im = I3.x * J1.w + I3.y * J1.z;
1359  tmp_im += I3.z * J2.y + I3.w * J2.x;
1360  tmp_im += I4.x * J2.w + I4.y * J2.z;
1361  accum_im[9*blockDim.x] = tmp_im;
1362 
1363  //22 component:
1364  tmp_re = I3.x * J3.x - I3.y * J3.y;
1365  tmp_re += I3.z * J3.z - I3.w * J3.w;
1366  tmp_re += I4.x * J4.x - I4.y * J4.y;
1367  accum_re[10*blockDim.x] = tmp_re;
1368 
1369  tmp_im = I3.x * J3.y + I3.y * J3.x;
1370  tmp_im += I3.z * J3.w + I3.w * J3.z;
1371  tmp_im += I4.x * J4.y + I4.y * J4.x;
1372  accum_im[10*blockDim.x] = tmp_im;
1373 
1374  //23 component:
1375  tmp_re = I3.x * J4.z - I3.y * J4.w;
1376  tmp_re += I3.z * J5.x - I3.w * J5.y;
1377  tmp_re += I4.x * J5.z - I4.y * J5.w;
1378  accum_re[11*blockDim.x] = tmp_re;
1379 
1380  tmp_im = I3.x * J4.w + I3.y * J4.z;
1381  tmp_im += I3.z * J5.y + I3.w * J5.x;
1382  tmp_im += I4.x * J5.w + I4.y * J5.z;
1383  accum_im[11*blockDim.x] = tmp_im;
1384 
1385  //30 component:
1386  tmp_re = I4.z * J0.x - I4.w * J0.y;
1387  tmp_re += I5.x * J0.z - I5.y * J0.w;
1388  tmp_re += I5.z * J1.x - I5.w * J1.y;
1389  accum_re[12*blockDim.x] = tmp_re;
1390 
1391  tmp_im = I4.z * J0.y + I4.w * J0.x;
1392  tmp_im += I5.x * J0.w + I5.y * J0.z;
1393  tmp_im += I5.z * J1.y + I5.w * J1.x;
1394  accum_im[12*blockDim.x] = tmp_im;
1395 
1396  //31 component:
1397  tmp_re = I4.z * J1.z - I4.w * J1.w;
1398  tmp_re += I5.x * J2.x - I5.y * J2.y;
1399  tmp_re += I5.z * J2.z - I5.w * J2.w;
1400  accum_re[13*blockDim.x] = tmp_re;
1401 
1402  tmp_im = I4.z * J1.w + I4.w * J1.z;
1403  tmp_im += I5.x * J2.y + I5.y * J2.x;
1404  tmp_im += I5.z * J2.w + I5.w * J2.z;
1405  accum_im[13*blockDim.x] = tmp_im;
1406 
1407  //32 component:
1408  tmp_re = I4.z * J3.x - I4.w * J3.y;
1409  tmp_re += I5.x * J3.z - I5.y * J3.w;
1410  tmp_re += I5.z * J4.x - I5.w * J4.y;
1411  accum_re[14*blockDim.x] = tmp_re;
1412 
1413  tmp_im = I4.z * J3.y + I4.w * J3.x;
1414  tmp_im += I5.x * J3.w + I5.y * J3.z;
1415  tmp_im += I5.z * J4.y + I5.w * J4.x;
1416  accum_im[14*blockDim.x] = tmp_im;
1417 
1418  //33 component:
1419  tmp_re = I4.z * J4.z - I4.w * J4.w;
1420  tmp_re += I5.x * J5.x - I5.y * J5.y;
1421  tmp_re += I5.z * J5.z - I5.w * J5.w;
1422  accum_re[15*blockDim.x] = tmp_re;
1423 
1424  tmp_im = I4.z * J4.w + I4.w * J4.z;
1425  tmp_im += I5.x * J5.y + I5.y * J5.x;
1426  tmp_im += I5.z * J5.w + I5.w * J5.z;
1427  accum_im[15*blockDim.x] = tmp_im;
1428 
1429  //Store output back to global buffer:
1430 
1431 
1432  /* CONTRACTION FULL VOLUME */
1433 
1434  out[outId + 0 *param.threads*2] = make_float2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
1435  out[outId + 1 *param.threads*2] = make_float2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
1436  out[outId + 2 *param.threads*2] = make_float2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
1437  out[outId + 3 *param.threads*2] = make_float2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
1438  out[outId + 4 *param.threads*2] = make_float2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
1439  out[outId + 5 *param.threads*2] = make_float2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
1440  out[outId + 6 *param.threads*2] = make_float2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
1441  out[outId + 7 *param.threads*2] = make_float2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
1442  out[outId + 8 *param.threads*2] = make_float2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
1443  out[outId + 9 *param.threads*2] = make_float2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
1444  out[outId + 10*param.threads*2] = make_float2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
1445  out[outId + 11*param.threads*2] = make_float2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
1446  out[outId + 12*param.threads*2] = make_float2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
1447  out[outId + 13*param.threads*2] = make_float2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
1448  out[outId + 14*param.threads*2] = make_float2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
1449  out[outId + 15*param.threads*2] = make_float2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
1450 
1451  return;
1452 }
1453 
1454 __global__ void contractKernel (float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
1455 {
1456  int sid = blockIdx.x*blockDim.x + threadIdx.x;
1457  int outId = sid;
1458  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1459 
1460  if (sid >= param.threads)
1461  return;
1462 
1463  volatile float2 tmp;
1464  extern __shared__ float sms[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
1465 
1466  volatile float *accum_re = sms + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
1467  volatile float *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
1468 
1469  eutId = 2*sid;
1470  auxCoord1 = eutId / X1;
1471  xCoord1 = eutId - auxCoord1 * X1;
1472  auxCoord2 = auxCoord1 / X2;
1473  xCoord2 = auxCoord1 - auxCoord2 * X2;
1474  xCoord4 = auxCoord2 / X3;
1475  xCoord3 = auxCoord2 - xCoord4 * X3;
1476 
1477  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1478  xCoord1 += auxCoord1;
1479  outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4)); //AQUI
1480 
1481  //Load the full input spinors:
1482 
1483  READ_SPINOR (SPINORTEX, myStride, sid, sid);
1484  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
1485 
1486  //compute in1^dag:
1487 
1488  I0.y = -I0.y;
1489  I0.w = -I0.w;
1490  I1.y = -I1.y;
1491  I1.w = -I1.w;
1492  I2.y = -I2.y;
1493  I2.w = -I2.w;
1494  I3.y = -I3.y;
1495  I3.w = -I3.w;
1496  I4.y = -I4.y;
1497  I4.w = -I4.w;
1498  I5.y = -I5.y;
1499  I5.w = -I5.w;
1500 
1501  //do products for first color component here:
1502  //00 component:
1503  tmp_re = I0.x * J0.x - I0.y * J0.y;
1504  tmp_re += I0.z * J0.z - I0.w * J0.w;
1505  tmp_re += I1.x * J1.x - I1.y * J1.y;
1506  accum_re[0*blockDim.x] = tmp_re;
1507 
1508  tmp_im = I0.x * J0.y + I0.y * J0.x;
1509  tmp_im += I0.z * J0.w + I0.w * J0.z;
1510  tmp_im += I1.x * J1.y + I1.y * J1.x;
1511  accum_im[0*blockDim.x] = tmp_im;
1512 
1513  //01 component:
1514  tmp_re = I0.x * J1.z - I0.y * J1.w;
1515  tmp_re += I0.z * J2.x - I0.w * J2.y;
1516  tmp_re += I1.x * J2.z - I1.y * J2.w;
1517  accum_re[1*blockDim.x] = tmp_re;
1518 
1519  tmp_im = I0.x * J1.w + I0.y * J1.z;
1520  tmp_im += I0.z * J2.y + I0.w * J2.x;
1521  tmp_im += I1.x * J2.w + I1.y * J2.z;
1522  accum_im[1*blockDim.x] = tmp_im;
1523 
1524  //02 component:
1525  tmp_re = I0.x * J3.x - I0.y * J3.y;
1526  tmp_re += I0.z * J3.z - I0.w * J3.w;
1527  tmp_re += I1.x * J4.x - I1.y * J4.y;
1528  accum_re[2*blockDim.x] = tmp_re;
1529 
1530  tmp_im = I0.x * J3.y + I0.y * J3.x;
1531  tmp_im += I0.z * J3.w + I0.w * J3.z;
1532  tmp_im += I1.x * J4.y + I1.y * J4.x;
1533  accum_im[2*blockDim.x] = tmp_im;
1534 
1535  //03 component:
1536  tmp_re = I0.x * J4.z - I0.y * J4.w;
1537  tmp_re += I0.z * J5.x - I0.w * J5.x;
1538  tmp_re += I1.x * J5.z - I1.y * J5.w;
1539 
1540  accum_re[3*blockDim.x] = tmp_re;
1541 
1542  tmp_im = I0.x * J4.w + I0.y * J4.z;
1543  tmp_im += I0.z * J5.y + I0.w * J5.y;
1544  tmp_im += I1.x * J5.w + I1.y * J5.z;
1545  accum_im[3*blockDim.x] = tmp_im;
1546 
1547  //10 component:
1548  tmp_re = I1.z * J0.x - I1.w * J0.y;
1549  tmp_re += I2.x * J0.z - I2.y * J0.w;
1550  tmp_re += I2.z * J1.x - I2.w * J1.y;
1551  accum_re[4*blockDim.x] = tmp_re;
1552 
1553  tmp_im = I1.z * J0.y + I1.w * J0.x;
1554  tmp_im += I2.x * J0.w + I2.y * J0.z;
1555  tmp_im += I2.z * J1.y + I2.w * J1.x;
1556  accum_im[4*blockDim.x] = tmp_im;
1557 
1558  //11 component:
1559  tmp_re = I1.z * J1.z - I1.w * J1.w;
1560  tmp_re += I2.x * J2.x - I2.y * J2.y;
1561  tmp_re += I2.z * J2.z - I2.w * J2.w;
1562  accum_re[5*blockDim.x] = tmp_re;
1563 
1564  tmp_im = I1.z * J1.w + I1.w * J1.z;
1565  tmp_im += I2.x * J2.y + I2.y * J2.x;
1566  tmp_im += I2.z * J2.w + I2.w * J2.z;
1567  accum_im[5*blockDim.x] = tmp_im;
1568 
1569  //12 component:
1570  tmp_re = I1.z * J3.x - I1.w * J3.y;
1571  tmp_re += I2.x * J3.z - I2.y * J3.w;
1572  tmp_re += I2.z * J4.x - I2.w * J4.y;
1573  accum_re[6*blockDim.x] = tmp_re;
1574 
1575  tmp_im = I1.z * J3.y + I1.w * J3.x;
1576  tmp_im += I2.x * J3.w + I2.y * J3.z;
1577  tmp_im += I2.z * J4.y + I2.w * J4.x;
1578  accum_im[6*blockDim.x] = tmp_im;
1579 
1580  //13 component:
1581  tmp_re = I1.z * J4.z - I1.w * J4.w;
1582  tmp_re += I2.x * J5.x - I2.y * J5.y;
1583  tmp_re += I2.z * J5.z - I2.w * J5.w;
1584  accum_re[7*blockDim.x] = tmp_re;
1585 
1586  tmp_im = I1.z * J4.w + I1.w * J4.z;
1587  tmp_im += I2.x * J5.y + I2.y * J5.x;
1588  tmp_im += I2.z * J5.w + I2.w * J5.z;
1589  accum_im[7*blockDim.x] = tmp_im;
1590 
1591  //20 component:
1592  tmp_re = I3.x * J0.x - I3.y * J0.y;
1593  tmp_re += I3.z * J0.z - I3.w * J0.w;
1594  tmp_re += I4.x * J1.x - I4.y * J1.y;
1595  accum_re[8*blockDim.x] = tmp_re;
1596 
1597  tmp_im = I3.x * J0.y + I3.y * J0.x;
1598  tmp_im += I3.z * J0.w + I3.w * J0.z;
1599  tmp_im += I4.x * J1.y + I4.y * J1.x;
1600  accum_im[8*blockDim.x] = tmp_im;
1601 
1602  //21 component:
1603  tmp_re = I3.x * J1.z - I3.y * J1.w;
1604  tmp_re += I3.z * J2.x - I3.w * J2.y;
1605  tmp_re += I4.x * J2.z - I4.y * J2.w;
1606  accum_re[9*blockDim.x] = tmp_re;
1607 
1608  tmp_im = I3.x * J1.w + I3.y * J1.z;
1609  tmp_im += I3.z * J2.y + I3.w * J2.x;
1610  tmp_im += I4.x * J2.w + I4.y * J2.z;
1611  accum_im[9*blockDim.x] = tmp_im;
1612 
1613  //22 component:
1614  tmp_re = I3.x * J3.x - I3.y * J3.y;
1615  tmp_re += I3.z * J3.z - I3.w * J3.w;
1616  tmp_re += I4.x * J4.x - I4.y * J4.y;
1617  accum_re[10*blockDim.x] = tmp_re;
1618 
1619  tmp_im = I3.x * J3.y + I3.y * J3.x;
1620  tmp_im += I3.z * J3.w + I3.w * J3.z;
1621  tmp_im += I4.x * J4.y + I4.y * J4.x;
1622  accum_im[10*blockDim.x] = tmp_im;
1623 
1624  //23 component:
1625  tmp_re = I3.x * J4.z - I3.y * J4.w;
1626  tmp_re += I3.z * J5.x - I3.w * J5.y;
1627  tmp_re += I4.x * J5.z - I4.y * J5.w;
1628  accum_re[11*blockDim.x] = tmp_re;
1629 
1630  tmp_im = I3.x * J4.w + I3.y * J4.z;
1631  tmp_im += I3.z * J5.y + I3.w * J5.x;
1632  tmp_im += I4.x * J5.w + I4.y * J5.z;
1633  accum_im[11*blockDim.x] = tmp_im;
1634 
1635  //30 component:
1636  tmp_re = I4.z * J0.x - I4.w * J0.y;
1637  tmp_re += I5.x * J0.z - I5.y * J0.w;
1638  tmp_re += I5.z * J1.x - I5.w * J1.y;
1639  accum_re[12*blockDim.x] = tmp_re;
1640 
1641  tmp_im = I4.z * J0.y + I4.w * J0.x;
1642  tmp_im += I5.x * J0.w + I5.y * J0.z;
1643  tmp_im += I5.z * J1.y + I5.w * J1.x;
1644  accum_im[12*blockDim.x] = tmp_im;
1645 
1646  //31 component:
1647  tmp_re = I4.z * J1.z - I4.w * J1.w;
1648  tmp_re += I5.x * J2.x - I5.y * J2.y;
1649  tmp_re += I5.z * J2.z - I5.w * J2.w;
1650  accum_re[13*blockDim.x] = tmp_re;
1651 
1652  tmp_im = I4.z * J1.w + I4.w * J1.z;
1653  tmp_im += I5.x * J2.y + I5.y * J2.x;
1654  tmp_im += I5.z * J2.w + I5.w * J2.z;
1655  accum_im[13*blockDim.x] = tmp_im;
1656 
1657  //32 component:
1658  tmp_re = I4.z * J3.x - I4.w * J3.y;
1659  tmp_re += I5.x * J3.z - I5.y * J3.w;
1660  tmp_re += I5.z * J4.x - I5.w * J4.y;
1661  accum_re[14*blockDim.x] = tmp_re;
1662 
1663  tmp_im = I4.z * J3.y + I4.w * J3.x;
1664  tmp_im += I5.x * J3.w + I5.y * J3.z;
1665  tmp_im += I5.z * J4.y + I5.w * J4.x;
1666  accum_im[14*blockDim.x] = tmp_im;
1667 
1668  //33 component:
1669  tmp_re = I4.z * J4.z - I4.w * J4.w;
1670  tmp_re += I5.x * J5.x - I5.y * J5.y;
1671  tmp_re += I5.z * J5.z - I5.w * J5.w;
1672  accum_re[15*blockDim.x] = tmp_re;
1673 
1674  tmp_im = I4.z * J4.w + I4.w * J4.z;
1675  tmp_im += I5.x * J5.y + I5.y * J5.x;
1676  tmp_im += I5.z * J5.w + I5.w * J5.z;
1677  accum_im[15*blockDim.x] = tmp_im;
1678 
1679  //Store output back to global buffer:
1680 
1681  /* CONTRACTION FULL VOLUME */
1682 
1683  out[outId + 0 *param.threads*2] = make_float2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
1684  out[outId + 1 *param.threads*2] = make_float2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
1685  out[outId + 2 *param.threads*2] = make_float2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
1686  out[outId + 3 *param.threads*2] = make_float2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
1687  out[outId + 4 *param.threads*2] = make_float2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
1688  out[outId + 5 *param.threads*2] = make_float2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
1689  out[outId + 6 *param.threads*2] = make_float2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
1690  out[outId + 7 *param.threads*2] = make_float2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
1691  out[outId + 8 *param.threads*2] = make_float2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
1692  out[outId + 9 *param.threads*2] = make_float2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
1693  out[outId + 10*param.threads*2] = make_float2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
1694  out[outId + 11*param.threads*2] = make_float2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
1695  out[outId + 12*param.threads*2] = make_float2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
1696  out[outId + 13*param.threads*2] = make_float2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
1697  out[outId + 14*param.threads*2] = make_float2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
1698  out[outId + 15*param.threads*2] = make_float2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
1699 
1700  return;
1701 }
1702 
1703 #undef SPINORTEX
1704 #undef INTERTEX
1705 
1706 #undef tmp_re
1707 #undef tmp_im
1708 
1709 #undef READ_SPINOR
1710 #undef READ_INTERMEDIATE_SPINOR
1711 
1712 #undef SPINOR_HOP
1713 
1714 #endif //_TWIST_QUDA_CONTRACT
__constant__ int X2
__global__ void contractTsliceKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
__constant__ int Vsh
__constant__ int X1
#define TOTAL_COMPONENTS
Definition: contract_core.h:7
QudaGaugeParam param
Definition: pack_test.cpp:17
cudaColorSpinorField * tmp
#define SPINORTEX
__global__ void contractKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
#define READ_SPINOR
#define READ_INTERMEDIATE_SPINOR
cpuColorSpinorField * out
__constant__ int X3
#define tmp_re
Definition: contract_core.h:4
#define INTERTEX
__global__ void contractGamma5Kernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
#define tmp_im
Definition: contract_core.h:5