QUDA  0.9.0
contract_core.h
Go to the documentation of this file.
1 #ifdef READ_SPINOR
2 #undef READ_SPINOR
3 #endif
4 
5 #ifndef _TWIST_QUDA_CONTRACT
6 #define _TWIST_QUDA_CONTRACT
7 
8 #define tmp_re tmp.x
9 #define tmp_im tmp.y
10 
11 #define TOTAL_COMPONENTS 16
12 
13 #define READ_INTERMEDIATE_SPINOR_DOUBLE(spinor, stride, sp_idx, norm_idx) \
14  double2 J0 = spinor[sp_idx + 0*(stride)]; \
15  double2 J1 = spinor[sp_idx + 1*(stride)]; \
16  double2 J2 = spinor[sp_idx + 2*(stride)]; \
17  double2 J3 = spinor[sp_idx + 3*(stride)]; \
18  double2 J4 = spinor[sp_idx + 4*(stride)]; \
19  double2 J5 = spinor[sp_idx + 5*(stride)]; \
20  double2 J6 = spinor[sp_idx + 6*(stride)]; \
21  double2 J7 = spinor[sp_idx + 7*(stride)]; \
22  double2 J8 = spinor[sp_idx + 8*(stride)]; \
23  double2 J9 = spinor[sp_idx + 9*(stride)]; \
24  double2 J10 = spinor[sp_idx +10*(stride)]; \
25  double2 J11 = spinor[sp_idx +11*(stride)];
26 
27 #define READ_INTERMEDIATE_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \
28  double2 J0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
29  double2 J1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
30  double2 J2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
31  double2 J3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
32  double2 J4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
33  double2 J5 = fetch_double2((spinor), sp_idx + 5*(stride)); \
34  double2 J6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
35  double2 J7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
36  double2 J8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
37  double2 J9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
38  double2 J10 = fetch_double2((spinor), sp_idx +10*(stride)); \
39  double2 J11 = fetch_double2((spinor), sp_idx +11*(stride));
40 
41 #ifdef DIRECT_ACCESS_WILSON_SPINOR
42  #define READ_SPINOR READ_SPINOR_DOUBLE
43  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE
44  #define SPINORTEX in
45  #define INTERTEX in2
46 #else
47  #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
48  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE_TEX
49 
50  #ifdef USE_TEXTURE_OBJECTS
51  #define SPINORTEX param.inTex
52  #define INTERTEX param.outTex
53  #else
54  #define SPINORTEX spinorTexDouble
55  #define INTERTEX interTexDouble
56  #endif // USE_TEXTURE_OBJECTS
57 
58 #ifdef SPINOR_HOP
59 #undef SPINOR_HOP
60 #endif
61 #define SPINOR_HOP 12
62 
63 #endif
64 
65 __global__ void contractGamma5Kernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
66 {
67  int sid = blockIdx.x*blockDim.x + threadIdx.x;
68  int outId = sid;
69  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
70 
71  if (sid >= param.threads)
72  return;
73 
74  volatile double2 tmp;
75  extern __shared__ double sm[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
76 
77  volatile double *accum_re = sm + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
78  volatile double *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
79 
80  eutId = 2*sid;
81  auxCoord1 = eutId / param.dc.X[0];;
82  xCoord1 = eutId - auxCoord1 * param.dc.X[0];
83  auxCoord2 = auxCoord1 / param.dc.X[1];
84  xCoord2 = auxCoord1 - auxCoord2 * param.dc.X[1];
85  xCoord4 = auxCoord2 / param.dc.X[2];
86  xCoord3 = auxCoord2 - xCoord4 * param.dc.X[2];
87 
88  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
89  xCoord1 += auxCoord1;
90  outId = xCoord1 + param.dc.X[0]*(xCoord2 + param.dc.X[1]*(xCoord3 + param.dc.X[2]*xCoord4)); //AQUI
91 
92  READ_SPINOR (SPINORTEX, myStride, sid, sid);
94 
95  //compute in1^dag * gamma5:
96 
97  tmp_re = +I0.x;
98  tmp_im = -I0.y;
99  I0.x = +I6.x;
100  I0.y = -I6.y;
101  I6.x = tmp_re;
102  I6.y = tmp_im;
103 
104  tmp_re = +I3.x;
105  tmp_im = -I3.y;
106  I3.x = +I9.x;
107  I3.y = -I9.y;
108  I9.x = tmp_re;
109  I9.y = tmp_im;
110 
111  tmp_re = +I1.x;
112  tmp_im = -I1.y;
113  I1.x = +I7.x;
114  I1.y = -I7.y;
115  I7.x = tmp_re;
116  I7.y = tmp_im;
117 
118  tmp_re = +I4.x;
119  tmp_im = -I4.y;
120  I4.x = +I10.x;
121  I4.y = -I10.y;
122  I10.x = tmp_re;
123  I10.y = tmp_im;
124 
125  tmp_re = +I2.x;
126  tmp_im = -I2.y;
127  I2.x = +I8.x;
128  I2.y = -I8.y;
129  I8.x = tmp_re;
130  I8.y = tmp_im;
131 
132  tmp_re = +I5.x;
133  tmp_im = -I5.y;
134  I5.x = +I11.x;
135  I5.y = -I11.y;
136  I11.x = tmp_re;
137  I11.y = tmp_im;
138 
139  //do products for all color component here:
140  //00 component:
141  tmp_re = I0.x * J0.x - I0.y * J0.y;
142  tmp_re += I1.x * J1.x - I1.y * J1.y;
143  tmp_re += I2.x * J2.x - I2.y * J2.y;
144  accum_re[0*blockDim.x] = tmp_re;
145 
146  tmp_im = I0.x * J0.y + I0.y * J0.x;
147  tmp_im += I1.x * J1.y + I1.y * J1.x;
148  tmp_im += I2.x * J2.y + I2.y * J2.x;
149  accum_im[0*blockDim.x] = tmp_im;
150 
151  //01 component:
152  tmp_re = I0.x * J3.x - I0.y * J3.y;
153  tmp_re += I1.x * J4.x - I1.y * J4.y;
154  tmp_re += I2.x * J5.x - I2.y * J5.y;
155  accum_re[1*blockDim.x] = tmp_re;
156 
157  tmp_im = I0.x * J3.y + I0.y * J3.x;
158  tmp_im += I1.x * J4.y + I1.y * J4.x;
159  tmp_im += I2.x * J5.y + I2.y * J5.x;
160  accum_im[1*blockDim.x] = tmp_im;
161 
162  //02 component:
163  tmp_re = I0.x * J6.x - I0.y * J6.y;
164  tmp_re += I1.x * J7.x - I1.y * J7.y;
165  tmp_re += I2.x * J8.x - I2.y * J8.y;
166  accum_re[2*blockDim.x] = tmp_re;
167 
168  tmp_im = I0.x * J6.y + I0.y * J6.x;
169  tmp_im += I1.x * J7.y + I1.y * J7.x;
170  tmp_im += I2.x * J8.y + I2.y * J8.x;
171  accum_im[2*blockDim.x] = tmp_im;
172 
173  //03 component:
174  tmp_re = I0.x * J9.x - I0.y * J9.y;
175  tmp_re += I1.x * J10.x - I1.y * J10.y;
176  tmp_re += I2.x * J11.x - I2.y * J11.y;
177  accum_re[3*blockDim.x] = tmp_re;
178 
179  tmp_im = I0.x * J9.y + I0.y * J9.x;
180  tmp_im += I1.x * J10.y + I1.y * J10.x;
181  tmp_im += I2.x * J11.y + I2.y * J11.x;
182  accum_im[3*blockDim.x] = tmp_im;
183 
184  //10 component:
185  tmp_re = I3.x * J0.x - I3.y * J0.y;
186  tmp_re += I4.x * J1.x - I4.y * J1.y;
187  tmp_re += I5.x * J2.x - I5.y * J2.y;
188  accum_re[ 4*blockDim.x] = tmp_re;
189 
190  tmp_im = I3.x * J0.y + I3.y * J0.x;
191  tmp_im += I4.x * J1.y + I4.y * J1.x;
192  tmp_im += I5.x * J2.y + I5.y * J2.x;
193  accum_im[ 4*blockDim.x] = tmp_im;
194 
195  //11 component:
196  tmp_re = I3.x * J3.x - I3.y * J3.y;
197  tmp_re += I4.x * J4.x - I4.y * J4.y;
198  tmp_re += I5.x * J5.x - I5.y * J5.y;
199  accum_re[ 5*blockDim.x] = tmp_re;
200 
201  tmp_im = I3.x * J3.y + I3.y * J3.x;
202  tmp_im += I4.x * J4.y + I4.y * J4.x;
203  tmp_im += I5.x * J5.y + I5.y * J5.x;
204  accum_im[ 5*blockDim.x] = tmp_im;
205 
206  //12 component:
207  tmp_re = I3.x * J6.x - I3.y * J6.y;
208  tmp_re += I4.x * J7.x - I4.y * J7.y;
209  tmp_re += I5.x * J8.x - I5.y * J8.y;
210  accum_re[ 6*blockDim.x] = tmp_re;
211 
212  tmp_im = I3.x * J6.y + I3.y * J6.x;
213  tmp_im += I4.x * J7.y + I4.y * J7.x;
214  tmp_im += I5.x * J8.y + I5.y * J8.x;
215  accum_im[ 6*blockDim.x] = tmp_im;
216 
217  //13 component:
218  tmp_re = I3.x * J9.x - I3.y * J9.y;
219  tmp_re += I4.x * J10.x - I4.y * J10.y;
220  tmp_re += I5.x * J11.x - I5.y * J11.y;
221  accum_re[ 7*blockDim.x] = tmp_re;
222 
223  tmp_im = I3.x * J9.y + I3.y * J9.x;
224  tmp_im += I4.x * J10.y + I4.y * J10.x;
225  tmp_im += I5.x * J11.y + I5.y * J11.x;
226  accum_im[ 7*blockDim.x] = tmp_im;
227 
228  //20 component:
229  tmp_re = I6.x * J0.x - I6.y * J0.y;
230  tmp_re += I7.x * J1.x - I7.y * J1.y;
231  tmp_re += I8.x * J2.x - I8.y * J2.y;
232  accum_re[ 8*blockDim.x] = tmp_re;
233 
234  tmp_im = I6.x * J0.y + I6.y * J0.x;
235  tmp_im += I7.x * J1.y + I7.y * J1.x;
236  tmp_im += I8.x * J2.y + I8.y * J2.x;
237  accum_im[ 8*blockDim.x] = tmp_im;
238 
239  //21 component:
240  tmp_re = I6.x * J3.x - I6.y * J3.y;
241  tmp_re += I7.x * J4.x - I7.y * J4.y;
242  tmp_re += I8.x * J5.x - I8.y * J5.y;
243  accum_re[ 9*blockDim.x] = tmp_re;
244 
245  tmp_im = I6.x * J3.y + I6.y * J3.x;
246  tmp_im += I7.x * J4.y + I7.y * J4.x;
247  tmp_im += I8.x * J5.y + I8.y * J5.x;
248  accum_im[ 9*blockDim.x] = tmp_im;
249 
250  //22 component:
251  tmp_re = I6.x * J6.x - I6.y * J6.y;
252  tmp_re += I7.x * J7.x - I7.y * J7.y;
253  tmp_re += I8.x * J8.x - I8.y * J8.y;
254  accum_re[10*blockDim.x] = tmp_re;
255 
256  tmp_im = I6.x * J6.y + I6.y * J6.x;
257  tmp_im += I7.x * J7.y + I7.y * J7.x;
258  tmp_im += I8.x * J8.y + I8.y * J8.x;
259  accum_im[10*blockDim.x] = tmp_im;
260 
261  //23 component:
262  tmp_re = I6.x * J9.x - I6.y * J9.y;
263  tmp_re += I7.x * J10.x - I7.y * J10.y;
264  tmp_re += I8.x * J11.x - I8.y * J11.y;
265  accum_re[11*blockDim.x] = tmp_re;
266 
267  tmp_im = I6.x * J9.y + I6.y * J9.x;
268  tmp_im += I7.x * J10.y + I7.y * J10.x;
269  tmp_im += I8.x * J11.y + I8.y * J11.x;
270  accum_im[11*blockDim.x] = tmp_im;
271 
272  //30 component:
273  tmp_re = I9.x * J0.x - I9.y * J0.y;
274  tmp_re += I10.x * J1.x - I10.y * J1.y;
275  tmp_re += I11.x * J2.x - I11.y * J2.y;
276  accum_re[12*blockDim.x] = tmp_re;
277 
278  tmp_im = I9.x * J0.y + I9.y * J0.x;
279  tmp_im += I10.x * J1.y + I10.y * J1.x;
280  tmp_im += I11.x * J2.y + I11.y * J2.x;
281  accum_im[12*blockDim.x] = tmp_im;
282 
283  //31 component:
284  tmp_re = I9.x * J3.x - I9.y * J3.y;
285  tmp_re += I10.x * J4.x - I10.y * J4.y;
286  tmp_re += I11.x * J5.x - I11.y * J5.y;
287  accum_re[13*blockDim.x] = tmp_re;
288 
289  tmp_im = I9.x * J3.y + I9.y * J3.x;
290  tmp_im += I10.x * J4.y + I10.y * J4.x;
291  tmp_im += I11.x * J5.y + I11.y * J5.x;
292  accum_im[13*blockDim.x] = tmp_im;
293 
294  //32 component:
295  tmp_re = I9.x * J6.x - I9.y * J6.y;
296  tmp_re += I10.x * J7.x - I10.y * J7.y;
297  tmp_re += I11.x * J8.x - I11.y * J8.y;
298  accum_re[14*blockDim.x] = tmp_re;
299 
300  tmp_im = I9.x * J6.y + I9.y * J6.x;
301  tmp_im += I10.x * J7.y + I10.y * J7.x;
302  tmp_im += I11.x * J8.y + I11.y * J8.x;
303  accum_im[14*blockDim.x] = tmp_im;
304 
305  //33 component:
306  tmp_re = I9.x * J9.x - I9.y * J9.y;
307  tmp_re += I10.x * J10.x - I10.y * J10.y;
308  tmp_re += I11.x * J11.x - I11.y * J11.y;
309  accum_re[15*blockDim.x] = tmp_re;
310 
311  tmp_im = I9.x * J9.y + I9.y * J9.x;
312  tmp_im += I10.x * J10.y + I10.y * J10.x;
313  tmp_im += I11.x * J11.y + I11.y * J11.x;
314  accum_im[15*blockDim.x] = tmp_im;
315 
316  //Store output back to global buffer:
317 
318 
319 /* CONTRACTION FULL VOLUME */
320 
321  out[outId + 0 *param.threads*2] = make_double2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
322  out[outId + 1 *param.threads*2] = make_double2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
323  out[outId + 2 *param.threads*2] = make_double2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
324  out[outId + 3 *param.threads*2] = make_double2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
325  out[outId + 4 *param.threads*2] = make_double2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
326  out[outId + 5 *param.threads*2] = make_double2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
327  out[outId + 6 *param.threads*2] = make_double2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
328  out[outId + 7 *param.threads*2] = make_double2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
329  out[outId + 8 *param.threads*2] = make_double2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
330  out[outId + 9 *param.threads*2] = make_double2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
331  out[outId + 10*param.threads*2] = make_double2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
332  out[outId + 11*param.threads*2] = make_double2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
333  out[outId + 12*param.threads*2] = make_double2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
334  out[outId + 13*param.threads*2] = make_double2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
335  out[outId + 14*param.threads*2] = make_double2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
336  out[outId + 15*param.threads*2] = make_double2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
337 
338  return;
339 }
340 
341 //Perform trace in color space only and for a given tslice
342 //since the file is included in dslash_quda.h, no need to add dslash_constants.h file here (for, e.g., Vsh)
343 __global__ void contractTsliceKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
344 {
345  int sid = blockIdx.x*blockDim.x + threadIdx.x; //number of threads is equal to Tslice volume
346  //Adjust sid to correct tslice (exe domain must be Tslice volume!)
347  int inId = sid + param.Vsh*Tslice; //Vsh - 3d space volume for the parity spinor (equale to exe domain!)
348  int outId;
349  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
350 
351  if (sid >= param.threads) //param.threads == tslice volume
352  return;
353 
354  volatile double2 tmp;
355  extern __shared__ double sm[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
356 
357  volatile double *accum_re = sm + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
358  volatile double *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
359 
360 //The output only for a given tslice (for the full tslice content, i.e., both parities!):
361 
362  eutId = 2*inId;
363  auxCoord1 = eutId / param.dc.X[0];
364  xCoord1 = eutId - auxCoord1 * param.dc.X[0];
365  auxCoord2 = auxCoord1 / param.dc.X[1];
366  xCoord2 = auxCoord1 - auxCoord2 * param.dc.X[1];
367  xCoord4 = auxCoord2 / param.dc.X[2];
368  xCoord3 = auxCoord2 - xCoord4 * param.dc.X[2];
369 
370  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
371  xCoord1 += auxCoord1;
372  outId = xCoord1 + param.dc.X[0]*(xCoord2 + param.dc.X[1]*(xCoord3 + param.dc.X[2]*xCoord4)); //AQUI
373 
374  READ_SPINOR (SPINORTEX, myStride, sid, sid);
376 
377  //compute in1^dag:
378 
379  I0.y = -I0.y;
380  I1.y = -I1.y;
381  I2.y = -I2.y;
382  I3.y = -I3.y;
383  I4.y = -I4.y;
384  I5.y = -I5.y;
385  I6.y = -I6.y;
386  I7.y = -I7.y;
387  I8.y = -I8.y;
388  I9.y = -I9.y;
389  I10.y = -I10.y;
390  I11.y = -I11.y;
391 
392  //do products for all color component here:
393  //00 component:
394  tmp_re = I0.x * J0.x - I0.y * J0.y;
395  tmp_re += I1.x * J1.x - I1.y * J1.y;
396  tmp_re += I2.x * J2.x - I2.y * J2.y;
397  accum_re[0*blockDim.x] = tmp_re;
398 
399  tmp_im = I0.x * J0.y + I0.y * J0.x;
400  tmp_im += I1.x * J1.y + I1.y * J1.x;
401  tmp_im += I2.x * J2.y + I2.y * J2.x;
402  accum_im[0*blockDim.x] = tmp_im;
403 
404  //01 component:
405  tmp_re = I0.x * J3.x - I0.y * J3.y;
406  tmp_re += I1.x * J4.x - I1.y * J4.y;
407  tmp_re += I2.x * J5.x - I2.y * J5.y;
408  accum_re[1*blockDim.x] = tmp_re;
409 
410  tmp_im = I0.x * J3.y + I0.y * J3.x;
411  tmp_im += I1.x * J4.y + I1.y * J4.x;
412  tmp_im += I2.x * J5.y + I2.y * J5.x;
413  accum_im[1*blockDim.x] = tmp_im;
414 
415  //02 component:
416  tmp_re = I0.x * J6.x - I0.y * J6.y;
417  tmp_re += I1.x * J7.x - I1.y * J7.y;
418  tmp_re += I2.x * J8.x - I2.y * J8.y;
419  accum_re[2*blockDim.x] = tmp_re;
420 
421  tmp_im = I0.x * J6.y + I0.y * J6.x;
422  tmp_im += I1.x * J7.y + I1.y * J7.x;
423  tmp_im += I2.x * J8.y + I2.y * J8.x;
424  accum_im[2*blockDim.x] = tmp_im;
425 
426  //03 component:
427  tmp_re = I0.x * J9.x - I0.y * J9.y;
428  tmp_re += I1.x * J10.x - I1.y * J10.y;
429  tmp_re += I2.x * J11.x - I2.y * J11.y;
430  accum_re[3*blockDim.x] = tmp_re;
431 
432  tmp_im = I0.x * J9.y + I0.y * J9.x;
433  tmp_im += I1.x * J10.y + I1.y * J10.x;
434  tmp_im += I2.x * J11.y + I2.y * J11.x;
435  accum_im[3*blockDim.x] = tmp_im;
436 
437  //10 component:
438  tmp_re = I3.x * J0.x - I3.y * J0.y;
439  tmp_re += I4.x * J1.x - I4.y * J1.y;
440  tmp_re += I5.x * J2.x - I5.y * J2.y;
441  accum_re[ 4*blockDim.x] = tmp_re;
442 
443  tmp_im = I3.x * J0.y + I3.y * J0.x;
444  tmp_im += I4.x * J1.y + I4.y * J1.x;
445  tmp_im += I5.x * J2.y + I5.y * J2.x;
446  accum_im[ 4*blockDim.x] = tmp_im;
447 
448  //11 component:
449  tmp_re = I3.x * J3.x - I3.y * J3.y;
450  tmp_re += I4.x * J4.x - I4.y * J4.y;
451  tmp_re += I5.x * J5.x - I5.y * J5.y;
452  accum_re[ 5*blockDim.x] = tmp_re;
453 
454  tmp_im = I3.x * J3.y + I3.y * J3.x;
455  tmp_im += I4.x * J4.y + I4.y * J4.x;
456  tmp_im += I5.x * J5.y + I5.y * J5.x;
457  accum_im[ 5*blockDim.x] = tmp_im;
458 
459  //12 component:
460  tmp_re = I3.x * J6.x - I3.y * J6.y;
461  tmp_re += I4.x * J7.x - I4.y * J7.y;
462  tmp_re += I5.x * J8.x - I5.y * J8.y;
463  accum_re[ 6*blockDim.x] = tmp_re;
464 
465  tmp_im = I3.x * J6.y + I3.y * J6.x;
466  tmp_im += I4.x * J7.y + I4.y * J7.x;
467  tmp_im += I5.x * J8.y + I5.y * J8.x;
468  accum_im[ 6*blockDim.x] = tmp_im;
469 
470  //13 component:
471  tmp_re = I3.x * J9.x - I3.y * J9.y;
472  tmp_re += I4.x * J10.x - I4.y * J10.y;
473  tmp_re += I5.x * J11.x - I5.y * J11.y;
474  accum_re[ 7*blockDim.x] = tmp_re;
475 
476  tmp_im = I3.x * J9.y + I3.y * J9.x;
477  tmp_im += I4.x * J10.y + I4.y * J10.x;
478  tmp_im += I5.x * J11.y + I5.y * J11.x;
479  accum_im[ 7*blockDim.x] = tmp_im;
480 
481  //20 component:
482  tmp_re = I6.x * J0.x - I6.y * J0.y;
483  tmp_re += I7.x * J1.x - I7.y * J1.y;
484  tmp_re += I8.x * J2.x - I8.y * J2.y;
485  accum_re[ 8*blockDim.x] = tmp_re;
486 
487  tmp_im = I6.x * J0.y + I6.y * J0.x;
488  tmp_im += I7.x * J1.y + I7.y * J1.x;
489  tmp_im += I8.x * J2.y + I8.y * J2.x;
490  accum_im[ 8*blockDim.x] = tmp_im;
491 
492  //21 component:
493  tmp_re = I6.x * J3.x - I6.y * J3.y;
494  tmp_re += I7.x * J4.x - I7.y * J4.y;
495  tmp_re += I8.x * J5.x - I8.y * J5.y;
496  accum_re[ 9*blockDim.x] = tmp_re;
497 
498  tmp_im = I6.x * J3.y + I6.y * J3.x;
499  tmp_im += I7.x * J4.y + I7.y * J4.x;
500  tmp_im += I8.x * J5.y + I8.y * J5.x;
501  accum_im[ 9*blockDim.x] = tmp_im;
502 
503  //22 component:
504  tmp_re = I6.x * J6.x - I6.y * J6.y;
505  tmp_re += I7.x * J7.x - I7.y * J7.y;
506  tmp_re += I8.x * J8.x - I8.y * J8.y;
507  accum_re[10*blockDim.x] = tmp_re;
508 
509  tmp_im = I6.x * J6.y + I6.y * J6.x;
510  tmp_im += I7.x * J7.y + I7.y * J7.x;
511  tmp_im += I8.x * J8.y + I8.y * J8.x;
512  accum_im[10*blockDim.x] = tmp_im;
513 
514  //23 component:
515  tmp_re = I6.x * J9.x - I6.y * J9.y;
516  tmp_re += I7.x * J10.x - I7.y * J10.y;
517  tmp_re += I8.x * J11.x - I8.y * J11.y;
518  accum_re[11*blockDim.x] = tmp_re;
519 
520  tmp_im = I6.x * J9.y + I6.y * J9.x;
521  tmp_im += I7.x * J10.y + I7.y * J10.x;
522  tmp_im += I8.x * J11.y + I8.y * J11.x;
523  accum_im[11*blockDim.x] = tmp_im;
524 
525  //30 component:
526  tmp_re = I9.x * J0.x - I9.y * J0.y;
527  tmp_re += I10.x * J1.x - I10.y * J1.y;
528  tmp_re += I11.x * J2.x - I11.y * J2.y;
529  accum_re[12*blockDim.x] = tmp_re;
530 
531  tmp_im = I9.x * J0.y + I9.y * J0.x;
532  tmp_im += I10.x * J1.y + I10.y * J1.x;
533  tmp_im += I11.x * J2.y + I11.y * J2.x;
534  accum_im[12*blockDim.x] = tmp_im;
535 
536  //31 component:
537  tmp_re = I9.x * J3.x - I9.y * J3.y;
538  tmp_re += I10.x * J4.x - I10.y * J4.y;
539  tmp_re += I11.x * J5.x - I11.y * J5.y;
540  accum_re[13*blockDim.x] = tmp_re;
541 
542  tmp_im = I9.x * J3.y + I9.y * J3.x;
543  tmp_im += I10.x * J4.y + I10.y * J4.x;
544  tmp_im += I11.x * J5.y + I11.y * J5.x;
545  accum_im[13*blockDim.x] = tmp_im;
546 
547  //32 component:
548  tmp_re = I9.x * J6.x - I9.y * J6.y;
549  tmp_re += I10.x * J7.x - I10.y * J7.y;
550  tmp_re += I11.x * J8.x - I11.y * J8.y;
551  accum_re[14*blockDim.x] = tmp_re;
552 
553  tmp_im = I9.x * J6.y + I9.y * J6.x;
554  tmp_im += I10.x * J7.y + I10.y * J7.x;
555  tmp_im += I11.x * J8.y + I11.y * J8.x;
556  accum_im[14*blockDim.x] = tmp_im;
557 
558  //33 component:
559  tmp_re = I9.x * J9.x - I9.y * J9.y;
560  tmp_re += I10.x * J10.x - I10.y * J10.y;
561  tmp_re += I11.x * J11.x - I11.y * J11.y;
562  accum_re[15*blockDim.x] = tmp_re;
563 
564  tmp_im = I9.x * J9.y + I9.y * J9.x;
565  tmp_im += I10.x * J10.y + I10.y * J10.x;
566  tmp_im += I11.x * J11.y + I11.y * J11.x;
567  accum_im[15*blockDim.x] = tmp_im;
568 
569  //Store output back to global buffer:
570 
571 
572 /* CONTRACTION FULL VOLUME */
573 
574  out[outId + 0 *param.threads*2] = make_double2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
575  out[outId + 1 *param.threads*2] = make_double2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
576  out[outId + 2 *param.threads*2] = make_double2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
577  out[outId + 3 *param.threads*2] = make_double2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
578  out[outId + 4 *param.threads*2] = make_double2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
579  out[outId + 5 *param.threads*2] = make_double2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
580  out[outId + 6 *param.threads*2] = make_double2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
581  out[outId + 7 *param.threads*2] = make_double2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
582  out[outId + 8 *param.threads*2] = make_double2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
583  out[outId + 9 *param.threads*2] = make_double2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
584  out[outId + 10*param.threads*2] = make_double2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
585  out[outId + 11*param.threads*2] = make_double2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
586  out[outId + 12*param.threads*2] = make_double2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
587  out[outId + 13*param.threads*2] = make_double2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
588  out[outId + 14*param.threads*2] = make_double2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
589  out[outId + 15*param.threads*2] = make_double2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
590 
591  return;
592 }
593 
594 __global__ void contractKernel (double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
595 {
596  int sid = blockIdx.x*blockDim.x + threadIdx.x;
597  int outId = sid;
598  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
599 
600  if (sid >= param.threads)
601  return;
602 
603  volatile double2 tmp;
604  extern __shared__ double sm[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
605 
606  volatile double *accum_re = sm + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
607  volatile double *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
608 
609  eutId = 2*sid;
610  auxCoord1 = eutId / param.dc.X[0];
611  xCoord1 = eutId - auxCoord1 * param.dc.X[0];
612  auxCoord2 = auxCoord1 / param.dc.X[1];
613  xCoord2 = auxCoord1 - auxCoord2 * param.dc.X[1];
614  xCoord4 = auxCoord2 / param.dc.X[2];
615  xCoord3 = auxCoord2 - xCoord4 * param.dc.X[2];
616 
617  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
618  xCoord1 += auxCoord1;
619  outId = xCoord1 + param.dc.X[0]*(xCoord2 + param.dc.X[1]*(xCoord3 + param.dc.X[2]*xCoord4)); //AQUI
620 
621  READ_SPINOR (SPINORTEX, myStride, sid, sid);
623 
624  //compute in1^dag:
625 
626  I0.y = -I0.y;
627  I1.y = -I1.y;
628  I2.y = -I2.y;
629  I3.y = -I3.y;
630  I4.y = -I4.y;
631  I5.y = -I5.y;
632  I6.y = -I6.y;
633  I7.y = -I7.y;
634  I8.y = -I8.y;
635  I9.y = -I9.y;
636  I10.y = -I10.y;
637  I11.y = -I11.y;
638 
639  //do products for all color component here:
640  //00 component:
641  tmp_re = I0.x * J0.x - I0.y * J0.y;
642  tmp_re += I1.x * J1.x - I1.y * J1.y;
643  tmp_re += I2.x * J2.x - I2.y * J2.y;
644  accum_re[0*blockDim.x] = tmp_re;
645 
646  tmp_im = I0.x * J0.y + I0.y * J0.x;
647  tmp_im += I1.x * J1.y + I1.y * J1.x;
648  tmp_im += I2.x * J2.y + I2.y * J2.x;
649  accum_im[0*blockDim.x] = tmp_im;
650 
651  //01 component:
652  tmp_re = I0.x * J3.x - I0.y * J3.y;
653  tmp_re += I1.x * J4.x - I1.y * J4.y;
654  tmp_re += I2.x * J5.x - I2.y * J5.y;
655  accum_re[1*blockDim.x] = tmp_re;
656 
657  tmp_im = I0.x * J3.y + I0.y * J3.x;
658  tmp_im += I1.x * J4.y + I1.y * J4.x;
659  tmp_im += I2.x * J5.y + I2.y * J5.x;
660  accum_im[1*blockDim.x] = tmp_im;
661 
662  //02 component:
663  tmp_re = I0.x * J6.x - I0.y * J6.y;
664  tmp_re += I1.x * J7.x - I1.y * J7.y;
665  tmp_re += I2.x * J8.x - I2.y * J8.y;
666  accum_re[2*blockDim.x] = tmp_re;
667 
668  tmp_im = I0.x * J6.y + I0.y * J6.x;
669  tmp_im += I1.x * J7.y + I1.y * J7.x;
670  tmp_im += I2.x * J8.y + I2.y * J8.x;
671  accum_im[2*blockDim.x] = tmp_im;
672 
673  //03 component:
674  tmp_re = I0.x * J9.x - I0.y * J9.y;
675  tmp_re += I1.x * J10.x - I1.y * J10.y;
676  tmp_re += I2.x * J11.x - I2.y * J11.y;
677  accum_re[3*blockDim.x] = tmp_re;
678 
679  tmp_im = I0.x * J9.y + I0.y * J9.x;
680  tmp_im += I1.x * J10.y + I1.y * J10.x;
681  tmp_im += I2.x * J11.y + I2.y * J11.x;
682  accum_im[3*blockDim.x] = tmp_im;
683 
684  //10 component:
685  tmp_re = I3.x * J0.x - I3.y * J0.y;
686  tmp_re += I4.x * J1.x - I4.y * J1.y;
687  tmp_re += I5.x * J2.x - I5.y * J2.y;
688  accum_re[ 4*blockDim.x] = tmp_re;
689 
690  tmp_im = I3.x * J0.y + I3.y * J0.x;
691  tmp_im += I4.x * J1.y + I4.y * J1.x;
692  tmp_im += I5.x * J2.y + I5.y * J2.x;
693  accum_im[ 4*blockDim.x] = tmp_im;
694 
695  //11 component:
696  tmp_re = I3.x * J3.x - I3.y * J3.y;
697  tmp_re += I4.x * J4.x - I4.y * J4.y;
698  tmp_re += I5.x * J5.x - I5.y * J5.y;
699  accum_re[ 5*blockDim.x] = tmp_re;
700 
701  tmp_im = I3.x * J3.y + I3.y * J3.x;
702  tmp_im += I4.x * J4.y + I4.y * J4.x;
703  tmp_im += I5.x * J5.y + I5.y * J5.x;
704  accum_im[ 5*blockDim.x] = tmp_im;
705 
706  //12 component:
707  tmp_re = I3.x * J6.x - I3.y * J6.y;
708  tmp_re += I4.x * J7.x - I4.y * J7.y;
709  tmp_re += I5.x * J8.x - I5.y * J8.y;
710  accum_re[ 6*blockDim.x] = tmp_re;
711 
712  tmp_im = I3.x * J6.y + I3.y * J6.x;
713  tmp_im += I4.x * J7.y + I4.y * J7.x;
714  tmp_im += I5.x * J8.y + I5.y * J8.x;
715  accum_im[ 6*blockDim.x] = tmp_im;
716 
717  //13 component:
718  tmp_re = I3.x * J9.x - I3.y * J9.y;
719  tmp_re += I4.x * J10.x - I4.y * J10.y;
720  tmp_re += I5.x * J11.x - I5.y * J11.y;
721  accum_re[ 7*blockDim.x] = tmp_re;
722 
723  tmp_im = I3.x * J9.y + I3.y * J9.x;
724  tmp_im += I4.x * J10.y + I4.y * J10.x;
725  tmp_im += I5.x * J11.y + I5.y * J11.x;
726  accum_im[ 7*blockDim.x] = tmp_im;
727 
728  //20 component:
729  tmp_re = I6.x * J0.x - I6.y * J0.y;
730  tmp_re += I7.x * J1.x - I7.y * J1.y;
731  tmp_re += I8.x * J2.x - I8.y * J2.y;
732  accum_re[ 8*blockDim.x] = tmp_re;
733 
734  tmp_im = I6.x * J0.y + I6.y * J0.x;
735  tmp_im += I7.x * J1.y + I7.y * J1.x;
736  tmp_im += I8.x * J2.y + I8.y * J2.x;
737  accum_im[ 8*blockDim.x] = tmp_im;
738 
739  //21 component:
740  tmp_re = I6.x * J3.x - I6.y * J3.y;
741  tmp_re += I7.x * J4.x - I7.y * J4.y;
742  tmp_re += I8.x * J5.x - I8.y * J5.y;
743  accum_re[ 9*blockDim.x] = tmp_re;
744 
745  tmp_im = I6.x * J3.y + I6.y * J3.x;
746  tmp_im += I7.x * J4.y + I7.y * J4.x;
747  tmp_im += I8.x * J5.y + I8.y * J5.x;
748  accum_im[ 9*blockDim.x] = tmp_im;
749 
750  //22 component:
751  tmp_re = I6.x * J6.x - I6.y * J6.y;
752  tmp_re += I7.x * J7.x - I7.y * J7.y;
753  tmp_re += I8.x * J8.x - I8.y * J8.y;
754  accum_re[10*blockDim.x] = tmp_re;
755 
756  tmp_im = I6.x * J6.y + I6.y * J6.x;
757  tmp_im += I7.x * J7.y + I7.y * J7.x;
758  tmp_im += I8.x * J8.y + I8.y * J8.x;
759  accum_im[10*blockDim.x] = tmp_im;
760 
761  //23 component:
762  tmp_re = I6.x * J9.x - I6.y * J9.y;
763  tmp_re += I7.x * J10.x - I7.y * J10.y;
764  tmp_re += I8.x * J11.x - I8.y * J11.y;
765  accum_re[11*blockDim.x] = tmp_re;
766 
767  tmp_im = I6.x * J9.y + I6.y * J9.x;
768  tmp_im += I7.x * J10.y + I7.y * J10.x;
769  tmp_im += I8.x * J11.y + I8.y * J11.x;
770  accum_im[11*blockDim.x] = tmp_im;
771 
772  //30 component:
773  tmp_re = I9.x * J0.x - I9.y * J0.y;
774  tmp_re += I10.x * J1.x - I10.y * J1.y;
775  tmp_re += I11.x * J2.x - I11.y * J2.y;
776  accum_re[12*blockDim.x] = tmp_re;
777 
778  tmp_im = I9.x * J0.y + I9.y * J0.x;
779  tmp_im += I10.x * J1.y + I10.y * J1.x;
780  tmp_im += I11.x * J2.y + I11.y * J2.x;
781  accum_im[12*blockDim.x] = tmp_im;
782 
783  //31 component:
784  tmp_re = I9.x * J3.x - I9.y * J3.y;
785  tmp_re += I10.x * J4.x - I10.y * J4.y;
786  tmp_re += I11.x * J5.x - I11.y * J5.y;
787  accum_re[13*blockDim.x] = tmp_re;
788 
789  tmp_im = I9.x * J3.y + I9.y * J3.x;
790  tmp_im += I10.x * J4.y + I10.y * J4.x;
791  tmp_im += I11.x * J5.y + I11.y * J5.x;
792  accum_im[13*blockDim.x] = tmp_im;
793 
794  //32 component:
795  tmp_re = I9.x * J6.x - I9.y * J6.y;
796  tmp_re += I10.x * J7.x - I10.y * J7.y;
797  tmp_re += I11.x * J8.x - I11.y * J8.y;
798  accum_re[14*blockDim.x] = tmp_re;
799 
800  tmp_im = I9.x * J6.y + I9.y * J6.x;
801  tmp_im += I10.x * J7.y + I10.y * J7.x;
802  tmp_im += I11.x * J8.y + I11.y * J8.x;
803  accum_im[14*blockDim.x] = tmp_im;
804 
805  //33 component:
806  tmp_re = I9.x * J9.x - I9.y * J9.y;
807  tmp_re += I10.x * J10.x - I10.y * J10.y;
808  tmp_re += I11.x * J11.x - I11.y * J11.y;
809  accum_re[15*blockDim.x] = tmp_re;
810 
811  tmp_im = I9.x * J9.y + I9.y * J9.x;
812  tmp_im += I10.x * J10.y + I10.y * J10.x;
813  tmp_im += I11.x * J11.y + I11.y * J11.x;
814  accum_im[15*blockDim.x] = tmp_im;
815 
816 /* CONTRACTION FULL VOLUME */
817 
818  out[outId + 0 *param.threads*2] = make_double2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
819  out[outId + 1 *param.threads*2] = make_double2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
820  out[outId + 2 *param.threads*2] = make_double2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
821  out[outId + 3 *param.threads*2] = make_double2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
822  out[outId + 4 *param.threads*2] = make_double2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
823  out[outId + 5 *param.threads*2] = make_double2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
824  out[outId + 6 *param.threads*2] = make_double2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
825  out[outId + 7 *param.threads*2] = make_double2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
826  out[outId + 8 *param.threads*2] = make_double2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
827  out[outId + 9 *param.threads*2] = make_double2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
828  out[outId + 10*param.threads*2] = make_double2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
829  out[outId + 11*param.threads*2] = make_double2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
830  out[outId + 12*param.threads*2] = make_double2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
831  out[outId + 13*param.threads*2] = make_double2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
832  out[outId + 14*param.threads*2] = make_double2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
833  out[outId + 15*param.threads*2] = make_double2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
834 
835  return;
836 }
837 
838 #undef SPINORTEX
839 #undef INTERTEX
840 
841 #undef READ_SPINOR
842 #undef READ_INTERMEDIATE_SPINOR
843 
844 #undef SPINOR_HOP
845 
846 
847 
848 #define READ_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
849  float4 I0 = spinor[sp_idx + 0*(stride)]; \
850  float4 I1 = spinor[sp_idx + 1*(stride)]; \
851  float4 I2 = spinor[sp_idx + 2*(stride)]; \
852  float4 I3 = spinor[sp_idx + 3*(stride)]; \
853  float4 I4 = spinor[sp_idx + 4*(stride)]; \
854  float4 I5 = spinor[sp_idx + 5*(stride)];
855 
856 
857 #define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
858  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
859  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
860  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
861  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
862  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
863  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
864 
865 #define READ_INTERMEDIATE_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
866  float4 J0 = spinor[sp_idx + 0*(stride)]; \
867  float4 J1 = spinor[sp_idx + 1*(stride)]; \
868  float4 J2 = spinor[sp_idx + 2*(stride)]; \
869  float4 J3 = spinor[sp_idx + 3*(stride)]; \
870  float4 J4 = spinor[sp_idx + 4*(stride)]; \
871  float4 J5 = spinor[sp_idx + 5*(stride)];
872 
873 #define READ_INTERMEDIATE_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
874  float4 J0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
875  float4 J1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
876  float4 J2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
877  float4 J3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
878  float4 J4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
879  float4 J5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
880 
881 
882 #ifdef DIRECT_ACCESS_WILSON_SPINOR
883  #define READ_SPINOR READ_SPINOR_SINGLE
884  #define SPINORTEX in
885 #else
886  #define READ_SPINOR READ_SPINOR_SINGLE_TEX
887 
888  #ifdef USE_TEXTURE_OBJECTS
889  #define SPINORTEX param.inTex
890  #else
891  #define SPINORTEX spinorTexSingle
892  #endif // USE_TEXTURE_OBJECTS
893 #endif
894 
895 #ifdef DIRECT_ACCESS_WILSON_INTER
896  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE
897  #define INTERTEX in2
898 #else
899  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE_TEX
900 
901  #ifdef USE_TEXTURE_OBJECTS
902  #define INTERTEX param.outTex
903  #else
904  #define INTERTEX interTexSingle
905  #endif // USE_TEXTURE_OBJECTS
906 #endif
907 
908 #define SPINOR_HOP 6
909 
910 __global__ void contractGamma5Kernel (float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
911 {
912  int sid = blockIdx.x*blockDim.x + threadIdx.x;
913  int outId = sid;
914  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
915 
916  if (sid >= param.threads)
917  return;
918 
919  volatile float2 tmp;
920  extern __shared__ float sms[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
921 
922  volatile float *accum_re = sms + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
923  volatile float *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
924 
925  eutId = 2*sid;
926  auxCoord1 = eutId / param.dc.X[0];
927  xCoord1 = eutId - auxCoord1 * param.dc.X[0];
928  auxCoord2 = auxCoord1 / param.dc.X[1];
929  xCoord2 = auxCoord1 - auxCoord2 * param.dc.X[1];
930  xCoord4 = auxCoord2 / param.dc.X[2];
931  xCoord3 = auxCoord2 - xCoord4 * param.dc.X[2];
932 
933  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
934  xCoord1 += auxCoord1;
935  outId = xCoord1 + param.dc.X[0]*(xCoord2 + param.dc.X[1]*(xCoord3 + param.dc.X[2]*xCoord4)); //AQUI
936 
937  //Load the full input spinors:
938 
939  READ_SPINOR (SPINORTEX, myStride, sid, sid);
941 
942  //compute in1^dag * gamma5:
943 
944  //First color component
945 
946  tmp_re = +I0.x;
947  tmp_im = -I0.y;
948  I0.x = +I3.x;
949  I0.y = -I3.y;
950  I3.x = tmp_re;
951  I3.y = tmp_im;
952 
953  tmp_re = +I1.z;
954  tmp_im = -I1.w;
955  I1.z = +I4.z;
956  I1.w = -I4.w;
957  I4.z = tmp_re;
958  I4.w = tmp_im;
959 
960  //Second color component
961 
962  tmp_re = +I0.z;
963  tmp_im = -I0.w;
964  I0.z = +I3.z;
965  I0.w = -I3.w;
966  I3.z = tmp_re;
967  I3.w = tmp_im;
968 
969  tmp_re = +I2.x;
970  tmp_im = -I2.y;
971  I2.x = +I5.x;
972  I2.y = -I5.y;
973  I5.x = tmp_re;
974  I5.y = tmp_im;
975 
976  //Third color component
977 
978  tmp_re = +I1.x;
979  tmp_im = -I1.y;
980  I1.x = +I4.x;
981  I1.y = -I4.y;
982  I4.x = tmp_re;
983  I4.y = tmp_im;
984 
985  tmp_re = +I2.z;
986  tmp_im = -I2.w;
987  I2.z = +I5.z;
988  I2.w = -I5.w;
989  I5.z = tmp_re;
990  I5.w = tmp_im;
991 
992  //do products for first color component here:
993 
994  //00 component:
995  tmp_re = I0.x * J0.x - I0.y * J0.y;
996  tmp_re += I0.z * J0.z - I0.w * J0.w;
997  tmp_re += I1.x * J1.x - I1.y * J1.y;
998  accum_re[0*blockDim.x] = tmp_re;
999 
1000  tmp_im = I0.x * J0.y + I0.y * J0.x;
1001  tmp_im += I0.z * J0.w + I0.w * J0.z;
1002  tmp_im += I1.x * J1.y + I1.y * J1.x;
1003  accum_im[0*blockDim.x] = tmp_im;
1004 
1005  //01 component:
1006  tmp_re = I0.x * J1.z - I0.y * J1.w;
1007  tmp_re += I0.z * J2.x - I0.w * J2.y;
1008  tmp_re += I1.x * J2.z - I1.y * J2.w;
1009  accum_re[1*blockDim.x] = tmp_re;
1010 
1011  tmp_im = I0.x * J1.w + I0.y * J1.z;
1012  tmp_im += I0.z * J2.y + I0.w * J2.x;
1013  tmp_im += I1.x * J2.w + I1.y * J2.z;
1014  accum_im[1*blockDim.x] = tmp_im;
1015 
1016  //02 component:
1017  tmp_re = I0.x * J3.x - I0.y * J3.y;
1018  tmp_re += I0.z * J3.z - I0.w * J3.w;
1019  tmp_re += I1.x * J4.x - I1.y * J4.y;
1020  accum_re[2*blockDim.x] = tmp_re;
1021 
1022  tmp_im = I0.x * J3.y + I0.y * J3.x;
1023  tmp_im += I0.z * J3.w + I0.w * J3.z;
1024  tmp_im += I1.x * J4.y + I1.y * J4.x;
1025  accum_im[2*blockDim.x] = tmp_im;
1026 
1027  //03 component:
1028  tmp_re = I0.x * J4.z - I0.y * J4.w;
1029  tmp_re += I0.z * J5.x - I0.w * J5.x;
1030  tmp_re += I1.x * J5.z - I1.y * J5.w;
1031 
1032  accum_re[3*blockDim.x] = tmp_re;
1033 
1034  tmp_im = I0.x * J4.w + I0.y * J4.z;
1035  tmp_im += I0.z * J5.y + I0.w * J5.y;
1036  tmp_im += I1.x * J5.w + I1.y * J5.z;
1037  accum_im[3*blockDim.x] = tmp_im;
1038 
1039  //10 component:
1040  tmp_re = I1.z * J0.x - I1.w * J0.y;
1041  tmp_re += I2.x * J0.z - I2.y * J0.w;
1042  tmp_re += I2.z * J1.x - I2.w * J1.y;
1043  accum_re[4*blockDim.x] = tmp_re;
1044 
1045  tmp_im = I1.z * J0.y + I1.w * J0.x;
1046  tmp_im += I2.x * J0.w + I2.y * J0.z;
1047  tmp_im += I2.z * J1.y + I2.w * J1.x;
1048  accum_im[4*blockDim.x] = tmp_im;
1049 
1050  //11 component:
1051  tmp_re = I1.z * J1.z - I1.w * J1.w;
1052  tmp_re += I2.x * J2.x - I2.y * J2.y;
1053  tmp_re += I2.z * J2.z - I2.w * J2.w;
1054  accum_re[5*blockDim.x] = tmp_re;
1055 
1056  tmp_im = I1.z * J1.w + I1.w * J1.z;
1057  tmp_im += I2.x * J2.y + I2.y * J2.x;
1058  tmp_im += I2.z * J2.w + I2.w * J2.z;
1059  accum_im[5*blockDim.x] = tmp_im;
1060 
1061  //12 component:
1062  tmp_re = I1.z * J3.x - I1.w * J3.y;
1063  tmp_re += I2.x * J3.z - I2.y * J3.w;
1064  tmp_re += I2.z * J4.x - I2.w * J4.y;
1065  accum_re[6*blockDim.x] = tmp_re;
1066 
1067  tmp_im = I1.z * J3.y + I1.w * J3.x;
1068  tmp_im += I2.x * J3.w + I2.y * J3.z;
1069  tmp_im += I2.z * J4.y + I2.w * J4.x;
1070  accum_im[6*blockDim.x] = tmp_im;
1071 
1072  //13 component:
1073  tmp_re = I1.z * J4.z - I1.w * J4.w;
1074  tmp_re += I2.x * J5.x - I2.y * J5.y;
1075  tmp_re += I2.z * J5.z - I2.w * J5.w;
1076  accum_re[7*blockDim.x] = tmp_re;
1077 
1078  tmp_im = I1.z * J4.w + I1.w * J4.z;
1079  tmp_im += I2.x * J5.y + I2.y * J5.x;
1080  tmp_im += I2.z * J5.w + I2.w * J5.z;
1081  accum_im[7*blockDim.x] = tmp_im;
1082 
1083  //20 component:
1084  tmp_re = I3.x * J0.x - I3.y * J0.y;
1085  tmp_re += I3.z * J0.z - I3.w * J0.w;
1086  tmp_re += I4.x * J1.x - I4.y * J1.y;
1087  accum_re[8*blockDim.x] = tmp_re;
1088 
1089  tmp_im = I3.x * J0.y + I3.y * J0.x;
1090  tmp_im += I3.z * J0.w + I3.w * J0.z;
1091  tmp_im += I4.x * J1.y + I4.y * J1.x;
1092  accum_im[8*blockDim.x] = tmp_im;
1093 
1094  //21 component:
1095  tmp_re = I3.x * J1.z - I3.y * J1.w;
1096  tmp_re += I3.z * J2.x - I3.w * J2.y;
1097  tmp_re += I4.x * J2.z - I4.y * J2.w;
1098  accum_re[9*blockDim.x] = tmp_re;
1099 
1100  tmp_im = I3.x * J1.w + I3.y * J1.z;
1101  tmp_im += I3.z * J2.y + I3.w * J2.x;
1102  tmp_im += I4.x * J2.w + I4.y * J2.z;
1103  accum_im[9*blockDim.x] = tmp_im;
1104 
1105  //22 component:
1106  tmp_re = I3.x * J3.x - I3.y * J3.y;
1107  tmp_re += I3.z * J3.z - I3.w * J3.w;
1108  tmp_re += I4.x * J4.x - I4.y * J4.y;
1109  accum_re[10*blockDim.x] = tmp_re;
1110 
1111  tmp_im = I3.x * J3.y + I3.y * J3.x;
1112  tmp_im += I3.z * J3.w + I3.w * J3.z;
1113  tmp_im += I4.x * J4.y + I4.y * J4.x;
1114  accum_im[10*blockDim.x] = tmp_im;
1115 
1116  //23 component:
1117  tmp_re = I3.x * J4.z - I3.y * J4.w;
1118  tmp_re += I3.z * J5.x - I3.w * J5.y;
1119  tmp_re += I4.x * J5.z - I4.y * J5.w;
1120  accum_re[11*blockDim.x] = tmp_re;
1121 
1122  tmp_im = I3.x * J4.w + I3.y * J4.z;
1123  tmp_im += I3.z * J5.y + I3.w * J5.x;
1124  tmp_im += I4.x * J5.w + I4.y * J5.z;
1125  accum_im[11*blockDim.x] = tmp_im;
1126 
1127  //30 component:
1128  tmp_re = I4.z * J0.x - I4.w * J0.y;
1129  tmp_re += I5.x * J0.z - I5.y * J0.w;
1130  tmp_re += I5.z * J1.x - I5.w * J1.y;
1131  accum_re[12*blockDim.x] = tmp_re;
1132 
1133  tmp_im = I4.z * J0.y + I4.w * J0.x;
1134  tmp_im += I5.x * J0.w + I5.y * J0.z;
1135  tmp_im += I5.z * J1.y + I5.w * J1.x;
1136  accum_im[12*blockDim.x] = tmp_im;
1137 
1138  //31 component:
1139  tmp_re = I4.z * J1.z - I4.w * J1.w;
1140  tmp_re += I5.x * J2.x - I5.y * J2.y;
1141  tmp_re += I5.z * J2.z - I5.w * J2.w;
1142  accum_re[13*blockDim.x] = tmp_re;
1143 
1144  tmp_im = I4.z * J1.w + I4.w * J1.z;
1145  tmp_im += I5.x * J2.y + I5.y * J2.x;
1146  tmp_im += I5.z * J2.w + I5.w * J2.z;
1147  accum_im[13*blockDim.x] = tmp_im;
1148 
1149  //32 component:
1150  tmp_re = I4.z * J3.x - I4.w * J3.y;
1151  tmp_re += I5.x * J3.z - I5.y * J3.w;
1152  tmp_re += I5.z * J4.x - I5.w * J4.y;
1153  accum_re[14*blockDim.x] = tmp_re;
1154 
1155  tmp_im = I4.z * J3.y + I4.w * J3.x;
1156  tmp_im += I5.x * J3.w + I5.y * J3.z;
1157  tmp_im += I5.z * J4.y + I5.w * J4.x;
1158  accum_im[14*blockDim.x] = tmp_im;
1159 
1160  //33 component:
1161  tmp_re = I4.z * J4.z - I4.w * J4.w;
1162  tmp_re += I5.x * J5.x - I5.y * J5.y;
1163  tmp_re += I5.z * J5.z - I5.w * J5.w;
1164  accum_re[15*blockDim.x] = tmp_re;
1165 
1166  tmp_im = I4.z * J4.w + I4.w * J4.z;
1167  tmp_im += I5.x * J5.y + I5.y * J5.x;
1168  tmp_im += I5.z * J5.w + I5.w * J5.z;
1169  accum_im[15*blockDim.x] = tmp_im;
1170 
1171 
1172 
1173  //Store output back to global buffer:
1174 
1175 
1176  /* CONTRACTION FULL VOLUME */
1177 
1178  out[outId + 0 *param.threads*2] = make_float2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
1179  out[outId + 1 *param.threads*2] = make_float2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
1180  out[outId + 2 *param.threads*2] = make_float2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
1181  out[outId + 3 *param.threads*2] = make_float2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
1182  out[outId + 4 *param.threads*2] = make_float2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
1183  out[outId + 5 *param.threads*2] = make_float2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
1184  out[outId + 6 *param.threads*2] = make_float2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
1185  out[outId + 7 *param.threads*2] = make_float2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
1186  out[outId + 8 *param.threads*2] = make_float2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
1187  out[outId + 9 *param.threads*2] = make_float2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
1188  out[outId + 10*param.threads*2] = make_float2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
1189  out[outId + 11*param.threads*2] = make_float2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
1190  out[outId + 12*param.threads*2] = make_float2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
1191  out[outId + 13*param.threads*2] = make_float2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
1192  out[outId + 14*param.threads*2] = make_float2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
1193  out[outId + 15*param.threads*2] = make_float2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
1194 
1195  return;
1196 }
1197 
1198 //Perform trace in color space only and for a given tslice
1199 //since the file is included in dslash_quda.h, no need to add dslash_constants.h file here (for, e.g., Vsh)
1200 __global__ void contractTsliceKernel (float2 *out, float4 *in1, float4 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
1201 {
1202  int sid = blockIdx.x*blockDim.x + threadIdx.x; //number of threads is equal to Tslice volume
1203  //Adjust sid to correct tslice (exe domain must be Tslice volume!)
1204  int inId = sid + param.Vsh*Tslice; //Vsh - 3d space volume for the parity spinor (equale to exe domain!)
1205  int outId;
1206  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1207 
1208  if (sid >= param.threads) //param.threads == tslice volume
1209  return;
1210 
1211  volatile float2 tmp;
1212  extern __shared__ float sms[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
1213 
1214  volatile float *accum_re = sms + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
1215  volatile float *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
1216 
1217 //The output only for a given tslice (for the full tslice content, i.e., both parities!):
1218 
1219  eutId = 2*inId;
1220  auxCoord1 = eutId / param.dc.X[0];
1221  xCoord1 = eutId - auxCoord1 * param.dc.X[0];
1222  auxCoord2 = auxCoord1 / param.dc.X[1];
1223  xCoord2 = auxCoord1 - auxCoord2 * param.dc.X[1];
1224  xCoord4 = auxCoord2 / param.dc.X[2];
1225  xCoord3 = auxCoord2 - xCoord4 * param.dc.X[2];
1226 
1227  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1228  xCoord1 += auxCoord1;
1229  outId = xCoord1 + param.dc.X[0]*(xCoord2 + param.dc.X[1]*(xCoord3 + param.dc.X[2]*xCoord4)); //AQUI
1230 
1231  //Load the full input spinors:
1232 
1233  READ_SPINOR (SPINORTEX, myStride, sid, sid);
1234  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
1235 
1236  //compute in1^dag:
1237 
1238  I0.y = -I0.y;
1239  I0.w = -I0.w;
1240  I1.y = -I1.y;
1241  I1.w = -I1.w;
1242  I2.y = -I2.y;
1243  I2.w = -I2.w;
1244  I3.y = -I3.y;
1245  I3.w = -I3.w;
1246  I4.y = -I4.y;
1247  I4.w = -I4.w;
1248  I5.y = -I5.y;
1249  I5.w = -I5.w;
1250 
1251  //do products for first color component here:
1252  //00 component:
1253  tmp_re = I0.x * J0.x - I0.y * J0.y;
1254  tmp_re += I0.z * J0.z - I0.w * J0.w;
1255  tmp_re += I1.x * J1.x - I1.y * J1.y;
1256  accum_re[0*blockDim.x] = tmp_re;
1257 
1258  tmp_im = I0.x * J0.y + I0.y * J0.x;
1259  tmp_im += I0.z * J0.w + I0.w * J0.z;
1260  tmp_im += I1.x * J1.y + I1.y * J1.x;
1261  accum_im[0*blockDim.x] = tmp_im;
1262 
1263  //01 component:
1264  tmp_re = I0.x * J1.z - I0.y * J1.w;
1265  tmp_re += I0.z * J2.x - I0.w * J2.y;
1266  tmp_re += I1.x * J2.z - I1.y * J2.w;
1267  accum_re[1*blockDim.x] = tmp_re;
1268 
1269  tmp_im = I0.x * J1.w + I0.y * J1.z;
1270  tmp_im += I0.z * J2.y + I0.w * J2.x;
1271  tmp_im += I1.x * J2.w + I1.y * J2.z;
1272  accum_im[1*blockDim.x] = tmp_im;
1273 
1274  //02 component:
1275  tmp_re = I0.x * J3.x - I0.y * J3.y;
1276  tmp_re += I0.z * J3.z - I0.w * J3.w;
1277  tmp_re += I1.x * J4.x - I1.y * J4.y;
1278  accum_re[2*blockDim.x] = tmp_re;
1279 
1280  tmp_im = I0.x * J3.y + I0.y * J3.x;
1281  tmp_im += I0.z * J3.w + I0.w * J3.z;
1282  tmp_im += I1.x * J4.y + I1.y * J4.x;
1283  accum_im[2*blockDim.x] = tmp_im;
1284 
1285  //03 component:
1286  tmp_re = I0.x * J4.z - I0.y * J4.w;
1287  tmp_re += I0.z * J5.x - I0.w * J5.x;
1288  tmp_re += I1.x * J5.z - I1.y * J5.w;
1289 
1290  accum_re[3*blockDim.x] = tmp_re;
1291 
1292  tmp_im = I0.x * J4.w + I0.y * J4.z;
1293  tmp_im += I0.z * J5.y + I0.w * J5.y;
1294  tmp_im += I1.x * J5.w + I1.y * J5.z;
1295  accum_im[3*blockDim.x] = tmp_im;
1296 
1297  //10 component:
1298  tmp_re = I1.z * J0.x - I1.w * J0.y;
1299  tmp_re += I2.x * J0.z - I2.y * J0.w;
1300  tmp_re += I2.z * J1.x - I2.w * J1.y;
1301  accum_re[4*blockDim.x] = tmp_re;
1302 
1303  tmp_im = I1.z * J0.y + I1.w * J0.x;
1304  tmp_im += I2.x * J0.w + I2.y * J0.z;
1305  tmp_im += I2.z * J1.y + I2.w * J1.x;
1306  accum_im[4*blockDim.x] = tmp_im;
1307 
1308  //11 component:
1309  tmp_re = I1.z * J1.z - I1.w * J1.w;
1310  tmp_re += I2.x * J2.x - I2.y * J2.y;
1311  tmp_re += I2.z * J2.z - I2.w * J2.w;
1312  accum_re[5*blockDim.x] = tmp_re;
1313 
1314  tmp_im = I1.z * J1.w + I1.w * J1.z;
1315  tmp_im += I2.x * J2.y + I2.y * J2.x;
1316  tmp_im += I2.z * J2.w + I2.w * J2.z;
1317  accum_im[5*blockDim.x] = tmp_im;
1318 
1319  //12 component:
1320  tmp_re = I1.z * J3.x - I1.w * J3.y;
1321  tmp_re += I2.x * J3.z - I2.y * J3.w;
1322  tmp_re += I2.z * J4.x - I2.w * J4.y;
1323  accum_re[6*blockDim.x] = tmp_re;
1324 
1325  tmp_im = I1.z * J3.y + I1.w * J3.x;
1326  tmp_im += I2.x * J3.w + I2.y * J3.z;
1327  tmp_im += I2.z * J4.y + I2.w * J4.x;
1328  accum_im[6*blockDim.x] = tmp_im;
1329 
1330  //13 component:
1331  tmp_re = I1.z * J4.z - I1.w * J4.w;
1332  tmp_re += I2.x * J5.x - I2.y * J5.y;
1333  tmp_re += I2.z * J5.z - I2.w * J5.w;
1334  accum_re[7*blockDim.x] = tmp_re;
1335 
1336  tmp_im = I1.z * J4.w + I1.w * J4.z;
1337  tmp_im += I2.x * J5.y + I2.y * J5.x;
1338  tmp_im += I2.z * J5.w + I2.w * J5.z;
1339  accum_im[7*blockDim.x] = tmp_im;
1340 
1341  //20 component:
1342  tmp_re = I3.x * J0.x - I3.y * J0.y;
1343  tmp_re += I3.z * J0.z - I3.w * J0.w;
1344  tmp_re += I4.x * J1.x - I4.y * J1.y;
1345  accum_re[8*blockDim.x] = tmp_re;
1346 
1347  tmp_im = I3.x * J0.y + I3.y * J0.x;
1348  tmp_im += I3.z * J0.w + I3.w * J0.z;
1349  tmp_im += I4.x * J1.y + I4.y * J1.x;
1350  accum_im[8*blockDim.x] = tmp_im;
1351 
1352  //21 component:
1353  tmp_re = I3.x * J1.z - I3.y * J1.w;
1354  tmp_re += I3.z * J2.x - I3.w * J2.y;
1355  tmp_re += I4.x * J2.z - I4.y * J2.w;
1356  accum_re[9*blockDim.x] = tmp_re;
1357 
1358  tmp_im = I3.x * J1.w + I3.y * J1.z;
1359  tmp_im += I3.z * J2.y + I3.w * J2.x;
1360  tmp_im += I4.x * J2.w + I4.y * J2.z;
1361  accum_im[9*blockDim.x] = tmp_im;
1362 
1363  //22 component:
1364  tmp_re = I3.x * J3.x - I3.y * J3.y;
1365  tmp_re += I3.z * J3.z - I3.w * J3.w;
1366  tmp_re += I4.x * J4.x - I4.y * J4.y;
1367  accum_re[10*blockDim.x] = tmp_re;
1368 
1369  tmp_im = I3.x * J3.y + I3.y * J3.x;
1370  tmp_im += I3.z * J3.w + I3.w * J3.z;
1371  tmp_im += I4.x * J4.y + I4.y * J4.x;
1372  accum_im[10*blockDim.x] = tmp_im;
1373 
1374  //23 component:
1375  tmp_re = I3.x * J4.z - I3.y * J4.w;
1376  tmp_re += I3.z * J5.x - I3.w * J5.y;
1377  tmp_re += I4.x * J5.z - I4.y * J5.w;
1378  accum_re[11*blockDim.x] = tmp_re;
1379 
1380  tmp_im = I3.x * J4.w + I3.y * J4.z;
1381  tmp_im += I3.z * J5.y + I3.w * J5.x;
1382  tmp_im += I4.x * J5.w + I4.y * J5.z;
1383  accum_im[11*blockDim.x] = tmp_im;
1384 
1385  //30 component:
1386  tmp_re = I4.z * J0.x - I4.w * J0.y;
1387  tmp_re += I5.x * J0.z - I5.y * J0.w;
1388  tmp_re += I5.z * J1.x - I5.w * J1.y;
1389  accum_re[12*blockDim.x] = tmp_re;
1390 
1391  tmp_im = I4.z * J0.y + I4.w * J0.x;
1392  tmp_im += I5.x * J0.w + I5.y * J0.z;
1393  tmp_im += I5.z * J1.y + I5.w * J1.x;
1394  accum_im[12*blockDim.x] = tmp_im;
1395 
1396  //31 component:
1397  tmp_re = I4.z * J1.z - I4.w * J1.w;
1398  tmp_re += I5.x * J2.x - I5.y * J2.y;
1399  tmp_re += I5.z * J2.z - I5.w * J2.w;
1400  accum_re[13*blockDim.x] = tmp_re;
1401 
1402  tmp_im = I4.z * J1.w + I4.w * J1.z;
1403  tmp_im += I5.x * J2.y + I5.y * J2.x;
1404  tmp_im += I5.z * J2.w + I5.w * J2.z;
1405  accum_im[13*blockDim.x] = tmp_im;
1406 
1407  //32 component:
1408  tmp_re = I4.z * J3.x - I4.w * J3.y;
1409  tmp_re += I5.x * J3.z - I5.y * J3.w;
1410  tmp_re += I5.z * J4.x - I5.w * J4.y;
1411  accum_re[14*blockDim.x] = tmp_re;
1412 
1413  tmp_im = I4.z * J3.y + I4.w * J3.x;
1414  tmp_im += I5.x * J3.w + I5.y * J3.z;
1415  tmp_im += I5.z * J4.y + I5.w * J4.x;
1416  accum_im[14*blockDim.x] = tmp_im;
1417 
1418  //33 component:
1419  tmp_re = I4.z * J4.z - I4.w * J4.w;
1420  tmp_re += I5.x * J5.x - I5.y * J5.y;
1421  tmp_re += I5.z * J5.z - I5.w * J5.w;
1422  accum_re[15*blockDim.x] = tmp_re;
1423 
1424  tmp_im = I4.z * J4.w + I4.w * J4.z;
1425  tmp_im += I5.x * J5.y + I5.y * J5.x;
1426  tmp_im += I5.z * J5.w + I5.w * J5.z;
1427  accum_im[15*blockDim.x] = tmp_im;
1428 
1429  //Store output back to global buffer:
1430 
1431 
1432  /* CONTRACTION FULL VOLUME */
1433 
1434  out[outId + 0 *param.threads*2] = make_float2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
1435  out[outId + 1 *param.threads*2] = make_float2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
1436  out[outId + 2 *param.threads*2] = make_float2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
1437  out[outId + 3 *param.threads*2] = make_float2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
1438  out[outId + 4 *param.threads*2] = make_float2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
1439  out[outId + 5 *param.threads*2] = make_float2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
1440  out[outId + 6 *param.threads*2] = make_float2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
1441  out[outId + 7 *param.threads*2] = make_float2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
1442  out[outId + 8 *param.threads*2] = make_float2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
1443  out[outId + 9 *param.threads*2] = make_float2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
1444  out[outId + 10*param.threads*2] = make_float2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
1445  out[outId + 11*param.threads*2] = make_float2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
1446  out[outId + 12*param.threads*2] = make_float2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
1447  out[outId + 13*param.threads*2] = make_float2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
1448  out[outId + 14*param.threads*2] = make_float2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
1449  out[outId + 15*param.threads*2] = make_float2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
1450 
1451  return;
1452 }
1453 
1454 __global__ void contractKernel (float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
1455 {
1456  int sid = blockIdx.x*blockDim.x + threadIdx.x;
1457  int outId = sid;
1458  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1459 
1460  if (sid >= param.threads)
1461  return;
1462 
1463  volatile float2 tmp;
1464  extern __shared__ float sms[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
1465 
1466  volatile float *accum_re = sms + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
1467  volatile float *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
1468 
1469  eutId = 2*sid;
1470  auxCoord1 = eutId / param.dc.X[0];
1471  xCoord1 = eutId - auxCoord1 * param.dc.X[0];
1472  auxCoord2 = auxCoord1 / param.dc.X[1];
1473  xCoord2 = auxCoord1 - auxCoord2 * param.dc.X[1];
1474  xCoord4 = auxCoord2 / param.dc.X[2];
1475  xCoord3 = auxCoord2 - xCoord4 * param.dc.X[2];
1476 
1477  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1478  xCoord1 += auxCoord1;
1479  outId = xCoord1 + param.dc.X[0]*(xCoord2 + param.dc.X[1]*(xCoord3 + param.dc.X[2]*xCoord4)); //AQUI
1480 
1481  //Load the full input spinors:
1482 
1483  READ_SPINOR (SPINORTEX, myStride, sid, sid);
1484  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
1485 
1486  //compute in1^dag:
1487 
1488  I0.y = -I0.y;
1489  I0.w = -I0.w;
1490  I1.y = -I1.y;
1491  I1.w = -I1.w;
1492  I2.y = -I2.y;
1493  I2.w = -I2.w;
1494  I3.y = -I3.y;
1495  I3.w = -I3.w;
1496  I4.y = -I4.y;
1497  I4.w = -I4.w;
1498  I5.y = -I5.y;
1499  I5.w = -I5.w;
1500 
1501  //do products for first color component here:
1502  //00 component:
1503  tmp_re = I0.x * J0.x - I0.y * J0.y;
1504  tmp_re += I0.z * J0.z - I0.w * J0.w;
1505  tmp_re += I1.x * J1.x - I1.y * J1.y;
1506  accum_re[0*blockDim.x] = tmp_re;
1507 
1508  tmp_im = I0.x * J0.y + I0.y * J0.x;
1509  tmp_im += I0.z * J0.w + I0.w * J0.z;
1510  tmp_im += I1.x * J1.y + I1.y * J1.x;
1511  accum_im[0*blockDim.x] = tmp_im;
1512 
1513  //01 component:
1514  tmp_re = I0.x * J1.z - I0.y * J1.w;
1515  tmp_re += I0.z * J2.x - I0.w * J2.y;
1516  tmp_re += I1.x * J2.z - I1.y * J2.w;
1517  accum_re[1*blockDim.x] = tmp_re;
1518 
1519  tmp_im = I0.x * J1.w + I0.y * J1.z;
1520  tmp_im += I0.z * J2.y + I0.w * J2.x;
1521  tmp_im += I1.x * J2.w + I1.y * J2.z;
1522  accum_im[1*blockDim.x] = tmp_im;
1523 
1524  //02 component:
1525  tmp_re = I0.x * J3.x - I0.y * J3.y;
1526  tmp_re += I0.z * J3.z - I0.w * J3.w;
1527  tmp_re += I1.x * J4.x - I1.y * J4.y;
1528  accum_re[2*blockDim.x] = tmp_re;
1529 
1530  tmp_im = I0.x * J3.y + I0.y * J3.x;
1531  tmp_im += I0.z * J3.w + I0.w * J3.z;
1532  tmp_im += I1.x * J4.y + I1.y * J4.x;
1533  accum_im[2*blockDim.x] = tmp_im;
1534 
1535  //03 component:
1536  tmp_re = I0.x * J4.z - I0.y * J4.w;
1537  tmp_re += I0.z * J5.x - I0.w * J5.x;
1538  tmp_re += I1.x * J5.z - I1.y * J5.w;
1539 
1540  accum_re[3*blockDim.x] = tmp_re;
1541 
1542  tmp_im = I0.x * J4.w + I0.y * J4.z;
1543  tmp_im += I0.z * J5.y + I0.w * J5.y;
1544  tmp_im += I1.x * J5.w + I1.y * J5.z;
1545  accum_im[3*blockDim.x] = tmp_im;
1546 
1547  //10 component:
1548  tmp_re = I1.z * J0.x - I1.w * J0.y;
1549  tmp_re += I2.x * J0.z - I2.y * J0.w;
1550  tmp_re += I2.z * J1.x - I2.w * J1.y;
1551  accum_re[4*blockDim.x] = tmp_re;
1552 
1553  tmp_im = I1.z * J0.y + I1.w * J0.x;
1554  tmp_im += I2.x * J0.w + I2.y * J0.z;
1555  tmp_im += I2.z * J1.y + I2.w * J1.x;
1556  accum_im[4*blockDim.x] = tmp_im;
1557 
1558  //11 component:
1559  tmp_re = I1.z * J1.z - I1.w * J1.w;
1560  tmp_re += I2.x * J2.x - I2.y * J2.y;
1561  tmp_re += I2.z * J2.z - I2.w * J2.w;
1562  accum_re[5*blockDim.x] = tmp_re;
1563 
1564  tmp_im = I1.z * J1.w + I1.w * J1.z;
1565  tmp_im += I2.x * J2.y + I2.y * J2.x;
1566  tmp_im += I2.z * J2.w + I2.w * J2.z;
1567  accum_im[5*blockDim.x] = tmp_im;
1568 
1569  //12 component:
1570  tmp_re = I1.z * J3.x - I1.w * J3.y;
1571  tmp_re += I2.x * J3.z - I2.y * J3.w;
1572  tmp_re += I2.z * J4.x - I2.w * J4.y;
1573  accum_re[6*blockDim.x] = tmp_re;
1574 
1575  tmp_im = I1.z * J3.y + I1.w * J3.x;
1576  tmp_im += I2.x * J3.w + I2.y * J3.z;
1577  tmp_im += I2.z * J4.y + I2.w * J4.x;
1578  accum_im[6*blockDim.x] = tmp_im;
1579 
1580  //13 component:
1581  tmp_re = I1.z * J4.z - I1.w * J4.w;
1582  tmp_re += I2.x * J5.x - I2.y * J5.y;
1583  tmp_re += I2.z * J5.z - I2.w * J5.w;
1584  accum_re[7*blockDim.x] = tmp_re;
1585 
1586  tmp_im = I1.z * J4.w + I1.w * J4.z;
1587  tmp_im += I2.x * J5.y + I2.y * J5.x;
1588  tmp_im += I2.z * J5.w + I2.w * J5.z;
1589  accum_im[7*blockDim.x] = tmp_im;
1590 
1591  //20 component:
1592  tmp_re = I3.x * J0.x - I3.y * J0.y;
1593  tmp_re += I3.z * J0.z - I3.w * J0.w;
1594  tmp_re += I4.x * J1.x - I4.y * J1.y;
1595  accum_re[8*blockDim.x] = tmp_re;
1596 
1597  tmp_im = I3.x * J0.y + I3.y * J0.x;
1598  tmp_im += I3.z * J0.w + I3.w * J0.z;
1599  tmp_im += I4.x * J1.y + I4.y * J1.x;
1600  accum_im[8*blockDim.x] = tmp_im;
1601 
1602  //21 component:
1603  tmp_re = I3.x * J1.z - I3.y * J1.w;
1604  tmp_re += I3.z * J2.x - I3.w * J2.y;
1605  tmp_re += I4.x * J2.z - I4.y * J2.w;
1606  accum_re[9*blockDim.x] = tmp_re;
1607 
1608  tmp_im = I3.x * J1.w + I3.y * J1.z;
1609  tmp_im += I3.z * J2.y + I3.w * J2.x;
1610  tmp_im += I4.x * J2.w + I4.y * J2.z;
1611  accum_im[9*blockDim.x] = tmp_im;
1612 
1613  //22 component:
1614  tmp_re = I3.x * J3.x - I3.y * J3.y;
1615  tmp_re += I3.z * J3.z - I3.w * J3.w;
1616  tmp_re += I4.x * J4.x - I4.y * J4.y;
1617  accum_re[10*blockDim.x] = tmp_re;
1618 
1619  tmp_im = I3.x * J3.y + I3.y * J3.x;
1620  tmp_im += I3.z * J3.w + I3.w * J3.z;
1621  tmp_im += I4.x * J4.y + I4.y * J4.x;
1622  accum_im[10*blockDim.x] = tmp_im;
1623 
1624  //23 component:
1625  tmp_re = I3.x * J4.z - I3.y * J4.w;
1626  tmp_re += I3.z * J5.x - I3.w * J5.y;
1627  tmp_re += I4.x * J5.z - I4.y * J5.w;
1628  accum_re[11*blockDim.x] = tmp_re;
1629 
1630  tmp_im = I3.x * J4.w + I3.y * J4.z;
1631  tmp_im += I3.z * J5.y + I3.w * J5.x;
1632  tmp_im += I4.x * J5.w + I4.y * J5.z;
1633  accum_im[11*blockDim.x] = tmp_im;
1634 
1635  //30 component:
1636  tmp_re = I4.z * J0.x - I4.w * J0.y;
1637  tmp_re += I5.x * J0.z - I5.y * J0.w;
1638  tmp_re += I5.z * J1.x - I5.w * J1.y;
1639  accum_re[12*blockDim.x] = tmp_re;
1640 
1641  tmp_im = I4.z * J0.y + I4.w * J0.x;
1642  tmp_im += I5.x * J0.w + I5.y * J0.z;
1643  tmp_im += I5.z * J1.y + I5.w * J1.x;
1644  accum_im[12*blockDim.x] = tmp_im;
1645 
1646  //31 component:
1647  tmp_re = I4.z * J1.z - I4.w * J1.w;
1648  tmp_re += I5.x * J2.x - I5.y * J2.y;
1649  tmp_re += I5.z * J2.z - I5.w * J2.w;
1650  accum_re[13*blockDim.x] = tmp_re;
1651 
1652  tmp_im = I4.z * J1.w + I4.w * J1.z;
1653  tmp_im += I5.x * J2.y + I5.y * J2.x;
1654  tmp_im += I5.z * J2.w + I5.w * J2.z;
1655  accum_im[13*blockDim.x] = tmp_im;
1656 
1657  //32 component:
1658  tmp_re = I4.z * J3.x - I4.w * J3.y;
1659  tmp_re += I5.x * J3.z - I5.y * J3.w;
1660  tmp_re += I5.z * J4.x - I5.w * J4.y;
1661  accum_re[14*blockDim.x] = tmp_re;
1662 
1663  tmp_im = I4.z * J3.y + I4.w * J3.x;
1664  tmp_im += I5.x * J3.w + I5.y * J3.z;
1665  tmp_im += I5.z * J4.y + I5.w * J4.x;
1666  accum_im[14*blockDim.x] = tmp_im;
1667 
1668  //33 component:
1669  tmp_re = I4.z * J4.z - I4.w * J4.w;
1670  tmp_re += I5.x * J5.x - I5.y * J5.y;
1671  tmp_re += I5.z * J5.z - I5.w * J5.w;
1672  accum_re[15*blockDim.x] = tmp_re;
1673 
1674  tmp_im = I4.z * J4.w + I4.w * J4.z;
1675  tmp_im += I5.x * J5.y + I5.y * J5.x;
1676  tmp_im += I5.z * J5.w + I5.w * J5.z;
1677  accum_im[15*blockDim.x] = tmp_im;
1678 
1679  //Store output back to global buffer:
1680 
1681  /* CONTRACTION FULL VOLUME */
1682 
1683  out[outId + 0 *param.threads*2] = make_float2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
1684  out[outId + 1 *param.threads*2] = make_float2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
1685  out[outId + 2 *param.threads*2] = make_float2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
1686  out[outId + 3 *param.threads*2] = make_float2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
1687  out[outId + 4 *param.threads*2] = make_float2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
1688  out[outId + 5 *param.threads*2] = make_float2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
1689  out[outId + 6 *param.threads*2] = make_float2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
1690  out[outId + 7 *param.threads*2] = make_float2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
1691  out[outId + 8 *param.threads*2] = make_float2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
1692  out[outId + 9 *param.threads*2] = make_float2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
1693  out[outId + 10*param.threads*2] = make_float2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
1694  out[outId + 11*param.threads*2] = make_float2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
1695  out[outId + 12*param.threads*2] = make_float2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
1696  out[outId + 13*param.threads*2] = make_float2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
1697  out[outId + 14*param.threads*2] = make_float2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
1698  out[outId + 15*param.threads*2] = make_float2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
1699 
1700  return;
1701 }
1702 
1703 #undef SPINORTEX
1704 #undef INTERTEX
1705 
1706 #undef tmp_re
1707 #undef tmp_im
1708 
1709 #undef READ_SPINOR
1710 #undef READ_INTERMEDIATE_SPINOR
1711 
1712 #undef SPINOR_HOP
1713 
1714 #endif //_TWIST_QUDA_CONTRACT
__global__ void contractKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
dim3 dim3 blockDim
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
#define TOTAL_COMPONENTS
Definition: contract_core.h:11
QudaGaugeParam param
Definition: pack_test.cpp:17
#define SPINORTEX
#define READ_SPINOR
#define READ_INTERMEDIATE_SPINOR
__global__ void contractTsliceKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
int X[4]
Definition: quda.h:29
cpuColorSpinorField * out
#define tmp_re
Definition: contract_core.h:8
#define INTERTEX
#define tmp_im
Definition: contract_core.h:9
__global__ void contractGamma5Kernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
Definition: contract_core.h:65