QUDA  0.9.0
contract_core_plus.h
Go to the documentation of this file.
1 #ifndef _TWIST_QUDA_CONTRACT_PLUS
2 #define _TWIST_QUDA_CONTRACT_PLUS
3 
4 #define tmp_re tmp.x
5 #define tmp_im tmp.y
6 
7 #define TOTAL_COMPONENTS 16
8 
9 #define READ_INTERMEDIATE_SPINOR_DOUBLE(spinor, stride, sp_idx, norm_idx) \
10  double2 J0 = spinor[sp_idx + 0*(stride)]; \
11  double2 J1 = spinor[sp_idx + 1*(stride)]; \
12  double2 J2 = spinor[sp_idx + 2*(stride)]; \
13  double2 J3 = spinor[sp_idx + 3*(stride)]; \
14  double2 J4 = spinor[sp_idx + 4*(stride)]; \
15  double2 J5 = spinor[sp_idx + 5*(stride)]; \
16  double2 J6 = spinor[sp_idx + 6*(stride)]; \
17  double2 J7 = spinor[sp_idx + 7*(stride)]; \
18  double2 J8 = spinor[sp_idx + 8*(stride)]; \
19  double2 J9 = spinor[sp_idx + 9*(stride)]; \
20  double2 J10 = spinor[sp_idx +10*(stride)]; \
21  double2 J11 = spinor[sp_idx +11*(stride)];
22 
23 #define READ_INTERMEDIATE_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \
24  double2 J0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
25  double2 J1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
26  double2 J2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
27  double2 J3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
28  double2 J4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
29  double2 J5 = fetch_double2((spinor), sp_idx + 5*(stride)); \
30  double2 J6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
31  double2 J7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
32  double2 J8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
33  double2 J9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
34  double2 J10 = fetch_double2((spinor), sp_idx +10*(stride)); \
35  double2 J11 = fetch_double2((spinor), sp_idx +11*(stride));
36 
37 #ifdef DIRECT_ACCESS_WILSON_SPINOR
38  #define READ_SPINOR READ_SPINOR_DOUBLE
39  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE
40  #define SPINORTEX in
41  #define INTERTEX in2
42 #else
43  #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
44  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE_TEX
45 
46  #ifdef USE_TEXTURE_OBJECTS
47  #define SPINORTEX param.inTex
48  #define INTERTEX param.outTex
49  #else
50  #define SPINORTEX spinorTexDouble
51  #define INTERTEX interTexDouble
52  #endif // USE_TEXTURE_OBJECTS
53 
54 #define SPINOR_HOP 12
55 
56 #endif
57 
58 __global__ void contractGamma5PlusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
59 {
60  int sid = blockIdx.x*blockDim.x + threadIdx.x;
61  int outId = sid;
62  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
63 
64  if (sid >= param.threads)
65  return;
66 
67  volatile double2 tmp;
68  extern __shared__ double sm[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
69 
70  volatile double *accum_re = sm + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
71  volatile double *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
72 
73  eutId = 2*sid;
74  auxCoord1 = eutId / param.dc.X[0];
75  xCoord1 = eutId - auxCoord1 * param.dc.X[0];
76  auxCoord2 = auxCoord1 / param.dc.X[1];
77  xCoord2 = auxCoord1 - auxCoord2 * param.dc.X[1];
78  xCoord4 = auxCoord2 / param.dc.X[2];
79  xCoord3 = auxCoord2 - xCoord4 * param.dc.X[2];
80 
81  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
82  xCoord1 += auxCoord1;
83  outId = xCoord1 + param.dc.X[0]*(xCoord2 + param.dc.X[1]*(xCoord3 + param.dc.X[2]*xCoord4)); //AQUI
84 
85  READ_SPINOR (SPINORTEX, myStride, sid, sid);
87 
88  //compute in1^dag * gamma5:
89 
90  tmp_re = +I0.x;
91  tmp_im = -I0.y;
92  I0.x = +I6.x;
93  I0.y = -I6.y;
94  I6.x = tmp_re;
95  I6.y = tmp_im;
96 
97  tmp_re = +I3.x;
98  tmp_im = -I3.y;
99  I3.x = +I9.x;
100  I3.y = -I9.y;
101  I9.x = tmp_re;
102  I9.y = tmp_im;
103 
104  tmp_re = +I1.x;
105  tmp_im = -I1.y;
106  I1.x = +I7.x;
107  I1.y = -I7.y;
108  I7.x = tmp_re;
109  I7.y = tmp_im;
110 
111  tmp_re = +I4.x;
112  tmp_im = -I4.y;
113  I4.x = +I10.x;
114  I4.y = -I10.y;
115  I10.x = tmp_re;
116  I10.y = tmp_im;
117 
118  tmp_re = +I2.x;
119  tmp_im = -I2.y;
120  I2.x = +I8.x;
121  I2.y = -I8.y;
122  I8.x = tmp_re;
123  I8.y = tmp_im;
124 
125  tmp_re = +I5.x;
126  tmp_im = -I5.y;
127  I5.x = +I11.x;
128  I5.y = -I11.y;
129  I11.x = tmp_re;
130  I11.y = tmp_im;
131 
132  //do products for all color component here:
133  //00 component:
134  tmp_re = I0.x * J0.x - I0.y * J0.y;
135  tmp_re += I1.x * J1.x - I1.y * J1.y;
136  tmp_re += I2.x * J2.x - I2.y * J2.y;
137  accum_re[0*blockDim.x] = tmp_re;
138 
139  tmp_im = I0.x * J0.y + I0.y * J0.x;
140  tmp_im += I1.x * J1.y + I1.y * J1.x;
141  tmp_im += I2.x * J2.y + I2.y * J2.x;
142  accum_im[0*blockDim.x] = tmp_im;
143 
144  //01 component:
145  tmp_re = I0.x * J3.x - I0.y * J3.y;
146  tmp_re += I1.x * J4.x - I1.y * J4.y;
147  tmp_re += I2.x * J5.x - I2.y * J5.y;
148  accum_re[1*blockDim.x] = tmp_re;
149 
150  tmp_im = I0.x * J3.y + I0.y * J3.x;
151  tmp_im += I1.x * J4.y + I1.y * J4.x;
152  tmp_im += I2.x * J5.y + I2.y * J5.x;
153  accum_im[1*blockDim.x] = tmp_im;
154 
155  //02 component:
156  tmp_re = I0.x * J6.x - I0.y * J6.y;
157  tmp_re += I1.x * J7.x - I1.y * J7.y;
158  tmp_re += I2.x * J8.x - I2.y * J8.y;
159  accum_re[2*blockDim.x] = tmp_re;
160 
161  tmp_im = I0.x * J6.y + I0.y * J6.x;
162  tmp_im += I1.x * J7.y + I1.y * J7.x;
163  tmp_im += I2.x * J8.y + I2.y * J8.x;
164  accum_im[2*blockDim.x] = tmp_im;
165 
166  //03 component:
167  tmp_re = I0.x * J9.x - I0.y * J9.y;
168  tmp_re += I1.x * J10.x - I1.y * J10.y;
169  tmp_re += I2.x * J11.x - I2.y * J11.y;
170  accum_re[3*blockDim.x] = tmp_re;
171 
172  tmp_im = I0.x * J9.y + I0.y * J9.x;
173  tmp_im += I1.x * J10.y + I1.y * J10.x;
174  tmp_im += I2.x * J11.y + I2.y * J11.x;
175  accum_im[3*blockDim.x] = tmp_im;
176 
177  //10 component:
178  tmp_re = I3.x * J0.x - I3.y * J0.y;
179  tmp_re += I4.x * J1.x - I4.y * J1.y;
180  tmp_re += I5.x * J2.x - I5.y * J2.y;
181  accum_re[ 4*blockDim.x] = tmp_re;
182 
183  tmp_im = I3.x * J0.y + I3.y * J0.x;
184  tmp_im += I4.x * J1.y + I4.y * J1.x;
185  tmp_im += I5.x * J2.y + I5.y * J2.x;
186  accum_im[ 4*blockDim.x] = tmp_im;
187 
188  //11 component:
189  tmp_re = I3.x * J3.x - I3.y * J3.y;
190  tmp_re += I4.x * J4.x - I4.y * J4.y;
191  tmp_re += I5.x * J5.x - I5.y * J5.y;
192  accum_re[ 5*blockDim.x] = tmp_re;
193 
194  tmp_im = I3.x * J3.y + I3.y * J3.x;
195  tmp_im += I4.x * J4.y + I4.y * J4.x;
196  tmp_im += I5.x * J5.y + I5.y * J5.x;
197  accum_im[ 5*blockDim.x] = tmp_im;
198 
199  //12 component:
200  tmp_re = I3.x * J6.x - I3.y * J6.y;
201  tmp_re += I4.x * J7.x - I4.y * J7.y;
202  tmp_re += I5.x * J8.x - I5.y * J8.y;
203  accum_re[ 6*blockDim.x] = tmp_re;
204 
205  tmp_im = I3.x * J6.y + I3.y * J6.x;
206  tmp_im += I4.x * J7.y + I4.y * J7.x;
207  tmp_im += I5.x * J8.y + I5.y * J8.x;
208  accum_im[ 6*blockDim.x] = tmp_im;
209 
210  //13 component:
211  tmp_re = I3.x * J9.x - I3.y * J9.y;
212  tmp_re += I4.x * J10.x - I4.y * J10.y;
213  tmp_re += I5.x * J11.x - I5.y * J11.y;
214  accum_re[ 7*blockDim.x] = tmp_re;
215 
216  tmp_im = I3.x * J9.y + I3.y * J9.x;
217  tmp_im += I4.x * J10.y + I4.y * J10.x;
218  tmp_im += I5.x * J11.y + I5.y * J11.x;
219  accum_im[ 7*blockDim.x] = tmp_im;
220 
221  //20 component:
222  tmp_re = I6.x * J0.x - I6.y * J0.y;
223  tmp_re += I7.x * J1.x - I7.y * J1.y;
224  tmp_re += I8.x * J2.x - I8.y * J2.y;
225  accum_re[ 8*blockDim.x] = tmp_re;
226 
227  tmp_im = I6.x * J0.y + I6.y * J0.x;
228  tmp_im += I7.x * J1.y + I7.y * J1.x;
229  tmp_im += I8.x * J2.y + I8.y * J2.x;
230  accum_im[ 8*blockDim.x] = tmp_im;
231 
232  //21 component:
233  tmp_re = I6.x * J3.x - I6.y * J3.y;
234  tmp_re += I7.x * J4.x - I7.y * J4.y;
235  tmp_re += I8.x * J5.x - I8.y * J5.y;
236  accum_re[ 9*blockDim.x] = tmp_re;
237 
238  tmp_im = I6.x * J3.y + I6.y * J3.x;
239  tmp_im += I7.x * J4.y + I7.y * J4.x;
240  tmp_im += I8.x * J5.y + I8.y * J5.x;
241  accum_im[ 9*blockDim.x] = tmp_im;
242 
243  //22 component:
244  tmp_re = I6.x * J6.x - I6.y * J6.y;
245  tmp_re += I7.x * J7.x - I7.y * J7.y;
246  tmp_re += I8.x * J8.x - I8.y * J8.y;
247  accum_re[10*blockDim.x] = tmp_re;
248 
249  tmp_im = I6.x * J6.y + I6.y * J6.x;
250  tmp_im += I7.x * J7.y + I7.y * J7.x;
251  tmp_im += I8.x * J8.y + I8.y * J8.x;
252  accum_im[10*blockDim.x] = tmp_im;
253 
254  //23 component:
255  tmp_re = I6.x * J9.x - I6.y * J9.y;
256  tmp_re += I7.x * J10.x - I7.y * J10.y;
257  tmp_re += I8.x * J11.x - I8.y * J11.y;
258  accum_re[11*blockDim.x] = tmp_re;
259 
260  tmp_im = I6.x * J9.y + I6.y * J9.x;
261  tmp_im += I7.x * J10.y + I7.y * J10.x;
262  tmp_im += I8.x * J11.y + I8.y * J11.x;
263  accum_im[11*blockDim.x] = tmp_im;
264 
265  //30 component:
266  tmp_re = I9.x * J0.x - I9.y * J0.y;
267  tmp_re += I10.x * J1.x - I10.y * J1.y;
268  tmp_re += I11.x * J2.x - I11.y * J2.y;
269  accum_re[12*blockDim.x] = tmp_re;
270 
271  tmp_im = I9.x * J0.y + I9.y * J0.x;
272  tmp_im += I10.x * J1.y + I10.y * J1.x;
273  tmp_im += I11.x * J2.y + I11.y * J2.x;
274  accum_im[12*blockDim.x] = tmp_im;
275 
276  //31 component:
277  tmp_re = I9.x * J3.x - I9.y * J3.y;
278  tmp_re += I10.x * J4.x - I10.y * J4.y;
279  tmp_re += I11.x * J5.x - I11.y * J5.y;
280  accum_re[13*blockDim.x] = tmp_re;
281 
282  tmp_im = I9.x * J3.y + I9.y * J3.x;
283  tmp_im += I10.x * J4.y + I10.y * J4.x;
284  tmp_im += I11.x * J5.y + I11.y * J5.x;
285  accum_im[13*blockDim.x] = tmp_im;
286 
287  //32 component:
288  tmp_re = I9.x * J6.x - I9.y * J6.y;
289  tmp_re += I10.x * J7.x - I10.y * J7.y;
290  tmp_re += I11.x * J8.x - I11.y * J8.y;
291  accum_re[14*blockDim.x] = tmp_re;
292 
293  tmp_im = I9.x * J6.y + I9.y * J6.x;
294  tmp_im += I10.x * J7.y + I10.y * J7.x;
295  tmp_im += I11.x * J8.y + I11.y * J8.x;
296  accum_im[14*blockDim.x] = tmp_im;
297 
298  //33 component:
299  tmp_re = I9.x * J9.x - I9.y * J9.y;
300  tmp_re += I10.x * J10.x - I10.y * J10.y;
301  tmp_re += I11.x * J11.x - I11.y * J11.y;
302  accum_re[15*blockDim.x] = tmp_re;
303 
304  tmp_im = I9.x * J9.y + I9.y * J9.x;
305  tmp_im += I10.x * J10.y + I10.y * J10.x;
306  tmp_im += I11.x * J11.y + I11.y * J11.x;
307  accum_im[15*blockDim.x] = tmp_im;
308 
309  //Store output back to global buffer:
310 
311 
312 /* CONTRACTION FULL VOLUME */
313 
314  out[outId + 0 *param.threads*2].x += accum_re[ 0*blockDim.x];
315  out[outId + 1 *param.threads*2].x += accum_re[ 1*blockDim.x];
316  out[outId + 2 *param.threads*2].x += accum_re[ 2*blockDim.x];
317  out[outId + 3 *param.threads*2].x += accum_re[ 3*blockDim.x];
318  out[outId + 4 *param.threads*2].x += accum_re[ 4*blockDim.x];
319  out[outId + 5 *param.threads*2].x += accum_re[ 5*blockDim.x];
320  out[outId + 6 *param.threads*2].x += accum_re[ 6*blockDim.x];
321  out[outId + 7 *param.threads*2].x += accum_re[ 7*blockDim.x];
322  out[outId + 8 *param.threads*2].x += accum_re[ 8*blockDim.x];
323  out[outId + 9 *param.threads*2].x += accum_re[ 9*blockDim.x];
324  out[outId + 10*param.threads*2].x += accum_re[10*blockDim.x];
325  out[outId + 11*param.threads*2].x += accum_re[11*blockDim.x];
326  out[outId + 12*param.threads*2].x += accum_re[12*blockDim.x];
327  out[outId + 13*param.threads*2].x += accum_re[13*blockDim.x];
328  out[outId + 14*param.threads*2].x += accum_re[14*blockDim.x];
329  out[outId + 15*param.threads*2].x += accum_re[15*blockDim.x];
330 
331  out[outId + 0 *param.threads*2].y += accum_im[ 0*blockDim.x];
332  out[outId + 1 *param.threads*2].y += accum_im[ 1*blockDim.x];
333  out[outId + 2 *param.threads*2].y += accum_im[ 2*blockDim.x];
334  out[outId + 3 *param.threads*2].y += accum_im[ 3*blockDim.x];
335  out[outId + 4 *param.threads*2].y += accum_im[ 4*blockDim.x];
336  out[outId + 5 *param.threads*2].y += accum_im[ 5*blockDim.x];
337  out[outId + 6 *param.threads*2].y += accum_im[ 6*blockDim.x];
338  out[outId + 7 *param.threads*2].y += accum_im[ 7*blockDim.x];
339  out[outId + 8 *param.threads*2].y += accum_im[ 8*blockDim.x];
340  out[outId + 9 *param.threads*2].y += accum_im[ 9*blockDim.x];
341  out[outId + 10*param.threads*2].y += accum_im[10*blockDim.x];
342  out[outId + 11*param.threads*2].y += accum_im[11*blockDim.x];
343  out[outId + 12*param.threads*2].y += accum_im[12*blockDim.x];
344  out[outId + 13*param.threads*2].y += accum_im[13*blockDim.x];
345  out[outId + 14*param.threads*2].y += accum_im[14*blockDim.x];
346  out[outId + 15*param.threads*2].y += accum_im[15*blockDim.x];
347 
348  return;
349 }
350 
351 //Perform trace in color space only and for a given tslice
352 //since the file is included in dslash_quda.h, no need to add dslash_constants.h file here (for, e.g., Vsh)
353 __global__ void contractTslicePlusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
354 {
355  int sid = blockIdx.x*blockDim.x + threadIdx.x; //number of threads is equal to Tslice volume
356  //Adjust sid to correct tslice (exe domain must be Tslice volume!)
357  int inId = sid + param.Vsh*Tslice; //Vsh - 3d space volume for the parity spinor (equale to exe domain!)
358  int outId;
359  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
360 
361  if (sid >= param.threads) //param.threads == tslice volume
362  return;
363 
364  volatile double2 tmp;
365  extern __shared__ double sm[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
366 
367  volatile double *accum_re = sm + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
368  volatile double *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
369 
370 //The output only for a given tslice (for the full tslice content, i.e., both parities!):
371 
372  eutId = 2*inId;
373  auxCoord1 = eutId / param.dc.X[0];
374  xCoord1 = eutId - auxCoord1 * param.dc.X[0];
375  auxCoord2 = auxCoord1 / param.dc.X[1];
376  xCoord2 = auxCoord1 - auxCoord2 * param.dc.X[1];
377  xCoord4 = auxCoord2 / param.dc.X[2];
378  xCoord3 = auxCoord2 - xCoord4 * param.dc.X[2];
379 
380  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
381  xCoord1 += auxCoord1;
382  outId = xCoord1 + param.dc.X[0]*(xCoord2 + param.dc.X[1]*(xCoord3 + param.dc.X[2]*xCoord4)); //AQUI
383 
384  READ_SPINOR (SPINORTEX, myStride, sid, sid);
386 
387  //compute in1^dag:
388 
389  I0.y = -I0.y;
390  I1.y = -I1.y;
391  I2.y = -I2.y;
392  I3.y = -I3.y;
393  I4.y = -I4.y;
394  I5.y = -I5.y;
395  I6.y = -I6.y;
396  I7.y = -I7.y;
397  I8.y = -I8.y;
398  I9.y = -I9.y;
399  I10.y = -I10.y;
400  I11.y = -I11.y;
401 
402  //do products for all color component here:
403  //00 component:
404  tmp_re = I0.x * J0.x - I0.y * J0.y;
405  tmp_re += I1.x * J1.x - I1.y * J1.y;
406  tmp_re += I2.x * J2.x - I2.y * J2.y;
407  accum_re[0*blockDim.x] = tmp_re;
408 
409  tmp_im = I0.x * J0.y + I0.y * J0.x;
410  tmp_im += I1.x * J1.y + I1.y * J1.x;
411  tmp_im += I2.x * J2.y + I2.y * J2.x;
412  accum_im[0*blockDim.x] = tmp_im;
413 
414  //01 component:
415  tmp_re = I0.x * J3.x - I0.y * J3.y;
416  tmp_re += I1.x * J4.x - I1.y * J4.y;
417  tmp_re += I2.x * J5.x - I2.y * J5.y;
418  accum_re[1*blockDim.x] = tmp_re;
419 
420  tmp_im = I0.x * J3.y + I0.y * J3.x;
421  tmp_im += I1.x * J4.y + I1.y * J4.x;
422  tmp_im += I2.x * J5.y + I2.y * J5.x;
423  accum_im[1*blockDim.x] = tmp_im;
424 
425  //02 component:
426  tmp_re = I0.x * J6.x - I0.y * J6.y;
427  tmp_re += I1.x * J7.x - I1.y * J7.y;
428  tmp_re += I2.x * J8.x - I2.y * J8.y;
429  accum_re[2*blockDim.x] = tmp_re;
430 
431  tmp_im = I0.x * J6.y + I0.y * J6.x;
432  tmp_im += I1.x * J7.y + I1.y * J7.x;
433  tmp_im += I2.x * J8.y + I2.y * J8.x;
434  accum_im[2*blockDim.x] = tmp_im;
435 
436  //03 component:
437  tmp_re = I0.x * J9.x - I0.y * J9.y;
438  tmp_re += I1.x * J10.x - I1.y * J10.y;
439  tmp_re += I2.x * J11.x - I2.y * J11.y;
440  accum_re[3*blockDim.x] = tmp_re;
441 
442  tmp_im = I0.x * J9.y + I0.y * J9.x;
443  tmp_im += I1.x * J10.y + I1.y * J10.x;
444  tmp_im += I2.x * J11.y + I2.y * J11.x;
445  accum_im[3*blockDim.x] = tmp_im;
446 
447  //10 component:
448  tmp_re = I3.x * J0.x - I3.y * J0.y;
449  tmp_re += I4.x * J1.x - I4.y * J1.y;
450  tmp_re += I5.x * J2.x - I5.y * J2.y;
451  accum_re[ 4*blockDim.x] = tmp_re;
452 
453  tmp_im = I3.x * J0.y + I3.y * J0.x;
454  tmp_im += I4.x * J1.y + I4.y * J1.x;
455  tmp_im += I5.x * J2.y + I5.y * J2.x;
456  accum_im[ 4*blockDim.x] = tmp_im;
457 
458  //11 component:
459  tmp_re = I3.x * J3.x - I3.y * J3.y;
460  tmp_re += I4.x * J4.x - I4.y * J4.y;
461  tmp_re += I5.x * J5.x - I5.y * J5.y;
462  accum_re[ 5*blockDim.x] = tmp_re;
463 
464  tmp_im = I3.x * J3.y + I3.y * J3.x;
465  tmp_im += I4.x * J4.y + I4.y * J4.x;
466  tmp_im += I5.x * J5.y + I5.y * J5.x;
467  accum_im[ 5*blockDim.x] = tmp_im;
468 
469  //12 component:
470  tmp_re = I3.x * J6.x - I3.y * J6.y;
471  tmp_re += I4.x * J7.x - I4.y * J7.y;
472  tmp_re += I5.x * J8.x - I5.y * J8.y;
473  accum_re[ 6*blockDim.x] = tmp_re;
474 
475  tmp_im = I3.x * J6.y + I3.y * J6.x;
476  tmp_im += I4.x * J7.y + I4.y * J7.x;
477  tmp_im += I5.x * J8.y + I5.y * J8.x;
478  accum_im[ 6*blockDim.x] = tmp_im;
479 
480  //13 component:
481  tmp_re = I3.x * J9.x - I3.y * J9.y;
482  tmp_re += I4.x * J10.x - I4.y * J10.y;
483  tmp_re += I5.x * J11.x - I5.y * J11.y;
484  accum_re[ 7*blockDim.x] = tmp_re;
485 
486  tmp_im = I3.x * J9.y + I3.y * J9.x;
487  tmp_im += I4.x * J10.y + I4.y * J10.x;
488  tmp_im += I5.x * J11.y + I5.y * J11.x;
489  accum_im[ 7*blockDim.x] = tmp_im;
490 
491  //20 component:
492  tmp_re = I6.x * J0.x - I6.y * J0.y;
493  tmp_re += I7.x * J1.x - I7.y * J1.y;
494  tmp_re += I8.x * J2.x - I8.y * J2.y;
495  accum_re[ 8*blockDim.x] = tmp_re;
496 
497  tmp_im = I6.x * J0.y + I6.y * J0.x;
498  tmp_im += I7.x * J1.y + I7.y * J1.x;
499  tmp_im += I8.x * J2.y + I8.y * J2.x;
500  accum_im[ 8*blockDim.x] = tmp_im;
501 
502  //21 component:
503  tmp_re = I6.x * J3.x - I6.y * J3.y;
504  tmp_re += I7.x * J4.x - I7.y * J4.y;
505  tmp_re += I8.x * J5.x - I8.y * J5.y;
506  accum_re[ 9*blockDim.x] = tmp_re;
507 
508  tmp_im = I6.x * J3.y + I6.y * J3.x;
509  tmp_im += I7.x * J4.y + I7.y * J4.x;
510  tmp_im += I8.x * J5.y + I8.y * J5.x;
511  accum_im[ 9*blockDim.x] = tmp_im;
512 
513  //22 component:
514  tmp_re = I6.x * J6.x - I6.y * J6.y;
515  tmp_re += I7.x * J7.x - I7.y * J7.y;
516  tmp_re += I8.x * J8.x - I8.y * J8.y;
517  accum_re[10*blockDim.x] = tmp_re;
518 
519  tmp_im = I6.x * J6.y + I6.y * J6.x;
520  tmp_im += I7.x * J7.y + I7.y * J7.x;
521  tmp_im += I8.x * J8.y + I8.y * J8.x;
522  accum_im[10*blockDim.x] = tmp_im;
523 
524  //23 component:
525  tmp_re = I6.x * J9.x - I6.y * J9.y;
526  tmp_re += I7.x * J10.x - I7.y * J10.y;
527  tmp_re += I8.x * J11.x - I8.y * J11.y;
528  accum_re[11*blockDim.x] = tmp_re;
529 
530  tmp_im = I6.x * J9.y + I6.y * J9.x;
531  tmp_im += I7.x * J10.y + I7.y * J10.x;
532  tmp_im += I8.x * J11.y + I8.y * J11.x;
533  accum_im[11*blockDim.x] = tmp_im;
534 
535  //30 component:
536  tmp_re = I9.x * J0.x - I9.y * J0.y;
537  tmp_re += I10.x * J1.x - I10.y * J1.y;
538  tmp_re += I11.x * J2.x - I11.y * J2.y;
539  accum_re[12*blockDim.x] = tmp_re;
540 
541  tmp_im = I9.x * J0.y + I9.y * J0.x;
542  tmp_im += I10.x * J1.y + I10.y * J1.x;
543  tmp_im += I11.x * J2.y + I11.y * J2.x;
544  accum_im[12*blockDim.x] = tmp_im;
545 
546  //31 component:
547  tmp_re = I9.x * J3.x - I9.y * J3.y;
548  tmp_re += I10.x * J4.x - I10.y * J4.y;
549  tmp_re += I11.x * J5.x - I11.y * J5.y;
550  accum_re[13*blockDim.x] = tmp_re;
551 
552  tmp_im = I9.x * J3.y + I9.y * J3.x;
553  tmp_im += I10.x * J4.y + I10.y * J4.x;
554  tmp_im += I11.x * J5.y + I11.y * J5.x;
555  accum_im[13*blockDim.x] = tmp_im;
556 
557  //32 component:
558  tmp_re = I9.x * J6.x - I9.y * J6.y;
559  tmp_re += I10.x * J7.x - I10.y * J7.y;
560  tmp_re += I11.x * J8.x - I11.y * J8.y;
561  accum_re[14*blockDim.x] = tmp_re;
562 
563  tmp_im = I9.x * J6.y + I9.y * J6.x;
564  tmp_im += I10.x * J7.y + I10.y * J7.x;
565  tmp_im += I11.x * J8.y + I11.y * J8.x;
566  accum_im[14*blockDim.x] = tmp_im;
567 
568  //33 component:
569  tmp_re = I9.x * J9.x - I9.y * J9.y;
570  tmp_re += I10.x * J10.x - I10.y * J10.y;
571  tmp_re += I11.x * J11.x - I11.y * J11.y;
572  accum_re[15*blockDim.x] = tmp_re;
573 
574  tmp_im = I9.x * J9.y + I9.y * J9.x;
575  tmp_im += I10.x * J10.y + I10.y * J10.x;
576  tmp_im += I11.x * J11.y + I11.y * J11.x;
577  accum_im[15*blockDim.x] = tmp_im;
578 
579  //Store output back to global buffer:
580 
581 
582 /* CONTRACTION FULL VOLUME */
583 
584  out[outId + 0 *param.threads*2].x += accum_re[ 0*blockDim.x];
585  out[outId + 1 *param.threads*2].x += accum_re[ 1*blockDim.x];
586  out[outId + 2 *param.threads*2].x += accum_re[ 2*blockDim.x];
587  out[outId + 3 *param.threads*2].x += accum_re[ 3*blockDim.x];
588  out[outId + 4 *param.threads*2].x += accum_re[ 4*blockDim.x];
589  out[outId + 5 *param.threads*2].x += accum_re[ 5*blockDim.x];
590  out[outId + 6 *param.threads*2].x += accum_re[ 6*blockDim.x];
591  out[outId + 7 *param.threads*2].x += accum_re[ 7*blockDim.x];
592  out[outId + 8 *param.threads*2].x += accum_re[ 8*blockDim.x];
593  out[outId + 9 *param.threads*2].x += accum_re[ 9*blockDim.x];
594  out[outId + 10*param.threads*2].x += accum_re[10*blockDim.x];
595  out[outId + 11*param.threads*2].x += accum_re[11*blockDim.x];
596  out[outId + 12*param.threads*2].x += accum_re[12*blockDim.x];
597  out[outId + 13*param.threads*2].x += accum_re[13*blockDim.x];
598  out[outId + 14*param.threads*2].x += accum_re[14*blockDim.x];
599  out[outId + 15*param.threads*2].x += accum_re[15*blockDim.x];
600 
601  out[outId + 0 *param.threads*2].y += accum_im[ 0*blockDim.x];
602  out[outId + 1 *param.threads*2].y += accum_im[ 1*blockDim.x];
603  out[outId + 2 *param.threads*2].y += accum_im[ 2*blockDim.x];
604  out[outId + 3 *param.threads*2].y += accum_im[ 3*blockDim.x];
605  out[outId + 4 *param.threads*2].y += accum_im[ 4*blockDim.x];
606  out[outId + 5 *param.threads*2].y += accum_im[ 5*blockDim.x];
607  out[outId + 6 *param.threads*2].y += accum_im[ 6*blockDim.x];
608  out[outId + 7 *param.threads*2].y += accum_im[ 7*blockDim.x];
609  out[outId + 8 *param.threads*2].y += accum_im[ 8*blockDim.x];
610  out[outId + 9 *param.threads*2].y += accum_im[ 9*blockDim.x];
611  out[outId + 10*param.threads*2].y += accum_im[10*blockDim.x];
612  out[outId + 11*param.threads*2].y += accum_im[11*blockDim.x];
613  out[outId + 12*param.threads*2].y += accum_im[12*blockDim.x];
614  out[outId + 13*param.threads*2].y += accum_im[13*blockDim.x];
615  out[outId + 14*param.threads*2].y += accum_im[14*blockDim.x];
616  out[outId + 15*param.threads*2].y += accum_im[15*blockDim.x];
617 
618  return;
619 }
620 
621 __global__ void contractPlusKernel (double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
622 {
623  int sid = blockIdx.x*blockDim.x + threadIdx.x;
624  int outId = sid;
625  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
626 
627  if (sid >= param.threads)
628  return;
629 
630  volatile double2 tmp;
631  extern __shared__ double sm[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
632 
633  volatile double *accum_re = sm + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
634  volatile double *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
635 
636  eutId = 2*sid;
637  auxCoord1 = eutId / param.dc.X[0];
638  xCoord1 = eutId - auxCoord1 * param.dc.X[0];
639  auxCoord2 = auxCoord1 / param.dc.X[1];
640  xCoord2 = auxCoord1 - auxCoord2 * param.dc.X[1];
641  xCoord4 = auxCoord2 / param.dc.X[2];
642  xCoord3 = auxCoord2 - xCoord4 * param.dc.X[2];
643 
644  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
645  xCoord1 += auxCoord1;
646  outId = xCoord1 + param.dc.X[0]*(xCoord2 + param.dc.X[1]*(xCoord3 + param.dc.X[2]*xCoord4)); //AQUI
647 
648  READ_SPINOR (SPINORTEX, myStride, sid, sid);
650 
651  //compute in1^dag:
652 
653  I0.y = -I0.y;
654  I1.y = -I1.y;
655  I2.y = -I2.y;
656  I3.y = -I3.y;
657  I4.y = -I4.y;
658  I5.y = -I5.y;
659  I6.y = -I6.y;
660  I7.y = -I7.y;
661  I8.y = -I8.y;
662  I9.y = -I9.y;
663  I10.y = -I10.y;
664  I11.y = -I11.y;
665 
666  //do products for all color component here:
667  //00 component:
668  tmp_re = I0.x * J0.x - I0.y * J0.y;
669  tmp_re += I1.x * J1.x - I1.y * J1.y;
670  tmp_re += I2.x * J2.x - I2.y * J2.y;
671  accum_re[0*blockDim.x] = tmp_re;
672 
673  tmp_im = I0.x * J0.y + I0.y * J0.x;
674  tmp_im += I1.x * J1.y + I1.y * J1.x;
675  tmp_im += I2.x * J2.y + I2.y * J2.x;
676  accum_im[0*blockDim.x] = tmp_im;
677 
678  //01 component:
679  tmp_re = I0.x * J3.x - I0.y * J3.y;
680  tmp_re += I1.x * J4.x - I1.y * J4.y;
681  tmp_re += I2.x * J5.x - I2.y * J5.y;
682  accum_re[1*blockDim.x] = tmp_re;
683 
684  tmp_im = I0.x * J3.y + I0.y * J3.x;
685  tmp_im += I1.x * J4.y + I1.y * J4.x;
686  tmp_im += I2.x * J5.y + I2.y * J5.x;
687  accum_im[1*blockDim.x] = tmp_im;
688 
689  //02 component:
690  tmp_re = I0.x * J6.x - I0.y * J6.y;
691  tmp_re += I1.x * J7.x - I1.y * J7.y;
692  tmp_re += I2.x * J8.x - I2.y * J8.y;
693  accum_re[2*blockDim.x] = tmp_re;
694 
695  tmp_im = I0.x * J6.y + I0.y * J6.x;
696  tmp_im += I1.x * J7.y + I1.y * J7.x;
697  tmp_im += I2.x * J8.y + I2.y * J8.x;
698  accum_im[2*blockDim.x] = tmp_im;
699 
700  //03 component:
701  tmp_re = I0.x * J9.x - I0.y * J9.y;
702  tmp_re += I1.x * J10.x - I1.y * J10.y;
703  tmp_re += I2.x * J11.x - I2.y * J11.y;
704  accum_re[3*blockDim.x] = tmp_re;
705 
706  tmp_im = I0.x * J9.y + I0.y * J9.x;
707  tmp_im += I1.x * J10.y + I1.y * J10.x;
708  tmp_im += I2.x * J11.y + I2.y * J11.x;
709  accum_im[3*blockDim.x] = tmp_im;
710 
711  //10 component:
712  tmp_re = I3.x * J0.x - I3.y * J0.y;
713  tmp_re += I4.x * J1.x - I4.y * J1.y;
714  tmp_re += I5.x * J2.x - I5.y * J2.y;
715  accum_re[ 4*blockDim.x] = tmp_re;
716 
717  tmp_im = I3.x * J0.y + I3.y * J0.x;
718  tmp_im += I4.x * J1.y + I4.y * J1.x;
719  tmp_im += I5.x * J2.y + I5.y * J2.x;
720  accum_im[ 4*blockDim.x] = tmp_im;
721 
722  //11 component:
723  tmp_re = I3.x * J3.x - I3.y * J3.y;
724  tmp_re += I4.x * J4.x - I4.y * J4.y;
725  tmp_re += I5.x * J5.x - I5.y * J5.y;
726  accum_re[ 5*blockDim.x] = tmp_re;
727 
728  tmp_im = I3.x * J3.y + I3.y * J3.x;
729  tmp_im += I4.x * J4.y + I4.y * J4.x;
730  tmp_im += I5.x * J5.y + I5.y * J5.x;
731  accum_im[ 5*blockDim.x] = tmp_im;
732 
733  //12 component:
734  tmp_re = I3.x * J6.x - I3.y * J6.y;
735  tmp_re += I4.x * J7.x - I4.y * J7.y;
736  tmp_re += I5.x * J8.x - I5.y * J8.y;
737  accum_re[ 6*blockDim.x] = tmp_re;
738 
739  tmp_im = I3.x * J6.y + I3.y * J6.x;
740  tmp_im += I4.x * J7.y + I4.y * J7.x;
741  tmp_im += I5.x * J8.y + I5.y * J8.x;
742  accum_im[ 6*blockDim.x] = tmp_im;
743 
744  //13 component:
745  tmp_re = I3.x * J9.x - I3.y * J9.y;
746  tmp_re += I4.x * J10.x - I4.y * J10.y;
747  tmp_re += I5.x * J11.x - I5.y * J11.y;
748  accum_re[ 7*blockDim.x] = tmp_re;
749 
750  tmp_im = I3.x * J9.y + I3.y * J9.x;
751  tmp_im += I4.x * J10.y + I4.y * J10.x;
752  tmp_im += I5.x * J11.y + I5.y * J11.x;
753  accum_im[ 7*blockDim.x] = tmp_im;
754 
755  //20 component:
756  tmp_re = I6.x * J0.x - I6.y * J0.y;
757  tmp_re += I7.x * J1.x - I7.y * J1.y;
758  tmp_re += I8.x * J2.x - I8.y * J2.y;
759  accum_re[ 8*blockDim.x] = tmp_re;
760 
761  tmp_im = I6.x * J0.y + I6.y * J0.x;
762  tmp_im += I7.x * J1.y + I7.y * J1.x;
763  tmp_im += I8.x * J2.y + I8.y * J2.x;
764  accum_im[ 8*blockDim.x] = tmp_im;
765 
766  //21 component:
767  tmp_re = I6.x * J3.x - I6.y * J3.y;
768  tmp_re += I7.x * J4.x - I7.y * J4.y;
769  tmp_re += I8.x * J5.x - I8.y * J5.y;
770  accum_re[ 9*blockDim.x] = tmp_re;
771 
772  tmp_im = I6.x * J3.y + I6.y * J3.x;
773  tmp_im += I7.x * J4.y + I7.y * J4.x;
774  tmp_im += I8.x * J5.y + I8.y * J5.x;
775  accum_im[ 9*blockDim.x] = tmp_im;
776 
777  //22 component:
778  tmp_re = I6.x * J6.x - I6.y * J6.y;
779  tmp_re += I7.x * J7.x - I7.y * J7.y;
780  tmp_re += I8.x * J8.x - I8.y * J8.y;
781  accum_re[10*blockDim.x] = tmp_re;
782 
783  tmp_im = I6.x * J6.y + I6.y * J6.x;
784  tmp_im += I7.x * J7.y + I7.y * J7.x;
785  tmp_im += I8.x * J8.y + I8.y * J8.x;
786  accum_im[10*blockDim.x] = tmp_im;
787 
788  //23 component:
789  tmp_re = I6.x * J9.x - I6.y * J9.y;
790  tmp_re += I7.x * J10.x - I7.y * J10.y;
791  tmp_re += I8.x * J11.x - I8.y * J11.y;
792  accum_re[11*blockDim.x] = tmp_re;
793 
794  tmp_im = I6.x * J9.y + I6.y * J9.x;
795  tmp_im += I7.x * J10.y + I7.y * J10.x;
796  tmp_im += I8.x * J11.y + I8.y * J11.x;
797  accum_im[11*blockDim.x] = tmp_im;
798 
799  //30 component:
800  tmp_re = I9.x * J0.x - I9.y * J0.y;
801  tmp_re += I10.x * J1.x - I10.y * J1.y;
802  tmp_re += I11.x * J2.x - I11.y * J2.y;
803  accum_re[12*blockDim.x] = tmp_re;
804 
805  tmp_im = I9.x * J0.y + I9.y * J0.x;
806  tmp_im += I10.x * J1.y + I10.y * J1.x;
807  tmp_im += I11.x * J2.y + I11.y * J2.x;
808  accum_im[12*blockDim.x] = tmp_im;
809 
810  //31 component:
811  tmp_re = I9.x * J3.x - I9.y * J3.y;
812  tmp_re += I10.x * J4.x - I10.y * J4.y;
813  tmp_re += I11.x * J5.x - I11.y * J5.y;
814  accum_re[13*blockDim.x] = tmp_re;
815 
816  tmp_im = I9.x * J3.y + I9.y * J3.x;
817  tmp_im += I10.x * J4.y + I10.y * J4.x;
818  tmp_im += I11.x * J5.y + I11.y * J5.x;
819  accum_im[13*blockDim.x] = tmp_im;
820 
821  //32 component:
822  tmp_re = I9.x * J6.x - I9.y * J6.y;
823  tmp_re += I10.x * J7.x - I10.y * J7.y;
824  tmp_re += I11.x * J8.x - I11.y * J8.y;
825  accum_re[14*blockDim.x] = tmp_re;
826 
827  tmp_im = I9.x * J6.y + I9.y * J6.x;
828  tmp_im += I10.x * J7.y + I10.y * J7.x;
829  tmp_im += I11.x * J8.y + I11.y * J8.x;
830  accum_im[14*blockDim.x] = tmp_im;
831 
832  //33 component:
833  tmp_re = I9.x * J9.x - I9.y * J9.y;
834  tmp_re += I10.x * J10.x - I10.y * J10.y;
835  tmp_re += I11.x * J11.x - I11.y * J11.y;
836  accum_re[15*blockDim.x] = tmp_re;
837 
838  tmp_im = I9.x * J9.y + I9.y * J9.x;
839  tmp_im += I10.x * J10.y + I10.y * J10.x;
840  tmp_im += I11.x * J11.y + I11.y * J11.x;
841  accum_im[15*blockDim.x] = tmp_im;
842 
843 /* CONTRACTION FULL VOLUME */
844 
845  out[outId + 0 *param.threads*2].x += accum_re[ 0*blockDim.x];
846  out[outId + 1 *param.threads*2].x += accum_re[ 1*blockDim.x];
847  out[outId + 2 *param.threads*2].x += accum_re[ 2*blockDim.x];
848  out[outId + 3 *param.threads*2].x += accum_re[ 3*blockDim.x];
849  out[outId + 4 *param.threads*2].x += accum_re[ 4*blockDim.x];
850  out[outId + 5 *param.threads*2].x += accum_re[ 5*blockDim.x];
851  out[outId + 6 *param.threads*2].x += accum_re[ 6*blockDim.x];
852  out[outId + 7 *param.threads*2].x += accum_re[ 7*blockDim.x];
853  out[outId + 8 *param.threads*2].x += accum_re[ 8*blockDim.x];
854  out[outId + 9 *param.threads*2].x += accum_re[ 9*blockDim.x];
855  out[outId + 10*param.threads*2].x += accum_re[10*blockDim.x];
856  out[outId + 11*param.threads*2].x += accum_re[11*blockDim.x];
857  out[outId + 12*param.threads*2].x += accum_re[12*blockDim.x];
858  out[outId + 13*param.threads*2].x += accum_re[13*blockDim.x];
859  out[outId + 14*param.threads*2].x += accum_re[14*blockDim.x];
860  out[outId + 15*param.threads*2].x += accum_re[15*blockDim.x];
861 
862  out[outId + 0 *param.threads*2].y += accum_im[ 0*blockDim.x];
863  out[outId + 1 *param.threads*2].y += accum_im[ 1*blockDim.x];
864  out[outId + 2 *param.threads*2].y += accum_im[ 2*blockDim.x];
865  out[outId + 3 *param.threads*2].y += accum_im[ 3*blockDim.x];
866  out[outId + 4 *param.threads*2].y += accum_im[ 4*blockDim.x];
867  out[outId + 5 *param.threads*2].y += accum_im[ 5*blockDim.x];
868  out[outId + 6 *param.threads*2].y += accum_im[ 6*blockDim.x];
869  out[outId + 7 *param.threads*2].y += accum_im[ 7*blockDim.x];
870  out[outId + 8 *param.threads*2].y += accum_im[ 8*blockDim.x];
871  out[outId + 9 *param.threads*2].y += accum_im[ 9*blockDim.x];
872  out[outId + 10*param.threads*2].y += accum_im[10*blockDim.x];
873  out[outId + 11*param.threads*2].y += accum_im[11*blockDim.x];
874  out[outId + 12*param.threads*2].y += accum_im[12*blockDim.x];
875  out[outId + 13*param.threads*2].y += accum_im[13*blockDim.x];
876  out[outId + 14*param.threads*2].y += accum_im[14*blockDim.x];
877  out[outId + 15*param.threads*2].y += accum_im[15*blockDim.x];
878 
879  return;
880 }
881 
882 #undef SPINORTEX
883 #undef INTERTEX
884 
885 #undef READ_SPINOR
886 #undef READ_INTERMEDIATE_SPINOR
887 
888 #undef SPINOR_HOP
889 
890 
891 #define READ_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
892  float4 I0 = spinor[sp_idx + 0*(stride)]; \
893  float4 I1 = spinor[sp_idx + 1*(stride)]; \
894  float4 I2 = spinor[sp_idx + 2*(stride)]; \
895  float4 I3 = spinor[sp_idx + 3*(stride)]; \
896  float4 I4 = spinor[sp_idx + 4*(stride)]; \
897  float4 I5 = spinor[sp_idx + 5*(stride)];
898 
899 
900 #define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
901  float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
902  float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
903  float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
904  float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
905  float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
906  float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
907 
908 #define READ_INTERMEDIATE_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
909  float4 J0 = spinor[sp_idx + 0*(stride)]; \
910  float4 J1 = spinor[sp_idx + 1*(stride)]; \
911  float4 J2 = spinor[sp_idx + 2*(stride)]; \
912  float4 J3 = spinor[sp_idx + 3*(stride)]; \
913  float4 J4 = spinor[sp_idx + 4*(stride)]; \
914  float4 J5 = spinor[sp_idx + 5*(stride)];
915 
916 #define READ_INTERMEDIATE_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
917  float4 J0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
918  float4 J1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
919  float4 J2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
920  float4 J3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
921  float4 J4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
922  float4 J5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
923 
924 
925 #ifdef DIRECT_ACCESS_WILSON_SPINOR
926  #define READ_SPINOR READ_SPINOR_SINGLE
927  #define SPINORTEX in
928 #else
929  #define READ_SPINOR READ_SPINOR_SINGLE_TEX
930 
931  #ifdef USE_TEXTURE_OBJECTS
932  #define SPINORTEX param.inTex
933  #else
934  #define SPINORTEX spinorTexSingle
935  #endif // USE_TEXTURE_OBJECTS
936 #endif
937 
938 #ifdef DIRECT_ACCESS_WILSON_INTER
939  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE
940  #define INTERTEX in2
941 #else
942  #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE_TEX
943 
944  #ifdef USE_TEXTURE_OBJECTS
945  #define INTERTEX param.outTex
946  #else
947  #define INTERTEX interTexSingle
948  #endif // USE_TEXTURE_OBJECTS
949 #endif
950 
951 #define SPINOR_HOP 6
952 
953 __global__ void contractGamma5PlusKernel (float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
954 {
955  int sid = blockIdx.x*blockDim.x + threadIdx.x;
956  int outId = sid;
957  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
958 
959  if (sid >= param.threads)
960  return;
961 
962  volatile float2 tmp;
963  extern __shared__ float sms[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
964 
965  volatile float *accum_re = sms + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
966  volatile float *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
967 
968  eutId = 2*sid;
969  auxCoord1 = eutId / param.dc.X[0];
970  xCoord1 = eutId - auxCoord1 * param.dc.X[0];
971  auxCoord2 = auxCoord1 / param.dc.X[1];
972  xCoord2 = auxCoord1 - auxCoord2 * param.dc.X[1];
973  xCoord4 = auxCoord2 / param.dc.X[2];
974  xCoord3 = auxCoord2 - xCoord4 * param.dc.X[2];
975 
976  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
977  xCoord1 += auxCoord1;
978  outId = xCoord1 + param.dc.X[0]*(xCoord2 + param.dc.X[1]*(xCoord3 + param.dc.X[2]*xCoord4)); //AQUI
979 
980  //Load the full input spinors:
981 
982  READ_SPINOR (SPINORTEX, myStride, sid, sid);
984 
985  //compute in1^dag * gamma5:
986 
987  //First color component
988 
989  tmp_re = +I0.x;
990  tmp_im = -I0.y;
991  I0.x = +I3.x;
992  I0.y = -I3.y;
993  I3.x = tmp_re;
994  I3.y = tmp_im;
995 
996  tmp_re = +I1.z;
997  tmp_im = -I1.w;
998  I1.z = +I4.z;
999  I1.w = -I4.w;
1000  I4.z = tmp_re;
1001  I4.w = tmp_im;
1002 
1003  //Second color component
1004 
1005  tmp_re = +I0.z;
1006  tmp_im = -I0.w;
1007  I0.z = +I3.z;
1008  I0.w = -I3.w;
1009  I3.z = tmp_re;
1010  I3.w = tmp_im;
1011 
1012  tmp_re = +I2.x;
1013  tmp_im = -I2.y;
1014  I2.x = +I5.x;
1015  I2.y = -I5.y;
1016  I5.x = tmp_re;
1017  I5.y = tmp_im;
1018 
1019  //Third color component
1020 
1021  tmp_re = +I1.x;
1022  tmp_im = -I1.y;
1023  I1.x = +I4.x;
1024  I1.y = -I4.y;
1025  I4.x = tmp_re;
1026  I4.y = tmp_im;
1027 
1028  tmp_re = +I2.z;
1029  tmp_im = -I2.w;
1030  I2.z = +I5.z;
1031  I2.w = -I5.w;
1032  I5.z = tmp_re;
1033  I5.w = tmp_im;
1034 
1035  //do products for first color component here:
1036 
1037  //00 component:
1038  tmp_re = I0.x * J0.x - I0.y * J0.y;
1039  tmp_re += I0.z * J0.z - I0.w * J0.w;
1040  tmp_re += I1.x * J1.x - I1.y * J1.y;
1041  accum_re[0*blockDim.x] = tmp_re;
1042 
1043  tmp_im = I0.x * J0.y + I0.y * J0.x;
1044  tmp_im += I0.z * J0.w + I0.w * J0.z;
1045  tmp_im += I1.x * J1.y + I1.y * J1.x;
1046  accum_im[0*blockDim.x] = tmp_im;
1047 
1048  //01 component:
1049  tmp_re = I0.x * J1.z - I0.y * J1.w;
1050  tmp_re += I0.z * J2.x - I0.w * J2.y;
1051  tmp_re += I1.x * J2.z - I1.y * J2.w;
1052  accum_re[1*blockDim.x] = tmp_re;
1053 
1054  tmp_im = I0.x * J1.w + I0.y * J1.z;
1055  tmp_im += I0.z * J2.y + I0.w * J2.x;
1056  tmp_im += I1.x * J2.w + I1.y * J2.z;
1057  accum_im[1*blockDim.x] = tmp_im;
1058 
1059  //02 component:
1060  tmp_re = I0.x * J3.x - I0.y * J3.y;
1061  tmp_re += I0.z * J3.z - I0.w * J3.w;
1062  tmp_re += I1.x * J4.x - I1.y * J4.y;
1063  accum_re[2*blockDim.x] = tmp_re;
1064 
1065  tmp_im = I0.x * J3.y + I0.y * J3.x;
1066  tmp_im += I0.z * J3.w + I0.w * J3.z;
1067  tmp_im += I1.x * J4.y + I1.y * J4.x;
1068  accum_im[2*blockDim.x] = tmp_im;
1069 
1070  //03 component:
1071  tmp_re = I0.x * J4.z - I0.y * J4.w;
1072  tmp_re += I0.z * J5.x - I0.w * J5.x;
1073  tmp_re += I1.x * J5.z - I1.y * J5.w;
1074 
1075  accum_re[3*blockDim.x] = tmp_re;
1076 
1077  tmp_im = I0.x * J4.w + I0.y * J4.z;
1078  tmp_im += I0.z * J5.y + I0.w * J5.y;
1079  tmp_im += I1.x * J5.w + I1.y * J5.z;
1080  accum_im[3*blockDim.x] = tmp_im;
1081 
1082  //10 component:
1083  tmp_re = I1.z * J0.x - I1.w * J0.y;
1084  tmp_re += I2.x * J0.z - I2.y * J0.w;
1085  tmp_re += I2.z * J1.x - I2.w * J1.y;
1086  accum_re[4*blockDim.x] = tmp_re;
1087 
1088  tmp_im = I1.z * J0.y + I1.w * J0.x;
1089  tmp_im += I2.x * J0.w + I2.y * J0.z;
1090  tmp_im += I2.z * J1.y + I2.w * J1.x;
1091  accum_im[4*blockDim.x] = tmp_im;
1092 
1093  //11 component:
1094  tmp_re = I1.z * J1.z - I1.w * J1.w;
1095  tmp_re += I2.x * J2.x - I2.y * J2.y;
1096  tmp_re += I2.z * J2.z - I2.w * J2.w;
1097  accum_re[5*blockDim.x] = tmp_re;
1098 
1099  tmp_im = I1.z * J1.w + I1.w * J1.z;
1100  tmp_im += I2.x * J2.y + I2.y * J2.x;
1101  tmp_im += I2.z * J2.w + I2.w * J2.z;
1102  accum_im[5*blockDim.x] = tmp_im;
1103 
1104  //12 component:
1105  tmp_re = I1.z * J3.x - I1.w * J3.y;
1106  tmp_re += I2.x * J3.z - I2.y * J3.w;
1107  tmp_re += I2.z * J4.x - I2.w * J4.y;
1108  accum_re[6*blockDim.x] = tmp_re;
1109 
1110  tmp_im = I1.z * J3.y + I1.w * J3.x;
1111  tmp_im += I2.x * J3.w + I2.y * J3.z;
1112  tmp_im += I2.z * J4.y + I2.w * J4.x;
1113  accum_im[6*blockDim.x] = tmp_im;
1114 
1115  //13 component:
1116  tmp_re = I1.z * J4.z - I1.w * J4.w;
1117  tmp_re += I2.x * J5.x - I2.y * J5.y;
1118  tmp_re += I2.z * J5.z - I2.w * J5.w;
1119  accum_re[7*blockDim.x] = tmp_re;
1120 
1121  tmp_im = I1.z * J4.w + I1.w * J4.z;
1122  tmp_im += I2.x * J5.y + I2.y * J5.x;
1123  tmp_im += I2.z * J5.w + I2.w * J5.z;
1124  accum_im[7*blockDim.x] = tmp_im;
1125 
1126  //20 component:
1127  tmp_re = I3.x * J0.x - I3.y * J0.y;
1128  tmp_re += I3.z * J0.z - I3.w * J0.w;
1129  tmp_re += I4.x * J1.x - I4.y * J1.y;
1130  accum_re[8*blockDim.x] = tmp_re;
1131 
1132  tmp_im = I3.x * J0.y + I3.y * J0.x;
1133  tmp_im += I3.z * J0.w + I3.w * J0.z;
1134  tmp_im += I4.x * J1.y + I4.y * J1.x;
1135  accum_im[8*blockDim.x] = tmp_im;
1136 
1137  //21 component:
1138  tmp_re = I3.x * J1.z - I3.y * J1.w;
1139  tmp_re += I3.z * J2.x - I3.w * J2.y;
1140  tmp_re += I4.x * J2.z - I4.y * J2.w;
1141  accum_re[9*blockDim.x] = tmp_re;
1142 
1143  tmp_im = I3.x * J1.w + I3.y * J1.z;
1144  tmp_im += I3.z * J2.y + I3.w * J2.x;
1145  tmp_im += I4.x * J2.w + I4.y * J2.z;
1146  accum_im[9*blockDim.x] = tmp_im;
1147 
1148  //22 component:
1149  tmp_re = I3.x * J3.x - I3.y * J3.y;
1150  tmp_re += I3.z * J3.z - I3.w * J3.w;
1151  tmp_re += I4.x * J4.x - I4.y * J4.y;
1152  accum_re[10*blockDim.x] = tmp_re;
1153 
1154  tmp_im = I3.x * J3.y + I3.y * J3.x;
1155  tmp_im += I3.z * J3.w + I3.w * J3.z;
1156  tmp_im += I4.x * J4.y + I4.y * J4.x;
1157  accum_im[10*blockDim.x] = tmp_im;
1158 
1159  //23 component:
1160  tmp_re = I3.x * J4.z - I3.y * J4.w;
1161  tmp_re += I3.z * J5.x - I3.w * J5.y;
1162  tmp_re += I4.x * J5.z - I4.y * J5.w;
1163  accum_re[11*blockDim.x] = tmp_re;
1164 
1165  tmp_im = I3.x * J4.w + I3.y * J4.z;
1166  tmp_im += I3.z * J5.y + I3.w * J5.x;
1167  tmp_im += I4.x * J5.w + I4.y * J5.z;
1168  accum_im[11*blockDim.x] = tmp_im;
1169 
1170  //30 component:
1171  tmp_re = I4.z * J0.x - I4.w * J0.y;
1172  tmp_re += I5.x * J0.z - I5.y * J0.w;
1173  tmp_re += I5.z * J1.x - I5.w * J1.y;
1174  accum_re[12*blockDim.x] = tmp_re;
1175 
1176  tmp_im = I4.z * J0.y + I4.w * J0.x;
1177  tmp_im += I5.x * J0.w + I5.y * J0.z;
1178  tmp_im += I5.z * J1.y + I5.w * J1.x;
1179  accum_im[12*blockDim.x] = tmp_im;
1180 
1181  //31 component:
1182  tmp_re = I4.z * J1.z - I4.w * J1.w;
1183  tmp_re += I5.x * J2.x - I5.y * J2.y;
1184  tmp_re += I5.z * J2.z - I5.w * J2.w;
1185  accum_re[13*blockDim.x] = tmp_re;
1186 
1187  tmp_im = I4.z * J1.w + I4.w * J1.z;
1188  tmp_im += I5.x * J2.y + I5.y * J2.x;
1189  tmp_im += I5.z * J2.w + I5.w * J2.z;
1190  accum_im[13*blockDim.x] = tmp_im;
1191 
1192  //32 component:
1193  tmp_re = I4.z * J3.x - I4.w * J3.y;
1194  tmp_re += I5.x * J3.z - I5.y * J3.w;
1195  tmp_re += I5.z * J4.x - I5.w * J4.y;
1196  accum_re[14*blockDim.x] = tmp_re;
1197 
1198  tmp_im = I4.z * J3.y + I4.w * J3.x;
1199  tmp_im += I5.x * J3.w + I5.y * J3.z;
1200  tmp_im += I5.z * J4.y + I5.w * J4.x;
1201  accum_im[14*blockDim.x] = tmp_im;
1202 
1203  //33 component:
1204  tmp_re = I4.z * J4.z - I4.w * J4.w;
1205  tmp_re += I5.x * J5.x - I5.y * J5.y;
1206  tmp_re += I5.z * J5.z - I5.w * J5.w;
1207  accum_re[15*blockDim.x] = tmp_re;
1208 
1209  tmp_im = I4.z * J4.w + I4.w * J4.z;
1210  tmp_im += I5.x * J5.y + I5.y * J5.x;
1211  tmp_im += I5.z * J5.w + I5.w * J5.z;
1212  accum_im[15*blockDim.x] = tmp_im;
1213 
1214 
1215 
1216  //Store output back to global buffer:
1217 
1218 
1219  /* CONTRACTION FULL VOLUME */
1220 
1221  out[outId + 0 *param.threads*2].x += accum_re[ 0*blockDim.x];
1222  out[outId + 1 *param.threads*2].x += accum_re[ 1*blockDim.x];
1223  out[outId + 2 *param.threads*2].x += accum_re[ 2*blockDim.x];
1224  out[outId + 3 *param.threads*2].x += accum_re[ 3*blockDim.x];
1225  out[outId + 4 *param.threads*2].x += accum_re[ 4*blockDim.x];
1226  out[outId + 5 *param.threads*2].x += accum_re[ 5*blockDim.x];
1227  out[outId + 6 *param.threads*2].x += accum_re[ 6*blockDim.x];
1228  out[outId + 7 *param.threads*2].x += accum_re[ 7*blockDim.x];
1229  out[outId + 8 *param.threads*2].x += accum_re[ 8*blockDim.x];
1230  out[outId + 9 *param.threads*2].x += accum_re[ 9*blockDim.x];
1231  out[outId + 10*param.threads*2].x += accum_re[10*blockDim.x];
1232  out[outId + 11*param.threads*2].x += accum_re[11*blockDim.x];
1233  out[outId + 12*param.threads*2].x += accum_re[12*blockDim.x];
1234  out[outId + 13*param.threads*2].x += accum_re[13*blockDim.x];
1235  out[outId + 14*param.threads*2].x += accum_re[14*blockDim.x];
1236  out[outId + 15*param.threads*2].x += accum_re[15*blockDim.x];
1237 
1238  out[outId + 0 *param.threads*2].y += accum_im[ 0*blockDim.x];
1239  out[outId + 1 *param.threads*2].y += accum_im[ 1*blockDim.x];
1240  out[outId + 2 *param.threads*2].y += accum_im[ 2*blockDim.x];
1241  out[outId + 3 *param.threads*2].y += accum_im[ 3*blockDim.x];
1242  out[outId + 4 *param.threads*2].y += accum_im[ 4*blockDim.x];
1243  out[outId + 5 *param.threads*2].y += accum_im[ 5*blockDim.x];
1244  out[outId + 6 *param.threads*2].y += accum_im[ 6*blockDim.x];
1245  out[outId + 7 *param.threads*2].y += accum_im[ 7*blockDim.x];
1246  out[outId + 8 *param.threads*2].y += accum_im[ 8*blockDim.x];
1247  out[outId + 9 *param.threads*2].y += accum_im[ 9*blockDim.x];
1248  out[outId + 10*param.threads*2].y += accum_im[10*blockDim.x];
1249  out[outId + 11*param.threads*2].y += accum_im[11*blockDim.x];
1250  out[outId + 12*param.threads*2].y += accum_im[12*blockDim.x];
1251  out[outId + 13*param.threads*2].y += accum_im[13*blockDim.x];
1252  out[outId + 14*param.threads*2].y += accum_im[14*blockDim.x];
1253  out[outId + 15*param.threads*2].y += accum_im[15*blockDim.x];
1254 
1255  return;
1256 }
1257 
1258 //Perform trace in color space only and for a given tslice
1259 //since the file is included in dslash_quda.h, no need to add dslash_constants.h file here (for, e.g., Vsh)
1260 __global__ void contractTslicePlusKernel (float2 *out, float4 *in1, float4 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
1261 {
1262  int sid = blockIdx.x*blockDim.x + threadIdx.x; //number of threads is equal to Tslice volume
1263  //Adjust sid to correct tslice (exe domain must be Tslice volume!)
1264  int inId = sid + param.Vsh*Tslice; //Vsh - 3d space volume for the parity spinor (equale to exe domain!)
1265  int outId;
1266  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1267 
1268  if (sid >= param.threads) //param.threads == tslice volume
1269  return;
1270 
1271  volatile float2 tmp;
1272  extern __shared__ float sms[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
1273 
1274  volatile float *accum_re = sms + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
1275  volatile float *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
1276 
1277 //The output only for a given tslice (for the full tslice content, i.e., both parities!):
1278 
1279  eutId = 2*inId;
1280  auxCoord1 = eutId / param.dc.X[0];
1281  xCoord1 = eutId - auxCoord1 * param.dc.X[0];
1282  auxCoord2 = auxCoord1 / param.dc.X[1];
1283  xCoord2 = auxCoord1 - auxCoord2 * param.dc.X[1];
1284  xCoord4 = auxCoord2 / param.dc.X[2];
1285  xCoord3 = auxCoord2 - xCoord4 * param.dc.X[2];
1286 
1287  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1288  xCoord1 += auxCoord1;
1289  outId = xCoord1 + param.dc.X[0]*(xCoord2 + param.dc.X[1]*(xCoord3 + param.dc.X[2]*xCoord4)); //AQUI
1290 
1291  //Load the full input spinors:
1292 
1293  READ_SPINOR (SPINORTEX, myStride, sid, sid);
1294  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
1295 
1296  //compute in1^dag:
1297 
1298  I0.y = -I0.y;
1299  I0.w = -I0.w;
1300  I1.y = -I1.y;
1301  I1.w = -I1.w;
1302  I2.y = -I2.y;
1303  I2.w = -I2.w;
1304  I3.y = -I3.y;
1305  I3.w = -I3.w;
1306  I4.y = -I4.y;
1307  I4.w = -I4.w;
1308  I5.y = -I5.y;
1309  I5.w = -I5.w;
1310 
1311  //do products for first color component here:
1312  //00 component:
1313  tmp_re = I0.x * J0.x - I0.y * J0.y;
1314  tmp_re += I0.z * J0.z - I0.w * J0.w;
1315  tmp_re += I1.x * J1.x - I1.y * J1.y;
1316  accum_re[0*blockDim.x] = tmp_re;
1317 
1318  tmp_im = I0.x * J0.y + I0.y * J0.x;
1319  tmp_im += I0.z * J0.w + I0.w * J0.z;
1320  tmp_im += I1.x * J1.y + I1.y * J1.x;
1321  accum_im[0*blockDim.x] = tmp_im;
1322 
1323  //01 component:
1324  tmp_re = I0.x * J1.z - I0.y * J1.w;
1325  tmp_re += I0.z * J2.x - I0.w * J2.y;
1326  tmp_re += I1.x * J2.z - I1.y * J2.w;
1327  accum_re[1*blockDim.x] = tmp_re;
1328 
1329  tmp_im = I0.x * J1.w + I0.y * J1.z;
1330  tmp_im += I0.z * J2.y + I0.w * J2.x;
1331  tmp_im += I1.x * J2.w + I1.y * J2.z;
1332  accum_im[1*blockDim.x] = tmp_im;
1333 
1334  //02 component:
1335  tmp_re = I0.x * J3.x - I0.y * J3.y;
1336  tmp_re += I0.z * J3.z - I0.w * J3.w;
1337  tmp_re += I1.x * J4.x - I1.y * J4.y;
1338  accum_re[2*blockDim.x] = tmp_re;
1339 
1340  tmp_im = I0.x * J3.y + I0.y * J3.x;
1341  tmp_im += I0.z * J3.w + I0.w * J3.z;
1342  tmp_im += I1.x * J4.y + I1.y * J4.x;
1343  accum_im[2*blockDim.x] = tmp_im;
1344 
1345  //03 component:
1346  tmp_re = I0.x * J4.z - I0.y * J4.w;
1347  tmp_re += I0.z * J5.x - I0.w * J5.x;
1348  tmp_re += I1.x * J5.z - I1.y * J5.w;
1349 
1350  accum_re[3*blockDim.x] = tmp_re;
1351 
1352  tmp_im = I0.x * J4.w + I0.y * J4.z;
1353  tmp_im += I0.z * J5.y + I0.w * J5.y;
1354  tmp_im += I1.x * J5.w + I1.y * J5.z;
1355  accum_im[3*blockDim.x] = tmp_im;
1356 
1357  //10 component:
1358  tmp_re = I1.z * J0.x - I1.w * J0.y;
1359  tmp_re += I2.x * J0.z - I2.y * J0.w;
1360  tmp_re += I2.z * J1.x - I2.w * J1.y;
1361  accum_re[4*blockDim.x] = tmp_re;
1362 
1363  tmp_im = I1.z * J0.y + I1.w * J0.x;
1364  tmp_im += I2.x * J0.w + I2.y * J0.z;
1365  tmp_im += I2.z * J1.y + I2.w * J1.x;
1366  accum_im[4*blockDim.x] = tmp_im;
1367 
1368  //11 component:
1369  tmp_re = I1.z * J1.z - I1.w * J1.w;
1370  tmp_re += I2.x * J2.x - I2.y * J2.y;
1371  tmp_re += I2.z * J2.z - I2.w * J2.w;
1372  accum_re[5*blockDim.x] = tmp_re;
1373 
1374  tmp_im = I1.z * J1.w + I1.w * J1.z;
1375  tmp_im += I2.x * J2.y + I2.y * J2.x;
1376  tmp_im += I2.z * J2.w + I2.w * J2.z;
1377  accum_im[5*blockDim.x] = tmp_im;
1378 
1379  //12 component:
1380  tmp_re = I1.z * J3.x - I1.w * J3.y;
1381  tmp_re += I2.x * J3.z - I2.y * J3.w;
1382  tmp_re += I2.z * J4.x - I2.w * J4.y;
1383  accum_re[6*blockDim.x] = tmp_re;
1384 
1385  tmp_im = I1.z * J3.y + I1.w * J3.x;
1386  tmp_im += I2.x * J3.w + I2.y * J3.z;
1387  tmp_im += I2.z * J4.y + I2.w * J4.x;
1388  accum_im[6*blockDim.x] = tmp_im;
1389 
1390  //13 component:
1391  tmp_re = I1.z * J4.z - I1.w * J4.w;
1392  tmp_re += I2.x * J5.x - I2.y * J5.y;
1393  tmp_re += I2.z * J5.z - I2.w * J5.w;
1394  accum_re[7*blockDim.x] = tmp_re;
1395 
1396  tmp_im = I1.z * J4.w + I1.w * J4.z;
1397  tmp_im += I2.x * J5.y + I2.y * J5.x;
1398  tmp_im += I2.z * J5.w + I2.w * J5.z;
1399  accum_im[7*blockDim.x] = tmp_im;
1400 
1401  //20 component:
1402  tmp_re = I3.x * J0.x - I3.y * J0.y;
1403  tmp_re += I3.z * J0.z - I3.w * J0.w;
1404  tmp_re += I4.x * J1.x - I4.y * J1.y;
1405  accum_re[8*blockDim.x] = tmp_re;
1406 
1407  tmp_im = I3.x * J0.y + I3.y * J0.x;
1408  tmp_im += I3.z * J0.w + I3.w * J0.z;
1409  tmp_im += I4.x * J1.y + I4.y * J1.x;
1410  accum_im[8*blockDim.x] = tmp_im;
1411 
1412  //21 component:
1413  tmp_re = I3.x * J1.z - I3.y * J1.w;
1414  tmp_re += I3.z * J2.x - I3.w * J2.y;
1415  tmp_re += I4.x * J2.z - I4.y * J2.w;
1416  accum_re[9*blockDim.x] = tmp_re;
1417 
1418  tmp_im = I3.x * J1.w + I3.y * J1.z;
1419  tmp_im += I3.z * J2.y + I3.w * J2.x;
1420  tmp_im += I4.x * J2.w + I4.y * J2.z;
1421  accum_im[9*blockDim.x] = tmp_im;
1422 
1423  //22 component:
1424  tmp_re = I3.x * J3.x - I3.y * J3.y;
1425  tmp_re += I3.z * J3.z - I3.w * J3.w;
1426  tmp_re += I4.x * J4.x - I4.y * J4.y;
1427  accum_re[10*blockDim.x] = tmp_re;
1428 
1429  tmp_im = I3.x * J3.y + I3.y * J3.x;
1430  tmp_im += I3.z * J3.w + I3.w * J3.z;
1431  tmp_im += I4.x * J4.y + I4.y * J4.x;
1432  accum_im[10*blockDim.x] = tmp_im;
1433 
1434  //23 component:
1435  tmp_re = I3.x * J4.z - I3.y * J4.w;
1436  tmp_re += I3.z * J5.x - I3.w * J5.y;
1437  tmp_re += I4.x * J5.z - I4.y * J5.w;
1438  accum_re[11*blockDim.x] = tmp_re;
1439 
1440  tmp_im = I3.x * J4.w + I3.y * J4.z;
1441  tmp_im += I3.z * J5.y + I3.w * J5.x;
1442  tmp_im += I4.x * J5.w + I4.y * J5.z;
1443  accum_im[11*blockDim.x] = tmp_im;
1444 
1445  //30 component:
1446  tmp_re = I4.z * J0.x - I4.w * J0.y;
1447  tmp_re += I5.x * J0.z - I5.y * J0.w;
1448  tmp_re += I5.z * J1.x - I5.w * J1.y;
1449  accum_re[12*blockDim.x] = tmp_re;
1450 
1451  tmp_im = I4.z * J0.y + I4.w * J0.x;
1452  tmp_im += I5.x * J0.w + I5.y * J0.z;
1453  tmp_im += I5.z * J1.y + I5.w * J1.x;
1454  accum_im[12*blockDim.x] = tmp_im;
1455 
1456  //31 component:
1457  tmp_re = I4.z * J1.z - I4.w * J1.w;
1458  tmp_re += I5.x * J2.x - I5.y * J2.y;
1459  tmp_re += I5.z * J2.z - I5.w * J2.w;
1460  accum_re[13*blockDim.x] = tmp_re;
1461 
1462  tmp_im = I4.z * J1.w + I4.w * J1.z;
1463  tmp_im += I5.x * J2.y + I5.y * J2.x;
1464  tmp_im += I5.z * J2.w + I5.w * J2.z;
1465  accum_im[13*blockDim.x] = tmp_im;
1466 
1467  //32 component:
1468  tmp_re = I4.z * J3.x - I4.w * J3.y;
1469  tmp_re += I5.x * J3.z - I5.y * J3.w;
1470  tmp_re += I5.z * J4.x - I5.w * J4.y;
1471  accum_re[14*blockDim.x] = tmp_re;
1472 
1473  tmp_im = I4.z * J3.y + I4.w * J3.x;
1474  tmp_im += I5.x * J3.w + I5.y * J3.z;
1475  tmp_im += I5.z * J4.y + I5.w * J4.x;
1476  accum_im[14*blockDim.x] = tmp_im;
1477 
1478  //33 component:
1479  tmp_re = I4.z * J4.z - I4.w * J4.w;
1480  tmp_re += I5.x * J5.x - I5.y * J5.y;
1481  tmp_re += I5.z * J5.z - I5.w * J5.w;
1482  accum_re[15*blockDim.x] = tmp_re;
1483 
1484  tmp_im = I4.z * J4.w + I4.w * J4.z;
1485  tmp_im += I5.x * J5.y + I5.y * J5.x;
1486  tmp_im += I5.z * J5.w + I5.w * J5.z;
1487  accum_im[15*blockDim.x] = tmp_im;
1488 
1489  //Store output back to global buffer:
1490 
1491 
1492  /* CONTRACTION FULL VOLUME */
1493 
1494  out[outId + 0 *param.threads*2].x += accum_re[ 0*blockDim.x];
1495  out[outId + 1 *param.threads*2].x += accum_re[ 1*blockDim.x];
1496  out[outId + 2 *param.threads*2].x += accum_re[ 2*blockDim.x];
1497  out[outId + 3 *param.threads*2].x += accum_re[ 3*blockDim.x];
1498  out[outId + 4 *param.threads*2].x += accum_re[ 4*blockDim.x];
1499  out[outId + 5 *param.threads*2].x += accum_re[ 5*blockDim.x];
1500  out[outId + 6 *param.threads*2].x += accum_re[ 6*blockDim.x];
1501  out[outId + 7 *param.threads*2].x += accum_re[ 7*blockDim.x];
1502  out[outId + 8 *param.threads*2].x += accum_re[ 8*blockDim.x];
1503  out[outId + 9 *param.threads*2].x += accum_re[ 9*blockDim.x];
1504  out[outId + 10*param.threads*2].x += accum_re[10*blockDim.x];
1505  out[outId + 11*param.threads*2].x += accum_re[11*blockDim.x];
1506  out[outId + 12*param.threads*2].x += accum_re[12*blockDim.x];
1507  out[outId + 13*param.threads*2].x += accum_re[13*blockDim.x];
1508  out[outId + 14*param.threads*2].x += accum_re[14*blockDim.x];
1509  out[outId + 15*param.threads*2].x += accum_re[15*blockDim.x];
1510 
1511  out[outId + 0 *param.threads*2].y += accum_im[ 0*blockDim.x];
1512  out[outId + 1 *param.threads*2].y += accum_im[ 1*blockDim.x];
1513  out[outId + 2 *param.threads*2].y += accum_im[ 2*blockDim.x];
1514  out[outId + 3 *param.threads*2].y += accum_im[ 3*blockDim.x];
1515  out[outId + 4 *param.threads*2].y += accum_im[ 4*blockDim.x];
1516  out[outId + 5 *param.threads*2].y += accum_im[ 5*blockDim.x];
1517  out[outId + 6 *param.threads*2].y += accum_im[ 6*blockDim.x];
1518  out[outId + 7 *param.threads*2].y += accum_im[ 7*blockDim.x];
1519  out[outId + 8 *param.threads*2].y += accum_im[ 8*blockDim.x];
1520  out[outId + 9 *param.threads*2].y += accum_im[ 9*blockDim.x];
1521  out[outId + 10*param.threads*2].y += accum_im[10*blockDim.x];
1522  out[outId + 11*param.threads*2].y += accum_im[11*blockDim.x];
1523  out[outId + 12*param.threads*2].y += accum_im[12*blockDim.x];
1524  out[outId + 13*param.threads*2].y += accum_im[13*blockDim.x];
1525  out[outId + 14*param.threads*2].y += accum_im[14*blockDim.x];
1526  out[outId + 15*param.threads*2].y += accum_im[15*blockDim.x];
1527 
1528  return;
1529 }
1530 
1531 __global__ void contractPlusKernel (float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
1532 {
1533  int sid = blockIdx.x*blockDim.x + threadIdx.x;
1534  int outId = sid;
1535  int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1536 
1537  if (sid >= param.threads)
1538  return;
1539 
1540  volatile float2 tmp;
1541  extern __shared__ float sms[]; //used for data accumulation: blockDim.x * 2 * 16 * sizeof(double)
1542 
1543  volatile float *accum_re = sms + threadIdx.x; //address it like idx*blockDim, where idx = 4*spinor_idx1 + spinor_idx2
1544  volatile float *accum_im = accum_re + TOTAL_COMPONENTS*blockDim.x;
1545 
1546  eutId = 2*sid;
1547  auxCoord1 = eutId / param.dc.X[0];
1548  xCoord1 = eutId - auxCoord1 * param.dc.X[0];
1549  auxCoord2 = auxCoord1 / param.dc.X[1];
1550  xCoord2 = auxCoord1 - auxCoord2 * param.dc.X[1];
1551  xCoord4 = auxCoord2 / param.dc.X[2];
1552  xCoord3 = auxCoord2 - xCoord4 * param.dc.X[2];
1553 
1554  auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1555  xCoord1 += auxCoord1;
1556  outId = xCoord1 + param.dc.X[0]*(xCoord2 + param.dc.X[1]*(xCoord3 + param.dc.X[2]*xCoord4)); //AQUI
1557 
1558  //Load the full input spinors:
1559 
1560  READ_SPINOR (SPINORTEX, myStride, sid, sid);
1561  READ_INTERMEDIATE_SPINOR (INTERTEX, myStride, sid, sid);
1562 
1563  //compute in1^dag:
1564 
1565  I0.y = -I0.y;
1566  I0.w = -I0.w;
1567  I1.y = -I1.y;
1568  I1.w = -I1.w;
1569  I2.y = -I2.y;
1570  I2.w = -I2.w;
1571  I3.y = -I3.y;
1572  I3.w = -I3.w;
1573  I4.y = -I4.y;
1574  I4.w = -I4.w;
1575  I5.y = -I5.y;
1576  I5.w = -I5.w;
1577 
1578  //do products for first color component here:
1579  //00 component:
1580  tmp_re = I0.x * J0.x - I0.y * J0.y;
1581  tmp_re += I0.z * J0.z - I0.w * J0.w;
1582  tmp_re += I1.x * J1.x - I1.y * J1.y;
1583  accum_re[0*blockDim.x] = tmp_re;
1584 
1585  tmp_im = I0.x * J0.y + I0.y * J0.x;
1586  tmp_im += I0.z * J0.w + I0.w * J0.z;
1587  tmp_im += I1.x * J1.y + I1.y * J1.x;
1588  accum_im[0*blockDim.x] = tmp_im;
1589 
1590  //01 component:
1591  tmp_re = I0.x * J1.z - I0.y * J1.w;
1592  tmp_re += I0.z * J2.x - I0.w * J2.y;
1593  tmp_re += I1.x * J2.z - I1.y * J2.w;
1594  accum_re[1*blockDim.x] = tmp_re;
1595 
1596  tmp_im = I0.x * J1.w + I0.y * J1.z;
1597  tmp_im += I0.z * J2.y + I0.w * J2.x;
1598  tmp_im += I1.x * J2.w + I1.y * J2.z;
1599  accum_im[1*blockDim.x] = tmp_im;
1600 
1601  //02 component:
1602  tmp_re = I0.x * J3.x - I0.y * J3.y;
1603  tmp_re += I0.z * J3.z - I0.w * J3.w;
1604  tmp_re += I1.x * J4.x - I1.y * J4.y;
1605  accum_re[2*blockDim.x] = tmp_re;
1606 
1607  tmp_im = I0.x * J3.y + I0.y * J3.x;
1608  tmp_im += I0.z * J3.w + I0.w * J3.z;
1609  tmp_im += I1.x * J4.y + I1.y * J4.x;
1610  accum_im[2*blockDim.x] = tmp_im;
1611 
1612  //03 component:
1613  tmp_re = I0.x * J4.z - I0.y * J4.w;
1614  tmp_re += I0.z * J5.x - I0.w * J5.x;
1615  tmp_re += I1.x * J5.z - I1.y * J5.w;
1616 
1617  accum_re[3*blockDim.x] = tmp_re;
1618 
1619  tmp_im = I0.x * J4.w + I0.y * J4.z;
1620  tmp_im += I0.z * J5.y + I0.w * J5.y;
1621  tmp_im += I1.x * J5.w + I1.y * J5.z;
1622  accum_im[3*blockDim.x] = tmp_im;
1623 
1624  //10 component:
1625  tmp_re = I1.z * J0.x - I1.w * J0.y;
1626  tmp_re += I2.x * J0.z - I2.y * J0.w;
1627  tmp_re += I2.z * J1.x - I2.w * J1.y;
1628  accum_re[4*blockDim.x] = tmp_re;
1629 
1630  tmp_im = I1.z * J0.y + I1.w * J0.x;
1631  tmp_im += I2.x * J0.w + I2.y * J0.z;
1632  tmp_im += I2.z * J1.y + I2.w * J1.x;
1633  accum_im[4*blockDim.x] = tmp_im;
1634 
1635  //11 component:
1636  tmp_re = I1.z * J1.z - I1.w * J1.w;
1637  tmp_re += I2.x * J2.x - I2.y * J2.y;
1638  tmp_re += I2.z * J2.z - I2.w * J2.w;
1639  accum_re[5*blockDim.x] = tmp_re;
1640 
1641  tmp_im = I1.z * J1.w + I1.w * J1.z;
1642  tmp_im += I2.x * J2.y + I2.y * J2.x;
1643  tmp_im += I2.z * J2.w + I2.w * J2.z;
1644  accum_im[5*blockDim.x] = tmp_im;
1645 
1646  //12 component:
1647  tmp_re = I1.z * J3.x - I1.w * J3.y;
1648  tmp_re += I2.x * J3.z - I2.y * J3.w;
1649  tmp_re += I2.z * J4.x - I2.w * J4.y;
1650  accum_re[6*blockDim.x] = tmp_re;
1651 
1652  tmp_im = I1.z * J3.y + I1.w * J3.x;
1653  tmp_im += I2.x * J3.w + I2.y * J3.z;
1654  tmp_im += I2.z * J4.y + I2.w * J4.x;
1655  accum_im[6*blockDim.x] = tmp_im;
1656 
1657  //13 component:
1658  tmp_re = I1.z * J4.z - I1.w * J4.w;
1659  tmp_re += I2.x * J5.x - I2.y * J5.y;
1660  tmp_re += I2.z * J5.z - I2.w * J5.w;
1661  accum_re[7*blockDim.x] = tmp_re;
1662 
1663  tmp_im = I1.z * J4.w + I1.w * J4.z;
1664  tmp_im += I2.x * J5.y + I2.y * J5.x;
1665  tmp_im += I2.z * J5.w + I2.w * J5.z;
1666  accum_im[7*blockDim.x] = tmp_im;
1667 
1668  //20 component:
1669  tmp_re = I3.x * J0.x - I3.y * J0.y;
1670  tmp_re += I3.z * J0.z - I3.w * J0.w;
1671  tmp_re += I4.x * J1.x - I4.y * J1.y;
1672  accum_re[8*blockDim.x] = tmp_re;
1673 
1674  tmp_im = I3.x * J0.y + I3.y * J0.x;
1675  tmp_im += I3.z * J0.w + I3.w * J0.z;
1676  tmp_im += I4.x * J1.y + I4.y * J1.x;
1677  accum_im[8*blockDim.x] = tmp_im;
1678 
1679  //21 component:
1680  tmp_re = I3.x * J1.z - I3.y * J1.w;
1681  tmp_re += I3.z * J2.x - I3.w * J2.y;
1682  tmp_re += I4.x * J2.z - I4.y * J2.w;
1683  accum_re[9*blockDim.x] = tmp_re;
1684 
1685  tmp_im = I3.x * J1.w + I3.y * J1.z;
1686  tmp_im += I3.z * J2.y + I3.w * J2.x;
1687  tmp_im += I4.x * J2.w + I4.y * J2.z;
1688  accum_im[9*blockDim.x] = tmp_im;
1689 
1690  //22 component:
1691  tmp_re = I3.x * J3.x - I3.y * J3.y;
1692  tmp_re += I3.z * J3.z - I3.w * J3.w;
1693  tmp_re += I4.x * J4.x - I4.y * J4.y;
1694  accum_re[10*blockDim.x] = tmp_re;
1695 
1696  tmp_im = I3.x * J3.y + I3.y * J3.x;
1697  tmp_im += I3.z * J3.w + I3.w * J3.z;
1698  tmp_im += I4.x * J4.y + I4.y * J4.x;
1699  accum_im[10*blockDim.x] = tmp_im;
1700 
1701  //23 component:
1702  tmp_re = I3.x * J4.z - I3.y * J4.w;
1703  tmp_re += I3.z * J5.x - I3.w * J5.y;
1704  tmp_re += I4.x * J5.z - I4.y * J5.w;
1705  accum_re[11*blockDim.x] = tmp_re;
1706 
1707  tmp_im = I3.x * J4.w + I3.y * J4.z;
1708  tmp_im += I3.z * J5.y + I3.w * J5.x;
1709  tmp_im += I4.x * J5.w + I4.y * J5.z;
1710  accum_im[11*blockDim.x] = tmp_im;
1711 
1712  //30 component:
1713  tmp_re = I4.z * J0.x - I4.w * J0.y;
1714  tmp_re += I5.x * J0.z - I5.y * J0.w;
1715  tmp_re += I5.z * J1.x - I5.w * J1.y;
1716  accum_re[12*blockDim.x] = tmp_re;
1717 
1718  tmp_im = I4.z * J0.y + I4.w * J0.x;
1719  tmp_im += I5.x * J0.w + I5.y * J0.z;
1720  tmp_im += I5.z * J1.y + I5.w * J1.x;
1721  accum_im[12*blockDim.x] = tmp_im;
1722 
1723  //31 component:
1724  tmp_re = I4.z * J1.z - I4.w * J1.w;
1725  tmp_re += I5.x * J2.x - I5.y * J2.y;
1726  tmp_re += I5.z * J2.z - I5.w * J2.w;
1727  accum_re[13*blockDim.x] = tmp_re;
1728 
1729  tmp_im = I4.z * J1.w + I4.w * J1.z;
1730  tmp_im += I5.x * J2.y + I5.y * J2.x;
1731  tmp_im += I5.z * J2.w + I5.w * J2.z;
1732  accum_im[13*blockDim.x] = tmp_im;
1733 
1734  //32 component:
1735  tmp_re = I4.z * J3.x - I4.w * J3.y;
1736  tmp_re += I5.x * J3.z - I5.y * J3.w;
1737  tmp_re += I5.z * J4.x - I5.w * J4.y;
1738  accum_re[14*blockDim.x] = tmp_re;
1739 
1740  tmp_im = I4.z * J3.y + I4.w * J3.x;
1741  tmp_im += I5.x * J3.w + I5.y * J3.z;
1742  tmp_im += I5.z * J4.y + I5.w * J4.x;
1743  accum_im[14*blockDim.x] = tmp_im;
1744 
1745  //33 component:
1746  tmp_re = I4.z * J4.z - I4.w * J4.w;
1747  tmp_re += I5.x * J5.x - I5.y * J5.y;
1748  tmp_re += I5.z * J5.z - I5.w * J5.w;
1749  accum_re[15*blockDim.x] = tmp_re;
1750 
1751  tmp_im = I4.z * J4.w + I4.w * J4.z;
1752  tmp_im += I5.x * J5.y + I5.y * J5.x;
1753  tmp_im += I5.z * J5.w + I5.w * J5.z;
1754  accum_im[15*blockDim.x] = tmp_im;
1755 
1756  //Store output back to global buffer:
1757 
1758  /* CONTRACTION FULL VOLUME */
1759 
1760  out[outId + 0 *param.threads*2].x += accum_re[ 0*blockDim.x];
1761  out[outId + 1 *param.threads*2].x += accum_re[ 1*blockDim.x];
1762  out[outId + 2 *param.threads*2].x += accum_re[ 2*blockDim.x];
1763  out[outId + 3 *param.threads*2].x += accum_re[ 3*blockDim.x];
1764  out[outId + 4 *param.threads*2].x += accum_re[ 4*blockDim.x];
1765  out[outId + 5 *param.threads*2].x += accum_re[ 5*blockDim.x];
1766  out[outId + 6 *param.threads*2].x += accum_re[ 6*blockDim.x];
1767  out[outId + 7 *param.threads*2].x += accum_re[ 7*blockDim.x];
1768  out[outId + 8 *param.threads*2].x += accum_re[ 8*blockDim.x];
1769  out[outId + 9 *param.threads*2].x += accum_re[ 9*blockDim.x];
1770  out[outId + 10*param.threads*2].x += accum_re[10*blockDim.x];
1771  out[outId + 11*param.threads*2].x += accum_re[11*blockDim.x];
1772  out[outId + 12*param.threads*2].x += accum_re[12*blockDim.x];
1773  out[outId + 13*param.threads*2].x += accum_re[13*blockDim.x];
1774  out[outId + 14*param.threads*2].x += accum_re[14*blockDim.x];
1775  out[outId + 15*param.threads*2].x += accum_re[15*blockDim.x];
1776 
1777  out[outId + 0 *param.threads*2].y += accum_im[ 0*blockDim.x];
1778  out[outId + 1 *param.threads*2].y += accum_im[ 1*blockDim.x];
1779  out[outId + 2 *param.threads*2].y += accum_im[ 2*blockDim.x];
1780  out[outId + 3 *param.threads*2].y += accum_im[ 3*blockDim.x];
1781  out[outId + 4 *param.threads*2].y += accum_im[ 4*blockDim.x];
1782  out[outId + 5 *param.threads*2].y += accum_im[ 5*blockDim.x];
1783  out[outId + 6 *param.threads*2].y += accum_im[ 6*blockDim.x];
1784  out[outId + 7 *param.threads*2].y += accum_im[ 7*blockDim.x];
1785  out[outId + 8 *param.threads*2].y += accum_im[ 8*blockDim.x];
1786  out[outId + 9 *param.threads*2].y += accum_im[ 9*blockDim.x];
1787  out[outId + 10*param.threads*2].y += accum_im[10*blockDim.x];
1788  out[outId + 11*param.threads*2].y += accum_im[11*blockDim.x];
1789  out[outId + 12*param.threads*2].y += accum_im[12*blockDim.x];
1790  out[outId + 13*param.threads*2].y += accum_im[13*blockDim.x];
1791  out[outId + 14*param.threads*2].y += accum_im[14*blockDim.x];
1792  out[outId + 15*param.threads*2].y += accum_im[15*blockDim.x];
1793 
1794  return;
1795 }
1796 
1797 #undef SPINORTEX
1798 #undef INTERTEX
1799 
1800 #undef tmp_re
1801 #undef tmp_im
1802 
1803 #undef READ_SPINOR
1804 #undef READ_INTERMEDIATE_SPINOR
1805 
1806 #undef SPINOR_HOP
1807 
1808 #endif //_TWIST_QUDA_CONTRACT_PLUS
dim3 dim3 blockDim
__global__ void contractTslicePlusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44
#define SPINORTEX
#define TOTAL_COMPONENTS
#define tmp_re
#define READ_INTERMEDIATE_SPINOR
QudaGaugeParam param
Definition: pack_test.cpp:17
#define INTERTEX
int X[4]
Definition: quda.h:29
__global__ void contractGamma5PlusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
#define tmp_im
cpuColorSpinorField * out
__global__ void contractPlusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
#define READ_SPINOR