1 #ifndef _TWIST_QUDA_CONTRACT
2 #define _TWIST_QUDA_CONTRACT
7 #define TOTAL_COMPONENTS 16
9 #if (__COMPUTE_CAPABILITY__ >= 130)
11 #define READ_INTERMEDIATE_SPINOR_DOUBLE(spinor, stride, sp_idx, norm_idx) \
12 double2 J0 = spinor[sp_idx + 0*(stride)]; \
13 double2 J1 = spinor[sp_idx + 1*(stride)]; \
14 double2 J2 = spinor[sp_idx + 2*(stride)]; \
15 double2 J3 = spinor[sp_idx + 3*(stride)]; \
16 double2 J4 = spinor[sp_idx + 4*(stride)]; \
17 double2 J5 = spinor[sp_idx + 5*(stride)]; \
18 double2 J6 = spinor[sp_idx + 6*(stride)]; \
19 double2 J7 = spinor[sp_idx + 7*(stride)]; \
20 double2 J8 = spinor[sp_idx + 8*(stride)]; \
21 double2 J9 = spinor[sp_idx + 9*(stride)]; \
22 double2 J10 = spinor[sp_idx +10*(stride)]; \
23 double2 J11 = spinor[sp_idx +11*(stride)];
25 #define READ_INTERMEDIATE_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \
26 double2 J0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
27 double2 J1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
28 double2 J2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
29 double2 J3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
30 double2 J4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
31 double2 J5 = fetch_double2((spinor), sp_idx + 5*(stride)); \
32 double2 J6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
33 double2 J7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
34 double2 J8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
35 double2 J9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
36 double2 J10 = fetch_double2((spinor), sp_idx +10*(stride)); \
37 double2 J11 = fetch_double2((spinor), sp_idx +11*(stride));
39 #ifdef DIRECT_ACCESS_WILSON_SPINOR
40 #define READ_SPINOR READ_SPINOR_DOUBLE
41 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE
45 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
46 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE_TEX
48 #ifdef USE_TEXTURE_OBJECTS
49 #define SPINORTEX param.inTex
50 #define INTERTEX param.outTex
52 #define SPINORTEX spinorTexDouble
53 #define INTERTEX interTexDouble
54 #endif // USE_TEXTURE_OBJECTS
62 int sid = blockIdx.x*blockDim.x + threadIdx.x;
64 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
70 extern __shared__
double sm[];
72 volatile double *accum_re = sm + threadIdx.x;
76 auxCoord1 = eutId /
X1;
77 xCoord1 = eutId - auxCoord1 *
X1;
78 auxCoord2 = auxCoord1 /
X2;
79 xCoord2 = auxCoord1 - auxCoord2 *
X2;
80 xCoord4 = auxCoord2 /
X3;
81 xCoord3 = auxCoord2 - xCoord4 *
X3;
83 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
85 outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4));
136 tmp_re = I0.x * J0.x - I0.y * J0.y;
137 tmp_re += I1.x * J1.x - I1.y * J1.y;
138 tmp_re += I2.x * J2.x - I2.y * J2.y;
139 accum_re[0*blockDim.x] =
tmp_re;
141 tmp_im = I0.x * J0.y + I0.y * J0.x;
142 tmp_im += I1.x * J1.y + I1.y * J1.x;
143 tmp_im += I2.x * J2.y + I2.y * J2.x;
144 accum_im[0*blockDim.x] =
tmp_im;
147 tmp_re = I0.x * J3.x - I0.y * J3.y;
148 tmp_re += I1.x * J4.x - I1.y * J4.y;
149 tmp_re += I2.x * J5.x - I2.y * J5.y;
150 accum_re[1*blockDim.x] =
tmp_re;
152 tmp_im = I0.x * J3.y + I0.y * J3.x;
153 tmp_im += I1.x * J4.y + I1.y * J4.x;
154 tmp_im += I2.x * J5.y + I2.y * J5.x;
155 accum_im[1*blockDim.x] =
tmp_im;
158 tmp_re = I0.x * J6.x - I0.y * J6.y;
159 tmp_re += I1.x * J7.x - I1.y * J7.y;
160 tmp_re += I2.x * J8.x - I2.y * J8.y;
161 accum_re[2*blockDim.x] =
tmp_re;
163 tmp_im = I0.x * J6.y + I0.y * J6.x;
164 tmp_im += I1.x * J7.y + I1.y * J7.x;
165 tmp_im += I2.x * J8.y + I2.y * J8.x;
166 accum_im[2*blockDim.x] =
tmp_im;
169 tmp_re = I0.x * J9.x - I0.y * J9.y;
170 tmp_re += I1.x * J10.x - I1.y * J10.y;
171 tmp_re += I2.x * J11.x - I2.y * J11.y;
172 accum_re[3*blockDim.x] =
tmp_re;
174 tmp_im = I0.x * J9.y + I0.y * J9.x;
175 tmp_im += I1.x * J10.y + I1.y * J10.x;
176 tmp_im += I2.x * J11.y + I2.y * J11.x;
177 accum_im[3*blockDim.x] =
tmp_im;
180 tmp_re = I3.x * J0.x - I3.y * J0.y;
181 tmp_re += I4.x * J1.x - I4.y * J1.y;
182 tmp_re += I5.x * J2.x - I5.y * J2.y;
183 accum_re[ 4*blockDim.x] =
tmp_re;
185 tmp_im = I3.x * J0.y + I3.y * J0.x;
186 tmp_im += I4.x * J1.y + I4.y * J1.x;
187 tmp_im += I5.x * J2.y + I5.y * J2.x;
188 accum_im[ 4*blockDim.x] =
tmp_im;
191 tmp_re = I3.x * J3.x - I3.y * J3.y;
192 tmp_re += I4.x * J4.x - I4.y * J4.y;
193 tmp_re += I5.x * J5.x - I5.y * J5.y;
194 accum_re[ 5*blockDim.x] =
tmp_re;
196 tmp_im = I3.x * J3.y + I3.y * J3.x;
197 tmp_im += I4.x * J4.y + I4.y * J4.x;
198 tmp_im += I5.x * J5.y + I5.y * J5.x;
199 accum_im[ 5*blockDim.x] =
tmp_im;
202 tmp_re = I3.x * J6.x - I3.y * J6.y;
203 tmp_re += I4.x * J7.x - I4.y * J7.y;
204 tmp_re += I5.x * J8.x - I5.y * J8.y;
205 accum_re[ 6*blockDim.x] =
tmp_re;
207 tmp_im = I3.x * J6.y + I3.y * J6.x;
208 tmp_im += I4.x * J7.y + I4.y * J7.x;
209 tmp_im += I5.x * J8.y + I5.y * J8.x;
210 accum_im[ 6*blockDim.x] =
tmp_im;
213 tmp_re = I3.x * J9.x - I3.y * J9.y;
214 tmp_re += I4.x * J10.x - I4.y * J10.y;
215 tmp_re += I5.x * J11.x - I5.y * J11.y;
216 accum_re[ 7*blockDim.x] =
tmp_re;
218 tmp_im = I3.x * J9.y + I3.y * J9.x;
219 tmp_im += I4.x * J10.y + I4.y * J10.x;
220 tmp_im += I5.x * J11.y + I5.y * J11.x;
221 accum_im[ 7*blockDim.x] =
tmp_im;
224 tmp_re = I6.x * J0.x - I6.y * J0.y;
225 tmp_re += I7.x * J1.x - I7.y * J1.y;
226 tmp_re += I8.x * J2.x - I8.y * J2.y;
227 accum_re[ 8*blockDim.x] =
tmp_re;
229 tmp_im = I6.x * J0.y + I6.y * J0.x;
230 tmp_im += I7.x * J1.y + I7.y * J1.x;
231 tmp_im += I8.x * J2.y + I8.y * J2.x;
232 accum_im[ 8*blockDim.x] =
tmp_im;
235 tmp_re = I6.x * J3.x - I6.y * J3.y;
236 tmp_re += I7.x * J4.x - I7.y * J4.y;
237 tmp_re += I8.x * J5.x - I8.y * J5.y;
238 accum_re[ 9*blockDim.x] =
tmp_re;
240 tmp_im = I6.x * J3.y + I6.y * J3.x;
241 tmp_im += I7.x * J4.y + I7.y * J4.x;
242 tmp_im += I8.x * J5.y + I8.y * J5.x;
243 accum_im[ 9*blockDim.x] =
tmp_im;
246 tmp_re = I6.x * J6.x - I6.y * J6.y;
247 tmp_re += I7.x * J7.x - I7.y * J7.y;
248 tmp_re += I8.x * J8.x - I8.y * J8.y;
249 accum_re[10*blockDim.x] =
tmp_re;
251 tmp_im = I6.x * J6.y + I6.y * J6.x;
252 tmp_im += I7.x * J7.y + I7.y * J7.x;
253 tmp_im += I8.x * J8.y + I8.y * J8.x;
254 accum_im[10*blockDim.x] =
tmp_im;
257 tmp_re = I6.x * J9.x - I6.y * J9.y;
258 tmp_re += I7.x * J10.x - I7.y * J10.y;
259 tmp_re += I8.x * J11.x - I8.y * J11.y;
260 accum_re[11*blockDim.x] =
tmp_re;
262 tmp_im = I6.x * J9.y + I6.y * J9.x;
263 tmp_im += I7.x * J10.y + I7.y * J10.x;
264 tmp_im += I8.x * J11.y + I8.y * J11.x;
265 accum_im[11*blockDim.x] =
tmp_im;
268 tmp_re = I9.x * J0.x - I9.y * J0.y;
269 tmp_re += I10.x * J1.x - I10.y * J1.y;
270 tmp_re += I11.x * J2.x - I11.y * J2.y;
271 accum_re[12*blockDim.x] =
tmp_re;
273 tmp_im = I9.x * J0.y + I9.y * J0.x;
274 tmp_im += I10.x * J1.y + I10.y * J1.x;
275 tmp_im += I11.x * J2.y + I11.y * J2.x;
276 accum_im[12*blockDim.x] =
tmp_im;
279 tmp_re = I9.x * J3.x - I9.y * J3.y;
280 tmp_re += I10.x * J4.x - I10.y * J4.y;
281 tmp_re += I11.x * J5.x - I11.y * J5.y;
282 accum_re[13*blockDim.x] =
tmp_re;
284 tmp_im = I9.x * J3.y + I9.y * J3.x;
285 tmp_im += I10.x * J4.y + I10.y * J4.x;
286 tmp_im += I11.x * J5.y + I11.y * J5.x;
287 accum_im[13*blockDim.x] =
tmp_im;
290 tmp_re = I9.x * J6.x - I9.y * J6.y;
291 tmp_re += I10.x * J7.x - I10.y * J7.y;
292 tmp_re += I11.x * J8.x - I11.y * J8.y;
293 accum_re[14*blockDim.x] =
tmp_re;
295 tmp_im = I9.x * J6.y + I9.y * J6.x;
296 tmp_im += I10.x * J7.y + I10.y * J7.x;
297 tmp_im += I11.x * J8.y + I11.y * J8.x;
298 accum_im[14*blockDim.x] =
tmp_im;
301 tmp_re = I9.x * J9.x - I9.y * J9.y;
302 tmp_re += I10.x * J10.x - I10.y * J10.y;
303 tmp_re += I11.x * J11.x - I11.y * J11.y;
304 accum_re[15*blockDim.x] =
tmp_re;
306 tmp_im = I9.x * J9.y + I9.y * J9.x;
307 tmp_im += I10.x * J10.y + I10.y * J10.x;
308 tmp_im += I11.x * J11.y + I11.y * J11.x;
309 accum_im[15*blockDim.x] =
tmp_im;
316 out[outId + 0 *param.
threads*2] = make_double2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
317 out[outId + 1 *param.
threads*2] = make_double2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
318 out[outId + 2 *param.
threads*2] = make_double2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
319 out[outId + 3 *param.
threads*2] = make_double2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
320 out[outId + 4 *param.
threads*2] = make_double2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
321 out[outId + 5 *param.
threads*2] = make_double2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
322 out[outId + 6 *param.
threads*2] = make_double2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
323 out[outId + 7 *param.
threads*2] = make_double2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
324 out[outId + 8 *param.
threads*2] = make_double2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
325 out[outId + 9 *param.
threads*2] = make_double2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
326 out[outId + 10*param.
threads*2] = make_double2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
327 out[outId + 11*param.
threads*2] = make_double2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
328 out[outId + 12*param.
threads*2] = make_double2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
329 out[outId + 13*param.
threads*2] = make_double2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
330 out[outId + 14*param.
threads*2] = make_double2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
331 out[outId + 15*param.
threads*2] = make_double2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
340 int sid = blockIdx.x*blockDim.x + threadIdx.x;
342 int inId = sid +
Vsh*Tslice;
344 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
349 volatile double2
tmp;
350 extern __shared__
double sm[];
352 volatile double *accum_re = sm + threadIdx.x;
358 auxCoord1 = eutId /
X1;
359 xCoord1 = eutId - auxCoord1 *
X1;
360 auxCoord2 = auxCoord1 /
X2;
361 xCoord2 = auxCoord1 - auxCoord2 *
X2;
362 xCoord4 = auxCoord2 /
X3;
363 xCoord3 = auxCoord2 - xCoord4 *
X3;
365 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
366 xCoord1 += auxCoord1;
367 outId = xCoord1 + X1*(xCoord2 + X2*xCoord3);
389 tmp_re = I0.x * J0.x - I0.y * J0.y;
390 tmp_re += I1.x * J1.x - I1.y * J1.y;
391 tmp_re += I2.x * J2.x - I2.y * J2.y;
392 accum_re[0*blockDim.x] =
tmp_re;
394 tmp_im = I0.x * J0.y + I0.y * J0.x;
395 tmp_im += I1.x * J1.y + I1.y * J1.x;
396 tmp_im += I2.x * J2.y + I2.y * J2.x;
397 accum_im[0*blockDim.x] =
tmp_im;
400 tmp_re = I0.x * J3.x - I0.y * J3.y;
401 tmp_re += I1.x * J4.x - I1.y * J4.y;
402 tmp_re += I2.x * J5.x - I2.y * J5.y;
403 accum_re[1*blockDim.x] =
tmp_re;
405 tmp_im = I0.x * J3.y + I0.y * J3.x;
406 tmp_im += I1.x * J4.y + I1.y * J4.x;
407 tmp_im += I2.x * J5.y + I2.y * J5.x;
408 accum_im[1*blockDim.x] =
tmp_im;
411 tmp_re = I0.x * J6.x - I0.y * J6.y;
412 tmp_re += I1.x * J7.x - I1.y * J7.y;
413 tmp_re += I2.x * J8.x - I2.y * J8.y;
414 accum_re[2*blockDim.x] =
tmp_re;
416 tmp_im = I0.x * J6.y + I0.y * J6.x;
417 tmp_im += I1.x * J7.y + I1.y * J7.x;
418 tmp_im += I2.x * J8.y + I2.y * J8.x;
419 accum_im[2*blockDim.x] =
tmp_im;
422 tmp_re = I0.x * J9.x - I0.y * J9.y;
423 tmp_re += I1.x * J10.x - I1.y * J10.y;
424 tmp_re += I2.x * J11.x - I2.y * J11.y;
425 accum_re[3*blockDim.x] =
tmp_re;
427 tmp_im = I0.x * J9.y + I0.y * J9.x;
428 tmp_im += I1.x * J10.y + I1.y * J10.x;
429 tmp_im += I2.x * J11.y + I2.y * J11.x;
430 accum_im[3*blockDim.x] =
tmp_im;
433 tmp_re = I3.x * J0.x - I3.y * J0.y;
434 tmp_re += I4.x * J1.x - I4.y * J1.y;
435 tmp_re += I5.x * J2.x - I5.y * J2.y;
436 accum_re[ 4*blockDim.x] =
tmp_re;
438 tmp_im = I3.x * J0.y + I3.y * J0.x;
439 tmp_im += I4.x * J1.y + I4.y * J1.x;
440 tmp_im += I5.x * J2.y + I5.y * J2.x;
441 accum_im[ 4*blockDim.x] =
tmp_im;
444 tmp_re = I3.x * J3.x - I3.y * J3.y;
445 tmp_re += I4.x * J4.x - I4.y * J4.y;
446 tmp_re += I5.x * J5.x - I5.y * J5.y;
447 accum_re[ 5*blockDim.x] =
tmp_re;
449 tmp_im = I3.x * J3.y + I3.y * J3.x;
450 tmp_im += I4.x * J4.y + I4.y * J4.x;
451 tmp_im += I5.x * J5.y + I5.y * J5.x;
452 accum_im[ 5*blockDim.x] =
tmp_im;
455 tmp_re = I3.x * J6.x - I3.y * J6.y;
456 tmp_re += I4.x * J7.x - I4.y * J7.y;
457 tmp_re += I5.x * J8.x - I5.y * J8.y;
458 accum_re[ 6*blockDim.x] =
tmp_re;
460 tmp_im = I3.x * J6.y + I3.y * J6.x;
461 tmp_im += I4.x * J7.y + I4.y * J7.x;
462 tmp_im += I5.x * J8.y + I5.y * J8.x;
463 accum_im[ 6*blockDim.x] =
tmp_im;
466 tmp_re = I3.x * J9.x - I3.y * J9.y;
467 tmp_re += I4.x * J10.x - I4.y * J10.y;
468 tmp_re += I5.x * J11.x - I5.y * J11.y;
469 accum_re[ 7*blockDim.x] =
tmp_re;
471 tmp_im = I3.x * J9.y + I3.y * J9.x;
472 tmp_im += I4.x * J10.y + I4.y * J10.x;
473 tmp_im += I5.x * J11.y + I5.y * J11.x;
474 accum_im[ 7*blockDim.x] =
tmp_im;
477 tmp_re = I6.x * J0.x - I6.y * J0.y;
478 tmp_re += I7.x * J1.x - I7.y * J1.y;
479 tmp_re += I8.x * J2.x - I8.y * J2.y;
480 accum_re[ 8*blockDim.x] =
tmp_re;
482 tmp_im = I6.x * J0.y + I6.y * J0.x;
483 tmp_im += I7.x * J1.y + I7.y * J1.x;
484 tmp_im += I8.x * J2.y + I8.y * J2.x;
485 accum_im[ 8*blockDim.x] =
tmp_im;
488 tmp_re = I6.x * J3.x - I6.y * J3.y;
489 tmp_re += I7.x * J4.x - I7.y * J4.y;
490 tmp_re += I8.x * J5.x - I8.y * J5.y;
491 accum_re[ 9*blockDim.x] =
tmp_re;
493 tmp_im = I6.x * J3.y + I6.y * J3.x;
494 tmp_im += I7.x * J4.y + I7.y * J4.x;
495 tmp_im += I8.x * J5.y + I8.y * J5.x;
496 accum_im[ 9*blockDim.x] =
tmp_im;
499 tmp_re = I6.x * J6.x - I6.y * J6.y;
500 tmp_re += I7.x * J7.x - I7.y * J7.y;
501 tmp_re += I8.x * J8.x - I8.y * J8.y;
502 accum_re[10*blockDim.x] =
tmp_re;
504 tmp_im = I6.x * J6.y + I6.y * J6.x;
505 tmp_im += I7.x * J7.y + I7.y * J7.x;
506 tmp_im += I8.x * J8.y + I8.y * J8.x;
507 accum_im[10*blockDim.x] =
tmp_im;
510 tmp_re = I6.x * J9.x - I6.y * J9.y;
511 tmp_re += I7.x * J10.x - I7.y * J10.y;
512 tmp_re += I8.x * J11.x - I8.y * J11.y;
513 accum_re[11*blockDim.x] =
tmp_re;
515 tmp_im = I6.x * J9.y + I6.y * J9.x;
516 tmp_im += I7.x * J10.y + I7.y * J10.x;
517 tmp_im += I8.x * J11.y + I8.y * J11.x;
518 accum_im[11*blockDim.x] =
tmp_im;
521 tmp_re = I9.x * J0.x - I9.y * J0.y;
522 tmp_re += I10.x * J1.x - I10.y * J1.y;
523 tmp_re += I11.x * J2.x - I11.y * J2.y;
524 accum_re[12*blockDim.x] =
tmp_re;
526 tmp_im = I9.x * J0.y + I9.y * J0.x;
527 tmp_im += I10.x * J1.y + I10.y * J1.x;
528 tmp_im += I11.x * J2.y + I11.y * J2.x;
529 accum_im[12*blockDim.x] =
tmp_im;
532 tmp_re = I9.x * J3.x - I9.y * J3.y;
533 tmp_re += I10.x * J4.x - I10.y * J4.y;
534 tmp_re += I11.x * J5.x - I11.y * J5.y;
535 accum_re[13*blockDim.x] =
tmp_re;
537 tmp_im = I9.x * J3.y + I9.y * J3.x;
538 tmp_im += I10.x * J4.y + I10.y * J4.x;
539 tmp_im += I11.x * J5.y + I11.y * J5.x;
540 accum_im[13*blockDim.x] =
tmp_im;
543 tmp_re = I9.x * J6.x - I9.y * J6.y;
544 tmp_re += I10.x * J7.x - I10.y * J7.y;
545 tmp_re += I11.x * J8.x - I11.y * J8.y;
546 accum_re[14*blockDim.x] =
tmp_re;
548 tmp_im = I9.x * J6.y + I9.y * J6.x;
549 tmp_im += I10.x * J7.y + I10.y * J7.x;
550 tmp_im += I11.x * J8.y + I11.y * J8.x;
551 accum_im[14*blockDim.x] =
tmp_im;
554 tmp_re = I9.x * J9.x - I9.y * J9.y;
555 tmp_re += I10.x * J10.x - I10.y * J10.y;
556 tmp_re += I11.x * J11.x - I11.y * J11.y;
557 accum_re[15*blockDim.x] =
tmp_re;
559 tmp_im = I9.x * J9.y + I9.y * J9.x;
560 tmp_im += I10.x * J10.y + I10.y * J10.x;
561 tmp_im += I11.x * J11.y + I11.y * J11.x;
562 accum_im[15*blockDim.x] =
tmp_im;
569 out[outId + 0 *param.
threads*2] = make_double2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
570 out[outId + 1 *param.
threads*2] = make_double2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
571 out[outId + 2 *param.
threads*2] = make_double2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
572 out[outId + 3 *param.
threads*2] = make_double2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
573 out[outId + 4 *param.
threads*2] = make_double2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
574 out[outId + 5 *param.
threads*2] = make_double2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
575 out[outId + 6 *param.
threads*2] = make_double2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
576 out[outId + 7 *param.
threads*2] = make_double2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
577 out[outId + 8 *param.
threads*2] = make_double2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
578 out[outId + 9 *param.
threads*2] = make_double2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
579 out[outId + 10*param.
threads*2] = make_double2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
580 out[outId + 11*param.
threads*2] = make_double2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
581 out[outId + 12*param.
threads*2] = make_double2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
582 out[outId + 13*param.
threads*2] = make_double2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
583 out[outId + 14*param.
threads*2] = make_double2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
584 out[outId + 15*param.
threads*2] = make_double2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
589 __global__
void contractKernel (double2 *out, double2 *in1, double2 *in2,
int myStride,
const int Parity,
const DslashParam param)
591 int sid = blockIdx.x*blockDim.x + threadIdx.x;
593 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
598 volatile double2
tmp;
599 extern __shared__
double sm[];
601 volatile double *accum_re = sm + threadIdx.x;
605 auxCoord1 = eutId /
X1;
606 xCoord1 = eutId - auxCoord1 *
X1;
607 auxCoord2 = auxCoord1 /
X2;
608 xCoord2 = auxCoord1 - auxCoord2 *
X2;
609 xCoord4 = auxCoord2 /
X3;
610 xCoord3 = auxCoord2 - xCoord4 *
X3;
612 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
613 xCoord1 += auxCoord1;
614 outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4));
636 tmp_re = I0.x * J0.x - I0.y * J0.y;
637 tmp_re += I1.x * J1.x - I1.y * J1.y;
638 tmp_re += I2.x * J2.x - I2.y * J2.y;
639 accum_re[0*blockDim.x] =
tmp_re;
641 tmp_im = I0.x * J0.y + I0.y * J0.x;
642 tmp_im += I1.x * J1.y + I1.y * J1.x;
643 tmp_im += I2.x * J2.y + I2.y * J2.x;
644 accum_im[0*blockDim.x] =
tmp_im;
647 tmp_re = I0.x * J3.x - I0.y * J3.y;
648 tmp_re += I1.x * J4.x - I1.y * J4.y;
649 tmp_re += I2.x * J5.x - I2.y * J5.y;
650 accum_re[1*blockDim.x] =
tmp_re;
652 tmp_im = I0.x * J3.y + I0.y * J3.x;
653 tmp_im += I1.x * J4.y + I1.y * J4.x;
654 tmp_im += I2.x * J5.y + I2.y * J5.x;
655 accum_im[1*blockDim.x] =
tmp_im;
658 tmp_re = I0.x * J6.x - I0.y * J6.y;
659 tmp_re += I1.x * J7.x - I1.y * J7.y;
660 tmp_re += I2.x * J8.x - I2.y * J8.y;
661 accum_re[2*blockDim.x] =
tmp_re;
663 tmp_im = I0.x * J6.y + I0.y * J6.x;
664 tmp_im += I1.x * J7.y + I1.y * J7.x;
665 tmp_im += I2.x * J8.y + I2.y * J8.x;
666 accum_im[2*blockDim.x] =
tmp_im;
669 tmp_re = I0.x * J9.x - I0.y * J9.y;
670 tmp_re += I1.x * J10.x - I1.y * J10.y;
671 tmp_re += I2.x * J11.x - I2.y * J11.y;
672 accum_re[3*blockDim.x] =
tmp_re;
674 tmp_im = I0.x * J9.y + I0.y * J9.x;
675 tmp_im += I1.x * J10.y + I1.y * J10.x;
676 tmp_im += I2.x * J11.y + I2.y * J11.x;
677 accum_im[3*blockDim.x] =
tmp_im;
680 tmp_re = I3.x * J0.x - I3.y * J0.y;
681 tmp_re += I4.x * J1.x - I4.y * J1.y;
682 tmp_re += I5.x * J2.x - I5.y * J2.y;
683 accum_re[ 4*blockDim.x] =
tmp_re;
685 tmp_im = I3.x * J0.y + I3.y * J0.x;
686 tmp_im += I4.x * J1.y + I4.y * J1.x;
687 tmp_im += I5.x * J2.y + I5.y * J2.x;
688 accum_im[ 4*blockDim.x] =
tmp_im;
691 tmp_re = I3.x * J3.x - I3.y * J3.y;
692 tmp_re += I4.x * J4.x - I4.y * J4.y;
693 tmp_re += I5.x * J5.x - I5.y * J5.y;
694 accum_re[ 5*blockDim.x] =
tmp_re;
696 tmp_im = I3.x * J3.y + I3.y * J3.x;
697 tmp_im += I4.x * J4.y + I4.y * J4.x;
698 tmp_im += I5.x * J5.y + I5.y * J5.x;
699 accum_im[ 5*blockDim.x] =
tmp_im;
702 tmp_re = I3.x * J6.x - I3.y * J6.y;
703 tmp_re += I4.x * J7.x - I4.y * J7.y;
704 tmp_re += I5.x * J8.x - I5.y * J8.y;
705 accum_re[ 6*blockDim.x] =
tmp_re;
707 tmp_im = I3.x * J6.y + I3.y * J6.x;
708 tmp_im += I4.x * J7.y + I4.y * J7.x;
709 tmp_im += I5.x * J8.y + I5.y * J8.x;
710 accum_im[ 6*blockDim.x] =
tmp_im;
713 tmp_re = I3.x * J9.x - I3.y * J9.y;
714 tmp_re += I4.x * J10.x - I4.y * J10.y;
715 tmp_re += I5.x * J11.x - I5.y * J11.y;
716 accum_re[ 7*blockDim.x] =
tmp_re;
718 tmp_im = I3.x * J9.y + I3.y * J9.x;
719 tmp_im += I4.x * J10.y + I4.y * J10.x;
720 tmp_im += I5.x * J11.y + I5.y * J11.x;
721 accum_im[ 7*blockDim.x] =
tmp_im;
724 tmp_re = I6.x * J0.x - I6.y * J0.y;
725 tmp_re += I7.x * J1.x - I7.y * J1.y;
726 tmp_re += I8.x * J2.x - I8.y * J2.y;
727 accum_re[ 8*blockDim.x] =
tmp_re;
729 tmp_im = I6.x * J0.y + I6.y * J0.x;
730 tmp_im += I7.x * J1.y + I7.y * J1.x;
731 tmp_im += I8.x * J2.y + I8.y * J2.x;
732 accum_im[ 8*blockDim.x] =
tmp_im;
735 tmp_re = I6.x * J3.x - I6.y * J3.y;
736 tmp_re += I7.x * J4.x - I7.y * J4.y;
737 tmp_re += I8.x * J5.x - I8.y * J5.y;
738 accum_re[ 9*blockDim.x] =
tmp_re;
740 tmp_im = I6.x * J3.y + I6.y * J3.x;
741 tmp_im += I7.x * J4.y + I7.y * J4.x;
742 tmp_im += I8.x * J5.y + I8.y * J5.x;
743 accum_im[ 9*blockDim.x] =
tmp_im;
746 tmp_re = I6.x * J6.x - I6.y * J6.y;
747 tmp_re += I7.x * J7.x - I7.y * J7.y;
748 tmp_re += I8.x * J8.x - I8.y * J8.y;
749 accum_re[10*blockDim.x] =
tmp_re;
751 tmp_im = I6.x * J6.y + I6.y * J6.x;
752 tmp_im += I7.x * J7.y + I7.y * J7.x;
753 tmp_im += I8.x * J8.y + I8.y * J8.x;
754 accum_im[10*blockDim.x] =
tmp_im;
757 tmp_re = I6.x * J9.x - I6.y * J9.y;
758 tmp_re += I7.x * J10.x - I7.y * J10.y;
759 tmp_re += I8.x * J11.x - I8.y * J11.y;
760 accum_re[11*blockDim.x] =
tmp_re;
762 tmp_im = I6.x * J9.y + I6.y * J9.x;
763 tmp_im += I7.x * J10.y + I7.y * J10.x;
764 tmp_im += I8.x * J11.y + I8.y * J11.x;
765 accum_im[11*blockDim.x] =
tmp_im;
768 tmp_re = I9.x * J0.x - I9.y * J0.y;
769 tmp_re += I10.x * J1.x - I10.y * J1.y;
770 tmp_re += I11.x * J2.x - I11.y * J2.y;
771 accum_re[12*blockDim.x] =
tmp_re;
773 tmp_im = I9.x * J0.y + I9.y * J0.x;
774 tmp_im += I10.x * J1.y + I10.y * J1.x;
775 tmp_im += I11.x * J2.y + I11.y * J2.x;
776 accum_im[12*blockDim.x] =
tmp_im;
779 tmp_re = I9.x * J3.x - I9.y * J3.y;
780 tmp_re += I10.x * J4.x - I10.y * J4.y;
781 tmp_re += I11.x * J5.x - I11.y * J5.y;
782 accum_re[13*blockDim.x] =
tmp_re;
784 tmp_im = I9.x * J3.y + I9.y * J3.x;
785 tmp_im += I10.x * J4.y + I10.y * J4.x;
786 tmp_im += I11.x * J5.y + I11.y * J5.x;
787 accum_im[13*blockDim.x] =
tmp_im;
790 tmp_re = I9.x * J6.x - I9.y * J6.y;
791 tmp_re += I10.x * J7.x - I10.y * J7.y;
792 tmp_re += I11.x * J8.x - I11.y * J8.y;
793 accum_re[14*blockDim.x] =
tmp_re;
795 tmp_im = I9.x * J6.y + I9.y * J6.x;
796 tmp_im += I10.x * J7.y + I10.y * J7.x;
797 tmp_im += I11.x * J8.y + I11.y * J8.x;
798 accum_im[14*blockDim.x] =
tmp_im;
801 tmp_re = I9.x * J9.x - I9.y * J9.y;
802 tmp_re += I10.x * J10.x - I10.y * J10.y;
803 tmp_re += I11.x * J11.x - I11.y * J11.y;
804 accum_re[15*blockDim.x] =
tmp_re;
806 tmp_im = I9.x * J9.y + I9.y * J9.x;
807 tmp_im += I10.x * J10.y + I10.y * J10.x;
808 tmp_im += I11.x * J11.y + I11.y * J11.x;
809 accum_im[15*blockDim.x] =
tmp_im;
813 out[outId + 0 *param.
threads*2] = make_double2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
814 out[outId + 1 *param.
threads*2] = make_double2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
815 out[outId + 2 *param.
threads*2] = make_double2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
816 out[outId + 3 *param.
threads*2] = make_double2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
817 out[outId + 4 *param.
threads*2] = make_double2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
818 out[outId + 5 *param.
threads*2] = make_double2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
819 out[outId + 6 *param.
threads*2] = make_double2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
820 out[outId + 7 *param.
threads*2] = make_double2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
821 out[outId + 8 *param.
threads*2] = make_double2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
822 out[outId + 9 *param.
threads*2] = make_double2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
823 out[outId + 10*param.
threads*2] = make_double2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
824 out[outId + 11*param.
threads*2] = make_double2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
825 out[outId + 12*param.
threads*2] = make_double2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
826 out[outId + 13*param.
threads*2] = make_double2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
827 out[outId + 14*param.
threads*2] = make_double2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
828 out[outId + 15*param.
threads*2] = make_double2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
837 #undef READ_INTERMEDIATE_SPINOR
841 #endif // (__CUDA_ARCH__ >= 130)
844 #define READ_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
845 float4 I0 = spinor[sp_idx + 0*(stride)]; \
846 float4 I1 = spinor[sp_idx + 1*(stride)]; \
847 float4 I2 = spinor[sp_idx + 2*(stride)]; \
848 float4 I3 = spinor[sp_idx + 3*(stride)]; \
849 float4 I4 = spinor[sp_idx + 4*(stride)]; \
850 float4 I5 = spinor[sp_idx + 5*(stride)];
853 #define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
854 float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
855 float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
856 float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
857 float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
858 float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
859 float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
861 #define READ_INTERMEDIATE_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
862 float4 J0 = spinor[sp_idx + 0*(stride)]; \
863 float4 J1 = spinor[sp_idx + 1*(stride)]; \
864 float4 J2 = spinor[sp_idx + 2*(stride)]; \
865 float4 J3 = spinor[sp_idx + 3*(stride)]; \
866 float4 J4 = spinor[sp_idx + 4*(stride)]; \
867 float4 J5 = spinor[sp_idx + 5*(stride)];
869 #define READ_INTERMEDIATE_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
870 float4 J0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
871 float4 J1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
872 float4 J2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
873 float4 J3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
874 float4 J4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
875 float4 J5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
878 #ifdef DIRECT_ACCESS_WILSON_SPINOR
879 #define READ_SPINOR READ_SPINOR_SINGLE
882 #define READ_SPINOR READ_SPINOR_SINGLE_TEX
884 #ifdef USE_TEXTURE_OBJECTS
885 #define SPINORTEX param.inTex
887 #define SPINORTEX spinorTexSingle
888 #endif // USE_TEXTURE_OBJECTS
891 #ifdef DIRECT_ACCESS_WILSON_INTER
892 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE
895 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE_TEX
897 #ifdef USE_TEXTURE_OBJECTS
898 #define INTERTEX param.outTex
900 #define INTERTEX interTexSingle
901 #endif // USE_TEXTURE_OBJECTS
908 int sid = blockIdx.x*blockDim.x + threadIdx.x;
910 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
916 extern __shared__
float sms[];
918 volatile float *accum_re = sms + threadIdx.x;
922 auxCoord1 = eutId /
X1;
923 xCoord1 = eutId - auxCoord1 *
X1;
924 auxCoord2 = auxCoord1 /
X2;
925 xCoord2 = auxCoord1 - auxCoord2 *
X2;
926 xCoord4 = auxCoord2 /
X3;
927 xCoord3 = auxCoord2 - xCoord4 *
X3;
929 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
930 xCoord1 += auxCoord1;
931 outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4));
991 tmp_re = I0.x * J0.x - I0.y * J0.y;
992 tmp_re += I0.z * J0.z - I0.w * J0.w;
993 tmp_re += I1.x * J1.x - I1.y * J1.y;
994 accum_re[0*blockDim.x] =
tmp_re;
996 tmp_im = I0.x * J0.y + I0.y * J0.x;
997 tmp_im += I0.z * J0.w + I0.w * J0.z;
998 tmp_im += I1.x * J1.y + I1.y * J1.x;
999 accum_im[0*blockDim.x] =
tmp_im;
1002 tmp_re = I0.x * J1.z - I0.y * J1.w;
1003 tmp_re += I0.z * J2.x - I0.w * J2.y;
1004 tmp_re += I1.x * J2.z - I1.y * J2.w;
1005 accum_re[1*blockDim.x] =
tmp_re;
1007 tmp_im = I0.x * J1.w + I0.y * J1.z;
1008 tmp_im += I0.z * J2.y + I0.w * J2.x;
1009 tmp_im += I1.x * J2.w + I1.y * J2.z;
1010 accum_im[1*blockDim.x] =
tmp_im;
1013 tmp_re = I0.x * J3.x - I0.y * J3.y;
1014 tmp_re += I0.z * J3.z - I0.w * J3.w;
1015 tmp_re += I1.x * J4.x - I1.y * J4.y;
1016 accum_re[2*blockDim.x] =
tmp_re;
1018 tmp_im = I0.x * J3.y + I0.y * J3.x;
1019 tmp_im += I0.z * J3.w + I0.w * J3.z;
1020 tmp_im += I1.x * J4.y + I1.y * J4.x;
1021 accum_im[2*blockDim.x] =
tmp_im;
1024 tmp_re = I0.x * J4.z - I0.y * J4.w;
1025 tmp_re += I0.z * J5.x - I0.w * J5.x;
1026 tmp_re += I1.x * J5.z - I1.y * J5.w;
1028 accum_re[3*blockDim.x] =
tmp_re;
1030 tmp_im = I0.x * J4.w + I0.y * J4.z;
1031 tmp_im += I0.z * J5.y + I0.w * J5.y;
1032 tmp_im += I1.x * J5.w + I1.y * J5.z;
1033 accum_im[3*blockDim.x] =
tmp_im;
1036 tmp_re = I1.z * J0.x - I1.w * J0.y;
1037 tmp_re += I2.x * J0.z - I2.y * J0.w;
1038 tmp_re += I2.z * J1.x - I2.w * J1.y;
1039 accum_re[4*blockDim.x] =
tmp_re;
1041 tmp_im = I1.z * J0.y + I1.w * J0.x;
1042 tmp_im += I2.x * J0.w + I2.y * J0.z;
1043 tmp_im += I2.z * J1.y + I2.w * J1.x;
1044 accum_im[4*blockDim.x] =
tmp_im;
1047 tmp_re = I1.z * J1.z - I1.w * J1.w;
1048 tmp_re += I2.x * J2.x - I2.y * J2.y;
1049 tmp_re += I2.z * J2.z - I2.w * J2.w;
1050 accum_re[5*blockDim.x] =
tmp_re;
1052 tmp_im = I1.z * J1.w + I1.w * J1.z;
1053 tmp_im += I2.x * J2.y + I2.y * J2.x;
1054 tmp_im += I2.z * J2.w + I2.w * J2.z;
1055 accum_im[5*blockDim.x] =
tmp_im;
1058 tmp_re = I1.z * J3.x - I1.w * J3.y;
1059 tmp_re += I2.x * J3.z - I2.y * J3.w;
1060 tmp_re += I2.z * J4.x - I2.w * J4.y;
1061 accum_re[6*blockDim.x] =
tmp_re;
1063 tmp_im = I1.z * J3.y + I1.w * J3.x;
1064 tmp_im += I2.x * J3.w + I2.y * J3.z;
1065 tmp_im += I2.z * J4.y + I2.w * J4.x;
1066 accum_im[6*blockDim.x] =
tmp_im;
1069 tmp_re = I1.z * J4.z - I1.w * J4.w;
1070 tmp_re += I2.x * J5.x - I2.y * J5.y;
1071 tmp_re += I2.z * J5.z - I2.w * J5.w;
1072 accum_re[7*blockDim.x] =
tmp_re;
1074 tmp_im = I1.z * J4.w + I1.w * J4.z;
1075 tmp_im += I2.x * J5.y + I2.y * J5.x;
1076 tmp_im += I2.z * J5.w + I2.w * J5.z;
1077 accum_im[7*blockDim.x] =
tmp_im;
1080 tmp_re = I3.x * J0.x - I3.y * J0.y;
1081 tmp_re += I3.z * J0.z - I3.w * J0.w;
1082 tmp_re += I4.x * J1.x - I4.y * J1.y;
1083 accum_re[8*blockDim.x] =
tmp_re;
1085 tmp_im = I3.x * J0.y + I3.y * J0.x;
1086 tmp_im += I3.z * J0.w + I3.w * J0.z;
1087 tmp_im += I4.x * J1.y + I4.y * J1.x;
1088 accum_im[8*blockDim.x] =
tmp_im;
1091 tmp_re = I3.x * J1.z - I3.y * J1.w;
1092 tmp_re += I3.z * J2.x - I3.w * J2.y;
1093 tmp_re += I4.x * J2.z - I4.y * J2.w;
1094 accum_re[9*blockDim.x] =
tmp_re;
1096 tmp_im = I3.x * J1.w + I3.y * J1.z;
1097 tmp_im += I3.z * J2.y + I3.w * J2.x;
1098 tmp_im += I4.x * J2.w + I4.y * J2.z;
1099 accum_im[9*blockDim.x] =
tmp_im;
1102 tmp_re = I3.x * J3.x - I3.y * J3.y;
1103 tmp_re += I3.z * J3.z - I3.w * J3.w;
1104 tmp_re += I4.x * J4.x - I4.y * J4.y;
1105 accum_re[10*blockDim.x] =
tmp_re;
1107 tmp_im = I3.x * J3.y + I3.y * J3.x;
1108 tmp_im += I3.z * J3.w + I3.w * J3.z;
1109 tmp_im += I4.x * J4.y + I4.y * J4.x;
1110 accum_im[10*blockDim.x] =
tmp_im;
1113 tmp_re = I3.x * J4.z - I3.y * J4.w;
1114 tmp_re += I3.z * J5.x - I3.w * J5.y;
1115 tmp_re += I4.x * J5.z - I4.y * J5.w;
1116 accum_re[11*blockDim.x] =
tmp_re;
1118 tmp_im = I3.x * J4.w + I3.y * J4.z;
1119 tmp_im += I3.z * J5.y + I3.w * J5.x;
1120 tmp_im += I4.x * J5.w + I4.y * J5.z;
1121 accum_im[11*blockDim.x] =
tmp_im;
1124 tmp_re = I4.z * J0.x - I4.w * J0.y;
1125 tmp_re += I5.x * J0.z - I5.y * J0.w;
1126 tmp_re += I5.z * J1.x - I5.w * J1.y;
1127 accum_re[12*blockDim.x] =
tmp_re;
1129 tmp_im = I4.z * J0.y + I4.w * J0.x;
1130 tmp_im += I5.x * J0.w + I5.y * J0.z;
1131 tmp_im += I5.z * J1.y + I5.w * J1.x;
1132 accum_im[12*blockDim.x] =
tmp_im;
1135 tmp_re = I4.z * J1.z - I4.w * J1.w;
1136 tmp_re += I5.x * J2.x - I5.y * J2.y;
1137 tmp_re += I5.z * J2.z - I5.w * J2.w;
1138 accum_re[13*blockDim.x] =
tmp_re;
1140 tmp_im = I4.z * J1.w + I4.w * J1.z;
1141 tmp_im += I5.x * J2.y + I5.y * J2.x;
1142 tmp_im += I5.z * J2.w + I5.w * J2.z;
1143 accum_im[13*blockDim.x] =
tmp_im;
1146 tmp_re = I4.z * J3.x - I4.w * J3.y;
1147 tmp_re += I5.x * J3.z - I5.y * J3.w;
1148 tmp_re += I5.z * J4.x - I5.w * J4.y;
1149 accum_re[14*blockDim.x] =
tmp_re;
1151 tmp_im = I4.z * J3.y + I4.w * J3.x;
1152 tmp_im += I5.x * J3.w + I5.y * J3.z;
1153 tmp_im += I5.z * J4.y + I5.w * J4.x;
1154 accum_im[14*blockDim.x] =
tmp_im;
1157 tmp_re = I4.z * J4.z - I4.w * J4.w;
1158 tmp_re += I5.x * J5.x - I5.y * J5.y;
1159 tmp_re += I5.z * J5.z - I5.w * J5.w;
1160 accum_re[15*blockDim.x] =
tmp_re;
1162 tmp_im = I4.z * J4.w + I4.w * J4.z;
1163 tmp_im += I5.x * J5.y + I5.y * J5.x;
1164 tmp_im += I5.z * J5.w + I5.w * J5.z;
1165 accum_im[15*blockDim.x] =
tmp_im;
1174 out[outId + 0 *param.
threads*2] = make_float2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
1175 out[outId + 1 *param.
threads*2] = make_float2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
1176 out[outId + 2 *param.
threads*2] = make_float2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
1177 out[outId + 3 *param.
threads*2] = make_float2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
1178 out[outId + 4 *param.
threads*2] = make_float2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
1179 out[outId + 5 *param.
threads*2] = make_float2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
1180 out[outId + 6 *param.
threads*2] = make_float2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
1181 out[outId + 7 *param.
threads*2] = make_float2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
1182 out[outId + 8 *param.
threads*2] = make_float2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
1183 out[outId + 9 *param.
threads*2] = make_float2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
1184 out[outId + 10*param.
threads*2] = make_float2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
1185 out[outId + 11*param.
threads*2] = make_float2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
1186 out[outId + 12*param.
threads*2] = make_float2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
1187 out[outId + 13*param.
threads*2] = make_float2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
1188 out[outId + 14*param.
threads*2] = make_float2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
1189 out[outId + 15*param.
threads*2] = make_float2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
1198 int sid = blockIdx.x*blockDim.x + threadIdx.x;
1200 int inId = sid +
Vsh*Tslice;
1202 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1207 volatile float2
tmp;
1208 extern __shared__
float sms[];
1210 volatile float *accum_re = sms + threadIdx.x;
1216 auxCoord1 = eutId /
X1;
1217 xCoord1 = eutId - auxCoord1 *
X1;
1218 auxCoord2 = auxCoord1 /
X2;
1219 xCoord2 = auxCoord1 - auxCoord2 *
X2;
1220 xCoord4 = auxCoord2 /
X3;
1225 xCoord3 = auxCoord2 - xCoord4 *
X3;
1227 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1228 xCoord1 += auxCoord1;
1229 outId = xCoord1 + X1*(xCoord2 + X2*xCoord3);
1253 tmp_re = I0.x * J0.x - I0.y * J0.y;
1254 tmp_re += I0.z * J0.z - I0.w * J0.w;
1255 tmp_re += I1.x * J1.x - I1.y * J1.y;
1256 accum_re[0*blockDim.x] =
tmp_re;
1258 tmp_im = I0.x * J0.y + I0.y * J0.x;
1259 tmp_im += I0.z * J0.w + I0.w * J0.z;
1260 tmp_im += I1.x * J1.y + I1.y * J1.x;
1261 accum_im[0*blockDim.x] =
tmp_im;
1264 tmp_re = I0.x * J1.z - I0.y * J1.w;
1265 tmp_re += I0.z * J2.x - I0.w * J2.y;
1266 tmp_re += I1.x * J2.z - I1.y * J2.w;
1267 accum_re[1*blockDim.x] =
tmp_re;
1269 tmp_im = I0.x * J1.w + I0.y * J1.z;
1270 tmp_im += I0.z * J2.y + I0.w * J2.x;
1271 tmp_im += I1.x * J2.w + I1.y * J2.z;
1272 accum_im[1*blockDim.x] =
tmp_im;
1275 tmp_re = I0.x * J3.x - I0.y * J3.y;
1276 tmp_re += I0.z * J3.z - I0.w * J3.w;
1277 tmp_re += I1.x * J4.x - I1.y * J4.y;
1278 accum_re[2*blockDim.x] =
tmp_re;
1280 tmp_im = I0.x * J3.y + I0.y * J3.x;
1281 tmp_im += I0.z * J3.w + I0.w * J3.z;
1282 tmp_im += I1.x * J4.y + I1.y * J4.x;
1283 accum_im[2*blockDim.x] =
tmp_im;
1286 tmp_re = I0.x * J4.z - I0.y * J4.w;
1287 tmp_re += I0.z * J5.x - I0.w * J5.x;
1288 tmp_re += I1.x * J5.z - I1.y * J5.w;
1290 accum_re[3*blockDim.x] =
tmp_re;
1292 tmp_im = I0.x * J4.w + I0.y * J4.z;
1293 tmp_im += I0.z * J5.y + I0.w * J5.y;
1294 tmp_im += I1.x * J5.w + I1.y * J5.z;
1295 accum_im[3*blockDim.x] =
tmp_im;
1298 tmp_re = I1.z * J0.x - I1.w * J0.y;
1299 tmp_re += I2.x * J0.z - I2.y * J0.w;
1300 tmp_re += I2.z * J1.x - I2.w * J1.y;
1301 accum_re[4*blockDim.x] =
tmp_re;
1303 tmp_im = I1.z * J0.y + I1.w * J0.x;
1304 tmp_im += I2.x * J0.w + I2.y * J0.z;
1305 tmp_im += I2.z * J1.y + I2.w * J1.x;
1306 accum_im[4*blockDim.x] =
tmp_im;
1309 tmp_re = I1.z * J1.z - I1.w * J1.w;
1310 tmp_re += I2.x * J2.x - I2.y * J2.y;
1311 tmp_re += I2.z * J2.z - I2.w * J2.w;
1312 accum_re[5*blockDim.x] =
tmp_re;
1314 tmp_im = I1.z * J1.w + I1.w * J1.z;
1315 tmp_im += I2.x * J2.y + I2.y * J2.x;
1316 tmp_im += I2.z * J2.w + I2.w * J2.z;
1317 accum_im[5*blockDim.x] =
tmp_im;
1320 tmp_re = I1.z * J3.x - I1.w * J3.y;
1321 tmp_re += I2.x * J3.z - I2.y * J3.w;
1322 tmp_re += I2.z * J4.x - I2.w * J4.y;
1323 accum_re[6*blockDim.x] =
tmp_re;
1325 tmp_im = I1.z * J3.y + I1.w * J3.x;
1326 tmp_im += I2.x * J3.w + I2.y * J3.z;
1327 tmp_im += I2.z * J4.y + I2.w * J4.x;
1328 accum_im[6*blockDim.x] =
tmp_im;
1331 tmp_re = I1.z * J4.z - I1.w * J4.w;
1332 tmp_re += I2.x * J5.x - I2.y * J5.y;
1333 tmp_re += I2.z * J5.z - I2.w * J5.w;
1334 accum_re[7*blockDim.x] =
tmp_re;
1336 tmp_im = I1.z * J4.w + I1.w * J4.z;
1337 tmp_im += I2.x * J5.y + I2.y * J5.x;
1338 tmp_im += I2.z * J5.w + I2.w * J5.z;
1339 accum_im[7*blockDim.x] =
tmp_im;
1342 tmp_re = I3.x * J0.x - I3.y * J0.y;
1343 tmp_re += I3.z * J0.z - I3.w * J0.w;
1344 tmp_re += I4.x * J1.x - I4.y * J1.y;
1345 accum_re[8*blockDim.x] =
tmp_re;
1347 tmp_im = I3.x * J0.y + I3.y * J0.x;
1348 tmp_im += I3.z * J0.w + I3.w * J0.z;
1349 tmp_im += I4.x * J1.y + I4.y * J1.x;
1350 accum_im[8*blockDim.x] =
tmp_im;
1353 tmp_re = I3.x * J1.z - I3.y * J1.w;
1354 tmp_re += I3.z * J2.x - I3.w * J2.y;
1355 tmp_re += I4.x * J2.z - I4.y * J2.w;
1356 accum_re[9*blockDim.x] =
tmp_re;
1358 tmp_im = I3.x * J1.w + I3.y * J1.z;
1359 tmp_im += I3.z * J2.y + I3.w * J2.x;
1360 tmp_im += I4.x * J2.w + I4.y * J2.z;
1361 accum_im[9*blockDim.x] =
tmp_im;
1364 tmp_re = I3.x * J3.x - I3.y * J3.y;
1365 tmp_re += I3.z * J3.z - I3.w * J3.w;
1366 tmp_re += I4.x * J4.x - I4.y * J4.y;
1367 accum_re[10*blockDim.x] =
tmp_re;
1369 tmp_im = I3.x * J3.y + I3.y * J3.x;
1370 tmp_im += I3.z * J3.w + I3.w * J3.z;
1371 tmp_im += I4.x * J4.y + I4.y * J4.x;
1372 accum_im[10*blockDim.x] =
tmp_im;
1375 tmp_re = I3.x * J4.z - I3.y * J4.w;
1376 tmp_re += I3.z * J5.x - I3.w * J5.y;
1377 tmp_re += I4.x * J5.z - I4.y * J5.w;
1378 accum_re[11*blockDim.x] =
tmp_re;
1380 tmp_im = I3.x * J4.w + I3.y * J4.z;
1381 tmp_im += I3.z * J5.y + I3.w * J5.x;
1382 tmp_im += I4.x * J5.w + I4.y * J5.z;
1383 accum_im[11*blockDim.x] =
tmp_im;
1386 tmp_re = I4.z * J0.x - I4.w * J0.y;
1387 tmp_re += I5.x * J0.z - I5.y * J0.w;
1388 tmp_re += I5.z * J1.x - I5.w * J1.y;
1389 accum_re[12*blockDim.x] =
tmp_re;
1391 tmp_im = I4.z * J0.y + I4.w * J0.x;
1392 tmp_im += I5.x * J0.w + I5.y * J0.z;
1393 tmp_im += I5.z * J1.y + I5.w * J1.x;
1394 accum_im[12*blockDim.x] =
tmp_im;
1397 tmp_re = I4.z * J1.z - I4.w * J1.w;
1398 tmp_re += I5.x * J2.x - I5.y * J2.y;
1399 tmp_re += I5.z * J2.z - I5.w * J2.w;
1400 accum_re[13*blockDim.x] =
tmp_re;
1402 tmp_im = I4.z * J1.w + I4.w * J1.z;
1403 tmp_im += I5.x * J2.y + I5.y * J2.x;
1404 tmp_im += I5.z * J2.w + I5.w * J2.z;
1405 accum_im[13*blockDim.x] =
tmp_im;
1408 tmp_re = I4.z * J3.x - I4.w * J3.y;
1409 tmp_re += I5.x * J3.z - I5.y * J3.w;
1410 tmp_re += I5.z * J4.x - I5.w * J4.y;
1411 accum_re[14*blockDim.x] =
tmp_re;
1413 tmp_im = I4.z * J3.y + I4.w * J3.x;
1414 tmp_im += I5.x * J3.w + I5.y * J3.z;
1415 tmp_im += I5.z * J4.y + I5.w * J4.x;
1416 accum_im[14*blockDim.x] =
tmp_im;
1419 tmp_re = I4.z * J4.z - I4.w * J4.w;
1420 tmp_re += I5.x * J5.x - I5.y * J5.y;
1421 tmp_re += I5.z * J5.z - I5.w * J5.w;
1422 accum_re[15*blockDim.x] =
tmp_re;
1424 tmp_im = I4.z * J4.w + I4.w * J4.z;
1425 tmp_im += I5.x * J5.y + I5.y * J5.x;
1426 tmp_im += I5.z * J5.w + I5.w * J5.z;
1427 accum_im[15*blockDim.x] =
tmp_im;
1434 out[outId + 0 *param.
threads*2] = make_float2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
1435 out[outId + 1 *param.
threads*2] = make_float2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
1436 out[outId + 2 *param.
threads*2] = make_float2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
1437 out[outId + 3 *param.
threads*2] = make_float2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
1438 out[outId + 4 *param.
threads*2] = make_float2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
1439 out[outId + 5 *param.
threads*2] = make_float2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
1440 out[outId + 6 *param.
threads*2] = make_float2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
1441 out[outId + 7 *param.
threads*2] = make_float2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
1442 out[outId + 8 *param.
threads*2] = make_float2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
1443 out[outId + 9 *param.
threads*2] = make_float2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
1444 out[outId + 10*param.
threads*2] = make_float2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
1445 out[outId + 11*param.
threads*2] = make_float2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
1446 out[outId + 12*param.
threads*2] = make_float2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
1447 out[outId + 13*param.
threads*2] = make_float2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
1448 out[outId + 14*param.
threads*2] = make_float2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
1449 out[outId + 15*param.
threads*2] = make_float2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
1456 int sid = blockIdx.x*blockDim.x + threadIdx.x;
1458 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1463 volatile float2
tmp;
1464 extern __shared__
float sms[];
1466 volatile float *accum_re = sms + threadIdx.x;
1470 auxCoord1 = eutId /
X1;
1471 xCoord1 = eutId - auxCoord1 *
X1;
1472 auxCoord2 = auxCoord1 /
X2;
1473 xCoord2 = auxCoord1 - auxCoord2 *
X2;
1474 xCoord4 = auxCoord2 /
X3;
1475 xCoord3 = auxCoord2 - xCoord4 *
X3;
1477 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1478 xCoord1 += auxCoord1;
1479 outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4));
1503 tmp_re = I0.x * J0.x - I0.y * J0.y;
1504 tmp_re += I0.z * J0.z - I0.w * J0.w;
1505 tmp_re += I1.x * J1.x - I1.y * J1.y;
1506 accum_re[0*blockDim.x] =
tmp_re;
1508 tmp_im = I0.x * J0.y + I0.y * J0.x;
1509 tmp_im += I0.z * J0.w + I0.w * J0.z;
1510 tmp_im += I1.x * J1.y + I1.y * J1.x;
1511 accum_im[0*blockDim.x] =
tmp_im;
1514 tmp_re = I0.x * J1.z - I0.y * J1.w;
1515 tmp_re += I0.z * J2.x - I0.w * J2.y;
1516 tmp_re += I1.x * J2.z - I1.y * J2.w;
1517 accum_re[1*blockDim.x] =
tmp_re;
1519 tmp_im = I0.x * J1.w + I0.y * J1.z;
1520 tmp_im += I0.z * J2.y + I0.w * J2.x;
1521 tmp_im += I1.x * J2.w + I1.y * J2.z;
1522 accum_im[1*blockDim.x] =
tmp_im;
1525 tmp_re = I0.x * J3.x - I0.y * J3.y;
1526 tmp_re += I0.z * J3.z - I0.w * J3.w;
1527 tmp_re += I1.x * J4.x - I1.y * J4.y;
1528 accum_re[2*blockDim.x] =
tmp_re;
1530 tmp_im = I0.x * J3.y + I0.y * J3.x;
1531 tmp_im += I0.z * J3.w + I0.w * J3.z;
1532 tmp_im += I1.x * J4.y + I1.y * J4.x;
1533 accum_im[2*blockDim.x] =
tmp_im;
1536 tmp_re = I0.x * J4.z - I0.y * J4.w;
1537 tmp_re += I0.z * J5.x - I0.w * J5.x;
1538 tmp_re += I1.x * J5.z - I1.y * J5.w;
1540 accum_re[3*blockDim.x] =
tmp_re;
1542 tmp_im = I0.x * J4.w + I0.y * J4.z;
1543 tmp_im += I0.z * J5.y + I0.w * J5.y;
1544 tmp_im += I1.x * J5.w + I1.y * J5.z;
1545 accum_im[3*blockDim.x] =
tmp_im;
1548 tmp_re = I1.z * J0.x - I1.w * J0.y;
1549 tmp_re += I2.x * J0.z - I2.y * J0.w;
1550 tmp_re += I2.z * J1.x - I2.w * J1.y;
1551 accum_re[4*blockDim.x] =
tmp_re;
1553 tmp_im = I1.z * J0.y + I1.w * J0.x;
1554 tmp_im += I2.x * J0.w + I2.y * J0.z;
1555 tmp_im += I2.z * J1.y + I2.w * J1.x;
1556 accum_im[4*blockDim.x] =
tmp_im;
1559 tmp_re = I1.z * J1.z - I1.w * J1.w;
1560 tmp_re += I2.x * J2.x - I2.y * J2.y;
1561 tmp_re += I2.z * J2.z - I2.w * J2.w;
1562 accum_re[5*blockDim.x] =
tmp_re;
1564 tmp_im = I1.z * J1.w + I1.w * J1.z;
1565 tmp_im += I2.x * J2.y + I2.y * J2.x;
1566 tmp_im += I2.z * J2.w + I2.w * J2.z;
1567 accum_im[5*blockDim.x] =
tmp_im;
1570 tmp_re = I1.z * J3.x - I1.w * J3.y;
1571 tmp_re += I2.x * J3.z - I2.y * J3.w;
1572 tmp_re += I2.z * J4.x - I2.w * J4.y;
1573 accum_re[6*blockDim.x] =
tmp_re;
1575 tmp_im = I1.z * J3.y + I1.w * J3.x;
1576 tmp_im += I2.x * J3.w + I2.y * J3.z;
1577 tmp_im += I2.z * J4.y + I2.w * J4.x;
1578 accum_im[6*blockDim.x] =
tmp_im;
1581 tmp_re = I1.z * J4.z - I1.w * J4.w;
1582 tmp_re += I2.x * J5.x - I2.y * J5.y;
1583 tmp_re += I2.z * J5.z - I2.w * J5.w;
1584 accum_re[7*blockDim.x] =
tmp_re;
1586 tmp_im = I1.z * J4.w + I1.w * J4.z;
1587 tmp_im += I2.x * J5.y + I2.y * J5.x;
1588 tmp_im += I2.z * J5.w + I2.w * J5.z;
1589 accum_im[7*blockDim.x] =
tmp_im;
1592 tmp_re = I3.x * J0.x - I3.y * J0.y;
1593 tmp_re += I3.z * J0.z - I3.w * J0.w;
1594 tmp_re += I4.x * J1.x - I4.y * J1.y;
1595 accum_re[8*blockDim.x] =
tmp_re;
1597 tmp_im = I3.x * J0.y + I3.y * J0.x;
1598 tmp_im += I3.z * J0.w + I3.w * J0.z;
1599 tmp_im += I4.x * J1.y + I4.y * J1.x;
1600 accum_im[8*blockDim.x] =
tmp_im;
1603 tmp_re = I3.x * J1.z - I3.y * J1.w;
1604 tmp_re += I3.z * J2.x - I3.w * J2.y;
1605 tmp_re += I4.x * J2.z - I4.y * J2.w;
1606 accum_re[9*blockDim.x] =
tmp_re;
1608 tmp_im = I3.x * J1.w + I3.y * J1.z;
1609 tmp_im += I3.z * J2.y + I3.w * J2.x;
1610 tmp_im += I4.x * J2.w + I4.y * J2.z;
1611 accum_im[9*blockDim.x] =
tmp_im;
1614 tmp_re = I3.x * J3.x - I3.y * J3.y;
1615 tmp_re += I3.z * J3.z - I3.w * J3.w;
1616 tmp_re += I4.x * J4.x - I4.y * J4.y;
1617 accum_re[10*blockDim.x] =
tmp_re;
1619 tmp_im = I3.x * J3.y + I3.y * J3.x;
1620 tmp_im += I3.z * J3.w + I3.w * J3.z;
1621 tmp_im += I4.x * J4.y + I4.y * J4.x;
1622 accum_im[10*blockDim.x] =
tmp_im;
1625 tmp_re = I3.x * J4.z - I3.y * J4.w;
1626 tmp_re += I3.z * J5.x - I3.w * J5.y;
1627 tmp_re += I4.x * J5.z - I4.y * J5.w;
1628 accum_re[11*blockDim.x] =
tmp_re;
1630 tmp_im = I3.x * J4.w + I3.y * J4.z;
1631 tmp_im += I3.z * J5.y + I3.w * J5.x;
1632 tmp_im += I4.x * J5.w + I4.y * J5.z;
1633 accum_im[11*blockDim.x] =
tmp_im;
1636 tmp_re = I4.z * J0.x - I4.w * J0.y;
1637 tmp_re += I5.x * J0.z - I5.y * J0.w;
1638 tmp_re += I5.z * J1.x - I5.w * J1.y;
1639 accum_re[12*blockDim.x] =
tmp_re;
1641 tmp_im = I4.z * J0.y + I4.w * J0.x;
1642 tmp_im += I5.x * J0.w + I5.y * J0.z;
1643 tmp_im += I5.z * J1.y + I5.w * J1.x;
1644 accum_im[12*blockDim.x] =
tmp_im;
1647 tmp_re = I4.z * J1.z - I4.w * J1.w;
1648 tmp_re += I5.x * J2.x - I5.y * J2.y;
1649 tmp_re += I5.z * J2.z - I5.w * J2.w;
1650 accum_re[13*blockDim.x] =
tmp_re;
1652 tmp_im = I4.z * J1.w + I4.w * J1.z;
1653 tmp_im += I5.x * J2.y + I5.y * J2.x;
1654 tmp_im += I5.z * J2.w + I5.w * J2.z;
1655 accum_im[13*blockDim.x] =
tmp_im;
1658 tmp_re = I4.z * J3.x - I4.w * J3.y;
1659 tmp_re += I5.x * J3.z - I5.y * J3.w;
1660 tmp_re += I5.z * J4.x - I5.w * J4.y;
1661 accum_re[14*blockDim.x] =
tmp_re;
1663 tmp_im = I4.z * J3.y + I4.w * J3.x;
1664 tmp_im += I5.x * J3.w + I5.y * J3.z;
1665 tmp_im += I5.z * J4.y + I5.w * J4.x;
1666 accum_im[14*blockDim.x] =
tmp_im;
1669 tmp_re = I4.z * J4.z - I4.w * J4.w;
1670 tmp_re += I5.x * J5.x - I5.y * J5.y;
1671 tmp_re += I5.z * J5.z - I5.w * J5.w;
1672 accum_re[15*blockDim.x] =
tmp_re;
1674 tmp_im = I4.z * J4.w + I4.w * J4.z;
1675 tmp_im += I5.x * J5.y + I5.y * J5.x;
1676 tmp_im += I5.z * J5.w + I5.w * J5.z;
1677 accum_im[15*blockDim.x] =
tmp_im;
1683 out[outId + 0 *param.
threads*2] = make_float2(accum_re[ 0*blockDim.x], accum_im[ 0*blockDim.x]);
1684 out[outId + 1 *param.
threads*2] = make_float2(accum_re[ 1*blockDim.x], accum_im[ 1*blockDim.x]);
1685 out[outId + 2 *param.
threads*2] = make_float2(accum_re[ 2*blockDim.x], accum_im[ 2*blockDim.x]);
1686 out[outId + 3 *param.
threads*2] = make_float2(accum_re[ 3*blockDim.x], accum_im[ 3*blockDim.x]);
1687 out[outId + 4 *param.
threads*2] = make_float2(accum_re[ 4*blockDim.x], accum_im[ 4*blockDim.x]);
1688 out[outId + 5 *param.
threads*2] = make_float2(accum_re[ 5*blockDim.x], accum_im[ 5*blockDim.x]);
1689 out[outId + 6 *param.
threads*2] = make_float2(accum_re[ 6*blockDim.x], accum_im[ 6*blockDim.x]);
1690 out[outId + 7 *param.
threads*2] = make_float2(accum_re[ 7*blockDim.x], accum_im[ 7*blockDim.x]);
1691 out[outId + 8 *param.
threads*2] = make_float2(accum_re[ 8*blockDim.x], accum_im[ 8*blockDim.x]);
1692 out[outId + 9 *param.
threads*2] = make_float2(accum_re[ 9*blockDim.x], accum_im[ 9*blockDim.x]);
1693 out[outId + 10*param.
threads*2] = make_float2(accum_re[10*blockDim.x], accum_im[10*blockDim.x]);
1694 out[outId + 11*param.
threads*2] = make_float2(accum_re[11*blockDim.x], accum_im[11*blockDim.x]);
1695 out[outId + 12*param.
threads*2] = make_float2(accum_re[12*blockDim.x], accum_im[12*blockDim.x]);
1696 out[outId + 13*param.
threads*2] = make_float2(accum_re[13*blockDim.x], accum_im[13*blockDim.x]);
1697 out[outId + 14*param.
threads*2] = make_float2(accum_re[14*blockDim.x], accum_im[14*blockDim.x]);
1698 out[outId + 15*param.
threads*2] = make_float2(accum_re[15*blockDim.x], accum_im[15*blockDim.x]);
1710 #undef READ_INTERMEDIATE_SPINOR
1714 #endif //_TWIST_QUDA_CONTRACT
__global__ void contractTsliceKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
cudaColorSpinorField * tmp
__global__ void contractKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
#define READ_INTERMEDIATE_SPINOR
cpuColorSpinorField * out
__global__ void contractGamma5Kernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)