1 #ifndef _TWIST_QUDA_CONTRACT_MINUS
2 #define _TWIST_QUDA_CONTRACT_MINUS
7 #define TOTAL_COMPONENTS 16
9 #if (__COMPUTE_CAPABILITY__ >= 130)
11 #define READ_INTERMEDIATE_SPINOR_DOUBLE(spinor, stride, sp_idx, norm_idx) \
12 double2 J0 = spinor[sp_idx + 0*(stride)]; \
13 double2 J1 = spinor[sp_idx + 1*(stride)]; \
14 double2 J2 = spinor[sp_idx + 2*(stride)]; \
15 double2 J3 = spinor[sp_idx + 3*(stride)]; \
16 double2 J4 = spinor[sp_idx + 4*(stride)]; \
17 double2 J5 = spinor[sp_idx + 5*(stride)]; \
18 double2 J6 = spinor[sp_idx + 6*(stride)]; \
19 double2 J7 = spinor[sp_idx + 7*(stride)]; \
20 double2 J8 = spinor[sp_idx + 8*(stride)]; \
21 double2 J9 = spinor[sp_idx + 9*(stride)]; \
22 double2 J10 = spinor[sp_idx +10*(stride)]; \
23 double2 J11 = spinor[sp_idx +11*(stride)];
25 #define READ_INTERMEDIATE_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \
26 double2 J0 = fetch_double2((spinor), sp_idx + 0*(stride)); \
27 double2 J1 = fetch_double2((spinor), sp_idx + 1*(stride)); \
28 double2 J2 = fetch_double2((spinor), sp_idx + 2*(stride)); \
29 double2 J3 = fetch_double2((spinor), sp_idx + 3*(stride)); \
30 double2 J4 = fetch_double2((spinor), sp_idx + 4*(stride)); \
31 double2 J5 = fetch_double2((spinor), sp_idx + 5*(stride)); \
32 double2 J6 = fetch_double2((spinor), sp_idx + 6*(stride)); \
33 double2 J7 = fetch_double2((spinor), sp_idx + 7*(stride)); \
34 double2 J8 = fetch_double2((spinor), sp_idx + 8*(stride)); \
35 double2 J9 = fetch_double2((spinor), sp_idx + 9*(stride)); \
36 double2 J10 = fetch_double2((spinor), sp_idx +10*(stride)); \
37 double2 J11 = fetch_double2((spinor), sp_idx +11*(stride));
39 #ifdef DIRECT_ACCESS_WILSON_SPINOR
40 #define READ_SPINOR READ_SPINOR_DOUBLE
41 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE
45 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX
46 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE_TEX
48 #ifdef USE_TEXTURE_OBJECTS
49 #define SPINORTEX param.inTex
50 #define INTERTEX param.outTex
52 #define SPINORTEX spinorTexDouble
53 #define INTERTEX interTexDouble
54 #endif // USE_TEXTURE_OBJECTS
62 int sid = blockIdx.x*blockDim.x + threadIdx.x;
64 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
70 extern __shared__
double sm[];
72 volatile double *accum_re = sm + threadIdx.x;
76 auxCoord1 = eutId /
X1;
77 xCoord1 = eutId - auxCoord1 *
X1;
78 auxCoord2 = auxCoord1 /
X2;
79 xCoord2 = auxCoord1 - auxCoord2 *
X2;
80 xCoord4 = auxCoord2 /
X3;
81 xCoord3 = auxCoord2 - xCoord4 *
X3;
83 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
85 outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4));
136 tmp_re = I0.x * J0.x - I0.y * J0.y;
137 tmp_re += I1.x * J1.x - I1.y * J1.y;
138 tmp_re += I2.x * J2.x - I2.y * J2.y;
139 accum_re[0*blockDim.x] =
tmp_re;
141 tmp_im = I0.x * J0.y + I0.y * J0.x;
142 tmp_im += I1.x * J1.y + I1.y * J1.x;
143 tmp_im += I2.x * J2.y + I2.y * J2.x;
144 accum_im[0*blockDim.x] =
tmp_im;
147 tmp_re = I0.x * J3.x - I0.y * J3.y;
148 tmp_re += I1.x * J4.x - I1.y * J4.y;
149 tmp_re += I2.x * J5.x - I2.y * J5.y;
150 accum_re[1*blockDim.x] =
tmp_re;
152 tmp_im = I0.x * J3.y + I0.y * J3.x;
153 tmp_im += I1.x * J4.y + I1.y * J4.x;
154 tmp_im += I2.x * J5.y + I2.y * J5.x;
155 accum_im[1*blockDim.x] =
tmp_im;
158 tmp_re = I0.x * J6.x - I0.y * J6.y;
159 tmp_re += I1.x * J7.x - I1.y * J7.y;
160 tmp_re += I2.x * J8.x - I2.y * J8.y;
161 accum_re[2*blockDim.x] =
tmp_re;
163 tmp_im = I0.x * J6.y + I0.y * J6.x;
164 tmp_im += I1.x * J7.y + I1.y * J7.x;
165 tmp_im += I2.x * J8.y + I2.y * J8.x;
166 accum_im[2*blockDim.x] =
tmp_im;
169 tmp_re = I0.x * J9.x - I0.y * J9.y;
170 tmp_re += I1.x * J10.x - I1.y * J10.y;
171 tmp_re += I2.x * J11.x - I2.y * J11.y;
172 accum_re[3*blockDim.x] =
tmp_re;
174 tmp_im = I0.x * J9.y + I0.y * J9.x;
175 tmp_im += I1.x * J10.y + I1.y * J10.x;
176 tmp_im += I2.x * J11.y + I2.y * J11.x;
177 accum_im[3*blockDim.x] =
tmp_im;
180 tmp_re = I3.x * J0.x - I3.y * J0.y;
181 tmp_re += I4.x * J1.x - I4.y * J1.y;
182 tmp_re += I5.x * J2.x - I5.y * J2.y;
183 accum_re[ 4*blockDim.x] =
tmp_re;
185 tmp_im = I3.x * J0.y + I3.y * J0.x;
186 tmp_im += I4.x * J1.y + I4.y * J1.x;
187 tmp_im += I5.x * J2.y + I5.y * J2.x;
188 accum_im[ 4*blockDim.x] =
tmp_im;
191 tmp_re = I3.x * J3.x - I3.y * J3.y;
192 tmp_re += I4.x * J4.x - I4.y * J4.y;
193 tmp_re += I5.x * J5.x - I5.y * J5.y;
194 accum_re[ 5*blockDim.x] =
tmp_re;
196 tmp_im = I3.x * J3.y + I3.y * J3.x;
197 tmp_im += I4.x * J4.y + I4.y * J4.x;
198 tmp_im += I5.x * J5.y + I5.y * J5.x;
199 accum_im[ 5*blockDim.x] =
tmp_im;
202 tmp_re = I3.x * J6.x - I3.y * J6.y;
203 tmp_re += I4.x * J7.x - I4.y * J7.y;
204 tmp_re += I5.x * J8.x - I5.y * J8.y;
205 accum_re[ 6*blockDim.x] =
tmp_re;
207 tmp_im = I3.x * J6.y + I3.y * J6.x;
208 tmp_im += I4.x * J7.y + I4.y * J7.x;
209 tmp_im += I5.x * J8.y + I5.y * J8.x;
210 accum_im[ 6*blockDim.x] =
tmp_im;
213 tmp_re = I3.x * J9.x - I3.y * J9.y;
214 tmp_re += I4.x * J10.x - I4.y * J10.y;
215 tmp_re += I5.x * J11.x - I5.y * J11.y;
216 accum_re[ 7*blockDim.x] =
tmp_re;
218 tmp_im = I3.x * J9.y + I3.y * J9.x;
219 tmp_im += I4.x * J10.y + I4.y * J10.x;
220 tmp_im += I5.x * J11.y + I5.y * J11.x;
221 accum_im[ 7*blockDim.x] =
tmp_im;
224 tmp_re = I6.x * J0.x - I6.y * J0.y;
225 tmp_re += I7.x * J1.x - I7.y * J1.y;
226 tmp_re += I8.x * J2.x - I8.y * J2.y;
227 accum_re[ 8*blockDim.x] =
tmp_re;
229 tmp_im = I6.x * J0.y + I6.y * J0.x;
230 tmp_im += I7.x * J1.y + I7.y * J1.x;
231 tmp_im += I8.x * J2.y + I8.y * J2.x;
232 accum_im[ 8*blockDim.x] =
tmp_im;
235 tmp_re = I6.x * J3.x - I6.y * J3.y;
236 tmp_re += I7.x * J4.x - I7.y * J4.y;
237 tmp_re += I8.x * J5.x - I8.y * J5.y;
238 accum_re[ 9*blockDim.x] =
tmp_re;
240 tmp_im = I6.x * J3.y + I6.y * J3.x;
241 tmp_im += I7.x * J4.y + I7.y * J4.x;
242 tmp_im += I8.x * J5.y + I8.y * J5.x;
243 accum_im[ 9*blockDim.x] =
tmp_im;
246 tmp_re = I6.x * J6.x - I6.y * J6.y;
247 tmp_re += I7.x * J7.x - I7.y * J7.y;
248 tmp_re += I8.x * J8.x - I8.y * J8.y;
249 accum_re[10*blockDim.x] =
tmp_re;
251 tmp_im = I6.x * J6.y + I6.y * J6.x;
252 tmp_im += I7.x * J7.y + I7.y * J7.x;
253 tmp_im += I8.x * J8.y + I8.y * J8.x;
254 accum_im[10*blockDim.x] =
tmp_im;
257 tmp_re = I6.x * J9.x - I6.y * J9.y;
258 tmp_re += I7.x * J10.x - I7.y * J10.y;
259 tmp_re += I8.x * J11.x - I8.y * J11.y;
260 accum_re[11*blockDim.x] =
tmp_re;
262 tmp_im = I6.x * J9.y + I6.y * J9.x;
263 tmp_im += I7.x * J10.y + I7.y * J10.x;
264 tmp_im += I8.x * J11.y + I8.y * J11.x;
265 accum_im[11*blockDim.x] =
tmp_im;
268 tmp_re = I9.x * J0.x - I9.y * J0.y;
269 tmp_re += I10.x * J1.x - I10.y * J1.y;
270 tmp_re += I11.x * J2.x - I11.y * J2.y;
271 accum_re[12*blockDim.x] =
tmp_re;
273 tmp_im = I9.x * J0.y + I9.y * J0.x;
274 tmp_im += I10.x * J1.y + I10.y * J1.x;
275 tmp_im += I11.x * J2.y + I11.y * J2.x;
276 accum_im[12*blockDim.x] =
tmp_im;
279 tmp_re = I9.x * J3.x - I9.y * J3.y;
280 tmp_re += I10.x * J4.x - I10.y * J4.y;
281 tmp_re += I11.x * J5.x - I11.y * J5.y;
282 accum_re[13*blockDim.x] =
tmp_re;
284 tmp_im = I9.x * J3.y + I9.y * J3.x;
285 tmp_im += I10.x * J4.y + I10.y * J4.x;
286 tmp_im += I11.x * J5.y + I11.y * J5.x;
287 accum_im[13*blockDim.x] =
tmp_im;
290 tmp_re = I9.x * J6.x - I9.y * J6.y;
291 tmp_re += I10.x * J7.x - I10.y * J7.y;
292 tmp_re += I11.x * J8.x - I11.y * J8.y;
293 accum_re[14*blockDim.x] =
tmp_re;
295 tmp_im = I9.x * J6.y + I9.y * J6.x;
296 tmp_im += I10.x * J7.y + I10.y * J7.x;
297 tmp_im += I11.x * J8.y + I11.y * J8.x;
298 accum_im[14*blockDim.x] =
tmp_im;
301 tmp_re = I9.x * J9.x - I9.y * J9.y;
302 tmp_re += I10.x * J10.x - I10.y * J10.y;
303 tmp_re += I11.x * J11.x - I11.y * J11.y;
304 accum_re[15*blockDim.x] =
tmp_re;
306 tmp_im = I9.x * J9.y + I9.y * J9.x;
307 tmp_im += I10.x * J10.y + I10.y * J10.x;
308 tmp_im += I11.x * J11.y + I11.y * J11.x;
309 accum_im[15*blockDim.x] =
tmp_im;
316 out[outId + 0 *param.
threads*2].x -= accum_re[ 0*blockDim.x];
317 out[outId + 1 *param.
threads*2].x -= accum_re[ 1*blockDim.x];
318 out[outId + 2 *param.
threads*2].x -= accum_re[ 2*blockDim.x];
319 out[outId + 3 *param.
threads*2].x -= accum_re[ 3*blockDim.x];
320 out[outId + 4 *param.
threads*2].x -= accum_re[ 4*blockDim.x];
321 out[outId + 5 *param.
threads*2].x -= accum_re[ 5*blockDim.x];
322 out[outId + 6 *param.
threads*2].x -= accum_re[ 6*blockDim.x];
323 out[outId + 7 *param.
threads*2].x -= accum_re[ 7*blockDim.x];
324 out[outId + 8 *param.
threads*2].x -= accum_re[ 8*blockDim.x];
325 out[outId + 9 *param.
threads*2].x -= accum_re[ 9*blockDim.x];
326 out[outId + 10*param.
threads*2].x -= accum_re[10*blockDim.x];
327 out[outId + 11*param.
threads*2].x -= accum_re[11*blockDim.x];
328 out[outId + 12*param.
threads*2].x -= accum_re[12*blockDim.x];
329 out[outId + 13*param.
threads*2].x -= accum_re[13*blockDim.x];
330 out[outId + 14*param.
threads*2].x -= accum_re[14*blockDim.x];
331 out[outId + 15*param.
threads*2].x -= accum_re[15*blockDim.x];
333 out[outId + 0 *param.
threads*2].y -= accum_im[ 0*blockDim.x];
334 out[outId + 1 *param.
threads*2].y -= accum_im[ 1*blockDim.x];
335 out[outId + 2 *param.
threads*2].y -= accum_im[ 2*blockDim.x];
336 out[outId + 3 *param.
threads*2].y -= accum_im[ 3*blockDim.x];
337 out[outId + 4 *param.
threads*2].y -= accum_im[ 4*blockDim.x];
338 out[outId + 5 *param.
threads*2].y -= accum_im[ 5*blockDim.x];
339 out[outId + 6 *param.
threads*2].y -= accum_im[ 6*blockDim.x];
340 out[outId + 7 *param.
threads*2].y -= accum_im[ 7*blockDim.x];
341 out[outId + 8 *param.
threads*2].y -= accum_im[ 8*blockDim.x];
342 out[outId + 9 *param.
threads*2].y -= accum_im[ 9*blockDim.x];
343 out[outId + 10*param.
threads*2].y -= accum_im[10*blockDim.x];
344 out[outId + 11*param.
threads*2].y -= accum_im[11*blockDim.x];
345 out[outId + 12*param.
threads*2].y -= accum_im[12*blockDim.x];
346 out[outId + 13*param.
threads*2].y -= accum_im[13*blockDim.x];
347 out[outId + 14*param.
threads*2].y -= accum_im[14*blockDim.x];
348 out[outId + 15*param.
threads*2].y -= accum_im[15*blockDim.x];
357 int sid = blockIdx.x*blockDim.x + threadIdx.x;
359 int inId = sid +
Vsh*Tslice;
361 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
366 volatile double2
tmp;
367 extern __shared__
double sm[];
369 volatile double *accum_re = sm + threadIdx.x;
375 auxCoord1 = eutId /
X1;
376 xCoord1 = eutId - auxCoord1 *
X1;
377 auxCoord2 = auxCoord1 /
X2;
378 xCoord2 = auxCoord1 - auxCoord2 *
X2;
379 xCoord4 = auxCoord2 /
X3;
380 xCoord3 = auxCoord2 - xCoord4 *
X3;
382 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
383 xCoord1 += auxCoord1;
384 outId = xCoord1 + X1*(xCoord2 + X2*xCoord3);
406 tmp_re = I0.x * J0.x - I0.y * J0.y;
407 tmp_re += I1.x * J1.x - I1.y * J1.y;
408 tmp_re += I2.x * J2.x - I2.y * J2.y;
409 accum_re[0*blockDim.x] =
tmp_re;
411 tmp_im = I0.x * J0.y + I0.y * J0.x;
412 tmp_im += I1.x * J1.y + I1.y * J1.x;
413 tmp_im += I2.x * J2.y + I2.y * J2.x;
414 accum_im[0*blockDim.x] =
tmp_im;
417 tmp_re = I0.x * J3.x - I0.y * J3.y;
418 tmp_re += I1.x * J4.x - I1.y * J4.y;
419 tmp_re += I2.x * J5.x - I2.y * J5.y;
420 accum_re[1*blockDim.x] =
tmp_re;
422 tmp_im = I0.x * J3.y + I0.y * J3.x;
423 tmp_im += I1.x * J4.y + I1.y * J4.x;
424 tmp_im += I2.x * J5.y + I2.y * J5.x;
425 accum_im[1*blockDim.x] =
tmp_im;
428 tmp_re = I0.x * J6.x - I0.y * J6.y;
429 tmp_re += I1.x * J7.x - I1.y * J7.y;
430 tmp_re += I2.x * J8.x - I2.y * J8.y;
431 accum_re[2*blockDim.x] =
tmp_re;
433 tmp_im = I0.x * J6.y + I0.y * J6.x;
434 tmp_im += I1.x * J7.y + I1.y * J7.x;
435 tmp_im += I2.x * J8.y + I2.y * J8.x;
436 accum_im[2*blockDim.x] =
tmp_im;
439 tmp_re = I0.x * J9.x - I0.y * J9.y;
440 tmp_re += I1.x * J10.x - I1.y * J10.y;
441 tmp_re += I2.x * J11.x - I2.y * J11.y;
442 accum_re[3*blockDim.x] =
tmp_re;
444 tmp_im = I0.x * J9.y + I0.y * J9.x;
445 tmp_im += I1.x * J10.y + I1.y * J10.x;
446 tmp_im += I2.x * J11.y + I2.y * J11.x;
447 accum_im[3*blockDim.x] =
tmp_im;
450 tmp_re = I3.x * J0.x - I3.y * J0.y;
451 tmp_re += I4.x * J1.x - I4.y * J1.y;
452 tmp_re += I5.x * J2.x - I5.y * J2.y;
453 accum_re[ 4*blockDim.x] =
tmp_re;
455 tmp_im = I3.x * J0.y + I3.y * J0.x;
456 tmp_im += I4.x * J1.y + I4.y * J1.x;
457 tmp_im += I5.x * J2.y + I5.y * J2.x;
458 accum_im[ 4*blockDim.x] =
tmp_im;
461 tmp_re = I3.x * J3.x - I3.y * J3.y;
462 tmp_re += I4.x * J4.x - I4.y * J4.y;
463 tmp_re += I5.x * J5.x - I5.y * J5.y;
464 accum_re[ 5*blockDim.x] =
tmp_re;
466 tmp_im = I3.x * J3.y + I3.y * J3.x;
467 tmp_im += I4.x * J4.y + I4.y * J4.x;
468 tmp_im += I5.x * J5.y + I5.y * J5.x;
469 accum_im[ 5*blockDim.x] =
tmp_im;
472 tmp_re = I3.x * J6.x - I3.y * J6.y;
473 tmp_re += I4.x * J7.x - I4.y * J7.y;
474 tmp_re += I5.x * J8.x - I5.y * J8.y;
475 accum_re[ 6*blockDim.x] =
tmp_re;
477 tmp_im = I3.x * J6.y + I3.y * J6.x;
478 tmp_im += I4.x * J7.y + I4.y * J7.x;
479 tmp_im += I5.x * J8.y + I5.y * J8.x;
480 accum_im[ 6*blockDim.x] =
tmp_im;
483 tmp_re = I3.x * J9.x - I3.y * J9.y;
484 tmp_re += I4.x * J10.x - I4.y * J10.y;
485 tmp_re += I5.x * J11.x - I5.y * J11.y;
486 accum_re[ 7*blockDim.x] =
tmp_re;
488 tmp_im = I3.x * J9.y + I3.y * J9.x;
489 tmp_im += I4.x * J10.y + I4.y * J10.x;
490 tmp_im += I5.x * J11.y + I5.y * J11.x;
491 accum_im[ 7*blockDim.x] =
tmp_im;
494 tmp_re = I6.x * J0.x - I6.y * J0.y;
495 tmp_re += I7.x * J1.x - I7.y * J1.y;
496 tmp_re += I8.x * J2.x - I8.y * J2.y;
497 accum_re[ 8*blockDim.x] =
tmp_re;
499 tmp_im = I6.x * J0.y + I6.y * J0.x;
500 tmp_im += I7.x * J1.y + I7.y * J1.x;
501 tmp_im += I8.x * J2.y + I8.y * J2.x;
502 accum_im[ 8*blockDim.x] =
tmp_im;
505 tmp_re = I6.x * J3.x - I6.y * J3.y;
506 tmp_re += I7.x * J4.x - I7.y * J4.y;
507 tmp_re += I8.x * J5.x - I8.y * J5.y;
508 accum_re[ 9*blockDim.x] =
tmp_re;
510 tmp_im = I6.x * J3.y + I6.y * J3.x;
511 tmp_im += I7.x * J4.y + I7.y * J4.x;
512 tmp_im += I8.x * J5.y + I8.y * J5.x;
513 accum_im[ 9*blockDim.x] =
tmp_im;
516 tmp_re = I6.x * J6.x - I6.y * J6.y;
517 tmp_re += I7.x * J7.x - I7.y * J7.y;
518 tmp_re += I8.x * J8.x - I8.y * J8.y;
519 accum_re[10*blockDim.x] =
tmp_re;
521 tmp_im = I6.x * J6.y + I6.y * J6.x;
522 tmp_im += I7.x * J7.y + I7.y * J7.x;
523 tmp_im += I8.x * J8.y + I8.y * J8.x;
524 accum_im[10*blockDim.x] =
tmp_im;
527 tmp_re = I6.x * J9.x - I6.y * J9.y;
528 tmp_re += I7.x * J10.x - I7.y * J10.y;
529 tmp_re += I8.x * J11.x - I8.y * J11.y;
530 accum_re[11*blockDim.x] =
tmp_re;
532 tmp_im = I6.x * J9.y + I6.y * J9.x;
533 tmp_im += I7.x * J10.y + I7.y * J10.x;
534 tmp_im += I8.x * J11.y + I8.y * J11.x;
535 accum_im[11*blockDim.x] =
tmp_im;
538 tmp_re = I9.x * J0.x - I9.y * J0.y;
539 tmp_re += I10.x * J1.x - I10.y * J1.y;
540 tmp_re += I11.x * J2.x - I11.y * J2.y;
541 accum_re[12*blockDim.x] =
tmp_re;
543 tmp_im = I9.x * J0.y + I9.y * J0.x;
544 tmp_im += I10.x * J1.y + I10.y * J1.x;
545 tmp_im += I11.x * J2.y + I11.y * J2.x;
546 accum_im[12*blockDim.x] =
tmp_im;
549 tmp_re = I9.x * J3.x - I9.y * J3.y;
550 tmp_re += I10.x * J4.x - I10.y * J4.y;
551 tmp_re += I11.x * J5.x - I11.y * J5.y;
552 accum_re[13*blockDim.x] =
tmp_re;
554 tmp_im = I9.x * J3.y + I9.y * J3.x;
555 tmp_im += I10.x * J4.y + I10.y * J4.x;
556 tmp_im += I11.x * J5.y + I11.y * J5.x;
557 accum_im[13*blockDim.x] =
tmp_im;
560 tmp_re = I9.x * J6.x - I9.y * J6.y;
561 tmp_re += I10.x * J7.x - I10.y * J7.y;
562 tmp_re += I11.x * J8.x - I11.y * J8.y;
563 accum_re[14*blockDim.x] =
tmp_re;
565 tmp_im = I9.x * J6.y + I9.y * J6.x;
566 tmp_im += I10.x * J7.y + I10.y * J7.x;
567 tmp_im += I11.x * J8.y + I11.y * J8.x;
568 accum_im[14*blockDim.x] =
tmp_im;
571 tmp_re = I9.x * J9.x - I9.y * J9.y;
572 tmp_re += I10.x * J10.x - I10.y * J10.y;
573 tmp_re += I11.x * J11.x - I11.y * J11.y;
574 accum_re[15*blockDim.x] =
tmp_re;
576 tmp_im = I9.x * J9.y + I9.y * J9.x;
577 tmp_im += I10.x * J10.y + I10.y * J10.x;
578 tmp_im += I11.x * J11.y + I11.y * J11.x;
579 accum_im[15*blockDim.x] =
tmp_im;
586 out[outId + 0 *param.
threads*2].x -= accum_re[ 0*blockDim.x];
587 out[outId + 1 *param.
threads*2].x -= accum_re[ 1*blockDim.x];
588 out[outId + 2 *param.
threads*2].x -= accum_re[ 2*blockDim.x];
589 out[outId + 3 *param.
threads*2].x -= accum_re[ 3*blockDim.x];
590 out[outId + 4 *param.
threads*2].x -= accum_re[ 4*blockDim.x];
591 out[outId + 5 *param.
threads*2].x -= accum_re[ 5*blockDim.x];
592 out[outId + 6 *param.
threads*2].x -= accum_re[ 6*blockDim.x];
593 out[outId + 7 *param.
threads*2].x -= accum_re[ 7*blockDim.x];
594 out[outId + 8 *param.
threads*2].x -= accum_re[ 8*blockDim.x];
595 out[outId + 9 *param.
threads*2].x -= accum_re[ 9*blockDim.x];
596 out[outId + 10*param.
threads*2].x -= accum_re[10*blockDim.x];
597 out[outId + 11*param.
threads*2].x -= accum_re[11*blockDim.x];
598 out[outId + 12*param.
threads*2].x -= accum_re[12*blockDim.x];
599 out[outId + 13*param.
threads*2].x -= accum_re[13*blockDim.x];
600 out[outId + 14*param.
threads*2].x -= accum_re[14*blockDim.x];
601 out[outId + 15*param.
threads*2].x -= accum_re[15*blockDim.x];
603 out[outId + 0 *param.
threads*2].y -= accum_im[ 0*blockDim.x];
604 out[outId + 1 *param.
threads*2].y -= accum_im[ 1*blockDim.x];
605 out[outId + 2 *param.
threads*2].y -= accum_im[ 2*blockDim.x];
606 out[outId + 3 *param.
threads*2].y -= accum_im[ 3*blockDim.x];
607 out[outId + 4 *param.
threads*2].y -= accum_im[ 4*blockDim.x];
608 out[outId + 5 *param.
threads*2].y -= accum_im[ 5*blockDim.x];
609 out[outId + 6 *param.
threads*2].y -= accum_im[ 6*blockDim.x];
610 out[outId + 7 *param.
threads*2].y -= accum_im[ 7*blockDim.x];
611 out[outId + 8 *param.
threads*2].y -= accum_im[ 8*blockDim.x];
612 out[outId + 9 *param.
threads*2].y -= accum_im[ 9*blockDim.x];
613 out[outId + 10*param.
threads*2].y -= accum_im[10*blockDim.x];
614 out[outId + 11*param.
threads*2].y -= accum_im[11*blockDim.x];
615 out[outId + 12*param.
threads*2].y -= accum_im[12*blockDim.x];
616 out[outId + 13*param.
threads*2].y -= accum_im[13*blockDim.x];
617 out[outId + 14*param.
threads*2].y -= accum_im[14*blockDim.x];
618 out[outId + 15*param.
threads*2].y -= accum_im[15*blockDim.x];
625 int sid = blockIdx.x*blockDim.x + threadIdx.x;
627 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
632 volatile double2
tmp;
633 extern __shared__
double sm[];
635 volatile double *accum_re = sm + threadIdx.x;
639 auxCoord1 = eutId /
X1;
640 xCoord1 = eutId - auxCoord1 *
X1;
641 auxCoord2 = auxCoord1 /
X2;
642 xCoord2 = auxCoord1 - auxCoord2 *
X2;
643 xCoord4 = auxCoord2 /
X3;
644 xCoord3 = auxCoord2 - xCoord4 *
X3;
646 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
647 xCoord1 += auxCoord1;
648 outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4));
670 tmp_re = I0.x * J0.x - I0.y * J0.y;
671 tmp_re += I1.x * J1.x - I1.y * J1.y;
672 tmp_re += I2.x * J2.x - I2.y * J2.y;
673 accum_re[0*blockDim.x] =
tmp_re;
675 tmp_im = I0.x * J0.y + I0.y * J0.x;
676 tmp_im += I1.x * J1.y + I1.y * J1.x;
677 tmp_im += I2.x * J2.y + I2.y * J2.x;
678 accum_im[0*blockDim.x] =
tmp_im;
681 tmp_re = I0.x * J3.x - I0.y * J3.y;
682 tmp_re += I1.x * J4.x - I1.y * J4.y;
683 tmp_re += I2.x * J5.x - I2.y * J5.y;
684 accum_re[1*blockDim.x] =
tmp_re;
686 tmp_im = I0.x * J3.y + I0.y * J3.x;
687 tmp_im += I1.x * J4.y + I1.y * J4.x;
688 tmp_im += I2.x * J5.y + I2.y * J5.x;
689 accum_im[1*blockDim.x] =
tmp_im;
692 tmp_re = I0.x * J6.x - I0.y * J6.y;
693 tmp_re += I1.x * J7.x - I1.y * J7.y;
694 tmp_re += I2.x * J8.x - I2.y * J8.y;
695 accum_re[2*blockDim.x] =
tmp_re;
697 tmp_im = I0.x * J6.y + I0.y * J6.x;
698 tmp_im += I1.x * J7.y + I1.y * J7.x;
699 tmp_im += I2.x * J8.y + I2.y * J8.x;
700 accum_im[2*blockDim.x] =
tmp_im;
703 tmp_re = I0.x * J9.x - I0.y * J9.y;
704 tmp_re += I1.x * J10.x - I1.y * J10.y;
705 tmp_re += I2.x * J11.x - I2.y * J11.y;
706 accum_re[3*blockDim.x] =
tmp_re;
708 tmp_im = I0.x * J9.y + I0.y * J9.x;
709 tmp_im += I1.x * J10.y + I1.y * J10.x;
710 tmp_im += I2.x * J11.y + I2.y * J11.x;
711 accum_im[3*blockDim.x] =
tmp_im;
714 tmp_re = I3.x * J0.x - I3.y * J0.y;
715 tmp_re += I4.x * J1.x - I4.y * J1.y;
716 tmp_re += I5.x * J2.x - I5.y * J2.y;
717 accum_re[ 4*blockDim.x] =
tmp_re;
719 tmp_im = I3.x * J0.y + I3.y * J0.x;
720 tmp_im += I4.x * J1.y + I4.y * J1.x;
721 tmp_im += I5.x * J2.y + I5.y * J2.x;
722 accum_im[ 4*blockDim.x] =
tmp_im;
725 tmp_re = I3.x * J3.x - I3.y * J3.y;
726 tmp_re += I4.x * J4.x - I4.y * J4.y;
727 tmp_re += I5.x * J5.x - I5.y * J5.y;
728 accum_re[ 5*blockDim.x] =
tmp_re;
730 tmp_im = I3.x * J3.y + I3.y * J3.x;
731 tmp_im += I4.x * J4.y + I4.y * J4.x;
732 tmp_im += I5.x * J5.y + I5.y * J5.x;
733 accum_im[ 5*blockDim.x] =
tmp_im;
736 tmp_re = I3.x * J6.x - I3.y * J6.y;
737 tmp_re += I4.x * J7.x - I4.y * J7.y;
738 tmp_re += I5.x * J8.x - I5.y * J8.y;
739 accum_re[ 6*blockDim.x] =
tmp_re;
741 tmp_im = I3.x * J6.y + I3.y * J6.x;
742 tmp_im += I4.x * J7.y + I4.y * J7.x;
743 tmp_im += I5.x * J8.y + I5.y * J8.x;
744 accum_im[ 6*blockDim.x] =
tmp_im;
747 tmp_re = I3.x * J9.x - I3.y * J9.y;
748 tmp_re += I4.x * J10.x - I4.y * J10.y;
749 tmp_re += I5.x * J11.x - I5.y * J11.y;
750 accum_re[ 7*blockDim.x] =
tmp_re;
752 tmp_im = I3.x * J9.y + I3.y * J9.x;
753 tmp_im += I4.x * J10.y + I4.y * J10.x;
754 tmp_im += I5.x * J11.y + I5.y * J11.x;
755 accum_im[ 7*blockDim.x] =
tmp_im;
758 tmp_re = I6.x * J0.x - I6.y * J0.y;
759 tmp_re += I7.x * J1.x - I7.y * J1.y;
760 tmp_re += I8.x * J2.x - I8.y * J2.y;
761 accum_re[ 8*blockDim.x] =
tmp_re;
763 tmp_im = I6.x * J0.y + I6.y * J0.x;
764 tmp_im += I7.x * J1.y + I7.y * J1.x;
765 tmp_im += I8.x * J2.y + I8.y * J2.x;
766 accum_im[ 8*blockDim.x] =
tmp_im;
769 tmp_re = I6.x * J3.x - I6.y * J3.y;
770 tmp_re += I7.x * J4.x - I7.y * J4.y;
771 tmp_re += I8.x * J5.x - I8.y * J5.y;
772 accum_re[ 9*blockDim.x] =
tmp_re;
774 tmp_im = I6.x * J3.y + I6.y * J3.x;
775 tmp_im += I7.x * J4.y + I7.y * J4.x;
776 tmp_im += I8.x * J5.y + I8.y * J5.x;
777 accum_im[ 9*blockDim.x] =
tmp_im;
780 tmp_re = I6.x * J6.x - I6.y * J6.y;
781 tmp_re += I7.x * J7.x - I7.y * J7.y;
782 tmp_re += I8.x * J8.x - I8.y * J8.y;
783 accum_re[10*blockDim.x] =
tmp_re;
785 tmp_im = I6.x * J6.y + I6.y * J6.x;
786 tmp_im += I7.x * J7.y + I7.y * J7.x;
787 tmp_im += I8.x * J8.y + I8.y * J8.x;
788 accum_im[10*blockDim.x] =
tmp_im;
791 tmp_re = I6.x * J9.x - I6.y * J9.y;
792 tmp_re += I7.x * J10.x - I7.y * J10.y;
793 tmp_re += I8.x * J11.x - I8.y * J11.y;
794 accum_re[11*blockDim.x] =
tmp_re;
796 tmp_im = I6.x * J9.y + I6.y * J9.x;
797 tmp_im += I7.x * J10.y + I7.y * J10.x;
798 tmp_im += I8.x * J11.y + I8.y * J11.x;
799 accum_im[11*blockDim.x] =
tmp_im;
802 tmp_re = I9.x * J0.x - I9.y * J0.y;
803 tmp_re += I10.x * J1.x - I10.y * J1.y;
804 tmp_re += I11.x * J2.x - I11.y * J2.y;
805 accum_re[12*blockDim.x] =
tmp_re;
807 tmp_im = I9.x * J0.y + I9.y * J0.x;
808 tmp_im += I10.x * J1.y + I10.y * J1.x;
809 tmp_im += I11.x * J2.y + I11.y * J2.x;
810 accum_im[12*blockDim.x] =
tmp_im;
813 tmp_re = I9.x * J3.x - I9.y * J3.y;
814 tmp_re += I10.x * J4.x - I10.y * J4.y;
815 tmp_re += I11.x * J5.x - I11.y * J5.y;
816 accum_re[13*blockDim.x] =
tmp_re;
818 tmp_im = I9.x * J3.y + I9.y * J3.x;
819 tmp_im += I10.x * J4.y + I10.y * J4.x;
820 tmp_im += I11.x * J5.y + I11.y * J5.x;
821 accum_im[13*blockDim.x] =
tmp_im;
824 tmp_re = I9.x * J6.x - I9.y * J6.y;
825 tmp_re += I10.x * J7.x - I10.y * J7.y;
826 tmp_re += I11.x * J8.x - I11.y * J8.y;
827 accum_re[14*blockDim.x] =
tmp_re;
829 tmp_im = I9.x * J6.y + I9.y * J6.x;
830 tmp_im += I10.x * J7.y + I10.y * J7.x;
831 tmp_im += I11.x * J8.y + I11.y * J8.x;
832 accum_im[14*blockDim.x] =
tmp_im;
835 tmp_re = I9.x * J9.x - I9.y * J9.y;
836 tmp_re += I10.x * J10.x - I10.y * J10.y;
837 tmp_re += I11.x * J11.x - I11.y * J11.y;
838 accum_re[15*blockDim.x] =
tmp_re;
840 tmp_im = I9.x * J9.y + I9.y * J9.x;
841 tmp_im += I10.x * J10.y + I10.y * J10.x;
842 tmp_im += I11.x * J11.y + I11.y * J11.x;
843 accum_im[15*blockDim.x] =
tmp_im;
847 out[outId + 0 *param.
threads*2].x -= accum_re[ 0*blockDim.x];
848 out[outId + 1 *param.
threads*2].x -= accum_re[ 1*blockDim.x];
849 out[outId + 2 *param.
threads*2].x -= accum_re[ 2*blockDim.x];
850 out[outId + 3 *param.
threads*2].x -= accum_re[ 3*blockDim.x];
851 out[outId + 4 *param.
threads*2].x -= accum_re[ 4*blockDim.x];
852 out[outId + 5 *param.
threads*2].x -= accum_re[ 5*blockDim.x];
853 out[outId + 6 *param.
threads*2].x -= accum_re[ 6*blockDim.x];
854 out[outId + 7 *param.
threads*2].x -= accum_re[ 7*blockDim.x];
855 out[outId + 8 *param.
threads*2].x -= accum_re[ 8*blockDim.x];
856 out[outId + 9 *param.
threads*2].x -= accum_re[ 9*blockDim.x];
857 out[outId + 10*param.
threads*2].x -= accum_re[10*blockDim.x];
858 out[outId + 11*param.
threads*2].x -= accum_re[11*blockDim.x];
859 out[outId + 12*param.
threads*2].x -= accum_re[12*blockDim.x];
860 out[outId + 13*param.
threads*2].x -= accum_re[13*blockDim.x];
861 out[outId + 14*param.
threads*2].x -= accum_re[14*blockDim.x];
862 out[outId + 15*param.
threads*2].x -= accum_re[15*blockDim.x];
864 out[outId + 0 *param.
threads*2].y -= accum_im[ 0*blockDim.x];
865 out[outId + 1 *param.
threads*2].y -= accum_im[ 1*blockDim.x];
866 out[outId + 2 *param.
threads*2].y -= accum_im[ 2*blockDim.x];
867 out[outId + 3 *param.
threads*2].y -= accum_im[ 3*blockDim.x];
868 out[outId + 4 *param.
threads*2].y -= accum_im[ 4*blockDim.x];
869 out[outId + 5 *param.
threads*2].y -= accum_im[ 5*blockDim.x];
870 out[outId + 6 *param.
threads*2].y -= accum_im[ 6*blockDim.x];
871 out[outId + 7 *param.
threads*2].y -= accum_im[ 7*blockDim.x];
872 out[outId + 8 *param.
threads*2].y -= accum_im[ 8*blockDim.x];
873 out[outId + 9 *param.
threads*2].y -= accum_im[ 9*blockDim.x];
874 out[outId + 10*param.
threads*2].y -= accum_im[10*blockDim.x];
875 out[outId + 11*param.
threads*2].y -= accum_im[11*blockDim.x];
876 out[outId + 12*param.
threads*2].y -= accum_im[12*blockDim.x];
877 out[outId + 13*param.
threads*2].y -= accum_im[13*blockDim.x];
878 out[outId + 14*param.
threads*2].y -= accum_im[14*blockDim.x];
879 out[outId + 15*param.
threads*2].y -= accum_im[15*blockDim.x];
888 #undef READ_INTERMEDIATE_SPINOR
892 #endif // (__CUDA_ARCH__ >= 130)
895 #define READ_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
896 float4 I0 = spinor[sp_idx + 0*(stride)]; \
897 float4 I1 = spinor[sp_idx + 1*(stride)]; \
898 float4 I2 = spinor[sp_idx + 2*(stride)]; \
899 float4 I3 = spinor[sp_idx + 3*(stride)]; \
900 float4 I4 = spinor[sp_idx + 4*(stride)]; \
901 float4 I5 = spinor[sp_idx + 5*(stride)];
904 #define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
905 float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
906 float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
907 float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
908 float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
909 float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
910 float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
912 #define READ_INTERMEDIATE_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \
913 float4 J0 = spinor[sp_idx + 0*(stride)]; \
914 float4 J1 = spinor[sp_idx + 1*(stride)]; \
915 float4 J2 = spinor[sp_idx + 2*(stride)]; \
916 float4 J3 = spinor[sp_idx + 3*(stride)]; \
917 float4 J4 = spinor[sp_idx + 4*(stride)]; \
918 float4 J5 = spinor[sp_idx + 5*(stride)];
920 #define READ_INTERMEDIATE_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \
921 float4 J0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \
922 float4 J1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \
923 float4 J2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \
924 float4 J3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \
925 float4 J4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \
926 float4 J5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride));
929 #ifdef DIRECT_ACCESS_WILSON_SPINOR
930 #define READ_SPINOR READ_SPINOR_SINGLE
933 #define READ_SPINOR READ_SPINOR_SINGLE_TEX
935 #ifdef USE_TEXTURE_OBJECTS
936 #define SPINORTEX param.inTex
938 #define SPINORTEX spinorTexSingle
939 #endif // USE_TEXTURE_OBJECTS
942 #ifdef DIRECT_ACCESS_WILSON_INTER
943 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE
946 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE_TEX
948 #ifdef USE_TEXTURE_OBJECTS
949 #define INTERTEX param.outTex
951 #define INTERTEX interTexSingle
952 #endif // USE_TEXTURE_OBJECTS
959 int sid = blockIdx.x*blockDim.x + threadIdx.x;
961 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
967 extern __shared__
float sms[];
969 volatile float *accum_re = sms + threadIdx.x;
973 auxCoord1 = eutId /
X1;
974 xCoord1 = eutId - auxCoord1 *
X1;
975 auxCoord2 = auxCoord1 /
X2;
976 xCoord2 = auxCoord1 - auxCoord2 *
X2;
977 xCoord4 = auxCoord2 /
X3;
978 xCoord3 = auxCoord2 - xCoord4 *
X3;
980 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
981 xCoord1 += auxCoord1;
982 outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4));
1042 tmp_re = I0.x * J0.x - I0.y * J0.y;
1043 tmp_re += I0.z * J0.z - I0.w * J0.w;
1044 tmp_re += I1.x * J1.x - I1.y * J1.y;
1045 accum_re[0*blockDim.x] =
tmp_re;
1047 tmp_im = I0.x * J0.y + I0.y * J0.x;
1048 tmp_im += I0.z * J0.w + I0.w * J0.z;
1049 tmp_im += I1.x * J1.y + I1.y * J1.x;
1050 accum_im[0*blockDim.x] =
tmp_im;
1053 tmp_re = I0.x * J1.z - I0.y * J1.w;
1054 tmp_re += I0.z * J2.x - I0.w * J2.y;
1055 tmp_re += I1.x * J2.z - I1.y * J2.w;
1056 accum_re[1*blockDim.x] =
tmp_re;
1058 tmp_im = I0.x * J1.w + I0.y * J1.z;
1059 tmp_im += I0.z * J2.y + I0.w * J2.x;
1060 tmp_im += I1.x * J2.w + I1.y * J2.z;
1061 accum_im[1*blockDim.x] =
tmp_im;
1064 tmp_re = I0.x * J3.x - I0.y * J3.y;
1065 tmp_re += I0.z * J3.z - I0.w * J3.w;
1066 tmp_re += I1.x * J4.x - I1.y * J4.y;
1067 accum_re[2*blockDim.x] =
tmp_re;
1069 tmp_im = I0.x * J3.y + I0.y * J3.x;
1070 tmp_im += I0.z * J3.w + I0.w * J3.z;
1071 tmp_im += I1.x * J4.y + I1.y * J4.x;
1072 accum_im[2*blockDim.x] =
tmp_im;
1075 tmp_re = I0.x * J4.z - I0.y * J4.w;
1076 tmp_re += I0.z * J5.x - I0.w * J5.x;
1077 tmp_re += I1.x * J5.z - I1.y * J5.w;
1079 accum_re[3*blockDim.x] =
tmp_re;
1081 tmp_im = I0.x * J4.w + I0.y * J4.z;
1082 tmp_im += I0.z * J5.y + I0.w * J5.y;
1083 tmp_im += I1.x * J5.w + I1.y * J5.z;
1084 accum_im[3*blockDim.x] =
tmp_im;
1087 tmp_re = I1.z * J0.x - I1.w * J0.y;
1088 tmp_re += I2.x * J0.z - I2.y * J0.w;
1089 tmp_re += I2.z * J1.x - I2.w * J1.y;
1090 accum_re[4*blockDim.x] =
tmp_re;
1092 tmp_im = I1.z * J0.y + I1.w * J0.x;
1093 tmp_im += I2.x * J0.w + I2.y * J0.z;
1094 tmp_im += I2.z * J1.y + I2.w * J1.x;
1095 accum_im[4*blockDim.x] =
tmp_im;
1098 tmp_re = I1.z * J1.z - I1.w * J1.w;
1099 tmp_re += I2.x * J2.x - I2.y * J2.y;
1100 tmp_re += I2.z * J2.z - I2.w * J2.w;
1101 accum_re[5*blockDim.x] =
tmp_re;
1103 tmp_im = I1.z * J1.w + I1.w * J1.z;
1104 tmp_im += I2.x * J2.y + I2.y * J2.x;
1105 tmp_im += I2.z * J2.w + I2.w * J2.z;
1106 accum_im[5*blockDim.x] =
tmp_im;
1109 tmp_re = I1.z * J3.x - I1.w * J3.y;
1110 tmp_re += I2.x * J3.z - I2.y * J3.w;
1111 tmp_re += I2.z * J4.x - I2.w * J4.y;
1112 accum_re[6*blockDim.x] =
tmp_re;
1114 tmp_im = I1.z * J3.y + I1.w * J3.x;
1115 tmp_im += I2.x * J3.w + I2.y * J3.z;
1116 tmp_im += I2.z * J4.y + I2.w * J4.x;
1117 accum_im[6*blockDim.x] =
tmp_im;
1120 tmp_re = I1.z * J4.z - I1.w * J4.w;
1121 tmp_re += I2.x * J5.x - I2.y * J5.y;
1122 tmp_re += I2.z * J5.z - I2.w * J5.w;
1123 accum_re[7*blockDim.x] =
tmp_re;
1125 tmp_im = I1.z * J4.w + I1.w * J4.z;
1126 tmp_im += I2.x * J5.y + I2.y * J5.x;
1127 tmp_im += I2.z * J5.w + I2.w * J5.z;
1128 accum_im[7*blockDim.x] =
tmp_im;
1131 tmp_re = I3.x * J0.x - I3.y * J0.y;
1132 tmp_re += I3.z * J0.z - I3.w * J0.w;
1133 tmp_re += I4.x * J1.x - I4.y * J1.y;
1134 accum_re[8*blockDim.x] =
tmp_re;
1136 tmp_im = I3.x * J0.y + I3.y * J0.x;
1137 tmp_im += I3.z * J0.w + I3.w * J0.z;
1138 tmp_im += I4.x * J1.y + I4.y * J1.x;
1139 accum_im[8*blockDim.x] =
tmp_im;
1142 tmp_re = I3.x * J1.z - I3.y * J1.w;
1143 tmp_re += I3.z * J2.x - I3.w * J2.y;
1144 tmp_re += I4.x * J2.z - I4.y * J2.w;
1145 accum_re[9*blockDim.x] =
tmp_re;
1147 tmp_im = I3.x * J1.w + I3.y * J1.z;
1148 tmp_im += I3.z * J2.y + I3.w * J2.x;
1149 tmp_im += I4.x * J2.w + I4.y * J2.z;
1150 accum_im[9*blockDim.x] =
tmp_im;
1153 tmp_re = I3.x * J3.x - I3.y * J3.y;
1154 tmp_re += I3.z * J3.z - I3.w * J3.w;
1155 tmp_re += I4.x * J4.x - I4.y * J4.y;
1156 accum_re[10*blockDim.x] =
tmp_re;
1158 tmp_im = I3.x * J3.y + I3.y * J3.x;
1159 tmp_im += I3.z * J3.w + I3.w * J3.z;
1160 tmp_im += I4.x * J4.y + I4.y * J4.x;
1161 accum_im[10*blockDim.x] =
tmp_im;
1164 tmp_re = I3.x * J4.z - I3.y * J4.w;
1165 tmp_re += I3.z * J5.x - I3.w * J5.y;
1166 tmp_re += I4.x * J5.z - I4.y * J5.w;
1167 accum_re[11*blockDim.x] =
tmp_re;
1169 tmp_im = I3.x * J4.w + I3.y * J4.z;
1170 tmp_im += I3.z * J5.y + I3.w * J5.x;
1171 tmp_im += I4.x * J5.w + I4.y * J5.z;
1172 accum_im[11*blockDim.x] =
tmp_im;
1175 tmp_re = I4.z * J0.x - I4.w * J0.y;
1176 tmp_re += I5.x * J0.z - I5.y * J0.w;
1177 tmp_re += I5.z * J1.x - I5.w * J1.y;
1178 accum_re[12*blockDim.x] =
tmp_re;
1180 tmp_im = I4.z * J0.y + I4.w * J0.x;
1181 tmp_im += I5.x * J0.w + I5.y * J0.z;
1182 tmp_im += I5.z * J1.y + I5.w * J1.x;
1183 accum_im[12*blockDim.x] =
tmp_im;
1186 tmp_re = I4.z * J1.z - I4.w * J1.w;
1187 tmp_re += I5.x * J2.x - I5.y * J2.y;
1188 tmp_re += I5.z * J2.z - I5.w * J2.w;
1189 accum_re[13*blockDim.x] =
tmp_re;
1191 tmp_im = I4.z * J1.w + I4.w * J1.z;
1192 tmp_im += I5.x * J2.y + I5.y * J2.x;
1193 tmp_im += I5.z * J2.w + I5.w * J2.z;
1194 accum_im[13*blockDim.x] =
tmp_im;
1197 tmp_re = I4.z * J3.x - I4.w * J3.y;
1198 tmp_re += I5.x * J3.z - I5.y * J3.w;
1199 tmp_re += I5.z * J4.x - I5.w * J4.y;
1200 accum_re[14*blockDim.x] =
tmp_re;
1202 tmp_im = I4.z * J3.y + I4.w * J3.x;
1203 tmp_im += I5.x * J3.w + I5.y * J3.z;
1204 tmp_im += I5.z * J4.y + I5.w * J4.x;
1205 accum_im[14*blockDim.x] =
tmp_im;
1208 tmp_re = I4.z * J4.z - I4.w * J4.w;
1209 tmp_re += I5.x * J5.x - I5.y * J5.y;
1210 tmp_re += I5.z * J5.z - I5.w * J5.w;
1211 accum_re[15*blockDim.x] =
tmp_re;
1213 tmp_im = I4.z * J4.w + I4.w * J4.z;
1214 tmp_im += I5.x * J5.y + I5.y * J5.x;
1215 tmp_im += I5.z * J5.w + I5.w * J5.z;
1216 accum_im[15*blockDim.x] =
tmp_im;
1225 out[outId + 0 *param.
threads*2].x -= accum_re[ 0*blockDim.x];
1226 out[outId + 1 *param.
threads*2].x -= accum_re[ 1*blockDim.x];
1227 out[outId + 2 *param.
threads*2].x -= accum_re[ 2*blockDim.x];
1228 out[outId + 3 *param.
threads*2].x -= accum_re[ 3*blockDim.x];
1229 out[outId + 4 *param.
threads*2].x -= accum_re[ 4*blockDim.x];
1230 out[outId + 5 *param.
threads*2].x -= accum_re[ 5*blockDim.x];
1231 out[outId + 6 *param.
threads*2].x -= accum_re[ 6*blockDim.x];
1232 out[outId + 7 *param.
threads*2].x -= accum_re[ 7*blockDim.x];
1233 out[outId + 8 *param.
threads*2].x -= accum_re[ 8*blockDim.x];
1234 out[outId + 9 *param.
threads*2].x -= accum_re[ 9*blockDim.x];
1235 out[outId + 10*param.
threads*2].x -= accum_re[10*blockDim.x];
1236 out[outId + 11*param.
threads*2].x -= accum_re[11*blockDim.x];
1237 out[outId + 12*param.
threads*2].x -= accum_re[12*blockDim.x];
1238 out[outId + 13*param.
threads*2].x -= accum_re[13*blockDim.x];
1239 out[outId + 14*param.
threads*2].x -= accum_re[14*blockDim.x];
1240 out[outId + 15*param.
threads*2].x -= accum_re[15*blockDim.x];
1242 out[outId + 0 *param.
threads*2].y -= accum_im[ 0*blockDim.x];
1243 out[outId + 1 *param.
threads*2].y -= accum_im[ 1*blockDim.x];
1244 out[outId + 2 *param.
threads*2].y -= accum_im[ 2*blockDim.x];
1245 out[outId + 3 *param.
threads*2].y -= accum_im[ 3*blockDim.x];
1246 out[outId + 4 *param.
threads*2].y -= accum_im[ 4*blockDim.x];
1247 out[outId + 5 *param.
threads*2].y -= accum_im[ 5*blockDim.x];
1248 out[outId + 6 *param.
threads*2].y -= accum_im[ 6*blockDim.x];
1249 out[outId + 7 *param.
threads*2].y -= accum_im[ 7*blockDim.x];
1250 out[outId + 8 *param.
threads*2].y -= accum_im[ 8*blockDim.x];
1251 out[outId + 9 *param.
threads*2].y -= accum_im[ 9*blockDim.x];
1252 out[outId + 10*param.
threads*2].y -= accum_im[10*blockDim.x];
1253 out[outId + 11*param.
threads*2].y -= accum_im[11*blockDim.x];
1254 out[outId + 12*param.
threads*2].y -= accum_im[12*blockDim.x];
1255 out[outId + 13*param.
threads*2].y -= accum_im[13*blockDim.x];
1256 out[outId + 14*param.
threads*2].y -= accum_im[14*blockDim.x];
1257 out[outId + 15*param.
threads*2].y -= accum_im[15*blockDim.x];
1266 int sid = blockIdx.x*blockDim.x + threadIdx.x;
1268 int inId = sid +
Vsh*Tslice;
1270 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1275 volatile float2
tmp;
1276 extern __shared__
float sms[];
1278 volatile float *accum_re = sms + threadIdx.x;
1284 auxCoord1 = eutId /
X1;
1285 xCoord1 = eutId - auxCoord1 *
X1;
1286 auxCoord2 = auxCoord1 /
X2;
1287 xCoord2 = auxCoord1 - auxCoord2 *
X2;
1288 xCoord4 = auxCoord2 /
X3;
1289 xCoord3 = auxCoord2 - xCoord4 *
X3;
1291 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1292 xCoord1 += auxCoord1;
1293 outId = xCoord1 + X1*(xCoord2 + X2*xCoord3);
1317 tmp_re = I0.x * J0.x - I0.y * J0.y;
1318 tmp_re += I0.z * J0.z - I0.w * J0.w;
1319 tmp_re += I1.x * J1.x - I1.y * J1.y;
1320 accum_re[0*blockDim.x] =
tmp_re;
1322 tmp_im = I0.x * J0.y + I0.y * J0.x;
1323 tmp_im += I0.z * J0.w + I0.w * J0.z;
1324 tmp_im += I1.x * J1.y + I1.y * J1.x;
1325 accum_im[0*blockDim.x] =
tmp_im;
1328 tmp_re = I0.x * J1.z - I0.y * J1.w;
1329 tmp_re += I0.z * J2.x - I0.w * J2.y;
1330 tmp_re += I1.x * J2.z - I1.y * J2.w;
1331 accum_re[1*blockDim.x] =
tmp_re;
1333 tmp_im = I0.x * J1.w + I0.y * J1.z;
1334 tmp_im += I0.z * J2.y + I0.w * J2.x;
1335 tmp_im += I1.x * J2.w + I1.y * J2.z;
1336 accum_im[1*blockDim.x] =
tmp_im;
1339 tmp_re = I0.x * J3.x - I0.y * J3.y;
1340 tmp_re += I0.z * J3.z - I0.w * J3.w;
1341 tmp_re += I1.x * J4.x - I1.y * J4.y;
1342 accum_re[2*blockDim.x] =
tmp_re;
1344 tmp_im = I0.x * J3.y + I0.y * J3.x;
1345 tmp_im += I0.z * J3.w + I0.w * J3.z;
1346 tmp_im += I1.x * J4.y + I1.y * J4.x;
1347 accum_im[2*blockDim.x] =
tmp_im;
1350 tmp_re = I0.x * J4.z - I0.y * J4.w;
1351 tmp_re += I0.z * J5.x - I0.w * J5.x;
1352 tmp_re += I1.x * J5.z - I1.y * J5.w;
1354 accum_re[3*blockDim.x] =
tmp_re;
1356 tmp_im = I0.x * J4.w + I0.y * J4.z;
1357 tmp_im += I0.z * J5.y + I0.w * J5.y;
1358 tmp_im += I1.x * J5.w + I1.y * J5.z;
1359 accum_im[3*blockDim.x] =
tmp_im;
1362 tmp_re = I1.z * J0.x - I1.w * J0.y;
1363 tmp_re += I2.x * J0.z - I2.y * J0.w;
1364 tmp_re += I2.z * J1.x - I2.w * J1.y;
1365 accum_re[4*blockDim.x] =
tmp_re;
1367 tmp_im = I1.z * J0.y + I1.w * J0.x;
1368 tmp_im += I2.x * J0.w + I2.y * J0.z;
1369 tmp_im += I2.z * J1.y + I2.w * J1.x;
1370 accum_im[4*blockDim.x] =
tmp_im;
1373 tmp_re = I1.z * J1.z - I1.w * J1.w;
1374 tmp_re += I2.x * J2.x - I2.y * J2.y;
1375 tmp_re += I2.z * J2.z - I2.w * J2.w;
1376 accum_re[5*blockDim.x] =
tmp_re;
1378 tmp_im = I1.z * J1.w + I1.w * J1.z;
1379 tmp_im += I2.x * J2.y + I2.y * J2.x;
1380 tmp_im += I2.z * J2.w + I2.w * J2.z;
1381 accum_im[5*blockDim.x] =
tmp_im;
1384 tmp_re = I1.z * J3.x - I1.w * J3.y;
1385 tmp_re += I2.x * J3.z - I2.y * J3.w;
1386 tmp_re += I2.z * J4.x - I2.w * J4.y;
1387 accum_re[6*blockDim.x] =
tmp_re;
1389 tmp_im = I1.z * J3.y + I1.w * J3.x;
1390 tmp_im += I2.x * J3.w + I2.y * J3.z;
1391 tmp_im += I2.z * J4.y + I2.w * J4.x;
1392 accum_im[6*blockDim.x] =
tmp_im;
1395 tmp_re = I1.z * J4.z - I1.w * J4.w;
1396 tmp_re += I2.x * J5.x - I2.y * J5.y;
1397 tmp_re += I2.z * J5.z - I2.w * J5.w;
1398 accum_re[7*blockDim.x] =
tmp_re;
1400 tmp_im = I1.z * J4.w + I1.w * J4.z;
1401 tmp_im += I2.x * J5.y + I2.y * J5.x;
1402 tmp_im += I2.z * J5.w + I2.w * J5.z;
1403 accum_im[7*blockDim.x] =
tmp_im;
1406 tmp_re = I3.x * J0.x - I3.y * J0.y;
1407 tmp_re += I3.z * J0.z - I3.w * J0.w;
1408 tmp_re += I4.x * J1.x - I4.y * J1.y;
1409 accum_re[8*blockDim.x] =
tmp_re;
1411 tmp_im = I3.x * J0.y + I3.y * J0.x;
1412 tmp_im += I3.z * J0.w + I3.w * J0.z;
1413 tmp_im += I4.x * J1.y + I4.y * J1.x;
1414 accum_im[8*blockDim.x] =
tmp_im;
1417 tmp_re = I3.x * J1.z - I3.y * J1.w;
1418 tmp_re += I3.z * J2.x - I3.w * J2.y;
1419 tmp_re += I4.x * J2.z - I4.y * J2.w;
1420 accum_re[9*blockDim.x] =
tmp_re;
1422 tmp_im = I3.x * J1.w + I3.y * J1.z;
1423 tmp_im += I3.z * J2.y + I3.w * J2.x;
1424 tmp_im += I4.x * J2.w + I4.y * J2.z;
1425 accum_im[9*blockDim.x] =
tmp_im;
1428 tmp_re = I3.x * J3.x - I3.y * J3.y;
1429 tmp_re += I3.z * J3.z - I3.w * J3.w;
1430 tmp_re += I4.x * J4.x - I4.y * J4.y;
1431 accum_re[10*blockDim.x] =
tmp_re;
1433 tmp_im = I3.x * J3.y + I3.y * J3.x;
1434 tmp_im += I3.z * J3.w + I3.w * J3.z;
1435 tmp_im += I4.x * J4.y + I4.y * J4.x;
1436 accum_im[10*blockDim.x] =
tmp_im;
1439 tmp_re = I3.x * J4.z - I3.y * J4.w;
1440 tmp_re += I3.z * J5.x - I3.w * J5.y;
1441 tmp_re += I4.x * J5.z - I4.y * J5.w;
1442 accum_re[11*blockDim.x] =
tmp_re;
1444 tmp_im = I3.x * J4.w + I3.y * J4.z;
1445 tmp_im += I3.z * J5.y + I3.w * J5.x;
1446 tmp_im += I4.x * J5.w + I4.y * J5.z;
1447 accum_im[11*blockDim.x] =
tmp_im;
1450 tmp_re = I4.z * J0.x - I4.w * J0.y;
1451 tmp_re += I5.x * J0.z - I5.y * J0.w;
1452 tmp_re += I5.z * J1.x - I5.w * J1.y;
1453 accum_re[12*blockDim.x] =
tmp_re;
1455 tmp_im = I4.z * J0.y + I4.w * J0.x;
1456 tmp_im += I5.x * J0.w + I5.y * J0.z;
1457 tmp_im += I5.z * J1.y + I5.w * J1.x;
1458 accum_im[12*blockDim.x] =
tmp_im;
1461 tmp_re = I4.z * J1.z - I4.w * J1.w;
1462 tmp_re += I5.x * J2.x - I5.y * J2.y;
1463 tmp_re += I5.z * J2.z - I5.w * J2.w;
1464 accum_re[13*blockDim.x] =
tmp_re;
1466 tmp_im = I4.z * J1.w + I4.w * J1.z;
1467 tmp_im += I5.x * J2.y + I5.y * J2.x;
1468 tmp_im += I5.z * J2.w + I5.w * J2.z;
1469 accum_im[13*blockDim.x] =
tmp_im;
1472 tmp_re = I4.z * J3.x - I4.w * J3.y;
1473 tmp_re += I5.x * J3.z - I5.y * J3.w;
1474 tmp_re += I5.z * J4.x - I5.w * J4.y;
1475 accum_re[14*blockDim.x] =
tmp_re;
1477 tmp_im = I4.z * J3.y + I4.w * J3.x;
1478 tmp_im += I5.x * J3.w + I5.y * J3.z;
1479 tmp_im += I5.z * J4.y + I5.w * J4.x;
1480 accum_im[14*blockDim.x] =
tmp_im;
1483 tmp_re = I4.z * J4.z - I4.w * J4.w;
1484 tmp_re += I5.x * J5.x - I5.y * J5.y;
1485 tmp_re += I5.z * J5.z - I5.w * J5.w;
1486 accum_re[15*blockDim.x] =
tmp_re;
1488 tmp_im = I4.z * J4.w + I4.w * J4.z;
1489 tmp_im += I5.x * J5.y + I5.y * J5.x;
1490 tmp_im += I5.z * J5.w + I5.w * J5.z;
1491 accum_im[15*blockDim.x] =
tmp_im;
1498 out[outId + 0 *param.
threads*2].x -= accum_re[ 0*blockDim.x];
1499 out[outId + 1 *param.
threads*2].x -= accum_re[ 1*blockDim.x];
1500 out[outId + 2 *param.
threads*2].x -= accum_re[ 2*blockDim.x];
1501 out[outId + 3 *param.
threads*2].x -= accum_re[ 3*blockDim.x];
1502 out[outId + 4 *param.
threads*2].x -= accum_re[ 4*blockDim.x];
1503 out[outId + 5 *param.
threads*2].x -= accum_re[ 5*blockDim.x];
1504 out[outId + 6 *param.
threads*2].x -= accum_re[ 6*blockDim.x];
1505 out[outId + 7 *param.
threads*2].x -= accum_re[ 7*blockDim.x];
1506 out[outId + 8 *param.
threads*2].x -= accum_re[ 8*blockDim.x];
1507 out[outId + 9 *param.
threads*2].x -= accum_re[ 9*blockDim.x];
1508 out[outId + 10*param.
threads*2].x -= accum_re[10*blockDim.x];
1509 out[outId + 11*param.
threads*2].x -= accum_re[11*blockDim.x];
1510 out[outId + 12*param.
threads*2].x -= accum_re[12*blockDim.x];
1511 out[outId + 13*param.
threads*2].x -= accum_re[13*blockDim.x];
1512 out[outId + 14*param.
threads*2].x -= accum_re[14*blockDim.x];
1513 out[outId + 15*param.
threads*2].x -= accum_re[15*blockDim.x];
1515 out[outId + 0 *param.
threads*2].y -= accum_im[ 0*blockDim.x];
1516 out[outId + 1 *param.
threads*2].y -= accum_im[ 1*blockDim.x];
1517 out[outId + 2 *param.
threads*2].y -= accum_im[ 2*blockDim.x];
1518 out[outId + 3 *param.
threads*2].y -= accum_im[ 3*blockDim.x];
1519 out[outId + 4 *param.
threads*2].y -= accum_im[ 4*blockDim.x];
1520 out[outId + 5 *param.
threads*2].y -= accum_im[ 5*blockDim.x];
1521 out[outId + 6 *param.
threads*2].y -= accum_im[ 6*blockDim.x];
1522 out[outId + 7 *param.
threads*2].y -= accum_im[ 7*blockDim.x];
1523 out[outId + 8 *param.
threads*2].y -= accum_im[ 8*blockDim.x];
1524 out[outId + 9 *param.
threads*2].y -= accum_im[ 9*blockDim.x];
1525 out[outId + 10*param.
threads*2].y -= accum_im[10*blockDim.x];
1526 out[outId + 11*param.
threads*2].y -= accum_im[11*blockDim.x];
1527 out[outId + 12*param.
threads*2].y -= accum_im[12*blockDim.x];
1528 out[outId + 13*param.
threads*2].y -= accum_im[13*blockDim.x];
1529 out[outId + 14*param.
threads*2].y -= accum_im[14*blockDim.x];
1530 out[outId + 15*param.
threads*2].y -= accum_im[15*blockDim.x];
1537 int sid = blockIdx.x*blockDim.x + threadIdx.x;
1539 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1544 volatile float2
tmp;
1545 extern __shared__
float sms[];
1547 volatile float *accum_re = sms + threadIdx.x;
1551 auxCoord1 = eutId /
X1;
1552 xCoord1 = eutId - auxCoord1 *
X1;
1553 auxCoord2 = auxCoord1 /
X2;
1554 xCoord2 = auxCoord1 - auxCoord2 *
X2;
1555 xCoord4 = auxCoord2 /
X3;
1556 xCoord3 = auxCoord2 - xCoord4 *
X3;
1558 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1559 xCoord1 += auxCoord1;
1560 outId = xCoord1 + X1*(xCoord2 + X2*(xCoord3 + X3*xCoord4));
1584 tmp_re = I0.x * J0.x - I0.y * J0.y;
1585 tmp_re += I0.z * J0.z - I0.w * J0.w;
1586 tmp_re += I1.x * J1.x - I1.y * J1.y;
1587 accum_re[0*blockDim.x] =
tmp_re;
1589 tmp_im = I0.x * J0.y + I0.y * J0.x;
1590 tmp_im += I0.z * J0.w + I0.w * J0.z;
1591 tmp_im += I1.x * J1.y + I1.y * J1.x;
1592 accum_im[0*blockDim.x] =
tmp_im;
1595 tmp_re = I0.x * J1.z - I0.y * J1.w;
1596 tmp_re += I0.z * J2.x - I0.w * J2.y;
1597 tmp_re += I1.x * J2.z - I1.y * J2.w;
1598 accum_re[1*blockDim.x] =
tmp_re;
1600 tmp_im = I0.x * J1.w + I0.y * J1.z;
1601 tmp_im += I0.z * J2.y + I0.w * J2.x;
1602 tmp_im += I1.x * J2.w + I1.y * J2.z;
1603 accum_im[1*blockDim.x] =
tmp_im;
1606 tmp_re = I0.x * J3.x - I0.y * J3.y;
1607 tmp_re += I0.z * J3.z - I0.w * J3.w;
1608 tmp_re += I1.x * J4.x - I1.y * J4.y;
1609 accum_re[2*blockDim.x] =
tmp_re;
1611 tmp_im = I0.x * J3.y + I0.y * J3.x;
1612 tmp_im += I0.z * J3.w + I0.w * J3.z;
1613 tmp_im += I1.x * J4.y + I1.y * J4.x;
1614 accum_im[2*blockDim.x] =
tmp_im;
1617 tmp_re = I0.x * J4.z - I0.y * J4.w;
1618 tmp_re += I0.z * J5.x - I0.w * J5.x;
1619 tmp_re += I1.x * J5.z - I1.y * J5.w;
1621 accum_re[3*blockDim.x] =
tmp_re;
1623 tmp_im = I0.x * J4.w + I0.y * J4.z;
1624 tmp_im += I0.z * J5.y + I0.w * J5.y;
1625 tmp_im += I1.x * J5.w + I1.y * J5.z;
1626 accum_im[3*blockDim.x] =
tmp_im;
1629 tmp_re = I1.z * J0.x - I1.w * J0.y;
1630 tmp_re += I2.x * J0.z - I2.y * J0.w;
1631 tmp_re += I2.z * J1.x - I2.w * J1.y;
1632 accum_re[4*blockDim.x] =
tmp_re;
1634 tmp_im = I1.z * J0.y + I1.w * J0.x;
1635 tmp_im += I2.x * J0.w + I2.y * J0.z;
1636 tmp_im += I2.z * J1.y + I2.w * J1.x;
1637 accum_im[4*blockDim.x] =
tmp_im;
1640 tmp_re = I1.z * J1.z - I1.w * J1.w;
1641 tmp_re += I2.x * J2.x - I2.y * J2.y;
1642 tmp_re += I2.z * J2.z - I2.w * J2.w;
1643 accum_re[5*blockDim.x] =
tmp_re;
1645 tmp_im = I1.z * J1.w + I1.w * J1.z;
1646 tmp_im += I2.x * J2.y + I2.y * J2.x;
1647 tmp_im += I2.z * J2.w + I2.w * J2.z;
1648 accum_im[5*blockDim.x] =
tmp_im;
1651 tmp_re = I1.z * J3.x - I1.w * J3.y;
1652 tmp_re += I2.x * J3.z - I2.y * J3.w;
1653 tmp_re += I2.z * J4.x - I2.w * J4.y;
1654 accum_re[6*blockDim.x] =
tmp_re;
1656 tmp_im = I1.z * J3.y + I1.w * J3.x;
1657 tmp_im += I2.x * J3.w + I2.y * J3.z;
1658 tmp_im += I2.z * J4.y + I2.w * J4.x;
1659 accum_im[6*blockDim.x] =
tmp_im;
1662 tmp_re = I1.z * J4.z - I1.w * J4.w;
1663 tmp_re += I2.x * J5.x - I2.y * J5.y;
1664 tmp_re += I2.z * J5.z - I2.w * J5.w;
1665 accum_re[7*blockDim.x] =
tmp_re;
1667 tmp_im = I1.z * J4.w + I1.w * J4.z;
1668 tmp_im += I2.x * J5.y + I2.y * J5.x;
1669 tmp_im += I2.z * J5.w + I2.w * J5.z;
1670 accum_im[7*blockDim.x] =
tmp_im;
1673 tmp_re = I3.x * J0.x - I3.y * J0.y;
1674 tmp_re += I3.z * J0.z - I3.w * J0.w;
1675 tmp_re += I4.x * J1.x - I4.y * J1.y;
1676 accum_re[8*blockDim.x] =
tmp_re;
1678 tmp_im = I3.x * J0.y + I3.y * J0.x;
1679 tmp_im += I3.z * J0.w + I3.w * J0.z;
1680 tmp_im += I4.x * J1.y + I4.y * J1.x;
1681 accum_im[8*blockDim.x] =
tmp_im;
1684 tmp_re = I3.x * J1.z - I3.y * J1.w;
1685 tmp_re += I3.z * J2.x - I3.w * J2.y;
1686 tmp_re += I4.x * J2.z - I4.y * J2.w;
1687 accum_re[9*blockDim.x] =
tmp_re;
1689 tmp_im = I3.x * J1.w + I3.y * J1.z;
1690 tmp_im += I3.z * J2.y + I3.w * J2.x;
1691 tmp_im += I4.x * J2.w + I4.y * J2.z;
1692 accum_im[9*blockDim.x] =
tmp_im;
1695 tmp_re = I3.x * J3.x - I3.y * J3.y;
1696 tmp_re += I3.z * J3.z - I3.w * J3.w;
1697 tmp_re += I4.x * J4.x - I4.y * J4.y;
1698 accum_re[10*blockDim.x] =
tmp_re;
1700 tmp_im = I3.x * J3.y + I3.y * J3.x;
1701 tmp_im += I3.z * J3.w + I3.w * J3.z;
1702 tmp_im += I4.x * J4.y + I4.y * J4.x;
1703 accum_im[10*blockDim.x] =
tmp_im;
1706 tmp_re = I3.x * J4.z - I3.y * J4.w;
1707 tmp_re += I3.z * J5.x - I3.w * J5.y;
1708 tmp_re += I4.x * J5.z - I4.y * J5.w;
1709 accum_re[11*blockDim.x] =
tmp_re;
1711 tmp_im = I3.x * J4.w + I3.y * J4.z;
1712 tmp_im += I3.z * J5.y + I3.w * J5.x;
1713 tmp_im += I4.x * J5.w + I4.y * J5.z;
1714 accum_im[11*blockDim.x] =
tmp_im;
1717 tmp_re = I4.z * J0.x - I4.w * J0.y;
1718 tmp_re += I5.x * J0.z - I5.y * J0.w;
1719 tmp_re += I5.z * J1.x - I5.w * J1.y;
1720 accum_re[12*blockDim.x] =
tmp_re;
1722 tmp_im = I4.z * J0.y + I4.w * J0.x;
1723 tmp_im += I5.x * J0.w + I5.y * J0.z;
1724 tmp_im += I5.z * J1.y + I5.w * J1.x;
1725 accum_im[12*blockDim.x] =
tmp_im;
1728 tmp_re = I4.z * J1.z - I4.w * J1.w;
1729 tmp_re += I5.x * J2.x - I5.y * J2.y;
1730 tmp_re += I5.z * J2.z - I5.w * J2.w;
1731 accum_re[13*blockDim.x] =
tmp_re;
1733 tmp_im = I4.z * J1.w + I4.w * J1.z;
1734 tmp_im += I5.x * J2.y + I5.y * J2.x;
1735 tmp_im += I5.z * J2.w + I5.w * J2.z;
1736 accum_im[13*blockDim.x] =
tmp_im;
1739 tmp_re = I4.z * J3.x - I4.w * J3.y;
1740 tmp_re += I5.x * J3.z - I5.y * J3.w;
1741 tmp_re += I5.z * J4.x - I5.w * J4.y;
1742 accum_re[14*blockDim.x] =
tmp_re;
1744 tmp_im = I4.z * J3.y + I4.w * J3.x;
1745 tmp_im += I5.x * J3.w + I5.y * J3.z;
1746 tmp_im += I5.z * J4.y + I5.w * J4.x;
1747 accum_im[14*blockDim.x] =
tmp_im;
1750 tmp_re = I4.z * J4.z - I4.w * J4.w;
1751 tmp_re += I5.x * J5.x - I5.y * J5.y;
1752 tmp_re += I5.z * J5.z - I5.w * J5.w;
1753 accum_re[15*blockDim.x] =
tmp_re;
1755 tmp_im = I4.z * J4.w + I4.w * J4.z;
1756 tmp_im += I5.x * J5.y + I5.y * J5.x;
1757 tmp_im += I5.z * J5.w + I5.w * J5.z;
1758 accum_im[15*blockDim.x] =
tmp_im;
1764 out[outId + 0 *param.
threads*2].x -= accum_re[ 0*blockDim.x];
1765 out[outId + 1 *param.
threads*2].x -= accum_re[ 1*blockDim.x];
1766 out[outId + 2 *param.
threads*2].x -= accum_re[ 2*blockDim.x];
1767 out[outId + 3 *param.
threads*2].x -= accum_re[ 3*blockDim.x];
1768 out[outId + 4 *param.
threads*2].x -= accum_re[ 4*blockDim.x];
1769 out[outId + 5 *param.
threads*2].x -= accum_re[ 5*blockDim.x];
1770 out[outId + 6 *param.
threads*2].x -= accum_re[ 6*blockDim.x];
1771 out[outId + 7 *param.
threads*2].x -= accum_re[ 7*blockDim.x];
1772 out[outId + 8 *param.
threads*2].x -= accum_re[ 8*blockDim.x];
1773 out[outId + 9 *param.
threads*2].x -= accum_re[ 9*blockDim.x];
1774 out[outId + 10*param.
threads*2].x -= accum_re[10*blockDim.x];
1775 out[outId + 11*param.
threads*2].x -= accum_re[11*blockDim.x];
1776 out[outId + 12*param.
threads*2].x -= accum_re[12*blockDim.x];
1777 out[outId + 13*param.
threads*2].x -= accum_re[13*blockDim.x];
1778 out[outId + 14*param.
threads*2].x -= accum_re[14*blockDim.x];
1779 out[outId + 15*param.
threads*2].x -= accum_re[15*blockDim.x];
1781 out[outId + 0 *param.
threads*2].y -= accum_im[ 0*blockDim.x];
1782 out[outId + 1 *param.
threads*2].y -= accum_im[ 1*blockDim.x];
1783 out[outId + 2 *param.
threads*2].y -= accum_im[ 2*blockDim.x];
1784 out[outId + 3 *param.
threads*2].y -= accum_im[ 3*blockDim.x];
1785 out[outId + 4 *param.
threads*2].y -= accum_im[ 4*blockDim.x];
1786 out[outId + 5 *param.
threads*2].y -= accum_im[ 5*blockDim.x];
1787 out[outId + 6 *param.
threads*2].y -= accum_im[ 6*blockDim.x];
1788 out[outId + 7 *param.
threads*2].y -= accum_im[ 7*blockDim.x];
1789 out[outId + 8 *param.
threads*2].y -= accum_im[ 8*blockDim.x];
1790 out[outId + 9 *param.
threads*2].y -= accum_im[ 9*blockDim.x];
1791 out[outId + 10*param.
threads*2].y -= accum_im[10*blockDim.x];
1792 out[outId + 11*param.
threads*2].y -= accum_im[11*blockDim.x];
1793 out[outId + 12*param.
threads*2].y -= accum_im[12*blockDim.x];
1794 out[outId + 13*param.
threads*2].y -= accum_im[13*blockDim.x];
1795 out[outId + 14*param.
threads*2].y -= accum_im[14*blockDim.x];
1796 out[outId + 15*param.
threads*2].y -= accum_im[15*blockDim.x];
1808 #undef READ_INTERMEDIATE_SPINOR
1812 #endif //_TWIST_QUDA_CONTRACT_MINUS
__global__ void contractGamma5MinusKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
__global__ void contractMinusKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Parity, const DslashParam param)
cudaColorSpinorField * tmp
cpuColorSpinorField * out
#define READ_INTERMEDIATE_SPINOR
__global__ void contractTsliceMinusKernel(float2 *out, float4 *in1, float4 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)