5 #ifndef _TWIST_QUDA_CONTRACT 6 #define _TWIST_QUDA_CONTRACT 11 #define TOTAL_COMPONENTS 16 13 #define READ_INTERMEDIATE_SPINOR_DOUBLE(spinor, stride, sp_idx, norm_idx) \ 14 double2 J0 = spinor[sp_idx + 0*(stride)]; \ 15 double2 J1 = spinor[sp_idx + 1*(stride)]; \ 16 double2 J2 = spinor[sp_idx + 2*(stride)]; \ 17 double2 J3 = spinor[sp_idx + 3*(stride)]; \ 18 double2 J4 = spinor[sp_idx + 4*(stride)]; \ 19 double2 J5 = spinor[sp_idx + 5*(stride)]; \ 20 double2 J6 = spinor[sp_idx + 6*(stride)]; \ 21 double2 J7 = spinor[sp_idx + 7*(stride)]; \ 22 double2 J8 = spinor[sp_idx + 8*(stride)]; \ 23 double2 J9 = spinor[sp_idx + 9*(stride)]; \ 24 double2 J10 = spinor[sp_idx +10*(stride)]; \ 25 double2 J11 = spinor[sp_idx +11*(stride)]; 27 #define READ_INTERMEDIATE_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \ 28 double2 J0 = fetch_double2((spinor), sp_idx + 0*(stride)); \ 29 double2 J1 = fetch_double2((spinor), sp_idx + 1*(stride)); \ 30 double2 J2 = fetch_double2((spinor), sp_idx + 2*(stride)); \ 31 double2 J3 = fetch_double2((spinor), sp_idx + 3*(stride)); \ 32 double2 J4 = fetch_double2((spinor), sp_idx + 4*(stride)); \ 33 double2 J5 = fetch_double2((spinor), sp_idx + 5*(stride)); \ 34 double2 J6 = fetch_double2((spinor), sp_idx + 6*(stride)); \ 35 double2 J7 = fetch_double2((spinor), sp_idx + 7*(stride)); \ 36 double2 J8 = fetch_double2((spinor), sp_idx + 8*(stride)); \ 37 double2 J9 = fetch_double2((spinor), sp_idx + 9*(stride)); \ 38 double2 J10 = fetch_double2((spinor), sp_idx +10*(stride)); \ 39 double2 J11 = fetch_double2((spinor), sp_idx +11*(stride)); 41 #ifdef DIRECT_ACCESS_WILSON_SPINOR 42 #define READ_SPINOR READ_SPINOR_DOUBLE 43 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE 47 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX 48 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE_TEX 50 #ifdef USE_TEXTURE_OBJECTS 51 #define SPINORTEX param.inTex 52 #define INTERTEX param.outTex 54 #define SPINORTEX spinorTexDouble 55 #define INTERTEX interTexDouble 56 #endif // USE_TEXTURE_OBJECTS 69 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
75 extern __shared__
double sm[];
77 volatile double *accum_re = sm + threadIdx.x;
81 auxCoord1 = eutId /
param.dc.
X[0];;
82 xCoord1 = eutId - auxCoord1 *
param.dc.
X[0];
83 auxCoord2 = auxCoord1 /
param.dc.
X[1];
84 xCoord2 = auxCoord1 - auxCoord2 *
param.dc.
X[1];
85 xCoord4 = auxCoord2 /
param.dc.
X[2];
86 xCoord3 = auxCoord2 - xCoord4 *
param.dc.
X[2];
88 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
90 outId = xCoord1 +
param.dc.
X[0]*(xCoord2 +
param.dc.
X[1]*(xCoord3 +
param.dc.
X[2]*xCoord4));
141 tmp_re = I0.x * J0.x - I0.y * J0.y;
142 tmp_re += I1.x * J1.x - I1.y * J1.y;
143 tmp_re += I2.x * J2.x - I2.y * J2.y;
146 tmp_im = I0.x * J0.y + I0.y * J0.x;
147 tmp_im += I1.x * J1.y + I1.y * J1.x;
148 tmp_im += I2.x * J2.y + I2.y * J2.x;
152 tmp_re = I0.x * J3.x - I0.y * J3.y;
153 tmp_re += I1.x * J4.x - I1.y * J4.y;
154 tmp_re += I2.x * J5.x - I2.y * J5.y;
157 tmp_im = I0.x * J3.y + I0.y * J3.x;
158 tmp_im += I1.x * J4.y + I1.y * J4.x;
159 tmp_im += I2.x * J5.y + I2.y * J5.x;
163 tmp_re = I0.x * J6.x - I0.y * J6.y;
164 tmp_re += I1.x * J7.x - I1.y * J7.y;
165 tmp_re += I2.x * J8.x - I2.y * J8.y;
168 tmp_im = I0.x * J6.y + I0.y * J6.x;
169 tmp_im += I1.x * J7.y + I1.y * J7.x;
170 tmp_im += I2.x * J8.y + I2.y * J8.x;
174 tmp_re = I0.x * J9.x - I0.y * J9.y;
175 tmp_re += I1.x * J10.x - I1.y * J10.y;
176 tmp_re += I2.x * J11.x - I2.y * J11.y;
179 tmp_im = I0.x * J9.y + I0.y * J9.x;
180 tmp_im += I1.x * J10.y + I1.y * J10.x;
181 tmp_im += I2.x * J11.y + I2.y * J11.x;
185 tmp_re = I3.x * J0.x - I3.y * J0.y;
186 tmp_re += I4.x * J1.x - I4.y * J1.y;
187 tmp_re += I5.x * J2.x - I5.y * J2.y;
190 tmp_im = I3.x * J0.y + I3.y * J0.x;
191 tmp_im += I4.x * J1.y + I4.y * J1.x;
192 tmp_im += I5.x * J2.y + I5.y * J2.x;
196 tmp_re = I3.x * J3.x - I3.y * J3.y;
197 tmp_re += I4.x * J4.x - I4.y * J4.y;
198 tmp_re += I5.x * J5.x - I5.y * J5.y;
201 tmp_im = I3.x * J3.y + I3.y * J3.x;
202 tmp_im += I4.x * J4.y + I4.y * J4.x;
203 tmp_im += I5.x * J5.y + I5.y * J5.x;
207 tmp_re = I3.x * J6.x - I3.y * J6.y;
208 tmp_re += I4.x * J7.x - I4.y * J7.y;
209 tmp_re += I5.x * J8.x - I5.y * J8.y;
212 tmp_im = I3.x * J6.y + I3.y * J6.x;
213 tmp_im += I4.x * J7.y + I4.y * J7.x;
214 tmp_im += I5.x * J8.y + I5.y * J8.x;
218 tmp_re = I3.x * J9.x - I3.y * J9.y;
219 tmp_re += I4.x * J10.x - I4.y * J10.y;
220 tmp_re += I5.x * J11.x - I5.y * J11.y;
223 tmp_im = I3.x * J9.y + I3.y * J9.x;
224 tmp_im += I4.x * J10.y + I4.y * J10.x;
225 tmp_im += I5.x * J11.y + I5.y * J11.x;
229 tmp_re = I6.x * J0.x - I6.y * J0.y;
230 tmp_re += I7.x * J1.x - I7.y * J1.y;
231 tmp_re += I8.x * J2.x - I8.y * J2.y;
234 tmp_im = I6.x * J0.y + I6.y * J0.x;
235 tmp_im += I7.x * J1.y + I7.y * J1.x;
236 tmp_im += I8.x * J2.y + I8.y * J2.x;
240 tmp_re = I6.x * J3.x - I6.y * J3.y;
241 tmp_re += I7.x * J4.x - I7.y * J4.y;
242 tmp_re += I8.x * J5.x - I8.y * J5.y;
245 tmp_im = I6.x * J3.y + I6.y * J3.x;
246 tmp_im += I7.x * J4.y + I7.y * J4.x;
247 tmp_im += I8.x * J5.y + I8.y * J5.x;
251 tmp_re = I6.x * J6.x - I6.y * J6.y;
252 tmp_re += I7.x * J7.x - I7.y * J7.y;
253 tmp_re += I8.x * J8.x - I8.y * J8.y;
256 tmp_im = I6.x * J6.y + I6.y * J6.x;
257 tmp_im += I7.x * J7.y + I7.y * J7.x;
258 tmp_im += I8.x * J8.y + I8.y * J8.x;
262 tmp_re = I6.x * J9.x - I6.y * J9.y;
263 tmp_re += I7.x * J10.x - I7.y * J10.y;
264 tmp_re += I8.x * J11.x - I8.y * J11.y;
267 tmp_im = I6.x * J9.y + I6.y * J9.x;
268 tmp_im += I7.x * J10.y + I7.y * J10.x;
269 tmp_im += I8.x * J11.y + I8.y * J11.x;
273 tmp_re = I9.x * J0.x - I9.y * J0.y;
274 tmp_re += I10.x * J1.x - I10.y * J1.y;
275 tmp_re += I11.x * J2.x - I11.y * J2.y;
278 tmp_im = I9.x * J0.y + I9.y * J0.x;
279 tmp_im += I10.x * J1.y + I10.y * J1.x;
280 tmp_im += I11.x * J2.y + I11.y * J2.x;
284 tmp_re = I9.x * J3.x - I9.y * J3.y;
285 tmp_re += I10.x * J4.x - I10.y * J4.y;
286 tmp_re += I11.x * J5.x - I11.y * J5.y;
289 tmp_im = I9.x * J3.y + I9.y * J3.x;
290 tmp_im += I10.x * J4.y + I10.y * J4.x;
291 tmp_im += I11.x * J5.y + I11.y * J5.x;
295 tmp_re = I9.x * J6.x - I9.y * J6.y;
296 tmp_re += I10.x * J7.x - I10.y * J7.y;
297 tmp_re += I11.x * J8.x - I11.y * J8.y;
300 tmp_im = I9.x * J6.y + I9.y * J6.x;
301 tmp_im += I10.x * J7.y + I10.y * J7.x;
302 tmp_im += I11.x * J8.y + I11.y * J8.x;
306 tmp_re = I9.x * J9.x - I9.y * J9.y;
307 tmp_re += I10.x * J10.x - I10.y * J10.y;
308 tmp_re += I11.x * J11.x - I11.y * J11.y;
311 tmp_im = I9.x * J9.y + I9.y * J9.x;
312 tmp_im += I10.x * J10.y + I10.y * J10.x;
313 tmp_im += I11.x * J11.y + I11.y * J11.x;
349 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
354 volatile double2
tmp;
355 extern __shared__
double sm[];
357 volatile double *accum_re = sm + threadIdx.x;
363 auxCoord1 = eutId /
param.dc.
X[0];
364 xCoord1 = eutId - auxCoord1 *
param.dc.
X[0];
365 auxCoord2 = auxCoord1 /
param.dc.
X[1];
366 xCoord2 = auxCoord1 - auxCoord2 *
param.dc.
X[1];
367 xCoord4 = auxCoord2 /
param.dc.
X[2];
368 xCoord3 = auxCoord2 - xCoord4 *
param.dc.
X[2];
370 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
371 xCoord1 += auxCoord1;
372 outId = xCoord1 +
param.dc.
X[0]*(xCoord2 +
param.dc.
X[1]*(xCoord3 +
param.dc.
X[2]*xCoord4));
394 tmp_re = I0.x * J0.x - I0.y * J0.y;
395 tmp_re += I1.x * J1.x - I1.y * J1.y;
396 tmp_re += I2.x * J2.x - I2.y * J2.y;
399 tmp_im = I0.x * J0.y + I0.y * J0.x;
400 tmp_im += I1.x * J1.y + I1.y * J1.x;
401 tmp_im += I2.x * J2.y + I2.y * J2.x;
405 tmp_re = I0.x * J3.x - I0.y * J3.y;
406 tmp_re += I1.x * J4.x - I1.y * J4.y;
407 tmp_re += I2.x * J5.x - I2.y * J5.y;
410 tmp_im = I0.x * J3.y + I0.y * J3.x;
411 tmp_im += I1.x * J4.y + I1.y * J4.x;
412 tmp_im += I2.x * J5.y + I2.y * J5.x;
416 tmp_re = I0.x * J6.x - I0.y * J6.y;
417 tmp_re += I1.x * J7.x - I1.y * J7.y;
418 tmp_re += I2.x * J8.x - I2.y * J8.y;
421 tmp_im = I0.x * J6.y + I0.y * J6.x;
422 tmp_im += I1.x * J7.y + I1.y * J7.x;
423 tmp_im += I2.x * J8.y + I2.y * J8.x;
427 tmp_re = I0.x * J9.x - I0.y * J9.y;
428 tmp_re += I1.x * J10.x - I1.y * J10.y;
429 tmp_re += I2.x * J11.x - I2.y * J11.y;
432 tmp_im = I0.x * J9.y + I0.y * J9.x;
433 tmp_im += I1.x * J10.y + I1.y * J10.x;
434 tmp_im += I2.x * J11.y + I2.y * J11.x;
438 tmp_re = I3.x * J0.x - I3.y * J0.y;
439 tmp_re += I4.x * J1.x - I4.y * J1.y;
440 tmp_re += I5.x * J2.x - I5.y * J2.y;
443 tmp_im = I3.x * J0.y + I3.y * J0.x;
444 tmp_im += I4.x * J1.y + I4.y * J1.x;
445 tmp_im += I5.x * J2.y + I5.y * J2.x;
449 tmp_re = I3.x * J3.x - I3.y * J3.y;
450 tmp_re += I4.x * J4.x - I4.y * J4.y;
451 tmp_re += I5.x * J5.x - I5.y * J5.y;
454 tmp_im = I3.x * J3.y + I3.y * J3.x;
455 tmp_im += I4.x * J4.y + I4.y * J4.x;
456 tmp_im += I5.x * J5.y + I5.y * J5.x;
460 tmp_re = I3.x * J6.x - I3.y * J6.y;
461 tmp_re += I4.x * J7.x - I4.y * J7.y;
462 tmp_re += I5.x * J8.x - I5.y * J8.y;
465 tmp_im = I3.x * J6.y + I3.y * J6.x;
466 tmp_im += I4.x * J7.y + I4.y * J7.x;
467 tmp_im += I5.x * J8.y + I5.y * J8.x;
471 tmp_re = I3.x * J9.x - I3.y * J9.y;
472 tmp_re += I4.x * J10.x - I4.y * J10.y;
473 tmp_re += I5.x * J11.x - I5.y * J11.y;
476 tmp_im = I3.x * J9.y + I3.y * J9.x;
477 tmp_im += I4.x * J10.y + I4.y * J10.x;
478 tmp_im += I5.x * J11.y + I5.y * J11.x;
482 tmp_re = I6.x * J0.x - I6.y * J0.y;
483 tmp_re += I7.x * J1.x - I7.y * J1.y;
484 tmp_re += I8.x * J2.x - I8.y * J2.y;
487 tmp_im = I6.x * J0.y + I6.y * J0.x;
488 tmp_im += I7.x * J1.y + I7.y * J1.x;
489 tmp_im += I8.x * J2.y + I8.y * J2.x;
493 tmp_re = I6.x * J3.x - I6.y * J3.y;
494 tmp_re += I7.x * J4.x - I7.y * J4.y;
495 tmp_re += I8.x * J5.x - I8.y * J5.y;
498 tmp_im = I6.x * J3.y + I6.y * J3.x;
499 tmp_im += I7.x * J4.y + I7.y * J4.x;
500 tmp_im += I8.x * J5.y + I8.y * J5.x;
504 tmp_re = I6.x * J6.x - I6.y * J6.y;
505 tmp_re += I7.x * J7.x - I7.y * J7.y;
506 tmp_re += I8.x * J8.x - I8.y * J8.y;
509 tmp_im = I6.x * J6.y + I6.y * J6.x;
510 tmp_im += I7.x * J7.y + I7.y * J7.x;
511 tmp_im += I8.x * J8.y + I8.y * J8.x;
515 tmp_re = I6.x * J9.x - I6.y * J9.y;
516 tmp_re += I7.x * J10.x - I7.y * J10.y;
517 tmp_re += I8.x * J11.x - I8.y * J11.y;
520 tmp_im = I6.x * J9.y + I6.y * J9.x;
521 tmp_im += I7.x * J10.y + I7.y * J10.x;
522 tmp_im += I8.x * J11.y + I8.y * J11.x;
526 tmp_re = I9.x * J0.x - I9.y * J0.y;
527 tmp_re += I10.x * J1.x - I10.y * J1.y;
528 tmp_re += I11.x * J2.x - I11.y * J2.y;
531 tmp_im = I9.x * J0.y + I9.y * J0.x;
532 tmp_im += I10.x * J1.y + I10.y * J1.x;
533 tmp_im += I11.x * J2.y + I11.y * J2.x;
537 tmp_re = I9.x * J3.x - I9.y * J3.y;
538 tmp_re += I10.x * J4.x - I10.y * J4.y;
539 tmp_re += I11.x * J5.x - I11.y * J5.y;
542 tmp_im = I9.x * J3.y + I9.y * J3.x;
543 tmp_im += I10.x * J4.y + I10.y * J4.x;
544 tmp_im += I11.x * J5.y + I11.y * J5.x;
548 tmp_re = I9.x * J6.x - I9.y * J6.y;
549 tmp_re += I10.x * J7.x - I10.y * J7.y;
550 tmp_re += I11.x * J8.x - I11.y * J8.y;
553 tmp_im = I9.x * J6.y + I9.y * J6.x;
554 tmp_im += I10.x * J7.y + I10.y * J7.x;
555 tmp_im += I11.x * J8.y + I11.y * J8.x;
559 tmp_re = I9.x * J9.x - I9.y * J9.y;
560 tmp_re += I10.x * J10.x - I10.y * J10.y;
561 tmp_re += I11.x * J11.x - I11.y * J11.y;
564 tmp_im = I9.x * J9.y + I9.y * J9.x;
565 tmp_im += I10.x * J10.y + I10.y * J10.x;
566 tmp_im += I11.x * J11.y + I11.y * J11.x;
598 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
603 volatile double2
tmp;
604 extern __shared__
double sm[];
606 volatile double *accum_re = sm + threadIdx.x;
610 auxCoord1 = eutId /
param.dc.
X[0];
611 xCoord1 = eutId - auxCoord1 *
param.dc.
X[0];
612 auxCoord2 = auxCoord1 /
param.dc.
X[1];
613 xCoord2 = auxCoord1 - auxCoord2 *
param.dc.
X[1];
614 xCoord4 = auxCoord2 /
param.dc.
X[2];
615 xCoord3 = auxCoord2 - xCoord4 *
param.dc.
X[2];
617 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
618 xCoord1 += auxCoord1;
619 outId = xCoord1 +
param.dc.
X[0]*(xCoord2 +
param.dc.
X[1]*(xCoord3 +
param.dc.
X[2]*xCoord4));
641 tmp_re = I0.x * J0.x - I0.y * J0.y;
642 tmp_re += I1.x * J1.x - I1.y * J1.y;
643 tmp_re += I2.x * J2.x - I2.y * J2.y;
646 tmp_im = I0.x * J0.y + I0.y * J0.x;
647 tmp_im += I1.x * J1.y + I1.y * J1.x;
648 tmp_im += I2.x * J2.y + I2.y * J2.x;
652 tmp_re = I0.x * J3.x - I0.y * J3.y;
653 tmp_re += I1.x * J4.x - I1.y * J4.y;
654 tmp_re += I2.x * J5.x - I2.y * J5.y;
657 tmp_im = I0.x * J3.y + I0.y * J3.x;
658 tmp_im += I1.x * J4.y + I1.y * J4.x;
659 tmp_im += I2.x * J5.y + I2.y * J5.x;
663 tmp_re = I0.x * J6.x - I0.y * J6.y;
664 tmp_re += I1.x * J7.x - I1.y * J7.y;
665 tmp_re += I2.x * J8.x - I2.y * J8.y;
668 tmp_im = I0.x * J6.y + I0.y * J6.x;
669 tmp_im += I1.x * J7.y + I1.y * J7.x;
670 tmp_im += I2.x * J8.y + I2.y * J8.x;
674 tmp_re = I0.x * J9.x - I0.y * J9.y;
675 tmp_re += I1.x * J10.x - I1.y * J10.y;
676 tmp_re += I2.x * J11.x - I2.y * J11.y;
679 tmp_im = I0.x * J9.y + I0.y * J9.x;
680 tmp_im += I1.x * J10.y + I1.y * J10.x;
681 tmp_im += I2.x * J11.y + I2.y * J11.x;
685 tmp_re = I3.x * J0.x - I3.y * J0.y;
686 tmp_re += I4.x * J1.x - I4.y * J1.y;
687 tmp_re += I5.x * J2.x - I5.y * J2.y;
690 tmp_im = I3.x * J0.y + I3.y * J0.x;
691 tmp_im += I4.x * J1.y + I4.y * J1.x;
692 tmp_im += I5.x * J2.y + I5.y * J2.x;
696 tmp_re = I3.x * J3.x - I3.y * J3.y;
697 tmp_re += I4.x * J4.x - I4.y * J4.y;
698 tmp_re += I5.x * J5.x - I5.y * J5.y;
701 tmp_im = I3.x * J3.y + I3.y * J3.x;
702 tmp_im += I4.x * J4.y + I4.y * J4.x;
703 tmp_im += I5.x * J5.y + I5.y * J5.x;
707 tmp_re = I3.x * J6.x - I3.y * J6.y;
708 tmp_re += I4.x * J7.x - I4.y * J7.y;
709 tmp_re += I5.x * J8.x - I5.y * J8.y;
712 tmp_im = I3.x * J6.y + I3.y * J6.x;
713 tmp_im += I4.x * J7.y + I4.y * J7.x;
714 tmp_im += I5.x * J8.y + I5.y * J8.x;
718 tmp_re = I3.x * J9.x - I3.y * J9.y;
719 tmp_re += I4.x * J10.x - I4.y * J10.y;
720 tmp_re += I5.x * J11.x - I5.y * J11.y;
723 tmp_im = I3.x * J9.y + I3.y * J9.x;
724 tmp_im += I4.x * J10.y + I4.y * J10.x;
725 tmp_im += I5.x * J11.y + I5.y * J11.x;
729 tmp_re = I6.x * J0.x - I6.y * J0.y;
730 tmp_re += I7.x * J1.x - I7.y * J1.y;
731 tmp_re += I8.x * J2.x - I8.y * J2.y;
734 tmp_im = I6.x * J0.y + I6.y * J0.x;
735 tmp_im += I7.x * J1.y + I7.y * J1.x;
736 tmp_im += I8.x * J2.y + I8.y * J2.x;
740 tmp_re = I6.x * J3.x - I6.y * J3.y;
741 tmp_re += I7.x * J4.x - I7.y * J4.y;
742 tmp_re += I8.x * J5.x - I8.y * J5.y;
745 tmp_im = I6.x * J3.y + I6.y * J3.x;
746 tmp_im += I7.x * J4.y + I7.y * J4.x;
747 tmp_im += I8.x * J5.y + I8.y * J5.x;
751 tmp_re = I6.x * J6.x - I6.y * J6.y;
752 tmp_re += I7.x * J7.x - I7.y * J7.y;
753 tmp_re += I8.x * J8.x - I8.y * J8.y;
756 tmp_im = I6.x * J6.y + I6.y * J6.x;
757 tmp_im += I7.x * J7.y + I7.y * J7.x;
758 tmp_im += I8.x * J8.y + I8.y * J8.x;
762 tmp_re = I6.x * J9.x - I6.y * J9.y;
763 tmp_re += I7.x * J10.x - I7.y * J10.y;
764 tmp_re += I8.x * J11.x - I8.y * J11.y;
767 tmp_im = I6.x * J9.y + I6.y * J9.x;
768 tmp_im += I7.x * J10.y + I7.y * J10.x;
769 tmp_im += I8.x * J11.y + I8.y * J11.x;
773 tmp_re = I9.x * J0.x - I9.y * J0.y;
774 tmp_re += I10.x * J1.x - I10.y * J1.y;
775 tmp_re += I11.x * J2.x - I11.y * J2.y;
778 tmp_im = I9.x * J0.y + I9.y * J0.x;
779 tmp_im += I10.x * J1.y + I10.y * J1.x;
780 tmp_im += I11.x * J2.y + I11.y * J2.x;
784 tmp_re = I9.x * J3.x - I9.y * J3.y;
785 tmp_re += I10.x * J4.x - I10.y * J4.y;
786 tmp_re += I11.x * J5.x - I11.y * J5.y;
789 tmp_im = I9.x * J3.y + I9.y * J3.x;
790 tmp_im += I10.x * J4.y + I10.y * J4.x;
791 tmp_im += I11.x * J5.y + I11.y * J5.x;
795 tmp_re = I9.x * J6.x - I9.y * J6.y;
796 tmp_re += I10.x * J7.x - I10.y * J7.y;
797 tmp_re += I11.x * J8.x - I11.y * J8.y;
800 tmp_im = I9.x * J6.y + I9.y * J6.x;
801 tmp_im += I10.x * J7.y + I10.y * J7.x;
802 tmp_im += I11.x * J8.y + I11.y * J8.x;
806 tmp_re = I9.x * J9.x - I9.y * J9.y;
807 tmp_re += I10.x * J10.x - I10.y * J10.y;
808 tmp_re += I11.x * J11.x - I11.y * J11.y;
811 tmp_im = I9.x * J9.y + I9.y * J9.x;
812 tmp_im += I10.x * J10.y + I10.y * J10.x;
813 tmp_im += I11.x * J11.y + I11.y * J11.x;
842 #undef READ_INTERMEDIATE_SPINOR 848 #define READ_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \ 849 float4 I0 = spinor[sp_idx + 0*(stride)]; \ 850 float4 I1 = spinor[sp_idx + 1*(stride)]; \ 851 float4 I2 = spinor[sp_idx + 2*(stride)]; \ 852 float4 I3 = spinor[sp_idx + 3*(stride)]; \ 853 float4 I4 = spinor[sp_idx + 4*(stride)]; \ 854 float4 I5 = spinor[sp_idx + 5*(stride)]; 857 #define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \ 858 float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \ 859 float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \ 860 float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \ 861 float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \ 862 float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \ 863 float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); 865 #define READ_INTERMEDIATE_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \ 866 float4 J0 = spinor[sp_idx + 0*(stride)]; \ 867 float4 J1 = spinor[sp_idx + 1*(stride)]; \ 868 float4 J2 = spinor[sp_idx + 2*(stride)]; \ 869 float4 J3 = spinor[sp_idx + 3*(stride)]; \ 870 float4 J4 = spinor[sp_idx + 4*(stride)]; \ 871 float4 J5 = spinor[sp_idx + 5*(stride)]; 873 #define READ_INTERMEDIATE_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \ 874 float4 J0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \ 875 float4 J1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \ 876 float4 J2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \ 877 float4 J3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \ 878 float4 J4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \ 879 float4 J5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); 882 #ifdef DIRECT_ACCESS_WILSON_SPINOR 883 #define READ_SPINOR READ_SPINOR_SINGLE 886 #define READ_SPINOR READ_SPINOR_SINGLE_TEX 888 #ifdef USE_TEXTURE_OBJECTS 889 #define SPINORTEX param.inTex 891 #define SPINORTEX spinorTexSingle 892 #endif // USE_TEXTURE_OBJECTS 895 #ifdef DIRECT_ACCESS_WILSON_INTER 896 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE 899 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE_TEX 901 #ifdef USE_TEXTURE_OBJECTS 902 #define INTERTEX param.outTex 904 #define INTERTEX interTexSingle 905 #endif // USE_TEXTURE_OBJECTS 914 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
920 extern __shared__
float sms[];
922 volatile float *accum_re = sms + threadIdx.x;
926 auxCoord1 = eutId /
param.dc.
X[0];
927 xCoord1 = eutId - auxCoord1 *
param.dc.
X[0];
928 auxCoord2 = auxCoord1 /
param.dc.
X[1];
929 xCoord2 = auxCoord1 - auxCoord2 *
param.dc.
X[1];
930 xCoord4 = auxCoord2 /
param.dc.
X[2];
931 xCoord3 = auxCoord2 - xCoord4 *
param.dc.
X[2];
933 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
934 xCoord1 += auxCoord1;
935 outId = xCoord1 +
param.dc.
X[0]*(xCoord2 +
param.dc.
X[1]*(xCoord3 +
param.dc.
X[2]*xCoord4));
995 tmp_re = I0.x * J0.x - I0.y * J0.y;
996 tmp_re += I0.z * J0.z - I0.w * J0.w;
997 tmp_re += I1.x * J1.x - I1.y * J1.y;
1000 tmp_im = I0.x * J0.y + I0.y * J0.x;
1001 tmp_im += I0.z * J0.w + I0.w * J0.z;
1002 tmp_im += I1.x * J1.y + I1.y * J1.x;
1006 tmp_re = I0.x * J1.z - I0.y * J1.w;
1007 tmp_re += I0.z * J2.x - I0.w * J2.y;
1008 tmp_re += I1.x * J2.z - I1.y * J2.w;
1011 tmp_im = I0.x * J1.w + I0.y * J1.z;
1012 tmp_im += I0.z * J2.y + I0.w * J2.x;
1013 tmp_im += I1.x * J2.w + I1.y * J2.z;
1017 tmp_re = I0.x * J3.x - I0.y * J3.y;
1018 tmp_re += I0.z * J3.z - I0.w * J3.w;
1019 tmp_re += I1.x * J4.x - I1.y * J4.y;
1022 tmp_im = I0.x * J3.y + I0.y * J3.x;
1023 tmp_im += I0.z * J3.w + I0.w * J3.z;
1024 tmp_im += I1.x * J4.y + I1.y * J4.x;
1028 tmp_re = I0.x * J4.z - I0.y * J4.w;
1029 tmp_re += I0.z * J5.x - I0.w * J5.x;
1030 tmp_re += I1.x * J5.z - I1.y * J5.w;
1034 tmp_im = I0.x * J4.w + I0.y * J4.z;
1035 tmp_im += I0.z * J5.y + I0.w * J5.y;
1036 tmp_im += I1.x * J5.w + I1.y * J5.z;
1040 tmp_re = I1.z * J0.x - I1.w * J0.y;
1041 tmp_re += I2.x * J0.z - I2.y * J0.w;
1042 tmp_re += I2.z * J1.x - I2.w * J1.y;
1045 tmp_im = I1.z * J0.y + I1.w * J0.x;
1046 tmp_im += I2.x * J0.w + I2.y * J0.z;
1047 tmp_im += I2.z * J1.y + I2.w * J1.x;
1051 tmp_re = I1.z * J1.z - I1.w * J1.w;
1052 tmp_re += I2.x * J2.x - I2.y * J2.y;
1053 tmp_re += I2.z * J2.z - I2.w * J2.w;
1056 tmp_im = I1.z * J1.w + I1.w * J1.z;
1057 tmp_im += I2.x * J2.y + I2.y * J2.x;
1058 tmp_im += I2.z * J2.w + I2.w * J2.z;
1062 tmp_re = I1.z * J3.x - I1.w * J3.y;
1063 tmp_re += I2.x * J3.z - I2.y * J3.w;
1064 tmp_re += I2.z * J4.x - I2.w * J4.y;
1067 tmp_im = I1.z * J3.y + I1.w * J3.x;
1068 tmp_im += I2.x * J3.w + I2.y * J3.z;
1069 tmp_im += I2.z * J4.y + I2.w * J4.x;
1073 tmp_re = I1.z * J4.z - I1.w * J4.w;
1074 tmp_re += I2.x * J5.x - I2.y * J5.y;
1075 tmp_re += I2.z * J5.z - I2.w * J5.w;
1078 tmp_im = I1.z * J4.w + I1.w * J4.z;
1079 tmp_im += I2.x * J5.y + I2.y * J5.x;
1080 tmp_im += I2.z * J5.w + I2.w * J5.z;
1084 tmp_re = I3.x * J0.x - I3.y * J0.y;
1085 tmp_re += I3.z * J0.z - I3.w * J0.w;
1086 tmp_re += I4.x * J1.x - I4.y * J1.y;
1089 tmp_im = I3.x * J0.y + I3.y * J0.x;
1090 tmp_im += I3.z * J0.w + I3.w * J0.z;
1091 tmp_im += I4.x * J1.y + I4.y * J1.x;
1095 tmp_re = I3.x * J1.z - I3.y * J1.w;
1096 tmp_re += I3.z * J2.x - I3.w * J2.y;
1097 tmp_re += I4.x * J2.z - I4.y * J2.w;
1100 tmp_im = I3.x * J1.w + I3.y * J1.z;
1101 tmp_im += I3.z * J2.y + I3.w * J2.x;
1102 tmp_im += I4.x * J2.w + I4.y * J2.z;
1106 tmp_re = I3.x * J3.x - I3.y * J3.y;
1107 tmp_re += I3.z * J3.z - I3.w * J3.w;
1108 tmp_re += I4.x * J4.x - I4.y * J4.y;
1111 tmp_im = I3.x * J3.y + I3.y * J3.x;
1112 tmp_im += I3.z * J3.w + I3.w * J3.z;
1113 tmp_im += I4.x * J4.y + I4.y * J4.x;
1117 tmp_re = I3.x * J4.z - I3.y * J4.w;
1118 tmp_re += I3.z * J5.x - I3.w * J5.y;
1119 tmp_re += I4.x * J5.z - I4.y * J5.w;
1122 tmp_im = I3.x * J4.w + I3.y * J4.z;
1123 tmp_im += I3.z * J5.y + I3.w * J5.x;
1124 tmp_im += I4.x * J5.w + I4.y * J5.z;
1128 tmp_re = I4.z * J0.x - I4.w * J0.y;
1129 tmp_re += I5.x * J0.z - I5.y * J0.w;
1130 tmp_re += I5.z * J1.x - I5.w * J1.y;
1133 tmp_im = I4.z * J0.y + I4.w * J0.x;
1134 tmp_im += I5.x * J0.w + I5.y * J0.z;
1135 tmp_im += I5.z * J1.y + I5.w * J1.x;
1139 tmp_re = I4.z * J1.z - I4.w * J1.w;
1140 tmp_re += I5.x * J2.x - I5.y * J2.y;
1141 tmp_re += I5.z * J2.z - I5.w * J2.w;
1144 tmp_im = I4.z * J1.w + I4.w * J1.z;
1145 tmp_im += I5.x * J2.y + I5.y * J2.x;
1146 tmp_im += I5.z * J2.w + I5.w * J2.z;
1150 tmp_re = I4.z * J3.x - I4.w * J3.y;
1151 tmp_re += I5.x * J3.z - I5.y * J3.w;
1152 tmp_re += I5.z * J4.x - I5.w * J4.y;
1155 tmp_im = I4.z * J3.y + I4.w * J3.x;
1156 tmp_im += I5.x * J3.w + I5.y * J3.z;
1157 tmp_im += I5.z * J4.y + I5.w * J4.x;
1161 tmp_re = I4.z * J4.z - I4.w * J4.w;
1162 tmp_re += I5.x * J5.x - I5.y * J5.y;
1163 tmp_re += I5.z * J5.z - I5.w * J5.w;
1166 tmp_im = I4.z * J4.w + I4.w * J4.z;
1167 tmp_im += I5.x * J5.y + I5.y * J5.x;
1168 tmp_im += I5.z * J5.w + I5.w * J5.z;
1206 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1211 volatile float2
tmp;
1212 extern __shared__
float sms[];
1214 volatile float *accum_re = sms + threadIdx.x;
1220 auxCoord1 = eutId /
param.dc.
X[0];
1221 xCoord1 = eutId - auxCoord1 *
param.dc.
X[0];
1222 auxCoord2 = auxCoord1 /
param.dc.
X[1];
1223 xCoord2 = auxCoord1 - auxCoord2 *
param.dc.
X[1];
1224 xCoord4 = auxCoord2 /
param.dc.
X[2];
1225 xCoord3 = auxCoord2 - xCoord4 *
param.dc.
X[2];
1227 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1228 xCoord1 += auxCoord1;
1229 outId = xCoord1 +
param.dc.
X[0]*(xCoord2 +
param.dc.
X[1]*(xCoord3 +
param.dc.
X[2]*xCoord4));
1253 tmp_re = I0.x * J0.x - I0.y * J0.y;
1254 tmp_re += I0.z * J0.z - I0.w * J0.w;
1255 tmp_re += I1.x * J1.x - I1.y * J1.y;
1258 tmp_im = I0.x * J0.y + I0.y * J0.x;
1259 tmp_im += I0.z * J0.w + I0.w * J0.z;
1260 tmp_im += I1.x * J1.y + I1.y * J1.x;
1264 tmp_re = I0.x * J1.z - I0.y * J1.w;
1265 tmp_re += I0.z * J2.x - I0.w * J2.y;
1266 tmp_re += I1.x * J2.z - I1.y * J2.w;
1269 tmp_im = I0.x * J1.w + I0.y * J1.z;
1270 tmp_im += I0.z * J2.y + I0.w * J2.x;
1271 tmp_im += I1.x * J2.w + I1.y * J2.z;
1275 tmp_re = I0.x * J3.x - I0.y * J3.y;
1276 tmp_re += I0.z * J3.z - I0.w * J3.w;
1277 tmp_re += I1.x * J4.x - I1.y * J4.y;
1280 tmp_im = I0.x * J3.y + I0.y * J3.x;
1281 tmp_im += I0.z * J3.w + I0.w * J3.z;
1282 tmp_im += I1.x * J4.y + I1.y * J4.x;
1286 tmp_re = I0.x * J4.z - I0.y * J4.w;
1287 tmp_re += I0.z * J5.x - I0.w * J5.x;
1288 tmp_re += I1.x * J5.z - I1.y * J5.w;
1292 tmp_im = I0.x * J4.w + I0.y * J4.z;
1293 tmp_im += I0.z * J5.y + I0.w * J5.y;
1294 tmp_im += I1.x * J5.w + I1.y * J5.z;
1298 tmp_re = I1.z * J0.x - I1.w * J0.y;
1299 tmp_re += I2.x * J0.z - I2.y * J0.w;
1300 tmp_re += I2.z * J1.x - I2.w * J1.y;
1303 tmp_im = I1.z * J0.y + I1.w * J0.x;
1304 tmp_im += I2.x * J0.w + I2.y * J0.z;
1305 tmp_im += I2.z * J1.y + I2.w * J1.x;
1309 tmp_re = I1.z * J1.z - I1.w * J1.w;
1310 tmp_re += I2.x * J2.x - I2.y * J2.y;
1311 tmp_re += I2.z * J2.z - I2.w * J2.w;
1314 tmp_im = I1.z * J1.w + I1.w * J1.z;
1315 tmp_im += I2.x * J2.y + I2.y * J2.x;
1316 tmp_im += I2.z * J2.w + I2.w * J2.z;
1320 tmp_re = I1.z * J3.x - I1.w * J3.y;
1321 tmp_re += I2.x * J3.z - I2.y * J3.w;
1322 tmp_re += I2.z * J4.x - I2.w * J4.y;
1325 tmp_im = I1.z * J3.y + I1.w * J3.x;
1326 tmp_im += I2.x * J3.w + I2.y * J3.z;
1327 tmp_im += I2.z * J4.y + I2.w * J4.x;
1331 tmp_re = I1.z * J4.z - I1.w * J4.w;
1332 tmp_re += I2.x * J5.x - I2.y * J5.y;
1333 tmp_re += I2.z * J5.z - I2.w * J5.w;
1336 tmp_im = I1.z * J4.w + I1.w * J4.z;
1337 tmp_im += I2.x * J5.y + I2.y * J5.x;
1338 tmp_im += I2.z * J5.w + I2.w * J5.z;
1342 tmp_re = I3.x * J0.x - I3.y * J0.y;
1343 tmp_re += I3.z * J0.z - I3.w * J0.w;
1344 tmp_re += I4.x * J1.x - I4.y * J1.y;
1347 tmp_im = I3.x * J0.y + I3.y * J0.x;
1348 tmp_im += I3.z * J0.w + I3.w * J0.z;
1349 tmp_im += I4.x * J1.y + I4.y * J1.x;
1353 tmp_re = I3.x * J1.z - I3.y * J1.w;
1354 tmp_re += I3.z * J2.x - I3.w * J2.y;
1355 tmp_re += I4.x * J2.z - I4.y * J2.w;
1358 tmp_im = I3.x * J1.w + I3.y * J1.z;
1359 tmp_im += I3.z * J2.y + I3.w * J2.x;
1360 tmp_im += I4.x * J2.w + I4.y * J2.z;
1364 tmp_re = I3.x * J3.x - I3.y * J3.y;
1365 tmp_re += I3.z * J3.z - I3.w * J3.w;
1366 tmp_re += I4.x * J4.x - I4.y * J4.y;
1369 tmp_im = I3.x * J3.y + I3.y * J3.x;
1370 tmp_im += I3.z * J3.w + I3.w * J3.z;
1371 tmp_im += I4.x * J4.y + I4.y * J4.x;
1375 tmp_re = I3.x * J4.z - I3.y * J4.w;
1376 tmp_re += I3.z * J5.x - I3.w * J5.y;
1377 tmp_re += I4.x * J5.z - I4.y * J5.w;
1380 tmp_im = I3.x * J4.w + I3.y * J4.z;
1381 tmp_im += I3.z * J5.y + I3.w * J5.x;
1382 tmp_im += I4.x * J5.w + I4.y * J5.z;
1386 tmp_re = I4.z * J0.x - I4.w * J0.y;
1387 tmp_re += I5.x * J0.z - I5.y * J0.w;
1388 tmp_re += I5.z * J1.x - I5.w * J1.y;
1391 tmp_im = I4.z * J0.y + I4.w * J0.x;
1392 tmp_im += I5.x * J0.w + I5.y * J0.z;
1393 tmp_im += I5.z * J1.y + I5.w * J1.x;
1397 tmp_re = I4.z * J1.z - I4.w * J1.w;
1398 tmp_re += I5.x * J2.x - I5.y * J2.y;
1399 tmp_re += I5.z * J2.z - I5.w * J2.w;
1402 tmp_im = I4.z * J1.w + I4.w * J1.z;
1403 tmp_im += I5.x * J2.y + I5.y * J2.x;
1404 tmp_im += I5.z * J2.w + I5.w * J2.z;
1408 tmp_re = I4.z * J3.x - I4.w * J3.y;
1409 tmp_re += I5.x * J3.z - I5.y * J3.w;
1410 tmp_re += I5.z * J4.x - I5.w * J4.y;
1413 tmp_im = I4.z * J3.y + I4.w * J3.x;
1414 tmp_im += I5.x * J3.w + I5.y * J3.z;
1415 tmp_im += I5.z * J4.y + I5.w * J4.x;
1419 tmp_re = I4.z * J4.z - I4.w * J4.w;
1420 tmp_re += I5.x * J5.x - I5.y * J5.y;
1421 tmp_re += I5.z * J5.z - I5.w * J5.w;
1424 tmp_im = I4.z * J4.w + I4.w * J4.z;
1425 tmp_im += I5.x * J5.y + I5.y * J5.x;
1426 tmp_im += I5.z * J5.w + I5.w * J5.z;
1458 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1463 volatile float2
tmp;
1464 extern __shared__
float sms[];
1466 volatile float *accum_re = sms + threadIdx.x;
1470 auxCoord1 = eutId /
param.dc.
X[0];
1471 xCoord1 = eutId - auxCoord1 *
param.dc.
X[0];
1472 auxCoord2 = auxCoord1 /
param.dc.
X[1];
1473 xCoord2 = auxCoord1 - auxCoord2 *
param.dc.
X[1];
1474 xCoord4 = auxCoord2 /
param.dc.
X[2];
1475 xCoord3 = auxCoord2 - xCoord4 *
param.dc.
X[2];
1477 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1478 xCoord1 += auxCoord1;
1479 outId = xCoord1 +
param.dc.
X[0]*(xCoord2 +
param.dc.
X[1]*(xCoord3 +
param.dc.
X[2]*xCoord4));
1503 tmp_re = I0.x * J0.x - I0.y * J0.y;
1504 tmp_re += I0.z * J0.z - I0.w * J0.w;
1505 tmp_re += I1.x * J1.x - I1.y * J1.y;
1508 tmp_im = I0.x * J0.y + I0.y * J0.x;
1509 tmp_im += I0.z * J0.w + I0.w * J0.z;
1510 tmp_im += I1.x * J1.y + I1.y * J1.x;
1514 tmp_re = I0.x * J1.z - I0.y * J1.w;
1515 tmp_re += I0.z * J2.x - I0.w * J2.y;
1516 tmp_re += I1.x * J2.z - I1.y * J2.w;
1519 tmp_im = I0.x * J1.w + I0.y * J1.z;
1520 tmp_im += I0.z * J2.y + I0.w * J2.x;
1521 tmp_im += I1.x * J2.w + I1.y * J2.z;
1525 tmp_re = I0.x * J3.x - I0.y * J3.y;
1526 tmp_re += I0.z * J3.z - I0.w * J3.w;
1527 tmp_re += I1.x * J4.x - I1.y * J4.y;
1530 tmp_im = I0.x * J3.y + I0.y * J3.x;
1531 tmp_im += I0.z * J3.w + I0.w * J3.z;
1532 tmp_im += I1.x * J4.y + I1.y * J4.x;
1536 tmp_re = I0.x * J4.z - I0.y * J4.w;
1537 tmp_re += I0.z * J5.x - I0.w * J5.x;
1538 tmp_re += I1.x * J5.z - I1.y * J5.w;
1542 tmp_im = I0.x * J4.w + I0.y * J4.z;
1543 tmp_im += I0.z * J5.y + I0.w * J5.y;
1544 tmp_im += I1.x * J5.w + I1.y * J5.z;
1548 tmp_re = I1.z * J0.x - I1.w * J0.y;
1549 tmp_re += I2.x * J0.z - I2.y * J0.w;
1550 tmp_re += I2.z * J1.x - I2.w * J1.y;
1553 tmp_im = I1.z * J0.y + I1.w * J0.x;
1554 tmp_im += I2.x * J0.w + I2.y * J0.z;
1555 tmp_im += I2.z * J1.y + I2.w * J1.x;
1559 tmp_re = I1.z * J1.z - I1.w * J1.w;
1560 tmp_re += I2.x * J2.x - I2.y * J2.y;
1561 tmp_re += I2.z * J2.z - I2.w * J2.w;
1564 tmp_im = I1.z * J1.w + I1.w * J1.z;
1565 tmp_im += I2.x * J2.y + I2.y * J2.x;
1566 tmp_im += I2.z * J2.w + I2.w * J2.z;
1570 tmp_re = I1.z * J3.x - I1.w * J3.y;
1571 tmp_re += I2.x * J3.z - I2.y * J3.w;
1572 tmp_re += I2.z * J4.x - I2.w * J4.y;
1575 tmp_im = I1.z * J3.y + I1.w * J3.x;
1576 tmp_im += I2.x * J3.w + I2.y * J3.z;
1577 tmp_im += I2.z * J4.y + I2.w * J4.x;
1581 tmp_re = I1.z * J4.z - I1.w * J4.w;
1582 tmp_re += I2.x * J5.x - I2.y * J5.y;
1583 tmp_re += I2.z * J5.z - I2.w * J5.w;
1586 tmp_im = I1.z * J4.w + I1.w * J4.z;
1587 tmp_im += I2.x * J5.y + I2.y * J5.x;
1588 tmp_im += I2.z * J5.w + I2.w * J5.z;
1592 tmp_re = I3.x * J0.x - I3.y * J0.y;
1593 tmp_re += I3.z * J0.z - I3.w * J0.w;
1594 tmp_re += I4.x * J1.x - I4.y * J1.y;
1597 tmp_im = I3.x * J0.y + I3.y * J0.x;
1598 tmp_im += I3.z * J0.w + I3.w * J0.z;
1599 tmp_im += I4.x * J1.y + I4.y * J1.x;
1603 tmp_re = I3.x * J1.z - I3.y * J1.w;
1604 tmp_re += I3.z * J2.x - I3.w * J2.y;
1605 tmp_re += I4.x * J2.z - I4.y * J2.w;
1608 tmp_im = I3.x * J1.w + I3.y * J1.z;
1609 tmp_im += I3.z * J2.y + I3.w * J2.x;
1610 tmp_im += I4.x * J2.w + I4.y * J2.z;
1614 tmp_re = I3.x * J3.x - I3.y * J3.y;
1615 tmp_re += I3.z * J3.z - I3.w * J3.w;
1616 tmp_re += I4.x * J4.x - I4.y * J4.y;
1619 tmp_im = I3.x * J3.y + I3.y * J3.x;
1620 tmp_im += I3.z * J3.w + I3.w * J3.z;
1621 tmp_im += I4.x * J4.y + I4.y * J4.x;
1625 tmp_re = I3.x * J4.z - I3.y * J4.w;
1626 tmp_re += I3.z * J5.x - I3.w * J5.y;
1627 tmp_re += I4.x * J5.z - I4.y * J5.w;
1630 tmp_im = I3.x * J4.w + I3.y * J4.z;
1631 tmp_im += I3.z * J5.y + I3.w * J5.x;
1632 tmp_im += I4.x * J5.w + I4.y * J5.z;
1636 tmp_re = I4.z * J0.x - I4.w * J0.y;
1637 tmp_re += I5.x * J0.z - I5.y * J0.w;
1638 tmp_re += I5.z * J1.x - I5.w * J1.y;
1641 tmp_im = I4.z * J0.y + I4.w * J0.x;
1642 tmp_im += I5.x * J0.w + I5.y * J0.z;
1643 tmp_im += I5.z * J1.y + I5.w * J1.x;
1647 tmp_re = I4.z * J1.z - I4.w * J1.w;
1648 tmp_re += I5.x * J2.x - I5.y * J2.y;
1649 tmp_re += I5.z * J2.z - I5.w * J2.w;
1652 tmp_im = I4.z * J1.w + I4.w * J1.z;
1653 tmp_im += I5.x * J2.y + I5.y * J2.x;
1654 tmp_im += I5.z * J2.w + I5.w * J2.z;
1658 tmp_re = I4.z * J3.x - I4.w * J3.y;
1659 tmp_re += I5.x * J3.z - I5.y * J3.w;
1660 tmp_re += I5.z * J4.x - I5.w * J4.y;
1663 tmp_im = I4.z * J3.y + I4.w * J3.x;
1664 tmp_im += I5.x * J3.w + I5.y * J3.z;
1665 tmp_im += I5.z * J4.y + I5.w * J4.x;
1669 tmp_re = I4.z * J4.z - I4.w * J4.w;
1670 tmp_re += I5.x * J5.x - I5.y * J5.y;
1671 tmp_re += I5.z * J5.z - I5.w * J5.w;
1674 tmp_im = I4.z * J4.w + I4.w * J4.z;
1675 tmp_im += I5.x * J5.y + I5.y * J5.x;
1676 tmp_im += I5.z * J5.w + I5.w * J5.z;
1710 #undef READ_INTERMEDIATE_SPINOR 1714 #endif //_TWIST_QUDA_CONTRACT __global__ void contractKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
cudaColorSpinorField * tmp
#define READ_INTERMEDIATE_SPINOR
__global__ void contractTsliceKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
cpuColorSpinorField * out
__global__ void contractGamma5Kernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)