1 #ifndef _TWIST_QUDA_CONTRACT_PLUS 2 #define _TWIST_QUDA_CONTRACT_PLUS 7 #define TOTAL_COMPONENTS 16 9 #define READ_INTERMEDIATE_SPINOR_DOUBLE(spinor, stride, sp_idx, norm_idx) \ 10 double2 J0 = spinor[sp_idx + 0*(stride)]; \ 11 double2 J1 = spinor[sp_idx + 1*(stride)]; \ 12 double2 J2 = spinor[sp_idx + 2*(stride)]; \ 13 double2 J3 = spinor[sp_idx + 3*(stride)]; \ 14 double2 J4 = spinor[sp_idx + 4*(stride)]; \ 15 double2 J5 = spinor[sp_idx + 5*(stride)]; \ 16 double2 J6 = spinor[sp_idx + 6*(stride)]; \ 17 double2 J7 = spinor[sp_idx + 7*(stride)]; \ 18 double2 J8 = spinor[sp_idx + 8*(stride)]; \ 19 double2 J9 = spinor[sp_idx + 9*(stride)]; \ 20 double2 J10 = spinor[sp_idx +10*(stride)]; \ 21 double2 J11 = spinor[sp_idx +11*(stride)]; 23 #define READ_INTERMEDIATE_SPINOR_DOUBLE_TEX(spinor, stride, sp_idx, norm_idx) \ 24 double2 J0 = fetch_double2((spinor), sp_idx + 0*(stride)); \ 25 double2 J1 = fetch_double2((spinor), sp_idx + 1*(stride)); \ 26 double2 J2 = fetch_double2((spinor), sp_idx + 2*(stride)); \ 27 double2 J3 = fetch_double2((spinor), sp_idx + 3*(stride)); \ 28 double2 J4 = fetch_double2((spinor), sp_idx + 4*(stride)); \ 29 double2 J5 = fetch_double2((spinor), sp_idx + 5*(stride)); \ 30 double2 J6 = fetch_double2((spinor), sp_idx + 6*(stride)); \ 31 double2 J7 = fetch_double2((spinor), sp_idx + 7*(stride)); \ 32 double2 J8 = fetch_double2((spinor), sp_idx + 8*(stride)); \ 33 double2 J9 = fetch_double2((spinor), sp_idx + 9*(stride)); \ 34 double2 J10 = fetch_double2((spinor), sp_idx +10*(stride)); \ 35 double2 J11 = fetch_double2((spinor), sp_idx +11*(stride)); 37 #ifdef DIRECT_ACCESS_WILSON_SPINOR 38 #define READ_SPINOR READ_SPINOR_DOUBLE 39 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE 43 #define READ_SPINOR READ_SPINOR_DOUBLE_TEX 44 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_DOUBLE_TEX 46 #ifdef USE_TEXTURE_OBJECTS 47 #define SPINORTEX param.inTex 48 #define INTERTEX param.outTex 50 #define SPINORTEX spinorTexDouble 51 #define INTERTEX interTexDouble 52 #endif // USE_TEXTURE_OBJECTS 62 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
68 extern __shared__
double sm[];
70 volatile double *accum_re = sm + threadIdx.x;
74 auxCoord1 = eutId /
param.dc.
X[0];
75 xCoord1 = eutId - auxCoord1 *
param.dc.
X[0];
76 auxCoord2 = auxCoord1 /
param.dc.
X[1];
77 xCoord2 = auxCoord1 - auxCoord2 *
param.dc.
X[1];
78 xCoord4 = auxCoord2 /
param.dc.
X[2];
79 xCoord3 = auxCoord2 - xCoord4 *
param.dc.
X[2];
81 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
83 outId = xCoord1 +
param.dc.
X[0]*(xCoord2 +
param.dc.
X[1]*(xCoord3 +
param.dc.
X[2]*xCoord4));
134 tmp_re = I0.x * J0.x - I0.y * J0.y;
135 tmp_re += I1.x * J1.x - I1.y * J1.y;
136 tmp_re += I2.x * J2.x - I2.y * J2.y;
139 tmp_im = I0.x * J0.y + I0.y * J0.x;
140 tmp_im += I1.x * J1.y + I1.y * J1.x;
141 tmp_im += I2.x * J2.y + I2.y * J2.x;
145 tmp_re = I0.x * J3.x - I0.y * J3.y;
146 tmp_re += I1.x * J4.x - I1.y * J4.y;
147 tmp_re += I2.x * J5.x - I2.y * J5.y;
150 tmp_im = I0.x * J3.y + I0.y * J3.x;
151 tmp_im += I1.x * J4.y + I1.y * J4.x;
152 tmp_im += I2.x * J5.y + I2.y * J5.x;
156 tmp_re = I0.x * J6.x - I0.y * J6.y;
157 tmp_re += I1.x * J7.x - I1.y * J7.y;
158 tmp_re += I2.x * J8.x - I2.y * J8.y;
161 tmp_im = I0.x * J6.y + I0.y * J6.x;
162 tmp_im += I1.x * J7.y + I1.y * J7.x;
163 tmp_im += I2.x * J8.y + I2.y * J8.x;
167 tmp_re = I0.x * J9.x - I0.y * J9.y;
168 tmp_re += I1.x * J10.x - I1.y * J10.y;
169 tmp_re += I2.x * J11.x - I2.y * J11.y;
172 tmp_im = I0.x * J9.y + I0.y * J9.x;
173 tmp_im += I1.x * J10.y + I1.y * J10.x;
174 tmp_im += I2.x * J11.y + I2.y * J11.x;
178 tmp_re = I3.x * J0.x - I3.y * J0.y;
179 tmp_re += I4.x * J1.x - I4.y * J1.y;
180 tmp_re += I5.x * J2.x - I5.y * J2.y;
183 tmp_im = I3.x * J0.y + I3.y * J0.x;
184 tmp_im += I4.x * J1.y + I4.y * J1.x;
185 tmp_im += I5.x * J2.y + I5.y * J2.x;
189 tmp_re = I3.x * J3.x - I3.y * J3.y;
190 tmp_re += I4.x * J4.x - I4.y * J4.y;
191 tmp_re += I5.x * J5.x - I5.y * J5.y;
194 tmp_im = I3.x * J3.y + I3.y * J3.x;
195 tmp_im += I4.x * J4.y + I4.y * J4.x;
196 tmp_im += I5.x * J5.y + I5.y * J5.x;
200 tmp_re = I3.x * J6.x - I3.y * J6.y;
201 tmp_re += I4.x * J7.x - I4.y * J7.y;
202 tmp_re += I5.x * J8.x - I5.y * J8.y;
205 tmp_im = I3.x * J6.y + I3.y * J6.x;
206 tmp_im += I4.x * J7.y + I4.y * J7.x;
207 tmp_im += I5.x * J8.y + I5.y * J8.x;
211 tmp_re = I3.x * J9.x - I3.y * J9.y;
212 tmp_re += I4.x * J10.x - I4.y * J10.y;
213 tmp_re += I5.x * J11.x - I5.y * J11.y;
216 tmp_im = I3.x * J9.y + I3.y * J9.x;
217 tmp_im += I4.x * J10.y + I4.y * J10.x;
218 tmp_im += I5.x * J11.y + I5.y * J11.x;
222 tmp_re = I6.x * J0.x - I6.y * J0.y;
223 tmp_re += I7.x * J1.x - I7.y * J1.y;
224 tmp_re += I8.x * J2.x - I8.y * J2.y;
227 tmp_im = I6.x * J0.y + I6.y * J0.x;
228 tmp_im += I7.x * J1.y + I7.y * J1.x;
229 tmp_im += I8.x * J2.y + I8.y * J2.x;
233 tmp_re = I6.x * J3.x - I6.y * J3.y;
234 tmp_re += I7.x * J4.x - I7.y * J4.y;
235 tmp_re += I8.x * J5.x - I8.y * J5.y;
238 tmp_im = I6.x * J3.y + I6.y * J3.x;
239 tmp_im += I7.x * J4.y + I7.y * J4.x;
240 tmp_im += I8.x * J5.y + I8.y * J5.x;
244 tmp_re = I6.x * J6.x - I6.y * J6.y;
245 tmp_re += I7.x * J7.x - I7.y * J7.y;
246 tmp_re += I8.x * J8.x - I8.y * J8.y;
249 tmp_im = I6.x * J6.y + I6.y * J6.x;
250 tmp_im += I7.x * J7.y + I7.y * J7.x;
251 tmp_im += I8.x * J8.y + I8.y * J8.x;
255 tmp_re = I6.x * J9.x - I6.y * J9.y;
256 tmp_re += I7.x * J10.x - I7.y * J10.y;
257 tmp_re += I8.x * J11.x - I8.y * J11.y;
260 tmp_im = I6.x * J9.y + I6.y * J9.x;
261 tmp_im += I7.x * J10.y + I7.y * J10.x;
262 tmp_im += I8.x * J11.y + I8.y * J11.x;
266 tmp_re = I9.x * J0.x - I9.y * J0.y;
267 tmp_re += I10.x * J1.x - I10.y * J1.y;
268 tmp_re += I11.x * J2.x - I11.y * J2.y;
271 tmp_im = I9.x * J0.y + I9.y * J0.x;
272 tmp_im += I10.x * J1.y + I10.y * J1.x;
273 tmp_im += I11.x * J2.y + I11.y * J2.x;
277 tmp_re = I9.x * J3.x - I9.y * J3.y;
278 tmp_re += I10.x * J4.x - I10.y * J4.y;
279 tmp_re += I11.x * J5.x - I11.y * J5.y;
282 tmp_im = I9.x * J3.y + I9.y * J3.x;
283 tmp_im += I10.x * J4.y + I10.y * J4.x;
284 tmp_im += I11.x * J5.y + I11.y * J5.x;
288 tmp_re = I9.x * J6.x - I9.y * J6.y;
289 tmp_re += I10.x * J7.x - I10.y * J7.y;
290 tmp_re += I11.x * J8.x - I11.y * J8.y;
293 tmp_im = I9.x * J6.y + I9.y * J6.x;
294 tmp_im += I10.x * J7.y + I10.y * J7.x;
295 tmp_im += I11.x * J8.y + I11.y * J8.x;
299 tmp_re = I9.x * J9.x - I9.y * J9.y;
300 tmp_re += I10.x * J10.x - I10.y * J10.y;
301 tmp_re += I11.x * J11.x - I11.y * J11.y;
304 tmp_im = I9.x * J9.y + I9.y * J9.x;
305 tmp_im += I10.x * J10.y + I10.y * J10.x;
306 tmp_im += I11.x * J11.y + I11.y * J11.x;
359 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
364 volatile double2
tmp;
365 extern __shared__
double sm[];
367 volatile double *accum_re = sm + threadIdx.x;
373 auxCoord1 = eutId /
param.dc.
X[0];
374 xCoord1 = eutId - auxCoord1 *
param.dc.
X[0];
375 auxCoord2 = auxCoord1 /
param.dc.
X[1];
376 xCoord2 = auxCoord1 - auxCoord2 *
param.dc.
X[1];
377 xCoord4 = auxCoord2 /
param.dc.
X[2];
378 xCoord3 = auxCoord2 - xCoord4 *
param.dc.
X[2];
380 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
381 xCoord1 += auxCoord1;
382 outId = xCoord1 +
param.dc.
X[0]*(xCoord2 +
param.dc.
X[1]*(xCoord3 +
param.dc.
X[2]*xCoord4));
404 tmp_re = I0.x * J0.x - I0.y * J0.y;
405 tmp_re += I1.x * J1.x - I1.y * J1.y;
406 tmp_re += I2.x * J2.x - I2.y * J2.y;
409 tmp_im = I0.x * J0.y + I0.y * J0.x;
410 tmp_im += I1.x * J1.y + I1.y * J1.x;
411 tmp_im += I2.x * J2.y + I2.y * J2.x;
415 tmp_re = I0.x * J3.x - I0.y * J3.y;
416 tmp_re += I1.x * J4.x - I1.y * J4.y;
417 tmp_re += I2.x * J5.x - I2.y * J5.y;
420 tmp_im = I0.x * J3.y + I0.y * J3.x;
421 tmp_im += I1.x * J4.y + I1.y * J4.x;
422 tmp_im += I2.x * J5.y + I2.y * J5.x;
426 tmp_re = I0.x * J6.x - I0.y * J6.y;
427 tmp_re += I1.x * J7.x - I1.y * J7.y;
428 tmp_re += I2.x * J8.x - I2.y * J8.y;
431 tmp_im = I0.x * J6.y + I0.y * J6.x;
432 tmp_im += I1.x * J7.y + I1.y * J7.x;
433 tmp_im += I2.x * J8.y + I2.y * J8.x;
437 tmp_re = I0.x * J9.x - I0.y * J9.y;
438 tmp_re += I1.x * J10.x - I1.y * J10.y;
439 tmp_re += I2.x * J11.x - I2.y * J11.y;
442 tmp_im = I0.x * J9.y + I0.y * J9.x;
443 tmp_im += I1.x * J10.y + I1.y * J10.x;
444 tmp_im += I2.x * J11.y + I2.y * J11.x;
448 tmp_re = I3.x * J0.x - I3.y * J0.y;
449 tmp_re += I4.x * J1.x - I4.y * J1.y;
450 tmp_re += I5.x * J2.x - I5.y * J2.y;
453 tmp_im = I3.x * J0.y + I3.y * J0.x;
454 tmp_im += I4.x * J1.y + I4.y * J1.x;
455 tmp_im += I5.x * J2.y + I5.y * J2.x;
459 tmp_re = I3.x * J3.x - I3.y * J3.y;
460 tmp_re += I4.x * J4.x - I4.y * J4.y;
461 tmp_re += I5.x * J5.x - I5.y * J5.y;
464 tmp_im = I3.x * J3.y + I3.y * J3.x;
465 tmp_im += I4.x * J4.y + I4.y * J4.x;
466 tmp_im += I5.x * J5.y + I5.y * J5.x;
470 tmp_re = I3.x * J6.x - I3.y * J6.y;
471 tmp_re += I4.x * J7.x - I4.y * J7.y;
472 tmp_re += I5.x * J8.x - I5.y * J8.y;
475 tmp_im = I3.x * J6.y + I3.y * J6.x;
476 tmp_im += I4.x * J7.y + I4.y * J7.x;
477 tmp_im += I5.x * J8.y + I5.y * J8.x;
481 tmp_re = I3.x * J9.x - I3.y * J9.y;
482 tmp_re += I4.x * J10.x - I4.y * J10.y;
483 tmp_re += I5.x * J11.x - I5.y * J11.y;
486 tmp_im = I3.x * J9.y + I3.y * J9.x;
487 tmp_im += I4.x * J10.y + I4.y * J10.x;
488 tmp_im += I5.x * J11.y + I5.y * J11.x;
492 tmp_re = I6.x * J0.x - I6.y * J0.y;
493 tmp_re += I7.x * J1.x - I7.y * J1.y;
494 tmp_re += I8.x * J2.x - I8.y * J2.y;
497 tmp_im = I6.x * J0.y + I6.y * J0.x;
498 tmp_im += I7.x * J1.y + I7.y * J1.x;
499 tmp_im += I8.x * J2.y + I8.y * J2.x;
503 tmp_re = I6.x * J3.x - I6.y * J3.y;
504 tmp_re += I7.x * J4.x - I7.y * J4.y;
505 tmp_re += I8.x * J5.x - I8.y * J5.y;
508 tmp_im = I6.x * J3.y + I6.y * J3.x;
509 tmp_im += I7.x * J4.y + I7.y * J4.x;
510 tmp_im += I8.x * J5.y + I8.y * J5.x;
514 tmp_re = I6.x * J6.x - I6.y * J6.y;
515 tmp_re += I7.x * J7.x - I7.y * J7.y;
516 tmp_re += I8.x * J8.x - I8.y * J8.y;
519 tmp_im = I6.x * J6.y + I6.y * J6.x;
520 tmp_im += I7.x * J7.y + I7.y * J7.x;
521 tmp_im += I8.x * J8.y + I8.y * J8.x;
525 tmp_re = I6.x * J9.x - I6.y * J9.y;
526 tmp_re += I7.x * J10.x - I7.y * J10.y;
527 tmp_re += I8.x * J11.x - I8.y * J11.y;
530 tmp_im = I6.x * J9.y + I6.y * J9.x;
531 tmp_im += I7.x * J10.y + I7.y * J10.x;
532 tmp_im += I8.x * J11.y + I8.y * J11.x;
536 tmp_re = I9.x * J0.x - I9.y * J0.y;
537 tmp_re += I10.x * J1.x - I10.y * J1.y;
538 tmp_re += I11.x * J2.x - I11.y * J2.y;
541 tmp_im = I9.x * J0.y + I9.y * J0.x;
542 tmp_im += I10.x * J1.y + I10.y * J1.x;
543 tmp_im += I11.x * J2.y + I11.y * J2.x;
547 tmp_re = I9.x * J3.x - I9.y * J3.y;
548 tmp_re += I10.x * J4.x - I10.y * J4.y;
549 tmp_re += I11.x * J5.x - I11.y * J5.y;
552 tmp_im = I9.x * J3.y + I9.y * J3.x;
553 tmp_im += I10.x * J4.y + I10.y * J4.x;
554 tmp_im += I11.x * J5.y + I11.y * J5.x;
558 tmp_re = I9.x * J6.x - I9.y * J6.y;
559 tmp_re += I10.x * J7.x - I10.y * J7.y;
560 tmp_re += I11.x * J8.x - I11.y * J8.y;
563 tmp_im = I9.x * J6.y + I9.y * J6.x;
564 tmp_im += I10.x * J7.y + I10.y * J7.x;
565 tmp_im += I11.x * J8.y + I11.y * J8.x;
569 tmp_re = I9.x * J9.x - I9.y * J9.y;
570 tmp_re += I10.x * J10.x - I10.y * J10.y;
571 tmp_re += I11.x * J11.x - I11.y * J11.y;
574 tmp_im = I9.x * J9.y + I9.y * J9.x;
575 tmp_im += I10.x * J10.y + I10.y * J10.x;
576 tmp_im += I11.x * J11.y + I11.y * J11.x;
625 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
630 volatile double2
tmp;
631 extern __shared__
double sm[];
633 volatile double *accum_re = sm + threadIdx.x;
637 auxCoord1 = eutId /
param.dc.
X[0];
638 xCoord1 = eutId - auxCoord1 *
param.dc.
X[0];
639 auxCoord2 = auxCoord1 /
param.dc.
X[1];
640 xCoord2 = auxCoord1 - auxCoord2 *
param.dc.
X[1];
641 xCoord4 = auxCoord2 /
param.dc.
X[2];
642 xCoord3 = auxCoord2 - xCoord4 *
param.dc.
X[2];
644 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
645 xCoord1 += auxCoord1;
646 outId = xCoord1 +
param.dc.
X[0]*(xCoord2 +
param.dc.
X[1]*(xCoord3 +
param.dc.
X[2]*xCoord4));
668 tmp_re = I0.x * J0.x - I0.y * J0.y;
669 tmp_re += I1.x * J1.x - I1.y * J1.y;
670 tmp_re += I2.x * J2.x - I2.y * J2.y;
673 tmp_im = I0.x * J0.y + I0.y * J0.x;
674 tmp_im += I1.x * J1.y + I1.y * J1.x;
675 tmp_im += I2.x * J2.y + I2.y * J2.x;
679 tmp_re = I0.x * J3.x - I0.y * J3.y;
680 tmp_re += I1.x * J4.x - I1.y * J4.y;
681 tmp_re += I2.x * J5.x - I2.y * J5.y;
684 tmp_im = I0.x * J3.y + I0.y * J3.x;
685 tmp_im += I1.x * J4.y + I1.y * J4.x;
686 tmp_im += I2.x * J5.y + I2.y * J5.x;
690 tmp_re = I0.x * J6.x - I0.y * J6.y;
691 tmp_re += I1.x * J7.x - I1.y * J7.y;
692 tmp_re += I2.x * J8.x - I2.y * J8.y;
695 tmp_im = I0.x * J6.y + I0.y * J6.x;
696 tmp_im += I1.x * J7.y + I1.y * J7.x;
697 tmp_im += I2.x * J8.y + I2.y * J8.x;
701 tmp_re = I0.x * J9.x - I0.y * J9.y;
702 tmp_re += I1.x * J10.x - I1.y * J10.y;
703 tmp_re += I2.x * J11.x - I2.y * J11.y;
706 tmp_im = I0.x * J9.y + I0.y * J9.x;
707 tmp_im += I1.x * J10.y + I1.y * J10.x;
708 tmp_im += I2.x * J11.y + I2.y * J11.x;
712 tmp_re = I3.x * J0.x - I3.y * J0.y;
713 tmp_re += I4.x * J1.x - I4.y * J1.y;
714 tmp_re += I5.x * J2.x - I5.y * J2.y;
717 tmp_im = I3.x * J0.y + I3.y * J0.x;
718 tmp_im += I4.x * J1.y + I4.y * J1.x;
719 tmp_im += I5.x * J2.y + I5.y * J2.x;
723 tmp_re = I3.x * J3.x - I3.y * J3.y;
724 tmp_re += I4.x * J4.x - I4.y * J4.y;
725 tmp_re += I5.x * J5.x - I5.y * J5.y;
728 tmp_im = I3.x * J3.y + I3.y * J3.x;
729 tmp_im += I4.x * J4.y + I4.y * J4.x;
730 tmp_im += I5.x * J5.y + I5.y * J5.x;
734 tmp_re = I3.x * J6.x - I3.y * J6.y;
735 tmp_re += I4.x * J7.x - I4.y * J7.y;
736 tmp_re += I5.x * J8.x - I5.y * J8.y;
739 tmp_im = I3.x * J6.y + I3.y * J6.x;
740 tmp_im += I4.x * J7.y + I4.y * J7.x;
741 tmp_im += I5.x * J8.y + I5.y * J8.x;
745 tmp_re = I3.x * J9.x - I3.y * J9.y;
746 tmp_re += I4.x * J10.x - I4.y * J10.y;
747 tmp_re += I5.x * J11.x - I5.y * J11.y;
750 tmp_im = I3.x * J9.y + I3.y * J9.x;
751 tmp_im += I4.x * J10.y + I4.y * J10.x;
752 tmp_im += I5.x * J11.y + I5.y * J11.x;
756 tmp_re = I6.x * J0.x - I6.y * J0.y;
757 tmp_re += I7.x * J1.x - I7.y * J1.y;
758 tmp_re += I8.x * J2.x - I8.y * J2.y;
761 tmp_im = I6.x * J0.y + I6.y * J0.x;
762 tmp_im += I7.x * J1.y + I7.y * J1.x;
763 tmp_im += I8.x * J2.y + I8.y * J2.x;
767 tmp_re = I6.x * J3.x - I6.y * J3.y;
768 tmp_re += I7.x * J4.x - I7.y * J4.y;
769 tmp_re += I8.x * J5.x - I8.y * J5.y;
772 tmp_im = I6.x * J3.y + I6.y * J3.x;
773 tmp_im += I7.x * J4.y + I7.y * J4.x;
774 tmp_im += I8.x * J5.y + I8.y * J5.x;
778 tmp_re = I6.x * J6.x - I6.y * J6.y;
779 tmp_re += I7.x * J7.x - I7.y * J7.y;
780 tmp_re += I8.x * J8.x - I8.y * J8.y;
783 tmp_im = I6.x * J6.y + I6.y * J6.x;
784 tmp_im += I7.x * J7.y + I7.y * J7.x;
785 tmp_im += I8.x * J8.y + I8.y * J8.x;
789 tmp_re = I6.x * J9.x - I6.y * J9.y;
790 tmp_re += I7.x * J10.x - I7.y * J10.y;
791 tmp_re += I8.x * J11.x - I8.y * J11.y;
794 tmp_im = I6.x * J9.y + I6.y * J9.x;
795 tmp_im += I7.x * J10.y + I7.y * J10.x;
796 tmp_im += I8.x * J11.y + I8.y * J11.x;
800 tmp_re = I9.x * J0.x - I9.y * J0.y;
801 tmp_re += I10.x * J1.x - I10.y * J1.y;
802 tmp_re += I11.x * J2.x - I11.y * J2.y;
805 tmp_im = I9.x * J0.y + I9.y * J0.x;
806 tmp_im += I10.x * J1.y + I10.y * J1.x;
807 tmp_im += I11.x * J2.y + I11.y * J2.x;
811 tmp_re = I9.x * J3.x - I9.y * J3.y;
812 tmp_re += I10.x * J4.x - I10.y * J4.y;
813 tmp_re += I11.x * J5.x - I11.y * J5.y;
816 tmp_im = I9.x * J3.y + I9.y * J3.x;
817 tmp_im += I10.x * J4.y + I10.y * J4.x;
818 tmp_im += I11.x * J5.y + I11.y * J5.x;
822 tmp_re = I9.x * J6.x - I9.y * J6.y;
823 tmp_re += I10.x * J7.x - I10.y * J7.y;
824 tmp_re += I11.x * J8.x - I11.y * J8.y;
827 tmp_im = I9.x * J6.y + I9.y * J6.x;
828 tmp_im += I10.x * J7.y + I10.y * J7.x;
829 tmp_im += I11.x * J8.y + I11.y * J8.x;
833 tmp_re = I9.x * J9.x - I9.y * J9.y;
834 tmp_re += I10.x * J10.x - I10.y * J10.y;
835 tmp_re += I11.x * J11.x - I11.y * J11.y;
838 tmp_im = I9.x * J9.y + I9.y * J9.x;
839 tmp_im += I10.x * J10.y + I10.y * J10.x;
840 tmp_im += I11.x * J11.y + I11.y * J11.x;
886 #undef READ_INTERMEDIATE_SPINOR 891 #define READ_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \ 892 float4 I0 = spinor[sp_idx + 0*(stride)]; \ 893 float4 I1 = spinor[sp_idx + 1*(stride)]; \ 894 float4 I2 = spinor[sp_idx + 2*(stride)]; \ 895 float4 I3 = spinor[sp_idx + 3*(stride)]; \ 896 float4 I4 = spinor[sp_idx + 4*(stride)]; \ 897 float4 I5 = spinor[sp_idx + 5*(stride)]; 900 #define READ_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \ 901 float4 I0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \ 902 float4 I1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \ 903 float4 I2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \ 904 float4 I3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \ 905 float4 I4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \ 906 float4 I5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); 908 #define READ_INTERMEDIATE_SPINOR_SINGLE(spinor, stride, sp_idx, norm_idx) \ 909 float4 J0 = spinor[sp_idx + 0*(stride)]; \ 910 float4 J1 = spinor[sp_idx + 1*(stride)]; \ 911 float4 J2 = spinor[sp_idx + 2*(stride)]; \ 912 float4 J3 = spinor[sp_idx + 3*(stride)]; \ 913 float4 J4 = spinor[sp_idx + 4*(stride)]; \ 914 float4 J5 = spinor[sp_idx + 5*(stride)]; 916 #define READ_INTERMEDIATE_SPINOR_SINGLE_TEX(spinor, stride, sp_idx, norm_idx) \ 917 float4 J0 = TEX1DFETCH(float4, (spinor), sp_idx + 0*(stride)); \ 918 float4 J1 = TEX1DFETCH(float4, (spinor), sp_idx + 1*(stride)); \ 919 float4 J2 = TEX1DFETCH(float4, (spinor), sp_idx + 2*(stride)); \ 920 float4 J3 = TEX1DFETCH(float4, (spinor), sp_idx + 3*(stride)); \ 921 float4 J4 = TEX1DFETCH(float4, (spinor), sp_idx + 4*(stride)); \ 922 float4 J5 = TEX1DFETCH(float4, (spinor), sp_idx + 5*(stride)); 925 #ifdef DIRECT_ACCESS_WILSON_SPINOR 926 #define READ_SPINOR READ_SPINOR_SINGLE 929 #define READ_SPINOR READ_SPINOR_SINGLE_TEX 931 #ifdef USE_TEXTURE_OBJECTS 932 #define SPINORTEX param.inTex 934 #define SPINORTEX spinorTexSingle 935 #endif // USE_TEXTURE_OBJECTS 938 #ifdef DIRECT_ACCESS_WILSON_INTER 939 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE 942 #define READ_INTERMEDIATE_SPINOR READ_INTERMEDIATE_SPINOR_SINGLE_TEX 944 #ifdef USE_TEXTURE_OBJECTS 945 #define INTERTEX param.outTex 947 #define INTERTEX interTexSingle 948 #endif // USE_TEXTURE_OBJECTS 957 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
963 extern __shared__
float sms[];
965 volatile float *accum_re = sms + threadIdx.x;
969 auxCoord1 = eutId /
param.dc.
X[0];
970 xCoord1 = eutId - auxCoord1 *
param.dc.
X[0];
971 auxCoord2 = auxCoord1 /
param.dc.
X[1];
972 xCoord2 = auxCoord1 - auxCoord2 *
param.dc.
X[1];
973 xCoord4 = auxCoord2 /
param.dc.
X[2];
974 xCoord3 = auxCoord2 - xCoord4 *
param.dc.
X[2];
976 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
977 xCoord1 += auxCoord1;
978 outId = xCoord1 +
param.dc.
X[0]*(xCoord2 +
param.dc.
X[1]*(xCoord3 +
param.dc.
X[2]*xCoord4));
1038 tmp_re = I0.x * J0.x - I0.y * J0.y;
1039 tmp_re += I0.z * J0.z - I0.w * J0.w;
1040 tmp_re += I1.x * J1.x - I1.y * J1.y;
1043 tmp_im = I0.x * J0.y + I0.y * J0.x;
1044 tmp_im += I0.z * J0.w + I0.w * J0.z;
1045 tmp_im += I1.x * J1.y + I1.y * J1.x;
1049 tmp_re = I0.x * J1.z - I0.y * J1.w;
1050 tmp_re += I0.z * J2.x - I0.w * J2.y;
1051 tmp_re += I1.x * J2.z - I1.y * J2.w;
1054 tmp_im = I0.x * J1.w + I0.y * J1.z;
1055 tmp_im += I0.z * J2.y + I0.w * J2.x;
1056 tmp_im += I1.x * J2.w + I1.y * J2.z;
1060 tmp_re = I0.x * J3.x - I0.y * J3.y;
1061 tmp_re += I0.z * J3.z - I0.w * J3.w;
1062 tmp_re += I1.x * J4.x - I1.y * J4.y;
1065 tmp_im = I0.x * J3.y + I0.y * J3.x;
1066 tmp_im += I0.z * J3.w + I0.w * J3.z;
1067 tmp_im += I1.x * J4.y + I1.y * J4.x;
1071 tmp_re = I0.x * J4.z - I0.y * J4.w;
1072 tmp_re += I0.z * J5.x - I0.w * J5.x;
1073 tmp_re += I1.x * J5.z - I1.y * J5.w;
1077 tmp_im = I0.x * J4.w + I0.y * J4.z;
1078 tmp_im += I0.z * J5.y + I0.w * J5.y;
1079 tmp_im += I1.x * J5.w + I1.y * J5.z;
1083 tmp_re = I1.z * J0.x - I1.w * J0.y;
1084 tmp_re += I2.x * J0.z - I2.y * J0.w;
1085 tmp_re += I2.z * J1.x - I2.w * J1.y;
1088 tmp_im = I1.z * J0.y + I1.w * J0.x;
1089 tmp_im += I2.x * J0.w + I2.y * J0.z;
1090 tmp_im += I2.z * J1.y + I2.w * J1.x;
1094 tmp_re = I1.z * J1.z - I1.w * J1.w;
1095 tmp_re += I2.x * J2.x - I2.y * J2.y;
1096 tmp_re += I2.z * J2.z - I2.w * J2.w;
1099 tmp_im = I1.z * J1.w + I1.w * J1.z;
1100 tmp_im += I2.x * J2.y + I2.y * J2.x;
1101 tmp_im += I2.z * J2.w + I2.w * J2.z;
1105 tmp_re = I1.z * J3.x - I1.w * J3.y;
1106 tmp_re += I2.x * J3.z - I2.y * J3.w;
1107 tmp_re += I2.z * J4.x - I2.w * J4.y;
1110 tmp_im = I1.z * J3.y + I1.w * J3.x;
1111 tmp_im += I2.x * J3.w + I2.y * J3.z;
1112 tmp_im += I2.z * J4.y + I2.w * J4.x;
1116 tmp_re = I1.z * J4.z - I1.w * J4.w;
1117 tmp_re += I2.x * J5.x - I2.y * J5.y;
1118 tmp_re += I2.z * J5.z - I2.w * J5.w;
1121 tmp_im = I1.z * J4.w + I1.w * J4.z;
1122 tmp_im += I2.x * J5.y + I2.y * J5.x;
1123 tmp_im += I2.z * J5.w + I2.w * J5.z;
1127 tmp_re = I3.x * J0.x - I3.y * J0.y;
1128 tmp_re += I3.z * J0.z - I3.w * J0.w;
1129 tmp_re += I4.x * J1.x - I4.y * J1.y;
1132 tmp_im = I3.x * J0.y + I3.y * J0.x;
1133 tmp_im += I3.z * J0.w + I3.w * J0.z;
1134 tmp_im += I4.x * J1.y + I4.y * J1.x;
1138 tmp_re = I3.x * J1.z - I3.y * J1.w;
1139 tmp_re += I3.z * J2.x - I3.w * J2.y;
1140 tmp_re += I4.x * J2.z - I4.y * J2.w;
1143 tmp_im = I3.x * J1.w + I3.y * J1.z;
1144 tmp_im += I3.z * J2.y + I3.w * J2.x;
1145 tmp_im += I4.x * J2.w + I4.y * J2.z;
1149 tmp_re = I3.x * J3.x - I3.y * J3.y;
1150 tmp_re += I3.z * J3.z - I3.w * J3.w;
1151 tmp_re += I4.x * J4.x - I4.y * J4.y;
1154 tmp_im = I3.x * J3.y + I3.y * J3.x;
1155 tmp_im += I3.z * J3.w + I3.w * J3.z;
1156 tmp_im += I4.x * J4.y + I4.y * J4.x;
1160 tmp_re = I3.x * J4.z - I3.y * J4.w;
1161 tmp_re += I3.z * J5.x - I3.w * J5.y;
1162 tmp_re += I4.x * J5.z - I4.y * J5.w;
1165 tmp_im = I3.x * J4.w + I3.y * J4.z;
1166 tmp_im += I3.z * J5.y + I3.w * J5.x;
1167 tmp_im += I4.x * J5.w + I4.y * J5.z;
1171 tmp_re = I4.z * J0.x - I4.w * J0.y;
1172 tmp_re += I5.x * J0.z - I5.y * J0.w;
1173 tmp_re += I5.z * J1.x - I5.w * J1.y;
1176 tmp_im = I4.z * J0.y + I4.w * J0.x;
1177 tmp_im += I5.x * J0.w + I5.y * J0.z;
1178 tmp_im += I5.z * J1.y + I5.w * J1.x;
1182 tmp_re = I4.z * J1.z - I4.w * J1.w;
1183 tmp_re += I5.x * J2.x - I5.y * J2.y;
1184 tmp_re += I5.z * J2.z - I5.w * J2.w;
1187 tmp_im = I4.z * J1.w + I4.w * J1.z;
1188 tmp_im += I5.x * J2.y + I5.y * J2.x;
1189 tmp_im += I5.z * J2.w + I5.w * J2.z;
1193 tmp_re = I4.z * J3.x - I4.w * J3.y;
1194 tmp_re += I5.x * J3.z - I5.y * J3.w;
1195 tmp_re += I5.z * J4.x - I5.w * J4.y;
1198 tmp_im = I4.z * J3.y + I4.w * J3.x;
1199 tmp_im += I5.x * J3.w + I5.y * J3.z;
1200 tmp_im += I5.z * J4.y + I5.w * J4.x;
1204 tmp_re = I4.z * J4.z - I4.w * J4.w;
1205 tmp_re += I5.x * J5.x - I5.y * J5.y;
1206 tmp_re += I5.z * J5.z - I5.w * J5.w;
1209 tmp_im = I4.z * J4.w + I4.w * J4.z;
1210 tmp_im += I5.x * J5.y + I5.y * J5.x;
1211 tmp_im += I5.z * J5.w + I5.w * J5.z;
1266 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1271 volatile float2
tmp;
1272 extern __shared__
float sms[];
1274 volatile float *accum_re = sms + threadIdx.x;
1280 auxCoord1 = eutId /
param.dc.
X[0];
1281 xCoord1 = eutId - auxCoord1 *
param.dc.
X[0];
1282 auxCoord2 = auxCoord1 /
param.dc.
X[1];
1283 xCoord2 = auxCoord1 - auxCoord2 *
param.dc.
X[1];
1284 xCoord4 = auxCoord2 /
param.dc.
X[2];
1285 xCoord3 = auxCoord2 - xCoord4 *
param.dc.
X[2];
1287 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1288 xCoord1 += auxCoord1;
1289 outId = xCoord1 +
param.dc.
X[0]*(xCoord2 +
param.dc.
X[1]*(xCoord3 +
param.dc.
X[2]*xCoord4));
1313 tmp_re = I0.x * J0.x - I0.y * J0.y;
1314 tmp_re += I0.z * J0.z - I0.w * J0.w;
1315 tmp_re += I1.x * J1.x - I1.y * J1.y;
1318 tmp_im = I0.x * J0.y + I0.y * J0.x;
1319 tmp_im += I0.z * J0.w + I0.w * J0.z;
1320 tmp_im += I1.x * J1.y + I1.y * J1.x;
1324 tmp_re = I0.x * J1.z - I0.y * J1.w;
1325 tmp_re += I0.z * J2.x - I0.w * J2.y;
1326 tmp_re += I1.x * J2.z - I1.y * J2.w;
1329 tmp_im = I0.x * J1.w + I0.y * J1.z;
1330 tmp_im += I0.z * J2.y + I0.w * J2.x;
1331 tmp_im += I1.x * J2.w + I1.y * J2.z;
1335 tmp_re = I0.x * J3.x - I0.y * J3.y;
1336 tmp_re += I0.z * J3.z - I0.w * J3.w;
1337 tmp_re += I1.x * J4.x - I1.y * J4.y;
1340 tmp_im = I0.x * J3.y + I0.y * J3.x;
1341 tmp_im += I0.z * J3.w + I0.w * J3.z;
1342 tmp_im += I1.x * J4.y + I1.y * J4.x;
1346 tmp_re = I0.x * J4.z - I0.y * J4.w;
1347 tmp_re += I0.z * J5.x - I0.w * J5.x;
1348 tmp_re += I1.x * J5.z - I1.y * J5.w;
1352 tmp_im = I0.x * J4.w + I0.y * J4.z;
1353 tmp_im += I0.z * J5.y + I0.w * J5.y;
1354 tmp_im += I1.x * J5.w + I1.y * J5.z;
1358 tmp_re = I1.z * J0.x - I1.w * J0.y;
1359 tmp_re += I2.x * J0.z - I2.y * J0.w;
1360 tmp_re += I2.z * J1.x - I2.w * J1.y;
1363 tmp_im = I1.z * J0.y + I1.w * J0.x;
1364 tmp_im += I2.x * J0.w + I2.y * J0.z;
1365 tmp_im += I2.z * J1.y + I2.w * J1.x;
1369 tmp_re = I1.z * J1.z - I1.w * J1.w;
1370 tmp_re += I2.x * J2.x - I2.y * J2.y;
1371 tmp_re += I2.z * J2.z - I2.w * J2.w;
1374 tmp_im = I1.z * J1.w + I1.w * J1.z;
1375 tmp_im += I2.x * J2.y + I2.y * J2.x;
1376 tmp_im += I2.z * J2.w + I2.w * J2.z;
1380 tmp_re = I1.z * J3.x - I1.w * J3.y;
1381 tmp_re += I2.x * J3.z - I2.y * J3.w;
1382 tmp_re += I2.z * J4.x - I2.w * J4.y;
1385 tmp_im = I1.z * J3.y + I1.w * J3.x;
1386 tmp_im += I2.x * J3.w + I2.y * J3.z;
1387 tmp_im += I2.z * J4.y + I2.w * J4.x;
1391 tmp_re = I1.z * J4.z - I1.w * J4.w;
1392 tmp_re += I2.x * J5.x - I2.y * J5.y;
1393 tmp_re += I2.z * J5.z - I2.w * J5.w;
1396 tmp_im = I1.z * J4.w + I1.w * J4.z;
1397 tmp_im += I2.x * J5.y + I2.y * J5.x;
1398 tmp_im += I2.z * J5.w + I2.w * J5.z;
1402 tmp_re = I3.x * J0.x - I3.y * J0.y;
1403 tmp_re += I3.z * J0.z - I3.w * J0.w;
1404 tmp_re += I4.x * J1.x - I4.y * J1.y;
1407 tmp_im = I3.x * J0.y + I3.y * J0.x;
1408 tmp_im += I3.z * J0.w + I3.w * J0.z;
1409 tmp_im += I4.x * J1.y + I4.y * J1.x;
1413 tmp_re = I3.x * J1.z - I3.y * J1.w;
1414 tmp_re += I3.z * J2.x - I3.w * J2.y;
1415 tmp_re += I4.x * J2.z - I4.y * J2.w;
1418 tmp_im = I3.x * J1.w + I3.y * J1.z;
1419 tmp_im += I3.z * J2.y + I3.w * J2.x;
1420 tmp_im += I4.x * J2.w + I4.y * J2.z;
1424 tmp_re = I3.x * J3.x - I3.y * J3.y;
1425 tmp_re += I3.z * J3.z - I3.w * J3.w;
1426 tmp_re += I4.x * J4.x - I4.y * J4.y;
1429 tmp_im = I3.x * J3.y + I3.y * J3.x;
1430 tmp_im += I3.z * J3.w + I3.w * J3.z;
1431 tmp_im += I4.x * J4.y + I4.y * J4.x;
1435 tmp_re = I3.x * J4.z - I3.y * J4.w;
1436 tmp_re += I3.z * J5.x - I3.w * J5.y;
1437 tmp_re += I4.x * J5.z - I4.y * J5.w;
1440 tmp_im = I3.x * J4.w + I3.y * J4.z;
1441 tmp_im += I3.z * J5.y + I3.w * J5.x;
1442 tmp_im += I4.x * J5.w + I4.y * J5.z;
1446 tmp_re = I4.z * J0.x - I4.w * J0.y;
1447 tmp_re += I5.x * J0.z - I5.y * J0.w;
1448 tmp_re += I5.z * J1.x - I5.w * J1.y;
1451 tmp_im = I4.z * J0.y + I4.w * J0.x;
1452 tmp_im += I5.x * J0.w + I5.y * J0.z;
1453 tmp_im += I5.z * J1.y + I5.w * J1.x;
1457 tmp_re = I4.z * J1.z - I4.w * J1.w;
1458 tmp_re += I5.x * J2.x - I5.y * J2.y;
1459 tmp_re += I5.z * J2.z - I5.w * J2.w;
1462 tmp_im = I4.z * J1.w + I4.w * J1.z;
1463 tmp_im += I5.x * J2.y + I5.y * J2.x;
1464 tmp_im += I5.z * J2.w + I5.w * J2.z;
1468 tmp_re = I4.z * J3.x - I4.w * J3.y;
1469 tmp_re += I5.x * J3.z - I5.y * J3.w;
1470 tmp_re += I5.z * J4.x - I5.w * J4.y;
1473 tmp_im = I4.z * J3.y + I4.w * J3.x;
1474 tmp_im += I5.x * J3.w + I5.y * J3.z;
1475 tmp_im += I5.z * J4.y + I5.w * J4.x;
1479 tmp_re = I4.z * J4.z - I4.w * J4.w;
1480 tmp_re += I5.x * J5.x - I5.y * J5.y;
1481 tmp_re += I5.z * J5.z - I5.w * J5.w;
1484 tmp_im = I4.z * J4.w + I4.w * J4.z;
1485 tmp_im += I5.x * J5.y + I5.y * J5.x;
1486 tmp_im += I5.z * J5.w + I5.w * J5.z;
1535 int eutId, xCoord1, xCoord2, xCoord3, xCoord4, auxCoord1, auxCoord2;
1540 volatile float2
tmp;
1541 extern __shared__
float sms[];
1543 volatile float *accum_re = sms + threadIdx.x;
1547 auxCoord1 = eutId /
param.dc.
X[0];
1548 xCoord1 = eutId - auxCoord1 *
param.dc.
X[0];
1549 auxCoord2 = auxCoord1 /
param.dc.
X[1];
1550 xCoord2 = auxCoord1 - auxCoord2 *
param.dc.
X[1];
1551 xCoord4 = auxCoord2 /
param.dc.
X[2];
1552 xCoord3 = auxCoord2 - xCoord4 *
param.dc.
X[2];
1554 auxCoord1 = (Parity + xCoord4 + xCoord3 + xCoord2) & 1;
1555 xCoord1 += auxCoord1;
1556 outId = xCoord1 +
param.dc.
X[0]*(xCoord2 +
param.dc.
X[1]*(xCoord3 +
param.dc.
X[2]*xCoord4));
1580 tmp_re = I0.x * J0.x - I0.y * J0.y;
1581 tmp_re += I0.z * J0.z - I0.w * J0.w;
1582 tmp_re += I1.x * J1.x - I1.y * J1.y;
1585 tmp_im = I0.x * J0.y + I0.y * J0.x;
1586 tmp_im += I0.z * J0.w + I0.w * J0.z;
1587 tmp_im += I1.x * J1.y + I1.y * J1.x;
1591 tmp_re = I0.x * J1.z - I0.y * J1.w;
1592 tmp_re += I0.z * J2.x - I0.w * J2.y;
1593 tmp_re += I1.x * J2.z - I1.y * J2.w;
1596 tmp_im = I0.x * J1.w + I0.y * J1.z;
1597 tmp_im += I0.z * J2.y + I0.w * J2.x;
1598 tmp_im += I1.x * J2.w + I1.y * J2.z;
1602 tmp_re = I0.x * J3.x - I0.y * J3.y;
1603 tmp_re += I0.z * J3.z - I0.w * J3.w;
1604 tmp_re += I1.x * J4.x - I1.y * J4.y;
1607 tmp_im = I0.x * J3.y + I0.y * J3.x;
1608 tmp_im += I0.z * J3.w + I0.w * J3.z;
1609 tmp_im += I1.x * J4.y + I1.y * J4.x;
1613 tmp_re = I0.x * J4.z - I0.y * J4.w;
1614 tmp_re += I0.z * J5.x - I0.w * J5.x;
1615 tmp_re += I1.x * J5.z - I1.y * J5.w;
1619 tmp_im = I0.x * J4.w + I0.y * J4.z;
1620 tmp_im += I0.z * J5.y + I0.w * J5.y;
1621 tmp_im += I1.x * J5.w + I1.y * J5.z;
1625 tmp_re = I1.z * J0.x - I1.w * J0.y;
1626 tmp_re += I2.x * J0.z - I2.y * J0.w;
1627 tmp_re += I2.z * J1.x - I2.w * J1.y;
1630 tmp_im = I1.z * J0.y + I1.w * J0.x;
1631 tmp_im += I2.x * J0.w + I2.y * J0.z;
1632 tmp_im += I2.z * J1.y + I2.w * J1.x;
1636 tmp_re = I1.z * J1.z - I1.w * J1.w;
1637 tmp_re += I2.x * J2.x - I2.y * J2.y;
1638 tmp_re += I2.z * J2.z - I2.w * J2.w;
1641 tmp_im = I1.z * J1.w + I1.w * J1.z;
1642 tmp_im += I2.x * J2.y + I2.y * J2.x;
1643 tmp_im += I2.z * J2.w + I2.w * J2.z;
1647 tmp_re = I1.z * J3.x - I1.w * J3.y;
1648 tmp_re += I2.x * J3.z - I2.y * J3.w;
1649 tmp_re += I2.z * J4.x - I2.w * J4.y;
1652 tmp_im = I1.z * J3.y + I1.w * J3.x;
1653 tmp_im += I2.x * J3.w + I2.y * J3.z;
1654 tmp_im += I2.z * J4.y + I2.w * J4.x;
1658 tmp_re = I1.z * J4.z - I1.w * J4.w;
1659 tmp_re += I2.x * J5.x - I2.y * J5.y;
1660 tmp_re += I2.z * J5.z - I2.w * J5.w;
1663 tmp_im = I1.z * J4.w + I1.w * J4.z;
1664 tmp_im += I2.x * J5.y + I2.y * J5.x;
1665 tmp_im += I2.z * J5.w + I2.w * J5.z;
1669 tmp_re = I3.x * J0.x - I3.y * J0.y;
1670 tmp_re += I3.z * J0.z - I3.w * J0.w;
1671 tmp_re += I4.x * J1.x - I4.y * J1.y;
1674 tmp_im = I3.x * J0.y + I3.y * J0.x;
1675 tmp_im += I3.z * J0.w + I3.w * J0.z;
1676 tmp_im += I4.x * J1.y + I4.y * J1.x;
1680 tmp_re = I3.x * J1.z - I3.y * J1.w;
1681 tmp_re += I3.z * J2.x - I3.w * J2.y;
1682 tmp_re += I4.x * J2.z - I4.y * J2.w;
1685 tmp_im = I3.x * J1.w + I3.y * J1.z;
1686 tmp_im += I3.z * J2.y + I3.w * J2.x;
1687 tmp_im += I4.x * J2.w + I4.y * J2.z;
1691 tmp_re = I3.x * J3.x - I3.y * J3.y;
1692 tmp_re += I3.z * J3.z - I3.w * J3.w;
1693 tmp_re += I4.x * J4.x - I4.y * J4.y;
1696 tmp_im = I3.x * J3.y + I3.y * J3.x;
1697 tmp_im += I3.z * J3.w + I3.w * J3.z;
1698 tmp_im += I4.x * J4.y + I4.y * J4.x;
1702 tmp_re = I3.x * J4.z - I3.y * J4.w;
1703 tmp_re += I3.z * J5.x - I3.w * J5.y;
1704 tmp_re += I4.x * J5.z - I4.y * J5.w;
1707 tmp_im = I3.x * J4.w + I3.y * J4.z;
1708 tmp_im += I3.z * J5.y + I3.w * J5.x;
1709 tmp_im += I4.x * J5.w + I4.y * J5.z;
1713 tmp_re = I4.z * J0.x - I4.w * J0.y;
1714 tmp_re += I5.x * J0.z - I5.y * J0.w;
1715 tmp_re += I5.z * J1.x - I5.w * J1.y;
1718 tmp_im = I4.z * J0.y + I4.w * J0.x;
1719 tmp_im += I5.x * J0.w + I5.y * J0.z;
1720 tmp_im += I5.z * J1.y + I5.w * J1.x;
1724 tmp_re = I4.z * J1.z - I4.w * J1.w;
1725 tmp_re += I5.x * J2.x - I5.y * J2.y;
1726 tmp_re += I5.z * J2.z - I5.w * J2.w;
1729 tmp_im = I4.z * J1.w + I4.w * J1.z;
1730 tmp_im += I5.x * J2.y + I5.y * J2.x;
1731 tmp_im += I5.z * J2.w + I5.w * J2.z;
1735 tmp_re = I4.z * J3.x - I4.w * J3.y;
1736 tmp_re += I5.x * J3.z - I5.y * J3.w;
1737 tmp_re += I5.z * J4.x - I5.w * J4.y;
1740 tmp_im = I4.z * J3.y + I4.w * J3.x;
1741 tmp_im += I5.x * J3.w + I5.y * J3.z;
1742 tmp_im += I5.z * J4.y + I5.w * J4.x;
1746 tmp_re = I4.z * J4.z - I4.w * J4.w;
1747 tmp_re += I5.x * J5.x - I5.y * J5.y;
1748 tmp_re += I5.z * J5.z - I5.w * J5.w;
1751 tmp_im = I4.z * J4.w + I4.w * J4.z;
1752 tmp_im += I5.x * J5.y + I5.y * J5.x;
1753 tmp_im += I5.z * J5.w + I5.w * J5.z;
1804 #undef READ_INTERMEDIATE_SPINOR 1808 #endif //_TWIST_QUDA_CONTRACT_PLUS
__global__ void contractTslicePlusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
cudaColorSpinorField * tmp
#define READ_INTERMEDIATE_SPINOR
__global__ void contractGamma5PlusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
cpuColorSpinorField * out
__global__ void contractPlusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)