7 __device__ float4
operator*(
const float &
x,
const float4 &
y)
19 __device__ double2
operator*(
const double &
x,
const double2 &
y)
30 #define tmp0_re tmp0.x
31 #define tmp0_im tmp0.y
32 #define tmp1_re tmp1.x
33 #define tmp1_im tmp1.y
34 #define tmp2_re tmp2.x
35 #define tmp2_im tmp2.y
36 #define tmp3_re tmp3.x
37 #define tmp3_im tmp3.y
39 #ifdef USE_TEXTURE_OBJECTS
40 #define SPINORTEX param.inTex
42 #define SPINORTEX spinorTexDouble
45 #if (__COMPUTE_CAPABILITY__ >= 130)
49 #ifdef GPU_TWISTED_MASS_DIRAC
51 int sid = blockIdx.x*blockDim.x + threadIdx.x;
52 if (sid >= param.
threads)
return;
54 #ifndef FERMI_NO_DBLE_TEX
68 double2 I0 = in[sid + 0 * param.
sp_stride];
69 double2 I1 = in[sid + 1 * param.
sp_stride];
70 double2 I2 = in[sid + 2 * param.
sp_stride];
71 double2 I3 = in[sid + 3 * param.
sp_stride];
72 double2 I4 = in[sid + 4 * param.
sp_stride];
73 double2 I5 = in[sid + 5 * param.
sp_stride];
74 double2 I6 = in[sid + 6 * param.
sp_stride];
75 double2 I7 = in[sid + 7 * param.
sp_stride];
76 double2 I8 = in[sid + 8 * param.
sp_stride];
77 double2 I9 = in[sid + 9 * param.
sp_stride];
78 double2 I10 = in[sid + 10 * param.
sp_stride];
79 double2 I11 = in[sid + 11 * param.
sp_stride];
82 volatile double2 tmp0,
tmp1,
tmp2, tmp3;
111 tmp0_re = I1.x - a * I7.y;
112 tmp0_im = I1.y + a * I7.x;
114 tmp2_re = I7.x - a * I1.y;
115 tmp2_im = I7.y + a * I1.x;
117 tmp1_re = I4.x - a * I10.y;
118 tmp1_im = I4.y + a * I10.x;
120 tmp3_re = I10.x - a * I4.y;
121 tmp3_im = I10.y + a * I4.x;
134 tmp0_re = I2.x - a* I8.y;
135 tmp0_im = I2.y + a* I8.x;
137 tmp2_re = I8.x - a* I2.y;
138 tmp2_im = I8.y + a* I2.x;
140 tmp1_re = I5.x - a* I11.y;
141 tmp1_im = I5.y + a* I11.x;
143 tmp3_re = I11.x - a* I5.y;
144 tmp3_im = I11.y + a* I5.x;
165 spinor[sid + 10 * param.
sp_stride] = I10;
166 spinor[sid + 11 * param.
sp_stride] = I11;
171 __global__
void twistGamma5Kernel(double2 *spinor,
float *null,
const double a,
const double b,
const double c,
const double2 *in,
const float *null2,
DslashParam param)
173 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
174 int sid = blockIdx.x * blockDim.x + threadIdx.x;
175 if (sid >= param.
threads)
return;
178 double2 accum1_0, accum1_1;
179 double2 accum2_0, accum2_1;
183 int flv2_idx = sid + param.fl_stride;
189 #ifndef FERMI_NO_DBLE_TEX
193 tmp0 = in[flv1_idx + 0 * param.
sp_stride];
194 tmp1 = in[flv1_idx + 6 * param.
sp_stride];
197 accum1_0.x = tmp0.x + a * tmp1.y;
198 accum1_0.y = tmp0.y - a * tmp1.x;
200 accum2_0.x = b * tmp0.x;
201 accum2_0.y = b * tmp0.y;
203 accum1_1.x = tmp1.x + a * tmp0.y;
204 accum1_1.y = tmp1.y - a * tmp0.x;
206 accum2_1.x = b * tmp1.x;
207 accum2_1.y = b * tmp1.y;
210 #ifndef FERMI_NO_DBLE_TEX
214 tmp0 = in[flv2_idx + 0 * param.
sp_stride];
215 tmp1 = in[flv2_idx + 6 * param.
sp_stride];
218 accum2_0.x += tmp0.x - a * tmp1.y;
219 accum2_0.y += tmp0.y + a * tmp1.x;
221 accum1_0.x += b * tmp0.x;
222 accum1_0.y += b * tmp0.y;
224 accum2_1.x += tmp1.x - a * tmp0.y;
225 accum2_1.y += tmp1.y + a * tmp0.x;
227 accum1_1.x += b * tmp1.x;
228 accum1_1.y += b * tmp1.y;
232 spinor[flv1_idx + 0 * param.
sp_stride] = c * accum1_0;
233 spinor[flv1_idx + 6 * param.
sp_stride] = c * accum1_1;
234 spinor[flv2_idx + 0 * param.
sp_stride] = c * accum2_0;
235 spinor[flv2_idx + 6 * param.
sp_stride] = c * accum2_1;
237 #ifndef FERMI_NO_DBLE_TEX
241 tmp0 = in[flv1_idx + 3 * param.
sp_stride];
242 tmp1 = in[flv1_idx + 9 * param.
sp_stride];
245 accum1_0.x = tmp0.x + a * tmp1.y;
246 accum1_0.y = tmp0.y - a * tmp1.x;
248 accum2_0.x = b * tmp0.x;
249 accum2_0.y = b * tmp0.y;
251 accum1_1.x = tmp1.x + a * tmp0.y;
252 accum1_1.y = tmp1.y - a * tmp0.x;
254 accum2_1.x = b * tmp1.x;
255 accum2_1.y = b * tmp1.y;
257 #ifndef FERMI_NO_DBLE_TEX
261 tmp0 = in[flv2_idx + 3 * param.
sp_stride];
262 tmp1 = in[flv2_idx + 9 * param.
sp_stride];
265 accum2_0.x += tmp0.x - a * tmp1.y;
266 accum2_0.y += tmp0.y + a * tmp1.x;
268 accum1_0.x += b * tmp0.x;
269 accum1_0.y += b * tmp0.y;
271 accum2_1.x += tmp1.x - a * tmp0.y;
272 accum2_1.y += tmp1.y + a * tmp0.x;
274 accum1_1.x += b * tmp1.x;
275 accum1_1.y += b * tmp1.y;
279 spinor[flv1_idx + 3 * param.
sp_stride] = c * accum1_0;
280 spinor[flv1_idx + 9 * param.
sp_stride] = c * accum1_1;
281 spinor[flv2_idx + 3 * param.
sp_stride] = c * accum2_0;
282 spinor[flv2_idx + 9 * param.
sp_stride] = c * accum2_1;
285 #ifndef FERMI_NO_DBLE_TEX
289 tmp0 = in[flv1_idx + 1 * param.
sp_stride];
290 tmp1 = in[flv1_idx + 7 * param.
sp_stride];
293 accum1_0.x = tmp0.x + a * tmp1.y;
294 accum1_0.y = tmp0.y - a * tmp1.x;
296 accum2_0.x = b * tmp0.x;
297 accum2_0.y = b * tmp0.y;
299 accum1_1.x = tmp1.x + a * tmp0.y;
300 accum1_1.y = tmp1.y - a * tmp0.x;
302 accum2_1.x = b * tmp1.x;
303 accum2_1.y = b * tmp1.y;
305 #ifndef FERMI_NO_DBLE_TEX
309 tmp0 = in[flv2_idx + 1 * param.
sp_stride];
310 tmp1 = in[flv2_idx + 7 * param.
sp_stride];
313 accum2_0.x += tmp0.x - a * tmp1.y;
314 accum2_0.y += tmp0.y + a * tmp1.x;
316 accum1_0.x += b * tmp0.x;
317 accum1_0.y += b * tmp0.y;
319 accum2_1.x += tmp1.x - a * tmp0.y;
320 accum2_1.y += tmp1.y + a * tmp0.x;
322 accum1_1.x += b * tmp1.x;
323 accum1_1.y += b * tmp1.y;
327 spinor[flv1_idx + 1 * param.
sp_stride] = c * accum1_0;
328 spinor[flv1_idx + 7 * param.
sp_stride] = c * accum1_1;
329 spinor[flv2_idx + 1 * param.
sp_stride] = c * accum2_0;
330 spinor[flv2_idx + 7 * param.
sp_stride] = c * accum2_1;
332 #ifndef FERMI_NO_DBLE_TEX
336 tmp0 = in[flv1_idx + 4 * param.
sp_stride];
337 tmp1 = in[flv1_idx + 10 * param.
sp_stride];
340 accum1_0.x = tmp0.x + a * tmp1.y;
341 accum1_0.y = tmp0.y - a * tmp1.x;
343 accum2_0.x = b * tmp0.x;
344 accum2_0.y = b * tmp0.y;
346 accum1_1.x = tmp1.x + a * tmp0.y;
347 accum1_1.y = tmp1.y - a * tmp0.x;
349 accum2_1.x = b * tmp1.x;
350 accum2_1.y = b * tmp1.y;
352 #ifndef FERMI_NO_DBLE_TEX
356 tmp0 = in[flv2_idx + 4 * param.
sp_stride];
357 tmp1 = in[flv2_idx + 10 * param.
sp_stride];
360 accum2_0.x += tmp0.x - a * tmp1.y;
361 accum2_0.y += tmp0.y + a * tmp1.x;
363 accum1_0.x += b * tmp0.x;
364 accum1_0.y += b * tmp0.y;
366 accum2_1.x += tmp1.x - a * tmp0.y;
367 accum2_1.y += tmp1.y + a * tmp0.x;
369 accum1_1.x += b * tmp1.x;
370 accum1_1.y += b * tmp1.y;
374 spinor[flv1_idx + 4 * param.
sp_stride] = c * accum1_0;
375 spinor[flv1_idx + 10 * param.
sp_stride] = c * accum1_1;
376 spinor[flv2_idx + 4 * param.
sp_stride] = c * accum2_0;
377 spinor[flv2_idx + 10 * param.
sp_stride] = c * accum2_1;
380 #ifndef FERMI_NO_DBLE_TEX
384 tmp0 = in[flv1_idx + 2 * param.
sp_stride];
385 tmp1 = in[flv1_idx + 8 * param.
sp_stride];
388 accum1_0.x = tmp0.x + a * tmp1.y;
389 accum1_0.y = tmp0.y - a * tmp1.x;
391 accum2_0.x = b * tmp0.x;
392 accum2_0.y = b * tmp0.y;
394 accum1_1.x = tmp1.x + a * tmp0.y;
395 accum1_1.y = tmp1.y - a * tmp0.x;
397 accum2_1.x = b * tmp1.x;
398 accum2_1.y = b * tmp1.y;
400 #ifndef FERMI_NO_DBLE_TEX
404 tmp0 = in[flv2_idx + 2 * param.
sp_stride];
405 tmp1 = in[flv2_idx + 8 * param.
sp_stride];
408 accum2_0.x += tmp0.x - a * tmp1.y;
409 accum2_0.y += tmp0.y + a * tmp1.x;
411 accum1_0.x += b * tmp0.x;
412 accum1_0.y += b * tmp0.y;
414 accum2_1.x += tmp1.x - a * tmp0.y;
415 accum2_1.y += tmp1.y + a * tmp0.x;
417 accum1_1.x += b * tmp1.x;
418 accum1_1.y += b * tmp1.y;
422 spinor[flv1_idx + 2 * param.
sp_stride] = c * accum1_0;
423 spinor[flv1_idx + 8 * param.
sp_stride] = c * accum1_1;
424 spinor[flv2_idx + 2 * param.
sp_stride] = c * accum2_0;
425 spinor[flv2_idx + 8 * param.
sp_stride] = c * accum2_1;
427 #ifndef FERMI_NO_DBLE_TEX
431 tmp0 = in[flv1_idx + 5 * param.
sp_stride];
432 tmp1 = in[flv1_idx + 11 * param.
sp_stride];
435 accum1_0.x = tmp0.x + a * tmp1.y;
436 accum1_0.y = tmp0.y - a * tmp1.x;
438 accum2_0.x = b * tmp0.x;
439 accum2_0.y = b * tmp0.y;
441 accum1_1.x = tmp1.x + a * tmp0.y;
442 accum1_1.y = tmp1.y - a * tmp0.x;
444 accum2_1.x = b * tmp1.x;
445 accum2_1.y = b * tmp1.y;
448 #ifndef FERMI_NO_DBLE_TEX
452 tmp0 = in[flv2_idx + 5 * param.
sp_stride];
453 tmp1 = in[flv2_idx + 11 * param.
sp_stride];
456 accum2_0.x += tmp0.x - a * tmp1.y;
457 accum2_0.y += tmp0.y + a * tmp1.x;
459 accum1_0.x += b * tmp0.x;
460 accum1_0.y += b * tmp0.y;
462 accum2_1.x += tmp1.x - a * tmp0.y;
463 accum2_1.y += tmp1.y + a * tmp0.x;
465 accum1_1.x += b * tmp1.x;
466 accum1_1.y += b * tmp1.y;
470 spinor[flv1_idx + 5 * param.
sp_stride] = c * accum1_0;
471 spinor[flv1_idx + 11 * param.
sp_stride] = c * accum1_1;
472 spinor[flv2_idx + 5 * param.
sp_stride] = c * accum2_0;
473 spinor[flv2_idx + 11 * param.
sp_stride] = c * accum2_1;
478 #endif // (__COMPUTE_CAPABILITY__ >= 130)
489 #define tmp0_re tmp0.x
490 #define tmp0_im tmp0.y
491 #define tmp1_re tmp0.z
492 #define tmp1_im tmp0.w
493 #define tmp2_re tmp1.x
494 #define tmp2_im tmp1.y
495 #define tmp3_re tmp1.z
496 #define tmp3_im tmp1.w
499 #ifdef USE_TEXTURE_OBJECTS
500 #define SPINORTEX param.inTex
502 #define SPINORTEX spinorTexSingle
507 const float4 *in,
const float *null2,
DslashParam param)
509 #ifdef GPU_TWISTED_MASS_DIRAC
510 int sid = blockIdx.x*blockDim.x + threadIdx.x;
511 if (sid >= param.
threads)
return;
520 volatile float4 tmp0,
tmp1;
549 tmp0_re = I0.z - a * I3.w;
550 tmp0_im = I0.w + a * I3.z;
552 tmp1_re = I2.x - a * I5.y;
553 tmp1_im = I2.y + a * I5.x;
555 tmp2_re = I3.z - a * I0.w;
556 tmp2_im = I3.w + a * I0.z;
558 tmp3_re = I5.x - a * I2.y;
559 tmp3_im = I5.y + a * I2.x;
572 tmp0_re = I1.x - a * I4.y;
573 tmp0_im = I1.y + a * I4.x;
575 tmp1_re = I2.z - a * I5.w;
576 tmp1_im = I2.w + a * I5.z;
578 tmp2_re = I4.x - a * I1.y;
579 tmp2_im = I4.y + a * I1.x;
581 tmp3_re = I5.z - a * I2.w;
582 tmp3_im = I5.w + a * I2.z;
604 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
605 int sid = blockIdx.x * blockDim.x + threadIdx.x;
606 if (sid >= param.
threads)
return;
608 float4 accum1_0, accum1_1;
609 float4 accum2_0, accum2_1;
613 int flv2_idx = sid + param.fl_stride;
622 accum1_0.x = tmp0.x + a * tmp1.y;
623 accum1_0.y = tmp0.y - a * tmp1.x;
624 accum1_0.z = tmp0.z + a * tmp1.w;
625 accum1_0.w = tmp0.w - a * tmp1.z;
627 accum2_0.x = b * tmp0.x;
628 accum2_0.y = b * tmp0.y;
629 accum2_0.z = b * tmp0.z;
630 accum2_0.w = b * tmp0.w;
632 accum1_1.x = tmp1.x + a * tmp0.y;
633 accum1_1.y = tmp1.y - a * tmp0.x;
634 accum1_1.z = tmp1.z + a * tmp0.w;
635 accum1_1.w = tmp1.w - a * tmp0.z;
637 accum2_1.x = b * tmp1.x;
638 accum2_1.y = b * tmp1.y;
639 accum2_1.z = b * tmp1.z;
640 accum2_1.w = b * tmp1.w;
645 accum2_0.x += tmp0.x - a * tmp1.y;
646 accum2_0.y += tmp0.y + a * tmp1.x;
647 accum2_0.z += tmp0.z - a * tmp1.w;
648 accum2_0.w += tmp0.w + a * tmp1.z;
650 accum1_0.x += b * tmp0.x;
651 accum1_0.y += b * tmp0.y;
652 accum1_0.z += b * tmp0.z;
653 accum1_0.w += b * tmp0.w;
655 accum2_1.x += tmp1.x - a * tmp0.y;
656 accum2_1.y += tmp1.y + a * tmp0.x;
657 accum2_1.z += tmp1.z - a * tmp0.w;
658 accum2_1.w += tmp1.w + a * tmp0.z;
660 accum1_1.x += b * tmp1.x;
661 accum1_1.y += b * tmp1.y;
662 accum1_1.z += b * tmp1.z;
663 accum1_1.w += b * tmp1.w;
665 spinor[flv1_idx + 0 * param.
sp_stride] = c * accum1_0;
666 spinor[flv1_idx + 3 * param.
sp_stride] = c * accum1_1;
667 spinor[flv2_idx + 0 * param.
sp_stride] = c * accum2_0;
668 spinor[flv2_idx + 3 * param.
sp_stride] = c * accum2_1;
675 accum1_0.x = tmp0.x + a * tmp1.y;
676 accum1_0.y = tmp0.y - a * tmp1.x;
677 accum1_0.z = tmp0.z + a * tmp1.w;
678 accum1_0.w = tmp0.w - a * tmp1.z;
680 accum2_0.x = b * tmp0.x;
681 accum2_0.y = b * tmp0.y;
682 accum2_0.z = b * tmp0.z;
683 accum2_0.w = b * tmp0.w;
685 accum1_1.x = tmp1.x + a * tmp0.y;
686 accum1_1.y = tmp1.y - a * tmp0.x;
687 accum1_1.z = tmp1.z + a * tmp0.w;
688 accum1_1.w = tmp1.w - a * tmp0.z;
690 accum2_1.x = b * tmp1.x;
691 accum2_1.y = b * tmp1.y;
692 accum2_1.z = b * tmp1.z;
693 accum2_1.w = b * tmp1.w;
698 accum2_0.x += tmp0.x - a * tmp1.y;
699 accum2_0.y += tmp0.y + a * tmp1.x;
700 accum2_0.z += tmp0.z - a * tmp1.w;
701 accum2_0.w += tmp0.w + a * tmp1.z;
703 accum1_0.x += b * tmp0.x;
704 accum1_0.y += b * tmp0.y;
705 accum1_0.z += b * tmp0.z;
706 accum1_0.w += b * tmp0.w;
708 accum2_1.x += tmp1.x - a * tmp0.y;
709 accum2_1.y += tmp1.y + a * tmp0.x;
710 accum2_1.z += tmp1.z - a * tmp0.w;
711 accum2_1.w += tmp1.w + a * tmp0.z;
713 accum1_1.x += b * tmp1.x;
714 accum1_1.y += b * tmp1.y;
715 accum1_1.z += b * tmp1.z;
716 accum1_1.w += b * tmp1.w;
718 spinor[flv1_idx + 1 * param.
sp_stride] = c * accum1_0;
719 spinor[flv1_idx + 4 * param.
sp_stride] = c * accum1_1;
720 spinor[flv2_idx + 1 * param.
sp_stride] = c * accum2_0;
721 spinor[flv2_idx + 4 * param.
sp_stride] = c * accum2_1;
728 accum1_0.x = tmp0.x + a * tmp1.y;
729 accum1_0.y = tmp0.y - a * tmp1.x;
730 accum1_0.z = tmp0.z + a * tmp1.w;
731 accum1_0.w = tmp0.w - a * tmp1.z;
733 accum2_0.x = b * tmp0.x;
734 accum2_0.y = b * tmp0.y;
735 accum2_0.z = b * tmp0.z;
736 accum2_0.w = b * tmp0.w;
738 accum1_1.x = tmp1.x + a * tmp0.y;
739 accum1_1.y = tmp1.y - a * tmp0.x;
740 accum1_1.z = tmp1.z + a * tmp0.w;
741 accum1_1.w = tmp1.w - a * tmp0.z;
743 accum2_1.x = b * tmp1.x;
744 accum2_1.y = b * tmp1.y;
745 accum2_1.z = b * tmp1.z;
746 accum2_1.w = b * tmp1.w;
751 accum2_0.x += tmp0.x - a * tmp1.y;
752 accum2_0.y += tmp0.y + a * tmp1.x;
753 accum2_0.z += tmp0.z - a * tmp1.w;
754 accum2_0.w += tmp0.w + a * tmp1.z;
756 accum1_0.x += b * tmp0.x;
757 accum1_0.y += b * tmp0.y;
758 accum1_0.z += b * tmp0.z;
759 accum1_0.w += b * tmp0.w;
761 accum2_1.x += tmp1.x - a * tmp0.y;
762 accum2_1.y += tmp1.y + a * tmp0.x;
763 accum2_1.z += tmp1.z - a * tmp0.w;
764 accum2_1.w += tmp1.w + a * tmp0.z;
766 accum1_1.x += b * tmp1.x;
767 accum1_1.y += b * tmp1.y;
768 accum1_1.z += b * tmp1.z;
769 accum1_1.w += b * tmp1.w;
771 spinor[flv1_idx + 2 * param.
sp_stride] = c * accum1_0;
772 spinor[flv1_idx + 5 * param.
sp_stride] = c * accum1_1;
773 spinor[flv2_idx + 2 * param.
sp_stride] = c * accum2_0;
774 spinor[flv2_idx + 5 * param.
sp_stride] = c * accum2_1;
780 #ifdef USE_TEXTURE_OBJECTS
781 #define SPINORTEX param.inTex
782 #define SPINORTEXNORM param.inTexNorm
784 #define SPINORTEX spinorTexHalf
785 #define SPINORTEXNORM spinorTexHalfNorm
790 const short4 *in,
const float *inNorm,
DslashParam param)
792 #ifdef GPU_TWISTED_MASS_DIRAC
793 int sid = blockIdx.x*blockDim.x + threadIdx.x;
794 if (sid >= param.
threads)
return;
812 volatile float4 tmp0,
tmp1;
841 tmp0_re = I0.z - a * I3.w;
842 tmp0_im = I0.w + a * I3.z;
844 tmp1_re = I2.x - a * I5.y;
845 tmp1_im = I2.y + a * I5.x;
847 tmp2_re = I3.z - a * I0.w;
848 tmp2_im = I3.w + a * I0.z;
850 tmp3_re = I5.x - a * I2.y;
851 tmp3_im = I5.y + a * I2.x;
864 tmp0_re = I1.x - a * I4.y;
865 tmp0_im = I1.y + a * I4.x;
867 tmp1_re = I2.z - a * I5.w;
868 tmp1_im = I2.w + a * I5.z;
870 tmp2_re = I4.x - a * I1.y;
871 tmp2_im = I4.y + a * I1.x;
873 tmp3_re = I5.z - a * I2.w;
874 tmp3_im = I5.w + a * I2.z;
886 float c0 = fmaxf(fabsf(I0.x), fabsf(I0.y));
887 float c1 = fmaxf(fabsf(I0.z), fabsf(I0.w));
888 float c2 = fmaxf(fabsf(I1.x), fabsf(I1.y));
889 float c3 = fmaxf(fabsf(I1.z), fabsf(I1.w));
890 float c4 = fmaxf(fabsf(I2.x), fabsf(I2.y));
891 float c5 = fmaxf(fabsf(I2.z), fabsf(I2.w));
892 float c6 = fmaxf(fabsf(I3.x), fabsf(I3.y));
893 float c7 = fmaxf(fabsf(I3.z), fabsf(I3.w));
894 float c8 = fmaxf(fabsf(I4.x), fabsf(I4.y));
895 float c9 = fmaxf(fabsf(I4.z), fabsf(I4.w));
896 float c10 = fmaxf(fabsf(I5.x), fabsf(I5.y));
897 float c11 = fmaxf(fabsf(I5.z), fabsf(I5.w));
903 c5 = fmaxf(c10, c11);
909 spinorNorm[
sid] = c0;
919 spinor[sid+0*(param.
sp_stride)] = make_short4((
short)I0.x, (short)I0.y, (
short)I0.z, (short)I0.w);
920 spinor[sid+1*(param.
sp_stride)] = make_short4((
short)I1.x, (short)I1.y, (
short)I1.z, (short)I1.w);
921 spinor[sid+2*(param.
sp_stride)] = make_short4((
short)I2.x, (short)I2.y, (
short)I2.z, (short)I2.w);
922 spinor[sid+3*(param.
sp_stride)] = make_short4((
short)I3.x, (short)I3.y, (
short)I3.z, (short)I3.w);
923 spinor[sid+4*(param.
sp_stride)] = make_short4((
short)I4.x, (short)I4.y, (
short)I4.z, (short)I4.w);
924 spinor[sid+5*(param.
sp_stride)] = make_short4((
short)I5.x, (short)I5.y, (
short)I5.z, (short)I5.w);
930 __global__
void twistGamma5Kernel(short4* spinor,
float *spinorNorm,
float a,
float b,
float c,
const short4 *in,
const float *inNorm,
DslashParam param)
932 #ifdef GPU_NDEG_TWISTED_MASS_DIRAC
933 int sid = blockIdx.x * blockDim.x + threadIdx.x;
934 if (sid >= param.
threads)
return;
937 int flv2_idx = sid + param.fl_stride;
942 float4 accum1_0, accum1_1, accum1_2, accum1_3, accum1_4, accum1_5;
943 float4 accum2_0, accum2_1, accum2_2, accum2_3, accum2_4, accum2_5;
955 accum1_0.x = tmp0.x + a * tmp1.y;
956 accum1_0.y = tmp0.y - a * tmp1.x;
957 accum1_0.z = tmp0.z + a * tmp1.w;
958 accum1_0.w = tmp0.w - a * tmp1.z;
960 accum2_0.x = b * tmp0.x;
961 accum2_0.y = b * tmp0.y;
962 accum2_0.z = b * tmp0.z;
963 accum2_0.w = b * tmp0.w;
965 accum1_3.x = tmp1.x + a * tmp0.y;
966 accum1_3.y = tmp1.y - a * tmp0.x;
967 accum1_3.z = tmp1.z + a * tmp0.w;
968 accum1_3.w = tmp1.w - a * tmp0.z;
970 accum2_3.x = b * tmp1.x;
971 accum2_3.y = b * tmp1.y;
972 accum2_3.z = b * tmp1.z;
973 accum2_3.w = b * tmp1.w;
981 accum2_0.x += tmp0.x - a * tmp1.y;
982 accum2_0.y += tmp0.y + a * tmp1.x;
983 accum2_0.z += tmp0.z - a * tmp1.w;
984 accum2_0.w += tmp0.w + a * tmp1.z;
986 accum1_0.x += b * tmp0.x;
987 accum1_0.y += b * tmp0.y;
988 accum1_0.z += b * tmp0.z;
989 accum1_0.w += b * tmp0.w;
991 accum2_3.x += tmp1.x - a * tmp0.y;
992 accum2_3.y += tmp1.y + a * tmp0.x;
993 accum2_3.z += tmp1.z - a * tmp0.w;
994 accum2_3.w += tmp1.w + a * tmp0.z;
996 accum1_3.x += b * tmp1.x;
997 accum1_3.y += b * tmp1.y;
998 accum1_3.z += b * tmp1.z;
999 accum1_3.w += b * tmp1.w;
1001 float c1_0 = fmaxf(fabsf(accum1_0.x), fabsf(accum1_0.y));
1002 float c1_1 = fmaxf(fabsf(accum1_0.z), fabsf(accum1_0.w));
1003 float c1_6 = fmaxf(fabsf(accum1_3.x), fabsf(accum1_3.y));
1004 float c1_7 = fmaxf(fabsf(accum1_3.z), fabsf(accum1_3.w));
1006 float c2_0 = fmaxf(fabsf(accum2_0.x), fabsf(accum2_0.y));
1007 float c2_1 = fmaxf(fabsf(accum2_0.z), fabsf(accum2_0.w));
1008 float c2_6 = fmaxf(fabsf(accum2_3.x), fabsf(accum2_3.y));
1009 float c2_7 = fmaxf(fabsf(accum2_3.z), fabsf(accum2_3.w));
1019 accum1_1.x = tmp0.x + a * tmp1.y;
1020 accum1_1.y = tmp0.y - a * tmp1.x;
1021 accum1_1.z = tmp0.z + a * tmp1.w;
1022 accum1_1.w = tmp0.w - a * tmp1.z;
1024 accum2_1.x = b * tmp0.x;
1025 accum2_1.y = b * tmp0.y;
1026 accum2_1.z = b * tmp0.z;
1027 accum2_1.w = b * tmp0.w;
1029 accum1_4.x = tmp1.x + a * tmp0.y;
1030 accum1_4.y = tmp1.y - a * tmp0.x;
1031 accum1_4.z = tmp1.z + a * tmp0.w;
1032 accum1_4.w = tmp1.w - a * tmp0.z;
1034 accum2_4.x = b * tmp1.x;
1035 accum2_4.y = b * tmp1.y;
1036 accum2_4.z = b * tmp1.z;
1037 accum2_4.w = b * tmp1.w;
1045 accum2_1.x += tmp0.x - a * tmp1.y;
1046 accum2_1.y += tmp0.y + a * tmp1.x;
1047 accum2_1.z += tmp0.z - a * tmp1.w;
1048 accum2_1.w += tmp0.w + a * tmp1.z;
1050 accum1_1.x += b * tmp0.x;
1051 accum1_1.y += b * tmp0.y;
1052 accum1_1.z += b * tmp0.z;
1053 accum1_1.w += b * tmp0.w;
1055 accum2_4.x += tmp1.x - a * tmp0.y;
1056 accum2_4.y += tmp1.y + a * tmp0.x;
1057 accum2_4.z += tmp1.z - a * tmp0.w;
1058 accum2_4.w += tmp1.w + a * tmp0.z;
1060 accum1_4.x += b * tmp1.x;
1061 accum1_4.y += b * tmp1.y;
1062 accum1_4.z += b * tmp1.z;
1063 accum1_4.w += b * tmp1.w;
1065 float c1_2 = fmaxf(fabsf(accum1_1.x), fabsf(accum1_1.y));
1066 float c1_3 = fmaxf(fabsf(accum1_1.z), fabsf(accum1_1.w));
1067 float c1_8 = fmaxf(fabsf(accum1_4.x), fabsf(accum1_4.y));
1068 float c1_9 = fmaxf(fabsf(accum1_4.z), fabsf(accum1_4.w));
1070 float c2_2 = fmaxf(fabsf(accum2_1.x), fabsf(accum2_1.y));
1071 float c2_3 = fmaxf(fabsf(accum2_1.z), fabsf(accum2_1.w));
1072 float c2_8 = fmaxf(fabsf(accum2_4.x), fabsf(accum2_4.y));
1073 float c2_9 = fmaxf(fabsf(accum2_4.z), fabsf(accum2_4.w));
1083 accum1_2.x = tmp0.x + a * tmp1.y;
1084 accum1_2.y = tmp0.y - a * tmp1.x;
1085 accum1_2.z = tmp0.z + a * tmp1.w;
1086 accum1_2.w = tmp0.w - a * tmp1.z;
1088 accum2_2.x = b * tmp0.x;
1089 accum2_2.y = b * tmp0.y;
1090 accum2_2.z = b * tmp0.z;
1091 accum2_2.w = b * tmp0.w;
1093 accum1_5.x = tmp1.x + a * tmp0.y;
1094 accum1_5.y = tmp1.y - a * tmp0.x;
1095 accum1_5.z = tmp1.z + a * tmp0.w;
1096 accum1_5.w = tmp1.w - a * tmp0.z;
1098 accum2_5.x = b * tmp1.x;
1099 accum2_5.y = b * tmp1.y;
1100 accum2_5.z = b * tmp1.z;
1101 accum2_5.w = b * tmp1.w;
1109 accum2_2.x += tmp0.x - a * tmp1.y;
1110 accum2_2.y += tmp0.y + a * tmp1.x;
1111 accum2_2.z += tmp0.z - a * tmp1.w;
1112 accum2_2.w += tmp0.w + a * tmp1.z;
1114 accum1_2.x += b * tmp0.x;
1115 accum1_2.y += b * tmp0.y;
1116 accum1_2.z += b * tmp0.z;
1117 accum1_2.w += b * tmp0.w;
1119 accum2_5.x += tmp1.x - a * tmp0.y;
1120 accum2_5.y += tmp1.y + a * tmp0.x;
1121 accum2_5.z += tmp1.z - a * tmp0.w;
1122 accum2_5.w += tmp1.w + a * tmp0.z;
1124 accum1_5.x += b * tmp1.x;
1125 accum1_5.y += b * tmp1.y;
1126 accum1_5.z += b * tmp1.z;
1127 accum1_5.w += b * tmp1.w;
1129 float c1_4 = fmaxf(fabsf(accum1_2.x), fabsf(accum1_2.y));
1130 float c1_5 = fmaxf(fabsf(accum1_2.z), fabsf(accum1_2.w));
1131 float c1_10 = fmaxf(fabsf(accum1_5.x), fabsf(accum1_5.y));
1132 float c1_11 = fmaxf(fabsf(accum1_5.z), fabsf(accum1_5.w));
1134 float c2_4 = fmaxf(fabsf(accum2_2.x), fabsf(accum2_2.y));
1135 float c2_5 = fmaxf(fabsf(accum2_2.z), fabsf(accum2_2.w));
1136 float c2_10 = fmaxf(fabsf(accum2_5.x), fabsf(accum2_5.y));
1137 float c2_11 = fmaxf(fabsf(accum2_5.z), fabsf(accum2_5.w));
1140 c1_0 = fmaxf(c1_0, c1_1);
1141 c1_1 = fmaxf(c1_2, c1_3);
1142 c1_2 = fmaxf(c1_4, c1_5);
1143 c1_3 = fmaxf(c1_6, c1_7);
1144 c1_4 = fmaxf(c1_8, c1_9);
1145 c1_5 = fmaxf(c1_10, c1_11);
1146 c1_0 = fmaxf(c1_0, c1_1);
1147 c1_1 = fmaxf(c1_2, c1_3);
1148 c1_2 = fmaxf(c1_4, c1_5);
1149 c1_0 = fmaxf(c1_0, c1_1);
1150 c1_0 = fmaxf(c1_0, c1_2);
1151 spinorNorm[flv1_idx] = c1_0;
1152 float scale = __fdividef(
MAX_SHORT, c1_0);
1154 accum1_0 = scale * accum1_0;
1155 accum1_1 = scale * accum1_1;
1156 accum1_2 = scale * accum1_2;
1157 accum1_3 = scale * accum1_3;
1158 accum1_4 = scale * accum1_4;
1159 accum1_5 = scale * accum1_5;
1161 c2_0 = fmaxf(c2_0, c2_1);
1162 c2_1 = fmaxf(c2_2, c2_3);
1163 c2_2 = fmaxf(c2_4, c2_5);
1164 c2_3 = fmaxf(c2_6, c2_7);
1165 c2_4 = fmaxf(c2_8, c2_9);
1166 c2_5 = fmaxf(c2_10, c2_11);
1167 c2_0 = fmaxf(c2_0, c2_1);
1168 c2_1 = fmaxf(c2_2, c2_3);
1169 c2_2 = fmaxf(c2_4, c2_5);
1170 c2_0 = fmaxf(c2_0, c2_1);
1171 c2_0 = fmaxf(c2_0, c2_2);
1172 spinorNorm[flv2_idx] = c2_0;
1175 accum2_0 = scale * accum2_0;
1176 accum2_1 = scale * accum2_1;
1177 accum2_2 = scale * accum2_2;
1178 accum2_3 = scale * accum2_3;
1179 accum2_4 = scale * accum2_4;
1180 accum2_5 = scale * accum2_5;
1183 spinor[flv1_idx+0*(param.
sp_stride)] = make_short4((
short)accum1_0.x, (short)accum1_0.y, (
short)accum1_0.z, (short)accum1_0.w);
1184 spinor[flv1_idx+1*(param.
sp_stride)] = make_short4((
short)accum1_1.x, (short)accum1_1.y, (
short)accum1_1.z, (short)accum1_1.w);
1185 spinor[flv1_idx+2*(param.
sp_stride)] = make_short4((
short)accum1_2.x, (short)accum1_2.y, (
short)accum1_2.z, (short)accum1_2.w);
1186 spinor[flv1_idx+3*(param.
sp_stride)] = make_short4((
short)accum1_3.x, (short)accum1_3.y, (
short)accum1_3.z, (short)accum1_3.w);
1187 spinor[flv1_idx+4*(param.
sp_stride)] = make_short4((
short)accum1_4.x, (short)accum1_4.y, (
short)accum1_4.z, (short)accum1_4.w);
1188 spinor[flv1_idx+5*(param.
sp_stride)] = make_short4((
short)accum1_5.x, (short)accum1_5.y, (
short)accum1_5.z, (short)accum1_5.w);
1190 spinor[flv2_idx+0*(param.
sp_stride)] = make_short4((
short)accum2_0.x, (short)accum2_0.y, (
short)accum2_0.z, (short)accum2_0.w);
1191 spinor[flv2_idx+1*(param.
sp_stride)] = make_short4((
short)accum2_1.x, (short)accum2_1.y, (
short)accum2_1.z, (short)accum2_1.w);
1192 spinor[flv2_idx+2*(param.
sp_stride)] = make_short4((
short)accum2_2.x, (short)accum2_2.y, (
short)accum2_2.z, (short)accum2_2.w);
1193 spinor[flv2_idx+3*(param.
sp_stride)] = make_short4((
short)accum2_3.x, (short)accum2_3.y, (
short)accum2_3.z, (short)accum2_3.w);
1194 spinor[flv2_idx+4*(param.
sp_stride)] = make_short4((
short)accum2_4.x, (short)accum2_4.y, (
short)accum2_4.z, (short)accum2_4.w);
1195 spinor[flv2_idx+5*(param.
sp_stride)] = make_short4((
short)accum2_5.x, (short)accum2_5.y, (
short)accum2_5.z, (short)accum2_5.w);
1201 #undef SPINORTEXNORM
cudaColorSpinorField * tmp1
cpuColorSpinorField * spinor
__global__ void twistGamma5Kernel(float4 *spinor, float *null, float a, float b, const float4 *in, const float *null2, DslashParam param)
cudaColorSpinorField * tmp2
__inline__ __device__ double2 fetch_double2(texture< int4, 1 > t, int i)
__device__ float4 operator*(const float &x, const float4 &y)
#define TEX1DFETCH(type, tex, idx)