4 #define SHARED_TMNDEG_FLOATS_PER_THREAD 0
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
10 #else // Open64 compiler
11 #define VOLATILE volatile
15 #define spinorFloat double
41 #define spinorFloat float
66 #endif // SPINOR_DOUBLE
109 #endif // GAUGE_DOUBLE
112 #define gT00_re (+g00_re)
113 #define gT00_im (-g00_im)
114 #define gT01_re (+g10_re)
115 #define gT01_im (-g10_im)
116 #define gT02_re (+g20_re)
117 #define gT02_im (-g20_im)
118 #define gT10_re (+g01_re)
119 #define gT10_im (-g01_im)
120 #define gT11_re (+g11_re)
121 #define gT11_im (-g11_im)
122 #define gT12_re (+g21_re)
123 #define gT12_im (-g21_im)
124 #define gT20_re (+g02_re)
125 #define gT20_im (-g02_im)
126 #define gT21_re (+g12_re)
127 #define gT21_im (-g12_im)
128 #define gT22_re (+g22_re)
129 #define gT22_im (-g22_im)
188 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
190 #endif // MULTI_GPU half precision
199 sid = blockIdx.x*blockDim.x + threadIdx.x;
235 sid = blockIdx.x*blockDim.x + threadIdx.x;
239 const int face_volume = (
param.threads >> 1);
240 const int face_num = (
sid >= face_volume);
247 #if (DD_PREC==2) // half precision
252 coordsFromFaceIndex<1>(
X,
sid,
x1,
x2,
x3,
x4,
face_idx, face_volume,
dim, face_num,
param.parity,
dims);
660 const int sp_idx = (
x1==0 ?
X+
X1m1 :
X-1) >> 1;
666 const int ga_idx =
sp_idx;
1019 const int ga_idx =
sid;
1374 const int ga_idx =
sp_idx;
1727 const int ga_idx =
sid;
2082 const int ga_idx =
sp_idx;
2435 const int ga_idx =
sid;
2899 const int ga_idx =
sp_idx;
3349 incomplete = incomplete || (
param.commDim[3] && (
x4==0 ||
x4==
X4m1));
3351 incomplete = incomplete || (
param.commDim[2] && (
x3==0 ||
x3==
X3m1));
3353 incomplete = incomplete || (
param.commDim[1] && (
x2==0 ||
x2==
X2m1));
3355 incomplete = incomplete || (
param.commDim[0] && (
x1==0 ||
x1==
X1m1));
3371 x1_re = 0.0, x1_im = 0.0;
3372 y1_re = 0.0, y1_im = 0.0;
3373 x2_re = 0.0, x2_im = 0.0;
3374 y2_re = 0.0, y2_im = 0.0;
3401 o1_00_re = x1_re; o1_00_im = x1_im;
3402 o1_20_re = y1_re; o1_20_im = y1_im;
3404 o2_00_re = x2_re; o2_00_im = x2_im;
3405 o2_20_re = y2_re; o2_20_im = y2_im;
3431 o1_10_re = x1_re; o1_10_im = x1_im;
3432 o1_30_re = y1_re; o1_30_im = y1_im;
3434 o2_10_re = x2_re; o2_10_im = x2_im;
3435 o2_30_re = y2_re; o2_30_im = y2_im;
3461 o1_01_re = x1_re; o1_01_im = x1_im;
3462 o1_21_re = y1_re; o1_21_im = y1_im;
3464 o2_01_re = x2_re; o2_01_im = x2_im;
3465 o2_21_re = y2_re; o2_21_im = y2_im;
3491 o1_11_re = x1_re; o1_11_im = x1_im;
3492 o1_31_re = y1_re; o1_31_im = y1_im;
3494 o2_11_re = x2_re; o2_11_im = x2_im;
3495 o2_31_re = y2_re; o2_31_im = y2_im;
3521 o1_02_re = x1_re; o1_02_im = x1_im;
3522 o1_22_re = y1_re; o1_22_im = y1_im;
3524 o2_02_re = x2_re; o2_02_im = x2_im;
3525 o2_22_re = y2_re; o2_22_im = y2_im;
3551 o1_12_re = x1_re; o1_12_im = x1_im;
3552 o1_32_re = y1_re; o1_32_im = y1_im;
3554 o2_12_re = x2_re; o2_12_im = x2_im;
3555 o2_32_re = y2_re; o2_32_im = y2_im;
3613 #ifdef SPINOR_DOUBLE
3615 #define acc_00_re accum0.x
3616 #define acc_00_im accum0.y
3617 #define acc_01_re accum1.x
3618 #define acc_01_im accum1.y
3619 #define acc_02_re accum2.x
3620 #define acc_02_im accum2.y
3621 #define acc_10_re accum3.x
3622 #define acc_10_im accum3.y
3623 #define acc_11_re accum4.x
3624 #define acc_11_im accum4.y
3625 #define acc_12_re accum5.x
3626 #define acc_12_im accum5.y
3627 #define acc_20_re accum6.x
3628 #define acc_20_im accum6.y
3629 #define acc_21_re accum7.x
3630 #define acc_21_im accum7.y
3631 #define acc_22_re accum8.x
3632 #define acc_22_im accum8.y
3633 #define acc_30_re accum9.x
3634 #define acc_30_im accum9.y
3635 #define acc_31_re accum10.x
3636 #define acc_31_im accum10.y
3637 #define acc_32_re accum11.x
3638 #define acc_32_im accum11.y
3641 #define acc_00_re accum0.x
3642 #define acc_00_im accum0.y
3643 #define acc_01_re accum0.z
3644 #define acc_01_im accum0.w
3645 #define acc_02_re accum1.x
3646 #define acc_02_im accum1.y
3647 #define acc_10_re accum1.z
3648 #define acc_10_im accum1.w
3649 #define acc_11_re accum2.x
3650 #define acc_11_im accum2.y
3651 #define acc_12_re accum2.z
3652 #define acc_12_im accum2.w
3653 #define acc_20_re accum3.x
3654 #define acc_20_im accum3.y
3655 #define acc_21_re accum3.z
3656 #define acc_21_im accum3.w
3657 #define acc_22_re accum4.x
3658 #define acc_22_im accum4.y
3659 #define acc_30_re accum4.z
3660 #define acc_30_im accum4.w
3661 #define acc_31_re accum5.x
3662 #define acc_31_im accum5.y
3663 #define acc_32_re accum5.z
3664 #define acc_32_im accum5.w
3666 #endif // SPINOR_DOUBLE
3669 READ_ACCUM(ACCUMTEX,
param.sp_stride)
3696 ASSN_ACCUM(ACCUMTEX,
param.sp_stride,
param.fl_stride)
3751 #ifdef SPINOR_DOUBLE
3753 #define acc1_00_re flv1_accum0.x
3754 #define acc1_00_im flv1_accum0.y
3755 #define acc1_01_re flv1_accum1.x
3756 #define acc1_01_im flv1_accum1.y
3757 #define acc1_02_re flv1_accum2.x
3758 #define acc1_02_im flv1_accum2.y
3759 #define acc1_10_re flv1_accum3.x
3760 #define acc1_10_im flv1_accum3.y
3761 #define acc1_11_re flv1_accum4.x
3762 #define acc1_11_im flv1_accum4.y
3763 #define acc1_12_re flv1_accum5.x
3764 #define acc1_12_im flv1_accum5.y
3765 #define acc1_20_re flv1_accum6.x
3766 #define acc1_20_im flv1_accum6.y
3767 #define acc1_21_re flv1_accum7.x
3768 #define acc1_21_im flv1_accum7.y
3769 #define acc1_22_re flv1_accum8.x
3770 #define acc1_22_im flv1_accum8.y
3771 #define acc1_30_re flv1_accum9.x
3772 #define acc1_30_im flv1_accum9.y
3773 #define acc1_31_re flv1_accum10.x
3774 #define acc1_31_im flv1_accum10.y
3775 #define acc1_32_re flv1_accum11.x
3776 #define acc1_32_im flv1_accum11.y
3778 #define acc2_00_re flv2_accum0.x
3779 #define acc2_00_im flv2_accum0.y
3780 #define acc2_01_re flv2_accum1.x
3781 #define acc2_01_im flv2_accum1.y
3782 #define acc2_02_re flv2_accum2.x
3783 #define acc2_02_im flv2_accum2.y
3784 #define acc2_10_re flv2_accum3.x
3785 #define acc2_10_im flv2_accum3.y
3786 #define acc2_11_re flv2_accum4.x
3787 #define acc2_11_im flv2_accum4.y
3788 #define acc2_12_re flv2_accum5.x
3789 #define acc2_12_im flv2_accum5.y
3790 #define acc2_20_re flv2_accum6.x
3791 #define acc2_20_im flv2_accum6.y
3792 #define acc2_21_re flv2_accum7.x
3793 #define acc2_21_im flv2_accum7.y
3794 #define acc2_22_re flv2_accum8.x
3795 #define acc2_22_im flv2_accum8.y
3796 #define acc2_30_re flv2_accum9.x
3797 #define acc2_30_im flv2_accum9.y
3798 #define acc2_31_re flv2_accum10.x
3799 #define acc2_31_im flv2_accum10.y
3800 #define acc2_32_re flv2_accum11.x
3801 #define acc2_32_im flv2_accum11.y
3805 #define acc1_00_re flv1_accum0.x
3806 #define acc1_00_im flv1_accum0.y
3807 #define acc1_01_re flv1_accum0.z
3808 #define acc1_01_im flv1_accum0.w
3809 #define acc1_02_re flv1_accum1.x
3810 #define acc1_02_im flv1_accum1.y
3811 #define acc1_10_re flv1_accum1.z
3812 #define acc1_10_im flv1_accum1.w
3813 #define acc1_11_re flv1_accum2.x
3814 #define acc1_11_im flv1_accum2.y
3815 #define acc1_12_re flv1_accum2.z
3816 #define acc1_12_im flv1_accum2.w
3817 #define acc1_20_re flv1_accum3.x
3818 #define acc1_20_im flv1_accum3.y
3819 #define acc1_21_re flv1_accum3.z
3820 #define acc1_21_im flv1_accum3.w
3821 #define acc1_22_re flv1_accum4.x
3822 #define acc1_22_im flv1_accum4.y
3823 #define acc1_30_re flv1_accum4.z
3824 #define acc1_30_im flv1_accum4.w
3825 #define acc1_31_re flv1_accum5.x
3826 #define acc1_31_im flv1_accum5.y
3827 #define acc1_32_re flv1_accum5.z
3828 #define acc1_32_im flv1_accum5.w
3830 #define acc2_00_re flv2_accum0.x
3831 #define acc2_00_im flv2_accum0.y
3832 #define acc2_01_re flv2_accum0.z
3833 #define acc2_01_im flv2_accum0.w
3834 #define acc2_02_re flv2_accum1.x
3835 #define acc2_02_im flv2_accum1.y
3836 #define acc2_10_re flv2_accum1.z
3837 #define acc2_10_im flv2_accum1.w
3838 #define acc2_11_re flv2_accum2.x
3839 #define acc2_11_im flv2_accum2.y
3840 #define acc2_12_re flv2_accum2.z
3841 #define acc2_12_im flv2_accum2.w
3842 #define acc2_20_re flv2_accum3.x
3843 #define acc2_20_im flv2_accum3.y
3844 #define acc2_21_re flv2_accum3.z
3845 #define acc2_21_im flv2_accum3.w
3846 #define acc2_22_re flv2_accum4.x
3847 #define acc2_22_im flv2_accum4.y
3848 #define acc2_30_re flv2_accum4.z
3849 #define acc2_30_im flv2_accum4.w
3850 #define acc2_31_re flv2_accum5.x
3851 #define acc2_31_im flv2_accum5.y
3852 #define acc2_32_re flv2_accum5.z
3853 #define acc2_32_im flv2_accum5.w
3855 #endif // SPINOR_DOUBLE
3858 READ_ACCUM_FLAVOR(ACCUMTEX,
param.sp_stride,
param.fl_stride)
3865 x1_re = 0.0, x1_im = 0.0;
3866 y1_re = 0.0, y1_im = 0.0;
3867 x2_re = 0.0, x2_im = 0.0;
3868 y2_re = 0.0, y2_im = 0.0;
3872 x1_re = acc1_00_re - a *acc1_20_im;
3873 x1_im = acc1_00_im + a *acc1_20_re;
3874 x2_re = b * acc1_00_re;
3875 x2_im = b * acc1_00_im;
3877 y1_re = acc1_20_re - a *acc1_00_im;
3878 y1_im = acc1_20_im + a *acc1_00_re;
3879 y2_re = b * acc1_20_re;
3880 y2_im = b * acc1_20_im;
3884 x2_re += acc2_00_re + a *acc2_20_im;
3885 x2_im += acc2_00_im - a *acc2_20_re;
3886 x1_re += b * acc2_00_re;
3887 x1_im += b * acc2_00_im;
3889 y2_re += acc2_20_re + a *acc2_00_im;
3890 y2_im += acc2_20_im - a *acc2_00_re;
3891 y1_re += b * acc2_20_re;
3892 y1_im += b * acc2_20_im;
3895 acc1_00_re = x1_re; acc1_00_im = x1_im;
3896 acc1_20_re = y1_re; acc1_20_im = y1_im;
3898 acc2_00_re = x2_re; acc2_00_im = x2_im;
3899 acc2_20_re = y2_re; acc2_20_im = y2_im;
3902 x1_re = acc1_10_re - a *acc1_30_im;
3903 x1_im = acc1_10_im + a *acc1_30_re;
3904 x2_re = b * acc1_10_re;
3905 x2_im = b * acc1_10_im;
3907 y1_re = acc1_30_re - a *acc1_10_im;
3908 y1_im = acc1_30_im + a *acc1_10_re;
3909 y2_re = b * acc1_30_re;
3910 y2_im = b * acc1_30_im;
3914 x2_re += acc2_10_re + a *acc2_30_im;
3915 x2_im += acc2_10_im - a *acc2_30_re;
3916 x1_re += b * acc2_10_re;
3917 x1_im += b * acc2_10_im;
3919 y2_re += acc2_30_re + a *acc2_10_im;
3920 y2_im += acc2_30_im - a *acc2_10_re;
3921 y1_re += b * acc2_30_re;
3922 y1_im += b * acc2_30_im;
3925 acc1_10_re = x1_re; acc1_10_im = x1_im;
3926 acc1_30_re = y1_re; acc1_30_im = y1_im;
3928 acc2_10_re = x2_re; acc2_10_im = x2_im;
3929 acc2_30_re = y2_re; acc2_30_im = y2_im;
3932 x1_re = acc1_01_re - a *acc1_21_im;
3933 x1_im = acc1_01_im + a *acc1_21_re;
3934 x2_re = b * acc1_01_re;
3935 x2_im = b * acc1_01_im;
3937 y1_re = acc1_21_re - a *acc1_01_im;
3938 y1_im = acc1_21_im + a *acc1_01_re;
3939 y2_re = b * acc1_21_re;
3940 y2_im = b * acc1_21_im;
3944 x2_re += acc2_01_re + a *acc2_21_im;
3945 x2_im += acc2_01_im - a *acc2_21_re;
3946 x1_re += b * acc2_01_re;
3947 x1_im += b * acc2_01_im;
3949 y2_re += acc2_21_re + a *acc2_01_im;
3950 y2_im += acc2_21_im - a *acc2_01_re;
3951 y1_re += b * acc2_21_re;
3952 y1_im += b * acc2_21_im;
3955 acc1_01_re = x1_re; acc1_01_im = x1_im;
3956 acc1_21_re = y1_re; acc1_21_im = y1_im;
3958 acc2_01_re = x2_re; acc2_01_im = x2_im;
3959 acc2_21_re = y2_re; acc2_21_im = y2_im;
3962 x1_re = acc1_11_re - a *acc1_31_im;
3963 x1_im = acc1_11_im + a *acc1_31_re;
3964 x2_re = b * acc1_11_re;
3965 x2_im = b * acc1_11_im;
3967 y1_re = acc1_31_re - a *acc1_11_im;
3968 y1_im = acc1_31_im + a *acc1_11_re;
3969 y2_re = b * acc1_31_re;
3970 y2_im = b * acc1_31_im;
3974 x2_re += acc2_11_re + a *acc2_31_im;
3975 x2_im += acc2_11_im - a *acc2_31_re;
3976 x1_re += b * acc2_11_re;
3977 x1_im += b * acc2_11_im;
3979 y2_re += acc2_31_re + a *acc2_11_im;
3980 y2_im += acc2_31_im - a *acc2_11_re;
3981 y1_re += b * acc2_31_re;
3982 y1_im += b * acc2_31_im;
3985 acc1_11_re = x1_re; acc1_11_im = x1_im;
3986 acc1_31_re = y1_re; acc1_31_im = y1_im;
3988 acc2_11_re = x2_re; acc2_11_im = x2_im;
3989 acc2_31_re = y2_re; acc2_31_im = y2_im;
3992 x1_re = acc1_02_re - a *acc1_22_im;
3993 x1_im = acc1_02_im + a *acc1_22_re;
3994 x2_re = b * acc1_02_re;
3995 x2_im = b * acc1_02_im;
3997 y1_re = acc1_22_re - a *acc1_02_im;
3998 y1_im = acc1_22_im + a *acc1_02_re;
3999 y2_re = b * acc1_22_re;
4000 y2_im = b * acc1_22_im;
4004 x2_re += acc2_02_re + a *acc2_22_im;
4005 x2_im += acc2_02_im - a *acc2_22_re;
4006 x1_re += b * acc2_02_re;
4007 x1_im += b * acc2_02_im;
4009 y2_re += acc2_22_re + a *acc2_02_im;
4010 y2_im += acc2_22_im - a *acc2_02_re;
4011 y1_re += b * acc2_22_re;
4012 y1_im += b * acc2_22_im;
4015 acc1_02_re = x1_re; acc1_02_im = x1_im;
4016 acc1_22_re = y1_re; acc1_22_im = y1_im;
4018 acc2_02_re = x2_re; acc2_02_im = x2_im;
4019 acc2_22_re = y2_re; acc2_22_im = y2_im;
4022 x1_re = acc1_12_re - a *acc1_32_im;
4023 x1_im = acc1_12_im + a *acc1_32_re;
4024 x2_re = b * acc1_12_re;
4025 x2_im = b * acc1_12_im;
4027 y1_re = acc1_32_re - a *acc1_12_im;
4028 y1_im = acc1_32_im + a *acc1_12_re;
4029 y2_re = b * acc1_32_re;
4030 y2_im = b * acc1_32_im;
4034 x2_re += acc2_12_re + a *acc2_32_im;
4035 x2_im += acc2_12_im - a *acc2_32_re;
4036 x1_re += b * acc2_12_re;
4037 x1_im += b * acc2_12_im;
4039 y2_re += acc2_32_re + a *acc2_12_im;
4040 y2_im += acc2_32_im - a *acc2_12_re;
4041 y1_re += b * acc2_32_re;
4042 y1_im += b * acc2_32_im;
4045 acc1_12_re = x1_re; acc1_12_im = x1_im;
4046 acc1_32_re = y1_re; acc1_32_im = y1_im;
4048 acc2_12_re = x2_re; acc2_12_im = x2_im;
4049 acc2_32_re = y2_re; acc2_32_im = y2_im;
4152 #endif//DSLASH_TWIST
4154 #endif // DSLASH_XPAY
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
VOLATILE spinorFloat o2_11_re
VOLATILE spinorFloat o2_12_im
VOLATILE spinorFloat o1_02_im
VOLATILE spinorFloat o1_32_re
VOLATILE spinorFloat o2_11_im
VOLATILE spinorFloat o2_12_re
VOLATILE spinorFloat o1_10_re
VOLATILE spinorFloat o1_00_re
VOLATILE spinorFloat o1_11_re
__constant__ int X3X2X1mX2X1
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o1_21_im
VOLATILE spinorFloat o2_01_re
VOLATILE spinorFloat o2_32_re
VOLATILE spinorFloat o1_20_re
VOLATILE spinorFloat o1_00_im
VOLATILE spinorFloat o2_21_re
VOLATILE spinorFloat o1_20_im
VOLATILE spinorFloat o1_21_re
__constant__ int ghostFace[QUDA_MAX_DIM+1]
VOLATILE spinorFloat o2_00_re
VOLATILE spinorFloat o2_00_im
VOLATILE spinorFloat o1_22_re
VOLATILE spinorFloat o2_30_im
VOLATILE spinorFloat o1_12_re
VOLATILE spinorFloat o1_31_im
VOLATILE spinorFloat o2_01_im
VOLATILE spinorFloat o2_10_im
VOLATILE spinorFloat o1_22_im
VOLATILE spinorFloat o2_31_re
VOLATILE spinorFloat o2_02_im
VOLATILE spinorFloat o1_30_re
__constant__ int gauge_fixed
VOLATILE spinorFloat o2_22_re
VOLATILE spinorFloat o1_01_im
__constant__ int X4X3X2X1mX3X2X1
VOLATILE spinorFloat o1_32_im
VOLATILE spinorFloat o2_31_im
VOLATILE spinorFloat o1_12_im
VOLATILE spinorFloat o1_10_im
__constant__ int ga_stride
VOLATILE spinorFloat o1_30_im
VOLATILE spinorFloat o1_11_im
VOLATILE spinorFloat o2_30_re
RECONSTRUCT_GAUGE_MATRIX(0)
VOLATILE spinorFloat o2_21_im
VOLATILE spinorFloat o1_02_re
VOLATILE spinorFloat o2_20_re
VOLATILE spinorFloat o2_20_im
VOLATILE spinorFloat o2_22_im
VOLATILE spinorFloat o1_31_re
o1_00_im *o1_01_re *o1_01_im *o1_02_re *o1_02_im *o1_10_re *o1_10_im *o1_11_re *o1_11_im *o1_12_re *o1_12_im *o1_20_re *o1_20_im *o1_21_re *o1_21_im *o1_22_re *o1_22_im *o1_30_re *o1_30_im *o1_31_re *o1_31_im *o1_32_re *o1_32_im *o2_00_re *o2_00_im *o2_01_re *o2_01_im *o2_02_re *o2_02_im *o2_10_re *o2_10_im *o2_11_re *o2_11_im *o2_12_re *o2_12_im *o2_20_re *o2_20_im *o2_21_re *o2_21_im *o2_22_re *o2_22_im *o2_30_re *o2_30_im *o2_31_re *o2_31_im *o2_32_re *o2_32_im * WRITE_FLAVOR_SPINOR()
VOLATILE spinorFloat o1_01_re
VOLATILE spinorFloat o2_10_re
VOLATILE spinorFloat o2_32_im
__constant__ int X4X3X2X1hmX3X2X1h
VOLATILE spinorFloat o2_02_re
READ_GAUGE_MATRIX(G, GAUGE0TEX, 0, ga_idx, ga_stride)