4 #define SHARED_TMNDEG_FLOATS_PER_THREAD 0
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
10 #else // Open64 compiler
11 #define VOLATILE volatile
16 #define spinorFloat double
42 #define spinorFloat float
67 #endif // SPINOR_DOUBLE
110 #endif // GAUGE_DOUBLE
113 #define gT00_re (+g00_re)
114 #define gT00_im (-g00_im)
115 #define gT01_re (+g10_re)
116 #define gT01_im (-g10_im)
117 #define gT02_re (+g20_re)
118 #define gT02_im (-g20_im)
119 #define gT10_re (+g01_re)
120 #define gT10_im (-g01_im)
121 #define gT11_re (+g11_re)
122 #define gT11_im (-g11_im)
123 #define gT12_re (+g21_re)
124 #define gT12_im (-g21_im)
125 #define gT20_re (+g02_re)
126 #define gT20_im (-g02_im)
127 #define gT21_re (+g12_re)
128 #define gT21_im (-g12_im)
129 #define gT22_re (+g22_re)
130 #define gT22_im (-g22_im)
189 #if (DD_PREC==2) // half precision
191 #endif // MULTI_GPU half precision
209 sid = blockIdx.x*blockDim.x + threadIdx.x;
245 sid = blockIdx.x*blockDim.x + threadIdx.x;
250 const int face_volume = ((
param.threadDimMapUpper[
dim] -
param.threadDimMapLower[
dim]) >> 1);
252 const int face_num = (
sid >= face_volume);
261 coordsFromFaceIndex<1>(
X,
sid,
x1,
x2,
x3,
x4,
face_idx, face_volume,
dim, face_num,
param.parity,
dims);
264 for(
int dir=0; dir<4; ++dir){
265 active = active || isActive(dim,dir,+1,x1,x2,x3,x4,
param.commDim,
param.
X);
317 const int sp_idx = face_idx +
param.ghostOffset[0];
522 const int fl_idx = sp_idx + ghostFace[0];
655 if (isActive(dim,0,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x1==0)
664 const int sp_idx = face_idx +
param.ghostOffset[0];
666 sp_norm_idx = face_idx +
param.ghostNormOffset[0];
869 const int fl_idx = sp_idx + ghostFace[0];
1011 const int sp_idx = face_idx +
param.ghostOffset[1];
1216 const int fl_idx = sp_idx + ghostFace[1];
1358 const int sp_idx = face_idx +
param.ghostOffset[1];
1360 sp_norm_idx = face_idx +
param.ghostNormOffset[1];
1563 const int fl_idx = sp_idx + ghostFace[1];
1705 const int sp_idx = face_idx +
param.ghostOffset[2];
1910 const int fl_idx = sp_idx + ghostFace[2];
2052 const int sp_idx = face_idx +
param.ghostOffset[2];
2054 sp_norm_idx = face_idx +
param.ghostNormOffset[2];
2257 const int fl_idx = sp_idx + ghostFace[2];
2399 const int sp_idx = face_idx +
param.ghostOffset[3];
2506 const int fl_idx = sp_idx + ghostFace[3];
2724 const int fl_idx = sp_idx + ghostFace[3];
2855 const int sp_idx = face_idx +
param.ghostOffset[3];
2857 sp_norm_idx = face_idx +
param.ghostNormOffset[3];
2962 const int fl_idx = sp_idx + ghostFace[3];
3180 const int fl_idx = sp_idx + ghostFace[3];
3308 incomplete = incomplete || (
param.commDim[3] && (x4==0 || x4==
X4m1));
3310 incomplete = incomplete || (
param.commDim[2] && (x3==0 || x3==
X3m1));
3312 incomplete = incomplete || (
param.commDim[1] && (x2==0 || x2==
X2m1));
3314 incomplete = incomplete || (
param.commDim[0] && (x1==0 || x1==
X1m1));
3330 x1_re = 0.0, x1_im = 0.0;
3331 y1_re = 0.0, y1_im = 0.0;
3332 x2_re = 0.0, x2_im = 0.0;
3333 y2_re = 0.0, y2_im = 0.0;
3360 o1_00_re = x1_re; o1_00_im = x1_im;
3361 o1_20_re = y1_re; o1_20_im = y1_im;
3363 o2_00_re = x2_re; o2_00_im = x2_im;
3364 o2_20_re = y2_re; o2_20_im = y2_im;
3390 o1_10_re = x1_re; o1_10_im = x1_im;
3391 o1_30_re = y1_re; o1_30_im = y1_im;
3393 o2_10_re = x2_re; o2_10_im = x2_im;
3394 o2_30_re = y2_re; o2_30_im = y2_im;
3420 o1_01_re = x1_re; o1_01_im = x1_im;
3421 o1_21_re = y1_re; o1_21_im = y1_im;
3423 o2_01_re = x2_re; o2_01_im = x2_im;
3424 o2_21_re = y2_re; o2_21_im = y2_im;
3450 o1_11_re = x1_re; o1_11_im = x1_im;
3451 o1_31_re = y1_re; o1_31_im = y1_im;
3453 o2_11_re = x2_re; o2_11_im = x2_im;
3454 o2_31_re = y2_re; o2_31_im = y2_im;
3480 o1_02_re = x1_re; o1_02_im = x1_im;
3481 o1_22_re = y1_re; o1_22_im = y1_im;
3483 o2_02_re = x2_re; o2_02_im = x2_im;
3484 o2_22_re = y2_re; o2_22_im = y2_im;
3510 o1_12_re = x1_re; o1_12_im = x1_im;
3511 o1_32_re = y1_re; o1_32_im = y1_im;
3513 o2_12_re = x2_re; o2_12_im = x2_im;
3514 o2_32_re = y2_re; o2_32_im = y2_im;
3572 #ifdef SPINOR_DOUBLE
3574 #define acc_00_re accum0.x
3575 #define acc_00_im accum0.y
3576 #define acc_01_re accum1.x
3577 #define acc_01_im accum1.y
3578 #define acc_02_re accum2.x
3579 #define acc_02_im accum2.y
3580 #define acc_10_re accum3.x
3581 #define acc_10_im accum3.y
3582 #define acc_11_re accum4.x
3583 #define acc_11_im accum4.y
3584 #define acc_12_re accum5.x
3585 #define acc_12_im accum5.y
3586 #define acc_20_re accum6.x
3587 #define acc_20_im accum6.y
3588 #define acc_21_re accum7.x
3589 #define acc_21_im accum7.y
3590 #define acc_22_re accum8.x
3591 #define acc_22_im accum8.y
3592 #define acc_30_re accum9.x
3593 #define acc_30_im accum9.y
3594 #define acc_31_re accum10.x
3595 #define acc_31_im accum10.y
3596 #define acc_32_re accum11.x
3597 #define acc_32_im accum11.y
3600 #define acc_00_re accum0.x
3601 #define acc_00_im accum0.y
3602 #define acc_01_re accum0.z
3603 #define acc_01_im accum0.w
3604 #define acc_02_re accum1.x
3605 #define acc_02_im accum1.y
3606 #define acc_10_re accum1.z
3607 #define acc_10_im accum1.w
3608 #define acc_11_re accum2.x
3609 #define acc_11_im accum2.y
3610 #define acc_12_re accum2.z
3611 #define acc_12_im accum2.w
3612 #define acc_20_re accum3.x
3613 #define acc_20_im accum3.y
3614 #define acc_21_re accum3.z
3615 #define acc_21_im accum3.w
3616 #define acc_22_re accum4.x
3617 #define acc_22_im accum4.y
3618 #define acc_30_re accum4.z
3619 #define acc_30_im accum4.w
3620 #define acc_31_re accum5.x
3621 #define acc_31_im accum5.y
3622 #define acc_32_re accum5.z
3623 #define acc_32_im accum5.w
3625 #endif // SPINOR_DOUBLE
3628 READ_ACCUM(ACCUMTEX,
param.sp_stride)
3655 ASSN_ACCUM(ACCUMTEX,
param.sp_stride,
param.fl_stride)
3710 #ifdef SPINOR_DOUBLE
3712 #define acc1_00_re flv1_accum0.x
3713 #define acc1_00_im flv1_accum0.y
3714 #define acc1_01_re flv1_accum1.x
3715 #define acc1_01_im flv1_accum1.y
3716 #define acc1_02_re flv1_accum2.x
3717 #define acc1_02_im flv1_accum2.y
3718 #define acc1_10_re flv1_accum3.x
3719 #define acc1_10_im flv1_accum3.y
3720 #define acc1_11_re flv1_accum4.x
3721 #define acc1_11_im flv1_accum4.y
3722 #define acc1_12_re flv1_accum5.x
3723 #define acc1_12_im flv1_accum5.y
3724 #define acc1_20_re flv1_accum6.x
3725 #define acc1_20_im flv1_accum6.y
3726 #define acc1_21_re flv1_accum7.x
3727 #define acc1_21_im flv1_accum7.y
3728 #define acc1_22_re flv1_accum8.x
3729 #define acc1_22_im flv1_accum8.y
3730 #define acc1_30_re flv1_accum9.x
3731 #define acc1_30_im flv1_accum9.y
3732 #define acc1_31_re flv1_accum10.x
3733 #define acc1_31_im flv1_accum10.y
3734 #define acc1_32_re flv1_accum11.x
3735 #define acc1_32_im flv1_accum11.y
3737 #define acc2_00_re flv2_accum0.x
3738 #define acc2_00_im flv2_accum0.y
3739 #define acc2_01_re flv2_accum1.x
3740 #define acc2_01_im flv2_accum1.y
3741 #define acc2_02_re flv2_accum2.x
3742 #define acc2_02_im flv2_accum2.y
3743 #define acc2_10_re flv2_accum3.x
3744 #define acc2_10_im flv2_accum3.y
3745 #define acc2_11_re flv2_accum4.x
3746 #define acc2_11_im flv2_accum4.y
3747 #define acc2_12_re flv2_accum5.x
3748 #define acc2_12_im flv2_accum5.y
3749 #define acc2_20_re flv2_accum6.x
3750 #define acc2_20_im flv2_accum6.y
3751 #define acc2_21_re flv2_accum7.x
3752 #define acc2_21_im flv2_accum7.y
3753 #define acc2_22_re flv2_accum8.x
3754 #define acc2_22_im flv2_accum8.y
3755 #define acc2_30_re flv2_accum9.x
3756 #define acc2_30_im flv2_accum9.y
3757 #define acc2_31_re flv2_accum10.x
3758 #define acc2_31_im flv2_accum10.y
3759 #define acc2_32_re flv2_accum11.x
3760 #define acc2_32_im flv2_accum11.y
3764 #define acc1_00_re flv1_accum0.x
3765 #define acc1_00_im flv1_accum0.y
3766 #define acc1_01_re flv1_accum0.z
3767 #define acc1_01_im flv1_accum0.w
3768 #define acc1_02_re flv1_accum1.x
3769 #define acc1_02_im flv1_accum1.y
3770 #define acc1_10_re flv1_accum1.z
3771 #define acc1_10_im flv1_accum1.w
3772 #define acc1_11_re flv1_accum2.x
3773 #define acc1_11_im flv1_accum2.y
3774 #define acc1_12_re flv1_accum2.z
3775 #define acc1_12_im flv1_accum2.w
3776 #define acc1_20_re flv1_accum3.x
3777 #define acc1_20_im flv1_accum3.y
3778 #define acc1_21_re flv1_accum3.z
3779 #define acc1_21_im flv1_accum3.w
3780 #define acc1_22_re flv1_accum4.x
3781 #define acc1_22_im flv1_accum4.y
3782 #define acc1_30_re flv1_accum4.z
3783 #define acc1_30_im flv1_accum4.w
3784 #define acc1_31_re flv1_accum5.x
3785 #define acc1_31_im flv1_accum5.y
3786 #define acc1_32_re flv1_accum5.z
3787 #define acc1_32_im flv1_accum5.w
3789 #define acc2_00_re flv2_accum0.x
3790 #define acc2_00_im flv2_accum0.y
3791 #define acc2_01_re flv2_accum0.z
3792 #define acc2_01_im flv2_accum0.w
3793 #define acc2_02_re flv2_accum1.x
3794 #define acc2_02_im flv2_accum1.y
3795 #define acc2_10_re flv2_accum1.z
3796 #define acc2_10_im flv2_accum1.w
3797 #define acc2_11_re flv2_accum2.x
3798 #define acc2_11_im flv2_accum2.y
3799 #define acc2_12_re flv2_accum2.z
3800 #define acc2_12_im flv2_accum2.w
3801 #define acc2_20_re flv2_accum3.x
3802 #define acc2_20_im flv2_accum3.y
3803 #define acc2_21_re flv2_accum3.z
3804 #define acc2_21_im flv2_accum3.w
3805 #define acc2_22_re flv2_accum4.x
3806 #define acc2_22_im flv2_accum4.y
3807 #define acc2_30_re flv2_accum4.z
3808 #define acc2_30_im flv2_accum4.w
3809 #define acc2_31_re flv2_accum5.x
3810 #define acc2_31_im flv2_accum5.y
3811 #define acc2_32_re flv2_accum5.z
3812 #define acc2_32_im flv2_accum5.w
3814 #endif // SPINOR_DOUBLE
3817 READ_ACCUM_FLAVOR(ACCUMTEX,
param.sp_stride,
param.fl_stride)
3824 x1_re = 0.0, x1_im = 0.0;
3825 y1_re = 0.0, y1_im = 0.0;
3826 x2_re = 0.0, x2_im = 0.0;
3827 y2_re = 0.0, y2_im = 0.0;
3831 x1_re = acc1_00_re + a *acc1_20_im;
3832 x1_im = acc1_00_im - a *acc1_20_re;
3833 x2_re = b * acc1_00_re;
3834 x2_im = b * acc1_00_im;
3836 y1_re = acc1_20_re + a *acc1_00_im;
3837 y1_im = acc1_20_im - a *acc1_00_re;
3838 y2_re = b * acc1_20_re;
3839 y2_im = b * acc1_20_im;
3843 x2_re += acc2_00_re - a *acc2_20_im;
3844 x2_im += acc2_00_im + a *acc2_20_re;
3845 x1_re += b * acc2_00_re;
3846 x1_im += b * acc2_00_im;
3848 y2_re += acc2_20_re - a *acc2_00_im;
3849 y2_im += acc2_20_im + a *acc2_00_re;
3850 y1_re += b * acc2_20_re;
3851 y1_im += b * acc2_20_im;
3854 acc1_00_re = x1_re; acc1_00_im = x1_im;
3855 acc1_20_re = y1_re; acc1_20_im = y1_im;
3857 acc2_00_re = x2_re; acc2_00_im = x2_im;
3858 acc2_20_re = y2_re; acc2_20_im = y2_im;
3861 x1_re = acc1_10_re + a *acc1_30_im;
3862 x1_im = acc1_10_im - a *acc1_30_re;
3863 x2_re = b * acc1_10_re;
3864 x2_im = b * acc1_10_im;
3866 y1_re = acc1_30_re + a *acc1_10_im;
3867 y1_im = acc1_30_im - a *acc1_10_re;
3868 y2_re = b * acc1_30_re;
3869 y2_im = b * acc1_30_im;
3873 x2_re += acc2_10_re - a *acc2_30_im;
3874 x2_im += acc2_10_im + a *acc2_30_re;
3875 x1_re += b * acc2_10_re;
3876 x1_im += b * acc2_10_im;
3878 y2_re += acc2_30_re - a *acc2_10_im;
3879 y2_im += acc2_30_im + a *acc2_10_re;
3880 y1_re += b * acc2_30_re;
3881 y1_im += b * acc2_30_im;
3884 acc1_10_re = x1_re; acc1_10_im = x1_im;
3885 acc1_30_re = y1_re; acc1_30_im = y1_im;
3887 acc2_10_re = x2_re; acc2_10_im = x2_im;
3888 acc2_30_re = y2_re; acc2_30_im = y2_im;
3891 x1_re = acc1_01_re + a *acc1_21_im;
3892 x1_im = acc1_01_im - a *acc1_21_re;
3893 x2_re = b * acc1_01_re;
3894 x2_im = b * acc1_01_im;
3896 y1_re = acc1_21_re + a *acc1_01_im;
3897 y1_im = acc1_21_im - a *acc1_01_re;
3898 y2_re = b * acc1_21_re;
3899 y2_im = b * acc1_21_im;
3903 x2_re += acc2_01_re - a *acc2_21_im;
3904 x2_im += acc2_01_im + a *acc2_21_re;
3905 x1_re += b * acc2_01_re;
3906 x1_im += b * acc2_01_im;
3908 y2_re += acc2_21_re - a *acc2_01_im;
3909 y2_im += acc2_21_im + a *acc2_01_re;
3910 y1_re += b * acc2_21_re;
3911 y1_im += b * acc2_21_im;
3914 acc1_01_re = x1_re; acc1_01_im = x1_im;
3915 acc1_21_re = y1_re; acc1_21_im = y1_im;
3917 acc2_01_re = x2_re; acc2_01_im = x2_im;
3918 acc2_21_re = y2_re; acc2_21_im = y2_im;
3921 x1_re = acc1_11_re + a *acc1_31_im;
3922 x1_im = acc1_11_im - a *acc1_31_re;
3923 x2_re = b * acc1_11_re;
3924 x2_im = b * acc1_11_im;
3926 y1_re = acc1_31_re + a *acc1_11_im;
3927 y1_im = acc1_31_im - a *acc1_11_re;
3928 y2_re = b * acc1_31_re;
3929 y2_im = b * acc1_31_im;
3933 x2_re += acc2_11_re - a *acc2_31_im;
3934 x2_im += acc2_11_im + a *acc2_31_re;
3935 x1_re += b * acc2_11_re;
3936 x1_im += b * acc2_11_im;
3938 y2_re += acc2_31_re - a *acc2_11_im;
3939 y2_im += acc2_31_im + a *acc2_11_re;
3940 y1_re += b * acc2_31_re;
3941 y1_im += b * acc2_31_im;
3944 acc1_11_re = x1_re; acc1_11_im = x1_im;
3945 acc1_31_re = y1_re; acc1_31_im = y1_im;
3947 acc2_11_re = x2_re; acc2_11_im = x2_im;
3948 acc2_31_re = y2_re; acc2_31_im = y2_im;
3951 x1_re = acc1_02_re + a *acc1_22_im;
3952 x1_im = acc1_02_im - a *acc1_22_re;
3953 x2_re = b * acc1_02_re;
3954 x2_im = b * acc1_02_im;
3956 y1_re = acc1_22_re + a *acc1_02_im;
3957 y1_im = acc1_22_im - a *acc1_02_re;
3958 y2_re = b * acc1_22_re;
3959 y2_im = b * acc1_22_im;
3963 x2_re += acc2_02_re - a *acc2_22_im;
3964 x2_im += acc2_02_im + a *acc2_22_re;
3965 x1_re += b * acc2_02_re;
3966 x1_im += b * acc2_02_im;
3968 y2_re += acc2_22_re - a *acc2_02_im;
3969 y2_im += acc2_22_im + a *acc2_02_re;
3970 y1_re += b * acc2_22_re;
3971 y1_im += b * acc2_22_im;
3974 acc1_02_re = x1_re; acc1_02_im = x1_im;
3975 acc1_22_re = y1_re; acc1_22_im = y1_im;
3977 acc2_02_re = x2_re; acc2_02_im = x2_im;
3978 acc2_22_re = y2_re; acc2_22_im = y2_im;
3981 x1_re = acc1_12_re + a *acc1_32_im;
3982 x1_im = acc1_12_im - a *acc1_32_re;
3983 x2_re = b * acc1_12_re;
3984 x2_im = b * acc1_12_im;
3986 y1_re = acc1_32_re + a *acc1_12_im;
3987 y1_im = acc1_32_im - a *acc1_12_re;
3988 y2_re = b * acc1_32_re;
3989 y2_im = b * acc1_32_im;
3993 x2_re += acc2_12_re - a *acc2_32_im;
3994 x2_im += acc2_12_im + a *acc2_32_re;
3995 x1_re += b * acc2_12_re;
3996 x1_im += b * acc2_12_im;
3998 y2_re += acc2_32_re - a *acc2_12_im;
3999 y2_im += acc2_32_im + a *acc2_12_re;
4000 y1_re += b * acc2_32_re;
4001 y1_im += b * acc2_32_im;
4004 acc1_12_re = x1_re; acc1_12_im = x1_im;
4005 acc1_32_re = y1_re; acc1_32_im = y1_im;
4007 acc2_12_re = x2_re; acc2_12_im = x2_im;
4008 acc2_32_re = y2_re; acc2_32_im = y2_im;
4111 #endif//DSLASH_TWIST
4113 #endif // DSLASH_XPAY
VOLATILE spinorFloat o2_30_im
VOLATILE spinorFloat o2_01_im
VOLATILE spinorFloat o1_12_im
VOLATILE spinorFloat o2_30_re
VOLATILE spinorFloat o2_32_im
VOLATILE spinorFloat o1_20_re
VOLATILE spinorFloat o1_32_im
VOLATILE spinorFloat o2_00_re
#define READ_INTERMEDIATE_SPINOR
VOLATILE spinorFloat o1_10_im
VOLATILE spinorFloat o1_11_im
VOLATILE spinorFloat o2_21_re
VOLATILE spinorFloat o2_10_re
VOLATILE spinorFloat o1_12_re
VOLATILE spinorFloat o2_22_re
o1_00_im *o1_01_re *o1_01_im *o1_02_re *o1_02_im *o1_10_re *o1_10_im *o1_11_re *o1_11_im *o1_12_re *o1_12_im *o1_20_re *o1_20_im *o1_21_re *o1_21_im *o1_22_re *o1_22_im *o1_30_re *o1_30_im *o1_31_re *o1_31_im *o1_32_re *o1_32_im *o2_00_re *o2_00_im *o2_01_re *o2_01_im *o2_02_re *o2_02_im *o2_10_re *o2_10_im *o2_11_re *o2_11_im *o2_12_re *o2_12_im *o2_20_re *o2_20_im *o2_21_re *o2_21_im *o2_22_re *o2_22_im *o2_30_re *o2_30_im *o2_31_re *o2_31_im *o2_32_re *o2_32_im * WRITE_FLAVOR_SPINOR()
VOLATILE spinorFloat o2_10_im
VOLATILE spinorFloat o2_02_re
VOLATILE spinorFloat o2_32_re
VOLATILE spinorFloat o1_10_re
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define RECONSTRUCT_GAUGE_MATRIX
VOLATILE spinorFloat o1_01_im
VOLATILE spinorFloat o2_11_re
VOLATILE spinorFloat o1_00_re
VOLATILE spinorFloat o2_20_im
VOLATILE spinorFloat o2_12_re
coordsFromIndex< EVEN_X >(X, x1, x2, x3, x4, sid, param.parity, dims)
VOLATILE spinorFloat o2_31_im
VOLATILE spinorFloat o1_02_im
VOLATILE spinorFloat o2_11_im
__constant__ int gauge_fixed
VOLATILE spinorFloat o1_01_re
VOLATILE spinorFloat o1_11_re
VOLATILE spinorFloat o1_21_im
VOLATILE spinorFloat o2_02_im
VOLATILE spinorFloat o1_31_im
VOLATILE spinorFloat o2_31_re
__constant__ int ga_stride
VOLATILE spinorFloat o2_22_im
VOLATILE spinorFloat o1_22_im
VOLATILE spinorFloat o2_00_im
VOLATILE spinorFloat o1_30_re
VOLATILE spinorFloat o1_00_im
VOLATILE spinorFloat o2_20_re
VOLATILE spinorFloat o1_21_re
VOLATILE spinorFloat o1_02_re
VOLATILE spinorFloat o1_30_im
VOLATILE spinorFloat o2_12_im
#define READ_GAUGE_MATRIX
VOLATILE spinorFloat o1_32_re
__constant__ int X4X3X2X1hmX3X2X1h
VOLATILE spinorFloat o1_20_im
VOLATILE spinorFloat o1_22_re
VOLATILE spinorFloat o2_21_im
VOLATILE spinorFloat o2_01_re
VOLATILE spinorFloat o1_31_re