4 #define SHARED_TMNDEG_FLOATS_PER_THREAD 0
8 #if ((CUDA_VERSION >= 4010) && (__COMPUTE_CAPABILITY__ >= 200)) // NVVM compiler
10 #else // Open64 compiler
11 #define VOLATILE volatile
15 #define spinorFloat double
41 #define spinorFloat float
66 #endif // SPINOR_DOUBLE
109 #endif // GAUGE_DOUBLE
112 #define gT00_re (+g00_re)
113 #define gT00_im (-g00_im)
114 #define gT01_re (+g10_re)
115 #define gT01_im (-g10_im)
116 #define gT02_re (+g20_re)
117 #define gT02_im (-g20_im)
118 #define gT10_re (+g01_re)
119 #define gT10_im (-g01_im)
120 #define gT11_re (+g11_re)
121 #define gT11_im (-g11_im)
122 #define gT12_re (+g21_re)
123 #define gT12_im (-g21_im)
124 #define gT20_re (+g02_re)
125 #define gT20_im (-g02_im)
126 #define gT21_re (+g12_re)
127 #define gT21_im (-g12_im)
128 #define gT22_re (+g22_re)
129 #define gT22_im (-g22_im)
188 #if (defined MULTI_GPU) && (DD_PREC==2) // half precision
190 #endif // MULTI_GPU half precision
199 sid = blockIdx.x*blockDim.x + threadIdx.x;
234 sid = blockIdx.x*blockDim.x + threadIdx.x;
237 const int dim =
static_cast<int>(kernel_type);
238 const int face_volume = (
param.threads >> 1);
239 const int face_num = (
sid >= face_volume);
240 face_idx =
sid - face_num*face_volume;
246 #if (DD_PREC==2) // half precision
247 sp_norm_idx =
sid +
param.ghostNormOffset[
static_cast<int>(kernel_type)];
250 coordsFromFaceIndex<1>(
X,
sid,
x1,
x2,
x3,
x4, face_idx, face_volume, dim, face_num,
param.parity);
304 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
510 const int fl_idx = sp_idx +
ghostFace[
static_cast<int>(kernel_type)];
656 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
658 const int sp_idx = (
x1==0 ?
X+
X1m1 :
X-1) >> 1;
664 const int ga_idx = sp_idx;
866 const int fl_idx = sp_idx +
ghostFace[
static_cast<int>(kernel_type)];
1012 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1017 const int ga_idx =
sid;
1057 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
1215 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
1218 const int fl_idx = sp_idx +
ghostFace[
static_cast<int>(kernel_type)];
1364 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1372 const int ga_idx = sp_idx;
1413 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
1571 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
1574 const int fl_idx = sp_idx +
ghostFace[
static_cast<int>(kernel_type)];
1720 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
1725 const int ga_idx =
sid;
1765 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
1923 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
1926 const int fl_idx = sp_idx +
ghostFace[
static_cast<int>(kernel_type)];
2072 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
2080 const int ga_idx = sp_idx;
2121 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
2279 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
2282 const int fl_idx = sp_idx +
ghostFace[
static_cast<int>(kernel_type)];
2428 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
2433 const int ga_idx =
sid;
2469 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
2533 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
2536 const int fl_idx = sp_idx +
ghostFace[
static_cast<int>(kernel_type)];
2605 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
2751 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
2754 const int fl_idx = sp_idx +
ghostFace[
static_cast<int>(kernel_type)];
2889 face_idx +
param.ghostOffset[
static_cast<int>(kernel_type)];
2897 const int ga_idx = sp_idx;
2934 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
2998 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
3001 const int fl_idx = sp_idx +
ghostFace[
static_cast<int>(kernel_type)];
3070 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
3216 const int sp_stride_pad =
FLAVORS*
ghostFace[
static_cast<int>(kernel_type)];
3219 const int fl_idx = sp_idx +
ghostFace[
static_cast<int>(kernel_type)];
3345 switch(kernel_type) {
3347 incomplete = incomplete || (
param.commDim[3] && (
x4==0 ||
x4==
X4m1));
3349 incomplete = incomplete || (
param.commDim[2] && (
x3==0 ||
x3==
X3m1));
3351 incomplete = incomplete || (
param.commDim[1] && (
x2==0 ||
x2==
X2m1));
3353 incomplete = incomplete || (
param.commDim[0] && (
x1==0 ||
x1==
X1m1));
3368 x1_re = 0.0, x1_im = 0.0;
3369 y1_re = 0.0, y1_im = 0.0;
3370 x2_re = 0.0, x2_im = 0.0;
3371 y2_re = 0.0, y2_im = 0.0;
3398 o1_00_re = x1_re; o1_00_im = x1_im;
3399 o1_20_re = y1_re; o1_20_im = y1_im;
3401 o2_00_re = x2_re; o2_00_im = x2_im;
3402 o2_20_re = y2_re; o2_20_im = y2_im;
3428 o1_10_re = x1_re; o1_10_im = x1_im;
3429 o1_30_re = y1_re; o1_30_im = y1_im;
3431 o2_10_re = x2_re; o2_10_im = x2_im;
3432 o2_30_re = y2_re; o2_30_im = y2_im;
3458 o1_01_re = x1_re; o1_01_im = x1_im;
3459 o1_21_re = y1_re; o1_21_im = y1_im;
3461 o2_01_re = x2_re; o2_01_im = x2_im;
3462 o2_21_re = y2_re; o2_21_im = y2_im;
3488 o1_11_re = x1_re; o1_11_im = x1_im;
3489 o1_31_re = y1_re; o1_31_im = y1_im;
3491 o2_11_re = x2_re; o2_11_im = x2_im;
3492 o2_31_re = y2_re; o2_31_im = y2_im;
3518 o1_02_re = x1_re; o1_02_im = x1_im;
3519 o1_22_re = y1_re; o1_22_im = y1_im;
3521 o2_02_re = x2_re; o2_02_im = x2_im;
3522 o2_22_re = y2_re; o2_22_im = y2_im;
3548 o1_12_re = x1_re; o1_12_im = x1_im;
3549 o1_32_re = y1_re; o1_32_im = y1_im;
3551 o2_12_re = x2_re; o2_12_im = x2_im;
3552 o2_32_re = y2_re; o2_32_im = y2_im;
3612 #ifdef SPINOR_DOUBLE
3638 o1_00_re = c*o1_00_re + accum0.x;
3662 #endif // SPINOR_DOUBLE
3669 #ifdef SPINOR_DOUBLE
3695 o2_00_re = c*o2_00_re + accum0.x;
3719 #endif // SPINOR_DOUBLE
3723 #endif // DSLASH_XPAY