3 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
8 #if (CUDA_VERSION >= 4010)
11 #define VOLATILE volatile
15 #define spinorFloat double
41 #define spinorFloat float
66 #endif // SPINOR_DOUBLE
109 #endif // GAUGE_DOUBLE
112 #define gT00_re (+g00_re)
113 #define gT00_im (-g00_im)
114 #define gT01_re (+g10_re)
115 #define gT01_im (-g10_im)
116 #define gT02_re (+g20_re)
117 #define gT02_im (-g20_im)
118 #define gT10_re (+g01_re)
119 #define gT10_im (-g01_im)
120 #define gT11_re (+g11_re)
121 #define gT11_im (-g11_im)
122 #define gT12_re (+g21_re)
123 #define gT12_im (-g21_im)
124 #define gT20_re (+g02_re)
125 #define gT20_im (-g02_im)
126 #define gT21_re (+g12_re)
127 #define gT21_im (-g12_im)
128 #define gT22_re (+g22_re)
129 #define gT22_im (-g22_im)
158 #if (__COMPUTE_CAPABILITY__ >= 200)
159 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
161 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
164 #if (__COMPUTE_CAPABILITY__ >= 200)
165 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
167 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
174 #if (DD_PREC==2) // half precision
176 #endif // half precision
178 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
187 faceVolume[0] = (
X2*
X3*
X4)>>1;
188 faceVolume[1] = (
X1*
X3*
X4)>>1;
189 faceVolume[2] = (
X1*
X2*
X4)>>1;
190 faceVolume[3] = (
X1*
X2*
X3)>>1;
201 dim = dimFromDWFaceIndex(
sid,
param);
203 const int face_volume = ((
param.threadDimMapUpper[
dim] -
param.threadDimMapLower[
dim])*
param.Ls >> 1);
204 const int face_num = (
sid >= face_volume);
205 face_idx =
sid - face_num*face_volume;
212 coordsFromDWFaceIndex<1>(
sid,
x1,
x2,
x3,
x4,
xs,
face_idx, face_volume,
dim, face_num,
param.parity,
dims);
216 for(
int dir=0; dir<4; ++dir){
217 active = active || isActive(dim,dir,+1,x1,x2,x3,x4,
param.commDim,
param.
X);
226 X = 2*sid + (boundaryCrossing +
param.parity) % 2;
246 #if (DD_PREC==0) //temporal hack
286 const int sp_idx = face_idx +
param.ghostOffset[0];
287 #if (DD_PREC==2) // half precision
442 if (isActive(dim,0,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x1==0 )
451 const int sp_idx = face_idx +
param.ghostOffset[0];
452 #if (DD_PREC==2) // half precision
453 sp_norm_idx = face_idx +
param.ghostNormOffset[0];
616 const int sp_idx = face_idx +
param.ghostOffset[1];
617 #if (DD_PREC==2) // half precision
622 const int ga_idx = sid %
Vh;
772 if (isActive(dim,1,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x2==0 )
781 const int sp_idx = face_idx +
param.ghostOffset[1];
782 #if (DD_PREC==2) // half precision
783 sp_norm_idx = face_idx +
param.ghostNormOffset[1];
946 const int sp_idx = face_idx +
param.ghostOffset[2];
947 #if (DD_PREC==2) // half precision
952 const int ga_idx = sid %
Vh;
1102 if (isActive(dim,2,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x3==0 )
1111 const int sp_idx = face_idx +
param.ghostOffset[2];
1112 #if (DD_PREC==2) // half precision
1113 sp_norm_idx = face_idx +
param.ghostNormOffset[2];
1267 if (isActive(dim,3,+1,x1,x2,x3,x4,
param.commDim,
param.
X) && x4==
X4m1 )
1276 const int sp_idx = face_idx +
param.ghostOffset[3];
1277 #if (DD_PREC==2) // half precision
1282 const int ga_idx = sid %
Vh;
1301 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1302 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1303 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1304 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1305 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1306 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1352 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1353 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1354 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1355 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1356 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1357 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1471 if (isActive(dim,3,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x4==0 )
1480 const int sp_idx = face_idx +
param.ghostOffset[3];
1481 #if (DD_PREC==2) // half precision
1482 sp_norm_idx = face_idx +
param.ghostNormOffset[3];
1505 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1506 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1507 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1508 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1509 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1510 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1556 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1557 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1558 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1559 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1560 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1561 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1678 #ifdef SPINOR_DOUBLE
1728 #endif // SPINOR_DOUBLE
1730 #endif // DSLASH_XPAY
1738 #undef SHARED_STRIDE
#define READ_INTERMEDIATE_SPINOR
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define RECONSTRUCT_GAUGE_MATRIX
__constant__ int gauge_fixed
#define ASSN_GAUGE_MATRIX
__constant__ int ga_stride
__constant__ int X4X3X2X1hmX3X2X1h