5 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
8 #if (CUDA_VERSION >= 4010)
11 #define VOLATILE volatile
15 #define spinorFloat double
41 #define mdwf_b5 mdwf_b5_d
42 #define mdwf_c5 mdwf_c5_d
44 #define spinorFloat float
70 #define mdwf_b5 mdwf_b5_f
71 #define mdwf_c5 mdwf_c5_f
72 #endif // SPINOR_DOUBLE
115 #endif // GAUGE_DOUBLE
118 #define gT00_re (+g00_re)
119 #define gT00_im (-g00_im)
120 #define gT01_re (+g10_re)
121 #define gT01_im (-g10_im)
122 #define gT02_re (+g20_re)
123 #define gT02_im (-g20_im)
124 #define gT10_re (+g01_re)
125 #define gT10_im (-g01_im)
126 #define gT11_re (+g11_re)
127 #define gT11_im (-g11_im)
128 #define gT12_re (+g21_re)
129 #define gT12_im (-g21_im)
130 #define gT20_re (+g02_re)
131 #define gT20_im (-g02_im)
132 #define gT21_re (+g12_re)
133 #define gT21_im (-g12_im)
134 #define gT22_re (+g22_re)
135 #define gT22_im (-g22_im)
164 #if (__COMPUTE_CAPABILITY__ >= 200)
165 #define SHARED_STRIDE 16 // to avoid bank conflicts on Fermi
167 #define SHARED_STRIDE 8 // to avoid bank conflicts on G80 and GT200
170 #if (__COMPUTE_CAPABILITY__ >= 200)
171 #define SHARED_STRIDE 32 // to avoid bank conflicts on Fermi
173 #define SHARED_STRIDE 16 // to avoid bank conflicts on G80 and GT200
180 #if (DD_PREC==2) // half precision
182 #endif // half precision
184 int sid = ((blockIdx.y*blockDim.y + threadIdx.y)*gridDim.x + blockIdx.x)*blockDim.x + threadIdx.x;
192 faceVolume[0] = (
X2*
X3*
X4)>>1;
193 faceVolume[1] = (
X1*
X3*
X4)>>1;
194 faceVolume[2] = (
X1*
X2*
X4)>>1;
195 faceVolume[3] = (
X1*
X2*
X3)>>1;
206 dim = dimFromDWFaceIndex(
sid,
param);
209 const int face_volume = ((
param.threadDimMapUpper[
dim] -
param.threadDimMapLower[
dim])*
param.Ls >> 1);
211 const int face_num = (
sid >= face_volume);
212 face_idx =
sid - face_num*face_volume;
220 coordsFromDW4DFaceIndex<1>(
sid,
x1,
x2,
x3,
x4,
xs,
face_idx, face_volume,
dim, face_num,
param.parity,
dims);
224 for(
int dir=0; dir<4; ++dir){
225 active = active || isActive(dim,dir,+1,x1,x2,x3,x4,
param.commDim,
param.
X);
235 X = 2*sid + (boundaryCrossing +
param.parity) % 2;
254 #if (DD_PREC==0) //temporal hack
294 const int sp_idx = face_idx +
param.ghostOffset[0];
295 #if (DD_PREC==2) // half precision
448 if (isActive(dim,0,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x1==0 )
457 const int sp_idx = face_idx +
param.ghostOffset[0];
458 #if (DD_PREC==2) // half precision
459 sp_norm_idx = face_idx +
param.ghostNormOffset[0];
620 const int sp_idx = face_idx +
param.ghostOffset[1];
621 #if (DD_PREC==2) // half precision
626 const int ga_idx = sid %
Vh;
774 if (isActive(dim,1,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x2==0 )
783 const int sp_idx = face_idx +
param.ghostOffset[1];
784 #if (DD_PREC==2) // half precision
785 sp_norm_idx = face_idx +
param.ghostNormOffset[1];
946 const int sp_idx = face_idx +
param.ghostOffset[2];
947 #if (DD_PREC==2) // half precision
952 const int ga_idx = sid %
Vh;
1100 if (isActive(dim,2,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x3==0 )
1109 const int sp_idx = face_idx +
param.ghostOffset[2];
1110 #if (DD_PREC==2) // half precision
1111 sp_norm_idx = face_idx +
param.ghostNormOffset[2];
1263 if (isActive(dim,3,+1,x1,x2,x3,x4,
param.commDim,
param.
X) && x4==
X4m1 )
1272 const int sp_idx = face_idx +
param.ghostOffset[3];
1273 #if (DD_PREC==2) // half precision
1278 const int ga_idx = sid %
Vh;
1297 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1298 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1299 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1300 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1301 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1302 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1346 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1347 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1348 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1349 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1350 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1351 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1464 if (isActive(dim,3,-1,x1,x2,x3,x4,
param.commDim,
param.
X) && x4==0 )
1473 const int sp_idx = face_idx +
param.ghostOffset[3];
1474 #if (DD_PREC==2) // half precision
1475 sp_norm_idx = face_idx +
param.ghostNormOffset[3];
1498 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1499 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1500 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1501 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1502 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1503 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1547 a0_re = t_proj_scale*
i00_re; a0_im = t_proj_scale*
i00_im;
1548 a1_re = t_proj_scale*
i01_re; a1_im = t_proj_scale*
i01_im;
1549 a2_re = t_proj_scale*
i02_re; a2_im = t_proj_scale*
i02_im;
1550 b0_re = t_proj_scale*
i10_re; b0_im = t_proj_scale*
i10_im;
1551 b1_re = t_proj_scale*
i11_re; b1_im = t_proj_scale*
i11_im;
1552 b2_re = t_proj_scale*
i12_re; b2_im = t_proj_scale*
i12_im;
1668 READ_ACCUM(ACCUMTEX,
param.sp_stride)
1677 #ifdef SPINOR_DOUBLE
1727 #endif // SPINOR_DOUBLE
1728 #endif // DSLASH_XPAY
1739 #undef SHARED_STRIDE
#define READ_INTERMEDIATE_SPINOR
__constant__ int ghostFace[QUDA_MAX_DIM+1]
#define RECONSTRUCT_GAUGE_MATRIX
__constant__ double coeff
__constant__ int gauge_fixed
#define ASSN_GAUGE_MATRIX
__constant__ int ga_stride
__constant__ int X4X3X2X1hmX3X2X1h