5 #if (__COMPUTE_CAPABILITY__ >= 300 || __CUDA_ARCH__ >= 300) 6 #include <generics/shfl.h> 10 #if __COMPUTE_CAPABILITY__ >= 300 && CUDA_VERSION >= 7050 11 #define DOT_PRODUCT_SPLIT 22 template <
typename Float,
typename yFloat,
typename ghostFloat,
int coarseSpin,
int coarseColor, QudaFieldOrder csOrder, QudaGaugeFieldOrder gOrder>
46 X(const_cast<
GaugeField&>(X)), kappa(kappa), parity(parity),
47 nParity(out.SiteSubset()), nFace(1), X0h( ((3-nParity) * out.X(0)) /2),
48 dim{ (3-
nParity) * out.
X(0), out.
X(1), out.
X(2), out.
X(3), out.
Ndim() == 5 ? out.
X(4) : 1 },
57 template <DslashType type>
58 static __host__ __device__
bool doHalo() {
71 template <DslashType type>
72 static __host__ __device__
bool doBulk() {
92 extern __shared__
float s[];
93 template <
typename Float,
int nDim,
int Ns,
int Nc,
int Mc,
int color_str
ide,
int dim_str
ide,
int thread_dir,
int thread_dim,
bool dagger, DslashType type,
typename Arg>
94 __device__ __host__
inline void applyDslash(complex<Float>
out[],
Arg &
arg,
int x_cb,
int src_idx,
int parity,
int s_row,
int color_block,
int color_offset) {
95 const int their_spinor_parity = (arg.
nParity == 2) ? 1-parity : 0;
102 complex<Float> *shared_sum = (complex<Float>*)s;
108 for(
int d = thread_dim; d < nDim; d+=dim_stride)
110 const int fwd_idx =
linkIndexP1(coord, arg.dim, d);
112 if ( arg.commDim[d] && (coord[d] + arg.nFace >= arg.dim[d]) ) {
113 if (doHalo<type>()) {
114 int ghost_idx = ghostFaceIndex<1, 5>(coord, arg.dim, d, arg.nFace);
117 for(
int color_local = 0; color_local < Mc; color_local++) {
118 int c_row = color_block + color_local;
119 int row = s_row*Nc + c_row;
121 for(
int s_col = 0; s_col < Ns; s_col++) {
123 for(
int c_col = 0; c_col < Nc; c_col+=color_stride) {
124 int col = s_col*Nc + c_col + color_offset;
126 out[color_local] += arg.Y(d+4, parity, x_cb, row, col)
127 * arg.inA.Ghost(d, 1, their_spinor_parity, ghost_idx + src_idx*arg.
volumeCB, s_col, c_col+color_offset);
129 out[color_local] += arg.Y(d, parity, x_cb, row, col)
130 * arg.inA.Ghost(d, 1, their_spinor_parity, ghost_idx + src_idx*arg.
volumeCB, s_col, c_col+color_offset);
135 }
else if (doBulk<type>()) {
137 for(
int color_local = 0; color_local < Mc; color_local++) {
138 int c_row = color_block + color_local;
139 int row = s_row*Nc + c_row;
141 for(
int s_col = 0; s_col < Ns; s_col++) {
143 for(
int c_col = 0; c_col < Nc; c_col+=color_stride) {
144 int col = s_col*Nc + c_col + color_offset;
146 out[color_local] += arg.Y(d+4, parity, x_cb, row, col)
147 * arg.inA(their_spinor_parity, fwd_idx + src_idx*arg.
volumeCB, s_col, c_col+color_offset);
149 out[color_local] += arg.Y(d, parity, x_cb, row, col)
150 * arg.inA(their_spinor_parity, fwd_idx + src_idx*arg.
volumeCB, s_col, c_col+color_offset);
158 #if defined(__CUDA_ARCH__) 159 if (thread_dim > 0) {
161 for (
int color_local=0; color_local < Mc; color_local++) {
162 shared_sum[((color_local * blockDim.z + threadIdx.z )*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x] = out[color_local];
173 for(
int d = thread_dim; d < nDim; d+=dim_stride)
175 const int back_idx =
linkIndexM1(coord, arg.dim, d);
176 const int gauge_idx = back_idx;
177 if ( arg.commDim[d] && (coord[d] - arg.nFace < 0) ) {
178 if (doHalo<type>()) {
179 const int ghost_idx = ghostFaceIndex<0, 5>(coord, arg.dim, d, arg.nFace);
181 for (
int color_local=0; color_local<Mc; color_local++) {
182 int c_row = color_block + color_local;
183 int row = s_row*Nc + c_row;
185 for (
int s_col=0; s_col<Ns; s_col++)
187 for (
int c_col=0; c_col<Nc; c_col+=color_stride) {
188 int col = s_col*Nc + c_col + color_offset;
190 out[color_local] +=
conj(arg.Y.Ghost(d, 1-parity, ghost_idx, col, row))
191 * arg.inA.Ghost(d, 0, their_spinor_parity, ghost_idx + src_idx*arg.
volumeCB, s_col, c_col+color_offset);
193 out[color_local] +=
conj(arg.Y.Ghost(d+4, 1-parity, ghost_idx, col, row))
194 * arg.inA.Ghost(d, 0, their_spinor_parity, ghost_idx + src_idx*arg.
volumeCB, s_col, c_col+color_offset);
198 }
else if (doBulk<type>()) {
200 for(
int color_local = 0; color_local < Mc; color_local++) {
201 int c_row = color_block + color_local;
202 int row = s_row*Nc + c_row;
204 for(
int s_col = 0; s_col < Ns; s_col++)
206 for(
int c_col = 0; c_col < Nc; c_col+=color_stride) {
207 int col = s_col*Nc + c_col + color_offset;
209 out[color_local] +=
conj(arg.Y(d, 1-parity, gauge_idx, col, row))
210 * arg.inA(their_spinor_parity, back_idx + src_idx*arg.
volumeCB, s_col, c_col+color_offset);
212 out[color_local] +=
conj(arg.Y(d+4, 1-parity, gauge_idx, col, row))
213 * arg.inA(their_spinor_parity, back_idx + src_idx*arg.
volumeCB, s_col, c_col+color_offset);
220 #if defined(__CUDA_ARCH__) 223 for (
int color_local=0; color_local < Mc; color_local++) {
224 shared_sum[ ((color_local * blockDim.z + threadIdx.z )*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x] = out[color_local];
230 #ifdef __CUDA_ARCH__ // CUDA path has to recombine the foward and backward results 234 if (thread_dim == 0 && thread_dir == 0) {
238 for (
int d=1; d<dim_stride; d++) {
242 for (
int color_local=0; color_local < Mc; color_local++) {
244 shared_sum[(((color_local*blockDim.z/(2*dim_stride) + threadIdx.z/(2*dim_stride)) * 2 * dim_stride + d * 2 + 0)*blockDim.y+threadIdx.y)*blockDim.x+threadIdx.x];
249 for (
int d=0; d<dim_stride; d++) {
251 for (
int color_local=0; color_local < Mc; color_local++) {
253 shared_sum[(((color_local*blockDim.z/(2*dim_stride) + threadIdx.z/(2*dim_stride)) * 2 * dim_stride + d * 2 + 1)*blockDim.y+threadIdx.y)*blockDim.x+threadIdx.x];
259 for (
int color_local=0; color_local<Mc; color_local++) out[color_local] *= -arg.kappa;
263 #else // !__CUDA_ARCH__ 264 for (
int color_local=0; color_local<Mc; color_local++) out[color_local] *= -arg.kappa;
279 template <
typename Float,
int Ns,
int Nc,
int Mc,
int color_str
ide,
bool dagger,
typename Arg>
280 __device__ __host__
inline void applyClover(complex<Float>
out[],
Arg &
arg,
int x_cb,
int src_idx,
int parity,
int s,
int color_block,
int color_offset) {
281 const int spinor_parity = (arg.
nParity == 2) ? parity : 0;
285 for(
int color_local = 0; color_local < Mc; color_local++) {
286 int c = color_block + color_local;
289 for (
int s_col = 0; s_col < Ns; s_col++)
291 for (
int c_col = 0; c_col < Nc; c_col+=color_stride) {
293 int col = s_col*Nc + c_col + color_offset;
295 out[color_local] += arg.X(0, parity, x_cb, row, col)
296 * arg.inB(spinor_parity, x_cb+src_idx*arg.
volumeCB, s_col, c_col+color_offset);
298 out[color_local] +=
conj(arg.X(0, parity, x_cb, col, row))
299 * arg.inB(spinor_parity, x_cb+src_idx*arg.
volumeCB, s_col, c_col+color_offset);
307 template <
typename Float,
int nDim,
int Ns,
int Nc,
int Mc,
int color_stride,
int dim_thread_split,
309 __device__ __host__
inline void coarseDslash(
Arg &
arg,
int x_cb,
int src_idx,
int parity,
int s,
int color_block,
int color_offset)
312 if (dslash) applyDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dir,dim,dagger,type>(out.
data,
arg, x_cb, src_idx,
parity,
s, color_block, color_offset);
313 if (doBulk<type>() && clover && dir==0 && dim==0) applyClover<Float,Ns,Nc,Mc,color_stride,dagger>(out.
data, arg, x_cb, src_idx, parity, s, color_block, color_offset);
315 if (dir==0 && dim==0) {
316 const int my_spinor_parity = (arg.
nParity == 2) ? parity : 0;
317 #if __CUDA_ARCH__ >= 300 // only have warp shuffle on Kepler and above 320 for (
int color_local=0; color_local<Mc; color_local++) {
322 constexpr
int warp_size = 32;
324 for (
int offset = warp_size/2; offset >= warp_size/color_stride; offset /= 2)
325 #
if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000)
326 #define WARP_CONVERGED 0xffffffff // we know warp should be converged here 327 out[color_local] += __shfl_down_sync(WARP_CONVERGED, out[color_local], offset);
329 out[color_local] += __shfl_down(out[color_local], offset);
333 #endif // __CUDA_ARCH__ >= 300 336 for (
int color_local=0; color_local<Mc; color_local++) {
337 int c = color_block + color_local;
338 if (color_offset == 0) {
340 if (doBulk<type>()) arg.out(my_spinor_parity, x_cb+src_idx*arg.
volumeCB, s, c) = out[color_local];
341 else arg.out(my_spinor_parity, x_cb+src_idx*arg.
volumeCB, s, c) += out[color_local];
349 template <
typename Float,
int nDim,
int Ns,
int Nc,
int Mc,
bool dslash,
bool clover,
bool dagger, DslashType type,
typename Arg>
353 const int color_stride = 1;
354 const int color_offset = 0;
355 const int dim_thread_split = 1;
363 for (
int src_idx = 0; src_idx < arg.dim[4]; src_idx++) {
365 for(
int x_cb = 0; x_cb < arg.
volumeCB; x_cb++) {
366 for (
int s=0; s<2; s++) {
367 for (
int color_block=0; color_block<Nc; color_block+=Mc) {
368 coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,dir,dim>(
arg, x_cb, src_idx,
parity,
s, color_block, color_offset);
378 template <
typename Float,
int nDim,
int Ns,
int Nc,
int Mc,
int color_str
ide,
int dim_thread_split,
bool dslash,
bool clover,
bool dagger, DslashType type,
typename Arg>
381 constexpr
int warp_size = 32;
382 const int lane_id = threadIdx.x % warp_size;
383 const int warp_id = threadIdx.x / warp_size;
384 const int vector_site_width = warp_size / color_stride;
386 int x_cb = blockIdx.x*(blockDim.x/color_stride) + warp_id*(warp_size/color_stride) + lane_id % vector_site_width;
388 const int color_offset = lane_id / vector_site_width;
391 #if 0 // disable multi-src since this has a measurable impact on single src performance 392 int paritySrc = blockDim.y*blockIdx.y + threadIdx.y;
393 if (paritySrc >= arg.
nParity * arg.dim[4])
return;
394 const int src_idx = (arg.
nParity == 2) ? paritySrc / 2 : paritySrc;
395 const int parity = (arg.
nParity == 2) ? paritySrc % 2 : arg.parity;
397 const int src_idx = 0;
398 const int parity = (arg.
nParity == 2) ? blockDim.y*blockIdx.y + threadIdx.y : arg.parity;
402 int sMd = blockDim.z*blockIdx.z + threadIdx.z;
404 int sMdim = sMd >> 1;
405 int dim = sMdim % dim_thread_split;
406 int sM = sMdim / dim_thread_split;
407 int s = sM / (Nc/Mc);
408 int color_block = (sM % (Nc/Mc)) * Mc;
413 if (dim == 0) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,0>(
arg, x_cb, src_idx,
parity,
s, color_block, color_offset);
414 else if (dim == 1) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,1>(
arg, x_cb, src_idx,
parity,
s, color_block, color_offset);
415 else if (dim == 2) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,2>(
arg, x_cb, src_idx,
parity,
s, color_block, color_offset);
416 else if (dim == 3) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,0,3>(
arg, x_cb, src_idx,
parity,
s, color_block, color_offset);
417 }
else if (dir == 1) {
418 if (dim == 0) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,0>(
arg, x_cb, src_idx,
parity,
s, color_block, color_offset);
419 else if (dim == 1) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,1>(
arg, x_cb, src_idx,
parity,
s, color_block, color_offset);
420 else if (dim == 2) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,2>(
arg, x_cb, src_idx,
parity,
s, color_block, color_offset);
421 else if (dim == 3) coarseDslash<Float,nDim,Ns,Nc,Mc,color_stride,dim_thread_split,dslash,clover,dagger,type,1,3>(
arg, x_cb, src_idx,
parity,
s, color_block, color_offset);
DslashCoarseArg(ColorSpinorField &out, const ColorSpinorField &inA, const ColorSpinorField &inB, const GaugeField &Y, const GaugeField &X, Float kappa, int parity)
static __device__ __host__ void getCoordsCB(int x[], int cb_index, const I X[], J X0h, int parity)
gauge::FieldOrder< Float, coarseColor *coarseSpin, coarseSpin, gOrder, true, yFloat > G
__host__ __device__ bool doBulk()
Helper function to determine if we should do interior computation.
__device__ __host__ void coarseDslash(Arg &arg, int x_cb, int src_idx, int parity, int s, int color_block, int color_offset)
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
__device__ __host__ int VolumeCB() const
__host__ __device__ bool doHalo(int dim=-1)
Helper function to determine if we should do halo computation.
__global__ void coarseDslashKernel(Arg arg)
colorspinor::FieldOrderCB< Float, coarseSpin, coarseColor, 1, csOrder, Float, ghostFloat > F
Main header file for host and device accessors to GaugeFields.
__device__ __host__ void applyDslash(complex< Float > out[], Arg &arg, int x_cb, int src_idx, int parity, int s_row, int color_block, int color_offset)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ __host__ void applyClover(complex< Float > out[], Arg &arg, int x_cb, int src_idx, int parity, int s, int color_block, int color_offset)
gauge::FieldOrder< Float, coarseColor *coarseSpin, coarseSpin, gOrder, true, yFloat > GY
__host__ __device__ ValueType conj(ValueType x)
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
int comm_dim_partitioned(int dim)