8 template<
typename Output,
typename Input>
13 const usigned
int ghostOffset;
16 const unsigned int dir;
22 const unsigned int X[4],
23 const unsigned int ghostOffset,
24 const unsigned int parity,
25 const unsigned int dir,
28 const Output& out) : length(length),
30 ghostOffset(ghostOffset),
32 parity(parity), dir(dir), shift(shift), in(in), out(out)
34 for(
int i=0; i<4; ++i) this->X[i] = X[i];
39 template<IndexType
idxType,
typename Int>
40 __device__ __forceinline__
63 return (((t*
X3 + z)*
X2 + y)*
X1 + x) >> 1;
67 template <
typename FloatN,
int N,
typename Output,
typename Input>
70 int shift[4] = {0,0,0,0};
71 shift[arg.dir] = arg.shift;
73 unsigned int idx = blockIdx.x*(blockDim.x) + threadIdx.x;
74 unsigned int gridSize = gridDim.x*blockDim.x;
77 while(idx<arg.length){
78 const int new_idx =
neighborIndex(idx, shift, arg.partitioned, arg.parity);
82 arg.in.load(x, new_idx);
92 template<
typename FloatN,
int N,
typename Output,
typename Input>
95 unsigned int idx = blockIdx.x*(blockDim.x) + threadIdx.x;
96 unsigned int gridSize = gridDim.x*blockDim.x;
99 unsigned int coord[4];
100 while(idx<arg.length){
103 coordsFromIndex<1>(coord, idx, arg.X, arg.dir, arg.parity);
105 unsigned int ghost_idx = arg.ghostOffset + ghostIndexFromCoords<3,3>(arg.X, coord, arg.dir, arg.shift);
107 arg.in.load(x, ghost_idx);
108 arg.out.save(x, idx);
117 template<
typename Output,
typename Input>
132 const unsigned int max_threads =
deviceProp.maxThreadsDim[0];
133 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
134 const unsigned int max_shared = 16384;
136 const int threads = arg.
length;
139 param.
block.x += step;
140 if(param.
block.x > max_threads || sharedBytesPerThread()*param.
block.x > max_shared){
141 param.
block = dim3((threads+max_blocks-1)/max_blocks, 1, 1);
142 param.
block.x = ((param.
block.x+step-1)/step)*step;
143 if(param.
block.x > max_threads)
errorQuda(
"Local lattice volume is too large for device");
156 : arg(arg), location(location) {}
167 errorQuda(
"ShiftColorSpinorField is not yet implemented on the CPU\n");
173 const unsigned int max_threads =
deviceProp.maxThreadsDim[0];
174 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
175 const int threads = arg.
length;
177 param.
block = dim3((threads+max_blocks-1)/max_blocks, 1, 1);
178 param.
block.x = ((param.
block.x+step-1) / step) * step;
179 if (param.
block.x > max_threads)
errorQuda(
"Local lattice volume is too large for device");
181 param.
shared_bytes = sharedBytesPerThread()*param.
block.x > sharedBytesPerBlock(param) ?
182 sharedBytesPerThread()*param.
block.x : sharedBytesPerBlock(param);
187 initTuneParam(param);
190 long long flops()
const {
return 0; }
191 long long bytes()
const {
return 0; }
194 std::stringstream vol, aux;
199 aux <<
"threads=" << 2*arg.
in.volumeCB <<
",prec=" <<
sizeof(
Complex)/2;
200 aux <<
"stride=" << arg.
in.stride;
201 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
210 errorQuda(
"destination field is the same as source field\n");
217 errorQuda(
"Spinor fields do not have matching subsets\n");
232 const int offset = (shift>0) ? 0 : 1;
237 if(src.
Nspin() == 1){
248 face->gather(src,
dagger, 2*dim+offset, 1);
253 shiftColorSpinor.apply(0);
258 cudaError_t eventQuery = cudaEventQuery(
gatherEnd);
259 if(eventQuery == cudaSuccess){
260 face->commsStart(2*dim + offset);
268 if(face->commsQuery(2*dim + offset)){
269 face->scatter(src,
dagger, 2*dim+offset, 1);
275 shiftColorSpinor.apply(1);
280 errorQuda(
"Only staggered fermions are currently supported\n");
283 if(src.
Nspin() == 1 ){
289 errorQuda(
"Only staggered fermions are currently supported\n");
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
int commDimPartitioned(int dir)
ShiftColorSpinorField(const ShiftColorSpinorField< Output, Input > &arg, QudaFieldLocation location)
void apply(const cudaStream_t &stream)
cudaDeviceProp deviceProp
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
QudaVerbosity getVerbosity()
const unsigned int length
void shiftColorSpinorField(cudaColorSpinorField &dst, const cudaColorSpinorField &src, const unsigned int parity, const unsigned int dim, const int shift)
virtual void initTuneParam(TuneParam ¶m) const
const ColorSpinorField & Even() const
const ColorSpinorField & Odd() const
__global__ void shiftColorSpinorFieldKernel(ShiftQuarkArg< Output, Input > arg)
void defaultTuneParam(TuneParam ¶m) const
__global__ void shiftColorSpinorFieldExternalKernel(ShiftQuarkArg< Output, Input > arg)
int sharedBytesPerThread() const
const unsigned int parity
ShiftColorSpinorFieldArg(const unsigned int length, const unsigned int X[4], const unsigned int ghostOffset, const unsigned int parity, const unsigned int dir, const int shift, const Input &in, const Output &out)
bool advanceGridDim(TuneParam ¶m) const
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param ¶m)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
int sharedBytesPerBlock(const TuneParam &) cont
std::complex< double > Complex
ShiftColorSpinorFieldArg< Output, Input > arg
enum QudaFieldLocation_s QudaFieldLocation
bool advanceBlockDim(TuneParam ¶m) const
cudaEvent_t scatterEnd[Nstream]
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
virtual ~ShiftColorSpinorField()
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
cudaEvent_t gatherEnd[Nstream]