8 template<
typename Output,
typename Input>
13 const usigned
int ghostOffset;
16 const unsigned int dir;
22 const unsigned int X[4],
23 const unsigned int ghostOffset,
25 const unsigned int dir,
28 const Output&
out) : length(length),
30 ghostOffset(ghostOffset),
32 parity(parity), dir(dir), shift(shift), in(in), out(out)
34 for(
int i=0; i<4; ++i) this->X[i] = X[i];
39 template<IndexType
idxType,
typename Int>
40 __device__ __forceinline__
41 int neighborIndex(
const unsigned int& cb_idx,
const int (&shift)[4],
const bool (&partitioned)[4],
const unsigned int&
parity){
46 coordsFromIndex(
full_idx, x, y, z, t, cb_idx, parity);
50 if( (x+shift[0])<0 || (x+shift[0])>=
X1)
return -1;
52 if( (y+shift[1])<0 || (y+shift[1])>=
X2)
return -1;
54 if( (z+shift[2])<0 || (z+shift[2])>=
X3)
return -1;
56 if( (z+shift[3])<0 || (z+shift[3])>=
X4)
return -1;
59 x = shift[0] ? (x + shift[0] +
X1) %
X1 : x;
60 y = shift[1] ? (y + shift[1] +
X2) %
X2 : y;
61 z = shift[2] ? (z + shift[2] +
X3) %
X3 : z;
62 t = shift[3] ? (t + shift[3] +
X4) %
X4 : t;
63 return (((t*
X3 + z)*
X2 +
y)*
X1 + x) >> 1;
67 template <
typename FloatN,
int N,
typename Output,
typename Input>
70 int shift[4] = {0,0,0,0};
71 shift[arg.dir] = arg.shift;
73 unsigned int idx = blockIdx.x*(blockDim.x) + threadIdx.x;
74 unsigned int gridSize = gridDim.x*blockDim.x;
77 while(idx<arg.length){
78 const int new_idx =
neighborIndex(idx, shift, arg.partitioned, arg.parity);
82 arg.in.load(
x, new_idx);
92 template<
typename FloatN,
int N,
typename Output,
typename Input>
95 unsigned int idx = blockIdx.x*(blockDim.x) + threadIdx.x;
96 unsigned int gridSize = gridDim.x*blockDim.x;
99 unsigned int coord[4];
100 while(idx<arg.length){
103 coordsFromIndex<1>(coord,
idx, arg.X, arg.dir, arg.parity);
105 unsigned int ghost_idx = arg.ghostOffset + ghostIndexFromCoords<3,3>(arg.X, coord, arg.dir, arg.shift);
107 arg.in.load(
x, ghost_idx);
108 arg.out.save(
x, idx);
117 template<
typename Output,
typename Input>
124 int sharedBytesPerThread()
const {
return 0; }
125 int sharedBytesPerBlock(
const TuneParam &) cont {
return 0; }
130 bool advanceBlockDim(
TuneParam ¶m)
const
132 const unsigned int max_threads =
deviceProp.maxThreadsDim[0];
133 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
134 const unsigned int max_shared = 16384;
136 const int threads = arg.length;
139 param.
block.x += step;
140 if(param.
block.x > max_threads || sharedBytesPerThread()*param.
block.x > max_shared){
141 param.
block = dim3((threads+max_blocks-1)/max_blocks, 1, 1);
142 param.
block.x = ((param.
block.x+step-1)/step)*step;
143 if(param.
block.x > max_threads)
errorQuda(
"Local lattice volume is too large for device");
156 : arg(arg), location(location) {}
167 errorQuda(
"ShiftColorSpinorField is not yet implemented on the CPU\n");
176 const unsigned int max_threads =
deviceProp.maxThreadsDim[0];
177 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
178 const int threads = arg.length;
180 param.
block = dim3((threads+max_blocks-1)/max_blocks, 1, 1);
181 param.
block.x = ((param.
block.x+step-1) / step) * step;
182 if (param.
block.x > max_threads)
errorQuda(
"Local lattice volume is too large for device");
184 param.
shared_bytes = sharedBytesPerThread()*param.
block.x > sharedBytesPerBlock(param) ?
185 sharedBytesPerThread()*param.
block.x : sharedBytesPerBlock(param);
193 long long flops()
const {
return 0; }
194 long long bytes()
const {
return 0; }
197 std::stringstream vol,
aux;
202 aux <<
"threads=" << 2*arg.in.volumeCB <<
",prec=" <<
sizeof(
Complex)/2;
203 aux <<
"stride=" << arg.in.stride;
204 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
213 errorQuda(
"destination field is the same as source field\n");
220 errorQuda(
"Spinor fields do not have matching subsets\n");
235 const int offset = (shift>0) ? 0 : 1;
240 if(src.
Nspin() == 1){
251 face->gather(src,
dagger, 2*dim+offset, 1);
256 shiftColorSpinor.apply(0);
261 cudaError_t eventQuery = cudaEventQuery(
gatherEnd);
262 if(eventQuery == cudaSuccess){
263 face->commsStart(2*dim + offset);
271 if(face->commsQuery(2*dim + offset)){
272 face->scatter(src,
dagger, 2*dim+offset, 1);
278 shiftColorSpinor.apply(1);
283 errorQuda(
"Only staggered fermions are currently supported\n");
286 if(src.
Nspin() == 1 ){
292 errorQuda(
"Only staggered fermions are currently supported\n");
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
ShiftColorSpinorField(const ShiftColorSpinorField< Output, Input > &arg, QudaFieldLocation location)
void apply(const cudaStream_t &stream)
int commDimPartitioned(int dir)
cudaDeviceProp deviceProp
QudaVerbosity getVerbosity()
const unsigned int length
std::complex< double > Complex
void shiftColorSpinorField(cudaColorSpinorField &dst, const cudaColorSpinorField &src, const unsigned int parity, const unsigned int dim, const int shift)
__global__ void const FloatN FloatM FloatM Float Float int threads
virtual void initTuneParam(TuneParam ¶m) const
__global__ void shiftColorSpinorFieldKernel(ShiftQuarkArg< Output, Input > arg)
cudaColorSpinorField & Odd() const
__global__ void shiftColorSpinorFieldExternalKernel(ShiftQuarkArg< Output, Input > arg)
const QudaFieldLocation location
cudaEvent_t packEnd[Nstream]
const unsigned int parity
FloatingPoint< float > Float
ShiftColorSpinorFieldArg(const unsigned int length, const unsigned int X[4], const unsigned int ghostOffset, const unsigned int parity, const unsigned int dir, const int shift, const Input &in, const Output &out)
void defaultTuneParam(TuneParam ¶m) const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
cudaEvent_t gatherEnd[Nstream]
enum QudaFieldLocation_s QudaFieldLocation
QudaPrecision Precision() const
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
virtual ~ShiftColorSpinorField()
cudaEvent_t scatterEnd[Nstream]
QudaSiteSubset SiteSubset() const
cudaColorSpinorField & Even() const