QUDA  1.0.0
shift_quark_field.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <cuda.h>
4 #include <quda_internal.h>
5 
6 namespace quda {
7 
8  template<typename Output, typename Input>
10  const unsigned int length;
11  unsigned int X[4];
12 #ifdef MULTI_GPU
13  const usigned int ghostOffset; // depends on the direction
14 #endif
15  const unsigned int parity;
16  const unsigned int dir;
17  bool partitioned[4];
18  const int shift;
19  Input in;
20  Output out;
21  ShiftColorSpinorFieldArg(const unsigned int length,
22  const unsigned int X[4],
23  const unsigned int ghostOffset,
24  const unsigned int parity,
25  const unsigned int dir,
26  const int shift,
27  const Input& in,
28  const Output& out) : length(length),
29 #ifdef MULTI_GPU
30  ghostOffset(ghostOffset),
31 #endif
32  parity(parity), dir(dir), shift(shift), in(in), out(out)
33  {
34  for(int i=0; i<4; ++i) this->X[i] = X[i];
35  for(int i=0; i<4; ++i) partitioned[i] = commDimPartitioned(i) ? true : false;
36  }
37  };
38 
39  template<IndexType idxType, typename Int>
40  __device__ __forceinline__
41  int neighborIndex(const unsigned int& cb_idx, const int (&shift)[4], const bool (&partitioned)[4], const unsigned int& parity){
42 
43  int idx;
44  Int x, y, z, t;
45 
46  coordsFromIndex(full_idx, x, y, z, t, cb_idx, parity);
47 
48 #ifdef MULTI_GPU
49  if(partitioned[0])
50  if( (x+shift[0])<0 || (x+shift[0])>=X1) return -1;
51  if(partitioned[1])
52  if( (y+shift[1])<0 || (y+shift[1])>=X2) return -1;
53  if(partitioned[2])
54  if( (z+shift[2])<0 || (z+shift[2])>=X3) return -1;
55  if(partitioned[3])
56  if( (z+shift[3])<0 || (z+shift[3])>=X4) return -1;
57 #endif
58 
59  x = shift[0] ? (x + shift[0] + X1) % X1 : x;
60  y = shift[1] ? (y + shift[1] + X2) % X2 : y;
61  z = shift[2] ? (z + shift[2] + X3) % X3 : z;
62  t = shift[3] ? (t + shift[3] + X4) % X4 : t;
63  return (((t*X3 + z)*X2 + y)*X1 + x) >> 1;
64  }
65 
66 
67  template <typename FloatN, int N, typename Output, typename Input>
68  __global__ void shiftColorSpinorFieldKernel(ShiftQuarkArg<Output,Input> arg){
69 
70  int shift[4] = {0,0,0,0};
71  shift[arg.dir] = arg.shift;
72 
73  unsigned int idx = blockIdx.x*(blockDim.x) + threadIdx.x;
74  unsigned int gridSize = gridDim.x*blockDim.x;
75 
76  FloatN x[N];
77  while(idx<arg.length){
78  const int new_idx = neighborIndex(idx, shift, arg.partitioned, arg.parity);
79 #ifdef MULTI_GPU
80  if(new_idx > 0){
81 #endif
82  arg.in.load(x, new_idx);
83  arg.out.save(x, idx);
84 #ifdef MULTI_GPU
85  }
86 #endif
87  idx += gridSize;
88  }
89  return;
90  }
91 
92  template<typename FloatN, int N, typename Output, typename Input>
93  __global__ void shiftColorSpinorFieldExternalKernel(ShiftQuarkArg<Output,Input> arg){
94 
95  unsigned int idx = blockIdx.x*(blockDim.x) + threadIdx.x;
96  unsigned int gridSize = gridDim.x*blockDim.x;
97 
98  Float x[N];
99  unsigned int coord[4];
100  while(idx<arg.length){
101 
102  // compute the coordinates in the ghost zone
103  coordsFromIndex<1>(coord, idx, arg.X, arg.dir, arg.parity);
104 
105  unsigned int ghost_idx = arg.ghostOffset + ghostIndexFromCoords<3,3>(arg.X, coord, arg.dir, arg.shift);
106 
107  arg.in.load(x, ghost_idx);
108  arg.out.save(x, idx);
109 
110  idx += gridSize;
111  }
112 
113 
114  return;
115  }
116 
117  template<typename Output, typename Input>
119 
120  private:
122  const int *X; // pointer to lattice dimensions
123 
124  int sharedBytesPerThread() const { return 0; }
125  int sharedBytesPerBlock(const TuneParam &) cont { return 0; }
126 
127  // don't tune the grid dimension
128  bool advanceGridDim(TuneParam & param) const { return false; }
129 
131  {
132  const unsigned int max_threads = deviceProp.maxThreadsDim[0];
133  const unsigned int max_blocks = deviceProp.maxGridSize[0];
134  const unsigned int max_shared = 16384;
135  const int step = deviceProp.warpSize;
136  const int threads = arg.length;
137  bool ret;
138 
139  param.block.x += step;
140  if(param.block.x > max_threads || sharedBytesPerThread()*param.block.x > max_shared){
141  param.block = dim3((threads+max_blocks-1)/max_blocks, 1, 1); // ensure the blockDim is large enough given the limit on gridDim
142  param.block.x = ((param.block.x+step-1)/step)*step;
143  if(param.block.x > max_threads) errorQuda("Local lattice volume is too large for device");
144  ret = false;
145  }else{
146  ret = true;
147  }
148  param.grid = dim3((threads+param.block.x-1)/param.block.x,1,1);
149  return ret;
150  }
151 
152 
153  public:
155  QudaFieldLocation location)
156  : arg(arg), location(location) {}
158 
159  void apply(const cudaStream_t &stream){
160  if(location == QUDA_CUDA_FIELD_LOCATION){
161  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
162  shiftColorSpinorFieldKernel<Output,Input><<<tp.grid,tp.block,tp.shared_bytes>>>(arg);
163 #ifdef MULTI_GPU
164  // Need to perform some communication and call exterior kernel, I guess
165 #endif
166  }else{ // run the CPU code
167  errorQuda("ShiftColorSpinorField is not yet implemented on the CPU\n");
168  }
169  } // apply
170 
171  virtual void initTuneParam(TuneParam &param) const
172  {
173  const unsigned int max_threads = deviceProp.maxThreadsDim[0];
174  const unsigned int max_blocks = deviceProp.maxGridSize[0];
175  const int threads = arg.length;
176  const int step = deviceProp.warpSize;
177  param.block = dim3((threads+max_blocks-1)/max_blocks, 1, 1); // ensure the blockDim is large enough, given the limit on gridDim
178  param.block.x = ((param.block.x+step-1) / step) * step; // round up to the nearest "step"
179  if (param.block.x > max_threads) errorQuda("Local lattice volume is too large for device");
180  param.grid = dim3((threads+param.block.x-1)/param.block.x, 1, 1);
181  param.shared_bytes = sharedBytesPerThread()*param.block.x > sharedBytesPerBlock(param) ?
182  sharedBytesPerThread()*param.block.x : sharedBytesPerBlock(param);
183  }
184 
187  initTuneParam(param);
188  }
189 
190  long long flops() const { return 0; } // fixme
191  long long bytes() const { return 0; } // fixme
192 
193  TuneKey tuneKey() const {
194  std::stringstream vol, aux;
195  vol << X[0] << "x";
196  vol << X[1] << "x";
197  vol << X[2] << "x";
198  vol << X[3] << "x";
199  aux << "threads=" << 2*arg.in.volumeCB << ",prec=" << sizeof(Complex)/2;
200  aux << "stride=" << arg.in.stride;
201  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
202  }
203  };
204 
205 
206  // Should really have a parity
207  void shiftColorSpinorField(cudaColorSpinorField &dst, const cudaColorSpinorField &src, const unsigned int parity, const unsigned int dim, const int shift) {
208 
209  if(&src == &dst){
210  errorQuda("destination field is the same as source field\n");
211  return;
212  }
213 
214  if(src.Nspin() != 1 && src.Nspin() !=4) errorQuda("nSpin(%d) not supported\n", src.Nspin());
215 
216  if(src.SiteSubset() != dst.SiteSubset())
217  errorQuda("Spinor fields do not have matching subsets\n");
218 
219  if(src.SiteSubset() == QUDA_FULL_SITE_SUBSET){
220  if(shift&1){
221  shiftColorSpinorField(dst.Even(), src.Odd(), 0, dim, shift);
222  shiftColorSpinorField(dst.Odd(), src.Even(), 1, dim, shift);
223  }else{
224  shiftColorSpinorField(dst.Even(), src.Even(), 0, dim, shift);
225  shiftColorSpinorField(dst.Odd(), src.Odd(), 1, dim, shift);
226  }
227  return;
228  }
229 
230 #ifdef MULTI_GPU
231  const int dir = (shift>0) ? QUDA_BACKWARDS : QUDA_FORWARDS; // pack the start of the field if shift is positive
232  const int offset = (shift>0) ? 0 : 1;
233 #endif
234 
235 
237  if(src.Nspin() == 1){
240  ShiftColorSpinorFieldArg arg(src.Volume(), parity, dim, shift, dst_spinor, src_tex);
242 
243 #ifdef MULTI_GPU
244  if(commDimPartitioned(dim) && dim!=3){
245  face->pack(src, 1-parity, dagger, dim, dir, streams); // pack in stream[1]
247  qudaStreamWaitEvent(streams[1], packEnd, 0); // wait for pack to end in stream[1]
248  face->gather(src, dagger, 2*dim+offset, 1); // copy packed data from device buffer to host and do this in stream[1]
249  qudaEventRecord(gatherEnd, streams[1]); // record the completion of face->gather
250  }
251 #endif
252 
253  shiftColorSpinor.apply(0); // shift the field in the interior region
254 
255 #ifdef MULTI_GPU
256  if(commDimPartitioned(dim) && dim!=3){
257  while(1){
258  cudaError_t eventQuery = cudaEventQuery(gatherEnd);
259  if(eventQuery == cudaSuccess){
260  face->commsStart(2*dim + offset); // if argument is even, send backwards, else send forwards
261  break;
262  }
263  }
264 
265  // after communication, load data back on to device
266  // do this in stream[1]
267  while(1){
268  if(face->commsQuery(2*dim + offset)){
269  face->scatter(src, dagger, 2*dim+offset, 1);
270  break;
271  }
272  } // while(1)
275  shiftColorSpinor.apply(1);
276  }
277 #endif
278 
279  }else{
280  errorQuda("Only staggered fermions are currently supported\n");
281  }
282  }else if(dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION){
283  if(src.Nspin() == 1 ){
285  Spinor<float2, float2, float2, 3, 1> dst_spinor(dst);
286  ShiftColorSpinorFieldArg arg(src.Volume(), parity, dim, shift, dst_spinor, src_tex);
288  }else{
289  errorQuda("Only staggered fermions are currently supported\n");
290  }
291  }
292  return;
293  }
294 
295 
296 } // namespace quda
297 
__device__ __forceinline__ int neighborIndex(const unsigned int &cb_idx, const int(&shift)[4], const bool(&partitioned)[4], const unsigned int &parity)
int commDimPartitioned(int dir)
ShiftColorSpinorField(const ShiftColorSpinorField< Output, Input > &arg, QudaFieldLocation location)
void apply(const cudaStream_t &stream)
cudaDeviceProp deviceProp
cudaError_t qudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
Wrapper around cudaEventRecord or cuEventRecord.
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
static int X2
Definition: face_gauge.cpp:42
#define errorQuda(...)
Definition: util_quda.h:121
void shiftColorSpinorField(cudaColorSpinorField &dst, const cudaColorSpinorField &src, const unsigned int parity, const unsigned int dim, const int shift)
cudaStream_t * streams
virtual void initTuneParam(TuneParam &param) const
cudaStream_t * stream
const ColorSpinorField & Even() const
const ColorSpinorField & Odd() const
__global__ void shiftColorSpinorFieldKernel(ShiftQuarkArg< Output, Input > arg)
void defaultTuneParam(TuneParam &param) const
QudaGaugeParam param
Definition: pack_test.cpp:17
__global__ void shiftColorSpinorFieldExternalKernel(ShiftQuarkArg< Output, Input > arg)
ShiftColorSpinorFieldArg(const unsigned int length, const unsigned int X[4], const unsigned int ghostOffset, const unsigned int parity, const unsigned int dir, const int shift, const Input &in, const Output &out)
bool advanceGridDim(TuneParam &param) const
static __device__ __forceinline__ void coordsFromIndex(int &idx, T *x, int &cb_idx, const Param &param)
Compute coordinates from index into the checkerboard (used by the interior Dslash kernels)...
QudaSiteSubset SiteSubset() const
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:643
cudaEvent_t packEnd[2]
Definition: dslash_quda.cu:55
int sharedBytesPerBlock(const TuneParam &) cont
std::complex< double > Complex
Definition: quda_internal.h:46
ShiftColorSpinorFieldArg< Output, Input > arg
static int X3
Definition: face_gauge.cpp:42
static int X1
Definition: face_gauge.cpp:42
enum QudaFieldLocation_s QudaFieldLocation
bool advanceBlockDim(TuneParam &param) const
cudaEvent_t scatterEnd[Nstream]
Definition: dslash_quda.cu:59
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
cudaError_t qudaEventRecord(cudaEvent_t &event, cudaStream_t stream=0)
Wrapper around cudaEventRecord or cuEventRecord.
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaPrecision Precision() const
QudaDagType dagger
Definition: test_util.cpp:1620
cudaEvent_t gatherEnd[Nstream]
Definition: dslash_quda.cu:57
static int X4
Definition: face_gauge.cpp:42