QUDA  v1.1.0
A library for QCD on GPUs
shift_quark_field.cu
Go to the documentation of this file.
1 #include <cstdio>
2 #include <cstdlib>
3 #include <quda_internal.h>
4 
5 namespace quda {
6 
7  template<typename Output, typename Input>
8  struct ShiftColorSpinorFieldArg {
9  const unsigned int length;
10  unsigned int X[4];
11 #ifdef MULTI_GPU
12  const usigned int ghostOffset; // depends on the direction
13 #endif
14  const unsigned int parity;
15  const unsigned int dir;
16  bool partitioned[4];
17  const int shift;
18  Input in;
19  Output out;
20  ShiftColorSpinorFieldArg(const unsigned int length,
21  const unsigned int X[4],
22  const unsigned int ghostOffset,
23  const unsigned int parity,
24  const unsigned int dir,
25  const int shift,
26  const Input& in,
27  const Output& out) : length(length),
28 #ifdef MULTI_GPU
29  ghostOffset(ghostOffset),
30 #endif
31  parity(parity), dir(dir), shift(shift), in(in), out(out)
32  {
33  for(int i=0; i<4; ++i) this->X[i] = X[i];
34  for(int i=0; i<4; ++i) partitioned[i] = commDimPartitioned(i) ? true : false;
35  }
36  };
37 
38  template<IndexType idxType, typename Int>
39  __device__ __forceinline__
40  int neighborIndex(const unsigned int& cb_idx, const int (&shift)[4], const bool (&partitioned)[4], const unsigned int& parity){
41 
42  int idx;
43  Int x, y, z, t;
44 
45  coordsFromIndex(full_idx, x, y, z, t, cb_idx, parity);
46 
47 #ifdef MULTI_GPU
48  if(partitioned[0])
49  if( (x+shift[0])<0 || (x+shift[0])>=X1) return -1;
50  if(partitioned[1])
51  if( (y+shift[1])<0 || (y+shift[1])>=X2) return -1;
52  if(partitioned[2])
53  if( (z+shift[2])<0 || (z+shift[2])>=X3) return -1;
54  if(partitioned[3])
55  if( (z+shift[3])<0 || (z+shift[3])>=X4) return -1;
56 #endif
57 
58  x = shift[0] ? (x + shift[0] + X1) % X1 : x;
59  y = shift[1] ? (y + shift[1] + X2) % X2 : y;
60  z = shift[2] ? (z + shift[2] + X3) % X3 : z;
61  t = shift[3] ? (t + shift[3] + X4) % X4 : t;
62  return (((t*X3 + z)*X2 + y)*X1 + x) >> 1;
63  }
64 
65  template <typename FloatN, int N, typename Arg>
66  __global__ void shiftColorSpinorFieldKernel(Arg arg)
67  {
68  int shift[4] = {0,0,0,0};
69  shift[arg.dir] = arg.shift;
70 
71  unsigned int idx = blockIdx.x*(blockDim.x) + threadIdx.x;
72  unsigned int gridSize = gridDim.x*blockDim.x;
73 
74  FloatN x[N];
75  while(idx<arg.length){
76  const int new_idx = neighborIndex(idx, shift, arg.partitioned, arg.parity);
77 #ifdef MULTI_GPU
78  if(new_idx > 0){
79 #endif
80  arg.in.load(x, new_idx);
81  arg.out.save(x, idx);
82 #ifdef MULTI_GPU
83  }
84 #endif
85  idx += gridSize;
86  }
87  }
88 
89  template<typename FloatN, int N, typename Arg>
90  __global__ void shiftColorSpinorFieldExternalKernel(Arg arg)
91  {
92  unsigned int idx = blockIdx.x*(blockDim.x) + threadIdx.x;
93  unsigned int gridSize = gridDim.x*blockDim.x;
94 
95  Float x[N];
96  unsigned int coord[4];
97  while(idx<arg.length){
98 
99  // compute the coordinates in the ghost zone
100  coordsFromIndex<1>(coord, idx, arg.X, arg.dir, arg.parity);
101 
102  unsigned int ghost_idx = arg.ghostOffset + ghostIndexFromCoords<3,3>(arg.X, coord, arg.dir, arg.shift);
103 
104  arg.in.load(x, ghost_idx);
105  arg.out.save(x, idx);
106 
107  idx += gridSize;
108  }
109  }
110 
111  template<typename Output, typename Input>
112  class ShiftColorSpinorField : public Tunable {
113  ShiftColorSpinorFieldArg<Output,Input> arg;
114  const int *X; // pointer to lattice dimensions
115 
116  int sharedBytesPerThread() const { return 0; }
117  int sharedBytesPerBlock(const TuneParam &) cont { return 0; }
118 
119  // don't tune the grid dimension
120  bool advanceGridDim(TuneParam & param) const { return false; }
121 
122  bool advanceBlockDim(TuneParam &param) const
123  {
124  const unsigned int max_threads = deviceProp.maxThreadsDim[0];
125  const unsigned int max_blocks = deviceProp.maxGridSize[0];
126  const unsigned int max_shared = 16384;
127  const int step = deviceProp.warpSize;
128  const int threads = arg.length;
129  bool ret;
130 
131  param.block.x += step;
132  if(param.block.x > max_threads || sharedBytesPerThread()*param.block.x > max_shared){
133  param.block = dim3((threads+max_blocks-1)/max_blocks, 1, 1); // ensure the blockDim is large enough given the limit on gridDim
134  param.block.x = ((param.block.x+step-1)/step)*step;
135  if(param.block.x > max_threads) errorQuda("Local lattice volume is too large for device");
136  ret = false;
137  }else{
138  ret = true;
139  }
140  param.grid = dim3((threads+param.block.x-1)/param.block.x,1,1);
141  return ret;
142  }
143 
144 
145  public:
146  ShiftColorSpinorField(const ShiftColorSpinorField<Output,Input> &arg,
147  QudaFieldLocation location)
148  : arg(arg), location(location) {}
149  virtual ~ShiftColorSpinorField() {}
150 
151  void apply(const qudaStream_t &stream){
152  if(location == QUDA_CUDA_FIELD_LOCATION){
153  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
154  qudaLaunchKernel(shiftColorSpinorFieldKernel<decltype(arg)>, tp, stream, arg);
155 #ifdef MULTI_GPU
156  // Need to perform some communication and call exterior kernel, I guess
157 #endif
158  }else{ // run the CPU code
159  errorQuda("ShiftColorSpinorField is not yet implemented on the CPU\n");
160  }
161  } // apply
162 
163  virtual void initTuneParam(TuneParam &param) const
164  {
165  const unsigned int max_threads = deviceProp.maxThreadsDim[0];
166  const unsigned int max_blocks = deviceProp.maxGridSize[0];
167  const int threads = arg.length;
168  const int step = deviceProp.warpSize;
169  param.block = dim3((threads+max_blocks-1)/max_blocks, 1, 1); // ensure the blockDim is large enough, given the limit on gridDim
170  param.block.x = ((param.block.x+step-1) / step) * step; // round up to the nearest "step"
171  if (param.block.x > max_threads) errorQuda("Local lattice volume is too large for device");
172  param.grid = dim3((threads+param.block.x-1)/param.block.x, 1, 1);
173  param.shared_bytes = sharedBytesPerThread()*param.block.x > sharedBytesPerBlock(param) ?
174  sharedBytesPerThread()*param.block.x : sharedBytesPerBlock(param);
175  }
176 
177  /** sets default values for when tuning is disabled */
178  void defaultTuneParam(TuneParam &param) const {
179  initTuneParam(param);
180  }
181 
182  long long flops() const { return 0; } // fixme
183  long long bytes() const { return 0; } // fixme
184 
185  TuneKey tuneKey() const {
186  std::stringstream vol, aux;
187  vol << X[0] << "x";
188  vol << X[1] << "x";
189  vol << X[2] << "x";
190  vol << X[3] << "x";
191  aux << "threads=" << 2*arg.in.volumeCB << ",prec=" << sizeof(Complex)/2;
192  aux << "stride=" << arg.in.stride;
193  return TuneKey(vol.str(), typeid(*this).name(), aux.str());
194  }
195  };
196 
197 
198  // Should really have a parity
199  void shiftColorSpinorField(cudaColorSpinorField &dst, const cudaColorSpinorField &src, const unsigned int parity, const unsigned int dim, const int shift) {
200 
201  if(&src == &dst){
202  errorQuda("destination field is the same as source field\n");
203  return;
204  }
205 
206  if(src.Nspin() != 1 && src.Nspin() !=4) errorQuda("nSpin(%d) not supported\n", src.Nspin());
207 
208  if(src.SiteSubset() != dst.SiteSubset())
209  errorQuda("Spinor fields do not have matching subsets\n");
210 
211  if(src.SiteSubset() == QUDA_FULL_SITE_SUBSET){
212  if(shift&1){
213  shiftColorSpinorField(dst.Even(), src.Odd(), 0, dim, shift);
214  shiftColorSpinorField(dst.Odd(), src.Even(), 1, dim, shift);
215  }else{
216  shiftColorSpinorField(dst.Even(), src.Even(), 0, dim, shift);
217  shiftColorSpinorField(dst.Odd(), src.Odd(), 1, dim, shift);
218  }
219  return;
220  }
221 
222 #ifdef MULTI_GPU
223  const int dir = (shift>0) ? QUDA_BACKWARDS : QUDA_FORWARDS; // pack the start of the field if shift is positive
224  const int offset = (shift>0) ? 0 : 1;
225 #endif
226 
227 
228  if(dst.Precision() == QUDA_DOUBLE_PRECISION && src.Precision() == QUDA_DOUBLE_PRECISION){
229  if(src.Nspin() == 1){
230  Spinor<double2, double2, double2, 3, 0, 0> src_tex(src);
231  Spinor<double2, double2, double2, 3, 1> dst_spinor(dst);
232  ShiftColorSpinorFieldArg arg(src.Volume(), parity, dim, shift, dst_spinor, src_tex);
233  ShiftColorSpinorField shiftColorSpinor(arg, QUDA_CPU_FIELD_LOCATION);
234 
235 #ifdef MULTI_GPU
236  if(commDimPartitioned(dim) && dim!=3){
237  face->pack(src, 1-parity, dagger, dim, dir, streams); // pack in stream[1]
238  qudaEventRecord(packEnd, streams[1]);
239  qudaStreamWaitEvent(streams[1], packEnd, 0); // wait for pack to end in stream[1]
240  face->gather(src, dagger, 2*dim+offset, 1); // copy packed data from device buffer to host and do this in stream[1]
241  qudaEventRecord(gatherEnd, streams[1]); // record the completion of face->gather
242  }
243 #endif
244 
245  shiftColorSpinor.apply(0); // shift the field in the interior region
246 
247 #ifdef MULTI_GPU
248  if(commDimPartitioned(dim) && dim!=3){
249  while(1){
250  cudaError_t eventQuery = cudaEventQuery(gatherEnd);
251  if(eventQuery == cudaSuccess){
252  face->commsStart(2*dim + offset); // if argument is even, send backwards, else send forwards
253  break;
254  }
255  }
256 
257  // after communication, load data back on to device
258  // do this in stream[1]
259  while(1){
260  if(face->commsQuery(2*dim + offset)){
261  face->scatter(src, dagger, 2*dim+offset, 1);
262  break;
263  }
264  } // while(1)
265  qudaEventRecord(scatterEnd, streams[1]);
266  qudaStreamWaitEvent(streams[1], scatterEnd, 0);
267  shiftColorSpinor.apply(1);
268  }
269 #endif
270 
271  }else{
272  errorQuda("Only staggered fermions are currently supported\n");
273  }
274  }else if(dst.Precision() == QUDA_SINGLE_PRECISION && src.Precision() == QUDA_SINGLE_PRECISION){
275  if(src.Nspin() == 1 ){
276  Spinor<float2, float2, float2, 3, 0, 0> src_tex(src);
277  Spinor<float2, float2, float2, 3, 1> dst_spinor(dst);
278  ShiftColorSpinorFieldArg arg(src.Volume(), parity, dim, shift, dst_spinor, src_tex);
279  ShiftColorSpinorField shiftColorSpinor(arg, QUDA_CPU_FIELD_LOCATION);
280  }else{
281  errorQuda("Only staggered fermions are currently supported\n");
282  }
283  }
284  return;
285  }
286 
287 
288 } // namespace quda
289