QUDA  v0.7.0
A library for QCD on GPUs
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
multi_reduce_core.h
Go to the documentation of this file.
1 #ifdef SSTEP
2 
3 template <int N, typename ReduceType, typename SpinorX, typename SpinorY,
4  typename SpinorZ, typename SpinorW, typename SpinorV, typename Reducer>
5 struct MultiReduceArg {
6 
7  SpinorX X[N];
8  SpinorY Y[N];
9  SpinorZ Z[N];
10  SpinorW W[N];
11  SpinorV V[N];
12  Reducer r;
13  ReduceType *partial;
14  ReduceType *complete;
15  const int length;
16  MultiReduceArg(SpinorX X[N], SpinorY Y[N], SpinorZ Z[N], SpinorW W[N], SpinorV V[N],
17  Reducer r, ReduceType *partial, ReduceType *complete, int length)
18  : r(r), length(length){
19 
20  for(int i=0; i<N; ++i){
21  this->X[i] = X[i];
22  this->Y[i] = Y[i];
23  this->Z[i] = Z[i];
24  this->W[i] = W[i];
25  this->V[i] = V[i];
26  this->partial = partial;
27  this->complete = complete;
28  }
29  }
30 };
31 
32 
33 template<int block_size, int N, typename ReduceType, typename ReduceSimpleType,
34  typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename SpinorV, typename Reducer>
35  __global__ void multiReduceKernel(MultiReduceArg<N,ReduceType,SpinorX,SpinorY,SpinorZ,SpinorW,SpinorV,Reducer> arg){
36 
37  unsigned int tid = threadIdx.x;
38  unsigned int gridSize = gridDim.x*blockDim.x;
39 
40 
41  ReduceType sum[N];
42 
43 
44 
45  for(int i=0; i<N; ++i){
46  zero(sum[i]);
47  unsigned int id = blockIdx.x*(blockDim.x) + threadIdx.x;
48  FloatN x[M], y[M], z[M], w[M], v[M];
49  while(id < arg.length){
50 
51  arg.X[i].load(x, id);
52  arg.Y[i].load(y, id);
53  arg.Z[i].load(z, id);
54  arg.W[i].load(w, id);
55  arg.V[i].load(v, id);
56 #if (__COMPUTE_CAPABILITY__ >= 200)
57  arg.r.pre();
58 #endif
59 #pragma unroll
60  for (int j=0; j<M; j++) arg.r(sum[i], x[j], y[j], z[j], w[j], v[j]);
61 
62 #if (__COMPUTE_CAPABILITY__ >= 200)
63  arg.r.post(sum[i]);
64 #endif
65  arg.X[i].save(x, id);
66  arg.Y[i].save(y, id);
67  arg.Z[i].save(z, id);
68  arg.W[i].save(w, id);
69  arg.V[i].save(v, id);
70 
71  id += gridSize;
72  } // loop over id
73  } // loop over i
74 
75 
76 
77  extern __shared__ ReduceSimpleType sdata[];
78  ReduceSimpleType *s = sdata + tid;
79 
80  // Copy data into shared memory
81  for(int i=0; i<N; ++i){
82  if(tid >= warpSize) copytoshared(s, 0, sum[i], block_size);
84 
85  // now reduce using the first warp only
86  if(tid < warpSize){
87  for(int j=warpSize; j<block_size; j+=warpSize) add<ReduceType>(sum[i], s, j, block_size);
88  warpReduce<block_size>(s, sum[i]);
89 
90  // write result for this block to global memory
91  if(tid == 0){
92  ReduceType tmp;
93  copyfromshared(tmp, s, 0, block_size);
94  arg.partial[i*gridDim.x + blockIdx.x] = tmp;
95  }
96  }
97  } // loop over i
98 
99  if(tid==0){
100  __threadfence(); // flush result
101  unsigned int value = atomicInc(&count, gridDim.x);
102 
103  isLastBlockDone = (value == (gridDim.x-1));
104  }
105  __syncthreads();
106 
107 
108  // Finish the reduction if last block
109  if(isLastBlockDone){
110  for(int i=0; i<N; ++i){
111  unsigned int id = threadIdx.x;
112 
113  zero(sum[i]);
114  while(id < gridDim.x){
115  sum[i] += arg.partial[i*gridDim.x + id];
116  id += block_size;
117  }
118  } // loop over i
119 
120  extern __shared__ ReduceSimpleType sdata[];
121  ReduceSimpleType *s = sdata + tid;
122 
123  for(int i=0; i<N; ++i){
124  if(tid >= warpSize) copytoshared(s, 0, sum[i], block_size);
125  __syncthreads();
126 
127  if(tid < warpSize){
128  for(int j=warpSize; j<block_size; j+=warpSize){ add<ReduceType>(sum[i], s, j, block_size); }
129  warpReduce<block_size>(s, sum[i]);
130 
131  if(tid == 0){
132  ReduceType tmp;
133  copyfromshared(tmp, s, 0, block_size);
134  arg.complete[i] = tmp;
135  }
136  }
137  } // loop over i
138  if(threadIdx.x == 0) count = 0;
139  } // isLastBlockDone
140  } // multiReduceKernel
141 
142 template<int N, typename doubleN, typename ReduceType, typename ReduceSimpleType, typename FloatN,
143 int M, typename SpinorX, typename SpinorY, typename SpinorZ, typename SpinorW, typename SpinorV, typename Reducer>
144 void multiReduceLaunch(doubleN result[],
145  MultiReduceArg<N,ReduceType,SpinorX,SpinorY,SpinorZ,SpinorW,SpinorV,Reducer> &arg,
146  const TuneParam &tp, const cudaStream_t &stream){
147 
148  if(tp.grid.x > REDUCE_MAX_BLOCKS)
149  errorQuda("Grid size %d greater than maximum %d\n", tp.grid.x, REDUCE_MAX_BLOCKS);
150 
151  LAUNCH_KERNEL(multiReduceKernel, tp, stream, arg, N, ReduceType, ReduceSimpleType, FloatN, M);
152 
153 #if (defined(_MSC_VER) && defined(_WIN64) || defined(__LP64__))
154  if(deviceProp.canMapHostMemory){
155  cudaEventRecord(reduceEnd, stream);
156  while(cudaSuccess != cudaEventQuery(reduceEnd)) {}
157  } else
158 #endif
159  { cudaMemcpy(h_reduce, hd_reduce, sizeof(ReduceType)*N, cudaMemcpyDeviceToHost); }
160 
161  memset(result, 0, N*sizeof(doubleN));
162  for(int i=0; i<N; ++i) result[i] += ((ReduceType*)h_reduce)[i]; // Need to check this
163 
164  const int Nreduce = N*(sizeof(doubleN)/sizeof(double));
165 
166  reduceDoubleArray((double*)result, Nreduce);
167 }
168 
169 
170 
171 
172 template<int N, typename doubleN, typename ReduceType, typename ReduceSimpleType,
173  typename FloatN, int M, typename SpinorX, typename SpinorY, typename SpinorZ,
174  typename SpinorW, typename SpinorV, typename Reducer>
175  class MultiReduceCuda : public Tunable {
176 
177  private:
178  mutable MultiReduceArg<N,ReduceType,SpinorX,SpinorY,SpinorZ,SpinorW,SpinorV,Reducer> arg;
179 
180  doubleN *result;
181 
182  // host pointer used for backing up fields when tuning
183  char *X_h[N], *Y_h[N], *Z_h[N], *W_h[N], *V_h[N];
184  char *Xnorm_h[N], *Ynorm_h[N], *Znorm_h[N], *Wnorm_h[N], *Vnorm_h[N];
185 
186  unsigned int sharedBytesPerThread() const { return sizeof(ReduceType); }
187 
188  // When there is only one warp per block, we need to allocate two warps
189  // worth of shared memory so that we don't index shared memory out of bounds
190  unsigned int sharedBytesPerBlock(const TuneParam &param) const {
191  int warpSize = 32;
192  return 2*warpSize*sizeof(ReduceType);
193  }
194 
195  virtual bool advanceSharedBytes(TuneParam &param) const
196  {
197  TuneParam next(param);
198  advanceBlockDim(next); // to get next blockDim
199  int nthreads = next.block.x * next.block.y * next.block.z;
200  param.shared_bytes = sharedBytesPerThread()*nthreads > sharedBytesPerBlock(param) ? sharedBytesPerThread()*nthreads : sharedBytesPerBlock(param);
201  return false;
202  }
203 
204  public:
205  MultiReduceCuda(doubleN result[], SpinorX X[], SpinorY Y[],
206  SpinorZ Z[], SpinorW W[], SpinorV V[], Reducer &r, int length) :
207  arg(X, Y, Z, W, V, r, (ReduceType*)d_reduce, (ReduceType*)hd_reduce, length), result(result) {
208  for(int i=0; i<N; ++i){
209  X_h[i] = 0;
210  Y_h[i] = 0;
211  Z_h[i] = 0;
212  W_h[i] = 0;
213  V_h[i] = 0;
214 
215  Xnorm_h[i] = 0;
216  Ynorm_h[i] = 0;
217  Znorm_h[i] = 0;
218  Wnorm_h[i] = 0;
219  Vnorm_h[i] = 0;
220  }
221  }
222 
223  virtual ~MultiReduceCuda(){}
224 
225  inline TuneKey tuneKey() const {
226  return TuneKey(blasStrings.vol_str, typeid(arg.r).name(), blasStrings.aux_str);
227  }
228 
229  void apply(const cudaStream_t &stream){
230  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
231  multiReduceLaunch<N,doubleN,ReduceType,ReduceSimpleType,FloatN,M>(result,arg,tp,stream);
232 
233  }
234 
235 
236 #define BYTES(X) ( arg.X.Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M*arg.X.Stride() )
237 #define NORM_BYTES(X) ( (arg.X.Precision() == QUDA_HALF_PRECISION) ? sizeof(float)*arg.length : 0 )
238 
239 
240 
241  void preTune() {
242  for(int i=0; i<N; ++i){
243  arg.X[i].save(&X_h[i], &Xnorm_h[i], BYTES(X[i]), NORM_BYTES(X[i]));
244  arg.Y[i].save(&Y_h[i], &Ynorm_h[i], BYTES(Y[i]), NORM_BYTES(Y[i]));
245  arg.Z[i].save(&Z_h[i], &Znorm_h[i], BYTES(Z[i]), NORM_BYTES(Z[i]));
246  arg.W[i].save(&W_h[i], &Wnorm_h[i], BYTES(W[i]), NORM_BYTES(W[i]));
247  arg.V[i].save(&V_h[i], &Vnorm_h[i], BYTES(V[i]), NORM_BYTES(V[i]));
248  }
249  }
250 
251  void postTune() {
252  for(int i=0; i<N; ++i){
253  arg.X[i].load(&X_h[i], &Xnorm_h[i], BYTES(X[i]), NORM_BYTES(X[i]));
254  arg.Y[i].load(&Y_h[i], &Ynorm_h[i], BYTES(Y[i]), NORM_BYTES(Y[i]));
255  arg.Z[i].load(&Z_h[i], &Znorm_h[i], BYTES(Z[i]), NORM_BYTES(Z[i]));
256  arg.W[i].load(&W_h[i], &Wnorm_h[i], BYTES(W[i]), NORM_BYTES(W[i]));
257  arg.V[i].load(&V_h[i], &Vnorm_h[i], BYTES(V[i]), NORM_BYTES(V[i]));
258  }
259  }
260 #undef BYTES
261 #undef NORM_BYTES
262 
263  // Need to check this!
264  long long flops() const { return arg.r.flops()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*arg.length*M; }
265  long long bytes() const {
266  size_t bytes = N*arg.X[0].Precision()*(sizeof(FloatN)/sizeof(((FloatN*)0)->x))*M;
267  if (arg.X[0].Precision() == QUDA_HALF_PRECISION) bytes += sizeof(float);
268  return arg.r.streams()*bytes*arg.length; }
269  int tuningIter() const { return 3; }
270  };
271 // NB - need to change d_reduce and hd_reduce
272 
273 
274 
275 
276 template<int N, typename doubleN, typename ReduceType, typename ReduceSimpleType,
277  template <typename ReducerType, typename Float, typename FloatN> class Reducer,
278  int writeX, int writeY, int writeZ, int writeW, int writeV, bool siteUnroll>
279  void multiReduceCuda(doubleN result[], const double2& a, const double2& b,
280  std::vector<cudaColorSpinorField*>& x, std::vector<cudaColorSpinorField*>& y,
281  std::vector<cudaColorSpinorField*>& z, std::vector<cudaColorSpinorField*>& w,
282  std::vector<cudaColorSpinorField*>& v){
283 
284  if(x[0]->SiteSubset() == QUDA_FULL_SITE_SUBSET){
285  doubleN evenResult[N];
286  doubleN oddResult[N];
287  std::vector<cudaColorSpinorField> xp; xp.reserve(N);
288  std::vector<cudaColorSpinorField> yp; yp.reserve(N);
289  std::vector<cudaColorSpinorField> zp; zp.reserve(N);
290  std::vector<cudaColorSpinorField> wp; wp.reserve(N);
291  std::vector<cudaColorSpinorField> vp; vp.reserve(N);
292 
293  std::vector<cudaColorSpinorField*> xpp; xpp.reserve(N);
294  std::vector<cudaColorSpinorField*> ypp; ypp.reserve(N);
295  std::vector<cudaColorSpinorField*> zpp; zpp.reserve(N);
296  std::vector<cudaColorSpinorField*> wpp; wpp.reserve(N);
297  std::vector<cudaColorSpinorField*> vpp; vpp.reserve(N);
298 
299  for(int i=0; i<N; ++i){
300  xp.push_back(x[i]->Even());
301  yp.push_back(y[i]->Even());
302  zp.push_back(z[i]->Even());
303  wp.push_back(w[i]->Even());
304  vp.push_back(v[i]->Even());
305 
306  xpp.push_back(&xp[i]);
307  ypp.push_back(&yp[i]);
308  zpp.push_back(&zp[i]);
309  wpp.push_back(&wp[i]);
310  vpp.push_back(&vp[i]);
311  }
312 
313  multiReduceCuda<N, doubleN, ReduceType, ReduceSimpleType, Reducer, writeX,
314  writeY, writeZ, writeW, writeV, siteUnroll>
315  (evenResult, a, b, xpp, ypp, zpp, wpp, vpp);
316 
317  for(int i=0; i<N; ++i){
318  xp.push_back(x[i]->Odd());
319  yp.push_back(y[i]->Odd());
320  zp.push_back(z[i]->Odd());
321  wp.push_back(w[i]->Odd());
322  vp.push_back(v[i]->Odd());
323 
324  xpp.push_back(&xp[i]);
325  ypp.push_back(&yp[i]);
326  zpp.push_back(&zp[i]);
327  wpp.push_back(&wp[i]);
328  vpp.push_back(&vp[i]);
329  }
330 
331  multiReduceCuda<N, doubleN, ReduceType, ReduceSimpleType, Reducer, writeX,
332  writeY, writeZ, writeW, writeV, siteUnroll>
333  (oddResult, a, b, xpp, ypp, zpp, wpp, vpp);
334 
335  for(int i=0; i<N; ++i) result[i] = evenResult[i] + oddResult[i];
336  return;
337  }
338  for(int i=0; i<N; ++i){
339 
340  checkSpinor( (*(x[i])), (*(y[i])) );
341  checkSpinor( (*(x[i])), (*(z[i])) );
342  checkSpinor( (*(x[i])), (*(w[i])) );
343  checkSpinor( (*(x[i])), (*(v[i])) );
344 
345  if(!x[i]->isNative()){
346  warningQuda("Reductions on non-native fields are not supported\n");
347  memset(result, 0, N*sizeof(doubleN));
348  return;
349  }
350  }
351 
352  memset(result, 0, N*sizeof(doubleN));
353 
354  blasStrings.vol_str = x[0]->VolString();
355  blasStrings.aux_str = x[0]->AuxString();
356 
357  int reduce_length = siteUnroll ? x[0]->RealLength() : x[0]->Length();
358 
359  if(x[0]->Precision() == QUDA_DOUBLE_PRECISION){
360  if(x[0]->Nspin() == 4){ // wilson
361  const int M = siteUnroll ? 12 : 1; // determines how much work per thread to do
362 
368 
369  for(int i=0; i<N; ++i){
370  X[i].set(*x[i]);
371  Y[i].set(*y[i]);
372  Z[i].set(*z[i]);
373  W[i].set(*w[i]);
374  V[i].set(*v[i]);
375  }
376  Reducer<ReduceType, double2, double2> r(a,b);
377  MultiReduceCuda<N, doubleN, ReduceType, ReduceSimpleType, double2, M,
380  Spinor<double2, double2, double2, M, writeV>, Reducer<ReduceType, double2, double2> >
381  reduce(result, X, Y, Z, W, V, r, reduce_length/(2*M));
382  reduce.apply(*getBlasStream());
383 
384 
385 
386  }else if(x[0]->Nspin() == 1){
387 
388  const int M = siteUnroll ? 3 : 1; // determines how much work per thread to do
389 
390  Spinor<double2, double2, double2, M, writeX> X[N];
391  Spinor<double2, double2, double2, M, writeY> Y[N];
392  Spinor<double2, double2, double2, M, writeZ> Z[N];
393  Spinor<double2, double2, double2, M, writeW> W[N];
394  Spinor<double2, double2, double2, M, writeV> V[N];
395 
396  for(int i=0; i<N; ++i){
397  X[i].set(*x[i]);
398  Y[i].set(*y[i]);
399  Z[i].set(*z[i]);
400  W[i].set(*w[i]);
401  V[i].set(*v[i]);
402  }
403 
404  Reducer<ReduceType, double2, double2> r(a,b);
405  MultiReduceCuda<N, doubleN, ReduceType, ReduceSimpleType, double2, M,
406  Spinor<double2, double2, double2, M, writeX>, Spinor<double2, double2, double2, M, writeY>,
407  Spinor<double2, double2, double2, M, writeZ>, Spinor<double2, double2, double2, M, writeW>,
408  Spinor<double2, double2, double2, M, writeV>, Reducer<ReduceType, double2, double2> >
409  reduce(result, X, Y, Z, W, V, r, reduce_length/(2*M));
410 
411 
412  reduce.apply(*getBlasStream());
413  } else { errorQuda("ERROR: nSpin=%d is not supported\n", x[0]->Nspin()); }
414 
415  }else if (x[0]->Precision() == QUDA_SINGLE_PRECISION) {
416  if(x[0]->Nspin() == 4){ // wilson
417  const int M = siteUnroll ? 6 : 1; // determines how much work per thread to do
418 
424 
425  for(int i=0; i<N; ++i){
426  X[i].set(*x[i]);
427  Y[i].set(*y[i]);
428  Z[i].set(*z[i]);
429  W[i].set(*w[i]);
430  V[i].set(*v[i]);
431  }
432 
433 
434  Reducer<ReduceType, float2, float4> r(make_float2(a.x,a.y), make_float2(b.x,b.y));
435  MultiReduceCuda<N,doubleN,ReduceType,ReduceSimpleType,float4,M,
438  Spinor<float4,float4,float4,M,writeV>, Reducer<ReduceType, float2, float4> >
439  reduce(result, X, Y, Z, W, V, r, reduce_length/(4*M));
440  reduce.apply(*getBlasStream());
441  }else if(x[0]->Nspin() == 1){ // staggered
442 
443  const int M = siteUnroll ? 3 : 1;
444 
450 
451  for(int i=0; i<N; ++i){
452  X[i].set(*x[i]);
453  Y[i].set(*y[i]);
454  Z[i].set(*z[i]);
455  W[i].set(*w[i]);
456  V[i].set(*v[i]);
457  }
458 
459  Reducer<ReduceType, float2, float2> r(make_float2(a.x,a.y), make_float2(b.x,b.y));
460  MultiReduceCuda<N,doubleN,ReduceType,ReduceSimpleType,float2,M,
463  Spinor<float2,float2,float2,M,writeV>, Reducer<ReduceType, float2, float2> >
464  reduce(result, X, Y, Z, W, V, r, reduce_length/(2*M));
465  reduce.apply(*getBlasStream());
466  }
467  }else{ // half precision
468  if(x[0]->Nspin() == 4){ // wilson
474 
475  for(int i=0; i<N; ++i){
476  X[i].set(*x[i]);
477  Y[i].set(*y[i]);
478  Z[i].set(*z[i]);
479  V[i].set(*v[i]);
480  W[i].set(*w[i]);
481  }
482 
483  Reducer<ReduceType, float2, float4> r(make_float2(a.x,a.y), make_float2(b.x,b.y));
484  MultiReduceCuda<N,doubleN,ReduceType,ReduceSimpleType,float4,6,
487  Spinor<float4,float4,short4,6,writeV>, Reducer<ReduceType, float2, float4> >
488  reduce(result, X, Y, Z, W, V, r, y[0]->Volume());
489  reduce.apply(*getBlasStream());
490  }else if(x[0]->Nspin() == 1){ // staggered
496 
497  for(int i=0; i<N; ++i){
498  X[i].set(*x[i]);
499  Y[i].set(*y[i]);
500  Z[i].set(*z[i]);
501  V[i].set(*v[i]);
502  W[i].set(*w[i]);
503  }
504 
505  Reducer<ReduceType, float2, float2> r(make_float2(a.x,a.y), make_float2(b.x,b.y));
506  MultiReduceCuda<N,doubleN,ReduceType,ReduceSimpleType,float2,3,
509  Spinor<float2,float2,short2,3,writeV>, Reducer<ReduceType, float2, float2> >
510  reduce(result, X, Y, Z, W, V, r, y[0]->Volume());
511  reduce.apply(*getBlasStream());
512 
513  }else{ errorQuda("ERROR: nSpin=%d is not supported\n", x[0]->Nspin()); }
514  }
515 
516  for(int i=0; i<N; ++i){
517  blas_bytes += Reducer<ReduceType,double2,double2>::streams()*(unsigned long long)x[i]->RealLength()*x[i]->Precision();
518  blas_flops += Reducer<ReduceType,double2,double2>::flops()*(unsigned long long)x[i]->RealLength();
519  }
520  checkCudaError();
521  return;
522  }
523 
524 #endif // SSTEP
__device__ void copyfromshared(double &x, const double *s, const int i, const int block)
Definition: reduce_core.h:14
int V
Definition: test_util.cpp:29
int y[4]
cudaDeviceProp deviceProp
__device__ void copytoshared(double *s, const int i, const double x, const int block)
Definition: reduce_core.h:4
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20
#define errorQuda(...)
Definition: util_quda.h:73
unsigned long long blas_bytes
Definition: blas_quda.cu:38
cudaStream_t * streams
cudaStream_t * stream
void set(const cudaColorSpinorField &x)
Definition: texture.h:358
int length[]
int Nspin
Definition: blas_test.cu:43
QudaGaugeParam param
Definition: pack_test.cpp:17
__host__ __device__ void zero(double &x)
Definition: reduce_core.h:1
cudaColorSpinorField * tmp
void reduceDoubleArray(double *, const int len)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:271
#define warningQuda(...)
Definition: util_quda.h:84
__device__ unsigned int count
Definition: reduce_core.h:112
int x[4]
cudaStream_t * getBlasStream()
Definition: blas_quda.cu:64
unsigned long long blas_flops
Definition: blas_quda.cu:37
void * memset(void *s, int c, size_t n)
int Z[4]
Definition: test_util.cpp:28
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:843
#define checkSpinor(a, b)
Definition: blas_quda.cu:15
#define REDUCE_MAX_BLOCKS
Definition: reduce_quda.cu:16
#define checkCudaError()
Definition: util_quda.h:110
QudaTune getTuning()
Definition: util_quda.cpp:32
VOLATILE spinorFloat * s
__shared__ bool isLastBlockDone
Definition: reduce_core.h:113
__syncthreads()