QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
cub_helper.cuh
Go to the documentation of this file.
1 #pragma once
2 #include <quda_constants.h>
3 #include <float_vector.h>
4 #include <comm_quda.h>
5 #include <blas_quda.h>
6 
7 using namespace quda;
8 
9 // ensures we use shfl_sync and not shfl when compiling with clang
10 #if defined(__clang__) && defined(__CUDA__) && CUDA_VERSION >= 9000
11 #define CUB_USE_COOPERATIVE_GROUPS
12 #endif
13 
14 #include <cub/block/block_reduce.cuh>
15 
16 #if __COMPUTE_CAPABILITY__ >= 300
17 #include <generics/shfl.h>
18 #endif
19 
28 namespace quda {
29 
33  template <typename scalar, int n>
34  struct vector_type {
36  __device__ __host__ inline scalar& operator[](int i) { return data[i]; }
37  __device__ __host__ inline const scalar& operator[](int i) const { return data[i]; }
38  __device__ __host__ inline static constexpr int size() { return n; }
39  __device__ __host__ inline void operator+=(const vector_type &a) {
40 #pragma unroll
41  for (int i=0; i<n; i++) data[i] += a[i];
42  }
43  __device__ __host__ inline void operator=(const vector_type &a) {
44 #pragma unroll
45  for (int i=0; i<n; i++) data[i] = a[i];
46  }
47  __device__ __host__ vector_type() {
48 #pragma unroll
49  for (int i=0; i<n; i++) zero(data[i]);
50  }
51  };
52 
53  template<typename scalar, int n>
54  __device__ __host__ inline void zero(vector_type<scalar,n> &v) {
55 #pragma unroll
56  for (int i=0; i<n; i++) zero(v.data[i]);
57  }
58 
59  template<typename scalar, int n>
60  __device__ __host__ inline vector_type<scalar,n> operator+(const vector_type<scalar,n> &a, const vector_type<scalar,n> &b) {
62 #pragma unroll
63  for (int i=0; i<n; i++) c[i] = a[i] + b[i];
64  return c;
65  }
66 
67 
68  template <typename T>
69  struct ReduceArg {
70  T *partial;
74  partial(static_cast<T*>(blas::getDeviceReduceBuffer())),
75  result_d(static_cast<T*>(blas::getMappedHostReduceBuffer())),
76  result_h(static_cast<T*>(blas::getHostReduceBuffer()))
77  {
78  // write reduction to GPU memory if asynchronous
79  if (commAsyncReduction()) result_d = partial;
80  }
81 
82  };
83 
84 #ifdef QUAD_SUM
85  __device__ __host__ inline void zero(doubledouble &x) { x.a.x = 0.0; x.a.y = 0.0; }
86  __device__ __host__ inline void zero(doubledouble2 &x) { zero(x.x); zero(x.y); }
87  __device__ __host__ inline void zero(doubledouble3 &x) { zero(x.x); zero(x.y); zero(x.z); }
88 #endif
89 
90  __device__ unsigned int count[QUDA_MAX_MULTI_REDUCE] = { };
91  __shared__ bool isLastBlockDone;
92 
93  template <int block_size_x, int block_size_y, typename T, bool do_sum=true, typename Reducer=cub::Sum>
94  __device__ inline void reduce2d(ReduceArg<T> arg, const T &in, const int idx=0) {
95 
96  typedef cub::BlockReduce<T, block_size_x, cub::BLOCK_REDUCE_WARP_REDUCTIONS, block_size_y> BlockReduce;
97  __shared__ typename BlockReduce::TempStorage cub_tmp;
98 
99  Reducer r;
100  T aggregate = (do_sum ? BlockReduce(cub_tmp).Sum(in) : BlockReduce(cub_tmp).Reduce(in, r));
101 
102  if (threadIdx.x == 0 && threadIdx.y == 0) {
103  arg.partial[idx*gridDim.x + blockIdx.x] = aggregate;
104  __threadfence(); // flush result
105 
106  // increment global block counter
107  unsigned int value = atomicInc(&count[idx], gridDim.x);
108 
109  // determine if last block
110  isLastBlockDone = (value == (gridDim.x-1));
111  }
112 
113  __syncthreads();
114 
115  // finish the reduction if last block
116  if (isLastBlockDone) {
117  unsigned int i = threadIdx.y*block_size_x + threadIdx.x;
118  T sum;
119  zero(sum);
120  while (i<gridDim.x) {
121  sum = r(sum, arg.partial[idx*gridDim.x + i]);
122  //sum += arg.partial[idx*gridDim.x + i];
123  i += block_size_x*block_size_y;
124  }
125 
126  sum = (do_sum ? BlockReduce(cub_tmp).Sum(sum) : BlockReduce(cub_tmp).Reduce(sum,r));
127 
128  // write out the final reduced value
129  if (threadIdx.y*block_size_x + threadIdx.x == 0) {
130  arg.result_d[idx] = sum;
131  count[idx] = 0; // set to zero for next time
132  }
133  }
134  }
135 
136  template <int block_size, typename T, bool do_sum = true, typename Reducer = cub::Sum>
137  __device__ inline void reduce(ReduceArg<T> arg, const T &in, const int idx=0) { reduce2d<block_size, 1, T, do_sum, Reducer>(arg, in, idx); }
138 
139 
140  __shared__ volatile bool isLastWarpDone[16];
141 
142 #if __COMPUTE_CAPABILITY__ >= 300
143 
151  template <typename T>
152  __device__ inline void warp_reduce(ReduceArg<T> arg, const T &in, const int idx=0) {
153 
154  const int warp_size = 32;
155  T aggregate = in;
156 #pragma unroll
157  for (int offset = warp_size/2; offset > 0; offset /= 2) aggregate += __shfl_down(aggregate, offset);
158 
159  if (threadIdx.x == 0) {
160  arg.partial[idx*gridDim.x + blockIdx.x] = aggregate;
161  __threadfence(); // flush result
162 
163  // increment global block counter
164  unsigned int value = atomicInc(&count[idx], gridDim.x);
165 
166  // determine if last warp
167  if (threadIdx.y == 0) isLastBlockDone = (value == (gridDim.x-1));
168  }
169 
170  __syncthreads();
171 
172  // finish the reduction if last block
173  if (isLastBlockDone) {
174  unsigned int i = threadIdx.x;
175  T sum;
176  zero(sum);
177  while (i<gridDim.x) {
178  sum += arg.partial[idx*gridDim.x + i];
179  i += warp_size;
180  }
181 
182 #pragma unroll
183  for (int offset = warp_size/2; offset > 0; offset /= 2) sum += __shfl_down(sum, offset);
184 
185  // write out the final reduced value
186  if (threadIdx.x == 0) {
187  arg.result_d[idx] = sum;
188  count[idx] = 0; // set to zero for next time
189  }
190  }
191  }
192 #endif // __COMPUTE_CAPABILITY__ >= 300
193 
197  template <typename T>
198  struct reduce_vector {
199  __device__ __host__ inline T operator()(const T &a, const T &b) {
200  T sum;
201  for (int i=0; i<sum.size(); i++) sum[i] = a[i] + b[i];
202  return sum;
203  }
204  };
205 
206  template <int block_size_x, int block_size_y, typename T>
207  __device__ inline void reduceRow(ReduceArg<T> arg, const T &in) {
208 
209  typedef vector_type<T,block_size_y> vector;
210  typedef cub::BlockReduce<vector, block_size_x, cub::BLOCK_REDUCE_WARP_REDUCTIONS, block_size_y> BlockReduce;
211  constexpr int n_word = sizeof(T) / sizeof(int);
212 
213  __shared__ union {
214  typename BlockReduce::TempStorage cub;
215  int exchange[n_word*block_size_x*block_size_y];
216  } shared;
217 
218  // first move all data at y>0 to y=0 slice and pack in a vector of length block_size_y
219  if (threadIdx.y > 0) {
220  for (int i=0; i<n_word; i++)
221  shared.exchange[(i * block_size_y + threadIdx.y)*block_size_x + threadIdx.x] = reinterpret_cast<const int*>(&in)[i];
222  }
223 
224  __syncthreads();
225 
226  vector data;
227 
228  if (threadIdx.y == 0) {
229  data[0] = in;
230  for (int y=1; y<block_size_y; y++)
231  for (int i=0; i<n_word; i++)
232  reinterpret_cast<int*>(&data[y])[i] = shared.exchange[(i * block_size_y + y)*block_size_x + threadIdx.x];
233  }
234 
235  __syncthreads();
236 
237  reduce_vector<vector> reducer;
238 
239  vector aggregate = BlockReduce(shared.cub).Reduce(data, reducer, block_size_x);
240 
241  if (threadIdx.x == 0 && threadIdx.y == 0) {
242  reinterpret_cast<vector*>(arg.partial)[blockIdx.x] = aggregate;
243  __threadfence(); // flush result
244 
245  // increment global block counter
246  unsigned int value = atomicInc(&count[0], gridDim.x);
247 
248  // determine if last block
249  isLastBlockDone = (value == (gridDim.x-1));
250  }
251 
252  __syncthreads();
253 
254  // finish the reduction if last block
255  if (isLastBlockDone) {
256  vector sum;
257  if (threadIdx.y == 0) { // only use x-row to do final reduction since we've only allocated space for this
258  unsigned int i = threadIdx.x;
259  while (i < gridDim.x) {
260  sum += reinterpret_cast<vector*>(arg.partial)[i];
261  i += block_size_x;
262  }
263  }
264 
265  sum = BlockReduce(shared.cub).Reduce(sum, reducer, block_size_x);
266 
267  // write out the final reduced value
268  if (threadIdx.y*block_size_x + threadIdx.x == 0) {
269  reinterpret_cast<vector*>(arg.result_d)[0] = sum;
270  count[0] = 0; // set to zero for next time
271  }
272  }
273  }
274 
275 } // namespace quda
__device__ void reduce2d(ReduceArg< T > arg, const T &in, const int idx=0)
Definition: cub_helper.cuh:94
dbldbl a
Definition: dbldbl.h:285
doubledouble x
Definition: dbldbl.h:339
void * getHostReduceBuffer()
Definition: reduce_quda.cu:28
bool commAsyncReduction()
__device__ static __host__ constexpr int size()
Definition: cub_helper.cuh:38
#define QUDA_MAX_MULTI_REDUCE
Maximum number of simultaneous reductions that can take place. This number may be increased if needed...
__device__ __host__ void operator=(const vector_type &a)
Definition: cub_helper.cuh:43
__device__ __host__ scalar & operator[](int i)
Definition: cub_helper.cuh:36
void * getMappedHostReduceBuffer()
Definition: reduce_quda.cu:27
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
Definition: cub_helper.cuh:137
__host__ __device__ void sum(double &a, double &b)
Definition: blas_helper.cuh:62
doubledouble x
Definition: dbldbl.h:357
__shared__ volatile bool isLastWarpDone[16]
Definition: cub_helper.cuh:140
__device__ void reduceRow(ReduceArg< T > arg, const T &in)
Definition: cub_helper.cuh:207
__shared__ bool isLastBlockDone
Definition: cub_helper.cuh:91
__device__ __host__ void operator+=(const vector_type &a)
Definition: cub_helper.cuh:39
void exchange(void **ghost, void **sendbuf, int nFace=1) const
__device__ __host__ const scalar & operator[](int i) const
Definition: cub_helper.cuh:37
cpuColorSpinorField * in
__device__ __host__ vector_type()
Definition: cub_helper.cuh:47
doubledouble z
Definition: dbldbl.h:359
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator+(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor addition operator.
void * getDeviceReduceBuffer()
Definition: reduce_quda.cu:26
doubledouble y
Definition: dbldbl.h:358
__device__ __host__ T operator()(const T &a, const T &b)
Definition: cub_helper.cuh:199
doubledouble y
Definition: dbldbl.h:340
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:90
__device__ __host__ void zero(vector_type< scalar, n > &v)
Definition: cub_helper.cuh:54