QUDA  0.9.0
cub_helper.cuh
Go to the documentation of this file.
1 #pragma once
2 #include <float_vector.h>
3 
4 using namespace quda;
5 #include <cub/cub.cuh>
6 
7 #if __COMPUTE_CAPABILITY__ >= 300
8 #include <generics/shfl.h>
9 #endif
10 
19 namespace quda {
20 
24  template <typename T>
25  struct Summ {
26  __host__ __device__ __forceinline__ T operator() (const T &a, const T &b){
27  return a + b;
28  }
29  };
30 
34  template <>
35  struct Summ<double2>{
36  __host__ __device__ __forceinline__ double2 operator() (const double2 &a, const double2 &b){
37  return make_double2(a.x + b.x, a.y + b.y);
38  }
39  };
40 
44  template <>
45  struct Summ<double3>{
46  __host__ __device__ __forceinline__ double3 operator() (const double3 &a, const double3 &b){
47  return make_double3(a.x + b.x, a.y + b.y, a.z + b.z);
48  }
49  };
50 
54  template <>
55  struct Summ<double4>{
56  __host__ __device__ __forceinline__ double4 operator() (const double4 &a, const double4 &b){
57  return make_double4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
58  }
59  };
60 
61 
65  template <typename scalar, int n>
66  struct vector_type {
68  __device__ __host__ inline scalar& operator[](int i) { return data[i]; }
69  __device__ __host__ inline const scalar& operator[](int i) const { return data[i]; }
70  __device__ __host__ inline static constexpr int size() { return n; }
71  __device__ __host__ inline void operator+=(const vector_type &a) {
72 #pragma unroll
73  for (int i=0; i<n; i++) data[i] += a[i];
74  }
75  __device__ __host__ vector_type() {
76 #pragma unroll
77  for (int i=0; i<n; i++) zero(data[i]);
78  }
79  };
80 
81  template<typename scalar, int n>
82  __device__ __host__ inline void zero(vector_type<scalar,n> &v) {
83 #pragma unroll
84  for (int i=0; i<n; i++) zero(v.data[i]);
85  }
86 
87  template<typename scalar, int n>
88  __device__ __host__ inline vector_type<scalar,n> operator+(const vector_type<scalar,n> &a, const vector_type<scalar,n> &b) {
90 #pragma unroll
91  for (int i=0; i<n; i++) c[i] = a[i] + b[i];
92  return c;
93  }
94 
95 
96  template <typename T>
97  struct ReduceArg {
98  T *partial;
102  partial(static_cast<T*>(blas::getDeviceReduceBuffer())),
103  result_d(static_cast<T*>(blas::getMappedHostReduceBuffer())),
104  result_h(static_cast<T*>(blas::getHostReduceBuffer()))
105  {
106  // write reduction to GPU memory if asynchronous
108  }
109 
110  };
111 
112 #ifdef QUAD_SUM
113  __device__ __host__ inline void zero(doubledouble &x) { x.a.x = 0.0; x.a.y = 0.0; }
114  __device__ __host__ inline void zero(doubledouble2 &x) { zero(x.x); zero(x.y); }
115  __device__ __host__ inline void zero(doubledouble3 &x) { zero(x.x); zero(x.y); zero(x.z); }
116 #endif
117 
118  __device__ unsigned int count[QUDA_MAX_MULTI_REDUCE] = { };
119  __shared__ bool isLastBlockDone;
120 
121  template <int block_size_x, int block_size_y, typename T>
122  __device__ inline void reduce2d(ReduceArg<T> arg, const T &in, const int idx=0) {
123 
124  typedef cub::BlockReduce<T, block_size_x, cub::BLOCK_REDUCE_WARP_REDUCTIONS, block_size_y> BlockReduce;
125  __shared__ typename BlockReduce::TempStorage cub_tmp;
126 
127  T aggregate = BlockReduce(cub_tmp).Sum(in);
128 
129  if (threadIdx.x == 0 && threadIdx.y == 0) {
130  arg.partial[idx*gridDim.x + blockIdx.x] = aggregate;
131  __threadfence(); // flush result
132 
133  // increment global block counter
134  unsigned int value = atomicInc(&count[idx], gridDim.x);
135 
136  // determine if last block
137  isLastBlockDone = (value == (gridDim.x-1));
138  }
139 
140  __syncthreads();
141 
142  // finish the reduction if last block
143  if (isLastBlockDone) {
144  unsigned int i = threadIdx.y*block_size_x + threadIdx.x;
145  T sum;
146  zero(sum);
147  while (i<gridDim.x) {
148  sum += arg.partial[idx*gridDim.x + i];
149  i += block_size_x*block_size_y;
150  }
151 
152  sum = BlockReduce(cub_tmp).Sum(sum);
153 
154  // write out the final reduced value
155  if (threadIdx.y*block_size_x + threadIdx.x == 0) {
156  arg.result_d[idx] = sum;
157  count[idx] = 0; // set to zero for next time
158  }
159  }
160  }
161 
162  template <int block_size, typename T>
163  __device__ inline void reduce(ReduceArg<T> arg, const T &in, const int idx=0) { reduce2d<block_size, 1, T>(arg, in, idx); }
164 
165 
166  __shared__ volatile bool isLastWarpDone[16];
167 
168 #if __COMPUTE_CAPABILITY__ >= 300
169 
177  template <typename T>
178  __device__ inline void warp_reduce(ReduceArg<T> arg, const T &in, const int idx=0) {
179 
180  const int warp_size = 32;
181  T aggregate = in;
182 #pragma unroll
183  for (int offset = warp_size/2; offset > 0; offset /= 2) aggregate += __shfl_down(aggregate, offset);
184 
185  if (threadIdx.x == 0) {
186  arg.partial[idx*gridDim.x + blockIdx.x] = aggregate;
187  __threadfence(); // flush result
188 
189  // increment global block counter
190  unsigned int value = atomicInc(&count[idx], gridDim.x);
191 
192  // determine if last warp
193  if (threadIdx.y == 0) isLastBlockDone = (value == (gridDim.x-1));
194  }
195 
196  __syncthreads();
197 
198  // finish the reduction if last block
199  if (isLastBlockDone) {
200  unsigned int i = threadIdx.x;
201  T sum;
202  zero(sum);
203  while (i<gridDim.x) {
204  sum += arg.partial[idx*gridDim.x + i];
205  i += warp_size;
206  }
207 
208 #pragma unroll
209  for (int offset = warp_size/2; offset > 0; offset /= 2) sum += __shfl_down(sum, offset);
210 
211  // write out the final reduced value
212  if (threadIdx.x == 0) {
213  arg.result_d[idx] = sum;
214  count[idx] = 0; // set to zero for next time
215  }
216  }
217  }
218 #endif // __COMPUTE_CAPABILITY__ >= 300
219 
223  template <typename T>
224  struct reduce_vector {
225  __device__ __host__ inline T operator()(const T &a, const T &b) {
226  T sum;
227  for (int i=0; i<sum.size(); i++) sum[i] = a[i] + b[i];
228  return sum;
229  }
230  };
231 
232  template <int block_size_x, int block_size_y, typename T>
233  __device__ inline void reduceRow(ReduceArg<T> arg, const T &in) {
234 
235  typedef vector_type<T,block_size_y> vector;
236  typedef cub::BlockReduce<vector, block_size_x, cub::BLOCK_REDUCE_WARP_REDUCTIONS, block_size_y> BlockReduce;
237  constexpr int n_word = sizeof(T) / sizeof(int);
238 
239  __shared__ union {
240  typename BlockReduce::TempStorage cub;
241  int exchange[n_word*block_size_x*block_size_y];
242  } shared;
243 
244  // first move all data at y>0 to y=0 slice and pack in a vector of length block_size_y
245  if (threadIdx.y > 0) {
246  for (int i=0; i<n_word; i++)
247  shared.exchange[(i * block_size_y + threadIdx.y)*block_size_x + threadIdx.x] = reinterpret_cast<const int*>(&in)[i];
248  }
249 
250  __syncthreads();
251 
252  vector data;
253 
254  if (threadIdx.y == 0) {
255  data[0] = in;
256  for (int y=1; y<block_size_y; y++)
257  for (int i=0; i<n_word; i++)
258  reinterpret_cast<int*>(&data[y])[i] = shared.exchange[(i * block_size_y + y)*block_size_x + threadIdx.x];
259  }
260 
261  __syncthreads();
262 
263  reduce_vector<vector> reducer;
264 
265  vector aggregate = BlockReduce(shared.cub).Reduce(data, reducer, block_size_x);
266 
267  if (threadIdx.x == 0 && threadIdx.y == 0) {
268  reinterpret_cast<vector*>(arg.partial)[blockIdx.x] = aggregate;
269  __threadfence(); // flush result
270 
271  // increment global block counter
272  unsigned int value = atomicInc(&count[0], gridDim.x);
273 
274  // determine if last block
275  isLastBlockDone = (value == (gridDim.x-1));
276  }
277 
278  __syncthreads();
279 
280  // finish the reduction if last block
281  if (isLastBlockDone) {
282  vector sum;
283  if (threadIdx.y == 0) { // only use x-row to do final reduction since we've only allocated space for this
284  unsigned int i = threadIdx.x;
285  while (i < gridDim.x) {
286  sum += reinterpret_cast<vector*>(arg.partial)[i];
287  i += block_size_x;
288  }
289  }
290 
291  sum = BlockReduce(shared.cub).Reduce(sum, reducer, block_size_x);
292 
293  // write out the final reduced value
294  if (threadIdx.y*block_size_x + threadIdx.x == 0) {
295  reinterpret_cast<vector*>(arg.result_d)[0] = sum;
296  count[0] = 0; // set to zero for next time
297  }
298  }
299  }
300 
301 } // namespace quda
void * getHostReduceBuffer()
Definition: reduce_quda.cu:75
bool commAsyncReduction()
__device__ static __host__ constexpr int size()
Definition: cub_helper.cuh:70
__device__ void reduce2d(ReduceArg< T > arg, const T &in, const int idx=0)
Definition: cub_helper.cuh:122
#define QUDA_MAX_MULTI_REDUCE
Maximum number of simultaneous reductions that can take place. This number may be increased if needed...
__device__ __host__ scalar & operator[](int i)
Definition: cub_helper.cuh:68
void * getMappedHostReduceBuffer()
Definition: reduce_quda.cu:74
__shared__ volatile bool isLastWarpDone[16]
Definition: cub_helper.cuh:166
__device__ void reduceRow(ReduceArg< T > arg, const T &in)
Definition: cub_helper.cuh:233
size_t size_t offset
#define b
__shared__ bool isLastBlockDone
Definition: cub_helper.cuh:119
__device__ __host__ void operator+=(const vector_type &a)
Definition: cub_helper.cuh:71
void exchange(void **ghost, void **sendbuf, int nFace=1) const
__host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
Definition: cub_helper.cuh:26
__host__ __device__ void sum(double &a, double &b)
__device__ __host__ const scalar & operator[](int i) const
Definition: cub_helper.cuh:69
cpuColorSpinorField * in
__device__ __host__ vector_type()
Definition: cub_helper.cuh:75
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
Definition: cub_helper.cuh:163
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator+(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor addition operator.
Definition: color_spinor.h:885
void * getDeviceReduceBuffer()
Definition: reduce_quda.cu:73
__device__ __host__ T operator()(const T &a, const T &b)
Definition: cub_helper.cuh:225
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880
const void * c
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:118
#define a
__syncthreads()
__device__ __host__ void zero(vector_type< scalar, n > &v)
Definition: cub_helper.cuh:82