10 #if defined(__clang__) && defined(__CUDA__) && CUDA_VERSION >= 9000 11 #define CUB_USE_COOPERATIVE_GROUPS 14 #include <cub/block/block_reduce.cuh> 16 #if __COMPUTE_CAPABILITY__ >= 300 17 #include <generics/shfl.h> 33 template <
typename scalar,
int n>
38 __device__ __host__
inline static constexpr
int size() {
return n; }
41 for (
int i=0; i<n; i++) data[i] += a[i];
45 for (
int i=0; i<n; i++) data[i] = a[i];
49 for (
int i=0; i<n; i++)
zero(data[i]);
53 template<
typename scalar,
int n>
56 for (
int i=0; i<n; i++)
zero(v.
data[i]);
59 template<
typename scalar,
int n>
63 for (
int i=0; i<n; i++) c[i] = a[i] + b[i];
93 template <
int block_size_x,
int block_size_y,
typename T,
bool do_sum=true,
typename Reducer=cub::Sum>
96 typedef cub::BlockReduce<T, block_size_x, cub::BLOCK_REDUCE_WARP_REDUCTIONS, block_size_y> BlockReduce;
97 __shared__
typename BlockReduce::TempStorage cub_tmp;
100 T aggregate = (do_sum ? BlockReduce(cub_tmp).Sum(in) : BlockReduce(cub_tmp).Reduce(in, r));
102 if (threadIdx.x == 0 && threadIdx.y == 0) {
103 arg.
partial[idx*gridDim.x + blockIdx.x] = aggregate;
107 unsigned int value = atomicInc(&count[idx], gridDim.x);
110 isLastBlockDone = (value == (gridDim.x-1));
116 if (isLastBlockDone) {
117 unsigned int i = threadIdx.y*block_size_x + threadIdx.x;
120 while (i<gridDim.x) {
121 sum = r(sum, arg.
partial[idx*gridDim.x + i]);
123 i += block_size_x*block_size_y;
126 sum = (do_sum ? BlockReduce(cub_tmp).Sum(sum) : BlockReduce(cub_tmp).Reduce(sum,r));
129 if (threadIdx.y*block_size_x + threadIdx.x == 0) {
136 template <
int block_size,
typename T,
bool do_sum = true,
typename Reducer = cub::Sum>
142 #if __COMPUTE_CAPABILITY__ >= 300 151 template <
typename T>
152 __device__
inline void warp_reduce(
ReduceArg<T> arg,
const T &
in,
const int idx=0) {
154 const int warp_size = 32;
157 for (
int offset = warp_size/2; offset > 0; offset /= 2) aggregate += __shfl_down(aggregate, offset);
159 if (threadIdx.x == 0) {
160 arg.
partial[idx*gridDim.x + blockIdx.x] = aggregate;
164 unsigned int value = atomicInc(&count[idx], gridDim.x);
167 if (threadIdx.y == 0) isLastBlockDone = (value == (gridDim.x-1));
173 if (isLastBlockDone) {
174 unsigned int i = threadIdx.x;
177 while (i<gridDim.x) {
178 sum += arg.
partial[idx*gridDim.x + i];
183 for (
int offset = warp_size/2; offset > 0; offset /= 2) sum += __shfl_down(sum, offset);
186 if (threadIdx.x == 0) {
192 #endif // __COMPUTE_CAPABILITY__ >= 300 197 template <
typename T>
199 __device__ __host__
inline T
operator()(
const T &a,
const T &b) {
201 for (
int i=0; i<sum.size(); i++) sum[i] = a[i] + b[i];
206 template <
int block_size_x,
int block_size_y,
typename T>
210 typedef cub::BlockReduce<vector, block_size_x, cub::BLOCK_REDUCE_WARP_REDUCTIONS, block_size_y> BlockReduce;
211 constexpr
int n_word =
sizeof(T) /
sizeof(
int);
214 typename BlockReduce::TempStorage cub;
215 int exchange[n_word*block_size_x*block_size_y];
219 if (threadIdx.y > 0) {
220 for (
int i=0; i<n_word; i++)
221 shared.exchange[(i * block_size_y + threadIdx.y)*block_size_x + threadIdx.x] =
reinterpret_cast<const int*
>(&
in)[i];
228 if (threadIdx.y == 0) {
230 for (
int y=1; y<block_size_y; y++)
231 for (
int i=0; i<n_word; i++)
232 reinterpret_cast<int*>(&data[y])[i] = shared.
exchange[(i * block_size_y + y)*block_size_x + threadIdx.x];
239 vector aggregate = BlockReduce(shared.cub).Reduce(data, reducer, block_size_x);
241 if (threadIdx.x == 0 && threadIdx.y == 0) {
242 reinterpret_cast<vector*
>(arg.
partial)[blockIdx.x] = aggregate;
246 unsigned int value = atomicInc(&count[0], gridDim.x);
249 isLastBlockDone = (value == (gridDim.x-1));
255 if (isLastBlockDone) {
257 if (threadIdx.y == 0) {
258 unsigned int i = threadIdx.x;
259 while (i < gridDim.x) {
260 sum +=
reinterpret_cast<vector*
>(arg.
partial)[i];
265 sum = BlockReduce(shared.cub).Reduce(sum, reducer, block_size_x);
268 if (threadIdx.y*block_size_x + threadIdx.x == 0) {
269 reinterpret_cast<vector*
>(arg.
result_d)[0] = sum;
__device__ void reduce2d(ReduceArg< T > arg, const T &in, const int idx=0)
void * getHostReduceBuffer()
bool commAsyncReduction()
__device__ static __host__ constexpr int size()
#define QUDA_MAX_MULTI_REDUCE
Maximum number of simultaneous reductions that can take place. This number may be increased if needed...
__device__ __host__ void operator=(const vector_type &a)
__device__ __host__ scalar & operator[](int i)
void * getMappedHostReduceBuffer()
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
__host__ __device__ void sum(double &a, double &b)
__shared__ volatile bool isLastWarpDone[16]
__device__ void reduceRow(ReduceArg< T > arg, const T &in)
__shared__ bool isLastBlockDone
__device__ __host__ void operator+=(const vector_type &a)
void exchange(void **ghost, void **sendbuf, int nFace=1) const
__device__ __host__ const scalar & operator[](int i) const
__device__ __host__ vector_type()
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator+(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor addition operator.
void * getDeviceReduceBuffer()
__device__ __host__ T operator()(const T &a, const T &b)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
__device__ __host__ void zero(vector_type< scalar, n > &v)