7 #if __COMPUTE_CAPABILITY__ >= 300 8 #include <generics/shfl.h> 26 __host__ __device__ __forceinline__ T
operator() (
const T &
a,
const T &
b){
36 __host__ __device__ __forceinline__ double2
operator() (
const double2 &
a,
const double2 &
b){
37 return make_double2(
a.x +
b.x,
a.y +
b.y);
46 __host__ __device__ __forceinline__ double3
operator() (
const double3 &
a,
const double3 &
b){
47 return make_double3(
a.x +
b.x,
a.y +
b.y,
a.z +
b.z);
56 __host__ __device__ __forceinline__ double4
operator() (
const double4 &
a,
const double4 &
b){
57 return make_double4(
a.x +
b.x,
a.y +
b.y,
a.z +
b.z,
a.w +
b.w);
65 template <
typename scalar,
int n>
70 __device__ __host__
inline static constexpr
int size() {
return n; }
81 template<
typename scalar,
int n>
87 template<
typename scalar,
int n>
121 template <
int block_size_x,
int block_size_y,
typename T>
124 typedef cub::BlockReduce<T, block_size_x, cub::BLOCK_REDUCE_WARP_REDUCTIONS, block_size_y> BlockReduce;
125 __shared__
typename BlockReduce::TempStorage cub_tmp;
127 T aggregate = BlockReduce(cub_tmp).Sum(
in);
129 if (threadIdx.x == 0 && threadIdx.y == 0) {
144 unsigned int i = threadIdx.y*block_size_x + threadIdx.x;
149 i += block_size_x*block_size_y;
152 sum = BlockReduce(cub_tmp).Sum(
sum);
155 if (threadIdx.y*block_size_x + threadIdx.x == 0) {
162 template <
int block_size,
typename T>
168 #if __COMPUTE_CAPABILITY__ >= 300 177 template <
typename T>
180 const int warp_size = 32;
185 if (threadIdx.x == 0) {
200 unsigned int i = threadIdx.x;
212 if (threadIdx.x == 0) {
218 #endif // __COMPUTE_CAPABILITY__ >= 300 223 template <
typename T>
232 template <
int block_size_x,
int block_size_y,
typename T>
236 typedef cub::BlockReduce<vector, block_size_x, cub::BLOCK_REDUCE_WARP_REDUCTIONS, block_size_y> BlockReduce;
237 constexpr
int n_word =
sizeof(T) /
sizeof(
int);
240 typename BlockReduce::TempStorage cub;
241 int exchange[n_word*block_size_x*block_size_y];
245 if (threadIdx.y > 0) {
246 for (
int i=0;
i<n_word;
i++)
247 shared.exchange[(
i * block_size_y + threadIdx.y)*block_size_x + threadIdx.x] =
reinterpret_cast<const int*
>(&
in)[
i];
254 if (threadIdx.y == 0) {
256 for (
int y=1;
y<block_size_y;
y++)
257 for (
int i=0;
i<n_word;
i++)
258 reinterpret_cast<int*>(&data[
y])[
i] = shared.
exchange[(
i * block_size_y +
y)*block_size_x + threadIdx.x];
265 vector aggregate = BlockReduce(shared.cub).Reduce(data, reducer, block_size_x);
267 if (threadIdx.x == 0 && threadIdx.y == 0) {
268 reinterpret_cast<vector*
>(
arg.partial)[blockIdx.x] = aggregate;
283 if (threadIdx.y == 0) {
284 unsigned int i = threadIdx.x;
286 sum +=
reinterpret_cast<vector*
>(
arg.partial)[
i];
291 sum = BlockReduce(shared.cub).Reduce(
sum, reducer, block_size_x);
294 if (threadIdx.y*block_size_x + threadIdx.x == 0) {
295 reinterpret_cast<vector*
>(
arg.result_d)[0] =
sum;
void * getHostReduceBuffer()
bool commAsyncReduction()
__device__ static __host__ constexpr int size()
__device__ void reduce2d(ReduceArg< T > arg, const T &in, const int idx=0)
#define QUDA_MAX_MULTI_REDUCE
Maximum number of simultaneous reductions that can take place. This number may be increased if needed...
__device__ __host__ scalar & operator[](int i)
void * getMappedHostReduceBuffer()
__shared__ volatile bool isLastWarpDone[16]
__device__ void reduceRow(ReduceArg< T > arg, const T &in)
__shared__ bool isLastBlockDone
__device__ __host__ void operator+=(const vector_type &a)
void exchange(void **ghost, void **sendbuf, int nFace=1) const
__host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
__host__ __device__ void sum(double &a, double &b)
__device__ __host__ const scalar & operator[](int i) const
__device__ __host__ vector_type()
__device__ void reduce(ReduceArg< T > arg, const T &in, const int idx=0)
__device__ __host__ ColorSpinor< Float, Nc, Ns > operator+(const ColorSpinor< Float, Nc, Ns > &x, const ColorSpinor< Float, Nc, Ns > &y)
ColorSpinor addition operator.
void * getDeviceReduceBuffer()
__device__ __host__ T operator()(const T &a, const T &b)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
__device__ __host__ void zero(vector_type< scalar, n > &v)