#include <reduce_helper.h>
|
| ReduceArg (int n_reduce=1) |
|
template<typename host_t , typename device_t = host_t> |
void | complete (std::vector< host_t > &result, const qudaStream_t stream=0, bool reset=false) |
| Finalize the reduction, returning the computed reduction into result. With heterogeneous atomics this means we poll the atomics until their value differs from the init_value. The alternate legacy path posts an event after the kernel and then polls on completion of the event. More...
|
|
template<typename host_t , typename device_t = host_t> |
void | complete (host_t &result, const qudaStream_t stream=0, bool reset=false) |
| Overload providing a simple reference interface. More...
|
|
template<int block_size_x, int block_size_y, bool do_sum = true, typename Reducer = cub::Sum> |
__device__ void | reduce2d (const T &in, const int idx=0) |
| Generic reduction function that reduces block-distributed data "in" per thread to a single value. This is the legacy variant which require explicit host-device synchronization to signal the completion of the reduction to the host. More...
|
|
template<int block_size, bool do_sum = true, typename Reducer = cub::Sum> |
__device__ void | reduce (const T &in, const int idx=0) |
|
template<typename T>
struct quda::ReduceArg< T >
Definition at line 58 of file reduce_helper.h.
◆ ReduceArg()
◆ complete() [1/2]
template<typename T >
template<typename host_t , typename device_t = host_t>
Overload providing a simple reference interface.
- Parameters
-
[out] | result | The reduction result is copied here |
[in] | stream | The stream on which we the reduction is being done |
[in] | reset | Whether to reset the atomics after the reduction has completed; required if the same ReduceArg instance will be used for multiple reductions. |
Definition at line 169 of file reduce_helper.h.
◆ complete() [2/2]
template<typename T >
template<typename host_t , typename device_t = host_t>
Finalize the reduction, returning the computed reduction into result. With heterogeneous atomics this means we poll the atomics until their value differs from the init_value. The alternate legacy path posts an event after the kernel and then polls on completion of the event.
- Parameters
-
[out] | result | The reduction result is copied here |
[in] | stream | The stream on which we the reduction is being done |
[in] | reset | Whether to reset the atomics after the reduction has completed; required if the same aReducearg instance will be used for multiple reductions. |
Definition at line 125 of file reduce_helper.h.
◆ reduce()
template<typename T >
template<int block_size, bool do_sum = true, typename Reducer = cub::Sum>
__device__ void quda::ReduceArg< T >::reduce |
( |
const T & |
in, |
|
|
const int |
idx = 0 |
|
) |
| |
|
inline |
◆ reduce2d()
template<typename T >
template<int block_size_x, int block_size_y, bool do_sum = true, typename Reducer = cub::Sum>
__device__ void quda::ReduceArg< T >::reduce2d |
( |
const T & |
in, |
|
|
const int |
idx = 0 |
|
) |
| |
|
inline |
Generic reduction function that reduces block-distributed data "in" per thread to a single value. This is the legacy variant which require explicit host-device synchronization to signal the completion of the reduction to the host.
- Parameters
-
in | The input per-thread data to be reduced |
idx | In the case of multiple reductions, idx identifies which reduction this thread block corresponds to. Typically idx will be constant along constant blockIdx.y and blockIdx.z. |
Definition at line 257 of file reduce_helper.h.
◆ launch_error
The documentation for this struct was generated from the following file: