18 template <
typename T>
struct plus {
19 __device__ __host__ T
operator()(T a, T b) {
return a + b; }
23 __device__ __host__ T
operator()(T a, T b) {
return a > b ? a : b; }
27 __device__ __host__ T
operator()(T a, T b) {
return a < b ? a : b; }
34 template <
typename reduce_t,
typename T,
typename count_t,
typename transformer,
typename reducer>
54 for (
size_t j = 0; j <
v.size(); j++) this->v[j] =
v[j];
61 using reduce_t = decltype(
arg.init);
63 for (
int j = 0; j <
arg.n_batch; j++) {
65 reduce_t r_ =
arg.init;
67 auto v_ =
arg.h(v[i]);
74 template <
typename Arg>
__launch_bounds__(Arg::block_size) __global__
void transform_reduce_kernel(Arg
arg)
77 using reduce_t = decltype(
arg.init);
79 count_t i = blockIdx.x * blockDim.x + threadIdx.x;
82 reduce_t r_ =
arg.init;
84 while (i <
arg.n_items) {
85 auto v_ =
arg.h(v[i]);
87 i += blockDim.x * gridDim.x;
90 arg.template
reduce<Arg::block_size,
false, decltype(
arg.r)>(r_, j);
93 template <
typename reduce_t,
typename T,
typename I,
typename transformer,
typename reducer>
98 std::vector<reduce_t> &result;
99 const std::vector<T *> &v;
105 bool tuneSharedBytes()
const {
return false; }
106 unsigned int sharedBytesPerThread()
const {
return 0; }
107 unsigned int sharedBytesPerBlock(
const TuneParam &
param)
const {
return 0; }
119 param.grid.y = v.size();
124 transformer &h, reduce_t init, reducer &r) :
133 strcpy(
aux,
"batch_size=");
142 Arg arg(v, n_items, h, init, r);
149 for (
size_t j = 0; j < result.size(); j++) result[j] =
arg.result[j];
157 return TuneKey(count,
typeid(*this).name(),
aux);
160 long long flops()
const {
return 0; }
161 long long bytes()
const {
return v.size() * n_items *
sizeof(T); }
177 template <
typename reduce_t,
typename T,
typename I,
typename transformer,
typename reducer>
179 transformer h, reduce_t
init, reducer r)
181 if (result.size() != v.size())
182 errorQuda(
"result %lu and input %lu set sizes do not match", result.size(), v.size());
183 TransformReduce<reduce_t, T, I, transformer, reducer> reduce(location, result, v, n_items, h,
init, r);
199 template <
typename reduce_t,
typename T,
typename I,
typename transformer,
typename reducer>
202 std::vector<reduce_t> result = {0.0};
203 std::vector<const T *> v_ = {v};
220 template <
typename reduce_t,
typename T,
typename I,
typename transformer,
typename reducer>
222 reduce_t
init, reducer r)
239 template <
typename reduce_t,
typename T,
typename I,
typename reducer>
242 std::vector<reduce_t> result = {0.0};
243 std::vector<const T *> v_ = {v};
virtual bool advanceTuneParam(TuneParam ¶m) const
virtual void initTuneParam(TuneParam ¶m) const
@ QUDA_CUDA_FIELD_LOCATION
@ QUDA_CPU_FIELD_LOCATION
enum QudaFieldLocation_s QudaFieldLocation
void init()
Create the BLAS context.
void transform_reduce(Arg &arg)
__launch_bounds__(Arg::block_size) __global__ void transform_reduce_kernel(Arg arg)
TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void u32toa(char *buffer, uint32_t value)
void reduce(QudaFieldLocation location, std::vector< reduce_t > &result, const std::vector< T * > &v, I n_items, reduce_t init, reducer r)
QUDA implementation providing thrust::reduce like functionality. Improves upon thrust's implementatio...
qudaError_t qudaLaunchKernel(const void *func, const TuneParam &tp, void **args, qudaStream_t stream)
Wrapper around cudaLaunchKernel.
cudaStream_t qudaStream_t
__device__ __host__ T operator()(T a)
__device__ __host__ T operator()(T a, T b)
__device__ __host__ T operator()(T a, T b)
__device__ __host__ T operator()(T a, T b)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaVerbosity getVerbosity()