QUDA  v1.1.0
A library for QCD on GPUs
transform_reduce.h
Go to the documentation of this file.
1 #pragma once
2 #include <typeinfo>
3 
4 #include <reduce_helper.h>
5 #include <uint_to_char.h>
6 #include <tune_quda.h>
7 
15 namespace quda
16 {
17 
18  template <typename T> struct plus {
19  __device__ __host__ T operator()(T a, T b) { return a + b; }
20  };
21 
22  template <typename T> struct maximum {
23  __device__ __host__ T operator()(T a, T b) { return a > b ? a : b; }
24  };
25 
26  template <typename T> struct minimum {
27  __device__ __host__ T operator()(T a, T b) { return a < b ? a : b; }
28  };
29 
30  template <typename T> struct identity {
31  __device__ __host__ T operator()(T a) { return a; }
32  };
33 
34  template <typename reduce_t, typename T, typename count_t, typename transformer, typename reducer>
35  struct TransformReduceArg : public ReduceArg<reduce_t> {
36  static constexpr int block_size = 512;
37  static constexpr int n_batch_max = 8;
38  const T *v[n_batch_max];
40  int n_batch;
41  reduce_t init;
42  reduce_t result[n_batch_max];
43  transformer h;
44  reducer r;
45  TransformReduceArg(const std::vector<T *> &v, count_t n_items, transformer h, reduce_t init, reducer r) :
46  ReduceArg<reduce_t>(v.size()),
48  n_batch(v.size()),
49  init(init),
50  h(h),
51  r(r)
52  {
53  if (n_batch > n_batch_max) errorQuda("Requested batch %d greater than max supported %d", n_batch, n_batch_max);
54  for (size_t j = 0; j < v.size(); j++) this->v[j] = v[j];
55  }
56  };
57 
58  template <typename Arg> void transform_reduce(Arg &arg)
59  {
60  using count_t = decltype(arg.n_items);
61  using reduce_t = decltype(arg.init);
62 
63  for (int j = 0; j < arg.n_batch; j++) {
64  auto v = arg.v[j];
65  reduce_t r_ = arg.init;
66  for (count_t i = 0; i < arg.n_items; i++) {
67  auto v_ = arg.h(v[i]);
68  r_ = arg.r(r_, v_);
69  }
70  arg.result[j] = r_;
71  }
72  }
73 
74  template <typename Arg> __launch_bounds__(Arg::block_size) __global__ void transform_reduce_kernel(Arg arg)
75  {
76  using count_t = decltype(arg.n_items);
77  using reduce_t = decltype(arg.init);
78 
79  count_t i = blockIdx.x * blockDim.x + threadIdx.x;
80  int j = blockIdx.y;
81  auto v = arg.v[j];
82  reduce_t r_ = arg.init;
83 
84  while (i < arg.n_items) {
85  auto v_ = arg.h(v[i]);
86  r_ = arg.r(r_, v_);
87  i += blockDim.x * gridDim.x;
88  }
89 
90  arg.template reduce<Arg::block_size, false, decltype(arg.r)>(r_, j);
91  }
92 
93  template <typename reduce_t, typename T, typename I, typename transformer, typename reducer>
95  {
97  QudaFieldLocation location;
98  std::vector<reduce_t> &result;
99  const std::vector<T *> &v;
100  I n_items;
101  transformer &h;
102  reduce_t init;
103  reducer &r;
104 
105  bool tuneSharedBytes() const { return false; }
106  unsigned int sharedBytesPerThread() const { return 0; }
107  unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
108  int blockMin() const { return Arg::block_size; }
109  unsigned int maxBlockSize(const TuneParam &param) const { return Arg::block_size; }
110 
111  bool advanceTuneParam(TuneParam &param) const // only do autotuning if we have device fields
112  {
113  return location == QUDA_CUDA_FIELD_LOCATION ? Tunable::advanceTuneParam(param) : false;
114  }
115 
116  void initTuneParam(TuneParam &param) const
117  {
119  param.grid.y = v.size();
120  }
121 
122  public:
123  TransformReduce(QudaFieldLocation location, std::vector<reduce_t> &result, const std::vector<T *> &v, I n_items,
124  transformer &h, reduce_t init, reducer &r) :
125  location(location),
126  result(result),
127  v(v),
128  n_items(n_items),
129  h(h),
130  init(init),
131  r(r)
132  {
133  strcpy(aux, "batch_size=");
134  u32toa(aux + 11, v.size());
135  if (location == QUDA_CPU_FIELD_LOCATION) strcat(aux, ",cpu");
136  apply(0);
137  }
138 
139  void apply(const qudaStream_t &stream)
140  {
141  TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
142  Arg arg(v, n_items, h, init, r);
143 
144  if (location == QUDA_CUDA_FIELD_LOCATION) {
145  arg.launch_error = qudaLaunchKernel(transform_reduce_kernel<Arg>, tp, stream, arg);
146  arg.complete(result, stream);
147  } else {
149  for (size_t j = 0; j < result.size(); j++) result[j] = arg.result[j];
150  }
151  }
152 
153  TuneKey tuneKey() const
154  {
155  char count[16];
156  u32toa(count, n_items);
157  return TuneKey(count, typeid(*this).name(), aux);
158  }
159 
160  long long flops() const { return 0; } // just care about bandwidth
161  long long bytes() const { return v.size() * n_items * sizeof(T); }
162  };
163 
177  template <typename reduce_t, typename T, typename I, typename transformer, typename reducer>
178  void transform_reduce(QudaFieldLocation location, std::vector<reduce_t> &result, const std::vector<T *> &v, I n_items,
179  transformer h, reduce_t init, reducer r)
180  {
181  if (result.size() != v.size())
182  errorQuda("result %lu and input %lu set sizes do not match", result.size(), v.size());
183  TransformReduce<reduce_t, T, I, transformer, reducer> reduce(location, result, v, n_items, h, init, r);
184  }
185 
199  template <typename reduce_t, typename T, typename I, typename transformer, typename reducer>
200  reduce_t transform_reduce(QudaFieldLocation location, const T *v, I n_items, transformer h, reduce_t init, reducer r)
201  {
202  std::vector<reduce_t> result = {0.0};
203  std::vector<const T *> v_ = {v};
204  transform_reduce(location, result, v_, n_items, h, init, r);
205  return result[0];
206  }
207 
220  template <typename reduce_t, typename T, typename I, typename transformer, typename reducer>
221  void reduce(QudaFieldLocation location, std::vector<reduce_t> &result, const std::vector<T *> &v, I n_items,
222  reduce_t init, reducer r)
223  {
224  transform_reduce(location, result, v, n_items, identity<T>(), init, r);
225  }
226 
239  template <typename reduce_t, typename T, typename I, typename reducer>
240  reduce_t reduce(QudaFieldLocation location, const T *v, I n_items, reduce_t init, reducer r)
241  {
242  std::vector<reduce_t> result = {0.0};
243  std::vector<const T *> v_ = {v};
244  transform_reduce(location, result, v_, n_items, identity<T>(), init, r);
245  return result[0];
246  }
247 } // namespace quda
long long bytes() const
long long flops() const
TuneKey tuneKey() const
TransformReduce(QudaFieldLocation location, std::vector< reduce_t > &result, const std::vector< T * > &v, I n_items, transformer &h, reduce_t init, reducer &r)
void apply(const qudaStream_t &stream)
char aux[TuneKey::aux_n]
Definition: tune_quda.h:269
virtual bool advanceTuneParam(TuneParam &param) const
Definition: tune_quda.h:363
virtual void initTuneParam(TuneParam &param) const
Definition: tune_quda.h:332
@ QUDA_CUDA_FIELD_LOCATION
Definition: enum_quda.h:326
@ QUDA_CPU_FIELD_LOCATION
Definition: enum_quda.h:325
enum QudaFieldLocation_s QudaFieldLocation
void init()
Create the BLAS context.
void transform_reduce(Arg &arg)
__launch_bounds__(Arg::block_size) __global__ void transform_reduce_kernel(Arg arg)
TuneParam tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:677
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
void u32toa(char *buffer, uint32_t value)
Definition: uint_to_char.h:45
void reduce(QudaFieldLocation location, std::vector< reduce_t > &result, const std::vector< T * > &v, I n_items, reduce_t init, reducer r)
QUDA implementation providing thrust::reduce like functionality. Improves upon thrust's implementatio...
qudaStream_t * stream
qudaError_t qudaLaunchKernel(const void *func, const TuneParam &tp, void **args, qudaStream_t stream)
Wrapper around cudaLaunchKernel.
Definition: quda_api.cpp:57
QudaGaugeParam param
Definition: pack_test.cpp:18
cudaStream_t qudaStream_t
Definition: quda_api.h:9
unsigned int count_t
Definition: reduce_helper.h:15
TransformReduceArg(const std::vector< T * > &v, count_t n_items, transformer h, reduce_t init, reducer r)
static constexpr int n_batch_max
const T * v[n_batch_max]
static constexpr int block_size
reduce_t result[n_batch_max]
__device__ __host__ T operator()(T a)
__device__ __host__ T operator()(T a, T b)
__device__ __host__ T operator()(T a, T b)
__device__ __host__ T operator()(T a, T b)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:52
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:21
#define errorQuda(...)
Definition: util_quda.h:120