QUDA  v1.1.0
A library for QCD on GPUs
reduce_helper.cu
Go to the documentation of this file.
1 #include <quda_internal.h>
2 #include <malloc_quda.h>
3 #include <reduce_helper.h>
4 #include <tune_quda.h>
5 
6 // These are used for reduction kernels
7 static device_reduce_t *d_reduce = nullptr;
8 static device_reduce_t *h_reduce = nullptr;
9 static device_reduce_t *hd_reduce = nullptr;
10 
11 static count_t *reduce_count = nullptr;
12 static cudaEvent_t reduceEnd;
13 
14 namespace quda
15 {
16 
17  namespace reducer
18  {
19 
20  // FIXME need to dynamically resize these
21  void *get_device_buffer() { return d_reduce; }
22  void *get_mapped_buffer() { return hd_reduce; }
23  void *get_host_buffer() { return h_reduce; }
24  count_t *get_count() { return reduce_count; }
25  cudaEvent_t &get_event() { return reduceEnd; }
26 
27  size_t buffer_size()
28  {
29  /* we have these different reductions to cater for:
30 
31  - regular reductions (reduce_quda.cu) where are reducing to a
32  single vector type (max length 4 presently), and a
33  grid-stride loop with max number of blocks = 2 x SM count
34 
35  - multi-reductions where we are reducing to a matrix of size
36  of size QUDA_MAX_MULTI_REDUCE of vectors (max length 4),
37  and a grid-stride loop with maximum number of blocks = 2 x
38  SM count
39  */
40 
41  int reduce_size = 4 * sizeof(device_reduce_t);
42  int max_reduce = reduce_size;
43  int max_multi_reduce = max_n_reduce() * reduce_size;
44  int max_reduce_blocks = 2 * deviceProp.multiProcessorCount;
45 
46  // reduction buffer size
47  size_t bytes = max_reduce_blocks * std::max(max_reduce, max_multi_reduce);
48  return bytes;
49  }
50 
51  // need to use placement new constructor to initialize the atomic counters
52  template <typename T> __global__ void init_count(T *counter)
53  {
54  for (int i = 0; i < max_n_reduce(); i++) new (counter + i) T {0};
55  }
56 
57  void init()
58  {
59  auto bytes = buffer_size();
60  if (!d_reduce) d_reduce = (device_reduce_t *)device_malloc(bytes);
61 
62  // these arrays are actually oversized currently (only needs to be device_reduce_t x 3)
63 
64  // if the device supports host-mapped memory then use a host-mapped array for the reduction
65  if (!h_reduce) {
66  h_reduce = (device_reduce_t *)mapped_malloc(bytes);
67  hd_reduce = (device_reduce_t *)get_mapped_device_pointer(h_reduce); // set the matching device pointer
68 
69 #ifdef HETEROGENEOUS_ATOMIC
70  using system_atomic_t = device_reduce_t;
71  size_t n_reduce = bytes / sizeof(system_atomic_t);
72  auto *atomic_buf = reinterpret_cast<system_atomic_t *>(h_reduce); // FIXME
73  for (size_t i = 0; i < n_reduce; i++) new (atomic_buf + i) system_atomic_t {0}; // placement new constructor
74 #else
75  memset(h_reduce, 0, bytes); // added to ensure that valgrind doesn't report h_reduce is unitialised
76 #endif
77  }
78 
79  if (!reduce_count) {
80  reduce_count = static_cast<count_t *>(device_malloc(max_n_reduce() * sizeof(decltype(*reduce_count))));
81  TuneParam tp;
82  tp.grid = dim3(1, 1, 1);
83  tp.block = dim3(1, 1, 1);
84 
85  qudaLaunchKernel(init_count<count_t>, tp, 0, reduce_count);
86  }
87 
88  cudaEventCreateWithFlags(&reduceEnd, cudaEventDisableTiming);
89 
90  checkCudaError();
91  }
92 
93  void destroy()
94  {
95  cudaEventDestroy(reduceEnd);
96 
97  if (reduce_count) {
98  device_free(reduce_count);
99  reduce_count = nullptr;
100  }
101  if (d_reduce) {
102  device_free(d_reduce);
103  d_reduce = 0;
104  }
105  if (h_reduce) {
106  host_free(h_reduce);
107  h_reduce = 0;
108  }
109  hd_reduce = 0;
110  }
111 
112  } // namespace reducer
113 } // namespace quda