1 #include <quda_internal.h>
2 #include <malloc_quda.h>
3 #include <reduce_helper.h>
6 // These are used for reduction kernels
7 static device_reduce_t *d_reduce = nullptr;
8 static device_reduce_t *h_reduce = nullptr;
9 static device_reduce_t *hd_reduce = nullptr;
11 static count_t *reduce_count = nullptr;
12 static cudaEvent_t reduceEnd;
20 // FIXME need to dynamically resize these
21 void *get_device_buffer() { return d_reduce; }
22 void *get_mapped_buffer() { return hd_reduce; }
23 void *get_host_buffer() { return h_reduce; }
24 count_t *get_count() { return reduce_count; }
25 cudaEvent_t &get_event() { return reduceEnd; }
29 /* we have these different reductions to cater for:
31 - regular reductions (reduce_quda.cu) where are reducing to a
32 single vector type (max length 4 presently), and a
33 grid-stride loop with max number of blocks = 2 x SM count
35 - multi-reductions where we are reducing to a matrix of size
36 of size QUDA_MAX_MULTI_REDUCE of vectors (max length 4),
37 and a grid-stride loop with maximum number of blocks = 2 x
41 int reduce_size = 4 * sizeof(device_reduce_t);
42 int max_reduce = reduce_size;
43 int max_multi_reduce = max_n_reduce() * reduce_size;
44 int max_reduce_blocks = 2 * deviceProp.multiProcessorCount;
46 // reduction buffer size
47 size_t bytes = max_reduce_blocks * std::max(max_reduce, max_multi_reduce);
51 // need to use placement new constructor to initialize the atomic counters
52 template <typename T> __global__ void init_count(T *counter)
54 for (int i = 0; i < max_n_reduce(); i++) new (counter + i) T {0};
59 auto bytes = buffer_size();
60 if (!d_reduce) d_reduce = (device_reduce_t *)device_malloc(bytes);
62 // these arrays are actually oversized currently (only needs to be device_reduce_t x 3)
64 // if the device supports host-mapped memory then use a host-mapped array for the reduction
66 h_reduce = (device_reduce_t *)mapped_malloc(bytes);
67 hd_reduce = (device_reduce_t *)get_mapped_device_pointer(h_reduce); // set the matching device pointer
69 #ifdef HETEROGENEOUS_ATOMIC
70 using system_atomic_t = device_reduce_t;
71 size_t n_reduce = bytes / sizeof(system_atomic_t);
72 auto *atomic_buf = reinterpret_cast<system_atomic_t *>(h_reduce); // FIXME
73 for (size_t i = 0; i < n_reduce; i++) new (atomic_buf + i) system_atomic_t {0}; // placement new constructor
75 memset(h_reduce, 0, bytes); // added to ensure that valgrind doesn't report h_reduce is unitialised
80 reduce_count = static_cast<count_t *>(device_malloc(max_n_reduce() * sizeof(decltype(*reduce_count))));
82 tp.grid = dim3(1, 1, 1);
83 tp.block = dim3(1, 1, 1);
85 qudaLaunchKernel(init_count<count_t>, tp, 0, reduce_count);
88 cudaEventCreateWithFlags(&reduceEnd, cudaEventDisableTiming);
95 cudaEventDestroy(reduceEnd);
98 device_free(reduce_count);
99 reduce_count = nullptr;
102 device_free(d_reduce);
112 } // namespace reducer