QUDA  0.9.0
malloc.cpp
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <map>
5 #include <unistd.h> // for getpagesize()
6 #include <execinfo.h> // for backtrace
7 #include <quda_internal.h>
8 
9 #ifdef USE_QDPJIT
10 #include "qdp_quda.h"
11 #endif
12 
13 namespace quda {
14 
15  enum AllocType {
21  };
22 
23  class MemAlloc {
24 
25  public:
26  std::string func;
27  std::string file;
28  int line;
29  size_t size;
30  size_t base_size;
31 
33  : line(-1), size(0), base_size(0) { }
34 
35  MemAlloc(std::string func, std::string file, int line)
36  : func(func), file(file), line(line), size(0), base_size(0) { }
37 
39  if (&a != this) {
40  func = a.func;
41  file = a.file;
42  line = a.line;
43  size = a.size;
44  base_size = a.base_size;
45  }
46  return *this;
47  }
48  };
49 
50 
51  static std::map<void *, MemAlloc> alloc[N_ALLOC_TYPE];
52  static long total_bytes[N_ALLOC_TYPE] = {0};
53  static long max_total_bytes[N_ALLOC_TYPE] = {0};
56 
58 
60 
62 
64 
65  static void print_trace (void) {
66  void *array[10];
67  size_t size;
68  char **strings;
69  size = backtrace (array, 10);
70  strings = backtrace_symbols (array, size);
71  printfQuda("Obtained %zd stack frames.\n", size);
72  for (size_t i=0; i<size; i++) printfQuda("%s\n", strings[i]);
73  free(strings);
74  }
75 
76  static void print_alloc_header()
77  {
78  printfQuda("Type Pointer Size Location\n");
79  printfQuda("----------------------------------------------------------\n");
80  }
81 
82 
83  static void print_alloc(AllocType type)
84  {
85  const char *type_str[] = {"Device", "Host ", "Pinned", "Mapped"};
86  std::map<void *, MemAlloc>::iterator entry;
87 
88  for (entry = alloc[type].begin(); entry != alloc[type].end(); entry++) {
89  void *ptr = entry->first;
90  MemAlloc a = entry->second;
91  printfQuda("%s %15p %15lu %s(), %s:%d\n", type_str[type], ptr, (unsigned long) a.base_size,
92  a.func.c_str(), a.file.c_str(), a.line);
93  }
94  }
95 
96 
97  static void track_malloc(const AllocType &type, const MemAlloc &a, void *ptr)
98  {
99  total_bytes[type] += a.base_size;
100  if (total_bytes[type] > max_total_bytes[type]) {
101  max_total_bytes[type] = total_bytes[type];
102  }
103  if (type != DEVICE) {
104  total_host_bytes += a.base_size;
107  }
108  }
109  if (type == PINNED || type == MAPPED) {
110  total_pinned_bytes += a.base_size;
113  }
114  }
115  alloc[type][ptr] = a;
116  }
117 
118 
119  static void track_free(const AllocType &type, void *ptr)
120  {
121  size_t size = alloc[type][ptr].base_size;
122  total_bytes[type] -= size;
123  if (type != DEVICE) {
125  }
126  if (type == PINNED || type == MAPPED) {
128  }
129  alloc[type].erase(ptr);
130  }
131 
132 
139  static void *aligned_malloc(MemAlloc &a, size_t size)
140  {
141  void *ptr = nullptr;
142 
143  a.size = size;
144 
145 #if (CUDA_VERSION > 4000) && 0 // we need to manually align to page boundaries to allow us to bind a texture to mapped memory
146  a.base_size = size;
147  ptr = malloc(size);
148  if (!ptr ) {
149 #else
150  static int page_size = 2*getpagesize();
151  a.base_size = ((size + page_size - 1) / page_size) * page_size; // round up to the nearest multiple of page_size
152  int align = posix_memalign(&ptr, page_size, a.base_size);
153  if (!ptr || align != 0) {
154 #endif
155  printfQuda("ERROR: Failed to allocate aligned host memory of size %zu (%s:%d in %s())\n", size, a.file.c_str(), a.line, a.func.c_str());
156  errorQuda("Aborting");
157  }
158  return ptr;
159  }
160 
161 
167  void *device_malloc_(const char *func, const char *file, int line, size_t size)
168  {
169  MemAlloc a(func, file, line);
170  void *ptr;
171 
172  a.size = a.base_size = size;
173 
174  cudaError_t err = cudaMalloc(&ptr, size);
175  if (err != cudaSuccess) {
176  printfQuda("ERROR: Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
177  errorQuda("Aborting");
178  }
180 #ifdef HOST_DEBUG
181  cudaMemset(ptr, 0xff, size);
182 #endif
183  return ptr;
184  }
185 
186 
194  void *device_pinned_malloc_(const char *func, const char *file, int line, size_t size)
195  {
196  MemAlloc a(func, file, line);
197  void *ptr;
198 
199  a.size = a.base_size = size;
200 
201  CUresult err = cuMemAlloc((CUdeviceptr*)&ptr, size);
202  if (err != CUDA_SUCCESS) {
203  printfQuda("ERROR: Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
204  errorQuda("Aborting");
205  }
207 #ifdef HOST_DEBUG
208  cudaMemset(ptr, 0xff, size);
209 #endif
210  return ptr;
211  }
212 
213 
219  void *safe_malloc_(const char *func, const char *file, int line, size_t size)
220  {
221  MemAlloc a(func, file, line);
222  a.size = a.base_size = size;
223 
224  void *ptr = malloc(size);
225  if (!ptr) {
226  printfQuda("ERROR: Failed to allocate host memory of size %zu (%s:%d in %s())\n", size, file, line, func);
227  errorQuda("Aborting");
228  }
229  track_malloc(HOST, a, ptr);
230 #ifdef HOST_DEBUG
231  memset(ptr, 0xff, size);
232 #endif
233  return ptr;
234  }
235 
236 
246  void *pinned_malloc_(const char *func, const char *file, int line, size_t size)
247  {
248  MemAlloc a(func, file, line);
249  void *ptr = aligned_malloc(a, size);
250 
251  cudaError_t err = cudaHostRegister(ptr, a.base_size, cudaHostRegisterDefault);
252  if (err != cudaSuccess) {
253  printfQuda("ERROR: Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func);
254  errorQuda("Aborting");
255  }
257 #ifdef HOST_DEBUG
258  memset(ptr, 0xff, a.base_size);
259 #endif
260  return ptr;
261  }
262 
263 
269  void *mapped_malloc_(const char *func, const char *file, int line, size_t size)
270  {
271  MemAlloc a(func, file, line);
272  void *ptr = aligned_malloc(a, size);
273 
274  cudaError_t err = cudaHostRegister(ptr, a.base_size, cudaHostRegisterMapped);
275  if (err != cudaSuccess) {
276  printfQuda("ERROR: Failed to register host-mapped memory of size %zu (%s:%d in %s())\n", size, file, line, func);
277  errorQuda("Aborting");
278  }
280 #ifdef HOST_DEBUG
281  memset(ptr, 0xff, a.base_size);
282 #endif
283  return ptr;
284  }
285 
286 
292  void device_free_(const char *func, const char *file, int line, void *ptr)
293  {
294  if (!ptr) {
295  printfQuda("ERROR: Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func);
296  errorQuda("Aborting");
297  }
298  if (!alloc[DEVICE].count(ptr)) {
299  printfQuda("ERROR: Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
300  errorQuda("Aborting");
301  }
302  cudaError_t err = cudaFree(ptr);
303  if (err != cudaSuccess) {
304  printfQuda("ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func);
305  errorQuda("Aborting");
306  }
308  }
309 
310 
316  void device_pinned_free_(const char *func, const char *file, int line, void *ptr)
317  {
318  if (!ptr) {
319  printfQuda("ERROR: Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func);
320  errorQuda("Aborting");
321  }
322  if (!alloc[DEVICE].count(ptr)) {
323  printfQuda("ERROR: Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
324  errorQuda("Aborting");
325  }
326  CUresult err = cuMemFree((CUdeviceptr)ptr);
327  if (err != CUDA_SUCCESS) {
328  printfQuda("ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func);
329  errorQuda("Aborting");
330  }
332  }
333 
334 
340  void host_free_(const char *func, const char *file, int line, void *ptr)
341  {
342  if (!ptr) {
343  printfQuda("ERROR: Attempt to free NULL host pointer (%s:%d in %s())\n", file, line, func);
344  errorQuda("Aborting");
345  }
346  if (alloc[HOST].count(ptr)) {
347  track_free(HOST, ptr);
348  } else if (alloc[PINNED].count(ptr)) {
349  cudaError_t err = cudaHostUnregister(ptr);
350  if (err != cudaSuccess) {
351  printfQuda("ERROR: Failed to unregister pinned memory (%s:%d in %s())\n", file, line, func);
352  errorQuda("Aborting");
353  }
355  } else if (alloc[MAPPED].count(ptr)) {
356  cudaError_t err = cudaHostUnregister(ptr);
357  if (err != cudaSuccess) {
358  printfQuda("ERROR: Failed to unregister host-mapped memory (%s:%d in %s())\n", file, line, func);
359  errorQuda("Aborting");
360  }
362  } else {
363  printfQuda("ERROR: Attempt to free invalid host pointer (%s:%d in %s())\n", file, line, func);
364  print_trace();
365  errorQuda("Aborting");
366  }
367  free(ptr);
368  }
369 
370 
372  {
373  printfQuda("Device memory used = %.1f MB\n", max_total_bytes[DEVICE] / (double)(1<<20));
374  printfQuda("Page-locked host memory used = %.1f MB\n", max_total_pinned_bytes / (double)(1<<20));
375  printfQuda("Total host memory used >= %.1f MB\n", max_total_host_bytes / (double)(1<<20));
376  }
377 
378 
380  {
381  if (!alloc[DEVICE].empty() || !alloc[HOST].empty() || !alloc[PINNED].empty() || !alloc[MAPPED].empty()) {
382  warningQuda("The following internal memory allocations were not freed.");
383  printfQuda("\n");
386  print_alloc(HOST);
389  printfQuda("\n");
390  }
391  }
392 
393 
394  namespace pool {
395 
399  static std::multimap<size_t, void *> pinnedCache;
400 
404  static std::map<void *, size_t> pinnedSize;
405 
409  static std::multimap<size_t, void *> deviceCache;
410 
414  static std::map<void *, size_t> deviceSize;
415 
416  static bool pool_init = false;
417 
419  static bool device_memory_pool = true;
420 
422  static bool pinned_memory_pool = true;
423 
424  void init() {
425  if (!pool_init) {
426  // device memory pool
427  char *enable_device_pool = getenv("QUDA_ENABLE_DEVICE_MEMORY_POOL");
428  if (!enable_device_pool || strcmp(enable_device_pool,"0")!=0) {
429  warningQuda("Using device memory pool allocator");
430  device_memory_pool = true;
431  } else {
432  warningQuda("Not using device memory pool allocator");
433  device_memory_pool = false;
434  }
435 
436  // pinned memory pool
437  char *enable_pinned_pool = getenv("QUDA_ENABLE_PINNED_MEMORY_POOL");
438  if (!enable_pinned_pool || strcmp(enable_pinned_pool,"0")!=0) {
439  warningQuda("Using pinned memory pool allocator");
440  pinned_memory_pool = true;
441  } else {
442  warningQuda("Not using pinned memory pool allocator");
443  pinned_memory_pool = false;
444  }
445  pool_init = true;
446  }
447  }
448 
449  void* pinned_malloc_(const char *func, const char *file, int line, size_t nbytes)
450  {
451  void *ptr = nullptr;
452  if (pinned_memory_pool) {
453  std::multimap<size_t, void *>::iterator it;
454 
455  if (pinnedCache.empty()) {
456  ptr = quda::pinned_malloc_(func, file, line, nbytes);
457  } else {
458  it = pinnedCache.lower_bound(nbytes);
459  if (it != pinnedCache.end()) { // sufficiently large allocation found
460  nbytes = it->first;
461  ptr = it->second;
462  pinnedCache.erase(it);
463  } else { // sacrifice the smallest cached allocation
464  it = pinnedCache.begin();
465  ptr = it->second;
466  pinnedCache.erase(it);
467  host_free(ptr);
468  ptr = quda::pinned_malloc_(func, file, line, nbytes);
469  }
470  }
471  pinnedSize[ptr] = nbytes;
472  } else {
473  ptr = quda::pinned_malloc_(func, file, line, nbytes);
474  }
475  return ptr;
476  }
477 
478  void pinned_free_(const char *func, const char *file, int line, void *ptr)
479  {
480  if (pinned_memory_pool) {
481  if (!pinnedSize.count(ptr)) {
482  errorQuda("Attempt to free invalid pointer");
483  }
484  pinnedCache.insert(std::make_pair(pinnedSize[ptr], ptr));
485  pinnedSize.erase(ptr);
486  } else {
487  quda::host_free_(func, file, line, ptr);
488  }
489  }
490 
491  void* device_malloc_(const char *func, const char *file, int line, size_t nbytes)
492  {
493  void *ptr = nullptr;
494  if (device_memory_pool) {
495  std::multimap<size_t, void *>::iterator it;
496 
497  if (deviceCache.empty()) {
498  ptr = quda::device_malloc_(func, file, line, nbytes);
499  } else {
500  it = deviceCache.lower_bound(nbytes);
501  if (it != deviceCache.end()) { // sufficiently large allocation found
502  nbytes = it->first;
503  ptr = it->second;
504  deviceCache.erase(it);
505  } else { // sacrifice the smallest cached allocation
506  it = deviceCache.begin();
507  ptr = it->second;
508  deviceCache.erase(it);
509  quda::device_free_(func, file, line, ptr);
510  ptr = quda::device_malloc_(func, file, line, nbytes);
511  }
512  }
513  deviceSize[ptr] = nbytes;
514  } else {
515  ptr = quda::device_malloc_(func, file, line, nbytes);
516  }
517  return ptr;
518  }
519 
520  void device_free_(const char *func, const char *file, int line, void *ptr)
521  {
522  if (device_memory_pool) {
523  if (!deviceSize.count(ptr)) {
524  errorQuda("Attempt to free invalid pointer");
525  }
526  deviceCache.insert(std::make_pair(deviceSize[ptr], ptr));
527  deviceSize.erase(ptr);
528  } else {
529  quda::device_free_(func, file, line, ptr);
530  }
531  }
532 
534  {
535  if (pinned_memory_pool) {
536  std::multimap<size_t, void *>::iterator it;
537  for (it = pinnedCache.begin(); it != pinnedCache.end(); it++) {
538  void *ptr = it->second;
539  host_free(ptr);
540  }
541  pinnedCache.clear();
542  }
543  }
544 
546  {
547  if (device_memory_pool) {
548  std::multimap<size_t, void *>::iterator it;
549  for (it = deviceCache.begin(); it != deviceCache.end(); it++) {
550  void *ptr = it->second;
551  device_free(ptr);
552  }
553  deviceCache.clear();
554  }
555  }
556 
557  } // namespace pool
558 
559 } // namespace quda
static long max_total_host_bytes
Definition: malloc.cpp:54
long device_allocated_peak()
Definition: malloc.cpp:57
std::string file
Definition: malloc.cpp:27
void free(void *)
static std::map< void *, size_t > deviceSize
Definition: malloc.cpp:414
static std::multimap< size_t, void * > deviceCache
Definition: malloc.cpp:409
void * safe_malloc_(const char *func, const char *file, int line, size_t size)
Definition: malloc.cpp:219
static void print_alloc_header()
Definition: malloc.cpp:76
static long max_total_pinned_bytes
Definition: malloc.cpp:55
const void * func
static std::multimap< size_t, void * > pinnedCache
Definition: malloc.cpp:399
#define errorQuda(...)
Definition: util_quda.h:90
void * device_malloc_(const char *func, const char *file, int line, size_t size)
Definition: malloc.cpp:167
#define host_free(ptr)
Definition: malloc_quda.h:59
static std::map< void *, MemAlloc > alloc[N_ALLOC_TYPE]
Definition: malloc.cpp:51
int posix_memalign(void **__memptr, size_t __alignment, size_t __size) __attribute__((availability(macosx
void assertAllMemFree()
Definition: malloc.cpp:379
static void print_alloc(AllocType type)
Definition: malloc.cpp:83
void * device_pinned_malloc_(const char *func, const char *file, int line, size_t size)
Definition: malloc.cpp:194
static std::map< void *, size_t > pinnedSize
Definition: malloc.cpp:404
void device_free_(const char *func, const char *file, int line, void *ptr)
Virtual free of pinned-memory allocation.
Definition: malloc.cpp:520
void * pinned_malloc_(const char *func, const char *file, int line, size_t size)
Allocate pinned-memory. If a free pre-existing allocation exists reuse this.
Definition: malloc.cpp:449
void device_free_(const char *func, const char *file, int line, void *ptr)
Definition: malloc.cpp:292
MemAlloc & operator=(const MemAlloc &a)
Definition: malloc.cpp:38
std::string func
Definition: malloc.cpp:26
void flush_pinned()
Free all outstanding pinned-memory allocations.
Definition: malloc.cpp:533
static void track_free(const AllocType &type, void *ptr)
Definition: malloc.cpp:119
int strcmp(const char *__s1, const char *__s2)
static __inline__ T * entry
void device_pinned_free_(const char *func, const char *file, int line, void *ptr)
Definition: malloc.cpp:316
void init()
Initialize the memory pool allocator.
Definition: malloc.cpp:424
MemAlloc(std::string func, std::string file, int line)
Definition: malloc.cpp:35
void host_free_(const char *func, const char *file, int line, void *ptr)
Definition: malloc.cpp:340
void * malloc(size_t __size) __attribute__((__warn_unused_result__)) __attribute__((alloc_size(1)))
static long max_total_bytes[N_ALLOC_TYPE]
Definition: malloc.cpp:53
void flush_device()
Free all outstanding device-memory allocations.
Definition: malloc.cpp:545
static map::iterator it
Definition: tune.cpp:91
#define warningQuda(...)
Definition: util_quda.h:101
static void print_trace(void)
Definition: malloc.cpp:65
static void * aligned_malloc(MemAlloc &a, size_t size)
Definition: malloc.cpp:139
cudaError_t err
long host_allocated_peak()
Definition: malloc.cpp:63
long mapped_allocated_peak()
Definition: malloc.cpp:61
static bool pinned_memory_pool
Definition: malloc.cpp:422
const void * ptr
void * mapped_malloc_(const char *func, const char *file, int line, size_t size)
Definition: malloc.cpp:269
static bool device_memory_pool
Definition: malloc.cpp:419
static long total_pinned_bytes
Definition: malloc.cpp:55
void * memset(void *__b, int __c, size_t __len)
long pinned_allocated_peak()
Definition: malloc.cpp:59
void * device_malloc_(const char *func, const char *file, int line, size_t size)
Allocate device-memory. If free pre-existing allocation exists reuse this.
Definition: malloc.cpp:491
void printPeakMemUsage()
Definition: malloc.cpp:371
#define printfQuda(...)
Definition: util_quda.h:84
static long total_bytes[N_ALLOC_TYPE]
Definition: malloc.cpp:52
void pinned_free_(const char *func, const char *file, int line, void *ptr)
Virtual free of pinned-memory allocation.
Definition: malloc.cpp:478
struct cudaExtent unsigned int cudaArray_t array
void * pinned_malloc_(const char *func, const char *file, int line, size_t size)
Definition: malloc.cpp:246
size_t base_size
Definition: malloc.cpp:30
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:118
char * getenv(const char *)
static void track_malloc(const AllocType &type, const MemAlloc &a, void *ptr)
Definition: malloc.cpp:97
#define a
static long total_host_bytes
Definition: malloc.cpp:54
static bool pool_init
Definition: malloc.cpp:416
#define device_free(ptr)
Definition: malloc_quda.h:57
size_t size
Definition: malloc.cpp:29