QUDA  1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
malloc.cpp
Go to the documentation of this file.
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4 #include <map>
5 #include <unistd.h> // for getpagesize()
6 #include <execinfo.h> // for backtrace
7 #include <quda_internal.h>
8 
9 #ifdef USE_QDPJIT
10 #include "qdp_quda.h"
11 #include "qdp_config.h"
12 #endif
13 
14 namespace quda {
15 
16  enum AllocType {
23  };
24 
25  class MemAlloc {
26 
27  public:
28  std::string func;
29  std::string file;
30  int line;
31  size_t size;
32  size_t base_size;
33 
35  : line(-1), size(0), base_size(0) { }
36 
37  MemAlloc(std::string func, std::string file, int line)
38  : func(func), file(file), line(line), size(0), base_size(0) { }
39 
41  if (&a != this) {
42  func = a.func;
43  file = a.file;
44  line = a.line;
45  size = a.size;
46  base_size = a.base_size;
47  }
48  return *this;
49  }
50  };
51 
52 
53  static std::map<void *, MemAlloc> alloc[N_ALLOC_TYPE];
54  static long total_bytes[N_ALLOC_TYPE] = {0};
55  static long max_total_bytes[N_ALLOC_TYPE] = {0};
58 
59  long device_allocated_peak() { return max_total_bytes[DEVICE]; }
60 
61  long pinned_allocated_peak() { return max_total_bytes[PINNED]; }
62 
63  long mapped_allocated_peak() { return max_total_bytes[MAPPED]; }
64 
65  long host_allocated_peak() { return max_total_bytes[HOST]; }
66 
67  static void print_trace (void) {
68  void *array[10];
69  size_t size;
70  char **strings;
71  size = backtrace (array, 10);
72  strings = backtrace_symbols (array, size);
73  printfQuda("Obtained %zd stack frames.\n", size);
74  for (size_t i=0; i<size; i++) printfQuda("%s\n", strings[i]);
75  free(strings);
76  }
77 
78  static void print_alloc_header()
79  {
80  printfQuda("Type Pointer Size Location\n");
81  printfQuda("----------------------------------------------------------\n");
82  }
83 
84 
85  static void print_alloc(AllocType type)
86  {
87  const char *type_str[] = {"Device", "Device Pinned", "Host ", "Pinned", "Mapped"};
88  std::map<void *, MemAlloc>::iterator entry;
89 
90  for (entry = alloc[type].begin(); entry != alloc[type].end(); entry++) {
91  void *ptr = entry->first;
92  MemAlloc a = entry->second;
93  printfQuda("%s %15p %15lu %s(), %s:%d\n", type_str[type], ptr, (unsigned long) a.base_size,
94  a.func.c_str(), a.file.c_str(), a.line);
95  }
96  }
97 
98 
99  static void track_malloc(const AllocType &type, const MemAlloc &a, void *ptr)
100  {
101  total_bytes[type] += a.base_size;
102  if (total_bytes[type] > max_total_bytes[type]) {
103  max_total_bytes[type] = total_bytes[type];
104  }
105  if (type != DEVICE && type != DEVICE_PINNED) {
106  total_host_bytes += a.base_size;
107  if (total_host_bytes > max_total_host_bytes) {
108  max_total_host_bytes = total_host_bytes;
109  }
110  }
111  if (type == PINNED || type == MAPPED) {
112  total_pinned_bytes += a.base_size;
113  if (total_pinned_bytes > max_total_pinned_bytes) {
114  max_total_pinned_bytes = total_pinned_bytes;
115  }
116  }
117  alloc[type][ptr] = a;
118  }
119 
120 
121  static void track_free(const AllocType &type, void *ptr)
122  {
123  size_t size = alloc[type][ptr].base_size;
124  total_bytes[type] -= size;
125  if (type != DEVICE && type != DEVICE_PINNED) {
126  total_host_bytes -= size;
127  }
128  if (type == PINNED || type == MAPPED) {
129  total_pinned_bytes -= size;
130  }
131  alloc[type].erase(ptr);
132  }
133 
134 
141  static void *aligned_malloc(MemAlloc &a, size_t size)
142  {
143  void *ptr = nullptr;
144 
145  a.size = size;
146 
147 #if (CUDA_VERSION > 4000) && 0 // we need to manually align to page boundaries to allow us to bind a texture to mapped memory
148  a.base_size = size;
149  ptr = malloc(size);
150  if (!ptr ) {
151 #else
152  static int page_size = 2*getpagesize();
153  a.base_size = ((size + page_size - 1) / page_size) * page_size; // round up to the nearest multiple of page_size
154  int align = posix_memalign(&ptr, page_size, a.base_size);
155  if (!ptr || align != 0) {
156 #endif
157  errorQuda("Failed to allocate aligned host memory of size %zu (%s:%d in %s())\n", size, a.file.c_str(), a.line,
158  a.func.c_str());
159  }
160  return ptr;
161  }
162 
163 
169  void *device_malloc_(const char *func, const char *file, int line, size_t size)
170  {
171  #ifndef QDP_USE_CUDA_MANAGED_MEMORY
172  MemAlloc a(func, file, line);
173  void *ptr;
174 
175  a.size = a.base_size = size;
176 
177  cudaError_t err = cudaMalloc(&ptr, size);
178  if (err != cudaSuccess) {
179  errorQuda("Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
180  }
181  track_malloc(DEVICE, a, ptr);
182 #ifdef HOST_DEBUG
183  cudaMemset(ptr, 0xff, size);
184 #endif
185  return ptr;
186 #else
187  // when QDO uses managed memory we can bypass the QDP memory manager
188  return device_pinned_malloc_(func, file, line, size);
189 #endif
190  }
191 
192 
200  void *device_pinned_malloc_(const char *func, const char *file, int line, size_t size)
201  {
202  if (!comm_peer2peer_present()) return device_malloc_(func, file, line, size);
203 
204  MemAlloc a(func, file, line);
205  void *ptr;
206 
207  a.size = a.base_size = size;
208 
209  CUresult err = cuMemAlloc((CUdeviceptr*)&ptr, size);
210  if (err != CUDA_SUCCESS) {
211  errorQuda("Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
212  }
213  track_malloc(DEVICE_PINNED, a, ptr);
214 #ifdef HOST_DEBUG
215  cudaMemset(ptr, 0xff, size);
216 #endif
217  return ptr;
218  }
219 
220 
226  void *safe_malloc_(const char *func, const char *file, int line, size_t size)
227  {
228  MemAlloc a(func, file, line);
229  a.size = a.base_size = size;
230 
231  void *ptr = malloc(size);
232  if (!ptr) { errorQuda("Failed to allocate host memory of size %zu (%s:%d in %s())\n", size, file, line, func); }
233  track_malloc(HOST, a, ptr);
234 #ifdef HOST_DEBUG
235  memset(ptr, 0xff, size);
236 #endif
237  return ptr;
238  }
239 
240 
250  void *pinned_malloc_(const char *func, const char *file, int line, size_t size)
251  {
252  MemAlloc a(func, file, line);
253  void *ptr = aligned_malloc(a, size);
254 
255  cudaError_t err = cudaHostRegister(ptr, a.base_size, cudaHostRegisterDefault);
256  if (err != cudaSuccess) {
257  errorQuda("Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func);
258  }
259  track_malloc(PINNED, a, ptr);
260 #ifdef HOST_DEBUG
261  memset(ptr, 0xff, a.base_size);
262 #endif
263  return ptr;
264  }
265 
266 #define HOST_ALLOC // this needs to be set presently on P9
267 
273  void *mapped_malloc_(const char *func, const char *file, int line, size_t size)
274  {
275  MemAlloc a(func, file, line);
276 
277 #ifdef HOST_ALLOC
278  void *ptr;
279  cudaError_t err = cudaHostAlloc(&ptr, size, cudaHostRegisterMapped | cudaHostRegisterPortable);
280  if (err != cudaSuccess) { errorQuda("cudaHostAlloc failed of size %zu (%s:%d in %s())\n", size, file, line, func); }
281 #else
282  void *ptr = aligned_malloc(a, size);
283  cudaError_t err = cudaHostRegister(ptr, a.base_size, cudaHostRegisterMapped);
284  if (err != cudaSuccess) {
285  errorQuda("Failed to register host-mapped memory of size %zu (%s:%d in %s())\n", size, file, line, func);
286  }
287 #endif
288  track_malloc(MAPPED, a, ptr);
289 #ifdef HOST_DEBUG
290  memset(ptr, 0xff, a.base_size);
291 #endif
292  return ptr;
293  }
294 
295 
301  void device_free_(const char *func, const char *file, int line, void *ptr)
302  {
303 #ifndef QDP_USE_CUDA_MANAGED_MEMORY
304  if (!ptr) { errorQuda("Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); }
305  if (!alloc[DEVICE].count(ptr)) {
306  errorQuda("Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
307  }
308  cudaError_t err = cudaFree(ptr);
309  if (err != cudaSuccess) { errorQuda("Failed to free device memory (%s:%d in %s())\n", file, line, func); }
310  track_free(DEVICE, ptr);
311 #else
312  device_pinned_free_(func, file, line, ptr);
313 #endif
314  }
315 
316 
322  void device_pinned_free_(const char *func, const char *file, int line, void *ptr)
323  {
324  if (!comm_peer2peer_present()) {
325  device_free_(func, file, line, ptr);
326  return;
327  }
328 
329  if (!ptr) { errorQuda("Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); }
330  if (!alloc[DEVICE_PINNED].count(ptr)) {
331  errorQuda("Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
332  }
333  CUresult err = cuMemFree((CUdeviceptr)ptr);
334  if (err != CUDA_SUCCESS) { printfQuda("Failed to free device memory (%s:%d in %s())\n", file, line, func); }
336  }
337 
338 
344  void host_free_(const char *func, const char *file, int line, void *ptr)
345  {
346  if (!ptr) { errorQuda("Attempt to free NULL host pointer (%s:%d in %s())\n", file, line, func); }
347  if (alloc[HOST].count(ptr)) {
348  track_free(HOST, ptr);
349  free(ptr);
350  } else if (alloc[PINNED].count(ptr)) {
351  cudaError_t err = cudaHostUnregister(ptr);
352  if (err != cudaSuccess) { errorQuda("Failed to unregister pinned memory (%s:%d in %s())\n", file, line, func); }
353  track_free(PINNED, ptr);
354  free(ptr);
355  } else if (alloc[MAPPED].count(ptr)) {
356 #ifdef HOST_ALLOC
357  cudaError_t err = cudaFreeHost(ptr);
358  if (err != cudaSuccess) { errorQuda("Failed to free host memory (%s:%d in %s())\n", file, line, func); }
359 #else
360  cudaError_t err = cudaHostUnregister(ptr);
361  if (err != cudaSuccess) {
362  errorQuda("Failed to unregister host-mapped memory (%s:%d in %s())\n", file, line, func);
363  }
364  free(ptr);
365 #endif
366  track_free(MAPPED, ptr);
367  } else {
368  printfQuda("ERROR: Attempt to free invalid host pointer (%s:%d in %s())\n", file, line, func);
369  print_trace();
370  errorQuda("Aborting");
371  }
372  }
373 
374 
376  {
377  printfQuda("Device memory used = %.1f MB\n", max_total_bytes[DEVICE] / (double)(1<<20));
378  printfQuda("Pinned device memory used = %.1f MB\n", max_total_bytes[DEVICE_PINNED] / (double)(1<<20));
379  printfQuda("Page-locked host memory used = %.1f MB\n", max_total_pinned_bytes / (double)(1<<20));
380  printfQuda("Total host memory used >= %.1f MB\n", max_total_host_bytes / (double)(1<<20));
381  }
382 
383 
385  {
386  if (!alloc[DEVICE].empty() || !alloc[DEVICE_PINNED].empty() || !alloc[HOST].empty() || !alloc[PINNED].empty() || !alloc[MAPPED].empty()) {
387  warningQuda("The following internal memory allocations were not freed.");
388  printfQuda("\n");
392  print_alloc(HOST);
395  printfQuda("\n");
396  }
397  }
398 
400 
401  CUpointer_attribute attribute[] = { CU_POINTER_ATTRIBUTE_MEMORY_TYPE };
402  CUmemorytype mem_type;
403  void *data[] = { &mem_type };
404  CUresult error = cuPointerGetAttributes(1, attribute, data, reinterpret_cast<CUdeviceptr>(ptr));
405  if (error != CUDA_SUCCESS) {
406  const char *string;
407  cuGetErrorString(error, &string);
408  errorQuda("cuPointerGetAttributes failed with error %s", string);
409  }
410 
411  // catch pointers that have not been created in CUDA
412  if (mem_type == 0) mem_type = CU_MEMORYTYPE_HOST;
413 
414  switch (mem_type) {
415  case CU_MEMORYTYPE_DEVICE:
416  case CU_MEMORYTYPE_UNIFIED:
418  case CU_MEMORYTYPE_HOST:
420  default:
421  errorQuda("Unknown memory type %d", mem_type);
423  }
424 
425  }
426 
427  namespace pool {
428 
432  static std::multimap<size_t, void *> pinnedCache;
433 
437  static std::map<void *, size_t> pinnedSize;
438 
442  static std::multimap<size_t, void *> deviceCache;
443 
447  static std::map<void *, size_t> deviceSize;
448 
449  static bool pool_init = false;
450 
452  static bool device_memory_pool = true;
453 
455  static bool pinned_memory_pool = true;
456 
457  void init() {
458  if (!pool_init) {
459  // device memory pool
460  char *enable_device_pool = getenv("QUDA_ENABLE_DEVICE_MEMORY_POOL");
461  if (!enable_device_pool || strcmp(enable_device_pool,"0")!=0) {
462  warningQuda("Using device memory pool allocator");
463  device_memory_pool = true;
464  } else {
465  warningQuda("Not using device memory pool allocator");
466  device_memory_pool = false;
467  }
468 
469  // pinned memory pool
470  char *enable_pinned_pool = getenv("QUDA_ENABLE_PINNED_MEMORY_POOL");
471  if (!enable_pinned_pool || strcmp(enable_pinned_pool,"0")!=0) {
472  warningQuda("Using pinned memory pool allocator");
473  pinned_memory_pool = true;
474  } else {
475  warningQuda("Not using pinned memory pool allocator");
476  pinned_memory_pool = false;
477  }
478  pool_init = true;
479  }
480  }
481 
482  void* pinned_malloc_(const char *func, const char *file, int line, size_t nbytes)
483  {
484  void *ptr = nullptr;
485  if (pinned_memory_pool) {
486  std::multimap<size_t, void *>::iterator it;
487 
488  if (pinnedCache.empty()) {
489  ptr = quda::pinned_malloc_(func, file, line, nbytes);
490  } else {
491  it = pinnedCache.lower_bound(nbytes);
492  if (it != pinnedCache.end()) { // sufficiently large allocation found
493  nbytes = it->first;
494  ptr = it->second;
495  pinnedCache.erase(it);
496  } else { // sacrifice the smallest cached allocation
497  it = pinnedCache.begin();
498  ptr = it->second;
499  pinnedCache.erase(it);
500  host_free(ptr);
501  ptr = quda::pinned_malloc_(func, file, line, nbytes);
502  }
503  }
504  pinnedSize[ptr] = nbytes;
505  } else {
506  ptr = quda::pinned_malloc_(func, file, line, nbytes);
507  }
508  return ptr;
509  }
510 
511  void pinned_free_(const char *func, const char *file, int line, void *ptr)
512  {
513  if (pinned_memory_pool) {
514  if (!pinnedSize.count(ptr)) {
515  errorQuda("Attempt to free invalid pointer");
516  }
517  pinnedCache.insert(std::make_pair(pinnedSize[ptr], ptr));
518  pinnedSize.erase(ptr);
519  } else {
520  quda::host_free_(func, file, line, ptr);
521  }
522  }
523 
524  void* device_malloc_(const char *func, const char *file, int line, size_t nbytes)
525  {
526  void *ptr = nullptr;
527  if (device_memory_pool) {
528  std::multimap<size_t, void *>::iterator it;
529 
530  if (deviceCache.empty()) {
531  ptr = quda::device_malloc_(func, file, line, nbytes);
532  } else {
533  it = deviceCache.lower_bound(nbytes);
534  if (it != deviceCache.end()) { // sufficiently large allocation found
535  nbytes = it->first;
536  ptr = it->second;
537  deviceCache.erase(it);
538  } else { // sacrifice the smallest cached allocation
539  it = deviceCache.begin();
540  ptr = it->second;
541  deviceCache.erase(it);
542  quda::device_free_(func, file, line, ptr);
543  ptr = quda::device_malloc_(func, file, line, nbytes);
544  }
545  }
546  deviceSize[ptr] = nbytes;
547  } else {
548  ptr = quda::device_malloc_(func, file, line, nbytes);
549  }
550  return ptr;
551  }
552 
553  void device_free_(const char *func, const char *file, int line, void *ptr)
554  {
555  if (device_memory_pool) {
556  if (!deviceSize.count(ptr)) {
557  errorQuda("Attempt to free invalid pointer");
558  }
559  deviceCache.insert(std::make_pair(deviceSize[ptr], ptr));
560  deviceSize.erase(ptr);
561  } else {
562  quda::device_free_(func, file, line, ptr);
563  }
564  }
565 
567  {
568  if (pinned_memory_pool) {
569  std::multimap<size_t, void *>::iterator it;
570  for (it = pinnedCache.begin(); it != pinnedCache.end(); it++) {
571  void *ptr = it->second;
572  host_free(ptr);
573  }
574  pinnedCache.clear();
575  }
576  }
577 
579  {
580  if (device_memory_pool) {
581  std::multimap<size_t, void *>::iterator it;
582  for (it = deviceCache.begin(); it != deviceCache.end(); it++) {
583  void *ptr = it->second;
584  device_free(ptr);
585  }
586  deviceCache.clear();
587  }
588  }
589 
590  } // namespace pool
591 
592 } // namespace quda
static long max_total_host_bytes
Definition: malloc.cpp:56
long device_allocated_peak()
Definition: malloc.cpp:59
std::string file
Definition: malloc.cpp:29
static std::map< void *, size_t > deviceSize
Definition: malloc.cpp:447
static std::multimap< size_t, void * > deviceCache
Definition: malloc.cpp:442
void * safe_malloc_(const char *func, const char *file, int line, size_t size)
Definition: malloc.cpp:226
static void print_alloc_header()
Definition: malloc.cpp:78
static long max_total_pinned_bytes
Definition: malloc.cpp:57
static std::multimap< size_t, void * > pinnedCache
Definition: malloc.cpp:432
#define errorQuda(...)
Definition: util_quda.h:121
void * device_malloc_(const char *func, const char *file, int line, size_t size)
Definition: malloc.cpp:169
#define host_free(ptr)
Definition: malloc_quda.h:71
static std::map< void *, MemAlloc > alloc[N_ALLOC_TYPE]
Definition: malloc.cpp:53
void assertAllMemFree()
Definition: malloc.cpp:384
static void print_alloc(AllocType type)
Definition: malloc.cpp:85
void * device_pinned_malloc_(const char *func, const char *file, int line, size_t size)
Definition: malloc.cpp:200
static std::map< void *, size_t > pinnedSize
Definition: malloc.cpp:437
void device_free_(const char *func, const char *file, int line, void *ptr)
Definition: malloc.cpp:301
MemAlloc & operator=(const MemAlloc &a)
Definition: malloc.cpp:40
std::string func
Definition: malloc.cpp:28
void flush_pinned()
Free all outstanding pinned-memory allocations.
Definition: malloc.cpp:566
static void track_free(const AllocType &type, void *ptr)
Definition: malloc.cpp:121
void device_pinned_free_(const char *func, const char *file, int line, void *ptr)
Definition: malloc.cpp:322
MemAlloc(std::string func, std::string file, int line)
Definition: malloc.cpp:37
void host_free_(const char *func, const char *file, int line, void *ptr)
Definition: malloc.cpp:344
static long max_total_bytes[N_ALLOC_TYPE]
Definition: malloc.cpp:55
void flush_device()
Free all outstanding device-memory allocations.
Definition: malloc.cpp:578
static map::iterator it
Definition: tune.cpp:109
#define warningQuda(...)
Definition: util_quda.h:133
static void print_trace(void)
Definition: malloc.cpp:67
static void * aligned_malloc(MemAlloc &a, size_t size)
Definition: malloc.cpp:141
long host_allocated_peak()
Definition: malloc.cpp:65
long mapped_allocated_peak()
Definition: malloc.cpp:63
void init()
Create the CUBLAS context.
Definition: blas_cublas.cu:31
static bool pinned_memory_pool
Definition: malloc.cpp:455
void * mapped_malloc_(const char *func, const char *file, int line, size_t size)
Definition: malloc.cpp:273
void * memset(void *s, int c, size_t n)
static bool device_memory_pool
Definition: malloc.cpp:452
static long total_pinned_bytes
Definition: malloc.cpp:57
QudaFieldLocation get_pointer_location(const void *ptr)
Definition: malloc.cpp:399
enum QudaFieldLocation_s QudaFieldLocation
long pinned_allocated_peak()
Definition: malloc.cpp:61
void printPeakMemUsage()
Definition: malloc.cpp:375
#define printfQuda(...)
Definition: util_quda.h:115
static long total_bytes[N_ALLOC_TYPE]
Definition: malloc.cpp:54
void pinned_free_(const char *func, const char *file, int line, void *ptr)
Virtual free of pinned-memory allocation.
Definition: malloc.cpp:511
void * pinned_malloc_(const char *func, const char *file, int line, size_t size)
Definition: malloc.cpp:250
bool comm_peer2peer_present()
Returns true if any peer-to-peer capability is present on this system (regardless of whether it has b...
size_t base_size
Definition: malloc.cpp:32
__device__ unsigned int count[QUDA_MAX_MULTI_REDUCE]
Definition: cub_helper.cuh:90
static void track_malloc(const AllocType &type, const MemAlloc &a, void *ptr)
Definition: malloc.cpp:99
static long total_host_bytes
Definition: malloc.cpp:56
static bool pool_init
Definition: malloc.cpp:449
#define device_free(ptr)
Definition: malloc_quda.h:69
size_t size
Definition: malloc.cpp:31