11 #include "qdp_config.h"
14 #ifdef QUDA_BACKWARDSCPP
15 #include "backward.hpp"
31 #ifdef QUDA_BACKWARDSCPP
32 backward::StackTrace st;
39 #ifdef QUDA_BACKWARDSCPP
53 #ifdef QUDA_BACKWARDSCPP
64 static long total_host_bytes, max_total_host_bytes;
65 static long total_pinned_bytes, max_total_pinned_bytes;
77 static void print_trace(
void)
82 size = backtrace(array, 10);
83 strings = backtrace_symbols(array, size);
84 printfQuda(
"Obtained %zd stack frames.\n", size);
85 for (
size_t i = 0; i < size; i++)
printfQuda(
"%s\n", strings[i]);
89 static void print_alloc_header()
92 printfQuda(
"----------------------------------------------------------\n");
97 const char *type_str[] = {
"Device",
"Device Pinned",
"Host ",
"Pinned",
"Mapped",
"Managed"};
98 std::map<void *, MemAlloc>::iterator entry;
100 for (entry = alloc[type].begin(); entry != alloc[type].end(); entry++) {
101 void *ptr = entry->first;
102 MemAlloc a = entry->second;
103 printfQuda(
"%s %15p %15lu %s(), %s:%d\n", type_str[type], ptr, (
unsigned long)a.base_size, a.func.c_str(),
104 a.file.c_str(), a.line);
105 #ifdef QUDA_BACKWARDSCPP
114 static void track_malloc(
const AllocType &type,
const MemAlloc &a,
void *ptr)
116 total_bytes[type] += a.base_size;
117 if (total_bytes[type] > max_total_bytes[type]) { max_total_bytes[type] = total_bytes[type]; }
119 total_host_bytes += a.base_size;
120 if (total_host_bytes > max_total_host_bytes) { max_total_host_bytes = total_host_bytes; }
123 total_pinned_bytes += a.base_size;
124 if (total_pinned_bytes > max_total_pinned_bytes) { max_total_pinned_bytes = total_pinned_bytes; }
126 alloc[type][ptr] = a;
129 static void track_free(
const AllocType &type,
void *ptr)
131 size_t size = alloc[type][ptr].base_size;
132 total_bytes[type] -= size;
134 if (type ==
PINNED || type ==
MAPPED) { total_pinned_bytes -= size; }
135 alloc[type].erase(ptr);
144 static void *aligned_malloc(MemAlloc &a,
size_t size)
150 static int page_size = 2 * getpagesize();
151 a.base_size = ((size + page_size - 1) / page_size) * page_size;
152 int align = posix_memalign(&ptr, page_size, a.base_size);
153 if (!ptr || align != 0) {
154 errorQuda(
"Failed to allocate aligned host memory of size %zu (%s:%d in %s())\n", size, a.file.c_str(), a.line,
162 static bool managed =
false;
163 static bool init =
false;
166 char *enable_managed_memory = getenv(
"QUDA_ENABLE_MANAGED_MEMORY");
167 if (enable_managed_memory && strcmp(enable_managed_memory,
"1") == 0) {
168 warningQuda(
"Using managed memory for HIP allocations");
182 static bool prefetch =
false;
183 static bool init =
false;
187 char *enable_managed_prefetch = getenv(
"QUDA_ENABLE_MANAGED_PREFETCH");
188 if (enable_managed_prefetch && strcmp(enable_managed_prefetch,
"1") == 0) {
189 warningQuda(
"Enabling prefetch support for managed memory");
205 void *
device_malloc_(
const char *func,
const char *file,
int line,
size_t size)
209 #ifndef QDP_USE_CUDA_MANAGED_MEMORY
210 MemAlloc a(func, file, line);
213 a.size = a.base_size = size;
215 hipError_t err = hipMalloc(&ptr, size);
216 if (err != hipSuccess) {
217 errorQuda(
"Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
219 track_malloc(
DEVICE, a, ptr);
221 hipMemset(ptr, 0xff, size);
241 MemAlloc a(func, file, line);
244 a.size = a.base_size = size;
246 hipError_t err = hipMemAlloc((hipDeviceptr_t *)&ptr, size);
247 if (err != HIP_SUCCESS) {
248 errorQuda(
"Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
252 hipMemset(ptr, 0xff, size);
262 void *
safe_malloc_(
const char *func,
const char *file,
int line,
size_t size)
264 MemAlloc a(func, file, line);
265 a.size = a.base_size = size;
267 void *ptr = malloc(size);
268 if (!ptr) {
errorQuda(
"Failed to allocate host memory of size %zu (%s:%d in %s())\n", size, file, line, func); }
269 track_malloc(
HOST, a, ptr);
285 void *
pinned_malloc_(
const char *func,
const char *file,
int line,
size_t size)
287 MemAlloc a(func, file, line);
288 void *ptr = aligned_malloc(a, size);
290 hipError_t err = hipHostRegister(ptr, a.base_size, hipHostRegisterDefault);
291 if (err != hipSuccess) {
292 errorQuda(
"Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func);
294 track_malloc(
PINNED, a, ptr);
296 memset(ptr, 0xff, a.base_size);
306 void *
mapped_malloc_(
const char *func,
const char *file,
int line,
size_t size)
308 MemAlloc a(func, file, line);
310 void *ptr = aligned_malloc(a, size);
311 hipError_t err = hipHostRegister(ptr, a.base_size, hipHostRegisterMapped | hipHostRegisterPortable);
312 if (err != hipSuccess) {
313 errorQuda(
"Failed to register host-mapped memory of size %zu (%s:%d in %s())\n", size, file, line, func);
316 track_malloc(
MAPPED, a, ptr);
318 memset(ptr, 0xff, a.base_size);
328 void *
managed_malloc_(
const char *func,
const char *file,
int line,
size_t size)
330 MemAlloc a(func, file, line);
333 a.size = a.base_size = size;
335 hipError_t err = hipMallocManaged(&ptr, size);
336 if (err != hipSuccess) {
337 errorQuda(
"Failed to allocate managed memory of size %zu (%s:%d in %s())\n", size, file, line, func);
341 hipMemset(ptr, 0xff, size);
351 void device_free_(
const char *func,
const char *file,
int line,
void *ptr)
358 #ifndef QDP_USE_CUDA_MANAGED_MEMORY
359 if (!ptr) {
errorQuda(
"Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); }
360 if (!alloc[
DEVICE].count(ptr)) {
361 errorQuda(
"Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
363 hipError_t err = hipFree(ptr);
364 if (err != hipSuccess) {
errorQuda(
"Failed to free device memory (%s:%d in %s())\n", file, line, func); }
383 if (!ptr) {
errorQuda(
"Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); }
385 errorQuda(
"Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
387 hipError_t err = hipMemFree((hipDeviceptr_t)ptr);
388 if (err != HIP_SUCCESS) {
printfQuda(
"Failed to free device memory (%s:%d in %s())\n", file, line, func); }
397 void managed_free_(
const char *func,
const char *file,
int line,
void *ptr)
399 if (!ptr) {
errorQuda(
"Attempt to free NULL managed pointer (%s:%d in %s())\n", file, line, func); }
400 if (!alloc[
MANAGED].count(ptr)) {
401 errorQuda(
"Attempt to free invalid managed pointer (%s:%d in %s())\n", file, line, func);
403 hipError_t err = hipFree(ptr);
404 if (err != hipSuccess) {
errorQuda(
"Failed to free device memory (%s:%d in %s())\n", file, line, func); }
413 void host_free_(
const char *func,
const char *file,
int line,
void *ptr)
415 if (!ptr) {
errorQuda(
"Attempt to free NULL host pointer (%s:%d in %s())\n", file, line, func); }
416 if (alloc[
HOST].count(ptr)) {
417 track_free(
HOST, ptr);
419 }
else if (alloc[
PINNED].count(ptr)) {
420 hipError_t err = hipHostUnregister(ptr);
421 if (err != hipSuccess) {
errorQuda(
"Failed to unregister pinned memory (%s:%d in %s())\n", file, line, func); }
424 }
else if (alloc[
MAPPED].count(ptr)) {
426 hipError_t err = hipFreeHost(ptr);
427 if (err != hipSuccess) {
errorQuda(
"Failed to free host memory (%s:%d in %s())\n", file, line, func); }
429 hipError_t err = hipHostUnregister(ptr);
430 if (err != hipSuccess) {
431 errorQuda(
"Failed to unregister host-mapped memory (%s:%d in %s())\n", file, line, func);
437 printfQuda(
"ERROR: Attempt to free invalid host pointer (%s:%d in %s())\n", file, line, func);
445 printfQuda(
"Device memory used = %.1f MB\n", max_total_bytes[
DEVICE] / (
double)(1 << 20));
447 printfQuda(
"Managed memory used = %.1f MB\n", max_total_bytes[
MANAGED] / (
double)(1 << 20));
448 printfQuda(
"Page-locked host memory used = %.1f MB\n", max_total_pinned_bytes / (
double)(1 << 20));
449 printfQuda(
"Total host memory used >= %.1f MB\n", max_total_host_bytes / (
double)(1 << 20));
455 || !alloc[
MAPPED].empty()) {
456 warningQuda(
"The following internal memory allocations were not freed.");
458 print_alloc_header();
498 auto error = hipHostGetDevicePointer(&device,
const_cast<void *
>(host), 0);
499 if (error != hipSuccess) {
500 errorQuda(
"hipHostGetDevicePointer failed with error %s (%s:%d in %s()", hipGetErrorString(error), file, line,
512 static std::multimap<size_t, void *> pinnedCache;
517 static std::map<void *, size_t> pinnedSize;
522 static std::multimap<size_t, void *> deviceCache;
527 static std::map<void *, size_t> deviceSize;
529 static bool pool_init =
false;
532 static bool device_memory_pool =
true;
535 static bool pinned_memory_pool =
true;
541 char *enable_device_pool = getenv(
"QUDA_ENABLE_DEVICE_MEMORY_POOL");
542 if (!enable_device_pool || strcmp(enable_device_pool,
"0") != 0) {
544 device_memory_pool =
true;
546 warningQuda(
"Not using device memory pool allocator");
547 device_memory_pool =
false;
551 char *enable_pinned_pool = getenv(
"QUDA_ENABLE_PINNED_MEMORY_POOL");
552 if (!enable_pinned_pool || strcmp(enable_pinned_pool,
"0") != 0) {
554 pinned_memory_pool =
true;
556 warningQuda(
"Not using pinned memory pool allocator");
557 pinned_memory_pool =
false;
563 void *
pinned_malloc_(
const char *func,
const char *file,
int line,
size_t nbytes)
566 if (pinned_memory_pool) {
567 std::multimap<size_t, void *>::iterator it;
569 if (pinnedCache.empty()) {
572 it = pinnedCache.lower_bound(nbytes);
573 if (it != pinnedCache.end()) {
576 pinnedCache.erase(it);
578 it = pinnedCache.begin();
580 pinnedCache.erase(it);
585 pinnedSize[ptr] = nbytes;
592 void pinned_free_(
const char *func,
const char *file,
int line,
void *ptr)
594 if (pinned_memory_pool) {
595 if (!pinnedSize.count(ptr)) {
errorQuda(
"Attempt to free invalid pointer"); }
596 pinnedCache.insert(std::make_pair(pinnedSize[ptr], ptr));
597 pinnedSize.erase(ptr);
603 void *
device_malloc_(
const char *func,
const char *file,
int line,
size_t nbytes)
606 if (device_memory_pool) {
607 std::multimap<size_t, void *>::iterator it;
609 if (deviceCache.empty()) {
612 it = deviceCache.lower_bound(nbytes);
613 if (it != deviceCache.end()) {
616 deviceCache.erase(it);
618 it = deviceCache.begin();
620 deviceCache.erase(it);
625 deviceSize[ptr] = nbytes;
632 void device_free_(
const char *func,
const char *file,
int line,
void *ptr)
634 if (device_memory_pool) {
635 if (!deviceSize.count(ptr)) {
errorQuda(
"Attempt to free invalid pointer"); }
636 deviceCache.insert(std::make_pair(deviceSize[ptr], ptr));
637 deviceSize.erase(ptr);
645 if (pinned_memory_pool) {
646 std::multimap<size_t, void *>::iterator it;
647 for (it = pinnedCache.begin(); it != pinnedCache.end(); it++) {
648 void *ptr = it->second;
657 if (device_memory_pool) {
658 std::multimap<size_t, void *>::iterator it;
659 for (it = deviceCache.begin(); it != deviceCache.end(); it++) {
660 void *ptr = it->second;
MemAlloc(std::string func, std::string file, int line)
MemAlloc & operator=(const MemAlloc &a)
bool comm_peer2peer_present()
Returns true if any peer-to-peer capability is present on this system (regardless of whether it has b...
void * memset(void *s, int c, size_t n)
enum QudaFieldLocation_s QudaFieldLocation
void init()
Create the BLAS context.
void init()
Initialize the memory pool allocator.
void * pinned_malloc_(const char *func, const char *file, int line, size_t size)
Allocate pinned-memory. If a free pre-existing allocation exists reuse this.
void flush_pinned()
Free all outstanding pinned-memory allocations.
void flush_device()
Free all outstanding device-memory allocations.
void pinned_free_(const char *func, const char *file, int line, void *ptr)
Virtual free of pinned-memory allocation.
void device_free_(const char *func, const char *file, int line, void *ptr)
Virtual free of pinned-memory allocation.
void * device_malloc_(const char *func, const char *file, int line, size_t size)
Allocate device-memory. If free pre-existing allocation exists reuse this.
void device_free_(const char *func, const char *file, int line, void *ptr)
void host_free_(const char *func, const char *file, int line, void *ptr)
void * device_pinned_malloc_(const char *func, const char *file, int line, size_t size)
void * mapped_malloc_(const char *func, const char *file, int line, size_t size)
void * pinned_malloc_(const char *func, const char *file, int line, size_t size)
size_t host_allocated_peak()
void * get_mapped_device_pointer_(const char *func, const char *file, int line, const void *ptr)
void device_pinned_free_(const char *func, const char *file, int line, void *ptr)
void managed_free_(const char *func, const char *file, int line, void *ptr)
size_t managed_allocated_peak()
size_t mapped_allocated_peak()
void * device_malloc_(const char *func, const char *file, int line, size_t size)
void * safe_malloc_(const char *func, const char *file, int line, size_t size)
bool use_managed_memory()
size_t pinned_allocated_peak()
size_t device_allocated_peak()
bool is_prefetch_enabled()
QudaFieldLocation get_pointer_location(const void *ptr)
void * managed_malloc_(const char *func, const char *file, int line, size_t size)
cudaDeviceProp deviceProp
bool getRankVerbosity()
This function returns true if the calling rank is enabled for verbosity (e.g., whether printQuda and ...