8 #include <shmem_helper.cuh>
12 #include "qdp_config.h"
15 #ifdef QUDA_BACKWARDSCPP
16 #include "backward.hpp"
33 #ifdef QUDA_BACKWARDSCPP
34 backward::StackTrace st;
41 #ifdef QUDA_BACKWARDSCPP
55 #ifdef QUDA_BACKWARDSCPP
66 static size_t total_host_bytes, max_total_host_bytes;
67 static size_t total_pinned_bytes, max_total_pinned_bytes;
89 static void print_trace(
void)
94 size = backtrace(array, 10);
95 strings = backtrace_symbols(array, size);
96 printfQuda(
"Obtained %zd stack frames.\n", size);
97 for (
size_t i = 0; i < size; i++)
printfQuda(
"%s\n", strings[i]);
101 static void print_alloc_header()
104 printfQuda(
"----------------------------------------------------------\n");
109 const char *type_str[] = {
"Device",
"Device Pinned",
"Host ",
"Pinned",
"Mapped",
"Managed",
"Shmem "};
111 for (
auto entry : alloc[type]) {
112 void *ptr = entry.first;
113 MemAlloc a = entry.second;
114 printfQuda(
"%s %15p %15lu %s(), %s:%d\n", type_str[type], ptr, (
unsigned long)a.base_size, a.func.c_str(),
115 a.file.c_str(), a.line);
116 #ifdef QUDA_BACKWARDSCPP
125 static void track_malloc(
const AllocType &type,
const MemAlloc &a,
void *ptr)
127 total_bytes[type] += a.base_size;
128 if (total_bytes[type] > max_total_bytes[type]) { max_total_bytes[type] = total_bytes[type]; }
130 total_host_bytes += a.base_size;
131 if (total_host_bytes > max_total_host_bytes) { max_total_host_bytes = total_host_bytes; }
134 total_pinned_bytes += a.base_size;
135 if (total_pinned_bytes > max_total_pinned_bytes) { max_total_pinned_bytes = total_pinned_bytes; }
137 alloc[type][ptr] = a;
140 static void track_free(
const AllocType &type,
void *ptr)
142 size_t size = alloc[type][ptr].base_size;
143 total_bytes[type] -= size;
145 if (type ==
PINNED || type ==
MAPPED) { total_pinned_bytes -= size; }
146 alloc[type].erase(ptr);
155 static void *aligned_malloc(MemAlloc &a,
size_t size)
161 #if (CUDA_VERSION > 4000) \
167 static int page_size = 2 * getpagesize();
168 a.base_size = ((size + page_size - 1) / page_size) * page_size;
169 int align = posix_memalign(&ptr, page_size, a.base_size);
170 if (!ptr || align != 0) {
172 errorQuda(
"Failed to allocate aligned host memory of size %zu (%s:%d in %s())\n", size, a.file.c_str(), a.line,
180 static bool managed =
false;
181 static bool init =
false;
184 char *enable_managed_memory = getenv(
"QUDA_ENABLE_MANAGED_MEMORY");
185 if (enable_managed_memory && strcmp(enable_managed_memory,
"1") == 0) {
186 warningQuda(
"Using managed memory for CUDA allocations");
200 static bool prefetch =
false;
201 static bool init =
false;
205 char *enable_managed_prefetch = getenv(
"QUDA_ENABLE_MANAGED_PREFETCH");
206 if (enable_managed_prefetch && strcmp(enable_managed_prefetch,
"1") == 0) {
207 warningQuda(
"Enabling prefetch support for managed memory");
227 #ifndef QDP_USE_CUDA_MANAGED_MEMORY
233 cudaError_t err = cudaMalloc(&ptr, size);
234 if (err != cudaSuccess) {
235 errorQuda(
"Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
237 track_malloc(
DEVICE, a, ptr);
264 CUresult err = cuMemAlloc((CUdeviceptr *)&ptr, size);
265 if (err != CUDA_SUCCESS) {
266 errorQuda(
"Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
280 void *
safe_malloc_(
const char *func,
const char *file,
int line,
size_t size)
285 void *ptr = malloc(size);
286 if (!ptr) {
errorQuda(
"Failed to allocate host memory of size %zu (%s:%d in %s())\n", size, file, line, func); }
287 track_malloc(
HOST, a, ptr);
306 void *ptr = aligned_malloc(a, size);
308 cudaError_t err = cudaHostRegister(ptr, a.
base_size, cudaHostRegisterDefault);
309 if (err != cudaSuccess) {
310 errorQuda(
"Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func);
312 track_malloc(
PINNED, a, ptr);
330 static int page_size = 2*getpagesize();
331 a.
base_size = ((size + page_size - 1) / page_size) * page_size;
333 cudaError_t err = cudaHostAlloc(&ptr, a.
base_size, cudaHostAllocMapped | cudaHostAllocPortable);
334 if (err != cudaSuccess) {
335 errorQuda(
"cudaHostAlloc failed of size %zu (%s:%d in %s())\n", size, file, line, func); }
338 void *ptr = aligned_malloc(a, size);
339 cudaError_t err = cudaHostRegister(ptr, a.base_size, cudaHostRegisterMapped | cudaHostRegisterPortable);
340 if (err != cudaSuccess) {
341 errorQuda(
"Failed to register host-mapped memory of size %zu (%s:%d in %s())\n", size, file, line, func);
344 track_malloc(
MAPPED, a, ptr);
346 memset(ptr, 0xff, a.base_size);
363 cudaError_t err = cudaMallocManaged(&ptr, size);
364 if (err != cudaSuccess) {
365 errorQuda(
"Failed to allocate managed memory of size %zu (%s:%d in %s())\n", size, file, line, func);
378 void *shmem_malloc_(
const char *func,
const char *file,
int line,
size_t size)
380 MemAlloc a(func, file, line);
382 a.size = a.base_size = size;
384 auto ptr = nvshmem_malloc(size);
385 if (ptr ==
nullptr) {
386 printfQuda(
"ERROR: Failed to allocate shmem memory of size %zu (%s:%d in %s())\n", size, file, line, func);
389 track_malloc(
SHMEM, a, ptr);
404 return shmem_malloc_(func, file, line, size);
415 void device_free_(
const char *func,
const char *file,
int line,
void *ptr)
422 #ifndef QDP_USE_CUDA_MANAGED_MEMORY
423 if (!ptr) {
errorQuda(
"Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); }
424 if (!alloc[
DEVICE].count(ptr)) {
425 errorQuda(
"Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
427 cudaError_t err = cudaFree(ptr);
428 if (err != cudaSuccess) {
errorQuda(
"Failed to free device memory (%s:%d in %s())\n", file, line, func); }
447 if (!ptr) {
errorQuda(
"Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); }
449 errorQuda(
"Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
451 CUresult err = cuMemFree((CUdeviceptr)ptr);
452 if (err != CUDA_SUCCESS) {
printfQuda(
"Failed to free device memory (%s:%d in %s())\n", file, line, func); }
463 if (!ptr) {
errorQuda(
"Attempt to free NULL managed pointer (%s:%d in %s())\n", file, line, func); }
464 if (!alloc[
MANAGED].count(ptr)) {
465 errorQuda(
"Attempt to free invalid managed pointer (%s:%d in %s())\n", file, line, func);
467 cudaError_t err = cudaFree(ptr);
468 if (err != cudaSuccess) {
errorQuda(
"Failed to free device memory (%s:%d in %s())\n", file, line, func); }
477 void host_free_(
const char *func,
const char *file,
int line,
void *ptr)
479 if (!ptr) {
errorQuda(
"Attempt to free NULL host pointer (%s:%d in %s())\n", file, line, func); }
480 if (alloc[
HOST].count(ptr)) {
481 track_free(
HOST, ptr);
483 }
else if (alloc[
PINNED].count(ptr)) {
484 cudaError_t err = cudaHostUnregister(ptr);
485 if (err != cudaSuccess) {
errorQuda(
"Failed to unregister pinned memory (%s:%d in %s())\n", file, line, func); }
488 }
else if (alloc[
MAPPED].count(ptr)) {
490 cudaError_t err = cudaFreeHost(ptr);
491 if (err != cudaSuccess) {
errorQuda(
"Failed to free host memory (%s:%d in %s())\n", file, line, func); }
493 cudaError_t err = cudaHostUnregister(ptr);
494 if (err != cudaSuccess) {
495 errorQuda(
"Failed to unregister host-mapped memory (%s:%d in %s())\n", file, line, func);
501 printfQuda(
"ERROR: Attempt to free invalid host pointer (%s:%d in %s())\n", file, line, func);
511 void shmem_free_(
const char *func,
const char *file,
int line,
void *ptr)
514 printfQuda(
"ERROR: Attempt to free NULL shmem pointer (%s:%d in %s())\n", file, line, func);
517 if (!alloc[
SHMEM].count(ptr)) {
518 printfQuda(
"ERROR: Attempt to free invalid shmem pointer (%s:%d in %s())\n", file, line, func);
522 track_free(
SHMEM, ptr);
533 shmem_free_(func, file, line, ptr);
541 printfQuda(
"Device memory used = %.1f MiB\n", max_total_bytes[
DEVICE] / (
double)(1 << 20));
543 printfQuda(
"Managed memory used = %.1f MiB\n", max_total_bytes[
MANAGED] / (
double)(1 << 20));
544 printfQuda(
"Shmem memory used = %.1f MiB\n", max_total_bytes[
SHMEM] / (
double)(1 << 20));
545 printfQuda(
"Page-locked host memory used = %.1f MiB\n", max_total_pinned_bytes / (
double)(1 << 20));
546 printfQuda(
"Total host memory used >= %.1f MiB\n", max_total_host_bytes / (
double)(1 << 20));
552 || !alloc[
MAPPED].empty()) {
553 warningQuda(
"The following internal memory allocations were not freed.");
555 print_alloc_header();
569 CUpointer_attribute attribute[] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
570 CUmemorytype mem_type;
571 void *data[] = {&mem_type};
572 CUresult error = cuPointerGetAttributes(1, attribute, data,
reinterpret_cast<CUdeviceptr
>(ptr));
573 if (error != CUDA_SUCCESS) {
575 cuGetErrorString(error, &
string);
576 errorQuda(
"cuPointerGetAttributes failed with error %s",
string);
580 if (mem_type == 0) mem_type = CU_MEMORYTYPE_HOST;
583 case CU_MEMORYTYPE_DEVICE:
593 auto error = cudaHostGetDevicePointer(&device,
const_cast<void *
>(host), 0);
594 if (error != cudaSuccess) {
595 errorQuda(
"cudaHostGetDevicePointer failed with error %s (%s:%d in %s()", cudaGetErrorString(error), file, line,
607 static std::multimap<size_t, void *> pinnedCache;
612 static std::map<void *, size_t> pinnedSize;
617 static std::multimap<size_t, void *> deviceCache;
622 static std::map<void *, size_t> deviceSize;
624 static bool pool_init =
false;
627 static bool device_memory_pool =
true;
630 static bool pinned_memory_pool =
true;
636 char *enable_device_pool = getenv(
"QUDA_ENABLE_DEVICE_MEMORY_POOL");
637 if (!enable_device_pool || strcmp(enable_device_pool,
"0") != 0) {
639 device_memory_pool =
true;
641 warningQuda(
"Not using device memory pool allocator");
642 device_memory_pool =
false;
646 char *enable_pinned_pool = getenv(
"QUDA_ENABLE_PINNED_MEMORY_POOL");
647 if (!enable_pinned_pool || strcmp(enable_pinned_pool,
"0") != 0) {
649 pinned_memory_pool =
true;
651 warningQuda(
"Not using pinned memory pool allocator");
652 pinned_memory_pool =
false;
656 #if defined(NVSHMEM_COMMS)
657 MPI_Comm
tmp = MPI_COMM_WORLD;
659 nvshmemx_init_attr_t attr;
660 attr.mpi_comm = &
tmp;
661 nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
665 void *
pinned_malloc_(
const char *func,
const char *file,
int line,
size_t nbytes)
668 if (pinned_memory_pool) {
669 if (pinnedCache.empty()) {
672 auto it = pinnedCache.lower_bound(nbytes);
673 if (it != pinnedCache.end()) {
676 pinnedCache.erase(it);
678 it = pinnedCache.begin();
680 pinnedCache.erase(it);
685 pinnedSize[ptr] = nbytes;
692 void pinned_free_(
const char *func,
const char *file,
int line,
void *ptr)
694 if (pinned_memory_pool) {
695 if (!pinnedSize.count(ptr)) {
errorQuda(
"Attempt to free invalid pointer"); }
696 pinnedCache.insert(std::make_pair(pinnedSize[ptr], ptr));
697 pinnedSize.erase(ptr);
703 void *
device_malloc_(
const char *func,
const char *file,
int line,
size_t nbytes)
706 if (device_memory_pool) {
707 if (deviceCache.empty()) {
710 auto it = deviceCache.lower_bound(nbytes);
711 if (it != deviceCache.end()) {
714 deviceCache.erase(it);
716 it = deviceCache.begin();
718 deviceCache.erase(it);
723 deviceSize[ptr] = nbytes;
730 void device_free_(
const char *func,
const char *file,
int line,
void *ptr)
732 if (device_memory_pool) {
733 if (!deviceSize.count(ptr)) {
errorQuda(
"Attempt to free invalid pointer"); }
734 deviceCache.insert(std::make_pair(deviceSize[ptr], ptr));
735 deviceSize.erase(ptr);
742 void *shmem_malloc_(
const char *func,
const char *file,
int line,
size_t nbytes)
744 return quda::shmem_malloc_(func, file, line, nbytes);
747 void shmem_free_(
const char *func,
const char *file,
int line,
void *ptr)
749 quda::shmem_free_(func, file, line, ptr);
755 if (pinned_memory_pool) {
756 for (
auto it : pinnedCache) {
host_free(it.second); }
763 if (device_memory_pool) {
764 for (
auto it : deviceCache) {
device_free(it.second); }
MemAlloc(std::string func, std::string file, int line)
MemAlloc & operator=(const MemAlloc &a)
bool comm_peer2peer_present()
Returns true if any peer-to-peer capability is present on this system (regardless of whether it has b...
void * memset(void *s, int c, size_t n)
cudaColorSpinorField * tmp
@ QUDA_CUDA_FIELD_LOCATION
@ QUDA_CPU_FIELD_LOCATION
@ QUDA_INVALID_FIELD_LOCATION
enum QudaFieldLocation_s QudaFieldLocation
void init()
Create the BLAS context.
void init()
Initialize the memory pool allocator.
void * pinned_malloc_(const char *func, const char *file, int line, size_t size)
Allocate pinned-memory. If a free pre-existing allocation exists reuse this.
void flush_pinned()
Free all outstanding pinned-memory allocations.
void flush_device()
Free all outstanding device-memory allocations.
void pinned_free_(const char *func, const char *file, int line, void *ptr)
Virtual free of pinned-memory allocation.
void device_free_(const char *func, const char *file, int line, void *ptr)
Virtual free of pinned-memory allocation.
void * device_malloc_(const char *func, const char *file, int line, size_t size)
Allocate device-memory. If free pre-existing allocation exists reuse this.
void device_free_(const char *func, const char *file, int line, void *ptr)
void host_free_(const char *func, const char *file, int line, void *ptr)
void * device_pinned_malloc_(const char *func, const char *file, int line, size_t size)
void * mapped_malloc_(const char *func, const char *file, int line, size_t size)
size_t mapped_allocated()
void * pinned_malloc_(const char *func, const char *file, int line, size_t size)
size_t device_allocated()
size_t host_allocated_peak()
void * get_mapped_device_pointer_(const char *func, const char *file, int line, const void *ptr)
void device_pinned_free_(const char *func, const char *file, int line, void *ptr)
size_t managed_allocated()
void managed_free_(const char *func, const char *file, int line, void *ptr)
size_t pinned_allocated()
size_t managed_allocated_peak()
void device_comms_pinned_free_(const char *func, const char *file, int line, void *ptr)
size_t mapped_allocated_peak()
void * device_malloc_(const char *func, const char *file, int line, size_t size)
void * safe_malloc_(const char *func, const char *file, int line, size_t size)
bool use_managed_memory()
size_t pinned_allocated_peak()
size_t device_allocated_peak()
bool is_prefetch_enabled()
QudaFieldLocation get_pointer_location(const void *ptr)
void * managed_malloc_(const char *func, const char *file, int line, size_t size)
void * device_comms_pinned_malloc_(const char *func, const char *file, int line, size_t size)
#define qudaMemset(ptr, value, count)
cudaDeviceProp deviceProp
bool getRankVerbosity()
This function returns true if the calling rank is enabled for verbosity (e.g., whether printQuda and ...