Merge branch 'main' into sb/reduce-allocssb/reduce-allocs

author: Stefan Boberg <[email protected]> 2026-04-11 12:46:01 +0200
committer: GitHub Enterprise <[email protected]> 2026-04-11 12:46:01 +0200
commit: dc742b88d908d23e0c5c5d1d95994637658db2b2 (patch)
tree: 6fb25b88b64c92c503c239cf3cef497ed18ee172 /thirdparty/rpmalloc/rpmalloc.c
parent: Reduce short-lived heap allocations in zenhttp (diff)
parent: hub deprovision all (#938) (diff)
download: zen-sb/reduce-allocs.tar.xz
zen-sb/reduce-allocs.zip
1 files changed, 171 insertions, 50 deletions
diff --git a/thirdparty/rpmalloc/rpmalloc.c b/thirdparty/rpmalloc/rpmalloc.c
index 08cefe6dd..b8fe16a0a 100644
--- a/thirdparty/rpmalloc/rpmalloc.c
+++ b/thirdparty/rpmalloc/rpmalloc.c
@@ -57,6 +57,9 @@
 #endif
 
 #if PLATFORM_WINDOWS
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 #include <fibersapi.h>
 static DWORD fls_key;
@@ -184,6 +187,12 @@ madvise(caddr_t, size_t, int);
 #define SPAN_SIZE (256 * 1024 * 1024)
 #define SPAN_MASK (~((uintptr_t)(SPAN_SIZE - 1)))
 
+#if ENABLE_VALIDATE_ARGS
+//! Maximum allocation size to avoid integer overflow
+#undef  MAX_ALLOC_SIZE
+#define MAX_ALLOC_SIZE (((size_t)-1) - SPAN_SIZE)
+#endif
+
 ////////////
 ///
 /// Utility macros
@@ -258,13 +267,13 @@ static inline size_t
 rpmalloc_clz(uintptr_t x) {
 #if ARCH_64BIT
 #if defined(_MSC_VER) && !defined(__clang__)
-	return (size_t)_lzcnt_u64(x);
+	return (size_t)__lzcnt64(x);
 #else
 	return (size_t)__builtin_clzll(x);
 #endif
 #else
 #if defined(_MSC_VER) && !defined(__clang__)
-	return (size_t)_lzcnt_u32(x);
+	return (size_t)__lzcnt32(x);
 #else
 	return (size_t)__builtin_clzl(x);
 #endif
@@ -279,9 +288,9 @@ wait_spin(void) {
 #else
 	_mm_pause();
 #endif
-#elif defined(__x86_64__) || defined(__i386__)
+#elif (defined(__x86_64__) || defined(__i386__)) && !defined(_M_ARM64EC)
 	__asm__ volatile("pause" ::: "memory");
-#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH >= 7)
+#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(_M_ARM64EC)
 	__asm__ volatile("yield" ::: "memory");
 #elif defined(__powerpc__) || defined(__powerpc64__)
 	// No idea if ever been compiled in such archs but ... as precaution
@@ -468,6 +477,9 @@ struct heap_t {
 	uint32_t offset;
 	//! Memory map size
 	size_t mapped_size;
+#if RPMALLOC_HEAP_STATISTICS
+	struct rpmalloc_heap_statistics_t stats;
+#endif
 };
 
 _Static_assert(sizeof(page_t) <= PAGE_HEADER_SIZE, "Invalid page header size");
@@ -530,10 +542,10 @@ static const size_class_t global_size_class[SIZE_CLASS_COUNT] = {
     LCLASS(262144), LCLASS(327680), LCLASS(393216), LCLASS(458752), LCLASS(524288)};
 
 //! Threshold number of pages for when free pages are decommitted
-static uint32_t global_page_free_overflow[4] = {16, 8, 2, 0};
+static uint32_t global_page_free_overflow[4] = {64, 16, 4, 0};
 
 //! Number of pages to retain when free page threshold overflows
-static uint32_t global_page_free_retain[4] = {4, 2, 1, 0};
+static uint32_t global_page_free_retain[4] = {16, 4, 2, 0};
 
 //! OS huge page support
 static int os_huge_pages;
@@ -719,6 +731,8 @@ os_mmap(size_t size, size_t alignment, size_t* offset, size_t* mapped_size) {
 	// page to avoid saturating the OS commit limit
 #if ENABLE_DECOMMIT
 	DWORD do_commit = 0;
+	if (global_config.disable_decommit)
+	    do_commit = MEM_COMMIT;
 #else
 	DWORD do_commit = MEM_COMMIT;
 #endif
@@ -788,35 +802,29 @@ os_mmap(size_t size, size_t alignment, size_t* offset, size_t* mapped_size) {
 		                                          page_mapped_current, memory_order_relaxed, memory_order_relaxed))
 			break;
 	}
-#if ENABLE_DECOMMIT
-	size_t page_active_current =
-	    atomic_fetch_add_explicit(&global_statistics.page_active, page_count, memory_order_relaxed) + page_count;
-	size_t page_active_peak = atomic_load_explicit(&global_statistics.page_active_peak, memory_order_relaxed);
-	while (page_active_current > page_active_peak) {
-		if (atomic_compare_exchange_weak_explicit(&global_statistics.page_active_peak, &page_active_peak,
-		                                          page_active_current, memory_order_relaxed, memory_order_relaxed))
-			break;
-	}
-#endif
 #endif
 	return ptr;
 }
 
-static void
+static int
 os_mcommit(void* address, size_t size) {
 #if ENABLE_DECOMMIT
-	if (global_config.disable_decommit)
-		return;
+	if (global_config.disable_decommit) {
+		return 0;
+	}
 #if PLATFORM_WINDOWS
 	if (!VirtualAlloc(address, size, MEM_COMMIT, PAGE_READWRITE)) {
+		if (global_memory_interface->map_fail_callback && global_memory_interface->map_fail_callback(size))
+			return os_mcommit(address, size);
 		rpmalloc_assert(0, "Failed to commit virtual memory block");
+		return 1;
 	}
 #else
-		/*
-		if (mprotect(address, size, PROT_READ | PROT_WRITE)) {
-		    rpmalloc_assert(0, "Failed to commit virtual memory block");
-		}
-		*/
+	/*
+	if (mprotect(address, size, PROT_READ | PROT_WRITE)) {
+		rpmalloc_assert(0, "Failed to commit virtual memory block");
+	}
+	*/
 #endif
 #if ENABLE_STATISTICS
 	size_t page_count = size / global_config.page_size;
@@ -833,23 +841,25 @@ os_mcommit(void* address, size_t size) {
 #endif
 	(void)sizeof(address);
 	(void)sizeof(size);
+	return 0;
 }
 
-static void
+static int
 os_mdecommit(void* address, size_t size) {
 #if ENABLE_DECOMMIT
 	if (global_config.disable_decommit)
-		return;
+		return 1;
 #if PLATFORM_WINDOWS
 	if (!VirtualFree(address, size, MEM_DECOMMIT)) {
 		rpmalloc_assert(0, "Failed to decommit virtual memory block");
+		return 1;
 	}
 #else
-		/*
-		if (mprotect(address, size, PROT_NONE)) {
-		    rpmalloc_assert(0, "Failed to decommit virtual memory block");
-		}
-		*/
+	/*
+	if (mprotect(address, size, PROT_NONE)) {
+		rpmalloc_assert(0, "Failed to decommit virtual memory block");
+	}
+	*/
 #if defined(MADV_DONTNEED)
 	if (madvise(address, size, MADV_DONTNEED)) {
 #elif defined(MADV_FREE_REUSABLE)
@@ -865,6 +875,7 @@ os_mdecommit(void* address, size_t size) {
 	if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) {
 #endif
 		rpmalloc_assert(0, "Failed to decommit virtual memory block");
+		return 1;
 	}
 #endif
 #if ENABLE_STATISTICS
@@ -879,6 +890,7 @@ os_mdecommit(void* address, size_t size) {
 	(void)sizeof(address);
 	(void)sizeof(size);
 #endif
+	return 0;
 }
 
 static void
@@ -986,19 +998,29 @@ page_decommit_memory_pages(page_t* page) {
 		return;
 	void* extra_page = pointer_offset(page, global_config.page_size);
 	size_t extra_page_size = page_get_size(page) - global_config.page_size;
-	global_memory_interface->memory_decommit(extra_page, extra_page_size);
+	if (global_memory_interface->memory_decommit(extra_page, extra_page_size) != 0)
+		return;
+#if RPMALLOC_HEAP_STATISTICS && ENABLE_DECOMMIT
+	if (page->heap)
+		page->heap->stats.committed_size -= extra_page_size;
+#endif
 	page->is_decommitted = 1;
 }
 
-static inline void
+static inline int
 page_commit_memory_pages(page_t* page) {
 	if (!page->is_decommitted)
-		return;
+		return 0;
 	void* extra_page = pointer_offset(page, global_config.page_size);
 	size_t extra_page_size = page_get_size(page) - global_config.page_size;
-	global_memory_interface->memory_commit(extra_page, extra_page_size);
+	if (global_memory_interface->memory_commit(extra_page, extra_page_size) != 0)
+		return 1;
 	page->is_decommitted = 0;
 #if ENABLE_DECOMMIT
+#if RPMALLOC_HEAP_STATISTICS
+	if (page->heap)
+		page->heap->stats.committed_size += extra_page_size;
+#endif
 #if !defined(__APPLE__)
 	// When page is recommitted, the blocks in the second memory page and forward
 	// will be zeroed out by OS - take advantage in zalloc/calloc calls and make sure
@@ -1008,6 +1030,7 @@ page_commit_memory_pages(page_t* page) {
 	page->is_zero = 1;
 #endif
 #endif
+	return 0;
 }
 
 static void
@@ -1090,7 +1113,7 @@ static NOINLINE void
 page_adopt_thread_free_block_list(page_t* page) {
 	if (page->local_free)
 		return;
-	unsigned long long thread_free = atomic_load_explicit(&page->thread_free, memory_order_acquire);
+	unsigned long long thread_free = atomic_load_explicit(&page->thread_free, memory_order_relaxed);
 	if (thread_free != 0) {
 		// Other threads can only replace with another valid list head, this will never change to 0 in other threads
 		while (!atomic_compare_exchange_weak_explicit(&page->thread_free, &thread_free, 0, memory_order_acquire,
@@ -1243,8 +1266,13 @@ span_allocate_page(span_t* span) {
 
 #if ENABLE_DECOMMIT
 	// The first page is always committed on initial span map of memory
-	if (span->page_initialized)
-		global_memory_interface->memory_commit(page, span->page_size);
+	if (span->page_initialized) {
+		if (global_memory_interface->memory_commit(page, span->page_size) != 0)
+			return 0;
+#if RPMALLOC_HEAP_STATISTICS
+		heap->stats.committed_size += span->page_size;
+#endif
+	}
 #endif
 	++span->page_initialized;
 
@@ -1268,6 +1296,16 @@ span_allocate_page(span_t* span) {
 static NOINLINE void
 span_deallocate_block(span_t* span, page_t* page, void* block) {
 	if (UNEXPECTED(page->page_type == PAGE_HUGE)) {
+#if RPMALLOC_HEAP_STATISTICS
+		if (span->heap) {
+			span->heap->stats.mapped_size -= span->mapped_size;
+#if ENABLE_DECOMMIT
+			span->heap->stats.committed_size -= span->page_count * span->page_size;
+#else
+			span->heap->stats.committed_size -= mapped_size;
+#endif
+		}
+#endif
 		global_memory_interface->memory_unmap(span, span->offset, span->mapped_size);
 		return;
 	}
@@ -1303,6 +1341,16 @@ block_deallocate(block_t* block) {
 	page_t* page = span_get_page_from_block(span, block);
 	const int is_thread_local = page_is_thread_heap(page);
 
+#if RPMALLOC_HEAP_STATISTICS
+	heap_t* heap = span->heap;
+	if (heap) {
+		if (span->page_type <= PAGE_LARGE)
+			heap->stats.allocated_size -= page->block_size;
+		else
+			heap->stats.allocated_size -= ((size_t)span->page_size * (size_t)span->page_count);
+	}
+#endif
+
 	// Optimized path for thread local free with non-huge block in page
 	// that has no aligned blocks
 	if (EXPECTED(is_thread_local != 0)) {
@@ -1373,7 +1421,8 @@ heap_allocate_new(void) {
 	size_t mapped_size = 0;
 	block_t* block = global_memory_interface->memory_map(heap_size, 0, &offset, &mapped_size);
 #if ENABLE_DECOMMIT
-	global_memory_interface->memory_commit(block, heap_size);
+	if (global_memory_interface->memory_commit(block, heap_size) != 0)
+		return 0;
 #endif
 	heap_t* heap = heap_initialize((void*)block);
 	heap->offset = (uint32_t)offset;
@@ -1442,7 +1491,7 @@ heap_page_free_decommit(heap_t* heap, uint32_t page_type, uint32_t page_retain_c
 	}
 }
 
-static inline void
+static inline int
 heap_make_free_page_available(heap_t* heap, uint32_t size_class, page_t* page) {
 	page->size_class = size_class;
 	page->block_size = global_size_class[size_class].block_size;
@@ -1463,8 +1512,9 @@ heap_make_free_page_available(heap_t* heap, uint32_t size_class, page_t* page) {
 	if (head)
 		head->prev = page;
 	heap->page_available[size_class] = page;
-	if (page->is_decommitted)
-		page_commit_memory_pages(page);
+	if (page->is_decommitted != 0)
+		return page_commit_memory_pages(page);
+	return 0;
 }
 
 //! Find or allocate a span for the given page type with the given size class
@@ -1478,6 +1528,9 @@ heap_get_span(heap_t* heap, page_type_t page_type) {
 	size_t offset = 0;
 	size_t mapped_size = 0;
 	span_t* span = global_memory_interface->memory_map(SPAN_SIZE, SPAN_SIZE, &offset, &mapped_size);
+#if RPMALLOC_HEAP_STATISTICS
+	heap->stats.mapped_size += mapped_size;
+#endif
 	if (EXPECTED(span != 0)) {
 		uint32_t page_count = 0;
 		uint32_t page_size = 0;
@@ -1496,7 +1549,15 @@ heap_get_span(heap_t* heap, page_type_t page_type) {
 			page_address_mask = LARGE_PAGE_MASK;
 		}
 #if ENABLE_DECOMMIT
-		global_memory_interface->memory_commit(span, page_size);
+		if (global_memory_interface->memory_commit(span, page_size) != 0)
+			return 0;
+#endif
+#if RPMALLOC_HEAP_STATISTICS
+#if ENABLE_DECOMMIT
+		heap->stats.committed_size += page_size;
+#else
+		heap->stats.committed_size += mapped_size;
+#endif
 #endif
 		span->heap = heap;
 		span->page_type = page_type;
@@ -1523,9 +1584,9 @@ heap_get_page_generic(heap_t* heap, uint32_t size_class) {
 	page_type_t page_type = get_page_type(size_class);
 
 	// Check if there is a free page from multithreaded deallocations
-	uintptr_t block_mt = atomic_load_explicit(&heap->thread_free[page_type], memory_order_acquire);
+	uintptr_t block_mt = atomic_load_explicit(&heap->thread_free[page_type], memory_order_relaxed);
 	if (UNEXPECTED(block_mt != 0)) {
-		while (!atomic_compare_exchange_weak_explicit(&heap->thread_free[page_type], &block_mt, 0, memory_order_release,
+		while (!atomic_compare_exchange_weak_explicit(&heap->thread_free[page_type], &block_mt, 0, memory_order_acquire,
 		                                              memory_order_relaxed)) {
 			wait_spin();
 		}
@@ -1547,7 +1608,8 @@ heap_get_page_generic(heap_t* heap, uint32_t size_class) {
 			rpmalloc_assert(heap->page_free_commit_count[page_type] > 0, "Free committed page count out of sync");
 			--heap->page_free_commit_count[page_type];
 		}
-		heap_make_free_page_available(heap, size_class, page);
+		if (heap_make_free_page_available(heap, size_class, page) != 0)
+			return 0;
 		return page;
 	}
 	rpmalloc_assert(heap->page_free_commit_count[page_type] == 0, "Free committed page count out of sync");
@@ -1565,7 +1627,8 @@ heap_get_page_generic(heap_t* heap, uint32_t size_class) {
 	span_t* span = heap_get_span(heap, page_type);
 	if (EXPECTED(span != 0)) {
 		page = span_allocate_page(span);
-		heap_make_free_page_available(page->heap, size_class, page);
+		if (heap_make_free_page_available(page->heap, size_class, page) != 0)
+			return 0;
 	}
 
 	return page;
@@ -1604,6 +1667,7 @@ heap_allocate_block_small_to_large(heap_t* heap, uint32_t size_class, unsigned i
 static NOINLINE RPMALLOC_ALLOCATOR void*
 heap_allocate_block_huge(heap_t* heap, size_t size, unsigned int zero) {
 	if (heap->id == 0) {
+		// Thread has not yet initialized, assign heap and try again
 		rpmalloc_initialize(0);
 		heap = get_thread_heap();
 	}
@@ -1614,7 +1678,16 @@ heap_allocate_block_huge(heap_t* heap, size_t size, unsigned int zero) {
 	if (block) {
 		span_t* span = block;
 #if ENABLE_DECOMMIT
-		global_memory_interface->memory_commit(span, alloc_size);
+		if (global_memory_interface->memory_commit(span, alloc_size) != 0)
+			return 0;
+#endif
+#if RPMALLOC_HEAP_STATISTICS
+		heap->stats.mapped_size += mapped_size;
+#if ENABLE_DECOMMIT
+		heap->stats.committed_size += alloc_size;
+#else
+		heap->stats.committed_size += mapped_size;
+#endif
 #endif
 		span->heap = heap;
 		span->page_type = PAGE_HUGE;
@@ -1635,6 +1708,9 @@ heap_allocate_block_huge(heap_t* heap, size_t size, unsigned int zero) {
 		void* ptr = pointer_offset(block, SPAN_HEADER_SIZE);
 		if (zero)
 			memset(ptr, 0, size);
+#if RPMALLOC_HEAP_STATISTICS
+		heap->stats.allocated_size += size;
+#endif
 		return ptr;
 	}
 	return 0;
@@ -1644,6 +1720,10 @@ static RPMALLOC_ALLOCATOR NOINLINE void*
 heap_allocate_block_generic(heap_t* heap, size_t size, unsigned int zero) {
 	uint32_t size_class = get_size_class(size);
 	if (EXPECTED(size_class < SIZE_CLASS_COUNT)) {
+#if RPMALLOC_HEAP_STATISTICS
+		heap->stats.allocated_size += global_size_class[size_class].block_size;
+#endif
+
 		block_t* block = heap_pop_local_free(heap, size_class);
 		if (EXPECTED(block != 0)) {
 			// Fast track with small block available in heap level local free list
@@ -1668,6 +1748,9 @@ heap_allocate_block(heap_t* heap, size_t size, unsigned int zero) {
 			// Fast track with small block available in heap level local free list
 			if (zero)
 				memset(block, 0, global_size_class[size_class].block_size);
+#if RPMALLOC_HEAP_STATISTICS
+			heap->stats.allocated_size += global_size_class[size_class].block_size;
+#endif
 			return block;
 		}
 	}
@@ -1901,7 +1984,7 @@ rprealloc(void* ptr, size_t size) {
 extern RPMALLOC_ALLOCATOR void*
 rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags) {
 #if ENABLE_VALIDATE_ARGS
-	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+	if ((size + alignment < size) || (alignment > SMALL_PAGE_SIZE)) {
 		errno = EINVAL;
 		return 0;
 	}
@@ -2210,6 +2293,21 @@ rpmalloc_dump_statistics(void* file) {
 #endif
 }
 
+void
+rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
+#if ENABLE_STATISTICS
+    stats->mapped = global_config.page_size * atomic_load_explicit(&global_statistics.page_mapped, memory_order_relaxed);
+    stats->mapped_peak = global_config.page_size * atomic_load_explicit(&global_statistics.page_mapped_peak, memory_order_relaxed);
+    stats->committed = global_config.page_size * atomic_load_explicit(&global_statistics.page_commit, memory_order_relaxed);
+    stats->decommitted = global_config.page_size * atomic_load_explicit(&global_statistics.page_decommit, memory_order_relaxed);
+    stats->active = global_config.page_size * atomic_load_explicit(&global_statistics.page_active, memory_order_relaxed);
+    stats->active_peak = global_config.page_size * atomic_load_explicit(&global_statistics.page_active_peak, memory_order_relaxed);
+    stats->heap_count = atomic_load_explicit(&global_statistics.heap_count, memory_order_relaxed);
+#else
+    memset(stats, 0, sizeof(rpmalloc_global_statistics_t));
+#endif
+}
+
 #if RPMALLOC_FIRST_CLASS_HEAPS
 
 rpmalloc_heap_t*
@@ -2253,6 +2351,17 @@ rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size
 }
 
 RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_zalloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return heap_allocate_block_aligned(heap, alignment, size, 1);
+}
+
+RPMALLOC_ALLOCATOR void*
 rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) {
 	size_t total;
 #if ENABLE_VALIDATE_ARGS
@@ -2312,7 +2421,7 @@ rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned in
 RPMALLOC_ALLOCATOR void*
 rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) {
 #if ENABLE_VALIDATE_ARGS
-	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+	if ((size + alignment < size) || (alignment > SMALL_PAGE_SIZE)) {
 		errno = EINVAL;
 		return 0;
 	}
@@ -2332,6 +2441,18 @@ rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
 	heap_free_all(heap);
 }
 
+struct rpmalloc_heap_statistics_t
+rpmalloc_heap_statistics(rpmalloc_heap_t* heap) {
+#if RPMALLOC_HEAP_STATISTICS
+	if (heap) {
+		return heap->stats;
+	}
+#endif
+	(void)sizeof(heap);
+	struct rpmalloc_heap_statistics_t stats = {0};
+	return stats;
+}
+
 extern inline void
 rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap) {
 	heap_t* prev_heap = get_thread_heap();
author	Stefan Boberg <[email protected]>	2026-04-11 12:46:01 +0200
committer	GitHub Enterprise <[email protected]>	2026-04-11 12:46:01 +0200
commit	dc742b88d908d23e0c5c5d1d95994637658db2b2 (patch)
tree	6fb25b88b64c92c503c239cf3cef497ed18ee172 /thirdparty/rpmalloc/rpmalloc.c
parent	Reduce short-lived heap allocations in zenhttp (diff)
parent	hub deprovision all (#938) (diff)
download	zen-sb/reduce-allocs.tar.xz zen-sb/reduce-allocs.zip