diff options
| author | Stefan Boberg <[email protected]> | 2025-09-17 12:48:38 +0200 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2025-09-17 12:48:38 +0200 |
| commit | 324d7ebca12909a91eb98c41ee73304ad7ee7ea6 (patch) | |
| tree | 9d256567ed94da881713c2b252b8fc4a64b1134b | |
| parent | Sorting oplog tree view by size would raise an error. (#497) (diff) | |
| download | zen-324d7ebca12909a91eb98c41ee73304ad7ee7ea6.tar.xz zen-324d7ebca12909a91eb98c41ee73304ad7ee7ea6.zip | |
rpmalloc fixes (#499)
* fixed rpmalloc build on Linux and Mac
* updated rpmalloc from develop branch on the advice of mjansson
* enabled rpmalloc on all platforms
note that this does not change any behaviour unless `--malloc=rpmalloc` is passed in on the command line. The default is still `mimalloc`.
| -rw-r--r-- | CHANGELOG.md | 1 | ||||
| -rw-r--r-- | thirdparty/rpmalloc/rpmalloc.c | 115 | ||||
| -rw-r--r-- | thirdparty/rpmalloc/rpmalloc.h | 2 | ||||
| -rw-r--r-- | xmake.lua | 3 |
4 files changed, 69 insertions, 52 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b51c4bc6..39786b3b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ - Improvement: Make exceptions handled in http request processing to warnings - Improvement: Revised project oplog in-memory representation which reduces load times and memory usage - Improvement: Updated README.md to state the required version vcpkg +- Improvement: Updated rpmalloc to latest from 'develop' stream, now also compiles on Linux/Mac - Bugfix: Self-hosted dashboard; sorting oplog by size in the tree view would raise an error ## 5.7.1 diff --git a/thirdparty/rpmalloc/rpmalloc.c b/thirdparty/rpmalloc/rpmalloc.c index 7aecfb0f4..08cefe6dd 100644 --- a/thirdparty/rpmalloc/rpmalloc.c +++ b/thirdparty/rpmalloc/rpmalloc.c @@ -21,6 +21,10 @@ #include <stdint.h> #include <stdatomic.h> +#if !defined(__has_builtin) +#define __has_builtin(b) 0 +#endif + #if defined(__clang__) #pragma clang diagnostic ignored "-Wunused-macros" #pragma clang diagnostic ignored "-Wunused-function" @@ -109,10 +113,6 @@ madvise(caddr_t, size_t, int); #define ARCH_32BIT 1 #endif -#if !defined(__has_builtin) -#define __has_builtin(b) 0 -#endif - #define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs)) #define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second)) @@ -210,7 +210,9 @@ madvise(caddr_t, size_t, int); } while (0) #endif -#if __has_builtin(__builtin_assume) +#if defined(_MSC_VER) +#define rpmalloc_assume(cond) __assume(cond) +#elif defined(__clang__) && __has_builtin(__builtin_assume) #define rpmalloc_assume(cond) __builtin_assume(cond) #elif defined(__GNUC__) #define rpmalloc_assume(cond) \ @@ -218,8 +220,6 @@ madvise(caddr_t, size_t, int); if (!__builtin_expect(cond, 0)) \ __builtin_unreachable(); \ } while (0) -#elif defined(_MSC_VER) -#define rpmalloc_assume(cond) __assume(cond) #else #define rpmalloc_assume(cond) 0 #endif @@ -305,8 +305,9 @@ wait_spin(void) { #define UNEXPECTED(x) x #endif -#if defined(__GNUC__) || defined(__clang__) +#if defined(__GNUC__) || defined(__clang__) +#ifdef __has_builtin #if __has_builtin(__builtin_memcpy_inline) #define memcpy_const(x, y, s) __builtin_memcpy_inline(x, y, s) #else @@ -326,7 +327,10 @@ wait_spin(void) { memset(x, y, s); \ } while (0) #endif -#else +#endif +#endif + +#ifndef memcpy_const #define memcpy_const(x, y, s) memcpy(x, y, s) #define memset_const(x, y, s) memset(x, y, s) #endif @@ -548,10 +552,17 @@ static size_t os_page_size; #if defined(_MSC_VER) && !defined(__clang__) #define TLS_MODEL #define _Thread_local __declspec(thread) +#elif defined(__ANDROID__) +#if __ANDROID_API__ >= 29 && \ + ((defined(__clang__) && (__clang_major__ >= 17)) || (defined(__NDK_MAJOR__) && (__NDK_MAJOR__ >= 26))) +#define TLS_MODEL __attribute__((tls_model("local-dynamic"))) #else -// #define TLS_MODEL __attribute__((tls_model("initial-exec"))) #define TLS_MODEL #endif +#else +#define TLS_MODEL __attribute__((tls_model("initial-exec"))) +// #define TLS_MODEL +#endif static _Thread_local heap_t* global_thread_heap TLS_MODEL = &global_heap_fallback; static heap_t* @@ -565,38 +576,35 @@ static inline uintptr_t get_thread_id(void) { #if defined(_WIN32) return (uintptr_t)((void*)NtCurrentTeb()); -#else +#elif !defined(__APPLE__) && !defined(__CYGWIN__) && \ + ((defined(__clang__) && (__clang_major__ >= 7)) || ((defined(__GNUC__) && (__GNUC__ >= 5)))) && \ + (defined(__aarch64__) || defined(__x86_64__) || defined(__loongarch__)) // Unsure of other archs, needs testing void* thp = __builtin_thread_pointer(); return (uintptr_t)thp; +#else + uintptr_t tid; +#if defined(__i386__) + __asm__("movl %%gs:0, %0" : "=r"(tid) : :); +#elif defined(__x86_64__) +#if defined(__MACH__) + __asm__("movq %%gs:0, %0" : "=r"(tid) : :); +#else + __asm__("movq %%fs:0, %0" : "=r"(tid) : :); +#endif +#elif defined(__arm__) + __asm__ volatile("mrc p15, 0, %0, c13, c0, 3" : "=r"(tid)); +#elif defined(__aarch64__) +#if defined(__MACH__) + // tpidr_el0 likely unused, always return 0 on iOS + __asm__ volatile("mrs %0, tpidrro_el0" : "=r"(tid)); +#else + __asm__ volatile("mrs %0, tpidr_el0" : "=r"(tid)); +#endif +#else + tid = (uintptr_t)&global_thread_heap; +#endif + return tid; #endif - /* - #elif (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__) - uintptr_t tid; - #if defined(__i386__) - __asm__("movl %%gs:0, %0" : "=r"(tid) : :); - #elif defined(__x86_64__) - #if defined(__MACH__) - __asm__("movq %%gs:0, %0" : "=r"(tid) : :); - #else - __asm__("movq %%fs:0, %0" : "=r"(tid) : :); - #endif - #elif defined(__arm__) - __asm__ volatile("mrc p15, 0, %0, c13, c0, 3" : "=r"(tid)); - #elif defined(__aarch64__) - #if defined(__MACH__) - // tpidr_el0 likely unused, always return 0 on iOS - __asm__ volatile("mrs %0, tpidrro_el0" : "=r"(tid)); - #else - __asm__ volatile("mrs %0, tpidr_el0" : "=r"(tid)); - #endif - #else - #error This platform needs implementation of get_thread_id() - #endif - return tid; - #else - #error This platform needs implementation of get_thread_id() - #endif - */ } //! Set the current thread heap @@ -1044,7 +1052,7 @@ page_full_to_free_on_new_heap(page_t* page, heap_t* heap) { page->is_full = 0; page->is_free = 1; page->heap = heap; - atomic_store_explicit(&page->thread_free, 0, memory_order_relaxed); + atomic_store_explicit(&page->thread_free, 0, memory_order_release); page->next = heap->page_free[page->page_type]; heap->page_free[page->page_type] = page; if (++heap->page_free_commit_count[page->page_type] >= global_page_free_overflow[page->page_type]) @@ -1082,10 +1090,10 @@ static NOINLINE void page_adopt_thread_free_block_list(page_t* page) { if (page->local_free) return; - unsigned long long thread_free = atomic_load_explicit(&page->thread_free, memory_order_relaxed); + unsigned long long thread_free = atomic_load_explicit(&page->thread_free, memory_order_acquire); if (thread_free != 0) { // Other threads can only replace with another valid list head, this will never change to 0 in other threads - while (!atomic_compare_exchange_weak_explicit(&page->thread_free, &thread_free, 0, memory_order_relaxed, + while (!atomic_compare_exchange_weak_explicit(&page->thread_free, &thread_free, 0, memory_order_acquire, memory_order_relaxed)) wait_spin(); page->local_free_count = page_block_from_thread_free_list(page, thread_free, &page->local_free); @@ -1104,7 +1112,7 @@ page_put_thread_free_block(page_t* page, block_t* block) { uintptr_t prev_head = atomic_load_explicit(&heap->thread_free[page->page_type], memory_order_relaxed); block->next = (void*)prev_head; while (!atomic_compare_exchange_weak_explicit(&heap->thread_free[page->page_type], &prev_head, (uintptr_t)block, - memory_order_relaxed, memory_order_relaxed)) { + memory_order_release, memory_order_relaxed)) { block->next = (void*)prev_head; wait_spin(); } @@ -1115,7 +1123,7 @@ page_put_thread_free_block(page_t* page, block_t* block) { uint32_t list_size = page_block_from_thread_free_list(page, prev_thread_free, &block->next) + 1; uint64_t thread_free = page_block_to_thread_free_list(page, block_index, list_size); while (!atomic_compare_exchange_weak_explicit(&page->thread_free, &prev_thread_free, thread_free, - memory_order_relaxed, memory_order_relaxed)) { + memory_order_release, memory_order_relaxed)) { list_size = page_block_from_thread_free_list(page, prev_thread_free, &block->next) + 1; thread_free = page_block_to_thread_free_list(page, block_index, list_size); wait_spin(); @@ -1170,7 +1178,7 @@ page_allocate_block(page_t* page, unsigned int zero) { unsigned int is_zero = 0; block_t* block = (page->local_free != 0) ? page_get_local_free_block(page) : 0; if (UNEXPECTED(block == 0)) { - if (atomic_load_explicit(&page->thread_free, memory_order_relaxed) != 0) { + if (atomic_load_explicit(&page->thread_free, memory_order_acquire) != 0) { page_adopt_thread_free_block_list(page); block = (page->local_free != 0) ? page_get_local_free_block(page) : 0; } @@ -1451,7 +1459,7 @@ heap_make_free_page_available(heap_t* heap, uint32_t size_class, page_t* page) { page_t* head = heap->page_available[size_class]; page->next = head; page->prev = 0; - atomic_store_explicit(&page->thread_free, 0, memory_order_relaxed); + atomic_store_explicit(&page->thread_free, 0, memory_order_release); if (head) head->prev = page; heap->page_available[size_class] = page; @@ -1515,9 +1523,9 @@ heap_get_page_generic(heap_t* heap, uint32_t size_class) { page_type_t page_type = get_page_type(size_class); // Check if there is a free page from multithreaded deallocations - uintptr_t block_mt = atomic_load_explicit(&heap->thread_free[page_type], memory_order_relaxed); + uintptr_t block_mt = atomic_load_explicit(&heap->thread_free[page_type], memory_order_acquire); if (UNEXPECTED(block_mt != 0)) { - while (!atomic_compare_exchange_weak_explicit(&heap->thread_free[page_type], &block_mt, 0, memory_order_relaxed, + while (!atomic_compare_exchange_weak_explicit(&heap->thread_free[page_type], &block_mt, 0, memory_order_release, memory_order_relaxed)) { wait_spin(); } @@ -1595,7 +1603,10 @@ heap_allocate_block_small_to_large(heap_t* heap, uint32_t size_class, unsigned i //! Generic allocation path from heap pages, spans or new mapping static NOINLINE RPMALLOC_ALLOCATOR void* heap_allocate_block_huge(heap_t* heap, size_t size, unsigned int zero) { - (void)sizeof(heap); + if (heap->id == 0) { + rpmalloc_initialize(0); + heap = get_thread_heap(); + } size_t alloc_size = get_page_aligned_size(size + SPAN_HEADER_SIZE); size_t offset = 0; size_t mapped_size = 0; @@ -1790,7 +1801,7 @@ heap_free_all(heap_t* heap) { heap->span_partial[itype] = 0; heap->page_free[itype] = 0; heap->page_free_commit_count[itype] = 0; - atomic_store_explicit(&heap->thread_free[itype], 0, memory_order_relaxed); + atomic_store_explicit(&heap->thread_free[itype], 0, memory_order_release); } for (int itype = 0; itype < 4; ++itype) { span_t* span = heap->span_used[itype]; @@ -2026,7 +2037,11 @@ rpmalloc_initialize(rpmalloc_interface_t* memory_interface) { if (global_config.enable_huge_pages) { #if PLATFORM_WINDOWS HANDLE token = 0; +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP | WINAPI_PARTITION_SYSTEM) size_t large_page_minimum = GetLargePageMinimum(); +#else + size_t large_page_minimum = 2 * 1024 * 1024; +#endif if (large_page_minimum) OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token); if (token) { diff --git a/thirdparty/rpmalloc/rpmalloc.h b/thirdparty/rpmalloc/rpmalloc.h index 2e67280f9..d11292fb1 100644 --- a/thirdparty/rpmalloc/rpmalloc.h +++ b/thirdparty/rpmalloc/rpmalloc.h @@ -353,7 +353,7 @@ rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) RPMALLOC_AT // also be strictly less than the span size (default 64KiB). RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + RPMALLOC_ATTRIB_ALLOC_SIZE2(3, 4); //! Reallocate the given block to at least the given size. The memory block MUST be allocated // by the same heap given to this function. @@ -121,6 +121,7 @@ end if is_os("linux") then add_cxxflags("-Wno-vla-cxx-extension") + add_defines("_GNU_SOURCE") end -- Turn use of undefined cpp macros into errors @@ -150,7 +151,7 @@ option_end() add_define_by_config("ZEN_USE_MIMALLOC", "zenmimalloc") option("zenrpmalloc") - set_default(is_os("windows")) + set_default(true) set_showmenu(true) set_description("Use rpmalloc for faster memory management") option_end() |