Insights-compatible memory tracking (#214)

This change introduces support for tracing of memory allocation activity. The code is ported from UE5, and Unreal Insights can be used to analyze the output. This is currently only fully supported on Windows, but will be extended to Mac/Linux in the near future. To activate full memory tracking, pass `--trace=memory` on the commandline alongside `--tracehost=<ip>` or `-tracefile=<path>`. For more control over how much detail is traced you can instead pass some combination of `callstack`, `memtag`, `memalloc` instead. In practice, `--trace=memory` is an alias for `--trace=callstack,memtag,memalloc`). For convenience we also support `--trace=memory_light` which omits call stacks. This change also introduces multiple memory allocators, which may be selected via command-line option `--malloc=<allocator>`: * `mimalloc` - mimalloc (default, same as before) * `rpmalloc` - rpmalloc is another high performance allocator for multithreaded applications which may be a better option than mimalloc (to be evaluated). Due to toolchain limitations this is currently only supported on Windows. * `stomp` - an allocator intended to be used during development/debugging to help track down memory issues such as use-after-free or out-of-bounds access. Currently only supported on Windows. * `ansi` - fallback to default system allocator
author: Stefan Boberg <[email protected]> 2024-11-25 09:56:23 +0100
committer: GitHub Enterprise <[email protected]> 2024-11-25 09:56:23 +0100
commit: 8b8de92e51db4cc4c1727712c736dcba5f79d369 (patch)
tree: 1f58edaaad389837a7652daebab246125762240e /thirdparty
parent: 5.5.13 (diff)
download: zen-8b8de92e51db4cc4c1727712c736dcba5f79d369.tar.xz
zen-8b8de92e51db4cc4c1727712c736dcba5f79d369.zip
4 files changed, 3215 insertions, 0 deletions
diff --git a/thirdparty/rpmalloc/malloc.c b/thirdparty/rpmalloc/malloc.c
new file mode 100644
index 000000000..835eff18e
--- /dev/null
+++ b/thirdparty/rpmalloc/malloc.c
@@ -0,0 +1,367 @@
+/* malloc.c  -  Memory allocator  -  Public Domain  -  2016 Mattias Jansson
+ *
+ * This library provides a cross-platform lock free thread caching malloc implementation in C11.
+ * The latest source code is always available at
+ *
+ * https://github.com/mjansson/rpmalloc
+ *
+ * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions.
+ *
+ */
+
+/* clang-format off */
+
+//
+// This file provides overrides for the standard library malloc entry points for C and new/delete operators for C++
+// It also provides automatic initialization/finalization of process and threads
+//
+
+#if defined(__TINYC__)
+#include <sys/types.h>
+#endif
+
+#if (defined(__GNUC__) || defined(__clang__))
+#pragma GCC visibility push(default)
+#endif
+
+#define USE_IMPLEMENT 1
+#define USE_INTERPOSE 0
+#define USE_ALIAS 0
+
+#if defined(__APPLE__)
+#undef USE_INTERPOSE
+#define USE_INTERPOSE 1
+
+typedef struct interpose_t {
+	void* new_func;
+	void* orig_func;
+} interpose_t;
+
+#define MAC_INTERPOSE_PAIR(newf, oldf) 	{ (void*)newf, (void*)oldf }
+#define MAC_INTERPOSE_SINGLE(newf, oldf) \
+__attribute__((used)) static const interpose_t macinterpose##newf##oldf \
+__attribute__ ((section("__DATA, __interpose"))) = MAC_INTERPOSE_PAIR(newf, oldf)
+
+#endif
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+#undef USE_IMPLEMENT
+#undef USE_ALIAS
+#define USE_IMPLEMENT 0
+#define USE_ALIAS 1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable : 4100)
+#undef malloc
+#undef free
+#undef calloc
+#endif
+
+#if ENABLE_OVERRIDE
+
+typedef struct rp_nothrow_t { int __dummy; } rp_nothrow_t;
+
+#if USE_INTERPOSE || USE_ALIAS
+
+static void* rpmalloc_nothrow(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+static void* rpaligned_alloc_reverse(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+static void* rpaligned_alloc_reverse_nothrow(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+static void rpfree_size(void* p, size_t size) { (void)sizeof(size); rpfree(p); }
+static void rpfree_aligned(void* p, size_t align) { (void)sizeof(align); rpfree(p); }
+static void rpfree_size_aligned(void* p, size_t size, size_t align) { (void)sizeof(size); (void)sizeof(align); rpfree(p); }
+
+#endif
+
+extern inline void* RPMALLOC_CDECL rpvalloc(size_t size) {
+	return rpaligned_alloc(os_page_size, size);
+}
+
+extern inline void* RPMALLOC_CDECL
+rppvalloc(size_t size) {
+	const size_t page_size = os_page_size;
+	const size_t aligned_size = ((size + page_size - 1) / page_size) * page_size;
+#if ENABLE_VALIDATE_ARGS
+	if (aligned_size < size) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return rpaligned_alloc(os_page_size, aligned_size);
+}
+
+extern inline void* RPMALLOC_CDECL
+rpreallocarray(void* ptr, size_t count, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#ifdef _MSC_VER
+	int err = SizeTMult(count, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(count, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = count * size;
+#endif
+	return rprealloc(ptr, total);
+}
+
+#if USE_IMPLEMENT
+
+extern inline void* RPMALLOC_CDECL malloc(size_t size) { return rpmalloc(size); }
+extern inline void* RPMALLOC_CDECL calloc(size_t count, size_t size) { return rpcalloc(count, size); }
+extern inline void* RPMALLOC_CDECL realloc(void* ptr, size_t size) { return rprealloc(ptr, size); }
+extern inline void* RPMALLOC_CDECL reallocf(void* ptr, size_t size) { return rprealloc(ptr, size); }
+extern inline void* RPMALLOC_CDECL aligned_alloc(size_t alignment, size_t size) { return rpaligned_alloc(alignment, size); }
+extern inline void* RPMALLOC_CDECL memalign(size_t alignment, size_t size) { return rpmemalign(alignment, size); }
+extern inline int RPMALLOC_CDECL posix_memalign(void** memptr, size_t alignment, size_t size) { return rpposix_memalign(memptr, alignment, size); }
+extern inline void RPMALLOC_CDECL free(void* ptr) { rpfree(ptr); }
+extern inline void RPMALLOC_CDECL cfree(void* ptr) { rpfree(ptr); }
+extern inline size_t RPMALLOC_CDECL malloc_usable_size(void* ptr) { return rpmalloc_usable_size(ptr); }
+extern inline size_t RPMALLOC_CDECL malloc_size(void* ptr) { return rpmalloc_usable_size(ptr); }
+extern inline void* RPMALLOC_CDECL valloc(size_t size) { return rpvalloc(size); }
+extern inline void* RPMALLOC_CDECL pvalloc(size_t size) { return rppvalloc(size); }
+extern inline void* RPMALLOC_CDECL reallocarray(void* ptr, size_t count, size_t size) { return rpreallocarray(ptr, count, size); }
+
+#ifdef _WIN32
+extern inline void* RPMALLOC_CDECL _malloc_base(size_t size) { return rpmalloc(size); }
+extern inline void RPMALLOC_CDECL _free_base(void* ptr) { rpfree(ptr); }
+extern inline void* RPMALLOC_CDECL _calloc_base(size_t count, size_t size) { return rpcalloc(count, size); }
+extern inline size_t RPMALLOC_CDECL _msize(void* ptr) { return rpmalloc_usable_size(ptr); }
+extern inline size_t RPMALLOC_CDECL _msize_base(void* ptr) { return rpmalloc_usable_size(ptr); }
+extern inline void* RPMALLOC_CDECL _realloc_base(void* ptr, size_t size) { return rprealloc(ptr, size); }
+#endif
+
+#ifdef _WIN32
+// For Windows, #include <rpnew.h> in one source file to get the C++ operator overrides implemented in your module
+#else
+// Overload the C++ operators using the mangled names (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling)
+// operators delete and delete[]
+#define RPDEFVIS __attribute__((visibility("default")))
+extern void _ZdlPv(void* p); void RPDEFVIS _ZdlPv(void* p) { rpfree(p); }
+extern void _ZdaPv(void* p); void RPDEFVIS _ZdaPv(void* p) { rpfree(p); }
+#if ARCH_64BIT
+// 64-bit operators new and new[], normal and aligned
+extern void* _Znwm(uint64_t size); void* RPDEFVIS _Znwm(uint64_t size) { return rpmalloc(size); }
+extern void* _Znam(uint64_t size); void* RPDEFVIS _Znam(uint64_t size) { return rpmalloc(size); }
+extern void* _Znwmm(uint64_t size, uint64_t align); void* RPDEFVIS _Znwmm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
+extern void* _Znamm(uint64_t size, uint64_t align); void* RPDEFVIS _Znamm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnwmSt11align_val_t(uint64_t size, uint64_t align); void* RPDEFVIS _ZnwmSt11align_val_t(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnamSt11align_val_t(uint64_t size, uint64_t align); void* RPDEFVIS _ZnamSt11align_val_t(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t); void* RPDEFVIS _ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern void* _ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t); void* RPDEFVIS _ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern void* _ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, rp_nothrow_t t); void* RPDEFVIS _ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+extern void* _ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, rp_nothrow_t t); void* RPDEFVIS _ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+// 64-bit operators sized delete and delete[], normal and aligned
+extern void _ZdlPvm(void* p, uint64_t size); void RPDEFVIS _ZdlPvm(void* p, uint64_t size) { rpfree(p); (void)sizeof(size); }
+extern void _ZdaPvm(void* p, uint64_t size); void RPDEFVIS _ZdaPvm(void* p, uint64_t size) { rpfree(p); (void)sizeof(size); }
+extern void _ZdlPvSt11align_val_t(void* p, uint64_t align); void RPDEFVIS _ZdlPvSt11align_val_t(void* p, uint64_t align) { rpfree(p); (void)sizeof(align); }
+extern void _ZdaPvSt11align_val_t(void* p, uint64_t align); void RPDEFVIS _ZdaPvSt11align_val_t(void* p, uint64_t align) { rpfree(p); (void)sizeof(align); }
+extern void _ZdlPvmSt11align_val_t(void* p, uint64_t size, uint64_t align); void RPDEFVIS _ZdlPvmSt11align_val_t(void* p, uint64_t size, uint64_t align) { rpfree(p); (void)sizeof(size); (void)sizeof(align); }
+extern void _ZdaPvmSt11align_val_t(void* p, uint64_t size, uint64_t align); void RPDEFVIS _ZdaPvmSt11align_val_t(void* p, uint64_t size, uint64_t align) { rpfree(p); (void)sizeof(size); (void)sizeof(align); }
+#else
+// 32-bit operators new and new[], normal and aligned
+extern void* _Znwj(uint32_t size); void* RPDEFVIS _Znwj(uint32_t size) { return rpmalloc(size); }
+extern void* _Znaj(uint32_t size); void* RPDEFVIS _Znaj(uint32_t size) { return rpmalloc(size); }
+extern void* _Znwjj(uint32_t size, uint32_t align); void* RPDEFVIS _Znwjj(uint32_t size, uint32_t align) { return rpaligned_alloc(align, size); }
+extern void* _Znajj(uint32_t size, uint32_t align); void* RPDEFVIS _Znajj(uint32_t size, uint32_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnwjSt11align_val_t(size_t size, size_t align); void* RPDEFVIS _ZnwjSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnajSt11align_val_t(size_t size, size_t align); void* RPDEFVIS _ZnajSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t); void* RPDEFVIS _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern void* _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t); void* RPDEFVIS _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t); void* RPDEFVIS _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+extern void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t); void* RPDEFVIS _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+// 32-bit operators sized delete and delete[], normal and aligned
+extern void _ZdlPvj(void* p, uint64_t size); void RPDEFVIS _ZdlPvj(void* p, uint64_t size) { rpfree(p); (void)sizeof(size); }
+extern void _ZdaPvj(void* p, uint64_t size); void RPDEFVIS _ZdaPvj(void* p, uint64_t size) { rpfree(p); (void)sizeof(size); }
+extern void _ZdlPvSt11align_val_t(void* p, uint32_t align); void RPDEFVIS _ZdlPvSt11align_val_t(void* p, uint64_t a) { rpfree(p); (void)sizeof(align); }
+extern void _ZdaPvSt11align_val_t(void* p, uint32_t align); void RPDEFVIS _ZdaPvSt11align_val_t(void* p, uint64_t a) { rpfree(p); (void)sizeof(align); }
+extern void _ZdlPvjSt11align_val_t(void* p, uint32_t size, uint32_t align); void RPDEFVIS _ZdlPvjSt11align_val_t(void* p, uint64_t size, uint64_t align) { rpfree(p); (void)sizeof(size); (void)sizeof(a); }
+extern void _ZdaPvjSt11align_val_t(void* p, uint32_t size, uint32_t align); void RPDEFVIS _ZdaPvjSt11align_val_t(void* p, uint64_t size, uint64_t align) { rpfree(p); (void)sizeof(size); (void)sizeof(a); }
+#endif
+#endif
+#endif
+
+#if USE_INTERPOSE
+
+__attribute__((used)) static const interpose_t macinterpose_malloc[]
+__attribute__ ((section("__DATA, __interpose"))) = {
+	//new and new[]
+	MAC_INTERPOSE_PAIR(rpmalloc, _Znwm),
+	MAC_INTERPOSE_PAIR(rpmalloc, _Znam),
+	MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _Znwmm),
+	MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _Znamm),
+	MAC_INTERPOSE_PAIR(rpmalloc_nothrow, _ZnwmRKSt9nothrow_t),
+	MAC_INTERPOSE_PAIR(rpmalloc_nothrow, _ZnamRKSt9nothrow_t),
+	MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _ZnwmSt11align_val_t),
+	MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _ZnamSt11align_val_t),
+	MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse_nothrow, _ZnwmSt11align_val_tRKSt9nothrow_t),
+	MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse_nothrow, _ZnamSt11align_val_tRKSt9nothrow_t),
+	//delete and delete[]
+	MAC_INTERPOSE_PAIR(rpfree, _ZdlPv),
+	MAC_INTERPOSE_PAIR(rpfree, _ZdaPv),
+	MAC_INTERPOSE_PAIR(rpfree_size, _ZdlPvm),
+	MAC_INTERPOSE_PAIR(rpfree_size, _ZdaPvm),
+	MAC_INTERPOSE_PAIR(rpfree_aligned, _ZdlPvSt11align_val_t),
+	MAC_INTERPOSE_PAIR(rpfree_aligned, _ZdaPvSt11align_val_t),
+	MAC_INTERPOSE_PAIR(rpfree_size_aligned, _ZdlPvmSt11align_val_t),
+	MAC_INTERPOSE_PAIR(rpfree_size_aligned, _ZdaPvmSt11align_val_t),
+	//libc entry points
+	MAC_INTERPOSE_PAIR(rpmalloc, malloc),
+	MAC_INTERPOSE_PAIR(rpmalloc, calloc),
+	MAC_INTERPOSE_PAIR(rprealloc, realloc),
+	MAC_INTERPOSE_PAIR(rprealloc, reallocf),
+	MAC_INTERPOSE_PAIR(rpvalloc, valloc),
+#if defined(__MAC_10_15) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_15
+	MAC_INTERPOSE_PAIR(rpaligned_alloc, aligned_alloc),
+#endif
+	MAC_INTERPOSE_PAIR(rpmemalign, memalign),
+	MAC_INTERPOSE_PAIR(rpposix_memalign, posix_memalign),
+	MAC_INTERPOSE_PAIR(rpfree, free),
+	MAC_INTERPOSE_PAIR(rpfree, cfree),
+	MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_usable_size),
+	MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_size)
+};
+
+#endif
+
+#if USE_ALIAS
+
+#define RPALIAS(fn) __attribute__((alias(#fn), used, visibility("default")));
+
+// Alias the C++ operators using the mangled names (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling)
+
+// operators delete and delete[]
+void _ZdlPv(void* p) RPALIAS(rpfree)
+void _ZdaPv(void* p) RPALIAS(rpfree)
+
+#if ARCH_64BIT
+// 64-bit operators new and new[], normal and aligned
+void* _Znwm(uint64_t size) RPALIAS(rpmalloc)
+void* _Znam(uint64_t size) RPALIAS(rpmalloc)
+void* _Znwmm(uint64_t size, uint64_t align) RPALIAS(rpaligned_alloc_reverse)
+void* _Znamm(uint64_t size, uint64_t align) RPALIAS(rpaligned_alloc_reverse)
+void* _ZnwmSt11align_val_t(size_t size, size_t align) RPALIAS(rpaligned_alloc_reverse)
+void* _ZnamSt11align_val_t(size_t size, size_t align) RPALIAS(rpaligned_alloc_reverse)
+void* _ZnwmRKSt9nothrow_t(size_t size, rp_nothrow_t t) RPALIAS(rpmalloc_nothrow)
+void* _ZnamRKSt9nothrow_t(size_t size, rp_nothrow_t t) RPALIAS(rpmalloc_nothrow)
+void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) RPALIAS(rpaligned_alloc_reverse_nothrow)
+void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) RPALIAS(rpaligned_alloc_reverse_nothrow)
+// 64-bit operators delete and delete[], sized and aligned
+void _ZdlPvm(void* p, size_t n) RPALIAS(rpfree_size)
+void _ZdaPvm(void* p, size_t n) RPALIAS(rpfree_size)
+void _ZdlPvSt11align_val_t(void* p, size_t a) RPALIAS(rpfree_aligned)
+void _ZdaPvSt11align_val_t(void* p, size_t a) RPALIAS(rpfree_aligned)
+void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t a) RPALIAS(rpfree_size_aligned)
+void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t a) RPALIAS(rpfree_size_aligned)
+#else
+// 32-bit operators new and new[], normal and aligned
+void* _Znwj(uint32_t size) RPALIAS(rpmalloc)
+void* _Znaj(uint32_t size) RPALIAS(rpmalloc)
+void* _Znwjj(uint32_t size, uint32_t align) RPALIAS(rpaligned_alloc_reverse)
+void* _Znajj(uint32_t size, uint32_t align) RPALIAS(rpaligned_alloc_reverse)
+void* _ZnwjSt11align_val_t(size_t size, size_t align) RPALIAS(rpaligned_alloc_reverse)
+void* _ZnajSt11align_val_t(size_t size, size_t align) RPALIAS(rpaligned_alloc_reverse)
+void* _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t) RPALIAS(rpmalloc_nothrow)
+void* _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t) RPALIAS(rpmalloc_nothrow)
+void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) RPALIAS(rpaligned_alloc_reverse_nothrow)
+void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) RPALIAS(rpaligned_alloc_reverse_nothrow)
+// 32-bit operators delete and delete[], sized and aligned
+void _ZdlPvj(void* p, size_t n) RPALIAS(rpfree_size)
+void _ZdaPvj(void* p, size_t n) RPALIAS(rpfree_size)
+void _ZdlPvSt11align_val_t(void* p, size_t a) RPALIAS(rpfree_aligned)
+void _ZdaPvSt11align_val_t(void* p, size_t a) RPALIAS(rpfree_aligned)
+void _ZdlPvjSt11align_val_t(void* p, size_t n, size_t a) RPALIAS(rpfree_size_aligned)
+void _ZdaPvjSt11align_val_t(void* p, size_t n, size_t a) RPALIAS(rpfree_size_aligned)
+#endif
+
+void* malloc(size_t size) RPALIAS(rpmalloc)
+void* calloc(size_t count, size_t size) RPALIAS(rpcalloc)
+void* realloc(void* ptr, size_t size) RPALIAS(rprealloc)
+void* reallocf(void* ptr, size_t size) RPALIAS(rprealloc)
+void* aligned_alloc(size_t alignment, size_t size) RPALIAS(rpaligned_alloc)
+void* memalign(size_t alignment, size_t size) RPALIAS(rpmemalign)
+int posix_memalign(void** memptr, size_t alignment, size_t size) RPALIAS(rpposix_memalign)
+void free(void* ptr) RPALIAS(rpfree)
+void cfree(void* ptr) RPALIAS(rpfree)
+void* reallocarray(void* ptr, size_t count, size_t size) RPALIAS(rpreallocarray)
+void* valloc(size_t size) RPALIAS(rpvalloc)
+void* pvalloc(size_t size) RPALIAS(rppvalloc)
+#if defined(__ANDROID__) || defined(__FreeBSD__)
+size_t malloc_usable_size(const void* ptr) RPALIAS(rpmalloc_usable_size)
+#else
+size_t malloc_usable_size(void* ptr) RPALIAS(rpmalloc_usable_size)
+#endif
+size_t malloc_size(void* ptr) RPALIAS(rpmalloc_usable_size)
+
+// end USE_ALIAS
+#endif
+
+#if defined(__GLIBC__) && defined(__linux__) && 0
+
+void* __libc_malloc(size_t size) RPALIAS(rpmalloc)
+void* __libc_calloc(size_t count, size_t size) RPALIAS(rpcalloc)
+void* __libc_realloc(void* p, size_t size) RPALIAS(rprealloc)
+void __libc_free(void* p) RPALIAS(rpfree)
+void __libc_cfree(void* p) RPALIAS(rpfree)
+
+extern void* __libc_valloc(size_t size);
+extern void* __libc_pvalloc(size_t size);
+
+void*
+__libc_valloc(size_t size) {
+	return valloc(size);
+}
+
+void*
+__libc_pvalloc(size_t size) {
+	return pvalloc(size);
+}
+void* __libc_memalign(size_t align, size_t size) RPALIAS(rpmemalign)
+
+int __posix_memalign(void** p, size_t align, size_t size) RPALIAS(rpposix_memalign)
+
+#endif
+
+// end ENABLE_OVERRIDE
+#endif
+
+#if ENABLE_DYNAMIC_LINK
+
+#ifdef _WIN32
+
+extern __declspec(dllexport) BOOL WINAPI
+DllMain(HINSTANCE instance, DWORD reason, LPVOID reserved);
+
+extern __declspec(dllexport) BOOL WINAPI
+DllMain(HINSTANCE instance, DWORD reason, LPVOID reserved) {
+	(void)sizeof(reserved);
+	(void)sizeof(instance);
+	if (reason == DLL_PROCESS_ATTACH)
+		rpmalloc_initialize(0);
+	else if (reason == DLL_PROCESS_DETACH)
+		rpmalloc_finalize();
+	else if (reason == DLL_THREAD_ATTACH)
+		rpmalloc_thread_initialize();
+	else if (reason == DLL_THREAD_DETACH)
+		rpmalloc_thread_finalize();
+	return TRUE;
+}
+
+#endif
+
+// end ENABLE_DYNAMIC_LINK
+#endif
+
+#if (defined(__GNUC__) || defined(__clang__))
+#pragma GCC visibility pop
+#endif
+
+/* clang-format on */
diff --git a/thirdparty/rpmalloc/rpmalloc.c b/thirdparty/rpmalloc/rpmalloc.c
new file mode 100644
index 000000000..7aecfb0f4
--- /dev/null
+++ b/thirdparty/rpmalloc/rpmalloc.c
@@ -0,0 +1,2341 @@
+/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016-2020 Mattias
+ * Jansson
+ *
+ * This library provides a cross-platform lock free thread caching malloc
+ * implementation in C11. The latest source code is always available at
+ *
+ * https://github.com/mjansson/rpmalloc
+ *
+ * This library is put in the public domain; you can redistribute it and/or
+ * modify it without any restrictions.
+ *
+ */
+
+#include "rpmalloc.h"
+
+#include <errno.h>
+#include <string.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdatomic.h>
+
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wunused-macros"
+#pragma clang diagnostic ignored "-Wunused-function"
+#if __has_warning("-Wreserved-identifier")
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+#if __has_warning("-Wstatic-in-inline")
+#pragma clang diagnostic ignored "-Wstatic-in-inline"
+#endif
+#if __has_warning("-Wunsafe-buffer-usage")
+#pragma clang diagnostic ignored "-Wunsafe-buffer-usage"
+#endif
+#elif defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wunused-macros"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+#if defined(_WIN32) || defined(__WIN32__) || defined(_WIN64)
+#define PLATFORM_WINDOWS 1
+#define PLATFORM_POSIX 0
+#else
+#define PLATFORM_WINDOWS 0
+#define PLATFORM_POSIX 1
+#endif
+
+#if defined(_MSC_VER)
+#define NOINLINE __declspec(noinline)
+#else
+#define NOINLINE __attribute__((noinline))
+#endif
+
+#if PLATFORM_WINDOWS
+#include <windows.h>
+#include <fibersapi.h>
+static DWORD fls_key;
+#endif
+#if PLATFORM_POSIX
+#include <sys/mman.h>
+#include <sched.h>
+#include <unistd.h>
+#include <pthread.h>
+static pthread_key_t pthread_key;
+#ifdef __FreeBSD__
+#include <sys/sysctl.h>
+#define MAP_HUGETLB MAP_ALIGNED_SUPER
+#ifndef PROT_MAX
+#define PROT_MAX(f) 0
+#endif
+#else
+#define PROT_MAX(f) 0
+#endif
+#ifdef __sun
+extern int
+madvise(caddr_t, size_t, int);
+#endif
+#ifndef MAP_UNINITIALIZED
+#define MAP_UNINITIALIZED 0
+#endif
+#endif
+
+#if defined(__linux__) || defined(__ANDROID__)
+#include <sys/prctl.h>
+#if !defined(PR_SET_VMA)
+#define PR_SET_VMA 0x53564d41
+#define PR_SET_VMA_ANON_NAME 0
+#endif
+#endif
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
+#include <mach/mach_vm.h>
+#include <mach/vm_statistics.h>
+#endif
+#include <pthread.h>
+#endif
+#if defined(__HAIKU__) || defined(__TINYC__)
+#include <pthread.h>
+#endif
+
+#include <limits.h>
+#if (INTPTR_MAX > INT32_MAX)
+#define ARCH_64BIT 1
+#define ARCH_32BIT 0
+#else
+#define ARCH_64BIT 0
+#define ARCH_32BIT 1
+#endif
+
+#if !defined(__has_builtin)
+#define __has_builtin(b) 0
+#endif
+
+#define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs))
+#define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second))
+
+////////////
+///
+/// Build time configurable limits
+///
+//////
+
+#ifndef ENABLE_VALIDATE_ARGS
+//! Enable validation of args to public entry points
+#define ENABLE_VALIDATE_ARGS 0
+#endif
+#ifndef ENABLE_ASSERTS
+//! Enable asserts
+#define ENABLE_ASSERTS 0
+#endif
+#ifndef ENABLE_UNMAP
+//! Enable unmapping memory pages
+#define ENABLE_UNMAP 1
+#endif
+#ifndef ENABLE_DECOMMIT
+//! Enable decommitting memory pages
+#define ENABLE_DECOMMIT 1
+#endif
+#ifndef ENABLE_DYNAMIC_LINK
+//! Enable building as dynamic library
+#define ENABLE_DYNAMIC_LINK 0
+#endif
+#ifndef ENABLE_OVERRIDE
+//! Enable standard library malloc/free/new/delete overrides
+#define ENABLE_OVERRIDE 1
+#endif
+#ifndef ENABLE_STATISTICS
+//! Enable statistics
+#define ENABLE_STATISTICS 0
+#endif
+
+////////////
+///
+/// Built in size configurations
+///
+//////
+
+#define PAGE_HEADER_SIZE 128
+#define SPAN_HEADER_SIZE PAGE_HEADER_SIZE
+
+#define SMALL_GRANULARITY 16
+
+#define SMALL_BLOCK_SIZE_LIMIT (4 * 1024)
+#define MEDIUM_BLOCK_SIZE_LIMIT (256 * 1024)
+#define LARGE_BLOCK_SIZE_LIMIT (8 * 1024 * 1024)
+
+#define SMALL_SIZE_CLASS_COUNT 73
+#define MEDIUM_SIZE_CLASS_COUNT 24
+#define LARGE_SIZE_CLASS_COUNT 20
+#define SIZE_CLASS_COUNT (SMALL_SIZE_CLASS_COUNT + MEDIUM_SIZE_CLASS_COUNT + LARGE_SIZE_CLASS_COUNT)
+
+#define SMALL_PAGE_SIZE_SHIFT 16
+#define SMALL_PAGE_SIZE (1 << SMALL_PAGE_SIZE_SHIFT)
+#define SMALL_PAGE_MASK (~((uintptr_t)SMALL_PAGE_SIZE - 1))
+#define MEDIUM_PAGE_SIZE_SHIFT 22
+#define MEDIUM_PAGE_SIZE (1 << MEDIUM_PAGE_SIZE_SHIFT)
+#define MEDIUM_PAGE_MASK (~((uintptr_t)MEDIUM_PAGE_SIZE - 1))
+#define LARGE_PAGE_SIZE_SHIFT 26
+#define LARGE_PAGE_SIZE (1 << LARGE_PAGE_SIZE_SHIFT)
+#define LARGE_PAGE_MASK (~((uintptr_t)LARGE_PAGE_SIZE - 1))
+
+#define SPAN_SIZE (256 * 1024 * 1024)
+#define SPAN_MASK (~((uintptr_t)(SPAN_SIZE - 1)))
+
+////////////
+///
+/// Utility macros
+///
+//////
+
+#if ENABLE_ASSERTS
+#undef NDEBUG
+#if defined(_MSC_VER) && !defined(_DEBUG)
+#define _DEBUG
+#endif
+#include <assert.h>
+#define RPMALLOC_TOSTRING_M(x) #x
+#define RPMALLOC_TOSTRING(x) RPMALLOC_TOSTRING_M(x)
+#define rpmalloc_assert(truth, message) \
+	do {                                \
+		if (!(truth)) {                 \
+			assert((truth) && message); \
+		}                               \
+	} while (0)
+#else
+#define rpmalloc_assert(truth, message) \
+	do {                                \
+	} while (0)
+#endif
+
+#if __has_builtin(__builtin_assume)
+#define rpmalloc_assume(cond) __builtin_assume(cond)
+#elif defined(__GNUC__)
+#define rpmalloc_assume(cond)           \
+	do {                                \
+		if (!__builtin_expect(cond, 0)) \
+			__builtin_unreachable();    \
+	} while (0)
+#elif defined(_MSC_VER)
+#define rpmalloc_assume(cond) __assume(cond)
+#else
+#define rpmalloc_assume(cond) 0
+#endif
+
+////////////
+///
+/// Statistics
+///
+//////
+
+#if ENABLE_STATISTICS
+
+typedef struct rpmalloc_statistics_t {
+	atomic_size_t page_mapped;
+	atomic_size_t page_mapped_peak;
+	atomic_size_t page_commit;
+	atomic_size_t page_decommit;
+	atomic_size_t page_active;
+	atomic_size_t page_active_peak;
+	atomic_size_t heap_count;
+} rpmalloc_statistics_t;
+
+static rpmalloc_statistics_t global_statistics;
+
+#else
+
+#endif
+
+////////////
+///
+/// Low level abstractions
+///
+//////
+
+static inline size_t
+rpmalloc_clz(uintptr_t x) {
+#if ARCH_64BIT
+#if defined(_MSC_VER) && !defined(__clang__)
+	return (size_t)_lzcnt_u64(x);
+#else
+	return (size_t)__builtin_clzll(x);
+#endif
+#else
+#if defined(_MSC_VER) && !defined(__clang__)
+	return (size_t)_lzcnt_u32(x);
+#else
+	return (size_t)__builtin_clzl(x);
+#endif
+#endif
+}
+
+static inline void
+wait_spin(void) {
+#if defined(_MSC_VER)
+#if defined(_M_ARM64)
+	__yield();
+#else
+	_mm_pause();
+#endif
+#elif defined(__x86_64__) || defined(__i386__)
+	__asm__ volatile("pause" ::: "memory");
+#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH >= 7)
+	__asm__ volatile("yield" ::: "memory");
+#elif defined(__powerpc__) || defined(__powerpc64__)
+	// No idea if ever been compiled in such archs but ... as precaution
+	__asm__ volatile("or 27,27,27");
+#elif defined(__sparc__)
+	__asm__ volatile("rd %ccr, %g0 \n\trd %ccr, %g0 \n\trd %ccr, %g0");
+#else
+	struct timespec ts = {0};
+	nanosleep(&ts, 0);
+#endif
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#define EXPECTED(x) __builtin_expect((x), 1)
+#define UNEXPECTED(x) __builtin_expect((x), 0)
+
+#else
+
+#define EXPECTED(x) x
+#define UNEXPECTED(x) x
+
+#endif
+#if defined(__GNUC__) || defined(__clang__)
+
+#if __has_builtin(__builtin_memcpy_inline)
+#define memcpy_const(x, y, s) __builtin_memcpy_inline(x, y, s)
+#else
+#define memcpy_const(x, y, s)                                                                                   \
+	do {                                                                                                        \
+		_Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0), "len must be a constant integer"); \
+		memcpy(x, y, s);                                                                                        \
+	} while (0)
+#endif
+
+#if __has_builtin(__builtin_memset_inline)
+#define memset_const(x, y, s) __builtin_memset_inline(x, y, s)
+#else
+#define memset_const(x, y, s)                                                                                   \
+	do {                                                                                                        \
+		_Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0), "len must be a constant integer"); \
+		memset(x, y, s);                                                                                        \
+	} while (0)
+#endif
+#else
+#define memcpy_const(x, y, s) memcpy(x, y, s)
+#define memset_const(x, y, s) memset(x, y, s)
+#endif
+
+////////////
+///
+/// Data types
+///
+//////
+
+//! A memory heap, per thread
+typedef struct heap_t heap_t;
+//! Span of memory pages
+typedef struct span_t span_t;
+//! Memory page
+typedef struct page_t page_t;
+//! Memory block
+typedef struct block_t block_t;
+//! Size class for a memory block
+typedef struct size_class_t size_class_t;
+
+//! Memory page type
+typedef enum page_type_t {
+	PAGE_SMALL,   // 64KiB
+	PAGE_MEDIUM,  // 4MiB
+	PAGE_LARGE,   // 64MiB
+	PAGE_HUGE
+} page_type_t;
+
+//! Block size class
+struct size_class_t {
+	//! Size of blocks in this class
+	uint32_t block_size;
+	//! Number of blocks in each chunk
+	uint32_t block_count;
+};
+
+//! A memory block
+struct block_t {
+	//! Next block in list
+	block_t* next;
+};
+
+//! A page contains blocks of a given size
+struct page_t {
+	//! Size class of blocks
+	uint32_t size_class;
+	//! Block size
+	uint32_t block_size;
+	//! Block count
+	uint32_t block_count;
+	//! Block initialized count
+	uint32_t block_initialized;
+	//! Block used count
+	uint32_t block_used;
+	//! Page type
+	page_type_t page_type;
+	//! Flag set if part of heap full list
+	uint32_t is_full : 1;
+	//! Flag set if part of heap free list
+	uint32_t is_free : 1;
+	//! Flag set if blocks are zero initialied
+	uint32_t is_zero : 1;
+	//! Flag set if memory pages have been decommitted
+	uint32_t is_decommitted : 1;
+	//! Flag set if containing aligned blocks
+	uint32_t has_aligned_block : 1;
+	//! Fast combination flag for either huge, fully allocated or has aligned blocks
+	uint32_t generic_free : 1;
+	//! Local free list count
+	uint32_t local_free_count;
+	//! Local free list
+	block_t* local_free;
+	//! Owning heap
+	heap_t* heap;
+	//! Next page in list
+	page_t* next;
+	//! Previous page in list
+	page_t* prev;
+	//! Multithreaded free list, block index is in low 32 bit, list count is high 32 bit
+	atomic_ullong thread_free;
+};
+
+//! A span contains pages of a given type
+struct span_t {
+	//! Page header
+	page_t page;
+	//! Owning heap
+	heap_t* heap;
+	//! Page address mask
+	uintptr_t page_address_mask;
+	//! Number of pages initialized
+	uint32_t page_initialized;
+	//! Number of pages in use
+	uint32_t page_count;
+	//! Number of bytes per page
+	uint32_t page_size;
+	//! Page type
+	page_type_t page_type;
+	//! Offset to start of mapped memory region
+	uint32_t offset;
+	//! Mapped size
+	uint64_t mapped_size;
+	//! Next span in list
+	span_t* next;
+};
+
+// Control structure for a heap, either a thread heap or a first class heap if enabled
+struct heap_t {
+	//! Owning thread ID
+	uintptr_t owner_thread;
+	//! Heap local free list for small size classes
+	block_t* local_free[SIZE_CLASS_COUNT];
+	//! Available non-full pages for each size class
+	page_t* page_available[SIZE_CLASS_COUNT];
+	//! Free pages for each page type
+	page_t* page_free[3];
+	//! Free but still committed page count for each page tyoe
+	uint32_t page_free_commit_count[3];
+	//! Multithreaded free list
+	atomic_uintptr_t thread_free[3];
+	//! Available partially initialized spans for each page type
+	span_t* span_partial[3];
+	//! Spans in full use for each page type
+	span_t* span_used[4];
+	//! Next heap in queue
+	heap_t* next;
+	//! Previous heap in queue
+	heap_t* prev;
+	//! Heap ID
+	uint32_t id;
+	//! Finalization state flag
+	uint32_t finalize;
+	//! Memory map region offset
+	uint32_t offset;
+	//! Memory map size
+	size_t mapped_size;
+};
+
+_Static_assert(sizeof(page_t) <= PAGE_HEADER_SIZE, "Invalid page header size");
+_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "Invalid span header size");
+_Static_assert(sizeof(heap_t) <= 4096, "Invalid heap size");
+
+////////////
+///
+/// Global data
+///
+//////
+
+//! Fallback heap
+static RPMALLOC_CACHE_ALIGNED heap_t global_heap_fallback;
+//! Default heap
+static heap_t* global_heap_default = &global_heap_fallback;
+//! Available heaps
+static heap_t* global_heap_queue;
+//! In use heaps
+static heap_t* global_heap_used;
+//! Lock for heap queue
+static atomic_uintptr_t global_heap_lock;
+//! Heap ID counter
+static atomic_uint global_heap_id = 1;
+//! Initialized flag
+static int global_rpmalloc_initialized;
+//! Memory interface
+static rpmalloc_interface_t* global_memory_interface;
+//! Default memory interface
+static rpmalloc_interface_t global_memory_interface_default;
+//! Current configuration
+static rpmalloc_config_t global_config = {0};
+//! Main thread ID
+static uintptr_t global_main_thread_id;
+
+//! Size classes
+#define SCLASS(n) \
+	{ (n * SMALL_GRANULARITY), (SMALL_PAGE_SIZE - PAGE_HEADER_SIZE) / (n * SMALL_GRANULARITY) }
+#define MCLASS(n) \
+	{ (n * SMALL_GRANULARITY), (MEDIUM_PAGE_SIZE - PAGE_HEADER_SIZE) / (n * SMALL_GRANULARITY) }
+#define LCLASS(n) \
+	{ (n * SMALL_GRANULARITY), (LARGE_PAGE_SIZE - PAGE_HEADER_SIZE) / (n * SMALL_GRANULARITY) }
+static const size_class_t global_size_class[SIZE_CLASS_COUNT] = {
+    SCLASS(1),      SCLASS(1),      SCLASS(2),      SCLASS(3),      SCLASS(4),      SCLASS(5),      SCLASS(6),
+    SCLASS(7),      SCLASS(8),      SCLASS(9),      SCLASS(10),     SCLASS(11),     SCLASS(12),     SCLASS(13),
+    SCLASS(14),     SCLASS(15),     SCLASS(16),     SCLASS(17),     SCLASS(18),     SCLASS(19),     SCLASS(20),
+    SCLASS(21),     SCLASS(22),     SCLASS(23),     SCLASS(24),     SCLASS(25),     SCLASS(26),     SCLASS(27),
+    SCLASS(28),     SCLASS(29),     SCLASS(30),     SCLASS(31),     SCLASS(32),     SCLASS(33),     SCLASS(34),
+    SCLASS(35),     SCLASS(36),     SCLASS(37),     SCLASS(38),     SCLASS(39),     SCLASS(40),     SCLASS(41),
+    SCLASS(42),     SCLASS(43),     SCLASS(44),     SCLASS(45),     SCLASS(46),     SCLASS(47),     SCLASS(48),
+    SCLASS(49),     SCLASS(50),     SCLASS(51),     SCLASS(52),     SCLASS(53),     SCLASS(54),     SCLASS(55),
+    SCLASS(56),     SCLASS(57),     SCLASS(58),     SCLASS(59),     SCLASS(60),     SCLASS(61),     SCLASS(62),
+    SCLASS(63),     SCLASS(64),     SCLASS(80),     SCLASS(96),     SCLASS(112),    SCLASS(128),    SCLASS(160),
+    SCLASS(192),    SCLASS(224),    SCLASS(256),    MCLASS(320),    MCLASS(384),    MCLASS(448),    MCLASS(512),
+    MCLASS(640),    MCLASS(768),    MCLASS(896),    MCLASS(1024),   MCLASS(1280),   MCLASS(1536),   MCLASS(1792),
+    MCLASS(2048),   MCLASS(2560),   MCLASS(3072),   MCLASS(3584),   MCLASS(4096),   MCLASS(5120),   MCLASS(6144),
+    MCLASS(7168),   MCLASS(8192),   MCLASS(10240),  MCLASS(12288),  MCLASS(14336),  MCLASS(16384),  LCLASS(20480),
+    LCLASS(24576),  LCLASS(28672),  LCLASS(32768),  LCLASS(40960),  LCLASS(49152),  LCLASS(57344),  LCLASS(65536),
+    LCLASS(81920),  LCLASS(98304),  LCLASS(114688), LCLASS(131072), LCLASS(163840), LCLASS(196608), LCLASS(229376),
+    LCLASS(262144), LCLASS(327680), LCLASS(393216), LCLASS(458752), LCLASS(524288)};
+
+//! Threshold number of pages for when free pages are decommitted
+static uint32_t global_page_free_overflow[4] = {16, 8, 2, 0};
+
+//! Number of pages to retain when free page threshold overflows
+static uint32_t global_page_free_retain[4] = {4, 2, 1, 0};
+
+//! OS huge page support
+static int os_huge_pages;
+//! OS memory map granularity
+static size_t os_map_granularity;
+//! OS memory page size
+static size_t os_page_size;
+
+////////////
+///
+/// Thread local heap and ID
+///
+//////
+
+//! Current thread heap
+#if defined(_MSC_VER) && !defined(__clang__)
+#define TLS_MODEL
+#define _Thread_local __declspec(thread)
+#else
+// #define TLS_MODEL __attribute__((tls_model("initial-exec")))
+#define TLS_MODEL
+#endif
+static _Thread_local heap_t* global_thread_heap TLS_MODEL = &global_heap_fallback;
+
+static heap_t*
+heap_allocate(int first_class);
+
+static void
+heap_page_free_decommit(heap_t* heap, uint32_t page_type, uint32_t page_retain_count);
+
+//! Fast thread ID
+static inline uintptr_t
+get_thread_id(void) {
+#if defined(_WIN32)
+	return (uintptr_t)((void*)NtCurrentTeb());
+#else
+	void* thp = __builtin_thread_pointer();
+	return (uintptr_t)thp;
+#endif
+	/*
+	#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__)
+	    uintptr_t tid;
+	#if defined(__i386__)
+	    __asm__("movl %%gs:0, %0" : "=r"(tid) : :);
+	#elif defined(__x86_64__)
+	#if defined(__MACH__)
+	    __asm__("movq %%gs:0, %0" : "=r"(tid) : :);
+	#else
+	    __asm__("movq %%fs:0, %0" : "=r"(tid) : :);
+	#endif
+	#elif defined(__arm__)
+	    __asm__ volatile("mrc p15, 0, %0, c13, c0, 3" : "=r"(tid));
+	#elif defined(__aarch64__)
+	#if defined(__MACH__)
+	    // tpidr_el0 likely unused, always return 0 on iOS
+	    __asm__ volatile("mrs %0, tpidrro_el0" : "=r"(tid));
+	#else
+	    __asm__ volatile("mrs %0, tpidr_el0" : "=r"(tid));
+	#endif
+	#else
+	#error This platform needs implementation of get_thread_id()
+	#endif
+	    return tid;
+	#else
+	#error This platform needs implementation of get_thread_id()
+	#endif
+	*/
+}
+
+//! Set the current thread heap
+static void
+set_thread_heap(heap_t* heap) {
+	global_thread_heap = heap;
+	if (heap && (heap->id != 0)) {
+		rpmalloc_assert(heap->id != 0, "Default heap being used");
+		heap->owner_thread = get_thread_id();
+	}
+#if PLATFORM_WINDOWS
+	FlsSetValue(fls_key, heap);
+#else
+	pthread_setspecific(pthread_key, heap);
+#endif
+}
+
+static heap_t*
+get_thread_heap_allocate(void) {
+	heap_t* heap = heap_allocate(0);
+	set_thread_heap(heap);
+	return heap;
+}
+
+//! Get the current thread heap
+static inline heap_t*
+get_thread_heap(void) {
+	return global_thread_heap;
+}
+
+//! Get the size class from given size in bytes for tiny blocks (below 16 times the minimum granularity)
+static inline uint32_t
+get_size_class_tiny(size_t size) {
+	return (((uint32_t)size + (SMALL_GRANULARITY - 1)) / SMALL_GRANULARITY);
+}
+
+//! Get the size class from given size in bytes
+static inline uint32_t
+get_size_class(size_t size) {
+	uintptr_t minblock_count = (size + (SMALL_GRANULARITY - 1)) / SMALL_GRANULARITY;
+	// For sizes up to 64 times the minimum granularity (i.e 1024 bytes) the size class is equal to number of such
+	// blocks
+	if (size <= (SMALL_GRANULARITY * 64)) {
+		rpmalloc_assert(global_size_class[minblock_count].block_size >= size, "Size class misconfiguration");
+		return (uint32_t)(minblock_count ? minblock_count : 1);
+	}
+	--minblock_count;
+	// Calculate position of most significant bit, since minblock_count now guaranteed to be > 64 this position is
+	// guaranteed to be >= 6
+#if ARCH_64BIT
+	const uint32_t most_significant_bit = (uint32_t)(63 - (int)rpmalloc_clz(minblock_count));
+#else
+	const uint32_t most_significant_bit = (uint32_t)(31 - (int)rpmalloc_clz(minblock_count));
+#endif
+	// Class sizes are of the bit format [..]000xxx000[..] where we already have the position of the most significant
+	// bit, now calculate the subclass from the remaining two bits
+	const uint32_t subclass_bits = (minblock_count >> (most_significant_bit - 2)) & 0x03;
+	const uint32_t class_idx = (uint32_t)((most_significant_bit << 2) + subclass_bits) + 41;
+	rpmalloc_assert((class_idx >= SIZE_CLASS_COUNT) || (global_size_class[class_idx].block_size >= size),
+	                "Size class misconfiguration");
+	rpmalloc_assert((class_idx >= SIZE_CLASS_COUNT) || (global_size_class[class_idx - 1].block_size < size),
+	                "Size class misconfiguration");
+	return class_idx;
+}
+
+static inline page_type_t
+get_page_type(uint32_t size_class) {
+	if (size_class < SMALL_SIZE_CLASS_COUNT)
+		return PAGE_SMALL;
+	else if (size_class < (SMALL_SIZE_CLASS_COUNT + MEDIUM_SIZE_CLASS_COUNT))
+		return PAGE_MEDIUM;
+	else if (size_class < SIZE_CLASS_COUNT)
+		return PAGE_LARGE;
+	return PAGE_HUGE;
+}
+
+static inline size_t
+get_page_aligned_size(size_t size) {
+	size_t unalign = size % global_config.page_size;
+	if (unalign)
+		size += global_config.page_size - unalign;
+	return size;
+}
+
+////////////
+///
+/// OS entry points
+///
+//////
+
+static void
+os_set_page_name(void* address, size_t size) {
+#if defined(__linux__) || defined(__ANDROID__)
+	const char* name = os_huge_pages ? global_config.huge_page_name : global_config.page_name;
+	if ((address == MAP_FAILED) || !name)
+		return;
+	// If the kernel does not support CONFIG_ANON_VMA_NAME or if the call fails
+	// (e.g. invalid name) it is a no-op basically.
+	(void)prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (uintptr_t)address, size, (uintptr_t)name);
+#else
+	(void)sizeof(size);
+	(void)sizeof(address);
+#endif
+}
+
+static void*
+os_mmap(size_t size, size_t alignment, size_t* offset, size_t* mapped_size) {
+	size_t map_size = size + alignment;
+#if PLATFORM_WINDOWS
+	// Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses
+	// are actually accessed". But if we enable decommit it's better to not immediately commit and instead commit per
+	// page to avoid saturating the OS commit limit
+#if ENABLE_DECOMMIT
+	DWORD do_commit = 0;
+#else
+	DWORD do_commit = MEM_COMMIT;
+#endif
+	void* ptr =
+	    VirtualAlloc(0, map_size, (os_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | do_commit, PAGE_READWRITE);
+#else
+	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED;
+#if defined(__APPLE__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
+	int fd = (int)VM_MAKE_TAG(240U);
+	if (os_huge_pages)
+		fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+	void* ptr = mmap(0, map_size, PROT_READ | PROT_WRITE, flags, fd, 0);
+#elif defined(MAP_HUGETLB)
+	void* ptr = mmap(0, map_size, PROT_READ | PROT_WRITE | PROT_MAX(PROT_READ | PROT_WRITE),
+	                 (os_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0);
+#if defined(MADV_HUGEPAGE)
+	// In some configurations, huge pages allocations might fail thus
+	// we fallback to normal allocations and promote the region as transparent huge page
+	if ((ptr == MAP_FAILED || !ptr) && os_huge_pages) {
+		ptr = mmap(0, map_size, PROT_READ | PROT_WRITE, flags, -1, 0);
+		if (ptr && ptr != MAP_FAILED) {
+			int prm = madvise(ptr, size, MADV_HUGEPAGE);
+			(void)prm;
+			rpmalloc_assert((prm == 0), "Failed to promote the page to transparent huge page");
+		}
+	}
+#endif
+	os_set_page_name(ptr, map_size);
+#elif defined(MAP_ALIGNED)
+	const size_t align = (sizeof(size_t) * 8) - (size_t)(__builtin_clzl(size - 1));
+	void* ptr = mmap(0, map_size, PROT_READ | PROT_WRITE, (os_huge_pages ? MAP_ALIGNED(align) : 0) | flags, -1, 0);
+#elif defined(MAP_ALIGN)
+	caddr_t base = (os_huge_pages ? (caddr_t)(4 << 20) : 0);
+	void* ptr = mmap(base, map_size, PROT_READ | PROT_WRITE, (os_huge_pages ? MAP_ALIGN : 0) | flags, -1, 0);
+#else
+	void* ptr = mmap(0, map_size, PROT_READ | PROT_WRITE, flags, -1, 0);
+#endif
+	if (ptr == MAP_FAILED)
+		ptr = 0;
+#endif
+	if (!ptr) {
+		if (global_memory_interface->map_fail_callback) {
+			if (global_memory_interface->map_fail_callback(map_size))
+				return os_mmap(size, alignment, offset, mapped_size);
+		} else {
+			rpmalloc_assert(ptr != 0, "Failed to map more virtual memory");
+		}
+		return 0;
+	}
+	if (alignment) {
+		size_t padding = ((uintptr_t)ptr & (uintptr_t)(alignment - 1));
+		if (padding)
+			padding = alignment - padding;
+		rpmalloc_assert(padding <= alignment, "Internal failure in padding");
+		rpmalloc_assert(!(padding % 8), "Internal failure in padding");
+		ptr = pointer_offset(ptr, padding);
+		*offset = padding;
+	}
+	*mapped_size = map_size;
+#if ENABLE_STATISTICS
+	size_t page_count = map_size / global_config.page_size;
+	size_t page_mapped_current =
+	    atomic_fetch_add_explicit(&global_statistics.page_mapped, page_count, memory_order_relaxed) + page_count;
+	size_t page_mapped_peak = atomic_load_explicit(&global_statistics.page_mapped_peak, memory_order_relaxed);
+	while (page_mapped_current > page_mapped_peak) {
+		if (atomic_compare_exchange_weak_explicit(&global_statistics.page_mapped_peak, &page_mapped_peak,
+		                                          page_mapped_current, memory_order_relaxed, memory_order_relaxed))
+			break;
+	}
+#if ENABLE_DECOMMIT
+	size_t page_active_current =
+	    atomic_fetch_add_explicit(&global_statistics.page_active, page_count, memory_order_relaxed) + page_count;
+	size_t page_active_peak = atomic_load_explicit(&global_statistics.page_active_peak, memory_order_relaxed);
+	while (page_active_current > page_active_peak) {
+		if (atomic_compare_exchange_weak_explicit(&global_statistics.page_active_peak, &page_active_peak,
+		                                          page_active_current, memory_order_relaxed, memory_order_relaxed))
+			break;
+	}
+#endif
+#endif
+	return ptr;
+}
+
+static void
+os_mcommit(void* address, size_t size) {
+#if ENABLE_DECOMMIT
+	if (global_config.disable_decommit)
+		return;
+#if PLATFORM_WINDOWS
+	if (!VirtualAlloc(address, size, MEM_COMMIT, PAGE_READWRITE)) {
+		rpmalloc_assert(0, "Failed to commit virtual memory block");
+	}
+#else
+		/*
+		if (mprotect(address, size, PROT_READ | PROT_WRITE)) {
+		    rpmalloc_assert(0, "Failed to commit virtual memory block");
+		}
+		*/
+#endif
+#if ENABLE_STATISTICS
+	size_t page_count = size / global_config.page_size;
+	atomic_fetch_add_explicit(&global_statistics.page_commit, page_count, memory_order_relaxed);
+	size_t page_active_current =
+	    atomic_fetch_add_explicit(&global_statistics.page_active, page_count, memory_order_relaxed) + page_count;
+	size_t page_active_peak = atomic_load_explicit(&global_statistics.page_active_peak, memory_order_relaxed);
+	while (page_active_current > page_active_peak) {
+		if (atomic_compare_exchange_weak_explicit(&global_statistics.page_active_peak, &page_active_peak,
+		                                          page_active_current, memory_order_relaxed, memory_order_relaxed))
+			break;
+	}
+#endif
+#endif
+	(void)sizeof(address);
+	(void)sizeof(size);
+}
+
+static void
+os_mdecommit(void* address, size_t size) {
+#if ENABLE_DECOMMIT
+	if (global_config.disable_decommit)
+		return;
+#if PLATFORM_WINDOWS
+	if (!VirtualFree(address, size, MEM_DECOMMIT)) {
+		rpmalloc_assert(0, "Failed to decommit virtual memory block");
+	}
+#else
+		/*
+		if (mprotect(address, size, PROT_NONE)) {
+		    rpmalloc_assert(0, "Failed to decommit virtual memory block");
+		}
+		*/
+#if defined(MADV_DONTNEED)
+	if (madvise(address, size, MADV_DONTNEED)) {
+#elif defined(MADV_FREE_REUSABLE)
+	int ret;
+	while ((ret = madvise(address, size, MADV_FREE_REUSABLE)) == -1 && (errno == EAGAIN))
+		errno = 0;
+	if ((ret == -1) && (errno != 0)) {
+#elif defined(MADV_PAGEOUT)
+	if (madvise(address, size, MADV_PAGEOUT)) {
+#elif defined(MADV_FREE)
+	if (madvise(address, size, MADV_FREE)) {
+#else
+	if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) {
+#endif
+		rpmalloc_assert(0, "Failed to decommit virtual memory block");
+	}
+#endif
+#if ENABLE_STATISTICS
+	size_t page_count = size / global_config.page_size;
+	atomic_fetch_add_explicit(&global_statistics.page_decommit, page_count, memory_order_relaxed);
+	size_t page_active_current =
+	    atomic_fetch_sub_explicit(&global_statistics.page_active, page_count, memory_order_relaxed);
+	rpmalloc_assert(page_active_current >= page_count, "Decommit counter out of sync");
+	(void)sizeof(page_active_current);
+#endif
+#else
+	(void)sizeof(address);
+	(void)sizeof(size);
+#endif
+}
+
+static void
+os_munmap(void* address, size_t offset, size_t mapped_size) {
+	(void)sizeof(mapped_size);
+	address = pointer_offset(address, -(int32_t)offset);
+#if ENABLE_UNMAP
+#if PLATFORM_WINDOWS
+	if (!VirtualFree(address, 0, MEM_RELEASE)) {
+		rpmalloc_assert(0, "Failed to unmap virtual memory block");
+	}
+#else
+	if (munmap(address, mapped_size))
+		rpmalloc_assert(0, "Failed to unmap virtual memory block");
+#endif
+#if ENABLE_STATISTICS
+	size_t page_count = mapped_size / global_config.page_size;
+	atomic_fetch_sub_explicit(&global_statistics.page_mapped, page_count, memory_order_relaxed);
+	atomic_fetch_sub_explicit(&global_statistics.page_active, page_count, memory_order_relaxed);
+#endif
+#endif
+}
+
+////////////
+///
+/// Page interface
+///
+//////
+
+static inline span_t*
+page_get_span(page_t* page) {
+	return (span_t*)((uintptr_t)page & SPAN_MASK);
+}
+
+static inline size_t
+page_get_size(page_t* page) {
+	if (page->page_type == PAGE_SMALL)
+		return SMALL_PAGE_SIZE;
+	else if (page->page_type == PAGE_MEDIUM)
+		return MEDIUM_PAGE_SIZE;
+	else if (page->page_type == PAGE_LARGE)
+		return LARGE_PAGE_SIZE;
+	else
+		return page_get_span(page)->page_size;
+}
+
+static inline int
+page_is_thread_heap(page_t* page) {
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	return (!page->heap->owner_thread || (page->heap->owner_thread == get_thread_id()));
+#else
+	return (page->heap->owner_thread == get_thread_id());
+#endif
+}
+
+static inline block_t*
+page_block_start(page_t* page) {
+	return pointer_offset(page, PAGE_HEADER_SIZE);
+}
+
+static inline block_t*
+page_block(page_t* page, uint32_t block_index) {
+	return pointer_offset(page, PAGE_HEADER_SIZE + (page->block_size * block_index));
+}
+
+static inline uint32_t
+page_block_index(page_t* page, block_t* block) {
+	block_t* block_first = page_block_start(page);
+	return (uint32_t)pointer_diff(block, block_first) / page->block_size;
+}
+
+static inline uint32_t
+page_block_from_thread_free_list(page_t* page, uint64_t token, block_t** block) {
+	uint32_t block_index = (uint32_t)(token & 0xFFFFFFFFULL);
+	uint32_t list_count = (uint32_t)((token >> 32ULL) & 0xFFFFFFFFULL);
+	*block = list_count ? page_block(page, block_index) : 0;
+	return list_count;
+}
+
+static inline uint64_t
+page_block_to_thread_free_list(page_t* page, uint32_t block_index, uint32_t list_count) {
+	(void)sizeof(page);
+	return ((uint64_t)list_count << 32ULL) | (uint64_t)block_index;
+}
+
+static inline block_t*
+page_block_realign(page_t* page, block_t* block) {
+	void* blocks_start = page_block_start(page);
+	uint32_t block_offset = (uint32_t)pointer_diff(block, blocks_start);
+	return pointer_offset(block, -(int32_t)(block_offset % page->block_size));
+}
+
+static block_t*
+page_get_local_free_block(page_t* page) {
+	block_t* block = page->local_free;
+	page->local_free = block->next;
+	--page->local_free_count;
+	++page->block_used;
+	return block;
+}
+
+static inline void
+page_decommit_memory_pages(page_t* page) {
+	if (page->is_decommitted)
+		return;
+	void* extra_page = pointer_offset(page, global_config.page_size);
+	size_t extra_page_size = page_get_size(page) - global_config.page_size;
+	global_memory_interface->memory_decommit(extra_page, extra_page_size);
+	page->is_decommitted = 1;
+}
+
+static inline void
+page_commit_memory_pages(page_t* page) {
+	if (!page->is_decommitted)
+		return;
+	void* extra_page = pointer_offset(page, global_config.page_size);
+	size_t extra_page_size = page_get_size(page) - global_config.page_size;
+	global_memory_interface->memory_commit(extra_page, extra_page_size);
+	page->is_decommitted = 0;
+#if ENABLE_DECOMMIT
+#if !defined(__APPLE__)
+	// When page is recommitted, the blocks in the second memory page and forward
+	// will be zeroed out by OS - take advantage in zalloc/calloc calls and make sure
+	// blocks in first page is zeroed out
+	void* first_page = pointer_offset(page, PAGE_HEADER_SIZE);
+	memset(first_page, 0, global_config.page_size - PAGE_HEADER_SIZE);
+	page->is_zero = 1;
+#endif
+#endif
+}
+
+static void
+page_available_to_free(page_t* page) {
+	rpmalloc_assert(page->is_full == 0, "Page full flag internal failure");
+	rpmalloc_assert(page->is_decommitted == 0, "Page decommitted flag internal failure");
+	heap_t* heap = page->heap;
+	if (heap->page_available[page->size_class] == page) {
+		heap->page_available[page->size_class] = page->next;
+	} else {
+		page->prev->next = page->next;
+		if (page->next)
+			page->next->prev = page->prev;
+	}
+	page->is_free = 1;
+	page->is_zero = 0;
+	page->next = heap->page_free[page->page_type];
+	heap->page_free[page->page_type] = page;
+	if (++heap->page_free_commit_count[page->page_type] >= global_page_free_overflow[page->page_type])
+		heap_page_free_decommit(heap, page->page_type, global_page_free_retain[page->page_type]);
+}
+
+static void
+page_full_to_available(page_t* page) {
+	rpmalloc_assert(page->is_full == 1, "Page full flag internal failure");
+	rpmalloc_assert(page->is_decommitted == 0, "Page decommitted flag internal failure");
+	heap_t* heap = page->heap;
+	page->next = heap->page_available[page->size_class];
+	if (page->next)
+		page->next->prev = page;
+	heap->page_available[page->size_class] = page;
+	page->is_full = 0;
+	if (page->has_aligned_block == 0)
+		page->generic_free = 0;
+}
+
+static void
+page_full_to_free_on_new_heap(page_t* page, heap_t* heap) {
+	rpmalloc_assert(heap->id, "Page full to free on default heap");
+	rpmalloc_assert(page->is_full == 1, "Page full flag internal failure");
+	rpmalloc_assert(page->is_decommitted == 0, "Page decommitted flag internal failure");
+	page->is_full = 0;
+	page->is_free = 1;
+	page->heap = heap;
+	atomic_store_explicit(&page->thread_free, 0, memory_order_relaxed);
+	page->next = heap->page_free[page->page_type];
+	heap->page_free[page->page_type] = page;
+	if (++heap->page_free_commit_count[page->page_type] >= global_page_free_overflow[page->page_type])
+		heap_page_free_decommit(heap, page->page_type, global_page_free_retain[page->page_type]);
+}
+
+static void
+page_available_to_full(page_t* page) {
+	heap_t* heap = page->heap;
+	if (heap->page_available[page->size_class] == page) {
+		heap->page_available[page->size_class] = page->next;
+	} else {
+		page->prev->next = page->next;
+		if (page->next)
+			page->next->prev = page->prev;
+	}
+	page->is_full = 1;
+	page->is_zero = 0;
+	page->generic_free = 1;
+}
+
+static inline void
+page_put_local_free_block(page_t* page, block_t* block) {
+	block->next = page->local_free;
+	page->local_free = block;
+	++page->local_free_count;
+	if (UNEXPECTED(--page->block_used == 0)) {
+		page_available_to_free(page);
+	} else if (UNEXPECTED(page->is_full != 0)) {
+		page_full_to_available(page);
+	}
+}
+
+static NOINLINE void
+page_adopt_thread_free_block_list(page_t* page) {
+	if (page->local_free)
+		return;
+	unsigned long long thread_free = atomic_load_explicit(&page->thread_free, memory_order_relaxed);
+	if (thread_free != 0) {
+		// Other threads can only replace with another valid list head, this will never change to 0 in other threads
+		while (!atomic_compare_exchange_weak_explicit(&page->thread_free, &thread_free, 0, memory_order_relaxed,
+		                                              memory_order_relaxed))
+			wait_spin();
+		page->local_free_count = page_block_from_thread_free_list(page, thread_free, &page->local_free);
+		rpmalloc_assert(page->local_free_count <= page->block_used, "Page thread free list count internal failure");
+		page->block_used -= page->local_free_count;
+	}
+}
+
+static NOINLINE void
+page_put_thread_free_block(page_t* page, block_t* block) {
+	atomic_thread_fence(memory_order_acquire);
+	if (page->is_full) {
+		// Page is full, put the block in the heap thread free list instead, otherwise
+		// the heap will not pick up the free blocks until a thread local free happens
+		heap_t* heap = page->heap;
+		uintptr_t prev_head = atomic_load_explicit(&heap->thread_free[page->page_type], memory_order_relaxed);
+		block->next = (void*)prev_head;
+		while (!atomic_compare_exchange_weak_explicit(&heap->thread_free[page->page_type], &prev_head, (uintptr_t)block,
+		                                              memory_order_relaxed, memory_order_relaxed)) {
+			block->next = (void*)prev_head;
+			wait_spin();
+		}
+	} else {
+		unsigned long long prev_thread_free = atomic_load_explicit(&page->thread_free, memory_order_relaxed);
+		uint32_t block_index = page_block_index(page, block);
+		rpmalloc_assert(page_block(page, block_index) == block, "Block pointer is not aligned to start of block");
+		uint32_t list_size = page_block_from_thread_free_list(page, prev_thread_free, &block->next) + 1;
+		uint64_t thread_free = page_block_to_thread_free_list(page, block_index, list_size);
+		while (!atomic_compare_exchange_weak_explicit(&page->thread_free, &prev_thread_free, thread_free,
+		                                              memory_order_relaxed, memory_order_relaxed)) {
+			list_size = page_block_from_thread_free_list(page, prev_thread_free, &block->next) + 1;
+			thread_free = page_block_to_thread_free_list(page, block_index, list_size);
+			wait_spin();
+		}
+	}
+}
+
+static void
+page_push_local_free_to_heap(page_t* page) {
+	// Push the page free list as the fast track list of free blocks for heap
+	page->heap->local_free[page->size_class] = page->local_free;
+	page->block_used += page->local_free_count;
+	page->local_free = 0;
+	page->local_free_count = 0;
+}
+
+static NOINLINE void*
+page_initialize_blocks(page_t* page) {
+	rpmalloc_assert(page->block_initialized < page->block_count, "Block initialization internal failure");
+	block_t* block = page_block(page, page->block_initialized);
+	++page->block_initialized;
+	++page->block_used;
+
+	if ((page->page_type == PAGE_SMALL) && (page->block_size < (global_config.page_size >> 1))) {
+		// Link up until next memory page in free list
+		void* memory_page_start = (void*)((uintptr_t)block & ~(uintptr_t)(global_config.page_size - 1));
+		void* memory_page_next = pointer_offset(memory_page_start, global_config.page_size);
+		block_t* free_block = pointer_offset(block, page->block_size);
+		block_t* first_block = free_block;
+		block_t* last_block = free_block;
+		uint32_t list_count = 0;
+		uint32_t max_list_count = page->block_count - page->block_initialized;
+		while (((void*)free_block < memory_page_next) && (list_count < max_list_count)) {
+			last_block = free_block;
+			free_block->next = pointer_offset(free_block, page->block_size);
+			free_block = free_block->next;
+			++list_count;
+		}
+		if (list_count) {
+			last_block->next = 0;
+			page->local_free = first_block;
+			page->block_initialized += list_count;
+			page->local_free_count = list_count;
+		}
+	}
+
+	return block;
+}
+
+static inline RPMALLOC_ALLOCATOR void*
+page_allocate_block(page_t* page, unsigned int zero) {
+	unsigned int is_zero = 0;
+	block_t* block = (page->local_free != 0) ? page_get_local_free_block(page) : 0;
+	if (UNEXPECTED(block == 0)) {
+		if (atomic_load_explicit(&page->thread_free, memory_order_relaxed) != 0) {
+			page_adopt_thread_free_block_list(page);
+			block = (page->local_free != 0) ? page_get_local_free_block(page) : 0;
+		}
+		if (block == 0) {
+			block = page_initialize_blocks(page);
+			is_zero = page->is_zero;
+		}
+	}
+
+	rpmalloc_assert(page->block_used <= page->block_count, "Page block use counter out of sync");
+	if (page->local_free && !page->heap->local_free[page->size_class])
+		page_push_local_free_to_heap(page);
+
+	// The page might be full when free list has been pushed to heap local free list,
+	// check if there is a thread free list to adopt
+	if (page->block_used == page->block_count)
+		page_adopt_thread_free_block_list(page);
+
+	if (page->block_used == page->block_count) {
+		// Page is now fully utilized
+		rpmalloc_assert(!page->is_full, "Page block use counter out of sync with full flag");
+		page_available_to_full(page);
+	}
+
+	if (zero) {
+		if (!is_zero)
+			memset(block, 0, page->block_size);
+		else
+			*(uintptr_t*)block = 0;
+	}
+
+	return block;
+}
+
+////////////
+///
+/// Span interface
+///
+//////
+
+static inline int
+span_is_thread_heap(span_t* span) {
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	return (!span->heap->owner_thread || (span->heap->owner_thread == get_thread_id()));
+#else
+	return (span->heap->owner_thread == get_thread_id());
+#endif
+}
+
+static inline page_t*
+span_get_page_from_block(span_t* span, void* block) {
+	return (page_t*)((uintptr_t)block & span->page_address_mask);
+}
+
+//! Find or allocate a page from the given span
+static inline page_t*
+span_allocate_page(span_t* span) {
+	// Allocate path, initialize a new chunk of memory for a page in the given span
+	rpmalloc_assert(span->page_initialized < span->page_count, "Page initialization internal failure");
+	heap_t* heap = span->heap;
+	page_t* page = pointer_offset(span, span->page_size * span->page_initialized);
+
+#if ENABLE_DECOMMIT
+	// The first page is always committed on initial span map of memory
+	if (span->page_initialized)
+		global_memory_interface->memory_commit(page, span->page_size);
+#endif
+	++span->page_initialized;
+
+	page->page_type = span->page_type;
+	page->is_zero = 1;
+	page->heap = heap;
+	rpmalloc_assert(page_is_thread_heap(page), "Page owner thread mismatch");
+
+	if (span->page_initialized == span->page_count) {
+		// Span fully utilized
+		rpmalloc_assert(span == heap->span_partial[span->page_type], "Span partial tracking out of sync");
+		heap->span_partial[span->page_type] = 0;
+
+		span->next = heap->span_used[span->page_type];
+		heap->span_used[span->page_type] = span;
+	}
+
+	return page;
+}
+
+static NOINLINE void
+span_deallocate_block(span_t* span, page_t* page, void* block) {
+	if (UNEXPECTED(page->page_type == PAGE_HUGE)) {
+		global_memory_interface->memory_unmap(span, span->offset, span->mapped_size);
+		return;
+	}
+
+	if (page->has_aligned_block) {
+		// Realign pointer to block start
+		block = page_block_realign(page, block);
+	}
+
+	int is_thread_local = page_is_thread_heap(page);
+	if (EXPECTED(is_thread_local != 0)) {
+		page_put_local_free_block(page, block);
+	} else {
+		// Multithreaded deallocation, push to deferred deallocation list.
+		page_put_thread_free_block(page, block);
+	}
+}
+
+////////////
+///
+/// Block interface
+///
+//////
+
+static inline span_t*
+block_get_span(block_t* block) {
+	return (span_t*)((uintptr_t)block & SPAN_MASK);
+}
+
+static inline void
+block_deallocate(block_t* block) {
+	span_t* span = (span_t*)((uintptr_t)block & SPAN_MASK);
+	page_t* page = span_get_page_from_block(span, block);
+	const int is_thread_local = page_is_thread_heap(page);
+
+	// Optimized path for thread local free with non-huge block in page
+	// that has no aligned blocks
+	if (EXPECTED(is_thread_local != 0)) {
+		if (EXPECTED(page->generic_free == 0)) {
+			// Page is not huge, not full and has no aligned block - fast path
+			block->next = page->local_free;
+			page->local_free = block;
+			++page->local_free_count;
+			if (UNEXPECTED(--page->block_used == 0))
+				page_available_to_free(page);
+		} else {
+			span_deallocate_block(span, page, block);
+		}
+	} else {
+		span_deallocate_block(span, page, block);
+	}
+}
+
+static inline size_t
+block_usable_size(block_t* block) {
+	span_t* span = (span_t*)((uintptr_t)block & SPAN_MASK);
+	if (EXPECTED(span->page_type <= PAGE_LARGE)) {
+		page_t* page = span_get_page_from_block(span, block);
+		void* blocks_start = pointer_offset(page, PAGE_HEADER_SIZE);
+		return page->block_size - ((size_t)pointer_diff(block, blocks_start) % page->block_size);
+	} else {
+		return ((size_t)span->page_size * (size_t)span->page_count) - (size_t)pointer_diff(block, span);
+	}
+}
+
+////////////
+///
+/// Heap interface
+///
+//////
+
+static inline void
+heap_lock_acquire(void) {
+	uintptr_t lock = 0;
+	uintptr_t this_lock = get_thread_id();
+	while (!atomic_compare_exchange_strong(&global_heap_lock, &lock, this_lock)) {
+		lock = 0;
+		wait_spin();
+	}
+}
+
+static inline void
+heap_lock_release(void) {
+	rpmalloc_assert((uintptr_t)atomic_load_explicit(&global_heap_lock, memory_order_relaxed) == get_thread_id(),
+	                "Bad heap lock");
+	atomic_store_explicit(&global_heap_lock, 0, memory_order_release);
+}
+
+static inline heap_t*
+heap_initialize(void* block) {
+	heap_t* heap = block;
+	memset_const(heap, 0, sizeof(heap_t));
+	heap->id = 1 + atomic_fetch_add_explicit(&global_heap_id, 1, memory_order_relaxed);
+	return heap;
+}
+
+static heap_t*
+heap_allocate_new(void) {
+	if (!global_config.page_size)
+		rpmalloc_initialize(0);
+	size_t heap_size = get_page_aligned_size(sizeof(heap_t));
+	size_t offset = 0;
+	size_t mapped_size = 0;
+	block_t* block = global_memory_interface->memory_map(heap_size, 0, &offset, &mapped_size);
+#if ENABLE_DECOMMIT
+	global_memory_interface->memory_commit(block, heap_size);
+#endif
+	heap_t* heap = heap_initialize((void*)block);
+	heap->offset = (uint32_t)offset;
+	heap->mapped_size = mapped_size;
+#if ENABLE_STATISTICS
+	atomic_fetch_add_explicit(&global_statistics.heap_count, 1, memory_order_relaxed);
+#endif
+	return heap;
+}
+
+static void
+heap_unmap(heap_t* heap) {
+	global_memory_interface->memory_unmap(heap, heap->offset, heap->mapped_size);
+}
+
+static heap_t*
+heap_allocate(int first_class) {
+	heap_t* heap = 0;
+	if (!first_class) {
+		heap_lock_acquire();
+		heap = global_heap_queue;
+		global_heap_queue = heap ? heap->next : 0;
+		heap_lock_release();
+	}
+	if (!heap)
+		heap = heap_allocate_new();
+	if (heap) {
+		uintptr_t current_thread_id = get_thread_id();
+		heap_lock_acquire();
+		heap->next = global_heap_used;
+		heap->prev = 0;
+		if (global_heap_used)
+			global_heap_used->prev = heap;
+		global_heap_used = heap;
+		heap_lock_release();
+		heap->owner_thread = current_thread_id;
+	}
+	return heap;
+}
+
+static inline void
+heap_release(heap_t* heap) {
+	heap_lock_acquire();
+	if (heap->prev)
+		heap->prev->next = heap->next;
+	if (heap->next)
+		heap->next->prev = heap->prev;
+	if (global_heap_used == heap)
+		global_heap_used = heap->next;
+	heap->next = global_heap_queue;
+	global_heap_queue = heap;
+	heap_lock_release();
+}
+
+static void
+heap_page_free_decommit(heap_t* heap, uint32_t page_type, uint32_t page_retain_count) {
+	page_t* page = heap->page_free[page_type];
+	while (page && page_retain_count) {
+		page = page->next;
+		--page_retain_count;
+	}
+	while (page && (page->is_decommitted == 0)) {
+		page_decommit_memory_pages(page);
+		--heap->page_free_commit_count[page_type];
+		page = page->next;
+	}
+}
+
+static inline void
+heap_make_free_page_available(heap_t* heap, uint32_t size_class, page_t* page) {
+	page->size_class = size_class;
+	page->block_size = global_size_class[size_class].block_size;
+	page->block_count = global_size_class[size_class].block_count;
+	page->block_used = 0;
+	page->block_initialized = 0;
+	page->local_free = 0;
+	page->local_free_count = 0;
+	page->is_full = 0;
+	page->is_free = 0;
+	page->has_aligned_block = 0;
+	page->generic_free = 0;
+	page->heap = heap;
+	page_t* head = heap->page_available[size_class];
+	page->next = head;
+	page->prev = 0;
+	atomic_store_explicit(&page->thread_free, 0, memory_order_relaxed);
+	if (head)
+		head->prev = page;
+	heap->page_available[size_class] = page;
+	if (page->is_decommitted)
+		page_commit_memory_pages(page);
+}
+
+//! Find or allocate a span for the given page type with the given size class
+static inline span_t*
+heap_get_span(heap_t* heap, page_type_t page_type) {
+	// Fast path, available span for given page type
+	if (EXPECTED(heap->span_partial[page_type] != 0))
+		return heap->span_partial[page_type];
+
+	// Fallback path, map more memory
+	size_t offset = 0;
+	size_t mapped_size = 0;
+	span_t* span = global_memory_interface->memory_map(SPAN_SIZE, SPAN_SIZE, &offset, &mapped_size);
+	if (EXPECTED(span != 0)) {
+		uint32_t page_count = 0;
+		uint32_t page_size = 0;
+		uintptr_t page_address_mask = 0;
+		if (page_type == PAGE_SMALL) {
+			page_count = SPAN_SIZE / SMALL_PAGE_SIZE;
+			page_size = SMALL_PAGE_SIZE;
+			page_address_mask = SMALL_PAGE_MASK;
+		} else if (page_type == PAGE_MEDIUM) {
+			page_count = SPAN_SIZE / MEDIUM_PAGE_SIZE;
+			page_size = MEDIUM_PAGE_SIZE;
+			page_address_mask = MEDIUM_PAGE_MASK;
+		} else {
+			page_count = SPAN_SIZE / LARGE_PAGE_SIZE;
+			page_size = LARGE_PAGE_SIZE;
+			page_address_mask = LARGE_PAGE_MASK;
+		}
+#if ENABLE_DECOMMIT
+		global_memory_interface->memory_commit(span, page_size);
+#endif
+		span->heap = heap;
+		span->page_type = page_type;
+		span->page_count = page_count;
+		span->page_size = page_size;
+		span->page_address_mask = page_address_mask;
+		span->offset = (uint32_t)offset;
+		span->mapped_size = mapped_size;
+
+		heap->span_partial[page_type] = span;
+	}
+
+	return span;
+}
+
+static page_t*
+heap_get_page(heap_t* heap, uint32_t size_class);
+
+static void
+block_deallocate(block_t* block);
+
+static page_t*
+heap_get_page_generic(heap_t* heap, uint32_t size_class) {
+	page_type_t page_type = get_page_type(size_class);
+
+	// Check if there is a free page from multithreaded deallocations
+	uintptr_t block_mt = atomic_load_explicit(&heap->thread_free[page_type], memory_order_relaxed);
+	if (UNEXPECTED(block_mt != 0)) {
+		while (!atomic_compare_exchange_weak_explicit(&heap->thread_free[page_type], &block_mt, 0, memory_order_relaxed,
+		                                              memory_order_relaxed)) {
+			wait_spin();
+		}
+		block_t* block = (void*)block_mt;
+		while (block) {
+			block_t* next_block = block->next;
+			block_deallocate(block);
+			block = next_block;
+		}
+		// Retry after processing deferred thread frees
+		return heap_get_page(heap, size_class);
+	}
+
+	// Check if there is a free page
+	page_t* page = heap->page_free[page_type];
+	if (EXPECTED(page != 0)) {
+		heap->page_free[page_type] = page->next;
+		if (page->is_decommitted == 0) {
+			rpmalloc_assert(heap->page_free_commit_count[page_type] > 0, "Free committed page count out of sync");
+			--heap->page_free_commit_count[page_type];
+		}
+		heap_make_free_page_available(heap, size_class, page);
+		return page;
+	}
+	rpmalloc_assert(heap->page_free_commit_count[page_type] == 0, "Free committed page count out of sync");
+
+	if (heap->id == 0) {
+		// Thread has not yet initialized, assign heap and try again
+		rpmalloc_initialize(0);
+		return heap_get_page(get_thread_heap(), size_class);
+	}
+
+	// Fallback path, find or allocate span for given size class
+	// If thread was not initialized, the heap for the new span
+	// will be different from the local heap variable in this scope
+	// (which is the default heap) - so use span page heap instead
+	span_t* span = heap_get_span(heap, page_type);
+	if (EXPECTED(span != 0)) {
+		page = span_allocate_page(span);
+		heap_make_free_page_available(page->heap, size_class, page);
+	}
+
+	return page;
+}
+
+//! Find or allocate a page for the given size class
+static page_t*
+heap_get_page(heap_t* heap, uint32_t size_class) {
+	// Fast path, available page for given size class
+	page_t* page = heap->page_available[size_class];
+	if (EXPECTED(page != 0))
+		return page;
+	return heap_get_page_generic(heap, size_class);
+}
+
+//! Pop a block from the heap local free list
+static inline RPMALLOC_ALLOCATOR void*
+heap_pop_local_free(heap_t* heap, uint32_t size_class) {
+	block_t** free_list = heap->local_free + size_class;
+	block_t* block = *free_list;
+	if (EXPECTED(block != 0))
+		*free_list = block->next;
+	return block;
+}
+
+//! Generic allocation path from heap pages, spans or new mapping
+static NOINLINE RPMALLOC_ALLOCATOR void*
+heap_allocate_block_small_to_large(heap_t* heap, uint32_t size_class, unsigned int zero) {
+	page_t* page = heap_get_page(heap, size_class);
+	if (EXPECTED(page != 0))
+		return page_allocate_block(page, zero);
+	return 0;
+}
+
+//! Generic allocation path from heap pages, spans or new mapping
+static NOINLINE RPMALLOC_ALLOCATOR void*
+heap_allocate_block_huge(heap_t* heap, size_t size, unsigned int zero) {
+	(void)sizeof(heap);
+	size_t alloc_size = get_page_aligned_size(size + SPAN_HEADER_SIZE);
+	size_t offset = 0;
+	size_t mapped_size = 0;
+	void* block = global_memory_interface->memory_map(alloc_size, SPAN_SIZE, &offset, &mapped_size);
+	if (block) {
+		span_t* span = block;
+#if ENABLE_DECOMMIT
+		global_memory_interface->memory_commit(span, alloc_size);
+#endif
+		span->heap = heap;
+		span->page_type = PAGE_HUGE;
+		span->page_size = (uint32_t)global_config.page_size;
+		span->page_count = (uint32_t)(alloc_size / global_config.page_size);
+		span->page_address_mask = LARGE_PAGE_MASK;
+		span->offset = (uint32_t)offset;
+		span->mapped_size = mapped_size;
+		span->page.heap = heap;
+		span->page.is_full = 1;
+		span->page.generic_free = 1;
+		span->page.page_type = PAGE_HUGE;
+		// Keep track of span if first class heap
+		if (!heap->owner_thread) {
+			span->next = heap->span_used[PAGE_HUGE];
+			heap->span_used[PAGE_HUGE] = span;
+		}
+		void* ptr = pointer_offset(block, SPAN_HEADER_SIZE);
+		if (zero)
+			memset(ptr, 0, size);
+		return ptr;
+	}
+	return 0;
+}
+
+static RPMALLOC_ALLOCATOR NOINLINE void*
+heap_allocate_block_generic(heap_t* heap, size_t size, unsigned int zero) {
+	uint32_t size_class = get_size_class(size);
+	if (EXPECTED(size_class < SIZE_CLASS_COUNT)) {
+		block_t* block = heap_pop_local_free(heap, size_class);
+		if (EXPECTED(block != 0)) {
+			// Fast track with small block available in heap level local free list
+			if (zero)
+				memset(block, 0, global_size_class[size_class].block_size);
+			return block;
+		}
+
+		return heap_allocate_block_small_to_large(heap, size_class, zero);
+	}
+
+	return heap_allocate_block_huge(heap, size, zero);
+}
+
+//! Find or allocate a block of the given size
+static inline RPMALLOC_ALLOCATOR void*
+heap_allocate_block(heap_t* heap, size_t size, unsigned int zero) {
+	if (size <= (SMALL_GRANULARITY * 64)) {
+		uint32_t size_class = get_size_class_tiny(size);
+		block_t* block = heap_pop_local_free(heap, size_class);
+		if (EXPECTED(block != 0)) {
+			// Fast track with small block available in heap level local free list
+			if (zero)
+				memset(block, 0, global_size_class[size_class].block_size);
+			return block;
+		}
+	}
+	return heap_allocate_block_generic(heap, size, zero);
+}
+
+static RPMALLOC_ALLOCATOR void*
+heap_allocate_block_aligned(heap_t* heap, size_t alignment, size_t size, unsigned int zero) {
+	if (alignment <= SMALL_GRANULARITY)
+		return heap_allocate_block(heap, size, zero);
+
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment) < size) {
+		errno = EINVAL;
+		return 0;
+	}
+	if (alignment & (alignment - 1)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	if (alignment >= RPMALLOC_MAX_ALIGNMENT) {
+		errno = EINVAL;
+		return 0;
+	}
+
+	size_t align_mask = alignment - 1;
+	block_t* block = heap_allocate_block(heap, size + alignment, zero);
+	if ((uintptr_t)block & align_mask) {
+		block = (void*)(((uintptr_t)block & ~(uintptr_t)align_mask) + alignment);
+		// Mark as having aligned blocks
+		span_t* span = block_get_span(block);
+		page_t* page = span_get_page_from_block(span, block);
+		page->has_aligned_block = 1;
+		page->generic_free = 1;
+	}
+	return block;
+}
+
+static void*
+heap_reallocate_block(heap_t* heap, void* block, size_t size, size_t old_size, unsigned int flags) {
+	if (block) {
+		// Grab the span using guaranteed span alignment
+		span_t* span = block_get_span(block);
+		if (EXPECTED(span->page_type <= PAGE_LARGE)) {
+			// Normal sized block
+			page_t* page = span_get_page_from_block(span, block);
+			void* blocks_start = pointer_offset(page, PAGE_HEADER_SIZE);
+			uint32_t block_offset = (uint32_t)pointer_diff(block, blocks_start);
+			uint32_t block_idx = block_offset / page->block_size;
+			void* block_origin = pointer_offset(blocks_start, (size_t)block_idx * page->block_size);
+			if (!old_size)
+				old_size = (size_t)((ptrdiff_t)page->block_size - pointer_diff(block, block_origin));
+			if ((size_t)page->block_size >= size) {
+				// Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+				if ((block != block_origin) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block_origin, block, old_size);
+				return block_origin;
+			}
+		} else {
+			// Huge block
+			void* block_start = pointer_offset(span, SPAN_HEADER_SIZE);
+			if (!old_size)
+				old_size = ((size_t)span->page_size * (size_t)span->page_count) - SPAN_HEADER_SIZE;
+			if ((size < old_size) && (size > LARGE_BLOCK_SIZE_LIMIT)) {
+				// Still fits in block and still huge, never mind trying to save memory,
+				// but preserve data if alignment changed
+				if ((block_start != block) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block_start, block, old_size);
+				return block_start;
+			}
+		}
+	} else {
+		old_size = 0;
+	}
+
+	if (!!(flags & RPMALLOC_GROW_OR_FAIL))
+		return 0;
+
+	// Size is greater than block size or saves enough memory to resize, need to allocate a new block
+	// and deallocate the old. Avoid hysteresis by overallocating if increase is small (below 37%)
+	size_t lower_bound = old_size + (old_size >> 2) + (old_size >> 3);
+	size_t new_size = (size > lower_bound) ? size : ((size > old_size) ? lower_bound : size);
+	void* old_block = block;
+	block = heap_allocate_block(heap, new_size, 0);
+	if (block && old_block) {
+		if (!(flags & RPMALLOC_NO_PRESERVE))
+			memcpy(block, old_block, old_size < new_size ? old_size : new_size);
+		block_deallocate(old_block);
+	}
+
+	return block;
+}
+
+static void*
+heap_reallocate_block_aligned(heap_t* heap, void* block, size_t alignment, size_t size, size_t old_size,
+                              unsigned int flags) {
+	if (alignment <= SMALL_GRANULARITY)
+		return heap_reallocate_block(heap, block, size, old_size, flags);
+
+	int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL);
+	size_t usable_size = (block ? block_usable_size(block) : 0);
+	if ((usable_size >= size) && !((uintptr_t)block & (alignment - 1))) {
+		if (no_alloc || (size >= (usable_size / 2)))
+			return block;
+	}
+	// Aligned alloc marks span as having aligned blocks
+	void* old_block = block;
+	block = (!no_alloc ? heap_allocate_block_aligned(heap, alignment, size, 0) : 0);
+	if (EXPECTED(block != 0)) {
+		if (!(flags & RPMALLOC_NO_PRESERVE) && old_block) {
+			if (!old_size)
+				old_size = usable_size;
+			memcpy(block, old_block, old_size < size ? old_size : size);
+		}
+		if (EXPECTED(old_block != 0))
+			block_deallocate(old_block);
+	}
+	return block;
+}
+
+static void
+heap_free_all(heap_t* heap) {
+	for (int itype = 0; itype < 3; ++itype) {
+		span_t* span = heap->span_partial[itype];
+		while (span) {
+			span_t* span_next = span->next;
+			global_memory_interface->memory_unmap(span, span->offset, span->mapped_size);
+			span = span_next;
+		}
+		heap->span_partial[itype] = 0;
+		heap->page_free[itype] = 0;
+		heap->page_free_commit_count[itype] = 0;
+		atomic_store_explicit(&heap->thread_free[itype], 0, memory_order_relaxed);
+	}
+	for (int itype = 0; itype < 4; ++itype) {
+		span_t* span = heap->span_used[itype];
+		while (span) {
+			span_t* span_next = span->next;
+			global_memory_interface->memory_unmap(span, span->offset, span->mapped_size);
+			span = span_next;
+		}
+		heap->span_used[itype] = 0;
+	}
+	memset(heap->local_free, 0, sizeof(heap->local_free));
+	memset(heap->page_available, 0, sizeof(heap->page_available));
+
+#if ENABLE_STATISTICS
+	// TODO: Fix
+#endif
+}
+
+////////////
+///
+/// Extern interface
+///
+//////
+
+int
+rpmalloc_is_thread_initialized(void) {
+	return (get_thread_heap() != global_heap_default) ? 1 : 0;
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc(size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return heap_allocate_block(heap, size, 0);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpzalloc(size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return heap_allocate_block(heap, size, 1);
+}
+
+extern inline void
+rpfree(void* ptr) {
+	if (UNEXPECTED(ptr == 0))
+		return;
+	block_deallocate(ptr);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpcalloc(size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	heap_t* heap = get_thread_heap();
+	return heap_allocate_block(heap, total, 1);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rprealloc(void* ptr, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return heap_reallocate_block(heap, ptr, size, 0, 0);
+}
+
+extern RPMALLOC_ALLOCATOR void*
+rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return heap_reallocate_block_aligned(heap, ptr, alignment, size, oldsize, flags);
+}
+
+extern RPMALLOC_ALLOCATOR void*
+rpaligned_alloc(size_t alignment, size_t size) {
+	heap_t* heap = get_thread_heap();
+	return heap_allocate_block_aligned(heap, alignment, size, 0);
+}
+
+extern RPMALLOC_ALLOCATOR void*
+rpaligned_zalloc(size_t alignment, size_t size) {
+	heap_t* heap = get_thread_heap();
+	return heap_allocate_block_aligned(heap, alignment, size, 1);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpaligned_calloc(size_t alignment, size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	heap_t* heap = get_thread_heap();
+	return heap_allocate_block_aligned(heap, alignment, total, 1);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmemalign(size_t alignment, size_t size) {
+	heap_t* heap = get_thread_heap();
+	return heap_allocate_block_aligned(heap, alignment, size, 0);
+}
+
+extern inline int
+rpposix_memalign(void** memptr, size_t alignment, size_t size) {
+	heap_t* heap = get_thread_heap();
+	if (memptr)
+		*memptr = heap_allocate_block_aligned(heap, alignment, size, 0);
+	else
+		return EINVAL;
+	return *memptr ? 0 : ENOMEM;
+}
+
+extern inline size_t
+rpmalloc_usable_size(void* ptr) {
+	return (ptr ? block_usable_size(ptr) : 0);
+}
+
+////////////
+///
+/// Initialization and finalization
+///
+//////
+
+static void
+rpmalloc_thread_destructor(void* value) {
+	// If this is called on main thread assume it means rpmalloc_finalize
+	// has not been called and shutdown is forced (through _exit) or unclean
+	if (get_thread_id() == global_main_thread_id)
+		return;
+	if (value)
+		rpmalloc_thread_finalize();
+}
+
+extern int
+rpmalloc_initialize_config(rpmalloc_interface_t* memory_interface, rpmalloc_config_t* config) {
+	if (global_rpmalloc_initialized) {
+		rpmalloc_thread_initialize();
+		if (config)
+			*config = global_config;
+		return 0;
+	}
+
+	if (config)
+		global_config = *config;
+
+	int result = rpmalloc_initialize(memory_interface);
+
+	if (config)
+		*config = global_config;
+
+	return result;
+}
+
+extern int
+rpmalloc_initialize(rpmalloc_interface_t* memory_interface) {
+	if (global_rpmalloc_initialized) {
+		rpmalloc_thread_initialize();
+		return 0;
+	}
+
+	global_rpmalloc_initialized = 1;
+
+	global_memory_interface = memory_interface ? memory_interface : &global_memory_interface_default;
+	if (!global_memory_interface->memory_map || !global_memory_interface->memory_unmap) {
+		global_memory_interface->memory_map = os_mmap;
+		global_memory_interface->memory_commit = os_mcommit;
+		global_memory_interface->memory_decommit = os_mdecommit;
+		global_memory_interface->memory_unmap = os_munmap;
+	}
+
+#if PLATFORM_WINDOWS
+	SYSTEM_INFO system_info;
+	memset(&system_info, 0, sizeof(system_info));
+	GetSystemInfo(&system_info);
+	os_map_granularity = system_info.dwAllocationGranularity;
+#else
+	os_map_granularity = (size_t)sysconf(_SC_PAGESIZE);
+#endif
+
+#if PLATFORM_WINDOWS
+	os_page_size = system_info.dwPageSize;
+#else
+	os_page_size = os_map_granularity;
+#endif
+	if (global_config.enable_huge_pages) {
+#if PLATFORM_WINDOWS
+		HANDLE token = 0;
+		size_t large_page_minimum = GetLargePageMinimum();
+		if (large_page_minimum)
+			OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
+		if (token) {
+			LUID luid;
+			if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) {
+				TOKEN_PRIVILEGES token_privileges;
+				memset(&token_privileges, 0, sizeof(token_privileges));
+				token_privileges.PrivilegeCount = 1;
+				token_privileges.Privileges[0].Luid = luid;
+				token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+				if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) {
+					if (GetLastError() == ERROR_SUCCESS)
+						os_huge_pages = 1;
+				}
+			}
+			CloseHandle(token);
+		}
+		if (os_huge_pages) {
+			if (large_page_minimum > os_page_size)
+				os_page_size = large_page_minimum;
+			if (large_page_minimum > os_map_granularity)
+				os_map_granularity = large_page_minimum;
+		}
+#elif defined(__linux__)
+		size_t huge_page_size = 0;
+		FILE* meminfo = fopen("/proc/meminfo", "r");
+		if (meminfo) {
+			char line[128];
+			while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) {
+				line[sizeof(line) - 1] = 0;
+				if (strstr(line, "Hugepagesize:"))
+					huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024;
+			}
+			fclose(meminfo);
+		}
+		if (huge_page_size) {
+			os_huge_pages = 1;
+			os_page_size = huge_page_size;
+			os_map_granularity = huge_page_size;
+		}
+#elif defined(__FreeBSD__)
+		int rc;
+		size_t sz = sizeof(rc);
+
+		if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && rc == 1) {
+			os_huge_pages = 1;
+			os_page_size = 2 * 1024 * 1024;
+			os_map_granularity = os_page_size;
+		}
+#elif defined(__APPLE__) || defined(__NetBSD__)
+		os_huge_pages = 1;
+		os_page_size = 2 * 1024 * 1024;
+		os_map_granularity = os_page_size;
+#endif
+	} else {
+		os_huge_pages = 0;
+	}
+
+	global_config.enable_huge_pages = os_huge_pages;
+
+	if (!memory_interface || (global_config.page_size < os_page_size))
+		global_config.page_size = os_page_size;
+
+	if (global_config.enable_huge_pages || global_config.page_size > (256 * 1024))
+		global_config.disable_decommit = 1;
+
+#if defined(__linux__) || defined(__ANDROID__)
+	if (global_config.disable_thp)
+		(void)prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0);
+#endif
+
+#ifdef _WIN32
+	fls_key = FlsAlloc(&rpmalloc_thread_destructor);
+#else
+	pthread_key_create(&pthread_key, rpmalloc_thread_destructor);
+#endif
+
+	global_main_thread_id = get_thread_id();
+
+	rpmalloc_thread_initialize();
+
+	return 0;
+}
+
+extern const rpmalloc_config_t*
+rpmalloc_config(void) {
+	return &global_config;
+}
+
+extern void
+rpmalloc_finalize(void) {
+	rpmalloc_thread_finalize();
+
+	if (global_config.unmap_on_finalize) {
+		heap_t* heap = global_heap_queue;
+		global_heap_queue = 0;
+		while (heap) {
+			heap_t* heap_next = heap->next;
+			heap_free_all(heap);
+			heap_unmap(heap);
+			heap = heap_next;
+		}
+		heap = global_heap_used;
+		global_heap_used = 0;
+		while (heap) {
+			heap_t* heap_next = heap->next;
+			heap_free_all(heap);
+			heap_unmap(heap);
+			heap = heap_next;
+		}
+#if ENABLE_STATISTICS
+		memset(&global_statistics, 0, sizeof(global_statistics));
+#endif
+	}
+
+#ifdef _WIN32
+	FlsFree(fls_key);
+	fls_key = 0;
+#else
+	pthread_key_delete(pthread_key);
+	pthread_key = 0;
+#endif
+
+	global_main_thread_id = 0;
+	global_rpmalloc_initialized = 0;
+}
+
+extern void
+rpmalloc_thread_initialize(void) {
+	if (get_thread_heap() == global_heap_default)
+		get_thread_heap_allocate();
+}
+
+extern void
+rpmalloc_thread_finalize(void) {
+	heap_t* heap = get_thread_heap();
+	if (heap != global_heap_default) {
+		heap_release(heap);
+		set_thread_heap(global_heap_default);
+	}
+}
+
+extern void
+rpmalloc_thread_collect(void) {
+}
+
+void
+rpmalloc_dump_statistics(void* file) {
+#if ENABLE_STATISTICS
+	fprintf(file, "Mapped pages:        %llu\n",
+	        (unsigned long long)atomic_load_explicit(&global_statistics.page_mapped, memory_order_relaxed));
+	fprintf(file, "Mapped pages (peak): %llu\n",
+	        (unsigned long long)atomic_load_explicit(&global_statistics.page_mapped_peak, memory_order_relaxed));
+	fprintf(file, "Active pages:        %llu\n",
+	        (unsigned long long)atomic_load_explicit(&global_statistics.page_active, memory_order_relaxed));
+	fprintf(file, "Active pages (peak): %llu\n",
+	        (unsigned long long)atomic_load_explicit(&global_statistics.page_active_peak, memory_order_relaxed));
+	fprintf(file, "Pages committed:     %llu\n",
+	        (unsigned long long)atomic_load_explicit(&global_statistics.page_commit, memory_order_relaxed));
+	fprintf(file, "Pages decommitted:   %llu\n",
+	        (unsigned long long)atomic_load_explicit(&global_statistics.page_decommit, memory_order_relaxed));
+	fprintf(file, "Heaps created:       %llu\n",
+	        (unsigned long long)atomic_load_explicit(&global_statistics.heap_count, memory_order_relaxed));
+#else
+	(void)sizeof(file);
+#endif
+}
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
+rpmalloc_heap_t*
+rpmalloc_heap_acquire(void) {
+	// Must be a pristine heap from newly mapped memory pages, or else memory blocks
+	// could already be allocated from the heap which would (wrongly) be released when
+	// heap is cleared with rpmalloc_heap_free_all(). Also heaps guaranteed to be
+	// pristine from the dedicated orphan list can be used.
+	heap_t* heap = heap_allocate(1);
+	rpmalloc_assume(heap != 0);
+	heap->owner_thread = 0;
+	return heap;
+}
+
+void
+rpmalloc_heap_release(rpmalloc_heap_t* heap) {
+	if (heap)
+		heap_release(heap);
+}
+
+RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return heap_allocate_block(heap, size, 0);
+}
+
+RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return heap_allocate_block_aligned(heap, alignment, size, 0);
+}
+
+RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	return heap_allocate_block(heap, total, 1);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	return heap_allocate_block_aligned(heap, alignment, total, 1);
+}
+
+RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	return heap_reallocate_block(heap, ptr, size, 0, flags);
+}
+
+RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return heap_reallocate_block_aligned(heap, ptr, alignment, size, 0, flags);
+}
+
+void
+rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr) {
+	(void)sizeof(heap);
+	block_deallocate(ptr);
+}
+
+//! Free all memory allocated by the heap
+void
+rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
+	heap_free_all(heap);
+}
+
+extern inline void
+rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap) {
+	heap_t* prev_heap = get_thread_heap();
+	if (prev_heap != heap) {
+		set_thread_heap(heap);
+		if (prev_heap)
+			heap_release(prev_heap);
+	}
+}
+
+rpmalloc_heap_t*
+rpmalloc_get_heap_for_ptr(void* ptr) {
+	// Grab the span, and then the heap from the span
+	span_t* span = (span_t*)((uintptr_t)ptr & SPAN_MASK);
+	if (span)
+		return span_get_page_from_block(span, ptr)->heap;
+	return 0;
+}
+
+#endif
+
+#include "malloc.c"
diff --git a/thirdparty/rpmalloc/rpmalloc.h b/thirdparty/rpmalloc/rpmalloc.h
new file mode 100644
index 000000000..2e67280f9
--- /dev/null
+++ b/thirdparty/rpmalloc/rpmalloc.h
@@ -0,0 +1,396 @@
+/* rpmalloc.h  -  Memory allocator  -  Public Domain  -  2016-2024 Mattias Jansson
+ *
+ * This library provides a cross-platform lock free thread caching malloc
+ * implementation in C11. The latest source code is always available at
+ *
+ * https://github.com/mjansson/rpmalloc
+ *
+ * This library is put in the public domain; you can redistribute it and/or
+ * modify it without any restrictions.
+ *
+ */
+
+#pragma once
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RPMALLOC_CACHE_LINE_SIZE 64
+#if defined(__clang__) || defined(__GNUC__)
+#define RPMALLOC_EXPORT __attribute__((visibility("default")))
+#define RPMALLOC_RESTRICT __restrict
+#define RPMALLOC_ALLOCATOR
+#define RPMALLOC_CACHE_ALIGNED __attribute__((aligned(RPMALLOC_CACHE_LINE_SIZE)))
+#if (defined(__clang_major__) && (__clang_major__ < 4)) || (!defined(__clang_major__) && defined(__GNUC__))
+#define RPMALLOC_ATTRIB_MALLOC
+#define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
+#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
+#else
+#define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__))
+#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size)))
+#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) __attribute__((alloc_size(count, size)))
+#endif
+#define RPMALLOC_CDECL
+#elif defined(_MSC_VER)
+#define RPMALLOC_EXPORT
+#define RPMALLOC_RESTRICT __declspec(restrict)
+#define RPMALLOC_ALLOCATOR __declspec(allocator) __declspec(restrict)
+#define RPMALLOC_CACHE_ALIGNED __declspec(align(RPMALLOC_CACHE_LINE_SIZE))
+#define RPMALLOC_ATTRIB_MALLOC
+#define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
+#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
+#define RPMALLOC_CDECL __cdecl
+#else
+#define RPMALLOC_EXPORT
+#define RPMALLOC_ALLOCATOR
+#define RPMALLOC_ATTRIB_MALLOC
+#define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
+#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
+#define RPMALLOC_CDECL
+#endif
+
+#define RPMALLOC_MAX_ALIGNMENT (256 * 1024)
+
+//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_* functions).
+#ifndef RPMALLOC_FIRST_CLASS_HEAPS
+#define RPMALLOC_FIRST_CLASS_HEAPS 0
+#endif
+
+//! Flag to rpaligned_realloc to not preserve content in reallocation
+#define RPMALLOC_NO_PRESERVE 1
+//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be done in-place,
+//  in which case the original pointer is still valid (just like a call to realloc which failes to allocate
+//  a new block).
+#define RPMALLOC_GROW_OR_FAIL 2
+
+typedef struct rpmalloc_global_statistics_t {
+	//! Current amount of virtual memory mapped, all of which might not have been committed (only if
+	//! ENABLE_STATISTICS=1)
+	size_t mapped;
+	//! Peak amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1)
+	size_t mapped_peak;
+	//! Current amount of memory in global caches for small and medium sizes (<32KiB)
+	size_t cached;
+	//! Current amount of memory allocated in huge allocations, i.e larger than LARGE_SIZE_LIMIT which is 2MiB by
+	//! default (only if ENABLE_STATISTICS=1)
+	size_t huge_alloc;
+	//! Peak amount of memory allocated in huge allocations, i.e larger than LARGE_SIZE_LIMIT which is 2MiB by default
+	//! (only if ENABLE_STATISTICS=1)
+	size_t huge_alloc_peak;
+	//! Total amount of memory mapped since initialization (only if ENABLE_STATISTICS=1)
+	size_t mapped_total;
+	//! Total amount of memory unmapped since initialization  (only if ENABLE_STATISTICS=1)
+	size_t unmapped_total;
+} rpmalloc_global_statistics_t;
+
+typedef struct rpmalloc_thread_statistics_t {
+	//! Current number of bytes available in thread size class caches for small and medium sizes (<32KiB)
+	size_t sizecache;
+	//! Current number of bytes available in thread span caches for small and medium sizes (<32KiB)
+	size_t spancache;
+	//! Total number of bytes transitioned from thread cache to global cache (only if ENABLE_STATISTICS=1)
+	size_t thread_to_global;
+	//! Total number of bytes transitioned from global cache to thread cache (only if ENABLE_STATISTICS=1)
+	size_t global_to_thread;
+	//! Per span count statistics (only if ENABLE_STATISTICS=1)
+	struct {
+		//! Currently used number of spans
+		size_t current;
+		//! High water mark of spans used
+		size_t peak;
+		//! Number of spans transitioned to global cache
+		size_t to_global;
+		//! Number of spans transitioned from global cache
+		size_t from_global;
+		//! Number of spans transitioned to thread cache
+		size_t to_cache;
+		//! Number of spans transitioned from thread cache
+		size_t from_cache;
+		//! Number of spans transitioned to reserved state
+		size_t to_reserved;
+		//! Number of spans transitioned from reserved state
+		size_t from_reserved;
+		//! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls)
+		size_t map_calls;
+	} span_use[64];
+	//! Per size class statistics (only if ENABLE_STATISTICS=1)
+	struct {
+		//! Current number of allocations
+		size_t alloc_current;
+		//! Peak number of allocations
+		size_t alloc_peak;
+		//! Total number of allocations
+		size_t alloc_total;
+		//! Total number of frees
+		size_t free_total;
+		//! Number of spans transitioned to cache
+		size_t spans_to_cache;
+		//! Number of spans transitioned from cache
+		size_t spans_from_cache;
+		//! Number of spans transitioned from reserved state
+		size_t spans_from_reserved;
+		//! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls)
+		size_t map_calls;
+	} size_use[128];
+} rpmalloc_thread_statistics_t;
+
+typedef struct rpmalloc_interface_t {
+	//! Map memory pages for the given number of bytes. The returned address MUST be aligned to the given alignment,
+	//! which will always be either 0 or the span size. The function can store an alignment offset in the offset
+	//! variable in case it performs alignment and the returned pointer is offset from the actual start of the memory
+	//! region due to this alignment. This alignment offset will be passed to the memory unmap function. The mapped size
+	//! can be stored in the mapped_size variable, which will also be passed to the memory unmap function as the release
+	//! parameter once the entire mapped region is ready to be released. If you set a memory_map function, you must also
+	//! set a memory_unmap function or else the default implementation will be used for both. This function must be
+	//! thread safe, it can be called by multiple threads simultaneously.
+	void* (*memory_map)(size_t size, size_t alignment, size_t* offset, size_t* mapped_size);
+	//! Commit a range of memory pages
+	void (*memory_commit)(void* address, size_t size);
+	//! Decommit a range of memory pages
+	void (*memory_decommit)(void* address, size_t size);
+	//! Unmap the memory pages starting at address and spanning the given number of bytes. If you set a memory_unmap
+	//! function, you must also set a memory_map function or else the default implementation will be used for both. This
+	//! function must be thread safe, it can be called by multiple threads simultaneously.
+	void (*memory_unmap)(void* address, size_t offset, size_t mapped_size);
+	//! Called when a call to map memory pages fails (out of memory). If this callback is not set or returns zero the
+	//! library will return a null pointer in the allocation call. If this callback returns non-zero the map call will
+	//! be retried. The argument passed is the number of bytes that was requested in the map call. Only used if the
+	//! default system memory map function is used (memory_map callback is not set).
+	int (*map_fail_callback)(size_t size);
+	//! Called when an assert fails, if asserts are enabled. Will use the standard assert() if this is not set.
+	void (*error_callback)(const char* message);
+} rpmalloc_interface_t;
+
+typedef struct rpmalloc_config_t {
+	//! Size of memory pages. The page size MUST be a power of two. All memory mapping
+	//  requests to memory_map will be made with size set to a multiple of the page size.
+	//  Set to 0 to use the OS default page size.
+	size_t page_size;
+	//! Enable use of large/huge pages. If this flag is set to non-zero and page size is
+	//  zero, the allocator will try to enable huge pages and auto detect the configuration.
+	//  If this is set to non-zero and page_size is also non-zero, the allocator will
+	//  assume huge pages have been configured and enabled prior to initializing the
+	//  allocator.
+	//  For Windows, see https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support
+	//  For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
+	int enable_huge_pages;
+	//! Disable decommitting unused pages when allocator determines the memory pressure
+	//  is low and there is enough active pages cached. If set to 1, keep all pages committed.
+	int disable_decommit;
+	//! Allocated pages names for systems supporting it to be able to distinguish among anonymous regions.
+	const char* page_name;
+	//! Allocated huge pages names for systems supporting it to be able to distinguish among anonymous regions.
+	const char* huge_page_name;
+	//! Unmap all memory on finalize if set to 1. Normally you can let the OS unmap all pages
+	//  when process exits, but if using rpmalloc in a dynamic library you might want to unmap
+	//  all pages when the dynamic library unloads to avoid process memory leaks and bloat.
+	int unmap_on_finalize;
+#if defined(__linux__) || defined(__ANDROID__)
+	///! Allows to disable the Transparent Huge Page feature on Linux on a process basis,
+	///  rather than enabling/disabling system-wise (done via /sys/kernel/mm/transparent_hugepage/enabled).
+	///  It can possibly improve performance and reduced allocation overhead in some contexts, albeit
+	///  THP is usually enabled by default.
+	int disable_thp;
+#endif
+} rpmalloc_config_t;
+
+//! Initialize allocator
+RPMALLOC_EXPORT int
+rpmalloc_initialize(rpmalloc_interface_t* memory_interface);
+
+//! Initialize allocator
+RPMALLOC_EXPORT int
+rpmalloc_initialize_config(rpmalloc_interface_t* memory_interface, rpmalloc_config_t* config);
+
+//! Get allocator configuration
+RPMALLOC_EXPORT const rpmalloc_config_t*
+rpmalloc_config(void);
+
+//! Finalize allocator
+RPMALLOC_EXPORT void
+rpmalloc_finalize(void);
+
+//! Initialize allocator for calling thread
+RPMALLOC_EXPORT void
+rpmalloc_thread_initialize(void);
+
+//! Finalize allocator for calling thread
+RPMALLOC_EXPORT void
+rpmalloc_thread_finalize(void);
+
+//! Perform deferred deallocations pending for the calling thread heap
+RPMALLOC_EXPORT void
+rpmalloc_thread_collect(void);
+
+//! Query if allocator is initialized for calling thread
+RPMALLOC_EXPORT int
+rpmalloc_is_thread_initialized(void);
+
+//! Get per-thread statistics
+RPMALLOC_EXPORT void
+rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats);
+
+//! Get global statistics
+RPMALLOC_EXPORT void
+rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats);
+
+//! Dump all statistics in human readable format to file (should be a FILE*)
+RPMALLOC_EXPORT void
+rpmalloc_dump_statistics(void* file);
+
+//! Allocate a memory block of at least the given size
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1);
+
+//! Allocate a zero initialized memory block of at least the given size
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpzalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1);
+
+//! Allocate a memory block of at least the given size and zero initialize it
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2);
+
+//! Reallocate the given block to at least the given size
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rprealloc(void* ptr, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Reallocate the given block to at least the given size and alignment,
+//  with optional control flags (see RPMALLOC_NO_PRESERVE).
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpaligned_zalloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size and alignment, and zero initialize it.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpaligned_calloc(size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT int
+rpposix_memalign(void** memptr, size_t alignment, size_t size);
+
+//! Free the given memory block
+RPMALLOC_EXPORT void
+rpfree(void* ptr);
+
+//! Query the usable size of the given memory block (from given pointer to the end of block)
+RPMALLOC_EXPORT size_t
+rpmalloc_usable_size(void* ptr);
+
+//! Dummy empty function for forcing linker symbol inclusion
+RPMALLOC_EXPORT void
+rpmalloc_linker_reference(void);
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
+//! Heap type
+typedef struct heap_t rpmalloc_heap_t;
+
+//! Acquire a new heap. Will reuse existing released heaps or allocate memory for a new heap
+//  if none available. Heap API is implemented with the strict assumption that only one single
+//  thread will call heap functions for a given heap at any given time, no functions are thread safe.
+RPMALLOC_EXPORT rpmalloc_heap_t*
+rpmalloc_heap_acquire(void);
+
+//! Release a heap (does NOT free the memory allocated by the heap, use rpmalloc_heap_free_all before destroying the
+//! heap).
+//  Releasing a heap will enable it to be reused by other threads. Safe to pass a null pointer.
+RPMALLOC_EXPORT void
+rpmalloc_heap_release(rpmalloc_heap_t* heap);
+
+//! Allocate a memory block of at least the given size using the given heap.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size using the given heap. The returned
+//  block will have the requested alignment. Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Allocate a memory block of at least the given size using the given heap and zero initialize it.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Allocate a memory block of at least the given size using the given heap and zero initialize it. The returned
+//  block will have the requested alignment. Alignment must either be zero, or a power of two and a multiple of
+//  sizeof(void*), and should ideally be less than memory page size. A caveat of rpmalloc internals is that this must
+//  also be strictly less than the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Reallocate the given block to at least the given size. The memory block MUST be allocated
+//  by the same heap given to this function.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Reallocate the given block to at least the given size. The memory block MUST be allocated
+//  by the same heap given to this function. The returned block will have the requested alignment.
+//  Alignment must be either zero, or a power of two and a multiple of sizeof(void*), and should ideally be
+//  less than memory page size. A caveat of rpmalloc internals is that this must also be strictly less than
+//  the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size,
+                              unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(4);
+
+//! Free the given memory block from the given heap. The memory block MUST be allocated
+//  by the same heap given to this function.
+RPMALLOC_EXPORT void
+rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr);
+
+//! Free all memory allocated by the heap
+RPMALLOC_EXPORT void
+rpmalloc_heap_free_all(rpmalloc_heap_t* heap);
+
+//! Set the given heap as the current heap for the calling thread. A heap MUST only be current heap
+//  for a single thread, a heap can never be shared between multiple threads. The previous
+//  current heap for the calling thread is released to be reused by other threads.
+RPMALLOC_EXPORT void
+rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap);
+
+//! Returns which heap the given pointer is allocated on
+RPMALLOC_EXPORT rpmalloc_heap_t*
+rpmalloc_get_heap_for_ptr(void* ptr);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/thirdparty/rpmalloc/rpnew.h b/thirdparty/rpmalloc/rpnew.h
new file mode 100644
index 000000000..75d381a3f
--- /dev/null
+++ b/thirdparty/rpmalloc/rpnew.h
@@ -0,0 +1,111 @@
+
+#ifdef __cplusplus
+
+#include <new>
+#include <rpmalloc.h>
+
+#ifndef __CRTDECL
+#define __CRTDECL
+#endif
+
+extern void __CRTDECL
+operator delete(void* p) noexcept {
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete[](void* p) noexcept {
+	rpfree(p);
+}
+
+extern void* __CRTDECL
+operator new(std::size_t size) noexcept(false) {
+	return rpmalloc(size);
+}
+
+extern void* __CRTDECL
+operator new[](std::size_t size) noexcept(false) {
+	return rpmalloc(size);
+}
+
+extern void* __CRTDECL
+operator new(std::size_t size, const std::nothrow_t& tag) noexcept {
+	(void)sizeof(tag);
+	return rpmalloc(size);
+}
+
+extern void* __CRTDECL
+operator new[](std::size_t size, const std::nothrow_t& tag) noexcept {
+	(void)sizeof(tag);
+	return rpmalloc(size);
+}
+
+#if (__cplusplus >= 201402L || _MSC_VER >= 1916)
+
+extern void __CRTDECL
+operator delete(void* p, std::size_t size) noexcept {
+	(void)sizeof(size);
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete[](void* p, std::size_t size) noexcept {
+	(void)sizeof(size);
+	rpfree(p);
+}
+
+#endif
+
+#if (__cplusplus > 201402L || defined(__cpp_aligned_new))
+
+extern void __CRTDECL
+operator delete(void* p, std::align_val_t align) noexcept {
+	(void)sizeof(align);
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete[](void* p, std::align_val_t align) noexcept {
+	(void)sizeof(align);
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete(void* p, std::size_t size, std::align_val_t align) noexcept {
+	(void)sizeof(size);
+	(void)sizeof(align);
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete[](void* p, std::size_t size, std::align_val_t align) noexcept {
+	(void)sizeof(size);
+	(void)sizeof(align);
+	rpfree(p);
+}
+
+extern void* __CRTDECL
+operator new(std::size_t size, std::align_val_t align) noexcept(false) {
+	return rpaligned_alloc(static_cast<size_t>(align), size);
+}
+
+extern void* __CRTDECL
+operator new[](std::size_t size, std::align_val_t align) noexcept(false) {
+	return rpaligned_alloc(static_cast<size_t>(align), size);
+}
+
+extern void* __CRTDECL
+operator new(std::size_t size, std::align_val_t align, const std::nothrow_t& tag) noexcept {
+	(void)sizeof(tag);
+	return rpaligned_alloc(static_cast<size_t>(align), size);
+}
+
+extern void* __CRTDECL
+operator new[](std::size_t size, std::align_val_t align, const std::nothrow_t& tag) noexcept {
+	(void)sizeof(tag);
+	return rpaligned_alloc(static_cast<size_t>(align), size);
+}
+
+#endif
+
+#endif
author	Stefan Boberg <[email protected]>	2024-11-25 09:56:23 +0100
committer	GitHub Enterprise <[email protected]>	2024-11-25 09:56:23 +0100
commit	8b8de92e51db4cc4c1727712c736dcba5f79d369 (patch)
tree	1f58edaaad389837a7652daebab246125762240e /thirdparty
parent	5.5.13 (diff)
download	zen-8b8de92e51db4cc4c1727712c736dcba5f79d369.tar.xz zen-8b8de92e51db4cc4c1727712c736dcba5f79d369.zip