1 files changed, 2341 insertions, 0 deletions
diff --git a/thirdparty/rpmalloc/rpmalloc.c b/thirdparty/rpmalloc/rpmalloc.c
new file mode 100644
index 000000000..7aecfb0f4
--- /dev/null
+++ b/thirdparty/rpmalloc/rpmalloc.c
@@ -0,0 +1,2341 @@
+/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016-2020 Mattias
+ * Jansson
+ *
+ * This library provides a cross-platform lock free thread caching malloc
+ * implementation in C11. The latest source code is always available at
+ *
+ * https://github.com/mjansson/rpmalloc
+ *
+ * This library is put in the public domain; you can redistribute it and/or
+ * modify it without any restrictions.
+ *
+ */
+
+#include "rpmalloc.h"
+
+#include <errno.h>
+#include <string.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdatomic.h>
+
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wunused-macros"
+#pragma clang diagnostic ignored "-Wunused-function"
+#if __has_warning("-Wreserved-identifier")
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+#if __has_warning("-Wstatic-in-inline")
+#pragma clang diagnostic ignored "-Wstatic-in-inline"
+#endif
+#if __has_warning("-Wunsafe-buffer-usage")
+#pragma clang diagnostic ignored "-Wunsafe-buffer-usage"
+#endif
+#elif defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wunused-macros"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+#if defined(_WIN32) || defined(__WIN32__) || defined(_WIN64)
+#define PLATFORM_WINDOWS 1
+#define PLATFORM_POSIX 0
+#else
+#define PLATFORM_WINDOWS 0
+#define PLATFORM_POSIX 1
+#endif
+
+#if defined(_MSC_VER)
+#define NOINLINE __declspec(noinline)
+#else
+#define NOINLINE __attribute__((noinline))
+#endif
+
+#if PLATFORM_WINDOWS
+#include <windows.h>
+#include <fibersapi.h>
+static DWORD fls_key;
+#endif
+#if PLATFORM_POSIX
+#include <sys/mman.h>
+#include <sched.h>
+#include <unistd.h>
+#include <pthread.h>
+static pthread_key_t pthread_key;
+#ifdef __FreeBSD__
+#include <sys/sysctl.h>
+#define MAP_HUGETLB MAP_ALIGNED_SUPER
+#ifndef PROT_MAX
+#define PROT_MAX(f) 0
+#endif
+#else
+#define PROT_MAX(f) 0
+#endif
+#ifdef __sun
+extern int
+madvise(caddr_t, size_t, int);
+#endif
+#ifndef MAP_UNINITIALIZED
+#define MAP_UNINITIALIZED 0
+#endif
+#endif
+
+#if defined(__linux__) || defined(__ANDROID__)
+#include <sys/prctl.h>
+#if !defined(PR_SET_VMA)
+#define PR_SET_VMA 0x53564d41
+#define PR_SET_VMA_ANON_NAME 0
+#endif
+#endif
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
+#include <mach/mach_vm.h>
+#include <mach/vm_statistics.h>
+#endif
+#include <pthread.h>
+#endif
+#if defined(__HAIKU__) || defined(__TINYC__)
+#include <pthread.h>
+#endif
+
+#include <limits.h>
+#if (INTPTR_MAX > INT32_MAX)
+#define ARCH_64BIT 1
+#define ARCH_32BIT 0
+#else
+#define ARCH_64BIT 0
+#define ARCH_32BIT 1
+#endif
+
+#if !defined(__has_builtin)
+#define __has_builtin(b) 0
+#endif
+
+#define pointer_offset(ptr, ofs) (void*)((char*)(ptr) + (ptrdiff_t)(ofs))
+#define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second))
+
+////////////
+///
+/// Build time configurable limits
+///
+//////
+
+#ifndef ENABLE_VALIDATE_ARGS
+//! Enable validation of args to public entry points
+#define ENABLE_VALIDATE_ARGS 0
+#endif
+#ifndef ENABLE_ASSERTS
+//! Enable asserts
+#define ENABLE_ASSERTS 0
+#endif
+#ifndef ENABLE_UNMAP
+//! Enable unmapping memory pages
+#define ENABLE_UNMAP 1
+#endif
+#ifndef ENABLE_DECOMMIT
+//! Enable decommitting memory pages
+#define ENABLE_DECOMMIT 1
+#endif
+#ifndef ENABLE_DYNAMIC_LINK
+//! Enable building as dynamic library
+#define ENABLE_DYNAMIC_LINK 0
+#endif
+#ifndef ENABLE_OVERRIDE
+//! Enable standard library malloc/free/new/delete overrides
+#define ENABLE_OVERRIDE 1
+#endif
+#ifndef ENABLE_STATISTICS
+//! Enable statistics
+#define ENABLE_STATISTICS 0
+#endif
+
+////////////
+///
+/// Built in size configurations
+///
+//////
+
+#define PAGE_HEADER_SIZE 128
+#define SPAN_HEADER_SIZE PAGE_HEADER_SIZE
+
+#define SMALL_GRANULARITY 16
+
+#define SMALL_BLOCK_SIZE_LIMIT (4 * 1024)
+#define MEDIUM_BLOCK_SIZE_LIMIT (256 * 1024)
+#define LARGE_BLOCK_SIZE_LIMIT (8 * 1024 * 1024)
+
+#define SMALL_SIZE_CLASS_COUNT 73
+#define MEDIUM_SIZE_CLASS_COUNT 24
+#define LARGE_SIZE_CLASS_COUNT 20
+#define SIZE_CLASS_COUNT (SMALL_SIZE_CLASS_COUNT + MEDIUM_SIZE_CLASS_COUNT + LARGE_SIZE_CLASS_COUNT)
+
+#define SMALL_PAGE_SIZE_SHIFT 16
+#define SMALL_PAGE_SIZE (1 << SMALL_PAGE_SIZE_SHIFT)
+#define SMALL_PAGE_MASK (~((uintptr_t)SMALL_PAGE_SIZE - 1))
+#define MEDIUM_PAGE_SIZE_SHIFT 22
+#define MEDIUM_PAGE_SIZE (1 << MEDIUM_PAGE_SIZE_SHIFT)
+#define MEDIUM_PAGE_MASK (~((uintptr_t)MEDIUM_PAGE_SIZE - 1))
+#define LARGE_PAGE_SIZE_SHIFT 26
+#define LARGE_PAGE_SIZE (1 << LARGE_PAGE_SIZE_SHIFT)
+#define LARGE_PAGE_MASK (~((uintptr_t)LARGE_PAGE_SIZE - 1))
+
+#define SPAN_SIZE (256 * 1024 * 1024)
+#define SPAN_MASK (~((uintptr_t)(SPAN_SIZE - 1)))
+
+////////////
+///
+/// Utility macros
+///
+//////
+
+#if ENABLE_ASSERTS
+#undef NDEBUG
+#if defined(_MSC_VER) && !defined(_DEBUG)
+#define _DEBUG
+#endif
+#include <assert.h>
+#define RPMALLOC_TOSTRING_M(x) #x
+#define RPMALLOC_TOSTRING(x) RPMALLOC_TOSTRING_M(x)
+#define rpmalloc_assert(truth, message) \
+	do {                                \
+		if (!(truth)) {                 \
+			assert((truth) && message); \
+		}                               \
+	} while (0)
+#else
+#define rpmalloc_assert(truth, message) \
+	do {                                \
+	} while (0)
+#endif
+
+#if __has_builtin(__builtin_assume)
+#define rpmalloc_assume(cond) __builtin_assume(cond)
+#elif defined(__GNUC__)
+#define rpmalloc_assume(cond)           \
+	do {                                \
+		if (!__builtin_expect(cond, 0)) \
+			__builtin_unreachable();    \
+	} while (0)
+#elif defined(_MSC_VER)
+#define rpmalloc_assume(cond) __assume(cond)
+#else
+#define rpmalloc_assume(cond) 0
+#endif
+
+////////////
+///
+/// Statistics
+///
+//////
+
+#if ENABLE_STATISTICS
+
+typedef struct rpmalloc_statistics_t {
+	atomic_size_t page_mapped;
+	atomic_size_t page_mapped_peak;
+	atomic_size_t page_commit;
+	atomic_size_t page_decommit;
+	atomic_size_t page_active;
+	atomic_size_t page_active_peak;
+	atomic_size_t heap_count;
+} rpmalloc_statistics_t;
+
+static rpmalloc_statistics_t global_statistics;
+
+#else
+
+#endif
+
+////////////
+///
+/// Low level abstractions
+///
+//////
+
+static inline size_t
+rpmalloc_clz(uintptr_t x) {
+#if ARCH_64BIT
+#if defined(_MSC_VER) && !defined(__clang__)
+	return (size_t)_lzcnt_u64(x);
+#else
+	return (size_t)__builtin_clzll(x);
+#endif
+#else
+#if defined(_MSC_VER) && !defined(__clang__)
+	return (size_t)_lzcnt_u32(x);
+#else
+	return (size_t)__builtin_clzl(x);
+#endif
+#endif
+}
+
+static inline void
+wait_spin(void) {
+#if defined(_MSC_VER)
+#if defined(_M_ARM64)
+	__yield();
+#else
+	_mm_pause();
+#endif
+#elif defined(__x86_64__) || defined(__i386__)
+	__asm__ volatile("pause" ::: "memory");
+#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH >= 7)
+	__asm__ volatile("yield" ::: "memory");
+#elif defined(__powerpc__) || defined(__powerpc64__)
+	// No idea if ever been compiled in such archs but ... as precaution
+	__asm__ volatile("or 27,27,27");
+#elif defined(__sparc__)
+	__asm__ volatile("rd %ccr, %g0 \n\trd %ccr, %g0 \n\trd %ccr, %g0");
+#else
+	struct timespec ts = {0};
+	nanosleep(&ts, 0);
+#endif
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#define EXPECTED(x) __builtin_expect((x), 1)
+#define UNEXPECTED(x) __builtin_expect((x), 0)
+
+#else
+
+#define EXPECTED(x) x
+#define UNEXPECTED(x) x
+
+#endif
+#if defined(__GNUC__) || defined(__clang__)
+
+#if __has_builtin(__builtin_memcpy_inline)
+#define memcpy_const(x, y, s) __builtin_memcpy_inline(x, y, s)
+#else
+#define memcpy_const(x, y, s)                                                                                   \
+	do {                                                                                                        \
+		_Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0), "len must be a constant integer"); \
+		memcpy(x, y, s);                                                                                        \
+	} while (0)
+#endif
+
+#if __has_builtin(__builtin_memset_inline)
+#define memset_const(x, y, s) __builtin_memset_inline(x, y, s)
+#else
+#define memset_const(x, y, s)                                                                                   \
+	do {                                                                                                        \
+		_Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0), "len must be a constant integer"); \
+		memset(x, y, s);                                                                                        \
+	} while (0)
+#endif
+#else
+#define memcpy_const(x, y, s) memcpy(x, y, s)
+#define memset_const(x, y, s) memset(x, y, s)
+#endif
+
+////////////
+///
+/// Data types
+///
+//////
+
+//! A memory heap, per thread
+typedef struct heap_t heap_t;
+//! Span of memory pages
+typedef struct span_t span_t;
+//! Memory page
+typedef struct page_t page_t;
+//! Memory block
+typedef struct block_t block_t;
+//! Size class for a memory block
+typedef struct size_class_t size_class_t;
+
+//! Memory page type
+typedef enum page_type_t {
+	PAGE_SMALL,   // 64KiB
+	PAGE_MEDIUM,  // 4MiB
+	PAGE_LARGE,   // 64MiB
+	PAGE_HUGE
+} page_type_t;
+
+//! Block size class
+struct size_class_t {
+	//! Size of blocks in this class
+	uint32_t block_size;
+	//! Number of blocks in each chunk
+	uint32_t block_count;
+};
+
+//! A memory block
+struct block_t {
+	//! Next block in list
+	block_t* next;
+};
+
+//! A page contains blocks of a given size
+struct page_t {
+	//! Size class of blocks
+	uint32_t size_class;
+	//! Block size
+	uint32_t block_size;
+	//! Block count
+	uint32_t block_count;
+	//! Block initialized count
+	uint32_t block_initialized;
+	//! Block used count
+	uint32_t block_used;
+	//! Page type
+	page_type_t page_type;
+	//! Flag set if part of heap full list
+	uint32_t is_full : 1;
+	//! Flag set if part of heap free list
+	uint32_t is_free : 1;
+	//! Flag set if blocks are zero initialied
+	uint32_t is_zero : 1;
+	//! Flag set if memory pages have been decommitted
+	uint32_t is_decommitted : 1;
+	//! Flag set if containing aligned blocks
+	uint32_t has_aligned_block : 1;
+	//! Fast combination flag for either huge, fully allocated or has aligned blocks
+	uint32_t generic_free : 1;
+	//! Local free list count
+	uint32_t local_free_count;
+	//! Local free list
+	block_t* local_free;
+	//! Owning heap
+	heap_t* heap;
+	//! Next page in list
+	page_t* next;
+	//! Previous page in list
+	page_t* prev;
+	//! Multithreaded free list, block index is in low 32 bit, list count is high 32 bit
+	atomic_ullong thread_free;
+};
+
+//! A span contains pages of a given type
+struct span_t {
+	//! Page header
+	page_t page;
+	//! Owning heap
+	heap_t* heap;
+	//! Page address mask
+	uintptr_t page_address_mask;
+	//! Number of pages initialized
+	uint32_t page_initialized;
+	//! Number of pages in use
+	uint32_t page_count;
+	//! Number of bytes per page
+	uint32_t page_size;
+	//! Page type
+	page_type_t page_type;
+	//! Offset to start of mapped memory region
+	uint32_t offset;
+	//! Mapped size
+	uint64_t mapped_size;
+	//! Next span in list
+	span_t* next;
+};
+
+// Control structure for a heap, either a thread heap or a first class heap if enabled
+struct heap_t {
+	//! Owning thread ID
+	uintptr_t owner_thread;
+	//! Heap local free list for small size classes
+	block_t* local_free[SIZE_CLASS_COUNT];
+	//! Available non-full pages for each size class
+	page_t* page_available[SIZE_CLASS_COUNT];
+	//! Free pages for each page type
+	page_t* page_free[3];
+	//! Free but still committed page count for each page tyoe
+	uint32_t page_free_commit_count[3];
+	//! Multithreaded free list
+	atomic_uintptr_t thread_free[3];
+	//! Available partially initialized spans for each page type
+	span_t* span_partial[3];
+	//! Spans in full use for each page type
+	span_t* span_used[4];
+	//! Next heap in queue
+	heap_t* next;
+	//! Previous heap in queue
+	heap_t* prev;
+	//! Heap ID
+	uint32_t id;
+	//! Finalization state flag
+	uint32_t finalize;
+	//! Memory map region offset
+	uint32_t offset;
+	//! Memory map size
+	size_t mapped_size;
+};
+
+_Static_assert(sizeof(page_t) <= PAGE_HEADER_SIZE, "Invalid page header size");
+_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "Invalid span header size");
+_Static_assert(sizeof(heap_t) <= 4096, "Invalid heap size");
+
+////////////
+///
+/// Global data
+///
+//////
+
+//! Fallback heap
+static RPMALLOC_CACHE_ALIGNED heap_t global_heap_fallback;
+//! Default heap
+static heap_t* global_heap_default = &global_heap_fallback;
+//! Available heaps
+static heap_t* global_heap_queue;
+//! In use heaps
+static heap_t* global_heap_used;
+//! Lock for heap queue
+static atomic_uintptr_t global_heap_lock;
+//! Heap ID counter
+static atomic_uint global_heap_id = 1;
+//! Initialized flag
+static int global_rpmalloc_initialized;
+//! Memory interface
+static rpmalloc_interface_t* global_memory_interface;
+//! Default memory interface
+static rpmalloc_interface_t global_memory_interface_default;
+//! Current configuration
+static rpmalloc_config_t global_config = {0};
+//! Main thread ID
+static uintptr_t global_main_thread_id;
+
+//! Size classes
+#define SCLASS(n) \
+	{ (n * SMALL_GRANULARITY), (SMALL_PAGE_SIZE - PAGE_HEADER_SIZE) / (n * SMALL_GRANULARITY) }
+#define MCLASS(n) \
+	{ (n * SMALL_GRANULARITY), (MEDIUM_PAGE_SIZE - PAGE_HEADER_SIZE) / (n * SMALL_GRANULARITY) }
+#define LCLASS(n) \
+	{ (n * SMALL_GRANULARITY), (LARGE_PAGE_SIZE - PAGE_HEADER_SIZE) / (n * SMALL_GRANULARITY) }
+static const size_class_t global_size_class[SIZE_CLASS_COUNT] = {
+    SCLASS(1),      SCLASS(1),      SCLASS(2),      SCLASS(3),      SCLASS(4),      SCLASS(5),      SCLASS(6),
+    SCLASS(7),      SCLASS(8),      SCLASS(9),      SCLASS(10),     SCLASS(11),     SCLASS(12),     SCLASS(13),
+    SCLASS(14),     SCLASS(15),     SCLASS(16),     SCLASS(17),     SCLASS(18),     SCLASS(19),     SCLASS(20),
+    SCLASS(21),     SCLASS(22),     SCLASS(23),     SCLASS(24),     SCLASS(25),     SCLASS(26),     SCLASS(27),
+    SCLASS(28),     SCLASS(29),     SCLASS(30),     SCLASS(31),     SCLASS(32),     SCLASS(33),     SCLASS(34),
+    SCLASS(35),     SCLASS(36),     SCLASS(37),     SCLASS(38),     SCLASS(39),     SCLASS(40),     SCLASS(41),
+    SCLASS(42),     SCLASS(43),     SCLASS(44),     SCLASS(45),     SCLASS(46),     SCLASS(47),     SCLASS(48),
+    SCLASS(49),     SCLASS(50),     SCLASS(51),     SCLASS(52),     SCLASS(53),     SCLASS(54),     SCLASS(55),
+    SCLASS(56),     SCLASS(57),     SCLASS(58),     SCLASS(59),     SCLASS(60),     SCLASS(61),     SCLASS(62),
+    SCLASS(63),     SCLASS(64),     SCLASS(80),     SCLASS(96),     SCLASS(112),    SCLASS(128),    SCLASS(160),
+    SCLASS(192),    SCLASS(224),    SCLASS(256),    MCLASS(320),    MCLASS(384),    MCLASS(448),    MCLASS(512),
+    MCLASS(640),    MCLASS(768),    MCLASS(896),    MCLASS(1024),   MCLASS(1280),   MCLASS(1536),   MCLASS(1792),
+    MCLASS(2048),   MCLASS(2560),   MCLASS(3072),   MCLASS(3584),   MCLASS(4096),   MCLASS(5120),   MCLASS(6144),
+    MCLASS(7168),   MCLASS(8192),   MCLASS(10240),  MCLASS(12288),  MCLASS(14336),  MCLASS(16384),  LCLASS(20480),
+    LCLASS(24576),  LCLASS(28672),  LCLASS(32768),  LCLASS(40960),  LCLASS(49152),  LCLASS(57344),  LCLASS(65536),
+    LCLASS(81920),  LCLASS(98304),  LCLASS(114688), LCLASS(131072), LCLASS(163840), LCLASS(196608), LCLASS(229376),
+    LCLASS(262144), LCLASS(327680), LCLASS(393216), LCLASS(458752), LCLASS(524288)};
+
+//! Threshold number of pages for when free pages are decommitted
+static uint32_t global_page_free_overflow[4] = {16, 8, 2, 0};
+
+//! Number of pages to retain when free page threshold overflows
+static uint32_t global_page_free_retain[4] = {4, 2, 1, 0};
+
+//! OS huge page support
+static int os_huge_pages;
+//! OS memory map granularity
+static size_t os_map_granularity;
+//! OS memory page size
+static size_t os_page_size;
+
+////////////
+///
+/// Thread local heap and ID
+///
+//////
+
+//! Current thread heap
+#if defined(_MSC_VER) && !defined(__clang__)
+#define TLS_MODEL
+#define _Thread_local __declspec(thread)
+#else
+// #define TLS_MODEL __attribute__((tls_model("initial-exec")))
+#define TLS_MODEL
+#endif
+static _Thread_local heap_t* global_thread_heap TLS_MODEL = &global_heap_fallback;
+
+static heap_t*
+heap_allocate(int first_class);
+
+static void
+heap_page_free_decommit(heap_t* heap, uint32_t page_type, uint32_t page_retain_count);
+
+//! Fast thread ID
+static inline uintptr_t
+get_thread_id(void) {
+#if defined(_WIN32)
+	return (uintptr_t)((void*)NtCurrentTeb());
+#else
+	void* thp = __builtin_thread_pointer();
+	return (uintptr_t)thp;
+#endif
+	/*
+	#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__)
+	    uintptr_t tid;
+	#if defined(__i386__)
+	    __asm__("movl %%gs:0, %0" : "=r"(tid) : :);
+	#elif defined(__x86_64__)
+	#if defined(__MACH__)
+	    __asm__("movq %%gs:0, %0" : "=r"(tid) : :);
+	#else
+	    __asm__("movq %%fs:0, %0" : "=r"(tid) : :);
+	#endif
+	#elif defined(__arm__)
+	    __asm__ volatile("mrc p15, 0, %0, c13, c0, 3" : "=r"(tid));
+	#elif defined(__aarch64__)
+	#if defined(__MACH__)
+	    // tpidr_el0 likely unused, always return 0 on iOS
+	    __asm__ volatile("mrs %0, tpidrro_el0" : "=r"(tid));
+	#else
+	    __asm__ volatile("mrs %0, tpidr_el0" : "=r"(tid));
+	#endif
+	#else
+	#error This platform needs implementation of get_thread_id()
+	#endif
+	    return tid;
+	#else
+	#error This platform needs implementation of get_thread_id()
+	#endif
+	*/
+}
+
+//! Set the current thread heap
+static void
+set_thread_heap(heap_t* heap) {
+	global_thread_heap = heap;
+	if (heap && (heap->id != 0)) {
+		rpmalloc_assert(heap->id != 0, "Default heap being used");
+		heap->owner_thread = get_thread_id();
+	}
+#if PLATFORM_WINDOWS
+	FlsSetValue(fls_key, heap);
+#else
+	pthread_setspecific(pthread_key, heap);
+#endif
+}
+
+static heap_t*
+get_thread_heap_allocate(void) {
+	heap_t* heap = heap_allocate(0);
+	set_thread_heap(heap);
+	return heap;
+}
+
+//! Get the current thread heap
+static inline heap_t*
+get_thread_heap(void) {
+	return global_thread_heap;
+}
+
+//! Get the size class from given size in bytes for tiny blocks (below 16 times the minimum granularity)
+static inline uint32_t
+get_size_class_tiny(size_t size) {
+	return (((uint32_t)size + (SMALL_GRANULARITY - 1)) / SMALL_GRANULARITY);
+}
+
+//! Get the size class from given size in bytes
+static inline uint32_t
+get_size_class(size_t size) {
+	uintptr_t minblock_count = (size + (SMALL_GRANULARITY - 1)) / SMALL_GRANULARITY;
+	// For sizes up to 64 times the minimum granularity (i.e 1024 bytes) the size class is equal to number of such
+	// blocks
+	if (size <= (SMALL_GRANULARITY * 64)) {
+		rpmalloc_assert(global_size_class[minblock_count].block_size >= size, "Size class misconfiguration");
+		return (uint32_t)(minblock_count ? minblock_count : 1);
+	}
+	--minblock_count;
+	// Calculate position of most significant bit, since minblock_count now guaranteed to be > 64 this position is
+	// guaranteed to be >= 6
+#if ARCH_64BIT
+	const uint32_t most_significant_bit = (uint32_t)(63 - (int)rpmalloc_clz(minblock_count));
+#else
+	const uint32_t most_significant_bit = (uint32_t)(31 - (int)rpmalloc_clz(minblock_count));
+#endif
+	// Class sizes are of the bit format [..]000xxx000[..] where we already have the position of the most significant
+	// bit, now calculate the subclass from the remaining two bits
+	const uint32_t subclass_bits = (minblock_count >> (most_significant_bit - 2)) & 0x03;
+	const uint32_t class_idx = (uint32_t)((most_significant_bit << 2) + subclass_bits) + 41;
+	rpmalloc_assert((class_idx >= SIZE_CLASS_COUNT) || (global_size_class[class_idx].block_size >= size),
+	                "Size class misconfiguration");
+	rpmalloc_assert((class_idx >= SIZE_CLASS_COUNT) || (global_size_class[class_idx - 1].block_size < size),
+	                "Size class misconfiguration");
+	return class_idx;
+}
+
+static inline page_type_t
+get_page_type(uint32_t size_class) {
+	if (size_class < SMALL_SIZE_CLASS_COUNT)
+		return PAGE_SMALL;
+	else if (size_class < (SMALL_SIZE_CLASS_COUNT + MEDIUM_SIZE_CLASS_COUNT))
+		return PAGE_MEDIUM;
+	else if (size_class < SIZE_CLASS_COUNT)
+		return PAGE_LARGE;
+	return PAGE_HUGE;
+}
+
+static inline size_t
+get_page_aligned_size(size_t size) {
+	size_t unalign = size % global_config.page_size;
+	if (unalign)
+		size += global_config.page_size - unalign;
+	return size;
+}
+
+////////////
+///
+/// OS entry points
+///
+//////
+
+static void
+os_set_page_name(void* address, size_t size) {
+#if defined(__linux__) || defined(__ANDROID__)
+	const char* name = os_huge_pages ? global_config.huge_page_name : global_config.page_name;
+	if ((address == MAP_FAILED) || !name)
+		return;
+	// If the kernel does not support CONFIG_ANON_VMA_NAME or if the call fails
+	// (e.g. invalid name) it is a no-op basically.
+	(void)prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (uintptr_t)address, size, (uintptr_t)name);
+#else
+	(void)sizeof(size);
+	(void)sizeof(address);
+#endif
+}
+
+static void*
+os_mmap(size_t size, size_t alignment, size_t* offset, size_t* mapped_size) {
+	size_t map_size = size + alignment;
+#if PLATFORM_WINDOWS
+	// Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses
+	// are actually accessed". But if we enable decommit it's better to not immediately commit and instead commit per
+	// page to avoid saturating the OS commit limit
+#if ENABLE_DECOMMIT
+	DWORD do_commit = 0;
+#else
+	DWORD do_commit = MEM_COMMIT;
+#endif
+	void* ptr =
+	    VirtualAlloc(0, map_size, (os_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | do_commit, PAGE_READWRITE);
+#else
+	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED;
+#if defined(__APPLE__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
+	int fd = (int)VM_MAKE_TAG(240U);
+	if (os_huge_pages)
+		fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+	void* ptr = mmap(0, map_size, PROT_READ | PROT_WRITE, flags, fd, 0);
+#elif defined(MAP_HUGETLB)
+	void* ptr = mmap(0, map_size, PROT_READ | PROT_WRITE | PROT_MAX(PROT_READ | PROT_WRITE),
+	                 (os_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0);
+#if defined(MADV_HUGEPAGE)
+	// In some configurations, huge pages allocations might fail thus
+	// we fallback to normal allocations and promote the region as transparent huge page
+	if ((ptr == MAP_FAILED || !ptr) && os_huge_pages) {
+		ptr = mmap(0, map_size, PROT_READ | PROT_WRITE, flags, -1, 0);
+		if (ptr && ptr != MAP_FAILED) {
+			int prm = madvise(ptr, size, MADV_HUGEPAGE);
+			(void)prm;
+			rpmalloc_assert((prm == 0), "Failed to promote the page to transparent huge page");
+		}
+	}
+#endif
+	os_set_page_name(ptr, map_size);
+#elif defined(MAP_ALIGNED)
+	const size_t align = (sizeof(size_t) * 8) - (size_t)(__builtin_clzl(size - 1));
+	void* ptr = mmap(0, map_size, PROT_READ | PROT_WRITE, (os_huge_pages ? MAP_ALIGNED(align) : 0) | flags, -1, 0);
+#elif defined(MAP_ALIGN)
+	caddr_t base = (os_huge_pages ? (caddr_t)(4 << 20) : 0);
+	void* ptr = mmap(base, map_size, PROT_READ | PROT_WRITE, (os_huge_pages ? MAP_ALIGN : 0) | flags, -1, 0);
+#else
+	void* ptr = mmap(0, map_size, PROT_READ | PROT_WRITE, flags, -1, 0);
+#endif
+	if (ptr == MAP_FAILED)
+		ptr = 0;
+#endif
+	if (!ptr) {
+		if (global_memory_interface->map_fail_callback) {
+			if (global_memory_interface->map_fail_callback(map_size))
+				return os_mmap(size, alignment, offset, mapped_size);
+		} else {
+			rpmalloc_assert(ptr != 0, "Failed to map more virtual memory");
+		}
+		return 0;
+	}
+	if (alignment) {
+		size_t padding = ((uintptr_t)ptr & (uintptr_t)(alignment - 1));
+		if (padding)
+			padding = alignment - padding;
+		rpmalloc_assert(padding <= alignment, "Internal failure in padding");
+		rpmalloc_assert(!(padding % 8), "Internal failure in padding");
+		ptr = pointer_offset(ptr, padding);
+		*offset = padding;
+	}
+	*mapped_size = map_size;
+#if ENABLE_STATISTICS
+	size_t page_count = map_size / global_config.page_size;
+	size_t page_mapped_current =
+	    atomic_fetch_add_explicit(&global_statistics.page_mapped, page_count, memory_order_relaxed) + page_count;
+	size_t page_mapped_peak = atomic_load_explicit(&global_statistics.page_mapped_peak, memory_order_relaxed);
+	while (page_mapped_current > page_mapped_peak) {
+		if (atomic_compare_exchange_weak_explicit(&global_statistics.page_mapped_peak, &page_mapped_peak,
+		                                          page_mapped_current, memory_order_relaxed, memory_order_relaxed))
+			break;
+	}
+#if ENABLE_DECOMMIT
+	size_t page_active_current =
+	    atomic_fetch_add_explicit(&global_statistics.page_active, page_count, memory_order_relaxed) + page_count;
+	size_t page_active_peak = atomic_load_explicit(&global_statistics.page_active_peak, memory_order_relaxed);
+	while (page_active_current > page_active_peak) {
+		if (atomic_compare_exchange_weak_explicit(&global_statistics.page_active_peak, &page_active_peak,
+		                                          page_active_current, memory_order_relaxed, memory_order_relaxed))
+			break;
+	}
+#endif
+#endif
+	return ptr;
+}
+
+static void
+os_mcommit(void* address, size_t size) {
+#if ENABLE_DECOMMIT
+	if (global_config.disable_decommit)
+		return;
+#if PLATFORM_WINDOWS
+	if (!VirtualAlloc(address, size, MEM_COMMIT, PAGE_READWRITE)) {
+		rpmalloc_assert(0, "Failed to commit virtual memory block");
+	}
+#else
+		/*
+		if (mprotect(address, size, PROT_READ | PROT_WRITE)) {
+		    rpmalloc_assert(0, "Failed to commit virtual memory block");
+		}
+		*/
+#endif
+#if ENABLE_STATISTICS
+	size_t page_count = size / global_config.page_size;
+	atomic_fetch_add_explicit(&global_statistics.page_commit, page_count, memory_order_relaxed);
+	size_t page_active_current =
+	    atomic_fetch_add_explicit(&global_statistics.page_active, page_count, memory_order_relaxed) + page_count;
+	size_t page_active_peak = atomic_load_explicit(&global_statistics.page_active_peak, memory_order_relaxed);
+	while (page_active_current > page_active_peak) {
+		if (atomic_compare_exchange_weak_explicit(&global_statistics.page_active_peak, &page_active_peak,
+		                                          page_active_current, memory_order_relaxed, memory_order_relaxed))
+			break;
+	}
+#endif
+#endif
+	(void)sizeof(address);
+	(void)sizeof(size);
+}
+
+static void
+os_mdecommit(void* address, size_t size) {
+#if ENABLE_DECOMMIT
+	if (global_config.disable_decommit)
+		return;
+#if PLATFORM_WINDOWS
+	if (!VirtualFree(address, size, MEM_DECOMMIT)) {
+		rpmalloc_assert(0, "Failed to decommit virtual memory block");
+	}
+#else
+		/*
+		if (mprotect(address, size, PROT_NONE)) {
+		    rpmalloc_assert(0, "Failed to decommit virtual memory block");
+		}
+		*/
+#if defined(MADV_DONTNEED)
+	if (madvise(address, size, MADV_DONTNEED)) {
+#elif defined(MADV_FREE_REUSABLE)
+	int ret;
+	while ((ret = madvise(address, size, MADV_FREE_REUSABLE)) == -1 && (errno == EAGAIN))
+		errno = 0;
+	if ((ret == -1) && (errno != 0)) {
+#elif defined(MADV_PAGEOUT)
+	if (madvise(address, size, MADV_PAGEOUT)) {
+#elif defined(MADV_FREE)
+	if (madvise(address, size, MADV_FREE)) {
+#else
+	if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) {
+#endif
+		rpmalloc_assert(0, "Failed to decommit virtual memory block");
+	}
+#endif
+#if ENABLE_STATISTICS
+	size_t page_count = size / global_config.page_size;
+	atomic_fetch_add_explicit(&global_statistics.page_decommit, page_count, memory_order_relaxed);
+	size_t page_active_current =
+	    atomic_fetch_sub_explicit(&global_statistics.page_active, page_count, memory_order_relaxed);
+	rpmalloc_assert(page_active_current >= page_count, "Decommit counter out of sync");
+	(void)sizeof(page_active_current);
+#endif
+#else
+	(void)sizeof(address);
+	(void)sizeof(size);
+#endif
+}
+
+static void
+os_munmap(void* address, size_t offset, size_t mapped_size) {
+	(void)sizeof(mapped_size);
+	address = pointer_offset(address, -(int32_t)offset);
+#if ENABLE_UNMAP
+#if PLATFORM_WINDOWS
+	if (!VirtualFree(address, 0, MEM_RELEASE)) {
+		rpmalloc_assert(0, "Failed to unmap virtual memory block");
+	}
+#else
+	if (munmap(address, mapped_size))
+		rpmalloc_assert(0, "Failed to unmap virtual memory block");
+#endif
+#if ENABLE_STATISTICS
+	size_t page_count = mapped_size / global_config.page_size;
+	atomic_fetch_sub_explicit(&global_statistics.page_mapped, page_count, memory_order_relaxed);
+	atomic_fetch_sub_explicit(&global_statistics.page_active, page_count, memory_order_relaxed);
+#endif
+#endif
+}
+
+////////////
+///
+/// Page interface
+///
+//////
+
+static inline span_t*
+page_get_span(page_t* page) {
+	return (span_t*)((uintptr_t)page & SPAN_MASK);
+}
+
+static inline size_t
+page_get_size(page_t* page) {
+	if (page->page_type == PAGE_SMALL)
+		return SMALL_PAGE_SIZE;
+	else if (page->page_type == PAGE_MEDIUM)
+		return MEDIUM_PAGE_SIZE;
+	else if (page->page_type == PAGE_LARGE)
+		return LARGE_PAGE_SIZE;
+	else
+		return page_get_span(page)->page_size;
+}
+
+static inline int
+page_is_thread_heap(page_t* page) {
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	return (!page->heap->owner_thread || (page->heap->owner_thread == get_thread_id()));
+#else
+	return (page->heap->owner_thread == get_thread_id());
+#endif
+}
+
+static inline block_t*
+page_block_start(page_t* page) {
+	return pointer_offset(page, PAGE_HEADER_SIZE);
+}
+
+static inline block_t*
+page_block(page_t* page, uint32_t block_index) {
+	return pointer_offset(page, PAGE_HEADER_SIZE + (page->block_size * block_index));
+}
+
+static inline uint32_t
+page_block_index(page_t* page, block_t* block) {
+	block_t* block_first = page_block_start(page);
+	return (uint32_t)pointer_diff(block, block_first) / page->block_size;
+}
+
+static inline uint32_t
+page_block_from_thread_free_list(page_t* page, uint64_t token, block_t** block) {
+	uint32_t block_index = (uint32_t)(token & 0xFFFFFFFFULL);
+	uint32_t list_count = (uint32_t)((token >> 32ULL) & 0xFFFFFFFFULL);
+	*block = list_count ? page_block(page, block_index) : 0;
+	return list_count;
+}
+
+static inline uint64_t
+page_block_to_thread_free_list(page_t* page, uint32_t block_index, uint32_t list_count) {
+	(void)sizeof(page);
+	return ((uint64_t)list_count << 32ULL) | (uint64_t)block_index;
+}
+
+static inline block_t*
+page_block_realign(page_t* page, block_t* block) {
+	void* blocks_start = page_block_start(page);
+	uint32_t block_offset = (uint32_t)pointer_diff(block, blocks_start);
+	return pointer_offset(block, -(int32_t)(block_offset % page->block_size));
+}
+
+static block_t*
+page_get_local_free_block(page_t* page) {
+	block_t* block = page->local_free;
+	page->local_free = block->next;
+	--page->local_free_count;
+	++page->block_used;
+	return block;
+}
+
+static inline void
+page_decommit_memory_pages(page_t* page) {
+	if (page->is_decommitted)
+		return;
+	void* extra_page = pointer_offset(page, global_config.page_size);
+	size_t extra_page_size = page_get_size(page) - global_config.page_size;
+	global_memory_interface->memory_decommit(extra_page, extra_page_size);
+	page->is_decommitted = 1;
+}
+
+static inline void
+page_commit_memory_pages(page_t* page) {
+	if (!page->is_decommitted)
+		return;
+	void* extra_page = pointer_offset(page, global_config.page_size);
+	size_t extra_page_size = page_get_size(page) - global_config.page_size;
+	global_memory_interface->memory_commit(extra_page, extra_page_size);
+	page->is_decommitted = 0;
+#if ENABLE_DECOMMIT
+#if !defined(__APPLE__)
+	// When page is recommitted, the blocks in the second memory page and forward
+	// will be zeroed out by OS - take advantage in zalloc/calloc calls and make sure
+	// blocks in first page is zeroed out
+	void* first_page = pointer_offset(page, PAGE_HEADER_SIZE);
+	memset(first_page, 0, global_config.page_size - PAGE_HEADER_SIZE);
+	page->is_zero = 1;
+#endif
+#endif
+}
+
+static void
+page_available_to_free(page_t* page) {
+	rpmalloc_assert(page->is_full == 0, "Page full flag internal failure");
+	rpmalloc_assert(page->is_decommitted == 0, "Page decommitted flag internal failure");
+	heap_t* heap = page->heap;
+	if (heap->page_available[page->size_class] == page) {
+		heap->page_available[page->size_class] = page->next;
+	} else {
+		page->prev->next = page->next;
+		if (page->next)
+			page->next->prev = page->prev;
+	}
+	page->is_free = 1;
+	page->is_zero = 0;
+	page->next = heap->page_free[page->page_type];
+	heap->page_free[page->page_type] = page;
+	if (++heap->page_free_commit_count[page->page_type] >= global_page_free_overflow[page->page_type])
+		heap_page_free_decommit(heap, page->page_type, global_page_free_retain[page->page_type]);
+}
+
+static void
+page_full_to_available(page_t* page) {
+	rpmalloc_assert(page->is_full == 1, "Page full flag internal failure");
+	rpmalloc_assert(page->is_decommitted == 0, "Page decommitted flag internal failure");
+	heap_t* heap = page->heap;
+	page->next = heap->page_available[page->size_class];
+	if (page->next)
+		page->next->prev = page;
+	heap->page_available[page->size_class] = page;
+	page->is_full = 0;
+	if (page->has_aligned_block == 0)
+		page->generic_free = 0;
+}
+
+static void
+page_full_to_free_on_new_heap(page_t* page, heap_t* heap) {
+	rpmalloc_assert(heap->id, "Page full to free on default heap");
+	rpmalloc_assert(page->is_full == 1, "Page full flag internal failure");
+	rpmalloc_assert(page->is_decommitted == 0, "Page decommitted flag internal failure");
+	page->is_full = 0;
+	page->is_free = 1;
+	page->heap = heap;
+	atomic_store_explicit(&page->thread_free, 0, memory_order_relaxed);
+	page->next = heap->page_free[page->page_type];
+	heap->page_free[page->page_type] = page;
+	if (++heap->page_free_commit_count[page->page_type] >= global_page_free_overflow[page->page_type])
+		heap_page_free_decommit(heap, page->page_type, global_page_free_retain[page->page_type]);
+}
+
+static void
+page_available_to_full(page_t* page) {
+	heap_t* heap = page->heap;
+	if (heap->page_available[page->size_class] == page) {
+		heap->page_available[page->size_class] = page->next;
+	} else {
+		page->prev->next = page->next;
+		if (page->next)
+			page->next->prev = page->prev;
+	}
+	page->is_full = 1;
+	page->is_zero = 0;
+	page->generic_free = 1;
+}
+
+static inline void
+page_put_local_free_block(page_t* page, block_t* block) {
+	block->next = page->local_free;
+	page->local_free = block;
+	++page->local_free_count;
+	if (UNEXPECTED(--page->block_used == 0)) {
+		page_available_to_free(page);
+	} else if (UNEXPECTED(page->is_full != 0)) {
+		page_full_to_available(page);
+	}
+}
+
+static NOINLINE void
+page_adopt_thread_free_block_list(page_t* page) {
+	if (page->local_free)
+		return;
+	unsigned long long thread_free = atomic_load_explicit(&page->thread_free, memory_order_relaxed);
+	if (thread_free != 0) {
+		// Other threads can only replace with another valid list head, this will never change to 0 in other threads
+		while (!atomic_compare_exchange_weak_explicit(&page->thread_free, &thread_free, 0, memory_order_relaxed,
+		                                              memory_order_relaxed))
+			wait_spin();
+		page->local_free_count = page_block_from_thread_free_list(page, thread_free, &page->local_free);
+		rpmalloc_assert(page->local_free_count <= page->block_used, "Page thread free list count internal failure");
+		page->block_used -= page->local_free_count;
+	}
+}
+
+static NOINLINE void
+page_put_thread_free_block(page_t* page, block_t* block) {
+	atomic_thread_fence(memory_order_acquire);
+	if (page->is_full) {
+		// Page is full, put the block in the heap thread free list instead, otherwise
+		// the heap will not pick up the free blocks until a thread local free happens
+		heap_t* heap = page->heap;
+		uintptr_t prev_head = atomic_load_explicit(&heap->thread_free[page->page_type], memory_order_relaxed);
+		block->next = (void*)prev_head;
+		while (!atomic_compare_exchange_weak_explicit(&heap->thread_free[page->page_type], &prev_head, (uintptr_t)block,
+		                                              memory_order_relaxed, memory_order_relaxed)) {
+			block->next = (void*)prev_head;
+			wait_spin();
+		}
+	} else {
+		unsigned long long prev_thread_free = atomic_load_explicit(&page->thread_free, memory_order_relaxed);
+		uint32_t block_index = page_block_index(page, block);
+		rpmalloc_assert(page_block(page, block_index) == block, "Block pointer is not aligned to start of block");
+		uint32_t list_size = page_block_from_thread_free_list(page, prev_thread_free, &block->next) + 1;
+		uint64_t thread_free = page_block_to_thread_free_list(page, block_index, list_size);
+		while (!atomic_compare_exchange_weak_explicit(&page->thread_free, &prev_thread_free, thread_free,
+		                                              memory_order_relaxed, memory_order_relaxed)) {
+			list_size = page_block_from_thread_free_list(page, prev_thread_free, &block->next) + 1;
+			thread_free = page_block_to_thread_free_list(page, block_index, list_size);
+			wait_spin();
+		}
+	}
+}
+
+static void
+page_push_local_free_to_heap(page_t* page) {
+	// Push the page free list as the fast track list of free blocks for heap
+	page->heap->local_free[page->size_class] = page->local_free;
+	page->block_used += page->local_free_count;
+	page->local_free = 0;
+	page->local_free_count = 0;
+}
+
+static NOINLINE void*
+page_initialize_blocks(page_t* page) {
+	rpmalloc_assert(page->block_initialized < page->block_count, "Block initialization internal failure");
+	block_t* block = page_block(page, page->block_initialized);
+	++page->block_initialized;
+	++page->block_used;
+
+	if ((page->page_type == PAGE_SMALL) && (page->block_size < (global_config.page_size >> 1))) {
+		// Link up until next memory page in free list
+		void* memory_page_start = (void*)((uintptr_t)block & ~(uintptr_t)(global_config.page_size - 1));
+		void* memory_page_next = pointer_offset(memory_page_start, global_config.page_size);
+		block_t* free_block = pointer_offset(block, page->block_size);
+		block_t* first_block = free_block;
+		block_t* last_block = free_block;
+		uint32_t list_count = 0;
+		uint32_t max_list_count = page->block_count - page->block_initialized;
+		while (((void*)free_block < memory_page_next) && (list_count < max_list_count)) {
+			last_block = free_block;
+			free_block->next = pointer_offset(free_block, page->block_size);
+			free_block = free_block->next;
+			++list_count;
+		}
+		if (list_count) {
+			last_block->next = 0;
+			page->local_free = first_block;
+			page->block_initialized += list_count;
+			page->local_free_count = list_count;
+		}
+	}
+
+	return block;
+}
+
+static inline RPMALLOC_ALLOCATOR void*
+page_allocate_block(page_t* page, unsigned int zero) {
+	unsigned int is_zero = 0;
+	block_t* block = (page->local_free != 0) ? page_get_local_free_block(page) : 0;
+	if (UNEXPECTED(block == 0)) {
+		if (atomic_load_explicit(&page->thread_free, memory_order_relaxed) != 0) {
+			page_adopt_thread_free_block_list(page);
+			block = (page->local_free != 0) ? page_get_local_free_block(page) : 0;
+		}
+		if (block == 0) {
+			block = page_initialize_blocks(page);
+			is_zero = page->is_zero;
+		}
+	}
+
+	rpmalloc_assert(page->block_used <= page->block_count, "Page block use counter out of sync");
+	if (page->local_free && !page->heap->local_free[page->size_class])
+		page_push_local_free_to_heap(page);
+
+	// The page might be full when free list has been pushed to heap local free list,
+	// check if there is a thread free list to adopt
+	if (page->block_used == page->block_count)
+		page_adopt_thread_free_block_list(page);
+
+	if (page->block_used == page->block_count) {
+		// Page is now fully utilized
+		rpmalloc_assert(!page->is_full, "Page block use counter out of sync with full flag");
+		page_available_to_full(page);
+	}
+
+	if (zero) {
+		if (!is_zero)
+			memset(block, 0, page->block_size);
+		else
+			*(uintptr_t*)block = 0;
+	}
+
+	return block;
+}
+
+////////////
+///
+/// Span interface
+///
+//////
+
+static inline int
+span_is_thread_heap(span_t* span) {
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	return (!span->heap->owner_thread || (span->heap->owner_thread == get_thread_id()));
+#else
+	return (span->heap->owner_thread == get_thread_id());
+#endif
+}
+
+static inline page_t*
+span_get_page_from_block(span_t* span, void* block) {
+	return (page_t*)((uintptr_t)block & span->page_address_mask);
+}
+
+//! Find or allocate a page from the given span
+static inline page_t*
+span_allocate_page(span_t* span) {
+	// Allocate path, initialize a new chunk of memory for a page in the given span
+	rpmalloc_assert(span->page_initialized < span->page_count, "Page initialization internal failure");
+	heap_t* heap = span->heap;
+	page_t* page = pointer_offset(span, span->page_size * span->page_initialized);
+
+#if ENABLE_DECOMMIT
+	// The first page is always committed on initial span map of memory
+	if (span->page_initialized)
+		global_memory_interface->memory_commit(page, span->page_size);
+#endif
+	++span->page_initialized;
+
+	page->page_type = span->page_type;
+	page->is_zero = 1;
+	page->heap = heap;
+	rpmalloc_assert(page_is_thread_heap(page), "Page owner thread mismatch");
+
+	if (span->page_initialized == span->page_count) {
+		// Span fully utilized
+		rpmalloc_assert(span == heap->span_partial[span->page_type], "Span partial tracking out of sync");
+		heap->span_partial[span->page_type] = 0;
+
+		span->next = heap->span_used[span->page_type];
+		heap->span_used[span->page_type] = span;
+	}
+
+	return page;
+}
+
+static NOINLINE void
+span_deallocate_block(span_t* span, page_t* page, void* block) {
+	if (UNEXPECTED(page->page_type == PAGE_HUGE)) {
+		global_memory_interface->memory_unmap(span, span->offset, span->mapped_size);
+		return;
+	}
+
+	if (page->has_aligned_block) {
+		// Realign pointer to block start
+		block = page_block_realign(page, block);
+	}
+
+	int is_thread_local = page_is_thread_heap(page);
+	if (EXPECTED(is_thread_local != 0)) {
+		page_put_local_free_block(page, block);
+	} else {
+		// Multithreaded deallocation, push to deferred deallocation list.
+		page_put_thread_free_block(page, block);
+	}
+}
+
+////////////
+///
+/// Block interface
+///
+//////
+
+static inline span_t*
+block_get_span(block_t* block) {
+	return (span_t*)((uintptr_t)block & SPAN_MASK);
+}
+
+static inline void
+block_deallocate(block_t* block) {
+	span_t* span = (span_t*)((uintptr_t)block & SPAN_MASK);
+	page_t* page = span_get_page_from_block(span, block);
+	const int is_thread_local = page_is_thread_heap(page);
+
+	// Optimized path for thread local free with non-huge block in page
+	// that has no aligned blocks
+	if (EXPECTED(is_thread_local != 0)) {
+		if (EXPECTED(page->generic_free == 0)) {
+			// Page is not huge, not full and has no aligned block - fast path
+			block->next = page->local_free;
+			page->local_free = block;
+			++page->local_free_count;
+			if (UNEXPECTED(--page->block_used == 0))
+				page_available_to_free(page);
+		} else {
+			span_deallocate_block(span, page, block);
+		}
+	} else {
+		span_deallocate_block(span, page, block);
+	}
+}
+
+static inline size_t
+block_usable_size(block_t* block) {
+	span_t* span = (span_t*)((uintptr_t)block & SPAN_MASK);
+	if (EXPECTED(span->page_type <= PAGE_LARGE)) {
+		page_t* page = span_get_page_from_block(span, block);
+		void* blocks_start = pointer_offset(page, PAGE_HEADER_SIZE);
+		return page->block_size - ((size_t)pointer_diff(block, blocks_start) % page->block_size);
+	} else {
+		return ((size_t)span->page_size * (size_t)span->page_count) - (size_t)pointer_diff(block, span);
+	}
+}
+
+////////////
+///
+/// Heap interface
+///
+//////
+
+static inline void
+heap_lock_acquire(void) {
+	uintptr_t lock = 0;
+	uintptr_t this_lock = get_thread_id();
+	while (!atomic_compare_exchange_strong(&global_heap_lock, &lock, this_lock)) {
+		lock = 0;
+		wait_spin();
+	}
+}
+
+static inline void
+heap_lock_release(void) {
+	rpmalloc_assert((uintptr_t)atomic_load_explicit(&global_heap_lock, memory_order_relaxed) == get_thread_id(),
+	                "Bad heap lock");
+	atomic_store_explicit(&global_heap_lock, 0, memory_order_release);
+}
+
+static inline heap_t*
+heap_initialize(void* block) {
+	heap_t* heap = block;
+	memset_const(heap, 0, sizeof(heap_t));
+	heap->id = 1 + atomic_fetch_add_explicit(&global_heap_id, 1, memory_order_relaxed);
+	return heap;
+}
+
+static heap_t*
+heap_allocate_new(void) {
+	if (!global_config.page_size)
+		rpmalloc_initialize(0);
+	size_t heap_size = get_page_aligned_size(sizeof(heap_t));
+	size_t offset = 0;
+	size_t mapped_size = 0;
+	block_t* block = global_memory_interface->memory_map(heap_size, 0, &offset, &mapped_size);
+#if ENABLE_DECOMMIT
+	global_memory_interface->memory_commit(block, heap_size);
+#endif
+	heap_t* heap = heap_initialize((void*)block);
+	heap->offset = (uint32_t)offset;
+	heap->mapped_size = mapped_size;
+#if ENABLE_STATISTICS
+	atomic_fetch_add_explicit(&global_statistics.heap_count, 1, memory_order_relaxed);
+#endif
+	return heap;
+}
+
+static void
+heap_unmap(heap_t* heap) {
+	global_memory_interface->memory_unmap(heap, heap->offset, heap->mapped_size);
+}
+
+static heap_t*
+heap_allocate(int first_class) {
+	heap_t* heap = 0;
+	if (!first_class) {
+		heap_lock_acquire();
+		heap = global_heap_queue;
+		global_heap_queue = heap ? heap->next : 0;
+		heap_lock_release();
+	}
+	if (!heap)
+		heap = heap_allocate_new();
+	if (heap) {
+		uintptr_t current_thread_id = get_thread_id();
+		heap_lock_acquire();
+		heap->next = global_heap_used;
+		heap->prev = 0;
+		if (global_heap_used)
+			global_heap_used->prev = heap;
+		global_heap_used = heap;
+		heap_lock_release();
+		heap->owner_thread = current_thread_id;
+	}
+	return heap;
+}
+
+static inline void
+heap_release(heap_t* heap) {
+	heap_lock_acquire();
+	if (heap->prev)
+		heap->prev->next = heap->next;
+	if (heap->next)
+		heap->next->prev = heap->prev;
+	if (global_heap_used == heap)
+		global_heap_used = heap->next;
+	heap->next = global_heap_queue;
+	global_heap_queue = heap;
+	heap_lock_release();
+}
+
+static void
+heap_page_free_decommit(heap_t* heap, uint32_t page_type, uint32_t page_retain_count) {
+	page_t* page = heap->page_free[page_type];
+	while (page && page_retain_count) {
+		page = page->next;
+		--page_retain_count;
+	}
+	while (page && (page->is_decommitted == 0)) {
+		page_decommit_memory_pages(page);
+		--heap->page_free_commit_count[page_type];
+		page = page->next;
+	}
+}
+
+static inline void
+heap_make_free_page_available(heap_t* heap, uint32_t size_class, page_t* page) {
+	page->size_class = size_class;
+	page->block_size = global_size_class[size_class].block_size;
+	page->block_count = global_size_class[size_class].block_count;
+	page->block_used = 0;
+	page->block_initialized = 0;
+	page->local_free = 0;
+	page->local_free_count = 0;
+	page->is_full = 0;
+	page->is_free = 0;
+	page->has_aligned_block = 0;
+	page->generic_free = 0;
+	page->heap = heap;
+	page_t* head = heap->page_available[size_class];
+	page->next = head;
+	page->prev = 0;
+	atomic_store_explicit(&page->thread_free, 0, memory_order_relaxed);
+	if (head)
+		head->prev = page;
+	heap->page_available[size_class] = page;
+	if (page->is_decommitted)
+		page_commit_memory_pages(page);
+}
+
+//! Find or allocate a span for the given page type with the given size class
+static inline span_t*
+heap_get_span(heap_t* heap, page_type_t page_type) {
+	// Fast path, available span for given page type
+	if (EXPECTED(heap->span_partial[page_type] != 0))
+		return heap->span_partial[page_type];
+
+	// Fallback path, map more memory
+	size_t offset = 0;
+	size_t mapped_size = 0;
+	span_t* span = global_memory_interface->memory_map(SPAN_SIZE, SPAN_SIZE, &offset, &mapped_size);
+	if (EXPECTED(span != 0)) {
+		uint32_t page_count = 0;
+		uint32_t page_size = 0;
+		uintptr_t page_address_mask = 0;
+		if (page_type == PAGE_SMALL) {
+			page_count = SPAN_SIZE / SMALL_PAGE_SIZE;
+			page_size = SMALL_PAGE_SIZE;
+			page_address_mask = SMALL_PAGE_MASK;
+		} else if (page_type == PAGE_MEDIUM) {
+			page_count = SPAN_SIZE / MEDIUM_PAGE_SIZE;
+			page_size = MEDIUM_PAGE_SIZE;
+			page_address_mask = MEDIUM_PAGE_MASK;
+		} else {
+			page_count = SPAN_SIZE / LARGE_PAGE_SIZE;
+			page_size = LARGE_PAGE_SIZE;
+			page_address_mask = LARGE_PAGE_MASK;
+		}
+#if ENABLE_DECOMMIT
+		global_memory_interface->memory_commit(span, page_size);
+#endif
+		span->heap = heap;
+		span->page_type = page_type;
+		span->page_count = page_count;
+		span->page_size = page_size;
+		span->page_address_mask = page_address_mask;
+		span->offset = (uint32_t)offset;
+		span->mapped_size = mapped_size;
+
+		heap->span_partial[page_type] = span;
+	}
+
+	return span;
+}
+
+static page_t*
+heap_get_page(heap_t* heap, uint32_t size_class);
+
+static void
+block_deallocate(block_t* block);
+
+static page_t*
+heap_get_page_generic(heap_t* heap, uint32_t size_class) {
+	page_type_t page_type = get_page_type(size_class);
+
+	// Check if there is a free page from multithreaded deallocations
+	uintptr_t block_mt = atomic_load_explicit(&heap->thread_free[page_type], memory_order_relaxed);
+	if (UNEXPECTED(block_mt != 0)) {
+		while (!atomic_compare_exchange_weak_explicit(&heap->thread_free[page_type], &block_mt, 0, memory_order_relaxed,
+		                                              memory_order_relaxed)) {
+			wait_spin();
+		}
+		block_t* block = (void*)block_mt;
+		while (block) {
+			block_t* next_block = block->next;
+			block_deallocate(block);
+			block = next_block;
+		}
+		// Retry after processing deferred thread frees
+		return heap_get_page(heap, size_class);
+	}
+
+	// Check if there is a free page
+	page_t* page = heap->page_free[page_type];
+	if (EXPECTED(page != 0)) {
+		heap->page_free[page_type] = page->next;
+		if (page->is_decommitted == 0) {
+			rpmalloc_assert(heap->page_free_commit_count[page_type] > 0, "Free committed page count out of sync");
+			--heap->page_free_commit_count[page_type];
+		}
+		heap_make_free_page_available(heap, size_class, page);
+		return page;
+	}
+	rpmalloc_assert(heap->page_free_commit_count[page_type] == 0, "Free committed page count out of sync");
+
+	if (heap->id == 0) {
+		// Thread has not yet initialized, assign heap and try again
+		rpmalloc_initialize(0);
+		return heap_get_page(get_thread_heap(), size_class);
+	}
+
+	// Fallback path, find or allocate span for given size class
+	// If thread was not initialized, the heap for the new span
+	// will be different from the local heap variable in this scope
+	// (which is the default heap) - so use span page heap instead
+	span_t* span = heap_get_span(heap, page_type);
+	if (EXPECTED(span != 0)) {
+		page = span_allocate_page(span);
+		heap_make_free_page_available(page->heap, size_class, page);
+	}
+
+	return page;
+}
+
+//! Find or allocate a page for the given size class
+static page_t*
+heap_get_page(heap_t* heap, uint32_t size_class) {
+	// Fast path, available page for given size class
+	page_t* page = heap->page_available[size_class];
+	if (EXPECTED(page != 0))
+		return page;
+	return heap_get_page_generic(heap, size_class);
+}
+
+//! Pop a block from the heap local free list
+static inline RPMALLOC_ALLOCATOR void*
+heap_pop_local_free(heap_t* heap, uint32_t size_class) {
+	block_t** free_list = heap->local_free + size_class;
+	block_t* block = *free_list;
+	if (EXPECTED(block != 0))
+		*free_list = block->next;
+	return block;
+}
+
+//! Generic allocation path from heap pages, spans or new mapping
+static NOINLINE RPMALLOC_ALLOCATOR void*
+heap_allocate_block_small_to_large(heap_t* heap, uint32_t size_class, unsigned int zero) {
+	page_t* page = heap_get_page(heap, size_class);
+	if (EXPECTED(page != 0))
+		return page_allocate_block(page, zero);
+	return 0;
+}
+
+//! Generic allocation path from heap pages, spans or new mapping
+static NOINLINE RPMALLOC_ALLOCATOR void*
+heap_allocate_block_huge(heap_t* heap, size_t size, unsigned int zero) {
+	(void)sizeof(heap);
+	size_t alloc_size = get_page_aligned_size(size + SPAN_HEADER_SIZE);
+	size_t offset = 0;
+	size_t mapped_size = 0;
+	void* block = global_memory_interface->memory_map(alloc_size, SPAN_SIZE, &offset, &mapped_size);
+	if (block) {
+		span_t* span = block;
+#if ENABLE_DECOMMIT
+		global_memory_interface->memory_commit(span, alloc_size);
+#endif
+		span->heap = heap;
+		span->page_type = PAGE_HUGE;
+		span->page_size = (uint32_t)global_config.page_size;
+		span->page_count = (uint32_t)(alloc_size / global_config.page_size);
+		span->page_address_mask = LARGE_PAGE_MASK;
+		span->offset = (uint32_t)offset;
+		span->mapped_size = mapped_size;
+		span->page.heap = heap;
+		span->page.is_full = 1;
+		span->page.generic_free = 1;
+		span->page.page_type = PAGE_HUGE;
+		// Keep track of span if first class heap
+		if (!heap->owner_thread) {
+			span->next = heap->span_used[PAGE_HUGE];
+			heap->span_used[PAGE_HUGE] = span;
+		}
+		void* ptr = pointer_offset(block, SPAN_HEADER_SIZE);
+		if (zero)
+			memset(ptr, 0, size);
+		return ptr;
+	}
+	return 0;
+}
+
+static RPMALLOC_ALLOCATOR NOINLINE void*
+heap_allocate_block_generic(heap_t* heap, size_t size, unsigned int zero) {
+	uint32_t size_class = get_size_class(size);
+	if (EXPECTED(size_class < SIZE_CLASS_COUNT)) {
+		block_t* block = heap_pop_local_free(heap, size_class);
+		if (EXPECTED(block != 0)) {
+			// Fast track with small block available in heap level local free list
+			if (zero)
+				memset(block, 0, global_size_class[size_class].block_size);
+			return block;
+		}
+
+		return heap_allocate_block_small_to_large(heap, size_class, zero);
+	}
+
+	return heap_allocate_block_huge(heap, size, zero);
+}
+
+//! Find or allocate a block of the given size
+static inline RPMALLOC_ALLOCATOR void*
+heap_allocate_block(heap_t* heap, size_t size, unsigned int zero) {
+	if (size <= (SMALL_GRANULARITY * 64)) {
+		uint32_t size_class = get_size_class_tiny(size);
+		block_t* block = heap_pop_local_free(heap, size_class);
+		if (EXPECTED(block != 0)) {
+			// Fast track with small block available in heap level local free list
+			if (zero)
+				memset(block, 0, global_size_class[size_class].block_size);
+			return block;
+		}
+	}
+	return heap_allocate_block_generic(heap, size, zero);
+}
+
+static RPMALLOC_ALLOCATOR void*
+heap_allocate_block_aligned(heap_t* heap, size_t alignment, size_t size, unsigned int zero) {
+	if (alignment <= SMALL_GRANULARITY)
+		return heap_allocate_block(heap, size, zero);
+
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment) < size) {
+		errno = EINVAL;
+		return 0;
+	}
+	if (alignment & (alignment - 1)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	if (alignment >= RPMALLOC_MAX_ALIGNMENT) {
+		errno = EINVAL;
+		return 0;
+	}
+
+	size_t align_mask = alignment - 1;
+	block_t* block = heap_allocate_block(heap, size + alignment, zero);
+	if ((uintptr_t)block & align_mask) {
+		block = (void*)(((uintptr_t)block & ~(uintptr_t)align_mask) + alignment);
+		// Mark as having aligned blocks
+		span_t* span = block_get_span(block);
+		page_t* page = span_get_page_from_block(span, block);
+		page->has_aligned_block = 1;
+		page->generic_free = 1;
+	}
+	return block;
+}
+
+static void*
+heap_reallocate_block(heap_t* heap, void* block, size_t size, size_t old_size, unsigned int flags) {
+	if (block) {
+		// Grab the span using guaranteed span alignment
+		span_t* span = block_get_span(block);
+		if (EXPECTED(span->page_type <= PAGE_LARGE)) {
+			// Normal sized block
+			page_t* page = span_get_page_from_block(span, block);
+			void* blocks_start = pointer_offset(page, PAGE_HEADER_SIZE);
+			uint32_t block_offset = (uint32_t)pointer_diff(block, blocks_start);
+			uint32_t block_idx = block_offset / page->block_size;
+			void* block_origin = pointer_offset(blocks_start, (size_t)block_idx * page->block_size);
+			if (!old_size)
+				old_size = (size_t)((ptrdiff_t)page->block_size - pointer_diff(block, block_origin));
+			if ((size_t)page->block_size >= size) {
+				// Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+				if ((block != block_origin) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block_origin, block, old_size);
+				return block_origin;
+			}
+		} else {
+			// Huge block
+			void* block_start = pointer_offset(span, SPAN_HEADER_SIZE);
+			if (!old_size)
+				old_size = ((size_t)span->page_size * (size_t)span->page_count) - SPAN_HEADER_SIZE;
+			if ((size < old_size) && (size > LARGE_BLOCK_SIZE_LIMIT)) {
+				// Still fits in block and still huge, never mind trying to save memory,
+				// but preserve data if alignment changed
+				if ((block_start != block) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block_start, block, old_size);
+				return block_start;
+			}
+		}
+	} else {
+		old_size = 0;
+	}
+
+	if (!!(flags & RPMALLOC_GROW_OR_FAIL))
+		return 0;
+
+	// Size is greater than block size or saves enough memory to resize, need to allocate a new block
+	// and deallocate the old. Avoid hysteresis by overallocating if increase is small (below 37%)
+	size_t lower_bound = old_size + (old_size >> 2) + (old_size >> 3);
+	size_t new_size = (size > lower_bound) ? size : ((size > old_size) ? lower_bound : size);
+	void* old_block = block;
+	block = heap_allocate_block(heap, new_size, 0);
+	if (block && old_block) {
+		if (!(flags & RPMALLOC_NO_PRESERVE))
+			memcpy(block, old_block, old_size < new_size ? old_size : new_size);
+		block_deallocate(old_block);
+	}
+
+	return block;
+}
+
+static void*
+heap_reallocate_block_aligned(heap_t* heap, void* block, size_t alignment, size_t size, size_t old_size,
+                              unsigned int flags) {
+	if (alignment <= SMALL_GRANULARITY)
+		return heap_reallocate_block(heap, block, size, old_size, flags);
+
+	int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL);
+	size_t usable_size = (block ? block_usable_size(block) : 0);
+	if ((usable_size >= size) && !((uintptr_t)block & (alignment - 1))) {
+		if (no_alloc || (size >= (usable_size / 2)))
+			return block;
+	}
+	// Aligned alloc marks span as having aligned blocks
+	void* old_block = block;
+	block = (!no_alloc ? heap_allocate_block_aligned(heap, alignment, size, 0) : 0);
+	if (EXPECTED(block != 0)) {
+		if (!(flags & RPMALLOC_NO_PRESERVE) && old_block) {
+			if (!old_size)
+				old_size = usable_size;
+			memcpy(block, old_block, old_size < size ? old_size : size);
+		}
+		if (EXPECTED(old_block != 0))
+			block_deallocate(old_block);
+	}
+	return block;
+}
+
+static void
+heap_free_all(heap_t* heap) {
+	for (int itype = 0; itype < 3; ++itype) {
+		span_t* span = heap->span_partial[itype];
+		while (span) {
+			span_t* span_next = span->next;
+			global_memory_interface->memory_unmap(span, span->offset, span->mapped_size);
+			span = span_next;
+		}
+		heap->span_partial[itype] = 0;
+		heap->page_free[itype] = 0;
+		heap->page_free_commit_count[itype] = 0;
+		atomic_store_explicit(&heap->thread_free[itype], 0, memory_order_relaxed);
+	}
+	for (int itype = 0; itype < 4; ++itype) {
+		span_t* span = heap->span_used[itype];
+		while (span) {
+			span_t* span_next = span->next;
+			global_memory_interface->memory_unmap(span, span->offset, span->mapped_size);
+			span = span_next;
+		}
+		heap->span_used[itype] = 0;
+	}
+	memset(heap->local_free, 0, sizeof(heap->local_free));
+	memset(heap->page_available, 0, sizeof(heap->page_available));
+
+#if ENABLE_STATISTICS
+	// TODO: Fix
+#endif
+}
+
+////////////
+///
+/// Extern interface
+///
+//////
+
+int
+rpmalloc_is_thread_initialized(void) {
+	return (get_thread_heap() != global_heap_default) ? 1 : 0;
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc(size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return heap_allocate_block(heap, size, 0);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpzalloc(size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return heap_allocate_block(heap, size, 1);
+}
+
+extern inline void
+rpfree(void* ptr) {
+	if (UNEXPECTED(ptr == 0))
+		return;
+	block_deallocate(ptr);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpcalloc(size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	heap_t* heap = get_thread_heap();
+	return heap_allocate_block(heap, total, 1);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rprealloc(void* ptr, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return heap_reallocate_block(heap, ptr, size, 0, 0);
+}
+
+extern RPMALLOC_ALLOCATOR void*
+rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	heap_t* heap = get_thread_heap();
+	return heap_reallocate_block_aligned(heap, ptr, alignment, size, oldsize, flags);
+}
+
+extern RPMALLOC_ALLOCATOR void*
+rpaligned_alloc(size_t alignment, size_t size) {
+	heap_t* heap = get_thread_heap();
+	return heap_allocate_block_aligned(heap, alignment, size, 0);
+}
+
+extern RPMALLOC_ALLOCATOR void*
+rpaligned_zalloc(size_t alignment, size_t size) {
+	heap_t* heap = get_thread_heap();
+	return heap_allocate_block_aligned(heap, alignment, size, 1);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpaligned_calloc(size_t alignment, size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	heap_t* heap = get_thread_heap();
+	return heap_allocate_block_aligned(heap, alignment, total, 1);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmemalign(size_t alignment, size_t size) {
+	heap_t* heap = get_thread_heap();
+	return heap_allocate_block_aligned(heap, alignment, size, 0);
+}
+
+extern inline int
+rpposix_memalign(void** memptr, size_t alignment, size_t size) {
+	heap_t* heap = get_thread_heap();
+	if (memptr)
+		*memptr = heap_allocate_block_aligned(heap, alignment, size, 0);
+	else
+		return EINVAL;
+	return *memptr ? 0 : ENOMEM;
+}
+
+extern inline size_t
+rpmalloc_usable_size(void* ptr) {
+	return (ptr ? block_usable_size(ptr) : 0);
+}
+
+////////////
+///
+/// Initialization and finalization
+///
+//////
+
+static void
+rpmalloc_thread_destructor(void* value) {
+	// If this is called on main thread assume it means rpmalloc_finalize
+	// has not been called and shutdown is forced (through _exit) or unclean
+	if (get_thread_id() == global_main_thread_id)
+		return;
+	if (value)
+		rpmalloc_thread_finalize();
+}
+
+extern int
+rpmalloc_initialize_config(rpmalloc_interface_t* memory_interface, rpmalloc_config_t* config) {
+	if (global_rpmalloc_initialized) {
+		rpmalloc_thread_initialize();
+		if (config)
+			*config = global_config;
+		return 0;
+	}
+
+	if (config)
+		global_config = *config;
+
+	int result = rpmalloc_initialize(memory_interface);
+
+	if (config)
+		*config = global_config;
+
+	return result;
+}
+
+extern int
+rpmalloc_initialize(rpmalloc_interface_t* memory_interface) {
+	if (global_rpmalloc_initialized) {
+		rpmalloc_thread_initialize();
+		return 0;
+	}
+
+	global_rpmalloc_initialized = 1;
+
+	global_memory_interface = memory_interface ? memory_interface : &global_memory_interface_default;
+	if (!global_memory_interface->memory_map || !global_memory_interface->memory_unmap) {
+		global_memory_interface->memory_map = os_mmap;
+		global_memory_interface->memory_commit = os_mcommit;
+		global_memory_interface->memory_decommit = os_mdecommit;
+		global_memory_interface->memory_unmap = os_munmap;
+	}
+
+#if PLATFORM_WINDOWS
+	SYSTEM_INFO system_info;
+	memset(&system_info, 0, sizeof(system_info));
+	GetSystemInfo(&system_info);
+	os_map_granularity = system_info.dwAllocationGranularity;
+#else
+	os_map_granularity = (size_t)sysconf(_SC_PAGESIZE);
+#endif
+
+#if PLATFORM_WINDOWS
+	os_page_size = system_info.dwPageSize;
+#else
+	os_page_size = os_map_granularity;
+#endif
+	if (global_config.enable_huge_pages) {
+#if PLATFORM_WINDOWS
+		HANDLE token = 0;
+		size_t large_page_minimum = GetLargePageMinimum();
+		if (large_page_minimum)
+			OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
+		if (token) {
+			LUID luid;
+			if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) {
+				TOKEN_PRIVILEGES token_privileges;
+				memset(&token_privileges, 0, sizeof(token_privileges));
+				token_privileges.PrivilegeCount = 1;
+				token_privileges.Privileges[0].Luid = luid;
+				token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+				if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) {
+					if (GetLastError() == ERROR_SUCCESS)
+						os_huge_pages = 1;
+				}
+			}
+			CloseHandle(token);
+		}
+		if (os_huge_pages) {
+			if (large_page_minimum > os_page_size)
+				os_page_size = large_page_minimum;
+			if (large_page_minimum > os_map_granularity)
+				os_map_granularity = large_page_minimum;
+		}
+#elif defined(__linux__)
+		size_t huge_page_size = 0;
+		FILE* meminfo = fopen("/proc/meminfo", "r");
+		if (meminfo) {
+			char line[128];
+			while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) {
+				line[sizeof(line) - 1] = 0;
+				if (strstr(line, "Hugepagesize:"))
+					huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024;
+			}
+			fclose(meminfo);
+		}
+		if (huge_page_size) {
+			os_huge_pages = 1;
+			os_page_size = huge_page_size;
+			os_map_granularity = huge_page_size;
+		}
+#elif defined(__FreeBSD__)
+		int rc;
+		size_t sz = sizeof(rc);
+
+		if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && rc == 1) {
+			os_huge_pages = 1;
+			os_page_size = 2 * 1024 * 1024;
+			os_map_granularity = os_page_size;
+		}
+#elif defined(__APPLE__) || defined(__NetBSD__)
+		os_huge_pages = 1;
+		os_page_size = 2 * 1024 * 1024;
+		os_map_granularity = os_page_size;
+#endif
+	} else {
+		os_huge_pages = 0;
+	}
+
+	global_config.enable_huge_pages = os_huge_pages;
+
+	if (!memory_interface || (global_config.page_size < os_page_size))
+		global_config.page_size = os_page_size;
+
+	if (global_config.enable_huge_pages || global_config.page_size > (256 * 1024))
+		global_config.disable_decommit = 1;
+
+#if defined(__linux__) || defined(__ANDROID__)
+	if (global_config.disable_thp)
+		(void)prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0);
+#endif
+
+#ifdef _WIN32
+	fls_key = FlsAlloc(&rpmalloc_thread_destructor);
+#else
+	pthread_key_create(&pthread_key, rpmalloc_thread_destructor);
+#endif
+
+	global_main_thread_id = get_thread_id();
+
+	rpmalloc_thread_initialize();
+
+	return 0;
+}
+
+extern const rpmalloc_config_t*
+rpmalloc_config(void) {
+	return &global_config;
+}
+
+extern void
+rpmalloc_finalize(void) {
+	rpmalloc_thread_finalize();
+
+	if (global_config.unmap_on_finalize) {
+		heap_t* heap = global_heap_queue;
+		global_heap_queue = 0;
+		while (heap) {
+			heap_t* heap_next = heap->next;
+			heap_free_all(heap);
+			heap_unmap(heap);
+			heap = heap_next;
+		}
+		heap = global_heap_used;
+		global_heap_used = 0;
+		while (heap) {
+			heap_t* heap_next = heap->next;
+			heap_free_all(heap);
+			heap_unmap(heap);
+			heap = heap_next;
+		}
+#if ENABLE_STATISTICS
+		memset(&global_statistics, 0, sizeof(global_statistics));
+#endif
+	}
+
+#ifdef _WIN32
+	FlsFree(fls_key);
+	fls_key = 0;
+#else
+	pthread_key_delete(pthread_key);
+	pthread_key = 0;
+#endif
+
+	global_main_thread_id = 0;
+	global_rpmalloc_initialized = 0;
+}
+
+extern void
+rpmalloc_thread_initialize(void) {
+	if (get_thread_heap() == global_heap_default)
+		get_thread_heap_allocate();
+}
+
+extern void
+rpmalloc_thread_finalize(void) {
+	heap_t* heap = get_thread_heap();
+	if (heap != global_heap_default) {
+		heap_release(heap);
+		set_thread_heap(global_heap_default);
+	}
+}
+
+extern void
+rpmalloc_thread_collect(void) {
+}
+
+void
+rpmalloc_dump_statistics(void* file) {
+#if ENABLE_STATISTICS
+	fprintf(file, "Mapped pages:        %llu\n",
+	        (unsigned long long)atomic_load_explicit(&global_statistics.page_mapped, memory_order_relaxed));
+	fprintf(file, "Mapped pages (peak): %llu\n",
+	        (unsigned long long)atomic_load_explicit(&global_statistics.page_mapped_peak, memory_order_relaxed));
+	fprintf(file, "Active pages:        %llu\n",
+	        (unsigned long long)atomic_load_explicit(&global_statistics.page_active, memory_order_relaxed));
+	fprintf(file, "Active pages (peak): %llu\n",
+	        (unsigned long long)atomic_load_explicit(&global_statistics.page_active_peak, memory_order_relaxed));
+	fprintf(file, "Pages committed:     %llu\n",
+	        (unsigned long long)atomic_load_explicit(&global_statistics.page_commit, memory_order_relaxed));
+	fprintf(file, "Pages decommitted:   %llu\n",
+	        (unsigned long long)atomic_load_explicit(&global_statistics.page_decommit, memory_order_relaxed));
+	fprintf(file, "Heaps created:       %llu\n",
+	        (unsigned long long)atomic_load_explicit(&global_statistics.heap_count, memory_order_relaxed));
+#else
+	(void)sizeof(file);
+#endif
+}
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
+rpmalloc_heap_t*
+rpmalloc_heap_acquire(void) {
+	// Must be a pristine heap from newly mapped memory pages, or else memory blocks
+	// could already be allocated from the heap which would (wrongly) be released when
+	// heap is cleared with rpmalloc_heap_free_all(). Also heaps guaranteed to be
+	// pristine from the dedicated orphan list can be used.
+	heap_t* heap = heap_allocate(1);
+	rpmalloc_assume(heap != 0);
+	heap->owner_thread = 0;
+	return heap;
+}
+
+void
+rpmalloc_heap_release(rpmalloc_heap_t* heap) {
+	if (heap)
+		heap_release(heap);
+}
+
+RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return heap_allocate_block(heap, size, 0);
+}
+
+RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return heap_allocate_block_aligned(heap, alignment, size, 0);
+}
+
+RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	return heap_allocate_block(heap, total, 1);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	return heap_allocate_block_aligned(heap, alignment, total, 1);
+}
+
+RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	return heap_reallocate_block(heap, ptr, size, 0, flags);
+}
+
+RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return heap_reallocate_block_aligned(heap, ptr, alignment, size, 0, flags);
+}
+
+void
+rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr) {
+	(void)sizeof(heap);
+	block_deallocate(ptr);
+}
+
+//! Free all memory allocated by the heap
+void
+rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
+	heap_free_all(heap);
+}
+
+extern inline void
+rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap) {
+	heap_t* prev_heap = get_thread_heap();
+	if (prev_heap != heap) {
+		set_thread_heap(heap);
+		if (prev_heap)
+			heap_release(prev_heap);
+	}
+}
+
+rpmalloc_heap_t*
+rpmalloc_get_heap_for_ptr(void* ptr) {
+	// Grab the span, and then the heap from the span
+	span_t* span = (span_t*)((uintptr_t)ptr & SPAN_MASK);
+	if (span)
+		return span_get_page_from_block(span, ptr)->heap;
+	return 0;
+}
+
+#endif
+
+#include "malloc.c"