From 6be10b61227725efbac4714fc71b6fd3ea093989 Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Sat, 30 Jul 2022 13:29:57 +0200 Subject: [PATCH] Update rpmalloc to 1.4.4. --- public/client/tracy_rpmalloc.cpp | 4472 ++++++++++++++++++------------ public/client/tracy_rpmalloc.hpp | 118 +- 2 files changed, 2867 insertions(+), 1723 deletions(-) diff --git a/public/client/tracy_rpmalloc.cpp b/public/client/tracy_rpmalloc.cpp index fbfd74a0..03b185f4 100644 --- a/public/client/tracy_rpmalloc.cpp +++ b/public/client/tracy_rpmalloc.cpp @@ -1,6 +1,6 @@ #ifdef TRACY_ENABLE -/* rpmalloc.c - Memory allocator - Public Domain - 2016 Mattias Jansson +/* rpmalloc.c - Memory allocator - Public Domain - 2016-2020 Mattias Jansson * * This library provides a cross-platform lock free thread caching malloc implementation in C11. * The latest source code is always available at @@ -13,7 +13,23 @@ #include "tracy_rpmalloc.hpp" +//////////// +/// /// Build time configurable limits +/// +////// + +#if defined(__clang__) +#pragma clang diagnostic ignored "-Wunused-macros" +#pragma clang diagnostic ignored "-Wunused-function" +#if __has_warning("-Wreserved-identifier") +#pragma clang diagnostic ignored "-Wreserved-identifier" +#endif +#elif defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wunused-macros" +#pragma GCC diagnostic ignored "-Wunused-function" +#endif + #ifndef HEAP_ARRAY_SIZE //! Size of heap hashmap #define HEAP_ARRAY_SIZE 47 @@ -47,59 +63,46 @@ #define ENABLE_PRELOAD 0 #endif #ifndef DISABLE_UNMAP -//! Disable unmapping memory pages +//! Disable unmapping memory pages (also enables unlimited cache) #define DISABLE_UNMAP 0 #endif +#ifndef ENABLE_UNLIMITED_CACHE +//! Enable unlimited global cache (no unmapping until finalization) +#define ENABLE_UNLIMITED_CACHE 0 +#endif +#ifndef ENABLE_ADAPTIVE_THREAD_CACHE +//! Enable adaptive thread cache size based on use heuristics +#define ENABLE_ADAPTIVE_THREAD_CACHE 0 +#endif #ifndef DEFAULT_SPAN_MAP_COUNT //! Default number of spans to map in call to map more virtual memory (default values yield 4MiB here) #define DEFAULT_SPAN_MAP_COUNT 64 #endif - -#if ENABLE_THREAD_CACHE -#ifndef ENABLE_UNLIMITED_CACHE -//! Unlimited thread and global cache -#define ENABLE_UNLIMITED_CACHE 0 -#endif -#ifndef ENABLE_UNLIMITED_THREAD_CACHE -//! Unlimited cache disables any thread cache limitations -#define ENABLE_UNLIMITED_THREAD_CACHE ENABLE_UNLIMITED_CACHE -#endif -#if !ENABLE_UNLIMITED_THREAD_CACHE -#ifndef THREAD_CACHE_MULTIPLIER -//! Multiplier for thread cache (cache limit will be span release count multiplied by this value) -#define THREAD_CACHE_MULTIPLIER 16 -#endif -#ifndef ENABLE_ADAPTIVE_THREAD_CACHE -//! Enable adaptive size of per-thread cache (still bounded by THREAD_CACHE_MULTIPLIER hard limit) -#define ENABLE_ADAPTIVE_THREAD_CACHE 0 -#endif -#endif -#endif - -#if ENABLE_GLOBAL_CACHE && ENABLE_THREAD_CACHE -#ifndef ENABLE_UNLIMITED_GLOBAL_CACHE -//! Unlimited cache disables any global cache limitations -#define ENABLE_UNLIMITED_GLOBAL_CACHE ENABLE_UNLIMITED_CACHE -#endif -#if !ENABLE_UNLIMITED_GLOBAL_CACHE -//! Multiplier for global cache (cache limit will be span release count multiplied by this value) -#define GLOBAL_CACHE_MULTIPLIER (THREAD_CACHE_MULTIPLIER * 6) -#endif -#else -# undef ENABLE_GLOBAL_CACHE -# define ENABLE_GLOBAL_CACHE 0 -#endif - -#if !ENABLE_THREAD_CACHE || ENABLE_UNLIMITED_THREAD_CACHE -# undef ENABLE_ADAPTIVE_THREAD_CACHE -# define ENABLE_ADAPTIVE_THREAD_CACHE 0 +#ifndef GLOBAL_CACHE_MULTIPLIER +//! Multiplier for global cache +#define GLOBAL_CACHE_MULTIPLIER 8 #endif #if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE -# error Must use global cache if unmap is disabled +#error Must use global cache if unmap is disabled #endif -#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 ) +#if DISABLE_UNMAP +#undef ENABLE_UNLIMITED_CACHE +#define ENABLE_UNLIMITED_CACHE 1 +#endif + +#if !ENABLE_GLOBAL_CACHE +#undef ENABLE_UNLIMITED_CACHE +#define ENABLE_UNLIMITED_CACHE 0 +#endif + +#if !ENABLE_THREAD_CACHE +#undef ENABLE_ADAPTIVE_THREAD_CACHE +#define ENABLE_ADAPTIVE_THREAD_CACHE 0 +#endif + +#if defined(_WIN32) || defined(__WIN32__) || defined(_WIN64) # define PLATFORM_WINDOWS 1 # define PLATFORM_POSIX 0 #else @@ -107,13 +110,15 @@ # define PLATFORM_POSIX 1 #endif -#define _Static_assert static_assert - /// Platform and arch specifics -#ifndef FORCEINLINE -# if defined(_MSC_VER) && !defined(__clang__) +#if defined(_MSC_VER) && !defined(__clang__) +# pragma warning (disable: 5105) +# ifndef FORCEINLINE # define FORCEINLINE inline __forceinline -# else +# endif +# define _Static_assert static_assert +#else +# ifndef FORCEINLINE # define FORCEINLINE inline __attribute__((__always_inline__)) # endif #endif @@ -123,27 +128,62 @@ # endif # include # if ENABLE_VALIDATE_ARGS -# include +# include # endif #else # include # include # include -# if defined(__APPLE__) -# if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR -# include +# include +# if defined(__linux__) || defined(__ANDROID__) +# include +# if !defined(PR_SET_VMA) +# define PR_SET_VMA 0x53564d41 +# define PR_SET_VMA_ANON_NAME 0 # endif +# endif +# if defined(__APPLE__) +# include +# if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR +# include # include +# endif # include # endif -# if defined(__HAIKU__) -# include +# if defined(__HAIKU__) || defined(__TINYC__) # include # endif #endif #include #include +#include + +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) +#include +static DWORD fls_key; +#endif + +#if PLATFORM_POSIX +# include +# include +# ifdef __FreeBSD__ +# include +# define MAP_HUGETLB MAP_ALIGNED_SUPER +# ifndef PROT_MAX +# define PROT_MAX(f) 0 +# endif +# else +# define PROT_MAX(f) 0 +# endif +# ifdef __sun +extern int madvise(caddr_t, size_t, int); +# endif +# ifndef MAP_UNINITIALIZED +# define MAP_UNINITIALIZED 0 +# endif +#endif +#include #if ENABLE_ASSERTS # undef NDEBUG @@ -151,47 +191,134 @@ # define _DEBUG # endif # include +#define RPMALLOC_TOSTRING_M(x) #x +#define RPMALLOC_TOSTRING(x) RPMALLOC_TOSTRING_M(x) +#define rpmalloc_assert(truth, message) \ + do { \ + if (!(truth)) { \ + if (_memory_config.error_callback) { \ + _memory_config.error_callback( \ + message " (" RPMALLOC_TOSTRING(truth) ") at " __FILE__ ":" RPMALLOC_TOSTRING(__LINE__)); \ + } else { \ + assert((truth) && message); \ + } \ + } \ + } while (0) #else -# undef assert -# define assert(x) do {} while(0) +# define rpmalloc_assert(truth, message) do {} while(0) #endif #if ENABLE_STATISTICS # include #endif -#include +////// +/// +/// Atomic access abstraction (since MSVC does not do C11 yet) +/// +////// + +#if defined(_MSC_VER) && !defined(__clang__) namespace tracy { -typedef std::atomic atomic32_t; -typedef std::atomic atomic64_t; -typedef std::atomic atomicptr_t; +typedef volatile long atomic32_t; +typedef volatile long long atomic64_t; +typedef volatile void* atomicptr_t; -#define atomic_thread_fence_acquire() std::atomic_thread_fence(std::memory_order_acquire) -#define atomic_thread_fence_release() std::atomic_thread_fence(std::memory_order_release) +static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return *src; } +static FORCEINLINE void atomic_store32(atomic32_t* dst, int32_t val) { *dst = val; } +static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)InterlockedIncrement(val); } +static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return (int32_t)InterlockedDecrement(val); } +static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)InterlockedExchangeAdd(val, add) + add; } +static FORCEINLINE int atomic_cas32_acquire(atomic32_t* dst, int32_t val, int32_t ref) { return (InterlockedCompareExchange(dst, val, ref) == ref) ? 1 : 0; } +static FORCEINLINE void atomic_store32_release(atomic32_t* dst, int32_t val) { *dst = val; } +static FORCEINLINE int64_t atomic_load64(atomic64_t* src) { return *src; } +static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return (int64_t)InterlockedExchangeAdd64(val, add) + add; } +static FORCEINLINE void* atomic_load_ptr(atomicptr_t* src) { return (void*)*src; } +static FORCEINLINE void atomic_store_ptr(atomicptr_t* dst, void* val) { *dst = val; } +static FORCEINLINE void atomic_store_ptr_release(atomicptr_t* dst, void* val) { *dst = val; } +static FORCEINLINE void* atomic_exchange_ptr_acquire(atomicptr_t* dst, void* val) { return (void*)InterlockedExchangePointer((void* volatile*)dst, val); } +static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (InterlockedCompareExchangePointer((void* volatile*)dst, val, ref) == ref) ? 1 : 0; } -static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return std::atomic_load_explicit(src, std::memory_order_relaxed); } -static FORCEINLINE void atomic_store32(atomic32_t* dst, int32_t val) { std::atomic_store_explicit(dst, val, std::memory_order_relaxed); } -static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return std::atomic_fetch_add_explicit(val, 1, std::memory_order_relaxed) + 1; } -#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE -static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; } -#endif -static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return std::atomic_fetch_add_explicit(val, add, std::memory_order_relaxed) + add; } -static FORCEINLINE void* atomic_load_ptr(atomicptr_t* src) { return std::atomic_load_explicit(src, std::memory_order_relaxed); } -static FORCEINLINE void atomic_store_ptr(atomicptr_t* dst, void* val) { std::atomic_store_explicit(dst, val, std::memory_order_relaxed); } -static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return std::atomic_compare_exchange_weak_explicit(dst, &ref, val, std::memory_order_release, std::memory_order_acquire); } +#define EXPECTED(x) (x) +#define UNEXPECTED(x) (x) + +} -#if defined(_MSC_VER) && !defined(__clang__) -# define EXPECTED(x) (x) -# define UNEXPECTED(x) (x) #else -# define EXPECTED(x) __builtin_expect((x), 1) -# define UNEXPECTED(x) __builtin_expect((x), 0) + +#include + +namespace tracy +{ + +typedef volatile _Atomic(int32_t) atomic32_t; +typedef volatile _Atomic(int64_t) atomic64_t; +typedef volatile _Atomic(void*) atomicptr_t; + +static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return atomic_load_explicit(src, memory_order_relaxed); } +static FORCEINLINE void atomic_store32(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_relaxed); } +static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; } +static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; } +static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; } +static FORCEINLINE int atomic_cas32_acquire(atomic32_t* dst, int32_t val, int32_t ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_acquire, memory_order_relaxed); } +static FORCEINLINE void atomic_store32_release(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_release); } +static FORCEINLINE int64_t atomic_load64(atomic64_t* val) { return atomic_load_explicit(val, memory_order_relaxed); } +static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; } +static FORCEINLINE void* atomic_load_ptr(atomicptr_t* src) { return atomic_load_explicit(src, memory_order_relaxed); } +static FORCEINLINE void atomic_store_ptr(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_relaxed); } +static FORCEINLINE void atomic_store_ptr_release(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_release); } +static FORCEINLINE void* atomic_exchange_ptr_acquire(atomicptr_t* dst, void* val) { return atomic_exchange_explicit(dst, val, memory_order_acquire); } +static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_relaxed, memory_order_relaxed); } + +#define EXPECTED(x) __builtin_expect((x), 1) +#define UNEXPECTED(x) __builtin_expect((x), 0) + +} + #endif +//////////// +/// +/// Statistics related functions (evaluate to nothing when statistics not enabled) +/// +////// + +#if ENABLE_STATISTICS +# define _rpmalloc_stat_inc(counter) atomic_incr32(counter) +# define _rpmalloc_stat_dec(counter) atomic_decr32(counter) +# define _rpmalloc_stat_add(counter, value) atomic_add32(counter, (int32_t)(value)) +# define _rpmalloc_stat_add64(counter, value) atomic_add64(counter, (int64_t)(value)) +# define _rpmalloc_stat_add_peak(counter, value, peak) do { int32_t _cur_count = atomic_add32(counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0) +# define _rpmalloc_stat_sub(counter, value) atomic_add32(counter, -(int32_t)(value)) +# define _rpmalloc_stat_inc_alloc(heap, class_idx) do { \ + int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \ + if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \ + heap->size_class_use[class_idx].alloc_peak = alloc_current; \ + atomic_incr32(&heap->size_class_use[class_idx].alloc_total); \ +} while(0) +# define _rpmalloc_stat_inc_free(heap, class_idx) do { \ + atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \ + atomic_incr32(&heap->size_class_use[class_idx].free_total); \ +} while(0) +#else +# define _rpmalloc_stat_inc(counter) do {} while(0) +# define _rpmalloc_stat_dec(counter) do {} while(0) +# define _rpmalloc_stat_add(counter, value) do {} while(0) +# define _rpmalloc_stat_add64(counter, value) do {} while(0) +# define _rpmalloc_stat_add_peak(counter, value, peak) do {} while (0) +# define _rpmalloc_stat_sub(counter, value) do {} while(0) +# define _rpmalloc_stat_inc_alloc(heap, class_idx) do {} while(0) +# define _rpmalloc_stat_inc_free(heap, class_idx) do {} while(0) +#endif + + +/// /// Preconfigured limits and sizes -//! Granularity of a small allocation block +/// + +//! Granularity of a small allocation block (must be power of two) #define SMALL_GRANULARITY 16 //! Small granularity shift count #define SMALL_GRANULARITY_SHIFT 4 @@ -208,13 +335,24 @@ static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref //! Total number of small + medium size classes #define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT) //! Number of large block size classes -#define LARGE_CLASS_COUNT 32 +#define LARGE_CLASS_COUNT 63 //! Maximum size of a medium block #define MEDIUM_SIZE_LIMIT (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT)) //! Maximum size of a large block #define LARGE_SIZE_LIMIT ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE) -//! Size of a span header (must be a multiple of SMALL_GRANULARITY) -#define SPAN_HEADER_SIZE 96 +//! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power of two) +#define SPAN_HEADER_SIZE 128 +//! Number of spans in thread cache +#define MAX_THREAD_SPAN_CACHE 400 +//! Number of spans to transfer between thread and global cache +#define THREAD_SPAN_CACHE_TRANSFER 64 +//! Number of spans in thread cache for large spans (must be greater than LARGE_CLASS_COUNT / 2) +#define MAX_THREAD_SPAN_LARGE_CACHE 100 +//! Number of spans to transfer between thread and global cache for large spans +#define THREAD_SPAN_LARGE_CACHE_TRANSFER 6 + +_Static_assert((SMALL_GRANULARITY & (SMALL_GRANULARITY - 1)) == 0, "Small granularity must be power of two"); +_Static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0, "Span header size must be power of two"); #if ENABLE_VALIDATE_ARGS //! Maximum allocation size to avoid integer overflow @@ -227,11 +365,20 @@ static FORCEINLINE int atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref #define INVALID_POINTER ((void*)((uintptr_t)-1)) +#define SIZE_CLASS_LARGE SIZE_CLASS_COUNT +#define SIZE_CLASS_HUGE ((uint32_t)-1) + +//////////// +/// /// Data types +/// +////// + +namespace tracy +{ + //! A memory heap, per thread typedef struct heap_t heap_t; -//! Heap spans per size class -typedef struct heap_class_t heap_class_t; //! Span of memory pages typedef struct span_t span_t; //! Span list @@ -249,28 +396,32 @@ typedef struct global_cache_t global_cache_t; #define SPAN_FLAG_SUBSPAN 2U //! Flag indicating span has blocks with increased alignment #define SPAN_FLAG_ALIGNED_BLOCKS 4U +//! Flag indicating an unmapped master span +#define SPAN_FLAG_UNMAPPED_MASTER 8U #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS struct span_use_t { //! Current number of spans used (actually used, not in cache) atomic32_t current; //! High water mark of spans used - uint32_t high; + atomic32_t high; #if ENABLE_STATISTICS + //! Number of spans in deferred list + atomic32_t spans_deferred; //! Number of spans transitioned to global cache - uint32_t spans_to_global; + atomic32_t spans_to_global; //! Number of spans transitioned from global cache - uint32_t spans_from_global; + atomic32_t spans_from_global; //! Number of spans transitioned to thread cache - uint32_t spans_to_cache; + atomic32_t spans_to_cache; //! Number of spans transitioned from thread cache - uint32_t spans_from_cache; + atomic32_t spans_from_cache; //! Number of spans transitioned to reserved state - uint32_t spans_to_reserved; + atomic32_t spans_to_reserved; //! Number of spans transitioned from reserved state - uint32_t spans_from_reserved; + atomic32_t spans_from_reserved; //! Number of raw memory map calls - uint32_t spans_map_calls; + atomic32_t spans_map_calls; #endif }; typedef struct span_use_t span_use_t; @@ -283,64 +434,59 @@ struct size_class_use_t { //! Peak number of allocations int32_t alloc_peak; //! Total number of allocations - int32_t alloc_total; + atomic32_t alloc_total; //! Total number of frees atomic32_t free_total; //! Number of spans in use - uint32_t spans_current; + atomic32_t spans_current; //! Number of spans transitioned to cache - uint32_t spans_peak; + int32_t spans_peak; //! Number of spans transitioned to cache - uint32_t spans_to_cache; + atomic32_t spans_to_cache; //! Number of spans transitioned from cache - uint32_t spans_from_cache; + atomic32_t spans_from_cache; //! Number of spans transitioned from reserved state - uint32_t spans_from_reserved; + atomic32_t spans_from_reserved; //! Number of spans mapped - uint32_t spans_map_calls; + atomic32_t spans_map_calls; + int32_t unused; }; typedef struct size_class_use_t size_class_use_t; #endif -typedef enum span_state_t { - SPAN_STATE_ACTIVE = 0, - SPAN_STATE_PARTIAL, - SPAN_STATE_FULL -} span_state_t; - -//A span can either represent a single span of memory pages with size declared by span_map_count configuration variable, -//or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single -//span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first -//(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans -//that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire -//superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released -//in the same call to release the virtual memory range, but individual subranges can be decommitted individually -//to reduce physical memory use). +// A span can either represent a single span of memory pages with size declared by span_map_count configuration variable, +// or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single +// span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first +// (super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans +// that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire +// superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released +// in the same call to release the virtual memory range, but individual subranges can be decommitted individually +// to reduce physical memory use). struct span_t { //! Free list void* free_list; - //! State - uint32_t state; - //! Used count when not active (not including deferred free list) - uint32_t used_count; - //! Block count + //! Total block count of size class uint32_t block_count; //! Size class uint32_t size_class; //! Index of last block initialized in free list uint32_t free_list_limit; - //! Span list size when part of a cache list, or size of deferred free list when partial/full - uint32_t list_size; + //! Number of used blocks remaining when in partial state + uint32_t used_count; //! Deferred free list atomicptr_t free_list_deferred; + //! Size of deferred free list, or list of spans when part of a cache list + uint32_t list_size; //! Size of a block uint32_t block_size; //! Flags and counters uint32_t flags; //! Number of spans uint32_t span_count; - //! Total span counter for master spans, distance for subspans - uint32_t total_spans_or_distance; + //! Total span counter for master spans + uint32_t total_spans; + //! Offset from master span for subspans + uint32_t offset_from_master; //! Remaining span counter, for master spans atomic32_t remaining_spans; //! Alignment offset @@ -354,51 +500,87 @@ struct span_t { }; _Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch"); -struct heap_class_t { +struct span_cache_t { + size_t count; + span_t* span[MAX_THREAD_SPAN_CACHE]; +}; +typedef struct span_cache_t span_cache_t; + +struct span_large_cache_t { + size_t count; + span_t* span[MAX_THREAD_SPAN_LARGE_CACHE]; +}; +typedef struct span_large_cache_t span_large_cache_t; + +struct heap_size_class_t { //! Free list of active span void* free_list; - //! Double linked list of partially used spans with free blocks for each size class. - // Current active span is at head of list. Previous span pointer in head points to tail span of list. + //! Double linked list of partially used spans with free blocks. + // Previous span pointer in head points to tail span of list. span_t* partial_span; + //! Early level cache of fully free spans + span_t* cache; }; +typedef struct heap_size_class_t heap_size_class_t; +// Control structure for a heap, either a thread heap or a first class heap if enabled struct heap_t { - //! Active and semi-used span data per size class - heap_class_t span_class[SIZE_CLASS_COUNT]; + //! Owning thread ID + uintptr_t owner_thread; + //! Free lists for each size class + heap_size_class_t size_class[SIZE_CLASS_COUNT]; #if ENABLE_THREAD_CACHE - //! List of free spans (single linked list) - span_t* span_cache[LARGE_CLASS_COUNT]; - //! List of deferred free spans of class 0 (single linked list) - atomicptr_t span_cache_deferred; -#endif -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS - //! Current and high water mark of spans used per span count - span_use_t span_use[LARGE_CLASS_COUNT]; + //! Arrays of fully freed spans, single span + span_cache_t span_cache; #endif + //! List of deferred free spans (single linked list) + atomicptr_t span_free_deferred; + //! Number of full spans + size_t full_span_count; //! Mapped but unused spans span_t* span_reserve; //! Master span for mapped but unused spans span_t* span_reserve_master; //! Number of mapped but unused spans - size_t spans_reserved; + uint32_t spans_reserved; + //! Child count + atomic32_t child_count; //! Next heap in id list heap_t* next_heap; //! Next heap in orphan list heap_t* next_orphan; - //! Memory pages alignment offset - size_t align_offset; //! Heap ID int32_t id; + //! Finalization state flag + int finalize; + //! Master heap owning the memory pages + heap_t* master_heap; +#if ENABLE_THREAD_CACHE + //! Arrays of fully freed spans, large spans with > 1 span count + span_large_cache_t span_large_cache[LARGE_CLASS_COUNT - 1]; +#endif +#if RPMALLOC_FIRST_CLASS_HEAPS + //! Double linked list of fully utilized spans with free blocks for each size class. + // Previous span pointer in head points to tail span of list. + span_t* full_span[SIZE_CLASS_COUNT]; + //! Double linked list of large and huge spans allocated by this heap + span_t* large_huge_span; +#endif +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + //! Current and high water mark of spans used per span count + span_use_t span_use[LARGE_CLASS_COUNT]; +#endif #if ENABLE_STATISTICS - //! Number of bytes transitioned thread -> global - size_t thread_to_global; - //! Number of bytes transitioned global -> thread - size_t global_to_thread; //! Allocation stats per size class size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1]; + //! Number of bytes transitioned thread -> global + atomic64_t thread_to_global; + //! Number of bytes transitioned global -> thread + atomic64_t global_to_thread; #endif }; +// Size class for defining a block size bucket struct size_class_t { //! Size of blocks in this class uint32_t block_size; @@ -410,17 +592,37 @@ struct size_class_t { _Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch"); struct global_cache_t { - //! Cache list pointer - atomicptr_t cache; - //! Cache size - atomic32_t size; - //! ABA counter - atomic32_t counter; + //! Cache lock + atomic32_t lock; + //! Cache count + uint32_t count; +#if ENABLE_STATISTICS + //! Insert count + size_t insert_count; + //! Extract count + size_t extract_count; +#endif + //! Cached spans + span_t* span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE]; + //! Unlimited cache overflow + span_t* overflow; }; +//////////// +/// /// Global data +/// +////// + +//! Default span size (64KiB) +#define _memory_default_span_size (64 * 1024) +#define _memory_default_span_size_shift 16 +#define _memory_default_span_mask (~((uintptr_t)(_memory_span_size - 1))) + //! Initialized flag static int _rpmalloc_initialized; +//! Main thread ID +static uintptr_t _rpmalloc_main_thread_id; //! Configuration static rpmalloc_config_t _memory_config; //! Memory page size @@ -437,17 +639,15 @@ static size_t _memory_span_size_shift; //! Mask to get to start of a memory span static uintptr_t _memory_span_mask; #else -//! Hardwired span size (64KiB) -#define _memory_span_size (64 * 1024) -#define _memory_span_size_shift 16 -#define _memory_span_mask (~((uintptr_t)(_memory_span_size - 1))) +//! Hardwired span size +#define _memory_span_size _memory_default_span_size +#define _memory_span_size_shift _memory_default_span_size_shift +#define _memory_span_mask _memory_default_span_mask #endif //! Number of spans to map in each map call static size_t _memory_span_map_count; -//! Number of spans to release from thread cache to global cache (single spans) -static size_t _memory_span_release_count; -//! Number of spans to release from thread cache to global cache (large multiple spans) -static size_t _memory_span_release_count_large; +//! Number of spans to keep reserved in each heap +static size_t _memory_heap_reserve_count; //! Global size classes static size_class_t _memory_size_class[SIZE_CLASS_COUNT]; //! Run-time size limit of medium blocks @@ -460,21 +660,37 @@ static int _memory_huge_pages; //! Global span cache static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT]; #endif +//! Global reserved spans +static span_t* _memory_global_reserve; +//! Global reserved count +static size_t _memory_global_reserve_count; +//! Global reserved master +static span_t* _memory_global_reserve_master; //! All heaps -static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE]; +static heap_t* _memory_heaps[HEAP_ARRAY_SIZE]; +//! Used to restrict access to mapping memory for huge pages +static atomic32_t _memory_global_lock; //! Orphaned heaps -static atomicptr_t _memory_orphan_heaps; -//! Running orphan counter to avoid ABA issues in linked list -static atomic32_t _memory_orphan_counter; +static heap_t* _memory_orphan_heaps; +#if RPMALLOC_FIRST_CLASS_HEAPS +//! Orphaned heaps (first class heaps) +static heap_t* _memory_first_class_orphan_heaps; +#endif #if ENABLE_STATISTICS +//! Allocations counter +static atomic64_t _allocation_counter; +//! Deallocations counter +static atomic64_t _deallocation_counter; //! Active heap count static atomic32_t _memory_active_heaps; //! Number of currently mapped memory pages static atomic32_t _mapped_pages; //! Peak number of concurrently mapped memory pages static int32_t _mapped_pages_peak; -//! Number of currently unused spans -static atomic32_t _reserved_spans; +//! Number of mapped master spans +static atomic32_t _master_spans; +//! Number of unmapped dangling master spans +static atomic32_t _unmapped_master_spans; //! Running counter of total number of mapped memory pages since start static atomic32_t _mapped_total; //! Running counter of total number of unmapped memory pages since start @@ -487,15 +703,25 @@ static atomic32_t _huge_pages_current; static int32_t _huge_pages_peak; #endif +//////////// +/// +/// Thread local heap and ID +/// +////// + //! Current thread heap -#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD +#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || defined(__TINYC__) static pthread_key_t _memory_thread_heap; #else # ifdef _MSC_VER # define _Thread_local __declspec(thread) # define TLS_MODEL # else -# define TLS_MODEL __attribute__((tls_model("initial-exec"))) +# ifndef __HAIKU__ +# define TLS_MODEL __attribute__((tls_model("initial-exec"))) +# else +# define TLS_MODEL +# endif # if !defined(__clang__) && defined(__GNUC__) # define _Thread_local __thread # endif @@ -526,93 +752,355 @@ get_thread_heap(void) { #endif } +//! Fast thread ID +static inline uintptr_t +get_thread_id(void) { +#if defined(_WIN32) + return (uintptr_t)((void*)NtCurrentTeb()); +#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__) + uintptr_t tid; +# if defined(__i386__) + __asm__("movl %%gs:0, %0" : "=r" (tid) : : ); +# elif defined(__x86_64__) +# if defined(__MACH__) + __asm__("movq %%gs:0, %0" : "=r" (tid) : : ); +# else + __asm__("movq %%fs:0, %0" : "=r" (tid) : : ); +# endif +# elif defined(__arm__) + __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid)); +# elif defined(__aarch64__) +# if defined(__MACH__) + // tpidr_el0 likely unused, always return 0 on iOS + __asm__ volatile ("mrs %0, tpidrro_el0" : "=r" (tid)); +# else + __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tid)); +# endif +# else + tid = (uintptr_t)((void*)get_thread_heap_raw()); +# endif + return tid; +#else + return (uintptr_t)((void*)get_thread_heap_raw()); +#endif +} + //! Set the current thread heap static void set_thread_heap(heap_t* heap) { -#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD +#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || defined(__TINYC__) pthread_setspecific(_memory_thread_heap, heap); #else _memory_thread_heap = heap; #endif + if (heap) + heap->owner_thread = get_thread_id(); } -//! Default implementation to map more virtual memory -static void* -_memory_map_os(size_t size, size_t* offset); +//! Set main thread ID +extern void +rpmalloc_set_main_thread(void); + +void +rpmalloc_set_main_thread(void) { + _rpmalloc_main_thread_id = get_thread_id(); +} -//! Default implementation to unmap virtual memory static void -_memory_unmap_os(void* address, size_t size, size_t offset, size_t release); - -//! Lookup a memory heap from heap ID -static heap_t* -_memory_heap_lookup(int32_t id) { - uint32_t list_idx = id % HEAP_ARRAY_SIZE; - heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); - while (heap && (heap->id != id)) - heap = heap->next_heap; - return heap; +_rpmalloc_spin(void) { +#if defined(_MSC_VER) + _mm_pause(); +#elif defined(__x86_64__) || defined(__i386__) + __asm__ volatile("pause" ::: "memory"); +#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH >= 7) + __asm__ volatile("yield" ::: "memory"); +#elif defined(__powerpc__) || defined(__powerpc64__) + // No idea if ever been compiled in such archs but ... as precaution + __asm__ volatile("or 27,27,27"); +#elif defined(__sparc__) + __asm__ volatile("rd %ccr, %g0 \n\trd %ccr, %g0 \n\trd %ccr, %g0"); +#else + struct timespec ts = {0}; + nanosleep(&ts, 0); +#endif } -#if ENABLE_STATISTICS -# define _memory_statistics_inc(counter, value) counter += value -# define _memory_statistics_dec(counter, value) counter -= value -# define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value)) -# define _memory_statistics_add_peak(atomic_counter, value, peak) do { int32_t _cur_count = atomic_add32(atomic_counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0) -# define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value)) -# define _memory_statistics_inc_alloc(heap, class_idx) do { \ - int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \ - if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \ - heap->size_class_use[class_idx].alloc_peak = alloc_current; \ - heap->size_class_use[class_idx].alloc_total++; \ -} while(0) -# define _memory_statistics_inc_free(heap, class_idx) do { \ - atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \ - atomic_incr32(&heap->size_class_use[class_idx].free_total); \ -} while(0) -#else -# define _memory_statistics_inc(counter, value) do {} while(0) -# define _memory_statistics_dec(counter, value) do {} while(0) -# define _memory_statistics_add(atomic_counter, value) do {} while(0) -# define _memory_statistics_add_peak(atomic_counter, value, peak) do {} while (0) -# define _memory_statistics_sub(atomic_counter, value) do {} while(0) -# define _memory_statistics_inc_alloc(heap, class_idx) do {} while(0) -# define _memory_statistics_inc_free(heap, class_idx) do {} while(0) +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) +static void NTAPI +_rpmalloc_thread_destructor(void* value) { +#if ENABLE_OVERRIDE + // If this is called on main thread it means rpmalloc_finalize + // has not been called and shutdown is forced (through _exit) or unclean + if (get_thread_id() == _rpmalloc_main_thread_id) + return; +#endif + if (value) + rpmalloc_thread_finalize(1); +} #endif + +//////////// +/// +/// Low level memory map/unmap +/// +////// + static void -_memory_heap_cache_insert(heap_t* heap, span_t* span); +_rpmalloc_set_name(void* address, size_t size) { +#if defined(__linux__) || defined(__ANDROID__) + const char *name = _memory_huge_pages ? _memory_config.huge_page_name : _memory_config.page_name; + if (address == MAP_FAILED || !name) + return; + // If the kernel does not support CONFIG_ANON_VMA_NAME or if the call fails + // (e.g. invalid name) it is a no-op basically. + (void)prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (uintptr_t)address, size, (uintptr_t)name); +#else + (void)sizeof(size); + (void)sizeof(address); +#endif +} + //! Map more virtual memory +// size is number of bytes to map +// offset receives the offset in bytes from start of mapped region +// returns address to start of mapped region to use static void* -_memory_map(size_t size, size_t* offset) { - assert(!(size % _memory_page_size)); - assert(size >= _memory_page_size); - _memory_statistics_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak); - _memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift)); - return _memory_config.memory_map(size, offset); +_rpmalloc_mmap(size_t size, size_t* offset) { + rpmalloc_assert(!(size % _memory_page_size), "Invalid mmap size"); + rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size"); + void* address = _memory_config.memory_map(size, offset); + if (EXPECTED(address != 0)) { + _rpmalloc_stat_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak); + _rpmalloc_stat_add(&_mapped_total, (size >> _memory_page_size_shift)); + } + return address; } //! Unmap virtual memory +// address is the memory address to unmap, as returned from _memory_map +// size is the number of bytes to unmap, which might be less than full region for a partial unmap +// offset is the offset in bytes to the actual mapped region, as set by _memory_map +// release is set to 0 for partial unmap, or size of entire range for a full unmap static void -_memory_unmap(void* address, size_t size, size_t offset, size_t release) { - assert(!release || (release >= size)); - assert(!release || (release >= _memory_page_size)); +_rpmalloc_unmap(void* address, size_t size, size_t offset, size_t release) { + rpmalloc_assert(!release || (release >= size), "Invalid unmap size"); + rpmalloc_assert(!release || (release >= _memory_page_size), "Invalid unmap size"); if (release) { - assert(!(release % _memory_page_size)); - _memory_statistics_sub(&_mapped_pages, (release >> _memory_page_size_shift)); - _memory_statistics_add(&_unmapped_total, (release >> _memory_page_size_shift)); + rpmalloc_assert(!(release % _memory_page_size), "Invalid unmap size"); + _rpmalloc_stat_sub(&_mapped_pages, (release >> _memory_page_size_shift)); + _rpmalloc_stat_add(&_unmapped_total, (release >> _memory_page_size_shift)); } _memory_config.memory_unmap(address, size, offset, release); } +//! Default implementation to map new pages to virtual memory +static void* +_rpmalloc_mmap_os(size_t size, size_t* offset) { + //Either size is a heap (a single page) or a (multiple) span - we only need to align spans, and only if larger than map granularity + size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0; + rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size"); +#if PLATFORM_WINDOWS + //Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed" + void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + if (!ptr) { + if (_memory_config.map_fail_callback) { + if (_memory_config.map_fail_callback(size + padding)) + return _rpmalloc_mmap_os(size, offset); + } else { + rpmalloc_assert(ptr, "Failed to map virtual memory block"); + } + return 0; + } +#else + int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED; +# if defined(__APPLE__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR + int fd = (int)VM_MAKE_TAG(240U); + if (_memory_huge_pages) + fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB; + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0); +# elif defined(MAP_HUGETLB) + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE | PROT_MAX(PROT_READ | PROT_WRITE), (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0); +# if defined(MADV_HUGEPAGE) + // In some configurations, huge pages allocations might fail thus + // we fallback to normal allocations and promote the region as transparent huge page + if ((ptr == MAP_FAILED || !ptr) && _memory_huge_pages) { + ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); + if (ptr && ptr != MAP_FAILED) { + int prm = madvise(ptr, size + padding, MADV_HUGEPAGE); + (void)prm; + rpmalloc_assert((prm == 0), "Failed to promote the page to THP"); + } + } +# endif + _rpmalloc_set_name(ptr, size + padding); +# elif defined(MAP_ALIGNED) + const size_t align = (sizeof(size_t) * 8) - (size_t)(__builtin_clzl(size - 1)); + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_ALIGNED(align) : 0) | flags, -1, 0); +# elif defined(MAP_ALIGN) + caddr_t base = (_memory_huge_pages ? (caddr_t)(4 << 20) : 0); + void* ptr = mmap(base, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_ALIGN : 0) | flags, -1, 0); +# else + void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); +# endif + if ((ptr == MAP_FAILED) || !ptr) { + if (_memory_config.map_fail_callback) { + if (_memory_config.map_fail_callback(size + padding)) + return _rpmalloc_mmap_os(size, offset); + } else if (errno != ENOMEM) { + rpmalloc_assert((ptr != MAP_FAILED) && ptr, "Failed to map virtual memory block"); + } + return 0; + } +#endif + _rpmalloc_stat_add(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift)); + if (padding) { + size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask); + rpmalloc_assert(final_padding <= _memory_span_size, "Internal failure in padding"); + rpmalloc_assert(final_padding <= padding, "Internal failure in padding"); + rpmalloc_assert(!(final_padding % 8), "Internal failure in padding"); + ptr = pointer_offset(ptr, final_padding); + *offset = final_padding >> 3; + } + rpmalloc_assert((size < _memory_span_size) || !((uintptr_t)ptr & ~_memory_span_mask), "Internal failure in padding"); + return ptr; +} + +//! Default implementation to unmap pages from virtual memory +static void +_rpmalloc_unmap_os(void* address, size_t size, size_t offset, size_t release) { + rpmalloc_assert(release || (offset == 0), "Invalid unmap size"); + rpmalloc_assert(!release || (release >= _memory_page_size), "Invalid unmap size"); + rpmalloc_assert(size >= _memory_page_size, "Invalid unmap size"); + if (release && offset) { + offset <<= 3; + address = pointer_offset(address, -(int32_t)offset); + if ((release >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) { + //Padding is always one span size + release += _memory_span_size; + } + } +#if !DISABLE_UNMAP +#if PLATFORM_WINDOWS + if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) { + rpmalloc_assert(0, "Failed to unmap virtual memory block"); + } +#else + if (release) { + if (munmap(address, release)) { + rpmalloc_assert(0, "Failed to unmap virtual memory block"); + } + } else { +#if defined(MADV_FREE_REUSABLE) + int ret; + while ((ret = madvise(address, size, MADV_FREE_REUSABLE)) == -1 && (errno == EAGAIN)) + errno = 0; + if ((ret == -1) && (errno != 0)) { +#elif defined(MADV_DONTNEED) + if (madvise(address, size, MADV_DONTNEED)) { +#elif defined(MADV_PAGEOUT) + if (madvise(address, size, MADV_PAGEOUT)) { +#elif defined(MADV_FREE) + if (madvise(address, size, MADV_FREE)) { +#else + if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) { +#endif + rpmalloc_assert(0, "Failed to madvise virtual memory block as free"); + } + } +#endif +#endif + if (release) + _rpmalloc_stat_sub(&_mapped_pages_os, release >> _memory_page_size_shift); +} + +static void +_rpmalloc_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count); + +//! Use global reserved spans to fulfill a memory map request (reserve size must be checked by caller) +static span_t* +_rpmalloc_global_get_reserved_spans(size_t span_count) { + span_t* span = _memory_global_reserve; + _rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, span, span_count); + _memory_global_reserve_count -= span_count; + if (_memory_global_reserve_count) + _memory_global_reserve = (span_t*)pointer_offset(span, span_count << _memory_span_size_shift); + else + _memory_global_reserve = 0; + return span; +} + +//! Store the given spans as global reserve (must only be called from within new heap allocation, not thread safe) +static void +_rpmalloc_global_set_reserved_spans(span_t* master, span_t* reserve, size_t reserve_span_count) { + _memory_global_reserve_master = master; + _memory_global_reserve_count = reserve_span_count; + _memory_global_reserve = reserve; +} + + +//////////// +/// +/// Span linked list management +/// +////// + +//! Add a span to double linked list at the head +static void +_rpmalloc_span_double_link_list_add(span_t** head, span_t* span) { + if (*head) + (*head)->prev = span; + span->next = *head; + *head = span; +} + +//! Pop head span from double linked list +static void +_rpmalloc_span_double_link_list_pop_head(span_t** head, span_t* span) { + rpmalloc_assert(*head == span, "Linked list corrupted"); + span = *head; + *head = span->next; +} + +//! Remove a span from double linked list +static void +_rpmalloc_span_double_link_list_remove(span_t** head, span_t* span) { + rpmalloc_assert(*head, "Linked list corrupted"); + if (*head == span) { + *head = span->next; + } else { + span_t* next_span = span->next; + span_t* prev_span = span->prev; + prev_span->next = next_span; + if (EXPECTED(next_span != 0)) + next_span->prev = prev_span; + } +} + + +//////////// +/// +/// Span control +/// +////// + +static void +_rpmalloc_heap_cache_insert(heap_t* heap, span_t* span); + +static void +_rpmalloc_heap_finalize(heap_t* heap); + +static void +_rpmalloc_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count); + //! Declare the span to be a subspan and store distance from master span and span count static void -_memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) { - assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER)); +_rpmalloc_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) { + rpmalloc_assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER), "Span master pointer and/or flag mismatch"); if (subspan != master) { subspan->flags = SPAN_FLAG_SUBSPAN; - subspan->total_spans_or_distance = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift); + subspan->offset_from_master = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift); subspan->align_offset = 0; } subspan->span_count = (uint32_t)span_count; @@ -620,496 +1108,178 @@ _memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size //! Use reserved spans to fulfill a memory map request (reserve size must be checked by caller) static span_t* -_memory_map_from_reserve(heap_t* heap, size_t span_count) { +_rpmalloc_span_map_from_reserve(heap_t* heap, size_t span_count) { //Update the heap span reserve span_t* span = heap->span_reserve; heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size); - heap->spans_reserved -= span_count; + heap->spans_reserved -= (uint32_t)span_count; - _memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count); + _rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count); if (span_count <= LARGE_CLASS_COUNT) - _memory_statistics_inc(heap->span_use[span_count - 1].spans_from_reserved, 1); + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_reserved); return span; } //! Get the aligned number of spans to map in based on wanted count, configured mapping granularity and the page size static size_t -_memory_map_align_span_count(size_t span_count) { +_rpmalloc_span_align_count(size_t span_count) { size_t request_count = (span_count > _memory_span_map_count) ? span_count : _memory_span_map_count; if ((_memory_page_size > _memory_span_size) && ((request_count * _memory_span_size) % _memory_page_size)) - request_count += _memory_span_map_count - (request_count % _memory_span_map_count); + request_count += _memory_span_map_count - (request_count % _memory_span_map_count); return request_count; } -//! Store the given spans as reserve in the given heap -static void -_memory_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) { - heap->span_reserve_master = master; - heap->span_reserve = reserve; - heap->spans_reserved = reserve_span_count; -} - //! Setup a newly mapped span static void -_memory_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) { - span->total_spans_or_distance = (uint32_t)total_span_count; +_rpmalloc_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) { + span->total_spans = (uint32_t)total_span_count; span->span_count = (uint32_t)span_count; span->align_offset = (uint32_t)align_offset; span->flags = SPAN_FLAG_MASTER; - atomic_store32(&span->remaining_spans, (int32_t)total_span_count); + atomic_store32(&span->remaining_spans, (int32_t)total_span_count); } -//! Map a akigned set of spans, taking configured mapping granularity and the page size into account +static void +_rpmalloc_span_unmap(span_t* span); + +//! Map an aligned set of spans, taking configured mapping granularity and the page size into account static span_t* -_memory_map_aligned_span_count(heap_t* heap, size_t span_count) { +_rpmalloc_span_map_aligned_count(heap_t* heap, size_t span_count) { //If we already have some, but not enough, reserved spans, release those to heap cache and map a new //full set of spans. Otherwise we would waste memory if page size > span size (huge pages) - size_t aligned_span_count = _memory_map_align_span_count(span_count); + size_t aligned_span_count = _rpmalloc_span_align_count(span_count); size_t align_offset = 0; - span_t* span = (span_t*)_memory_map(aligned_span_count * _memory_span_size, &align_offset); + span_t* span = (span_t*)_rpmalloc_mmap(aligned_span_count * _memory_span_size, &align_offset); if (!span) return 0; - _memory_span_initialize(span, aligned_span_count, span_count, align_offset); - _memory_statistics_add(&_reserved_spans, aligned_span_count); + _rpmalloc_span_initialize(span, aligned_span_count, span_count, align_offset); + _rpmalloc_stat_inc(&_master_spans); if (span_count <= LARGE_CLASS_COUNT) - _memory_statistics_inc(heap->span_use[span_count - 1].spans_map_calls, 1); + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_map_calls); if (aligned_span_count > span_count) { + span_t* reserved_spans = (span_t*)pointer_offset(span, span_count * _memory_span_size); + size_t reserved_count = aligned_span_count - span_count; if (heap->spans_reserved) { - _memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved); - _memory_heap_cache_insert(heap, heap->span_reserve); + _rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved); + _rpmalloc_heap_cache_insert(heap, heap->span_reserve); } - _memory_heap_set_reserved_spans(heap, span, (span_t*)pointer_offset(span, span_count * _memory_span_size), aligned_span_count - span_count); + if (reserved_count > _memory_heap_reserve_count) { + // If huge pages or eager spam map count, the global reserve spin lock is held by caller, _rpmalloc_span_map + rpmalloc_assert(atomic_load32(&_memory_global_lock) == 1, "Global spin lock not held as expected"); + size_t remain_count = reserved_count - _memory_heap_reserve_count; + reserved_count = _memory_heap_reserve_count; + span_t* remain_span = (span_t*)pointer_offset(reserved_spans, reserved_count * _memory_span_size); + if (_memory_global_reserve) { + _rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, _memory_global_reserve, _memory_global_reserve_count); + _rpmalloc_span_unmap(_memory_global_reserve); + } + _rpmalloc_global_set_reserved_spans(span, remain_span, remain_count); + } + _rpmalloc_heap_set_reserved_spans(heap, span, reserved_spans, reserved_count); } return span; } //! Map in memory pages for the given number of spans (or use previously reserved pages) static span_t* -_memory_map_spans(heap_t* heap, size_t span_count) { +_rpmalloc_span_map(heap_t* heap, size_t span_count) { if (span_count <= heap->spans_reserved) - return _memory_map_from_reserve(heap, span_count); - return _memory_map_aligned_span_count(heap, span_count); + return _rpmalloc_span_map_from_reserve(heap, span_count); + span_t* span = 0; + int use_global_reserve = (_memory_page_size > _memory_span_size) || (_memory_span_map_count > _memory_heap_reserve_count); + if (use_global_reserve) { + // If huge pages, make sure only one thread maps more memory to avoid bloat + while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) + _rpmalloc_spin(); + if (_memory_global_reserve_count >= span_count) { + size_t reserve_count = (!heap->spans_reserved ? _memory_heap_reserve_count : span_count); + if (_memory_global_reserve_count < reserve_count) + reserve_count = _memory_global_reserve_count; + span = _rpmalloc_global_get_reserved_spans(reserve_count); + if (span) { + if (reserve_count > span_count) { + span_t* reserved_span = (span_t*)pointer_offset(span, span_count << _memory_span_size_shift); + _rpmalloc_heap_set_reserved_spans(heap, _memory_global_reserve_master, reserved_span, reserve_count - span_count); + } + // Already marked as subspan in _rpmalloc_global_get_reserved_spans + span->span_count = (uint32_t)span_count; + } + } + } + if (!span) + span = _rpmalloc_span_map_aligned_count(heap, span_count); + if (use_global_reserve) + atomic_store32_release(&_memory_global_lock, 0); + return span; } //! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings) static void -_memory_unmap_span(span_t* span) { - assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN)); - assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN)); +_rpmalloc_span_unmap(span_t* span) { + rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted"); + rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted"); int is_master = !!(span->flags & SPAN_FLAG_MASTER); - span_t* master = is_master ? span : (span_t*)(pointer_offset(span, -(int32_t)(span->total_spans_or_distance * _memory_span_size))); - assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN)); - assert(master->flags & SPAN_FLAG_MASTER); + span_t* master = is_master ? span : ((span_t*)pointer_offset(span, -(intptr_t)((uintptr_t)span->offset_from_master * _memory_span_size))); + rpmalloc_assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted"); + rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted"); size_t span_count = span->span_count; if (!is_master) { //Directly unmap subspans (unless huge pages, in which case we defer and unmap entire page range with master) - assert(span->align_offset == 0); - if (_memory_span_size >= _memory_page_size) { - _memory_unmap(span, span_count * _memory_span_size, 0, 0); - _memory_statistics_sub(&_reserved_spans, span_count); - } + rpmalloc_assert(span->align_offset == 0, "Span align offset corrupted"); + if (_memory_span_size >= _memory_page_size) + _rpmalloc_unmap(span, span_count * _memory_span_size, 0, 0); } else { //Special double flag to denote an unmapped master //It must be kept in memory since span header must be used - span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN; + span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN | SPAN_FLAG_UNMAPPED_MASTER; + _rpmalloc_stat_add(&_unmapped_master_spans, 1); } if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) { //Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span - assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN)); + rpmalloc_assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted"); size_t unmap_count = master->span_count; if (_memory_span_size < _memory_page_size) - unmap_count = master->total_spans_or_distance; - _memory_statistics_sub(&_reserved_spans, unmap_count); - _memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, master->total_spans_or_distance * _memory_span_size); + unmap_count = master->total_spans; + _rpmalloc_stat_sub(&_master_spans, 1); + _rpmalloc_stat_sub(&_unmapped_master_spans, 1); + _rpmalloc_unmap(master, unmap_count * _memory_span_size, master->align_offset, (size_t)master->total_spans * _memory_span_size); } } -#if ENABLE_THREAD_CACHE - -//! Unmap a single linked list of spans -static void -_memory_unmap_span_list(span_t* span) { - size_t list_size = span->list_size; - for (size_t ispan = 0; ispan < list_size; ++ispan) { - span_t* next_span = span->next; - _memory_unmap_span(span); - span = next_span; - } - assert(!span); -} - -//! Add span to head of single linked span list -static size_t -_memory_span_list_push(span_t** head, span_t* span) { - span->next = *head; - if (*head) - span->list_size = (*head)->list_size + 1; - else - span->list_size = 1; - *head = span; - return span->list_size; -} - -//! Remove span from head of single linked span list, returns the new list head -static span_t* -_memory_span_list_pop(span_t** head) { - span_t* span = *head; - span_t* next_span = 0; - if (span->list_size > 1) { - assert(span->next); - next_span = span->next; - assert(next_span); - next_span->list_size = span->list_size - 1; - } - *head = next_span; - return span; -} - -//! Split a single linked span list -static span_t* -_memory_span_list_split(span_t* span, size_t limit) { - span_t* next = 0; - if (limit < 2) - limit = 2; - if (span->list_size > limit) { - uint32_t list_size = 1; - span_t* last = span; - next = span->next; - while (list_size < limit) { - last = next; - next = next->next; - ++list_size; - } - last->next = 0; - assert(next); - next->list_size = span->list_size - list_size; - span->list_size = list_size; - span->prev = 0; - } - return next; -} - -#endif - -//! Add a span to partial span double linked list at the head -static void -_memory_span_partial_list_add(span_t** head, span_t* span) { - if (*head) { - span->next = *head; - //Maintain pointer to tail span - span->prev = (*head)->prev; - (*head)->prev = span; - } else { - span->next = 0; - span->prev = span; - } - *head = span; -} - -//! Add a span to partial span double linked list at the tail -static void -_memory_span_partial_list_add_tail(span_t** head, span_t* span) { - span->next = 0; - if (*head) { - span_t* tail = (*head)->prev; - tail->next = span; - span->prev = tail; - //Maintain pointer to tail span - (*head)->prev = span; - } else { - span->prev = span; - *head = span; - } -} - -//! Pop head span from partial span double linked list -static void -_memory_span_partial_list_pop_head(span_t** head) { - span_t* span = *head; - *head = span->next; - if (*head) { - //Maintain pointer to tail span - (*head)->prev = span->prev; - } -} - -//! Remove a span from partial span double linked list -static void -_memory_span_partial_list_remove(span_t** head, span_t* span) { - if (UNEXPECTED(*head == span)) { - _memory_span_partial_list_pop_head(head); - } else { - span_t* next_span = span->next; - span_t* prev_span = span->prev; - prev_span->next = next_span; - if (EXPECTED(next_span != 0)) { - next_span->prev = prev_span; - } else { - //Update pointer to tail span - (*head)->prev = prev_span; - } - } -} - -#if ENABLE_GLOBAL_CACHE - -//! Insert the given list of memory page spans in the global cache -static void -_memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) { - assert((span->list_size == 1) || (span->next != 0)); - int32_t list_size = (int32_t)span->list_size; - //Unmap if cache has reached the limit - if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) { -#if !ENABLE_UNLIMITED_GLOBAL_CACHE - _memory_unmap_span_list(span); - atomic_add32(&cache->size, -list_size); - return; -#endif - } - void* current_cache, *new_cache; - do { - current_cache = atomic_load_ptr(&cache->cache); - span->prev = (span_t*)((uintptr_t)current_cache & _memory_span_mask); - new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask)); - } while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache)); -} - -//! Extract a number of memory page spans from the global cache -static span_t* -_memory_cache_extract(global_cache_t* cache) { - uintptr_t span_ptr; - do { - void* global_span = atomic_load_ptr(&cache->cache); - span_ptr = (uintptr_t)global_span & _memory_span_mask; - if (span_ptr) { - span_t* span = (span_t*)span_ptr; - //By accessing the span ptr before it is swapped out of list we assume that a contending thread - //does not manage to traverse the span to being unmapped before we access it - void* new_cache = (void*)((uintptr_t)span->prev | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask)); - if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) { - atomic_add32(&cache->size, -(int32_t)span->list_size); - return span; - } - } - } while (span_ptr); - return 0; -} - -//! Finalize a global cache, only valid from allocator finalization (not thread safe) -static void -_memory_cache_finalize(global_cache_t* cache) { - void* current_cache = atomic_load_ptr(&cache->cache); - span_t* span = (span_t*)((uintptr_t)current_cache & _memory_span_mask); - while (span) { - span_t* skip_span = (span_t*)((uintptr_t)span->prev & _memory_span_mask); - atomic_add32(&cache->size, -(int32_t)span->list_size); - _memory_unmap_span_list(span); - span = skip_span; - } - assert(!atomic_load32(&cache->size)); - atomic_store_ptr(&cache->cache, 0); - atomic_store32(&cache->size, 0); -} - -//! Insert the given list of memory page spans in the global cache -static void -_memory_global_cache_insert(span_t* span) { - size_t span_count = span->span_count; -#if ENABLE_UNLIMITED_GLOBAL_CACHE - _memory_cache_insert(&_memory_span_cache[span_count - 1], span, 0); -#else - const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * ((span_count == 1) ? _memory_span_release_count : _memory_span_release_count_large)); - _memory_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit); -#endif -} - -//! Extract a number of memory page spans from the global cache for large blocks -static span_t* -_memory_global_cache_extract(size_t span_count) { - span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]); - assert(!span || (span->span_count == span_count)); - return span; -} - -#endif - -#if ENABLE_THREAD_CACHE -//! Adopt the deferred span cache list -static void -_memory_heap_cache_adopt_deferred(heap_t* heap) { - atomic_thread_fence_acquire(); - span_t* span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred); - if (!span) - return; - do { - span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred); - } while (!atomic_cas_ptr(&heap->span_cache_deferred, 0, span)); - while (span) { - span_t* next_span = span->next; - _memory_span_list_push(&heap->span_cache[0], span); -#if ENABLE_STATISTICS - atomic_decr32(&heap->span_use[span->span_count - 1].current); - ++heap->size_class_use[span->size_class].spans_to_cache; - --heap->size_class_use[span->size_class].spans_current; -#endif - span = next_span; - } -} -#endif - -//! Insert a single span into thread heap cache, releasing to global cache if overflow -static void -_memory_heap_cache_insert(heap_t* heap, span_t* span) { -#if ENABLE_THREAD_CACHE - size_t span_count = span->span_count; - size_t idx = span_count - 1; - _memory_statistics_inc(heap->span_use[idx].spans_to_cache, 1); - if (!idx) - _memory_heap_cache_adopt_deferred(heap); -#if ENABLE_UNLIMITED_THREAD_CACHE - _memory_span_list_push(&heap->span_cache[idx], span); -#else - const size_t release_count = (!idx ? _memory_span_release_count : _memory_span_release_count_large); - size_t current_cache_size = _memory_span_list_push(&heap->span_cache[idx], span); - if (current_cache_size <= release_count) - return; - const size_t hard_limit = release_count * THREAD_CACHE_MULTIPLIER; - if (current_cache_size <= hard_limit) { -#if ENABLE_ADAPTIVE_THREAD_CACHE - //Require 25% of high water mark to remain in cache (and at least 1, if use is 0) - const size_t high_mark = heap->span_use[idx].high; - const size_t min_limit = (high_mark >> 2) + release_count + 1; - if (current_cache_size < min_limit) - return; -#else - return; -#endif - } - heap->span_cache[idx] = _memory_span_list_split(span, release_count); - assert(span->list_size == release_count); -#if ENABLE_STATISTICS - heap->thread_to_global += (size_t)span->list_size * span_count * _memory_span_size; - heap->span_use[idx].spans_to_global += span->list_size; -#endif -#if ENABLE_GLOBAL_CACHE - _memory_global_cache_insert(span); -#else - _memory_unmap_span_list(span); -#endif -#endif -#else - (void)sizeof(heap); - _memory_unmap_span(span); -#endif -} - -//! Extract the given number of spans from the different cache levels -static span_t* -_memory_heap_thread_cache_extract(heap_t* heap, size_t span_count) { -#if ENABLE_THREAD_CACHE - size_t idx = span_count - 1; - if (!idx) - _memory_heap_cache_adopt_deferred(heap); - if (heap->span_cache[idx]) { -#if ENABLE_STATISTICS - heap->span_use[idx].spans_from_cache++; -#endif - return _memory_span_list_pop(&heap->span_cache[idx]); - } -#endif - return 0; -} - -static span_t* -_memory_heap_reserved_extract(heap_t* heap, size_t span_count) { - if (heap->spans_reserved >= span_count) - return _memory_map_spans(heap, span_count); - return 0; -} - -//! Extract a span from the global cache -static span_t* -_memory_heap_global_cache_extract(heap_t* heap, size_t span_count) { -#if ENABLE_GLOBAL_CACHE - size_t idx = span_count - 1; - heap->span_cache[idx] = _memory_global_cache_extract(span_count); - if (heap->span_cache[idx]) { -#if ENABLE_STATISTICS - heap->global_to_thread += (size_t)heap->span_cache[idx]->list_size * span_count * _memory_span_size; - heap->span_use[idx].spans_from_global += heap->span_cache[idx]->list_size; -#endif - return _memory_span_list_pop(&heap->span_cache[idx]); - } -#endif - return 0; -} - -//! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory -static span_t* -_memory_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_idx) { - (void)sizeof(class_idx); -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS - uint32_t idx = (uint32_t)span_count - 1; - uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current); - if (current_count > heap->span_use[idx].high) - heap->span_use[idx].high = current_count; -#if ENABLE_STATISTICS - uint32_t spans_current = ++heap->size_class_use[class_idx].spans_current; - if (spans_current > heap->size_class_use[class_idx].spans_peak) - heap->size_class_use[class_idx].spans_peak = spans_current; -#endif -#endif - span_t* span = _memory_heap_thread_cache_extract(heap, span_count); - if (EXPECTED(span != 0)) { - _memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1); - return span; - } - span = _memory_heap_reserved_extract(heap, span_count); - if (EXPECTED(span != 0)) { - _memory_statistics_inc(heap->size_class_use[class_idx].spans_from_reserved, 1); - return span; - } - span = _memory_heap_global_cache_extract(heap, span_count); - if (EXPECTED(span != 0)) { - _memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1); - return span; - } - //Final fallback, map in more virtual memory - span = _memory_map_spans(heap, span_count); - _memory_statistics_inc(heap->size_class_use[class_idx].spans_map_calls, 1); - return span; -} - //! Move the span (used for small or medium allocations) to the heap thread cache static void -_memory_span_release_to_cache(heap_t* heap, span_t* span) { - heap_class_t* heap_class = heap->span_class + span->size_class; - assert(heap_class->partial_span != span); - if (span->state == SPAN_STATE_PARTIAL) - _memory_span_partial_list_remove(&heap_class->partial_span, span); +_rpmalloc_span_release_to_cache(heap_t* heap, span_t* span) { + rpmalloc_assert(heap == span->heap, "Span heap pointer corrupted"); + rpmalloc_assert(span->size_class < SIZE_CLASS_COUNT, "Invalid span size class"); + rpmalloc_assert(span->span_count == 1, "Invalid span count"); #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS atomic_decr32(&heap->span_use[0].current); #endif - _memory_statistics_inc(heap->span_use[0].spans_to_cache, 1); - _memory_statistics_inc(heap->size_class_use[span->size_class].spans_to_cache, 1); - _memory_statistics_dec(heap->size_class_use[span->size_class].spans_current, 1); - _memory_heap_cache_insert(heap, span); + _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current); + if (!heap->finalize) { + _rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache); + _rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache); + if (heap->size_class[span->size_class].cache) + _rpmalloc_heap_cache_insert(heap, heap->size_class[span->size_class].cache); + heap->size_class[span->size_class].cache = span; + } else { + _rpmalloc_span_unmap(span); + } } //! Initialize a (partial) free list up to next system memory page, while reserving the first block //! as allocated, returning number of blocks in list static uint32_t -free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start, - uint32_t block_count, uint32_t block_size) { - assert(block_count); +free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start, uint32_t block_count, uint32_t block_size) { + rpmalloc_assert(block_count, "Internal failure"); *first_block = block_start; if (block_count > 1) { void* free_block = pointer_offset(block_start, block_size); - void* block_end = pointer_offset(block_start, block_size * block_count); + void* block_end = pointer_offset(block_start, (size_t)block_size * block_count); //If block size is less than half a memory page, bound init to next memory page boundary if (block_size < (_memory_page_size >> 1)) { void* page_end = pointer_offset(page_start, _memory_page_size); @@ -1132,75 +1302,799 @@ free_list_partial_init(void** list, void** first_block, void* page_start, void* return block_count; } -//! Initialize an unused span (from cache or mapped) to be new active span +//! Initialize an unused span (from cache or mapped) to be new active span, putting the initial free list in heap class free list static void* -_memory_span_set_new_active(heap_t* heap, heap_class_t* heap_class, span_t* span, uint32_t class_idx) { - assert(span->span_count == 1); +_rpmalloc_span_initialize_new(heap_t* heap, heap_size_class_t* heap_size_class, span_t* span, uint32_t class_idx) { + rpmalloc_assert(span->span_count == 1, "Internal failure"); size_class_t* size_class = _memory_size_class + class_idx; span->size_class = class_idx; span->heap = heap; span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS; - span->block_count = size_class->block_count; span->block_size = size_class->block_size; - span->state = SPAN_STATE_ACTIVE; + span->block_count = size_class->block_count; span->free_list = 0; + span->list_size = 0; + atomic_store_ptr_release(&span->free_list_deferred, 0); //Setup free list. Only initialize one system page worth of free blocks in list void* block; - span->free_list_limit = free_list_partial_init(&heap_class->free_list, &block, + span->free_list_limit = free_list_partial_init(&heap_size_class->free_list, &block, span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size); - atomic_store_ptr(&span->free_list_deferred, 0); - span->list_size = 0; - atomic_thread_fence_release(); - - _memory_span_partial_list_add(&heap_class->partial_span, span); + //Link span as partial if there remains blocks to be initialized as free list, or full if fully initialized + if (span->free_list_limit < span->block_count) { + _rpmalloc_span_double_link_list_add(&heap_size_class->partial_span, span); + span->used_count = span->free_list_limit; + } else { +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span); +#endif + ++heap->full_span_count; + span->used_count = span->block_count; + } return block; } -//! Promote a partially used span (from heap used list) to be new active span static void -_memory_span_set_partial_active(heap_class_t* heap_class, span_t* span) { - assert(span->state == SPAN_STATE_PARTIAL); - assert(span->block_count == _memory_size_class[span->size_class].block_count); - //Move data to heap size class and set span as active - heap_class->free_list = span->free_list; - span->state = SPAN_STATE_ACTIVE; - span->free_list = 0; - assert(heap_class->free_list); -} - -//! Mark span as full (from active) -static void -_memory_span_set_active_full(heap_class_t* heap_class, span_t* span) { - assert(span->state == SPAN_STATE_ACTIVE); - assert(span == heap_class->partial_span); - _memory_span_partial_list_pop_head(&heap_class->partial_span); - span->used_count = span->block_count; - span->state = SPAN_STATE_FULL; - span->free_list = 0; -} - -//! Move span from full to partial state -static void -_memory_span_set_full_partial(heap_t* heap, span_t* span) { - assert(span->state == SPAN_STATE_FULL); - heap_class_t* heap_class = &heap->span_class[span->size_class]; - span->state = SPAN_STATE_PARTIAL; - _memory_span_partial_list_add_tail(&heap_class->partial_span, span); -} - -static void* -_memory_span_extract_deferred(span_t* span) { - void* free_list; +_rpmalloc_span_extract_free_list_deferred(span_t* span) { + // We need acquire semantics on the CAS operation since we are interested in the list size + // Refer to _rpmalloc_deallocate_defer_small_or_medium for further comments on this dependency do { - free_list = atomic_load_ptr(&span->free_list_deferred); - } while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list)); + span->free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER); + } while (span->free_list == INVALID_POINTER); + span->used_count -= span->list_size; span->list_size = 0; - atomic_store_ptr(&span->free_list_deferred, 0); - atomic_thread_fence_release(); - return free_list; + atomic_store_ptr_release(&span->free_list_deferred, 0); } +static int +_rpmalloc_span_is_fully_utilized(span_t* span) { + rpmalloc_assert(span->free_list_limit <= span->block_count, "Span free list corrupted"); + return !span->free_list && (span->free_list_limit >= span->block_count); +} + +static int +_rpmalloc_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list_head) { + void* free_list = heap->size_class[iclass].free_list; + span_t* class_span = (span_t*)((uintptr_t)free_list & _memory_span_mask); + if (span == class_span) { + // Adopt the heap class free list back into the span free list + void* block = span->free_list; + void* last_block = 0; + while (block) { + last_block = block; + block = *((void**)block); + } + uint32_t free_count = 0; + block = free_list; + while (block) { + ++free_count; + block = *((void**)block); + } + if (last_block) { + *((void**)last_block) = free_list; + } else { + span->free_list = free_list; + } + heap->size_class[iclass].free_list = 0; + span->used_count -= free_count; + } + //If this assert triggers you have memory leaks + rpmalloc_assert(span->list_size == span->used_count, "Memory leak detected"); + if (span->list_size == span->used_count) { + _rpmalloc_stat_dec(&heap->span_use[0].current); + _rpmalloc_stat_dec(&heap->size_class_use[iclass].spans_current); + // This function only used for spans in double linked lists + if (list_head) + _rpmalloc_span_double_link_list_remove(list_head, span); + _rpmalloc_span_unmap(span); + return 1; + } + return 0; +} + + +//////////// +/// +/// Global cache +/// +////// + +#if ENABLE_GLOBAL_CACHE + +//! Finalize a global cache +static void +_rpmalloc_global_cache_finalize(global_cache_t* cache) { + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + + for (size_t ispan = 0; ispan < cache->count; ++ispan) + _rpmalloc_span_unmap(cache->span[ispan]); + cache->count = 0; + + while (cache->overflow) { + span_t* span = cache->overflow; + cache->overflow = span->next; + _rpmalloc_span_unmap(span); + } + + atomic_store32_release(&cache->lock, 0); +} + +static void +_rpmalloc_global_cache_insert_spans(span_t** span, size_t span_count, size_t count) { + const size_t cache_limit = (span_count == 1) ? + GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE : + GLOBAL_CACHE_MULTIPLIER * (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1)); + + global_cache_t* cache = &_memory_span_cache[span_count - 1]; + + size_t insert_count = count; + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + +#if ENABLE_STATISTICS + cache->insert_count += count; +#endif + if ((cache->count + insert_count) > cache_limit) + insert_count = cache_limit - cache->count; + + memcpy(cache->span + cache->count, span, sizeof(span_t*) * insert_count); + cache->count += (uint32_t)insert_count; + +#if ENABLE_UNLIMITED_CACHE + while (insert_count < count) { +#else + // Enable unlimited cache if huge pages, or we will leak since it is unlikely that an entire huge page + // will be unmapped, and we're unable to partially decommit a huge page + while ((_memory_page_size > _memory_span_size) && (insert_count < count)) { +#endif + span_t* current_span = span[insert_count++]; + current_span->next = cache->overflow; + cache->overflow = current_span; + } + atomic_store32_release(&cache->lock, 0); + + span_t* keep = 0; + for (size_t ispan = insert_count; ispan < count; ++ispan) { + span_t* current_span = span[ispan]; + // Keep master spans that has remaining subspans to avoid dangling them + if ((current_span->flags & SPAN_FLAG_MASTER) && + (atomic_load32(¤t_span->remaining_spans) > (int32_t)current_span->span_count)) { + current_span->next = keep; + keep = current_span; + } else { + _rpmalloc_span_unmap(current_span); + } + } + + if (keep) { + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + + size_t islot = 0; + while (keep) { + for (; islot < cache->count; ++islot) { + span_t* current_span = cache->span[islot]; + if (!(current_span->flags & SPAN_FLAG_MASTER) || ((current_span->flags & SPAN_FLAG_MASTER) && + (atomic_load32(¤t_span->remaining_spans) <= (int32_t)current_span->span_count))) { + _rpmalloc_span_unmap(current_span); + cache->span[islot] = keep; + break; + } + } + if (islot == cache->count) + break; + keep = keep->next; + } + + if (keep) { + span_t* tail = keep; + while (tail->next) + tail = tail->next; + tail->next = cache->overflow; + cache->overflow = keep; + } + + atomic_store32_release(&cache->lock, 0); + } +} + +static size_t +_rpmalloc_global_cache_extract_spans(span_t** span, size_t span_count, size_t count) { + global_cache_t* cache = &_memory_span_cache[span_count - 1]; + + size_t extract_count = 0; + while (!atomic_cas32_acquire(&cache->lock, 1, 0)) + _rpmalloc_spin(); + +#if ENABLE_STATISTICS + cache->extract_count += count; +#endif + size_t want = count - extract_count; + if (want > cache->count) + want = cache->count; + + memcpy(span + extract_count, cache->span + (cache->count - want), sizeof(span_t*) * want); + cache->count -= (uint32_t)want; + extract_count += want; + + while ((extract_count < count) && cache->overflow) { + span_t* current_span = cache->overflow; + span[extract_count++] = current_span; + cache->overflow = current_span->next; + } + +#if ENABLE_ASSERTS + for (size_t ispan = 0; ispan < extract_count; ++ispan) { + assert(span[ispan]->span_count == span_count); + } +#endif + + atomic_store32_release(&cache->lock, 0); + + return extract_count; +} + +#endif + +//////////// +/// +/// Heap control +/// +////// + +static void _rpmalloc_deallocate_huge(span_t*); + +//! Store the given spans as reserve in the given heap +static void +_rpmalloc_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) { + heap->span_reserve_master = master; + heap->span_reserve = reserve; + heap->spans_reserved = (uint32_t)reserve_span_count; +} + +//! Adopt the deferred span cache list, optionally extracting the first single span for immediate re-use +static void +_rpmalloc_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) { + span_t* span = (span_t*)((void*)atomic_exchange_ptr_acquire(&heap->span_free_deferred, 0)); + while (span) { + span_t* next_span = (span_t*)span->free_list; + rpmalloc_assert(span->heap == heap, "Span heap pointer corrupted"); + if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) { + rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted"); + --heap->full_span_count; + _rpmalloc_stat_dec(&heap->span_use[0].spans_deferred); +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span); +#endif + _rpmalloc_stat_dec(&heap->span_use[0].current); + _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current); + if (single_span && !*single_span) + *single_span = span; + else + _rpmalloc_heap_cache_insert(heap, span); + } else { + if (span->size_class == SIZE_CLASS_HUGE) { + _rpmalloc_deallocate_huge(span); + } else { + rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, "Span size class invalid"); + rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted"); + --heap->full_span_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&heap->large_huge_span, span); +#endif + uint32_t idx = span->span_count - 1; + _rpmalloc_stat_dec(&heap->span_use[idx].spans_deferred); + _rpmalloc_stat_dec(&heap->span_use[idx].current); + if (!idx && single_span && !*single_span) + *single_span = span; + else + _rpmalloc_heap_cache_insert(heap, span); + } + } + span = next_span; + } +} + +static void +_rpmalloc_heap_unmap(heap_t* heap) { + if (!heap->master_heap) { + if ((heap->finalize > 1) && !atomic_load32(&heap->child_count)) { + span_t* span = (span_t*)((uintptr_t)heap & _memory_span_mask); + _rpmalloc_span_unmap(span); + } + } else { + if (atomic_decr32(&heap->master_heap->child_count) == 0) { + _rpmalloc_heap_unmap(heap->master_heap); + } + } +} + +static void +_rpmalloc_heap_global_finalize(heap_t* heap) { + if (heap->finalize++ > 1) { + --heap->finalize; + return; + } + + _rpmalloc_heap_finalize(heap); + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t* span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1)); + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); + span_cache->count = 0; + } +#endif + + if (heap->full_span_count) { + --heap->finalize; + return; + } + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + if (heap->size_class[iclass].free_list || heap->size_class[iclass].partial_span) { + --heap->finalize; + return; + } + } + //Heap is now completely free, unmap and remove from heap list + size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE; + heap_t* list_heap = _memory_heaps[list_idx]; + if (list_heap == heap) { + _memory_heaps[list_idx] = heap->next_heap; + } else { + while (list_heap->next_heap != heap) + list_heap = list_heap->next_heap; + list_heap->next_heap = heap->next_heap; + } + + _rpmalloc_heap_unmap(heap); +} + +//! Insert a single span into thread heap cache, releasing to global cache if overflow +static void +_rpmalloc_heap_cache_insert(heap_t* heap, span_t* span) { + if (UNEXPECTED(heap->finalize != 0)) { + _rpmalloc_span_unmap(span); + _rpmalloc_heap_global_finalize(heap); + return; + } +#if ENABLE_THREAD_CACHE + size_t span_count = span->span_count; + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_to_cache); + if (span_count == 1) { + span_cache_t* span_cache = &heap->span_cache; + span_cache->span[span_cache->count++] = span; + if (span_cache->count == MAX_THREAD_SPAN_CACHE) { + const size_t remain_count = MAX_THREAD_SPAN_CACHE - THREAD_SPAN_CACHE_TRANSFER; +#if ENABLE_GLOBAL_CACHE + _rpmalloc_stat_add64(&heap->thread_to_global, THREAD_SPAN_CACHE_TRANSFER * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, THREAD_SPAN_CACHE_TRANSFER); + _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, span_count, THREAD_SPAN_CACHE_TRANSFER); +#else + for (size_t ispan = 0; ispan < THREAD_SPAN_CACHE_TRANSFER; ++ispan) + _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]); +#endif + span_cache->count = remain_count; + } + } else { + size_t cache_idx = span_count - 2; + span_large_cache_t* span_cache = heap->span_large_cache + cache_idx; + span_cache->span[span_cache->count++] = span; + const size_t cache_limit = (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1)); + if (span_cache->count == cache_limit) { + const size_t transfer_limit = 2 + (cache_limit >> 2); + const size_t transfer_count = (THREAD_SPAN_LARGE_CACHE_TRANSFER <= transfer_limit ? THREAD_SPAN_LARGE_CACHE_TRANSFER : transfer_limit); + const size_t remain_count = cache_limit - transfer_count; +#if ENABLE_GLOBAL_CACHE + _rpmalloc_stat_add64(&heap->thread_to_global, transfer_count * span_count * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, transfer_count); + _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, span_count, transfer_count); +#else + for (size_t ispan = 0; ispan < transfer_count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]); +#endif + span_cache->count = remain_count; + } + } +#else + (void)sizeof(heap); + _rpmalloc_span_unmap(span); +#endif +} + +//! Extract the given number of spans from the different cache levels +static span_t* +_rpmalloc_heap_thread_cache_extract(heap_t* heap, size_t span_count) { + span_t* span = 0; +#if ENABLE_THREAD_CACHE + span_cache_t* span_cache; + if (span_count == 1) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t*)(heap->span_large_cache + (span_count - 2)); + if (span_cache->count) { + _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_cache); + return span_cache->span[--span_cache->count]; + } +#endif + return span; +} + +static span_t* +_rpmalloc_heap_thread_cache_deferred_extract(heap_t* heap, size_t span_count) { + span_t* span = 0; + if (span_count == 1) { + _rpmalloc_heap_cache_adopt_deferred(heap, &span); + } else { + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + span = _rpmalloc_heap_thread_cache_extract(heap, span_count); + } + return span; +} + +static span_t* +_rpmalloc_heap_reserved_extract(heap_t* heap, size_t span_count) { + if (heap->spans_reserved >= span_count) + return _rpmalloc_span_map(heap, span_count); + return 0; +} + +//! Extract a span from the global cache +static span_t* +_rpmalloc_heap_global_cache_extract(heap_t* heap, size_t span_count) { +#if ENABLE_GLOBAL_CACHE +#if ENABLE_THREAD_CACHE + span_cache_t* span_cache; + size_t wanted_count; + if (span_count == 1) { + span_cache = &heap->span_cache; + wanted_count = THREAD_SPAN_CACHE_TRANSFER; + } else { + span_cache = (span_cache_t*)(heap->span_large_cache + (span_count - 2)); + wanted_count = THREAD_SPAN_LARGE_CACHE_TRANSFER; + } + span_cache->count = _rpmalloc_global_cache_extract_spans(span_cache->span, span_count, wanted_count); + if (span_cache->count) { + _rpmalloc_stat_add64(&heap->global_to_thread, span_count * span_cache->count * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, span_cache->count); + return span_cache->span[--span_cache->count]; + } +#else + span_t* span = 0; + size_t count = _rpmalloc_global_cache_extract_spans(&span, span_count, 1); + if (count) { + _rpmalloc_stat_add64(&heap->global_to_thread, span_count * count * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, count); + return span; + } +#endif +#endif + (void)sizeof(heap); + (void)sizeof(span_count); + return 0; +} + +static void +_rpmalloc_inc_span_statistics(heap_t* heap, size_t span_count, uint32_t class_idx) { + (void)sizeof(heap); + (void)sizeof(span_count); + (void)sizeof(class_idx); +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + uint32_t idx = (uint32_t)span_count - 1; + uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current); + if (current_count > (uint32_t)atomic_load32(&heap->span_use[idx].high)) + atomic_store32(&heap->span_use[idx].high, (int32_t)current_count); + _rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1, heap->size_class_use[class_idx].spans_peak); +#endif +} + +//! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory +static span_t* +_rpmalloc_heap_extract_new_span(heap_t* heap, heap_size_class_t* heap_size_class, size_t span_count, uint32_t class_idx) { + span_t* span; +#if ENABLE_THREAD_CACHE + if (heap_size_class && heap_size_class->cache) { + span = heap_size_class->cache; + heap_size_class->cache = (heap->span_cache.count ? heap->span_cache.span[--heap->span_cache.count] : 0); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } +#endif + (void)sizeof(class_idx); + // Allow 50% overhead to increase cache hits + size_t base_span_count = span_count; + size_t limit_span_count = (span_count > 2) ? (span_count + (span_count >> 1)) : span_count; + if (limit_span_count > LARGE_CLASS_COUNT) + limit_span_count = LARGE_CLASS_COUNT; + do { + span = _rpmalloc_heap_thread_cache_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + span = _rpmalloc_heap_thread_cache_deferred_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + span = _rpmalloc_heap_reserved_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_reserved); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + span = _rpmalloc_heap_global_cache_extract(heap, span_count); + if (EXPECTED(span != 0)) { + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache); + _rpmalloc_inc_span_statistics(heap, span_count, class_idx); + return span; + } + ++span_count; + } while (span_count <= limit_span_count); + //Final fallback, map in more virtual memory + span = _rpmalloc_span_map(heap, base_span_count); + _rpmalloc_inc_span_statistics(heap, base_span_count, class_idx); + _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_map_calls); + return span; +} + +static void +_rpmalloc_heap_initialize(heap_t* heap) { + memset(heap, 0, sizeof(heap_t)); + //Get a new heap ID + heap->id = 1 + atomic_incr32(&_memory_heap_id); + + //Link in heap in heap ID map + size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE; + heap->next_heap = _memory_heaps[list_idx]; + _memory_heaps[list_idx] = heap; +} + +static void +_rpmalloc_heap_orphan(heap_t* heap, int first_class) { + heap->owner_thread = (uintptr_t)-1; +#if RPMALLOC_FIRST_CLASS_HEAPS + heap_t** heap_list = (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps); +#else + (void)sizeof(first_class); + heap_t** heap_list = &_memory_orphan_heaps; +#endif + heap->next_orphan = *heap_list; + *heap_list = heap; +} + +//! Allocate a new heap from newly mapped memory pages +static heap_t* +_rpmalloc_heap_allocate_new(void) { + // Map in pages for a 16 heaps. If page size is greater than required size for this, map a page and + // use first part for heaps and remaining part for spans for allocations. Adds a lot of complexity, + // but saves a lot of memory on systems where page size > 64 spans (4MiB) + size_t heap_size = sizeof(heap_t); + size_t aligned_heap_size = 16 * ((heap_size + 15) / 16); + size_t request_heap_count = 16; + size_t heap_span_count = ((aligned_heap_size * request_heap_count) + sizeof(span_t) + _memory_span_size - 1) / _memory_span_size; + size_t block_size = _memory_span_size * heap_span_count; + size_t span_count = heap_span_count; + span_t* span = 0; + // If there are global reserved spans, use these first + if (_memory_global_reserve_count >= heap_span_count) { + span = _rpmalloc_global_get_reserved_spans(heap_span_count); + } + if (!span) { + if (_memory_page_size > block_size) { + span_count = _memory_page_size / _memory_span_size; + block_size = _memory_page_size; + // If using huge pages, make sure to grab enough heaps to avoid reallocating a huge page just to serve new heaps + size_t possible_heap_count = (block_size - sizeof(span_t)) / aligned_heap_size; + if (possible_heap_count >= (request_heap_count * 16)) + request_heap_count *= 16; + else if (possible_heap_count < request_heap_count) + request_heap_count = possible_heap_count; + heap_span_count = ((aligned_heap_size * request_heap_count) + sizeof(span_t) + _memory_span_size - 1) / _memory_span_size; + } + + size_t align_offset = 0; + span = (span_t*)_rpmalloc_mmap(block_size, &align_offset); + if (!span) + return 0; + + // Master span will contain the heaps + _rpmalloc_stat_inc(&_master_spans); + _rpmalloc_span_initialize(span, span_count, heap_span_count, align_offset); + } + + size_t remain_size = _memory_span_size - sizeof(span_t); + heap_t* heap = (heap_t*)pointer_offset(span, sizeof(span_t)); + _rpmalloc_heap_initialize(heap); + + // Put extra heaps as orphans + size_t num_heaps = remain_size / aligned_heap_size; + if (num_heaps < request_heap_count) + num_heaps = request_heap_count; + atomic_store32(&heap->child_count, (int32_t)num_heaps - 1); + heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size); + while (num_heaps > 1) { + _rpmalloc_heap_initialize(extra_heap); + extra_heap->master_heap = heap; + _rpmalloc_heap_orphan(extra_heap, 1); + extra_heap = (heap_t*)pointer_offset(extra_heap, aligned_heap_size); + --num_heaps; + } + + if (span_count > heap_span_count) { + // Cap reserved spans + size_t remain_count = span_count - heap_span_count; + size_t reserve_count = (remain_count > _memory_heap_reserve_count ? _memory_heap_reserve_count : remain_count); + span_t* remain_span = (span_t*)pointer_offset(span, heap_span_count * _memory_span_size); + _rpmalloc_heap_set_reserved_spans(heap, span, remain_span, reserve_count); + + if (remain_count > reserve_count) { + // Set to global reserved spans + remain_span = (span_t*)pointer_offset(remain_span, reserve_count * _memory_span_size); + reserve_count = remain_count - reserve_count; + _rpmalloc_global_set_reserved_spans(span, remain_span, reserve_count); + } + } + + return heap; +} + +static heap_t* +_rpmalloc_heap_extract_orphan(heap_t** heap_list) { + heap_t* heap = *heap_list; + *heap_list = (heap ? heap->next_orphan : 0); + return heap; +} + +//! Allocate a new heap, potentially reusing a previously orphaned heap +static heap_t* +_rpmalloc_heap_allocate(int first_class) { + heap_t* heap = 0; + while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) + _rpmalloc_spin(); + if (first_class == 0) + heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps); +#if RPMALLOC_FIRST_CLASS_HEAPS + if (!heap) + heap = _rpmalloc_heap_extract_orphan(&_memory_first_class_orphan_heaps); +#endif + if (!heap) + heap = _rpmalloc_heap_allocate_new(); + atomic_store32_release(&_memory_global_lock, 0); + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + return heap; +} + +static void +_rpmalloc_heap_release(void* heapptr, int first_class, int release_cache) { + heap_t* heap = (heap_t*)heapptr; + if (!heap) + return; + //Release thread cache spans back to global cache + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + if (release_cache || heap->finalize) { +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t* span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1)); + if (!span_cache->count) + continue; +#if ENABLE_GLOBAL_CACHE + if (heap->finalize) { + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); + } else { + _rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count); + _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count); + } +#else + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); +#endif + span_cache->count = 0; + } +#endif + } + + if (get_thread_heap_raw() == heap) + set_thread_heap(0); + +#if ENABLE_STATISTICS + atomic_decr32(&_memory_active_heaps); + rpmalloc_assert(atomic_load32(&_memory_active_heaps) >= 0, "Still active heaps during finalization"); +#endif + + // If we are forcibly terminating with _exit the state of the + // lock atomic is unknown and it's best to just go ahead and exit + if (get_thread_id() != _rpmalloc_main_thread_id) { + while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) + _rpmalloc_spin(); + } + _rpmalloc_heap_orphan(heap, first_class); + atomic_store32_release(&_memory_global_lock, 0); +} + +static void +_rpmalloc_heap_release_raw(void* heapptr, int release_cache) { + _rpmalloc_heap_release(heapptr, 0, release_cache); +} + +static void +_rpmalloc_heap_release_raw_fc(void* heapptr) { + _rpmalloc_heap_release_raw(heapptr, 1); +} + +static void +_rpmalloc_heap_finalize(heap_t* heap) { + if (heap->spans_reserved) { + span_t* span = _rpmalloc_span_map(heap, heap->spans_reserved); + _rpmalloc_span_unmap(span); + heap->spans_reserved = 0; + } + + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + if (heap->size_class[iclass].cache) + _rpmalloc_span_unmap(heap->size_class[iclass].cache); + heap->size_class[iclass].cache = 0; + span_t* span = heap->size_class[iclass].partial_span; + while (span) { + span_t* next = span->next; + _rpmalloc_span_finalize(heap, iclass, span, &heap->size_class[iclass].partial_span); + span = next; + } + // If class still has a free list it must be a full span + if (heap->size_class[iclass].free_list) { + span_t* class_span = (span_t*)((uintptr_t)heap->size_class[iclass].free_list & _memory_span_mask); + span_t** list = 0; +#if RPMALLOC_FIRST_CLASS_HEAPS + list = &heap->full_span[iclass]; +#endif + --heap->full_span_count; + if (!_rpmalloc_span_finalize(heap, iclass, class_span, list)) { + if (list) + _rpmalloc_span_double_link_list_remove(list, class_span); + _rpmalloc_span_double_link_list_add(&heap->size_class[iclass].partial_span, class_span); + } + } + } + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t* span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1)); + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); + span_cache->count = 0; + } +#endif + rpmalloc_assert(!atomic_load_ptr(&heap->span_free_deferred), "Heaps still active during finalization"); +} + + +//////////// +/// +/// Allocation entry points +/// +////// + //! Pop first block from a free list static void* free_list_pop(void** list) { @@ -1211,84 +2105,85 @@ free_list_pop(void** list) { //! Allocate a small/medium sized memory block from the given heap static void* -_memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) { - heap_class_t* heap_class = &heap->span_class[class_idx]; - void* block; - - span_t* active_span = heap_class->partial_span; - if (EXPECTED(active_span != 0)) { - assert(active_span->state == SPAN_STATE_ACTIVE); - assert(active_span->block_count == _memory_size_class[active_span->size_class].block_count); - //Swap in free list if not empty - if (active_span->free_list) { - heap_class->free_list = active_span->free_list; - active_span->free_list = 0; - return free_list_pop(&heap_class->free_list); - } - //If the span did not fully initialize free list, link up another page worth of blocks - if (active_span->free_list_limit < active_span->block_count) { - void* block_start = pointer_offset(active_span, SPAN_HEADER_SIZE + (active_span->free_list_limit * active_span->block_size)); - active_span->free_list_limit += free_list_partial_init(&heap_class->free_list, &block, +_rpmalloc_allocate_from_heap_fallback(heap_t* heap, heap_size_class_t* heap_size_class, uint32_t class_idx) { + span_t* span = heap_size_class->partial_span; + if (EXPECTED(span != 0)) { + rpmalloc_assert(span->block_count == _memory_size_class[span->size_class].block_count, "Span block count corrupted"); + rpmalloc_assert(!_rpmalloc_span_is_fully_utilized(span), "Internal failure"); + void* block; + if (span->free_list) { + //Span local free list is not empty, swap to size class free list + block = free_list_pop(&span->free_list); + heap_size_class->free_list = span->free_list; + span->free_list = 0; + } else { + //If the span did not fully initialize free list, link up another page worth of blocks + void* block_start = pointer_offset(span, SPAN_HEADER_SIZE + ((size_t)span->free_list_limit * span->block_size)); + span->free_list_limit += free_list_partial_init(&heap_size_class->free_list, &block, (void*)((uintptr_t)block_start & ~(_memory_page_size - 1)), block_start, - active_span->block_count - active_span->free_list_limit, active_span->block_size); + span->block_count - span->free_list_limit, span->block_size); + } + rpmalloc_assert(span->free_list_limit <= span->block_count, "Span block count corrupted"); + span->used_count = span->free_list_limit; + + //Swap in deferred free list if present + if (atomic_load_ptr(&span->free_list_deferred)) + _rpmalloc_span_extract_free_list_deferred(span); + + //If span is still not fully utilized keep it in partial list and early return block + if (!_rpmalloc_span_is_fully_utilized(span)) return block; - } - //Swap in deferred free list - atomic_thread_fence_acquire(); - if (atomic_load_ptr(&active_span->free_list_deferred)) { - heap_class->free_list = _memory_span_extract_deferred(active_span); - return free_list_pop(&heap_class->free_list); - } - //If the active span is fully allocated, mark span as free floating (fully allocated and not part of any list) - assert(!heap_class->free_list); - assert(active_span->free_list_limit >= active_span->block_count); - _memory_span_set_active_full(heap_class, active_span); + //The span is fully utilized, unlink from partial list and add to fully utilized list + _rpmalloc_span_double_link_list_pop_head(&heap_size_class->partial_span, span); +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span); +#endif + ++heap->full_span_count; + return block; } - assert(!heap_class->free_list); - - //Try promoting a semi-used span to active - active_span = heap_class->partial_span; - if (EXPECTED(active_span != 0)) { - _memory_span_set_partial_active(heap_class, active_span); - return free_list_pop(&heap_class->free_list); - } - assert(!heap_class->free_list); - assert(!heap_class->partial_span); //Find a span in one of the cache levels - active_span = _memory_heap_extract_new_span(heap, 1, class_idx); + span = _rpmalloc_heap_extract_new_span(heap, heap_size_class, 1, class_idx); + if (EXPECTED(span != 0)) { + //Mark span as owned by this heap and set base data, return first block + return _rpmalloc_span_initialize_new(heap, heap_size_class, span, class_idx); + } - //Mark span as owned by this heap and set base data, return first block - return _memory_span_set_new_active(heap, heap_class, active_span, class_idx); + return 0; } //! Allocate a small sized memory block from the given heap static void* -_memory_allocate_small(heap_t* heap, size_t size) { +_rpmalloc_allocate_small(heap_t* heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); //Small sizes have unique size classes const uint32_t class_idx = (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT); - _memory_statistics_inc_alloc(heap, class_idx); - if (EXPECTED(heap->span_class[class_idx].free_list != 0)) - return free_list_pop(&heap->span_class[class_idx].free_list); - return _memory_allocate_from_heap_fallback(heap, class_idx); + heap_size_class_t* heap_size_class = heap->size_class + class_idx; + _rpmalloc_stat_inc_alloc(heap, class_idx); + if (EXPECTED(heap_size_class->free_list != 0)) + return free_list_pop(&heap_size_class->free_list); + return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, class_idx); } //! Allocate a medium sized memory block from the given heap static void* -_memory_allocate_medium(heap_t* heap, size_t size) { +_rpmalloc_allocate_medium(heap_t* heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); //Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes) const uint32_t base_idx = (uint32_t)(SMALL_CLASS_COUNT + ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT)); const uint32_t class_idx = _memory_size_class[base_idx].class_idx; - _memory_statistics_inc_alloc(heap, class_idx); - if (EXPECTED(heap->span_class[class_idx].free_list != 0)) - return free_list_pop(&heap->span_class[class_idx].free_list); - return _memory_allocate_from_heap_fallback(heap, class_idx); + heap_size_class_t* heap_size_class = heap->size_class + class_idx; + _rpmalloc_stat_inc_alloc(heap, class_idx); + if (EXPECTED(heap_size_class->free_list != 0)) + return free_list_pop(&heap_size_class->free_list); + return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class, class_idx); } //! Allocate a large sized memory block from the given heap static void* -_memory_allocate_large(heap_t* heap, size_t size) { +_rpmalloc_allocate_large(heap_t* heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); //Calculate number of needed max sized spans (including header) //Since this function is never called if size > LARGE_SIZE_LIMIT //the span_count is guaranteed to be <= LARGE_CLASS_COUNT @@ -1296,928 +2191,71 @@ _memory_allocate_large(heap_t* heap, size_t size) { size_t span_count = size >> _memory_span_size_shift; if (size & (_memory_span_size - 1)) ++span_count; - size_t idx = span_count - 1; //Find a span in one of the cache levels - span_t* span = _memory_heap_extract_new_span(heap, span_count, SIZE_CLASS_COUNT); + span_t* span = _rpmalloc_heap_extract_new_span(heap, 0, span_count, SIZE_CLASS_LARGE); + if (!span) + return span; //Mark span as owned by this heap and set base data - assert(span->span_count == span_count); - span->size_class = (uint32_t)(SIZE_CLASS_COUNT + idx); + rpmalloc_assert(span->span_count >= span_count, "Internal failure"); + span->size_class = SIZE_CLASS_LARGE; span->heap = heap; - atomic_thread_fence_release(); + +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); +#endif + ++heap->full_span_count; return pointer_offset(span, SPAN_HEADER_SIZE); } //! Allocate a huge block by mapping memory pages directly static void* -_memory_allocate_huge(size_t size) { +_rpmalloc_allocate_huge(heap_t* heap, size_t size) { + rpmalloc_assert(heap, "No thread heap"); + _rpmalloc_heap_cache_adopt_deferred(heap, 0); size += SPAN_HEADER_SIZE; size_t num_pages = size >> _memory_page_size_shift; if (size & (_memory_page_size - 1)) ++num_pages; size_t align_offset = 0; - span_t* span = (span_t*)_memory_map(num_pages * _memory_page_size, &align_offset); + span_t* span = (span_t*)_rpmalloc_mmap(num_pages * _memory_page_size, &align_offset); if (!span) return span; + //Store page count in span_count - span->size_class = (uint32_t)-1; + span->size_class = SIZE_CLASS_HUGE; span->span_count = (uint32_t)num_pages; span->align_offset = (uint32_t)align_offset; - _memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + span->heap = heap; + _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); +#endif + ++heap->full_span_count; return pointer_offset(span, SPAN_HEADER_SIZE); } -//! Allocate a block larger than medium size -static void* -_memory_allocate_oversized(heap_t* heap, size_t size) { - if (size <= LARGE_SIZE_LIMIT) - return _memory_allocate_large(heap, size); - return _memory_allocate_huge(size); -} - //! Allocate a block of the given size static void* -_memory_allocate(heap_t* heap, size_t size) { +_rpmalloc_allocate(heap_t* heap, size_t size) { + _rpmalloc_stat_add64(&_allocation_counter, 1); if (EXPECTED(size <= SMALL_SIZE_LIMIT)) - return _memory_allocate_small(heap, size); + return _rpmalloc_allocate_small(heap, size); else if (size <= _memory_medium_size_limit) - return _memory_allocate_medium(heap, size); - return _memory_allocate_oversized(heap, size); + return _rpmalloc_allocate_medium(heap, size); + else if (size <= LARGE_SIZE_LIMIT) + return _rpmalloc_allocate_large(heap, size); + return _rpmalloc_allocate_huge(heap, size); } -//! Allocate a new heap -static heap_t* -_memory_allocate_heap(void) { - void* raw_heap; - void* next_raw_heap; - uintptr_t orphan_counter; - heap_t* heap; - heap_t* next_heap; - //Try getting an orphaned heap - atomic_thread_fence_acquire(); - do { - raw_heap = atomic_load_ptr(&_memory_orphan_heaps); - heap = (heap_t*)((uintptr_t)raw_heap & ~(uintptr_t)0x1FF); - if (!heap) - break; - next_heap = heap->next_orphan; - orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter); - next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & (uintptr_t)0x1FF)); - } while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap)); - - if (!heap) { - //Map in pages for a new heap - size_t align_offset = 0; - heap = (heap_t*)_memory_map((1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, &align_offset); - if (!heap) - return heap; - memset((char*)heap, 0, sizeof(heap_t)); - heap->align_offset = align_offset; - - //Get a new heap ID - do { - heap->id = atomic_incr32(&_memory_heap_id); - if (_memory_heap_lookup(heap->id)) - heap->id = 0; - } while (!heap->id); - - //Link in heap in heap ID map - size_t list_idx = heap->id % HEAP_ARRAY_SIZE; - do { - next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); - heap->next_heap = next_heap; - } while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap)); - } - - return heap; -} - -//! Deallocate the given small/medium memory block in the current thread local heap -static void -_memory_deallocate_direct(span_t* span, void* block) { - assert(span->heap == get_thread_heap_raw()); - uint32_t state = span->state; - //Add block to free list - *((void**)block) = span->free_list; - span->free_list = block; - if (UNEXPECTED(state == SPAN_STATE_ACTIVE)) - return; - uint32_t used = --span->used_count; - uint32_t free = span->list_size; - if (UNEXPECTED(used == free)) - _memory_span_release_to_cache(span->heap, span); - else if (UNEXPECTED(state == SPAN_STATE_FULL)) - _memory_span_set_full_partial(span->heap, span); -} - -//! Put the block in the deferred free list of the owning span -static void -_memory_deallocate_defer(span_t* span, void* block) { - atomic_thread_fence_acquire(); - if (span->state == SPAN_STATE_FULL) { - if ((span->list_size + 1) == span->block_count) { - //Span will be completely freed by deferred deallocations, no other thread can - //currently touch it. Safe to move to owner heap deferred cache - span_t* last_head; - heap_t* heap = span->heap; - do { - last_head = (span_t*)atomic_load_ptr(&heap->span_cache_deferred); - span->next = last_head; - } while (!atomic_cas_ptr(&heap->span_cache_deferred, span, last_head)); - return; - } - } - - void* free_list; - do { - atomic_thread_fence_acquire(); - free_list = atomic_load_ptr(&span->free_list_deferred); - *((void**)block) = free_list; - } while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list)); - ++span->list_size; - atomic_store_ptr(&span->free_list_deferred, block); -} - -static void -_memory_deallocate_small_or_medium(span_t* span, void* p) { - _memory_statistics_inc_free(span->heap, span->size_class); - if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) { - //Realign pointer to block start - void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); - uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); - p = pointer_offset(p, -(int32_t)(block_offset % span->block_size)); - } - //Check if block belongs to this heap or if deallocation should be deferred - if (span->heap == get_thread_heap_raw()) - _memory_deallocate_direct(span, p); - else - _memory_deallocate_defer(span, p); -} - -//! Deallocate the given large memory block to the current heap -static void -_memory_deallocate_large(span_t* span) { - //Decrease counter - assert(span->span_count == ((size_t)span->size_class - SIZE_CLASS_COUNT + 1)); - assert(span->size_class >= SIZE_CLASS_COUNT); - assert(span->size_class - SIZE_CLASS_COUNT < LARGE_CLASS_COUNT); - assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN)); - assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN)); - //Large blocks can always be deallocated and transferred between heaps - //Investigate if it is better to defer large spans as well through span_cache_deferred, - //possibly with some heuristics to pick either scheme at runtime per deallocation - heap_t* heap = get_thread_heap(); - if (!heap) return; -#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS - size_t idx = span->span_count - 1; - atomic_decr32(&span->heap->span_use[idx].current); -#endif - if ((span->span_count > 1) && !heap->spans_reserved) { - heap->span_reserve = span; - heap->spans_reserved = span->span_count; - if (span->flags & SPAN_FLAG_MASTER) { - heap->span_reserve_master = span; - } else { //SPAN_FLAG_SUBSPAN - uint32_t distance = span->total_spans_or_distance; - span_t* master = (span_t*)pointer_offset(span, -(int32_t)(distance * _memory_span_size)); - heap->span_reserve_master = master; - assert(master->flags & SPAN_FLAG_MASTER); - assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count); - } - _memory_statistics_inc(heap->span_use[idx].spans_to_reserved, 1); - } else { - //Insert into cache list - _memory_heap_cache_insert(heap, span); - } -} - -//! Deallocate the given huge span -static void -_memory_deallocate_huge(span_t* span) { - //Oversized allocation, page count is stored in span_count - size_t num_pages = span->span_count; - _memory_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size); - _memory_statistics_sub(&_huge_pages_current, num_pages); -} - -//! Deallocate the given block -static void -_memory_deallocate(void* p) { - //Grab the span (always at start of span, using span alignment) - span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); - if (UNEXPECTED(!span)) - return; - if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) - _memory_deallocate_small_or_medium(span, p); - else if (span->size_class != (uint32_t)-1) - _memory_deallocate_large(span); - else - _memory_deallocate_huge(span); -} - -//! Reallocate the given block to the given size static void* -_memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) { - if (p) { - //Grab the span using guaranteed span alignment - span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); - if (span->heap) { - if (span->size_class < SIZE_CLASS_COUNT) { - //Small/medium sized block - assert(span->span_count == 1); - void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); - uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); - uint32_t block_idx = block_offset / span->block_size; - void* block = pointer_offset(blocks_start, block_idx * span->block_size); - if (!oldsize) - oldsize = span->block_size - (uint32_t)pointer_diff(p, block); - if ((size_t)span->block_size >= size) { - //Still fits in block, never mind trying to save memory, but preserve data if alignment changed - if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) - memmove(block, p, oldsize); - return block; - } - } else { - //Large block - size_t total_size = size + SPAN_HEADER_SIZE; - size_t num_spans = total_size >> _memory_span_size_shift; - if (total_size & (_memory_span_mask - 1)) - ++num_spans; - size_t current_spans = span->span_count; - assert(current_spans == ((span->size_class - SIZE_CLASS_COUNT) + 1)); - void* block = pointer_offset(span, SPAN_HEADER_SIZE); - if (!oldsize) - oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; - if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) { - //Still fits in block, never mind trying to save memory, but preserve data if alignment changed - if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) - memmove(block, p, oldsize); - return block; - } - } - } else { - //Oversized block - size_t total_size = size + SPAN_HEADER_SIZE; - size_t num_pages = total_size >> _memory_page_size_shift; - if (total_size & (_memory_page_size - 1)) - ++num_pages; - //Page count is stored in span_count - size_t current_pages = span->span_count; - void* block = pointer_offset(span, SPAN_HEADER_SIZE); - if (!oldsize) - oldsize = (current_pages * _memory_page_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; - if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) { - //Still fits in block, never mind trying to save memory, but preserve data if alignment changed - if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) - memmove(block, p, oldsize); - return block; - } - } - } else { - oldsize = 0; - } - - //Size is greater than block size, need to allocate a new block and deallocate the old - heap_t* heap = get_thread_heap(); - //Avoid hysteresis by overallocating if increase is small (below 37%) - size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); - size_t new_size = (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size); - void* block = _memory_allocate(heap, new_size); - if (p && block) { - if (!(flags & RPMALLOC_NO_PRESERVE)) - memcpy(block, p, oldsize < new_size ? oldsize : new_size); - _memory_deallocate(p); - } - - return block; -} - -//! Get the usable size of the given block -static size_t -_memory_usable_size(void* p) { - //Grab the span using guaranteed span alignment - span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); - if (span->heap) { - //Small/medium block - if (span->size_class < SIZE_CLASS_COUNT) { - void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); - return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size); - } - - //Large block - size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1; - return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span); - } - - //Oversized block, page count is stored in span_count - size_t current_pages = span->span_count; - return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span); -} - -//! Adjust and optimize the size class properties for the given class -static void -_memory_adjust_size_class(size_t iclass) { - size_t block_size = _memory_size_class[iclass].block_size; - size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size; - - _memory_size_class[iclass].block_count = (uint16_t)block_count; - _memory_size_class[iclass].class_idx = (uint16_t)iclass; - - //Check if previous size classes can be merged - size_t prevclass = iclass; - while (prevclass > 0) { - --prevclass; - //A class can be merged if number of pages and number of blocks are equal - if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count) - memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass])); - else - break; - } -} - -extern thread_local bool RpThreadShutdown; - -static void -_memory_heap_finalize(void* heapptr) { - heap_t* heap = (heap_t*)heapptr; - if (!heap) - return; - RpThreadShutdown = true; - //Release thread cache spans back to global cache -#if ENABLE_THREAD_CACHE - _memory_heap_cache_adopt_deferred(heap); - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - span_t* span = heap->span_cache[iclass]; -#if ENABLE_GLOBAL_CACHE - while (span) { - assert(span->span_count == (iclass + 1)); - size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large); - span_t* next = _memory_span_list_split(span, (uint32_t)release_count); -#if ENABLE_STATISTICS - heap->thread_to_global += (size_t)span->list_size * span->span_count * _memory_span_size; - heap->span_use[iclass].spans_to_global += span->list_size; -#endif - _memory_global_cache_insert(span); - span = next; - } -#else - if (span) - _memory_unmap_span_list(span); -#endif - heap->span_cache[iclass] = 0; - } -#endif - - //Orphan the heap - void* raw_heap; - uintptr_t orphan_counter; - heap_t* last_heap; - do { - last_heap = (heap_t*)atomic_load_ptr(&_memory_orphan_heaps); - heap->next_orphan = (heap_t*)((uintptr_t)last_heap & ~(uintptr_t)0x1FF); - orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter); - raw_heap = (void*)((uintptr_t)heap | (orphan_counter & (uintptr_t)0x1FF)); - } while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap)); - - set_thread_heap(0); - -#if ENABLE_STATISTICS - atomic_decr32(&_memory_active_heaps); - assert(atomic_load32(&_memory_active_heaps) >= 0); -#endif -} - -#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) -#include -static DWORD fls_key; -static void NTAPI -rp_thread_destructor(void* value) { - if (value) - rpmalloc_thread_finalize(); -} -#endif - -#if PLATFORM_POSIX -# include -# include -# ifdef __FreeBSD__ -# include -# define MAP_HUGETLB MAP_ALIGNED_SUPER -# endif -# ifndef MAP_UNINITIALIZED -# define MAP_UNINITIALIZED 0 -# endif -#endif -#include - -//! Initialize the allocator and setup global data -TRACY_API int -rpmalloc_initialize(void) { - if (_rpmalloc_initialized) { - rpmalloc_thread_initialize(); - return 0; - } - memset(&_memory_config, 0, sizeof(rpmalloc_config_t)); - return rpmalloc_initialize_config(0); -} - -int -rpmalloc_initialize_config(const rpmalloc_config_t* config) { - if (_rpmalloc_initialized) { - rpmalloc_thread_initialize(); - return 0; - } - _rpmalloc_initialized = 1; - - if (config) - memcpy(&_memory_config, config, sizeof(rpmalloc_config_t)); - - if (!_memory_config.memory_map || !_memory_config.memory_unmap) { - _memory_config.memory_map = _memory_map_os; - _memory_config.memory_unmap = _memory_unmap_os; - } - -#if RPMALLOC_CONFIGURABLE - _memory_page_size = _memory_config.page_size; -#else - _memory_page_size = 0; -#endif - _memory_huge_pages = 0; - _memory_map_granularity = _memory_page_size; - if (!_memory_page_size) { -#if PLATFORM_WINDOWS - SYSTEM_INFO system_info; - memset(&system_info, 0, sizeof(system_info)); - GetSystemInfo(&system_info); - _memory_page_size = system_info.dwPageSize; - _memory_map_granularity = system_info.dwAllocationGranularity; - if (config && config->enable_huge_pages) { - HANDLE token = 0; - size_t large_page_minimum = GetLargePageMinimum(); - if (large_page_minimum) - OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token); - if (token) { - LUID luid; - if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) { - TOKEN_PRIVILEGES token_privileges; - memset(&token_privileges, 0, sizeof(token_privileges)); - token_privileges.PrivilegeCount = 1; - token_privileges.Privileges[0].Luid = luid; - token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) { - DWORD err = GetLastError(); - if (err == ERROR_SUCCESS) { - _memory_huge_pages = 1; - _memory_page_size = large_page_minimum; - _memory_map_granularity = large_page_minimum; - } - } - } - CloseHandle(token); - } - } -#else - _memory_page_size = (size_t)sysconf(_SC_PAGESIZE); - _memory_map_granularity = _memory_page_size; - if (config && config->enable_huge_pages) { -#if defined(__linux__) - size_t huge_page_size = 0; - FILE* meminfo = fopen("/proc/meminfo", "r"); - if (meminfo) { - char line[128]; - while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) { - line[sizeof(line) - 1] = 0; - if (strstr(line, "Hugepagesize:")) - huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024; - } - fclose(meminfo); - } - if (huge_page_size) { - _memory_huge_pages = 1; - _memory_page_size = huge_page_size; - _memory_map_granularity = huge_page_size; - } -#elif defined(__FreeBSD__) - int rc; - size_t sz = sizeof(rc); - - if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && rc == 1) { - _memory_huge_pages = 1; - _memory_page_size = 2 * 1024 * 1024; - _memory_map_granularity = _memory_page_size; - } -#elif defined(__APPLE__) - _memory_huge_pages = 1; - _memory_page_size = 2 * 1024 * 1024; - _memory_map_granularity = _memory_page_size; -#endif - } -#endif - } else { - if (config && config->enable_huge_pages) - _memory_huge_pages = 1; - } - - //The ABA counter in heap orphan list is tied to using 512 (bitmask 0x1FF) - if (_memory_page_size < 512) - _memory_page_size = 512; - if (_memory_page_size > (64 * 1024 * 1024)) - _memory_page_size = (64 * 1024 * 1024); - _memory_page_size_shift = 0; - size_t page_size_bit = _memory_page_size; - while (page_size_bit != 1) { - ++_memory_page_size_shift; - page_size_bit >>= 1; - } - _memory_page_size = ((size_t)1 << _memory_page_size_shift); - -#if RPMALLOC_CONFIGURABLE - size_t span_size = _memory_config.span_size; - if (!span_size) - span_size = (64 * 1024); - if (span_size > (256 * 1024)) - span_size = (256 * 1024); - _memory_span_size = 4096; - _memory_span_size_shift = 12; - while (_memory_span_size < span_size) { - _memory_span_size <<= 1; - ++_memory_span_size_shift; - } - _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1); -#endif - - _memory_span_map_count = ( _memory_config.span_map_count ? _memory_config.span_map_count : DEFAULT_SPAN_MAP_COUNT); - if ((_memory_span_size * _memory_span_map_count) < _memory_page_size) - _memory_span_map_count = (_memory_page_size / _memory_span_size); - if ((_memory_page_size >= _memory_span_size) && ((_memory_span_map_count * _memory_span_size) % _memory_page_size)) - _memory_span_map_count = (_memory_page_size / _memory_span_size); - - _memory_config.page_size = _memory_page_size; - _memory_config.span_size = _memory_span_size; - _memory_config.span_map_count = _memory_span_map_count; - _memory_config.enable_huge_pages = _memory_huge_pages; - - _memory_span_release_count = (_memory_span_map_count > 4 ? ((_memory_span_map_count < 64) ? _memory_span_map_count : 64) : 4); - _memory_span_release_count_large = (_memory_span_release_count > 8 ? (_memory_span_release_count / 4) : 2); - -#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD - if (pthread_key_create(&_memory_thread_heap, _memory_heap_finalize)) - return -1; -#endif -#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) - fls_key = FlsAlloc(&rp_thread_destructor); -#endif - - atomic_store32(&_memory_heap_id, 0); - atomic_store32(&_memory_orphan_counter, 0); -#if ENABLE_STATISTICS - atomic_store32(&_memory_active_heaps, 0); - atomic_store32(&_reserved_spans, 0); - atomic_store32(&_mapped_pages, 0); - _mapped_pages_peak = 0; - atomic_store32(&_mapped_total, 0); - atomic_store32(&_unmapped_total, 0); - atomic_store32(&_mapped_pages_os, 0); - atomic_store32(&_huge_pages_current, 0); - _huge_pages_peak = 0; -#endif - - //Setup all small and medium size classes - size_t iclass = 0; - _memory_size_class[iclass].block_size = SMALL_GRANULARITY; - _memory_adjust_size_class(iclass); - for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) { - size_t size = iclass * SMALL_GRANULARITY; - _memory_size_class[iclass].block_size = (uint32_t)size; - _memory_adjust_size_class(iclass); - } - //At least two blocks per span, then fall back to large allocations - _memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1; - if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT) - _memory_medium_size_limit = MEDIUM_SIZE_LIMIT; - for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { - size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); - if (size > _memory_medium_size_limit) - break; - _memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size; - _memory_adjust_size_class(SMALL_CLASS_COUNT + iclass); - } - - for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) - atomic_store_ptr(&_memory_heaps[list_idx], 0); - - //Initialize this thread - rpmalloc_thread_initialize(); - return 0; -} - -//! Finalize the allocator -TRACY_API void -rpmalloc_finalize(void) { - atomic_thread_fence_acquire(); - - rpmalloc_thread_finalize(); - //rpmalloc_dump_statistics(stderr); - - //Free all thread caches - for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { - heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]); - while (heap) { - if (heap->spans_reserved) { - span_t* span = _memory_map_spans(heap, heap->spans_reserved); - _memory_unmap_span(span); - } - - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - heap_class_t* heap_class = heap->span_class + iclass; - span_t* span = heap_class->partial_span; - while (span) { - span_t* next = span->next; - if (span->state == SPAN_STATE_ACTIVE) { - uint32_t used_blocks = span->block_count; - if (span->free_list_limit < span->block_count) - used_blocks = span->free_list_limit; - uint32_t free_blocks = 0; - void* block = heap_class->free_list; - while (block) { - ++free_blocks; - block = *((void**)block); - } - block = span->free_list; - while (block) { - ++free_blocks; - block = *((void**)block); - } - if (used_blocks == (free_blocks + span->list_size)) - _memory_heap_cache_insert(heap, span); - } else { - if (span->used_count == span->list_size) - _memory_heap_cache_insert(heap, span); - } - span = next; - } - } - -#if ENABLE_THREAD_CACHE - //Free span caches (other thread might have deferred after the thread using this heap finalized) - _memory_heap_cache_adopt_deferred(heap); - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - if (heap->span_cache[iclass]) - _memory_unmap_span_list(heap->span_cache[iclass]); - } -#endif - heap_t* next_heap = heap->next_heap; - size_t heap_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size; - _memory_unmap(heap, heap_size, heap->align_offset, heap_size); - heap = next_heap; - } - } - -#if ENABLE_GLOBAL_CACHE - //Free global caches - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) - _memory_cache_finalize(&_memory_span_cache[iclass]); -#endif - - atomic_store_ptr(&_memory_orphan_heaps, 0); - atomic_thread_fence_release(); - -#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD - pthread_key_delete(_memory_thread_heap); -#endif -#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) - FlsFree(fls_key); -#endif - -#if ENABLE_STATISTICS - //If you hit these asserts you probably have memory leaks or double frees in your code - assert(!atomic_load32(&_mapped_pages)); - assert(!atomic_load32(&_reserved_spans)); - assert(!atomic_load32(&_mapped_pages_os)); -#endif - - _rpmalloc_initialized = 0; -} - -//! Initialize thread, assign heap -TRACY_API void -rpmalloc_thread_initialize(void) { - if (!get_thread_heap_raw()) { - heap_t* heap = _memory_allocate_heap(); - if (heap) { - atomic_thread_fence_acquire(); -#if ENABLE_STATISTICS - atomic_incr32(&_memory_active_heaps); -#endif - set_thread_heap(heap); -#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) - FlsSetValue(fls_key, heap); -#endif - } - } -} - -//! Finalize thread, orphan heap -TRACY_API void -rpmalloc_thread_finalize(void) { - heap_t* heap = get_thread_heap_raw(); - if (heap) - _memory_heap_finalize(heap); -} - -int -rpmalloc_is_thread_initialized(void) { - return (get_thread_heap_raw() != 0) ? 1 : 0; -} - -const rpmalloc_config_t* -rpmalloc_config(void) { - return &_memory_config; -} - -//! Map new pages to virtual memory -static void* -_memory_map_os(size_t size, size_t* offset) { - //Either size is a heap (a single page) or a (multiple) span - we only need to align spans, and only if larger than map granularity - size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0; - assert(size >= _memory_page_size); -#if PLATFORM_WINDOWS - //Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed" - void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); - if (!ptr) { - assert(!"Failed to map virtual memory block"); - return 0; - } -#else - int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED; -# if defined(__APPLE__) - int fd = (int)VM_MAKE_TAG(240U); - if (_memory_huge_pages) - fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB; - void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0); -# elif defined(MAP_HUGETLB) - void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0); -# else - void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0); -# endif - if ((ptr == MAP_FAILED) || !ptr) { - assert("Failed to map virtual memory block" == 0); - return 0; - } -#endif -#if ENABLE_STATISTICS - atomic_add32(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift)); -#endif - if (padding) { - size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask); - assert(final_padding <= _memory_span_size); - assert(final_padding <= padding); - assert(!(final_padding % 8)); - ptr = pointer_offset(ptr, final_padding); - *offset = final_padding >> 3; - } - assert((size < _memory_span_size) || !((uintptr_t)ptr & ~_memory_span_mask)); - return ptr; -} - -//! Unmap pages from virtual memory -static void -_memory_unmap_os(void* address, size_t size, size_t offset, size_t release) { - assert(release || (offset == 0)); - assert(!release || (release >= _memory_page_size)); - assert(size >= _memory_page_size); - if (release && offset) { - offset <<= 3; - address = pointer_offset(address, -(int32_t)offset); -#if PLATFORM_POSIX - //Padding is always one span size - release += _memory_span_size; -#endif - } -#if !DISABLE_UNMAP -#if PLATFORM_WINDOWS - if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) { - assert(!"Failed to unmap virtual memory block"); - } -#else - if (release) { - if (munmap(address, release)) { - assert("Failed to unmap virtual memory block" == 0); - } - } - else { -#if defined(POSIX_MADV_FREE) - if (posix_madvise(address, size, POSIX_MADV_FREE)) -#endif -#if defined(POSIX_MADV_DONTNEED) - if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) { - assert("Failed to madvise virtual memory block as free" == 0); - } -#endif - } -#endif -#endif -#if ENABLE_STATISTICS - if (release) - atomic_add32(&_mapped_pages_os, -(int32_t)(release >> _memory_page_size_shift)); -#endif -} - -// Extern interface - -TRACY_API RPMALLOC_ALLOCATOR void* -rpmalloc(size_t size) { -#if ENABLE_VALIDATE_ARGS - if (size >= MAX_ALLOC_SIZE) { - errno = EINVAL; - return 0; - } -#endif - heap_t* heap = get_thread_heap(); - return _memory_allocate(heap, size); -} - -TRACY_API void -rpfree(void* ptr) { - _memory_deallocate(ptr); -} - -extern inline RPMALLOC_ALLOCATOR void* -rpcalloc(size_t num, size_t size) { - size_t total; -#if ENABLE_VALIDATE_ARGS -#if PLATFORM_WINDOWS - int err = SizeTMult(num, size, &total); - if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#else - int err = __builtin_umull_overflow(num, size, &total); - if (err || (total >= MAX_ALLOC_SIZE)) { - errno = EINVAL; - return 0; - } -#endif -#else - total = num * size; -#endif - heap_t* heap = get_thread_heap(); - void* block = _memory_allocate(heap, total); - memset(block, 0, total); - return block; -} - -TRACY_API RPMALLOC_ALLOCATOR void* -rprealloc(void* ptr, size_t size) { -#if ENABLE_VALIDATE_ARGS - if (size >= MAX_ALLOC_SIZE) { - errno = EINVAL; - return ptr; - } -#endif - return _memory_reallocate(ptr, size, 0, 0); -} - -extern RPMALLOC_ALLOCATOR void* -rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, - unsigned int flags) { -#if ENABLE_VALIDATE_ARGS - if ((size + alignment < size) || (alignment > _memory_page_size)) { - errno = EINVAL; - return 0; - } -#endif - void* block; - if (alignment > 32) { - size_t usablesize = _memory_usable_size(ptr); - if ((usablesize >= size) && (size >= (usablesize / 2)) && !((uintptr_t)ptr & (alignment - 1))) - return ptr; - - block = rpaligned_alloc(alignment, size); - if (ptr) { - if (!oldsize) - oldsize = usablesize; - if (!(flags & RPMALLOC_NO_PRESERVE)) - memcpy(block, ptr, oldsize < size ? oldsize : size); - rpfree(ptr); - } - //Mark as having aligned blocks - span_t* span = (span_t*)((uintptr_t)block & _memory_span_mask); - span->flags |= SPAN_FLAG_ALIGNED_BLOCKS; - } else { - block = _memory_reallocate(ptr, size, oldsize, flags); - } - return block; -} - -extern RPMALLOC_ALLOCATOR void* -rpaligned_alloc(size_t alignment, size_t size) { - if (alignment <= 16) - return rpmalloc(size); +_rpmalloc_aligned_allocate(heap_t* heap, size_t alignment, size_t size) { + if (alignment <= SMALL_GRANULARITY) + return _rpmalloc_allocate(heap, size); #if ENABLE_VALIDATE_ARGS if ((size + alignment) < size) { @@ -2230,15 +2268,26 @@ rpaligned_alloc(size_t alignment, size_t size) { } #endif + if ((alignment <= SPAN_HEADER_SIZE) && (size < _memory_medium_size_limit)) { + // If alignment is less or equal to span header size (which is power of two), + // and size aligned to span header size multiples is less than size + alignment, + // then use natural alignment of blocks to provide alignment + size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) & ~(uintptr_t)(SPAN_HEADER_SIZE - 1) : SPAN_HEADER_SIZE; + rpmalloc_assert(!(multiple_size % SPAN_HEADER_SIZE), "Failed alignment calculation"); + if (multiple_size <= (size + alignment)) + return _rpmalloc_allocate(heap, multiple_size); + } + void* ptr = 0; size_t align_mask = alignment - 1; - if (alignment < _memory_page_size) { - ptr = rpmalloc(size + alignment); - if ((uintptr_t)ptr & align_mask) + if (alignment <= _memory_page_size) { + ptr = _rpmalloc_allocate(heap, size + alignment); + if ((uintptr_t)ptr & align_mask) { ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment); - //Mark as having aligned blocks - span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask); - span->flags |= SPAN_FLAG_ALIGNED_BLOCKS; + //Mark as having aligned blocks + span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask); + span->flags |= SPAN_FLAG_ALIGNED_BLOCKS; + } return ptr; } @@ -2282,7 +2331,7 @@ retry: align_offset = 0; mapped_size = num_pages * _memory_page_size; - span = (span_t*)_memory_map(mapped_size, &align_offset); + span = (span_t*)_rpmalloc_mmap(mapped_size, &align_offset); if (!span) { errno = ENOMEM; return 0; @@ -2295,7 +2344,7 @@ retry: if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) || (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) || (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) { - _memory_unmap(span, mapped_size, align_offset, mapped_size); + _rpmalloc_unmap(span, mapped_size, align_offset, mapped_size); ++num_pages; if (num_pages > limit_pages) { errno = EINVAL; @@ -2305,14 +2354,774 @@ retry: } //Store page count in span_count - span->size_class = (uint32_t)-1; + span->size_class = SIZE_CLASS_HUGE; span->span_count = (uint32_t)num_pages; span->align_offset = (uint32_t)align_offset; - _memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + span->heap = heap; + _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak); + +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span); +#endif + ++heap->full_span_count; + + _rpmalloc_stat_add64(&_allocation_counter, 1); return ptr; } + +//////////// +/// +/// Deallocation entry points +/// +////// + +//! Deallocate the given small/medium memory block in the current thread local heap +static void +_rpmalloc_deallocate_direct_small_or_medium(span_t* span, void* block) { + heap_t* heap = span->heap; + rpmalloc_assert(heap->owner_thread == get_thread_id() || !heap->owner_thread || heap->finalize, "Internal failure"); + //Add block to free list + if (UNEXPECTED(_rpmalloc_span_is_fully_utilized(span))) { + span->used_count = span->block_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span); +#endif + _rpmalloc_span_double_link_list_add(&heap->size_class[span->size_class].partial_span, span); + --heap->full_span_count; + } + *((void**)block) = span->free_list; + --span->used_count; + span->free_list = block; + if (UNEXPECTED(span->used_count == span->list_size)) { + // If there are no used blocks it is guaranteed that no other external thread is accessing the span + if (span->used_count) { + // Make sure we have synchronized the deferred list and list size by using acquire semantics + // and guarantee that no external thread is accessing span concurrently + void* free_list; + do { + free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER); + } while (free_list == INVALID_POINTER); + atomic_store_ptr_release(&span->free_list_deferred, free_list); + } + _rpmalloc_span_double_link_list_remove(&heap->size_class[span->size_class].partial_span, span); + _rpmalloc_span_release_to_cache(heap, span); + } +} + +static void +_rpmalloc_deallocate_defer_free_span(heap_t* heap, span_t* span) { + if (span->size_class != SIZE_CLASS_HUGE) + _rpmalloc_stat_inc(&heap->span_use[span->span_count - 1].spans_deferred); + //This list does not need ABA protection, no mutable side state + do { + span->free_list = (void*)atomic_load_ptr(&heap->span_free_deferred); + } while (!atomic_cas_ptr(&heap->span_free_deferred, span, span->free_list)); +} + +//! Put the block in the deferred free list of the owning span +static void +_rpmalloc_deallocate_defer_small_or_medium(span_t* span, void* block) { + // The memory ordering here is a bit tricky, to avoid having to ABA protect + // the deferred free list to avoid desynchronization of list and list size + // we need to have acquire semantics on successful CAS of the pointer to + // guarantee the list_size variable validity + release semantics on pointer store + void* free_list; + do { + free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER); + } while (free_list == INVALID_POINTER); + *((void**)block) = free_list; + uint32_t free_count = ++span->list_size; + int all_deferred_free = (free_count == span->block_count); + atomic_store_ptr_release(&span->free_list_deferred, block); + if (all_deferred_free) { + // Span was completely freed by this block. Due to the INVALID_POINTER spin lock + // no other thread can reach this state simultaneously on this span. + // Safe to move to owner heap deferred cache + _rpmalloc_deallocate_defer_free_span(span->heap, span); + } +} + +static void +_rpmalloc_deallocate_small_or_medium(span_t* span, void* p) { + _rpmalloc_stat_inc_free(span->heap, span->size_class); + if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) { + //Realign pointer to block start + void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); + p = pointer_offset(p, -(int32_t)(block_offset % span->block_size)); + } + //Check if block belongs to this heap or if deallocation should be deferred +#if RPMALLOC_FIRST_CLASS_HEAPS + int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#else + int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#endif + if (!defer) + _rpmalloc_deallocate_direct_small_or_medium(span, p); + else + _rpmalloc_deallocate_defer_small_or_medium(span, p); +} + +//! Deallocate the given large memory block to the current heap +static void +_rpmalloc_deallocate_large(span_t* span) { + rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, "Bad span size class"); + rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted"); + rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN), "Span flag corrupted"); + //We must always defer (unless finalizing) if from another heap since we cannot touch the list or counters of another heap +#if RPMALLOC_FIRST_CLASS_HEAPS + int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#else + int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#endif + if (defer) { + _rpmalloc_deallocate_defer_free_span(span->heap, span); + return; + } + rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted"); + --span->heap->full_span_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span); +#endif +#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS + //Decrease counter + size_t idx = span->span_count - 1; + atomic_decr32(&span->heap->span_use[idx].current); +#endif + heap_t* heap = span->heap; + rpmalloc_assert(heap, "No thread heap"); +#if ENABLE_THREAD_CACHE + const int set_as_reserved = ((span->span_count > 1) && (heap->span_cache.count == 0) && !heap->finalize && !heap->spans_reserved); +#else + const int set_as_reserved = ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved); +#endif + if (set_as_reserved) { + heap->span_reserve = span; + heap->spans_reserved = span->span_count; + if (span->flags & SPAN_FLAG_MASTER) { + heap->span_reserve_master = span; + } else { //SPAN_FLAG_SUBSPAN + span_t* master = (span_t*)pointer_offset(span, -(intptr_t)((size_t)span->offset_from_master * _memory_span_size)); + heap->span_reserve_master = master; + rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted"); + rpmalloc_assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count, "Master span count corrupted"); + } + _rpmalloc_stat_inc(&heap->span_use[idx].spans_to_reserved); + } else { + //Insert into cache list + _rpmalloc_heap_cache_insert(heap, span); + } +} + +//! Deallocate the given huge span +static void +_rpmalloc_deallocate_huge(span_t* span) { + rpmalloc_assert(span->heap, "No span heap"); +#if RPMALLOC_FIRST_CLASS_HEAPS + int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#else + int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize); +#endif + if (defer) { + _rpmalloc_deallocate_defer_free_span(span->heap, span); + return; + } + rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted"); + --span->heap->full_span_count; +#if RPMALLOC_FIRST_CLASS_HEAPS + _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span); +#endif + + //Oversized allocation, page count is stored in span_count + size_t num_pages = span->span_count; + _rpmalloc_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size); + _rpmalloc_stat_sub(&_huge_pages_current, num_pages); +} + +//! Deallocate the given block +static void +_rpmalloc_deallocate(void* p) { + _rpmalloc_stat_add64(&_deallocation_counter, 1); + //Grab the span (always at start of span, using span alignment) + span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); + if (UNEXPECTED(!span)) + return; + if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) + _rpmalloc_deallocate_small_or_medium(span, p); + else if (span->size_class == SIZE_CLASS_LARGE) + _rpmalloc_deallocate_large(span); + else + _rpmalloc_deallocate_huge(span); +} + +//////////// +/// +/// Reallocation entry points +/// +////// + +static size_t +_rpmalloc_usable_size(void* p); + +//! Reallocate the given block to the given size +static void* +_rpmalloc_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigned int flags) { + if (p) { + //Grab the span using guaranteed span alignment + span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); + if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) { + //Small/medium sized block + rpmalloc_assert(span->span_count == 1, "Span counter corrupted"); + void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start); + uint32_t block_idx = block_offset / span->block_size; + void* block = pointer_offset(blocks_start, (size_t)block_idx * span->block_size); + if (!oldsize) + oldsize = (size_t)((ptrdiff_t)span->block_size - pointer_diff(p, block)); + if ((size_t)span->block_size >= size) { + //Still fits in block, never mind trying to save memory, but preserve data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } else if (span->size_class == SIZE_CLASS_LARGE) { + //Large block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_spans = total_size >> _memory_span_size_shift; + if (total_size & (_memory_span_mask - 1)) + ++num_spans; + size_t current_spans = span->span_count; + void* block = pointer_offset(span, SPAN_HEADER_SIZE); + if (!oldsize) + oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; + if ((current_spans >= num_spans) && (total_size >= (oldsize / 2))) { + //Still fits in block, never mind trying to save memory, but preserve data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } else { + //Oversized block + size_t total_size = size + SPAN_HEADER_SIZE; + size_t num_pages = total_size >> _memory_page_size_shift; + if (total_size & (_memory_page_size - 1)) + ++num_pages; + //Page count is stored in span_count + size_t current_pages = span->span_count; + void* block = pointer_offset(span, SPAN_HEADER_SIZE); + if (!oldsize) + oldsize = (current_pages * _memory_page_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE; + if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) { + //Still fits in block, never mind trying to save memory, but preserve data if alignment changed + if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE)) + memmove(block, p, oldsize); + return block; + } + } + } else { + oldsize = 0; + } + + if (!!(flags & RPMALLOC_GROW_OR_FAIL)) + return 0; + + //Size is greater than block size, need to allocate a new block and deallocate the old + //Avoid hysteresis by overallocating if increase is small (below 37%) + size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3); + size_t new_size = (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size); + void* block = _rpmalloc_allocate(heap, new_size); + if (p && block) { + if (!(flags & RPMALLOC_NO_PRESERVE)) + memcpy(block, p, oldsize < new_size ? oldsize : new_size); + _rpmalloc_deallocate(p); + } + + return block; +} + +static void* +_rpmalloc_aligned_reallocate(heap_t* heap, void* ptr, size_t alignment, size_t size, size_t oldsize, + unsigned int flags) { + if (alignment <= SMALL_GRANULARITY) + return _rpmalloc_reallocate(heap, ptr, size, oldsize, flags); + + int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL); + size_t usablesize = (ptr ? _rpmalloc_usable_size(ptr) : 0); + if ((usablesize >= size) && !((uintptr_t)ptr & (alignment - 1))) { + if (no_alloc || (size >= (usablesize / 2))) + return ptr; + } + // Aligned alloc marks span as having aligned blocks + void* block = (!no_alloc ? _rpmalloc_aligned_allocate(heap, alignment, size) : 0); + if (EXPECTED(block != 0)) { + if (!(flags & RPMALLOC_NO_PRESERVE) && ptr) { + if (!oldsize) + oldsize = usablesize; + memcpy(block, ptr, oldsize < size ? oldsize : size); + } + _rpmalloc_deallocate(ptr); + } + return block; +} + + +//////////// +/// +/// Initialization, finalization and utility +/// +////// + +//! Get the usable size of the given block +static size_t +_rpmalloc_usable_size(void* p) { + //Grab the span using guaranteed span alignment + span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask); + if (span->size_class < SIZE_CLASS_COUNT) { + //Small/medium block + void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE); + return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size); + } + if (span->size_class == SIZE_CLASS_LARGE) { + //Large block + size_t current_spans = span->span_count; + return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span); + } + //Oversized block, page count is stored in span_count + size_t current_pages = span->span_count; + return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span); +} + +//! Adjust and optimize the size class properties for the given class +static void +_rpmalloc_adjust_size_class(size_t iclass) { + size_t block_size = _memory_size_class[iclass].block_size; + size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size; + + _memory_size_class[iclass].block_count = (uint16_t)block_count; + _memory_size_class[iclass].class_idx = (uint16_t)iclass; + + //Check if previous size classes can be merged + if (iclass >= SMALL_CLASS_COUNT) { + size_t prevclass = iclass; + while (prevclass > 0) { + --prevclass; + //A class can be merged if number of pages and number of blocks are equal + if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count) + memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass])); + else + break; + } + } +} + +//! Initialize the allocator and setup global data +extern inline int +rpmalloc_initialize(void) { + if (_rpmalloc_initialized) { + rpmalloc_thread_initialize(); + return 0; + } + return rpmalloc_initialize_config(0); +} + +int +rpmalloc_initialize_config(const rpmalloc_config_t* config) { + if (_rpmalloc_initialized) { + rpmalloc_thread_initialize(); + return 0; + } + _rpmalloc_initialized = 1; + + if (config) + memcpy(&_memory_config, config, sizeof(rpmalloc_config_t)); + else + memset(&_memory_config, 0, sizeof(rpmalloc_config_t)); + + if (!_memory_config.memory_map || !_memory_config.memory_unmap) { + _memory_config.memory_map = _rpmalloc_mmap_os; + _memory_config.memory_unmap = _rpmalloc_unmap_os; + } + +#if PLATFORM_WINDOWS + SYSTEM_INFO system_info; + memset(&system_info, 0, sizeof(system_info)); + GetSystemInfo(&system_info); + _memory_map_granularity = system_info.dwAllocationGranularity; +#else + _memory_map_granularity = (size_t)sysconf(_SC_PAGESIZE); +#endif + +#if RPMALLOC_CONFIGURABLE + _memory_page_size = _memory_config.page_size; +#else + _memory_page_size = 0; +#endif + _memory_huge_pages = 0; + if (!_memory_page_size) { +#if PLATFORM_WINDOWS + _memory_page_size = system_info.dwPageSize; +#else + _memory_page_size = _memory_map_granularity; + if (_memory_config.enable_huge_pages) { +#if defined(__linux__) + size_t huge_page_size = 0; + FILE* meminfo = fopen("/proc/meminfo", "r"); + if (meminfo) { + char line[128]; + while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) { + line[sizeof(line) - 1] = 0; + if (strstr(line, "Hugepagesize:")) + huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024; + } + fclose(meminfo); + } + if (huge_page_size) { + _memory_huge_pages = 1; + _memory_page_size = huge_page_size; + _memory_map_granularity = huge_page_size; + } +#elif defined(__FreeBSD__) + int rc; + size_t sz = sizeof(rc); + + if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 && rc == 1) { + _memory_huge_pages = 1; + _memory_page_size = 2 * 1024 * 1024; + _memory_map_granularity = _memory_page_size; + } +#elif defined(__APPLE__) || defined(__NetBSD__) + _memory_huge_pages = 1; + _memory_page_size = 2 * 1024 * 1024; + _memory_map_granularity = _memory_page_size; +#endif + } +#endif + } else { + if (_memory_config.enable_huge_pages) + _memory_huge_pages = 1; + } + +#if PLATFORM_WINDOWS + if (_memory_config.enable_huge_pages) { + HANDLE token = 0; + size_t large_page_minimum = GetLargePageMinimum(); + if (large_page_minimum) + OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token); + if (token) { + LUID luid; + if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) { + TOKEN_PRIVILEGES token_privileges; + memset(&token_privileges, 0, sizeof(token_privileges)); + token_privileges.PrivilegeCount = 1; + token_privileges.Privileges[0].Luid = luid; + token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) { + if (GetLastError() == ERROR_SUCCESS) + _memory_huge_pages = 1; + } + } + CloseHandle(token); + } + if (_memory_huge_pages) { + if (large_page_minimum > _memory_page_size) + _memory_page_size = large_page_minimum; + if (large_page_minimum > _memory_map_granularity) + _memory_map_granularity = large_page_minimum; + } + } +#endif + + size_t min_span_size = 256; + size_t max_page_size; +#if UINTPTR_MAX > 0xFFFFFFFF + max_page_size = 4096ULL * 1024ULL * 1024ULL; +#else + max_page_size = 4 * 1024 * 1024; +#endif + if (_memory_page_size < min_span_size) + _memory_page_size = min_span_size; + if (_memory_page_size > max_page_size) + _memory_page_size = max_page_size; + _memory_page_size_shift = 0; + size_t page_size_bit = _memory_page_size; + while (page_size_bit != 1) { + ++_memory_page_size_shift; + page_size_bit >>= 1; + } + _memory_page_size = ((size_t)1 << _memory_page_size_shift); + +#if RPMALLOC_CONFIGURABLE + if (!_memory_config.span_size) { + _memory_span_size = _memory_default_span_size; + _memory_span_size_shift = _memory_default_span_size_shift; + _memory_span_mask = _memory_default_span_mask; + } else { + size_t span_size = _memory_config.span_size; + if (span_size > (256 * 1024)) + span_size = (256 * 1024); + _memory_span_size = 4096; + _memory_span_size_shift = 12; + while (_memory_span_size < span_size) { + _memory_span_size <<= 1; + ++_memory_span_size_shift; + } + _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1); + } +#endif + + _memory_span_map_count = ( _memory_config.span_map_count ? _memory_config.span_map_count : DEFAULT_SPAN_MAP_COUNT); + if ((_memory_span_size * _memory_span_map_count) < _memory_page_size) + _memory_span_map_count = (_memory_page_size / _memory_span_size); + if ((_memory_page_size >= _memory_span_size) && ((_memory_span_map_count * _memory_span_size) % _memory_page_size)) + _memory_span_map_count = (_memory_page_size / _memory_span_size); + _memory_heap_reserve_count = (_memory_span_map_count > DEFAULT_SPAN_MAP_COUNT) ? DEFAULT_SPAN_MAP_COUNT : _memory_span_map_count; + + _memory_config.page_size = _memory_page_size; + _memory_config.span_size = _memory_span_size; + _memory_config.span_map_count = _memory_span_map_count; + _memory_config.enable_huge_pages = _memory_huge_pages; + +#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) || defined(__TINYC__) + if (pthread_key_create(&_memory_thread_heap, _rpmalloc_heap_release_raw_fc)) + return -1; +#endif +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + fls_key = FlsAlloc(&_rpmalloc_thread_destructor); +#endif + + //Setup all small and medium size classes + size_t iclass = 0; + _memory_size_class[iclass].block_size = SMALL_GRANULARITY; + _rpmalloc_adjust_size_class(iclass); + for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) { + size_t size = iclass * SMALL_GRANULARITY; + _memory_size_class[iclass].block_size = (uint32_t)size; + _rpmalloc_adjust_size_class(iclass); + } + //At least two blocks per span, then fall back to large allocations + _memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1; + if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT) + _memory_medium_size_limit = MEDIUM_SIZE_LIMIT; + for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) { + size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY); + if (size > _memory_medium_size_limit) + break; + _memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size; + _rpmalloc_adjust_size_class(SMALL_CLASS_COUNT + iclass); + } + + _memory_orphan_heaps = 0; +#if RPMALLOC_FIRST_CLASS_HEAPS + _memory_first_class_orphan_heaps = 0; +#endif +#if ENABLE_STATISTICS + atomic_store32(&_memory_active_heaps, 0); + atomic_store32(&_mapped_pages, 0); + _mapped_pages_peak = 0; + atomic_store32(&_master_spans, 0); + atomic_store32(&_mapped_total, 0); + atomic_store32(&_unmapped_total, 0); + atomic_store32(&_mapped_pages_os, 0); + atomic_store32(&_huge_pages_current, 0); + _huge_pages_peak = 0; +#endif + memset(_memory_heaps, 0, sizeof(_memory_heaps)); + atomic_store32_release(&_memory_global_lock, 0); + + //Initialize this thread + rpmalloc_thread_initialize(); + return 0; +} + +//! Finalize the allocator +void +rpmalloc_finalize(void) { + rpmalloc_thread_finalize(1); + //rpmalloc_dump_statistics(stdout); + + if (_memory_global_reserve) { + atomic_add32(&_memory_global_reserve_master->remaining_spans, -(int32_t)_memory_global_reserve_count); + _memory_global_reserve_master = 0; + _memory_global_reserve_count = 0; + _memory_global_reserve = 0; + } + atomic_store32_release(&_memory_global_lock, 0); + + //Free all thread caches and fully free spans + for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { + heap_t* heap = _memory_heaps[list_idx]; + while (heap) { + heap_t* next_heap = heap->next_heap; + heap->finalize = 1; + _rpmalloc_heap_global_finalize(heap); + heap = next_heap; + } + } + +#if ENABLE_GLOBAL_CACHE + //Free global caches + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) + _rpmalloc_global_cache_finalize(&_memory_span_cache[iclass]); +#endif + +#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD + pthread_key_delete(_memory_thread_heap); +#endif +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsFree(fls_key); + fls_key = 0; +#endif +#if ENABLE_STATISTICS + //If you hit these asserts you probably have memory leaks (perhaps global scope data doing dynamic allocations) or double frees in your code + rpmalloc_assert(atomic_load32(&_mapped_pages) == 0, "Memory leak detected"); + rpmalloc_assert(atomic_load32(&_mapped_pages_os) == 0, "Memory leak detected"); +#endif + + _rpmalloc_initialized = 0; +} + +//! Initialize thread, assign heap +extern inline void +rpmalloc_thread_initialize(void) { + if (!get_thread_heap_raw()) { + heap_t* heap = _rpmalloc_heap_allocate(0); + if (heap) { + _rpmalloc_stat_inc(&_memory_active_heaps); + set_thread_heap(heap); +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsSetValue(fls_key, heap); +#endif + } + } +} + +//! Finalize thread, orphan heap +void +rpmalloc_thread_finalize(int release_caches) { + heap_t* heap = get_thread_heap_raw(); + if (heap) + _rpmalloc_heap_release_raw(heap, release_caches); + set_thread_heap(0); +#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK) + FlsSetValue(fls_key, 0); +#endif +} + +int +rpmalloc_is_thread_initialized(void) { + return (get_thread_heap_raw() != 0) ? 1 : 0; +} + +const rpmalloc_config_t* +rpmalloc_config(void) { + return &_memory_config; +} + +// Extern interface + +extern inline RPMALLOC_ALLOCATOR void* +rpmalloc(size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + heap_t* heap = get_thread_heap(); + return _rpmalloc_allocate(heap, size); +} + +extern inline void +rpfree(void* ptr) { + _rpmalloc_deallocate(ptr); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpcalloc(size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + heap_t* heap = get_thread_heap(); + void* block = _rpmalloc_allocate(heap, total); + if (block) + memset(block, 0, total); + return block; +} + +extern inline RPMALLOC_ALLOCATOR void* +rprealloc(void* ptr, size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return ptr; + } +#endif + heap_t* heap = get_thread_heap(); + return _rpmalloc_reallocate(heap, ptr, size, 0, 0); +} + +extern RPMALLOC_ALLOCATOR void* +rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, + unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if ((size + alignment < size) || (alignment > _memory_page_size)) { + errno = EINVAL; + return 0; + } +#endif + heap_t* heap = get_thread_heap(); + return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, oldsize, flags); +} + +extern RPMALLOC_ALLOCATOR void* +rpaligned_alloc(size_t alignment, size_t size) { + heap_t* heap = get_thread_heap(); + return _rpmalloc_aligned_allocate(heap, alignment, size); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpaligned_calloc(size_t alignment, size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + void* block = rpaligned_alloc(alignment, total); + if (block) + memset(block, 0, total); + return block; +} + extern inline RPMALLOC_ALLOCATOR void* rpmemalign(size_t alignment, size_t size) { return rpaligned_alloc(alignment, size); @@ -2329,7 +3138,7 @@ rpposix_memalign(void **memptr, size_t alignment, size_t size) { extern inline size_t rpmalloc_usable_size(void* ptr) { - return (ptr ? _memory_usable_size(ptr) : 0); + return (ptr ? _rpmalloc_usable_size(ptr) : 0); } extern inline void @@ -2345,13 +3154,13 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) { for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { size_class_t* size_class = _memory_size_class + iclass; - heap_class_t* heap_class = heap->span_class + iclass; - span_t* span = heap_class->partial_span; + span_t* span = heap->size_class[iclass].partial_span; while (span) { - atomic_thread_fence_acquire(); size_t free_count = span->list_size; - if (span->state == SPAN_STATE_PARTIAL) - free_count += (size_class->block_count - span->used_count); + size_t block_count = size_class->block_count; + if (span->free_list_limit < block_count) + block_count = span->free_list_limit; + free_count += (block_count - span->used_count); stats->sizecache = free_count * size_class->block_size; span = span->next; } @@ -2359,38 +3168,46 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) { #if ENABLE_THREAD_CACHE for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - if (heap->span_cache[iclass]) - stats->spancache = (size_t)heap->span_cache[iclass]->list_size * (iclass + 1) * _memory_span_size; - span_t* deferred_list = !iclass ? (span_t*)atomic_load_ptr(&heap->span_cache_deferred) : 0; - //TODO: Incorrect, for deferred lists the size is NOT stored in list_size - if (deferred_list) - stats->spancache = (size_t)deferred_list->list_size * (iclass + 1) * _memory_span_size; + span_cache_t* span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1)); + stats->spancache = span_cache->count * (iclass + 1) * _memory_span_size; } #endif + + span_t* deferred = (span_t*)atomic_load_ptr(&heap->span_free_deferred); + while (deferred) { + if (deferred->size_class != SIZE_CLASS_HUGE) + stats->spancache = (size_t)deferred->span_count * _memory_span_size; + deferred = (span_t*)deferred->free_list; + } + #if ENABLE_STATISTICS - stats->thread_to_global = heap->thread_to_global; - stats->global_to_thread = heap->global_to_thread; + stats->thread_to_global = (size_t)atomic_load64(&heap->thread_to_global); + stats->global_to_thread = (size_t)atomic_load64(&heap->global_to_thread); for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { stats->span_use[iclass].current = (size_t)atomic_load32(&heap->span_use[iclass].current); - stats->span_use[iclass].peak = (size_t)heap->span_use[iclass].high; - stats->span_use[iclass].to_global = (size_t)heap->span_use[iclass].spans_to_global; - stats->span_use[iclass].from_global = (size_t)heap->span_use[iclass].spans_from_global; - stats->span_use[iclass].to_cache = (size_t)heap->span_use[iclass].spans_to_cache; - stats->span_use[iclass].from_cache = (size_t)heap->span_use[iclass].spans_from_cache; - stats->span_use[iclass].to_reserved = (size_t)heap->span_use[iclass].spans_to_reserved; - stats->span_use[iclass].from_reserved = (size_t)heap->span_use[iclass].spans_from_reserved; - stats->span_use[iclass].map_calls = (size_t)heap->span_use[iclass].spans_map_calls; + stats->span_use[iclass].peak = (size_t)atomic_load32(&heap->span_use[iclass].high); + stats->span_use[iclass].to_global = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_global); + stats->span_use[iclass].from_global = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_global); + stats->span_use[iclass].to_cache = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache); + stats->span_use[iclass].from_cache = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache); + stats->span_use[iclass].to_reserved = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved); + stats->span_use[iclass].from_reserved = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved); + stats->span_use[iclass].map_calls = (size_t)atomic_load32(&heap->span_use[iclass].spans_map_calls); } for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { stats->size_use[iclass].alloc_current = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current); stats->size_use[iclass].alloc_peak = (size_t)heap->size_class_use[iclass].alloc_peak; - stats->size_use[iclass].alloc_total = (size_t)heap->size_class_use[iclass].alloc_total; + stats->size_use[iclass].alloc_total = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_total); stats->size_use[iclass].free_total = (size_t)atomic_load32(&heap->size_class_use[iclass].free_total); - stats->size_use[iclass].spans_to_cache = (size_t)heap->size_class_use[iclass].spans_to_cache; - stats->size_use[iclass].spans_from_cache = (size_t)heap->size_class_use[iclass].spans_from_cache; - stats->size_use[iclass].spans_from_reserved = (size_t)heap->size_class_use[iclass].spans_from_reserved; - stats->size_use[iclass].map_calls = (size_t)heap->size_class_use[iclass].spans_map_calls; + stats->size_use[iclass].spans_to_cache = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache); + stats->size_use[iclass].spans_from_cache = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache); + stats->size_use[iclass].spans_from_reserved = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_reserved); + stats->size_use[iclass].map_calls = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_map_calls); } #endif } @@ -2407,94 +3224,319 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) { stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size; #endif #if ENABLE_GLOBAL_CACHE - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - stats->cached += (size_t)atomic_load32(&_memory_span_cache[iclass].size) * (iclass + 1) * _memory_span_size; - } + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) + stats->cached += _memory_span_cache[iclass].count * (iclass + 1) * _memory_span_size; #endif } +#if ENABLE_STATISTICS + +static void +_memory_heap_dump_statistics(heap_t* heap, void* file) { + fprintf(file, "Heap %d stats:\n", heap->id); + fprintf(file, "Class CurAlloc PeakAlloc TotAlloc TotFree BlkSize BlkCount SpansCur SpansPeak PeakAllocMiB ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n"); + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) + continue; + fprintf(file, "%3u: %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass, + atomic_load32(&heap->size_class_use[iclass].alloc_current), + heap->size_class_use[iclass].alloc_peak, + atomic_load32(&heap->size_class_use[iclass].alloc_total), + atomic_load32(&heap->size_class_use[iclass].free_total), + _memory_size_class[iclass].block_size, + _memory_size_class[iclass].block_count, + atomic_load32(&heap->size_class_use[iclass].spans_current), + heap->size_class_use[iclass].spans_peak, + ((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_reserved) * _memory_span_size) / (size_t)(1024 * 1024), + atomic_load32(&heap->size_class_use[iclass].spans_map_calls)); + } + fprintf(file, "Spans Current Peak Deferred PeakMiB Cached ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB MmapCalls\n"); + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + if (!atomic_load32(&heap->span_use[iclass].high) && !atomic_load32(&heap->span_use[iclass].spans_map_calls)) + continue; + fprintf(file, "%4u: %8d %8u %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1), + atomic_load32(&heap->span_use[iclass].current), + atomic_load32(&heap->span_use[iclass].high), + atomic_load32(&heap->span_use[iclass].spans_deferred), + ((size_t)atomic_load32(&heap->span_use[iclass].high) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), +#if ENABLE_THREAD_CACHE + (unsigned int)(!iclass ? heap->span_cache.count : heap->span_large_cache[iclass - 1].count), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), +#else + 0, (size_t)0, (size_t)0, +#endif + ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_global) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), + ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_global) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), + atomic_load32(&heap->span_use[iclass].spans_map_calls)); + } + fprintf(file, "Full spans: %zu\n", heap->full_span_count); + fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n"); + fprintf(file, "%17zu %17zu\n", (size_t)atomic_load64(&heap->thread_to_global) / (size_t)(1024 * 1024), (size_t)atomic_load64(&heap->global_to_thread) / (size_t)(1024 * 1024)); +} + +#endif + void rpmalloc_dump_statistics(void* file) { #if ENABLE_STATISTICS - //If you hit this assert, you still have active threads or forgot to finalize some thread(s) - assert(atomic_load32(&_memory_active_heaps) == 0); - for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) { - heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]); + heap_t* heap = _memory_heaps[list_idx]; while (heap) { - fprintf(file, "Heap %d stats:\n", heap->id); - fprintf(file, "Class CurAlloc PeakAlloc TotAlloc TotFree BlkSize BlkCount SpansCur SpansPeak PeakAllocMiB ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n"); - for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { - if (!heap->size_class_use[iclass].alloc_total) { - assert(!atomic_load32(&heap->size_class_use[iclass].free_total)); - assert(!heap->size_class_use[iclass].spans_map_calls); + int need_dump = 0; + for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT); ++iclass) { + if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) { + rpmalloc_assert(!atomic_load32(&heap->size_class_use[iclass].free_total), "Heap statistics counter mismatch"); + rpmalloc_assert(!atomic_load32(&heap->size_class_use[iclass].spans_map_calls), "Heap statistics counter mismatch"); continue; } - fprintf(file, "%3u: %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass, - atomic_load32(&heap->size_class_use[iclass].alloc_current), - heap->size_class_use[iclass].alloc_peak, - heap->size_class_use[iclass].alloc_total, - atomic_load32(&heap->size_class_use[iclass].free_total), - _memory_size_class[iclass].block_size, - _memory_size_class[iclass].block_count, - heap->size_class_use[iclass].spans_current, - heap->size_class_use[iclass].spans_peak, - ((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024), - ((size_t)heap->size_class_use[iclass].spans_to_cache * _memory_span_size) / (size_t)(1024 * 1024), - ((size_t)heap->size_class_use[iclass].spans_from_cache * _memory_span_size) / (size_t)(1024 * 1024), - ((size_t)heap->size_class_use[iclass].spans_from_reserved * _memory_span_size) / (size_t)(1024 * 1024), - heap->size_class_use[iclass].spans_map_calls); + need_dump = 1; } - fprintf(file, "Spans Current Peak PeakMiB Cached ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB MmapCalls\n"); - for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { - if (!heap->span_use[iclass].high && !heap->span_use[iclass].spans_map_calls) + for (size_t iclass = 0; !need_dump && (iclass < LARGE_CLASS_COUNT); ++iclass) { + if (!atomic_load32(&heap->span_use[iclass].high) && !atomic_load32(&heap->span_use[iclass].spans_map_calls)) continue; - fprintf(file, "%4u: %8d %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1), - atomic_load32(&heap->span_use[iclass].current), - heap->span_use[iclass].high, - ((size_t)heap->span_use[iclass].high * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), - heap->span_cache[iclass] ? heap->span_cache[iclass]->list_size : 0, - ((size_t)heap->span_use[iclass].spans_to_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), - ((size_t)heap->span_use[iclass].spans_from_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), - ((size_t)heap->span_use[iclass].spans_to_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), - ((size_t)heap->span_use[iclass].spans_from_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024), - ((size_t)heap->span_use[iclass].spans_to_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), - ((size_t)heap->span_use[iclass].spans_from_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024), - heap->span_use[iclass].spans_map_calls); + need_dump = 1; } - fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n"); - fprintf(file, "%17zu %17zu\n", (size_t)heap->thread_to_global / (size_t)(1024 * 1024), (size_t)heap->global_to_thread / (size_t)(1024 * 1024)); + if (need_dump) + _memory_heap_dump_statistics(heap, file); heap = heap->next_heap; } } - fprintf(file, "Global stats:\n"); size_t huge_current = (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size; size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size; fprintf(file, "HugeCurrentMiB HugePeakMiB\n"); fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024), huge_peak / (size_t)(1024 * 1024)); + fprintf(file, "GlobalCacheMiB\n"); + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + global_cache_t* cache = _memory_span_cache + iclass; + size_t global_cache = (size_t)cache->count * iclass * _memory_span_size; + + size_t global_overflow_cache = 0; + span_t* span = cache->overflow; + while (span) { + global_overflow_cache += iclass * _memory_span_size; + span = span->next; + } + if (global_cache || global_overflow_cache || cache->insert_count || cache->extract_count) + fprintf(file, "%4zu: %8zuMiB (%8zuMiB overflow) %14zu insert %14zu extract\n", iclass + 1, global_cache / (size_t)(1024 * 1024), global_overflow_cache / (size_t)(1024 * 1024), cache->insert_count, cache->extract_count); + } + size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size; size_t mapped_os = (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size; size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size; size_t mapped_total = (size_t)atomic_load32(&_mapped_total) * _memory_page_size; size_t unmapped_total = (size_t)atomic_load32(&_unmapped_total) * _memory_page_size; - size_t reserved_total = (size_t)atomic_load32(&_reserved_spans) * _memory_span_size; - fprintf(file, "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB ReservedTotalMiB\n"); - fprintf(file, "%9zu %11zu %13zu %14zu %16zu %16zu\n", + fprintf(file, "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB\n"); + fprintf(file, "%9zu %11zu %13zu %14zu %16zu\n", mapped / (size_t)(1024 * 1024), mapped_os / (size_t)(1024 * 1024), mapped_peak / (size_t)(1024 * 1024), mapped_total / (size_t)(1024 * 1024), - unmapped_total / (size_t)(1024 * 1024), - reserved_total / (size_t)(1024 * 1024)); + unmapped_total / (size_t)(1024 * 1024)); fprintf(file, "\n"); -#else +#if 0 + int64_t allocated = atomic_load64(&_allocation_counter); + int64_t deallocated = atomic_load64(&_deallocation_counter); + fprintf(file, "Allocation count: %lli\n", allocated); + fprintf(file, "Deallocation count: %lli\n", deallocated); + fprintf(file, "Current allocations: %lli\n", (allocated - deallocated)); + fprintf(file, "Master spans: %d\n", atomic_load32(&_master_spans)); + fprintf(file, "Dangling master spans: %d\n", atomic_load32(&_unmapped_master_spans)); +#endif +#endif (void)sizeof(file); +} + +#if RPMALLOC_FIRST_CLASS_HEAPS + +extern inline rpmalloc_heap_t* +rpmalloc_heap_acquire(void) { + // Must be a pristine heap from newly mapped memory pages, or else memory blocks + // could already be allocated from the heap which would (wrongly) be released when + // heap is cleared with rpmalloc_heap_free_all(). Also heaps guaranteed to be + // pristine from the dedicated orphan list can be used. + heap_t* heap = _rpmalloc_heap_allocate(1); + heap->owner_thread = 0; + _rpmalloc_stat_inc(&_memory_active_heaps); + return heap; +} + +extern inline void +rpmalloc_heap_release(rpmalloc_heap_t* heap) { + if (heap) + _rpmalloc_heap_release(heap, 1, 1); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + return _rpmalloc_allocate(heap, size); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return 0; + } +#endif + return _rpmalloc_aligned_allocate(heap, alignment, size); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) { + return rpmalloc_heap_aligned_calloc(heap, 0, num, size); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) { + size_t total; +#if ENABLE_VALIDATE_ARGS +#if PLATFORM_WINDOWS + int err = SizeTMult(num, size, &total); + if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#else + int err = __builtin_umull_overflow(num, size, &total); + if (err || (total >= MAX_ALLOC_SIZE)) { + errno = EINVAL; + return 0; + } +#endif +#else + total = num * size; +#endif + void* block = _rpmalloc_aligned_allocate(heap, alignment, total); + if (block) + memset(block, 0, total); + return block; +} + +extern inline RPMALLOC_ALLOCATOR void* +rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if (size >= MAX_ALLOC_SIZE) { + errno = EINVAL; + return ptr; + } +#endif + return _rpmalloc_reallocate(heap, ptr, size, 0, flags); +} + +extern inline RPMALLOC_ALLOCATOR void* +rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) { +#if ENABLE_VALIDATE_ARGS + if ((size + alignment < size) || (alignment > _memory_page_size)) { + errno = EINVAL; + return 0; + } +#endif + return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, 0, flags); +} + +extern inline void +rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr) { + (void)sizeof(heap); + _rpmalloc_deallocate(ptr); +} + +extern inline void +rpmalloc_heap_free_all(rpmalloc_heap_t* heap) { + span_t* span; + span_t* next_span; + + _rpmalloc_heap_cache_adopt_deferred(heap, 0); + + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + span = heap->size_class[iclass].partial_span; + while (span) { + next_span = span->next; + _rpmalloc_heap_cache_insert(heap, span); + span = next_span; + } + heap->size_class[iclass].partial_span = 0; + span = heap->full_span[iclass]; + while (span) { + next_span = span->next; + _rpmalloc_heap_cache_insert(heap, span); + span = next_span; + } + } + memset(heap->size_class, 0, sizeof(heap->size_class)); + memset(heap->full_span, 0, sizeof(heap->full_span)); + + span = heap->large_huge_span; + while (span) { + next_span = span->next; + if (UNEXPECTED(span->size_class == SIZE_CLASS_HUGE)) + _rpmalloc_deallocate_huge(span); + else + _rpmalloc_heap_cache_insert(heap, span); + span = next_span; + } + heap->large_huge_span = 0; + heap->full_span_count = 0; + +#if ENABLE_THREAD_CACHE + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + span_cache_t* span_cache; + if (!iclass) + span_cache = &heap->span_cache; + else + span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1)); + if (!span_cache->count) + continue; +#if ENABLE_GLOBAL_CACHE + _rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size); + _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count); + _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count); +#else + for (size_t ispan = 0; ispan < span_cache->count; ++ispan) + _rpmalloc_span_unmap(span_cache->span[ispan]); +#endif + span_cache->count = 0; + } +#endif + +#if ENABLE_STATISTICS + for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) { + atomic_store32(&heap->size_class_use[iclass].alloc_current, 0); + atomic_store32(&heap->size_class_use[iclass].spans_current, 0); + } + for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) { + atomic_store32(&heap->span_use[iclass].current, 0); + } #endif } +extern inline void +rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap) { + heap_t* prev_heap = get_thread_heap_raw(); + if (prev_heap != heap) { + set_thread_heap(heap); + if (prev_heap) + rpmalloc_heap_release(prev_heap); + } +} + +#endif + } #endif diff --git a/public/client/tracy_rpmalloc.hpp b/public/client/tracy_rpmalloc.hpp index ef92db18..51216a21 100644 --- a/public/client/tracy_rpmalloc.hpp +++ b/public/client/tracy_rpmalloc.hpp @@ -20,11 +20,12 @@ namespace tracy #if defined(__clang__) || defined(__GNUC__) # define RPMALLOC_EXPORT __attribute__((visibility("default"))) # define RPMALLOC_ALLOCATOR -# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__)) -# if defined(__clang_major__) && (__clang_major__ < 4) +# if (defined(__clang_major__) && (__clang_major__ < 4)) || (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD) +# define RPMALLOC_ATTRIB_MALLOC # define RPMALLOC_ATTRIB_ALLOC_SIZE(size) # define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) # else +# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__)) # define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size))) # define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) __attribute__((alloc_size(count, size))) # endif @@ -45,13 +46,24 @@ namespace tracy # define RPMALLOC_CDECL #endif -//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes +//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce +// a very small overhead due to some size calculations not being compile time constants #ifndef RPMALLOC_CONFIGURABLE #define RPMALLOC_CONFIGURABLE 0 #endif +//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_* functions). +// Will introduce a very small overhead to track fully allocated spans in heaps +#ifndef RPMALLOC_FIRST_CLASS_HEAPS +#define RPMALLOC_FIRST_CLASS_HEAPS 0 +#endif + //! Flag to rpaligned_realloc to not preserve content in reallocation #define RPMALLOC_NO_PRESERVE 1 +//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be done in-place, +// in which case the original pointer is still valid (just like a call to realloc which failes to allocate +// a new block). +#define RPMALLOC_GROW_OR_FAIL 2 typedef struct rpmalloc_global_statistics_t { //! Current amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1) @@ -99,7 +111,7 @@ typedef struct rpmalloc_thread_statistics_t { size_t from_reserved; //! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls) size_t map_calls; - } span_use[32]; + } span_use[64]; //! Per size class statistics (only if ENABLE_STATISTICS=1) struct { //! Current number of allocations @@ -131,7 +143,8 @@ typedef struct rpmalloc_config_t { // larger than 65535 (storable in an uint16_t), if it is you must use natural // alignment to shift it into 16 bits. If you set a memory_map function, you // must also set a memory_unmap function or else the default implementation will - // be used for both. + // be used for both. This function must be thread safe, it can be called by + // multiple threads simultaneously. void* (*memory_map)(size_t size, size_t* offset); //! Unmap the memory pages starting at address and spanning the given number of bytes. // If release is set to non-zero, the unmap is for an entire span range as returned by @@ -139,8 +152,18 @@ typedef struct rpmalloc_config_t { // release argument holds the size of the entire span range. If release is set to 0, // the unmap is a partial decommit of a subset of the mapped memory range. // If you set a memory_unmap function, you must also set a memory_map function or - // else the default implementation will be used for both. + // else the default implementation will be used for both. This function must be thread + // safe, it can be called by multiple threads simultaneously. void (*memory_unmap)(void* address, size_t size, size_t offset, size_t release); + //! Called when an assert fails, if asserts are enabled. Will use the standard assert() + // if this is not set. + void (*error_callback)(const char* message); + //! Called when a call to map memory pages fails (out of memory). If this callback is + // not set or returns zero the library will return a null pointer in the allocation + // call. If this callback returns non-zero the map call will be retried. The argument + // passed is the number of bytes that was requested in the map call. Only used if + // the default system memory map function is used (memory_map callback is not set). + int (*map_fail_callback)(size_t size); //! Size of memory pages. The page size MUST be a power of two. All memory mapping // requests to memory_map will be made with size set to a multiple of the page size. // Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system page size is used. @@ -163,6 +186,10 @@ typedef struct rpmalloc_config_t { // For Windows, see https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support // For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt int enable_huge_pages; + //! Respectively allocated pages and huge allocated pages names for systems + // supporting it to be able to distinguish among anonymous regions. + const char *page_name; + const char *huge_page_name; } rpmalloc_config_t; //! Initialize allocator with default configuration @@ -187,7 +214,7 @@ rpmalloc_thread_initialize(void); //! Finalize allocator for calling thread TRACY_API void -rpmalloc_thread_finalize(void); +rpmalloc_thread_finalize(int release_caches); //! Perform deferred deallocations pending for the calling thread heap RPMALLOC_EXPORT void @@ -240,6 +267,13 @@ rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsi RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2); +//! Allocate a memory block of at least the given size and alignment, and zero initialize it. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpaligned_calloc(size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + //! Allocate a memory block of at least the given size and alignment. // Alignment must be a power of two and a multiple of sizeof(void*), // and should ideally be less than memory page size. A caveat of rpmalloc @@ -252,10 +286,78 @@ rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB // and should ideally be less than memory page size. A caveat of rpmalloc // internals is that this must also be strictly less than the span size (default 64KiB) RPMALLOC_EXPORT int -rpposix_memalign(void **memptr, size_t alignment, size_t size); +rpposix_memalign(void** memptr, size_t alignment, size_t size); //! Query the usable size of the given memory block (from given pointer to the end of block) RPMALLOC_EXPORT size_t rpmalloc_usable_size(void* ptr); +#if RPMALLOC_FIRST_CLASS_HEAPS + +//! Heap type +typedef struct heap_t rpmalloc_heap_t; + +//! Acquire a new heap. Will reuse existing released heaps or allocate memory for a new heap +// if none available. Heap API is implemented with the strict assumption that only one single +// thread will call heap functions for a given heap at any given time, no functions are thread safe. +RPMALLOC_EXPORT rpmalloc_heap_t* +rpmalloc_heap_acquire(void); + +//! Release a heap (does NOT free the memory allocated by the heap, use rpmalloc_heap_free_all before destroying the heap). +// Releasing a heap will enable it to be reused by other threads. Safe to pass a null pointer. +RPMALLOC_EXPORT void +rpmalloc_heap_release(rpmalloc_heap_t* heap); + +//! Allocate a memory block of at least the given size using the given heap. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size using the given heap. The returned +// block will have the requested alignment. Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Allocate a memory block of at least the given size using the given heap and zero initialize it. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + +//! Allocate a memory block of at least the given size using the given heap and zero initialize it. The returned +// block will have the requested alignment. Alignment must either be zero, or a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + +//! Reallocate the given block to at least the given size. The memory block MUST be allocated +// by the same heap given to this function. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Reallocate the given block to at least the given size. The memory block MUST be allocated +// by the same heap given to this function. The returned block will have the requested alignment. +// Alignment must be either zero, or a power of two and a multiple of sizeof(void*), and should ideally be +// less than memory page size. A caveat of rpmalloc internals is that this must also be strictly less than +// the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void* +rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(4); + +//! Free the given memory block from the given heap. The memory block MUST be allocated +// by the same heap given to this function. +RPMALLOC_EXPORT void +rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr); + +//! Free all memory allocated by the heap +RPMALLOC_EXPORT void +rpmalloc_heap_free_all(rpmalloc_heap_t* heap); + +//! Set the given heap as the current heap for the calling thread. A heap MUST only be current heap +// for a single thread, a heap can never be shared between multiple threads. The previous +// current heap for the calling thread is released to be reused by other threads. +RPMALLOC_EXPORT void +rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap); + +#endif + }