From ad5ed7425599aa5f48d2ce3296e6dc358d4e1d69 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Sat, 18 Apr 2026 01:13:15 -0700 Subject: [PATCH 1/7] GH-148937: fix for free-threaded GC (RSS based defer) Asking the OS for the process memory usage doesn't work will given how mimalloc works. It does not promptly return memory to the OS and so the memory doesn't drop after cyclic trash is freed. Instead of asking the OS, use mimalloc APIs to compute how much memory is being used by all mimalloc arenas. We need to stop-the-world to do this but usually we can avoid doing a collection. So, from a performance perspective, this is worth it. --- Include/internal/pycore_interp_structs.h | 11 +- Python/gc_free_threading.c | 271 +++++++---------------- 2 files changed, 82 insertions(+), 200 deletions(-) diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index cccfe3565db6e07..3dd9aa5430a6418 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -260,15 +260,14 @@ struct _gc_runtime_state { /* True if gc.freeze() has been used. */ int freeze_active; - /* Memory usage of the process (RSS + swap) after last GC. */ - Py_ssize_t last_mem; + /* Sum of area->used*area->block_size across all mimalloc heaps after last + GC, in KB. Updated under stop-the-world so the measurement is accurate + even when OS pages are being reused. */ + Py_ssize_t last_gc_used; /* This accumulates the new object count whenever collection is deferred - due to the RSS increase condition not being meet. Reset on collection. */ + due to memory usage not increasing enough. Reset on collection. */ Py_ssize_t deferred_count; - - /* Mutex held for gc_should_collect_mem_usage(). */ - PyMutex mutex; #endif }; diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 4b46ca04f56b201..25de084b1203ae9 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -17,30 +17,7 @@ #include "pydtrace.h" -// Platform-specific includes for get_process_mem_usage(). -#ifdef _WIN32 - #include - #include // For GetProcessMemoryInfo -#elif defined(__linux__) - #include // For sysconf, getpid -#elif defined(__APPLE__) - #include - #include // Required for TASK_VM_INFO - #include // For sysconf, getpid -#elif defined(__FreeBSD__) - #include - #include - #include // Requires sys/user.h for kinfo_proc definition - #include - #include // For sysconf, getpid - #include // For O_RDONLY - #include // For _POSIX2_LINE_MAX -#elif defined(__OpenBSD__) - #include - #include - #include // For kinfo_proc - #include // For sysconf, getpid -#endif +#include "pycore_mimalloc.h" // mi_heap_visit_blocks() // enable the "mark alive" pass of GC #define GC_ENABLE_MARK_ALIVE 1 @@ -2016,188 +1993,93 @@ cleanup_worklist(struct worklist *worklist) } } -// Return the memory usage (typically RSS + swap) of the process, in units of -// KB. Returns -1 if this operation is not supported or on failure. -static Py_ssize_t -get_process_mem_usage(void) -{ -#ifdef _WIN32 - // Windows implementation using GetProcessMemoryInfo - // Returns WorkingSetSize + PagefileUsage - PROCESS_MEMORY_COUNTERS pmc; - HANDLE hProcess = GetCurrentProcess(); - if (NULL == hProcess) { - // Should not happen for the current process - return -1; - } - - // GetProcessMemoryInfo returns non-zero on success - if (GetProcessMemoryInfo(hProcess, &pmc, sizeof(pmc))) { - // Values are in bytes, convert to KB. - return (Py_ssize_t)((pmc.WorkingSetSize + pmc.PagefileUsage) / 1024); - } - else { - return -1; - } +// Visitor for get_all_mimalloc_used_kb(): called once per heap area. +struct count_used_area_args { + Py_ssize_t total_bytes; +}; -#elif __linux__ - FILE* fp = fopen("/proc/self/status", "r"); - if (fp == NULL) { - return -1; +static bool +count_used_area_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, + void *block, size_t block_size, void *arg) +{ + if (block == NULL) { + // Called once per area when visit_all_blocks=false. + ((struct count_used_area_args *)arg)->total_bytes += + (Py_ssize_t)(area->used * area->block_size); } + return true; +} - char line_buffer[256]; - long long rss_kb = -1; - long long swap_kb = -1; - - while (fgets(line_buffer, sizeof(line_buffer), fp) != NULL) { - if (rss_kb == -1 && strncmp(line_buffer, "VmRSS:", 6) == 0) { - sscanf(line_buffer + 6, "%lld", &rss_kb); - } - else if (swap_kb == -1 && strncmp(line_buffer, "VmSwap:", 7) == 0) { - sscanf(line_buffer + 7, "%lld", &swap_kb); +// Return the total bytes in use across all mimalloc heaps for all threads, in +// KB. Requires the world to be stopped so heap structures are stable. +static Py_ssize_t +get_all_mimalloc_used_kb(PyInterpreterState *interp) +{ + assert(interp->stoptheworld.world_stopped); + struct count_used_area_args args = {0}; + HEAD_LOCK(&_PyRuntime); + _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { + struct _mimalloc_thread_state *m = &((_PyThreadStateImpl *)p)->mimalloc; + if (!_Py_atomic_load_int(&m->initialized)) { + continue; } - if (rss_kb != -1 && swap_kb != -1) { - break; // Found both + for (int h = 0; h < _Py_MIMALLOC_HEAP_COUNT; h++) { + mi_heap_visit_blocks(&m->heaps[h], false, + count_used_area_visitor, &args); } } - fclose(fp); - - if (rss_kb != -1 && swap_kb != -1) { - return (Py_ssize_t)(rss_kb + swap_kb); - } - return -1; - -#elif defined(__APPLE__) - // --- MacOS (Darwin) --- - // Returns phys_footprint (RAM + compressed memory) - task_vm_info_data_t vm_info; - mach_msg_type_number_t count = TASK_VM_INFO_COUNT; - kern_return_t kerr; - - kerr = task_info(mach_task_self(), TASK_VM_INFO, (task_info_t)&vm_info, &count); - if (kerr != KERN_SUCCESS) { - return -1; - } - // phys_footprint is in bytes. Convert to KB. - return (Py_ssize_t)(vm_info.phys_footprint / 1024); - -#elif defined(__FreeBSD__) - // NOTE: Returns RSS only. Per-process swap usage isn't readily available - long page_size_kb = sysconf(_SC_PAGESIZE) / 1024; - if (page_size_kb <= 0) { - return -1; - } - - // Using /dev/null for vmcore avoids needing dump file. - // NULL for kernel file uses running kernel. - char errbuf[_POSIX2_LINE_MAX]; // For kvm error messages - kvm_t *kd = kvm_openfiles(NULL, "/dev/null", NULL, O_RDONLY, errbuf); - if (kd == NULL) { - return -1; - } - - // KERN_PROC_PID filters for the specific process ID - // n_procs will contain the number of processes returned (should be 1 or 0) - pid_t pid = getpid(); - int n_procs; - struct kinfo_proc *kp = kvm_getprocs(kd, KERN_PROC_PID, pid, &n_procs); - if (kp == NULL) { - kvm_close(kd); - return -1; - } - - Py_ssize_t rss_kb = -1; - if (n_procs > 0) { - // kp[0] contains the info for our process - // ki_rssize is in pages. Convert to KB. - rss_kb = (Py_ssize_t)kp->ki_rssize * page_size_kb; - } - else { - // Process with PID not found, shouldn't happen for self. - rss_kb = -1; - } - - kvm_close(kd); - return rss_kb; - -#elif defined(__OpenBSD__) - // NOTE: Returns RSS only. Per-process swap usage isn't readily available - long page_size_kb = sysconf(_SC_PAGESIZE) / 1024; - if (page_size_kb <= 0) { - return -1; - } - - struct kinfo_proc kp; - pid_t pid = getpid(); - int mib[6]; - size_t len = sizeof(kp); - - mib[0] = CTL_KERN; - mib[1] = KERN_PROC; - mib[2] = KERN_PROC_PID; - mib[3] = pid; - mib[4] = sizeof(struct kinfo_proc); // size of the structure we want - mib[5] = 1; // want 1 structure back - if (sysctl(mib, 6, &kp, &len, NULL, 0) == -1) { - return -1; - } - - if (len > 0) { - // p_vm_rssize is in pages on OpenBSD. Convert to KB. - return (Py_ssize_t)kp.p_vm_rssize * page_size_kb; - } - else { - // Process info not returned - return -1; - } -#else - // Unsupported platform - return -1; -#endif + mi_abandoned_pool_t *pool = &interp->mimalloc.abandoned_pool; + // Only GC page tags are supported by _mi_abandoned_pool_visit_blocks. + _mi_abandoned_pool_visit_blocks(pool, _Py_MIMALLOC_HEAP_GC, false, + count_used_area_visitor, &args); + _mi_abandoned_pool_visit_blocks(pool, _Py_MIMALLOC_HEAP_GC_PRE, false, + count_used_area_visitor, &args); + HEAD_UNLOCK(&_PyRuntime); + return args.total_bytes / 1024; } +// Decide whether memory usage has grown enough to warrant a collection. +// Stops the world to measure mimalloc heap usage accurately; OS-level RSS +// is unreliable since mimalloc reuses pages without returning them. static bool -gc_should_collect_mem_usage(GCState *gcstate) +gc_should_collect_mem_usage(PyThreadState *tstate) { - Py_ssize_t mem = get_process_mem_usage(); - if (mem < 0) { - // Reading process memory usage is not support or failed. - return true; - } + PyInterpreterState *interp = tstate->interp; + GCState *gcstate = &interp->gc; int threshold = gcstate->young.threshold; - Py_ssize_t deferred = _Py_atomic_load_ssize_relaxed(&gcstate->deferred_count); - if (deferred > threshold * 40) { - // Too many new container objects since last GC, even though memory use - // might not have increased much. This is intended to avoid resource - // exhaustion if some objects consume resources but don't result in a - // memory usage increase. We use 40x as the factor here because older - // versions of Python would do full collections after roughly every - // 70,000 new container objects. + + if (gcstate->deferred_count > threshold * 40) { + // Too many new container objects since last GC, even though memory + // use might not have increased much. This avoids resource + // exhaustion if some objects consume resources but don't result in + // a memory usage increase. We use 40x here because older versions + // of Python would do full collections after roughly every 70,000 + // new container objects. return true; } - Py_ssize_t last_mem = _Py_atomic_load_ssize_relaxed(&gcstate->last_mem); - Py_ssize_t mem_threshold = Py_MAX(last_mem / 10, 128); - if ((mem - last_mem) > mem_threshold) { - // The process memory usage has increased too much, do a collection. + _PyEval_StopTheWorld(interp); + Py_ssize_t used = get_all_mimalloc_used_kb(interp); + Py_ssize_t last = gcstate->last_gc_used; + Py_ssize_t mem_threshold = Py_MAX(last / 10, 128); + if ((used - last) > mem_threshold) { + // Heap usage has grown enough, collect. + _PyEval_StartTheWorld(interp); return true; } - else { - // The memory usage has not increased enough, defer the collection and - // clear the young object count so we don't check memory usage again - // on the next call to gc_should_collect(). - PyMutex_Lock(&gcstate->mutex); - int young_count = _Py_atomic_exchange_int(&gcstate->young.count, 0); - _Py_atomic_store_ssize_relaxed(&gcstate->deferred_count, - gcstate->deferred_count + young_count); - PyMutex_Unlock(&gcstate->mutex); - return false; - } + // Memory usage has not grown enough. Defer the collection, rolling the + // young count into deferred_count so we don't keep checking on every + // call to gc_should_collect(). + int young_count = gcstate->young.count; + gcstate->young.count = 0; + gcstate->deferred_count += young_count; + _PyEval_StartTheWorld(interp); + return false; } static bool -gc_should_collect(GCState *gcstate) +gc_should_collect(PyThreadState *tstate) { + GCState *gcstate = &tstate->interp->gc; int count = _Py_atomic_load_int_relaxed(&gcstate->young.count); int threshold = gcstate->young.threshold; int gc_enabled = _Py_atomic_load_int_relaxed(&gcstate->enabled); @@ -2214,7 +2096,7 @@ gc_should_collect(GCState *gcstate) // objects. return false; } - return gc_should_collect_mem_usage(gcstate); + return gc_should_collect_mem_usage(tstate); } static void @@ -2231,7 +2113,7 @@ record_allocation(PyThreadState *tstate) _Py_atomic_add_int(&gcstate->young.count, (int)gc->alloc_count); gc->alloc_count = 0; - if (gc_should_collect(gcstate) && + if (gc_should_collect(tstate) && !_Py_atomic_load_int_relaxed(&gcstate->collecting)) { _Py_ScheduleGC(tstate); @@ -2379,10 +2261,11 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, // to be freed. delete_garbage(state); - // Store the current memory usage, can be smaller now if breaking cycles - // freed some memory. - Py_ssize_t last_mem = get_process_mem_usage(); - _Py_atomic_store_ssize_relaxed(&state->gcstate->last_mem, last_mem); + // Record mimalloc heap usage as the baseline for the next collection's + // growth check. Stop-the-world so the heap structures are stable. + _PyEval_StopTheWorld(interp); + state->gcstate->last_gc_used = get_all_mimalloc_used_kb(interp); + _PyEval_StartTheWorld(interp); // Append objects with legacy finalizers to the "gc.garbage" list. handle_legacy_finalizers(state); @@ -2423,7 +2306,7 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) return 0; } - if (reason == _Py_GC_REASON_HEAP && !gc_should_collect(gcstate)) { + if (reason == _Py_GC_REASON_HEAP && !gc_should_collect(tstate)) { // Don't collect if the threshold is not exceeded. _Py_atomic_store_int(&gcstate->collecting, 0); return 0; From 05d50506c42eb3349ca724700e2dabf827b9afb5 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Thu, 23 Apr 2026 18:35:10 -0700 Subject: [PATCH 2/7] Move gc_should_collect_mem_usage() call. It's probably better to call this inside of gc_collect_main(). That way, we are not doing the STW from inside _PyObject_GC_Link() function. This should have no significant performance impact since we hit this only after the young object count hits the threshold. --- Python/gc_free_threading.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 25de084b1203ae9..8f7ecab64e33ac2 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -2048,6 +2048,11 @@ gc_should_collect_mem_usage(PyThreadState *tstate) GCState *gcstate = &interp->gc; int threshold = gcstate->young.threshold; + if (gcstate->old[0].threshold == 0) { + // A few tests rely on immediate scheduling of the GC so we ignore the + // extra conditions if generations[1].threshold is set to zero. + return true; + } if (gcstate->deferred_count > threshold * 40) { // Too many new container objects since last GC, even though memory // use might not have increased much. This avoids resource @@ -2096,7 +2101,7 @@ gc_should_collect(PyThreadState *tstate) // objects. return false; } - return gc_should_collect_mem_usage(tstate); + return true; } static void @@ -2311,6 +2316,10 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) _Py_atomic_store_int(&gcstate->collecting, 0); return 0; } + if (reason == _Py_GC_REASON_HEAP && !gc_should_collect_mem_usage(tstate)) { + _Py_atomic_store_int(&gcstate->collecting, 0); + return 0; + } gcstate->frame = tstate->current_frame; assert(generation >= 0 && generation < NUM_GENERATIONS); From 515e4c4b73e05369dbe3d9371e81de1dbc853f69 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Thu, 23 Apr 2026 18:39:45 -0700 Subject: [PATCH 3/7] Add blurb. --- .../2026-04-23-18-39-41.gh-issue-148937.yp--1l.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2026-04-23-18-39-41.gh-issue-148937.yp--1l.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-04-23-18-39-41.gh-issue-148937.yp--1l.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-23-18-39-41.gh-issue-148937.yp--1l.rst new file mode 100644 index 000000000000000..523792372bc8e5b --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-23-18-39-41.gh-issue-148937.yp--1l.rst @@ -0,0 +1,3 @@ +Fix a bug in the free-threaded GC that caused collections to be deferred too +long. This would result in excess memory usage since cyclic trash was not +freed quickly enough. From 7f654e10f3e226678f8fdc1d986f0e42ebb1ca00 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Wed, 29 Apr 2026 12:38:49 -0700 Subject: [PATCH 4/7] Compute mimalloc memory usage based on full pages. This avoids using STW in exchange for less accurate memory usage estimates. --- Include/internal/mimalloc/mimalloc/types.h | 4 + Include/internal/pycore_interp_structs.h | 9 +- Objects/mimalloc/init.c | 1 + Objects/mimalloc/page.c | 14 +++ Objects/obmalloc.c | 53 +++++++++ Python/gc_free_threading.c | 125 ++++++++------------- 6 files changed, 121 insertions(+), 85 deletions(-) diff --git a/Include/internal/mimalloc/mimalloc/types.h b/Include/internal/mimalloc/mimalloc/types.h index 286e7bf668312db..87ee42f67b27e57 100644 --- a/Include/internal/mimalloc/mimalloc/types.h +++ b/Include/internal/mimalloc/mimalloc/types.h @@ -516,6 +516,10 @@ typedef struct mi_abandoned_pool_s { // in order to prevent resetting/decommitting segment memory if it might // still be read. mi_decl_cache_align _Atomic(size_t) abandoned_readers; // = 0 + + // Total bytes (block_size * capacity) of pages currently in MI_BIN_FULL + // state whose pool association is this pool. + mi_decl_cache_align _Atomic(intptr_t) full_page_bytes; // = 0 } mi_abandoned_pool_t; diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index 3dd9aa5430a6418..a6c9db5a2151868 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -260,14 +260,15 @@ struct _gc_runtime_state { /* True if gc.freeze() has been used. */ int freeze_active; - /* Sum of area->used*area->block_size across all mimalloc heaps after last - GC, in KB. Updated under stop-the-world so the measurement is accurate - even when OS pages are being reused. */ - Py_ssize_t last_gc_used; + /* Estimate of the number of bytes used by mimalloc after last GC. */ + Py_ssize_t last_heap_bytes; /* This accumulates the new object count whenever collection is deferred due to memory usage not increasing enough. Reset on collection. */ Py_ssize_t deferred_count; + + /* Mutex held for gc_should_collect_mem_usage(). */ + PyMutex mutex; #endif }; diff --git a/Objects/mimalloc/init.c b/Objects/mimalloc/init.c index 81b241063ff40fc..64411bf1c77fdd3 100644 --- a/Objects/mimalloc/init.c +++ b/Objects/mimalloc/init.c @@ -103,6 +103,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = { NULL, // next false, 0, + 0, 0 }; diff --git a/Objects/mimalloc/page.c b/Objects/mimalloc/page.c index ded59f8eb1ccaac..ca71246c988ca3a 100644 --- a/Objects/mimalloc/page.c +++ b/Objects/mimalloc/page.c @@ -360,6 +360,10 @@ void _mi_page_unfull(mi_page_t* page) { mi_assert_internal(mi_page_is_in_full(page)); if (!mi_page_is_in_full(page)) return; +#ifdef Py_GIL_DISABLED + _PyMem_mi_page_full_dec(page); +#endif + mi_heap_t* heap = mi_page_heap(page); mi_page_queue_t* pqfull = &heap->pages[MI_BIN_FULL]; mi_page_set_in_full(page, false); // to get the right queue @@ -374,6 +378,9 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) { mi_assert_internal(!mi_page_is_in_full(page)); if (mi_page_is_in_full(page)) return; +#ifdef Py_GIL_DISABLED + _PyMem_mi_page_full_inc(page); +#endif mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page); _mi_page_free_collect(page,false); // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set } @@ -435,6 +442,13 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) { #ifdef Py_GIL_DISABLED mi_assert_internal(page->qsbr_goal == 0); mi_assert_internal(page->qsbr_node.next == NULL); + // Defensive: a full page whose last block is freed locally goes through + // _mi_page_retire -> _PyMem_mi_page_maybe_free -> _mi_page_free without + // ever calling _mi_page_unfull, so the per-thread full-page counter must + // be decremented here to maintain the invariant. + if (mi_page_is_in_full(page)) { + _PyMem_mi_page_full_dec(page); + } #endif // remove from the page list diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index 1809bd30451327b..ded77087680003a 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -22,6 +22,8 @@ static bool _PyMem_mi_page_is_safe_to_free(mi_page_t *page); static bool _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force); static void _PyMem_mi_page_reclaimed(mi_page_t *page); static void _PyMem_mi_heap_collect_qsbr(mi_heap_t *heap); +static void _PyMem_mi_page_full_inc(mi_page_t *page); +static void _PyMem_mi_page_full_dec(mi_page_t *page); # include "pycore_mimalloc.h" # include "mimalloc/static.c" # include "mimalloc/internal.h" // for stats @@ -223,6 +225,57 @@ _PyMem_mi_page_reclaimed(mi_page_t *page) #endif } +// Hooks called from mimalloc page-state transitions to maintain +// mi_abandoned_pool_t::full_page_bytes -- bytes (block_size * capacity) of +// pages currently in MI_BIN_FULL state whose pool association is that pool. +// Page weight uses the same formula as should_advance_qsbr_for_page above; +// capacity is stable while a page is in the full queue (extend_free is only +// called on non-full queues), so inc and dec see the same value. +// +// The pool a page counts toward is heap->tld->segments.abandoned, which for a +// Python tstate-bound heap is &interp->mimalloc.abandoned_pool, and for +// mimalloc's auto-created default heap is _mi_abandoned_default. Pages do +// not cross pools (mimalloc reclaim only pulls from the reclaiming heap's +// own pool), so the counter stays valid across abandon/reclaim without any +// hand-off -- abandon and reclaim therefore have no hooks of their own. +// +// The hooks fire only on slow paths: mi_page_to_full / _mi_page_unfull / +// in-full _mi_page_free. gc_get_heap_bytes() in gc_free_threading.c reads the +// per-interp pool plus _mi_abandoned_default to get a stop-the-world-free +// memory-pressure proxy. +#ifdef Py_GIL_DISABLED +static inline Py_ssize_t +_PyMem_mi_page_size(mi_page_t *page) +{ + return (Py_ssize_t)(mi_page_block_size(page) * (size_t)page->capacity); +} + +static inline Py_ssize_t * +_PyMem_mi_page_pool_full_bytes(mi_page_t *page) +{ + return (Py_ssize_t *) + &mi_page_heap(page)->tld->segments.abandoned->full_page_bytes; +} +#endif + +static void +_PyMem_mi_page_full_inc(mi_page_t *page) +{ +#ifdef Py_GIL_DISABLED + _Py_atomic_add_ssize(_PyMem_mi_page_pool_full_bytes(page), + _PyMem_mi_page_size(page)); +#endif +} + +static void +_PyMem_mi_page_full_dec(mi_page_t *page) +{ +#ifdef Py_GIL_DISABLED + _Py_atomic_add_ssize(_PyMem_mi_page_pool_full_bytes(page), + -_PyMem_mi_page_size(page)); +#endif +} + static void _PyMem_mi_heap_collect_qsbr(mi_heap_t *heap) { diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 8f7ecab64e33ac2..5ae23d875a60a64 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -17,7 +17,15 @@ #include "pydtrace.h" -#include "pycore_mimalloc.h" // mi_heap_visit_blocks() +// Minimum growth in mimalloc heap bytes (estimated from full pages) since the +// last GC. +#define GC_HEAP_BYTES_MIN_DELTA (512 * 1024) + +// Maximum number of "young" objects before we stop deferring collection due +// to heap not growing enough. With the default threshold, this is (40*2000) +// net new objects. This is set to 40x because older versions of Python would +// do full collections after roughly every 70,000 new container objects. +#define GC_MAX_DEFER_FACTOR 40 // enable the "mark alive" pass of GC #define GC_ENABLE_MARK_ALIVE 1 @@ -1993,92 +2001,53 @@ cleanup_worklist(struct worklist *worklist) } } -// Visitor for get_all_mimalloc_used_kb(): called once per heap area. -struct count_used_area_args { - Py_ssize_t total_bytes; -}; - -static bool -count_used_area_visitor(const mi_heap_t *heap, const mi_heap_area_t *area, - void *block, size_t block_size, void *arg) -{ - if (block == NULL) { - // Called once per area when visit_all_blocks=false. - ((struct count_used_area_args *)arg)->total_bytes += - (Py_ssize_t)(area->used * area->block_size); - } - return true; -} - -// Return the total bytes in use across all mimalloc heaps for all threads, in -// KB. Requires the world to be stopped so heap structures are stable. +// Return an estimate, in bytes, of how much memory is being used. static Py_ssize_t -get_all_mimalloc_used_kb(PyInterpreterState *interp) +gc_get_heap_bytes(PyInterpreterState *interp) { - assert(interp->stoptheworld.world_stopped); - struct count_used_area_args args = {0}; - HEAD_LOCK(&_PyRuntime); - _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { - struct _mimalloc_thread_state *m = &((_PyThreadStateImpl *)p)->mimalloc; - if (!_Py_atomic_load_int(&m->initialized)) { - continue; - } - for (int h = 0; h < _Py_MIMALLOC_HEAP_COUNT; h++) { - mi_heap_visit_blocks(&m->heaps[h], false, - count_used_area_visitor, &args); - } - } - mi_abandoned_pool_t *pool = &interp->mimalloc.abandoned_pool; - // Only GC page tags are supported by _mi_abandoned_pool_visit_blocks. - _mi_abandoned_pool_visit_blocks(pool, _Py_MIMALLOC_HEAP_GC, false, - count_used_area_visitor, &args); - _mi_abandoned_pool_visit_blocks(pool, _Py_MIMALLOC_HEAP_GC_PRE, false, - count_used_area_visitor, &args); - HEAD_UNLOCK(&_PyRuntime); - return args.total_bytes / 1024; + // Computed from mimalloc full-page byte counters maintained on each + // abandoned pool (see _PyMem_mi_page_full_inc/dec in Objects/obmalloc.c). + Py_ssize_t total = _Py_atomic_load_ssize_relaxed( + (Py_ssize_t *)&interp->mimalloc.abandoned_pool.full_page_bytes); + total += _Py_atomic_load_ssize_relaxed( + (Py_ssize_t *)&_mi_abandoned_default.full_page_bytes); + return total; } // Decide whether memory usage has grown enough to warrant a collection. -// Stops the world to measure mimalloc heap usage accurately; OS-level RSS -// is unreliable since mimalloc reuses pages without returning them. static bool gc_should_collect_mem_usage(PyThreadState *tstate) { PyInterpreterState *interp = tstate->interp; GCState *gcstate = &interp->gc; int threshold = gcstate->young.threshold; - - if (gcstate->old[0].threshold == 0) { - // A few tests rely on immediate scheduling of the GC so we ignore the - // extra conditions if generations[1].threshold is set to zero. - return true; - } - if (gcstate->deferred_count > threshold * 40) { + Py_ssize_t deferred = _Py_atomic_load_ssize_relaxed(&gcstate->deferred_count); + if (deferred > threshold * GC_MAX_DEFER_FACTOR) { // Too many new container objects since last GC, even though memory - // use might not have increased much. This avoids resource - // exhaustion if some objects consume resources but don't result in - // a memory usage increase. We use 40x here because older versions - // of Python would do full collections after roughly every 70,000 - // new container objects. + // use might not have increased much. This avoids resource exhaustion + // if some objects consume resources but don't result in a memory + // usage increase. return true; } - _PyEval_StopTheWorld(interp); - Py_ssize_t used = get_all_mimalloc_used_kb(interp); - Py_ssize_t last = gcstate->last_gc_used; - Py_ssize_t mem_threshold = Py_MAX(last / 10, 128); - if ((used - last) > mem_threshold) { - // Heap usage has grown enough, collect. - _PyEval_StartTheWorld(interp); + Py_ssize_t cur = gc_get_heap_bytes(interp); + Py_ssize_t last = _Py_atomic_load_ssize_relaxed(&gcstate->last_heap_bytes); + // Require 20% increase in full mimalloc pages. + Py_ssize_t delta = Py_MAX(last / 5, GC_HEAP_BYTES_MIN_DELTA); + if ((cur - last) > delta) { + // Heap has grown enough, collect. return true; } - // Memory usage has not grown enough. Defer the collection, rolling the - // young count into deferred_count so we don't keep checking on every - // call to gc_should_collect(). - int young_count = gcstate->young.count; - gcstate->young.count = 0; - gcstate->deferred_count += young_count; - _PyEval_StartTheWorld(interp); - return false; + else { + // Memory usage has not grown enough. Defer the collection, rolling the + // young count into deferred_count so we don't keep checking on every + // call to gc_should_collect(). + PyMutex_Lock(&gcstate->mutex); + int young_count = _Py_atomic_exchange_int(&gcstate->young.count, 0); + _Py_atomic_store_ssize_relaxed(&gcstate->deferred_count, + gcstate->deferred_count + young_count); + PyMutex_Unlock(&gcstate->mutex); + return false; + } } static bool @@ -2101,7 +2070,7 @@ gc_should_collect(PyThreadState *tstate) // objects. return false; } - return true; + return gc_should_collect_mem_usage(tstate); } static void @@ -2266,11 +2235,9 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, // to be freed. delete_garbage(state); - // Record mimalloc heap usage as the baseline for the next collection's - // growth check. Stop-the-world so the heap structures are stable. - _PyEval_StopTheWorld(interp); - state->gcstate->last_gc_used = get_all_mimalloc_used_kb(interp); - _PyEval_StartTheWorld(interp); + // Record the current heap bytes estimate as new baseline. + Py_ssize_t last_heap_bytes = gc_get_heap_bytes(interp); + _Py_atomic_store_ssize_relaxed(&state->gcstate->last_heap_bytes, last_heap_bytes); // Append objects with legacy finalizers to the "gc.garbage" list. handle_legacy_finalizers(state); @@ -2316,10 +2283,6 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) _Py_atomic_store_int(&gcstate->collecting, 0); return 0; } - if (reason == _Py_GC_REASON_HEAP && !gc_should_collect_mem_usage(tstate)) { - _Py_atomic_store_int(&gcstate->collecting, 0); - return 0; - } gcstate->frame = tstate->current_frame; assert(generation >= 0 && generation < NUM_GENERATIONS); From 14b9696d31f7ae78a3ac1b9e33cbcc248d3be5d5 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Thu, 30 Apr 2026 08:19:56 -0700 Subject: [PATCH 5/7] Avoid warning of unused functions. --- Objects/obmalloc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index ded77087680003a..cd974e406d2e141 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -22,8 +22,10 @@ static bool _PyMem_mi_page_is_safe_to_free(mi_page_t *page); static bool _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force); static void _PyMem_mi_page_reclaimed(mi_page_t *page); static void _PyMem_mi_heap_collect_qsbr(mi_heap_t *heap); +#ifdef Py_GIL_DISABLED static void _PyMem_mi_page_full_inc(mi_page_t *page); static void _PyMem_mi_page_full_dec(mi_page_t *page); +#endif # include "pycore_mimalloc.h" # include "mimalloc/static.c" # include "mimalloc/internal.h" // for stats @@ -256,25 +258,21 @@ _PyMem_mi_page_pool_full_bytes(mi_page_t *page) return (Py_ssize_t *) &mi_page_heap(page)->tld->segments.abandoned->full_page_bytes; } -#endif static void _PyMem_mi_page_full_inc(mi_page_t *page) { -#ifdef Py_GIL_DISABLED _Py_atomic_add_ssize(_PyMem_mi_page_pool_full_bytes(page), _PyMem_mi_page_size(page)); -#endif } static void _PyMem_mi_page_full_dec(mi_page_t *page) { -#ifdef Py_GIL_DISABLED _Py_atomic_add_ssize(_PyMem_mi_page_pool_full_bytes(page), -_PyMem_mi_page_size(page)); -#endif } +#endif static void _PyMem_mi_heap_collect_qsbr(mi_heap_t *heap) From 801737fd85a5fa919840fdcbdbb826fa5672b768 Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Sat, 2 May 2026 08:02:39 -0700 Subject: [PATCH 6/7] Store full_page_bytes in mi_heap_t. Should should avoid memory contention. Avoid casting *intptr_t to *Py_ssize_t. Include large and huge pages in count (promote eagerly to MI_BIN_FULL). Add comment noting about abandoned pages potentially being lost (their byte count never being subtracted). --- Include/internal/mimalloc/mimalloc/types.h | 11 +- Include/internal/pycore_gc.h | 2 + Include/internal/pycore_mimalloc.h | 5 + Lib/test/test_gc.py | 54 +++++++- Modules/_testinternalcapi.c | 7 + Objects/mimalloc/heap.c | 13 ++ Objects/mimalloc/init.c | 5 +- Objects/mimalloc/page-queue.c | 6 +- Objects/mimalloc/page.c | 148 +++++++++++++++++++-- Objects/obmalloc.c | 51 ------- Python/gc_free_threading.c | 57 ++++++-- 11 files changed, 276 insertions(+), 83 deletions(-) diff --git a/Include/internal/mimalloc/mimalloc/types.h b/Include/internal/mimalloc/mimalloc/types.h index 87ee42f67b27e57..178825ab2904a1a 100644 --- a/Include/internal/mimalloc/mimalloc/types.h +++ b/Include/internal/mimalloc/mimalloc/types.h @@ -517,9 +517,11 @@ typedef struct mi_abandoned_pool_s { // still be read. mi_decl_cache_align _Atomic(size_t) abandoned_readers; // = 0 - // Total bytes (block_size * capacity) of pages currently in MI_BIN_FULL - // state whose pool association is this pool. +#if MI_FULL_PAGE_BYTES + // Bytes (block_size * capacity) of full pages currently abandoned to this + // pool. mi_decl_cache_align _Atomic(intptr_t) full_page_bytes; // = 0 +#endif } mi_abandoned_pool_t; @@ -592,6 +594,11 @@ struct mi_heap_s { uint8_t tag; // custom identifier for this heap uint8_t debug_offset; // number of bytes to preserve when filling freed or uninitialized memory bool page_use_qsbr; // should freeing pages be delayed using QSBR +#if MI_FULL_PAGE_BYTES + // Bytes (block_size * capacity) of pages currently in MI_BIN_FULL state + // owned by this heap. + _Atomic(intptr_t) full_page_bytes; +#endif }; diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index e105677cd2e674a..345f2f51519698e 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -337,6 +337,8 @@ extern int _PyGC_VisitStackRef(union _PyStackRef *ref, visitproc visit, void *ar #ifdef Py_GIL_DISABLED extern void _PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp, gcvisitobjects_t callback, void *arg); +// Estimate of bytes allocated by mimalloc. +PyAPI_FUNC(Py_ssize_t) _PyGC_GetHeapBytes(PyInterpreterState *interp); #endif #ifdef __cplusplus diff --git a/Include/internal/pycore_mimalloc.h b/Include/internal/pycore_mimalloc.h index d870d01beb702c0..733d37d1ffd53dc 100644 --- a/Include/internal/pycore_mimalloc.h +++ b/Include/internal/pycore_mimalloc.h @@ -36,6 +36,11 @@ typedef enum { # define MI_TSAN 1 #endif +#ifdef Py_GIL_DISABLED +// Track full-page byte totals on each mi_heap_t and mi_abandoned_pool_t. +# define MI_FULL_PAGE_BYTES 1 +#endif + #ifdef __cplusplus extern "C++" { #endif diff --git a/Lib/test/test_gc.py b/Lib/test/test_gc.py index 88d265cbc21709d..0469476e58e931f 100644 --- a/Lib/test/test_gc.py +++ b/Lib/test/test_gc.py @@ -1271,8 +1271,58 @@ def test(): assert_python_ok("-c", code_inside_function) - @unittest.skipUnless(Py_GIL_DISABLED, "requires free-threaded GC") - @unittest.skipIf(_testinternalcapi is None, "requires _testinternalcapi") + +@unittest.skipUnless(Py_GIL_DISABLED, "requires free-threaded GC") +@unittest.skipIf(_testinternalcapi is None, "requires _testinternalcapi") +class FreeThreadingTests(unittest.TestCase): + # Tests that are specific to the free-threading GC. + + def test_gc_heap_bytes_large_allocs(self): + # The free-threaded GC threshold uses _PyGC_GetHeapBytes(), which + # sums mimalloc's full_page_bytes counters. Large/huge pages + # (>MI_MEDIUM_OBJ_SIZE_MAX, MI_BIN_HUGE) get eagerly promoted to + # MI_BIN_FULL by `_mi_malloc_generic` -- without that, mimalloc + # would never count these pages, and a cycle holding a large + # buffer would not register as memory pressure. + gc.collect() + baseline = _testinternalcapi.get_gc_heap_bytes() + size = 1 << 20 # 1 MiB + k = 5 + data = [bytearray(size) for _ in range(k)] + after_alloc = _testinternalcapi.get_gc_heap_bytes() + # All k pages should be counted. Page size rounds up the request, + # so the increase should be at least k * size. + self.assertGreaterEqual(after_alloc - baseline, k * size) + del data + gc.collect() + after_free = _testinternalcapi.get_gc_heap_bytes() + # Freeing the lone block in each huge page un-fulls it. Allow some + # slop for unrelated allocations triggered by gc.collect(). + self.assertLess(abs(after_free - baseline), size) + + def test_gc_heap_bytes_many_small_allocs(self): + # Filling small pages should also bump the counter. Small/medium + # transitions are lazy (only when a page actually becomes full), so + # use enough allocations to fill many pages. + gc.collect() + baseline = _testinternalcapi.get_gc_heap_bytes() + n = 100_000 + objs = [bytes(4) for i in range(n)] + after_alloc = _testinternalcapi.get_gc_heap_bytes() + print('small after alloc', baseline, after_alloc) + self.assertGreater(after_alloc - baseline, 1 << 20) + del objs + gc.collect() + after_free = _testinternalcapi.get_gc_heap_bytes() + print('small after free', baseline, after_free) + # Should drop substantially once the pages empty out. + self.assertLess(after_free - baseline, (after_alloc - baseline) // 2) + + def test_gc_heap_bytes_nonneg(self): + # Counter is intptr_t and only increases or decreases via paired + # hooks; it must never go negative. + self.assertGreaterEqual(_testinternalcapi.get_gc_heap_bytes(), 0) + def test_tuple_untrack_counts(self): # This ensures that the free-threaded GC is counting untracked tuples # in the "long_lived_total" count. This is required to avoid diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 5319d9c7a4819be..20e5b510ffd9df5 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -2636,6 +2636,12 @@ get_long_lived_total(PyObject *self, PyObject *Py_UNUSED(ignored)) return PyLong_FromInt64(PyInterpreterState_Get()->gc.long_lived_total); } +static PyObject * +get_gc_heap_bytes(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + return PyLong_FromSsize_t(_PyGC_GetHeapBytes(PyInterpreterState_Get())); +} + #endif static PyObject * @@ -3001,6 +3007,7 @@ static PyMethodDef module_functions[] = { {"get_tlbc", get_tlbc, METH_O, NULL}, {"get_tlbc_id", get_tlbc_id, METH_O, NULL}, {"get_long_lived_total", get_long_lived_total, METH_NOARGS}, + {"get_gc_heap_bytes", get_gc_heap_bytes, METH_NOARGS}, #endif #ifdef _Py_TIER2 {"uop_symbols_test", _Py_uop_symbols_test, METH_NOARGS}, diff --git a/Objects/mimalloc/heap.c b/Objects/mimalloc/heap.c index 5fbfb82baa02040..c4ac30cde26f1b4 100644 --- a/Objects/mimalloc/heap.c +++ b/Objects/mimalloc/heap.c @@ -270,6 +270,11 @@ static void mi_heap_reset_pages(mi_heap_t* heap) { _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages)); heap->thread_delayed_free = NULL; heap->page_count = 0; +#if MI_FULL_PAGE_BYTES + // All pages have been removed (destroyed, or transferred via + // mi_heap_absorb which already moved the bytes to the destination heap). + mi_atomic_store_relaxed(&heap->full_page_bytes, (intptr_t)0); +#endif } // called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources. @@ -427,6 +432,14 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) { } mi_assert_internal(from->page_count == 0); +#if MI_FULL_PAGE_BYTES + // The page-state hooks didn't fire for these transfers, so move the + // full_page_bytes accounting in bulk. mi_heap_reset_pages(from) below + // will zero `from->full_page_bytes`. + intptr_t bytes = mi_atomic_load_relaxed(&from->full_page_bytes); + mi_atomic_addi(&heap->full_page_bytes, bytes); +#endif + // and do outstanding delayed frees in the `from` heap // note: be careful here as the `heap` field in all those pages no longer point to `from`, // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a diff --git a/Objects/mimalloc/init.c b/Objects/mimalloc/init.c index 64411bf1c77fdd3..2ab0cb414347df2 100644 --- a/Objects/mimalloc/init.c +++ b/Objects/mimalloc/init.c @@ -104,7 +104,10 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = { false, 0, 0, - 0 + 0, +#if MI_FULL_PAGE_BYTES + MI_ATOMIC_VAR_INIT(0), // full_page_bytes +#endif }; #define tld_empty_stats ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats))) diff --git a/Objects/mimalloc/page-queue.c b/Objects/mimalloc/page-queue.c index cb54b3740196e97..d343f9fab196675 100644 --- a/Objects/mimalloc/page-queue.c +++ b/Objects/mimalloc/page-queue.c @@ -151,7 +151,7 @@ static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size)); mi_assert_internal(bin <= MI_BIN_FULL); mi_page_queue_t* pq = &heap->pages[bin]; - mi_assert_internal(mi_page_is_in_full(page) || page->xblock_size == pq->block_size); + mi_assert_internal(bin >= MI_BIN_HUGE || page->xblock_size == pq->block_size); return pq; } @@ -264,7 +264,9 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro (page->xblock_size == to->block_size && mi_page_queue_is_full(from)) || (page->xblock_size == from->block_size && mi_page_queue_is_full(to)) || (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) || - (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to))); + (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)) || + (mi_page_queue_is_huge(from) && mi_page_queue_is_full(to)) || + (mi_page_queue_is_full(from) && mi_page_queue_is_huge(to))); mi_heap_t* heap = mi_page_heap(page); if (page->prev != NULL) page->prev->next = page->next; diff --git a/Objects/mimalloc/page.c b/Objects/mimalloc/page.c index ca71246c988ca3a..f8891d375eee085 100644 --- a/Objects/mimalloc/page.c +++ b/Objects/mimalloc/page.c @@ -255,6 +255,78 @@ void _mi_page_free_collect(mi_page_t* page, bool force) { mi_assert_internal(!force || page->local_free == NULL); } +/* ----------------------------------------------------------- + Full-page byte accounting (MI_FULL_PAGE_BYTES) + + Maintain `mi_heap_t.full_page_bytes` (bytes of MI_BIN_FULL pages owned by + the heap) and `mi_abandoned_pool_t.full_page_bytes` (bytes of MI_BIN_FULL + pages currently abandoned to that pool). Page weight is + `mi_page_block_size(page) * page->capacity`. Capacity is stable while a + page is in the full queue (`mi_page_extend_free` only runs on non-full + queues), so inc and dec see the same value. + + State machine: + to-full : heap += size + from-full : heap -= size + abandon a full : heap -= size; pool += size + reclaim a full : pool -= size; heap += size + free a full : heap -= size + + The in_full bit is unconditionally cleared by `mi_page_queue_remove`, so + `_mi_page_abandon` re-sets it after queue_remove to preserve the "this + page's bytes were transferred to the pool" marker through abandonment. + `_mi_page_reclaim` then routes such pages straight to MI_BIN_FULL, so + `mi_page_queue_push` keeps the bit set; subsequent unfull/free fires the + matching dec. + + Large/huge pages (block_size > MI_MEDIUM_OBJ_SIZE_MAX) are 1-block pages + in MI_BIN_HUGE; mimalloc never walks that queue on a subsequent alloc, so + it would never call `mi_page_to_full` on them. `_mi_malloc_generic` + therefore eagerly calls `mi_page_to_full` on a freshly-filled huge page + (see the MI_FULL_PAGE_BYTES block at the bottom of that function). + Inc/dec then proceed identically to small/medium pages. + + Known minor leak: if a page abandoned-while-full later becomes empty and + then freed, the +size we added on abandon is never subtracted. +----------------------------------------------------------- */ + +#if MI_FULL_PAGE_BYTES +static inline intptr_t mi_page_full_size(mi_page_t* page) { + return (intptr_t)(mi_page_block_size(page) * (size_t)page->capacity); +} + +static void mi_page_full_inc(mi_page_t* page) { + mi_atomic_addi(&mi_page_heap(page)->full_page_bytes, mi_page_full_size(page)); +} + +static void mi_page_full_dec(mi_page_t* page) { + mi_atomic_addi(&mi_page_heap(page)->full_page_bytes, -mi_page_full_size(page)); +} + +// Called from `_mi_page_abandon` *before* the page's heap pointer is cleared. +// Transfers the page's bytes from its heap to the pool that will own the +// abandoned page. No-op if the page is not currently in MI_BIN_FULL. +static void mi_page_full_abandon(mi_page_t* page) { + if (!mi_page_is_in_full(page)) return; + intptr_t bytes = mi_page_full_size(page); + mi_heap_t* heap = mi_page_heap(page); + mi_atomic_addi(&heap->full_page_bytes, -bytes); + mi_atomic_addi(&heap->tld->segments.abandoned->full_page_bytes, bytes); +} + +// Called from `_mi_page_reclaim` when a page abandoned-while-full is +// returning to a heap. in_full=true here means "this page's bytes are +// currently in the pool counter from abandon". Transfer them: pool -= size, +// new-heap += size. The caller routes the page directly into MI_BIN_FULL, +// so the in_full bit (and matching dec hook on free/unfull) survives. +static void mi_page_full_reclaim(mi_page_t* page) { + if (!mi_page_is_in_full(page)) return; + intptr_t bytes = mi_page_full_size(page); + mi_heap_t* heap = mi_page_heap(page); + mi_atomic_addi(&heap->tld->segments.abandoned->full_page_bytes, -bytes); + mi_atomic_addi(&heap->full_page_bytes, bytes); +} +#endif // MI_FULL_PAGE_BYTES /* ----------------------------------------------------------- @@ -271,8 +343,24 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) { mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE); #endif - // TODO: push on full queue immediately if it is full? - mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page)); + mi_page_queue_t* pq; +#if MI_FULL_PAGE_BYTES + // If the page was abandoned full (in_full preserved as marker), route + // it directly to MI_BIN_FULL. Pushing to the size-bucket queue would + // rely on a later alloc walking that queue to promote it via + // mi_page_to_full -- which happens for small/medium bins but never for + // MI_BIN_HUGE, so a reclaimed full huge page would otherwise leave the + // pool counter without re-crediting any heap. mi_page_full_reclaim + // does the pool-to-heap transfer. + if (mi_page_is_in_full(page)) { + pq = &heap->pages[MI_BIN_FULL]; + } else { + pq = mi_page_queue(heap, mi_page_block_size(page)); + } + mi_page_full_reclaim(page); +#else + pq = mi_page_queue(heap, mi_page_block_size(page)); +#endif mi_page_queue_push(heap, pq, page); _PyMem_mi_page_reclaimed(page); mi_assert_expensive(_mi_page_is_valid(page)); @@ -360,8 +448,8 @@ void _mi_page_unfull(mi_page_t* page) { mi_assert_internal(mi_page_is_in_full(page)); if (!mi_page_is_in_full(page)) return; -#ifdef Py_GIL_DISABLED - _PyMem_mi_page_full_dec(page); +#if MI_FULL_PAGE_BYTES + mi_page_full_dec(page); #endif mi_heap_t* heap = mi_page_heap(page); @@ -378,8 +466,8 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) { mi_assert_internal(!mi_page_is_in_full(page)); if (mi_page_is_in_full(page)) return; -#ifdef Py_GIL_DISABLED - _PyMem_mi_page_full_inc(page); +#if MI_FULL_PAGE_BYTES + mi_page_full_inc(page); #endif mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page); _mi_page_free_collect(page,false); // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set @@ -398,6 +486,13 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) { mi_heap_t* pheap = mi_page_heap(page); +#if MI_FULL_PAGE_BYTES + // Capture in_full while the heap pointer is still valid; transfer the + // bytes from heap counter to pool counter. Must run before + // mi_page_queue_remove, which clears the in_full bit unconditionally. + bool was_in_full = mi_page_is_in_full(page); + mi_page_full_abandon(page); +#endif #ifdef Py_GIL_DISABLED if (page->qsbr_node.next != NULL) { // remove from QSBR queue, but keep the goal @@ -413,6 +508,15 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) { mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE); mi_page_set_heap(page, NULL); +#if MI_FULL_PAGE_BYTES + // Preserve the in_full marker through abandonment so `_mi_page_reclaim`'s + // `mi_page_full_reclaim` call can transfer the bytes back to the + // reclaiming heap. Nothing reads in_full on a heap-less page. + if (was_in_full) { + mi_page_set_in_full(page, true); + } +#endif + #if (MI_DEBUG>1) && !MI_TRACK_ENABLED // check there are no references left.. for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) { @@ -442,12 +546,16 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) { #ifdef Py_GIL_DISABLED mi_assert_internal(page->qsbr_goal == 0); mi_assert_internal(page->qsbr_node.next == NULL); - // Defensive: a full page whose last block is freed locally goes through +#endif +#if MI_FULL_PAGE_BYTES + // A full page whose last block is freed locally goes through // _mi_page_retire -> _PyMem_mi_page_maybe_free -> _mi_page_free without - // ever calling _mi_page_unfull, so the per-thread full-page counter must - // be decremented here to maintain the invariant. + // ever calling _mi_page_unfull, so the heap's full_page_bytes counter + // must be decremented here to maintain the invariant. `heap` is non-NULL + // for any page reaching _mi_page_free (abandoned pages take the + // segment-level cleanup path instead). if (mi_page_is_in_full(page)) { - _PyMem_mi_page_full_dec(page); + mi_page_full_dec(page); } #endif @@ -977,14 +1085,28 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al mi_assert_internal(mi_page_block_size(page) >= size); // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc) + void* p; if mi_unlikely(zero && page->xblock_size == 0) { // note: we cannot call _mi_page_malloc with zeroing for huge blocks; we zero it afterwards in that case. - void* p = _mi_page_malloc(heap, page, size, false); + p = _mi_page_malloc(heap, page, size, false); mi_assert_internal(p != NULL); _mi_memzero_aligned(p, mi_page_usable_block_size(page)); - return p; } else { - return _mi_page_malloc(heap, page, size, zero); + p = _mi_page_malloc(heap, page, size, zero); + } + +#if MI_FULL_PAGE_BYTES + // Eagerly promote a freshly-filled huge page (1 block per page, in + // MI_BIN_HUGE) to MI_BIN_FULL so its bytes get counted. See the + // "Full-page byte accounting" comment block above. + if (p != NULL && !mi_page_immediate_available(page)) { + mi_page_queue_t* page_pq = mi_page_queue_of(page); + if (mi_page_queue_is_huge(page_pq) && !mi_page_is_in_full(page)) { + mi_page_to_full(page, page_pq); + } } +#endif + + return p; } diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index cd974e406d2e141..1809bd30451327b 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -22,10 +22,6 @@ static bool _PyMem_mi_page_is_safe_to_free(mi_page_t *page); static bool _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force); static void _PyMem_mi_page_reclaimed(mi_page_t *page); static void _PyMem_mi_heap_collect_qsbr(mi_heap_t *heap); -#ifdef Py_GIL_DISABLED -static void _PyMem_mi_page_full_inc(mi_page_t *page); -static void _PyMem_mi_page_full_dec(mi_page_t *page); -#endif # include "pycore_mimalloc.h" # include "mimalloc/static.c" # include "mimalloc/internal.h" // for stats @@ -227,53 +223,6 @@ _PyMem_mi_page_reclaimed(mi_page_t *page) #endif } -// Hooks called from mimalloc page-state transitions to maintain -// mi_abandoned_pool_t::full_page_bytes -- bytes (block_size * capacity) of -// pages currently in MI_BIN_FULL state whose pool association is that pool. -// Page weight uses the same formula as should_advance_qsbr_for_page above; -// capacity is stable while a page is in the full queue (extend_free is only -// called on non-full queues), so inc and dec see the same value. -// -// The pool a page counts toward is heap->tld->segments.abandoned, which for a -// Python tstate-bound heap is &interp->mimalloc.abandoned_pool, and for -// mimalloc's auto-created default heap is _mi_abandoned_default. Pages do -// not cross pools (mimalloc reclaim only pulls from the reclaiming heap's -// own pool), so the counter stays valid across abandon/reclaim without any -// hand-off -- abandon and reclaim therefore have no hooks of their own. -// -// The hooks fire only on slow paths: mi_page_to_full / _mi_page_unfull / -// in-full _mi_page_free. gc_get_heap_bytes() in gc_free_threading.c reads the -// per-interp pool plus _mi_abandoned_default to get a stop-the-world-free -// memory-pressure proxy. -#ifdef Py_GIL_DISABLED -static inline Py_ssize_t -_PyMem_mi_page_size(mi_page_t *page) -{ - return (Py_ssize_t)(mi_page_block_size(page) * (size_t)page->capacity); -} - -static inline Py_ssize_t * -_PyMem_mi_page_pool_full_bytes(mi_page_t *page) -{ - return (Py_ssize_t *) - &mi_page_heap(page)->tld->segments.abandoned->full_page_bytes; -} - -static void -_PyMem_mi_page_full_inc(mi_page_t *page) -{ - _Py_atomic_add_ssize(_PyMem_mi_page_pool_full_bytes(page), - _PyMem_mi_page_size(page)); -} - -static void -_PyMem_mi_page_full_dec(mi_page_t *page) -{ - _Py_atomic_add_ssize(_PyMem_mi_page_pool_full_bytes(page), - -_PyMem_mi_page_size(page)); -} -#endif - static void _PyMem_mi_heap_collect_qsbr(mi_heap_t *heap) { diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 5ae23d875a60a64..af24508d11e346f 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -17,6 +17,10 @@ #include "pydtrace.h" +// Declared in mimalloc/internal.h only at function scope; we read its +// full_page_bytes counter from _PyGC_GetHeapBytes(). +extern mi_heap_t _mi_heap_main; + // Minimum growth in mimalloc heap bytes (estimated from full pages) since the // last GC. #define GC_HEAP_BYTES_MIN_DELTA (512 * 1024) @@ -2002,16 +2006,45 @@ cleanup_worklist(struct worklist *worklist) } // Return an estimate, in bytes, of how much memory is being used. -static Py_ssize_t -gc_get_heap_bytes(PyInterpreterState *interp) -{ - // Computed from mimalloc full-page byte counters maintained on each - // abandoned pool (see _PyMem_mi_page_full_inc/dec in Objects/obmalloc.c). - Py_ssize_t total = _Py_atomic_load_ssize_relaxed( - (Py_ssize_t *)&interp->mimalloc.abandoned_pool.full_page_bytes); - total += _Py_atomic_load_ssize_relaxed( - (Py_ssize_t *)&_mi_abandoned_default.full_page_bytes); - return total; +// +// Computed from mimalloc full-page byte counters: each mi_heap_t and +// mi_abandoned_pool_t carries a `full_page_bytes` field maintained by the +// page-state helpers in Objects/mimalloc/page.c. We sum: +// - per-tstate heaps for this interpreter (live full pages) +// - the interpreter's abandoned pool (full pages between abandon and reclaim) +// - _mi_heap_main (default heap on the main thread, used pre-tstate and +// for non-Python threads) +// - _mi_abandoned_default (full pages abandoned from default heaps) +// Per-thread auto-default heaps used by non-Python threads are not +// enumerated; their bytes show up in _mi_abandoned_default once the OS +// thread exits. This is acceptable because almost all FT-Python allocation +// routes through tstate-bound heaps. +Py_ssize_t +_PyGC_GetHeapBytes(PyInterpreterState *interp) +{ + // `full_page_bytes` is `_Atomic(intptr_t)`; cast to `intptr_t *` to + // strip the qualifier for the CPython atomic helpers. The mimalloc-side + // writes use `mi_atomic_addi` directly on the `_Atomic(intptr_t)` field; + // the cast is only needed for the read side. + intptr_t total = _Py_atomic_load_intptr_relaxed( + (intptr_t *)&interp->mimalloc.abandoned_pool.full_page_bytes); + total += _Py_atomic_load_intptr_relaxed( + (intptr_t *)&_mi_abandoned_default.full_page_bytes); + total += _Py_atomic_load_intptr_relaxed( + (intptr_t *)&_mi_heap_main.full_page_bytes); + HEAD_LOCK(&_PyRuntime); + _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { + _PyThreadStateImpl *t = (_PyThreadStateImpl *)p; + if (!_Py_atomic_load_int(&t->mimalloc.initialized)) { + continue; + } + for (int h = 0; h < _Py_MIMALLOC_HEAP_COUNT; h++) { + total += _Py_atomic_load_intptr_relaxed( + (intptr_t *)&t->mimalloc.heaps[h].full_page_bytes); + } + } + HEAD_UNLOCK(&_PyRuntime); + return (Py_ssize_t)total; } // Decide whether memory usage has grown enough to warrant a collection. @@ -2029,7 +2062,7 @@ gc_should_collect_mem_usage(PyThreadState *tstate) // usage increase. return true; } - Py_ssize_t cur = gc_get_heap_bytes(interp); + Py_ssize_t cur = _PyGC_GetHeapBytes(interp); Py_ssize_t last = _Py_atomic_load_ssize_relaxed(&gcstate->last_heap_bytes); // Require 20% increase in full mimalloc pages. Py_ssize_t delta = Py_MAX(last / 5, GC_HEAP_BYTES_MIN_DELTA); @@ -2236,7 +2269,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state, delete_garbage(state); // Record the current heap bytes estimate as new baseline. - Py_ssize_t last_heap_bytes = gc_get_heap_bytes(interp); + Py_ssize_t last_heap_bytes = _PyGC_GetHeapBytes(interp); _Py_atomic_store_ssize_relaxed(&state->gcstate->last_heap_bytes, last_heap_bytes); // Append objects with legacy finalizers to the "gc.garbage" list. From 057f8625f244420b707a09284fab52a938594e9c Mon Sep 17 00:00:00 2001 From: Neil Schemenauer Date: Sun, 3 May 2026 11:37:10 -0700 Subject: [PATCH 7/7] Use _mi_heap_main_get(). --- Python/gc_free_threading.c | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index af24508d11e346f..9f0b80a85bab75f 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -17,10 +17,6 @@ #include "pydtrace.h" -// Declared in mimalloc/internal.h only at function scope; we read its -// full_page_bytes counter from _PyGC_GetHeapBytes(). -extern mi_heap_t _mi_heap_main; - // Minimum growth in mimalloc heap bytes (estimated from full pages) since the // last GC. #define GC_HEAP_BYTES_MIN_DELTA (512 * 1024) @@ -2006,32 +2002,27 @@ cleanup_worklist(struct worklist *worklist) } // Return an estimate, in bytes, of how much memory is being used. -// -// Computed from mimalloc full-page byte counters: each mi_heap_t and -// mi_abandoned_pool_t carries a `full_page_bytes` field maintained by the -// page-state helpers in Objects/mimalloc/page.c. We sum: -// - per-tstate heaps for this interpreter (live full pages) -// - the interpreter's abandoned pool (full pages between abandon and reclaim) -// - _mi_heap_main (default heap on the main thread, used pre-tstate and -// for non-Python threads) -// - _mi_abandoned_default (full pages abandoned from default heaps) -// Per-thread auto-default heaps used by non-Python threads are not -// enumerated; their bytes show up in _mi_abandoned_default once the OS -// thread exits. This is acceptable because almost all FT-Python allocation -// routes through tstate-bound heaps. Py_ssize_t _PyGC_GetHeapBytes(PyInterpreterState *interp) { - // `full_page_bytes` is `_Atomic(intptr_t)`; cast to `intptr_t *` to - // strip the qualifier for the CPython atomic helpers. The mimalloc-side - // writes use `mi_atomic_addi` directly on the `_Atomic(intptr_t)` field; - // the cast is only needed for the read side. + // Computed from mimalloc full-page byte counters: each mi_heap_t and + // mi_abandoned_pool_t carries a `full_page_bytes` field. + // Sum: + // - per-tstate heaps for this interpreter (live full pages) + // - the interpreter's abandoned pool (full pages between abandon and reclaim) + // - _mi_heap_main (default heap on the main thread, used pre-tstate and + // for non-Python threads) + // - _mi_abandoned_default (full pages abandoned from default heaps) + // Per-thread auto-default heaps used by non-Python threads are not + // enumerated; their bytes show up in _mi_abandoned_default once the OS + // thread exits. This should be acceptable because almost all Python + // allocation is done by tstate-bound heaps. intptr_t total = _Py_atomic_load_intptr_relaxed( (intptr_t *)&interp->mimalloc.abandoned_pool.full_page_bytes); total += _Py_atomic_load_intptr_relaxed( (intptr_t *)&_mi_abandoned_default.full_page_bytes); total += _Py_atomic_load_intptr_relaxed( - (intptr_t *)&_mi_heap_main.full_page_bytes); + (intptr_t *)&_mi_heap_main_get()->full_page_bytes); HEAD_LOCK(&_PyRuntime); _Py_FOR_EACH_TSTATE_UNLOCKED(interp, p) { _PyThreadStateImpl *t = (_PyThreadStateImpl *)p;