Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions Include/internal/mimalloc/mimalloc/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,12 @@ typedef struct mi_abandoned_pool_s {
// in order to prevent resetting/decommitting segment memory if it might
// still be read.
mi_decl_cache_align _Atomic(size_t) abandoned_readers; // = 0

#if MI_FULL_PAGE_BYTES
// Bytes (block_size * capacity) of full pages currently abandoned to this
// pool.
mi_decl_cache_align _Atomic(intptr_t) full_page_bytes; // = 0
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm worried about contention here because all full/not-full operations modify this shared variable. Repeated allocation/deallocation of a single block can cause the containing page to be repeatedly marked as full/not-full.

I'd prefer we do the counting in per-thread state instead of here, which is effectively per-interpreter state. We can add the total to a per-interpreter counter when the thread exits. That means gc_should_collect_mem_usage would need to loop over all thread states to get an estimate of the allocated bytes, but I think that's a worthwhile tradeoff.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added full_page_bytes to the mi_heap_t struct, which is per-thread. Putting it inside the Python thread state does work but it adds some extra complication.

#endif
} mi_abandoned_pool_t;


Expand Down Expand Up @@ -588,6 +594,11 @@ struct mi_heap_s {
uint8_t tag; // custom identifier for this heap
uint8_t debug_offset; // number of bytes to preserve when filling freed or uninitialized memory
bool page_use_qsbr; // should freeing pages be delayed using QSBR
#if MI_FULL_PAGE_BYTES
// Bytes (block_size * capacity) of pages currently in MI_BIN_FULL state
// owned by this heap.
_Atomic(intptr_t) full_page_bytes;
#endif
};


Expand Down
2 changes: 2 additions & 0 deletions Include/internal/pycore_gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,8 @@ extern int _PyGC_VisitStackRef(union _PyStackRef *ref, visitproc visit, void *ar
#ifdef Py_GIL_DISABLED
extern void _PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp,
gcvisitobjects_t callback, void *arg);
// Estimate of bytes allocated by mimalloc.
PyAPI_FUNC(Py_ssize_t) _PyGC_GetHeapBytes(PyInterpreterState *interp);
#endif

#ifdef __cplusplus
Expand Down
6 changes: 3 additions & 3 deletions Include/internal/pycore_interp_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -260,11 +260,11 @@ struct _gc_runtime_state {
/* True if gc.freeze() has been used. */
int freeze_active;

/* Memory usage of the process (RSS + swap) after last GC. */
Py_ssize_t last_mem;
/* Estimate of the number of bytes used by mimalloc after last GC. */
Py_ssize_t last_heap_bytes;

/* This accumulates the new object count whenever collection is deferred
due to the RSS increase condition not being meet. Reset on collection. */
due to memory usage not increasing enough. Reset on collection. */
Py_ssize_t deferred_count;

/* Mutex held for gc_should_collect_mem_usage(). */
Expand Down
5 changes: 5 additions & 0 deletions Include/internal/pycore_mimalloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ typedef enum {
# define MI_TSAN 1
#endif

#ifdef Py_GIL_DISABLED
// Track full-page byte totals on each mi_heap_t and mi_abandoned_pool_t.
# define MI_FULL_PAGE_BYTES 1
#endif

#ifdef __cplusplus
extern "C++" {
#endif
Expand Down
54 changes: 52 additions & 2 deletions Lib/test/test_gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1271,8 +1271,58 @@ def test():
assert_python_ok("-c", code_inside_function)


@unittest.skipUnless(Py_GIL_DISABLED, "requires free-threaded GC")
@unittest.skipIf(_testinternalcapi is None, "requires _testinternalcapi")

@unittest.skipUnless(Py_GIL_DISABLED, "requires free-threaded GC")
@unittest.skipIf(_testinternalcapi is None, "requires _testinternalcapi")
class FreeThreadingTests(unittest.TestCase):
# Tests that are specific to the free-threading GC.

def test_gc_heap_bytes_large_allocs(self):
# The free-threaded GC threshold uses _PyGC_GetHeapBytes(), which
# sums mimalloc's full_page_bytes counters. Large/huge pages
# (>MI_MEDIUM_OBJ_SIZE_MAX, MI_BIN_HUGE) get eagerly promoted to
# MI_BIN_FULL by `_mi_malloc_generic` -- without that, mimalloc
# would never count these pages, and a cycle holding a large
# buffer would not register as memory pressure.
gc.collect()
baseline = _testinternalcapi.get_gc_heap_bytes()
size = 1 << 20 # 1 MiB
k = 5
data = [bytearray(size) for _ in range(k)]
after_alloc = _testinternalcapi.get_gc_heap_bytes()
# All k pages should be counted. Page size rounds up the request,
# so the increase should be at least k * size.
self.assertGreaterEqual(after_alloc - baseline, k * size)
del data
gc.collect()
after_free = _testinternalcapi.get_gc_heap_bytes()
# Freeing the lone block in each huge page un-fulls it. Allow some
# slop for unrelated allocations triggered by gc.collect().
self.assertLess(abs(after_free - baseline), size)

def test_gc_heap_bytes_many_small_allocs(self):
# Filling small pages should also bump the counter. Small/medium
# transitions are lazy (only when a page actually becomes full), so
# use enough allocations to fill many pages.
gc.collect()
baseline = _testinternalcapi.get_gc_heap_bytes()
n = 100_000
objs = [bytes(4) for i in range(n)]
after_alloc = _testinternalcapi.get_gc_heap_bytes()
print('small after alloc', baseline, after_alloc)
self.assertGreater(after_alloc - baseline, 1 << 20)
del objs
gc.collect()
after_free = _testinternalcapi.get_gc_heap_bytes()
print('small after free', baseline, after_free)
# Should drop substantially once the pages empty out.
self.assertLess(after_free - baseline, (after_alloc - baseline) // 2)

def test_gc_heap_bytes_nonneg(self):
# Counter is intptr_t and only increases or decreases via paired
# hooks; it must never go negative.
self.assertGreaterEqual(_testinternalcapi.get_gc_heap_bytes(), 0)

def test_tuple_untrack_counts(self):
# This ensures that the free-threaded GC is counting untracked tuples
# in the "long_lived_total" count. This is required to avoid
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fix a bug in the free-threaded GC that caused collections to be deferred too
long. This would result in excess memory usage since cyclic trash was not
freed quickly enough.
7 changes: 7 additions & 0 deletions Modules/_testinternalcapi.c
Original file line number Diff line number Diff line change
Expand Up @@ -2642,6 +2642,12 @@ get_long_lived_total(PyObject *self, PyObject *Py_UNUSED(ignored))
return PyLong_FromInt64(PyInterpreterState_Get()->gc.long_lived_total);
}

static PyObject *
get_gc_heap_bytes(PyObject *self, PyObject *Py_UNUSED(ignored))
{
return PyLong_FromSsize_t(_PyGC_GetHeapBytes(PyInterpreterState_Get()));
}

#endif

static PyObject *
Expand Down Expand Up @@ -3007,6 +3013,7 @@ static PyMethodDef module_functions[] = {
{"get_tlbc", get_tlbc, METH_O, NULL},
{"get_tlbc_id", get_tlbc_id, METH_O, NULL},
{"get_long_lived_total", get_long_lived_total, METH_NOARGS},
{"get_gc_heap_bytes", get_gc_heap_bytes, METH_NOARGS},
#endif
#ifdef _Py_TIER2
{"uop_symbols_test", _Py_uop_symbols_test, METH_NOARGS},
Expand Down
13 changes: 13 additions & 0 deletions Objects/mimalloc/heap.c
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,11 @@ static void mi_heap_reset_pages(mi_heap_t* heap) {
_mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
heap->thread_delayed_free = NULL;
heap->page_count = 0;
#if MI_FULL_PAGE_BYTES
// All pages have been removed (destroyed, or transferred via
// mi_heap_absorb which already moved the bytes to the destination heap).
mi_atomic_store_relaxed(&heap->full_page_bytes, (intptr_t)0);
#endif
}

// called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
Expand Down Expand Up @@ -427,6 +432,14 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
}
mi_assert_internal(from->page_count == 0);

#if MI_FULL_PAGE_BYTES
// The page-state hooks didn't fire for these transfers, so move the
// full_page_bytes accounting in bulk. mi_heap_reset_pages(from) below
// will zero `from->full_page_bytes`.
intptr_t bytes = mi_atomic_load_relaxed(&from->full_page_bytes);
mi_atomic_addi(&heap->full_page_bytes, bytes);
#endif

// and do outstanding delayed frees in the `from` heap
// note: be careful here as the `heap` field in all those pages no longer point to `from`,
// turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a
Expand Down
6 changes: 5 additions & 1 deletion Objects/mimalloc/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,11 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
NULL, // next
false,
0,
0
0,
0,
#if MI_FULL_PAGE_BYTES
MI_ATOMIC_VAR_INIT(0), // full_page_bytes
#endif
};

#define tld_empty_stats ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats)))
Expand Down
6 changes: 4 additions & 2 deletions Objects/mimalloc/page-queue.c
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t*
uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
mi_assert_internal(bin <= MI_BIN_FULL);
mi_page_queue_t* pq = &heap->pages[bin];
mi_assert_internal(mi_page_is_in_full(page) || page->xblock_size == pq->block_size);
mi_assert_internal(bin >= MI_BIN_HUGE || page->xblock_size == pq->block_size);
return pq;
}

Expand Down Expand Up @@ -264,7 +264,9 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro
(page->xblock_size == to->block_size && mi_page_queue_is_full(from)) ||
(page->xblock_size == from->block_size && mi_page_queue_is_full(to)) ||
(page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
(page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));
(page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)) ||
(mi_page_queue_is_huge(from) && mi_page_queue_is_full(to)) ||
(mi_page_queue_is_full(from) && mi_page_queue_is_huge(to)));

mi_heap_t* heap = mi_page_heap(page);
if (page->prev != NULL) page->prev->next = page->next;
Expand Down
Loading
Loading