[Python-checkins] bpo-37448: Use radix tree for pymalloc address_in_range(). (GH-14474)

nascheme webhook-mailer at python.org
Mon Mar 29 22:51:40 EDT 2021


https://github.com/python/cpython/commit/85b6b70589c187639aeebc560d67e9cc04abb4d8
commit: 85b6b70589c187639aeebc560d67e9cc04abb4d8
branch: master
author: Neil Schemenauer <nas-github at arctrix.com>
committer: nascheme <nas-github at arctrix.com>
date: 2021-03-29T19:51:15-07:00
summary:

bpo-37448: Use radix tree for pymalloc address_in_range(). (GH-14474)

The radix tree approach is a relatively simple and memory sanitary
alternative to the old (slightly) unsanitary address_in_range().
To disable the radix tree map, set a preprocessor flag as follows:
-DWITH_PYMALLOC_RADIX_TREE=0.

Co-authored-by: Tim Peters <tim.peters at gmail.com>

files:
A Misc/NEWS.d/next/Core and Builtins/2021-02-21-14-19-35.bpo-37448.btl7vO.rst
M Objects/obmalloc.c

diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-02-21-14-19-35.bpo-37448.btl7vO.rst b/Misc/NEWS.d/next/Core and Builtins/2021-02-21-14-19-35.bpo-37448.btl7vO.rst
new file mode 100644
index 0000000000000..fe771a50122f6
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-02-21-14-19-35.bpo-37448.btl7vO.rst	
@@ -0,0 +1,15 @@
+Add a radix tree based memory map to track in-use obmalloc arenas. Use to
+replace the old implementation of address_in_range(). The radix tree
+approach makes it easy to increase pool sizes beyond the OS page size.
+Boosting the pool and arena size allows obmalloc to handle a significantly
+higher percentage of requests from its ultra-fast paths.
+
+It also has the advantage of eliminating the memory unsanitary behavior of
+the previous address_in_range(). The old address_in_range() was marked with
+the annotations _Py_NO_SANITIZE_ADDRESS, _Py_NO_SANITIZE_THREAD, and
+_Py_NO_SANITIZE_MEMORY. Those annotations are no longer needed.
+
+To disable the radix tree map, set a preprocessor flag as follows:
+`-DWITH_PYMALLOC_RADIX_TREE=0`.
+
+Co-authored-by: Tim Peters <tim.peters at gmail.com>
diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
index 03d0e8e51264c..76ff6f9c99bc9 100644
--- a/Objects/obmalloc.c
+++ b/Objects/obmalloc.c
@@ -894,6 +894,22 @@ static int running_on_valgrind = -1;
 #endif
 #endif
 
+#if !defined(WITH_PYMALLOC_RADIX_TREE)
+/* Use radix-tree to track arena memory regions, for address_in_range().
+ * Enable by default since it allows larger pool sizes.  Can be disabled
+ * using -DWITH_PYMALLOC_RADIX_TREE=0 */
+#define WITH_PYMALLOC_RADIX_TREE 1
+#endif
+
+#if SIZEOF_VOID_P > 4
+/* on 64-bit platforms use larger pools and arenas if we can */
+#define USE_LARGE_ARENAS
+#if WITH_PYMALLOC_RADIX_TREE
+/* large pools only supported if radix-tree is enabled */
+#define USE_LARGE_POOLS
+#endif
+#endif
+
 /*
  * The allocator sub-allocates <Big> blocks of memory (called arenas) aligned
  * on a page boundary. This is a reserved virtual address space for the
@@ -907,18 +923,34 @@ static int running_on_valgrind = -1;
  * Arenas are allocated with mmap() on systems supporting anonymous memory
  * mappings to reduce heap fragmentation.
  */
-#define ARENA_SIZE              (256 << 10)     /* 256KB */
+#ifdef USE_LARGE_ARENAS
+#define ARENA_BITS              20                    /* 1 MiB */
+#else
+#define ARENA_BITS              18                    /* 256 KiB */
+#endif
+#define ARENA_SIZE              (1 << ARENA_BITS)
+#define ARENA_SIZE_MASK         (ARENA_SIZE - 1)
 
 #ifdef WITH_MEMORY_LIMITS
 #define MAX_ARENAS              (SMALL_MEMORY_LIMIT / ARENA_SIZE)
 #endif
 
 /*
- * Size of the pools used for small blocks. Should be a power of 2,
- * between 1K and SYSTEM_PAGE_SIZE, that is: 1k, 2k, 4k.
+ * Size of the pools used for small blocks.  Must be a power of 2.
  */
-#define POOL_SIZE               SYSTEM_PAGE_SIZE        /* must be 2^N */
-#define POOL_SIZE_MASK          SYSTEM_PAGE_SIZE_MASK
+#ifdef USE_LARGE_POOLS
+#define POOL_BITS               14                  /* 16 KiB */
+#else
+#define POOL_BITS               12                  /* 4 KiB */
+#endif
+#define POOL_SIZE               (1 << POOL_BITS)
+#define POOL_SIZE_MASK          (POOL_SIZE - 1)
+
+#if !WITH_PYMALLOC_RADIX_TREE
+#if POOL_SIZE != SYSTEM_PAGE_SIZE
+#   error "pool size must be equal to system page size"
+#endif
+#endif
 
 #define MAX_POOLS_IN_ARENA  (ARENA_SIZE / POOL_SIZE)
 #if MAX_POOLS_IN_ARENA * POOL_SIZE != ARENA_SIZE
@@ -1233,6 +1265,264 @@ _Py_GetAllocatedBlocks(void)
     return n;
 }
 
+#if WITH_PYMALLOC_RADIX_TREE
+/*==========================================================================*/
+/* radix tree for tracking arena usage
+
+   bit allocation for keys
+
+   64-bit pointers and 2^20 arena size:
+     16 -> ignored (POINTER_BITS - ADDRESS_BITS)
+     10 -> MAP_TOP
+     10 -> MAP_MID
+      8 -> MAP_BOT
+     20 -> ideal aligned arena
+   ----
+     64
+
+   32-bit pointers and 2^18 arena size:
+     14 -> MAP_BOT
+     18 -> ideal aligned arena
+   ----
+     32
+
+*/
+
+#if SIZEOF_VOID_P == 8
+
+/* number of bits in a pointer */
+#define POINTER_BITS 64
+
+/* Current 64-bit processors are limited to 48-bit physical addresses.  For
+ * now, the top 17 bits of addresses will all be equal to bit 2**47.  If that
+ * changes in the future, this must be adjusted upwards.
+ */
+#define ADDRESS_BITS 48
+
+/* use the top and mid layers of the radix tree */
+#define USE_INTERIOR_NODES
+
+#elif SIZEOF_VOID_P == 4
+
+#define POINTER_BITS 32
+#define ADDRESS_BITS 32
+
+#else
+
+ /* Currently this code works for 64-bit or 32-bit pointers only.  */
+#error "obmalloc radix tree requires 64-bit or 32-bit pointers."
+
+#endif /* SIZEOF_VOID_P */
+
+/* arena_coverage_t members require this to be true  */
+#if ARENA_BITS >= 32
+#   error "arena size must be < 2^32"
+#endif
+
+#ifdef USE_INTERIOR_NODES
+/* number of bits used for MAP_TOP and MAP_MID nodes */
+#define INTERIOR_BITS ((ADDRESS_BITS - ARENA_BITS + 2) / 3)
+#else
+#define INTERIOR_BITS 0
+#endif
+
+#define MAP_TOP_BITS INTERIOR_BITS
+#define MAP_TOP_LENGTH (1 << MAP_TOP_BITS)
+#define MAP_TOP_MASK (MAP_BOT_LENGTH - 1)
+
+#define MAP_MID_BITS INTERIOR_BITS
+#define MAP_MID_LENGTH (1 << MAP_MID_BITS)
+#define MAP_MID_MASK (MAP_MID_LENGTH - 1)
+
+#define MAP_BOT_BITS (ADDRESS_BITS - ARENA_BITS - 2*INTERIOR_BITS)
+#define MAP_BOT_LENGTH (1 << MAP_BOT_BITS)
+#define MAP_BOT_MASK (MAP_BOT_LENGTH - 1)
+
+#define MAP_BOT_SHIFT ARENA_BITS
+#define MAP_MID_SHIFT (MAP_BOT_BITS + MAP_BOT_SHIFT)
+#define MAP_TOP_SHIFT (MAP_MID_BITS + MAP_MID_SHIFT)
+
+#define AS_UINT(p) ((uintptr_t)(p))
+#define MAP_BOT_INDEX(p) ((AS_UINT(p) >> MAP_BOT_SHIFT) & MAP_BOT_MASK)
+#define MAP_MID_INDEX(p) ((AS_UINT(p) >> MAP_MID_SHIFT) & MAP_MID_MASK)
+#define MAP_TOP_INDEX(p) ((AS_UINT(p) >> MAP_TOP_SHIFT) & MAP_TOP_MASK)
+
+#if ADDRESS_BITS > POINTER_BITS
+/* Return non-physical address bits of a pointer.  Those bits should be same
+ * for all valid pointers if ADDRESS_BITS set correctly.  Linux has support for
+ * 57-bit address space (Intel 5-level paging) but will not currently give
+ * those addresses to user space.
+ */
+#define HIGH_BITS(p) (AS_UINT(p) >> ADDRESS_BITS)
+#else
+#define HIGH_BITS(p) 0
+#endif
+
+
+/* This is the leaf of the radix tree.  See arena_map_mark_used() for the
+ * meaning of these members. */
+typedef struct {
+    int32_t tail_hi;
+    int32_t tail_lo;
+} arena_coverage_t;
+
+typedef struct arena_map_bot {
+    /* The members tail_hi and tail_lo are accessed together.  So, it
+     * better to have them as an array of structs, rather than two
+     * arrays.
+     */
+    arena_coverage_t arenas[MAP_BOT_LENGTH];
+} arena_map_bot_t;
+
+#ifdef USE_INTERIOR_NODES
+typedef struct arena_map_mid {
+    struct arena_map_bot *ptrs[MAP_MID_LENGTH];
+} arena_map_mid_t;
+
+typedef struct arena_map_top {
+    struct arena_map_mid *ptrs[MAP_TOP_LENGTH];
+} arena_map_top_t;
+#endif
+
+/* The root of radix tree.  Note that by initializing like this, the memory
+ * should be in the BSS.  The OS will only memory map pages as the MAP_MID
+ * nodes get used (OS pages are demand loaded as needed).
+ */
+#ifdef USE_INTERIOR_NODES
+static arena_map_top_t arena_map_root;
+/* accounting for number of used interior nodes */
+static int arena_map_mid_count;
+static int arena_map_bot_count;
+#else
+static arena_map_bot_t arena_map_root;
+#endif
+
+/* Return a pointer to a bottom tree node, return NULL if it doesn't exist or
+ * it cannot be created */
+static arena_map_bot_t *
+arena_map_get(block *p, int create)
+{
+#ifdef USE_INTERIOR_NODES
+    /* sanity check that ADDRESS_BITS is correct */
+    assert(HIGH_BITS(p) == HIGH_BITS(&arena_map_root));
+    int i1 = MAP_TOP_INDEX(p);
+    if (arena_map_root.ptrs[i1] == NULL) {
+        if (!create) {
+            return NULL;
+        }
+        arena_map_mid_t *n = PyMem_RawCalloc(1, sizeof(arena_map_mid_t));
+        if (n == NULL) {
+            return NULL;
+        }
+        arena_map_root.ptrs[i1] = n;
+        arena_map_mid_count++;
+    }
+    int i2 = MAP_MID_INDEX(p);
+    if (arena_map_root.ptrs[i1]->ptrs[i2] == NULL) {
+        if (!create) {
+            return NULL;
+        }
+        arena_map_bot_t *n = PyMem_RawCalloc(1, sizeof(arena_map_bot_t));
+        if (n == NULL) {
+            return NULL;
+        }
+        arena_map_root.ptrs[i1]->ptrs[i2] = n;
+        arena_map_bot_count++;
+    }
+    return arena_map_root.ptrs[i1]->ptrs[i2];
+#else
+    return &arena_map_root;
+#endif
+}
+
+
+/* The radix tree only tracks arenas.  So, for 16 MiB arenas, we throw
+ * away 24 bits of the address.  That reduces the space requirement of
+ * the tree compared to similar radix tree page-map schemes.  In
+ * exchange for slashing the space requirement, it needs more
+ * computation to check an address.
+ *
+ * Tracking coverage is done by "ideal" arena address.  It is easier to
+ * explain in decimal so let's say that the arena size is 100 bytes.
+ * Then, ideal addresses are 100, 200, 300, etc.  For checking if a
+ * pointer address is inside an actual arena, we have to check two ideal
+ * arena addresses.  E.g. if pointer is 357, we need to check 200 and
+ * 300.  In the rare case that an arena is aligned in the ideal way
+ * (e.g. base address of arena is 200) then we only have to check one
+ * ideal address.
+ *
+ * The tree nodes for 200 and 300 both store the address of arena.
+ * There are two cases: the arena starts at a lower ideal arena and
+ * extends to this one, or the arena starts in this arena and extends to
+ * the next ideal arena.  The tail_lo and tail_hi members correspond to
+ * these two cases.
+ */
+
+
+/* mark or unmark addresses covered by arena */
+static int
+arena_map_mark_used(uintptr_t arena_base, int is_used)
+{
+    /* sanity check that ADDRESS_BITS is correct */
+    assert(HIGH_BITS(arena_base) == HIGH_BITS(&arena_map_root));
+    arena_map_bot_t *n_hi = arena_map_get((block *)arena_base, is_used);
+    if (n_hi == NULL) {
+        assert(is_used); /* otherwise node should already exist */
+        return 0; /* failed to allocate space for node */
+    }
+    int i3 = MAP_BOT_INDEX((block *)arena_base);
+    int32_t tail = (int32_t)(arena_base & ARENA_SIZE_MASK);
+    if (tail == 0) {
+        /* is ideal arena address */
+        n_hi->arenas[i3].tail_hi = is_used ? -1 : 0;
+    }
+    else {
+        /* arena_base address is not ideal (aligned to arena size) and
+         * so it potentially covers two MAP_BOT nodes.  Get the MAP_BOT node
+         * for the next arena.  Note that it might be in different MAP_TOP
+         * and MAP_MID nodes as well so we need to call arena_map_get()
+         * again (do the full tree traversal).
+         */
+        n_hi->arenas[i3].tail_hi = is_used ? tail : 0;
+        uintptr_t arena_base_next = arena_base + ARENA_SIZE;
+        /* If arena_base is a legit arena address, so is arena_base_next - 1
+         * (last address in arena).  If arena_base_next overflows then it
+         * must overflow to 0.  However, that would mean arena_base was
+         * "ideal" and we should not be in this case. */
+        assert(arena_base < arena_base_next);
+        arena_map_bot_t *n_lo = arena_map_get((block *)arena_base_next, is_used);
+        if (n_lo == NULL) {
+            assert(is_used); /* otherwise should already exist */
+            n_hi->arenas[i3].tail_hi = 0;
+            return 0; /* failed to allocate space for node */
+        }
+        int i3_next = MAP_BOT_INDEX(arena_base_next);
+        n_lo->arenas[i3_next].tail_lo = is_used ? tail : 0;
+    }
+    return 1;
+}
+
+/* Return true if 'p' is a pointer inside an obmalloc arena.
+ * _PyObject_Free() calls this so it needs to be very fast. */
+static int
+arena_map_is_used(block *p)
+{
+    arena_map_bot_t *n = arena_map_get(p, 0);
+    if (n == NULL) {
+        return 0;
+    }
+    int i3 = MAP_BOT_INDEX(p);
+    /* ARENA_BITS must be < 32 so that the tail is a non-negative int32_t. */
+    int32_t hi = n->arenas[i3].tail_hi;
+    int32_t lo = n->arenas[i3].tail_lo;
+    int32_t tail = (int32_t)(AS_UINT(p) & ARENA_SIZE_MASK);
+    return (tail < lo) || (tail >= hi && hi != 0);
+}
+
+/* end of radix tree logic */
+/*==========================================================================*/
+#endif /* WITH_PYMALLOC_RADIX_TREE */
+
 
 /* Allocate a new arena.  If we run out of memory, return NULL.  Else
  * allocate a new arena, and return the address of an arena_object
@@ -1302,6 +1592,15 @@ new_arena(void)
     unused_arena_objects = arenaobj->nextarena;
     assert(arenaobj->address == 0);
     address = _PyObject_Arena.alloc(_PyObject_Arena.ctx, ARENA_SIZE);
+#if WITH_PYMALLOC_RADIX_TREE
+    if (address != NULL) {
+        if (!arena_map_mark_used((uintptr_t)address, 1)) {
+            /* marking arena in radix tree failed, abort */
+            _PyObject_Arena.free(_PyObject_Arena.ctx, address, ARENA_SIZE);
+            address = NULL;
+        }
+    }
+#endif
     if (address == NULL) {
         /* The allocation failed: return NULL after putting the
          * arenaobj back.
@@ -1332,6 +1631,17 @@ new_arena(void)
 }
 
 
+
+#if WITH_PYMALLOC_RADIX_TREE
+/* Return true if and only if P is an address that was allocated by
+   pymalloc.  When the radix tree is used, 'poolp' is unused.
+ */
+static bool
+address_in_range(void *p, poolp pool)
+{
+    return arena_map_is_used(p);
+}
+#else
 /*
 address_in_range(P, POOL)
 
@@ -1423,6 +1733,7 @@ address_in_range(void *p, poolp pool)
         arenas[arenaindex].address != 0;
 }
 
+#endif /* !WITH_PYMALLOC_RADIX_TREE */
 
 /*==========================================================================*/
 
@@ -1768,6 +2079,11 @@ insert_to_freepool(poolp pool)
         ao->nextarena = unused_arena_objects;
         unused_arena_objects = ao;
 
+#if WITH_PYMALLOC_RADIX_TREE
+        /* mark arena region as not under control of obmalloc */
+        arena_map_mark_used(ao->address, 0);
+#endif
+
         /* Free the entire arena. */
         _PyObject_Arena.free(_PyObject_Arena.ctx,
                              (void *)ao->address, ARENA_SIZE);
@@ -2711,6 +3027,12 @@ _PyObject_DebugMallocStats(FILE *out)
     (void)printone(out, "# arenas reclaimed", ntimes_arena_allocated - narenas);
     (void)printone(out, "# arenas highwater mark", narenas_highwater);
     (void)printone(out, "# arenas allocated current", narenas);
+#ifdef USE_INTERIOR_NODES
+    (void)printone(out, "# arena map mid nodes", arena_map_mid_count);
+    (void)printone(out, "# arena map bot nodes", arena_map_bot_count);
+    fputc('\n', out);
+#endif
+
 
     PyOS_snprintf(buf, sizeof(buf),
                   "%zu arenas * %d bytes/arena",
@@ -2729,6 +3051,15 @@ _PyObject_DebugMallocStats(FILE *out)
     total += printone(out, "# bytes lost to pool headers", pool_header_bytes);
     total += printone(out, "# bytes lost to quantization", quantization);
     total += printone(out, "# bytes lost to arena alignment", arena_alignment);
+#ifdef WITH_PYMALLOC_RADIX_TREE
+    total += printone(out, "# bytes lost to arena map root", sizeof(arena_map_root));
+#endif
+#ifdef USE_INTERIOR_NODES
+    total += printone(out, "# bytes lost to arena map mid",
+                      sizeof(arena_map_mid_t) * arena_map_mid_count);
+    total += printone(out, "# bytes lost to arena map bot",
+                      sizeof(arena_map_bot_t) * arena_map_bot_count);
+#endif
     (void)printone(out, "Total", total);
     return 1;
 }



More information about the Python-checkins mailing list