[pypy-commit] stmgc c7: re-introduce the alloc-pages for small-sized allocations (3x faster in simple duhton bench).

Mon Jan 27 13:54:59 CET 2014

Author: Remi Meier
Branch: c7
Changeset: r677:8e7c804b4bdc
Date: 2014-01-27 13:55 +0100
http://bitbucket.org/pypy/stmgc/changeset/8e7c804b4bdc/

Log:	re-introduce the alloc-pages for small-sized allocations (3x faster
	in simple duhton bench).

diff --git a/c7/core.c b/c7/core.c
--- a/c7/core.c
+++ b/c7/core.c
@@ -100,10 +100,15 @@
 
     /* privatize if SHARED_PAGE */
     uintptr_t pagenum2, pages;
-    _stm_chunk_pages((struct object_s*)REAL_ADDRESS(get_thread_base(0), obj),
-                     &pagenum2, &pages);
-    assert(pagenum == pagenum2);
-    assert(pages == (stmcb_size(real_address(obj)) +4095) / 4096);
+    if (obj->stm_flags & GCFLAG_SMALL) {
+        pagenum2 = pagenum;
+        pages = 1;
+    } else {
+        _stm_chunk_pages((struct object_s*)REAL_ADDRESS(get_thread_base(0), obj),
+                         &pagenum2, &pages);
+        assert(pagenum == pagenum2);
+        assert(pages == (stmcb_size(real_address(obj)) +4095) / 4096);
+    }
     for (pagenum2 += pages - 1; pagenum2 >= pagenum; pagenum2--)
         stm_pages_privatize(pagenum2);
 
diff --git a/c7/core.h b/c7/core.h
--- a/c7/core.h
+++ b/c7/core.h
@@ -37,6 +37,9 @@
     /* only used during collections to mark an obj as moved out of the
        generation it was in */
     GCFLAG_MOVED = (1 << 2),
+    /* objects smaller than one page and even smaller than
+       LARGE_OBJECT_WORDS * 8 bytes */
+    GCFLAG_SMALL = (1 << 3),
 };
 
 
@@ -47,6 +50,7 @@
 
 typedef TLPREFIX struct _thread_local1_s _thread_local1_t;
 typedef TLPREFIX struct object_s object_t;
+typedef TLPREFIX struct alloc_for_size_s alloc_for_size_t;
 typedef TLPREFIX struct read_marker_s read_marker_t;
 typedef TLPREFIX char localchar_t;
 typedef void* jmpbufptr_t[5];  /* for use with __builtin_setjmp() */
@@ -79,6 +83,11 @@
     uint8_t rm;
 };
 
+struct alloc_for_size_s {
+    localchar_t *next;
+    uint16_t start, stop;
+    bool flag_partial_page;
+};
 
 struct _thread_local1_s {
     jmpbufptr_t *jmpbufptr;
@@ -94,6 +103,7 @@
     object_t **shadow_stack;
     object_t **shadow_stack_base;
 
+    struct alloc_for_size_s alloc[LARGE_OBJECT_WORDS];
     struct stm_list_s *uncommitted_objects;
 
     localchar_t *nursery_current;
diff --git a/c7/largemalloc.c b/c7/largemalloc.c
--- a/c7/largemalloc.c
+++ b/c7/largemalloc.c
@@ -105,6 +105,9 @@
 
 size_t _stm_data_size(struct object_s *data)
 {
+    if (data->stm_flags & GCFLAG_SMALL)
+        return stmcb_size(data); /* XXX: inefficient */
+    
     mchunk_t *chunk = data2chunk((char*)data);
     return chunk->size & ~FLAG_SORTED;
 }
@@ -120,7 +123,13 @@
     char *end = src + _stm_data_size((struct object_s*)REAL_ADDRESS(get_thread_base(0), obj));
     uintptr_t pagenum, num;
     struct object_s *t0_obj = (struct object_s*)REAL_ADDRESS(get_thread_base(0), _stm_tl_address(src));
-    _stm_chunk_pages(t0_obj, &pagenum, &num);
+
+    if (obj->stm_flags & GCFLAG_SMALL) {
+        pagenum = (uintptr_t)obj / 4096UL;
+        num = 1;
+    } else { 
+        _stm_chunk_pages(t0_obj, &pagenum, &num);
+    }
 
     while (src < end) {
         size_t to_copy = 4096UL - ((uintptr_t)src & 4095UL);
@@ -299,6 +308,8 @@
 
 void stm_large_free(object_t *tldata)
 {
+    assert(!(tldata->stm_flags & GCFLAG_SMALL));
+    
     while (__sync_lock_test_and_set(&alloc_lock, 1))
         spin_loop();
     
diff --git a/c7/nursery.c b/c7/nursery.c
--- a/c7/nursery.c
+++ b/c7/nursery.c
@@ -44,6 +44,58 @@
     return _stm_allocate_old(size);  /* XXX */
 }
 
+localchar_t *_stm_alloc_next_page(size_t size_class)
+{
+    /* may return uninitialized pages */
+    
+    /* 'alloc->next' points to where the next allocation should go.  The
+       present function is called instead when this next allocation is
+       equal to 'alloc->stop'.  As we know that 'start', 'next' and
+       'stop' are always nearby pointers, we play tricks and only store
+       the lower 16 bits of 'start' and 'stop', so that the three
+       variables plus some flags fit in 16 bytes.
+    */
+    uintptr_t page;
+    localchar_t *result;
+    alloc_for_size_t *alloc = &_STM_TL->alloc[size_class];
+    size_t size = size_class * 8;
+
+    /* reserve a fresh new page (XXX: from the end!) */
+    page = stm_pages_reserve(1);
+
+    result = (localchar_t *)(page * 4096UL);
+    alloc->start = (uintptr_t)result;
+    alloc->stop = alloc->start + (4096 / size) * size;
+    alloc->next = result + size;
+    alloc->flag_partial_page = false;
+    return result;
+}
+
+object_t *stm_big_small_alloc_old(size_t size, bool *is_small)
+{
+    /* may return uninitialized objects */
+    object_t *result;
+    size_t size_class = size / 8;
+    assert(size_class >= 2);
+    
+    if (size_class >= LARGE_OBJECT_WORDS) {
+        result = stm_large_malloc(size);
+        *is_small = 0;
+    } else {
+        *is_small = 1;
+        alloc_for_size_t *alloc = &_STM_TL->alloc[size_class];
+        
+        if ((uint16_t)((uintptr_t)alloc->next) == alloc->stop) {
+            result = (object_t *)_stm_alloc_next_page(size_class);
+        } else {
+            result = (object_t *)alloc->next;
+            alloc->next += size;
+        }
+    }
+    return result;
+}
+
+
 
 void trace_if_young(object_t **pobj)
 {
@@ -62,7 +114,8 @@
 
     /* move obj to somewhere else */
     size_t size = stmcb_size(real_address(*pobj));
-    object_t *moved = stm_large_malloc(size);
+    bool is_small;
+    object_t *moved = stm_big_small_alloc_old(size, &is_small);
 
     memcpy((void*)real_address(moved),
            (void*)real_address(*pobj),
@@ -70,6 +123,8 @@
 
     /* object is not committed yet */
     moved->stm_flags |= GCFLAG_NOT_COMMITTED;
+    if (is_small)              /* means, not allocated by large-malloc */
+        moved->stm_flags |= GCFLAG_SMALL;
     LIST_APPEND(_STM_TL->uncommitted_objects, moved);
     
     (*pobj)->stm_flags |= GCFLAG_MOVED;
@@ -189,6 +244,21 @@
     /* uncommitted objects */
     push_uncommitted_to_other_threads();
     stm_list_clear(_STM_TL->uncommitted_objects);
+
+    /* for small alloc classes, set the partial flag */
+    long j;
+    for (j = 2; j < LARGE_OBJECT_WORDS; j++) {
+        alloc_for_size_t *alloc = &_STM_TL->alloc[j];
+        uint16_t start = alloc->start;
+        uint16_t cur = (uintptr_t)alloc->next;
+        
+        if (start == cur)
+            continue;           /* page full -> will be replaced automatically */
+        
+        alloc->start = cur;     /* next transaction has different 'start' to
+                                   reset in case of an abort */
+        alloc->flag_partial_page = 1;
+    }
 }
 
 void nursery_on_abort()
@@ -205,13 +275,26 @@
     _STM_TL->nursery_current = nursery_base;
 
 
+    /* reset the alloc-pages to the state at the start of the transaction */
+    long j;
+    for (j = 2; j < LARGE_OBJECT_WORDS; j++) {
+        alloc_for_size_t *alloc = &_STM_TL->alloc[j];
+        uint16_t num_allocated = ((uintptr_t)alloc->next) - alloc->start;
+        
+        if (num_allocated) {
+            /* forget about all non-committed objects */
+            alloc->next -= num_allocated;
+        }
+    }
+    
     /* free uncommitted objects */
     struct stm_list_s *uncommitted = _STM_TL->uncommitted_objects;
     
     STM_LIST_FOREACH(
         uncommitted,
         ({
-            stm_large_free(item);
+            if (!(item->stm_flags & GCFLAG_SMALL))
+                stm_large_free(item);
         }));
     
     stm_list_clear(uncommitted);