[pypy-commit] pypy stmgc-c7: import stmgc/a158a889e78b and remove the spinlock_acquire()

Sun Apr 13 18:41:58 CEST 2014

Author: Armin Rigo <arigo at tunes.org>
Branch: stmgc-c7
Changeset: r70614:06e3816b1e7b
Date: 2014-04-13 18:41 +0200
http://bitbucket.org/pypy/pypy/changeset/06e3816b1e7b/

Log:	import stmgc/a158a889e78b and remove the spinlock_acquire() function
	from stmgcintf.h

diff --git a/rpython/translator/c/src/mem.c b/rpython/translator/c/src/mem.c
--- a/rpython/translator/c/src/mem.c
+++ b/rpython/translator/c/src/mem.c
@@ -48,7 +48,7 @@
 // spinlock_acquire/spinlock_release defined in ../../stm/src_stm/stmgcintf.h
 static Signed pypy_debug_alloc_lock = 0;
 #else
-# define spinlock_acquire(lock, targetvalue)  /* nothing */
+# define spinlock_acquire(lock)               /* nothing */
 # define spinlock_release(lock)               /* nothing */
 #endif
 
@@ -58,7 +58,7 @@
   RPyAssert(p, "out of memory");
   p->addr = addr;
   p->funcname = funcname;
-  spinlock_acquire(pypy_debug_alloc_lock, '+');
+  spinlock_acquire(pypy_debug_alloc_lock);
   p->next = pypy_debug_alloc_list;
   pypy_debug_alloc_list = p;
   spinlock_release(pypy_debug_alloc_lock);
@@ -67,7 +67,7 @@
 int try_pypy_debug_alloc_stop(void *addr)
 {
   struct pypy_debug_alloc_s **p;
-  spinlock_acquire(pypy_debug_alloc_lock, '-');
+  spinlock_acquire(pypy_debug_alloc_lock);
   for (p = &pypy_debug_alloc_list; *p; p = &((*p)->next))
     if ((*p)->addr == addr)
       {
@@ -92,7 +92,7 @@
 {
   long count = 0;
   struct pypy_debug_alloc_s *p;
-  spinlock_acquire(pypy_debug_alloc_lock, 'R');
+  spinlock_acquire(pypy_debug_alloc_lock);
   for (p = pypy_debug_alloc_list; p; p = p->next)
     count++;
   if (count > 0)
diff --git a/rpython/translator/stm/src_stm/revision b/rpython/translator/stm/src_stm/revision
--- a/rpython/translator/stm/src_stm/revision
+++ b/rpython/translator/stm/src_stm/revision
@@ -1,1 +1,1 @@
-0492e398156b
+a158a889e78b
diff --git a/rpython/translator/stm/src_stm/stm/atomic.h b/rpython/translator/stm/src_stm/stm/atomic.h
--- a/rpython/translator/stm/src_stm/stm/atomic.h
+++ b/rpython/translator/stm/src_stm/stm/atomic.h
@@ -37,4 +37,12 @@
 #endif
 
 
+#define spinlock_acquire(lock)                                          \
+    do { if (LIKELY(__sync_lock_test_and_set(&(lock), 1) == 0)) break;  \
+         spin_loop(); } while (1)
+#define spinlock_release(lock)                                          \
+    do { assert((lock) == 1);                                           \
+         __sync_lock_release(&(lock)); } while (0)
+
+
 #endif  /* _STM_ATOMIC_H */
diff --git a/rpython/translator/stm/src_stm/stm/core.c b/rpython/translator/stm/src_stm/stm/core.c
--- a/rpython/translator/stm/src_stm/stm/core.c
+++ b/rpython/translator/stm/src_stm/stm/core.c
@@ -9,6 +9,23 @@
     memset(write_locks, 0, sizeof(write_locks));
 }
 
+#ifdef NDEBUG
+#define EVENTUALLY(condition)    /* nothing */
+#else
+#define EVENTUALLY(condition)                                   \
+    {                                                           \
+        if (!(condition)) {                                     \
+            int _i;                                             \
+            for (_i = 1; _i <= NB_SEGMENTS; _i++)               \
+                spinlock_acquire(lock_pages_privatizing[_i]);   \
+            if (!(condition))                                   \
+                stm_fatalerror("fails: " #condition);           \
+            for (_i = 1; _i <= NB_SEGMENTS; _i++)               \
+                spinlock_release(lock_pages_privatizing[_i]);   \
+        }                                                       \
+    }
+#endif
+
 static void check_flag_write_barrier(object_t *obj)
 {
     /* check that all copies of the object, apart from mine, have the
@@ -22,12 +39,7 @@
         if (i == STM_SEGMENT->segment_num)
             continue;
         o1 = (struct object_s *)REAL_ADDRESS(get_segment_base(i), obj);
-        if (!(o1->stm_flags & GCFLAG_WRITE_BARRIER)) {
-            mutex_pages_lock();  /* try again... */
-            if (!(o1->stm_flags & GCFLAG_WRITE_BARRIER))
-                stm_fatalerror("missing GCFLAG_WRITE_BARRIER");
-            mutex_pages_unlock();
-        }
+        EVENTUALLY(o1->stm_flags & GCFLAG_WRITE_BARRIER);
     }
 #endif
 }
@@ -272,7 +284,6 @@
        with synchronize_object_now() but I don't completely see how to
        improve...
     */
-    assert(_has_mutex_pages());
     assert(!_is_young(obj));
 
     char *segment_base = get_segment_base(source_segment_num);
@@ -327,10 +338,7 @@
     /* Copy around the version of 'obj' that lives in our own segment.
        It is first copied into the shared pages, and then into other
        segments' own private pages.
-
-       This must be called with the mutex_pages_lock!
     */
-    assert(_has_mutex_pages());
     assert(!_is_young(obj));
     assert(obj->stm_flags & GCFLAG_WRITE_BARRIER);
 
@@ -374,9 +382,26 @@
                     memcpy(dst, src, copy_size);
             }
             else {
-                assert(memcmp(dst, src, copy_size) == 0);  /* same page */
+                EVENTUALLY(memcmp(dst, src, copy_size) == 0);  /* same page */
             }
 
+            /* Do a full memory barrier.  We must make sure that other
+               CPUs see the changes we did to the shared page ("S",
+               above) before we check the other segments below with
+               is_private_page().  Otherwise, we risk the following:
+               this CPU writes "S" but the writes are not visible yet;
+               then it checks is_private_page() and gets false, and does
+               nothing more; just afterwards another CPU sets its own
+               private_page bit and copies the page; but it risks doing
+               so before seeing the "S" writes.
+
+               XXX what is the cost of this?  If it's high, then we
+               should reorganize the code so that we buffer the second
+               parts and do them by bunch of N, after just one call to
+               __sync_synchronize()...
+            */
+            __sync_synchronize();
+
             for (i = 1; i <= NB_SEGMENTS; i++) {
                 if (i == myself)
                     continue;
@@ -393,7 +418,7 @@
                         memcpy(dst, src, copy_size);
                 }
                 else {
-                    assert(memcmp(dst, src, copy_size) == 0);  /* same page */
+                    EVENTUALLY(!memcmp(dst, src, copy_size));  /* same page */
                 }
             }
 
@@ -486,12 +511,10 @@
         major_collection_now_at_safe_point();
 
     /* synchronize overflow objects living in privatized pages */
-    mutex_pages_lock();
     push_overflow_objects_from_privatized_pages();
 
     /* synchronize modified old objects to other threads */
     push_modified_to_other_segments();
-    mutex_pages_unlock();
 
     /* update 'overflow_number' if needed */
     if (STM_PSEGMENT->overflow_number_has_been_used) {
diff --git a/rpython/translator/stm/src_stm/stm/forksupport.c b/rpython/translator/stm/src_stm/stm/forksupport.c
--- a/rpython/translator/stm/src_stm/stm/forksupport.c
+++ b/rpython/translator/stm/src_stm/stm/forksupport.c
@@ -71,7 +71,6 @@
 
     s_mutex_lock();
     synchronize_all_threads(STOP_OTHERS_UNTIL_MUTEX_UNLOCK);
-    mutex_pages_lock();
 
     /* Make a new mmap at some other address, but of the same size as
        the standard mmap at stm_object_pages
@@ -167,7 +166,6 @@
     fork_big_copy = NULL;
     bool was_in_transaction = fork_was_in_transaction;
 
-    mutex_pages_unlock();
     s_mutex_unlock();
 
     if (!was_in_transaction) {
@@ -204,7 +202,6 @@
 
     /* this new process contains no other thread, so we can
        just release these locks early */
-    mutex_pages_unlock();
     s_mutex_unlock();
 
     /* Move the copy of the mmap over the old one, overwriting it
diff --git a/rpython/translator/stm/src_stm/stm/gcpage.c b/rpython/translator/stm/src_stm/stm/gcpage.c
--- a/rpython/translator/stm/src_stm/stm/gcpage.c
+++ b/rpython/translator/stm/src_stm/stm/gcpage.c
@@ -39,6 +39,7 @@
 
 static void grab_more_free_pages_for_small_allocations(void)
 {
+    abort();//XXX
     /* grab N (= GCPAGE_NUM_PAGES) pages out of the top addresses */
     uintptr_t decrease_by = GCPAGE_NUM_PAGES * 4096;
     if (uninitialized_page_stop - uninitialized_page_start <= decrease_by)
@@ -76,17 +77,22 @@
 }
 
 
+static int lock_growth_large = 0;
+
 static char *allocate_outside_nursery_large(uint64_t size)
 {
-    /* thread-safe: use the lock of pages.c to prevent any remapping
-       from occurring under our feet */
-    mutex_pages_lock();
-
     /* Allocate the object with largemalloc.c from the lower addresses. */
     char *addr = _stm_large_malloc(size);
     if (addr == NULL)
         stm_fatalerror("not enough memory!");
 
+    if (LIKELY(addr + size <= uninitialized_page_start)) {
+        return addr;
+    }
+
+    /* uncommon case: need to initialize some more pages */
+    spinlock_acquire(lock_growth_large);
+
     if (addr + size > uninitialized_page_start) {
         uintptr_t npages;
         npages = (addr + size - uninitialized_page_start) / 4096UL;
@@ -96,11 +102,10 @@
             stm_fatalerror("out of memory!");   /* XXX */
         }
         setup_N_pages(uninitialized_page_start, npages);
+        __sync_synchronize();
         uninitialized_page_start += npages * 4096UL;
     }
-
-    mutex_pages_unlock();
-
+    spinlock_release(lock_growth_large);
     return addr;
 }
 
@@ -256,7 +261,6 @@
        total_allocated by 4096. */
 
     long i;
-    mutex_pages_lock();
 
     for (i = 1; i <= NB_SEGMENTS; i++) {
         /* The 'modified_old_objects' list gives the list of objects
@@ -306,7 +310,6 @@
     for (i = 1; i <= NB_SEGMENTS; i++) {
         major_restore_private_bits_for_modified_objects(i);
     }
-    mutex_pages_unlock();
 }
 
 
@@ -465,9 +468,7 @@
 
 static void sweep_large_objects(void)
 {
-    mutex_pages_lock();
     _stm_largemalloc_sweep();
-    mutex_pages_unlock();
 }
 
 static void clean_write_locks(void)
diff --git a/rpython/translator/stm/src_stm/stm/largemalloc.c b/rpython/translator/stm/src_stm/stm/largemalloc.c
--- a/rpython/translator/stm/src_stm/stm/largemalloc.c
+++ b/rpython/translator/stm/src_stm/stm/largemalloc.c
@@ -107,20 +107,35 @@
 
 */
 
-static dlist_t largebins[N_BINS];
-static mchunk_t *first_chunk, *last_chunk;
+
+static struct {
+    int lock;
+    mchunk_t *first_chunk, *last_chunk;
+    dlist_t largebins[N_BINS];
+} lm __attribute__((aligned(64)));
+
+
+static void lm_lock(void)
+{
+    spinlock_acquire(lm.lock);
+}
+
+static void lm_unlock(void)
+{
+    spinlock_release(lm.lock);
+}
 
 
 static void insert_unsorted(mchunk_t *new)
 {
     size_t index = LAST_BIN_INDEX(new->size) ? N_BINS - 1
                                              : largebin_index(new->size);
-    new->d.next = &largebins[index];
-    new->d.prev = largebins[index].prev;
+    new->d.next = &lm.largebins[index];
+    new->d.prev = lm.largebins[index].prev;
     new->d.prev->next = &new->d;
     new->u.up = UU_UNSORTED;
     new->u.down = NULL;
-    largebins[index].prev = &new->d;
+    lm.largebins[index].prev = &new->d;
 }
 
 static int compare_chunks(const void *vchunk1, const void *vchunk2)
@@ -140,8 +155,8 @@
 
 static void really_sort_bin(size_t index)
 {
-    dlist_t *unsorted = largebins[index].prev;
-    dlist_t *end = &largebins[index];
+    dlist_t *unsorted = lm.largebins[index].prev;
+    dlist_t *end = &lm.largebins[index];
     dlist_t *scan = unsorted->prev;
     size_t count = 1;
     while (scan != end && data2chunk(scan)->u.up == UU_UNSORTED) {
@@ -177,7 +192,7 @@
         chunk1 = chunks[--count];
     }
     size_t search_size = chunk1->size;
-    dlist_t *head = largebins[index].next;
+    dlist_t *head = lm.largebins[index].next;
 
     while (1) {
         if (head == end || data2chunk(head)->size < search_size) {
@@ -219,8 +234,8 @@
 
 static void sort_bin(size_t index)
 {
-    dlist_t *last = largebins[index].prev;
-    if (last != &largebins[index] && data2chunk(last)->u.up == UU_UNSORTED)
+    dlist_t *last = lm.largebins[index].prev;
+    if (last != &lm.largebins[index] && data2chunk(last)->u.up == UU_UNSORTED)
         really_sort_bin(index);
 }
 
@@ -263,13 +278,15 @@
     if (request_size < MIN_ALLOC_SIZE)
         request_size = MIN_ALLOC_SIZE;
 
+    lm_lock();
+
     size_t index = largebin_index(request_size);
     sort_bin(index);
 
     /* scan through the chunks of current bin in reverse order
        to find the smallest that fits. */
-    dlist_t *scan = largebins[index].prev;
-    dlist_t *end = &largebins[index];
+    dlist_t *scan = lm.largebins[index].prev;
+    dlist_t *end = &lm.largebins[index];
     mchunk_t *mscan;
     while (scan != end) {
         mscan = data2chunk(scan);
@@ -287,16 +304,17 @@
        smallest item of the first non-empty bin, as it will be large
        enough. */
     while (++index < N_BINS) {
-        if (largebins[index].prev != &largebins[index]) {
+        if (lm.largebins[index].prev != &lm.largebins[index]) {
             /* non-empty bin. */
             sort_bin(index);
-            scan = largebins[index].prev;
+            scan = lm.largebins[index].prev;
             mscan = data2chunk(scan);
             goto found;
         }
     }
 
     /* not enough memory. */
+    lm_unlock();
     return NULL;
 
  found:
@@ -337,12 +355,13 @@
     mscan->prev_size = BOTH_CHUNKS_USED;
     increment_total_allocated(request_size + LARGE_MALLOC_OVERHEAD);
 
+    lm_unlock();
+
     return (char *)&mscan->d;
 }
 
-void _stm_large_free(char *data)
+static void _large_free(mchunk_t *chunk)
 {
-    mchunk_t *chunk = data2chunk(data);
     assert((chunk->size & (sizeof(char *) - 1)) == 0);
     assert(chunk->prev_size != THIS_CHUNK_FREE);
 
@@ -350,9 +369,12 @@
     increment_total_allocated(-(chunk->size + LARGE_MALLOC_OVERHEAD));
 
 #ifndef NDEBUG
-    assert(chunk->size >= sizeof(dlist_t));
-    assert(chunk->size <= (((char *)last_chunk) - (char *)data));
-    memset(data, 0xDE, chunk->size);
+    {
+        char *data = (char *)&chunk->d;
+        assert(chunk->size >= sizeof(dlist_t));
+        assert(chunk->size <= (((char *)lm.last_chunk) - data));
+        memset(data, 0xDE, chunk->size);
+    }
 #endif
 
     /* try to merge with the following chunk in memory */
@@ -409,10 +431,18 @@
     insert_unsorted(chunk);
 }
 
+void _stm_large_free(char *data)
+{
+    lm_lock();
+    _large_free(data2chunk(data));
+    lm_unlock();
+}
+
 
 void _stm_large_dump(void)
 {
-    char *data = ((char *)first_chunk) + 16;
+    lm_lock();
+    char *data = ((char *)lm.first_chunk) + 16;
     size_t prev_size_if_free = 0;
     fprintf(stderr, "\n");
     while (1) {
@@ -447,12 +477,13 @@
         data += 16;
     }
     fprintf(stderr, "\n  %p: end. ]\n\n", data - 8);
-    assert(data - 16 == (char *)last_chunk);
+    assert(data - 16 == (char *)lm.last_chunk);
+    lm_unlock();
 }
 
 char *_stm_largemalloc_data_start(void)
 {
-    return (char *)first_chunk;
+    return (char *)lm.first_chunk;
 }
 
 #ifdef STM_LARGEMALLOC_TEST
@@ -463,21 +494,23 @@
 {
     int i;
     for (i = 0; i < N_BINS; i++) {
-        largebins[i].prev = &largebins[i];
-        largebins[i].next = &largebins[i];
+        lm.largebins[i].prev = &lm.largebins[i];
+        lm.largebins[i].next = &lm.largebins[i];
     }
 
     assert(data_size >= 2 * sizeof(struct malloc_chunk));
     assert((data_size & 31) == 0);
-    first_chunk = (mchunk_t *)data_start;
-    first_chunk->prev_size = THIS_CHUNK_FREE;
-    first_chunk->size = data_size - 2 * CHUNK_HEADER_SIZE;
-    last_chunk = chunk_at_offset(first_chunk, data_size - CHUNK_HEADER_SIZE);
-    last_chunk->prev_size = first_chunk->size;
-    last_chunk->size = END_MARKER;
-    assert(last_chunk == next_chunk(first_chunk));
+    lm.first_chunk = (mchunk_t *)data_start;
+    lm.first_chunk->prev_size = THIS_CHUNK_FREE;
+    lm.first_chunk->size = data_size - 2 * CHUNK_HEADER_SIZE;
+    lm.last_chunk = chunk_at_offset(lm.first_chunk,
+                                    data_size - CHUNK_HEADER_SIZE);
+    lm.last_chunk->prev_size = lm.first_chunk->size;
+    lm.last_chunk->size = END_MARKER;
+    assert(lm.last_chunk == next_chunk(lm.first_chunk));
+    lm.lock = 0;
 
-    insert_unsorted(first_chunk);
+    insert_unsorted(lm.first_chunk);
 
 #ifdef STM_LARGEMALLOC_TEST
     _stm_largemalloc_keep = NULL;
@@ -486,57 +519,64 @@
 
 int _stm_largemalloc_resize_arena(size_t new_size)
 {
+    int result = 0;
+    lm_lock();
+
     if (new_size < 2 * sizeof(struct malloc_chunk))
-        return 0;
+        goto fail;
     OPT_ASSERT((new_size & 31) == 0);
 
     new_size -= CHUNK_HEADER_SIZE;
-    mchunk_t *new_last_chunk = chunk_at_offset(first_chunk, new_size);
-    mchunk_t *old_last_chunk = last_chunk;
-    size_t old_size = ((char *)old_last_chunk) - (char *)first_chunk;
+    mchunk_t *new_last_chunk = chunk_at_offset(lm.first_chunk, new_size);
+    mchunk_t *old_last_chunk = lm.last_chunk;
+    size_t old_size = ((char *)old_last_chunk) - (char *)lm.first_chunk;
 
     if (new_size < old_size) {
         /* check if there is enough free space at the end to allow
            such a reduction */
-        size_t lsize = last_chunk->prev_size;
+        size_t lsize = lm.last_chunk->prev_size;
         assert(lsize != THIS_CHUNK_FREE);
         if (lsize == BOTH_CHUNKS_USED)
-            return 0;
+            goto fail;
         lsize += CHUNK_HEADER_SIZE;
-        mchunk_t *prev_chunk = chunk_at_offset(last_chunk, -lsize);
+        mchunk_t *prev_chunk = chunk_at_offset(lm.last_chunk, -lsize);
         if (((char *)new_last_chunk) < ((char *)prev_chunk) +
                                        sizeof(struct malloc_chunk))
-            return 0;
+            goto fail;
 
         /* unlink the prev_chunk from the doubly-linked list */
         unlink_chunk(prev_chunk);
 
         /* reduce the prev_chunk */
-        assert(prev_chunk->size == last_chunk->prev_size);
+        assert(prev_chunk->size == lm.last_chunk->prev_size);
         prev_chunk->size = ((char*)new_last_chunk) - (char *)prev_chunk
                            - CHUNK_HEADER_SIZE;
 
         /* make a fresh-new last chunk */
         new_last_chunk->prev_size = prev_chunk->size;
         new_last_chunk->size = END_MARKER;
-        last_chunk = new_last_chunk;
-        assert(last_chunk == next_chunk(prev_chunk));
+        lm.last_chunk = new_last_chunk;
+        assert(lm.last_chunk == next_chunk(prev_chunk));
 
         insert_unsorted(prev_chunk);
     }
     else if (new_size > old_size) {
         /* make the new last chunk first, with only the extra size */
-        mchunk_t *old_last_chunk = last_chunk;
+        mchunk_t *old_last_chunk = lm.last_chunk;
         old_last_chunk->size = (new_size - old_size) - CHUNK_HEADER_SIZE;
         new_last_chunk->prev_size = BOTH_CHUNKS_USED;
         new_last_chunk->size = END_MARKER;
-        last_chunk = new_last_chunk;
-        assert(last_chunk == next_chunk(old_last_chunk));
+        lm.last_chunk = new_last_chunk;
+        assert(lm.last_chunk == next_chunk(old_last_chunk));
 
         /* then free the last_chunk (turn it from "used" to "free) */
-        _stm_large_free((char *)&old_last_chunk->d);
+        _large_free(old_last_chunk);
     }
-    return 1;
+
+    result = 1;
+ fail:
+    lm_unlock();
+    return result;
 }
 
 
@@ -551,15 +591,17 @@
 
 void _stm_largemalloc_sweep(void)
 {
-    /* This may be slightly optimized by inlining _stm_large_free() and
+    lm_lock();
+
+    /* This may be slightly optimized by inlining _large_free() and
        making cases, e.g. we might know already if the previous block
        was free or not.  It's probably not really worth it. */
-    mchunk_t *mnext, *chunk = first_chunk;
+    mchunk_t *mnext, *chunk = lm.first_chunk;
 
     if (chunk->prev_size == THIS_CHUNK_FREE)
         chunk = next_chunk(chunk);   /* go to the first non-free chunk */
 
-    while (chunk != last_chunk) {
+    while (chunk != lm.last_chunk) {
         /* here, the chunk we're pointing to is not free */
         assert(chunk->prev_size != THIS_CHUNK_FREE);
 
@@ -571,8 +613,10 @@
         /* use the callback to know if 'chunk' contains an object that
            survives or dies */
         if (!_largemalloc_sweep_keep(chunk)) {
-            _stm_large_free((char *)&chunk->d);     /* dies */
+            _large_free(chunk);     /* dies */
         }
         chunk = mnext;
     }
+
+    lm_unlock();
 }
diff --git a/rpython/translator/stm/src_stm/stm/misc.c b/rpython/translator/stm/src_stm/stm/misc.c
--- a/rpython/translator/stm/src_stm/stm/misc.c
+++ b/rpython/translator/stm/src_stm/stm/misc.c
@@ -76,21 +76,6 @@
 
 uint64_t _stm_total_allocated(void)
 {
-    mutex_pages_lock();
-    uint64_t result = increment_total_allocated(0);
-    mutex_pages_unlock();
-    return result;
+    return increment_total_allocated(0);
 }
 #endif
-
-#ifdef STM_LARGEMALLOC_TEST
-void _stm_mutex_pages_lock(void)
-{
-    mutex_pages_lock();
-}
-
-void _stm_mutex_pages_unlock(void)
-{
-    mutex_pages_unlock();
-}
-#endif
diff --git a/rpython/translator/stm/src_stm/stm/nursery.c b/rpython/translator/stm/src_stm/stm/nursery.c
--- a/rpython/translator/stm/src_stm/stm/nursery.c
+++ b/rpython/translator/stm/src_stm/stm/nursery.c
@@ -196,9 +196,7 @@
                content); or add the object to 'large_overflow_objects'.
             */
             if (STM_PSEGMENT->minor_collect_will_commit_now) {
-                mutex_pages_lock();
                 synchronize_object_now(obj);
-                mutex_pages_unlock();
             }
             else
                 LIST_APPEND(STM_PSEGMENT->large_overflow_objects, obj);
@@ -234,20 +232,13 @@
 
     /* free any object left from 'young_outside_nursery' */
     if (!tree_is_cleared(pseg->young_outside_nursery)) {
-        bool locked = false;
         wlog_t *item;
+
         TREE_LOOP_FORWARD(*pseg->young_outside_nursery, item) {
             assert(!_is_in_nursery((object_t *)item->addr));
-            if (!locked) {
-                mutex_pages_lock();
-                locked = true;
-            }
             _stm_large_free(stm_object_pages + item->addr);
         } TREE_LOOP_END;
 
-        if (locked)
-            mutex_pages_unlock();
-
         tree_clear(pseg->young_outside_nursery);
     }
 
diff --git a/rpython/translator/stm/src_stm/stm/pages.c b/rpython/translator/stm/src_stm/stm/pages.c
--- a/rpython/translator/stm/src_stm/stm/pages.c
+++ b/rpython/translator/stm/src_stm/stm/pages.c
@@ -6,16 +6,12 @@
 
 /************************************************************/
 
-static union {
-    struct {
-        uint8_t mutex_pages;
-        volatile bool major_collection_requested;
-        uint64_t total_allocated;  /* keep track of how much memory we're
-                                      using, ignoring nurseries */
-        uint64_t total_allocated_bound;
-    };
-    char reserved[64];
-} pages_ctl __attribute__((aligned(64)));
+struct {
+    volatile bool major_collection_requested;
+    uint64_t total_allocated;  /* keep track of how much memory we're
+                                  using, ignoring nurseries */
+    uint64_t total_allocated_bound;
+} pages_ctl;
 
 
 static void setup_pages(void)
@@ -29,37 +25,15 @@
     memset(pages_privatized, 0, sizeof(pages_privatized));
 }
 
-static void mutex_pages_lock(void)
-{
-    if (__sync_lock_test_and_set(&pages_ctl.mutex_pages, 1) == 0)
-        return;
-
-    int previous = change_timing_state(STM_TIME_SPIN_LOOP);
-    while (__sync_lock_test_and_set(&pages_ctl.mutex_pages, 1) != 0) {
-        spin_loop();
-    }
-    change_timing_state(previous);
-}
-
-static void mutex_pages_unlock(void)
-{
-    __sync_lock_release(&pages_ctl.mutex_pages);
-}
-
-static bool _has_mutex_pages(void)
-{
-    return pages_ctl.mutex_pages != 0;
-}
-
 static uint64_t increment_total_allocated(ssize_t add_or_remove)
 {
-    assert(_has_mutex_pages());
-    pages_ctl.total_allocated += add_or_remove;
+    uint64_t ta = __sync_add_and_fetch(&pages_ctl.total_allocated,
+                                       add_or_remove);
 
-    if (pages_ctl.total_allocated >= pages_ctl.total_allocated_bound)
+    if (ta >= pages_ctl.total_allocated_bound)
         pages_ctl.major_collection_requested = true;
 
-    return pages_ctl.total_allocated;
+    return ta;
 }
 
 static bool is_major_collection_requested(void)
@@ -118,10 +92,12 @@
     /* call remap_file_pages() to make all pages in the range(pagenum,
        pagenum+count) refer to the same physical range of pages from
        segment 0. */
-    uintptr_t i;
-    assert(_has_mutex_pages());
+    dprintf(("pages_initialize_shared: 0x%ld - 0x%ld\n", pagenum,
+             pagenum + count));
+    assert(pagenum < NB_PAGES);
     if (count == 0)
         return;
+    uintptr_t i;
     for (i = 1; i <= NB_SEGMENTS; i++) {
         char *segment_base = get_segment_base(i);
         d_remap_file_pages(segment_base + pagenum * 4096UL,
@@ -131,14 +107,20 @@
 
 static void page_privatize(uintptr_t pagenum)
 {
-    if (is_private_page(STM_SEGMENT->segment_num, pagenum)) {
-        /* the page is already privatized */
+    /* check this thread's 'pages_privatized' bit */
+    uint64_t bitmask = 1UL << (STM_SEGMENT->segment_num - 1);
+    struct page_shared_s *ps = &pages_privatized[pagenum - PAGE_FLAG_START];
+    if (ps->by_segment & bitmask) {
+        /* the page is already privatized; nothing to do */
         return;
     }
 
-    /* lock, to prevent concurrent threads from looking up this thread's
-       'pages_privatized' bits in parallel */
-    mutex_pages_lock();
+#ifndef NDEBUG
+    spinlock_acquire(lock_pages_privatizing[STM_SEGMENT->segment_num]);
+#endif
+
+    /* add this thread's 'pages_privatized' bit */
+    __sync_fetch_and_add(&ps->by_segment, bitmask);
 
     /* "unmaps" the page to make the address space location correspond
        again to its underlying file offset (XXX later we should again
@@ -152,11 +134,9 @@
     /* copy the content from the shared (segment 0) source */
     pagecopy(new_page, stm_object_pages + pagenum * 4096UL);
 
-    /* add this thread's 'pages_privatized' bit */
-    uint64_t bitmask = 1UL << (STM_SEGMENT->segment_num - 1);
-    pages_privatized[pagenum - PAGE_FLAG_START].by_segment |= bitmask;
-
-    mutex_pages_unlock();
+#ifndef NDEBUG
+    spinlock_release(lock_pages_privatizing[STM_SEGMENT->segment_num]);
+#endif
 }
 
 static void _page_do_reshare(long segnum, uintptr_t pagenum)
diff --git a/rpython/translator/stm/src_stm/stm/pages.h b/rpython/translator/stm/src_stm/stm/pages.h
--- a/rpython/translator/stm/src_stm/stm/pages.h
+++ b/rpython/translator/stm/src_stm/stm/pages.h
@@ -35,6 +35,20 @@
 };
 
 static struct page_shared_s pages_privatized[PAGE_FLAG_END - PAGE_FLAG_START];
+/* Rules for concurrent access to this array, possibly with is_private_page():
+
+   - we clear bits only during major collection, when all threads are
+     synchronized anyway
+
+   - we set only the bit corresponding to our segment number, using
+     an atomic addition; and we do it _before_ we actually make the
+     page private.
+
+   - concurrently, other threads checking the bits might (rarely)
+     get the answer 'true' to is_private_page() even though it is not
+     actually private yet.  This inconsistency is in the direction
+     that we want for synchronize_object_now().
+*/
 
 static void pages_initialize_shared(uintptr_t pagenum, uintptr_t count);
 static void page_privatize(uintptr_t pagenum);
@@ -42,10 +56,6 @@
 static void _page_do_reshare(long segnum, uintptr_t pagenum);
 static void pages_setup_readmarkers_for_nursery(void);
 
-/* Note: don't ever do "mutex_pages_lock(); mutex_lock()" in that order */
-static void mutex_pages_lock(void);
-static void mutex_pages_unlock(void);
-static bool _has_mutex_pages(void) __attribute__((unused));
 static uint64_t increment_total_allocated(ssize_t add_or_remove);
 static bool is_major_collection_requested(void);
 static void force_major_collection_request(void);
@@ -64,4 +74,6 @@
         page_reshare(pagenum);
 }
 
-void _stm_mutex_pages_lock(void);
+#ifndef NDEBUG
+static char lock_pages_privatizing[NB_SEGMENTS + 1] = { 0 };
+#endif
diff --git a/rpython/translator/stm/src_stm/stm/timing.c b/rpython/translator/stm/src_stm/stm/timing.c
--- a/rpython/translator/stm/src_stm/stm/timing.c
+++ b/rpython/translator/stm/src_stm/stm/timing.c
@@ -56,7 +56,6 @@
     "minor gc",
     "major gc",
     "sync pause",
-    "spin loop",
 };
 
 void stm_flush_timing(stm_thread_local_t *tl, int verbose)
diff --git a/rpython/translator/stm/src_stm/stmgc.h b/rpython/translator/stm/src_stm/stmgc.h
--- a/rpython/translator/stm/src_stm/stmgc.h
+++ b/rpython/translator/stm/src_stm/stmgc.h
@@ -71,7 +71,6 @@
     STM_TIME_MINOR_GC,
     STM_TIME_MAJOR_GC,
     STM_TIME_SYNC_PAUSE,
-    STM_TIME_SPIN_LOOP,
     _STM_TIME_N
 };
 
@@ -134,8 +133,6 @@
 object_t *_stm_enum_modified_old_objects(long index);
 object_t *_stm_enum_objects_pointing_to_nursery(long index);
 uint64_t _stm_total_allocated(void);
-void _stm_mutex_pages_lock(void);
-void _stm_mutex_pages_unlock(void);
 #endif
 
 #define _STM_GCFLAG_WRITE_BARRIER      0x01
diff --git a/rpython/translator/stm/src_stm/stmgcintf.h b/rpython/translator/stm/src_stm/stmgcintf.h
--- a/rpython/translator/stm/src_stm/stmgcintf.h
+++ b/rpython/translator/stm/src_stm/stmgcintf.h
@@ -6,7 +6,7 @@
 
 #include <errno.h>
 #include "stmgc.h"
-#include "stm/atomic.h"    /* for spin_loop() and write_fence() */
+#include "stm/atomic.h"    /* for spin_loop(), write_fence(), spinlock_xxx() */
 
 extern __thread struct stm_thread_local_s stm_thread_local;
 extern __thread long pypy_stm_ready_atomic;
@@ -101,25 +101,4 @@
 }
 
 
-#if 0    /* fprinting versions */
-# define spinlock_acquire(lock, targetvalue)                            \
-    do { if (__sync_bool_compare_and_swap(&(lock), 0, (targetvalue))) { \
-             dprintf(("<<< locked %d\n", (int)targetvalue));            \
-             break;                                                     \
-         }                                                              \
-         do { spin_loop(); } while (lock);                              \
-    } while (1)
-# define spinlock_release(lock)                                         \
-    do { dprintf(("unlocked >>>\n")); write_fence();                    \
-         assert((lock) != 0); (lock) = 0; } while (0)
-#else
-# define spinlock_acquire(lock, targetvalue)                                 \
-    do { if (__sync_bool_compare_and_swap(&(lock), 0, (targetvalue))) break; \
-         do { spin_loop(); } while (lock);                                   \
-    } while (1)
-# define spinlock_release(lock)                                 \
-    do { write_fence(); assert((lock) != 0); (lock) = 0; } while (0)
-#endif
-
-
 #endif  /* _RPY_STMGCINTF_H */