[pypy-commit] pypy nogil-unsafe-2: Attempt to reduce false sharing between threads. Unclear results

Thu Aug 17 05:39:37 EDT 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: nogil-unsafe-2
Changeset: r92161:e40f8472eb81
Date: 2017-08-17 11:38 +0200
http://bitbucket.org/pypy/pypy/changeset/e40f8472eb81/

Log:	Attempt to reduce false sharing between threads. Unclear results

diff --git a/rpython/memory/gc/incminimark.py b/rpython/memory/gc/incminimark.py
--- a/rpython/memory/gc/incminimark.py
+++ b/rpython/memory/gc/incminimark.py
@@ -280,7 +280,7 @@
         # "cache_line_min" is used to round the actual thread-local
         # blocks to a cache line, to avoid pointless cache conflicts.
         "tl_block_size": 131072,
-        "cache_line_min": 256,  # why not 64b?
+        "cache_line_min": 128,  # two cache lines on x86
         }
 
     def __init__(self, config,
@@ -313,6 +313,7 @@
         self.max_heap_size_already_raised = False
         self.max_delta = float(r_uint(-1))
         self.max_number_of_pinned_objects = 0      # computed later
+        self.collecting_roots_in_nursery = False
         #
         self.card_page_indices = card_page_indices
         if self.card_page_indices > 0:
@@ -1983,13 +1984,20 @@
         # see them.
         use_jit_frame_stoppers = not any_pinned_object_from_earlier
         #
+        self.collecting_roots_in_nursery = True
         self.root_walker.walk_roots(
             callback,     # stack roots
             callback,     # static in prebuilt non-gc
             None,         # static in prebuilt gc
             is_minor=use_jit_frame_stoppers)
+        self.collecting_roots_in_nursery = False
         debug_stop("gc-minor-walkroots")
 
+    def collected_roots_for_one_thread(self):
+        if self.collecting_roots_in_nursery:
+            self.collect_oldrefs_to_nursery()
+            self.ac.force_non_sharing_by_dummy_allocation(self.cache_line_min)
+
     def collect_cardrefs_to_nursery(self):
         size_gc_header = self.gcheaderbuilder.size_gc_header
         oldlist = self.old_objects_with_cards_set
diff --git a/rpython/memory/gc/minimarkpage.py b/rpython/memory/gc/minimarkpage.py
--- a/rpython/memory/gc/minimarkpage.py
+++ b/rpython/memory/gc/minimarkpage.py
@@ -191,6 +191,30 @@
         return result
 
 
+    def force_non_sharing_by_dummy_allocation(self, alignment):
+        """Force a few bytes of memory to be lost, to ensure that
+        a CPU cache of size "alignment" would not cause false sharing
+        between objects allocated just before and objects allocated
+        just after the call to the present function.
+        """
+        size_class_max = self.small_request_threshold >> WORD_POWER_2
+        size_class = 1
+        while size_class <= size_class_max:
+            page = self.page_for_size[size_class]
+            if page != PAGE_NULL:
+                next_alloc = page.freeblock
+                allocation_start = llmemory.cast_ptr_to_adr(page) + self.hdrsize
+                if next_alloc != allocation_start:
+                    next_alloc = rffi.cast(lltype.Signed, next_alloc)
+                    rounded_up = (next_alloc + (alignment-1)) & ~(alignment-1)
+                    while next_alloc < rounded_up:
+                        self.malloc(size_class << WORD_POWER_2)
+                        if self.page_for_size[size_class] != page:
+                            break
+                        next_alloc = rffi.cast(lltype.Signed, page.freeblock)
+            size_class += 1
+
+
     def allocate_new_page(self, size_class):
         """Allocate and return a new page for the given size_class."""
         #
diff --git a/rpython/memory/gctransform/shadowstack.py b/rpython/memory/gctransform/shadowstack.py
--- a/rpython/memory/gctransform/shadowstack.py
+++ b/rpython/memory/gctransform/shadowstack.py
@@ -113,6 +113,7 @@
             debug_print("walk_stack", base, top)
             walk_stack_root(self.invoke_collect_stack_root, collect_stack_root,
                 None, base, top, is_minor=False)
+            self.gcdata.gc.collected_roots_for_one_thread()
 
         self._walk_thread_stack = walk_thread_stack
 
diff --git a/rpython/translator/c/src/threadlocal.c b/rpython/translator/c/src/threadlocal.c
--- a/rpython/translator/c/src/threadlocal.c
+++ b/rpython/translator/c/src/threadlocal.c
@@ -11,32 +11,36 @@
 #include "src/thread.h"
 
 
-/* this is a spin-lock that must be acquired around each doubly-linked-list
+/* this is a reentrant lock that must be acquired around each doubly-linked-list
    manipulation (because such manipulations can occur without the GIL) */
-static long pypy_threadlocal_lock = 0;
+static pthread_mutex_t _rpy_threadlocal_lock;
 
 static int check_valid(void);
 
-int _RPython_ThreadLocals_AcquireTimeout(int max_wait_iterations) {
-    while (1) {
-        long old_value = pypy_lock_test_and_set(&pypy_threadlocal_lock, 1);
-        if (old_value == 0)
-            break;
-        /* busy loop */
-        if (max_wait_iterations == 0)
-            return -1;
-        if (max_wait_iterations > 0)
-            --max_wait_iterations;
+static void do_check(int result)
+{
+    if (result != 0) {
+        fprintf(stderr, "threadlocal.c got an unexpected mutex error\n");
+        exit(1);
     }
+}
+
+static void init_lock(void)
+{
+    pthread_mutexattr_t attr;
+    do_check(pthread_mutexattr_init(&attr)
+          || pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)
+          || pthread_mutex_init(&_rpy_threadlocal_lock, &attr)
+          || pthread_mutexattr_destroy(&attr));
+}
+
+void _RPython_ThreadLocals_Acquire(void) {
+    do_check(pthread_mutex_lock(&_rpy_threadlocal_lock));
     assert(check_valid());
-    return 0;
-}
-void _RPython_ThreadLocals_Acquire(void) {
-    _RPython_ThreadLocals_AcquireTimeout(-1);
 }
 void _RPython_ThreadLocals_Release(void) {
     assert(check_valid());
-    pypy_lock_release(&pypy_threadlocal_lock);
+    do_check(pthread_mutex_unlock(&_rpy_threadlocal_lock));
 }
 
 
@@ -73,6 +77,7 @@
 {
     /* assume that at most one pypy_threadlocal_s survived, the current one */
     struct pypy_threadlocal_s *cur;
+    init_lock();
     cur = (struct pypy_threadlocal_s *)_RPy_ThreadLocals_Get();
     if (cur && cur->ready == 42) {
         cur->next = cur->prev = &linkedlist_head;
@@ -81,7 +86,6 @@
     else {
         linkedlist_head.next = linkedlist_head.prev = &linkedlist_head;
     }
-    _RPython_ThreadLocals_Release();
 }
 
 
@@ -188,7 +192,7 @@
        a non-null thread-local value).  This is needed even in the
        case where we use '__thread' below, for the destructor.
     */
-    assert(pypy_threadlocal_lock == 0);
+    init_lock();
 #ifdef _WIN32
     pypy_threadlocal_key = TlsAlloc();
     if (pypy_threadlocal_key == TLS_OUT_OF_INDEXES)
diff --git a/rpython/translator/c/src/threadlocal.h b/rpython/translator/c/src/threadlocal.h
--- a/rpython/translator/c/src/threadlocal.h
+++ b/rpython/translator/c/src/threadlocal.h
@@ -21,7 +21,6 @@
 
 RPY_EXTERN void _RPython_ThreadLocals_Acquire(void);
 RPY_EXTERN void _RPython_ThreadLocals_Release(void);
-RPY_EXTERN int _RPython_ThreadLocals_AcquireTimeout(int max_wait_iterations);
 
 /* Must acquire/release the thread-local lock around a series of calls
    to the following function */