[pypy-commit] pypy nogil-unsafe-2: Attempt to reduce false sharing between threads. Unclear results
arigo
pypy.commits at gmail.com
Thu Aug 17 05:39:37 EDT 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: nogil-unsafe-2
Changeset: r92161:e40f8472eb81
Date: 2017-08-17 11:38 +0200
http://bitbucket.org/pypy/pypy/changeset/e40f8472eb81/
Log: Attempt to reduce false sharing between threads. Unclear results
diff --git a/rpython/memory/gc/incminimark.py b/rpython/memory/gc/incminimark.py
--- a/rpython/memory/gc/incminimark.py
+++ b/rpython/memory/gc/incminimark.py
@@ -280,7 +280,7 @@
# "cache_line_min" is used to round the actual thread-local
# blocks to a cache line, to avoid pointless cache conflicts.
"tl_block_size": 131072,
- "cache_line_min": 256, # why not 64b?
+ "cache_line_min": 128, # two cache lines on x86
}
def __init__(self, config,
@@ -313,6 +313,7 @@
self.max_heap_size_already_raised = False
self.max_delta = float(r_uint(-1))
self.max_number_of_pinned_objects = 0 # computed later
+ self.collecting_roots_in_nursery = False
#
self.card_page_indices = card_page_indices
if self.card_page_indices > 0:
@@ -1983,13 +1984,20 @@
# see them.
use_jit_frame_stoppers = not any_pinned_object_from_earlier
#
+ self.collecting_roots_in_nursery = True
self.root_walker.walk_roots(
callback, # stack roots
callback, # static in prebuilt non-gc
None, # static in prebuilt gc
is_minor=use_jit_frame_stoppers)
+ self.collecting_roots_in_nursery = False
debug_stop("gc-minor-walkroots")
+ def collected_roots_for_one_thread(self):
+ if self.collecting_roots_in_nursery:
+ self.collect_oldrefs_to_nursery()
+ self.ac.force_non_sharing_by_dummy_allocation(self.cache_line_min)
+
def collect_cardrefs_to_nursery(self):
size_gc_header = self.gcheaderbuilder.size_gc_header
oldlist = self.old_objects_with_cards_set
diff --git a/rpython/memory/gc/minimarkpage.py b/rpython/memory/gc/minimarkpage.py
--- a/rpython/memory/gc/minimarkpage.py
+++ b/rpython/memory/gc/minimarkpage.py
@@ -191,6 +191,30 @@
return result
+ def force_non_sharing_by_dummy_allocation(self, alignment):
+ """Force a few bytes of memory to be lost, to ensure that
+ a CPU cache of size "alignment" would not cause false sharing
+ between objects allocated just before and objects allocated
+ just after the call to the present function.
+ """
+ size_class_max = self.small_request_threshold >> WORD_POWER_2
+ size_class = 1
+ while size_class <= size_class_max:
+ page = self.page_for_size[size_class]
+ if page != PAGE_NULL:
+ next_alloc = page.freeblock
+ allocation_start = llmemory.cast_ptr_to_adr(page) + self.hdrsize
+ if next_alloc != allocation_start:
+ next_alloc = rffi.cast(lltype.Signed, next_alloc)
+ rounded_up = (next_alloc + (alignment-1)) & ~(alignment-1)
+ while next_alloc < rounded_up:
+ self.malloc(size_class << WORD_POWER_2)
+ if self.page_for_size[size_class] != page:
+ break
+ next_alloc = rffi.cast(lltype.Signed, page.freeblock)
+ size_class += 1
+
+
def allocate_new_page(self, size_class):
"""Allocate and return a new page for the given size_class."""
#
diff --git a/rpython/memory/gctransform/shadowstack.py b/rpython/memory/gctransform/shadowstack.py
--- a/rpython/memory/gctransform/shadowstack.py
+++ b/rpython/memory/gctransform/shadowstack.py
@@ -113,6 +113,7 @@
debug_print("walk_stack", base, top)
walk_stack_root(self.invoke_collect_stack_root, collect_stack_root,
None, base, top, is_minor=False)
+ self.gcdata.gc.collected_roots_for_one_thread()
self._walk_thread_stack = walk_thread_stack
diff --git a/rpython/translator/c/src/threadlocal.c b/rpython/translator/c/src/threadlocal.c
--- a/rpython/translator/c/src/threadlocal.c
+++ b/rpython/translator/c/src/threadlocal.c
@@ -11,32 +11,36 @@
#include "src/thread.h"
-/* this is a spin-lock that must be acquired around each doubly-linked-list
+/* this is a reentrant lock that must be acquired around each doubly-linked-list
manipulation (because such manipulations can occur without the GIL) */
-static long pypy_threadlocal_lock = 0;
+static pthread_mutex_t _rpy_threadlocal_lock;
static int check_valid(void);
-int _RPython_ThreadLocals_AcquireTimeout(int max_wait_iterations) {
- while (1) {
- long old_value = pypy_lock_test_and_set(&pypy_threadlocal_lock, 1);
- if (old_value == 0)
- break;
- /* busy loop */
- if (max_wait_iterations == 0)
- return -1;
- if (max_wait_iterations > 0)
- --max_wait_iterations;
+static void do_check(int result)
+{
+ if (result != 0) {
+ fprintf(stderr, "threadlocal.c got an unexpected mutex error\n");
+ exit(1);
}
+}
+
+static void init_lock(void)
+{
+ pthread_mutexattr_t attr;
+ do_check(pthread_mutexattr_init(&attr)
+ || pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)
+ || pthread_mutex_init(&_rpy_threadlocal_lock, &attr)
+ || pthread_mutexattr_destroy(&attr));
+}
+
+void _RPython_ThreadLocals_Acquire(void) {
+ do_check(pthread_mutex_lock(&_rpy_threadlocal_lock));
assert(check_valid());
- return 0;
-}
-void _RPython_ThreadLocals_Acquire(void) {
- _RPython_ThreadLocals_AcquireTimeout(-1);
}
void _RPython_ThreadLocals_Release(void) {
assert(check_valid());
- pypy_lock_release(&pypy_threadlocal_lock);
+ do_check(pthread_mutex_unlock(&_rpy_threadlocal_lock));
}
@@ -73,6 +77,7 @@
{
/* assume that at most one pypy_threadlocal_s survived, the current one */
struct pypy_threadlocal_s *cur;
+ init_lock();
cur = (struct pypy_threadlocal_s *)_RPy_ThreadLocals_Get();
if (cur && cur->ready == 42) {
cur->next = cur->prev = &linkedlist_head;
@@ -81,7 +86,6 @@
else {
linkedlist_head.next = linkedlist_head.prev = &linkedlist_head;
}
- _RPython_ThreadLocals_Release();
}
@@ -188,7 +192,7 @@
a non-null thread-local value). This is needed even in the
case where we use '__thread' below, for the destructor.
*/
- assert(pypy_threadlocal_lock == 0);
+ init_lock();
#ifdef _WIN32
pypy_threadlocal_key = TlsAlloc();
if (pypy_threadlocal_key == TLS_OUT_OF_INDEXES)
diff --git a/rpython/translator/c/src/threadlocal.h b/rpython/translator/c/src/threadlocal.h
--- a/rpython/translator/c/src/threadlocal.h
+++ b/rpython/translator/c/src/threadlocal.h
@@ -21,7 +21,6 @@
RPY_EXTERN void _RPython_ThreadLocals_Acquire(void);
RPY_EXTERN void _RPython_ThreadLocals_Release(void);
-RPY_EXTERN int _RPython_ThreadLocals_AcquireTimeout(int max_wait_iterations);
/* Must acquire/release the thread-local lock around a series of calls
to the following function */
More information about the pypy-commit
mailing list