[pypy-commit] pypy nogil-unsafe-2: (arigo, remi) implement a synchronisation scheme for safepoints (WIP)
Raemi
pypy.commits at gmail.com
Wed Mar 1 12:28:07 EST 2017
Author: Remi Meier <remi.meier at gmail.com>
Branch: nogil-unsafe-2
Changeset: r90444:65be98dc2aee
Date: 2017-03-01 18:27 +0100
http://bitbucket.org/pypy/pypy/changeset/65be98dc2aee/
Log: (arigo, remi) implement a synchronisation scheme for safepoints
(WIP)
diff --git a/rpython/memory/gc/incminimark.py b/rpython/memory/gc/incminimark.py
--- a/rpython/memory/gc/incminimark.py
+++ b/rpython/memory/gc/incminimark.py
@@ -72,7 +72,7 @@
from rpython.rlib.rarithmetic import LONG_BIT_SHIFT
from rpython.rlib.debug import ll_assert, debug_print, debug_start, debug_stop
from rpython.rlib.objectmodel import specialize, we_are_translated
-from rpython.rlib import rthread
+from rpython.rlib import rgil, rthread
from rpython.memory.gc.minimarkpage import out_of_memory
#
@@ -191,7 +191,7 @@
NURSERY_FREE = rthread.ThreadLocalField(llmemory.Address, 'nursery_free')
NURSERY_TOP = rthread.ThreadLocalField(llmemory.Address, 'nursery_top')
-NEXT_NUBLOCK = rthread.ThreadLocalField(llmemory.Address, 'next_nublock')
+
# ____________________________________________________________
@@ -438,11 +438,11 @@
self.old_objects_pointing_to_pinned = self.AddressStack()
self.updated_old_objects_pointing_to_pinned = False
#
- # Allocate lock(s)
- ll_lock = lltype.malloc(rthread.TLOCKP.TO, flavor='raw',
- track_allocation=False)
- rthread.c_thread_lock_init(ll_lock)
- self.ll_lock = ll_lock
+ # # Allocate lock(s)
+ # ll_lock = lltype.malloc(rthread.TLOCKP.TO, flavor='raw',
+ # track_allocation=False)
+ # rthread.c_thread_lock_init(ll_lock)
+ # self.ll_lock = ll_lock
#
# Allocate a nursery. In case of auto_nursery_size, start by
# allocating a very small nursery, enough to do things like look
@@ -650,9 +650,6 @@
get_nursery_top = staticmethod(NURSERY_TOP.getraw)
set_nursery_top = staticmethod(NURSERY_TOP.setraw)
- get_next_nublock = staticmethod(NEXT_NUBLOCK.getraw)
- set_next_nublock = staticmethod(NEXT_NUBLOCK.setraw)
-
@property
def nursery_top(self):
XXX # fix caller
@@ -859,7 +856,8 @@
major collection, and finally reserve totalsize bytes.
"""
- rthread.acquire_NOAUTO(self.ll_lock, 1)
+ # rthread.acquire_NOAUTO(self.ll_lock, 1)
+ rgil.enter_master_section()
minor_collection_count = 0
while True:
@@ -898,6 +896,8 @@
self.set_nursery_free(self.nursery_barriers.popleft())
self.set_nursery_top(self.nursery_barriers.popleft())
else:
+ rgil.master_request_safepoint()
+
minor_collection_count += 1
if minor_collection_count == 1:
self.minor_collection_with_major_progress()
@@ -936,7 +936,8 @@
self.set_nursery_free(self.get_nursery_top() -
self.debug_tiny_nursery)
#
- rthread.release_NOAUTO(self.ll_lock)
+ rgil.leave_master_section()
+ # rthread.release_NOAUTO(self.ll_lock)
return result
collect_and_reserve._dont_inline_ = True
diff --git a/rpython/memory/gctransform/shadowstack.py b/rpython/memory/gctransform/shadowstack.py
--- a/rpython/memory/gctransform/shadowstack.py
+++ b/rpython/memory/gctransform/shadowstack.py
@@ -227,9 +227,11 @@
tl_shadowstack = rthread.ThreadLocalField(llmemory.Address,
'shadowstack')
+ tl_synclock = rthread.ThreadLocalField(lltype.Signed, 'synclock')
def thread_setup():
allocate_shadow_stack()
+ tl_synclock.get_or_make_raw()
def thread_run():
# If it's the first time we see this thread, allocate
diff --git a/rpython/rlib/rgil.py b/rpython/rlib/rgil.py
--- a/rpython/rlib/rgil.py
+++ b/rpython/rlib/rgil.py
@@ -22,7 +22,7 @@
_nowrapper=True, sandboxsafe=True,
compilation_info=eci)
-_gil_yield_thread = llexternal('RPyGilYieldThread', [], lltype.Signed,
+_gil_yield_thread = llexternal('RPyGilYieldThread', [], lltype.Void,
_nowrapper=True, sandboxsafe=True,
compilation_info=eci)
@@ -38,6 +38,20 @@
_nowrapper=True, sandboxsafe=True,
compilation_info=eci)
+enter_master_section = llexternal(
+ 'RPyGilEnterMasterSection', [], lltype.Void,
+ _nowrapper=True, sandboxsafe=True,
+ compilation_info=eci)
+
+leave_master_section = llexternal(
+ 'RPyGilLeaveMasterSection', [], lltype.Void,
+ _nowrapper=True, sandboxsafe=True,
+ compilation_info=eci)
+
+master_request_safepoint = llexternal(
+ 'RPyGilMasterRequestSafepoint', [], lltype.Void,
+ _nowrapper=True, sandboxsafe=True,
+ compilation_info=eci)
# ____________________________________________________________
@@ -133,10 +147,11 @@
# explicitly release the gil, in a way that tries to give more
# priority to other threads (as opposed to continuing to run in
# the same thread).
- if _gil_yield_thread():
- from rpython.rlib import rthread
- rthread.gc_thread_run()
- _after_thread_switch()
+ # if _gil_yield_thread():
+ # from rpython.rlib import rthread
+ # rthread.gc_thread_run()
+ # _after_thread_switch()
+ _gil_yield_thread()
yield_thread._gctransformer_hint_close_stack_ = True
yield_thread._dont_reach_me_in_del_ = True
yield_thread._dont_inline_ = True
diff --git a/rpython/translator/c/src/thread.c b/rpython/translator/c/src/thread.c
--- a/rpython/translator/c/src/thread.c
+++ b/rpython/translator/c/src/thread.c
@@ -9,11 +9,9 @@
#include "common_header.h"
#endif
-#ifdef PYPY_USE_ASMGCC
# include "common_header.h"
# include "structdef.h"
# include "forwarddecl.h"
-#endif
#ifdef _WIN32
#include "src/thread_nt.c"
diff --git a/rpython/translator/c/src/thread.h b/rpython/translator/c/src/thread.h
--- a/rpython/translator/c/src/thread.h
+++ b/rpython/translator/c/src/thread.h
@@ -30,8 +30,15 @@
#endif /* !_WIN32 */
RPY_EXTERN void RPyGilAllocate(void);
-RPY_EXTERN long RPyGilYieldThread(void);
-RPY_EXTERN void RPyGilAcquireSlowPath(long);
+RPY_EXTERN void RPyGilYieldThreadSlowPath(void);
+RPY_EXTERN void RPyGilAcquireSlowPath(void);
+RPY_EXTERN void RPyGilReleaseSlowPath(void);
+
+RPY_EXTERN void RPyGilEnterMasterSection(void);
+RPY_EXTERN void RPyGilLeaveMasterSection(void);
+RPY_EXTERN void RPyGilMasterRequestSafepoint(void);
+
+
#define RPyGilAcquire _RPyGilAcquire
#define RPyGilRelease _RPyGilRelease
#define RPyFetchFastGil _RPyFetchFastGil
@@ -43,21 +50,33 @@
#endif
//RPY_EXTERN long rpy_fastgil;
+#include "threadlocal.h"
-static inline void _RPyGilAcquire(void) {
-// long old_fastgil = pypy_lock_test_and_set(&rpy_fastgil, 1);
-// if (old_fastgil != 0)
-// RPyGilAcquireSlowPath(old_fastgil);
-}
-static inline void _RPyGilRelease(void) {
-// assert(RPY_FASTGIL_LOCKED(rpy_fastgil));
-// pypy_lock_release(&rpy_fastgil);
-}
+#define _RPyGilAcquire() do { \
+ if (!__sync_bool_compare_and_swap( \
+ &RPY_THREADLOCALREF_GET(synclock), 0L, 1L)) \
+ RPyGilAcquireSlowPath(); \
+ } while (0)
+
+#define _RPyGilRelease() do { \
+ assert(RPY_THREADLOCALREF_GET(synclock) != 0L); \
+ if (!__sync_bool_compare_and_swap( \
+ &RPY_THREADLOCALREF_GET(synclock), 1L, 0L)) \
+ RPyGilReleaseSlowPath(); \
+ } while (0)
+
static inline long *_RPyFetchFastGil(void) {
abort();
// return &rpy_fastgil;
}
+#define RPyGilYieldThread() do { \
+ assert(RPY_THREADLOCALREF_GET(synclock) & 1L); \
+ if (RPY_THREADLOCALREF_GET(synclock) == 3L) { \
+ RPyGilYieldThreadSlowPath(); \
+ } \
+ } while (0)
+
typedef unsigned char rpy_spinlock_t;
static inline void rpy_spinlock_acquire(rpy_spinlock_t *p)
{
diff --git a/rpython/translator/c/src/thread_gil.c b/rpython/translator/c/src/thread_gil.c
--- a/rpython/translator/c/src/thread_gil.c
+++ b/rpython/translator/c/src/thread_gil.c
@@ -1,239 +1,142 @@
+#include <pthread.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "threadlocal.h"
-/* Idea:
+static pthread_mutex_t master_mutex;
+static pthread_mutex_t sync_mutex;
+static pthread_cond_t sync_cond;
- - "The GIL" is a composite concept. There are two locks, and "the
- GIL is locked" when both are locked.
+static long counter_of_threes = 0;
- - The first lock is a simple global variable 'rpy_fastgil'. With
- shadowstack, we use the most portable definition: 0 means unlocked
- and != 0 means locked. With asmgcc, 0 means unlocked but only 1
- means locked. A different value means unlocked too, but the value
- is used by the JIT to contain the stack top for stack root scanning.
-
- - The second lock is a regular mutex. In the fast path, it is never
- unlocked. Remember that "the GIL is unlocked" means that either
- the first or the second lock is unlocked. It should never be the
- case that both are unlocked at the same time.
-
- - Let's call "thread 1" the thread with the GIL. Whenever it does an
- external function call, it sets 'rpy_fastgil' to 0 (unlocked).
- This is the cheapest way to release the GIL. When it returns from
- the function call, this thread attempts to atomically change
- 'rpy_fastgil' to 1. In the common case where it works, thread 1
- has got the GIL back and so continues to run.
-
- - Say "thread 2" is eagerly waiting for thread 1 to become blocked in
- some long-running call. Regularly, it checks if 'rpy_fastgil' is 0
- and tries to atomically change it to 1. If it succeeds, it means
- that the GIL was not previously locked. Thread 2 has now got the GIL.
-
- - If there are more than 2 threads, the rest is really sleeping by
- waiting on the 'mutex_gil_stealer' held by thread 2.
-
- - An additional mechanism is used for when thread 1 wants to
- explicitly yield the GIL to thread 2: it does so by releasing
- 'mutex_gil' (which is otherwise not released) but keeping the
- value of 'rpy_fastgil' to 1.
-*/
-
-
-/* The GIL is initially released; see pypy_main_function(), which calls
- RPyGilAcquire/RPyGilRelease. The point is that when building
- RPython libraries, they can be a collection of regular functions that
- also call RPyGilAcquire/RPyGilRelease; see test_standalone.TestShared.
-*/
-long rpy_fastgil = 0;
-static long rpy_waiting_threads = -42; /* GIL not initialized */
-static volatile int rpy_early_poll_n = 0;
-static mutex1_t mutex_gil_stealer;
-static mutex2_t mutex_gil;
+static long rpy_initialize = -42;
static void rpy_init_mutexes(void)
{
- mutex1_init(&mutex_gil_stealer);
- mutex2_init_locked(&mutex_gil);
- rpy_waiting_threads = 0;
+ int err = pthread_mutex_init(&master_mutex, NULL);
+ if (err)
+ abort();
+
+ err = pthread_mutex_init(&sync_mutex, NULL);
+ if (err)
+ abort();
+
+ err = pthread_cond_init(&sync_cond, NULL);
+ if (err)
+ abort();
+
+ counter_of_threes = 0; // XXX: fork?
+ rpy_initialize = 0;
}
void RPyGilAllocate(void)
{
-// if (rpy_waiting_threads < 0) {
-// assert(rpy_waiting_threads == -42);
-// rpy_init_mutexes();
+ if (rpy_initialize < 0) {
+ assert(rpy_initialize == -42);
+ rpy_init_mutexes();
#ifdef HAVE_PTHREAD_ATFORK
-// pthread_atfork(NULL, NULL, rpy_init_mutexes);
+ pthread_atfork(NULL, NULL, rpy_init_mutexes);
#endif
-// }
+ }
}
-static void check_and_save_old_fastgil(long old_fastgil)
+
+void RPyGilAcquireSlowPath(void)
{
- assert(RPY_FASTGIL_LOCKED(rpy_fastgil));
+ assert(RPY_THREADLOCALREF_GET(synclock) == 2);
-#ifdef PYPY_USE_ASMGCC
- if (old_fastgil != 0) {
- /* this case only occurs from the JIT compiler */
- struct pypy_ASM_FRAMEDATA_HEAD0 *new =
- (struct pypy_ASM_FRAMEDATA_HEAD0 *)old_fastgil;
- struct pypy_ASM_FRAMEDATA_HEAD0 *root = &pypy_g_ASM_FRAMEDATA_HEAD;
- struct pypy_ASM_FRAMEDATA_HEAD0 *next = root->as_next;
- new->as_next = next;
- new->as_prev = root;
- root->as_next = new;
- next->as_prev = new;
- }
-#else
- assert(old_fastgil == 0);
-#endif
+ /* wait until the master leaves the safe point */
+ pthread_mutex_lock(&master_mutex);
+ RPY_THREADLOCALREF_GET(synclock) = 1;
+ pthread_mutex_unlock(&master_mutex);
}
-#define RPY_GIL_POKE_MIN 40
-#define RPY_GIL_POKE_MAX 400
+void RPyGilReleaseSlowPath(void)
+{
+ assert(RPY_THREADLOCALREF_GET(synclock) == 3);
-void RPyGilAcquireSlowPath(long old_fastgil)
+ pthread_mutex_lock(&sync_mutex);
+
+ /* we are one of the THREES that the master is waiting for. Decrease the
+ * counter and signal the master if we are the last. */
+ counter_of_threes--;
+ if (counter_of_threes == 0)
+ pthread_cond_signal(&sync_cond);
+
+ /* set to TWO, so that Acquire above will wait until the master is finished
+ * with its safe point */
+ RPY_THREADLOCALREF_GET(synclock) = 2;
+ pthread_mutex_unlock(&sync_mutex);
+ // continue without GIL
+}
+
+void RPyGilYieldThreadSlowPath(void)
{
- /* Acquires the GIL. This assumes that we already did:
+ RPyGilRelease();
+ RPyGilAcquire();
+}
- old_fastgil = pypy_lock_test_and_set(&rpy_fastgil, 1);
- */
- if (!RPY_FASTGIL_LOCKED(old_fastgil)) {
- /* The fastgil was not previously locked: success.
- 'mutex_gil' should still be locked at this point.
- */
- }
- else {
- /* Otherwise, another thread is busy with the GIL. */
- int n;
- long old_waiting_threads;
+void RPyGilEnterMasterSection(void)
+{
+ RPyGilRelease();
+ pthread_mutex_lock(&master_mutex);
+}
- if (rpy_waiting_threads < 0) {
- /* <arigo> I tried to have RPyGilAllocate() called from
- * here, but it fails occasionally on an example
- * (2.7/test/test_threading.py). I think what occurs is
- * that if one thread runs RPyGilAllocate(), it still
- * doesn't have the GIL; then the other thread might fork()
- * at precisely this moment, killing the first thread.
- */
- fprintf(stderr, "Fatal RPython error: a thread is trying to wait "
- "for the GIL, but the GIL was not initialized\n"
- "(For PyPy, see "
- "https://bitbucket.org/pypy/pypy/issues/2274)\n");
+void RPyGilLeaveMasterSection(void)
+{
+ pthread_mutex_unlock(&master_mutex);
+ RPyGilAcquire();
+}
+
+void RPyGilMasterRequestSafepoint(void)
+{
+ pthread_mutex_lock(&sync_mutex);
+ assert(counter_of_threes == 0);
+
+ /* signal all threads to enter safepoints */
+ OP_THREADLOCALREF_ACQUIRE(/* */);
+
+ struct pypy_threadlocal_s *t = NULL;
+ while (1) {
+ OP_THREADLOCALREF_ENUM(t, t);
+ if (t == NULL)
+ break;
+
+ retry:
+ switch (t->synclock) {
+ case 3:
+ assert(!"unexpected synclock=3 found");
abort();
- }
-
- /* Register me as one of the threads that is actively waiting
- for the GIL. The number of such threads is found in
- rpy_waiting_threads. */
- old_waiting_threads = atomic_increment(&rpy_waiting_threads);
-
- /* Early polling: before entering the waiting queue, we check
- a certain number of times if the GIL becomes free. The
- motivation for this is issue #2341. Note that we do this
- polling even if there are already other threads in the
- queue, and one of thesee threads is the stealer. This is
- because the stealer is likely sleeping right now. There
- are use cases where the GIL will really be released very
- soon after RPyGilAcquireSlowPath() is called, so it's worth
- always doing this check.
-
- To avoid falling into bad cases, we "randomize" the number
- of iterations: we loop N times, where N is choosen between
- RPY_GIL_POKE_MIN and RPY_GIL_POKE_MAX.
- */
- n = rpy_early_poll_n * 2 + 1;
- while (n >= RPY_GIL_POKE_MAX)
- n -= (RPY_GIL_POKE_MAX - RPY_GIL_POKE_MIN);
- rpy_early_poll_n = n;
- while (n >= 0) {
- n--;
- if (old_waiting_threads != rpy_waiting_threads) {
- /* If the number changed, it is because another thread
- entered or left this function. In that case, stop
- this loop: if another thread left it means the GIL
- has been acquired by that thread; if another thread
- entered there is no point in running the present
- loop twice. */
+ case 2:
+ /* thread running in C code, already knows we want a safepoint */
+ break;
+ case 0:
+ /* thread running in C code, make sure it checks for and enters
+ * the safepoint before acquiring the "gil" again */
+ if (__sync_bool_compare_and_swap(&t->synclock, 0, 2))
+ break;
+ goto retry;
+ case 1:
+ /* thread running normally, place request to enter safepoint */
+ if (__sync_bool_compare_and_swap(&t->synclock, 1, 3)) {
+ counter_of_threes++;
+ t->nursery_top = NULL;
break;
}
- RPy_YieldProcessor();
- RPy_CompilerMemoryBarrier();
+ goto retry;
+ }
+ }
+ OP_THREADLOCALREF_RELEASE(/* */);
- if (!RPY_FASTGIL_LOCKED(rpy_fastgil)) {
- old_fastgil = pypy_lock_test_and_set(&rpy_fastgil, 1);
- if (!RPY_FASTGIL_LOCKED(old_fastgil)) {
- /* We got the gil before entering the waiting
- queue. In case there are other threads waiting
- for the GIL, wake up the stealer thread now and
- go to the waiting queue anyway, for fairness.
- This will fall through if there are no other
- threads waiting.
- */
- check_and_save_old_fastgil(old_fastgil);
- mutex2_unlock(&mutex_gil);
- break;
- }
- }
- }
+ /* wait until all THREES entered their safepoints */
+ while (counter_of_threes > 0) {
+ pthread_cond_wait(&sync_cond, &sync_mutex);
+ }
- /* Enter the waiting queue from the end. Assuming a roughly
- first-in-first-out order, this will nicely give the threads
- a round-robin chance.
- */
- mutex1_lock(&mutex_gil_stealer);
- mutex2_loop_start(&mutex_gil);
+ pthread_mutex_unlock(&sync_mutex);
- /* We are now the stealer thread. Steals! */
- while (1) {
- /* Busy-looping here. Try to look again if 'rpy_fastgil' is
- released.
- */
- if (!RPY_FASTGIL_LOCKED(rpy_fastgil)) {
- old_fastgil = pypy_lock_test_and_set(&rpy_fastgil, 1);
- if (!RPY_FASTGIL_LOCKED(old_fastgil))
- /* yes, got a non-held value! Now we hold it. */
- break;
- }
- /* Sleep for one interval of time. We may be woken up earlier
- if 'mutex_gil' is released.
- */
- if (mutex2_lock_timeout(&mutex_gil, 0.0001)) { /* 0.1 ms... */
- /* We arrive here if 'mutex_gil' was recently released
- and we just relocked it.
- */
- old_fastgil = 0;
- break;
- }
- /* Loop back. */
- }
- atomic_decrement(&rpy_waiting_threads);
- mutex2_loop_stop(&mutex_gil);
- mutex1_unlock(&mutex_gil_stealer);
- }
- check_and_save_old_fastgil(old_fastgil);
-}
-
-long RPyGilYieldThread(void)
-{
- /* can be called even before RPyGilAllocate(), but in this case,
- 'rpy_waiting_threads' will be -42. */
- assert(RPY_FASTGIL_LOCKED(rpy_fastgil));
- if (rpy_waiting_threads <= 0)
- return 0;
-
- /* Explicitly release the 'mutex_gil'.
- */
- mutex2_unlock(&mutex_gil);
-
- /* Now nobody has got the GIL, because 'mutex_gil' is released (but
- rpy_fastgil is still locked). Call RPyGilAcquire(). It will
- enqueue ourselves at the end of the 'mutex_gil_stealer' queue.
- If there is no other waiting thread, it will fall through both
- its mutex_lock() and mutex_lock_timeout() now. But that's
- unlikely, because we tested above that 'rpy_waiting_threads > 0'.
- */
- RPyGilAcquire();
- return 1;
+ /* caller can continue; all threads in safepoints */
}
/********** for tests only **********/
More information about the pypy-commit
mailing list