[pypy-commit] pypy nogil-unsafe-2: (arigo, remi) implement a synchronisation scheme for safepoints (WIP)

Wed Mar 1 12:28:07 EST 2017

Author: Remi Meier <remi.meier at gmail.com>
Branch: nogil-unsafe-2
Changeset: r90444:65be98dc2aee
Date: 2017-03-01 18:27 +0100
http://bitbucket.org/pypy/pypy/changeset/65be98dc2aee/

Log:	(arigo, remi) implement a synchronisation scheme for safepoints
	(WIP)

diff --git a/rpython/memory/gc/incminimark.py b/rpython/memory/gc/incminimark.py
--- a/rpython/memory/gc/incminimark.py
+++ b/rpython/memory/gc/incminimark.py
@@ -72,7 +72,7 @@
 from rpython.rlib.rarithmetic import LONG_BIT_SHIFT
 from rpython.rlib.debug import ll_assert, debug_print, debug_start, debug_stop
 from rpython.rlib.objectmodel import specialize, we_are_translated
-from rpython.rlib import rthread
+from rpython.rlib import rgil, rthread
 from rpython.memory.gc.minimarkpage import out_of_memory
 
 #
@@ -191,7 +191,7 @@
 
 NURSERY_FREE = rthread.ThreadLocalField(llmemory.Address, 'nursery_free')
 NURSERY_TOP  = rthread.ThreadLocalField(llmemory.Address, 'nursery_top')
-NEXT_NUBLOCK = rthread.ThreadLocalField(llmemory.Address, 'next_nublock')
+
 
 # ____________________________________________________________
 
@@ -438,11 +438,11 @@
         self.old_objects_pointing_to_pinned = self.AddressStack()
         self.updated_old_objects_pointing_to_pinned = False
         #
-        # Allocate lock(s)
-        ll_lock = lltype.malloc(rthread.TLOCKP.TO, flavor='raw',
-                                track_allocation=False)
-        rthread.c_thread_lock_init(ll_lock)
-        self.ll_lock = ll_lock
+        # # Allocate lock(s)
+        # ll_lock = lltype.malloc(rthread.TLOCKP.TO, flavor='raw',
+        #                         track_allocation=False)
+        # rthread.c_thread_lock_init(ll_lock)
+        # self.ll_lock = ll_lock
         #
         # Allocate a nursery.  In case of auto_nursery_size, start by
         # allocating a very small nursery, enough to do things like look
@@ -650,9 +650,6 @@
     get_nursery_top = staticmethod(NURSERY_TOP.getraw)
     set_nursery_top = staticmethod(NURSERY_TOP.setraw)
 
-    get_next_nublock = staticmethod(NEXT_NUBLOCK.getraw)
-    set_next_nublock = staticmethod(NEXT_NUBLOCK.setraw)
-
     @property
     def nursery_top(self):
         XXX   # fix caller
@@ -859,7 +856,8 @@
         major collection, and finally reserve totalsize bytes.
         """
 
-        rthread.acquire_NOAUTO(self.ll_lock, 1)
+        # rthread.acquire_NOAUTO(self.ll_lock, 1)
+        rgil.enter_master_section()
 
         minor_collection_count = 0
         while True:
@@ -898,6 +896,8 @@
                 self.set_nursery_free(self.nursery_barriers.popleft())
                 self.set_nursery_top(self.nursery_barriers.popleft())
             else:
+                rgil.master_request_safepoint()
+
                 minor_collection_count += 1
                 if minor_collection_count == 1:
                     self.minor_collection_with_major_progress()
@@ -936,7 +936,8 @@
                 self.set_nursery_free(self.get_nursery_top() -
                                       self.debug_tiny_nursery)
         #
-        rthread.release_NOAUTO(self.ll_lock)
+        rgil.leave_master_section()
+        # rthread.release_NOAUTO(self.ll_lock)
         return result
     collect_and_reserve._dont_inline_ = True
 
diff --git a/rpython/memory/gctransform/shadowstack.py b/rpython/memory/gctransform/shadowstack.py
--- a/rpython/memory/gctransform/shadowstack.py
+++ b/rpython/memory/gctransform/shadowstack.py
@@ -227,9 +227,11 @@
 
         tl_shadowstack = rthread.ThreadLocalField(llmemory.Address,
                                                   'shadowstack')
+        tl_synclock = rthread.ThreadLocalField(lltype.Signed, 'synclock')
 
         def thread_setup():
             allocate_shadow_stack()
+            tl_synclock.get_or_make_raw()
 
         def thread_run():
             # If it's the first time we see this thread, allocate
diff --git a/rpython/rlib/rgil.py b/rpython/rlib/rgil.py
--- a/rpython/rlib/rgil.py
+++ b/rpython/rlib/rgil.py
@@ -22,7 +22,7 @@
                            _nowrapper=True, sandboxsafe=True,
                            compilation_info=eci)
 
-_gil_yield_thread = llexternal('RPyGilYieldThread', [], lltype.Signed,
+_gil_yield_thread = llexternal('RPyGilYieldThread', [], lltype.Void,
                                _nowrapper=True, sandboxsafe=True,
                                compilation_info=eci)
 
@@ -38,6 +38,20 @@
                                _nowrapper=True, sandboxsafe=True,
                                compilation_info=eci)
 
+enter_master_section = llexternal(
+    'RPyGilEnterMasterSection', [], lltype.Void,
+    _nowrapper=True, sandboxsafe=True,
+    compilation_info=eci)
+
+leave_master_section = llexternal(
+    'RPyGilLeaveMasterSection', [], lltype.Void,
+    _nowrapper=True, sandboxsafe=True,
+    compilation_info=eci)
+
+master_request_safepoint = llexternal(
+    'RPyGilMasterRequestSafepoint', [], lltype.Void,
+    _nowrapper=True, sandboxsafe=True,
+    compilation_info=eci)
 # ____________________________________________________________
 
 
@@ -133,10 +147,11 @@
     # explicitly release the gil, in a way that tries to give more
     # priority to other threads (as opposed to continuing to run in
     # the same thread).
-    if _gil_yield_thread():
-        from rpython.rlib import rthread
-        rthread.gc_thread_run()
-        _after_thread_switch()
+    # if _gil_yield_thread():
+    #     from rpython.rlib import rthread
+    #     rthread.gc_thread_run()
+    #     _after_thread_switch()
+    _gil_yield_thread()
 yield_thread._gctransformer_hint_close_stack_ = True
 yield_thread._dont_reach_me_in_del_ = True
 yield_thread._dont_inline_ = True
diff --git a/rpython/translator/c/src/thread.c b/rpython/translator/c/src/thread.c
--- a/rpython/translator/c/src/thread.c
+++ b/rpython/translator/c/src/thread.c
@@ -9,11 +9,9 @@
 #include "common_header.h"
 #endif
 
-#ifdef PYPY_USE_ASMGCC
 # include "common_header.h"
 # include "structdef.h"
 # include "forwarddecl.h"
-#endif
 
 #ifdef _WIN32
 #include "src/thread_nt.c"
diff --git a/rpython/translator/c/src/thread.h b/rpython/translator/c/src/thread.h
--- a/rpython/translator/c/src/thread.h
+++ b/rpython/translator/c/src/thread.h
@@ -30,8 +30,15 @@
 #endif /* !_WIN32 */
 
 RPY_EXTERN void RPyGilAllocate(void);
-RPY_EXTERN long RPyGilYieldThread(void);
-RPY_EXTERN void RPyGilAcquireSlowPath(long);
+RPY_EXTERN void RPyGilYieldThreadSlowPath(void);
+RPY_EXTERN void RPyGilAcquireSlowPath(void);
+RPY_EXTERN void RPyGilReleaseSlowPath(void);
+
+RPY_EXTERN void RPyGilEnterMasterSection(void);
+RPY_EXTERN void RPyGilLeaveMasterSection(void);
+RPY_EXTERN void RPyGilMasterRequestSafepoint(void);
+
+
 #define RPyGilAcquire _RPyGilAcquire
 #define RPyGilRelease _RPyGilRelease
 #define RPyFetchFastGil _RPyFetchFastGil
@@ -43,21 +50,33 @@
 #endif
 
 //RPY_EXTERN long rpy_fastgil;
+#include "threadlocal.h"
 
-static inline void _RPyGilAcquire(void) {
-//    long old_fastgil = pypy_lock_test_and_set(&rpy_fastgil, 1);
-//    if (old_fastgil != 0)
-//        RPyGilAcquireSlowPath(old_fastgil);
-}
-static inline void _RPyGilRelease(void) {
-//    assert(RPY_FASTGIL_LOCKED(rpy_fastgil));
-//    pypy_lock_release(&rpy_fastgil);
-}
+#define _RPyGilAcquire() do { \
+        if (!__sync_bool_compare_and_swap(                  \
+                &RPY_THREADLOCALREF_GET(synclock), 0L, 1L)) \
+            RPyGilAcquireSlowPath();                        \
+    } while (0)
+
+#define _RPyGilRelease() do { \
+        assert(RPY_THREADLOCALREF_GET(synclock) != 0L); \
+    if (!__sync_bool_compare_and_swap(                  \
+            &RPY_THREADLOCALREF_GET(synclock), 1L, 0L)) \
+        RPyGilReleaseSlowPath();                        \
+    } while (0)
+
 static inline long *_RPyFetchFastGil(void) {
     abort();
 //    return &rpy_fastgil;
 }
 
+#define RPyGilYieldThread() do { \
+    assert(RPY_THREADLOCALREF_GET(synclock) & 1L); \
+    if (RPY_THREADLOCALREF_GET(synclock) == 3L) { \
+        RPyGilYieldThreadSlowPath(); \
+    } \
+    } while (0)
+
 typedef unsigned char rpy_spinlock_t;
 static inline void rpy_spinlock_acquire(rpy_spinlock_t *p)
 {
diff --git a/rpython/translator/c/src/thread_gil.c b/rpython/translator/c/src/thread_gil.c
--- a/rpython/translator/c/src/thread_gil.c
+++ b/rpython/translator/c/src/thread_gil.c
@@ -1,239 +1,142 @@
+#include <pthread.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "threadlocal.h"
 
-/* Idea:
+static pthread_mutex_t master_mutex;
+static pthread_mutex_t sync_mutex;
+static pthread_cond_t  sync_cond;
 
-   - "The GIL" is a composite concept.  There are two locks, and "the
-     GIL is locked" when both are locked.
+static long counter_of_threes = 0;
 
-   - The first lock is a simple global variable 'rpy_fastgil'.  With
-     shadowstack, we use the most portable definition: 0 means unlocked
-     and != 0 means locked.  With asmgcc, 0 means unlocked but only 1
-     means locked.  A different value means unlocked too, but the value
-     is used by the JIT to contain the stack top for stack root scanning.
-
-   - The second lock is a regular mutex.  In the fast path, it is never
-     unlocked.  Remember that "the GIL is unlocked" means that either
-     the first or the second lock is unlocked.  It should never be the
-     case that both are unlocked at the same time.
-
-   - Let's call "thread 1" the thread with the GIL.  Whenever it does an
-     external function call, it sets 'rpy_fastgil' to 0 (unlocked).
-     This is the cheapest way to release the GIL.  When it returns from
-     the function call, this thread attempts to atomically change
-     'rpy_fastgil' to 1.  In the common case where it works, thread 1
-     has got the GIL back and so continues to run.
-
-   - Say "thread 2" is eagerly waiting for thread 1 to become blocked in
-     some long-running call.  Regularly, it checks if 'rpy_fastgil' is 0
-     and tries to atomically change it to 1.  If it succeeds, it means
-     that the GIL was not previously locked.  Thread 2 has now got the GIL.
-
-   - If there are more than 2 threads, the rest is really sleeping by
-     waiting on the 'mutex_gil_stealer' held by thread 2.
-
-   - An additional mechanism is used for when thread 1 wants to
-     explicitly yield the GIL to thread 2: it does so by releasing
-     'mutex_gil' (which is otherwise not released) but keeping the
-     value of 'rpy_fastgil' to 1.
-*/
-
-
-/* The GIL is initially released; see pypy_main_function(), which calls
-   RPyGilAcquire/RPyGilRelease.  The point is that when building
-   RPython libraries, they can be a collection of regular functions that
-   also call RPyGilAcquire/RPyGilRelease; see test_standalone.TestShared.
-*/
-long rpy_fastgil = 0;
-static long rpy_waiting_threads = -42;    /* GIL not initialized */
-static volatile int rpy_early_poll_n = 0;
-static mutex1_t mutex_gil_stealer;
-static mutex2_t mutex_gil;
+static long rpy_initialize = -42;
 
 
 static void rpy_init_mutexes(void)
 {
-    mutex1_init(&mutex_gil_stealer);
-    mutex2_init_locked(&mutex_gil);
-    rpy_waiting_threads = 0;
+    int err = pthread_mutex_init(&master_mutex, NULL);
+    if (err)
+        abort();
+
+    err = pthread_mutex_init(&sync_mutex, NULL);
+    if (err)
+        abort();
+
+    err = pthread_cond_init(&sync_cond, NULL);
+    if (err)
+        abort();
+
+    counter_of_threes = 0; // XXX: fork?
+    rpy_initialize = 0;
 }
 
 void RPyGilAllocate(void)
 {
-//    if (rpy_waiting_threads < 0) {
-//        assert(rpy_waiting_threads == -42);
-//        rpy_init_mutexes();
+    if (rpy_initialize < 0) {
+        assert(rpy_initialize == -42);
+        rpy_init_mutexes();
 #ifdef HAVE_PTHREAD_ATFORK
-//        pthread_atfork(NULL, NULL, rpy_init_mutexes);
+        pthread_atfork(NULL, NULL, rpy_init_mutexes);
 #endif
-//    }
+    }
 }
 
-static void check_and_save_old_fastgil(long old_fastgil)
+
+void RPyGilAcquireSlowPath(void)
 {
-    assert(RPY_FASTGIL_LOCKED(rpy_fastgil));
+    assert(RPY_THREADLOCALREF_GET(synclock) == 2);
 
-#ifdef PYPY_USE_ASMGCC
-    if (old_fastgil != 0) {
-        /* this case only occurs from the JIT compiler */
-        struct pypy_ASM_FRAMEDATA_HEAD0 *new =
-            (struct pypy_ASM_FRAMEDATA_HEAD0 *)old_fastgil;
-        struct pypy_ASM_FRAMEDATA_HEAD0 *root = &pypy_g_ASM_FRAMEDATA_HEAD;
-        struct pypy_ASM_FRAMEDATA_HEAD0 *next = root->as_next;
-        new->as_next = next;
-        new->as_prev = root;
-        root->as_next = new;
-        next->as_prev = new;
-    }
-#else
-    assert(old_fastgil == 0);
-#endif
+    /* wait until the master leaves the safe point */
+    pthread_mutex_lock(&master_mutex);
+    RPY_THREADLOCALREF_GET(synclock) = 1;
+    pthread_mutex_unlock(&master_mutex);
 }
 
-#define RPY_GIL_POKE_MIN   40
-#define RPY_GIL_POKE_MAX  400
+void RPyGilReleaseSlowPath(void)
+{
+    assert(RPY_THREADLOCALREF_GET(synclock) == 3);
 
-void RPyGilAcquireSlowPath(long old_fastgil)
+    pthread_mutex_lock(&sync_mutex);
+
+    /* we are one of the THREES that the master is waiting for. Decrease the
+     * counter and signal the master if we are the last. */
+    counter_of_threes--;
+    if (counter_of_threes == 0)
+        pthread_cond_signal(&sync_cond);
+
+    /* set to TWO, so that Acquire above will wait until the master is finished
+     * with its safe point */
+    RPY_THREADLOCALREF_GET(synclock) = 2;
+    pthread_mutex_unlock(&sync_mutex);
+    // continue without GIL
+}
+
+void RPyGilYieldThreadSlowPath(void)
 {
-    /* Acquires the GIL.  This assumes that we already did:
+    RPyGilRelease();
+    RPyGilAcquire();
+}
 
-          old_fastgil = pypy_lock_test_and_set(&rpy_fastgil, 1);
-     */
-    if (!RPY_FASTGIL_LOCKED(old_fastgil)) {
-        /* The fastgil was not previously locked: success.
-           'mutex_gil' should still be locked at this point.
-        */
-    }
-    else {
-        /* Otherwise, another thread is busy with the GIL. */
-        int n;
-        long old_waiting_threads;
+void RPyGilEnterMasterSection(void)
+{
+    RPyGilRelease();
+    pthread_mutex_lock(&master_mutex);
+}
 
-        if (rpy_waiting_threads < 0) {
-            /* <arigo> I tried to have RPyGilAllocate() called from
-             * here, but it fails occasionally on an example
-             * (2.7/test/test_threading.py).  I think what occurs is
-             * that if one thread runs RPyGilAllocate(), it still
-             * doesn't have the GIL; then the other thread might fork()
-             * at precisely this moment, killing the first thread.
-             */
-            fprintf(stderr, "Fatal RPython error: a thread is trying to wait "
-                            "for the GIL, but the GIL was not initialized\n"
-                            "(For PyPy, see "
-                            "https://bitbucket.org/pypy/pypy/issues/2274)\n");
+void RPyGilLeaveMasterSection(void)
+{
+    pthread_mutex_unlock(&master_mutex);
+    RPyGilAcquire();
+}
+
+void RPyGilMasterRequestSafepoint(void)
+{
+    pthread_mutex_lock(&sync_mutex);
+    assert(counter_of_threes == 0);
+
+    /* signal all threads to enter safepoints */
+    OP_THREADLOCALREF_ACQUIRE(/* */);
+
+    struct pypy_threadlocal_s *t = NULL;
+    while (1) {
+        OP_THREADLOCALREF_ENUM(t, t);
+        if (t == NULL)
+            break;
+
+      retry:
+        switch (t->synclock) {
+        case 3:
+            assert(!"unexpected synclock=3 found");
             abort();
-        }
-
-        /* Register me as one of the threads that is actively waiting
-           for the GIL.  The number of such threads is found in
-           rpy_waiting_threads. */
-        old_waiting_threads = atomic_increment(&rpy_waiting_threads);
-
-        /* Early polling: before entering the waiting queue, we check
-           a certain number of times if the GIL becomes free.  The
-           motivation for this is issue #2341.  Note that we do this
-           polling even if there are already other threads in the
-           queue, and one of thesee threads is the stealer.  This is
-           because the stealer is likely sleeping right now.  There
-           are use cases where the GIL will really be released very
-           soon after RPyGilAcquireSlowPath() is called, so it's worth
-           always doing this check.
-
-           To avoid falling into bad cases, we "randomize" the number
-           of iterations: we loop N times, where N is choosen between
-           RPY_GIL_POKE_MIN and RPY_GIL_POKE_MAX.
-        */
-        n = rpy_early_poll_n * 2 + 1;
-        while (n >= RPY_GIL_POKE_MAX)
-            n -= (RPY_GIL_POKE_MAX - RPY_GIL_POKE_MIN);
-        rpy_early_poll_n = n;
-        while (n >= 0) {
-            n--;
-            if (old_waiting_threads != rpy_waiting_threads) {
-                /* If the number changed, it is because another thread 
-                   entered or left this function.  In that case, stop
-                   this loop: if another thread left it means the GIL
-                   has been acquired by that thread; if another thread 
-                   entered there is no point in running the present
-                   loop twice. */
+        case 2:
+            /* thread running in C code, already knows we want a safepoint */
+            break;
+        case 0:
+            /* thread running in C code, make sure it checks for and enters
+             * the safepoint before acquiring the "gil" again */
+            if (__sync_bool_compare_and_swap(&t->synclock, 0, 2))
+                break;
+            goto retry;
+        case 1:
+            /* thread running normally, place request to enter safepoint */
+            if (__sync_bool_compare_and_swap(&t->synclock, 1, 3)) {
+                counter_of_threes++;
+                t->nursery_top = NULL;
                 break;
             }
-            RPy_YieldProcessor();
-            RPy_CompilerMemoryBarrier();
+            goto retry;
+        }
+    }
+    OP_THREADLOCALREF_RELEASE(/* */);
 
-            if (!RPY_FASTGIL_LOCKED(rpy_fastgil)) {
-                old_fastgil = pypy_lock_test_and_set(&rpy_fastgil, 1);
-                if (!RPY_FASTGIL_LOCKED(old_fastgil)) {
-                    /* We got the gil before entering the waiting
-                       queue.  In case there are other threads waiting
-                       for the GIL, wake up the stealer thread now and
-                       go to the waiting queue anyway, for fairness.
-                       This will fall through if there are no other
-                       threads waiting.
-                    */
-                    check_and_save_old_fastgil(old_fastgil);
-                    mutex2_unlock(&mutex_gil);
-                    break;
-                }
-            }
-        }
+    /* wait until all THREES entered their safepoints */
+    while (counter_of_threes > 0) {
+        pthread_cond_wait(&sync_cond, &sync_mutex);
+    }
 
-        /* Enter the waiting queue from the end.  Assuming a roughly
-           first-in-first-out order, this will nicely give the threads
-           a round-robin chance.
-        */
-        mutex1_lock(&mutex_gil_stealer);
-        mutex2_loop_start(&mutex_gil);
+    pthread_mutex_unlock(&sync_mutex);
 
-        /* We are now the stealer thread.  Steals! */
-        while (1) {
-            /* Busy-looping here.  Try to look again if 'rpy_fastgil' is
-               released.
-            */
-            if (!RPY_FASTGIL_LOCKED(rpy_fastgil)) {
-                old_fastgil = pypy_lock_test_and_set(&rpy_fastgil, 1);
-                if (!RPY_FASTGIL_LOCKED(old_fastgil))
-                    /* yes, got a non-held value!  Now we hold it. */
-                    break;
-            }
-            /* Sleep for one interval of time.  We may be woken up earlier
-               if 'mutex_gil' is released.
-            */
-            if (mutex2_lock_timeout(&mutex_gil, 0.0001)) {   /* 0.1 ms... */
-                /* We arrive here if 'mutex_gil' was recently released
-                   and we just relocked it.
-                 */
-                old_fastgil = 0;
-                break;
-            }
-            /* Loop back. */
-        }
-        atomic_decrement(&rpy_waiting_threads);
-        mutex2_loop_stop(&mutex_gil);
-        mutex1_unlock(&mutex_gil_stealer);
-    }
-    check_and_save_old_fastgil(old_fastgil);
-}
-
-long RPyGilYieldThread(void)
-{
-    /* can be called even before RPyGilAllocate(), but in this case,
-       'rpy_waiting_threads' will be -42. */
-    assert(RPY_FASTGIL_LOCKED(rpy_fastgil));
-    if (rpy_waiting_threads <= 0)
-        return 0;
-
-    /* Explicitly release the 'mutex_gil'.
-     */
-    mutex2_unlock(&mutex_gil);
-
-    /* Now nobody has got the GIL, because 'mutex_gil' is released (but
-       rpy_fastgil is still locked).  Call RPyGilAcquire().  It will
-       enqueue ourselves at the end of the 'mutex_gil_stealer' queue.
-       If there is no other waiting thread, it will fall through both
-       its mutex_lock() and mutex_lock_timeout() now.  But that's
-       unlikely, because we tested above that 'rpy_waiting_threads > 0'.
-     */
-    RPyGilAcquire();
-    return 1;
+    /* caller can continue; all threads in safepoints */
 }
 
 /********** for tests only **********/