[pypy-commit] stmgc default: make use-gcc the new default

Wed Sep 16 10:49:39 CEST 2015

Author: Remi Meier <remi.meier at gmail.com>
Branch: 
Changeset: r1961:6ca47dad66a6
Date: 2015-09-16 10:50 +0200
http://bitbucket.org/pypy/stmgc/changeset/6ca47dad66a6/

Log:	make use-gcc the new default

diff too long, truncating to 2000 out of 2714 lines

diff --git a/c7/demo/Makefile b/c7/demo/Makefile
--- a/c7/demo/Makefile
+++ b/c7/demo/Makefile
@@ -19,18 +19,20 @@
 
 COMMON = -I.. -pthread -lrt -g -Wall -Werror -DSTM_LARGEMALLOC_TEST
 
+CC = gcc-seg-gs
+
 
 # note that 'build' is partially optimized but still contains all asserts
 debug-%: %.c ${H_FILES} ${C_FILES}
-	clang $(COMMON) -DSTM_DEBUGPRINT -DSTM_GC_NURSERY=128 -O0 \
+	$(CC) $(COMMON) -DSTM_DEBUGPRINT -DSTM_GC_NURSERY=128 -O0 \
         $< -o debug-$* ../stmgc.c
 
 build-%: %.c ${H_FILES} ${C_FILES}
-	clang $(COMMON) -DSTM_GC_NURSERY=128 -O1 $< -o build-$* ../stmgc.c
+	$(CC) $(COMMON) -DSTM_GC_NURSERY=128 -O1 $< -o build-$* ../stmgc.c
 
 release-%: %.c ${H_FILES} ${C_FILES}
-	clang $(COMMON) -DNDEBUG -O2 $< -o release-$* ../stmgc.c
+	$(CC) $(COMMON) -DNDEBUG -O2 $< -o release-$* ../stmgc.c
 
 
 release-htm-%: %.c ../../htm-c7/stmgc.? ../../htm-c7/htm.h
-	clang $(COMMON) -O2 $< -o release-htm-$* ../../htm-c7/stmgc.c -DUSE_HTM
+	$(CC) $(COMMON) -O2 $< -o release-htm-$* ../../htm-c7/stmgc.c -DUSE_HTM
diff --git a/c7/demo/demo2.c b/c7/demo/demo2.c
--- a/c7/demo/demo2.c
+++ b/c7/demo/demo2.c
@@ -216,7 +216,7 @@
 
 void teardown_list(void)
 {
-    STM_POP_ROOT_RET(stm_thread_local);
+    STM_POP_ROOT_DROP(stm_thread_local);
 }
 
 
@@ -256,6 +256,7 @@
     stm_rewind_jmp_leaveframe(&stm_thread_local, &rjbuf);
     unregister_thread_local();
     status = sem_post(&done); assert(status == 0);
+    (void)status;
     return NULL;
 }
 
@@ -293,6 +294,7 @@
     rewind_jmp_buf rjbuf;
 
     status = sem_init(&done, 0, 0); assert(status == 0);
+    (void)status;
 
     stm_setup();
     stm_register_thread_local(&stm_thread_local);
diff --git a/c7/demo/demo_random.c b/c7/demo/demo_random.c
--- a/c7/demo/demo_random.c
+++ b/c7/demo/demo_random.c
@@ -412,6 +412,7 @@
     stm_unregister_thread_local(&stm_thread_local);
 
     status = sem_post(&done); assert(status == 0);
+    (void)status;
     return NULL;
 }
 
diff --git a/c7/demo/demo_random2.c b/c7/demo/demo_random2.c
--- a/c7/demo/demo_random2.c
+++ b/c7/demo/demo_random2.c
@@ -435,6 +435,7 @@
     stm_unregister_thread_local(&stm_thread_local);
 
     status = sem_post(&done); assert(status == 0);
+    (void)status;
     return NULL;
 }
 
diff --git a/c7/demo/test_shadowstack.c b/c7/demo/test_shadowstack.c
--- a/c7/demo/test_shadowstack.c
+++ b/c7/demo/test_shadowstack.c
@@ -54,7 +54,7 @@
        then do a major collection.  It should still be found by the
        tracing logic. */
     stm_start_transaction(&stm_thread_local);
-    STM_POP_ROOT_RET(stm_thread_local);
+    STM_POP_ROOT_DROP(stm_thread_local);
     STM_POP_ROOT(stm_thread_local, node);
     assert(node->value == 129821);
     STM_PUSH_ROOT(stm_thread_local, NULL);
diff --git a/c7/gdb/gdb_stm.py b/c7/gdb/gdb_stm.py
--- a/c7/gdb/gdb_stm.py
+++ b/c7/gdb/gdb_stm.py
@@ -77,34 +77,25 @@
         '((struct stm_priv_segment_info_s *)(stm_object_pages+%d))%s'
         % (get_segment_size() * segment_id + get_psegment_ofs(), field))
 
-def thread_to_segment_id(thread_id):
-    base = int_(gdb.parse_and_eval('stm_object_pages'))
+def current_segment():
+    mytl = int_(gdb.parse_and_eval('&stm_thread_local'))
     for j in range(1, get_nb_segments() + 1):
-        #ti = get_psegment(j, '->pub.running_thread->creating_pthread[0]')
-        ti = get_psegment(j, '->running_pthread')
-        if int_(ti) == thread_id:
+        tl = get_psegment(j, '->pub.running_thread')
+        if int_(tl) == mytl:
             ts = get_psegment(j, '->transaction_state')
             if int_(ts) == 0:
                 print >> sys.stderr, "note: transaction_state == 0"
             return j
-    raise Exception("thread not found: %r" % (thread_id,))
+    raise Exception("no segment seems to be running this thread")
 
 def interactive_segment_base(thread=None):
     if thread is None:
-        s = gdb.execute('info threads', False, True)
-        i = s.find('\n* ')
-        assert i >= 0
-        fields = s[i+2:].split()
-        assert fields[1] == 'Thread'
-        assert fields[2].startswith('0x')
-        thread_id = int(fields[2], 16)
-        segment_id = thread_to_segment_id(thread_id)
+        segment_id = current_segment()
     elif thread.type.code == gdb.TYPE_CODE_INT:
         if 0 <= int_(thread) < 256:
             segment_id = int_(thread)
         else:
-            thread_id = int_(thread)
-            segment_id = thread_to_segment_id(thread_id)
+            raise TypeError("segment num not in range")
     else:
         raise TypeError("'thread' argument must be an int or not given")
     return get_segment_base(segment_id)
diff --git a/c7/stm/core.c b/c7/stm/core.c
--- a/c7/stm/core.c
+++ b/c7/stm/core.c
@@ -45,7 +45,6 @@
 #endif
 }
 
-__attribute__((always_inline))
 static void write_slowpath_overflow_obj(object_t *obj, bool mark_card)
 {
     /* An overflow object is an object from the same transaction, but
@@ -79,7 +78,6 @@
     }
 }
 
-__attribute__((always_inline))
 static void write_slowpath_common(object_t *obj, bool mark_card)
 {
     assert(_seems_to_be_running_transaction());
@@ -223,6 +221,7 @@
     check_flag_write_barrier(obj);
 }
 
+__attribute__((flatten))
 void _stm_write_slowpath(object_t *obj)
 {
     write_slowpath_common(obj, /*mark_card=*/false);
@@ -241,6 +240,7 @@
     return (size >= _STM_MIN_CARD_OBJ_SIZE);
 }
 
+__attribute__((flatten))
 char _stm_write_slowpath_card_extra(object_t *obj)
 {
     /* the PyPy JIT calls this function directly if it finds that an
diff --git a/c7/stm/forksupport.c b/c7/stm/forksupport.c
--- a/c7/stm/forksupport.c
+++ b/c7/stm/forksupport.c
@@ -58,7 +58,7 @@
     /* Make a new mmap at some other address, but of the same size as
        the standard mmap at stm_object_pages
     */
-    int big_copy_fd;
+    int big_copy_fd = -1;
     char *big_copy = setup_mmap("stmgc's fork support", &big_copy_fd);
 
     /* Copy all the data from the two ranges of objects (large, small)
diff --git a/c7/stm/fprintcolor.c b/c7/stm/fprintcolor.c
--- a/c7/stm/fprintcolor.c
+++ b/c7/stm/fprintcolor.c
@@ -1,3 +1,5 @@
+#include <stdarg.h>
+
 /* ------------------------------------------------------------ */
 #ifdef STM_DEBUGPRINT
 /* ------------------------------------------------------------ */
diff --git a/c7/stmgc.h b/c7/stmgc.h
--- a/c7/stmgc.h
+++ b/c7/stmgc.h
@@ -20,7 +20,15 @@
 #endif
 
 
-#define TLPREFIX __attribute__((address_space(256)))
+#ifdef __SEG_GS     /* on a custom patched gcc */
+#  define TLPREFIX __seg_gs
+#  define _STM_RM_SUFFIX  :8
+#elif defined(__clang__)   /* on a clang, hopefully made bug-free */
+#  define TLPREFIX __attribute__((address_space(256)))
+#  define _STM_RM_SUFFIX  /* nothing */
+#else
+#  error "needs either a GCC with __seg_gs support, or a bug-freed clang"
+#endif
 
 typedef TLPREFIX struct object_s object_t;
 typedef TLPREFIX struct stm_segment_info_s stm_segment_info_t;
@@ -34,11 +42,11 @@
        'STM_SEGMENT->transaction_read_version' if and only if the
        object was read in the current transaction.  The nurseries
        also have corresponding read markers, but they are never used. */
-    uint8_t rm;
+    unsigned char rm _STM_RM_SUFFIX;
 };
 
 struct stm_segment_info_s {
-    uint8_t transaction_read_version;
+    unsigned int transaction_read_version;
     int segment_num;
     char *segment_base;
     stm_char *nursery_current;
@@ -288,6 +296,7 @@
 #define STM_PUSH_ROOT(tl, p)   ((tl).shadowstack++->ss = (object_t *)(p))
 #define STM_POP_ROOT(tl, p)    ((p) = (typeof(p))((--(tl).shadowstack)->ss))
 #define STM_POP_ROOT_RET(tl)   ((--(tl).shadowstack)->ss)
+#define STM_POP_ROOT_DROP(tl)  ((void)(--(tl).shadowstack))
 
 
 /* Every thread needs to have a corresponding stm_thread_local_t
@@ -302,7 +311,12 @@
 
 /* At some key places, like the entry point of the thread and in the
    function with the interpreter's dispatch loop, you need to declare
-   a local variable of type 'rewind_jmp_buf' and call these macros. */
+   a local variable of type 'rewind_jmp_buf' and call these macros.
+   IMPORTANT: a function in which you call stm_rewind_jmp_enterframe()
+   must never change the value of its own arguments!  If they are
+   passed on the stack, gcc can change the value directly there, but
+   we're missing the logic to save/restore this part!
+*/
 #define stm_rewind_jmp_enterprepframe(tl, rjbuf)   \
     rewind_jmp_enterprepframe(&(tl)->rjthread, rjbuf, (tl)->shadowstack)
 #define stm_rewind_jmp_enterframe(tl, rjbuf)       \
@@ -506,7 +520,7 @@
 
 #define STM_POP_MARKER(tl)   ({                 \
     object_t *_popped = STM_POP_ROOT_RET(tl);   \
-    STM_POP_ROOT_RET(tl);                       \
+    STM_POP_ROOT_DROP(tl);                      \
     _popped;                                    \
 })
 
diff --git a/c7/test/common.py b/c7/test/common.py
--- a/c7/test/common.py
+++ b/c7/test/common.py
@@ -3,7 +3,7 @@
 assert sys.maxint == 9223372036854775807, "requires a 64-bit environment"
 
 # ----------
-os.environ['CC'] = 'clang'
+os.environ['CC'] = 'gcc-seg-gs'
 
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
diff --git a/c7/test/support.py b/c7/test/support.py
--- a/c7/test/support.py
+++ b/c7/test/support.py
@@ -478,7 +478,8 @@
                     ],
      undef_macros=['NDEBUG'],
      include_dirs=[parent_dir],
-     extra_compile_args=['-g', '-O0', '-Werror', '-ferror-limit=1'],
+     extra_compile_args=['-g', '-O0', '-Werror',  #, '-ferror-limit=1',  for clang
+                         '-Wfatal-errors'],   # for gcc
      extra_link_args=['-g', '-lrt'],
      force_generic_engine=True)
 
diff --git a/c7/test/test_list.py b/c7/test/test_list.py
--- a/c7/test/test_list.py
+++ b/c7/test/test_list.py
@@ -56,7 +56,7 @@
 ''', define_macros=[('STM_TESTS', '1')],
      undef_macros=['NDEBUG'],
      include_dirs=[parent_dir],
-     extra_compile_args=['-g', '-O0', '-Werror', '-ferror-limit=1'],
+     extra_compile_args=['-g', '-O0', '-Werror'], #, '-ferror-limit=1'],
      force_generic_engine=True)
 
 # ____________________________________________________________
diff --git a/c7/test/test_rewind.c b/c7/test/test_rewind.c
--- a/c7/test/test_rewind.c
+++ b/c7/test/test_rewind.c
@@ -174,12 +174,26 @@
 void foo(int *x) { ++*x; }
 
 __attribute__((noinline))
-void f6(int a1, int a2, int a3, int a4, int a5, int a6, int a7,
-        int a8, int a9, int a10, int a11, int a12, int a13)
+void f6(int c1, int c2, int c3, int c4, int c5, int c6, int c7,
+        int c8, int c9, int c10, int c11, int c12, int c13)
 {
     rewind_jmp_buf buf;
     rewind_jmp_enterframe(&gthread, &buf, NULL);
 
+    int a1 = c1;
+    int a2 = c2;
+    int a3 = c3;
+    int a4 = c4;
+    int a5 = c5;
+    int a6 = c6;
+    int a7 = c7;
+    int a8 = c8;
+    int a9 = c9;
+    int a10 = c10;
+    int a11 = c11;
+    int a12 = c12;
+    int a13 = c13;
+
     rewind_jmp_setjmp(&gthread, NULL);
     gevent(a1); gevent(a2); gevent(a3); gevent(a4);
     gevent(a5); gevent(a6); gevent(a7); gevent(a8);
diff --git a/c7/test/test_rewind.py b/c7/test/test_rewind.py
--- a/c7/test/test_rewind.py
+++ b/c7/test/test_rewind.py
@@ -1,11 +1,11 @@
 import os
 
 def run_test(opt):
-    err = os.system("clang -g -O%s -Werror -DRJBUF_CUSTOM_MALLOC -I../stm"
+    err = os.system("gcc-seg-gs -g -O%s -Werror -DRJBUF_CUSTOM_MALLOC -I../stm"
                     " -o test_rewind_O%s test_rewind.c ../stm/rewind_setjmp.c"
                     % (opt, opt))
     if err != 0:
-        raise OSError("clang failed on test_rewind.c")
+        raise OSError("gcc-seg-gs failed on test_rewind.c")
     for testnum in [1, 2, 3, 4, 5, 6, 7, "TL1", "TL2"]:
         print '=== O%s: RUNNING TEST %s ===' % (opt, testnum)
         err = os.system("./test_rewind_O%s %s" % (opt, testnum))
diff --git a/c8/TODO b/c8/TODO
--- a/c8/TODO
+++ b/c8/TODO
@@ -1,3 +1,10 @@
+- stm_identityhash spends a good time figuring out if an obj is prebuilt
+  (40% of its time). maybe after setup_prebuilt, we could defer the test
+  of GCFLAG_HAS_SHADOW in id_or_identityhash to after an address comparison.
+  I.e., after setup_prebuilt, all objs allocated there could be in some
+  area at the start of the heap, and can thus be classified by having
+  an address < some_barrier. (may include some non-prebuilt ones, but
+  that's ok) (also take care of small prebuilt objs)
 
 - fix markers (e.g. become_inevitable doesn't seem to show up)
 
diff --git a/c8/demo/Makefile b/c8/demo/Makefile
--- a/c8/demo/Makefile
+++ b/c8/demo/Makefile
@@ -19,18 +19,20 @@
 
 COMMON = -I.. -pthread -lrt -g -Wall -Werror -DSTM_LARGEMALLOC_TEST 
 
+CC = gcc-seg-gs
+
 
 # note that 'build' is partially optimized but still contains all asserts
 debug-%: %.c ${H_FILES} ${C_FILES}
-	clang $(COMMON) -DSTM_DEBUGPRINT -DSTM_GC_NURSERY=128 -O0 \
+	$(CC) $(COMMON) -DSTM_DEBUGPRINT -DSTM_GC_NURSERY=128 -O0 \
         $< -o debug-$* ../stmgc.c
 
 build-%: %.c ${H_FILES} ${C_FILES}
-	clang $(COMMON) -DSTM_GC_NURSERY=128 -O1 $< -o build-$* ../stmgc.c
+	$(CC) $(COMMON) -DSTM_GC_NURSERY=128 -O1 $< -o build-$* ../stmgc.c
 
 release-%: %.c ${H_FILES} ${C_FILES}
-	clang $(COMMON) -DNDEBUG -O2 $< -o release-$* ../stmgc.c
+	$(CC) $(COMMON) -DNDEBUG -O2 $< -o release-$* ../stmgc.c
 
 
 release-htm-%: %.c ../../htm-c7/stmgc.? ../../htm-c7/htm.h
-	clang $(COMMON) -O2 $< -o release-htm-$* ../../htm-c7/stmgc.c -DUSE_HTM
+	$(CC) $(COMMON) -O2 $< -o release-htm-$* ../../htm-c7/stmgc.c -DUSE_HTM
diff --git a/c8/demo/demo2.c b/c8/demo/demo2.c
--- a/c8/demo/demo2.c
+++ b/c8/demo/demo2.c
@@ -214,7 +214,7 @@
 
 void teardown_list(void)
 {
-    STM_POP_ROOT_RET(stm_thread_local);
+    STM_POP_ROOT_DROP(stm_thread_local);
 }
 
 
@@ -255,6 +255,7 @@
     stm_rewind_jmp_leaveframe(&stm_thread_local, &rjbuf);
     unregister_thread_local();
     status = sem_post(&done); assert(status == 0);
+    (void)status;
     return NULL;
 }
 
@@ -292,6 +293,7 @@
     rewind_jmp_buf rjbuf;
 
     status = sem_init(&done, 0, 0); assert(status == 0);
+    (void)status;
 
     stm_setup();
     stm_register_thread_local(&stm_thread_local);
@@ -308,6 +310,7 @@
 
     for (i = 1; i <= NTHREADS; i++) {
         status = sem_wait(&done); assert(status == 0);
+        (void)status;
     }
 
     final_check();
diff --git a/c8/demo/demo_random.c b/c8/demo/demo_random.c
--- a/c8/demo/demo_random.c
+++ b/c8/demo/demo_random.c
@@ -463,6 +463,7 @@
     stm_unregister_thread_local(&stm_thread_local);
 
     status = sem_post(&done); assert(status == 0);
+    (void)status;
     return NULL;
 }
 
diff --git a/c8/demo/demo_random2.c b/c8/demo/demo_random2.c
--- a/c8/demo/demo_random2.c
+++ b/c8/demo/demo_random2.c
@@ -485,6 +485,7 @@
     stm_unregister_thread_local(&stm_thread_local);
 
     status = sem_post(&done); assert(status == 0);
+    (void)status;
     return NULL;
 }
 
diff --git a/c8/demo/test_shadowstack.c b/c8/demo/test_shadowstack.c
--- a/c8/demo/test_shadowstack.c
+++ b/c8/demo/test_shadowstack.c
@@ -53,7 +53,7 @@
        then do a major collection.  It should still be found by the
        tracing logic. */
     stm_force_transaction_break(&stm_thread_local);
-    STM_POP_ROOT_RET(stm_thread_local);
+    STM_POP_ROOT_DROP(stm_thread_local);
     STM_POP_ROOT(stm_thread_local, node);
     assert(node->value == 129821);
     STM_PUSH_ROOT(stm_thread_local, NULL);
diff --git a/c8/stm/core.c b/c8/stm/core.c
--- a/c8/stm/core.c
+++ b/c8/stm/core.c
@@ -12,7 +12,7 @@
 {
     assert(undo->type != TYPE_POSITION_MARKER);
     free(undo->backup);
-    assert(undo->backup = (char*)-88);
+    assert(undo->backup = (char*)0xbb);
     increment_total_allocated(-SLICE_SIZE(undo->slice));
 }
 
@@ -119,6 +119,7 @@
     dprintf(("copy_bk_objs_in_page_from(%d, %ld, %d)\n",
              from_segnum, (long)pagenum, only_if_not_modified));
 
+    assert(modification_lock_check_rdlock(from_segnum));
     struct list_s *list = get_priv_segment(from_segnum)->modified_old_objects;
     struct stm_undo_s *undo = (struct stm_undo_s *)list->items;
     struct stm_undo_s *end = (struct stm_undo_s *)(list->items + list->count);
@@ -238,6 +239,7 @@
         addr >= stm_object_pages+TOTAL_MEMORY) {
         /* actual segfault, unrelated to stmgc */
         fprintf(stderr, "Segmentation fault: accessing %p\n", addr);
+        detect_shadowstack_overflow(addr);
         abort();
     }
 
@@ -269,7 +271,7 @@
 /* ############# commit log ############# */
 
 
-void _dbg_print_commit_log()
+void _dbg_print_commit_log(void)
 {
     struct stm_commit_log_entry_s *cl = &commit_log_root;
 
@@ -306,7 +308,7 @@
     }
 }
 
-static void reset_modified_from_backup_copies(int segment_num);  /* forward */
+static void reset_modified_from_backup_copies(int segment_num, object_t *only_obj);  /* forward */
 
 static bool _stm_validate(void)
 {
@@ -407,6 +409,25 @@
                         if (LIKELY(!_stm_was_read(obj)))
                             continue;
 
+                        /* check for NO_CONFLICT flag in seg0. While its data may
+                           not be current there, the flag will be there and is
+                           immutable. (we cannot check in my_segnum bc. we may
+                           only have executed stm_read(o) but not touched its pages
+                           yet -> they may be NO_ACCESS */
+                        struct object_s *obj0 = (struct object_s *)REAL_ADDRESS(get_segment_base(0), obj);
+                        if (obj0->stm_flags & GCFLAG_NO_CONFLICT) {
+                            /* obj is noconflict and therefore shouldn't cause
+                               an abort. However, from now on, we also assume
+                               that an abort would not roll-back to what is in
+                               the backup copy, as we don't trace the bkcpy
+                               during major GCs.
+                               We choose the approach to reset all our changes
+                               to this obj here, so that we can throw away the
+                               backup copy completely: */
+                            reset_modified_from_backup_copies(my_segnum, obj);
+                            continue;
+                        }
+
                         /* conflict! */
                         dprintf(("_stm_validate() failed for obj %p\n", obj));
 
@@ -416,7 +437,7 @@
                            from the old (but unmodified) version to the newer
                            version.
                         */
-                        reset_modified_from_backup_copies(my_segnum);
+                        reset_modified_from_backup_copies(my_segnum, NULL);
                         timing_write_read_contention(cl->written, undo);
                         needs_abort = true;
                         break;
@@ -614,7 +635,8 @@
 
     new = _create_commit_log_entry();
     if (STM_PSEGMENT->transaction_state == TS_INEVITABLE) {
-        assert(_stm_detached_inevitable_from_thread == 0);  /* running it */
+        assert(_stm_detached_inevitable_from_thread == 0  /* running it */
+               || _stm_detached_inevitable_from_thread == -1);  /* committing external */
 
         old = STM_PSEGMENT->last_commit_log_entry;
         new->rev_num = old->rev_num + 1;
@@ -868,7 +890,6 @@
     _cards_cleared_in_object(get_priv_segment(STM_SEGMENT->segment_num), obj, false);
 }
 
-__attribute__((always_inline))
 static void write_slowpath_overflow_obj(object_t *obj, bool mark_card)
 {
     assert(obj->stm_flags & GCFLAG_WRITE_BARRIER);
@@ -927,7 +948,6 @@
     release_privatization_lock(STM_SEGMENT->segment_num);
 }
 
-__attribute__((always_inline))
 static void write_slowpath_common(object_t *obj, bool mark_card)
 {
     assert(_seems_to_be_running_transaction());
@@ -1070,6 +1090,7 @@
              obj, index, get_index_to_card_index(index), CARD_MARKED));
 }
 
+__attribute__((flatten))
 void _stm_write_slowpath(object_t *obj) {
     write_slowpath_common(obj,  /* mark_card */ false);
 }
@@ -1336,6 +1357,16 @@
     push_large_overflow_objects_to_other_segments();
     /* push before validate. otherwise they are reachable too early */
 
+
+    /* before releasing _stm_detached_inevitable_from_thread, perform
+       the commit. Otherwise, the same thread whose (inev) transaction we try
+       to commit here may start a new one in another segment *but* w/o
+       the committed data from its previous inev transaction. (the
+       stm_validate() at the start of a new transaction is happy even
+       if there is an inevitable tx running) */
+    bool was_inev = STM_PSEGMENT->transaction_state == TS_INEVITABLE;
+    _validate_and_add_to_commit_log();
+
     if (external) {
         /* from this point on, unlink the original 'stm_thread_local_t *'
            from its segment.  Better do it as soon as possible, because
@@ -1347,17 +1378,16 @@
         _stm_detached_inevitable_from_thread = 0;
     }
 
-    bool was_inev = STM_PSEGMENT->transaction_state == TS_INEVITABLE;
-    _validate_and_add_to_commit_log();
 
     if (!was_inev) {
         assert(!external);
         stm_rewind_jmp_forget(STM_SEGMENT->running_thread);
     }
 
+    commit_finalizers();
+
     /* XXX do we still need a s_mutex_lock() section here? */
     s_mutex_lock();
-    commit_finalizers();
 
     /* update 'overflow_number' if needed */
     if (STM_PSEGMENT->overflow_number_has_been_used) {
@@ -1388,7 +1418,7 @@
         invoke_general_finalizers(tl);
 }
 
-static void reset_modified_from_backup_copies(int segment_num)
+static void reset_modified_from_backup_copies(int segment_num, object_t *only_obj)
 {
 #pragma push_macro("STM_PSEGMENT")
 #pragma push_macro("STM_SEGMENT")
@@ -1404,7 +1434,11 @@
     for (; undo < end; undo++) {
         if (undo->type == TYPE_POSITION_MARKER)
             continue;
+
         object_t *obj = undo->object;
+        if (only_obj != NULL && obj != only_obj)
+            continue;
+
         char *dst = REAL_ADDRESS(pseg->pub.segment_base, obj);
 
         memcpy(dst + SLICE_OFFSET(undo->slice),
@@ -1416,12 +1450,29 @@
                  SLICE_SIZE(undo->slice), undo->backup));
 
         free_bk(undo);
+
+        if (only_obj != NULL) {
+            assert(IMPLY(only_obj != NULL,
+                         (((struct object_s *)dst)->stm_flags
+                          & (GCFLAG_NO_CONFLICT
+                             | GCFLAG_WRITE_BARRIER
+                             | GCFLAG_WB_EXECUTED))
+                         == (GCFLAG_NO_CONFLICT | GCFLAG_WRITE_BARRIER)));
+            /* copy last element over this one */
+            end--;
+            list->count -= 3;
+            if (undo < end)
+                *undo = *end;
+            undo--;  /* next itr */
+        }
     }
 
-    /* check that all objects have the GCFLAG_WRITE_BARRIER afterwards */
-    check_all_write_barrier_flags(pseg->pub.segment_base, list);
+    if (only_obj == NULL) {
+        /* check that all objects have the GCFLAG_WRITE_BARRIER afterwards */
+        check_all_write_barrier_flags(pseg->pub.segment_base, list);
 
-    list_clear(list);
+        list_clear(list);
+    }
 #pragma pop_macro("STM_SEGMENT")
 #pragma pop_macro("STM_PSEGMENT")
 }
@@ -1457,7 +1508,7 @@
         });
 
     acquire_modification_lock_wr(segment_num);
-    reset_modified_from_backup_copies(segment_num);
+    reset_modified_from_backup_copies(segment_num, NULL);
     release_modification_lock_wr(segment_num);
     _verify_cards_cleared_in_all_lists(pseg);
 
@@ -1629,6 +1680,16 @@
         if (!_validate_and_turn_inevitable())
             return;
     }
+
+    /* There may be a concurrent commit of a detached Tx going on.
+       Here, we may be right after the _validate_and_add_to_commit_log
+       and before resetting _stm_detached_inevitable_from_thread to
+       0. We have to wait for this to happen bc. otherwise, eg.
+       _stm_detach_inevitable_transaction is not safe to do yet */
+    while (_stm_detached_inevitable_from_thread == -1)
+        spin_loop();
+    assert(_stm_detached_inevitable_from_thread == 0);
+
     soon_finished_or_inevitable_thread_segment();
     STM_PSEGMENT->transaction_state = TS_INEVITABLE;
 
diff --git a/c8/stm/core.h b/c8/stm/core.h
--- a/c8/stm/core.h
+++ b/c8/stm/core.h
@@ -43,6 +43,7 @@
     GCFLAG_CARDS_SET = _STM_GCFLAG_CARDS_SET,
     GCFLAG_VISITED = 0x10,
     GCFLAG_FINALIZATION_ORDERING = 0x20,
+    GCFLAG_NO_CONFLICT = _STM_GCFLAG_NO_CONFLICT,
     /* All remaining bits of the 32-bit 'stm_flags' field are taken by
        the "overflow number".  This is a number that identifies the
        "overflow objects" from the current transaction among all old
@@ -50,7 +51,7 @@
        current transaction that have been flushed out of the nursery,
        which occurs if the same transaction allocates too many objects.
     */
-    GCFLAG_OVERFLOW_NUMBER_bit0 = 0x40   /* must be last */
+    GCFLAG_OVERFLOW_NUMBER_bit0 = 0x80   /* must be last */
 };
 
 #define SYNC_QUEUE_SIZE    31
@@ -267,14 +268,6 @@
     return stm_object_pages + segment_num * (NB_PAGES * 4096UL);
 }
 
-static inline long get_num_segment_containing_address(char *addr)
-{
-    uintptr_t delta = addr - stm_object_pages;
-    uintptr_t result = delta / (NB_PAGES * 4096UL);
-    assert(result < NB_SEGMENTS);
-    return result;
-}
-
 static inline
 struct stm_segment_info_s *get_segment(long segment_num) {
     return (struct stm_segment_info_s *)REAL_ADDRESS(
diff --git a/c8/stm/detach.c b/c8/stm/detach.c
--- a/c8/stm/detach.c
+++ b/c8/stm/detach.c
@@ -122,6 +122,9 @@
         dprintf(("reattach_transaction: commit detached from seg %d\n",
                  remote_seg_num));
 
+        assert(tl != old_tl);
+
+        // XXX: not sure if the next line is a good idea
         tl->last_associated_segment_num = remote_seg_num;
         ensure_gs_register(remote_seg_num);
         commit_external_inevitable_transaction();
@@ -135,6 +138,7 @@
 {
     dprintf(("> stm_force_transaction_break()\n"));
     assert(STM_SEGMENT->running_thread == tl);
+    assert(!stm_is_atomic(tl));
     _stm_commit_transaction();
     _stm_start_transaction(tl);
 }
@@ -180,14 +184,9 @@
     dprintf(("commit_fetched_detached_transaction from seg %d\n", segnum));
     assert(segnum > 0);
 
-    if (segnum != mysegnum) {
-        set_gs_register(get_segment_base(segnum));
-    }
+    ensure_gs_register(segnum);
     commit_external_inevitable_transaction();
-
-    if (segnum != mysegnum) {
-        set_gs_register(get_segment_base(mysegnum));
-    }
+    ensure_gs_register(mysegnum);
 }
 
 static void commit_detached_transaction_if_from(stm_thread_local_t *tl)
diff --git a/c8/stm/finalizer.c b/c8/stm/finalizer.c
--- a/c8/stm/finalizer.c
+++ b/c8/stm/finalizer.c
@@ -29,6 +29,9 @@
 static void _commit_finalizers(void)
 {
     /* move finalizer lists to g_finalizers for major collections */
+    while (__sync_lock_test_and_set(&g_finalizers.lock, 1) != 0) {
+        spin_loop();
+    }
 
     if (STM_PSEGMENT->finalizers->run_finalizers != NULL) {
         /* copy 'STM_PSEGMENT->finalizers->run_finalizers' into
@@ -60,6 +63,8 @@
 
     free(STM_PSEGMENT->finalizers);
     STM_PSEGMENT->finalizers = NULL;
+
+    __sync_lock_release(&g_finalizers.lock);
 }
 
 static void abort_finalizers(struct stm_priv_segment_info_s *pseg)
@@ -309,7 +314,7 @@
 {
     assert(_finalization_state(obj) == 1);
     /* The call will add GCFLAG_VISITED recursively, thus bump state 1->2 */
-    mark_visit_possibly_new_object(obj, pseg);
+    mark_visit_possibly_overflow_object(obj, pseg);
 }
 
 static struct list_s *mark_finalize_step1(
@@ -389,7 +394,7 @@
 static void deal_with_objects_with_finalizers(void)
 {
     /* for non-light finalizers */
-
+    assert(_has_mutex());
     /* there is one 'objects_with_finalizers' list per segment.
        Objects that die at a major collection running in the same
        transaction as they were created will be put in the
@@ -431,7 +436,7 @@
     if (f != NULL && f->run_finalizers != NULL) {
         LIST_FOREACH_R(f->run_finalizers, object_t * /*item*/,
                        ({
-                           mark_visit_possibly_new_object(item, pseg);
+                           mark_visit_possibly_overflow_object(item, pseg);
                        }));
     }
 }
@@ -481,25 +486,39 @@
     LIST_FREE(f->run_finalizers);
 }
 
+/* XXX: according to translator.backendopt.finalizer, getfield_gc
+        for primitive types is a safe op in light finalizers.
+        I don't think that's correct in general (maybe if
+        getfield on *dying obj*).
+*/
+
 static void _invoke_general_finalizers(stm_thread_local_t *tl)
 {
     /* called between transactions */
-    static int lock = 0;
-
-    if (__sync_lock_test_and_set(&lock, 1) != 0) {
-        /* can't acquire the lock: someone else is likely already
-           running this function, so don't wait. */
-        return;
-    }
-
     rewind_jmp_buf rjbuf;
     stm_rewind_jmp_enterframe(tl, &rjbuf);
     _stm_start_transaction(tl);
+    /* XXX: become inevitable, bc. otherwise, we would need to keep
+       around the original g_finalizers.run_finalizers to restore it
+       in case of an abort. */
+    _stm_become_inevitable("finalizer-Tx");
 
-    _execute_finalizers(&g_finalizers);
+    while (__sync_lock_test_and_set(&g_finalizers.lock, 1) != 0) {
+        /* somebody is adding more finalizers (_commit_finalizer()) */
+        spin_loop();
+    }
+    struct finalizers_s copy = g_finalizers;
+    assert(copy.running_next == NULL);
+    g_finalizers.run_finalizers = NULL;
+    /* others may add to g_finalizers again: */
+    __sync_lock_release(&g_finalizers.lock);
+
+    if (copy.run_finalizers != NULL) {
+        _execute_finalizers(&copy);
+    }
 
     _stm_commit_transaction();
     stm_rewind_jmp_leaveframe(tl, &rjbuf);
 
-    __sync_lock_release(&lock);
+    LIST_FREE(copy.run_finalizers);
 }
diff --git a/c8/stm/finalizer.h b/c8/stm/finalizer.h
--- a/c8/stm/finalizer.h
+++ b/c8/stm/finalizer.h
@@ -1,5 +1,7 @@
 
+/* see deal_with_objects_with_finalizers() for explanation of these fields */
 struct finalizers_s {
+    long lock;
     struct list_s *objects_with_finalizers;
     uintptr_t count_non_young;
     struct list_s *run_finalizers;
diff --git a/c8/stm/forksupport.c b/c8/stm/forksupport.c
--- a/c8/stm/forksupport.c
+++ b/c8/stm/forksupport.c
@@ -20,7 +20,7 @@
     s_mutex_lock();
 
     dprintf(("forksupport_prepare\n"));
-    fprintf(stderr, "[forking: for now, this operation can take some time]\n");
+    //fprintf(stderr, "[forking: for now, this operation can take some time]\n");
 
     stm_thread_local_t *this_tl = NULL;
     stm_thread_local_t *tl = stm_all_thread_locals;
@@ -87,7 +87,7 @@
     assert(tl->last_associated_segment_num == i);
     assert(in_transaction(tl));
     assert(pr->transaction_state != TS_INEVITABLE);
-    set_gs_register(get_segment_base(i));
+    ensure_gs_register(i);
     assert(STM_SEGMENT->segment_num == i);
 
     s_mutex_lock();
@@ -155,7 +155,7 @@
     int segnum = fork_this_tl->last_associated_segment_num;
     assert(1 <= segnum && segnum < NB_SEGMENTS);
     *_get_cpth(fork_this_tl) = pthread_self();
-    set_gs_register(get_segment_base(segnum));
+    ensure_gs_register(segnum);
     assert(STM_SEGMENT->segment_num == segnum);
 
     if (!fork_was_in_transaction) {
diff --git a/c8/stm/fprintcolor.c b/c8/stm/fprintcolor.c
--- a/c8/stm/fprintcolor.c
+++ b/c8/stm/fprintcolor.c
@@ -1,3 +1,5 @@
+#include <stdarg.h>
+
 /* ------------------------------------------------------------ */
 #ifdef STM_DEBUGPRINT
 /* ------------------------------------------------------------ */
diff --git a/c8/stm/gcpage.c b/c8/stm/gcpage.c
--- a/c8/stm/gcpage.c
+++ b/c8/stm/gcpage.c
@@ -341,7 +341,7 @@
 }
 
 
-static void mark_visit_possibly_new_object(object_t *obj, struct stm_priv_segment_info_s *pseg)
+static void mark_visit_possibly_overflow_object(object_t *obj, struct stm_priv_segment_info_s *pseg)
 {
     /* if newly allocated object, we trace in segment_base, otherwise in
        the sharing seg0 */
@@ -464,7 +464,7 @@
         for (; modified < end; modified++) {
             if (modified->type == TYPE_POSITION_MARKER &&
                     modified->type2 != TYPE_MODIFIED_HASHTABLE)
-                mark_visit_possibly_new_object(modified->marker_object, pseg);
+                mark_visit_possibly_overflow_object(modified->marker_object, pseg);
         }
     }
 }
@@ -503,11 +503,11 @@
         struct stm_shadowentry_s *base = tl->shadowstack_base;
         while (current-- != base) {
             if ((((uintptr_t)current->ss) & 3) == 0) {
-                mark_visit_possibly_new_object(current->ss, pseg);
+                mark_visit_possibly_overflow_object(current->ss, pseg);
             }
         }
 
-        mark_visit_possibly_new_object(tl->thread_local_obj, pseg);
+        mark_visit_possibly_overflow_object(tl->thread_local_obj, pseg);
 
         tl = tl->next;
     } while (tl != stm_all_thread_locals);
@@ -517,7 +517,7 @@
     assert(get_priv_segment(0)->transaction_state == TS_NONE);
     for (i = 1; i < NB_SEGMENTS; i++) {
         if (get_priv_segment(i)->transaction_state != TS_NONE) {
-            mark_visit_possibly_new_object(
+            mark_visit_possibly_overflow_object(
                 get_priv_segment(i)->threadlocal_at_start_of_transaction,
                 get_priv_segment(i));
 
diff --git a/c8/stm/hashtable.c b/c8/stm/hashtable.c
--- a/c8/stm/hashtable.c
+++ b/c8/stm/hashtable.c
@@ -150,8 +150,9 @@
 
 static void _stm_rehash_hashtable(stm_hashtable_t *hashtable,
                                   uintptr_t biggercount,
-                                  char *segment_base)
+                                  long segnum) /* segnum=-1 if no major GC */
 {
+    char *segment_base = segnum == -1 ? NULL : get_segment_base(segnum);
     dprintf(("rehash %p to size %ld, segment_base=%p\n",
              hashtable, biggercount, segment_base));
 
@@ -175,22 +176,28 @@
         stm_hashtable_entry_t *entry = table->items[j];
         if (entry == NULL)
             continue;
-        if (segment_base != NULL) {
+
+        char *to_read_from = segment_base;
+        if (segnum != -1) {
             /* -> compaction during major GC */
+            /* it's possible that we just created this entry, and it wasn't
+               touched in this segment yet. Then seg0 is up-to-date.  */
+            to_read_from = get_page_status_in(segnum, (uintptr_t)entry / 4096UL) == PAGE_NO_ACCESS
+                ? stm_object_pages : to_read_from;
             if (((struct stm_hashtable_entry_s *)
-                       REAL_ADDRESS(segment_base, entry))->object == NULL &&
-                   !_stm_was_read_by_anybody((object_t *)entry)) {
-                dprintf(("  removing dead %p\n", entry));
+                 REAL_ADDRESS(to_read_from, entry))->object == NULL &&
+                !_stm_was_read_by_anybody((object_t *)entry)) {
+                dprintf(("  removing dead %p at %ld\n", entry, j));
                 continue;
             }
         }
 
         uintptr_t eindex;
-        if (segment_base == NULL)
+        if (segnum == -1)
             eindex = entry->index;   /* read from STM_SEGMENT */
         else
             eindex = ((struct stm_hashtable_entry_s *)
-                       REAL_ADDRESS(segment_base, entry))->index;
+                       REAL_ADDRESS(to_read_from, entry))->index;
 
         dprintf(("  insert_clean %p at index=%ld\n",
                  entry, eindex));
@@ -222,8 +229,10 @@
     i = index & mask;
     entry = VOLATILE_TABLE(table)->items[i];
     if (entry != NULL) {
-        if (entry->index == index)
+        if (entry->index == index) {
+            stm_read((object_t*)entry);
             return entry;           /* found at the first try */
+        }
 
         uintptr_t perturb = index;
         while (1) {
@@ -231,8 +240,10 @@
             i &= mask;
             entry = VOLATILE_TABLE(table)->items[i];
             if (entry != NULL) {
-                if (entry->index == index)
+                if (entry->index == index) {
+                    stm_read((object_t*)entry);
                     return entry;    /* found */
+                }
             }
             else
                 break;
@@ -285,7 +296,8 @@
     if (rc > 6) {
         /* we can only enter here once!  If we allocate stuff, we may
            run the GC, and so 'hashtableobj' might move afterwards. */
-        if (_is_in_nursery(hashtableobj)) {
+        if (_is_in_nursery(hashtableobj)
+            && will_allocate_in_nursery(sizeof(stm_hashtable_entry_t))) {
             /* this also means that the hashtable is from this
                transaction and not visible to other segments yet, so
                the new entry can be nursery-allocated. */
@@ -329,6 +341,7 @@
         table->items[i] = entry;
         write_fence();     /* make sure 'table->items' is written here */
         VOLATILE_TABLE(table)->resize_counter = rc - 6;    /* unlock */
+        stm_read((object_t*)entry);
         return entry;
     }
     else {
@@ -339,7 +352,7 @@
             biggercount *= 4;
         else
             biggercount *= 2;
-        _stm_rehash_hashtable(hashtable, biggercount, /*segment_base=*/NULL);
+        _stm_rehash_hashtable(hashtable, biggercount, /*segnum=*/-1);
         goto restart;
     }
 }
@@ -348,7 +361,7 @@
                              uintptr_t key)
 {
     stm_hashtable_entry_t *e = stm_hashtable_lookup(hobj, hashtable, key);
-    stm_read((object_t *)e);
+    // stm_read((object_t *)e); - done in _lookup()
     return e->object;
 }
 
@@ -359,6 +372,9 @@
 
         stm_write((object_t *)entry);
 
+        /* this restriction may be lifted, see test_new_entry_if_nursery_full: */
+        assert(IMPLY(_is_young((object_t *)entry), _is_young(hobj)));
+
         uintptr_t i = list_count(STM_PSEGMENT->modified_old_objects);
         if (i > 0 && list_item(STM_PSEGMENT->modified_old_objects, i - 3)
                      == (uintptr_t)entry) {
@@ -379,11 +395,13 @@
                    will make the other transaction check that it didn't
                    do any stm_hashtable_list() on the complete hashtable.
             */
+            acquire_modification_lock_wr(STM_SEGMENT->segment_num);
             STM_PSEGMENT->modified_old_objects = list_append3(
                 STM_PSEGMENT->modified_old_objects,
                 TYPE_POSITION_MARKER,      /* type1 */
                 TYPE_MODIFIED_HASHTABLE,   /* type2 */
                 (uintptr_t)hobj);          /* modif_hashtable */
+            release_modification_lock_wr(STM_SEGMENT->segment_num);
         }
     }
     entry->object = nvalue;
@@ -420,7 +438,7 @@
 }
 
 long stm_hashtable_list(object_t *hobj, stm_hashtable_t *hashtable,
-                        stm_hashtable_entry_t **results)
+                        stm_hashtable_entry_t * TLPREFIX *results)
 {
     /* Set the read marker.  It will be left as long as we're running
        the same transaction.
@@ -481,7 +499,7 @@
            objects in the segment of the running transaction.  Otherwise,
            the base case is to read them all from segment zero.
         */
-        long segnum = get_num_segment_containing_address((char *)hobj);
+        long segnum = get_segment_of_linear_address((char *)hobj);
         if (!IS_OVERFLOW_OBJ(get_priv_segment(segnum), hobj))
             segnum = 0;
 
@@ -495,7 +513,7 @@
         assert(count <= table->mask + 1);
 
         dprintf(("compact with %ld items:\n", num_entries_times_6 / 6));
-        _stm_rehash_hashtable(hashtable, count, get_segment_base(segnum));
+        _stm_rehash_hashtable(hashtable, count, segnum);
     }
 
     table = hashtable->table;
diff --git a/c8/stm/locks.h b/c8/stm/locks.h
--- a/c8/stm/locks.h
+++ b/c8/stm/locks.h
@@ -10,11 +10,14 @@
    of modification locks!
 */
 
-typedef struct {
-    pthread_rwlock_t lock;
+typedef union {
+    struct {
+        pthread_rwlock_t lock;
 #ifndef NDEBUG
-    volatile bool write_locked;
+        volatile bool write_locked;
 #endif
+    };
+    char _pad[64];
 } modification_lock_t __attribute__((aligned(64)));
 
 static modification_lock_t _modlocks[NB_SEGMENTS - 1];
diff --git a/c8/stm/marker.c b/c8/stm/marker.c
--- a/c8/stm/marker.c
+++ b/c8/stm/marker.c
@@ -73,12 +73,14 @@
     /* -2 is not odd */
     assert(marker.odd_number != (uintptr_t)TYPE_MODIFIED_HASHTABLE);
 
+    acquire_modification_lock_wr(STM_SEGMENT->segment_num);
     STM_PSEGMENT->position_markers_last = list_count(list);
     STM_PSEGMENT->modified_old_objects = list_append3(
         list,
         TYPE_POSITION_MARKER,         /* type */
         marker.odd_number,            /* marker_odd_number */
         (uintptr_t)marker.object);    /* marker_object */
+    release_modification_lock_wr(STM_SEGMENT->segment_num);
 }
 
 static void timing_write_read_contention(struct stm_undo_s *start,
diff --git a/c8/stm/nursery.c b/c8/stm/nursery.c
--- a/c8/stm/nursery.c
+++ b/c8/stm/nursery.c
@@ -477,6 +477,7 @@
     /* reset the nursery by zeroing it */
     char *realnursery;
     realnursery = REAL_ADDRESS(pseg->pub.segment_base, _stm_nursery_start);
+    (void)realnursery;
 #if _STM_NURSERY_ZEROED
     memset(realnursery, 0, nursery_used);
 
@@ -722,7 +723,7 @@
 
     /* including the sharing seg0 */
     for (i = 0; i < NB_SEGMENTS; i++) {
-        set_gs_register(get_segment_base(i));
+        ensure_gs_register(i);
 
         bool ok = _stm_validate();
         assert(get_priv_segment(i)->last_commit_log_entry->next == NULL
@@ -768,7 +769,7 @@
         }
     }
 
-    set_gs_register(get_segment_base(original_num));
+    ensure_gs_register(original_num);
 }
 
 
@@ -777,8 +778,16 @@
     char *realobj = REAL_ADDRESS(STM_SEGMENT->segment_base, obj);
     size_t size = stmcb_size_rounded_up((struct object_s *)realobj);
 
-    /* always gets outside as a large object for now (XXX?) */
-    object_t *nobj = (object_t *)allocate_outside_nursery_large(size);
+    /* always gets outside */
+    object_t *nobj;
+    if (size > GC_LAST_SMALL_SIZE) {
+        /* case 1: object is not small enough.
+           Ask gcpage.c for an allocation via largemalloc. */
+        nobj = (object_t *)allocate_outside_nursery_large(size);
+    } else {
+        /* case "small enough" */
+        nobj = (object_t *)allocate_outside_nursery_small(size);
+    }
 
     /* Initialize the shadow enough to be considered a valid gc object.
        If the original object stays alive at the next minor collection,
diff --git a/c8/stm/nursery.h b/c8/stm/nursery.h
--- a/c8/stm/nursery.h
+++ b/c8/stm/nursery.h
@@ -27,6 +27,20 @@
             get_priv_segment(other_segment_num)->safe_point != SP_RUNNING);
 }
 
+static inline bool will_allocate_in_nursery(size_t size_rounded_up) {
+    OPT_ASSERT(size_rounded_up >= 16);
+    OPT_ASSERT((size_rounded_up & 7) == 0);
+
+    if (UNLIKELY(size_rounded_up >= _STM_FAST_ALLOC))
+        return false;
+
+    stm_char *p = STM_SEGMENT->nursery_current;
+    stm_char *end = p + size_rounded_up;
+    if (UNLIKELY((uintptr_t)end > STM_SEGMENT->nursery_end))
+        return false;
+    return true;
+}
+
 
 #define must_abort()   is_abort(STM_SEGMENT->nursery_end)
 static object_t *find_shadow(object_t *obj);
diff --git a/c8/stm/pages.c b/c8/stm/pages.c
--- a/c8/stm/pages.c
+++ b/c8/stm/pages.c
@@ -28,7 +28,7 @@
     uint64_t ta = __sync_add_and_fetch(&pages_ctl.total_allocated,
                                        add_or_remove);
 
-    if (ta >= pages_ctl.total_allocated_bound)
+    if (UNLIKELY(ta >= pages_ctl.total_allocated_bound))
         pages_ctl.major_collection_requested = true;
 
     return ta;
diff --git a/c8/stm/queue.c b/c8/stm/queue.c
--- a/c8/stm/queue.c
+++ b/c8/stm/queue.c
@@ -52,7 +52,7 @@
 
 stm_queue_t *stm_queue_create(void)
 {
-    void *mem;
+    void *mem = NULL;
     int result = posix_memalign(&mem, 64, sizeof(stm_queue_t));
     assert(result == 0);
     (void)result;
@@ -437,7 +437,7 @@
             queue_entry_t *entry = seg->added_in_this_transaction;
 
             while (entry != NULL) {
-                mark_visit_possibly_new_object(entry->object, pseg);
+                mark_visit_possibly_overflow_object(entry->object, pseg);
                 entry = entry->next;
             }
         } TREE_LOOP_END;
diff --git a/c8/stm/setup.c b/c8/stm/setup.c
--- a/c8/stm/setup.c
+++ b/c8/stm/setup.c
@@ -183,12 +183,12 @@
     teardown_modification_locks();
 }
 
-static void _shadowstack_trap_page(char *start, int prot)
+static char *_shadowstack_trap_page(struct stm_shadowentry_s *base)
 {
     size_t bsize = STM_SHADOW_STACK_DEPTH * sizeof(struct stm_shadowentry_s);
-    char *end = start + bsize + 4095;
+    char *end = ((char *)base) + bsize + 4095;
     end -= (((uintptr_t)end) & 4095);
-    mprotect(end, 4096, prot);
+    return end;
 }
 
 static void _init_shadow_stack(stm_thread_local_t *tl)
@@ -200,9 +200,9 @@
 
     /* set up a trap page: if the shadowstack overflows, it will
        crash in a clean segfault */
-    _shadowstack_trap_page(start, PROT_NONE);
+    struct stm_shadowentry_s *s = (struct stm_shadowentry_s *)start;
+    mprotect(_shadowstack_trap_page(s), 4096, PROT_NONE);
 
-    struct stm_shadowentry_s *s = (struct stm_shadowentry_s *)start;
     tl->shadowstack = s;
     tl->shadowstack_base = s;
     STM_PUSH_ROOT(*tl, -1);
@@ -213,8 +213,8 @@
     assert(tl->shadowstack > tl->shadowstack_base);
     assert(tl->shadowstack_base->ss == (object_t *)-1);
 
-    char *start = (char *)tl->shadowstack_base;
-    _shadowstack_trap_page(start, PROT_READ | PROT_WRITE);
+    char *trap = _shadowstack_trap_page(tl->shadowstack_base);
+    mprotect(trap, 4096, PROT_READ | PROT_WRITE);
 
     free(tl->shadowstack_base);
     tl->shadowstack = NULL;
@@ -295,3 +295,19 @@
 {
     return tl->next != NULL;
 }
+
+static void detect_shadowstack_overflow(char *addr)
+{
+    if (addr == NULL)
+        return;
+    stm_thread_local_t *tl = stm_all_thread_locals;
+    while (tl != NULL) {
+        char *trap = _shadowstack_trap_page(tl->shadowstack_base);
+        if (trap <= addr && addr <= trap + 4095) {
+            fprintf(stderr, "This is caused by a stack overflow.\n"
+                "Sorry, proper RuntimeError support is not implemented yet.\n");
+            return;
+        }
+        tl = tl->next;
+    }
+}
diff --git a/c8/stm/setup.h b/c8/stm/setup.h
--- a/c8/stm/setup.h
+++ b/c8/stm/setup.h
@@ -1,6 +1,7 @@
 static void setup_mmap(char *reason);
 static void setup_protection_settings(void);
 static pthread_t *_get_cpth(stm_thread_local_t *);
+static void detect_shadowstack_overflow(char *);
 
 #ifndef NDEBUG
 static __thread long _stm_segfault_expected = 1;
diff --git a/c8/stm/smallmalloc.c b/c8/stm/smallmalloc.c
--- a/c8/stm/smallmalloc.c
+++ b/c8/stm/smallmalloc.c
@@ -8,9 +8,9 @@
 
 typedef struct {
     uint8_t sz;
-} fpsz_t;
+} full_page_size_t;
 
-static fpsz_t full_pages_object_size[PAGE_SMSIZE_END - PAGE_SMSIZE_START];
+static full_page_size_t full_pages_object_size[PAGE_SMSIZE_END - PAGE_SMSIZE_START];
 /* ^^^ This array contains the size (in number of words) of the objects
    in the given page, provided it's a "full page of small objects".  It
    is 0 if it's not such a page, if it's fully free, or if it's in
@@ -19,7 +19,7 @@
    technically full yet, it will be very soon in this case).
 */
 
-static fpsz_t *get_fpsz(char *smallpage)
+static full_page_size_t *get_full_page_size(char *smallpage)
 {
     uintptr_t pagenum = (((char *)smallpage) - END_NURSERY_PAGE * 4096UL - stm_object_pages) / 4096;
     /* <= PAGE_SMSIZE_END because we may ask for it when there is no
@@ -118,7 +118,7 @@
 
         /* Succeeded: we have a page in 'smallpage' */
         *fl = smallpage->next;
-        get_fpsz((char *)smallpage)->sz = n;
+        get_full_page_size((char *)smallpage)->sz = n;
         return (char *)smallpage;
     }
 
@@ -126,12 +126,15 @@
        Maybe we can pick one from free_uniform_pages.
      */
     smallpage = free_uniform_pages;
-    if (smallpage != NULL) {
+    if (LIKELY(smallpage != NULL)) {
         if (UNLIKELY(!__sync_bool_compare_and_swap(&free_uniform_pages,
                                                    smallpage,
                                                    smallpage->nextpage)))
             goto retry;
 
+        /* got a new page: */
+        increment_total_allocated(4096);
+
         /* Succeeded: we have a page in 'smallpage', which is not
            initialized so far, apart from the 'nextpage' field read
            above.  Initialize it.
@@ -153,7 +156,7 @@
         *previous = NULL;
 
         /* The first slot is immediately returned */
-        get_fpsz((char *)smallpage)->sz = n;
+        get_full_page_size((char *)smallpage)->sz = n;
         return (char *)smallpage;
     }
 
@@ -174,8 +177,6 @@
 
     struct small_free_loc_s *result = *fl;
 
-    increment_total_allocated(size);
-
     if (UNLIKELY(result == NULL)) {
         char *addr = _allocate_small_slowpath(size);
         ((struct object_s*)addr)->stm_flags = 0;
@@ -270,7 +271,6 @@
         }
         else if (!_smallmalloc_sweep_keep(p)) {
             /* the location should be freed now */
-            increment_total_allocated(-szword*8);
 #ifdef STM_TESTS
             /* fill location with 0xdd in all segs except seg0 */
             int j;
@@ -300,6 +300,7 @@
             any_object_remaining = true;
         }
     }
+
     if (!any_object_remaining) {
         /* give page back to free_uniform_pages and thus make it
            inaccessible from all other segments again (except seg0) */
@@ -311,9 +312,14 @@
 
         ((struct small_free_loc_s *)baseptr)->nextpage = free_uniform_pages;
         free_uniform_pages = (struct small_free_loc_s *)baseptr;
+
+        /* gave the page back */
+        increment_total_allocated(-4096);
     }
     else if (!any_object_dying) {
-        get_fpsz(baseptr)->sz = szword;
+        /* this is still a full page. only in this case we set the
+           full_page_size again: */
+        get_full_page_size(baseptr)->sz = szword;
     }
     else {
         check_order_inside_small_page(page_free);
@@ -339,9 +345,9 @@
             if (*fl != NULL) {
                 /* the entry in full_pages_object_size[] should already be
                    szword.  We reset it to 0. */
-                fpsz_t *fpsz = get_fpsz((char *)*fl);
-                assert(fpsz->sz == szword);
-                fpsz->sz = 0;
+                full_page_size_t *full_page_size = get_full_page_size((char *)*fl);
+                assert(full_page_size->sz == szword);
+                full_page_size->sz = 0;
                 sweep_small_page(getbaseptr(*fl), *fl, szword);
                 *fl = NULL;
             }
@@ -351,7 +357,7 @@
         while (page != NULL) {
             /* for every page in small_page_lists: assert that the
                corresponding full_pages_object_size[] entry is 0 */
-            assert(get_fpsz((char *)page)->sz == 0);
+            assert(get_full_page_size((char *)page)->sz == 0);
             nextpage = page->nextpage;
             sweep_small_page(getbaseptr(page), page, szword);
             page = nextpage;
@@ -361,10 +367,10 @@
     /* process the really full pages, which are the ones which still
        have a non-zero full_pages_object_size[] entry */
     char *pageptr = uninitialized_page_stop;
-    fpsz_t *fpsz_start = get_fpsz(pageptr);
-    fpsz_t *fpsz_end = &full_pages_object_size[PAGE_SMSIZE_END -
-                                               PAGE_SMSIZE_START];
-    fpsz_t *fpsz;
+    full_page_size_t *fpsz_start = get_full_page_size(pageptr);
+    full_page_size_t *fpsz_end = &full_pages_object_size[PAGE_SMSIZE_END -
+                                                         PAGE_SMSIZE_START];
+    full_page_size_t *fpsz;
     for (fpsz = fpsz_start; fpsz < fpsz_end; fpsz++, pageptr += 4096) {
         uint8_t sz = fpsz->sz;
         if (sz != 0) {
diff --git a/c8/stm/sync.c b/c8/stm/sync.c
--- a/c8/stm/sync.c
+++ b/c8/stm/sync.c
@@ -66,7 +66,6 @@
 
 static void ensure_gs_register(long segnum)
 {
-    /* XXX use this instead of set_gs_register() in many places */
     if (STM_SEGMENT->segment_num != segnum) {
         set_gs_register(get_segment_base(segnum));
         assert(STM_SEGMENT->segment_num == segnum);
@@ -211,16 +210,12 @@
     assert(_has_mutex());
     assert(_is_tl_registered(tl));
 
-    int num = tl->last_associated_segment_num - 1; // 0..NB_SEG-1
+    int num = tl->last_associated_segment_num - 1; // 0..NB_SEG-2
     OPT_ASSERT(num >= 0);
     if (sync_ctl.in_use1[num+1] == 0) {
         /* fast-path: we can get the same segment number than the one
-           we had before.  The value stored in GS is still valid. */
-#ifdef STM_TESTS
-        /* that can be optimized away, except during tests, because
-           they use only one thread */
-        set_gs_register(get_segment_base(num+1));
-#endif
+           we had before.  The value stored in GS may still be valid. */
+        ensure_gs_register(num+1);
         dprintf(("acquired same segment: %d\n", num+1));
         goto got_num;
     }
@@ -234,7 +229,7 @@
             int old_num = tl->last_associated_segment_num;
             dprintf(("acquired different segment: %d->%d\n", old_num, num+1));
             tl->last_associated_segment_num = num+1;
-            set_gs_register(get_segment_base(num+1));
+            ensure_gs_register(num+1);
             dprintf(("                            %d->%d\n", old_num, num+1));
             (void)old_num;
             goto got_num;
@@ -313,14 +308,14 @@
 void _stm_test_switch(stm_thread_local_t *tl)
 {
     assert(_stm_in_transaction(tl));
-    set_gs_register(get_segment_base(tl->last_associated_segment_num));
+    ensure_gs_register(tl->last_associated_segment_num);
     assert(STM_SEGMENT->running_thread == tl);
     exec_local_finalizers();
 }
 
 void _stm_test_switch_segment(int segnum)
 {
-    set_gs_register(get_segment_base(segnum+1));
+    ensure_gs_register(segnum+1);
 }
 
 #if STM_TESTS
diff --git a/c8/stmgc.h b/c8/stmgc.h
--- a/c8/stmgc.h
+++ b/c8/stmgc.h
@@ -21,7 +21,15 @@
 #endif
 
 
-#define TLPREFIX __attribute__((address_space(256)))
+#ifdef __SEG_GS     /* on a custom patched gcc */
+#  define TLPREFIX __seg_gs
+#  define _STM_RM_SUFFIX  :8
+#elif defined(__clang__)   /* on a clang, hopefully made bug-free */
+#  define TLPREFIX __attribute__((address_space(256)))
+#  define _STM_RM_SUFFIX  /* nothing */
+#else
+#  error "needs either a GCC with __seg_gs support, or a bug-freed clang"
+#endif
 
 typedef TLPREFIX struct object_s object_t;
 typedef TLPREFIX struct stm_segment_info_s stm_segment_info_t;
@@ -35,18 +43,18 @@
        'STM_SEGMENT->transaction_read_version' if and only if the
        object was read in the current transaction.  The nurseries
        also have corresponding read markers, but they are never used. */
-    uint8_t rm;
+    unsigned char rm _STM_RM_SUFFIX;
 };
 
 struct stm_segment_info_s {
-    uint8_t transaction_read_version;
-    uint8_t no_safe_point_here;    /* set from outside, triggers an assert */
+    unsigned int transaction_read_version;
     int segment_num;
     char *segment_base;
     stm_char *nursery_current;
     stm_char *nursery_mark;
     uintptr_t nursery_end;
     struct stm_thread_local_s *running_thread;
+    uint8_t no_safe_point_here;    /* set from outside, triggers an assert */
 };
 #define STM_SEGMENT           ((stm_segment_info_t *)4352)
 
@@ -154,6 +162,7 @@
 #endif
 
 #define _STM_GCFLAG_WRITE_BARRIER      0x01
+#define _STM_GCFLAG_NO_CONFLICT        0x40
 #define _STM_FAST_ALLOC           (66*1024)
 #define _STM_NSE_SIGNAL_ABORT             1
 #define _STM_NSE_SIGNAL_MAX               2
@@ -357,6 +366,7 @@
 #define STM_PUSH_ROOT(tl, p)   ((tl).shadowstack++->ss = (object_t *)(p))
 #define STM_POP_ROOT(tl, p)    ((p) = (typeof(p))((--(tl).shadowstack)->ss))
 #define STM_POP_ROOT_RET(tl)   ((--(tl).shadowstack)->ss)
+#define STM_POP_ROOT_DROP(tl)  ((void)(--(tl).shadowstack))
 
 /* Every thread needs to have a corresponding stm_thread_local_t
    structure.  It may be a "__thread" global variable or something else.
@@ -370,7 +380,12 @@
 
 /* At some key places, like the entry point of the thread and in the
    function with the interpreter's dispatch loop, you need to declare
-   a local variable of type 'rewind_jmp_buf' and call these macros. */
+   a local variable of type 'rewind_jmp_buf' and call these macros.
+   IMPORTANT: a function in which you call stm_rewind_jmp_enterframe()
+   must never change the value of its own arguments!  If they are
+   passed on the stack, gcc can change the value directly there, but
+   we're missing the logic to save/restore this part!
+*/
 #define stm_rewind_jmp_enterprepframe(tl, rjbuf)                        \
     rewind_jmp_enterprepframe(&(tl)->rjthread, rjbuf, (tl)->shadowstack)
 #define stm_rewind_jmp_enterframe(tl, rjbuf)       \
@@ -657,7 +672,7 @@
 
 #define STM_POP_MARKER(tl)   ({                 \
     object_t *_popped = STM_POP_ROOT_RET(tl);   \
-    STM_POP_ROOT_RET(tl);                       \
+    STM_POP_ROOT_DROP(tl);                      \
     _popped;                                    \
 })
 
@@ -711,6 +726,9 @@
 
 stm_hashtable_t *stm_hashtable_create(void);
 void stm_hashtable_free(stm_hashtable_t *);
+/* lookup returns a reference to an entry. This entry is only valid
+   in the current transaction and needs to be looked up again if there
+   may have been a break inbetween. */
 stm_hashtable_entry_t *stm_hashtable_lookup(object_t *, stm_hashtable_t *,
                                             uintptr_t key);
 object_t *stm_hashtable_read(object_t *, stm_hashtable_t *, uintptr_t key);
@@ -719,8 +737,13 @@
 void stm_hashtable_write_entry(object_t *hobj, stm_hashtable_entry_t *entry,
                                object_t *nvalue);
 long stm_hashtable_length_upper_bound(stm_hashtable_t *);
+
+/* WARNING: stm_hashtable_list does not do a stm_write() on the 'results'
+   argument. 'results' may point inside an object. So if 'results' may be
+   a part of an old obj (which may have survived a minor GC), then make
+   sure to call stm_write() on the obj before calling this function. */
 long stm_hashtable_list(object_t *, stm_hashtable_t *,
-                        stm_hashtable_entry_t **results);
+                        stm_hashtable_entry_t * TLPREFIX *results);
 extern uint32_t stm_hashtable_entry_userdata;
 void stm_hashtable_tracefn(struct object_s *, stm_hashtable_t *,
                            void (object_t **));
@@ -758,13 +781,31 @@
 void stm_queue_tracefn(stm_queue_t *queue, void trace(object_t **));
 
 
+
+/* stm_allocate_noconflict() allocates a special kind of object. Validation
+   will never detect conflicts on such an object. However, writes to it can
+   get lost. More precisely: every possible point for validation during a
+   transaction may import a committed version of such objs, thereby resetting
+   it or even contain not-yet-seen values from other (committed) transactions.
+   Hence, changes to such an obj that a transaction commits may or may not
+   propagate to other transactions. */
+__attribute__((always_inline))
+static inline object_t *stm_allocate_noconflict(ssize_t size_rounded_up)
+{
+    object_t *o = stm_allocate(size_rounded_up);
+    o->stm_flags |= _STM_GCFLAG_NO_CONFLICT;
+    return o;
+}
+
+
+
 /* ==================== END ==================== */
 
-static void (*stmcb_expand_marker)(char *segment_base, uintptr_t odd_number,
+extern void (*stmcb_expand_marker)(char *segment_base, uintptr_t odd_number,
                             object_t *following_object,
                             char *outputbuf, size_t outputbufsize);
 
-static void (*stmcb_debug_print)(const char *cause, double time,
+extern void (*stmcb_debug_print)(const char *cause, double time,
                           const char *marker);
 
 #endif
diff --git a/c8/test/common.py b/c8/test/common.py
--- a/c8/test/common.py
+++ b/c8/test/common.py
@@ -3,7 +3,7 @@
 assert sys.maxint == 9223372036854775807, "requires a 64-bit environment"
 
 # ----------
-os.environ['CC'] = 'clang'
+os.environ['CC'] = 'gcc-seg-gs'
 
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
diff --git a/c8/test/support.py b/c8/test/support.py
--- a/c8/test/support.py
+++ b/c8/test/support.py
@@ -12,6 +12,8 @@
 #define _STM_FAST_ALLOC ...
 #define _STM_CARD_SIZE ...
 #define _STM_CARD_MARKED ...
+#define STM_GC_NURSERY ...
+#define SIZEOF_HASHTABLE_ENTRY ...
 
 typedef struct {
 ...;
@@ -43,6 +45,7 @@
 object_t *stm_allocate(ssize_t size_rounded_up);
 object_t *stm_allocate_weakref(ssize_t size_rounded_up);
 object_t *stm_allocate_with_finalizer(ssize_t size_rounded_up);
+object_t *stm_allocate_noconflict(ssize_t size_rounded_up);
 
 /*void stm_write_card(); use _checked_stm_write_card() instead */
 
@@ -209,17 +212,21 @@
 object_t *hashtable_read_result;
 bool _check_hashtable_write(object_t *, stm_hashtable_t *, uintptr_t key,
                             object_t *nvalue, stm_thread_local_t *tl);
+stm_hashtable_entry_t *stm_hashtable_lookup(object_t *hashtableobj,
+                                            stm_hashtable_t *hashtable,
+                                            uintptr_t index);
 long stm_hashtable_length_upper_bound(stm_hashtable_t *);
-long stm_hashtable_list(object_t *, stm_hashtable_t *,
-                        stm_hashtable_entry_t **results);
 uint32_t stm_hashtable_entry_userdata;
 void stm_hashtable_tracefn(struct object_s *, stm_hashtable_t *,
                            void trace(object_t **));
+long _stm_hashtable_list(object_t *o, stm_hashtable_t *h,
+                         object_t *entries);
 
 void _set_hashtable(object_t *obj, stm_hashtable_t *h);
 stm_hashtable_t *_get_hashtable(object_t *obj);
 uintptr_t _get_entry_index(stm_hashtable_entry_t *entry);
 object_t *_get_entry_object(stm_hashtable_entry_t *entry);
+void *_get_hashtable_table(stm_hashtable_t *h);
 
 typedef struct stm_queue_s stm_queue_t;
 stm_queue_t *stm_queue_create(void);
@@ -256,6 +263,7 @@
 typedef TLPREFIX struct myobj_s myobj_t;
 #define SIZEOF_MYOBJ sizeof(struct myobj_s)
 
+#define SIZEOF_HASHTABLE_ENTRY sizeof(struct stm_hashtable_entry_s)
 
 int _stm_get_flags(object_t *obj) {
     return obj->stm_flags;
@@ -397,6 +405,21 @@
     return entry->object;
 }
 
+
+void *_get_hashtable_table(stm_hashtable_t *h) {
+    return *((void**)h);
+}
+
+long _stm_hashtable_list(object_t *o, stm_hashtable_t *h,
+                         object_t *entries)
+{
+    if (entries != NULL)
+        return stm_hashtable_list(o, h,
+            (stm_hashtable_entry_t * TLPREFIX*)((stm_char*)entries+SIZEOF_MYOBJ));
+    return stm_hashtable_list(o, h, NULL);
+}
+
+
 void _set_queue(object_t *obj, stm_queue_t *q)
 {
     stm_char *field_addr = ((stm_char*)obj);
@@ -570,11 +593,12 @@
                     ('STM_NO_COND_WAIT', '1'),
                     ('STM_DEBUGPRINT', '1'),
                     ('_STM_NURSERY_ZEROED', '1'),
+                    ('STM_GC_NURSERY', '128'), # KB
                     ('GC_N_SMALL_REQUESTS', str(GC_N_SMALL_REQUESTS)), #check
                     ],
      undef_macros=['NDEBUG'],
      include_dirs=[parent_dir],
-     extra_compile_args=['-g', '-O0', '-Wall', '-ferror-limit=5'],
+                 extra_compile_args=['-g', '-O0', '-Werror', '-Wall'], #, '-ferror-limit=5'],
      extra_link_args=['-g', '-lrt'],
      force_generic_engine=True)
 
@@ -590,7 +614,8 @@
 CARD_MARKED = lib._STM_CARD_MARKED
 CARD_MARKED_OLD = lib._stm_get_transaction_read_version
 lib.stm_hashtable_entry_userdata = 421418
-
+NURSERY_SIZE = lib.STM_GC_NURSERY * 1024 # bytes
+SIZEOF_HASHTABLE_ENTRY = lib.SIZEOF_HASHTABLE_ENTRY
 
 class Conflict(Exception):
     pass
@@ -640,6 +665,18 @@
     lib._set_type_id(o, tid)
     return o
 
+def stm_allocate_noconflict(size):
+    o = lib.stm_allocate_noconflict(size)
+    tid = 42 + size
+    lib._set_type_id(o, tid)
+    return o
+
+def stm_allocate_noconflict_refs(n):
+    o = lib.stm_allocate_noconflict(HDR + n * WORD)
+    tid = 421420 + n
+    lib._set_type_id(o, tid)
+    return o
+
 def stm_allocate_with_finalizer(size):
     o = lib.stm_allocate_with_finalizer(size)
     tid = 42 + size
@@ -652,14 +689,20 @@
     lib._set_type_id(o, tid)
     return o
 
+SIZEOF_HASHTABLE_OBJ = 16 + lib.SIZEOF_MYOBJ
 def stm_allocate_hashtable():
     o = lib.stm_allocate(16)
+    assert is_in_nursery(o)
     tid = 421419
     lib._set_type_id(o, tid)
     h = lib.stm_hashtable_create()
     lib._set_hashtable(o, h)
     return o
 
+def hashtable_lookup(hto, ht, idx):
+    return ffi.cast("object_t*",
+                    lib.stm_hashtable_lookup(hto, ht, idx))
+
 def get_hashtable(o):
     assert lib._get_type_id(o) == 421419
     h = lib._get_hashtable(o)
@@ -897,6 +940,17 @@
     def switch_to_segment(self, seg_num):
         lib._stm_test_switch_segment(seg_num)
 
+    def push_roots(self, os):
+        for o in os:
+            self.push_root(o)
+        self._last_push_all = os
+
+    def pop_roots(self):
+        os = self._last_push_all
+        self._last_push_all = None
+        return list(reversed([self.pop_root() for _ in os]))
+
+
     def push_root(self, o):
         assert ffi.typeof(o) == ffi.typeof("object_t *")
         tl = self.tls[self.current_thread]
diff --git a/c8/test/test_finalizer.py b/c8/test/test_finalizer.py
--- a/c8/test/test_finalizer.py
+++ b/c8/test/test_finalizer.py
@@ -139,6 +139,7 @@
         self.expect_finalized([lp1], from_tlnum=0)
 
 
+
 class TestRegularFinalizer(BaseTest):
     expect_content_character = None
     run_major_collect_in_finalizer = False
diff --git a/c8/test/test_gcpage.py b/c8/test/test_gcpage.py
--- a/c8/test/test_gcpage.py
+++ b/c8/test/test_gcpage.py
@@ -325,9 +325,10 @@
         actual_big = (big + 15 ) & ~15
 
         self.start_transaction()
-        assert lib._stm_total_allocated() == 64 + (actual_big + LMO) # large malloc'd
+        # 4096 for 1 page of smallmalloc:
+        assert lib._stm_total_allocated() == 4096 + (actual_big + LMO) # large malloc'd
         stm_major_collect()
-        assert lib._stm_total_allocated() == 64 + (actual_big + LMO) # large malloc'd
+        assert lib._stm_total_allocated() == 4096 + (actual_big + LMO) # large malloc'd
         self.commit_transaction()
 
     def test_bug(self):
@@ -376,19 +377,19 @@
         assert lib._stm_total_allocated() == 0
         self.push_root(new)
         stm_minor_collect()
-        assert lib._stm_total_allocated() == 16
+        assert lib._stm_total_allocated() == 4096
 
         new = self.pop_root()
         assert not is_in_nursery(new)
         stm_minor_collect()
-        assert lib._stm_total_allocated() == 16
+        assert lib._stm_total_allocated() == 4096
 
         stm_major_collect()
         assert lib._stm_total_allocated() == 0
 
     def test_mixed_major_collections(self):
         import random
-        obj_sizes = [16, 48, 1024, 1000*8]
+        obj_sizes = [1024, 1000*8]
 
         self.start_transaction()
         random.seed(123)
@@ -398,11 +399,7 @@
         NOBJS = 100
         for _ in range(NOBJS):
             osize = random.choice(obj_sizes)
-            is_small = osize <= GC_LAST_SMALL_SIZE
-            if is_small:
-                allocated += osize
-            else:
-                allocated += osize + LMO
+            allocated += osize + LMO
 
             o = stm_allocate(osize)
             self.push_root(o)
diff --git a/c8/test/test_hashtable.py b/c8/test/test_hashtable.py
--- a/c8/test/test_hashtable.py
+++ b/c8/test/test_hashtable.py
@@ -23,15 +23,17 @@
 def htitems(o):
     h = get_hashtable(o)
     upper_bound = lib.stm_hashtable_length_upper_bound(h)
-    entries = ffi.new("stm_hashtable_entry_t *[]", upper_bound)
-    count = lib.stm_hashtable_list(o, h, entries)
+    entries = stm_allocate_refs(upper_bound)
+    count = lib._stm_hashtable_list(o, h, entries)
     assert count <= upper_bound
-    return [(lib._get_entry_index(entries[i]),
-             lib._get_entry_object(entries[i])) for i in range(count)]
+
+    return [(lib._get_entry_index(ffi.cast("stm_hashtable_entry_t *", stm_get_ref(entries, i))),
+             lib._get_entry_object(ffi.cast("stm_hashtable_entry_t *", stm_get_ref(entries, i))))
+            for i in range(count)]
 
 def htlen(o):
     h = get_hashtable(o)
-    count = lib.stm_hashtable_list(o, h, ffi.NULL)
+    count = lib._stm_hashtable_list(o, h, ffi.NULL)
     return count
 
 
@@ -352,6 +354,111 @@
         stm_major_collect()       # to get rid of the hashtable object
 
 
+    def test_new_entry_if_nursery_full(self):
+        self.start_transaction()
+        tl0 = self.tls[self.current_thread]
+        # make sure we fill the nursery *exactly* so that
+        # the last entry allocation triggers a minor GC
+        # and needs to allocate preexisting outside the nursery:
+        SMALL = 24 + lib.SIZEOF_MYOBJ
+        assert (NURSERY_SIZE - SIZEOF_HASHTABLE_OBJ) % SMALL < SIZEOF_HASHTABLE_ENTRY
+        to_alloc = (NURSERY_SIZE - SIZEOF_HASHTABLE_OBJ) // SMALL
+        for i in range(to_alloc):
+            stm_allocate(SMALL)
+        h = self.allocate_hashtable()
+        assert is_in_nursery(h)
+        self.push_root(h)
+        # would trigger minor GC when allocating 'entry' in nursery:
+        entry = hashtable_lookup(h, get_hashtable(h), 123)
+        h = self.pop_root()
+        self.push_root(h)
+        assert is_in_nursery(h) # didn't trigger minor-gc, since entry allocated outside
+        assert not is_in_nursery(entry)
+        assert htget(h, 123) == ffi.NULL
+        htset(h, 123, h, tl0)
+
+        # stm_write(h) - the whole thing may be fixed also by ensuring
+        # the hashtable gets retraced in minor-GC if stm_hashtable_write_entry
+        # detects the 'entry' to be young (and hobj being old)
+
+        stm_minor_collect()
+        h = self.pop_root()
+        assert htget(h, 123) == h
+        entry2 = hashtable_lookup(h, get_hashtable(h), 123)
+        assert entry == entry2
+        assert not is_in_nursery(h)
+        assert not is_in_nursery(entry2)
+
+        # get rid of ht:
+        self.commit_transaction()
+        self.start_transaction()
+        stm_major_collect()
+        self.commit_transaction()
+
+    def test_dont_lose_entry(self):
+        self.start_transaction()
+        h = self.allocate_hashtable()
+        self.push_root(h)
+        stm_minor_collect()
+        h = self.pop_root()
+        self.push_root(h)
+        # produce entries:
+        K = 300
+        for i in range(K):
+            hashtable_lookup(h, get_hashtable(h), i)
+
+        table = lib._get_hashtable_table(get_hashtable(h))
+        entry = hashtable_lookup(h, get_hashtable(h), K)
+        self.push_root(entry)
+        stm_major_collect()
+        entry2 = hashtable_lookup(h, get_hashtable(h), K)
+        entry = self.pop_root()
+        assert table != lib._get_hashtable_table(get_hashtable(h)) # compacted
+        assert entry == entry2
+
+        # get rid of ht:
+        self.pop_root()
+        self.commit_transaction()
+        self.start_transaction()
+        stm_major_collect()
+        self.commit_transaction()
+
+    def test_empty_entry_not_kept_alive(self):
+        self.start_transaction()
+        h = self.allocate_hashtable()
+        self.push_root(h)
+        stm_minor_collect()
+        h = self.pop_root()
+        self.push_root(h)