[pypy-commit] pypy default: hg merge portable-threadlocal

arigo noreply at buildbot.pypy.org
Thu Nov 27 10:56:16 CET 2014


Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r74747:adc6ab4ae74d
Date: 2014-11-27 10:56 +0100
http://bitbucket.org/pypy/pypy/changeset/adc6ab4ae74d/

Log:	hg merge portable-threadlocal

	Change the way thread-locals are read from the JIT: found a way to
	do it portably, by passing around the pointer to a thread-local
	structure from outside the JIT and all the way inside.

diff too long, truncating to 2000 out of 2070 lines

diff --git a/pypy/module/_ssl/thread_lock.py b/pypy/module/_ssl/thread_lock.py
--- a/pypy/module/_ssl/thread_lock.py
+++ b/pypy/module/_ssl/thread_lock.py
@@ -24,12 +24,19 @@
 
 separate_module_source = """
 #include <openssl/crypto.h>
+#ifndef _WIN32
+# include <pthread.h>
+#endif
 
 static unsigned int _ssl_locks_count = 0;
 static struct RPyOpaque_ThreadLock *_ssl_locks;
 
 static unsigned long _ssl_thread_id_function(void) {
-    return RPyThreadGetIdent();
+#ifdef _WIN32
+    return (unsigned long)GetCurrentThreadId();
+#else
+    return (unsigned long)pthread_self();
+#endif
 }
 
 static void _ssl_thread_locking_function(int mode, int n, const char *file,
diff --git a/pypy/module/cpyext/src/pythread.c b/pypy/module/cpyext/src/pythread.c
--- a/pypy/module/cpyext/src/pythread.c
+++ b/pypy/module/cpyext/src/pythread.c
@@ -1,11 +1,18 @@
 #include <Python.h>
+#ifndef _WIN32
+# include <pthread.h>
+#endif
 #include "pythread.h"
 #include "src/thread.h"
 
 long
 PyThread_get_thread_ident(void)
 {
-    return RPyThreadGetIdent();
+#ifdef _WIN32
+    return (long)GetCurrentThreadId();
+#else
+    return (long)pthread_self();
+#endif
 }
 
 PyThread_type_lock
diff --git a/pypy/module/pypyjit/test_pypy_c/model.py b/pypy/module/pypyjit/test_pypy_c/model.py
--- a/pypy/module/pypyjit/test_pypy_c/model.py
+++ b/pypy/module/pypyjit/test_pypy_c/model.py
@@ -184,10 +184,10 @@
         matcher = OpMatcher(ops)
         return matcher.match(expected_src, **kwds)
 
-    def match_by_id(self, id, expected_src, **kwds):
+    def match_by_id(self, id, expected_src, ignore_ops=[], **kwds):
         ops = list(self.ops_by_id(id, **kwds))
         matcher = OpMatcher(ops, id)
-        return matcher.match(expected_src)
+        return matcher.match(expected_src, ignore_ops=ignore_ops)
 
 class PartialTraceWithIds(TraceWithIds):
     def __init__(self, trace, is_entry_bridge=False):
diff --git a/pypy/module/pypyjit/test_pypy_c/test_call.py b/pypy/module/pypyjit/test_pypy_c/test_call.py
--- a/pypy/module/pypyjit/test_pypy_c/test_call.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_call.py
@@ -82,7 +82,7 @@
         assert log.opnames(ops) == []
         #
         assert entry_bridge.match_by_id('call', """
-            p38 = call(ConstClass(_ll_0_threadlocalref_getter___), descr=<Callr . EF=1 OS=5>)
+            p38 = call(ConstClass(_ll_1_threadlocalref_get__Ptr_GcStruct_objectLlT_Signed), #, descr=<Callr . i EF=1 OS=5>)
             p39 = getfield_gc(p38, descr=<FieldP pypy.interpreter.executioncontext.ExecutionContext.inst_topframeref .*>)
             i40 = force_token()
             p41 = getfield_gc_pure(p38, descr=<FieldP pypy.interpreter.executioncontext.ExecutionContext.inst_w_tracefunc .*>)
@@ -444,7 +444,7 @@
             p26 = getfield_gc(p7, descr=<FieldP pypy.objspace.std.dictmultiobject.W_DictMultiObject.inst_strategy .*>)
             guard_value(p26, ConstPtr(ptr27), descr=...)
             guard_not_invalidated(descr=...)
-            p29 = call(ConstClass(_ll_0_threadlocalref_getter___), descr=<Callr . EF=1 OS=5>)
+            p29 = call(ConstClass(_ll_1_threadlocalref_get__Ptr_GcStruct_objectLlT_Signed), #, descr=<Callr . i EF=1 OS=5>)
             p30 = getfield_gc(p29, descr=<FieldP pypy.interpreter.executioncontext.ExecutionContext.inst_topframeref .*>)
             p31 = force_token()
             p32 = getfield_gc_pure(p29, descr=<FieldP pypy.interpreter.executioncontext.ExecutionContext.inst_w_tracefunc .*>)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_ffi.py b/pypy/module/pypyjit/test_pypy_c/test_ffi.py
--- a/pypy/module/pypyjit/test_pypy_c/test_ffi.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_ffi.py
@@ -199,21 +199,16 @@
         ldexp_addr, res = log.result
         assert res == 8.0 * 300
         loop, = log.loops_by_filename(self.filepath)
-        if 'ConstClass(ldexp)' in repr(loop):   # e.g. OS/X
-            ldexp_addr = 'ConstClass(ldexp)'
         assert loop.match_by_id('cfficall', """
-            ...
-            f1 = call_release_gil(..., descr=<Callf 8 fi EF=6 OS=62>)
-            ...
-        """)
-        ops = loop.ops_by_id('cfficall')
-        for name in ['raw_malloc', 'raw_free']:
-            assert name not in str(ops)
-        for name in ['raw_load', 'raw_store', 'getarrayitem_raw', 'setarrayitem_raw']:
-            assert name not in log.opnames(ops)
-        # so far just check that call_release_gil() is produced.
-        # later, also check that the arguments to call_release_gil()
-        # are constants
+            setarrayitem_raw(i69, 0, i95, descr=<ArrayS 4>)    # write 'errno'
+            p96 = force_token()
+            setfield_gc(p0, p96, descr=<FieldP pypy.interpreter.pyframe.PyFrame.vable_token .>)
+            f97 = call_release_gil(i59, 1.0, 3, descr=<Callf 8 fi EF=6 OS=62>)
+            guard_not_forced(descr=...)
+            guard_no_exception(descr=...)
+            i98 = getarrayitem_raw(i69, 0, descr=<ArrayS 4>)   # read 'errno'
+            setfield_gc(p65, i98, descr=<FieldS pypy.interpreter.executioncontext.ExecutionContext.inst__cffi_saved_errno .>)
+        """, ignore_ops=['guard_not_invalidated'])
 
     def test_cffi_call_guard_not_forced_fails(self):
         # this is the test_pypy_c equivalent of
diff --git a/rpython/annotator/bookkeeper.py b/rpython/annotator/bookkeeper.py
--- a/rpython/annotator/bookkeeper.py
+++ b/rpython/annotator/bookkeeper.py
@@ -65,6 +65,7 @@
         self.external_class_cache = {}      # cache of ExternalType classes
 
         self.needs_generic_instantiate = {}
+        self.thread_local_fields = set()
 
         delayed_imports()
 
diff --git a/rpython/jit/backend/arm/assembler.py b/rpython/jit/backend/arm/assembler.py
--- a/rpython/jit/backend/arm/assembler.py
+++ b/rpython/jit/backend/arm/assembler.py
@@ -497,9 +497,11 @@
         if self.cpu.supports_floats:
             mc.VPOP([reg.value for reg in r.callee_saved_vfp_registers],
                                                                     cond=cond)
-        # pop all callee saved registers and IP to keep the alignment
+        # pop all callee saved registers.  This pops 'pc' last.
+        # It also pops the threadlocal_addr back into 'r1', but it
+        # is not needed any more and will be discarded.
         mc.POP([reg.value for reg in r.callee_restored_registers] +
-                                                       [r.ip.value], cond=cond)
+                                                       [r.r1.value], cond=cond)
         mc.BKPT()
 
     def gen_func_prolog(self):
@@ -508,11 +510,16 @@
         if self.cpu.supports_floats:
             stack_size += len(r.callee_saved_vfp_registers) * 2 * WORD
 
-        # push all callee saved registers and IP to keep the alignment
+        # push all callee saved registers including lr; and push r1 as
+        # well, which contains the threadlocal_addr argument.  Note that
+        # we're pushing a total of 10 words, which keeps the stack aligned.
         self.mc.PUSH([reg.value for reg in r.callee_saved_registers] +
-                                                        [r.ip.value])
+                                                        [r.r1.value])
+        self.saved_threadlocal_addr = 0   # at offset 0 from location 'sp'
         if self.cpu.supports_floats:
             self.mc.VPUSH([reg.value for reg in r.callee_saved_vfp_registers])
+            self.saved_threadlocal_addr += (
+                len(r.callee_saved_vfp_registers) * 2 * WORD)
         assert stack_size % 8 == 0 # ensure we keep alignment
 
         # set fp to point to the JITFRAME
@@ -952,16 +959,11 @@
             regalloc._check_invariants()
         self.mc.mark_op(None)  # end of the loop
 
-    def regalloc_emit_llong(self, op, arglocs, fcond, regalloc):
+    def regalloc_emit_extra(self, op, arglocs, fcond, regalloc):
+        # for calls to a function with a specifically-supported OS_xxx
         effectinfo = op.getdescr().get_extra_info()
         oopspecindex = effectinfo.oopspecindex
-        asm_llong_operations[oopspecindex](self, op, arglocs, regalloc, fcond)
-        return fcond
-
-    def regalloc_emit_math(self, op, arglocs, fcond, regalloc):
-        effectinfo = op.getdescr().get_extra_info()
-        oopspecindex = effectinfo.oopspecindex
-        asm_math_operations[oopspecindex](self, op, arglocs, regalloc, fcond)
+        asm_extra_operations[oopspecindex](self, op, arglocs, regalloc, fcond)
         return fcond
 
     def patch_trace(self, faildescr, looptoken, bridge_addr, regalloc):
@@ -1150,6 +1152,14 @@
         else:
             assert 0, 'unsupported case'
 
+    def _mov_raw_sp_to_loc(self, prev_loc, loc, cond=c.AL):
+        if loc.is_core_reg():
+            # load a value from 'SP + n'
+            assert prev_loc.value <= 0xFFF     # not too far
+            self.load_reg(self.mc, loc, r.sp, prev_loc.value, cond=cond)
+        else:
+            assert 0, 'unsupported case'
+
     def regalloc_mov(self, prev_loc, loc, cond=c.AL):
         """Moves a value from a previous location to some other location"""
         if prev_loc.is_imm():
@@ -1163,7 +1173,7 @@
         elif prev_loc.is_vfp_reg():
             self._mov_vfp_reg_to_loc(prev_loc, loc, cond)
         elif prev_loc.is_raw_sp():
-            assert 0, 'raw sp locs are not supported as source loc'
+            self._mov_raw_sp_to_loc(prev_loc, loc, cond)
         else:
             assert 0, 'unsupported case'
     mov_loc_loc = regalloc_mov
@@ -1509,22 +1519,17 @@
 
 asm_operations = [notimplemented_op] * (rop._LAST + 1)
 asm_operations_with_guard = [notimplemented_op_with_guard] * (rop._LAST + 1)
-asm_llong_operations = {}
-asm_math_operations = {}
+asm_extra_operations = {}
 
 for name, value in ResOpAssembler.__dict__.iteritems():
     if name.startswith('emit_guard_'):
         opname = name[len('emit_guard_'):]
         num = getattr(rop, opname.upper())
         asm_operations_with_guard[num] = value
-    elif name.startswith('emit_op_llong_'):
-        opname = name[len('emit_op_llong_'):]
-        num = getattr(EffectInfo, 'OS_LLONG_' + opname.upper())
-        asm_llong_operations[num] = value
-    elif name.startswith('emit_op_math_'):
-        opname = name[len('emit_op_math_'):]
-        num = getattr(EffectInfo, 'OS_MATH_' + opname.upper())
-        asm_math_operations[num] = value
+    elif name.startswith('emit_opx_'):
+        opname = name[len('emit_opx_'):]
+        num = getattr(EffectInfo, 'OS_' + opname.upper())
+        asm_extra_operations[num] = value
     elif name.startswith('emit_op_'):
         opname = name[len('emit_op_'):]
         num = getattr(rop, opname.upper())
diff --git a/rpython/jit/backend/arm/locations.py b/rpython/jit/backend/arm/locations.py
--- a/rpython/jit/backend/arm/locations.py
+++ b/rpython/jit/backend/arm/locations.py
@@ -46,7 +46,7 @@
     def is_core_reg(self):
         return True
 
-    def as_key(self):
+    def as_key(self):       # 0 <= as_key <= 15
         return self.value
 
 
@@ -64,7 +64,7 @@
     def is_vfp_reg(self):
         return True
 
-    def as_key(self):
+    def as_key(self):            # 20 <= as_key <= 35
         return self.value + 20
 
     def is_float(self):
@@ -115,8 +115,8 @@
     def is_imm_float(self):
         return True
 
-    def as_key(self):
-        return self.value
+    def as_key(self):          # a real address + 1
+        return self.value | 1
 
     def is_float(self):
         return True
@@ -148,7 +148,7 @@
     def is_stack(self):
         return True
 
-    def as_key(self):
+    def as_key(self):                # an aligned word + 10000
         return self.position + 10000
 
     def is_float(self):
@@ -174,6 +174,9 @@
     def is_float(self):
         return self.type == FLOAT
 
+    def as_key(self):            # a word >= 1000, and < 1000 + size of SP frame
+        return self.value + 1000
+
 
 def imm(i):
     return ImmLocation(i)
diff --git a/rpython/jit/backend/arm/opassembler.py b/rpython/jit/backend/arm/opassembler.py
--- a/rpython/jit/backend/arm/opassembler.py
+++ b/rpython/jit/backend/arm/opassembler.py
@@ -19,7 +19,7 @@
 from rpython.jit.backend.arm.codebuilder import InstrBuilder, OverwritingBuilder
 from rpython.jit.backend.arm.jump import remap_frame_layout
 from rpython.jit.backend.arm.regalloc import TempBox
-from rpython.jit.backend.arm.locations import imm
+from rpython.jit.backend.arm.locations import imm, RawSPStackLocation
 from rpython.jit.backend.llsupport import symbolic
 from rpython.jit.backend.llsupport.gcmap import allocate_gcmap
 from rpython.jit.backend.llsupport.descr import InteriorFieldDescr
@@ -982,7 +982,9 @@
         return fcond
 
     def _call_assembler_emit_call(self, addr, argloc, resloc):
-        self.simple_call(addr, [argloc], result_loc=resloc)
+        ofs = self.saved_threadlocal_addr
+        threadlocal_loc = RawSPStackLocation(ofs, INT)
+        self.simple_call(addr, [argloc, threadlocal_loc], result_loc=resloc)
 
     def _call_assembler_emit_helper_call(self, addr, arglocs, resloc):
         self.simple_call(addr, arglocs, result_loc=resloc)
@@ -1108,7 +1110,7 @@
 
     emit_op_float_neg = gen_emit_unary_float_op('float_neg', 'VNEG')
     emit_op_float_abs = gen_emit_unary_float_op('float_abs', 'VABS')
-    emit_op_math_sqrt = gen_emit_unary_float_op('math_sqrt', 'VSQRT')
+    emit_opx_math_sqrt = gen_emit_unary_float_op('math_sqrt', 'VSQRT')
 
     emit_op_float_lt = gen_emit_float_cmp_op('float_lt', c.VFP_LT)
     emit_op_float_le = gen_emit_float_cmp_op('float_le', c.VFP_LE)
@@ -1142,13 +1144,13 @@
 
     # the following five instructions are only ARMv7;
     # regalloc.py won't call them at all on ARMv6
-    emit_op_llong_add = gen_emit_float_op('llong_add', 'VADD_i64')
-    emit_op_llong_sub = gen_emit_float_op('llong_sub', 'VSUB_i64')
-    emit_op_llong_and = gen_emit_float_op('llong_and', 'VAND_i64')
-    emit_op_llong_or = gen_emit_float_op('llong_or', 'VORR_i64')
-    emit_op_llong_xor = gen_emit_float_op('llong_xor', 'VEOR_i64')
+    emit_opx_llong_add = gen_emit_float_op('llong_add', 'VADD_i64')
+    emit_opx_llong_sub = gen_emit_float_op('llong_sub', 'VSUB_i64')
+    emit_opx_llong_and = gen_emit_float_op('llong_and', 'VAND_i64')
+    emit_opx_llong_or = gen_emit_float_op('llong_or', 'VORR_i64')
+    emit_opx_llong_xor = gen_emit_float_op('llong_xor', 'VEOR_i64')
 
-    def emit_op_llong_to_int(self, op, arglocs, regalloc, fcond):
+    def emit_opx_llong_to_int(self, op, arglocs, regalloc, fcond):
         loc = arglocs[0]
         res = arglocs[1]
         assert loc.is_vfp_reg()
@@ -1282,3 +1284,11 @@
             regalloc.rm.possibly_free_var(length_box)
         regalloc.rm.possibly_free_var(dstaddr_box)
         return fcond
+
+    def emit_opx_threadlocalref_get(self, op, arglocs, regalloc, fcond):
+        ofs0, res = arglocs
+        assert ofs0.is_imm()
+        ofs = self.saved_threadlocal_addr
+        self.load_reg(self.mc, res, r.sp, ofs)
+        self.load_reg(self.mc, res, res, ofs0.value)
+        return fcond
diff --git a/rpython/jit/backend/arm/regalloc.py b/rpython/jit/backend/arm/regalloc.py
--- a/rpython/jit/backend/arm/regalloc.py
+++ b/rpython/jit/backend/arm/regalloc.py
@@ -373,11 +373,8 @@
         return gcmap
 
     # ------------------------------------------------------------
-    def perform_llong(self, op, args, fcond):
-        return self.assembler.regalloc_emit_llong(op, args, fcond, self)
-
-    def perform_math(self, op, args, fcond):
-        return self.assembler.regalloc_emit_math(op, args, self, fcond)
+    def perform_extra(self, op, args, fcond):
+        return self.assembler.regalloc_emit_extra(op, args, fcond, self)
 
     def force_spill_var(self, var):
         if var.type == FLOAT:
@@ -558,15 +555,19 @@
                             EffectInfo.OS_LLONG_XOR):
                 if self.cpu.cpuinfo.arch_version >= 7:
                     args = self._prepare_llong_binop_xx(op, fcond)
-                    self.perform_llong(op, args, fcond)
+                    self.perform_extra(op, args, fcond)
                     return
             elif oopspecindex == EffectInfo.OS_LLONG_TO_INT:
                 args = self._prepare_llong_to_int(op, fcond)
-                self.perform_llong(op, args, fcond)
+                self.perform_extra(op, args, fcond)
                 return
             elif oopspecindex == EffectInfo.OS_MATH_SQRT:
-                args = self.prepare_op_math_sqrt(op, fcond)
-                self.perform_math(op, args, fcond)
+                args = self._prepare_op_math_sqrt(op, fcond)
+                self.perform_extra(op, args, fcond)
+                return
+            elif oopspecindex == EffectInfo.OS_THREADLOCALREF_GET:
+                args = self._prepare_threadlocalref_get(op, fcond)
+                self.perform_extra(op, args, fcond)
                 return
             #elif oopspecindex == EffectInfo.OS_MATH_READ_TIMESTAMP:
             #    ...
@@ -624,6 +625,11 @@
         res = self.force_allocate_reg(op.result)
         return [loc0, res]
 
+    def _prepare_threadlocalref_get(self, op, fcond):
+        ofs0 = imm(op.getarg(1).getint())
+        res = self.force_allocate_reg(op.result)
+        return [ofs0, res]
+
     def _prepare_guard(self, op, args=None):
         if args is None:
             args = []
@@ -1284,7 +1290,7 @@
     prepare_guard_float_ge = prepare_float_op(guard=True,
                             float_result=False, name='prepare_guard_float_ge')
 
-    def prepare_op_math_sqrt(self, op, fcond):
+    def _prepare_op_math_sqrt(self, op, fcond):
         loc = self.make_sure_var_in_reg(op.getarg(1))
         self.possibly_free_vars_for_op(op)
         self.free_temp_vars()
diff --git a/rpython/jit/backend/llsupport/llmodel.py b/rpython/jit/backend/llsupport/llmodel.py
--- a/rpython/jit/backend/llsupport/llmodel.py
+++ b/rpython/jit/backend/llsupport/llmodel.py
@@ -217,7 +217,13 @@
         return lltype.cast_opaque_ptr(llmemory.GCREF, frame)
 
     def make_execute_token(self, *ARGS):
-        FUNCPTR = lltype.Ptr(lltype.FuncType([llmemory.GCREF],
+        # The JIT backend must generate functions with the following
+        # signature: it takes the jitframe and the threadlocal_addr
+        # as arguments, and it returns the (possibly reallocated) jitframe.
+        # The backend can optimize OS_THREADLOCALREF_GET calls to return a
+        # field of this threadlocal_addr, but only if 'translate_support_code':
+        # in untranslated tests, threadlocal_addr is a dummy NULL.
+        FUNCPTR = lltype.Ptr(lltype.FuncType([llmemory.GCREF, llmemory.Address],
                                              llmemory.GCREF))
 
         lst = [(i, history.getkind(ARG)[0]) for i, ARG in enumerate(ARGS)]
@@ -249,8 +255,13 @@
                     else:
                         assert kind == history.REF
                         self.set_ref_value(ll_frame, num, arg)
+                if self.translate_support_code:
+                    ll_threadlocal_addr = llop.threadlocalref_addr(
+                        llmemory.Address)
+                else:
+                    ll_threadlocal_addr = llmemory.NULL
                 llop.gc_writebarrier(lltype.Void, ll_frame)
-                ll_frame = func(ll_frame)
+                ll_frame = func(ll_frame, ll_threadlocal_addr)
             finally:
                 if not self.translate_support_code:
                     LLInterpreter.current_interpreter = prev_interpreter
diff --git a/rpython/jit/backend/llsupport/test/ztranslation_test.py b/rpython/jit/backend/llsupport/test/ztranslation_test.py
--- a/rpython/jit/backend/llsupport/test/ztranslation_test.py
+++ b/rpython/jit/backend/llsupport/test/ztranslation_test.py
@@ -26,8 +26,6 @@
         # - profiler
         # - full optimizer
         # - floats neg and abs
-        # - threadlocalref_get
-        # - get_errno, set_errno
         # - llexternal with macro=True
 
         class Frame(object):
@@ -36,10 +34,6 @@
             def __init__(self, i):
                 self.i = i
 
-        class Foo(object):
-            pass
-        t = ThreadLocalReference(Foo)
-
         eci = ExternalCompilationInfo(post_include_bits=['''
 #define pypy_my_fabs(x)  fabs(x)
 '''])
@@ -74,9 +68,6 @@
                 k = myabs1(myabs2(j))
                 if k - abs(j):  raise ValueError
                 if k - abs(-j): raise ValueError
-                if t.get().nine != 9: raise ValueError
-                rposix.set_errno(total)
-                if rposix.get_errno() != total: raise ValueError
             return chr(total % 253)
         #
         class Virt2(object):
@@ -104,12 +95,8 @@
             return res
         #
         def main(i, j):
-            foo = Foo()
-            foo.nine = -(i + j)
-            t.set(foo)
             a_char = f(i, j)
             a_float = libffi_stuff(i, j)
-            keepalive_until_here(foo)
             return ord(a_char) * 10 + int(a_float)
         expected = main(40, -49)
         res = self.meta_interp(main, [40, -49])
@@ -121,6 +108,7 @@
 
     def test_direct_assembler_call_translates(self):
         """Test CALL_ASSEMBLER and the recursion limit"""
+        # - also tests threadlocalref_get
         from rpython.rlib.rstackovf import StackOverflow
 
         class Thing(object):
@@ -138,6 +126,10 @@
 
         somewhere_else = SomewhereElse()
 
+        class Foo(object):
+            pass
+        t = ThreadLocalReference(Foo)
+
         def change(newthing):
             somewhere_else.frame.thing = newthing
 
@@ -163,6 +155,7 @@
                     nextval = 13
                 frame.thing = Thing(nextval + 1)
                 i += 1
+                if t.get().nine != 9: raise ValueError
             return frame.thing.val
 
         driver2 = JitDriver(greens = [], reds = ['n'])
@@ -184,13 +177,24 @@
                 n = portal2(n)
         assert portal2(10) == -9
 
+        def setup(value):
+            foo = Foo()
+            foo.nine = value
+            t.set(foo)
+            return foo
+
         def mainall(codeno, bound):
-            return main(codeno) + main2(bound)
+            foo = setup(bound + 8)
+            result = main(codeno) + main2(bound)
+            keepalive_until_here(foo)
+            return result
 
+        tmp_obj = setup(9)
+        expected_1 = main(0)
         res = self.meta_interp(mainall, [0, 1], inline=True,
                                policy=StopAtXPolicy(change))
         print hex(res)
-        assert res & 255 == main(0)
+        assert res & 255 == expected_1
         bound = res & ~255
         assert 1024 <= bound <= 131072
         assert bound & (bound-1) == 0       # a power of two
diff --git a/rpython/jit/backend/x86/arch.py b/rpython/jit/backend/x86/arch.py
--- a/rpython/jit/backend/x86/arch.py
+++ b/rpython/jit/backend/x86/arch.py
@@ -34,10 +34,16 @@
     FRAME_FIXED_SIZE = 19
     PASS_ON_MY_FRAME = 15
     JITFRAME_FIXED_SIZE = 6 + 8 * 2 # 6 GPR + 8 XMM * 2 WORDS/float
+    # 'threadlocal_addr' is passed as 2nd argument on the stack,
+    # and it can be left here for when it is needed
+    THREADLOCAL_OFS = (FRAME_FIXED_SIZE + 2) * WORD
 else:
-    # rbp + rbx + r12 + r13 + r14 + r15 + 13 extra words = 19
+    # rbp + rbx + r12 + r13 + r14 + r15 + threadlocal + 12 extra words = 19
     FRAME_FIXED_SIZE = 19
-    PASS_ON_MY_FRAME = 13
+    PASS_ON_MY_FRAME = 12
     JITFRAME_FIXED_SIZE = 28 # 13 GPR + 15 XMM
+    # 'threadlocal_addr' is passed as 2nd argument in %esi,
+    # and is moved into this frame location
+    THREADLOCAL_OFS = (FRAME_FIXED_SIZE - 1) * WORD
 
 assert PASS_ON_MY_FRAME >= 12       # asmgcc needs at least JIT_USE_WORDS + 3
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -18,7 +18,7 @@
 from rpython.jit.backend.llsupport.regalloc import (get_scale, valid_addressing_size)
 from rpython.jit.backend.x86.arch import (FRAME_FIXED_SIZE, WORD, IS_X86_64,
                                        JITFRAME_FIXED_SIZE, IS_X86_32,
-                                       PASS_ON_MY_FRAME)
+                                       PASS_ON_MY_FRAME, THREADLOCAL_OFS)
 from rpython.jit.backend.x86.regloc import (eax, ecx, edx, ebx, esp, ebp, esi,
     xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, r8, r9, r10, r11, edi,
     r12, r13, r14, r15, X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG,
@@ -730,6 +730,7 @@
         self.mc.SUB_ri(esp.value, FRAME_FIXED_SIZE * WORD)
         self.mc.MOV_sr(PASS_ON_MY_FRAME * WORD, ebp.value)
         if IS_X86_64:
+            self.mc.MOV_sr(THREADLOCAL_OFS, esi.value)
             self.mc.MOV_rr(ebp.value, edi.value)
         else:
             self.mc.MOV_rs(ebp.value, (FRAME_FIXED_SIZE + 1) * WORD)
@@ -1969,7 +1970,8 @@
         self._emit_guard_not_forced(guard_token)
 
     def _call_assembler_emit_call(self, addr, argloc, _):
-        self.simple_call(addr, [argloc])
+        threadlocal_loc = RawEspLoc(THREADLOCAL_OFS, INT)
+        self.simple_call(addr, [argloc, threadlocal_loc])
 
     def _call_assembler_emit_helper_call(self, addr, arglocs, result_loc):
         self.simple_call(addr, arglocs, result_loc)
@@ -2334,48 +2336,16 @@
         assert isinstance(reg, RegLoc)
         self.mc.MOV_rr(reg.value, ebp.value)
 
-    def threadlocalref_get(self, op, resloc):
-        # this function is only called on Linux
-        from rpython.jit.codewriter.jitcode import ThreadLocalRefDescr
-        from rpython.jit.backend.x86 import stmtlocal
+    def threadlocalref_get(self, offset, resloc):
+        # This loads the stack location THREADLOCAL_OFS into a
+        # register, and then read the word at the given offset.
+        # It is only supported if 'translate_support_code' is
+        # true; otherwise, the original call to the piece of assembler
+        # was done with a dummy NULL value.
+        assert self.cpu.translate_support_code
         assert isinstance(resloc, RegLoc)
-        effectinfo = op.getdescr().get_extra_info()
-        assert effectinfo.extradescrs is not None
-        ed = effectinfo.extradescrs[0]
-        assert isinstance(ed, ThreadLocalRefDescr)
-        addr1 = rffi.cast(lltype.Signed, ed.get_tlref_addr())
-        # 'addr1' is the address is the current thread, but we assume that
-        # it is a thread-local at a constant offset from %fs/%gs.
-        addr0 = stmtlocal.threadlocal_base()
-        addr = addr1 - addr0
-        assert rx86.fits_in_32bits(addr)
-        mc = self.mc
-        mc.writechar(stmtlocal.SEGMENT_TL)     # prefix: %fs or %gs
-        mc.MOV_rj(resloc.value, addr)          # memory read
-
-    def get_set_errno(self, op, loc, issue_a_write):
-        # this function is only called on Linux
-        from rpython.jit.backend.x86 import stmtlocal
-        addr = stmtlocal.get_errno_tl()
-        assert rx86.fits_in_32bits(addr)
-        mc = self.mc
-        mc.writechar(stmtlocal.SEGMENT_TL)     # prefix: %fs or %gs
-        # !!important: the *next* instruction must be the one using 'addr'!!
-        if issue_a_write:
-            if isinstance(loc, RegLoc):
-                mc.MOV32_jr(addr, loc.value)       # memory write from reg
-            else:
-                assert isinstance(loc, ImmedLoc)
-                newvalue = loc.value
-                newvalue = rffi.cast(rffi.INT, newvalue)
-                newvalue = rffi.cast(lltype.Signed, newvalue)
-                mc.MOV32_ji(addr, newvalue)        # memory write immediate
-        else:
-            assert isinstance(loc, RegLoc)
-            if IS_X86_32:
-                mc.MOV_rj(loc.value, addr)         # memory read
-            elif IS_X86_64:
-                mc.MOVSX32_rj(loc.value, addr)     # memory read, sign-extend
+        self.mc.MOV_rs(resloc.value, THREADLOCAL_OFS)
+        self.mc.MOV_rm(resloc.value, (resloc.value, offset))
 
     def genop_discard_zero_array(self, op, arglocs):
         (base_loc, startindex_loc, bytes_loc,
diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -699,29 +699,11 @@
         loc0 = self.xrm.force_result_in_reg(op.result, op.getarg(1))
         self.perform_math(op, [loc0], loc0)
 
-    TLREF_SUPPORT = sys.platform.startswith('linux')
-    ERRNO_SUPPORT = sys.platform.startswith('linux')
-
     def _consider_threadlocalref_get(self, op):
-        if self.TLREF_SUPPORT:
+        if self.translate_support_code:
+            offset = op.getarg(1).getint()   # getarg(0) == 'threadlocalref_get'
             resloc = self.force_allocate_reg(op.result)
-            self.assembler.threadlocalref_get(op, resloc)
-        else:
-            self._consider_call(op)
-
-    def _consider_get_errno(self, op):
-        if self.ERRNO_SUPPORT:
-            resloc = self.force_allocate_reg(op.result)
-            self.assembler.get_set_errno(op, resloc, issue_a_write=False)
-        else:
-            self._consider_call(op)
-
-    def _consider_set_errno(self, op):
-        if self.ERRNO_SUPPORT:
-            # op.getarg(0) is the function set_errno; op.getarg(1) is
-            # the new errno value
-            loc0 = self.rm.make_sure_var_in_reg(op.getarg(1))
-            self.assembler.get_set_errno(op, loc0, issue_a_write=True)
+            self.assembler.threadlocalref_get(offset, resloc)
         else:
             self._consider_call(op)
 
@@ -804,10 +786,6 @@
                 return self._consider_math_sqrt(op)
             if oopspecindex == EffectInfo.OS_THREADLOCALREF_GET:
                 return self._consider_threadlocalref_get(op)
-            if oopspecindex == EffectInfo.OS_GET_ERRNO:
-                return self._consider_get_errno(op)
-            if oopspecindex == EffectInfo.OS_SET_ERRNO:
-                return self._consider_set_errno(op)
             if oopspecindex == EffectInfo.OS_MATH_READ_TIMESTAMP:
                 return self._consider_math_read_timestamp(op)
         self._consider_call(op)
diff --git a/rpython/jit/backend/x86/stmtlocal.py b/rpython/jit/backend/x86/stmtlocal.py
deleted file mode 100644
--- a/rpython/jit/backend/x86/stmtlocal.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from rpython.rtyper.lltypesystem import lltype, rffi
-from rpython.translator.tool.cbuild import ExternalCompilationInfo
-from rpython.jit.backend.x86.arch import WORD
-
-SEGMENT_FS = '\x64'
-SEGMENT_GS = '\x65'
-
-if WORD == 4:
-    SEGMENT_TL = SEGMENT_GS
-    _instruction = "movl %%gs:0, %0"
-else:
-    SEGMENT_TL = SEGMENT_FS
-    _instruction = "movq %%fs:0, %0"
-
-eci = ExternalCompilationInfo(post_include_bits=['''
-#define RPY_STM_JIT  1
-static long pypy__threadlocal_base(void)
-{
-    /* XXX ONLY LINUX WITH GCC/CLANG FOR NOW XXX */
-    long result;
-    asm("%s" : "=r"(result));
-    return result;
-}
-static long pypy__get_errno_tl(void)
-{
-    return ((long)&errno) - pypy__threadlocal_base();
-}
-''' % _instruction])
-
-
-threadlocal_base = rffi.llexternal(
-    'pypy__threadlocal_base',
-    [], lltype.Signed,
-    compilation_info=eci,
-    _nowrapper=True,
-    ) #transactionsafe=True)
-
-get_errno_tl = rffi.llexternal(
-    'pypy__get_errno_tl',
-    [], lltype.Signed,
-    compilation_info=eci,
-    _nowrapper=True,
-    ) #transactionsafe=True)
diff --git a/rpython/jit/codewriter/effectinfo.py b/rpython/jit/codewriter/effectinfo.py
--- a/rpython/jit/codewriter/effectinfo.py
+++ b/rpython/jit/codewriter/effectinfo.py
@@ -23,8 +23,6 @@
     OS_SHRINK_ARRAY             = 3    # rgc.ll_shrink_array
     OS_DICT_LOOKUP              = 4    # ll_dict_lookup
     OS_THREADLOCALREF_GET       = 5    # llop.threadlocalref_get
-    OS_GET_ERRNO                = 6    # rposix.get_errno
-    OS_SET_ERRNO                = 7    # rposix.set_errno
     OS_NOT_IN_TRACE             = 8    # for calls not recorded in the jit trace
     #
     OS_STR_CONCAT               = 22   # "stroruni.concat"
diff --git a/rpython/jit/codewriter/jitcode.py b/rpython/jit/codewriter/jitcode.py
--- a/rpython/jit/codewriter/jitcode.py
+++ b/rpython/jit/codewriter/jitcode.py
@@ -117,26 +117,6 @@
         raise NotImplementedError
 
 
-class ThreadLocalRefDescr(AbstractDescr):
-    # A special descr used as the extradescr in a call to a
-    # threadlocalref_get function.  If the backend supports it,
-    # it can use this 'get_tlref_addr()' to get the address *in the
-    # current thread* of the thread-local variable.  If, on the current
-    # platform, the "__thread" variables are implemented as an offset
-    # from some base register (e.g. %fs on x86-64), then the backend will
-    # immediately substract the current value of the base register.
-    # This gives an offset from the base register, and this can be
-    # written down in an assembler instruction to load the "__thread"
-    # variable from anywhere.
-
-    def __init__(self, opaque_id):
-        from rpython.rtyper.lltypesystem.lloperation import llop
-        from rpython.rtyper.lltypesystem import llmemory
-        def get_tlref_addr():
-            return llop.threadlocalref_getaddr(llmemory.Address, opaque_id)
-        self.get_tlref_addr = get_tlref_addr
-
-
 class LiveVarsInfo(object):
     def __init__(self, live_i, live_r, live_f):
         self.live_i = live_i
diff --git a/rpython/jit/codewriter/jtransform.py b/rpython/jit/codewriter/jtransform.py
--- a/rpython/jit/codewriter/jtransform.py
+++ b/rpython/jit/codewriter/jtransform.py
@@ -439,8 +439,6 @@
         elif oopspec_name.endswith('dict.lookup'):
             # also ordereddict.lookup
             prepare = self._handle_dict_lookup_call
-        elif oopspec_name.startswith('rposix.'):
-            prepare = self._handle_rposix_call
         else:
             prepare = self.prepare_builtin_call
         try:
@@ -1979,16 +1977,6 @@
         else:
             raise NotImplementedError(oopspec_name)
 
-    def _handle_rposix_call(self, op, oopspec_name, args):
-        if oopspec_name == 'rposix.get_errno':
-            return self._handle_oopspec_call(op, args, EffectInfo.OS_GET_ERRNO,
-                                             EffectInfo.EF_CANNOT_RAISE)
-        elif oopspec_name == 'rposix.set_errno':
-            return self._handle_oopspec_call(op, args, EffectInfo.OS_SET_ERRNO,
-                                             EffectInfo.EF_CANNOT_RAISE)
-        else:
-            raise NotImplementedError(oopspec_name)
-
     def rewrite_op_ll_read_timestamp(self, op):
         op1 = self.prepare_builtin_call(op, "ll_read_timestamp", [])
         return self.handle_residual_call(op1,
@@ -2005,16 +1993,15 @@
         return [op0, op1]
 
     def rewrite_op_threadlocalref_get(self, op):
-        from rpython.jit.codewriter.jitcode import ThreadLocalRefDescr
-        opaqueid = op.args[0].value
-        op1 = self.prepare_builtin_call(op, 'threadlocalref_getter', [],
-                                        extra=(opaqueid,),
-                                        extrakey=opaqueid._obj)
-        extradescr = ThreadLocalRefDescr(opaqueid)
+        # only supports RESTYPE being exactly one word.
+        RESTYPE = op.result.concretetype
+        assert (RESTYPE in (lltype.Signed, lltype.Unsigned, llmemory.Address)
+                or isinstance(RESTYPE, lltype.Ptr))
+        c_offset, = op.args
+        op1 = self.prepare_builtin_call(op, 'threadlocalref_get', [c_offset])
         return self.handle_residual_call(op1,
             oopspecindex=EffectInfo.OS_THREADLOCALREF_GET,
-            extraeffect=EffectInfo.EF_LOOPINVARIANT,
-            extradescr=[extradescr])
+            extraeffect=EffectInfo.EF_LOOPINVARIANT)
 
 # ____________________________________________________________
 
diff --git a/rpython/jit/codewriter/support.py b/rpython/jit/codewriter/support.py
--- a/rpython/jit/codewriter/support.py
+++ b/rpython/jit/codewriter/support.py
@@ -702,10 +702,9 @@
     build_ll_1_raw_free_no_track_allocation = (
         build_raw_free_builder(track_allocation=False))
 
-    def build_ll_0_threadlocalref_getter(opaqueid):
-        def _ll_0_threadlocalref_getter():
-            return llop.threadlocalref_get(rclass.OBJECTPTR, opaqueid)
-        return _ll_0_threadlocalref_getter
+    def _ll_1_threadlocalref_get(TP, offset):
+        return llop.threadlocalref_get(TP, offset)
+    _ll_1_threadlocalref_get.need_result_type = 'exact'   # don't deref
 
     def _ll_1_weakref_create(obj):
         return llop.weakref_create(llmemory.WeakRefPtr, obj)
@@ -818,8 +817,18 @@
     s_result = lltype_to_annotation(ll_res)
     impl = setup_extra_builtin(rtyper, oopspec_name, len(args_s), extra)
     if getattr(impl, 'need_result_type', False):
-        bk = rtyper.annotator.bookkeeper
-        args_s.insert(0, annmodel.SomePBC([bk.getdesc(deref(ll_res))]))
+        if hasattr(rtyper, 'annotator'):
+            bk = rtyper.annotator.bookkeeper
+            ll_restype = ll_res
+            if impl.need_result_type != 'exact':
+                ll_restype = deref(ll_restype)
+            desc = bk.getdesc(ll_restype)
+        else:
+            class TestingDesc(object):
+                knowntype = int
+                pyobj = None
+            desc = TestingDesc()
+        args_s.insert(0, annmodel.SomePBC([desc]))
     #
     if hasattr(rtyper, 'annotator'):  # regular case
         mixlevelann = MixLevelHelperAnnotator(rtyper)
diff --git a/rpython/jit/codewriter/test/test_jtransform.py b/rpython/jit/codewriter/test/test_jtransform.py
--- a/rpython/jit/codewriter/test/test_jtransform.py
+++ b/rpython/jit/codewriter/test/test_jtransform.py
@@ -148,9 +148,7 @@
              EI.OS_UNIEQ_LENGTHOK:       ([PUNICODE, PUNICODE], INT),
              EI.OS_RAW_MALLOC_VARSIZE_CHAR: ([INT], ARRAYPTR),
              EI.OS_RAW_FREE:             ([ARRAYPTR], lltype.Void),
-             EI.OS_THREADLOCALREF_GET:   ([], rclass.OBJECTPTR),
-             EI.OS_GET_ERRNO:            ([], INT),
-             EI.OS_SET_ERRNO:            ([INT], lltype.Void),
+             EI.OS_THREADLOCALREF_GET:   ([INT], INT),   # for example
             }
             argtypes = argtypes[oopspecindex]
             assert argtypes[0] == [v.concretetype for v in op.args[1:]]
@@ -159,9 +157,7 @@
                 assert extraeffect == EI.EF_ELIDABLE_CAN_RAISE
             elif oopspecindex == EI.OS_RAW_MALLOC_VARSIZE_CHAR:
                 assert extraeffect == EI.EF_CAN_RAISE
-            elif oopspecindex in (EI.OS_RAW_FREE,
-                                  EI.OS_GET_ERRNO,
-                                  EI.OS_SET_ERRNO):
+            elif oopspecindex == EI.OS_RAW_FREE:
                 assert extraeffect == EI.EF_CANNOT_RAISE
             elif oopspecindex == EI.OS_THREADLOCALREF_GET:
                 assert extraeffect == EI.EF_LOOPINVARIANT
@@ -1347,53 +1343,20 @@
     assert op2 is None
 
 def test_threadlocalref_get():
-    from rpython.rtyper import rclass
-    from rpython.rlib.rthread import ThreadLocalReference
+    from rpython.rlib.rthread import ThreadLocalField
+    tlfield = ThreadLocalField(lltype.Signed, 'foobar_test_')
     OS_THREADLOCALREF_GET = effectinfo.EffectInfo.OS_THREADLOCALREF_GET
-    class Foo: pass
-    t = ThreadLocalReference(Foo)
-    v2 = varoftype(rclass.OBJECTPTR)
-    c_opaqueid = const(t.opaque_id)
-    op = SpaceOperation('threadlocalref_get', [c_opaqueid], v2)
+    c = const(tlfield.offset)
+    v = varoftype(lltype.Signed)
+    op = SpaceOperation('threadlocalref_get', [c], v)
     tr = Transformer(FakeCPU(), FakeBuiltinCallControl())
     op0 = tr.rewrite_operation(op)
-    assert op0.opname == 'residual_call_r_r'
-    assert op0.args[0].value == 'threadlocalref_getter' # pseudo-function as str
-    assert op0.args[1] == ListOfKind("ref", [])
-    assert op0.args[2] == 'calldescr-%d' % OS_THREADLOCALREF_GET
-    assert op0.result == v2
-
-def test_get_errno():
-    # test that the oopspec is present and correctly transformed
-    from rpython.rlib import rposix
-    FUNC = lltype.FuncType([], lltype.Signed)
-    func = lltype.functionptr(FUNC, 'get_errno', _callable=rposix.get_errno)
-    v3 = varoftype(lltype.Signed)
-    op = SpaceOperation('direct_call', [const(func)], v3)
-    tr = Transformer(FakeCPU(), FakeBuiltinCallControl())
-    op1 = tr.rewrite_operation(op)
-    assert op1.opname == 'residual_call_r_i'
-    assert op1.args[0].value == func
-    assert op1.args[1] == ListOfKind('ref', [])
-    assert op1.args[2] == 'calldescr-%d' % effectinfo.EffectInfo.OS_GET_ERRNO
-    assert op1.result == v3
-
-def test_set_errno():
-    # test that the oopspec is present and correctly transformed
-    from rpython.rlib import rposix
-    FUNC = lltype.FuncType([lltype.Signed], lltype.Void)
-    func = lltype.functionptr(FUNC, 'set_errno', _callable=rposix.set_errno)
-    v1 = varoftype(lltype.Signed)
-    v3 = varoftype(lltype.Void)
-    op = SpaceOperation('direct_call', [const(func), v1], v3)
-    tr = Transformer(FakeCPU(), FakeBuiltinCallControl())
-    op1 = tr.rewrite_operation(op)
-    assert op1.opname == 'residual_call_ir_v'
-    assert op1.args[0].value == func
-    assert op1.args[1] == ListOfKind('int', [v1])
-    assert op1.args[2] == ListOfKind('ref', [])
-    assert op1.args[3] == 'calldescr-%d' % effectinfo.EffectInfo.OS_SET_ERRNO
-    assert op1.result == v3
+    assert op0.opname == 'residual_call_ir_i'
+    assert op0.args[0].value == 'threadlocalref_get' # pseudo-function as str
+    assert op0.args[1] == ListOfKind("int", [c])
+    assert op0.args[2] == ListOfKind("ref", [])
+    assert op0.args[3] == 'calldescr-%d' % OS_THREADLOCALREF_GET
+    assert op0.result == v
 
 def test_unknown_operation():
     op = SpaceOperation('foobar', [], varoftype(lltype.Void))
diff --git a/rpython/jit/metainterp/test/test_threadlocal.py b/rpython/jit/metainterp/test/test_threadlocal.py
--- a/rpython/jit/metainterp/test/test_threadlocal.py
+++ b/rpython/jit/metainterp/test/test_threadlocal.py
@@ -1,29 +1,20 @@
 import py
+from rpython.rlib import rthread
 from rpython.jit.metainterp.test.support import LLJitMixin
-from rpython.rlib.rthread import ThreadLocalReference
-from rpython.rlib.jit import dont_look_inside
+from rpython.rtyper.lltypesystem import lltype
+from rpython.rtyper.lltypesystem.lloperation import llop
 
 
 class ThreadLocalTest(object):
 
     def test_threadlocalref_get(self):
-        class Foo:
-            pass
-        t = ThreadLocalReference(Foo)
-        x = Foo()
-
-        @dont_look_inside
-        def setup():
-            t.set(x)
+        tlfield = rthread.ThreadLocalField(lltype.Signed, 'foobar_test_')
 
         def f():
-            setup()
-            if t.get() is x:
-                return 42
-            return -666
+            return tlfield.getraw()
 
         res = self.interp_operations(f, [])
-        assert res == 42
+        assert res == 0x544c    # magic value returned by llinterp
 
 
 class TestLLtype(ThreadLocalTest, LLJitMixin):
diff --git a/rpython/memory/gctransform/framework.py b/rpython/memory/gctransform/framework.py
--- a/rpython/memory/gctransform/framework.py
+++ b/rpython/memory/gctransform/framework.py
@@ -1080,6 +1080,9 @@
             assert not livevars, "live GC var around %s!" % (hop.spaceop,)
             hop.genop("direct_call", [self.root_walker.thread_run_ptr])
             self.pop_roots(hop, livevars)
+        else:
+            hop.rename("gc_thread_run")     # keep it around for c/gc.py,
+                                            # unless handled specially above
 
     def gct_gc_thread_start(self, hop):
         assert self.translator.config.translation.thread
@@ -1095,6 +1098,7 @@
             assert not livevars, "live GC var around %s!" % (hop.spaceop,)
             hop.genop("direct_call", [self.root_walker.thread_die_ptr])
             self.pop_roots(hop, livevars)
+        hop.rename("gc_thread_die")     # keep it around for c/gc.py
 
     def gct_gc_thread_before_fork(self, hop):
         if (self.translator.config.translation.thread
diff --git a/rpython/memory/gctransform/shadowstack.py b/rpython/memory/gctransform/shadowstack.py
--- a/rpython/memory/gctransform/shadowstack.py
+++ b/rpython/memory/gctransform/shadowstack.py
@@ -132,8 +132,12 @@
             gcdata.root_stack_top/root_stack_base is the one corresponding
             to the current thread.
             No GC operation here, e.g. no mallocs or storing in a dict!
+
+            Note that here specifically we don't call rthread.get_ident(),
+            but rthread.get_or_make_ident().  We are possibly in a fresh
+            new thread, so we need to be careful.
             """
-            tid = get_tid()
+            tid = rthread.get_or_make_ident()
             if gcdata.active_tid != tid:
                 switch_shadow_stacks(tid)
 
diff --git a/rpython/rlib/rposix.py b/rpython/rlib/rposix.py
--- a/rpython/rlib/rposix.py
+++ b/rpython/rlib/rposix.py
@@ -95,12 +95,19 @@
 # the default wrapper for set_errno is not suitable for use in critical places
 # like around GIL handling logic, so we provide our own wrappers.
 
- at jit.oopspec("rposix.get_errno()")
 def get_errno():
+    if jit.we_are_jitted():
+        from rpython.rlib import rthread
+        perrno = rthread.tlfield_p_errno.getraw()
+        return intmask(perrno[0])
     return intmask(_get_errno())
 
- at jit.oopspec("rposix.set_errno(errno)")
 def set_errno(errno):
+    if jit.we_are_jitted():
+        from rpython.rlib import rthread
+        perrno = rthread.tlfield_p_errno.getraw()
+        perrno[0] = rffi.cast(INT, errno)
+        return
     _set_errno(rffi.cast(INT, errno))
 
 if os.name == 'nt':
diff --git a/rpython/rlib/rstack.py b/rpython/rlib/rstack.py
--- a/rpython/rlib/rstack.py
+++ b/rpython/rlib/rstack.py
@@ -1,6 +1,6 @@
 """
 This file defines utilities for manipulating the stack in an
-RPython-compliant way, intended mostly for use by the Stackless PyPy.
+RPython-compliant way.  It is mainly about the stack_check() function.
 """
 
 import py
@@ -10,18 +10,11 @@
 from rpython.rlib import rgc
 from rpython.rtyper.lltypesystem import lltype, rffi
 from rpython.rtyper.lltypesystem.lloperation import llop
-from rpython.translator import cdir
-from rpython.translator.tool.cbuild import ExternalCompilationInfo
 
 # ____________________________________________________________
 
-srcdir = py.path.local(cdir) / 'src'
-compilation_info = ExternalCompilationInfo(
-        includes=['src/stack.h'],
-        separate_module_files=[srcdir / 'stack.c', srcdir / 'threadlocal.c'])
-
 def llexternal(name, args, res, _callable=None):
-    return rffi.llexternal(name, args, res, compilation_info=compilation_info,
+    return rffi.llexternal(name, args, res,
                            sandboxsafe=True, _nowrapper=True,
                            _callable=_callable)
 
diff --git a/rpython/rlib/rthread.py b/rpython/rlib/rthread.py
--- a/rpython/rlib/rthread.py
+++ b/rpython/rlib/rthread.py
@@ -5,8 +5,10 @@
 from rpython.rlib import jit, rgc
 from rpython.rlib.debug import ll_assert
 from rpython.rlib.objectmodel import we_are_translated, specialize
+from rpython.rlib.objectmodel import CDefinedIntSymbolic
 from rpython.rtyper.lltypesystem.lloperation import llop
 from rpython.rtyper.tool import rffi_platform
+from rpython.rtyper.extregistry import ExtRegistryEntry
 
 class RThreadError(Exception):
     pass
@@ -40,8 +42,6 @@
                             releasegil=True)  # release the GIL, but most
                                               # importantly, reacquire it
                                               # around the callback
-c_thread_get_ident = llexternal('RPyThreadGetIdent', [], rffi.LONG,
-                                _nowrapper=True)    # always call directly
 
 TLOCKP = rffi.COpaquePtr('struct RPyOpaque_ThreadLock',
                           compilation_info=eci)
@@ -83,9 +83,16 @@
 
 # wrappers...
 
- at jit.loop_invariant
 def get_ident():
-    return rffi.cast(lltype.Signed, c_thread_get_ident())
+    if we_are_translated():
+        return tlfield_thread_ident.getraw()
+    else:
+        import thread
+        return thread.get_ident()
+
+def get_or_make_ident():
+    assert we_are_translated()
+    return tlfield_thread_ident.get_or_make_raw()
 
 @specialize.arg(0)
 def start_new_thread(x, y):
@@ -265,17 +272,40 @@
 # KEEP THE REFERENCE ALIVE, THE GC DOES NOT FOLLOW THEM SO FAR!
 # We use _make_sure_does_not_move() to make sure the pointer will not move.
 
-ecitl = ExternalCompilationInfo(
-    includes = ['src/threadlocal.h'],
-    separate_module_files = [translator_c_dir / 'src' / 'threadlocal.c'])
-ensure_threadlocal = rffi.llexternal_use_eci(ecitl)
 
-class ThreadLocalReference(object):
+class ThreadLocalField(object):
+    def __init__(self, FIELDTYPE, fieldname):
+        "NOT_RPYTHON: must be prebuilt"
+        self.FIELDTYPE = FIELDTYPE
+        self.fieldname = fieldname
+        offset = CDefinedIntSymbolic('RPY_TLOFS_%s' % self.fieldname,
+                                     default='?')
+        self.offset = offset
+
+        def getraw():
+            _threadlocalref_seeme(self)
+            return llop.threadlocalref_get(FIELDTYPE, offset)
+
+        def get_or_make_raw():
+            _threadlocalref_seeme(self)
+            addr = llop.threadlocalref_addr(llmemory.Address)
+            return llop.raw_load(FIELDTYPE, addr, offset)
+
+        def setraw(value):
+            _threadlocalref_seeme(self)
+            addr = llop.threadlocalref_addr(llmemory.Address)
+            llop.raw_store(lltype.Void, addr, offset, value)
+
+        self.getraw = getraw
+        self.get_or_make_raw = get_or_make_raw
+        self.setraw = setraw
+
+    def _freeze_(self):
+        return True
+
+
+class ThreadLocalReference(ThreadLocalField):
     _COUNT = 1
-    OPAQUEID = lltype.OpaqueType("ThreadLocalRef",
-                                 hints={"threadlocalref": True,
-                                        "external": "C",
-                                        "c_name": "RPyThreadStaticTLS"})
 
     def __init__(self, Cls):
         "NOT_RPYTHON: must be prebuilt"
@@ -284,15 +314,16 @@
         self.local = thread._local()      # <- NOT_RPYTHON
         unique_id = ThreadLocalReference._COUNT
         ThreadLocalReference._COUNT += 1
-        opaque_id = lltype.opaqueptr(ThreadLocalReference.OPAQUEID,
-                                     'tlref%d' % unique_id)
-        self.opaque_id = opaque_id
+        ThreadLocalField.__init__(self, lltype.Signed, 'tlref%d' % unique_id)
+        setraw = self.setraw
+        offset = self.offset
 
         def get():
             if we_are_translated():
                 from rpython.rtyper import rclass
                 from rpython.rtyper.annlowlevel import cast_base_ptr_to_instance
-                ptr = llop.threadlocalref_get(rclass.OBJECTPTR, opaque_id)
+                _threadlocalref_seeme(self)
+                ptr = llop.threadlocalref_get(rclass.OBJECTPTR, offset)
                 return cast_base_ptr_to_instance(Cls, ptr)
             else:
                 return getattr(self.local, 'value', None)
@@ -301,21 +332,34 @@
         def set(value):
             assert isinstance(value, Cls) or value is None
             if we_are_translated():
-                from rpython.rtyper.annlowlevel import cast_instance_to_base_ptr
+                from rpython.rtyper.annlowlevel import cast_instance_to_gcref
                 from rpython.rlib.rgc import _make_sure_does_not_move
                 from rpython.rlib.objectmodel import running_on_llinterp
-                ptr = cast_instance_to_base_ptr(value)
+                gcref = cast_instance_to_gcref(value)
                 if not running_on_llinterp:
-                    gcref = lltype.cast_opaque_ptr(llmemory.GCREF, ptr)
                     if gcref:
                         _make_sure_does_not_move(gcref)
-                llop.threadlocalref_set(lltype.Void, opaque_id, ptr)
-                ensure_threadlocal()
+                value = lltype.cast_ptr_to_int(gcref)
+                setraw(value)
             else:
                 self.local.value = value
 
         self.get = get
         self.set = set
 
-    def _freeze_(self):
-        return True
+
+tlfield_thread_ident = ThreadLocalField(lltype.Signed, "thread_ident")
+tlfield_p_errno = ThreadLocalField(rffi.CArrayPtr(rffi.INT), "p_errno")
+
+def _threadlocalref_seeme(field):
+    "NOT_RPYTHON"
+
+class _Entry(ExtRegistryEntry):
+    _about_ = _threadlocalref_seeme
+
+    def compute_result_annotation(self, s_field):
+        field = s_field.const
+        self.bookkeeper.thread_local_fields.add(field)
+
+    def specialize_call(self, hop):
+        hop.exception_cannot_occur()
diff --git a/rpython/rlib/test/test_rthread.py b/rpython/rlib/test/test_rthread.py
--- a/rpython/rlib/test/test_rthread.py
+++ b/rpython/rlib/test/test_rthread.py
@@ -47,6 +47,10 @@
     time.sleep(0.5)
     assert results == [True] * 15
 
+def test_get_ident():
+    import thread
+    assert get_ident() == thread.get_ident()
+
 
 class AbstractThreadTests(AbstractGCTestClass):
     use_threads = True
diff --git a/rpython/rtyper/llinterp.py b/rpython/rtyper/llinterp.py
--- a/rpython/rtyper/llinterp.py
+++ b/rpython/rtyper/llinterp.py
@@ -919,19 +919,14 @@
     def op_stack_current(self):
         return 0
 
-    def op_threadlocalref_set(self, key, value):
-        try:
-            d = self.llinterpreter.tlrefsdict
-        except AttributeError:
-            d = self.llinterpreter.tlrefsdict = {}
-        d[key._obj] = value
+    def op_threadlocalref_addr(self):
+        raise NotImplementedError("threadlocalref_addr")
 
-    def op_threadlocalref_get(self, key):
-        d = self.llinterpreter.tlrefsdict
-        return d[key._obj]
-
-    def op_threadlocalref_getaddr(self, key):
-        raise NotImplementedError("threadlocalref_getaddr")
+    def op_threadlocalref_get(self, offset):
+        if (type(offset) is CDefinedIntSymbolic and
+                offset.expr == 'RPY_TLOFS_foobar_test_'):   # used in tests
+            return 0x544c
+        raise NotImplementedError("threadlocalref_get")
 
     # __________________________________________________________
     # operations on addresses
@@ -978,6 +973,9 @@
             ll_p = rffi.cast(rffi.CArrayPtr(RESTYPE),
                              rffi.ptradd(ll_p, offset))
             value = ll_p[0]
+        ## elif getattr(addr, 'is_fake_thread_local_addr', False):
+        ##     assert type(offset) is CDefinedIntSymbolic
+        ##     value = self.llinterpreter.tlobj[offset.expr]
         else:
             assert offset.TYPE == RESTYPE
             value = getattr(addr, str(RESTYPE).lower())[offset.repeat]
@@ -998,6 +996,9 @@
             ll_p = rffi.cast(rffi.CArrayPtr(ARGTYPE),
                              rffi.ptradd(ll_p, offset))
             ll_p[0] = value
+        ## elif getattr(addr, 'is_fake_thread_local_addr', False):
+        ##     assert type(offset) is CDefinedIntSymbolic
+        ##     self.llinterpreter.tlobj[offset.expr] = value
         else:
             assert offset.TYPE == ARGTYPE
             getattr(addr, str(ARGTYPE).lower())[offset.repeat] = value
diff --git a/rpython/rtyper/lltypesystem/lloperation.py b/rpython/rtyper/lltypesystem/lloperation.py
--- a/rpython/rtyper/lltypesystem/lloperation.py
+++ b/rpython/rtyper/lltypesystem/lloperation.py
@@ -546,9 +546,8 @@
     'getslice':             LLOp(canraise=(Exception,)),
     'check_and_clear_exc':  LLOp(),
 
-    'threadlocalref_get':   LLOp(sideeffects=False),
-    'threadlocalref_getaddr': LLOp(sideeffects=False),
-    'threadlocalref_set':   LLOp(),
+    'threadlocalref_addr':  LLOp(sideeffects=False),  # get (or make) addr of tl
+    'threadlocalref_get':   LLOp(sideeffects=False),  # read field (no check)
 
     # __________ debugging __________
     'debug_view':           LLOp(),
diff --git a/rpython/rtyper/lltypesystem/test/test_llmemory.py b/rpython/rtyper/lltypesystem/test/test_llmemory.py
--- a/rpython/rtyper/lltypesystem/test/test_llmemory.py
+++ b/rpython/rtyper/lltypesystem/test/test_llmemory.py
@@ -649,3 +649,13 @@
     #assert cast_int_to_adr(i) == adr -- depends on ll2ctypes details
     i = cast_adr_to_int(NULL, mode="forced")
     assert is_valid_int(i) and i == 0
+
+def test_cast_gcref_to_int():
+    A = lltype.GcArray(Address)
+    def f():
+        ptr = lltype.malloc(A, 10)
+        gcref = lltype.cast_opaque_ptr(GCREF, ptr)
+        adr = lltype.cast_ptr_to_int(gcref)
+        assert adr == lltype.cast_ptr_to_int(ptr)
+    f()
+    interpret(f, [])
diff --git a/rpython/translator/c/funcgen.py b/rpython/translator/c/funcgen.py
--- a/rpython/translator/c/funcgen.py
+++ b/rpython/translator/c/funcgen.py
@@ -13,6 +13,7 @@
 from rpython.translator.backendopt.ssa import SSI_to_SSA
 from rpython.translator.backendopt.innerloop import find_inner_loops
 from rpython.tool.identity_dict import identity_dict
+from rpython.rlib.objectmodel import CDefinedIntSymbolic
 
 
 LOCALVAR = 'l_%s'
@@ -900,4 +901,21 @@
         else:
             return None    # use the default
 
+    def OP_THREADLOCALREF_GET(self, op):
+        typename = self.db.gettype(op.result.concretetype)
+        if isinstance(op.args[0], Constant):
+            assert isinstance(op.args[0].value, CDefinedIntSymbolic)
+            fieldname = op.args[0].value.expr
+            assert fieldname.startswith('RPY_TLOFS_')
+            fieldname = fieldname[10:]
+            return '%s = (%s)RPY_THREADLOCALREF_GET(%s);' % (
+                self.expr(op.result),
+                cdecl(typename, ''),
+                fieldname)
+        else:
+            return 'OP_THREADLOCALREF_GET_NONCONST(%s, %s, %s);' % (
+                cdecl(typename, ''),
+                self.expr(op.args[0]),
+                self.expr(op.result))
+
 assert not USESLOTS or '__dict__' not in dir(FunctionCodeGenerator)
diff --git a/rpython/translator/c/gc.py b/rpython/translator/c/gc.py
--- a/rpython/translator/c/gc.py
+++ b/rpython/translator/c/gc.py
@@ -71,13 +71,20 @@
         return ''
 
     def OP_GC_THREAD_RUN(self, funcgen, op):
-        return ''
+        # The gc transformer leaves this operation in the graphs
+        # in all cases except with framework+shadowstack.  In that
+        # case the operation is removed because redundant with
+        # rthread.get_or_make_ident().
+        return 'RPY_THREADLOCALREF_ENSURE();'
 
     def OP_GC_THREAD_START(self, funcgen, op):
         return ''
 
     def OP_GC_THREAD_DIE(self, funcgen, op):
-        return ''
+        # The gc transformer leaves this operation in the graphs
+        # (but may insert a call to a gcrootfinder-specific
+        # function just before).
+        return 'RPython_ThreadLocals_ThreadDie();'
 
     def OP_GC_THREAD_BEFORE_FORK(self, funcgen, op):
         return '%s = NULL;' % funcgen.expr(op.result)
diff --git a/rpython/translator/c/genc.py b/rpython/translator/c/genc.py
--- a/rpython/translator/c/genc.py
+++ b/rpython/translator/c/genc.py
@@ -703,8 +703,27 @@
     for node in structdeflist:
         for line in node.definition():
             print >> f, line
+    gen_threadlocal_structdef(f, database)
     print >> f, "#endif"
 
+def gen_threadlocal_structdef(f, database):
+    from rpython.translator.c.support import cdecl
+    print >> f
+    bk = database.translator.annotator.bookkeeper
+    fields = list(bk.thread_local_fields)
+    fields.sort(key=lambda field: field.fieldname)
+    for field in fields:
+        print >> f, ('#define RPY_TLOFS_%s  offsetof(' % field.fieldname +
+                     'struct pypy_threadlocal_s, %s)' % field.fieldname)
+    print >> f, 'struct pypy_threadlocal_s {'
+    print >> f, '\tint ready;'
+    print >> f, '\tchar *stack_end;'
+    for field in fields:
+        typename = database.gettype(field.FIELDTYPE)
+        print >> f, '\t%s;' % cdecl(typename, field.fieldname)
+    print >> f, '};'
+    print >> f
+
 def gen_forwarddecl(f, database):
     print >> f, '/***********************************************************/'
     print >> f, '/***  Forward declarations                               ***/'
@@ -730,6 +749,11 @@
     # generate the start-up code and put it into a function
     print >> f, 'char *RPython_StartupCode(void) {'
     print >> f, '\tchar *error = NULL;'
+
+    bk = database.translator.annotator.bookkeeper
+    if bk.thread_local_fields:
+        print >> f, '\tRPython_ThreadLocals_ProgramInit();'
+
     for line in database.gcpolicy.gc_startup_code():
         print >> f,"\t" + line
 
@@ -748,6 +772,7 @@
                 print >> f, '\tif (error) return error;'
             for line in lines:
                 print >> f, '\t'+line
+
     print >> f, '\treturn error;'
     print >> f, '}'
 
@@ -770,6 +795,8 @@
         srcdir / 'asm.c',
         srcdir / 'instrument.c',
         srcdir / 'int.c',
+        srcdir / 'stack.c',
+        srcdir / 'threadlocal.c',
     ]
     if _CYGWIN:
         files.append(srcdir / 'cygwin_wait.c')
diff --git a/rpython/translator/c/node.py b/rpython/translator/c/node.py
--- a/rpython/translator/c/node.py
+++ b/rpython/translator/c/node.py
@@ -966,30 +966,12 @@
                 args.append('0')
         yield 'RPyOpaque_SETUP_%s(%s);' % (T.tag, ', '.join(args))
 
-class ThreadLocalRefOpaqueNode(ContainerNode):
-    nodekind = 'tlrefopaque'
-
-    def basename(self):
-        return self.obj._name
-
-    def enum_dependencies(self):
-        return []
-
-    def initializationexpr(self, decoration=''):
-        return ['0']
-
-    def startupcode(self):
-        p = self.getptrname()
-        yield 'RPyThreadStaticTLS_Create(%s);' % (p,)
-
 
 def opaquenode_factory(db, T, obj):
     if T == RuntimeTypeInfo:
         return db.gcpolicy.rtti_node_factory()(db, T, obj)
     if T.hints.get("render_structure", False):
         return ExtType_OpaqueNode(db, T, obj)
-    if T.hints.get("threadlocalref", False):
-        return ThreadLocalRefOpaqueNode(db, T, obj)
     raise Exception("don't know about %r" % (T,))
 
 
diff --git a/rpython/translator/c/src/g_include.h b/rpython/translator/c/src/g_include.h
--- a/rpython/translator/c/src/g_include.h
+++ b/rpython/translator/c/src/g_include.h
@@ -19,6 +19,8 @@
 #include "src/address.h"
 #include "src/unichar.h"
 #include "src/llgroup.h"
+#include "src/stack.h"
+#include "src/threadlocal.h"
 
 #include "src/instrument.h"
 #include "src/asm.h"
diff --git a/rpython/translator/c/src/g_prerequisite.h b/rpython/translator/c/src/g_prerequisite.h
--- a/rpython/translator/c/src/g_prerequisite.h
+++ b/rpython/translator/c/src/g_prerequisite.h
@@ -23,6 +23,3 @@
 # define RPY_LENGTH0     1       /* array decl [0] are bad */
 # define RPY_DUMMY_VARLENGTH     /* nothing */
 #endif
-
-
-#include "src/threadlocal.h"
diff --git a/rpython/translator/c/src/stack.c b/rpython/translator/c/src/stack.c
--- a/rpython/translator/c/src/stack.c
+++ b/rpython/translator/c/src/stack.c
@@ -1,6 +1,8 @@
 /* Stack operation */
+#include "common_header.h"
+#include "structdef.h"       /* for struct pypy_threadlocal_s */
 #include <src/stack.h>
-#include <src/thread.h>
+#include <src/threadlocal.h>
 #include <stdio.h>
 
 
@@ -9,7 +11,6 @@
 char *_LLstacktoobig_stack_end = NULL;
 long _LLstacktoobig_stack_length = MAX_STACK_SIZE;
 char _LLstacktoobig_report_error = 1;
-static RPyThreadStaticTLS end_tls_key;
 
 void LL_stack_set_length_fraction(double fraction)
 {
@@ -20,6 +21,8 @@
 {
 	long diff, max_stack_size;
 	char *baseptr, *curptr = (char*)current;
+	char *tl;
+	struct pypy_threadlocal_s *tl1;
 
 	/* The stack_end variable is updated to match the current value
 	   if it is still 0 or if we later find a 'curptr' position
@@ -27,15 +30,9 @@
 	   thread-local storage, but we try to minimize its overhead by
 	   keeping a local copy in _LLstacktoobig_stack_end. */
 
-	if (_LLstacktoobig_stack_end == NULL) {
-		/* not initialized */
-		/* XXX We assume that initialization is performed early,
-		   when there is still only one thread running.  This
-		   allows us to ignore race conditions here */
-		RPyThreadStaticTLS_Create(&end_tls_key);
-	}
-
-	baseptr = (char *) RPyThreadStaticTLS_Get(end_tls_key);
+	OP_THREADLOCALREF_ADDR(tl);
+	tl1 = (struct pypy_threadlocal_s *)tl;
+	baseptr = tl1->stack_end;
 	max_stack_size = _LLstacktoobig_stack_length;
 	if (baseptr == NULL) {
 		/* first time we see this thread */
@@ -58,7 +55,7 @@
 
 	/* update the stack base pointer to the current value */
 	baseptr = curptr;
-	RPyThreadStaticTLS_Set(end_tls_key, baseptr);
+	tl1->stack_end = baseptr;
 	_LLstacktoobig_stack_end = baseptr;
 	return 0;
 }
diff --git a/rpython/translator/c/src/stack.h b/rpython/translator/c/src/stack.h
--- a/rpython/translator/c/src/stack.h
+++ b/rpython/translator/c/src/stack.h
@@ -2,14 +2,13 @@
 /************************************************************/
  /***  C header subsection: stack operations               ***/
 
+#include <src/precommondefs.h>
+
+
 #ifndef MAX_STACK_SIZE
 #    define MAX_STACK_SIZE (3 << 18)    /* 768 kb */
 #endif
 
-/* This include must be done in any case to initialise
- * the header dependencies early (winsock2, before windows.h).
- * It is needed to have RPyThreadStaticTLS, too. */
-#include "threadlocal.h"
 
 RPY_EXTERN char *_LLstacktoobig_stack_end;
 RPY_EXTERN long _LLstacktoobig_stack_length;
diff --git a/rpython/translator/c/src/support.h b/rpython/translator/c/src/support.h
--- a/rpython/translator/c/src/support.h
+++ b/rpython/translator/c/src/support.h
@@ -2,6 +2,9 @@
 /************************************************************/
  /***  C header subsection: support functions              ***/
 
+#ifndef _SRC_SUPPORT_H
+#define _SRC_SUPPORT_H
+
 #define RUNNING_ON_LLINTERP	0
 #define OP_JIT_RECORD_KNOWN_CLASS(i, c, r)  /* nothing */
 
@@ -65,3 +68,5 @@
 #  define RPyNLenItem(array, index)          ((array)->items[index])
 #  define RPyBareItem(array, index)          ((array)[index])
 #endif
+
+#endif  /* _SRC_SUPPORT_H */
diff --git a/rpython/translator/c/src/thread_nt.c b/rpython/translator/c/src/thread_nt.c
--- a/rpython/translator/c/src/thread_nt.c
+++ b/rpython/translator/c/src/thread_nt.c
@@ -26,15 +26,6 @@
 
 static long _pypythread_stacksize = 0;
 
-/*
- * Return the thread Id instead of an handle. The Id is said to uniquely
-   identify the thread in the system
- */
-long RPyThreadGetIdent()
-{
-  return GetCurrentThreadId();
-}
-
 static void
 bootstrap(void *call)
 {
@@ -42,7 +33,7 @@
 	/* copy callobj since other thread might free it before we're done */
 	void (*func)(void) = obj->func;
 
-	obj->id = RPyThreadGetIdent();
+	obj->id = GetCurrentThreadId();
 	ReleaseSemaphore(obj->done, 1, NULL);
 	func();
 }
diff --git a/rpython/translator/c/src/thread_nt.h b/rpython/translator/c/src/thread_nt.h
--- a/rpython/translator/c/src/thread_nt.h
+++ b/rpython/translator/c/src/thread_nt.h
@@ -13,8 +13,6 @@
 
 /* prototypes */
 RPY_EXTERN
-long RPyThreadGetIdent(void);
-RPY_EXTERN
 long RPyThreadStart(void (*func)(void));
 RPY_EXTERN
 int RPyThreadLockInit(struct RPyOpaque_ThreadLock *lock);
diff --git a/rpython/translator/c/src/thread_pthread.c b/rpython/translator/c/src/thread_pthread.c
--- a/rpython/translator/c/src/thread_pthread.c
+++ b/rpython/translator/c/src/thread_pthread.c
@@ -56,30 +56,6 @@
 # endif
 #endif
 
-/* XXX This implementation is considered (to quote Tim Peters) "inherently
-   hosed" because:
-     - It does not guarantee the promise that a non-zero integer is returned.
-     - The cast to long is inherently unsafe.
-     - It is not clear that the 'volatile' (for AIX?) and ugly casting in the
-       latter return statement (for Alpha OSF/1) are any longer necessary.
-*/
-long RPyThreadGetIdent(void)
-{
-	volatile pthread_t threadid;
-	/* Jump through some hoops for Alpha OSF/1 */
-	threadid = pthread_self();
-
-#ifdef __CYGWIN__
-	/* typedef __uint32_t pthread_t; */
-	return (long) threadid;
-#else
-	if (sizeof(pthread_t) <= sizeof(long))
-		return (long) threadid;
-	else
-		return (long) *(long *) &threadid;
-#endif
-}
-
 static long _pypythread_stacksize = 0;
 
 static void *bootstrap_pthread(void *func)
diff --git a/rpython/translator/c/src/thread_pthread.h b/rpython/translator/c/src/thread_pthread.h
--- a/rpython/translator/c/src/thread_pthread.h
+++ b/rpython/translator/c/src/thread_pthread.h
@@ -60,8 +60,6 @@
 /* prototypes */
 
 RPY_EXTERN
-long RPyThreadGetIdent(void);
-RPY_EXTERN
 long RPyThreadStart(void (*func)(void));
 RPY_EXTERN
 int RPyThreadLockInit(struct RPyOpaque_ThreadLock *lock);
diff --git a/rpython/translator/c/src/threadlocal.c b/rpython/translator/c/src/threadlocal.c
--- a/rpython/translator/c/src/threadlocal.c
+++ b/rpython/translator/c/src/threadlocal.c
@@ -1,28 +1,117 @@
+#include "common_header.h"
+#include "structdef.h"       /* for struct pypy_threadlocal_s */
 #include <stdio.h>
 #include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#ifndef _WIN32
+# include <pthread.h>
+#endif
 #include "src/threadlocal.h"
 
+
+static void _RPy_ThreadLocals_Init(void *p)
+{
+    memset(p, 0, sizeof(struct pypy_threadlocal_s));
+#ifdef RPY_TLOFS_p_errno
+    ((struct pypy_threadlocal_s *)p)->p_errno = &errno;
+#endif
+#ifdef RPY_TLOFS_thread_ident
+    ((struct pypy_threadlocal_s *)p)->thread_ident =
+#    ifdef _WIN32
+        GetCurrentThreadId();
+#    else
+        (long)pthread_self();    /* xxx This abuses pthread_self() by
+                  assuming it just returns a integer.  According to
+                  comments in CPython's source code, the platforms
+                  where it is not the case are rather old nowadays. */
+#    endif
+#endif
+    ((struct pypy_threadlocal_s *)p)->ready = 42;
+}
+
+
+/* ------------------------------------------------------------ */
+#ifdef USE___THREAD
+/* ------------------------------------------------------------ */
+
+
+/* in this situation, we always have one full 'struct pypy_threadlocal_s'
+   available, managed by gcc. */
+__thread struct pypy_threadlocal_s pypy_threadlocal;
+
+void RPython_ThreadLocals_ProgramInit(void)
+{
+    _RPy_ThreadLocals_Init(&pypy_threadlocal);
+}
+
+char *_RPython_ThreadLocals_Build(void)
+{
+    RPyAssert(pypy_threadlocal.ready == 0, "corrupted thread-local");
+    _RPy_ThreadLocals_Init(&pypy_threadlocal);
+    return (char *)&pypy_threadlocal;
+}
+
+void RPython_ThreadLocals_ThreadDie(void)
+{
+    memset(&pypy_threadlocal, 0xDD,
+           sizeof(struct pypy_threadlocal_s));  /* debug */
+    pypy_threadlocal.ready = 0;
+}
+
+
+/* ------------------------------------------------------------ */
+#else
+/* ------------------------------------------------------------ */
+
+
+/* this is the case where the 'struct pypy_threadlocal_s' is allocated
+   explicitly, with malloc()/free(), and attached to (a single) thread-
+   local key using the API of Windows or pthread. */
+
+pthread_key_t pypy_threadlocal_key;
+
+
+void RPython_ThreadLocals_ProgramInit(void)
+{
 #ifdef _WIN32
-
-void RPyThreadTLS_Create(RPyThreadTLS *result)
-{
-    *result = TlsAlloc();
-    if (*result == TLS_OUT_OF_INDEXES) {
+    pypy_threadlocal_key = TlsAlloc();
+    if (pypy_threadlocal_key == TLS_OUT_OF_INDEXES)
+#else
+    if (pthread_key_create(&pypy_threadlocal_key, NULL) != 0)
+#endif
+    {
         fprintf(stderr, "Internal RPython error: "
                         "out of thread-local storage indexes");
         abort();
     }
+    _RPython_ThreadLocals_Build();
 }
 
-#else
+char *_RPython_ThreadLocals_Build(void)
+{
+    void *p = malloc(sizeof(struct pypy_threadlocal_s));
+    if (!p) {
+        fprintf(stderr, "Internal RPython error: "
+                        "out of memory for the thread-local storage");
+        abort();
+    }
+    _RPy_ThreadLocals_Init(p);
+    _RPy_ThreadLocals_Set(p);
+    return (char *)p;
+}
 
-void RPyThreadTLS_Create(RPyThreadTLS *result)
+void RPython_ThreadLocals_ThreadDie(void)
 {
-    if (pthread_key_create(result, NULL) != 0) {
-        fprintf(stderr, "Internal RPython error: "
-                        "out of thread-local storage keys");
-        abort();
+    void *p = _RPy_ThreadLocals_Get();
+    if (p != NULL) {
+        _RPy_ThreadLocals_Set(NULL);
+        memset(p, 0xDD, sizeof(struct pypy_threadlocal_s));  /* debug */
+        free(p);
     }
 }
 
+
+/* ------------------------------------------------------------ */
 #endif
+/* ------------------------------------------------------------ */
diff --git a/rpython/translator/c/src/threadlocal.h b/rpython/translator/c/src/threadlocal.h
--- a/rpython/translator/c/src/threadlocal.h
+++ b/rpython/translator/c/src/threadlocal.h
@@ -2,51 +2,98 @@
 #ifndef _SRC_THREADLOCAL_H
 #define _SRC_THREADLOCAL_H
 
-#include <src/precommondefs.h>
+#include "src/precommondefs.h"
+#include "src/support.h"
 
 
+/* RPython_ThreadLocals_ProgramInit() is called once at program start-up. */
+RPY_EXTERN void RPython_ThreadLocals_ProgramInit(void);
+
+/* RPython_ThreadLocals_ThreadDie() is called in a thread that is about
+   to die. */
+RPY_EXTERN void RPython_ThreadLocals_ThreadDie(void);
+
+/* There are two llops: 'threadlocalref_addr' and 'threadlocalref_make'.
+   They both return the address of the thread-local structure (of the
+   C type 'struct pypy_threadlocal_s').  The difference is that
+   OP_THREADLOCALREF_MAKE() checks if we have initialized this thread-
+   local structure in the current thread, and if not, calls the following
+   helper. */
+RPY_EXTERN char *_RPython_ThreadLocals_Build(void);
+
+
+/* ------------------------------------------------------------ */
+#ifdef USE___THREAD
+/* ------------------------------------------------------------ */
+
+
+/* Use the '__thread' specifier, so far only on Linux */
+
+RPY_EXTERN __thread struct pypy_threadlocal_s pypy_threadlocal;
+
+#define OP_THREADLOCALREF_ADDR(r)               \
+    do {                                        \
+        r = (char *)&pypy_threadlocal;          \
+        if (pypy_threadlocal.ready != 42)       \
+            r = _RPython_ThreadLocals_Build();  \
+    } while (0)
+
+#define RPY_THREADLOCALREF_ENSURE()             \
+    if (pypy_threadlocal.ready != 42)           \
+        (void)_RPython_ThreadLocals_Build();
+
+#define RPY_THREADLOCALREF_GET(FIELD)   pypy_threadlocal.FIELD
+
+
+/* ------------------------------------------------------------ */
+#else
+/* ------------------------------------------------------------ */
+
+
+/* Don't use '__thread'. */
+
 #ifdef _WIN32
-
-#include <WinSock2.h>
-#include <windows.h>
-#define __thread __declspec(thread)
-typedef DWORD RPyThreadTLS;
-#define RPyThreadTLS_Get(key)		TlsGetValue(key)
-#define RPyThreadTLS_Set(key, value)	TlsSetValue(key, value)
-
+#  include <WinSock2.h>
+#  include <windows.h>
+#  define _RPy_ThreadLocals_Get()   TlsGetValue(pypy_threadlocal_key)
+#  define _RPy_ThreadLocals_Set(x)  TlsSetValue(pypy_threadlocal_key, x)
+typedef DWORD pthread_key_t;
 #else
-
-#include <pthread.h>
-typedef pthread_key_t RPyThreadTLS;
-#define RPyThreadTLS_Get(key)		pthread_getspecific(key)
-#define RPyThreadTLS_Set(key, value)	pthread_setspecific(key, value)
-
+#  include <pthread.h>
+#  define _RPy_ThreadLocals_Get()   pthread_getspecific(pypy_threadlocal_key)
+#  define _RPy_ThreadLocals_Set(x)  pthread_setspecific(pypy_threadlocal_key, x)
 #endif
 
+RPY_EXTERN pthread_key_t pypy_threadlocal_key;
 
-#ifdef USE___THREAD
 
-#define RPyThreadStaticTLS                  __thread void *
-#define RPyThreadStaticTLS_Create(tls)      (void)0
-#define RPyThreadStaticTLS_Get(tls)         tls
-#define RPyThreadStaticTLS_Set(tls, value)  tls = value
-#define OP_THREADLOCALREF_GETADDR(tlref, ptr)  ptr = tlref
+#define OP_THREADLOCALREF_ADDR(r)               \
+    do {                                        \
+        r = (char *)_RPy_ThreadLocals_Get();    \
+        if (!r)                                 \
+            r = _RPython_ThreadLocals_Build();  \
+    } while (0)
 
+#define RPY_THREADLOCALREF_ENSURE()             \
+    if (!_RPy_ThreadLocals_Get())               \
+        (void)_RPython_ThreadLocals_Build();
+
+#define RPY_THREADLOCALREF_GET(FIELD)           \
+    ((struct pypy_threadlocal_s *)_RPy_ThreadLocals_Get())->FIELD
+
+
+/* ------------------------------------------------------------ */
 #endif
+/* ------------------------------------------------------------ */
 
-#ifndef RPyThreadStaticTLS
 
-#define RPyThreadStaticTLS             RPyThreadTLS
-#define RPyThreadStaticTLS_Create(key) RPyThreadTLS_Create(key)
-#define RPyThreadStaticTLS_Get(key)    RPyThreadTLS_Get(key)
-#define RPyThreadStaticTLS_Set(key, value) RPyThreadTLS_Set(key, value)
-RPY_EXTERN void RPyThreadTLS_Create(RPyThreadTLS *result);
-
-#endif
-
-
-#define OP_THREADLOCALREF_SET(tlref, ptr, _) RPyThreadStaticTLS_Set(*tlref, ptr)
-#define OP_THREADLOCALREF_GET(tlref, ptr)   ptr = RPyThreadStaticTLS_Get(*tlref)
+/* only for the fall-back path in the JIT */
+#define OP_THREADLOCALREF_GET_NONCONST(RESTYPE, offset, r)      \
+    do {                                                        \
+        char *a;                                                \
+        OP_THREADLOCALREF_ADDR(a);                              \
+        r = *(RESTYPE *)(a + offset);                           \
+    } while (0)
 
 
 #endif /* _SRC_THREADLOCAL_H */
diff --git a/rpython/translator/c/test/test_standalone.py b/rpython/translator/c/test/test_standalone.py
--- a/rpython/translator/c/test/test_standalone.py
+++ b/rpython/translator/c/test/test_standalone.py
@@ -2,6 +2,7 @@
 import sys, os, re
 
 from rpython.config.translationoption import get_combined_translation_config
+from rpython.config.translationoption import SUPPORT__THREAD
 from rpython.rlib.objectmodel import keepalive_until_here
 from rpython.rlib.rarithmetic import r_longlong
 from rpython.rlib.debug import ll_assert, have_debug_prints, debug_flush
@@ -1026,11 +1027,12 @@
     gcrootfinder = 'shadowstack'
     config = None
 


More information about the pypy-commit mailing list