[pypy-commit] pypy faster-nested-scopes: merge default

Sun Aug 28 12:39:33 CEST 2011

Author: Carl Friedrich Bolz <cfbolz at gmx.de>
Branch: faster-nested-scopes
Changeset: r46861:e846103b0972
Date: 2011-08-27 22:01 +0200
http://bitbucket.org/pypy/pypy/changeset/e846103b0972/

Log:	merge default

diff too long, truncating to 10000 out of 12982 lines

diff --git a/lib_pypy/greenlet.py b/lib_pypy/greenlet.py
--- a/lib_pypy/greenlet.py
+++ b/lib_pypy/greenlet.py
@@ -1,1 +1,138 @@
-from _stackless import greenlet
+import _continuation, sys
+
+
+# ____________________________________________________________
+# Exceptions
+
+class GreenletExit(Exception):
+    """This special exception does not propagate to the parent greenlet; it
+can be used to kill a single greenlet."""
+
+error = _continuation.error
+
+# ____________________________________________________________
+# Helper function
+
+def getcurrent():
+    "Returns the current greenlet (i.e. the one which called this function)."
+    try:
+        return _tls.current
+    except AttributeError:
+        # first call in this thread: current == main
+        _green_create_main()
+        return _tls.current
+
+# ____________________________________________________________
+# The 'greenlet' class
+
+_continulet = _continuation.continulet
+
+class greenlet(_continulet):
+    getcurrent = staticmethod(getcurrent)
+    error = error
+    GreenletExit = GreenletExit
+    __main = False
+    __started = False
+
+    def __new__(cls, *args, **kwds):
+        self = _continulet.__new__(cls)
+        self.parent = getcurrent()
+        return self
+
+    def __init__(self, run=None, parent=None):
+        if run is not None:
+            self.run = run
+        if parent is not None:
+            self.parent = parent
+
+    def switch(self, *args):
+        "Switch execution to this greenlet, optionally passing the values "
+        "given as argument(s).  Returns the value passed when switching back."
+        return self.__switch(_continulet.switch, args)
+
+    def throw(self, typ=GreenletExit, val=None, tb=None):
+        "raise exception in greenlet, return value passed when switching back"
+        return self.__switch(_continulet.throw, typ, val, tb)
+
+    def __switch(target, unbound_method, *args):
+        current = getcurrent()
+        #
+        while not target:
+            if not target.__started:
+                _continulet.__init__(target, _greenlet_start, *args)
+                args = ()
+                target.__started = True
+                break
+            # already done, go to the parent instead
+            # (NB. infinite loop possible, but unlikely, unless you mess
+            # up the 'parent' explicitly.  Good enough, because a Ctrl-C
+            # will show that the program is caught in this loop here.)
+            target = target.parent
+        #
+        try:
+            if current.__main:
+                if target.__main:
+                    # switch from main to main
+                    if unbound_method == _continulet.throw:
+                        raise args[0], args[1], args[2]
+                    (args,) = args
+                else:
+                    # enter from main to target
+                    args = unbound_method(target, *args)
+            else:
+                if target.__main:
+                    # leave to go to target=main
+                    args = unbound_method(current, *args)
+                else:
+                    # switch from non-main to non-main
+                    args = unbound_method(current, *args, to=target)
+        except GreenletExit, e:
+            args = (e,)
+        finally:
+            _tls.current = current
+        #
+        if len(args) == 1:
+            return args[0]
+        else:
+            return args
+
+    def __nonzero__(self):
+        return self.__main or _continulet.is_pending(self)
+
+    @property
+    def dead(self):
+        return self.__started and not self
+
+    @property
+    def gr_frame(self):
+        raise NotImplementedError("attribute 'gr_frame' of greenlet objects")
+
+# ____________________________________________________________
+# Internal stuff
+
+try:
+    from thread import _local
+except ImportError:
+    class _local(object):    # assume no threads
+        pass
+
+_tls = _local()
+
+def _green_create_main():
+    # create the main greenlet for this thread
+    _tls.current = None
+    gmain = greenlet.__new__(greenlet)
+    gmain._greenlet__main = True
+    gmain._greenlet__started = True
+    assert gmain.parent is None
+    _tls.main = gmain
+    _tls.current = gmain
+
+def _greenlet_start(greenlet, args):
+    _tls.current = greenlet
+    try:
+        res = greenlet.run(*args)
+    finally:
+        if greenlet.parent is not _tls.main:
+            _continuation.permute(greenlet, greenlet.parent)
+    return (res,)
diff --git a/pypy/config/pypyoption.py b/pypy/config/pypyoption.py
--- a/pypy/config/pypyoption.py
+++ b/pypy/config/pypyoption.py
@@ -33,7 +33,8 @@
      "struct", "_hashlib", "_md5", "_sha", "_minimal_curses", "cStringIO",
      "thread", "itertools", "pyexpat", "_ssl", "cpyext", "array",
      "_bisect", "binascii", "_multiprocessing", '_warnings',
-     "_collections", "_multibytecodec", "micronumpy", "_ffi"]
+     "_collections", "_multibytecodec", "micronumpy", "_ffi",
+     "_continuation"]
 ))
 
 translation_modules = default_modules.copy()
@@ -99,6 +100,7 @@
     "_ssl"      : ["pypy.module._ssl.interp_ssl"],
     "_hashlib"  : ["pypy.module._ssl.interp_ssl"],
     "_minimal_curses": ["pypy.module._minimal_curses.fficurses"],
+    "_continuation": ["pypy.rlib.rstacklet"],
     }
 
 def get_module_validator(modname):
diff --git a/pypy/config/translationoption.py b/pypy/config/translationoption.py
--- a/pypy/config/translationoption.py
+++ b/pypy/config/translationoption.py
@@ -28,10 +28,9 @@
 
 translation_optiondescription = OptionDescription(
         "translation", "Translation Options", [
-    BoolOption("stackless", "enable stackless features during compilation",
-               default=False, cmdline="--stackless",
-               requires=[("translation.type_system", "lltype"),
-                         ("translation.gcremovetypeptr", False)]),  # XXX?
+    BoolOption("continuation", "enable single-shot continuations",
+               default=False, cmdline="--continuation",
+               requires=[("translation.type_system", "lltype")]),
     ChoiceOption("type_system", "Type system to use when RTyping",
                  ["lltype", "ootype"], cmdline=None, default="lltype",
                  requires={
@@ -70,7 +69,8 @@
                      "statistics": [("translation.gctransformer", "framework")],
                      "generation": [("translation.gctransformer", "framework")],
                      "hybrid": [("translation.gctransformer", "framework")],
-                     "boehm": [("translation.gctransformer", "boehm")],
+                     "boehm": [("translation.gctransformer", "boehm"),
+                               ("translation.continuation", False)],  # breaks
                      "markcompact": [("translation.gctransformer", "framework")],
                      "minimark": [("translation.gctransformer", "framework")],
                      },
@@ -389,8 +389,6 @@
             config.translation.suggest(withsmallfuncsets=5)
         elif word == 'jit':
             config.translation.suggest(jit=True)
-            if config.translation.stackless:
-                raise NotImplementedError("JIT conflicts with stackless for now")
         elif word == 'removetypeptr':
             config.translation.suggest(gcremovetypeptr=True)
         else:
diff --git a/pypy/doc/config/objspace.usemodules._stackless.txt b/pypy/doc/config/objspace.usemodules._continuation.txt
copy from pypy/doc/config/objspace.usemodules._stackless.txt
copy to pypy/doc/config/objspace.usemodules._continuation.txt
--- a/pypy/doc/config/objspace.usemodules._stackless.txt
+++ b/pypy/doc/config/objspace.usemodules._continuation.txt
@@ -1,6 +1,4 @@
-Use the '_stackless' module. 
+Use the '_continuation' module. 
 
-Exposes the `stackless` primitives, and also implies a stackless build. 
-See also :config:`translation.stackless`.
-
-.. _`stackless`: ../stackless.html
+Exposes the `continulet` app-level primitives.
+See also :config:`translation.continuation`.
diff --git a/pypy/doc/config/objspace.usemodules._stackless.txt b/pypy/doc/config/objspace.usemodules._stackless.txt
--- a/pypy/doc/config/objspace.usemodules._stackless.txt
+++ b/pypy/doc/config/objspace.usemodules._stackless.txt
@@ -1,6 +1,1 @@
-Use the '_stackless' module. 
-
-Exposes the `stackless` primitives, and also implies a stackless build. 
-See also :config:`translation.stackless`.
-
-.. _`stackless`: ../stackless.html
+Deprecated.
diff --git a/pypy/doc/config/translation.stackless.txt b/pypy/doc/config/translation.continuation.txt
rename from pypy/doc/config/translation.stackless.txt
rename to pypy/doc/config/translation.continuation.txt
--- a/pypy/doc/config/translation.stackless.txt
+++ b/pypy/doc/config/translation.continuation.txt
@@ -1,5 +1,2 @@
-Run the `stackless transform`_ on each generated graph, which enables the use
-of coroutines at RPython level and the "stackless" module when translating
-PyPy.
-
-.. _`stackless transform`: ../stackless.html
+Enable the use of a stackless-like primitive called "stacklet".
+In PyPy, this is exposed at app-level by the "_continuation" module.
diff --git a/pypy/interpreter/test/test_gateway.py b/pypy/interpreter/test/test_gateway.py
--- a/pypy/interpreter/test/test_gateway.py
+++ b/pypy/interpreter/test/test_gateway.py
@@ -704,7 +704,7 @@
 class TestPassThroughArguments_CALL_METHOD(TestPassThroughArguments):
 
     def setup_class(cls):
-        space = gettestobjspace(usemodules=('_stackless',), **{
+        space = gettestobjspace(usemodules=('itertools',), **{
             "objspace.opcodes.CALL_METHOD": True
             })
         cls.space = space
diff --git a/pypy/jit/backend/llgraph/runner.py b/pypy/jit/backend/llgraph/runner.py
--- a/pypy/jit/backend/llgraph/runner.py
+++ b/pypy/jit/backend/llgraph/runner.py
@@ -312,7 +312,7 @@
         token = history.getkind(getattr(S, fieldname))
         return self.getdescr(ofs, token[0], name=fieldname)
 
-    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo):
         arg_types = []
         for ARG in ARGS:
             token = history.getkind(ARG)
@@ -326,7 +326,7 @@
         return self.getdescr(0, token[0], extrainfo=extrainfo,
                              arg_types=''.join(arg_types))
 
-    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo=None):
+    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo):
         from pypy.jit.backend.llsupport.ffisupport import get_ffi_type_kind
         from pypy.jit.backend.llsupport.ffisupport import UnsupportedKind
         arg_types = []
@@ -522,7 +522,7 @@
         return FieldDescr.new(T1, fieldname)
 
     @staticmethod
-    def calldescrof(FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(FUNC, ARGS, RESULT, extrainfo):
         return StaticMethDescr.new(FUNC, ARGS, RESULT, extrainfo)
 
     @staticmethod
diff --git a/pypy/jit/backend/llsupport/gc.py b/pypy/jit/backend/llsupport/gc.py
--- a/pypy/jit/backend/llsupport/gc.py
+++ b/pypy/jit/backend/llsupport/gc.py
@@ -366,36 +366,92 @@
 
     def add_jit2gc_hooks(self, jit2gc):
         #
-        def collect_jit_stack_root(callback, gc, addr):
-            if addr.signed[0] != GcRootMap_shadowstack.MARKER:
-                # common case
-                if gc.points_to_valid_gc_object(addr):
-                    callback(gc, addr)
-                return WORD
-            else:
-                # case of a MARKER followed by an assembler stack frame
-                follow_stack_frame_of_assembler(callback, gc, addr)
-                return 2 * WORD
+        # ---------------
+        # This is used to enumerate the shadowstack in the presence
+        # of the JIT.  It is also used by the stacklet support in
+        # rlib/_stacklet_shadowstack.  That's why it is written as
+        # an iterator that can also be used with a custom_trace.
         #
-        def follow_stack_frame_of_assembler(callback, gc, addr):
-            frame_addr = addr.signed[1]
-            addr = llmemory.cast_int_to_adr(frame_addr + self.force_index_ofs)
-            force_index = addr.signed[0]
-            if force_index < 0:
-                force_index = ~force_index
-            callshape = self._callshapes[force_index]
-            n = 0
-            while True:
-                offset = rffi.cast(lltype.Signed, callshape[n])
-                if offset == 0:
-                    break
-                addr = llmemory.cast_int_to_adr(frame_addr + offset)
-                if gc.points_to_valid_gc_object(addr):
-                    callback(gc, addr)
-                n += 1
+        class RootIterator:
+            _alloc_flavor_ = "raw"
+
+            def next(iself, gc, next, range_highest):
+                # Return the "next" valid GC object' address.  This usually
+                # means just returning "next", until we reach "range_highest",
+                # except that we are skipping NULLs.  If "next" contains a
+                # MARKER instead, then we go into JIT-frame-lookup mode.
+                #
+                while True:
+                    #
+                    # If we are not iterating right now in a JIT frame
+                    if iself.frame_addr == 0:
+                        #
+                        # Look for the next shadowstack address that
+                        # contains a valid pointer
+                        while next != range_highest:
+                            if next.signed[0] == self.MARKER:
+                                break
+                            if gc.points_to_valid_gc_object(next):
+                                return next
+                            next += llmemory.sizeof(llmemory.Address)
+                        else:
+                            return llmemory.NULL     # done
+                        #
+                        # It's a JIT frame.  Save away 'next' for later, and
+                        # go into JIT-frame-exploring mode.
+                        next += llmemory.sizeof(llmemory.Address)
+                        frame_addr = next.signed[0]
+                        iself.saved_next = next
+                        iself.frame_addr = frame_addr
+                        addr = llmemory.cast_int_to_adr(frame_addr +
+                                                        self.force_index_ofs)
+                        addr = iself.translateptr(iself.context, addr)
+                        force_index = addr.signed[0]
+                        if force_index < 0:
+                            force_index = ~force_index
+                        # NB: the next line reads a still-alive _callshapes,
+                        # because we ensure that just before we called this
+                        # piece of assembler, we put on the (same) stack a
+                        # pointer to a loop_token that keeps the force_index
+                        # alive.
+                        callshape = self._callshapes[force_index]
+                    else:
+                        # Continuing to explore this JIT frame
+                        callshape = iself.callshape
+                    #
+                    # 'callshape' points to the next INT of the callshape.
+                    # If it's zero we are done with the JIT frame.
+                    while rffi.cast(lltype.Signed, callshape[0]) != 0:
+                        #
+                        # Non-zero: it's an offset inside the JIT frame.
+                        # Read it and increment 'callshape'.
+                        offset = rffi.cast(lltype.Signed, callshape[0])
+                        callshape = lltype.direct_ptradd(callshape, 1)
+                        addr = llmemory.cast_int_to_adr(iself.frame_addr +
+                                                        offset)
+                        addr = iself.translateptr(iself.context, addr)
+                        if gc.points_to_valid_gc_object(addr):
+                            #
+                            # The JIT frame contains a valid GC pointer at
+                            # this address (as opposed to NULL).  Save
+                            # 'callshape' for the next call, and return the
+                            # address.
+                            iself.callshape = callshape
+                            return addr
+                    #
+                    # Restore 'prev' and loop back to the start.
+                    iself.frame_addr = 0
+                    next = iself.saved_next
+                    next += llmemory.sizeof(llmemory.Address)
+
+        # ---------------
         #
+        root_iterator = RootIterator()
+        root_iterator.frame_addr = 0
+        root_iterator.context = llmemory.NULL
+        root_iterator.translateptr = lambda context, addr: addr
         jit2gc.update({
-            'rootstackhook': collect_jit_stack_root,
+            'root_iterator': root_iterator,
             })
 
     def initialize(self):
@@ -550,7 +606,7 @@
             has_finalizer = bool(tid & (1<<llgroup.HALFSHIFT))
             check_typeid(type_id)
             res = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
-                                                  type_id, size, True,
+                                                  type_id, size,
                                                   has_finalizer, False)
             # In case the operation above failed, we are returning NULL
             # from this function to assembler.  There is also an RPython
@@ -575,7 +631,7 @@
             return llop1.do_malloc_varsize_clear(
                 llmemory.GCREF,
                 type_id, num_elem, self.array_basesize, itemsize,
-                self.array_length_ofs, True)
+                self.array_length_ofs)
         self.malloc_array = malloc_array
         self.GC_MALLOC_ARRAY = lltype.Ptr(lltype.FuncType(
             [lltype.Signed] * 3, llmemory.GCREF))
@@ -591,12 +647,12 @@
             return llop1.do_malloc_varsize_clear(
                 llmemory.GCREF,
                 str_type_id, length, str_basesize, str_itemsize,
-                str_ofs_length, True)
+                str_ofs_length)
         def malloc_unicode(length):
             return llop1.do_malloc_varsize_clear(
                 llmemory.GCREF,
                 unicode_type_id, length, unicode_basesize,unicode_itemsize,
-                unicode_ofs_length, True)
+                unicode_ofs_length)
         self.malloc_str = malloc_str
         self.malloc_unicode = malloc_unicode
         self.GC_MALLOC_STR_UNICODE = lltype.Ptr(lltype.FuncType(
@@ -622,7 +678,7 @@
             # also use it to allocate varsized objects.  The tid
             # and possibly the length are both set afterward.
             gcref = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
-                                        0, size, True, False, False)
+                                        0, size, False, False)
             return rffi.cast(lltype.Signed, gcref)
         self.malloc_slowpath = malloc_slowpath
         self.MALLOC_SLOWPATH = lltype.FuncType([lltype.Signed], lltype.Signed)
diff --git a/pypy/jit/backend/llsupport/llmodel.py b/pypy/jit/backend/llsupport/llmodel.py
--- a/pypy/jit/backend/llsupport/llmodel.py
+++ b/pypy/jit/backend/llsupport/llmodel.py
@@ -254,10 +254,10 @@
         return ofs, size, sign
     unpack_arraydescr_size._always_inline_ = True
 
-    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo):
         return get_call_descr(self.gc_ll_descr, ARGS, RESULT, extrainfo)
 
-    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo=None):
+    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo):
         from pypy.jit.backend.llsupport import ffisupport
         return ffisupport.get_call_descr_dynamic(self, ffi_args, ffi_result,
                                                  extrainfo)
diff --git a/pypy/jit/backend/llsupport/test/test_gc.py b/pypy/jit/backend/llsupport/test/test_gc.py
--- a/pypy/jit/backend/llsupport/test/test_gc.py
+++ b/pypy/jit/backend/llsupport/test/test_gc.py
@@ -246,9 +246,8 @@
     def __init__(self):
         self.record = []
 
-    def do_malloc_fixedsize_clear(self, RESTYPE, type_id, size, can_collect,
+    def do_malloc_fixedsize_clear(self, RESTYPE, type_id, size,
                                   has_finalizer, contains_weakptr):
-        assert can_collect
         assert not contains_weakptr
         p = llmemory.raw_malloc(size)
         p = llmemory.cast_adr_to_ptr(p, RESTYPE)
@@ -258,8 +257,7 @@
         return p
 
     def do_malloc_varsize_clear(self, RESTYPE, type_id, length, size,
-                                itemsize, offset_to_length, can_collect):
-        assert can_collect
+                                itemsize, offset_to_length):
         p = llmemory.raw_malloc(size + itemsize * length)
         (p + offset_to_length).signed[0] = length
         p = llmemory.cast_adr_to_ptr(p, RESTYPE)
diff --git a/pypy/jit/backend/test/calling_convention_test.py b/pypy/jit/backend/test/calling_convention_test.py
--- a/pypy/jit/backend/test/calling_convention_test.py
+++ b/pypy/jit/backend/test/calling_convention_test.py
@@ -8,6 +8,7 @@
                                          ConstObj, BoxFloat, ConstFloat)
 from pypy.jit.metainterp.resoperation import ResOperation, rop
 from pypy.jit.metainterp.typesystem import deref
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.tool.oparser import parse
 from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rffi, rclass
 from pypy.rpython.ootypesystem import ootype
@@ -96,7 +97,8 @@
             FUNC = self.FuncType(funcargs, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             ops = '[%s]\n' % arguments
@@ -148,7 +150,8 @@
             FUNC = self.FuncType(args, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             res = self.execute_operation(rop.CALL,
@@ -190,7 +193,8 @@
             FUNC = self.FuncType(args, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             res = self.execute_operation(rop.CALL,
@@ -268,7 +272,8 @@
                 else:
                     ARGS.append(lltype.Signed)
             FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-                lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+                lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+                EffectInfo.MOST_GENERAL)
             ops = '''
             [%s]
             f99 = call_assembler(%s, descr=called_looptoken)
@@ -337,7 +342,8 @@
             FUNC = self.FuncType(args, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             res = self.execute_operation(rop.CALL,
diff --git a/pypy/jit/backend/test/runner_test.py b/pypy/jit/backend/test/runner_test.py
--- a/pypy/jit/backend/test/runner_test.py
+++ b/pypy/jit/backend/test/runner_test.py
@@ -9,6 +9,7 @@
                                          ConstObj, BoxFloat, ConstFloat)
 from pypy.jit.metainterp.resoperation import ResOperation, rop
 from pypy.jit.metainterp.typesystem import deref
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.tool.oparser import parse
 from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rffi, rclass
 from pypy.rpython.ootypesystem import ootype
@@ -445,7 +446,8 @@
             return chr(ord(c) + 1)
         FPTR = self.Ptr(self.FuncType([lltype.Char], lltype.Char))
         func_ptr = llhelper(FPTR, func)
-        calldescr = cpu.calldescrof(deref(FPTR), (lltype.Char,), lltype.Char)
+        calldescr = cpu.calldescrof(deref(FPTR), (lltype.Char,), lltype.Char,
+                                    EffectInfo.MOST_GENERAL)
         x = cpu.bh_call_i(self.get_funcbox(cpu, func_ptr).value,
                           calldescr, [ord('A')], None, None)
         assert x == ord('B')
@@ -458,7 +460,8 @@
                                           lltype.Float))
             func_ptr = llhelper(FPTR, func)
             FTP = deref(FPTR)
-            calldescr = cpu.calldescrof(FTP, FTP.ARGS, FTP.RESULT)
+            calldescr = cpu.calldescrof(FTP, FTP.ARGS, FTP.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             x = cpu.bh_call_f(self.get_funcbox(cpu, func_ptr).value,
                               calldescr,
                               [42], None, [longlong.getfloatstorage(3.5)])
@@ -486,13 +489,15 @@
             FUNC = deref(FPTR)
             funcbox = self.get_funcbox(cpu, func_ptr)
             # first, try it with the "normal" calldescr
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             res = self.execute_operation(rop.CALL,
                                          [funcbox, BoxInt(num), BoxInt(num)],
                                          'int', descr=calldescr)
             assert res.value == 2 * num
             # then, try it with the dynamic calldescr
-            dyn_calldescr = cpu.calldescrof_dynamic([ffi_type, ffi_type], ffi_type)
+            dyn_calldescr = cpu.calldescrof_dynamic([ffi_type, ffi_type], ffi_type,
+                                                    EffectInfo.MOST_GENERAL)
             res = self.execute_operation(rop.CALL,
                                          [funcbox, BoxInt(num), BoxInt(num)],
                                          'int', descr=dyn_calldescr)
@@ -507,7 +512,8 @@
             FUNC = self.FuncType([F] * 7 + [I] * 2 + [F] * 3, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
             args = ([boxfloat(.1) for i in range(7)] +
                     [BoxInt(1), BoxInt(2), boxfloat(.2), boxfloat(.3),
@@ -529,7 +535,8 @@
 
         FUNC = self.FuncType([lltype.Signed]*16, lltype.Signed)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         func_ptr = llhelper(FPTR, func)
         args = range(16)
         funcbox = self.get_funcbox(self.cpu, func_ptr)
@@ -552,7 +559,8 @@
             FPTR = self.Ptr(self.FuncType([TP] * nb_args, TP))
             func_ptr = llhelper(FPTR, func_ints)
             FUNC = deref(FPTR)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
             args = [280-24*i for i in range(nb_args)]
             res = self.execute_operation(rop.CALL,
@@ -566,7 +574,8 @@
 
         FUNC = self.FuncType([lltype.Float, lltype.Float], lltype.Float)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         func_ptr = llhelper(FPTR, func)
         funcbox = self.get_funcbox(self.cpu, func_ptr)
         res = self.execute_operation(rop.CALL, [funcbox, constfloat(1.5),
@@ -1589,7 +1598,8 @@
         '''
         FPTR = lltype.Ptr(lltype.FuncType([lltype.Signed], lltype.Void))
         fptr = llhelper(FPTR, func)
-        calldescr = self.cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT)
+        calldescr = self.cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT,
+                                         EffectInfo.MOST_GENERAL)
 
         xtp = lltype.malloc(rclass.OBJECT_VTABLE, immortal=True)
         xtp.subclassrange_min = 1
@@ -1807,7 +1817,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Void)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1850,7 +1861,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Signed)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1895,7 +1907,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Float)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1941,7 +1954,8 @@
         cpu = self.cpu
         func_adr = llmemory.cast_ptr_to_adr(c_tolower.funcsym)
         funcbox = ConstInt(heaptracker.adr2int(func_adr))
-        calldescr = cpu.calldescrof_dynamic([types.uchar], types.sint)
+        calldescr = cpu.calldescrof_dynamic([types.uchar], types.sint,
+                                            EffectInfo.MOST_GENERAL)
         i1 = BoxInt()
         i2 = BoxInt()
         tok = BoxInt()
@@ -1997,7 +2011,8 @@
         funcbox = ConstInt(heaptracker.adr2int(func_adr))
         calldescr = cpu.calldescrof_dynamic([types.pointer, types_size_t,
                                              types_size_t, types.pointer],
-                                            types.void)
+                                            types.void,
+                                            EffectInfo.MOST_GENERAL)
         i0 = BoxInt()
         i1 = BoxInt()
         i2 = BoxInt()
@@ -2292,7 +2307,8 @@
         ARGS = [lltype.Signed] * 10
         RES = lltype.Signed
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         for i in range(10):
             self.cpu.set_future_value_int(i, i+1)
         res = self.cpu.execute_token(looptoken)
@@ -2332,7 +2348,8 @@
         ARGS = [lltype.Float, lltype.Float]
         RES = lltype.Float
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         
         ops = '''
         [f0, f1]
@@ -2422,7 +2439,8 @@
         ARGS = [lltype.Float, lltype.Float]
         RES = lltype.Float
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         
         ops = '''
         [f0, f1]
@@ -2634,7 +2652,8 @@
             #
             FUNC = self.FuncType([lltype.Signed], RESTYPE)
             FPTR = self.Ptr(FUNC)
-            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                             EffectInfo.MOST_GENERAL)
             x = self.cpu.bh_call_i(self.get_funcbox(self.cpu, f).value,
                                    calldescr, [value], None, None)
             assert x == expected, (
@@ -2667,7 +2686,8 @@
             #
             FUNC = self.FuncType([lltype.Signed], RESTYPE)
             FPTR = self.Ptr(FUNC)
-            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                             EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(self.cpu, f)
             res = self.execute_operation(rop.CALL, [funcbox, BoxInt(value)],
                                          'int', descr=calldescr)
@@ -2701,7 +2721,8 @@
         #
         FUNC = self.FuncType([lltype.SignedLongLong], lltype.SignedLongLong)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         x = self.cpu.bh_call_f(self.get_funcbox(self.cpu, f).value,
                                calldescr, None, None, [value])
         assert x == expected
@@ -2728,7 +2749,8 @@
         #
         FUNC = self.FuncType([lltype.SignedLongLong], lltype.SignedLongLong)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         funcbox = self.get_funcbox(self.cpu, f)
         res = self.execute_operation(rop.CALL, [funcbox, BoxFloat(value)],
                                      'float', descr=calldescr)
@@ -2756,7 +2778,8 @@
         #
         FUNC = self.FuncType([lltype.SingleFloat], lltype.SingleFloat)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         ivalue = longlong.singlefloat2int(value)
         iexpected = longlong.singlefloat2int(expected)
         x = self.cpu.bh_call_i(self.get_funcbox(self.cpu, f).value,
@@ -2785,7 +2808,8 @@
         #
         FUNC = self.FuncType([lltype.SingleFloat], lltype.SingleFloat)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         funcbox = self.get_funcbox(self.cpu, f)
         ivalue = longlong.singlefloat2int(value)
         iexpected = longlong.singlefloat2int(expected)
diff --git a/pypy/jit/backend/test/test_ll_random.py b/pypy/jit/backend/test/test_ll_random.py
--- a/pypy/jit/backend/test/test_ll_random.py
+++ b/pypy/jit/backend/test/test_ll_random.py
@@ -6,6 +6,7 @@
 from pypy.jit.metainterp.history import BoxPtr, BoxInt
 from pypy.jit.metainterp.history import BasicFailDescr
 from pypy.jit.codewriter import heaptracker
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.rpython.annlowlevel import llhelper
 from pypy.rlib.rarithmetic import intmask
 from pypy.rpython.llinterp import LLException
@@ -468,6 +469,10 @@
         exec code in d
         return subset, d['f'], vtableptr
 
+    def getcalldescr(self, builder, TP):
+        ef = EffectInfo.MOST_GENERAL
+        return builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT, ef)
+
 # 1. non raising call and guard_no_exception
 class CallOperation(BaseCallOperation):
     def produce_into(self, builder, r):
@@ -481,7 +486,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         op = ResOperation(rop.GUARD_NO_EXCEPTION, [], None,
                           descr=BasicFailDescr())
@@ -501,7 +506,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         _, vtableptr = builder.get_random_structure_type_and_vtable(r)
         exc_box = ConstAddr(llmemory.cast_ptr_to_adr(vtableptr), builder.cpu)
@@ -523,7 +528,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         exc_box = ConstAddr(llmemory.cast_ptr_to_adr(exc), builder.cpu)
         op = ResOperation(rop.GUARD_EXCEPTION, [exc_box], BoxPtr(),
@@ -540,7 +545,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         op = ResOperation(rop.GUARD_NO_EXCEPTION, [], BoxPtr(),
                           descr=BasicFailDescr())
@@ -559,7 +564,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         while True:
             _, vtableptr = builder.get_random_structure_type_and_vtable(r)
diff --git a/pypy/jit/backend/x86/regalloc.py b/pypy/jit/backend/x86/regalloc.py
--- a/pypy/jit/backend/x86/regalloc.py
+++ b/pypy/jit/backend/x86/regalloc.py
@@ -843,8 +843,8 @@
 
     def consider_call(self, op):
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            oopspecindex = effectinfo.oopspecindex
+        oopspecindex = effectinfo.oopspecindex
+        if oopspecindex != EffectInfo.OS_NONE:
             if IS_X86_32:
                 # support for some of the llong operations,
                 # which only exist on x86-32
diff --git a/pypy/jit/backend/x86/test/test_gc_integration.py b/pypy/jit/backend/x86/test/test_gc_integration.py
--- a/pypy/jit/backend/x86/test/test_gc_integration.py
+++ b/pypy/jit/backend/x86/test/test_gc_integration.py
@@ -7,6 +7,7 @@
      BoxPtr, ConstPtr, TreeLoop
 from pypy.jit.metainterp.resoperation import rop, ResOperation
 from pypy.jit.codewriter import heaptracker
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.backend.llsupport.descr import GcCache
 from pypy.jit.backend.llsupport.gc import GcLLDescription
 from pypy.jit.backend.detect_cpu import getcpuclass
@@ -76,7 +77,8 @@
         for box in boxes:
             regalloc.rm.try_allocate_reg(box)
         TP = lltype.FuncType([], lltype.Signed)
-        calldescr = cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        calldescr = cpu.calldescrof(TP, TP.ARGS, TP.RESULT,
+                                    EffectInfo.MOST_GENERAL)
         regalloc.rm._check_invariants()
         box = boxes[0]
         regalloc.position = 0
diff --git a/pypy/jit/backend/x86/test/test_regalloc.py b/pypy/jit/backend/x86/test/test_regalloc.py
--- a/pypy/jit/backend/x86/test/test_regalloc.py
+++ b/pypy/jit/backend/x86/test/test_regalloc.py
@@ -16,6 +16,7 @@
 from pypy.rpython.annlowlevel import llhelper
 from pypy.rpython.lltypesystem import rclass, rstr
 from pypy.jit.codewriter import longlong
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.backend.x86.rx86 import *
 
 def test_is_comparison_or_ovf_op():
@@ -92,7 +93,8 @@
     zd_addr = cpu.cast_int_to_adr(zero_division_tp)
     zero_division_error = llmemory.cast_adr_to_ptr(zd_addr,
                                             lltype.Ptr(rclass.OBJECT_VTABLE))
-    raising_calldescr = cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT)
+    raising_calldescr = cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT,
+                                        EffectInfo.MOST_GENERAL)
 
     fdescr1 = BasicFailDescr(1)
     fdescr2 = BasicFailDescr(2)
@@ -115,9 +117,12 @@
     f2ptr = llhelper(F2PTR, f2)
     f10ptr = llhelper(F10PTR, f10)
 
-    f1_calldescr = cpu.calldescrof(F1PTR.TO, F1PTR.TO.ARGS, F1PTR.TO.RESULT)
-    f2_calldescr = cpu.calldescrof(F2PTR.TO, F2PTR.TO.ARGS, F2PTR.TO.RESULT)
-    f10_calldescr = cpu.calldescrof(F10PTR.TO, F10PTR.TO.ARGS, F10PTR.TO.RESULT)
+    f1_calldescr = cpu.calldescrof(F1PTR.TO, F1PTR.TO.ARGS, F1PTR.TO.RESULT,
+                                   EffectInfo.MOST_GENERAL)
+    f2_calldescr = cpu.calldescrof(F2PTR.TO, F2PTR.TO.ARGS, F2PTR.TO.RESULT,
+                                   EffectInfo.MOST_GENERAL)
+    f10_calldescr= cpu.calldescrof(F10PTR.TO, F10PTR.TO.ARGS, F10PTR.TO.RESULT,
+                                   EffectInfo.MOST_GENERAL)
 
     namespace = locals().copy()
     type_system = 'lltype'
diff --git a/pypy/jit/codewriter/call.py b/pypy/jit/codewriter/call.py
--- a/pypy/jit/codewriter/call.py
+++ b/pypy/jit/codewriter/call.py
@@ -6,7 +6,7 @@
 from pypy.jit.codewriter import support
 from pypy.jit.codewriter.jitcode import JitCode
 from pypy.jit.codewriter.effectinfo import (VirtualizableAnalyzer,
-    QuasiImmutAnalyzer, CanReleaseGILAnalyzer, effectinfo_from_writeanalyze,
+    QuasiImmutAnalyzer, RandomEffectsAnalyzer, effectinfo_from_writeanalyze,
     EffectInfo, CallInfoCollection)
 from pypy.translator.simplify import get_funcobj, get_functype
 from pypy.rpython.lltypesystem import lltype, llmemory
@@ -31,7 +31,7 @@
             self.readwrite_analyzer = ReadWriteAnalyzer(translator)
             self.virtualizable_analyzer = VirtualizableAnalyzer(translator)
             self.quasiimmut_analyzer = QuasiImmutAnalyzer(translator)
-            self.canreleasegil_analyzer = CanReleaseGILAnalyzer(translator)
+            self.randomeffects_analyzer = RandomEffectsAnalyzer(translator)
         #
         for index, jd in enumerate(jitdrivers_sd):
             jd.index = index
@@ -187,7 +187,7 @@
             fnaddr = llmemory.cast_ptr_to_adr(fnptr)
         NON_VOID_ARGS = [ARG for ARG in FUNC.ARGS if ARG is not lltype.Void]
         calldescr = self.cpu.calldescrof(FUNC, tuple(NON_VOID_ARGS),
-                                         FUNC.RESULT)
+                                         FUNC.RESULT, EffectInfo.MOST_GENERAL)
         return (fnaddr, calldescr)
 
     def getcalldescr(self, op, oopspecindex=EffectInfo.OS_NONE,
@@ -219,9 +219,11 @@
                 assert not NON_VOID_ARGS, ("arguments not supported for "
                                            "loop-invariant function!")
         # build the extraeffect
-        can_release_gil = self.canreleasegil_analyzer.analyze(op)
-        # can_release_gil implies can_invalidate
-        can_invalidate = can_release_gil or self.quasiimmut_analyzer.analyze(op)
+        random_effects = self.randomeffects_analyzer.analyze(op)
+        if random_effects:
+            extraeffect = EffectInfo.EF_RANDOM_EFFECTS
+        # random_effects implies can_invalidate
+        can_invalidate = random_effects or self.quasiimmut_analyzer.analyze(op)
         if extraeffect is None:
             if self.virtualizable_analyzer.analyze(op):
                 extraeffect = EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
@@ -239,12 +241,10 @@
         #
         effectinfo = effectinfo_from_writeanalyze(
             self.readwrite_analyzer.analyze(op), self.cpu, extraeffect,
-            oopspecindex, can_invalidate, can_release_gil)
+            oopspecindex, can_invalidate)
         #
-        if oopspecindex != EffectInfo.OS_NONE:
-            assert effectinfo is not None
+        assert effectinfo is not None
         if elidable or loopinvariant:
-            assert effectinfo is not None
             assert extraeffect != EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
             # XXX this should also say assert not can_invalidate, but
             #     it can't because our analyzer is not good enough for now
@@ -264,8 +264,7 @@
 
     def calldescr_canraise(self, calldescr):
         effectinfo = calldescr.get_extra_info()
-        return (effectinfo is None or
-                effectinfo.extraeffect > EffectInfo.EF_CANNOT_RAISE)
+        return effectinfo.check_can_raise()
 
     def jitdriver_sd_from_portal_graph(self, graph):
         for jd in self.jitdrivers_sd:
diff --git a/pypy/jit/codewriter/effectinfo.py b/pypy/jit/codewriter/effectinfo.py
--- a/pypy/jit/codewriter/effectinfo.py
+++ b/pypy/jit/codewriter/effectinfo.py
@@ -15,6 +15,7 @@
     EF_ELIDABLE_CAN_RAISE              = 3 #elidable function (but can raise)
     EF_CAN_RAISE                       = 4 #normal function (can raise)
     EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE = 5 #can raise and force virtualizables
+    EF_RANDOM_EFFECTS                  = 6 #can do whatever
 
     # the 'oopspecindex' field is one of the following values:
     OS_NONE                     = 0    # normal case, no oopspec
@@ -80,17 +81,26 @@
                 write_descrs_fields, write_descrs_arrays,
                 extraeffect=EF_CAN_RAISE,
                 oopspecindex=OS_NONE,
-                can_invalidate=False, can_release_gil=False):
-        key = (frozenset(readonly_descrs_fields),
-               frozenset(readonly_descrs_arrays),
-               frozenset(write_descrs_fields),
-               frozenset(write_descrs_arrays),
+                can_invalidate=False):
+        key = (frozenset_or_none(readonly_descrs_fields),
+               frozenset_or_none(readonly_descrs_arrays),
+               frozenset_or_none(write_descrs_fields),
+               frozenset_or_none(write_descrs_arrays),
                extraeffect,
                oopspecindex,
-               can_invalidate,
-               can_release_gil)
+               can_invalidate)
         if key in cls._cache:
             return cls._cache[key]
+        if extraeffect == EffectInfo.EF_RANDOM_EFFECTS:
+            assert readonly_descrs_fields is None
+            assert readonly_descrs_arrays is None
+            assert write_descrs_fields is None
+            assert write_descrs_arrays is None
+        else:
+            assert readonly_descrs_fields is not None
+            assert readonly_descrs_arrays is not None
+            assert write_descrs_fields is not None
+            assert write_descrs_arrays is not None
         result = object.__new__(cls)
         result.readonly_descrs_fields = readonly_descrs_fields
         result.readonly_descrs_arrays = readonly_descrs_arrays
@@ -104,11 +114,13 @@
             result.write_descrs_arrays = write_descrs_arrays
         result.extraeffect = extraeffect
         result.can_invalidate = can_invalidate
-        result.can_release_gil = can_release_gil
         result.oopspecindex = oopspecindex
         cls._cache[key] = result
         return result
 
+    def check_can_raise(self):
+        return self.extraeffect > self.EF_CANNOT_RAISE
+
     def check_can_invalidate(self):
         return self.can_invalidate
 
@@ -116,56 +128,71 @@
         return self.extraeffect >= self.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
 
     def has_random_effects(self):
-        return self.oopspecindex == self.OS_LIBFFI_CALL or self.can_release_gil
+        return self.extraeffect >= self.EF_RANDOM_EFFECTS
+
+
+def frozenset_or_none(x):
+    if x is None:
+        return None
+    return frozenset(x)
+
+EffectInfo.MOST_GENERAL = EffectInfo(None, None, None, None,
+                                     EffectInfo.EF_RANDOM_EFFECTS,
+                                     can_invalidate=True)
+
 
 def effectinfo_from_writeanalyze(effects, cpu,
                                  extraeffect=EffectInfo.EF_CAN_RAISE,
                                  oopspecindex=EffectInfo.OS_NONE,
-                                 can_invalidate=False,
-                                 can_release_gil=False):
+                                 can_invalidate=False):
     from pypy.translator.backendopt.writeanalyze import top_set
-    if effects is top_set:
-        return None
-    readonly_descrs_fields = []
-    readonly_descrs_arrays = []
-    write_descrs_fields = []
-    write_descrs_arrays = []
+    if effects is top_set or extraeffect == EffectInfo.EF_RANDOM_EFFECTS:
+        readonly_descrs_fields = None
+        readonly_descrs_arrays = None
+        write_descrs_fields = None
+        write_descrs_arrays = None
+        extraeffect = EffectInfo.EF_RANDOM_EFFECTS
+    else:
+        readonly_descrs_fields = []
+        readonly_descrs_arrays = []
+        write_descrs_fields = []
+        write_descrs_arrays = []
 
-    def add_struct(descrs_fields, (_, T, fieldname)):
-        T = deref(T)
-        if consider_struct(T, fieldname):
-            descr = cpu.fielddescrof(T, fieldname)
-            descrs_fields.append(descr)
+        def add_struct(descrs_fields, (_, T, fieldname)):
+            T = deref(T)
+            if consider_struct(T, fieldname):
+                descr = cpu.fielddescrof(T, fieldname)
+                descrs_fields.append(descr)
 
-    def add_array(descrs_arrays, (_, T)):
-        ARRAY = deref(T)
-        if consider_array(ARRAY):
-            descr = cpu.arraydescrof(ARRAY)
-            descrs_arrays.append(descr)
+        def add_array(descrs_arrays, (_, T)):
+            ARRAY = deref(T)
+            if consider_array(ARRAY):
+                descr = cpu.arraydescrof(ARRAY)
+                descrs_arrays.append(descr)
 
-    for tup in effects:
-        if tup[0] == "struct":
-            add_struct(write_descrs_fields, tup)
-        elif tup[0] == "readstruct":
-            tupw = ("struct",) + tup[1:]
-            if tupw not in effects:
-                add_struct(readonly_descrs_fields, tup)
-        elif tup[0] == "array":
-            add_array(write_descrs_arrays, tup)
-        elif tup[0] == "readarray":
-            tupw = ("array",) + tup[1:]
-            if tupw not in effects:
-                add_array(readonly_descrs_arrays, tup)
-        else:
-            assert 0
+        for tup in effects:
+            if tup[0] == "struct":
+                add_struct(write_descrs_fields, tup)
+            elif tup[0] == "readstruct":
+                tupw = ("struct",) + tup[1:]
+                if tupw not in effects:
+                    add_struct(readonly_descrs_fields, tup)
+            elif tup[0] == "array":
+                add_array(write_descrs_arrays, tup)
+            elif tup[0] == "readarray":
+                tupw = ("array",) + tup[1:]
+                if tupw not in effects:
+                    add_array(readonly_descrs_arrays, tup)
+            else:
+                assert 0
+    #
     return EffectInfo(readonly_descrs_fields,
                       readonly_descrs_arrays,
                       write_descrs_fields,
                       write_descrs_arrays,
                       extraeffect,
                       oopspecindex,
-                      can_invalidate,
-                      can_release_gil)
+                      can_invalidate)
 
 def consider_struct(TYPE, fieldname):
     if fieldType(TYPE, fieldname) is lltype.Void:
@@ -201,12 +228,13 @@
     def analyze_simple_operation(self, op, graphinfo):
         return op.opname == 'jit_force_quasi_immutable'
 
-class CanReleaseGILAnalyzer(BoolGraphAnalyzer):
+class RandomEffectsAnalyzer(BoolGraphAnalyzer):
     def analyze_direct_call(self, graph, seen=None):
-        releases_gil = False
         if hasattr(graph, "func") and hasattr(graph.func, "_ptr"):
-            releases_gil = graph.func._ptr._obj.releases_gil
-        return releases_gil or super(CanReleaseGILAnalyzer, self).analyze_direct_call(graph, seen)
+            if graph.func._ptr._obj.random_effects_on_gcobjs:
+                return True
+        return super(RandomEffectsAnalyzer, self).analyze_direct_call(graph,
+                                                                      seen)
 
     def analyze_simple_operation(self, op, graphinfo):
         return False
diff --git a/pypy/jit/codewriter/jtransform.py b/pypy/jit/codewriter/jtransform.py
--- a/pypy/jit/codewriter/jtransform.py
+++ b/pypy/jit/codewriter/jtransform.py
@@ -1417,7 +1417,7 @@
             extraeffect = EffectInfo.EF_CANNOT_RAISE
         elif oopspec_name.startswith('libffi_call_'):
             oopspecindex = EffectInfo.OS_LIBFFI_CALL
-            extraeffect = EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
+            extraeffect = EffectInfo.EF_RANDOM_EFFECTS
         else:
             assert False, 'unsupported oopspec: %s' % oopspec_name
         return self._handle_oopspec_call(op, args, oopspecindex, extraeffect)
diff --git a/pypy/jit/codewriter/test/test_call.py b/pypy/jit/codewriter/test/test_call.py
--- a/pypy/jit/codewriter/test/test_call.py
+++ b/pypy/jit/codewriter/test/test_call.py
@@ -191,4 +191,4 @@
     [block, _] = list(f_graph.iterblocks())
     [op] = block.operations
     call_descr = cc.getcalldescr(op)
-    assert call_descr.extrainfo.can_release_gil
\ No newline at end of file
+    assert call_descr.extrainfo.has_random_effects()
diff --git a/pypy/jit/codewriter/test/test_codewriter.py b/pypy/jit/codewriter/test/test_codewriter.py
--- a/pypy/jit/codewriter/test/test_codewriter.py
+++ b/pypy/jit/codewriter/test/test_codewriter.py
@@ -5,7 +5,7 @@
 from pypy.rpython.lltypesystem import lltype, llmemory, rffi
 
 class FakeCallDescr(AbstractDescr):
-    def __init__(self, FUNC, ARGS, RESULT, effectinfo=None):
+    def __init__(self, FUNC, ARGS, RESULT, effectinfo):
         self.FUNC = FUNC
         self.ARGS = ARGS
         self.RESULT = RESULT
diff --git a/pypy/jit/codewriter/test/test_flatten.py b/pypy/jit/codewriter/test/test_flatten.py
--- a/pypy/jit/codewriter/test/test_flatten.py
+++ b/pypy/jit/codewriter/test/test_flatten.py
@@ -50,7 +50,7 @@
     def __init__(self, rtyper):
         rtyper._builtin_func_for_spec_cache = FakeDict()
         self.rtyper = rtyper
-    def calldescrof(self, FUNC, ARGS, RESULT):
+    def calldescrof(self, FUNC, ARGS, RESULT, effectinfo):
         return FakeDescr()
     def fielddescrof(self, STRUCT, name):
         return FakeDescr()
diff --git a/pypy/jit/metainterp/optimizeopt/fficall.py b/pypy/jit/metainterp/optimizeopt/fficall.py
--- a/pypy/jit/metainterp/optimizeopt/fficall.py
+++ b/pypy/jit/metainterp/optimizeopt/fficall.py
@@ -19,7 +19,8 @@
         self.funcval = funcval
         self.opargs = []
         argtypes, restype = self._get_signature(funcval)
-        self.descr = cpu.calldescrof_dynamic(argtypes, restype)
+        self.descr = cpu.calldescrof_dynamic(argtypes, restype,
+                                             EffectInfo.MOST_GENERAL)
         # ^^^ may be None if unsupported
         self.prepare_op = prepare_op
         self.delayed_ops = []
@@ -195,9 +196,7 @@
 
     def _get_oopspec(self, op):
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            return effectinfo.oopspecindex
-        return EffectInfo.OS_NONE
+        return effectinfo.oopspecindex
 
     def _get_funcval(self, op):
         return self.getvalue(op.getarg(1))
diff --git a/pypy/jit/metainterp/optimizeopt/heap.py b/pypy/jit/metainterp/optimizeopt/heap.py
--- a/pypy/jit/metainterp/optimizeopt/heap.py
+++ b/pypy/jit/metainterp/optimizeopt/heap.py
@@ -235,31 +235,33 @@
             opnum == rop.CALL_RELEASE_GIL or
             opnum == rop.CALL_ASSEMBLER):
             if opnum == rop.CALL_ASSEMBLER:
-                effectinfo = None
+                self._seen_guard_not_invalidated = False
             else:
                 effectinfo = op.getdescr().get_extra_info()
-            if effectinfo is None or effectinfo.check_can_invalidate():
-                self._seen_guard_not_invalidated = False
-            if effectinfo is not None and not effectinfo.has_random_effects():
-                # XXX we can get the wrong complexity here, if the lists
-                # XXX stored on effectinfo are large
-                for fielddescr in effectinfo.readonly_descrs_fields:
-                    self.force_lazy_setfield(fielddescr)
-                for arraydescr in effectinfo.readonly_descrs_arrays:
-                    self.force_lazy_setarrayitem(arraydescr)
-                for fielddescr in effectinfo.write_descrs_fields:
-                    self.force_lazy_setfield(fielddescr, can_cache=False)
-                for arraydescr in effectinfo.write_descrs_arrays:
-                    self.force_lazy_setarrayitem(arraydescr, can_cache=False)
-                if effectinfo.check_forces_virtual_or_virtualizable():
-                    vrefinfo = self.optimizer.metainterp_sd.virtualref_info
-                    self.force_lazy_setfield(vrefinfo.descr_forced)
-                    # ^^^ we only need to force this field; the other fields
-                    # of virtualref_info and virtualizable_info are not gcptrs.
-                return
+                if effectinfo.check_can_invalidate():
+                    self._seen_guard_not_invalidated = False
+                if not effectinfo.has_random_effects():
+                    self.force_from_effectinfo(effectinfo)
+                    return
         self.force_all_lazy_setfields_and_arrayitems()
         self.clean_caches()
 
+    def force_from_effectinfo(self, effectinfo):
+        # XXX we can get the wrong complexity here, if the lists
+        # XXX stored on effectinfo are large
+        for fielddescr in effectinfo.readonly_descrs_fields:
+            self.force_lazy_setfield(fielddescr)
+        for arraydescr in effectinfo.readonly_descrs_arrays:
+            self.force_lazy_setarrayitem(arraydescr)
+        for fielddescr in effectinfo.write_descrs_fields:
+            self.force_lazy_setfield(fielddescr, can_cache=False)
+        for arraydescr in effectinfo.write_descrs_arrays:
+            self.force_lazy_setarrayitem(arraydescr, can_cache=False)
+        if effectinfo.check_forces_virtual_or_virtualizable():
+            vrefinfo = self.optimizer.metainterp_sd.virtualref_info
+            self.force_lazy_setfield(vrefinfo.descr_forced)
+            # ^^^ we only need to force this field; the other fields
+            # of virtualref_info and virtualizable_info are not gcptrs.
 
     def turned_constant(self, value):
         assert value.is_constant()
diff --git a/pypy/jit/metainterp/optimizeopt/rewrite.py b/pypy/jit/metainterp/optimizeopt/rewrite.py
--- a/pypy/jit/metainterp/optimizeopt/rewrite.py
+++ b/pypy/jit/metainterp/optimizeopt/rewrite.py
@@ -433,11 +433,10 @@
         # specifically the given oopspec call.  For non-oopspec calls,
         # oopspecindex is just zero.
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            oopspecindex = effectinfo.oopspecindex
-            if oopspecindex == EffectInfo.OS_ARRAYCOPY:
-                if self._optimize_CALL_ARRAYCOPY(op):
-                    return
+        oopspecindex = effectinfo.oopspecindex
+        if oopspecindex == EffectInfo.OS_ARRAYCOPY:
+            if self._optimize_CALL_ARRAYCOPY(op):
+                return
         self.emit_operation(op)
 
     def _optimize_CALL_ARRAYCOPY(self, op):
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py b/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py
@@ -51,14 +51,18 @@
                              restype=types.sint)
         #
         def calldescr(cpu, FUNC, oopspecindex, extraeffect=None):
-            einfo = EffectInfo([], [], [], [], oopspecindex=oopspecindex,
+            if extraeffect == EffectInfo.EF_RANDOM_EFFECTS:
+                f = None   # means "can force all" really
+            else:
+                f = []
+            einfo = EffectInfo(f, f, f, f, oopspecindex=oopspecindex,
                                extraeffect=extraeffect)
             return cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT, einfo)
         #
         libffi_prepare =  calldescr(cpu, FUNC, EffectInfo.OS_LIBFFI_PREPARE)
         libffi_push_arg = calldescr(cpu, FUNC, EffectInfo.OS_LIBFFI_PUSH_ARG)
         libffi_call =     calldescr(cpu, FUNC, EffectInfo.OS_LIBFFI_CALL,
-                                 EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE)
+                                    EffectInfo.EF_RANDOM_EFFECTS)
     
     namespace = namespace.__dict__
 
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_util.py b/pypy/jit/metainterp/optimizeopt/test/test_util.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_util.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_util.py
@@ -167,7 +167,8 @@
     onedescr = cpu.fielddescrof(U, 'one')
 
     FUNC = lltype.FuncType([lltype.Signed], lltype.Signed)
-    plaincalldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+    plaincalldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                     EffectInfo.MOST_GENERAL)
     nonwritedescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
                                     EffectInfo([], [], [], []))
     writeadescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
diff --git a/pypy/jit/metainterp/optimizeopt/vstring.py b/pypy/jit/metainterp/optimizeopt/vstring.py
--- a/pypy/jit/metainterp/optimizeopt/vstring.py
+++ b/pypy/jit/metainterp/optimizeopt/vstring.py
@@ -455,8 +455,8 @@
         # specifically the given oopspec call.  For non-oopspec calls,
         # oopspecindex is just zero.
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            oopspecindex = effectinfo.oopspecindex
+        oopspecindex = effectinfo.oopspecindex
+        if oopspecindex != EffectInfo.OS_NONE:
             for value, meth in opt_call_oopspec_ops:
                 if oopspecindex == value:      # a match with the OS_STR_xxx
                     if meth(self, op, mode_string):
diff --git a/pypy/jit/metainterp/pyjitpl.py b/pypy/jit/metainterp/pyjitpl.py
--- a/pypy/jit/metainterp/pyjitpl.py
+++ b/pypy/jit/metainterp/pyjitpl.py
@@ -1257,10 +1257,8 @@
         assert i == len(allboxes)
         #
         effectinfo = descr.get_extra_info()
-        if (effectinfo is None or
-                effectinfo.extraeffect ==
-                             effectinfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE or
-                assembler_call):
+        if (assembler_call or
+                effectinfo.check_forces_virtual_or_virtualizable()):
             # residual calls require attention to keep virtualizables in-sync
             self.metainterp.clear_exception()
             self.metainterp.vable_and_vrefs_before_residual_call()
@@ -1693,12 +1691,11 @@
             return
         if opnum == rop.CALL:
             effectinfo = descr.get_extra_info()
-            if effectinfo is not None:
-                ef = effectinfo.extraeffect
-                if ef == effectinfo.EF_LOOPINVARIANT or \
-                   ef == effectinfo.EF_ELIDABLE_CANNOT_RAISE or \
-                   ef == effectinfo.EF_ELIDABLE_CAN_RAISE:
-                    return
+            ef = effectinfo.extraeffect
+            if ef == effectinfo.EF_LOOPINVARIANT or \
+               ef == effectinfo.EF_ELIDABLE_CANNOT_RAISE or \
+               ef == effectinfo.EF_ELIDABLE_CAN_RAISE:
+                return
         if self.heap_cache:
             self.heap_cache.clear()
         if self.heap_array_cache:
diff --git a/pypy/jit/metainterp/test/test_compile.py b/pypy/jit/metainterp/test/test_compile.py
--- a/pypy/jit/metainterp/test/test_compile.py
+++ b/pypy/jit/metainterp/test/test_compile.py
@@ -190,7 +190,7 @@
     class FakeJitDriverSD:
         portal_runner_ptr = llhelper(lltype.Ptr(FUNC), ll_portal_runner)
         portal_runner_adr = llmemory.cast_ptr_to_adr(portal_runner_ptr)
-        portal_calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        portal_calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT, None)
         portal_finishtoken = compile.DoneWithThisFrameDescrInt()
         num_red_args = 2
         result_type = INT
diff --git a/pypy/jit/metainterp/test/test_string.py b/pypy/jit/metainterp/test/test_string.py
--- a/pypy/jit/metainterp/test/test_string.py
+++ b/pypy/jit/metainterp/test/test_string.py
@@ -1,5 +1,6 @@
 import py
 from pypy.rlib.jit import JitDriver, dont_look_inside, we_are_jitted
+from pypy.rlib.debug import debug_print
 from pypy.jit.codewriter.policy import StopAtXPolicy
 from pypy.rpython.ootypesystem import ootype
 from pypy.jit.metainterp.test.support import LLJitMixin, OOJitMixin
@@ -521,7 +522,8 @@
         jitdriver = JitDriver(greens = ['g'], reds = ['m'])
         @dont_look_inside
         def escape(x):
-            print str(x)
+            # a plain "print" would call os.write() and release the gil
+            debug_print(str(x))
         def f(g, m):
             g = str(g)
             while m >= 0:
diff --git a/pypy/jit/metainterp/test/test_virtualstate.py b/pypy/jit/metainterp/test/test_virtualstate.py
--- a/pypy/jit/metainterp/test/test_virtualstate.py
+++ b/pypy/jit/metainterp/test/test_virtualstate.py
@@ -1,3 +1,4 @@
+from __future__ import with_statement
 import py
 from pypy.jit.metainterp.optimize import InvalidLoop
 from pypy.jit.metainterp.optimizeopt.virtualstate import VirtualStateInfo, VStructStateInfo, \
diff --git a/pypy/jit/metainterp/warmspot.py b/pypy/jit/metainterp/warmspot.py
--- a/pypy/jit/metainterp/warmspot.py
+++ b/pypy/jit/metainterp/warmspot.py
@@ -21,6 +21,7 @@
 from pypy.jit.metainterp.jitdriver import JitDriverStaticData
 from pypy.jit.codewriter import support, codewriter, longlong
 from pypy.jit.codewriter.policy import JitPolicy
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.metainterp.optimizeopt import ALL_OPTS_NAMES
 
 # ____________________________________________________________
@@ -746,7 +747,8 @@
         jd.portal_calldescr = self.cpu.calldescrof(
             jd._PTR_PORTAL_FUNCTYPE.TO,
             jd._PTR_PORTAL_FUNCTYPE.TO.ARGS,
-            jd._PTR_PORTAL_FUNCTYPE.TO.RESULT)
+            jd._PTR_PORTAL_FUNCTYPE.TO.RESULT,
+            EffectInfo.MOST_GENERAL)
 
         vinfo = jd.virtualizable_info
 
diff --git a/pypy/module/_continuation/__init__.py b/pypy/module/_continuation/__init__.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/__init__.py
@@ -0,0 +1,40 @@
+from pypy.interpreter.mixedmodule import MixedModule
+
+
+class Module(MixedModule):
+    """This module exposes 'one-shot continuation containers'.
+
+A 'continulet' object from this module is a container that stores a
+one-shot continuation.  It is similar in purpose to the 'f_back'
+attribute of frames, which points to where execution should continue
+after this frame finishes.  The difference is that it will be changed
+(often repeatedly) before the frame actually returns.
+
+To make a continulet object, call 'continulet' with a callable and
+optional extra arguments.  Later, the first time you switch() to the
+continulet, the callable is invoked wih the same continulet object as
+the extra first argument.
+
+At this point, the one-shot continuation stored in the continulet points
+to the caller of switch().  When switch() is called again, this one-shot
+continuation is exchanged with the current one; it means that the caller
+of switch() is suspended, its continuation stored in the container, and
+the old continuation from the continulet object is resumed.
+
+Continulets are internally implemented using stacklets.  Stacklets
+are a bit more primitive (they are really one-shot continuations), but
+that idea only works in C, not in Python, notably because of exceptions.
+
+The most primitive API is actually 'permute()', which just permutes the
+one-shot continuation stored in two (or more) continulets.
+"""
+
+    appleveldefs = {
+        'error': 'app_continuation.error',
+        'generator': 'app_continuation.generator',
+    }
+
+    interpleveldefs = {
+        'continulet': 'interp_continuation.W_Continulet',
+        'permute': 'interp_continuation.permute',
+    }
diff --git a/pypy/module/_continuation/app_continuation.py b/pypy/module/_continuation/app_continuation.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/app_continuation.py
@@ -0,0 +1,35 @@
+
+class error(Exception):
+    "Usage error of the _continuation module."
+
+
+import _continuation
+
+
+class generator(object):
+
+    def __init__(self, callable):
+        self.__func__ = callable
+
+    def __get__(self, obj, type=None):
+        return generator(self.__func__.__get__(obj, type))
+
+    def __call__(self, *args, **kwds):
+        return genlet(self.__func__, *args, **kwds)
+
+
+class genlet(_continuation.continulet):
+
+    def __iter__(self):
+        return self
+
+    def next(self, value=None):
+        res = self.switch(value)
+        if self.is_pending():
+            return res
+        else:
+            if res is not None:
+                raise TypeError("_continuation.generator must return None")
+            raise StopIteration
+
+    send = next
diff --git a/pypy/module/_continuation/interp_continuation.py b/pypy/module/_continuation/interp_continuation.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/interp_continuation.py
@@ -0,0 +1,245 @@
+from pypy.rlib.rstacklet import StackletThread
+from pypy.rlib import jit
+from pypy.interpreter.error import OperationError
+from pypy.interpreter.executioncontext import ExecutionContext
+from pypy.interpreter.baseobjspace import Wrappable
+from pypy.interpreter.typedef import TypeDef
+from pypy.interpreter.gateway import interp2app
+
+
+class W_Continulet(Wrappable):
+    sthread = None
+
+    def __init__(self, space):
+        self.space = space
+        # states:
+        #  - not init'ed: self.sthread == None
+        #  - normal:      self.sthread != None, not is_empty_handle(self.h)
+        #  - finished:    self.sthread != None, is_empty_handle(self.h)
+
+    def check_sthread(self):
+        ec = self.space.getexecutioncontext()
+        if ec.stacklet_thread is not self.sthread:
+            start_state.clear()
+            raise geterror(self.space, "inter-thread support is missing")
+        return ec
+
+    def descr_init(self, w_callable, __args__):
+        if self.sthread is not None:
+            raise geterror(self.space, "continulet already __init__ialized")
+        start_state.origin = self
+        start_state.w_callable = w_callable
+        start_state.args = __args__
+        self.sthread = build_sthread(self.space)
+        try:
+            self.h = self.sthread.new(new_stacklet_callback)
+            if self.sthread.is_empty_handle(self.h):    # early return
+                raise MemoryError
+        except MemoryError:
+            self.sthread = None
+            start_state.clear()
+            raise getmemoryerror(self.space)
+
+    def switch(self, w_to):
+        to = self.space.interp_w(W_Continulet, w_to, can_be_None=True)
+        if to is not None:
+            if self is to:    # double-switch to myself: no-op
+                return get_result()
+            if to.sthread is None:
+                start_state.clear()
+                raise geterror(self.space, "continulet not initialized yet")
+        if self.sthread is None:
+            start_state.clear()
+            raise geterror(self.space, "continulet not initialized yet")
+        ec = self.check_sthread()
+        saved_topframeref = ec.topframeref
+        #
+        start_state.origin = self
+        if to is None:
+            # simple switch: going to self.h
+            start_state.destination = self
+        else:
+            # double switch: the final destination is to.h
+            start_state.destination = to
+        #
+        h = start_state.destination.h
+        sthread = self.sthread
+        if sthread.is_empty_handle(h):
+            start_state.clear()
+            raise geterror(self.space, "continulet already finished")
+        #
+        try:
+            do_switch(sthread, h)
+        except MemoryError:
+            start_state.clear()
+            raise getmemoryerror(self.space)
+        #
+        ec = sthread.ec
+        ec.topframeref = saved_topframeref
+        return get_result()
+
+    def descr_switch(self, w_value=None, w_to=None):
+        start_state.w_value = w_value
+        return self.switch(w_to)
+
+    def descr_throw(self, w_type, w_val=None, w_tb=None, w_to=None):
+        from pypy.interpreter.pytraceback import check_traceback
+        space = self.space
+        #
+        msg = "throw() third argument must be a traceback object"
+        if space.is_w(w_tb, space.w_None):
+            tb = None
+        else:
+            tb = check_traceback(space, w_tb, msg)
+        #
+        operr = OperationError(w_type, w_val, tb)
+        operr.normalize_exception(space)
+        start_state.w_value = None
+        start_state.propagate_exception = operr
+        return self.switch(w_to)
+
+    def descr_is_pending(self):
+        valid = (self.sthread is not None
+                 and not self.sthread.is_empty_handle(self.h))
+        return self.space.newbool(valid)
+
+
+def W_Continulet___new__(space, w_subtype, __args__):
+    r = space.allocate_instance(W_Continulet, w_subtype)
+    r.__init__(space)
+    return space.wrap(r)
+
+
+W_Continulet.typedef = TypeDef(
+    'continulet',
+    __module__ = '_continuation',
+    __new__     = interp2app(W_Continulet___new__),
+    __init__    = interp2app(W_Continulet.descr_init),
+    switch      = interp2app(W_Continulet.descr_switch),
+    throw       = interp2app(W_Continulet.descr_throw),
+    is_pending  = interp2app(W_Continulet.descr_is_pending),
+    )
+
+
+# ____________________________________________________________
+
+
+class State:
+    def __init__(self, space):
+        self.space = space 
+        w_module = space.getbuiltinmodule('_continuation')
+        self.w_error = space.getattr(w_module, space.wrap('error'))
+        self.w_memoryerror = OperationError(space.w_MemoryError, space.w_None)
+
+def geterror(space, message):
+    cs = space.fromcache(State)
+    return OperationError(cs.w_error, space.wrap(message))
+
+def getmemoryerror(space):
+    cs = space.fromcache(State)
+    return cs.w_memoryerror
+
+# ____________________________________________________________
+
+
+class SThread(StackletThread):
+
+    def __init__(self, space, ec):
+        StackletThread.__init__(self, space.config)
+        self.space = space
+        self.ec = ec
+
+ExecutionContext.stacklet_thread = None
+
+# ____________________________________________________________
+
+
+class StartState:   # xxx a single global to pass around the function to start
+    def clear(self):
+        self.origin = None
+        self.destination = None
+        self.w_callable = None
+        self.args = None
+        self.w_value = None
+        self.propagate_exception = None
+start_state = StartState()
+start_state.clear()
+
+
+def new_stacklet_callback(h, arg):
+    self       = start_state.origin
+    w_callable = start_state.w_callable
+    args       = start_state.args
+    start_state.clear()
+    try:
+        do_switch(self.sthread, h)
+    except MemoryError:
+        return h       # oups!  do an early return in this case
+    #
+    space = self.space
+    try:
+        ec = self.sthread.ec
+        ec.topframeref = jit.vref_None
+
+        if start_state.propagate_exception is not None:
+            raise start_state.propagate_exception   # just propagate it further
+        if start_state.w_value is not space.w_None:
+            raise OperationError(space.w_TypeError, space.wrap(
+                "can't send non-None value to a just-started continulet"))
+
+        args = args.prepend(self.space.wrap(self))
+        w_result = space.call_args(w_callable, args)
+    except Exception, e:
+        start_state.propagate_exception = e
+    else:
+        start_state.w_value = w_result
+    start_state.origin = self
+    start_state.destination = self
+    return self.h
+
+
+def do_switch(sthread, h):
+    h = sthread.switch(h)
+    origin = start_state.origin
+    self = start_state.destination
+    start_state.origin = None
+    start_state.destination = None
+    self.h, origin.h = origin.h, h
+
+def get_result():
+    if start_state.propagate_exception:
+        e = start_state.propagate_exception
+        start_state.propagate_exception = None
+        raise e
+    w_value = start_state.w_value
+    start_state.w_value = None
+    return w_value
+
+def build_sthread(space):
+    ec = space.getexecutioncontext()
+    sthread = ec.stacklet_thread
+    if not sthread:
+        sthread = ec.stacklet_thread = SThread(space, ec)
+    return sthread
+
+# ____________________________________________________________
+
+def permute(space, args_w):
+    sthread = build_sthread(space)
+    #
+    contlist = []
+    for w_cont in args_w:
+        cont = space.interp_w(W_Continulet, w_cont)
+        if cont.sthread is not sthread:
+            if cont.sthread is None:
+                raise geterror(space, "got a non-initialized continulet")
+            else:
+                raise geterror(space, "inter-thread support is missing")
+        elif sthread.is_empty_handle(cont.h):
+            raise geterror(space, "got an already-finished continulet")
+        contlist.append(cont)
+    #
+    if len(contlist) > 1:
+        other = contlist[-1].h
+        for cont in contlist:
+            other, cont.h = cont.h, other
diff --git a/pypy/module/_continuation/test/__init__.py b/pypy/module/_continuation/test/__init__.py
new file mode 100644
diff --git a/pypy/module/_continuation/test/support.py b/pypy/module/_continuation/test/support.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/support.py
@@ -0,0 +1,12 @@
+import py
+from pypy.conftest import gettestobjspace
+from pypy.rpython.tool.rffi_platform import CompilationError
+
+
+class BaseAppTest:
+    def setup_class(cls):
+        try:
+            import pypy.rlib.rstacklet
+        except CompilationError, e:
+            py.test.skip("cannot import rstacklet: %s" % e)
+        cls.space = gettestobjspace(usemodules=['_continuation'])
diff --git a/pypy/module/_continuation/test/test_generator.py b/pypy/module/_continuation/test/test_generator.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/test_generator.py
@@ -0,0 +1,70 @@
+from pypy.module._continuation.test.support import BaseAppTest
+
+
+class AppTestGenerator(BaseAppTest):
+
+    def test_simple(self):
+        from _continuation import generator
+        #
+        @generator
+        def f(gen, n):
+            gen.switch(n+1)
+            f2(gen, n+2)
+            gen.switch(n+3)
+        #
+        def f2(gen, m):
+            gen.switch(m*2)
+        #
+        g = f(10)
+        res = g.next()
+        assert res == 11
+        res = g.next()
+        assert res == 24
+        res = g.next()
+        assert res == 13
+        raises(StopIteration, g.next)
+
+    def test_iterator(self):
+        from _continuation import generator
+        #
+        @generator
+        def f(gen, n):
+            gen.switch(n+1)
+            f2(gen, n+2)
+            gen.switch(n+3)
+        #
+        def f2(gen, m):
+            gen.switch(m*2)
+        #
+        res = list(f(10))
+        assert res == [11, 24, 13]
+        g = f(20)
+        assert iter(g) is g
+
+    def test_bound_method(self):
+        from _continuation import generator
+        #
+        class A(object):
+            def __init__(self, m):
+                self.m = m
+            #
+            @generator
+            def f(self, gen, n):
+                gen.switch(n - self.m)
+        #
+        a = A(10)
+        res = list(a.f(25))
+        assert res == [15]
+
+    def test_must_return_None(self):
+        from _continuation import generator
+        #
+        @generator
+        def f(gen, n):
+            gen.switch(n+1)
+            return "foo"
+        #
+        g = f(10)
+        res = g.next()
+        assert res == 11
+        raises(TypeError, g.next)
diff --git a/pypy/module/_continuation/test/test_stacklet.py b/pypy/module/_continuation/test/test_stacklet.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/test_stacklet.py
@@ -0,0 +1,635 @@
+import os
+from pypy.module._continuation.test.support import BaseAppTest
+
+
+class AppTestStacklet(BaseAppTest):
+    def setup_class(cls):
+        BaseAppTest.setup_class.im_func(cls)
+        cls.w_translated = cls.space.wrap(
+            os.path.join(os.path.dirname(__file__),
+                         'test_translated.py'))
+
+    def test_new_empty(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c):
+            pass
+        #
+        c = continulet(empty_callback)
+        assert type(c) is continulet
+
+    def test_call_empty(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c1):
+            assert c1 is c
+            seen.append(1)
+            return 42
+        #
+        seen = []
+        c = continulet(empty_callback)
+        res = c.switch()
+        assert res == 42
+        assert seen == [1]
+
+    def test_no_double_init(self):
+        from _continuation import continulet, error
+        #
+        def empty_callback(c1):
+            pass
+        #
+        c = continulet(empty_callback)
+        raises(error, c.__init__, empty_callback)
+
+    def test_no_init_after_started(self):
+        from _continuation import continulet, error
+        #
+        def empty_callback(c1):
+            raises(error, c1.__init__, empty_callback)
+            return 42
+        #
+        c = continulet(empty_callback)
+        res = c.switch()
+        assert res == 42
+
+    def test_no_init_after_finished(self):
+        from _continuation import continulet, error
+        #
+        def empty_callback(c1):
+            return 42
+        #
+        c = continulet(empty_callback)
+        res = c.switch()
+        assert res == 42
+        raises(error, c.__init__, empty_callback)
+
+    def test_propagate_exception(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c1):
+            assert c1 is c
+            seen.append(42)
+            raise ValueError
+        #
+        seen = []
+        c = continulet(empty_callback)
+        raises(ValueError, c.switch)
+        assert seen == [42]
+
+    def test_callback_with_arguments(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c1, *args, **kwds):
+            seen.append(c1)
+            seen.append(args)
+            seen.append(kwds)
+            return 42
+        #
+        seen = []
+        c = continulet(empty_callback, 42, 43, foo=44, bar=45)
+        res = c.switch()
+        assert res == 42
+        assert seen == [c, (42, 43), {'foo': 44, 'bar': 45}]
+
+    def test_switch(self):
+        from _continuation import continulet
+        #
+        def switchbackonce_callback(c):
+            seen.append(1)
+            res = c.switch('a')
+            assert res == 'b'
+            seen.append(3)
+            return 'c'
+        #
+        seen = []
+        c = continulet(switchbackonce_callback)
+        seen.append(0)
+        res = c.switch()
+        assert res == 'a'
+        seen.append(2)
+        res = c.switch('b')
+        assert res == 'c'
+        assert seen == [0, 1, 2, 3]
+
+    def test_initial_switch_must_give_None(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c):
+            return 'ok'
+        #
+        c = continulet(empty_callback)
+        res = c.switch(None)
+        assert res == 'ok'
+        #
+        c = continulet(empty_callback)
+        raises(TypeError, c.switch, 'foo')  # "can't send non-None value"
+
+    def test_continuation_error(self):
+        from _continuation import continulet, error
+        #
+        def empty_callback(c):
+            return 42
+        #
+        c = continulet(empty_callback)
+        c.switch()
+        e = raises(error, c.switch)
+        assert str(e.value) == "continulet already finished"
+
+    def test_not_initialized_yet(self):
+        from _continuation import continulet, error
+        c = continulet.__new__(continulet)
+        e = raises(error, c.switch)
+        assert str(e.value) == "continulet not initialized yet"
+
+    def test_go_depth2(self):
+        from _continuation import continulet
+        #
+        def depth2(c):
+            seen.append(3)
+            return 4
+        #
+        def depth1(c):
+            seen.append(1)
+            c2 = continulet(depth2)
+            seen.append(2)
+            res = c2.switch()
+            seen.append(res)
+            return 5
+        #
+        seen = []
+        c = continulet(depth1)
+        seen.append(0)
+        res = c.switch()
+        seen.append(res)
+        assert seen == [0, 1, 2, 3, 4, 5]
+
+    def test_exception_depth2(self):
+        from _continuation import continulet
+        #
+        def depth2(c):
+            seen.append(2)
+            raise ValueError
+        #
+        def depth1(c):
+            seen.append(1)
+            try:
+                continulet(depth2).switch()
+            except ValueError:
+                seen.append(3)
+            return 4
+        #
+        seen = []
+        c = continulet(depth1)
+        res = c.switch()
+        seen.append(res)
+        assert seen == [1, 2, 3, 4]
+
+    def test_exception_with_switch(self):
+        from _continuation import continulet
+        #
+        def depth1(c):
+            seen.append(1)
+            c.switch()
+            seen.append(3)
+            raise ValueError
+        #
+        seen = []
+        c = continulet(depth1)
+        seen.append(0)
+        c.switch()
+        seen.append(2)
+        raises(ValueError, c.switch)
+        assert seen == [0, 1, 2, 3]
+
+    def test_is_pending(self):
+        from _continuation import continulet
+        #
+        def switchbackonce_callback(c):
+            assert c.is_pending()
+            res = c.switch('a')
+            assert res == 'b'
+            assert c.is_pending()
+            return 'c'
+        #
+        c = continulet.__new__(continulet)
+        assert not c.is_pending()
+        c.__init__(switchbackonce_callback)
+        assert c.is_pending()
+        res = c.switch()
+        assert res == 'a'
+        assert c.is_pending()
+        res = c.switch('b')
+        assert res == 'c'
+        assert not c.is_pending()
+
+    def test_switch_alternate(self):
+        from _continuation import continulet
+        #
+        def func_lower(c):
+            res = c.switch('a')
+            assert res == 'b'
+            res = c.switch('c')
+            assert res == 'd'
+            return 'e'
+        #
+        def func_upper(c):
+            res = c.switch('A')
+            assert res == 'B'
+            res = c.switch('C')
+            assert res == 'D'
+            return 'E'
+        #
+        c_lower = continulet(func_lower)
+        c_upper = continulet(func_upper)
+        res = c_lower.switch()
+        assert res == 'a'
+        res = c_upper.switch()
+        assert res == 'A'
+        res = c_lower.switch('b')
+        assert res == 'c'
+        res = c_upper.switch('B')
+        assert res == 'C'
+        res = c_lower.switch('d')
+        assert res == 'e'
+        res = c_upper.switch('D')
+        assert res == 'E'
+
+    def test_exception_with_switch_depth2(self):
+        from _continuation import continulet
+        #
+        def depth2(c):
+            seen.append(4)
+            c.switch()
+            seen.append(6)
+            raise ValueError
+        #
+        def depth1(c):
+            seen.append(1)
+            c.switch()
+            seen.append(3)
+            c2 = continulet(depth2)
+            c2.switch()
+            seen.append(5)
+            raises(ValueError, c2.switch)
+            assert not c2.is_pending()
+            seen.append(7)
+            assert c.is_pending()
+            raise KeyError
+        #
+        seen = []
+        c = continulet(depth1)
+        c.switch()
+        seen.append(2)
+        raises(KeyError, c.switch)
+        assert not c.is_pending()
+        assert seen == [1, 2, 3, 4, 5, 6, 7]
+
+    def test_random_switching(self):
+        from _continuation import continulet
+        #
+        def t1(c1):
+            return c1.switch()
+        def s1(c1, n):
+            assert n == 123
+            c2 = t1(c1)
+            return c1.switch('a') + 1
+        #
+        def s2(c2, c1):
+            res = c1.switch(c2)
+            assert res == 'a'
+            return c2.switch('b') + 2
+        #
+        def f():
+            c1 = continulet(s1, 123)
+            c2 = continulet(s2, c1)
+            c1.switch()
+            res = c2.switch()
+            assert res == 'b'
+            res = c1.switch(1000)
+            assert res == 1001
+            return c2.switch(2000)
+        #
+        res = f()
+        assert res == 2002
+
+    def test_f_back_is_None_for_now(self):
+        import sys
+        from _continuation import continulet
+        #
+        def g(c):
+            c.switch(sys._getframe(0))
+            c.switch(sys._getframe(0).f_back)
+            c.switch(sys._getframe(1))
+            c.switch(sys._getframe(1).f_back)
+            c.switch(sys._getframe(2))
+        def f(c):
+            g(c)
+        #
+        c = continulet(f)
+        f1 = c.switch()
+        assert f1.f_code.co_name == 'g'
+        f2 = c.switch()
+        assert f2.f_code.co_name == 'f'
+        f3 = c.switch()
+        assert f3.f_code.co_name == 'f'
+        f4 = c.switch()
+        assert f4 is None
+        raises(ValueError, c.switch)    # "call stack is not deep enough"
+
+    def test_traceback_is_complete(self):
+        import sys
+        from _continuation import continulet
+        #
+        def g():
+            raise KeyError
+        def f(c):
+            g()
+        #
+        def do(c):
+            c.switch()
+        #
+        c = continulet(f)
+        try:
+            do(c)
+        except KeyError:
+            tb = sys.exc_info()[2]
+        else:
+            raise AssertionError("should have raised!")
+        #
+        assert tb.tb_next.tb_frame.f_code.co_name == 'do'
+        assert tb.tb_next.tb_next.tb_frame.f_code.co_name == 'f'
+        assert tb.tb_next.tb_next.tb_next.tb_frame.f_code.co_name == 'g'
+        assert tb.tb_next.tb_next.tb_next.tb_next is None
+
+    def test_switch2_simple(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            res = c1.switch('started 1')
+            assert res == 'a'
+            res = c1.switch('b', to=c2)
+            assert res == 'c'
+            return 42
+        def f2(c2):
+            res = c2.switch('started 2')
+            assert res == 'b'
+            res = c2.switch('c', to=c1)
+            not_reachable
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c1.switch()
+        assert res == 'started 1'
+        res = c2.switch()
+        assert res == 'started 2'
+        res = c1.switch('a')
+        assert res == 42
+
+    def test_switch2_pingpong(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            res = c1.switch('started 1')
+            assert res == 'go'
+            for i in range(10):
+                res = c1.switch(i, to=c2)
+                assert res == 100 + i
+            return 42
+        def f2(c2):
+            res = c2.switch('started 2')
+            for i in range(10):
+                assert res == i
+                res = c2.switch(100 + i, to=c1)
+            not_reachable
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c1.switch()
+        assert res == 'started 1'
+        res = c2.switch()
+        assert res == 'started 2'
+        res = c1.switch('go')
+        assert res == 42
+
+    def test_switch2_more_complex(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            res = c1.switch(to=c2)
+            assert res == 'a'
+            res = c1.switch('b', to=c2)
+            assert res == 'c'
+            return 41
+        def f2(c2):
+            res = c2.switch('a', to=c1)
+            assert res == 'b'
+            return 42
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c1.switch()
+        assert res == 42
+        assert not c2.is_pending()    # finished by returning 42
+        res = c1.switch('c')
+        assert res == 41
+
+    def test_switch2_no_op(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            res = c1.switch('a', to=c1)
+            assert res == 'a'
+            return 42
+        #
+        c1 = continulet(f1)
+        res = c1.switch()
+        assert res == 42
+
+    def test_switch2_immediately_away(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            print 'in f1'
+            return 'm'
+        #
+        def f2(c2):
+            res = c2.switch('z')
+            print 'got there!'
+            assert res == 'a'
+            return None
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c2.switch()
+        assert res == 'z'
+        assert c1.is_pending()
+        assert c2.is_pending()
+        print 'calling!'
+        res = c1.switch('a', to=c2)
+        print 'back'
+        assert res == 'm'
+
+    def test_switch2_immediately_away_corner_case(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            this_is_never_seen
+        #
+        def f2(c2):
+            res = c2.switch('z')
+            assert res is None
+            return 'b'    # this goes back into the caller, which is f1,
+                          # but f1 didn't start yet, so a None-value value
+                          # has nowhere to go to...
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c2.switch()
+        assert res == 'z'
+        raises(TypeError, c1.switch, to=c2)  # "can't send non-None value"
+
+    def test_switch2_not_initialized_yet(self):
+        from _continuation import continulet, error
+        #
+        def f1(c1):
+            not_reachable
+        #
+        c1 = continulet(f1)
+        c2 = continulet.__new__(continulet)
+        e = raises(error, c1.switch, to=c2)
+        assert str(e.value) == "continulet not initialized yet"
+
+    def test_switch2_already_finished(self):
+        from _continuation import continulet, error
+        #
+        def f1(c1):
+            not_reachable
+        def empty_callback(c):
+            return 42
+        #
+        c1 = continulet(f1)
+        c2 = continulet(empty_callback)
+        c2.switch()
+        e = raises(error, c1.switch, to=c2)
+        assert str(e.value) == "continulet already finished"
+
+    def test_throw(self):
+        import sys
+        from _continuation import continulet
+        #
+        def f1(c1):
+            try:
+                c1.switch()
+            except KeyError:
+                res = "got keyerror"
+            try:
+                c1.switch(res)
+            except IndexError, e:
+                pass
+            try:
+                c1.switch(e)
+            except IndexError, e2:
+                pass
+            try:
+                c1.switch(e2)
+            except IndexError:
+                c1.throw(*sys.exc_info())
+            should_never_reach_here
+        #
+        c1 = continulet(f1)
+        c1.switch()
+        res = c1.throw(KeyError)
+        assert res == "got keyerror"
+        class FooError(IndexError):
+            pass
+        foo = FooError()
+        res = c1.throw(foo)
+        assert res is foo
+        res = c1.throw(IndexError, foo)
+        assert res is foo
+        #
+        def main():
+            def do_raise():
+                raise foo
+            try:
+                do_raise()
+            except IndexError:
+                tb = sys.exc_info()[2]
+            try:
+                c1.throw(IndexError, foo, tb)
+            except IndexError:
+                tb = sys.exc_info()[2]
+            return tb
+        #
+        tb = main()
+        assert tb.tb_frame.f_code.co_name == 'main'
+        assert tb.tb_next.tb_frame.f_code.co_name == 'f1'
+        assert tb.tb_next.tb_next.tb_frame.f_code.co_name == 'main'
+        assert tb.tb_next.tb_next.tb_next.tb_frame.f_code.co_name == 'do_raise'
+        assert tb.tb_next.tb_next.tb_next.tb_next is None
+
+    def test_throw_to_starting(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            not_reached
+        #
+        c1 = continulet(f1)
+        raises(IndexError, c1.throw, IndexError)
+
+    def test_throw2_simple(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            not_reached
+        def f2(c2):
+            try:
+                c2.switch("ready")
+            except IndexError:
+                raise ValueError
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c2.switch()
+        assert res == "ready"
+        assert c1.is_pending()
+        assert c2.is_pending()
+        raises(ValueError, c1.throw, IndexError, to=c2)
+        assert not c1.is_pending()
+        assert not c2.is_pending()
+
+    def test_throw2_no_op(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            raises(ValueError, c1.throw, ValueError, to=c1)
+            return "ok"
+        #
+        c1 = continulet(f1)
+        res = c1.switch()
+        assert res == "ok"
+
+    def test_permute(self):
+        from _continuation import continulet, permute
+        #
+        def f1(c1):
+            res = c1.switch()
+            assert res == "ok"
+            return "done"
+        #
+        def f2(c2):
+            permute(c1, c2)
+            return "ok"
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        c1.switch()
+        res = c2.switch()
+        assert res == "done"
+
+    def test_various_depths(self):
+        skip("may fail on top of CPython")
+        # run it from test_translated, but not while being actually translated
+        d = {}
+        execfile(self.translated, d)
+        d['set_fast_mode']()
+        d['test_various_depths']()
diff --git a/pypy/module/_continuation/test/test_translated.py b/pypy/module/_continuation/test/test_translated.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/test_translated.py
@@ -0,0 +1,132 @@
+import py
+try:
+    import _continuation
+except ImportError:
+    py.test.skip("to run on top of a translated pypy-c")
+
+import sys, random
+
+# ____________________________________________________________
+
+STATUS_MAX = 50000
+CONTINULETS = 50
+
+def set_fast_mode():
+    global STATUS_MAX, CONTINULETS
+    STATUS_MAX = 100
+    CONTINULETS = 5
+
+# ____________________________________________________________
+
+class Done(Exception):
+    pass
+
+
+class Runner(object):
+
+    def __init__(self):
+        self.foobar = 12345
+        self.conts = {}     # {continulet: parent-or-None}
+        self.contlist = []
+
+    def run_test(self):
+        self.start_continulets()
+        self.n = 0
+        try:
+            while True:
+                self.do_switch(src=None)
+                assert self.target is None
+        except Done:
+            self.check_traceback(sys.exc_info()[2])
+
+    def do_switch(self, src):
+        assert src not in self.conts.values()
+        c = random.choice(self.contlist)
+        self.target = self.conts[c]
+        self.conts[c] = src
+        c.switch()
+        assert self.target is src
+
+    def run_continulet(self, c, i):
+        while True:
+            assert self.target is c
+            assert self.contlist[i] is c
+            self.do_switch(c)
+            assert self.foobar == 12345
+            self.n += 1
+            if self.n >= STATUS_MAX:
+                raise Done
+
+    def start_continulets(self, i=0):
+        c = _continuation.continulet(self.run_continulet, i)
+        self.contlist.append(c)
+        if i < CONTINULETS:
+            self.start_continulets(i + 1)
+            # ^^^ start each continulet with a different base stack
+        self.conts[c] = c   # initially (i.e. not started) there are all loops
+
+    def check_traceback(self, tb):
+        found = []
+        tb = tb.tb_next
+        while tb:
+            if tb.tb_frame.f_code.co_name != 'do_switch':
+                assert tb.tb_frame.f_code.co_name == 'run_continulet', (
+                    "got %r" % (tb.tb_frame.f_code.co_name,))
+                found.append(tb.tb_frame.f_locals['c'])
+            tb = tb.tb_next
+        found.reverse()
+        #
+        expected = []
+        c = self.target
+        while c is not None:
+            expected.append(c)
+            c = self.conts[c]
+        #
+        assert found == expected, "%r == %r" % (found, expected)
+
+# ____________________________________________________________
+
+class AppTestWrapper:
+    def setup_class(cls):
+        "Run test_various_depths() when we are run with 'pypy py.test -A'."
+        from pypy.conftest import option
+        if not option.runappdirect:
+            py.test.skip("meant only for -A run")
+
+    def test_single_threaded(self):
+        for i in range(20):
+            yield Runner().run_test,
+
+    def test_multi_threaded(self):
+        for i in range(5):
+            yield multithreaded_test,
+
+class ThreadTest(object):
+    def __init__(self, lock):
+        self.lock = lock
+        self.ok = False
+        lock.acquire()
+    def run(self):
+        try:
+            Runner().run_test()
+            self.ok = True
+        finally:
+            self.lock.release()
+
+def multithreaded_test():
+    try:
+        import thread
+    except ImportError:
+        py.test.skip("no threads")
+    ts = [ThreadTest(thread.allocate_lock()) for i in range(5)]
+    for t in ts:
+        thread.start_new_thread(t.run, ())
+    for t in ts:
+        t.lock.acquire()
+    for t in ts:
+        assert t.ok
+
+# ____________________________________________________________
+
+if __name__ == '__main__':
+    Runner().run_test()
diff --git a/pypy/module/micronumpy/interp_numarray.py b/pypy/module/micronumpy/interp_numarray.py
--- a/pypy/module/micronumpy/interp_numarray.py
+++ b/pypy/module/micronumpy/interp_numarray.py
@@ -178,8 +178,8 @@
 
     def descr_sort(self, space):
         size = self.find_size()
-	stack = [(0,size-1)]
-	first=0; last=size-1; splitpoint=first;
+        stack = [(0,size-1)]
+        first=0; last=size-1; splitpoint=first;
         while (len(stack) > 0):
             first, last = stack.pop()
             while last>first:
diff --git a/pypy/module/micronumpy/test/test_numarray.py b/pypy/module/micronumpy/test/test_numarray.py
--- a/pypy/module/micronumpy/test/test_numarray.py
+++ b/pypy/module/micronumpy/test/test_numarray.py
@@ -432,13 +432,13 @@
         a = [3.0,4.0,0.0,-1.0]
         b = array(a)
         a.sort()
-	b.sort()
+        b.sort()
         assert(len(a)==len(b))
         for i in range(len(a)):
             assert(a[i]==b[i])
-	a = array(list(reversed(range(6))))
-	b = array(range(6))
-	a.sort()
+        a = array(list(reversed(range(6))))
+        b = array(range(6))
+        a.sort()
         assert(len(a)==len(b))
         for i in range(len(a)):
             assert(a[i]==b[i])
diff --git a/pypy/module/pypyjit/interp_jit.py b/pypy/module/pypyjit/interp_jit.py
--- a/pypy/module/pypyjit/interp_jit.py
+++ b/pypy/module/pypyjit/interp_jit.py
@@ -25,6 +25,7 @@
                             'last_exception',
                             'lastblock',
                             'is_being_profiled',
+                            'w_globals',
                             ]
 
 JUMP_ABSOLUTE = opmap['JUMP_ABSOLUTE']
diff --git a/pypy/module/pypyjit/test_pypy_c/test_call.py b/pypy/module/pypyjit/test_pypy_c/test_call.py
--- a/pypy/module/pypyjit/test_pypy_c/test_call.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_call.py
@@ -67,24 +67,14 @@
         assert log.opnames(ops) == ["guard_value",
                                     "getfield_gc", "guard_value",
                                     "getfield_gc", "guard_value",
-                                    "getfield_gc", "guard_nonnull_class"]
-        # LOAD_GLOBAL of OFFSET but in different function partially folded
-        # away
-        # XXX could be improved
+                                    "guard_not_invalidated"]
         ops = entry_bridge.ops_by_id('add', opcode='LOAD_GLOBAL')
-        assert log.opnames(ops) == ["guard_value", "getfield_gc", "guard_value"]
+        assert log.opnames(ops) == ["guard_not_invalidated"]
         #
-        # two LOAD_GLOBAL of f, the second is folded away
         ops = entry_bridge.ops_by_id('call', opcode='LOAD_GLOBAL')
-        assert log.opnames(ops) == ["getfield_gc", "guard_nonnull_class"]
+        assert log.opnames(ops) == []
         #
         assert entry_bridge.match_by_id('call', """
-            p29 = getfield_gc(ConstPtr(ptr28), descr=<GcPtrFieldDescr pypy.objspace.std.celldict.ModuleCell.inst_w_value .*>)
-            guard_nonnull_class(p29, ConstClass(Function), descr=...)
-            p33 = getfield_gc(p29, descr=<GcPtrFieldDescr pypy.interpreter.function.Function.inst_code .*>)
-            guard_value(p33, ConstPtr(ptr34), descr=...)
-            p35 = getfield_gc(p29, descr=<GcPtrFieldDescr pypy.interpreter.function.Function.inst_w_func_globals .*>)
-            p36 = getfield_gc(p29, descr=<GcPtrFieldDescr pypy.interpreter.function.Function.inst_closure .*>)
             p38 = call(ConstClass(getexecutioncontext), descr=<GcPtrCallDescr>)
             p39 = getfield_gc(p38, descr=<GcPtrFieldDescr pypy.interpreter.executioncontext.ExecutionContext.inst_topframeref .*>)
             i40 = force_token()
@@ -100,19 +90,16 @@
         # -----------------------------
         loop, = log.loops_by_id('call')
         assert loop.match("""
-            i12 = int_lt(i5, i6)
-            guard_true(i12, descr=...)
+            guard_not_invalidated(descr=...)
+            i9 = int_lt(i5, i6)
+            guard_true(i9, descr=...)
+            i10 = force_token()
+            i12 = int_add(i5, 1)
             i13 = force_token()
-            i15 = int_add(i5, 1)
-            i16 = int_add_ovf(i15, i7)
-            guard_no_overflow(descr=...)
-            i18 = force_token()
-            i20 = int_add_ovf(i16, 1)
-            guard_no_overflow(descr=...)
-            i21 = int_add_ovf(i20, i7)
+            i15 = int_add_ovf(i12, 1)
             guard_no_overflow(descr=...)
             --TICK--
-            jump(p0, p1, p2, p3, p4, i21, i6, i7, p8, p9, p10, p11, descr=<Loop0>)
+            jump(p0, p1, p2, p3, p4, i15, i6, p7, p8, descr=<Loop0>)
         """)
 
     def test_method_call(self):
diff --git a/pypy/module/pypyjit/test_pypy_c/test_globals.py b/pypy/module/pypyjit/test_pypy_c/test_globals.py
--- a/pypy/module/pypyjit/test_pypy_c/test_globals.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_globals.py
@@ -20,11 +20,9 @@
             guard_value(p10, ConstPtr(ptr11), descr=...)
             p12 = getfield_gc(p10, descr=<GcPtrFieldDescr .*W_DictMultiObject.inst_strategy .*>)
             guard_value(p12, ConstPtr(ptr13), descr=...)
-            p15 = getfield_gc(ConstPtr(ptr14), descr=<GcPtrFieldDescr .*ModuleCell.inst_w_value .*>)
-            guard_isnull(p15, descr=...)
             guard_not_invalidated(descr=...)
             p19 = getfield_gc(ConstPtr(p17), descr=<GcPtrFieldDescr .*W_DictMultiObject.inst_strategy .*>)
             guard_value(p19, ConstPtr(ptr20), descr=...)
             p22 = getfield_gc(ConstPtr(ptr21), descr=<GcPtrFieldDescr .*ModuleCell.inst_w_value .*>)
             guard_nonnull(p22, descr=...)
-        """)
\ No newline at end of file
+        """)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_instance.py b/pypy/module/pypyjit/test_pypy_c/test_instance.py
--- a/pypy/module/pypyjit/test_pypy_c/test_instance.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_instance.py
@@ -52,7 +52,7 @@
             i10 = int_add_ovf(i5, i7)
             guard_no_overflow(descr=...)
             --TICK--
-            jump(p0, p1, p2, p3, p4, i10, i6, p7, i7, p8, descr=<Loop0>)
+            jump(p0, p1, p2, p3, p4, i10, i6, i7, p8, descr=<Loop0>)
         """)
 
     def test_getattr_with_dynamic_attribute(self):
@@ -151,6 +151,7 @@
         assert loop.match_by_id('loadattr',
         '''
         guard_not_invalidated(descr=...)
+        i16 = arraylen_gc(p10, descr=<GcPtrArrayDescr>)
         i19 = call(ConstClass(ll_dict_lookup), _, _, _, descr=...)
         guard_no_exception(descr=...)
         i21 = int_and(i19, _)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_math.py b/pypy/module/pypyjit/test_pypy_c/test_math.py
--- a/pypy/module/pypyjit/test_pypy_c/test_math.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_math.py
@@ -47,6 +47,7 @@
         assert loop.match("""
             i2 = int_lt(i0, i1)
             guard_true(i2, descr=...)
+            guard_not_invalidated(descr=...)
             f1 = cast_int_to_float(i0)
             i3 = float_eq(f1, inf)
             i4 = float_eq(f1, -inf)
@@ -60,4 +61,32 @@
             i7 = int_add(i0, f1)
             --TICK--
             jump(..., descr=)
+        """)
+
+    def test_fmod(self):
+        def main(n):
+            import math
+
+            s = 0
+            while n > 0:
+                s += math.fmod(n, 2.0)
+                n -= 1
+            return s
+        log = self.run(main, [500])
+        assert log.result == main(500)
+        loop, = log.loops_by_filename(self.filepath)
+        assert loop.match("""
+            i1 = int_gt(i0, 0)
+            guard_true(i1, descr=...)
+            f1 = cast_int_to_float(i0)
+            i2 = float_eq(f1, inf)
+            i3 = float_eq(f1, -inf)
+            i4 = int_or(i2, i3)
+            i5 = int_is_true(i4)
+            guard_false(i5, descr=...)
+            f2 = call(ConstClass(fmod), f1, 2.0, descr=<FloatCallDescr>)
+            f3 = float_add(f0, f2)
+            i6 = int_sub(i0, 1)
+            --TICK--
+            jump(..., descr=)
         """)
\ No newline at end of file
diff --git a/pypy/module/pypyjit/test_pypy_c/test_misc.py b/pypy/module/pypyjit/test_pypy_c/test_misc.py
--- a/pypy/module/pypyjit/test_pypy_c/test_misc.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_misc.py
@@ -234,3 +234,18 @@
             return total
         #
         self.run_and_check(main, [])
+
+
+    def test_global(self):
+        log = self.run("""
+        i = 0
+        globalinc = 1
+        def main(n):
+            global i
+            while i < n:
+                l = globalinc # ID: globalread
+                i += l
+        """, [1000])
+
+        loop, = log.loops_by_id("globalread", is_entry_bridge=True)
+        assert len(loop.ops_by_id("globalread")) == 0
diff --git a/pypy/module/pypyjit/test_pypy_c/test_string.py b/pypy/module/pypyjit/test_pypy_c/test_string.py
--- a/pypy/module/pypyjit/test_pypy_c/test_string.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_string.py
@@ -34,9 +34,9 @@
             i25 = unicodegetitem(p13, i19)
             p27 = newstr(1)
             strsetitem(p27, 0, i23)
-            p30 = call(ConstClass(ll_str2unicode__rpy_stringPtr), p27, descr=<GcPtrCallDescr>)
+            p30 = call(ConstClass(ll_str2unicode__rpy_stringPtr), p27, descr=...)
             guard_no_exception(descr=...)
-            i32 = call(ConstClass(_ll_2_str_eq_checknull_char__rpy_unicodePtr_UniChar), p30, i25, descr=<SignedCallDescr>)
+            i32 = call(ConstClass(_ll_2_str_eq_checknull_char__rpy_unicodePtr_UniChar), p30, i25, descr=...)
             guard_true(i32, descr=...)
             i34 = int_add(i6, 1)
             --TICK--
@@ -105,5 +105,5 @@
             i58 = int_add_ovf(i6, i57)
             guard_no_overflow(descr=...)
             --TICK--
-            jump(p0, p1, p2, p3, p4, p5, i58, i7, i8, p9, p10, descr=<Loop4>)
+            jump(p0, p1, p2, p3, p4, p5, i58, i7, descr=<Loop4>)
         """)
diff --git a/pypy/module/test_lib_pypy/test_greenlet.py b/pypy/module/test_lib_pypy/test_greenlet.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/test_lib_pypy/test_greenlet.py
@@ -0,0 +1,233 @@
+from pypy.conftest import gettestobjspace
+
+
+class AppTestGreenlet:
+    def setup_class(cls):
+        cls.space = gettestobjspace(usemodules=['_continuation'])
+
+    def test_simple(self):
+        from greenlet import greenlet
+        lst = []
+        def f():
+            lst.append(1)
+            greenlet.getcurrent().parent.switch()
+            lst.append(3)
+        g = greenlet(f)
+        lst.append(0)
+        g.switch()
+        lst.append(2)
+        g.switch()
+        lst.append(4)
+        assert lst == range(5)
+
+    def test_parent(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        assert gmain.parent is None
+        g = greenlet(lambda: None)
+        assert g.parent is gmain
+
+    def test_pass_around(self):
+        from greenlet import greenlet
+        seen = []
+        def f(x, y):
+            seen.append((x, y))
+            seen.append(greenlet.getcurrent().parent.switch())
+            seen.append(greenlet.getcurrent().parent.switch(42))
+            return 44, 'z'
+        g = greenlet(f)
+        seen.append(g.switch(40, 'x'))
+        seen.append(g.switch(41, 'y'))
+        seen.append(g.switch(43))
+        #
+        def f2():
+            return 45
+        g = greenlet(f2)
+        seen.append(g.switch())
+        #
+        def f3():
+            pass
+        g = greenlet(f3)
+        seen.append(g.switch())
+        #
+        assert seen == [(40, 'x'), (), (41, 'y'), 42, 43, (44, 'z'), 45, None]
+
+    def test_exception_simple(self):
+        from greenlet import greenlet
+        #
+        def fmain():
+            raise ValueError
+        #
+        g1 = greenlet(fmain)
+        raises(ValueError, g1.switch)
+
+    def test_dead(self):
+        from greenlet import greenlet
+        #
+        def fmain():
+            assert g1 and not g1.dead
+        #
+        g1 = greenlet(fmain)
+        assert not g1 and not g1.dead
+        g1.switch()
+        assert not g1 and g1.dead
+        #
+        gmain = greenlet.getcurrent()
+        assert gmain and not gmain.dead
+
+    def test_GreenletExit(self):
+        from greenlet import greenlet, GreenletExit
+        #
+        def fmain(*args):
+            raise GreenletExit(*args)
+        #
+        g1 = greenlet(fmain)
+        res = g1.switch('foo', 'bar')
+        assert isinstance(res, GreenletExit) and res.args == ('foo', 'bar')
+
+    def test_throw_1(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        #
+        def f():
+            try:
+                gmain.switch()
+            except ValueError:
+                return "ok"
+        #
+        g = greenlet(f)
+        g.switch()
+        res = g.throw(ValueError)
+        assert res == "ok"
+
+    def test_throw_2(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        #
+        def f():
+            gmain.throw(ValueError)
+        #
+        g = greenlet(f)
+        raises(ValueError, g.switch)
+
+    def test_throw_3(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        raises(ValueError, gmain.throw, ValueError)
+
+    def test_throw_4(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        #
+        def f1():
+            g2.throw(ValueError)
+        #
+        def f2():
+            try:
+                gmain.switch()
+            except ValueError:
+                return "ok"
+        #
+        g1 = greenlet(f1)
+        g2 = greenlet(f2)
+        g2.switch()
+        res = g1.switch()
+        assert res == "ok"
+
+    def test_nondefault_parent(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            g2 = greenlet(f2)
+            res = g2.switch()
+            assert res == "from 2"
+            return "from 1"
+        #
+        def f2():
+            return "from 2"
+        #
+        g1 = greenlet(f1)
+        res = g1.switch()
+        assert res == "from 1"
+
+    def test_change_parent(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            res = g2.switch()
+            assert res == "from 2"
+            return "from 1"
+        #
+        def f2():
+            return "from 2"
+        #
+        g1 = greenlet(f1)
+        g2 = greenlet(f2)
+        g2.parent = g1
+        res = g1.switch()
+        assert res == "from 1"
+
+    def test_raises_through_parent_chain(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            raises(IndexError, g2.switch)
+            raise ValueError
+        #
+        def f2():
+            raise IndexError
+        #
+        g1 = greenlet(f1)
+        g2 = greenlet(f2)
+        g2.parent = g1
+        raises(ValueError, g1.switch)
+
+    def test_switch_to_dead_1(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            return "ok"
+        #
+        g1 = greenlet(f1)
+        res = g1.switch()
+        assert res == "ok"
+        res = g1.switch("goes to gmain instead")
+        assert res == "goes to gmain instead"
+
+    def test_switch_to_dead_2(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            g2 = greenlet(f2)
+            return g2.switch()
+        #
+        def f2():
+            return "ok"
+        #
+        g1 = greenlet(f1)
+        res = g1.switch()
+        assert res == "ok"
+        res = g1.switch("goes to gmain instead")
+        assert res == "goes to gmain instead"
+
+    def test_switch_to_dead_3(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        #
+        def f1():
+            res = g2.switch()
+            assert res == "ok"
+            res = gmain.switch("next step")
+            assert res == "goes to f1 instead"
+            return "all ok"
+        #
+        def f2():
+            return "ok"
+        #
+        g1 = greenlet(f1)
+        g2 = greenlet(f2)
+        g2.parent = g1
+        res = g1.switch()
+        assert res == "next step"
+        res = g2.switch("goes to f1 instead")
+        assert res == "all ok"
diff --git a/pypy/module/thread/os_thread.py b/pypy/module/thread/os_thread.py
--- a/pypy/module/thread/os_thread.py
+++ b/pypy/module/thread/os_thread.py
@@ -15,11 +15,6 @@
 # * The start-up data (the app-level callable and arguments) is
 #   stored in the global bootstrapper object.
 #
-# * The GC is notified that a new thread is about to start; in the
-#   framework GC with shadow stacks, this allocates a fresh new shadow
-#   stack (but doesn't use it yet).  See gc_thread_prepare().  This
-#   has no effect in asmgcc.
-#
 # * The new thread is launched at RPython level using an rffi call
 #   to the C function RPyThreadStart() defined in
 #   translator/c/src/thread*.h.  This RPython thread will invoke the
@@ -33,8 +28,8 @@
 #   operation is called (this is all done by gil.after_external_call(),
 #   called from the rffi-generated wrapper).  The gc_thread_run()
 #   operation will automatically notice that the current thread id was
-#   not seen before, and start using the freshly prepared shadow stack.
-#   Again, this has no effect in asmgcc.
+#   not seen before, and (in shadowstack) it will allocate and use a
+#   fresh new stack.  Again, this has no effect in asmgcc.
 #
 # * Only then does bootstrap() really run.  The first thing it does
 #   is grab the start-up information (app-level callable and args)
@@ -180,7 +175,7 @@
     bootstrapper.acquire(space, w_callable, args)
     try:
         try:
-            thread.gc_thread_prepare()
+            thread.gc_thread_prepare()     # (this has no effect any more)
             ident = thread.start_new_thread(bootstrapper.bootstrap, ())
         except Exception, e:
             bootstrapper.release()     # normally called by the new thread
diff --git a/pypy/objspace/std/celldict.py b/pypy/objspace/std/celldict.py
--- a/pypy/objspace/std/celldict.py
+++ b/pypy/objspace/std/celldict.py
@@ -1,50 +1,57 @@
-""" A very simple cell dict implementation. The dictionary maps keys to cell.
-This ensures that the function (dict, key) -> cell is pure. By itself, this
-optimization is not helping at all, but in conjunction with the JIT it can
-speed up global lookups a lot."""
+""" A very simple cell dict implementation using a version tag. The dictionary
+maps keys to objects. If a specific key is changed a lot, a level of
+indirection is introduced to make the version tag change less often.
+"""
 
+from pypy.interpreter.baseobjspace import W_Root
 from pypy.objspace.std.dictmultiobject import IteratorImplementation
 from pypy.objspace.std.dictmultiobject import DictStrategy, _never_equal_to_string
 from pypy.objspace.std.dictmultiobject import ObjectDictStrategy
 from pypy.rlib import jit, rerased
 
-class ModuleCell(object):
+class VersionTag(object):
+    pass
+
+class ModuleCell(W_Root):
     def __init__(self, w_value=None):
         self.w_value = w_value
 
-    def invalidate(self):
-        w_value = self.w_value
-        self.w_value = None
-        return w_value
-
     def __repr__(self):
         return "<ModuleCell: %s>" % (self.w_value, )
 
+def unwrap_cell(w_value):
+    if isinstance(w_value, ModuleCell):
+        return w_value.w_value
+    return w_value
+
 class ModuleDictStrategy(DictStrategy):
 
     erase, unerase = rerased.new_erasing_pair("modulecell")
     erase = staticmethod(erase)
     unerase = staticmethod(unerase)
 
+    _immutable_fields_ = ["version?"]
+
     def __init__(self, space):
         self.space = space
+        self.version = VersionTag()
 
     def get_empty_storage(self):
        return self.erase({})
 
-    def getcell(self, w_dict, key, makenew):
-        if makenew or jit.we_are_jitted():
-            # when we are jitting, we always go through the pure function
-            # below, to ensure that we have no residual dict lookup
-            w_dict = jit.promote(w_dict)
-            self = jit.promote(self)
-            return self._getcell_makenew(w_dict, key)
+    def mutated(self):
+       self.version = VersionTag()
+
+    def getdictvalue_no_unwrapping(self, w_dict, key):
+        # NB: it's important to promote self here, so that self.version is a
+        # no-op due to the quasi-immutable field
+        self = jit.promote(self)
+        return self._getdictvalue_no_unwrapping_pure(self.version, w_dict, key)
+
+    @jit.elidable_promote('0,1,2')
+    def _getdictvalue_no_unwrapping_pure(self, version, w_dict, key):
         return self.unerase(w_dict.dstorage).get(key, None)
 
-    @jit.elidable
-    def _getcell_makenew(self, w_dict, key):
-        return self.unerase(w_dict.dstorage).setdefault(key, ModuleCell())
-
     def setitem(self, w_dict, w_key, w_value):
         space = self.space
         if space.is_w(space.type(w_key), space.w_str):
@@ -54,15 +61,24 @@
             w_dict.setitem(w_key, w_value)
 
     def setitem_str(self, w_dict, key, w_value):
-        self.getcell(w_dict, key, True).w_value = w_value
+        cell = self.getdictvalue_no_unwrapping(w_dict, key)
+        if isinstance(cell, ModuleCell):
+            cell.w_value = w_value
+            return
+        if cell is not None:
+            w_value = ModuleCell(w_value)
+        self.mutated()
+        self.unerase(w_dict.dstorage)[key] = w_value
 
     def setdefault(self, w_dict, w_key, w_default):
         space = self.space
         if space.is_w(space.type(w_key), space.w_str):
-            cell = self.getcell(w_dict, space.str_w(w_key), True)
-            if cell.w_value is None:
-                cell.w_value = w_default
-            return cell.w_value
+            key = space.str_w(w_key)
+            w_result = self.getitem_str(w_dict, key)
+            if w_result is not None:
+                return w_result
+            self.setitem_str(w_dict, key, w_default)
+            return w_default
         else:
             self.switch_to_object_strategy(w_dict)
             return w_dict.setdefault(w_key, w_default)
@@ -72,14 +88,13 @@
         w_key_type = space.type(w_key)
         if space.is_w(w_key_type, space.w_str):
             key = space.str_w(w_key)
-            cell = self.getcell(w_dict, key, False)
-            if cell is None or cell.w_value is None:
-                raise KeyError
-            # note that we don't remove the cell from self.content, to make
-            # sure that a key that was found at any point in the dict, still
-            # maps to the same cell later (even if this cell no longer
-            # represents a key)
-            cell.invalidate()
+            dict_w = self.unerase(w_dict.dstorage)
+            try:
+                del dict_w[key]
+            except KeyError:
+                raise
+            else:
+                self.mutated()
         elif _never_equal_to_string(space, w_key_type):
             raise KeyError
         else:
@@ -87,12 +102,7 @@
             w_dict.delitem(w_key)
 
     def length(self, w_dict):
-        # inefficient, but do we care?
-        res = 0
-        for cell in self.unerase(w_dict.dstorage).itervalues():
-            if cell.w_value is not None:
-                res += 1
-        return res
+        return len(self.unerase(w_dict.dstorage))
 
     def getitem(self, w_dict, w_key):
         space = self.space
@@ -107,11 +117,8 @@
             return w_dict.getitem(w_key)
 
     def getitem_str(self, w_dict, key):
-        res = self.getcell(w_dict, key, False)
-        if res is None:
-            return None
-        # note that even if the res.w_value is None, the next line is fine
-        return res.w_value
+        w_res = self.getdictvalue_no_unwrapping(w_dict, key)
+        return unwrap_cell(w_res)
 
     def iter(self, w_dict):
         return ModuleDictIteratorImplementation(self.space, self, w_dict)
@@ -119,44 +126,34 @@
     def keys(self, w_dict):
         space = self.space
         iterator = self.unerase(w_dict.dstorage).iteritems
-        return [space.wrap(key) for key, cell in iterator()
-                    if cell.w_value is not None]
+        return [space.wrap(key) for key, cell in iterator()]
 
     def values(self, w_dict):
         iterator = self.unerase(w_dict.dstorage).itervalues
-        return [cell.w_value for cell in iterator()
-                    if cell.w_value is not None]
+        return [unwrap_cell(cell) for cell in iterator()]
 
     def items(self, w_dict):
         space = self.space
         iterator = self.unerase(w_dict.dstorage).iteritems
-        return [space.newtuple([space.wrap(key), cell.w_value])
-                    for (key, cell) in iterator()
-                        if cell.w_value is not None]
+        return [space.newtuple([space.wrap(key), unwrap_cell(cell)])
+                    for key, cell in iterator()]
 
     def clear(self, w_dict):
-        iterator = self.unerase(w_dict.dstorage).iteritems
-        for k, cell in iterator():
-            cell.invalidate()
+        iterator = self.unerase(w_dict.dstorage).clear()
+        self.mutated()
 
     def popitem(self, w_dict):
-        # This is O(n) if called repeatadly, you probably shouldn't be on a
-        # Module's dict though
-        for k, cell in self.unerase(w_dict.dstorage).iteritems():
-            if cell.w_value is not None:
-                w_value = cell.w_value
-                cell.invalidate()
-                return self.space.wrap(k), w_value
-        else:
-            raise KeyError
+        d = self.unerase(w_dict.dstorage)
+        key, w_value = d.popitem()
+        self.mutated()
+        return self.space.wrap(key), unwrap_cell(w_value)
 
     def switch_to_object_strategy(self, w_dict):
         d = self.unerase(w_dict.dstorage)
         strategy = self.space.fromcache(ObjectDictStrategy)
         d_new = strategy.unerase(strategy.get_empty_storage())
         for key, cell in d.iteritems():
-            if cell.w_value is not None:
-                d_new[self.space.wrap(key)] = cell.w_value
+            d_new[self.space.wrap(key)] = unwrap_cell(cell)
         w_dict.strategy = strategy
         w_dict.dstorage = strategy.erase(d_new)
 
@@ -168,7 +165,6 @@
 
     def next_entry(self):
         for key, cell in self.iterator:
-            if cell.w_value is not None:
-                return (self.space.wrap(key), cell.w_value)
+            return (self.space.wrap(key), unwrap_cell(cell))
         else:
             return None, None
diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -38,7 +38,9 @@
         if space.config.objspace.std.withcelldict and module:
             from pypy.objspace.std.celldict import ModuleDictStrategy
             assert w_type is None
-            strategy = space.fromcache(ModuleDictStrategy)
+            # every module needs its own strategy, because the strategy stores
+            # the version tag
+            strategy = ModuleDictStrategy(space)
 
         elif instance or strdict or module:
             assert w_type is None
diff --git a/pypy/objspace/std/test/test_celldict.py b/pypy/objspace/std/test/test_celldict.py
--- a/pypy/objspace/std/test/test_celldict.py
+++ b/pypy/objspace/std/test/test_celldict.py
@@ -2,42 +2,111 @@
 from pypy.conftest import gettestobjspace, option
 from pypy.objspace.std.dictmultiobject import W_DictMultiObject
 from pypy.objspace.std.celldict import ModuleCell, ModuleDictStrategy
-from pypy.objspace.std.test.test_dictmultiobject import FakeSpace
+from pypy.objspace.std.test.test_dictmultiobject import FakeSpace, \
+        BaseTestRDictImplementation, BaseTestDevolvedDictImplementation
 from pypy.interpreter import gateway
 
+from pypy.conftest import gettestobjspace, option
+
 space = FakeSpace()
 
 class TestCellDict(object):
-    def test_basic_property(self):
+    def test_basic_property_cells(self):
         strategy = ModuleDictStrategy(space)
         storage = strategy.get_empty_storage()
         d = W_DictMultiObject(space, strategy, storage)
 
-        # replace getcell with getcell from strategy
-        def f(key, makenew):
-            return strategy.getcell(d, key, makenew)
-        d.getcell = f
+        v1 = strategy.version
+        d.setitem("a", 1)
+        v2 = strategy.version
+        assert v1 is not v2
+        assert d.getitem("a") == 1
+        assert d.strategy.getdictvalue_no_unwrapping(d, "a") == 1
 
-        d.setitem("a", 1)
-        assert d.getcell("a", False) is d.getcell("a", False)
-        acell = d.getcell("a", False)
-        d.setitem("b", 2)
-        assert d.getcell("b", False) is d.getcell("b", False)
-        assert d.getcell("c", True) is d.getcell("c", True)
+        d.setitem("a", 2)
+        v3 = strategy.version
+        assert v2 is not v3
+        assert d.getitem("a") == 2
+        assert d.strategy.getdictvalue_no_unwrapping(d, "a").w_value == 2
 
-        assert d.getitem("a") == 1
-        assert d.getitem("b") == 2
+        d.setitem("a", 3)
+        v4 = strategy.version
+        assert v3 is v4
+        assert d.getitem("a") == 3
+        assert d.strategy.getdictvalue_no_unwrapping(d, "a").w_value == 3
 
         d.delitem("a")
-        py.test.raises(KeyError, d.delitem, "a")
+        v5 = strategy.version
+        assert v5 is not v4
         assert d.getitem("a") is None
-        assert d.getcell("a", False) is acell
-        assert d.length() == 1
+        assert d.strategy.getdictvalue_no_unwrapping(d, "a") is None
 
-        d.clear()
-        assert d.getitem("a") is None
-        assert d.getcell("a", False) is acell
-        assert d.length() == 0
+class AppTestModuleDict(object):
+    def setup_class(cls):
+        cls.space = gettestobjspace(**{"objspace.std.withcelldict": True})
+
+    def w_impl_used(self, obj):
+        if option.runappdirect:
+            py.test.skip("__repr__ doesn't work on appdirect")
+        import __pypy__
+        assert "ModuleDictStrategy" in __pypy__.internal_repr(obj)
+
+    def test_check_module_uses_module_dict(self):
+        m = type(__builtins__)("abc")
+        self.impl_used(m.__dict__)
+
+    def test_key_not_there(self):
+        d = type(__builtins__)("abc").__dict__
+        raises(KeyError, "d['def']")
+
+    def test_fallback_evil_key(self):
+        class F(object):
+            def __hash__(self):
+                return hash("s")
+            def __eq__(self, other):
+                return other == "s"
+        d = type(__builtins__)("abc").__dict__
+        d["s"] = 12
+        assert d["s"] == 12
+        assert d[F()] == d["s"]
+
+        d = type(__builtins__)("abc").__dict__
+        x = d.setdefault("s", 12)
+        assert x == 12
+        x = d.setdefault(F(), 12)
+        assert x == 12
+
+        d = type(__builtins__)("abc").__dict__
+        x = d.setdefault(F(), 12)
+        assert x == 12
+
+        d = type(__builtins__)("abc").__dict__
+        d["s"] = 12
+        del d[F()]
+
+        assert "s" not in d
+        assert F() not in d
+
+
+class TestModuleDictImplementation(BaseTestRDictImplementation):
+    StrategyClass = ModuleDictStrategy
+
+class TestModuleDictImplementationWithBuiltinNames(BaseTestRDictImplementation):
+    StrategyClass = ModuleDictStrategy
+
+    string = "int"
+    string2 = "isinstance"
+
+
+class TestDevolvedModuleDictImplementation(BaseTestDevolvedDictImplementation):
+    StrategyClass = ModuleDictStrategy
+
+class TestDevolvedModuleDictImplementationWithBuiltinNames(BaseTestDevolvedDictImplementation):
+    StrategyClass = ModuleDictStrategy
+
+    string = "int"
+    string2 = "isinstance"
+
 
 class AppTestCellDict(object):
     OPTIONS = {"objspace.std.withcelldict": True}
@@ -67,4 +136,4 @@
         d["a"] = 3
         del d["a"]
         d[object()] = 5
-        assert d.values() == [5]
\ No newline at end of file
+        assert d.values() == [5]
diff --git a/pypy/objspace/std/test/test_dictmultiobject.py b/pypy/objspace/std/test/test_dictmultiobject.py
--- a/pypy/objspace/std/test/test_dictmultiobject.py
+++ b/pypy/objspace/std/test/test_dictmultiobject.py
@@ -5,7 +5,6 @@
      W_DictMultiObject, setitem__DictMulti_ANY_ANY, getitem__DictMulti_ANY, \
      StringDictStrategy, ObjectDictStrategy
 
-from pypy.objspace.std.celldict import ModuleDictStrategy
 from pypy.conftest import gettestobjspace
 from pypy.conftest import option
 
@@ -731,52 +730,6 @@
                 set([('a', 1), ('b', 2), ('d', 4), ('e', 5)]))
 
 
-class AppTestModuleDict(object):
-    def setup_class(cls):
-        cls.space = gettestobjspace(**{"objspace.std.withcelldict": True})
-        if option.runappdirect:
-            py.test.skip("__repr__ doesn't work on appdirect")
-
-    def w_impl_used(self, obj):
-        import __pypy__
-        assert "ModuleDictStrategy" in __pypy__.internal_repr(obj)
-
-    def test_check_module_uses_module_dict(self):
-        m = type(__builtins__)("abc")
-        self.impl_used(m.__dict__)
-
-    def test_key_not_there(self):
-        d = type(__builtins__)("abc").__dict__
-        raises(KeyError, "d['def']")
-
-    def test_fallback_evil_key(self):
-        class F(object):
-            def __hash__(self):
-                return hash("s")
-            def __eq__(self, other):
-                return other == "s"
-        d = type(__builtins__)("abc").__dict__
-        d["s"] = 12
-        assert d["s"] == 12
-        assert d[F()] == d["s"]
-
-        d = type(__builtins__)("abc").__dict__
-        x = d.setdefault("s", 12)
-        assert x == 12
-        x = d.setdefault(F(), 12)
-        assert x == 12
-
-        d = type(__builtins__)("abc").__dict__
-        x = d.setdefault(F(), 12)
-        assert x == 12
-
-        d = type(__builtins__)("abc").__dict__
-        d["s"] = 12
-        del d[F()]
-
-        assert "s" not in d
-        assert F() not in d
-
 class AppTestStrategies(object):
     def setup_class(cls):
         if option.runappdirect:
@@ -1071,16 +1024,6 @@
 ##     ImplementionClass = MeasuringDictImplementation
 ##     DevolvedClass = MeasuringDictImplementation
 
-class TestModuleDictImplementation(BaseTestRDictImplementation):
-    StrategyClass = ModuleDictStrategy
-
-class TestModuleDictImplementationWithBuiltinNames(BaseTestRDictImplementation):
-    StrategyClass = ModuleDictStrategy
-
-    string = "int"
-    string2 = "isinstance"
-
-
 class BaseTestDevolvedDictImplementation(BaseTestRDictImplementation):
     def fill_impl(self):
         BaseTestRDictImplementation.fill_impl(self)
@@ -1092,15 +1035,6 @@
 class TestDevolvedStrDictImplementation(BaseTestDevolvedDictImplementation):
     StrategyClass = StringDictStrategy
 
-class TestDevolvedModuleDictImplementation(BaseTestDevolvedDictImplementation):
-    StrategyClass = ModuleDictStrategy
-
-class TestDevolvedModuleDictImplementationWithBuiltinNames(BaseTestDevolvedDictImplementation):
-    StrategyClass = ModuleDictStrategy
-
-    string = "int"
-    string2 = "isinstance"
-
 
 def test_module_uses_strdict():
     fakespace = FakeSpace()
diff --git a/pypy/pytest-A-stackless.cfg b/pypy/pytest-A-stackless.cfg
deleted file mode 100644
--- a/pypy/pytest-A-stackless.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-# run for some directories a file at a time
-
-def collect_one_testdir(testdirs, reldir, tests):
-    if (reldir.startswith('module/_stackless/') or
-        reldir.startswith('lib')):
-        testdirs.extend(tests)
-    else:     
-        testdirs.append(reldir)
-
-    
diff --git a/pypy/rlib/_rffi_stacklet.py b/pypy/rlib/_rffi_stacklet.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/_rffi_stacklet.py
@@ -0,0 +1,49 @@
+import py
+from pypy.tool.autopath import pypydir
+from pypy.rpython.lltypesystem import lltype, llmemory, rffi
+from pypy.translator.tool.cbuild import ExternalCompilationInfo
+from pypy.rpython.tool import rffi_platform
+
+
+cdir = py.path.local(pypydir) / 'translator' / 'c'
+
+
+eci = ExternalCompilationInfo(
+    include_dirs = [cdir],
+    includes = ['src/stacklet/stacklet.h'],
+    separate_module_sources = ['#include "src/stacklet/stacklet.c"\n'],
+)
+rffi_platform.verify_eci(eci.convert_sources_to_files())
+
+def llexternal(name, args, result, **kwds):
+    return rffi.llexternal(name, args, result, compilation_info=eci,
+                           _nowrapper=True, **kwds)
+
+# ----- types -----
+
+handle = rffi.COpaquePtr(typedef='stacklet_handle', compilation_info=eci)
+thread_handle = rffi.COpaquePtr(typedef='stacklet_thread_handle',
+                                compilation_info=eci)
+run_fn = lltype.Ptr(lltype.FuncType([handle, llmemory.Address], handle))
+
+# ----- constants -----
+
+null_handle = lltype.nullptr(handle.TO)
+
+def is_empty_handle(h):
+    return rffi.cast(lltype.Signed, h) == -1
+
+# ----- functions -----
+
+newthread = llexternal('stacklet_newthread', [], thread_handle)
+deletethread = llexternal('stacklet_deletethread',[thread_handle], lltype.Void)
+
+new = llexternal('stacklet_new', [thread_handle, run_fn, llmemory.Address],
+                 handle, random_effects_on_gcobjs=True)
+switch = llexternal('stacklet_switch', [thread_handle, handle], handle,
+                    random_effects_on_gcobjs=True)
+destroy = llexternal('stacklet_destroy', [thread_handle, handle], lltype.Void)
+
+_translate_pointer = llexternal("_stacklet_translate_pointer",
+                                [handle, llmemory.Address],
+                                llmemory.Address)
diff --git a/pypy/rlib/_rsocket_rffi.py b/pypy/rlib/_rsocket_rffi.py
--- a/pypy/rlib/_rsocket_rffi.py
+++ b/pypy/rlib/_rsocket_rffi.py
@@ -489,10 +489,10 @@
 getnameinfo = external('getnameinfo', [sockaddr_ptr, socklen_t, CCHARP,
                        size_t, CCHARP, size_t, rffi.INT], rffi.INT)
 
-htonl = external('htonl', [rffi.UINT], rffi.UINT)
-htons = external('htons', [rffi.USHORT], rffi.USHORT)
-ntohl = external('ntohl', [rffi.UINT], rffi.UINT)
-ntohs = external('ntohs', [rffi.USHORT], rffi.USHORT)
+htonl = external('htonl', [rffi.UINT], rffi.UINT, threadsafe=False)
+htons = external('htons', [rffi.USHORT], rffi.USHORT, threadsafe=False)
+ntohl = external('ntohl', [rffi.UINT], rffi.UINT, threadsafe=False)
+ntohs = external('ntohs', [rffi.USHORT], rffi.USHORT, threadsafe=False)
 
 if _POSIX:
     inet_aton = external('inet_aton', [CCHARP, lltype.Ptr(in_addr)],
diff --git a/pypy/rlib/_stacklet_asmgcc.py b/pypy/rlib/_stacklet_asmgcc.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/_stacklet_asmgcc.py
@@ -0,0 +1,277 @@
+from pypy.rlib import _rffi_stacklet as _c
+from pypy.rlib.debug import ll_assert
+from pypy.rpython.lltypesystem import lltype, llmemory, rffi
+from pypy.rpython.lltypesystem.lloperation import llop
+from pypy.rpython.annlowlevel import llhelper
+
+
+_asmstackrootwalker = None    # BIG HACK: monkey-patched by asmgcroot.py
+_stackletrootwalker = None
+
+def get_stackletrootwalker():
+    # lazily called, to make the following imports lazy
+    global _stackletrootwalker
+    if _stackletrootwalker is not None:
+        return _stackletrootwalker
+
+    from pypy.rpython.memory.gctransform.asmgcroot import (
+        WALKFRAME, CALLEE_SAVED_REGS, INDEX_OF_EBP, sizeofaddr)
+
+    assert _asmstackrootwalker is not None, "should have been monkey-patched"
+    basewalker = _asmstackrootwalker
+
+    class StackletRootWalker(object):
+        _alloc_flavor_ = "raw"
+
+        enumerating = False
+
+        def setup(self, obj):
+            # initialization: read the SUSPSTACK object
+            p = llmemory.cast_adr_to_ptr(obj, lltype.Ptr(SUSPSTACK))
+            if not p.handle:
+                return False
+            self.context = p.handle
+            anchor = p.anchor
+            del p
+            self.curframe = lltype.malloc(WALKFRAME, flavor='raw')
+            self.otherframe = lltype.malloc(WALKFRAME, flavor='raw')
+            self.fill_initial_frame(self.curframe, anchor)
+            return True
+
+        def fill_initial_frame(self, curframe, initialframedata):
+            # Copy&paste :-(
+            initialframedata += 2*sizeofaddr
+            reg = 0
+            while reg < CALLEE_SAVED_REGS:
+                curframe.regs_stored_at[reg] = initialframedata+reg*sizeofaddr
+                reg += 1
+            retaddraddr = initialframedata + CALLEE_SAVED_REGS * sizeofaddr
+            retaddraddr = self.translateptr(retaddraddr)
+            curframe.frame_address = retaddraddr.address[0]
+
+        def teardown(self):
+            lltype.free(self.curframe, flavor='raw')
+            lltype.free(self.otherframe, flavor='raw')
+            self.context = lltype.nullptr(_c.handle.TO)
+            return llmemory.NULL
+
+        def next(self, obj, prev):
+            #
+            # Pointers to the stack can be "translated" or not:
+            #
+            #   * Non-translated pointers point to where the data would be
+            #     if the stack was installed and running.
+            #
+            #   * Translated pointers correspond to where the data
+            #     is now really in memory.
+            #
+            # Note that 'curframe' contains non-translated pointers, and
+            # of course the stack itself is full of non-translated pointers.
+            #
+            while True:
+                if not self.enumerating:
+                    if not prev:
+                        if not self.setup(obj):      # one-time initialization
+                            return llmemory.NULL
+                        prev = obj   # random value, but non-NULL
+                    callee = self.curframe
+                    retaddraddr = self.translateptr(callee.frame_address)
+                    retaddr = retaddraddr.address[0]
+                    basewalker.locate_caller_based_on_retaddr(retaddr)
+                    self.enumerating = True
+                #
+                # not really a loop, but kept this way for similarity
+                # with asmgcroot:
+                callee = self.curframe
+                ebp_in_caller = callee.regs_stored_at[INDEX_OF_EBP]
+                ebp_in_caller = self.translateptr(ebp_in_caller)
+                ebp_in_caller = ebp_in_caller.address[0]
+                while True:
+                    location = basewalker._shape_decompressor.next()
+                    if location == 0:
+                        break
+                    addr = basewalker.getlocation(callee, ebp_in_caller,
+                                                  location)
+                    # yield the translated addr of the next GCREF in the stack
+                    return self.translateptr(addr)
+                #
+                self.enumerating = False
+                caller = self.otherframe
+                reg = CALLEE_SAVED_REGS - 1
+                while reg >= 0:
+                    location = basewalker._shape_decompressor.next()
+                    addr = basewalker.getlocation(callee, ebp_in_caller,
+                                                  location)
+                    caller.regs_stored_at[reg] = addr   # non-translated
+                    reg -= 1
+
+                location = basewalker._shape_decompressor.next()
+                caller.frame_address = basewalker.getlocation(callee,
+                                                              ebp_in_caller,
+                                                              location)
+                # ^^^ non-translated
+                if caller.frame_address == llmemory.NULL:
+                    return self.teardown()    # completely done with this stack
+                #
+                self.otherframe = callee
+                self.curframe = caller
+                # loop back
+
+        def translateptr(self, addr):
+            return _c._translate_pointer(self.context, addr)
+
+    _stackletrootwalker = StackletRootWalker()
+    return _stackletrootwalker
+get_stackletrootwalker._annspecialcase_ = 'specialize:memo'
+
+
+def customtrace(obj, prev):
+    stackletrootwalker = get_stackletrootwalker()
+    return stackletrootwalker.next(obj, prev)
+
+
+SUSPSTACK = lltype.GcStruct('SuspStack',
+                            ('handle', _c.handle),
+                            ('anchor', llmemory.Address),
+                            rtti=True)
+NULL_SUSPSTACK = lltype.nullptr(SUSPSTACK)
+CUSTOMTRACEFUNC = lltype.FuncType([llmemory.Address, llmemory.Address],
+                                  llmemory.Address)
+customtraceptr = llhelper(lltype.Ptr(CUSTOMTRACEFUNC), customtrace)
+lltype.attachRuntimeTypeInfo(SUSPSTACK, customtraceptr=customtraceptr)
+
+ASM_FRAMEDATA_HEAD_PTR = lltype.Ptr(lltype.ForwardReference())
+ASM_FRAMEDATA_HEAD_PTR.TO.become(lltype.Struct('ASM_FRAMEDATA_HEAD',
+        ('prev', ASM_FRAMEDATA_HEAD_PTR),
+        ('next', ASM_FRAMEDATA_HEAD_PTR)
+    ))
+alternateanchor = lltype.malloc(ASM_FRAMEDATA_HEAD_PTR.TO,
+                                immortal=True)
+alternateanchor.prev = alternateanchor
+alternateanchor.next = alternateanchor
+
+FUNCNOARG_P = lltype.Ptr(lltype.FuncType([], _c.handle))
+pypy_asm_stackwalk2 = rffi.llexternal('pypy_asm_stackwalk',
+                                      [FUNCNOARG_P,
+                                       ASM_FRAMEDATA_HEAD_PTR],
+                                      _c.handle, sandboxsafe=True,
+                                      _nowrapper=True)
+
+
+def _new_callback():
+    # Here, we just closed the stack.  Get the stack anchor, store
+    # it in the gcrootfinder.suspstack.anchor, and create a new
+    # stacklet with stacklet_new().  If this call fails, then we
+    # are just returning NULL.
+    _stack_just_closed()
+    return _c.new(gcrootfinder.thrd, llhelper(_c.run_fn, _new_runfn),
+                  llmemory.NULL)
+
+def _stack_just_closed():
+    # Immediately unlink the new stackanchor from the doubly-linked
+    # chained list.  When returning from pypy_asm_stackwalk2, the
+    # assembler code will try to unlink it again, which should be
+    # a no-op given that the doubly-linked list is empty.
+    stackanchor = llmemory.cast_ptr_to_adr(alternateanchor.next)
+    gcrootfinder.suspstack.anchor = stackanchor
+    alternateanchor.prev = alternateanchor
+    alternateanchor.next = alternateanchor
+
+def _new_runfn(h, _):
+    # Here, we are in a fresh new stacklet.
+    llop.gc_stack_bottom(lltype.Void)   # marker for trackgcroot.py
+    #
+    # There is a fresh suspstack object waiting on the gcrootfinder,
+    # so populate it with data that represents the parent suspended
+    # stacklet and detach the suspstack object from gcrootfinder.
+    suspstack = gcrootfinder.attach_handle_on_suspstack(h)
+    #
+    # Call the main function provided by the (RPython) user.
+    suspstack = gcrootfinder.runfn(suspstack, gcrootfinder.arg)
+    #
+    # Here, suspstack points to the target stacklet to which we want
+    # to jump to next.  Read the 'handle' and forget about the
+    # suspstack object.
+    return _consume_suspstack(suspstack)
+
+def _consume_suspstack(suspstack):
+    h = suspstack.handle
+    ll_assert(bool(h), "_consume_suspstack: null handle")
+    suspstack.handle = _c.null_handle
+    return h
+
+def _switch_callback():
+    # Here, we just closed the stack.  Get the stack anchor, store
+    # it in the gcrootfinder.suspstack.anchor, and switch to this
+    # suspstack with stacklet_switch().  If this call fails, then we
+    # are just returning NULL.
+    oldanchor = gcrootfinder.suspstack.anchor
+    _stack_just_closed()
+    h = _consume_suspstack(gcrootfinder.suspstack)
+    #
+    # gcrootfinder.suspstack.anchor is left with the anchor of the
+    # previous place (i.e. before the call to switch()).
+    h2 = _c.switch(gcrootfinder.thrd, h)
+    #
+    if not h2:    # MemoryError: restore
+        gcrootfinder.suspstack.anchor = oldanchor
+        gcrootfinder.suspstack.handle = h
+    return h2
+
+
+class StackletGcRootFinder(object):
+    suspstack = NULL_SUSPSTACK
+
+    def new(self, thrd, callback, arg):
+        self.thrd = thrd._thrd
+        self.runfn = callback
+        self.arg = arg
+        # make a fresh new clean SUSPSTACK
+        newsuspstack = lltype.malloc(SUSPSTACK)
+        newsuspstack.handle = _c.null_handle
+        self.suspstack = newsuspstack
+        # Invoke '_new_callback' by closing the stack
+        h = pypy_asm_stackwalk2(llhelper(FUNCNOARG_P, _new_callback),
+                                alternateanchor)
+        return self.get_result_suspstack(h)
+
+    def switch(self, thrd, suspstack):
+        self.thrd = thrd._thrd
+        self.suspstack = suspstack
+        h = pypy_asm_stackwalk2(llhelper(FUNCNOARG_P, _switch_callback),
+                                alternateanchor)
+        return self.get_result_suspstack(h)
+
+    def attach_handle_on_suspstack(self, handle):
+        s = self.suspstack
+        self.suspstack = NULL_SUSPSTACK
+        ll_assert(bool(s.anchor), "s.anchor should not be null")
+        s.handle = handle
+        llop.gc_assume_young_pointers(lltype.Void, llmemory.cast_ptr_to_adr(s))
+        return s
+
+    def get_result_suspstack(self, h):
+        #
+        # Return from a new() or a switch(): 'h' is a handle, possibly
+        # an empty one, that says from where we switched to.
+        if not h:
+            raise MemoryError
+        elif _c.is_empty_handle(h):
+            return NULL_SUSPSTACK
+        else:
+            # This is a return that gave us a real handle.  Store it.
+            return self.attach_handle_on_suspstack(h)
+
+    def destroy(self, thrd, suspstack):
+        h = suspstack.handle
+        suspstack.handle = _c.null_handle
+        _c.destroy(thrd._thrd, h)
+
+    def is_empty_handle(self, suspstack):
+        return not suspstack
+
+    def get_null_handle(self):
+        return NULL_SUSPSTACK
+
+
+gcrootfinder = StackletGcRootFinder()
diff --git a/pypy/rlib/_stacklet_n_a.py b/pypy/rlib/_stacklet_n_a.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/_stacklet_n_a.py
@@ -0,0 +1,31 @@
+from pypy.rlib import _rffi_stacklet as _c
+from pypy.rpython.annlowlevel import llhelper
+from pypy.tool.staticmethods import StaticMethods
+
+
+class StackletGcRootFinder:
+    __metaclass__ = StaticMethods
+
+    def new(thrd, callback, arg):
+        h = _c.new(thrd._thrd, llhelper(_c.run_fn, callback), arg)
+        if not h:
+            raise MemoryError
+        return h
+    new._annspecialcase_ = 'specialize:arg(1)'
+
+    def switch(thrd, h):
+        h = _c.switch(thrd._thrd, h)
+        if not h:
+            raise MemoryError
+        return h
+
+    def destroy(thrd, h):
+        _c.destroy(thrd._thrd, h)
+
+    is_empty_handle = _c.is_empty_handle
+
+    def get_null_handle():
+        return _c.null_handle
+
+
+gcrootfinder = StackletGcRootFinder    # class object
diff --git a/pypy/rlib/_stacklet_shadowstack.py b/pypy/rlib/_stacklet_shadowstack.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/_stacklet_shadowstack.py
@@ -0,0 +1,108 @@
+from pypy.rlib import _rffi_stacklet as _c
+from pypy.rlib.debug import ll_assert
+from pypy.rpython.annlowlevel import llhelper
+from pypy.rpython.lltypesystem import lltype, llmemory
+from pypy.rpython.lltypesystem.lloperation import llop
+from pypy.tool.staticmethods import StaticMethods
+
+
+NULL_SUSPSTACK = lltype.nullptr(llmemory.GCREF.TO)
+
+
+def _new_callback(h, arg):
+    # We still have the old shadowstack active at this point; save it
+    # away, and start a fresh new one
+    oldsuspstack = gcrootfinder.oldsuspstack
+    llop.gc_save_current_state_away(lltype.Void,
+                                    oldsuspstack, h)
+    llop.gc_start_fresh_new_state(lltype.Void)
+    gcrootfinder.oldsuspstack = NULL_SUSPSTACK
+    #
+    newsuspstack = gcrootfinder.callback(oldsuspstack, arg)
+    #
+    # Finishing this stacklet.
+    gcrootfinder.oldsuspstack = NULL_SUSPSTACK
+    gcrootfinder.newsuspstack = newsuspstack
+    h = llop.gc_shadowstackref_context(llmemory.Address, newsuspstack)
+    return llmemory.cast_adr_to_ptr(h, _c.handle)
+
+def prepare_old_suspstack():
+    if not gcrootfinder.oldsuspstack:   # else reuse the one still there
+        _allocate_old_suspstack()
+
+def _allocate_old_suspstack():
+    suspstack = llop.gc_shadowstackref_new(llmemory.GCREF)
+    gcrootfinder.oldsuspstack = suspstack
+_allocate_old_suspstack._dont_inline_ = True
+
+def get_result_suspstack(h):
+    # Now we are in the target, after the switch() or the new().
+    # Note that this whole module was carefully written in such a way as
+    # not to invoke pushing/popping things off the shadowstack at
+    # unexpected moments...
+    oldsuspstack = gcrootfinder.oldsuspstack
+    newsuspstack = gcrootfinder.newsuspstack
+    gcrootfinder.oldsuspstack = NULL_SUSPSTACK
+    gcrootfinder.newsuspstack = NULL_SUSPSTACK
+    if not h:
+        raise MemoryError
+    # We still have the old shadowstack active at this point; save it
+    # away, and restore the new one
+    if oldsuspstack:
+        ll_assert(not _c.is_empty_handle(h),"unexpected empty stacklet handle")
+        llop.gc_save_current_state_away(lltype.Void, oldsuspstack, h)
+    else:
+        ll_assert(_c.is_empty_handle(h),"unexpected non-empty stacklet handle")
+        llop.gc_forget_current_state(lltype.Void)
+    #
+    llop.gc_restore_state_from(lltype.Void, newsuspstack)
+    #
+    # From this point on, 'newsuspstack' is consumed and done, its
+    # shadow stack installed as the current one.  It should not be
+    # used any more.  For performance, we avoid it being deallocated
+    # by letting it be reused on the next switch.
+    gcrootfinder.oldsuspstack = newsuspstack
+    # Return.
+    return oldsuspstack
+
+
+class StackletGcRootFinder:
+    __metaclass__ = StaticMethods
+
+    def new(thrd, callback, arg):
+        gcrootfinder.callback = callback
+        thread_handle = thrd._thrd
+        prepare_old_suspstack()
+        h = _c.new(thread_handle, llhelper(_c.run_fn, _new_callback), arg)
+        return get_result_suspstack(h)
+    new._dont_inline_ = True
+
+    def switch(thrd, suspstack):
+        # suspstack has a handle to target, i.e. where to switch to
+        ll_assert(suspstack != gcrootfinder.oldsuspstack,
+                  "stacklet: invalid use")
+        gcrootfinder.newsuspstack = suspstack
+        thread_handle = thrd._thrd
+        h = llop.gc_shadowstackref_context(llmemory.Address, suspstack)
+        h = llmemory.cast_adr_to_ptr(h, _c.handle)
+        prepare_old_suspstack()
+        h = _c.switch(thread_handle, h)
+        return get_result_suspstack(h)
+    switch._dont_inline_ = True
+
+    def destroy(thrd, suspstack):
+        h = llop.gc_shadowstackref_context(llmemory.Address, suspstack)
+        h = llmemory.cast_adr_to_ptr(h, _c.handle)
+        llop.gc_shadowstackref_destroy(lltype.Void, suspstack)
+        _c.destroy(thrd._thrd, h)
+
+    def is_empty_handle(suspstack):
+        return not suspstack
+
+    def get_null_handle():
+        return NULL_SUSPSTACK
+
+
+gcrootfinder = StackletGcRootFinder()
+gcrootfinder.oldsuspstack = NULL_SUSPSTACK
+gcrootfinder.newsuspstack = NULL_SUSPSTACK
diff --git a/pypy/rlib/debug.py b/pypy/rlib/debug.py
--- a/pypy/rlib/debug.py
+++ b/pypy/rlib/debug.py
@@ -26,6 +26,7 @@
         llop.debug_print_traceback(lltype.Void)
     llop.debug_fatalerror(lltype.Void, msg)
 fatalerror._dont_inline_ = True
+fatalerror._annspecialcase_ = 'specialize:arg(1)'
 
 
 class DebugLog(list):
diff --git a/pypy/rlib/rcoroutine.py b/pypy/rlib/rcoroutine.py
--- a/pypy/rlib/rcoroutine.py
+++ b/pypy/rlib/rcoroutine.py
@@ -29,6 +29,11 @@
 The type of a switch is determined by the target's costate.
 """
 
+import py; py.test.skip("fixme: rewrite using rlib.rstacklet")
+# XXX ^^^ the reason it is not done is that pypy.rlib.rcoroutine
+# plus pypy/module/_stackless look like faaaaaar too much code
+# to me :-(
+
 from pypy.rlib.rstack import yield_current_frame_to_caller
 from pypy.rlib.objectmodel import we_are_translated
 
diff --git a/pypy/rlib/rgc.py b/pypy/rlib/rgc.py
--- a/pypy/rlib/rgc.py
+++ b/pypy/rlib/rgc.py
@@ -15,132 +15,8 @@
     pass
 
 # ____________________________________________________________
-# Framework GC features
-
-class GcPool(object):
-    pass
-
-def gc_swap_pool(newpool):
-    """Set newpool as the current pool (create one if newpool is None).
-    All malloc'ed objects are put into the current pool;this is a
-    way to separate objects depending on when they were allocated.
-    """
-    raise NotImplementedError("only works in stacklessgc translated versions")
-
-def gc_clone(gcobject, pool):
-    """Recursively clone the gcobject and everything it points to,
-    directly or indirectly -- but stops at objects that are not
-    in the specified pool.  Pool can be None to mean the current one.
-    A new pool is built to contain the copies.  Return (newobject, newpool).
-    """
-    raise NotImplementedError("only works in stacklessgc translated versions")
-
-# ____________________________________________________________
 # Annotation and specialization
 
-class GcPoolEntry(ExtRegistryEntry):
-    "Link GcPool to its Repr."
-    _type_ = GcPool
-
-    def get_repr(self, rtyper, s_pool):
-        config = rtyper.getconfig()
-        # if the gc policy doesn't support allocation pools, lltype
-        # pools as Void.
-        if config.translation.gc != 'marksweep':
-            from pypy.annotation.model import s_None
-            return rtyper.getrepr(s_None)
-        else:
-            from pypy.rpython.rmodel import SimplePointerRepr
-            from pypy.rpython.memory.gc.marksweep import X_POOL_PTR
-            return SimplePointerRepr(X_POOL_PTR)
-
-
-class SwapPoolFnEntry(ExtRegistryEntry):
-    "Annotation and specialization of gc_swap_pool()."
-    _about_ = gc_swap_pool
-
-    def compute_result_annotation(self, s_newpool):
-        from pypy.annotation import model as annmodel
-        return annmodel.SomeExternalObject(GcPool)
-
-    def specialize_call(self, hop):
-        from pypy.annotation import model as annmodel
-        s_pool_ptr = annmodel.SomeExternalObject(GcPool)
-        r_pool_ptr = hop.rtyper.getrepr(s_pool_ptr)
-
-        opname = 'gc_x_swap_pool'
-        config = hop.rtyper.getconfig()
-        if config.translation.gc != 'marksweep':
-            # when the gc policy doesn't support pools, just return
-            # the argument (which is lltyped as Void anyway)
-            opname = 'same_as'
-            
-        s_pool_ptr = annmodel.SomeExternalObject(GcPool)
-        r_pool_ptr = hop.rtyper.getrepr(s_pool_ptr)
-        vlist = hop.inputargs(r_pool_ptr)
-        return hop.genop(opname, vlist, resulttype = r_pool_ptr)
-
-def _raise():
-    raise RuntimeError
-
-class CloneFnEntry(ExtRegistryEntry):
-    "Annotation and specialization of gc_clone()."
-    _about_ = gc_clone
-
-    def compute_result_annotation(self, s_gcobject, s_pool):
-        from pypy.annotation import model as annmodel
-        return annmodel.SomeTuple([s_gcobject,
-                                   annmodel.SomeExternalObject(GcPool)])
-
-    def specialize_call(self, hop):
-        from pypy.rpython.error import TyperError
-        from pypy.rpython.lltypesystem import rtuple
-        from pypy.annotation import model as annmodel
-        from pypy.rpython.memory.gc.marksweep import X_CLONE, X_CLONE_PTR
-
-        config = hop.rtyper.getconfig()
-        if config.translation.gc != 'marksweep':
-            # if the gc policy does not support allocation pools,
-            # gc_clone always raises RuntimeError
-            hop.exception_is_here()
-            hop.gendirectcall(_raise)
-            s_pool_ptr = annmodel.SomeExternalObject(GcPool)
-            r_pool_ptr = hop.rtyper.getrepr(s_pool_ptr)
-            r_tuple = hop.r_result
-            v_gcobject, v_pool = hop.inputargs(hop.args_r[0], r_pool_ptr)
-            return rtuple.newtuple(hop.llops, r_tuple, [v_gcobject, v_pool])
-
-        r_gcobject = hop.args_r[0]
-        if (not isinstance(r_gcobject.lowleveltype, lltype.Ptr) or
-            r_gcobject.lowleveltype.TO._gckind != 'gc'):
-            raise TyperError("gc_clone() can only clone a dynamically "
-                             "allocated object;\ngot %r" % (r_gcobject,))
-        s_pool_ptr = annmodel.SomeExternalObject(GcPool)
-        r_pool_ptr = hop.rtyper.getrepr(s_pool_ptr)
-        r_tuple = hop.r_result
-
-        c_CLONE       = hop.inputconst(lltype.Void, X_CLONE)
-        c_flags       = hop.inputconst(lltype.Void, {'flavor': 'gc'})
-        c_gcobjectptr = hop.inputconst(lltype.Void, "gcobjectptr")
-        c_pool        = hop.inputconst(lltype.Void, "pool")
-
-        v_gcobject, v_pool = hop.inputargs(hop.args_r[0], r_pool_ptr)
-        v_gcobjectptr = hop.genop('cast_opaque_ptr', [v_gcobject],
-                                  resulttype = llmemory.GCREF)
-        v_clonedata = hop.genop('malloc', [c_CLONE, c_flags],
-                                resulttype = X_CLONE_PTR)
-        hop.genop('setfield', [v_clonedata, c_gcobjectptr, v_gcobjectptr])
-        hop.genop('setfield', [v_clonedata, c_pool, v_pool])
-        hop.exception_is_here()
-        hop.genop('gc_x_clone', [v_clonedata])
-        v_gcobjectptr = hop.genop('getfield', [v_clonedata, c_gcobjectptr],
-                                  resulttype = llmemory.GCREF)
-        v_pool        = hop.genop('getfield', [v_clonedata, c_pool],
-                                  resulttype = r_pool_ptr)
-        v_gcobject = hop.genop('cast_opaque_ptr', [v_gcobjectptr],
-                               resulttype = r_tuple.items_r[0])
-        return rtuple.newtuple(hop.llops, r_tuple, [v_gcobject, v_pool])
-
 # Support for collection.
 
 class CollectEntry(ExtRegistryEntry):
diff --git a/pypy/rlib/rstack.py b/pypy/rlib/rstack.py
--- a/pypy/rlib/rstack.py
+++ b/pypy/rlib/rstack.py
@@ -14,25 +14,6 @@
 from pypy.rpython.controllerentry import Controller, SomeControlledInstance
 from pypy.translator.tool.cbuild import ExternalCompilationInfo
 
-def stack_unwind():
-    if we_are_translated():
-        return llop.stack_unwind(lltype.Void)
-    raise RuntimeError("cannot unwind stack in non-translated versions")
-
-
-def stack_capture():
-    if we_are_translated():
-        ptr = llop.stack_capture(OPAQUE_STATE_HEADER_PTR)
-        return frame_stack_top_controller.box(ptr)
-    raise RuntimeError("cannot unwind stack in non-translated versions")
-
-
-def stack_frames_depth():
-    if we_are_translated():
-        return llop.stack_frames_depth(lltype.Signed)
-    else:
-        return len(inspect.stack())
-
 # ____________________________________________________________
 
 compilation_info = ExternalCompilationInfo(includes=['src/stack.h'])
@@ -88,78 +69,6 @@
 @rgc.no_collect
 def stack_check_slowpath(current):
     if ord(_stack_too_big_slowpath(current)):
-        # Now we are sure that the stack is really too big.  Note that the
-        # stack_unwind implementation is different depending on if stackless
-        # is enabled. If it is it unwinds the stack, otherwise it simply
-        # raises a RuntimeError.
-        stack_unwind()
+        from pypy.rlib.rstackovf import _StackOverflow
+        raise _StackOverflow
 stack_check_slowpath._dont_inline_ = True
-
-# ____________________________________________________________
-
-def yield_current_frame_to_caller():
-    raise NotImplementedError("only works in translated versions")
-
-
-class frame_stack_top(object):
-    def switch(self):
-        raise NotImplementedError("only works in translated versions")
-
-
-class BoundSwitchOfFrameStackTop(object): pass
-class BoundSwitchOfFrameStackTopController(Controller):
-    knowntype = BoundSwitchOfFrameStackTop
-    def call(self, real_object):
-        from pypy.rpython.lltypesystem.lloperation import llop
-        ptr = llop.stack_switch(OPAQUE_STATE_HEADER_PTR, real_object)
-        return frame_stack_top_controller.box(ptr)
-
-
-class FrameStackTopController(Controller):
-    knowntype = frame_stack_top
-    can_be_None = True
-
-    def is_true(self, real_object):
-        return bool(real_object)
-
-    def get_switch(self, real_object):
-        return bound_switch_of_frame_stack_top_controller.box(real_object)
-
-    def convert(self, obj):
-        assert obj is None
-        return lltype.nullptr(OPAQUE_STATE_HEADER_PTR.TO)
-
-frame_stack_top_controller = FrameStackTopController()
-bound_switch_of_frame_stack_top_controller = BoundSwitchOfFrameStackTopController()
-OPAQUE_STATE_HEADER = lltype.GcOpaqueType("OPAQUE_STATE_HEADER", hints={"render_structure": True})
-OPAQUE_STATE_HEADER_PTR = lltype.Ptr(OPAQUE_STATE_HEADER)
-
-
-
-class FrameStackTopReturningFnEntry(ExtRegistryEntry):
-    def compute_result_annotation(self):
-        from pypy.annotation import model as annmodel
-        return SomeControlledInstance(annmodel.lltype_to_annotation(OPAQUE_STATE_HEADER_PTR), frame_stack_top_controller)
-
-
-class YieldCurrentFrameToCallerFnEntry(FrameStackTopReturningFnEntry):
-    _about_ = yield_current_frame_to_caller
-
-    def specialize_call(self, hop):
-        var = hop.genop("yield_current_frame_to_caller", [], hop.r_result.lowleveltype)
-        return var
-
-
-# ____________________________________________________________
-
-def get_stack_depth_limit():
-    if we_are_translated():
-        from pypy.rpython.lltypesystem.lloperation import llop
-        return llop.get_stack_depth_limit(lltype.Signed)
-    raise RuntimeError("no stack depth limit in non-translated versions")
-
-def set_stack_depth_limit(limit):
-    if we_are_translated():
-        from pypy.rpython.lltypesystem.lloperation import llop
-        return llop.set_stack_depth_limit(lltype.Void, limit)
-    raise RuntimeError("no stack depth limit in non-translated versions")
diff --git a/pypy/rlib/rstacklet.py b/pypy/rlib/rstacklet.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/rstacklet.py
@@ -0,0 +1,58 @@
+from pypy.rlib import _rffi_stacklet as _c
+from pypy.rpython.lltypesystem import lltype, llmemory
+
+
+class StackletThread(object):
+
+    def __init__(self, config):
+        self._gcrootfinder = _getgcrootfinder(config)
+        self._thrd = _c.newthread()
+        if not self._thrd:
+            raise MemoryError
+        self._thrd_deleter = StackletThreadDeleter(self._thrd)
+
+    def new(self, callback, arg=llmemory.NULL):
+        return self._gcrootfinder.new(self, callback, arg)
+    new._annspecialcase_ = 'specialize:arg(1)'
+
+    def switch(self, stacklet):
+        return self._gcrootfinder.switch(self, stacklet)
+
+    def destroy(self, stacklet):
+        self._gcrootfinder.destroy(self, stacklet)
+
+    def is_empty_handle(self, stacklet):
+        # note that "being an empty handle" and being equal to
+        # "get_null_handle()" may be the same, or not; don't rely on it
+        return self._gcrootfinder.is_empty_handle(stacklet)
+
+    def get_null_handle(self):
+        return self._gcrootfinder.get_null_handle()
+
+
+class StackletThreadDeleter(object):
+    # quick hack: the __del__ is on another object, so that
+    # if the main StackletThread ends up in random circular
+    # references, on pypy deletethread() is only called
+    # when all that circular reference mess is gone.
+    def __init__(self, thrd):
+        self._thrd = thrd
+    def __del__(self):
+        thrd = self._thrd
+        if thrd:
+            self._thrd = lltype.nullptr(_c.thread_handle.TO)
+            _c.deletethread(thrd)
+
+# ____________________________________________________________
+
+def _getgcrootfinder(config):
+    if (config is None or
+        config.translation.gc in ('ref', 'boehm', 'none')):   # for tests
+        gcrootfinder = 'n/a'
+    else:
+        gcrootfinder = config.translation.gcrootfinder
+    gcrootfinder = gcrootfinder.replace('/', '_')
+    module = __import__('pypy.rlib._stacklet_%s' % gcrootfinder,
+                        None, None, ['__doc__'])
+    return module.gcrootfinder
+_getgcrootfinder._annspecialcase_ = 'specialize:memo'
diff --git a/pypy/rlib/streamio.py b/pypy/rlib/streamio.py
--- a/pypy/rlib/streamio.py
+++ b/pypy/rlib/streamio.py
@@ -496,29 +496,24 @@
         if bufsize == -1:     # Get default from the class
             bufsize = self.bufsize
         self.bufsize = bufsize  # buffer size (hint only)
-        self.lines = []         # ready-made lines (sans "\n")
-        self.buf = ""           # raw data (may contain "\n")
-        # Invariant: readahead == "\n".join(self.lines + [self.buf])
-        # self.lines contains no "\n"
-        # self.buf may contain "\n"
+        self.buf = ""           # raw data
+        self.pos = 0
 
     def flush_buffers(self):
-        if self.lines or self.buf:
+        if self.buf:
             try:
                 self.do_seek(self.tell(), 0)
             except MyNotImplementedError:
                 pass
             else:
-                self.lines = []
                 self.buf = ""
+                self.pos = 0
 
     def tell(self):
-        bytes = self.do_tell()  # This may fail
-        offset = len(self.buf)
-        for line in self.lines:
-            offset += len(line) + 1
-        assert bytes >= offset #, (locals(), self.__dict__)
-        return bytes - offset
+        tellpos = self.do_tell()  # This may fail
+        offset = len(self.buf) - self.pos
+        assert tellpos >= offset #, (locals(), self.__dict__)
+        return tellpos - offset
 
     def seek(self, offset, whence):
         # This may fail on the do_seek() or do_tell() call.
@@ -526,32 +521,25 @@
         # Nor on a seek to the very end.
         if whence == 0:
             self.do_seek(offset, 0)
-            self.lines = []
             self.buf = ""
+            self.pos = 0
             return
         if whence == 1:
+            currentsize = len(self.buf) - self.pos
             if offset < 0:
-                self.do_seek(self.tell() + offset, 0)
-                self.lines = []
-                self.buf = ""
+                if self.pos + offset >= 0:
+                    self.pos += offset
+                else:
+                    self.do_seek(self.tell() + offset, 0)
+                    self.pos = 0
+                    self.buf = ""
                 return
-            while self.lines:
-                line = self.lines[-1]
-                if offset <= len(line):
-                    intoffset = intmask(offset)
-                    assert intoffset >= 0
-                    self.lines[-1] = line[intoffset:]
-                    return
-                offset -= len(self.lines[-1]) - 1
-                self.lines.pop()
-            assert not self.lines
-            if offset <= len(self.buf):
-                intoffset = intmask(offset)
-                assert intoffset >= 0
-                self.buf = self.buf[intoffset:]
+            elif offset <= currentsize:
+                self.pos += offset
                 return
-            offset -= len(self.buf)
             self.buf = ""
+            self.pos = 0
+            offset -= currentsize
             try:
                 self.do_seek(offset, 1)
             except MyNotImplementedError:
@@ -564,18 +552,18 @@
             except MyNotImplementedError:
                 pass
             else:
-                self.lines = []
+                self.pos = 0
                 self.buf = ""
                 return
             # Skip relative to EOF by reading and saving only just as
             # much as needed
             intoffset = offset2int(offset)
-            self.lines.reverse()
-            data = "\n".join(self.lines + [self.buf])
-            total = len(data)
-            buffers = [data]
-            self.lines = []
+            pos = self.pos
+            assert pos >= 0
+            buffers = [self.buf[pos:]]
+            total = len(buffers[0])
             self.buf = ""
+            self.pos = 0
             while 1:
                 data = self.do_read(self.bufsize)
                 if not data:
@@ -589,157 +577,101 @@
             if cutoff < 0:
                 raise StreamError("cannot seek back")
             if buffers:
+                assert cutoff >= 0
                 buffers[0] = buffers[0][cutoff:]
             self.buf = "".join(buffers)
-            self.lines = []
             return
+
         raise StreamError("whence should be 0, 1 or 2")
 
     def readall(self):
-        self.lines.reverse()
-        self.lines.append(self.buf)
-        more = ["\n".join(self.lines)]
-        self.lines = []
+        pos = self.pos
+        assert pos >= 0
+        chunks = [self.buf[pos:]]
         self.buf = ""
+        self.pos = 0
         bufsize = self.bufsize
         while 1:
             data = self.do_read(bufsize)
             if not data:
                 break
-            more.append(data)
+            chunks.append(data)
             bufsize = min(bufsize*2, self.bigsize)
-        return "".join(more)
+        return "".join(chunks)
 
-    def read(self, n):
+    def read(self, n=-1):
         assert isinstance(n, int)
-        assert n >= 0
-        if self.lines:
-            # See if this can be satisfied from self.lines[0]
-            line = self.lines[-1]
-            if len(line) >= n:
-                self.lines[-1] = line[n:]
-                return line[:n]
-
-            # See if this can be satisfied *without exhausting* self.lines
-            k = 0
-            i = 0
-            lgt = len(self.lines)
-            for linenum in range(lgt-1,-1,-1):
-                line = self.lines[linenum]
-                k += len(line)
-                if k >= n:
-                    lines = self.lines[linenum + 1:]
-                    data = self.lines[linenum]
-                    cutoff = len(data) - (k-n)
-                    assert cutoff >= 0
-                    lines.reverse()
-                    lines.append(data[:cutoff])
-                    del self.lines[linenum:]
-                    self.lines.append(data[cutoff:])
-                    return "\n".join(lines)
-                k += 1
-
-            # See if this can be satisfied from self.lines plus self.buf
-            if k + len(self.buf) >= n:
-                lines = self.lines
-                lines.reverse()
-                self.lines = []
-                cutoff = n - k
-                assert cutoff >= 0
-                lines.append(self.buf[:cutoff])
-                self.buf = self.buf[cutoff:]
-                return "\n".join(lines)
-
+        if n < 0:
+            return self.readall()
+        currentsize = len(self.buf) - self.pos
+        start = self.pos
+        assert start >= 0
+        if n <= currentsize:
+            stop = start + n
+            assert stop >= 0
+            result = self.buf[start:stop]
+            self.pos += n
+            return result
         else:
-            # See if this can be satisfied from self.buf
-            data = self.buf
-            k = len(data)
-            if k >= n:
-                cutoff = len(data) - (k-n)
-                assert cutoff >= 0
-                assert len(data) >= cutoff
-                self.buf = data[cutoff:]
-                return data[:cutoff]
-
-        lines = self.lines
-        lines.reverse()
-        self.lines = []
-        lines.append(self.buf)
-        self.buf = ""
-        data = "\n".join(lines)
-        more = [data]
-        k = len(data)
-        while k < n:
-            data = self.do_read(max(self.bufsize, n-k))
-            k += len(data)
-            more.append(data)
-            if not data:
-                break
-        cutoff = len(data) - (k-n)
-        assert cutoff >= 0
-        if len(data) <= cutoff:
-            self.buf = ""
-        else:
-            self.buf = data[cutoff:]
-            more[-1] = data[:cutoff]
-        return "".join(more)
-
-    # read_next_bunch is generally this, version below is slightly faster
-    #def _read_next_bunch(self):
-    #    self.lines = self.buf.split("\n")
-    #    self.buf = self.lines.pop()
-    #    self.lines.reverse()
-
-    def _read_next_bunch(self):
-        numlines = self.buf.count("\n")
-        self.lines = [None] * numlines
-        last = -1
-        num = numlines - 1
-        while True:
-            start = last + 1
-            assert start >= 0
-            next = self.buf.find("\n", start)
-            if next == -1:
-                if last != -1:
-                    self.buf = self.buf[start:]
-                break
-            assert next >= 0
-            self.lines[num] = self.buf[start:next]
-            last = next
-            num -= 1
+            chunks = [self.buf[start:]]
+            while 1:
+                self.buf = self.do_read(self.bufsize)
+                if not self.buf:
+                    self.pos = 0
+                    break
+                currentsize += len(self.buf)
+                if currentsize >= n:
+                    self.pos = len(self.buf) - (currentsize - n)
+                    stop = self.pos
+                    assert stop >= 0
+                    chunks.append(self.buf[:stop])
+                    break
+                chunks.append(self.buf)
+            return ''.join(chunks)
 
     def readline(self):
-        if self.lines:
-            return self.lines.pop() + "\n"
-
-        # This block is needed because read() can leave self.buf
-        # containing newlines
-        self._read_next_bunch()
-        if self.lines:
-            return self.lines.pop() + "\n"
-
-        if self.buf:
-            buf = [self.buf]
-        else:
-            buf = []
+        pos = self.pos
+        assert pos >= 0
+        i = self.buf.find("\n", pos)
+        start = self.pos
+        assert start >= 0
+        if i >= 0: # new line found
+            i += 1
+            result = self.buf[start:i]
+            self.pos = i
+            return result
+        temp = self.buf[start:]
+        # read one buffer and most of the time a new line will be found
+        self.buf = self.do_read(self.bufsize)
+        i = self.buf.find("\n")
+        if i >= 0: # new line found
+            i += 1
+            result = temp + self.buf[:i]
+            self.pos = i
+            return result
+        if not self.buf:
+            self.pos = 0
+            return temp
+        # need to keep getting data until we find a new line
+        chunks = [temp, self.buf]
         while 1:
             self.buf = self.do_read(self.bufsize)
-            self._read_next_bunch()
-            if self.lines:
-                buf.append(self.lines.pop())
-                buf.append("\n")
+            if not self.buf:
+                self.pos = 0
                 break
-            if not self.buf:
+            i = self.buf.find("\n")
+            if i >= 0:
+                i += 1
+                chunks.append(self.buf[:i])
+                self.pos = i
                 break
-            buf.append(self.buf)
-
-        return "".join(buf)
+            chunks.append(self.buf)
+        return "".join(chunks)
 
     def peek(self):
-        if self.lines:
-            return self.lines[-1] + "\n"
-        else:
-            return self.buf
+        pos = self.pos
+        assert pos >= 0
+        return self.buf[pos:]
 
     write      = PassThrough("write",     flush_buffers=True)
     truncate   = PassThrough("truncate",  flush_buffers=True)
diff --git a/pypy/rlib/test/test_rstacklet.py b/pypy/rlib/test/test_rstacklet.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/test/test_rstacklet.py
@@ -0,0 +1,272 @@
+import gc
+import py
+from pypy.rpython.tool.rffi_platform import CompilationError
+try:
+    from pypy.rlib import rstacklet
+except CompilationError, e:
+    py.test.skip("cannot import rstacklet: %s" % e)
+
+from pypy.rlib import rrandom
+from pypy.rlib.rarithmetic import intmask
+from pypy.rpython.lltypesystem import lltype, llmemory, rffi
+from pypy.translator.c.test.test_standalone import StandaloneTests
+
+
+
+class Runner:
+    STATUSMAX = 5000
+    config = None
+
+    def init(self, seed):
+        self.sthread = rstacklet.StackletThread(self.config)
+        self.random = rrandom.Random(seed)
+
+    def done(self):
+        self.sthread = None
+        gc.collect(); gc.collect(); gc.collect()
+
+    TESTS = []
+    def here_is_a_test(fn, TESTS=TESTS):
+        TESTS.append((fn.__name__, fn))
+        return fn
+
+    @here_is_a_test
+    def test_new(self):
+        print 'start'
+        h = self.sthread.new(empty_callback, rffi.cast(llmemory.Address, 123))
+        print 'end', h
+        assert self.sthread.is_empty_handle(h)
+
+    def nextstatus(self, nextvalue):
+        print 'expected nextvalue to be %d, got %d' % (nextvalue,
+                                                       self.status + 1)
+        assert self.status + 1 == nextvalue
+        self.status = nextvalue
+
+    @here_is_a_test
+    def test_simple_switch(self):
+        self.status = 0
+        h = self.sthread.new(switchbackonce_callback,
+                             rffi.cast(llmemory.Address, 321))
+        assert not self.sthread.is_empty_handle(h)
+        self.nextstatus(2)
+        h = self.sthread.switch(h)
+        self.nextstatus(4)
+        print 'end', h
+        assert self.sthread.is_empty_handle(h)
+
+    @here_is_a_test
+    def test_various_depths(self):
+        self.tasks = [Task(i) for i in range(10)]
+        self.nextstep = -1
+        self.comefrom = -1
+        self.status = 0
+        while self.status < self.STATUSMAX or self.any_alive():
+            self.tasks[0].withdepth(self.random.genrand32() % 50)
+            assert len(self.tasks[0].lst) == 0
+
+    def any_alive(self):
+        for task in self.tasks:
+            if task.h:
+                return True
+        return False
+
+
+class FooObj:
+    def __init__(self, n, d, next=None):
+        self.n = n
+        self.d = d
+        self.next = next
+
+
+class Task:
+    def __init__(self, n):
+        self.n = n
+        self.h = runner.sthread.get_null_handle()
+        self.lst = []
+
+    def withdepth(self, d):
+        if d > 0:
+            foo = FooObj(self.n, d)
+            foo2 = FooObj(self.n + 100, d, foo)
+            self.lst.append(foo)
+            res = self.withdepth(d-1)
+            foo = self.lst.pop()
+            assert foo2.n == self.n + 100
+            assert foo2.d == d
+            assert foo2.next is foo
+            assert foo.n == self.n
+            assert foo.d == d
+            assert foo.next is None
+        else:
+            res = 0
+            n = intmask(runner.random.genrand32() % 10)
+            if n == self.n or (runner.status >= runner.STATUSMAX and
+                               not runner.tasks[n].h):
+                return 1
+
+            print "status == %d, self.n = %d" % (runner.status, self.n)
+            assert not self.h
+            assert runner.nextstep == -1
+            runner.status += 1
+            runner.nextstep = runner.status
+            runner.comefrom = self.n
+            runner.gointo = n
+            task = runner.tasks[n]
+            if not task.h:
+                # start a new stacklet
+                print "NEW", n
+                h = runner.sthread.new(variousstackdepths_callback,
+                                       rffi.cast(llmemory.Address, n))
+            else:
+                # switch to this stacklet
+                print "switch to", n
+                h = task.h
+                task.h = runner.sthread.get_null_handle()
+                h = runner.sthread.switch(h)
+
+            print "back in self.n = %d, coming from %d" % (self.n,
+                                                           runner.comefrom)
+            assert runner.nextstep == runner.status
+            runner.nextstep = -1
+            assert runner.gointo == self.n
+            assert runner.comefrom != self.n
+            assert not self.h
+            if runner.comefrom != -42:
+                assert 0 <= runner.comefrom < 10
+                task = runner.tasks[runner.comefrom]
+                assert not task.h
+                task.h = h
+            else:
+                assert runner.sthread.is_empty_handle(h)
+            runner.comefrom = -1
+            runner.gointo = -1
+        assert (res & (res-1)) == 0   # to prevent a tail-call to withdepth()
+        return res
+
+
+runner = Runner()
+
+
+def empty_callback(h, arg):
+    print 'in empty_callback:', h, arg
+    assert rffi.cast(lltype.Signed, arg) == 123
+    return h
+
+def switchbackonce_callback(h, arg):
+    print 'in switchbackonce_callback:', h, arg
+    assert rffi.cast(lltype.Signed, arg) == 321
+    runner.nextstatus(1)
+    assert not runner.sthread.is_empty_handle(h)
+    h = runner.sthread.switch(h)
+    runner.nextstatus(3)
+    assert not runner.sthread.is_empty_handle(h)
+    return h
+
+def variousstackdepths_callback(h, arg):
+    assert runner.nextstep == runner.status
+    runner.nextstep = -1
+    arg = rffi.cast(lltype.Signed, arg)
+    assert arg == runner.gointo
+    self = runner.tasks[arg]
+    assert self.n == runner.gointo
+    assert not self.h
+    assert 0 <= runner.comefrom < 10
+    task = runner.tasks[runner.comefrom]
+    assert not task.h
+    assert bool(h) and not runner.sthread.is_empty_handle(h)
+    task.h = h
+    runner.comefrom = -1
+    runner.gointo = -1
+
+    while self.withdepth(runner.random.genrand32() % 20) == 0:
+        assert len(self.lst) == 0
+
+    assert len(self.lst) == 0
+    assert not self.h
+    while 1:
+        n = intmask(runner.random.genrand32() % 10)
+        h = runner.tasks[n].h
+        if h:
+            break
+
+    assert not runner.sthread.is_empty_handle(h)
+    runner.tasks[n].h = runner.sthread.get_null_handle()
+    runner.comefrom = -42
+    runner.gointo = n
+    assert runner.nextstep == -1
+    runner.status += 1
+    runner.nextstep = runner.status
+    print "LEAVING %d to go to %d" % (self.n, n)
+    return h
+
+
+def entry_point(argv):
+    seed = 0
+    if len(argv) > 1:
+        seed = int(argv[1])
+    runner.init(seed)
+    for name, meth in Runner.TESTS:
+        print '-----', name, '-----'
+        meth(runner)
+    print '----- all done -----'
+    runner.done()
+    return 0
+
+
+class BaseTestStacklet(StandaloneTests):
+
+    def setup_class(cls):
+        from pypy.config.pypyoption import get_pypy_config
+        config = get_pypy_config(translating=True)
+        config.translation.gc = cls.gc
+        if cls.gcrootfinder is not None:
+            config.translation.continuation = True
+            config.translation.gcrootfinder = cls.gcrootfinder
+            GCROOTFINDER = cls.gcrootfinder
+        cls.config = config
+        cls.old_values = Runner.config, Runner.STATUSMAX
+        Runner.config = config
+        Runner.STATUSMAX = 25000
+
+    def teardown_class(cls):
+        Runner.config, Runner.STATUSMAX = cls.old_values
+
+    def test_demo1(self):
+        t, cbuilder = self.compile(entry_point)
+
+        for i in range(15):
+            if (i & 1) == 0:
+                env = {}
+            else:
+                env = {'PYPY_GC_NURSERY': '2k'}
+            print 'running %s/%s with arg=%d and env=%r' % (
+                self.gc, self.gcrootfinder, i, env)
+            data = cbuilder.cmdexec('%d' % i, env=env)
+            assert data.endswith("----- all done -----\n")
+            for name, meth in Runner.TESTS:
+                assert ('----- %s -----\n' % name) in data
+
+
+class DONTTestStackletBoehm(BaseTestStacklet):
+    # Boehm does not work well with stacklets, probably because the
+    # moved-away copies of the stack are parsed using a different
+    # selection logic than the real stack
+    gc = 'boehm'
+    gcrootfinder = None
+
+class TestStackletAsmGcc(BaseTestStacklet):
+    gc = 'minimark'
+    gcrootfinder = 'asmgcc'
+
+class TestStackletShadowStack(BaseTestStacklet):
+    gc = 'minimark'
+    gcrootfinder = 'shadowstack'
+
+
+def target(*args):
+    return entry_point, None
+
+if __name__ == '__main__':
+    import sys
+    sys.exit(entry_point(sys.argv))
diff --git a/pypy/rpython/extfuncregistry.py b/pypy/rpython/extfuncregistry.py
--- a/pypy/rpython/extfuncregistry.py
+++ b/pypy/rpython/extfuncregistry.py
@@ -44,32 +44,28 @@
        ('log10', [float], float),
        ('sin', [float], float),
        ('cos', [float], float),
+       ('atan2', [float, float], float),
+       ('hypot', [float, float], float),
+       ('frexp', [float], (float, int)),
+       ('ldexp', [float, int], float),
+       ('modf', [float], (float, float)),
+       ('fmod', [float, float], float),
+       ('pow', [float, float], float),
     ]),
 ]
 for module, methods in _register:
     for name, arg_types, return_type in methods:
         method_name = 'll_math_%s' % name
+        oofake = None
+        # Things with a tuple return type have a fake impl for RPython, check
+        # to see if the method has one.
+        if hasattr(oo_math, method_name):
+          oofake = getattr(oo_math, method_name)
         register_external(getattr(module, name), arg_types, return_type,
                           export_name='ll_math.%s' % method_name,
                           sandboxsafe=True,
-                          llimpl=getattr(ll_math, method_name))
-
-
-complex_math_functions = [
-    ('frexp', [float],        (float, int)),
-    ('ldexp', [float, int],   float),
-    ('modf',  [float],        (float, float)),
-    ] + [(name, [float, float], float)
-         for name in 'atan2', 'fmod', 'hypot', 'pow']
-
-for name, args, res in complex_math_functions:
-    func = getattr(math, name)
-    llimpl = getattr(ll_math, 'll_math_%s' % name, None)
-    oofake = getattr(oo_math, 'll_math_%s' % name, None)
-    register_external(func, args, res, 'll_math.ll_math_%s' % name,
-                      llimpl=llimpl, oofakeimpl=oofake,
-                      sandboxsafe=True)
-
+                          llimpl=getattr(ll_math, method_name),
+                          oofakeimpl=oofake)
 
 # ___________________________
 # os.path functions
diff --git a/pypy/rpython/llinterp.py b/pypy/rpython/llinterp.py
--- a/pypy/rpython/llinterp.py
+++ b/pypy/rpython/llinterp.py
@@ -675,21 +675,6 @@
             #log.warn("op_indirect_call with graphs=None:", f)
         return self.op_direct_call(f, *args)
 
-    def op_adr_call(self, TGT, f, *inargs):
-        checkadr(f)
-        obj = self.llinterpreter.typer.type_system.deref(f.ref())
-        assert hasattr(obj, 'graph') # don't want to think about that
-        graph = obj.graph
-        args = []
-        for inarg, arg in zip(inargs, obj.graph.startblock.inputargs):
-            args.append(lltype._cast_whatever(arg.concretetype, inarg))
-        frame = self.newsubframe(graph, args)
-        result = frame.eval()
-        from pypy.translator.stackless.frame import storage_type
-        assert storage_type(lltype.typeOf(result)) == TGT
-        return lltype._cast_whatever(TGT, result)
-    op_adr_call.need_result_type = True
-
     def op_malloc(self, obj, flags):
         flavor = flags['flavor']
         zero = flags.get('zero', False)
@@ -840,10 +825,11 @@
 
     def op_gc_adr_of_nursery_top(self):
         raise NotImplementedError
-
     def op_gc_adr_of_nursery_free(self):
         raise NotImplementedError
 
+    def op_gc_adr_of_root_stack_base(self):
+        raise NotImplementedError
     def op_gc_adr_of_root_stack_top(self):
         raise NotImplementedError
 
@@ -894,6 +880,21 @@
     def op_gc_stack_bottom(self):
         pass       # marker for trackgcroot.py
 
+    def op_gc_shadowstackref_new(self):   # stacklet+shadowstack
+        raise NotImplementedError("gc_shadowstackref_new")
+    def op_gc_shadowstackref_context(self):
+        raise NotImplementedError("gc_shadowstackref_context")
+    def op_gc_shadowstackref_destroy(self):
+        raise NotImplementedError("gc_shadowstackref_destroy")
+    def op_gc_save_current_state_away(self):
+        raise NotImplementedError("gc_save_current_state_away")
+    def op_gc_forget_current_state(self):
+        raise NotImplementedError("gc_forget_current_state")
+    def op_gc_restore_state_from(self):
+        raise NotImplementedError("gc_restore_state_from")
+    def op_gc_start_fresh_new_state(self):
+        raise NotImplementedError("gc_start_fresh_new_state")
+
     def op_gc_get_type_info_group(self):
         raise NotImplementedError("gc_get_type_info_group")
 
@@ -930,27 +931,6 @@
     def op_get_write_barrier_from_array_failing_case(self):
         raise NotImplementedError("get_write_barrier_from_array_failing_case")
 
-    def op_yield_current_frame_to_caller(self):
-        raise NotImplementedError("yield_current_frame_to_caller")
-
-    def op_stack_frames_depth(self):
-        return len(self.llinterpreter.frame_stack)
-
-    def op_stack_switch(self, frametop):
-        raise NotImplementedError("stack_switch")
-
-    def op_stack_unwind(self):
-        raise NotImplementedError("stack_unwind")
-
-    def op_stack_capture(self):
-        raise NotImplementedError("stack_capture")
-
-    def op_get_stack_depth_limit(self):
-        raise NotImplementedError("get_stack_depth_limit")
-
-    def op_set_stack_depth_limit(self):
-        raise NotImplementedError("set_stack_depth_limit")
-
     def op_stack_current(self):
         return 0
 
@@ -1131,16 +1111,6 @@
         assert isinstance(x, (int, Symbolic))
         return bool(x)
 
-    # read frame var support
-
-    def op_get_frame_base(self):
-        self._obj0 = self        # hack
-        return llmemory.fakeaddress(self)
-
-    def op_frame_info(self, *vars):
-        pass
-    op_frame_info.specialform = True
-
     # hack for jit.codegen.llgraph
 
     def op_check_and_clear_exc(self):
diff --git a/pypy/rpython/lltypesystem/ll2ctypes.py b/pypy/rpython/lltypesystem/ll2ctypes.py
--- a/pypy/rpython/lltypesystem/ll2ctypes.py
+++ b/pypy/rpython/lltypesystem/ll2ctypes.py
@@ -1098,6 +1098,8 @@
     for i in range(len(FUNCTYPE.ARGS)):
         if FUNCTYPE.ARGS[i] is lltype.Void:
             void_arguments.append(i)
+    def callme(cargs):   # an extra indirection: workaround for rlib.rstacklet
+        return cfunc(*cargs)
     def invoke_via_ctypes(*argvalues):
         global _callback_exc_info
         cargs = []
@@ -1109,7 +1111,7 @@
                 cargs.append(cvalue)
         _callback_exc_info = None
         _restore_c_errno()
-        cres = cfunc(*cargs)
+        cres = callme(cargs)
         _save_c_errno()
         if _callback_exc_info:
             etype, evalue, etb = _callback_exc_info
diff --git a/pypy/rpython/lltypesystem/lloperation.py b/pypy/rpython/lltypesystem/lloperation.py
--- a/pypy/rpython/lltypesystem/lloperation.py
+++ b/pypy/rpython/lltypesystem/lloperation.py
@@ -9,7 +9,7 @@
 class LLOp(object):
 
     def __init__(self, sideeffects=True, canfold=False, canraise=(),
-                 pyobj=False, canunwindgc=False, canrun=False, oo=False,
+                 pyobj=False, canmallocgc=False, canrun=False, oo=False,
                  tryfold=False):
         # self.opname = ... (set afterwards)
 
@@ -36,12 +36,12 @@
         # The operation manipulates PyObjects
         self.pyobj = pyobj
 
-        # The operation can unwind the stack in a stackless gc build
-        self.canunwindgc = canunwindgc
-        if canunwindgc:
-            if (StackException not in self.canraise and
+        # The operation can go a GC malloc
+        self.canmallocgc = canmallocgc
+        if canmallocgc:
+            if (MemoryError not in self.canraise and
                 Exception not in self.canraise):
-                self.canraise += (StackException,)
+                self.canraise += (MemoryError,)
 
         # The operation can be run directly with __call__
         self.canrun = canrun or canfold
@@ -175,10 +175,6 @@
         return hop.genop(op.opname, args_v, resulttype=hop.r_result.lowleveltype)
 
 
-class StackException(Exception):
-    """Base for internal exceptions possibly used by the stackless
-    implementation."""
-
 # ____________________________________________________________
 #
 # This list corresponds to the operations implemented by the LLInterpreter.
@@ -356,10 +352,10 @@
 
     # __________ pointer operations __________
 
-    'malloc':               LLOp(canraise=(MemoryError,), canunwindgc=True),
-    'malloc_varsize':       LLOp(canraise=(MemoryError,), canunwindgc=True),
-    'malloc_nonmovable':    LLOp(canraise=(MemoryError,), canunwindgc=True),
-    'malloc_nonmovable_varsize':LLOp(canraise=(MemoryError,),canunwindgc=True),
+    'malloc':               LLOp(canmallocgc=True),
+    'malloc_varsize':       LLOp(canmallocgc=True),
+    'malloc_nonmovable':    LLOp(canmallocgc=True),
+    'malloc_nonmovable_varsize':LLOp(canmallocgc=True),
     'shrink_array':         LLOp(canrun=True),
     'zero_gc_pointers_inside': LLOp(),
     'free':                 LLOp(),
@@ -414,7 +410,6 @@
     'adr_ne':               LLOp(canfold=True),
     'adr_gt':               LLOp(canfold=True),
     'adr_ge':               LLOp(canfold=True),
-    'adr_call':             LLOp(canraise=(Exception,)),
     'cast_ptr_to_adr':      LLOp(sideeffects=False),
     'cast_adr_to_ptr':      LLOp(canfold=True),
     'cast_adr_to_int':      LLOp(sideeffects=False),
@@ -436,8 +431,8 @@
     'jit_force_quasi_immutable': LLOp(canrun=True),
     'get_exception_addr':   LLOp(),
     'get_exc_value_addr':   LLOp(),
-    'do_malloc_fixedsize_clear':LLOp(canraise=(MemoryError,),canunwindgc=True),
-    'do_malloc_varsize_clear':  LLOp(canraise=(MemoryError,),canunwindgc=True),
+    'do_malloc_fixedsize_clear':LLOp(canmallocgc=True),
+    'do_malloc_varsize_clear':  LLOp(canmallocgc=True),
     'get_write_barrier_failing_case': LLOp(sideeffects=False),
     'get_write_barrier_from_array_failing_case': LLOp(sideeffects=False),
     'gc_get_type_info_group': LLOp(sideeffects=False),
@@ -445,7 +440,7 @@
 
     # __________ GC operations __________
 
-    'gc__collect':          LLOp(canunwindgc=True),
+    'gc__collect':          LLOp(canmallocgc=True),
     'gc_free':              LLOp(),
     'gc_fetch_exception':   LLOp(),
     'gc_restore_exception': LLOp(),
@@ -455,17 +450,12 @@
     'gc_pop_alive_pyobj':   LLOp(),
     'gc_reload_possibly_moved': LLOp(),
     # see rlib/objectmodel for gc_identityhash and gc_id
-    'gc_identityhash':      LLOp(canraise=(MemoryError,), sideeffects=False,
-                                 canunwindgc=True),
-    'gc_id':                LLOp(canraise=(MemoryError,), sideeffects=False),
-                                 # ^^^ but canunwindgc=False, as it is
-                                 # allocating non-GC structures only
+    'gc_identityhash':      LLOp(sideeffects=False, canmallocgc=True),
+    'gc_id':                LLOp(sideeffects=False, canmallocgc=True),
     'gc_obtain_free_space': LLOp(),
     'gc_set_max_heap_size': LLOp(),
     'gc_can_move'         : LLOp(sideeffects=False),
-    'gc_thread_prepare'   : LLOp(canraise=(MemoryError,)),
-                                 # ^^^ but canunwindgc=False, as it is
-                                 # allocating non-GC structures only
+    'gc_thread_prepare'   : LLOp(canmallocgc=True),
     'gc_thread_run'       : LLOp(),
     'gc_thread_start'     : LLOp(),
     'gc_thread_die'       : LLOp(),
@@ -473,7 +463,7 @@
     'gc_thread_after_fork': LLOp(),   # arguments: (result_of_fork, opaqueaddr)
     'gc_assume_young_pointers': LLOp(canrun=True),
     'gc_writebarrier_before_copy': LLOp(canrun=True),
-    'gc_heap_stats'       : LLOp(canunwindgc=True),
+    'gc_heap_stats'       : LLOp(canmallocgc=True),
 
     'gc_get_rpy_roots'    : LLOp(),
     'gc_get_rpy_referents': LLOp(),
@@ -489,50 +479,37 @@
     # ^^^ returns an address of nursery free pointer, for later modifications
     'gc_adr_of_nursery_top' : LLOp(),
     # ^^^ returns an address of pointer, since it can change at runtime
+    'gc_adr_of_root_stack_base': LLOp(),
     'gc_adr_of_root_stack_top': LLOp(),
-    # ^^^ returns the address of gcdata.root_stack_top (for shadowstack only)
-
-    # experimental operations in support of thread cloning, only
-    # implemented by the Mark&Sweep GC
-    'gc_x_swap_pool':       LLOp(canraise=(MemoryError,), canunwindgc=True),
-    'gc_x_clone':           LLOp(canraise=(MemoryError, RuntimeError),
-                                 canunwindgc=True),
-    'gc_x_size_header':     LLOp(),
+    # returns the address of gcdata.root_stack_base/top (for shadowstack only)
 
     # for asmgcroot support to get the address of various static structures
     # see translator/c/src/mem.h for the valid indices
     'gc_asmgcroot_static':  LLOp(sideeffects=False),
     'gc_stack_bottom':      LLOp(canrun=True),
 
-    # NOTE NOTE NOTE! don't forget *** canunwindgc=True *** for anything that
-    # can go through a stack unwind, in particular anything that mallocs!
+    # for stacklet+shadowstack support
+    'gc_shadowstackref_new':      LLOp(canmallocgc=True),
+    'gc_shadowstackref_context':  LLOp(),
+    'gc_shadowstackref_destroy':  LLOp(),
+    'gc_save_current_state_away': LLOp(),
+    'gc_forget_current_state':    LLOp(),
+    'gc_restore_state_from':      LLOp(),
+    'gc_start_fresh_new_state':   LLOp(),
+
+    # NOTE NOTE NOTE! don't forget *** canmallocgc=True *** for anything that
+    # can malloc a GC object.
 
     # __________ weakrefs __________
 
-    'weakref_create':       LLOp(canraise=(MemoryError,), sideeffects=False,
-                                 canunwindgc=True),
+    'weakref_create':       LLOp(sideeffects=False, canmallocgc=True),
     'weakref_deref':        LLOp(sideeffects=False),
     'cast_ptr_to_weakrefptr': LLOp(sideeffects=False), # no-op type hiding
     'cast_weakrefptr_to_ptr': LLOp(sideeffects=False), # no-op type revealing
 
-    # __________ stackless operation(s) __________
-
-    'yield_current_frame_to_caller': LLOp(canraise=(StackException,
-                                                    RuntimeError)),
-    #                               can always unwind, not just if stackless gc
-
-    'stack_frames_depth':   LLOp(sideeffects=False, canraise=(StackException,
-                                                              RuntimeError)),
-    'stack_switch':         LLOp(canraise=(StackException, RuntimeError)),
-    'stack_unwind':         LLOp(canraise=(StackException, RuntimeError)),
-    'stack_capture':        LLOp(canraise=(StackException, RuntimeError)),
-    'get_stack_depth_limit':LLOp(sideeffects=False),
-    'set_stack_depth_limit':LLOp(),
+    # __________ misc operations __________
 
     'stack_current':        LLOp(sideeffects=False),
-
-    # __________ misc operations __________
-
     'keepalive':            LLOp(),
     'same_as':              LLOp(canfold=True),
     'hint':                 LLOp(),
@@ -591,10 +568,6 @@
     'ooparse_int':          LLOp(oo=True, canraise=(ValueError,)),
     'ooparse_float':        LLOp(oo=True, canraise=(ValueError,)),
     'oounicode':            LLOp(oo=True, canraise=(UnicodeDecodeError,)),
-
-    # _____ read frame var support ___
-    'get_frame_base':       LLOp(sideeffects=False),
-    'frame_info':           LLOp(sideeffects=False),
 }
 # ***** Run test_lloperation after changes. *****
 
diff --git a/pypy/rpython/lltypesystem/lltype.py b/pypy/rpython/lltypesystem/lltype.py
--- a/pypy/rpython/lltypesystem/lltype.py
+++ b/pypy/rpython/lltypesystem/lltype.py
@@ -362,7 +362,8 @@
                                                 about=self)._obj
         Struct._install_extras(self, **kwds)
 
-    def _attach_runtime_type_info_funcptr(self, funcptr, destrptr):
+    def _attach_runtime_type_info_funcptr(self, funcptr, destrptr,
+                                          customtraceptr):
         if self._runtime_type_info is None:
             raise TypeError("attachRuntimeTypeInfo: %r must have been built "
                             "with the rtti=True argument" % (self,))
@@ -376,7 +377,7 @@
                 raise TypeError("expected a runtime type info function "
                                 "implementation, got: %s" % funcptr)
             self._runtime_type_info.query_funcptr = funcptr
-        if destrptr is not None :
+        if destrptr is not None:
             T = typeOf(destrptr)
             if (not isinstance(T, Ptr) or
                 not isinstance(T.TO, FuncType) or
@@ -386,6 +387,18 @@
                 raise TypeError("expected a destructor function "
                                 "implementation, got: %s" % destrptr)
             self._runtime_type_info.destructor_funcptr = destrptr
+        if customtraceptr is not None:
+            from pypy.rpython.lltypesystem import llmemory
+            T = typeOf(customtraceptr)
+            if (not isinstance(T, Ptr) or
+                not isinstance(T.TO, FuncType) or
+                len(T.TO.ARGS) != 2 or
+                T.TO.RESULT != llmemory.Address or
+                T.TO.ARGS[0] != llmemory.Address or
+                T.TO.ARGS[1] != llmemory.Address):
+                raise TypeError("expected a custom trace function "
+                                "implementation, got: %s" % customtraceptr)
+            self._runtime_type_info.custom_trace_funcptr = customtraceptr
 
 class GcStruct(RttiStruct):
     _gckind = 'gc'
@@ -2042,10 +2055,12 @@
         raise ValueError("only odd integers can be cast back to ptr")
     return _ptr(PTRTYPE, oddint, solid=True)
 
-def attachRuntimeTypeInfo(GCSTRUCT, funcptr=None, destrptr=None):
+def attachRuntimeTypeInfo(GCSTRUCT, funcptr=None, destrptr=None,
+                          customtraceptr=None):
     if not isinstance(GCSTRUCT, RttiStruct):
         raise TypeError, "expected a RttiStruct: %s" % GCSTRUCT
-    GCSTRUCT._attach_runtime_type_info_funcptr(funcptr, destrptr)
+    GCSTRUCT._attach_runtime_type_info_funcptr(funcptr, destrptr,
+                                               customtraceptr)
     return _ptr(Ptr(RuntimeTypeInfo), GCSTRUCT._runtime_type_info)
 
 def getRuntimeTypeInfo(GCSTRUCT):
diff --git a/pypy/rpython/lltypesystem/module/ll_math.py b/pypy/rpython/lltypesystem/module/ll_math.py
--- a/pypy/rpython/lltypesystem/module/ll_math.py
+++ b/pypy/rpython/lltypesystem/module/ll_math.py
@@ -223,22 +223,13 @@
 
 
 def ll_math_fmod(x, y):
-    if isinf(y):
-        if isinf(x):
-            raise ValueError("math domain error")
-        return x  # fmod(x, +/-Inf) returns x for finite x (or if x is a NaN).
+    if isinf(x) and not isnan(y):
+        raise ValueError("math domain error")
 
-    _error_reset()
-    r = math_fmod(x, y)
-    errno = rposix.get_errno()
-    if isnan(r):
-        if isnan(x) or isnan(y):
-            errno = 0
-        else:
-            errno = EDOM
-    if errno:
-        _likely_raise(errno, r)
-    return r
+    if y == 0:
+        raise ValueError("math domain error")
+
+    return math_fmod(x, y)
 
 
 def ll_math_hypot(x, y):
diff --git a/pypy/rpython/lltypesystem/rffi.py b/pypy/rpython/lltypesystem/rffi.py
--- a/pypy/rpython/lltypesystem/rffi.py
+++ b/pypy/rpython/lltypesystem/rffi.py
@@ -56,7 +56,7 @@
                sandboxsafe=False, threadsafe='auto',
                _nowrapper=False, calling_conv='c',
                oo_primitive=None, elidable_function=False,
-               macro=None):
+               macro=None, random_effects_on_gcobjs='auto'):
     """Build an external function that will invoke the C function 'name'
     with the given 'args' types and 'result' type.
 
@@ -112,13 +112,19 @@
         # sandboxsafe is a hint for "too-small-ness" (e.g. math functions).
         invoke_around_handlers = not sandboxsafe
 
+    if random_effects_on_gcobjs not in (False, True):
+        random_effects_on_gcobjs = (
+            invoke_around_handlers or   # because it can release the GIL
+            has_callback)               # because the callback can do it
+
     funcptr = lltype.functionptr(ext_type, name, external='C',
                                  compilation_info=compilation_info,
                                  _callable=_callable,
                                  _safe_not_sandboxed=sandboxsafe,
                                  _debugexc=True, # on top of llinterp
                                  canraise=False,
-                                 releases_gil=invoke_around_handlers,
+                                 random_effects_on_gcobjs=
+                                     random_effects_on_gcobjs,
                                  **kwds)
     if isinstance(_callable, ll2ctypes.LL2CtypesCallable):
         _callable.funcptr = funcptr
diff --git a/pypy/rpython/lltypesystem/test/test_lloperation.py b/pypy/rpython/lltypesystem/test/test_lloperation.py
--- a/pypy/rpython/lltypesystem/test/test_lloperation.py
+++ b/pypy/rpython/lltypesystem/test/test_lloperation.py
@@ -144,6 +144,4 @@
     for opname, llop in LL_OPERATIONS.items():
         if llop.canrun:
             continue
-        if opname.startswith('gc_x_'):
-            continue   # ignore experimental stuff
         assert opname in LL_INTERP_OPERATIONS
diff --git a/pypy/rpython/memory/gc/base.py b/pypy/rpython/memory/gc/base.py
--- a/pypy/rpython/memory/gc/base.py
+++ b/pypy/rpython/memory/gc/base.py
@@ -69,7 +69,10 @@
                             varsize_offsets_to_gcpointers_in_var_part,
                             weakpointer_offset,
                             member_index,
-                            is_rpython_class):
+                            is_rpython_class,
+                            has_custom_trace,
+                            get_custom_trace,
+                            fast_path_tracing):
         self.getfinalizer = getfinalizer
         self.is_varsize = is_varsize
         self.has_gcptr_in_varsize = has_gcptr_in_varsize
@@ -83,6 +86,9 @@
         self.weakpointer_offset = weakpointer_offset
         self.member_index = member_index
         self.is_rpython_class = is_rpython_class
+        self.has_custom_trace = has_custom_trace
+        self.get_custom_trace = get_custom_trace
+        self.fast_path_tracing = fast_path_tracing
 
     def get_member_index(self, type_id):
         return self.member_index(type_id)
@@ -145,13 +151,13 @@
             else:
                 malloc_varsize = self.malloc_varsize
             ref = malloc_varsize(typeid, length, size, itemsize,
-                                 offset_to_length, True)
+                                 offset_to_length)
         else:
             if zero or not hasattr(self, 'malloc_fixedsize'):
                 malloc_fixedsize = self.malloc_fixedsize_clear
             else:
                 malloc_fixedsize = self.malloc_fixedsize
-            ref = malloc_fixedsize(typeid, size, True, needs_finalizer,
+            ref = malloc_fixedsize(typeid, size, needs_finalizer,
                                    contains_weakptr)
         # lots of cast and reverse-cast around...
         return llmemory.cast_ptr_to_adr(ref)
@@ -181,16 +187,25 @@
         Typically, 'callback' is a bound method and 'arg' can be None.
         """
         typeid = self.get_type_id(obj)
-        if self.is_gcarrayofgcptr(typeid):
-            # a performance shortcut for GcArray(gcptr)
-            length = (obj + llmemory.gcarrayofptr_lengthoffset).signed[0]
-            item = obj + llmemory.gcarrayofptr_itemsoffset
-            while length > 0:
-                if self.points_to_valid_gc_object(item):
-                    callback(item, arg)
-                item += llmemory.gcarrayofptr_singleitemoffset
-                length -= 1
-            return
+        #
+        # First, look if we need more than the simple fixed-size tracing
+        if not self.fast_path_tracing(typeid):
+            #
+            # Yes.  Two cases: either we are just a GcArray(gcptr), for
+            # which we have a special case for performance, or we call
+            # the slow path version.
+            if self.is_gcarrayofgcptr(typeid):
+                length = (obj + llmemory.gcarrayofptr_lengthoffset).signed[0]
+                item = obj + llmemory.gcarrayofptr_itemsoffset
+                while length > 0:
+                    if self.points_to_valid_gc_object(item):
+                        callback(item, arg)
+                    item += llmemory.gcarrayofptr_singleitemoffset
+                    length -= 1
+                return
+            self._trace_slow_path(obj, callback, arg)
+        #
+        # Do the tracing on the fixed-size part of the object.
         offsets = self.offsets_to_gc_pointers(typeid)
         i = 0
         while i < len(offsets):
@@ -198,6 +213,10 @@
             if self.points_to_valid_gc_object(item):
                 callback(item, arg)
             i += 1
+    trace._annspecialcase_ = 'specialize:arg(2)'
+
+    def _trace_slow_path(self, obj, callback, arg):
+        typeid = self.get_type_id(obj)
         if self.has_gcptr_in_varsize(typeid):
             item = obj + self.varsize_offset_to_variable_part(typeid)
             length = (obj + self.varsize_offset_to_length(typeid)).signed[0]
@@ -212,7 +231,16 @@
                     j += 1
                 item += itemlength
                 length -= 1
-    trace._annspecialcase_ = 'specialize:arg(2)'
+        if self.has_custom_trace(typeid):
+            generator = self.get_custom_trace(typeid)
+            item = llmemory.NULL
+            while True:
+                item = generator(obj, item)
+                if not item:
+                    break
+                if self.points_to_valid_gc_object(item):
+                    callback(item, arg)
+    _trace_slow_path._annspecialcase_ = 'specialize:arg(2)'
 
     def trace_partial(self, obj, start, stop, callback, arg):
         """Like trace(), but only walk the array part, for indices in
@@ -317,7 +345,7 @@
                     break
                 obj = self.run_finalizers.popleft()
                 finalizer = self.getfinalizer(self.get_type_id(obj))
-                finalizer(obj)
+                finalizer(obj, llmemory.NULL)
         finally:
             self.finalizer_lock_count -= 1
 
diff --git a/pypy/rpython/memory/gc/generation.py b/pypy/rpython/memory/gc/generation.py
--- a/pypy/rpython/memory/gc/generation.py
+++ b/pypy/rpython/memory/gc/generation.py
@@ -166,9 +166,9 @@
                 return False
         return self.nursery <= addr < self.nursery_top
 
-    def malloc_fixedsize_clear(self, typeid, size, can_collect,
+    def malloc_fixedsize_clear(self, typeid, size,
                                has_finalizer=False, contains_weakptr=False):
-        if (has_finalizer or not can_collect or
+        if (has_finalizer or
             (raw_malloc_usage(size) > self.lb_young_fixedsize and
              raw_malloc_usage(size) > self.largest_young_fixedsize)):
             # ^^^ we do two size comparisons; the first one appears redundant,
@@ -178,7 +178,6 @@
             ll_assert(not contains_weakptr, "wrong case for mallocing weakref")
             # "non-simple" case or object too big: don't use the nursery
             return SemiSpaceGC.malloc_fixedsize_clear(self, typeid, size,
-                                                      can_collect,
                                                       has_finalizer,
                                                       contains_weakptr)
         size_gc_header = self.gcheaderbuilder.size_gc_header
@@ -195,7 +194,7 @@
         return llmemory.cast_adr_to_ptr(result+size_gc_header, llmemory.GCREF)
 
     def malloc_varsize_clear(self, typeid, length, size, itemsize,
-                             offset_to_length, can_collect):
+                             offset_to_length):
         # Only use the nursery if there are not too many items.
         if not raw_malloc_usage(itemsize):
             too_many_items = False
@@ -214,8 +213,7 @@
             maxlength = maxlength_for_minimal_nursery << self.nursery_scale
             too_many_items = length > maxlength
 
-        if (not can_collect or
-            too_many_items or
+        if (too_many_items or
             (raw_malloc_usage(size) > self.lb_young_var_basesize and
              raw_malloc_usage(size) > self.largest_young_var_basesize)):
             # ^^^ we do two size comparisons; the first one appears redundant,
@@ -223,8 +221,7 @@
             #     it almost always folds down to False, which kills the
             #     second comparison as well.
             return SemiSpaceGC.malloc_varsize_clear(self, typeid, length, size,
-                                                    itemsize, offset_to_length,
-                                                    can_collect)
+                                                    itemsize, offset_to_length)
         # with the above checks we know now that totalsize cannot be more
         # than about half of the nursery size; in particular, the + and *
         # cannot overflow
diff --git a/pypy/rpython/memory/gc/hybrid.py b/pypy/rpython/memory/gc/hybrid.py
--- a/pypy/rpython/memory/gc/hybrid.py
+++ b/pypy/rpython/memory/gc/hybrid.py
@@ -129,11 +129,7 @@
     # 'large'.
 
     def malloc_varsize_clear(self, typeid, length, size, itemsize,
-                             offset_to_length, can_collect):
-        if not can_collect:
-            return SemiSpaceGC.malloc_varsize_clear(self, typeid, length, size,
-                                                    itemsize, offset_to_length,
-                                                    can_collect)
+                             offset_to_length):
         size_gc_header = self.gcheaderbuilder.size_gc_header
         nonvarsize = size_gc_header + size
 
@@ -225,9 +221,9 @@
             totalsize)
         return result
 
-    def _check_rawsize_alloced(self, size_estimate, can_collect=True):
+    def _check_rawsize_alloced(self, size_estimate):
         self.large_objects_collect_trigger -= size_estimate
-        if can_collect and self.large_objects_collect_trigger < 0:
+        if self.large_objects_collect_trigger < 0:
             debug_start("gc-rawsize-collect")
             debug_print("allocated", (self._initial_trigger -
                                       self.large_objects_collect_trigger),
diff --git a/pypy/rpython/memory/gc/markcompact.py b/pypy/rpython/memory/gc/markcompact.py
--- a/pypy/rpython/memory/gc/markcompact.py
+++ b/pypy/rpython/memory/gc/markcompact.py
@@ -88,6 +88,9 @@
 
     def __init__(self, config, space_size=4096,
                  min_next_collect_after=128, **kwds):
+        import py
+        py.test.skip("the 'markcompact' gc needs fixing for custom tracers")
+        #
         MovingGCBase.__init__(self, config, **kwds)
         self.space_size = space_size
         self.min_next_collect_after = min_next_collect_after
@@ -177,14 +180,14 @@
         return llmemory.cast_adr_to_ptr(result+size_gc_header, llmemory.GCREF)
     _setup_object._always_inline_ = True
 
-    def malloc_fixedsize(self, typeid16, size, can_collect,
+    def malloc_fixedsize(self, typeid16, size,
                          has_finalizer=False, contains_weakptr=False):
         size_gc_header = self.gcheaderbuilder.size_gc_header
         totalsize = size_gc_header + size
         result = self._get_memory(totalsize)
         return self._setup_object(result, typeid16, has_finalizer)
 
-    def malloc_fixedsize_clear(self, typeid16, size, can_collect,
+    def malloc_fixedsize_clear(self, typeid16, size,
                                has_finalizer=False, contains_weakptr=False):
         size_gc_header = self.gcheaderbuilder.size_gc_header
         totalsize = size_gc_header + size
@@ -193,7 +196,7 @@
         return self._setup_object(result, typeid16, has_finalizer)
 
     def malloc_varsize_clear(self, typeid16, length, size, itemsize,
-                             offset_to_length, can_collect):
+                             offset_to_length):
         size_gc_header = self.gcheaderbuilder.size_gc_header
         nonvarsize = size_gc_header + size
         totalsize = self._get_totalsize_var(nonvarsize, itemsize, length)
diff --git a/pypy/rpython/memory/gc/marksweep.py b/pypy/rpython/memory/gc/marksweep.py
--- a/pypy/rpython/memory/gc/marksweep.py
+++ b/pypy/rpython/memory/gc/marksweep.py
@@ -13,14 +13,14 @@
 
 import sys, os
 
-X_POOL = lltype.GcOpaqueType('gc.pool')
-X_POOL_PTR = lltype.Ptr(X_POOL)
-X_CLONE = lltype.GcStruct('CloneData', ('gcobjectptr', llmemory.GCREF),
-                                       ('pool',        X_POOL_PTR))
-X_CLONE_PTR = lltype.Ptr(X_CLONE)
+##X_POOL = lltype.GcOpaqueType('gc.pool')
+##X_POOL_PTR = lltype.Ptr(X_POOL)
+##X_CLONE = lltype.GcStruct('CloneData', ('gcobjectptr', llmemory.GCREF),
+##                                       ('pool',        X_POOL_PTR))
+##X_CLONE_PTR = lltype.Ptr(X_CLONE)
 
 FL_WITHHASH = 0x01
-FL_CURPOOL  = 0x02
+##FL_CURPOOL  = 0x02
 
 memoryError = MemoryError()
 class MarkSweepGC(GCBase):
@@ -92,10 +92,9 @@
     def write_free_statistics(self, typeid16, result):
         pass
 
-    def malloc_fixedsize(self, typeid16, size, can_collect,
+    def malloc_fixedsize(self, typeid16, size,
                          has_finalizer=False, contains_weakptr=False):
-        if can_collect:
-            self.maybe_collect()
+        self.maybe_collect()
         size_gc_header = self.gcheaderbuilder.size_gc_header
         try:
             tot_size = size_gc_header + size
@@ -128,10 +127,9 @@
         return llmemory.cast_adr_to_ptr(result, llmemory.GCREF)
     malloc_fixedsize._dont_inline_ = True
 
-    def malloc_fixedsize_clear(self, typeid16, size, can_collect,
+    def malloc_fixedsize_clear(self, typeid16, size,
                                has_finalizer=False, contains_weakptr=False):
-        if can_collect:
-            self.maybe_collect()
+        self.maybe_collect()
         size_gc_header = self.gcheaderbuilder.size_gc_header
         try:
             tot_size = size_gc_header + size
@@ -166,9 +164,8 @@
     malloc_fixedsize_clear._dont_inline_ = True
 
     def malloc_varsize(self, typeid16, length, size, itemsize,
-                       offset_to_length, can_collect):
-        if can_collect:
-            self.maybe_collect()
+                       offset_to_length):
+        self.maybe_collect()
         size_gc_header = self.gcheaderbuilder.size_gc_header
         try:
             fixsize = size_gc_header + size
@@ -200,9 +197,8 @@
     malloc_varsize._dont_inline_ = True
 
     def malloc_varsize_clear(self, typeid16, length, size, itemsize,
-                             offset_to_length, can_collect):
-        if can_collect:
-            self.maybe_collect()
+                             offset_to_length):
+        self.maybe_collect()
         size_gc_header = self.gcheaderbuilder.size_gc_header
         try:
             fixsize = size_gc_header + size
@@ -450,7 +446,7 @@
                 hdr.next = self.malloced_objects
                 self.malloced_objects = hdr
                 #llop.debug_view(lltype.Void, self.malloced_objects, self.malloced_objects_with_finalizer, size_gc_header)
-                finalizer(obj)
+                finalizer(obj, llmemory.NULL)
                 if not self.collect_in_progress: # another collection was caused?
                     debug_print("outer collect interrupted "
                                 "by recursive collect")
@@ -526,178 +522,10 @@
 
     # experimental support for thread cloning
     def x_swap_pool(self, newpool):
-        # Set newpool as the current pool (create one if newpool == NULL).
-        # All malloc'ed objects are put into the current pool;this is a
-        # way to separate objects depending on when they were allocated.
-        size_gc_header = self.gcheaderbuilder.size_gc_header
-        # invariant: each POOL GcStruct is at the _front_ of a linked list
-        # of malloced objects.
-        oldpool = self.curpool
-        #llop.debug_print(lltype.Void, 'x_swap_pool',
-        #                 lltype.cast_ptr_to_int(oldpool),
-        #                 lltype.cast_ptr_to_int(newpool))
-        if not oldpool:
-            # make a fresh pool object, which is automatically inserted at the
-            # front of the current list
-            oldpool = lltype.malloc(self.POOL)
-            addr = llmemory.cast_ptr_to_adr(oldpool)
-            addr -= size_gc_header
-            hdr = llmemory.cast_adr_to_ptr(addr, self.HDRPTR)
-            # put this new POOL object in the poolnodes list
-            node = lltype.malloc(self.POOLNODE, flavor="raw")
-            node.linkedlist = hdr
-            node.nextnode = self.poolnodes
-            self.poolnodes = node
-        else:
-            # manually insert oldpool at the front of the current list
-            addr = llmemory.cast_ptr_to_adr(oldpool)
-            addr -= size_gc_header
-            hdr = llmemory.cast_adr_to_ptr(addr, self.HDRPTR)
-            hdr.next = self.malloced_objects
-
-        newpool = lltype.cast_opaque_ptr(self.POOLPTR, newpool)
-        if newpool:
-            # newpool is at the front of the new linked list to install
-            addr = llmemory.cast_ptr_to_adr(newpool)
-            addr -= size_gc_header
-            hdr = llmemory.cast_adr_to_ptr(addr, self.HDRPTR)
-            self.malloced_objects = hdr.next
-            # invariant: now that objects in the hdr.next list are accessible
-            # through self.malloced_objects, make sure they are not accessible
-            # via poolnodes (which has a node pointing to newpool):
-            hdr.next = lltype.nullptr(self.HDR)
-        else:
-            # start a fresh new linked list
-            self.malloced_objects = lltype.nullptr(self.HDR)
-        self.curpool = newpool
-        return lltype.cast_opaque_ptr(X_POOL_PTR, oldpool)
+        raise NotImplementedError("old operation deprecated")
 
     def x_clone(self, clonedata):
-        # Recursively clone the gcobject and everything it points to,
-        # directly or indirectly -- but stops at objects that are not
-        # in the specified pool.  A new pool is built to contain the
-        # copies, and the 'gcobjectptr' and 'pool' fields of clonedata
-        # are adjusted to refer to the result.
-
-        # install a new pool into which all the mallocs go
-        curpool = self.x_swap_pool(lltype.nullptr(X_POOL))
-
-        size_gc_header = self.gcheaderbuilder.size_gc_header
-        oldobjects = self.AddressStack()
-        # if no pool specified, use the current pool as the 'source' pool
-        oldpool = clonedata.pool or curpool
-        oldpool = lltype.cast_opaque_ptr(self.POOLPTR, oldpool)
-        addr = llmemory.cast_ptr_to_adr(oldpool)
-        addr -= size_gc_header
-
-        hdr = llmemory.cast_adr_to_ptr(addr, self.HDRPTR)
-        hdr = hdr.next   # skip the POOL object itself
-        while hdr:
-            next = hdr.next
-            # mark all objects from malloced_list
-            hdr.flags = chr(ord(hdr.flags) | FL_CURPOOL)
-            hdr.next = lltype.nullptr(self.HDR)  # abused to point to the copy
-            oldobjects.append(llmemory.cast_ptr_to_adr(hdr))
-            hdr = next
-
-        # a stack of addresses of places that still points to old objects
-        # and that must possibly be fixed to point to a new copy
-        stack = self.AddressStack()
-        stack.append(llmemory.cast_ptr_to_adr(clonedata)
-                     + llmemory.offsetof(X_CLONE, 'gcobjectptr'))
-        while stack.non_empty():
-            gcptr_addr = stack.pop()
-            oldobj_addr = gcptr_addr.address[0]
-            if not oldobj_addr:
-                continue   # pointer is NULL
-            oldhdr = llmemory.cast_adr_to_ptr(oldobj_addr - size_gc_header,
-                                              self.HDRPTR)
-            if not (ord(oldhdr.flags) & FL_CURPOOL):
-                continue   # ignore objects that were not in the malloced_list
-            newhdr = oldhdr.next      # abused to point to the copy
-            if not newhdr:
-                typeid = oldhdr.typeid16
-                size = self.fixed_size(typeid)
-                # XXX! collect() at the beginning if the free heap is low
-                if self.is_varsize(typeid):
-                    itemsize = self.varsize_item_sizes(typeid)
-                    offset_to_length = self.varsize_offset_to_length(typeid)
-                    length = (oldobj_addr + offset_to_length).signed[0]
-                    newobj = self.malloc_varsize(typeid, length, size,
-                                                 itemsize, offset_to_length,
-                                                 False)
-                    size += length*itemsize
-                else:
-                    newobj = self.malloc_fixedsize(typeid, size, False)
-                    length = -1
-
-                newobj_addr = llmemory.cast_ptr_to_adr(newobj)
-
-                #llop.debug_print(lltype.Void, 'clone',
-                #                 llmemory.cast_adr_to_int(oldobj_addr),
-                #                 '->', llmemory.cast_adr_to_int(newobj_addr),
-                #                 'typeid', typeid,
-                #                 'length', length)
-
-                newhdr_addr = newobj_addr - size_gc_header
-                newhdr = llmemory.cast_adr_to_ptr(newhdr_addr, self.HDRPTR)
-
-                saved_id   = newhdr.typeid16  # XXX hack needed for genc
-                saved_flg1 = newhdr.mark
-                saved_flg2 = newhdr.flags
-                saved_next = newhdr.next      # where size_gc_header == 0
-                raw_memcopy(oldobj_addr, newobj_addr, size)
-                newhdr.typeid16 = saved_id
-                newhdr.mark     = saved_flg1
-                newhdr.flags    = saved_flg2
-                newhdr.next     = saved_next
-
-                offsets = self.offsets_to_gc_pointers(typeid)
-                i = 0
-                while i < len(offsets):
-                    pointer_addr = newobj_addr + offsets[i]
-                    stack.append(pointer_addr)
-                    i += 1
-
-                if length > 0:
-                    offsets = self.varsize_offsets_to_gcpointers_in_var_part(
-                        typeid)
-                    itemlength = self.varsize_item_sizes(typeid)
-                    offset = self.varsize_offset_to_variable_part(typeid)
-                    itembaseaddr = newobj_addr + offset
-                    i = 0
-                    while i < length:
-                        item = itembaseaddr + itemlength * i
-                        j = 0
-                        while j < len(offsets):
-                            pointer_addr = item + offsets[j]
-                            stack.append(pointer_addr)
-                            j += 1
-                        i += 1
-
-                oldhdr.next = newhdr
-            newobj_addr = llmemory.cast_ptr_to_adr(newhdr) + size_gc_header
-            gcptr_addr.address[0] = newobj_addr
-        stack.delete()
-
-        # re-create the original linked list
-        next = lltype.nullptr(self.HDR)
-        while oldobjects.non_empty():
-            hdr = llmemory.cast_adr_to_ptr(oldobjects.pop(), self.HDRPTR)
-            hdr.flags = chr(ord(hdr.flags) &~ FL_CURPOOL)  # reset the flag
-            hdr.next = next
-            next = hdr
-        oldobjects.delete()
-
-        # consistency check
-        addr = llmemory.cast_ptr_to_adr(oldpool)
-        addr -= size_gc_header
-        hdr = llmemory.cast_adr_to_ptr(addr, self.HDRPTR)
-        assert hdr.next == next
-
-        # build the new pool object collecting the new objects, and
-        # reinstall the pool that was current at the beginning of x_clone()
-        clonedata.pool = self.x_swap_pool(curpool)
+        raise NotImplementedError("old operation deprecated")
 
     def identityhash(self, obj):
         obj = llmemory.cast_ptr_to_adr(obj)
diff --git a/pypy/rpython/memory/gc/minimark.py b/pypy/rpython/memory/gc/minimark.py
--- a/pypy/rpython/memory/gc/minimark.py
+++ b/pypy/rpython/memory/gc/minimark.py
@@ -456,9 +456,8 @@
             debug_stop("gc-debug")
 
 
-    def malloc_fixedsize_clear(self, typeid, size, can_collect=True,
+    def malloc_fixedsize_clear(self, typeid, size,
                                needs_finalizer=False, contains_weakptr=False):
-        ll_assert(can_collect, "!can_collect")
         size_gc_header = self.gcheaderbuilder.size_gc_header
         totalsize = size_gc_header + size
         rawtotalsize = raw_malloc_usage(totalsize)
@@ -507,8 +506,7 @@
 
 
     def malloc_varsize_clear(self, typeid, length, size, itemsize,
-                             offset_to_length, can_collect):
-        ll_assert(can_collect, "!can_collect")
+                             offset_to_length):
         size_gc_header = self.gcheaderbuilder.size_gc_header
         nonvarsize = size_gc_header + size
         #
@@ -1704,7 +1702,7 @@
             None)   # we don't need the static in all prebuilt gc objects
         #
         # If we are in an inner collection caused by a call to a finalizer,
-        # the 'run_finalizers' objects also need to kept alive.
+        # the 'run_finalizers' objects also need to be kept alive.
         self.run_finalizers.foreach(self._collect_obj,
                                     self.objects_to_trace)
 
diff --git a/pypy/rpython/memory/gc/semispace.py b/pypy/rpython/memory/gc/semispace.py
--- a/pypy/rpython/memory/gc/semispace.py
+++ b/pypy/rpython/memory/gc/semispace.py
@@ -92,14 +92,12 @@
     # This class only defines the malloc_{fixed,var}size_clear() methods
     # because the spaces are filled with zeroes in advance.
 
-    def malloc_fixedsize_clear(self, typeid16, size, can_collect,
+    def malloc_fixedsize_clear(self, typeid16, size,
                                has_finalizer=False, contains_weakptr=False):
         size_gc_header = self.gcheaderbuilder.size_gc_header
         totalsize = size_gc_header + size
         result = self.free
         if raw_malloc_usage(totalsize) > self.top_of_space - result:
-            if not can_collect:
-                raise memoryError
             result = self.obtain_free_space(totalsize)
         llarena.arena_reserve(result, totalsize)
         self.init_gc_object(result, typeid16)
@@ -111,7 +109,7 @@
         return llmemory.cast_adr_to_ptr(result+size_gc_header, llmemory.GCREF)
 
     def malloc_varsize_clear(self, typeid16, length, size, itemsize,
-                             offset_to_length, can_collect):
+                             offset_to_length):
         size_gc_header = self.gcheaderbuilder.size_gc_header
         nonvarsize = size_gc_header + size
         try:
@@ -121,8 +119,6 @@
             raise memoryError
         result = self.free
         if raw_malloc_usage(totalsize) > self.top_of_space - result:
-            if not can_collect:
-                raise memoryError
             result = self.obtain_free_space(totalsize)
         llarena.arena_reserve(result, totalsize)
         self.init_gc_object(result, typeid16)
diff --git a/pypy/rpython/memory/gctransform/asmgcroot.py b/pypy/rpython/memory/gctransform/asmgcroot.py
--- a/pypy/rpython/memory/gctransform/asmgcroot.py
+++ b/pypy/rpython/memory/gctransform/asmgcroot.py
@@ -147,6 +147,11 @@
             self._extra_gcmapend    = lambda: llmemory.NULL
             self._extra_mark_sorted = lambda: True
 
+    def need_stacklet_support(self, gctransformer, getfn):
+        # stacklet support: BIG HACK for rlib.rstacklet
+        from pypy.rlib import _stacklet_asmgcc
+        _stacklet_asmgcc._asmstackrootwalker = self     # as a global! argh
+
     def need_thread_support(self, gctransformer, getfn):
         # Threads supported "out of the box" by the rest of the code.
         # The whole code in this function is only there to support
@@ -361,12 +366,13 @@
         # found!  Enumerate the GC roots in the caller frame
         #
         collect_stack_root = self.gcdata._gc_collect_stack_root
+        ebp_in_caller = callee.regs_stored_at[INDEX_OF_EBP].address[0]
         gc = self.gc
         while True:
             location = self._shape_decompressor.next()
             if location == 0:
                 break
-            addr = self.getlocation(callee, location)
+            addr = self.getlocation(callee, ebp_in_caller, location)
             if gc.points_to_valid_gc_object(addr):
                 collect_stack_root(gc, addr)
         #
@@ -376,12 +382,13 @@
         reg = CALLEE_SAVED_REGS - 1
         while reg >= 0:
             location = self._shape_decompressor.next()
-            addr = self.getlocation(callee, location)
+            addr = self.getlocation(callee, ebp_in_caller, location)
             caller.regs_stored_at[reg] = addr
             reg -= 1
 
         location = self._shape_decompressor.next()
-        caller.frame_address = self.getlocation(callee, location)
+        caller.frame_address = self.getlocation(callee, ebp_in_caller,
+                                                location)
         # we get a NULL marker to mean "I'm the frame
         # of the entry point, stop walking"
         return caller.frame_address != llmemory.NULL
@@ -429,7 +436,7 @@
             return
         llop.debug_fatalerror(lltype.Void, "cannot find gc roots!")
 
-    def getlocation(self, callee, location):
+    def getlocation(self, callee, ebp_in_caller, location):
         """Get the location in the 'caller' frame of a variable, based
         on the integer 'location' that describes it.  All locations are
         computed based on information saved by the 'callee'.
@@ -447,10 +454,8 @@
             esp_in_caller = callee.frame_address + sizeofaddr
             return esp_in_caller + offset
         elif kind == LOC_EBP_PLUS:    # in the caller stack frame at N(%ebp)
-            ebp_in_caller = callee.regs_stored_at[INDEX_OF_EBP].address[0]
             return ebp_in_caller + offset
         else:  # kind == LOC_EBP_MINUS:   at -N(%ebp)
-            ebp_in_caller = callee.regs_stored_at[INDEX_OF_EBP].address[0]
             return ebp_in_caller - offset
 
 
diff --git a/pypy/rpython/memory/gctransform/framework.py b/pypy/rpython/memory/gctransform/framework.py
--- a/pypy/rpython/memory/gctransform/framework.py
+++ b/pypy/rpython/memory/gctransform/framework.py
@@ -7,7 +7,7 @@
 from pypy.rpython.memory.gc import marksweep
 from pypy.rpython.memory.gcheader import GCHeaderBuilder
 from pypy.rlib.rarithmetic import ovfcheck
-from pypy.rlib import rstack, rgc
+from pypy.rlib import rgc
 from pypy.rlib.debug import ll_assert
 from pypy.rlib.objectmodel import we_are_translated
 from pypy.translator.backendopt import graphanalyze
@@ -33,8 +33,6 @@
         except AttributeError:
             pass
         else:
-            if func is rstack.stack_check:
-                return self.translator.config.translation.stackless
             if getattr(func, '_gctransformer_hint_cannot_collect_', False):
                 return False
             if getattr(func, '_gctransformer_hint_close_stack_', False):
@@ -50,10 +48,10 @@
     def analyze_simple_operation(self, op, graphinfo):
         if op.opname in ('malloc', 'malloc_varsize'):
             flags = op.args[1].value
-            return flags['flavor'] == 'gc' and not flags.get('nocollect', False)
+            return flags['flavor'] == 'gc'
         else:
             return (op.opname in LL_OPERATIONS and
-                    LL_OPERATIONS[op.opname].canunwindgc)
+                    LL_OPERATIONS[op.opname].canmallocgc)
 
 def find_initializing_stores(collect_analyzer, graph):
     from pypy.objspace.flow.model import mkentrymap
@@ -134,8 +132,7 @@
     return result
 
 class FrameworkGCTransformer(GCTransformer):
-    use_stackless = False
-    root_stack_depth = 163840
+    root_stack_depth = None    # for tests to override
 
     def __init__(self, translator):
         from pypy.rpython.memory.gc.base import choose_gc_from_config
@@ -152,13 +149,8 @@
             # for regular translation: pick the GC from the config
             GCClass, GC_PARAMS = choose_gc_from_config(translator.config)
 
-        self.root_stack_jit_hook = None
         if hasattr(translator, '_jit2gc'):
             self.layoutbuilder = translator._jit2gc['layoutbuilder']
-            try:
-                self.root_stack_jit_hook = translator._jit2gc['rootstackhook']
-            except KeyError:
-                pass
         else:
             self.layoutbuilder = TransformerLayoutBuilder(translator, GCClass)
         self.layoutbuilder.transformer = self
@@ -265,7 +257,7 @@
             malloc_fixedsize_clear_meth,
             [s_gc, s_typeid16,
              annmodel.SomeInteger(nonneg=True),
-             annmodel.SomeBool(), annmodel.SomeBool(),
+             annmodel.SomeBool(),
              annmodel.SomeBool()], s_gcref,
             inline = False)
         if hasattr(GCClass, 'malloc_fixedsize'):
@@ -274,7 +266,7 @@
                 malloc_fixedsize_meth,
                 [s_gc, s_typeid16,
                  annmodel.SomeInteger(nonneg=True),
-                 annmodel.SomeBool(), annmodel.SomeBool(),
+                 annmodel.SomeBool(),
                  annmodel.SomeBool()], s_gcref,
                 inline = False)
         else:
@@ -283,12 +275,11 @@
 ##         self.malloc_varsize_ptr = getfn(
 ##             GCClass.malloc_varsize.im_func,
 ##             [s_gc] + [annmodel.SomeInteger(nonneg=True) for i in range(5)]
-##             + [annmodel.SomeBool(), annmodel.SomeBool()], s_gcref)
+##             + [annmodel.SomeBool()], s_gcref)
         self.malloc_varsize_clear_ptr = getfn(
             GCClass.malloc_varsize_clear.im_func,
             [s_gc, s_typeid16]
-            + [annmodel.SomeInteger(nonneg=True) for i in range(4)]
-            + [annmodel.SomeBool()], s_gcref)
+            + [annmodel.SomeInteger(nonneg=True) for i in range(4)], s_gcref)
         self.collect_ptr = getfn(GCClass.collect.im_func,
             [s_gc, annmodel.SomeInteger()], annmodel.s_None)
         self.can_move_ptr = getfn(GCClass.can_move.im_func,
@@ -342,13 +333,11 @@
                 malloc_fast_meth,
                 "malloc_fast")
             s_False = annmodel.SomeBool(); s_False.const = False
-            s_True  = annmodel.SomeBool(); s_True .const = True
             self.malloc_fast_ptr = getfn(
                 malloc_fast,
                 [s_gc, s_typeid16,
                  annmodel.SomeInteger(nonneg=True),
-                 s_True, s_False,
-                 s_False], s_gcref,
+                 s_False, s_False], s_gcref,
                 inline = True)
         else:
             self.malloc_fast_ptr = None
@@ -362,15 +351,13 @@
                 GCClass.malloc_varsize_clear.im_func,
                 "malloc_varsize_clear_fast")
             s_False = annmodel.SomeBool(); s_False.const = False
-            s_True  = annmodel.SomeBool(); s_True .const = True
             self.malloc_varsize_clear_fast_ptr = getfn(
                 malloc_varsize_clear_fast,
                 [s_gc, s_typeid16,
                  annmodel.SomeInteger(nonneg=True),
                  annmodel.SomeInteger(nonneg=True),
                  annmodel.SomeInteger(nonneg=True),
-                 annmodel.SomeInteger(nonneg=True),
-                 s_True], s_gcref,
+                 annmodel.SomeInteger(nonneg=True)], s_gcref,
                 inline = True)
         else:
             self.malloc_varsize_clear_fast_ptr = None
@@ -491,21 +478,9 @@
                                     [s_gc, annmodel.SomeInteger()],
                                     annmodel.SomeInteger())
 
-        # experimental gc_x_* operations
-        s_x_pool  = annmodel.SomePtr(marksweep.X_POOL_PTR)
-        s_x_clone = annmodel.SomePtr(marksweep.X_CLONE_PTR)
-        # the x_*() methods use some regular mallocs that must be
-        # transformed in the normal way
-        self.x_swap_pool_ptr = getfn(GCClass.x_swap_pool.im_func,
-                                     [s_gc, s_x_pool],
-                                     s_x_pool,
-                                     minimal_transform = False)
-        self.x_clone_ptr = getfn(GCClass.x_clone.im_func,
-                                 [s_gc, s_x_clone],
-                                 annmodel.s_None,
-                                 minimal_transform = False)
-
         # thread support
+        if translator.config.translation.continuation:
+            root_walker.need_stacklet_support(self, getfn)
         if translator.config.translation.thread:
             root_walker.need_thread_support(self, getfn)
 
@@ -547,8 +522,8 @@
     #    this method is attached to the instance and redirects to
     #    layoutbuilder.get_type_id().
 
-    def finalizer_funcptr_for_type(self, TYPE):
-        return self.layoutbuilder.finalizer_funcptr_for_type(TYPE)
+    def special_funcptr_for_type(self, TYPE):
+        return self.layoutbuilder.special_funcptr_for_type(TYPE)
 
     def gc_header_for(self, obj, needs_hash=False):
         hdr = self.gcdata.gc.gcheaderbuilder.header_of_object(obj)
@@ -682,7 +657,6 @@
     def gct_fv_gc_malloc(self, hop, flags, TYPE, *args):
         op = hop.spaceop
         flavor = flags['flavor']
-        c_can_collect = rmodel.inputconst(lltype.Bool, not flags.get('nocollect', False))
 
         PTRTYPE = op.result.concretetype
         assert PTRTYPE.TO == TYPE
@@ -691,21 +665,23 @@
         c_type_id = rmodel.inputconst(TYPE_ID, type_id)
         info = self.layoutbuilder.get_info(type_id)
         c_size = rmodel.inputconst(lltype.Signed, info.fixedsize)
-        has_finalizer = bool(self.finalizer_funcptr_for_type(TYPE))
+        kind_and_fptr = self.special_funcptr_for_type(TYPE)
+        has_finalizer = (kind_and_fptr is not None and
+                         kind_and_fptr[0] == "finalizer")
         c_has_finalizer = rmodel.inputconst(lltype.Bool, has_finalizer)
 
         if not op.opname.endswith('_varsize') and not flags.get('varsize'):
             #malloc_ptr = self.malloc_fixedsize_ptr
             zero = flags.get('zero', False)
             if (self.malloc_fast_ptr is not None and
-                c_can_collect.value and not c_has_finalizer.value and
+                not c_has_finalizer.value and
                 (self.malloc_fast_is_clearing or not zero)):
                 malloc_ptr = self.malloc_fast_ptr
             elif zero:
                 malloc_ptr = self.malloc_fixedsize_clear_ptr
             else:
                 malloc_ptr = self.malloc_fixedsize_ptr
-            args = [self.c_const_gc, c_type_id, c_size, c_can_collect,
+            args = [self.c_const_gc, c_type_id, c_size,
                     c_has_finalizer, rmodel.inputconst(lltype.Bool, False)]
         else:
             assert not c_has_finalizer.value
@@ -718,17 +694,15 @@
             if flags.get('nonmovable') and self.malloc_varsize_nonmovable_ptr:
                 # we don't have tests for such cases, let's fail
                 # explicitely
-                assert c_can_collect.value
                 malloc_ptr = self.malloc_varsize_nonmovable_ptr
                 args = [self.c_const_gc, c_type_id, v_length]
             else:
-                if (self.malloc_varsize_clear_fast_ptr is not None and
-                    c_can_collect.value):
+                if self.malloc_varsize_clear_fast_ptr is not None:
                     malloc_ptr = self.malloc_varsize_clear_fast_ptr
                 else:
                     malloc_ptr = self.malloc_varsize_clear_ptr
                 args = [self.c_const_gc, c_type_id, v_length, c_size,
-                        c_varitemsize, c_ofstolength, c_can_collect]
+                        c_varitemsize, c_ofstolength]
         livevars = self.push_roots(hop)
         v_result = hop.genop("direct_call", [malloc_ptr] + args,
                              resulttype=llmemory.GCREF)
@@ -768,8 +742,13 @@
                   resultvar=op.result)
 
     def gct_gc_assume_young_pointers(self, hop):
+        if not hasattr(self, 'assume_young_pointers_ptr'):
+            return
         op = hop.spaceop
         v_addr = op.args[0]
+        if v_addr.concretetype != llmemory.Address:
+            v_addr = hop.genop('cast_ptr_to_adr',
+                               [v_addr], resulttype=llmemory.Address)
         hop.genop("direct_call", [self.assume_young_pointers_ptr,
                                   self.c_const_gc, v_addr])
 
@@ -788,68 +767,90 @@
         hop.genop("direct_call", [self.get_member_index_ptr, self.c_const_gc,
                                   v_typeid], resultvar=op.result)
 
-    def gct_gc_adr_of_nursery_free(self, hop):
-        if getattr(self.gcdata.gc, 'nursery_free', None) is None:
-            raise NotImplementedError("gc_adr_of_nursery_free only for generational gcs")
+    def _gc_adr_of_gc_attr(self, hop, attrname):
+        if getattr(self.gcdata.gc, attrname, None) is None:
+            raise NotImplementedError("gc_adr_of_%s only for generational gcs"
+                                      % (attrname,))
         op = hop.spaceop
         ofs = llmemory.offsetof(self.c_const_gc.concretetype.TO,
-                                'inst_nursery_free')
+                                'inst_' + attrname)
         c_ofs = rmodel.inputconst(lltype.Signed, ofs)
         v_gc_adr = hop.genop('cast_ptr_to_adr', [self.c_const_gc],
                              resulttype=llmemory.Address)
         hop.genop('adr_add', [v_gc_adr, c_ofs], resultvar=op.result)
 
+    def gct_gc_adr_of_nursery_free(self, hop):
+        self._gc_adr_of_gc_attr(hop, 'nursery_free')
     def gct_gc_adr_of_nursery_top(self, hop):
-        if getattr(self.gcdata.gc, 'nursery_top', None) is None:
-            raise NotImplementedError("gc_adr_of_nursery_top only for generational gcs")
-        op = hop.spaceop
-        ofs = llmemory.offsetof(self.c_const_gc.concretetype.TO,
-                                'inst_nursery_top')
-        c_ofs = rmodel.inputconst(lltype.Signed, ofs)
-        v_gc_adr = hop.genop('cast_ptr_to_adr', [self.c_const_gc],
-                             resulttype=llmemory.Address)
-        hop.genop('adr_add', [v_gc_adr, c_ofs], resultvar=op.result)
+        self._gc_adr_of_gc_attr(hop, 'nursery_top')
 
-    def gct_gc_adr_of_root_stack_top(self, hop):
+    def _gc_adr_of_gcdata_attr(self, hop, attrname):
         op = hop.spaceop
         ofs = llmemory.offsetof(self.c_const_gcdata.concretetype.TO,
-                                'inst_root_stack_top')
+                                'inst_' + attrname)
         c_ofs = rmodel.inputconst(lltype.Signed, ofs)
         v_gcdata_adr = hop.genop('cast_ptr_to_adr', [self.c_const_gcdata],
                                  resulttype=llmemory.Address)
         hop.genop('adr_add', [v_gcdata_adr, c_ofs], resultvar=op.result)
 
-    def gct_gc_x_swap_pool(self, hop):
+    def gct_gc_adr_of_root_stack_base(self, hop):
+        self._gc_adr_of_gcdata_attr(hop, 'root_stack_base')
+    def gct_gc_adr_of_root_stack_top(self, hop):
+        self._gc_adr_of_gcdata_attr(hop, 'root_stack_top')
+
+    def gct_gc_shadowstackref_new(self, hop):
         op = hop.spaceop
-        [v_malloced] = op.args
+        livevars = self.push_roots(hop)
+        hop.genop("direct_call", [self.root_walker.gc_shadowstackref_new_ptr],
+                  resultvar=op.result)
+        self.pop_roots(hop, livevars)
+
+    def gct_gc_shadowstackref_context(self, hop):
+        op = hop.spaceop
         hop.genop("direct_call",
-                  [self.x_swap_pool_ptr, self.c_const_gc, v_malloced],
+                  [self.root_walker.gc_shadowstackref_context_ptr, op.args[0]],
                   resultvar=op.result)
 
+    def gct_gc_shadowstackref_destroy(self, hop):
+        hop.genop("direct_call",
+                  [self.root_walker.gc_shadowstackref_destroy_ptr, op.args[0]])
+
+    def gct_gc_save_current_state_away(self, hop):
+        op = hop.spaceop
+        hop.genop("direct_call",
+                  [self.root_walker.gc_save_current_state_away_ptr,
+                   op.args[0], op.args[1]])
+
+    def gct_gc_forget_current_state(self, hop):
+        hop.genop("direct_call",
+                  [self.root_walker.gc_forget_current_state_ptr])
+
+    def gct_gc_restore_state_from(self, hop):
+        op = hop.spaceop
+        hop.genop("direct_call",
+                  [self.root_walker.gc_restore_state_from_ptr,
+                   op.args[0]])
+
+    def gct_gc_start_fresh_new_state(self, hop):
+        hop.genop("direct_call",
+                  [self.root_walker.gc_start_fresh_new_state_ptr])
+
+    def gct_gc_x_swap_pool(self, hop):
+        raise NotImplementedError("old operation deprecated")
     def gct_gc_x_clone(self, hop):
-        op = hop.spaceop
-        [v_clonedata] = op.args
-        hop.genop("direct_call",
-                  [self.x_clone_ptr, self.c_const_gc, v_clonedata],
-                  resultvar=op.result)
-
+        raise NotImplementedError("old operation deprecated")
     def gct_gc_x_size_header(self, hop):
-        op = hop.spaceop
-        c_result = rmodel.inputconst(lltype.Signed,
-                                     self.gcdata.gc.size_gc_header())
-        hop.genop("same_as",
-                  [c_result],
-                  resultvar=op.result)
+        raise NotImplementedError("old operation deprecated")
 
     def gct_do_malloc_fixedsize_clear(self, hop):
         # used by the JIT (see pypy.jit.backend.llsupport.gc)
         op = hop.spaceop
-        [v_typeid, v_size, v_can_collect,
+        [v_typeid, v_size,
          v_has_finalizer, v_contains_weakptr] = op.args
         livevars = self.push_roots(hop)
         hop.genop("direct_call",
                   [self.malloc_fixedsize_clear_ptr, self.c_const_gc,
-                   v_typeid, v_size, v_can_collect,
+                   v_typeid, v_size,
                    v_has_finalizer, v_contains_weakptr],
                   resultvar=op.result)
         self.pop_roots(hop, livevars)
@@ -858,12 +859,12 @@
         # used by the JIT (see pypy.jit.backend.llsupport.gc)
         op = hop.spaceop
         [v_typeid, v_length, v_size, v_itemsize,
-         v_offset_to_length, v_can_collect] = op.args
+         v_offset_to_length] = op.args
         livevars = self.push_roots(hop)
         hop.genop("direct_call",
                   [self.malloc_varsize_clear_ptr, self.c_const_gc,
                    v_typeid, v_length, v_size, v_itemsize,
-                   v_offset_to_length, v_can_collect],
+                   v_offset_to_length],
                   resultvar=op.result)
         self.pop_roots(hop, livevars)
 
@@ -911,8 +912,8 @@
         c_size = rmodel.inputconst(lltype.Signed, info.fixedsize)
         malloc_ptr = self.malloc_fixedsize_ptr
         c_has_finalizer = rmodel.inputconst(lltype.Bool, False)
-        c_has_weakptr = c_can_collect = rmodel.inputconst(lltype.Bool, True)
-        args = [self.c_const_gc, c_type_id, c_size, c_can_collect,
+        c_has_weakptr = rmodel.inputconst(lltype.Bool, True)
+        args = [self.c_const_gc, c_type_id, c_size,
                 c_has_finalizer, c_has_weakptr]
 
         # push and pop the current live variables *including* the argument
@@ -979,24 +980,28 @@
                                   v_size])
 
     def gct_gc_thread_prepare(self, hop):
-        assert self.translator.config.translation.thread
-        if hasattr(self.root_walker, 'thread_prepare_ptr'):
-            hop.genop("direct_call", [self.root_walker.thread_prepare_ptr])
+        pass   # no effect any more
 
     def gct_gc_thread_run(self, hop):
         assert self.translator.config.translation.thread
         if hasattr(self.root_walker, 'thread_run_ptr'):
+            livevars = self.push_roots(hop)
             hop.genop("direct_call", [self.root_walker.thread_run_ptr])
+            self.pop_roots(hop, livevars)
 
     def gct_gc_thread_start(self, hop):
         assert self.translator.config.translation.thread
         if hasattr(self.root_walker, 'thread_start_ptr'):
+            # only with asmgcc.  Note that this is actually called after
+            # the first gc_thread_run() in the new thread.
             hop.genop("direct_call", [self.root_walker.thread_start_ptr])
 
     def gct_gc_thread_die(self, hop):
         assert self.translator.config.translation.thread
         if hasattr(self.root_walker, 'thread_die_ptr'):
+            livevars = self.push_roots(hop)
             hop.genop("direct_call", [self.root_walker.thread_die_ptr])
+            self.pop_roots(hop, livevars)
 
     def gct_gc_thread_before_fork(self, hop):
         if (self.translator.config.translation.thread
@@ -1011,8 +1016,10 @@
     def gct_gc_thread_after_fork(self, hop):
         if (self.translator.config.translation.thread
             and hasattr(self.root_walker, 'thread_after_fork_ptr')):
+            livevars = self.push_roots(hop)
             hop.genop("direct_call", [self.root_walker.thread_after_fork_ptr]
                                      + hop.spaceop.args)
+            self.pop_roots(hop, livevars)
 
     def gct_gc_get_type_info_group(self, hop):
         return hop.cast_result(self.c_type_info_group)
@@ -1246,28 +1253,39 @@
 
     def has_finalizer(self, TYPE):
         rtti = get_rtti(TYPE)
-        return rtti is not None and hasattr(rtti._obj, 'destructor_funcptr')
+        return rtti is not None and getattr(rtti._obj, 'destructor_funcptr',
+                                            None)
+
+    def has_custom_trace(self, TYPE):
+        rtti = get_rtti(TYPE)
+        return rtti is not None and getattr(rtti._obj, 'custom_trace_funcptr',
+                                            None)
 
     def make_finalizer_funcptr_for_type(self, TYPE):
-        if self.has_finalizer(TYPE):
-            rtti = get_rtti(TYPE)
-            destrptr = rtti._obj.destructor_funcptr
-            DESTR_ARG = lltype.typeOf(destrptr).TO.ARGS[0]
-        else:
-            destrptr = None
-            DESTR_ARG = None
+        if not self.has_finalizer(TYPE):
+            return None
+        rtti = get_rtti(TYPE)
+        destrptr = rtti._obj.destructor_funcptr
+        DESTR_ARG = lltype.typeOf(destrptr).TO.ARGS[0]
+        assert not type_contains_pyobjs(TYPE), "not implemented"
+        typename = TYPE.__name__
+        def ll_finalizer(addr, ignored):
+            v = llmemory.cast_adr_to_ptr(addr, DESTR_ARG)
+            ll_call_destructor(destrptr, v, typename)
+            return llmemory.NULL
+        fptr = self.transformer.annotate_finalizer(ll_finalizer,
+                [llmemory.Address, llmemory.Address], llmemory.Address)
+        return fptr
 
-        assert not type_contains_pyobjs(TYPE), "not implemented"
-        if destrptr:
-            typename = TYPE.__name__
-            def ll_finalizer(addr):
-                v = llmemory.cast_adr_to_ptr(addr, DESTR_ARG)
-                ll_call_destructor(destrptr, v, typename)
-            fptr = self.transformer.annotate_finalizer(ll_finalizer,
-                                                       [llmemory.Address],
-                                                       lltype.Void)
-        else:
-            fptr = lltype.nullptr(gctypelayout.GCData.FINALIZERTYPE.TO)
+    def make_custom_trace_funcptr_for_type(self, TYPE):
+        if not self.has_custom_trace(TYPE):
+            return None
+        rtti = get_rtti(TYPE)
+        fptr = rtti._obj.custom_trace_funcptr
+        if not hasattr(fptr._obj, 'graph'):
+            ll_func = fptr._obj._callable
+            fptr = self.transformer.annotate_finalizer(ll_func,
+                    [llmemory.Address, llmemory.Address], llmemory.Address)
         return fptr
 
 
@@ -1333,6 +1351,10 @@
         if collect_stack_root:
             self.walk_stack_roots(collect_stack_root)     # abstract
 
+    def need_stacklet_support(self):
+        raise Exception("%s does not support stacklets" % (
+            self.__class__.__name__,))
+
     def need_thread_support(self, gctransformer, getfn):
         raise Exception("%s does not support threads" % (
             self.__class__.__name__,))
diff --git a/pypy/rpython/memory/gctransform/shadowstack.py b/pypy/rpython/memory/gctransform/shadowstack.py
--- a/pypy/rpython/memory/gctransform/shadowstack.py
+++ b/pypy/rpython/memory/gctransform/shadowstack.py
@@ -1,17 +1,16 @@
 from pypy.rpython.memory.gctransform.framework import BaseRootWalker
 from pypy.rpython.memory.gctransform.framework import sizeofaddr
+from pypy.rpython.annlowlevel import llhelper
+from pypy.rpython.lltypesystem import lltype, llmemory
 from pypy.rlib.debug import ll_assert
-from pypy.rpython.lltypesystem import llmemory
 from pypy.annotation import model as annmodel
 
 
 class ShadowStackRootWalker(BaseRootWalker):
     need_root_stack = True
-    collect_stacks_from_other_threads = None
 
     def __init__(self, gctransformer):
         BaseRootWalker.__init__(self, gctransformer)
-        self.rootstacksize = sizeofaddr * gctransformer.root_stack_depth
         # NB. 'self' is frozen, but we can use self.gcdata to store state
         gcdata = self.gcdata
 
@@ -27,13 +26,33 @@
             return top
         self.decr_stack = decr_stack
 
-        self.rootstackhook = gctransformer.root_stack_jit_hook
-        if self.rootstackhook is None:
-            def collect_stack_root(callback, gc, addr):
-                if gc.points_to_valid_gc_object(addr):
+        translator = gctransformer.translator
+        if (hasattr(translator, '_jit2gc') and
+                'root_iterator' in translator._jit2gc):
+            root_iterator = translator._jit2gc['root_iterator']
+            def jit_walk_stack_root(callback, addr, end):
+                root_iterator.context = llmemory.NULL
+                gc = self.gc
+                while True:
+                    addr = root_iterator.next(gc, addr, end)
+                    if addr == llmemory.NULL:
+                        return
                     callback(gc, addr)
-                return sizeofaddr
-            self.rootstackhook = collect_stack_root
+                    addr += sizeofaddr
+            self.rootstackhook = jit_walk_stack_root
+        else:
+            def default_walk_stack_root(callback, addr, end):
+                gc = self.gc
+                while addr != end:
+                    if gc.points_to_valid_gc_object(addr):
+                        callback(gc, addr)
+                    addr += sizeofaddr
+            self.rootstackhook = default_walk_stack_root
+
+        self.shadow_stack_pool = ShadowStackPool(gcdata)
+        rsd = gctransformer.root_stack_depth
+        if rsd is not None:
+            self.shadow_stack_pool.root_stack_depth = rsd
 
     def push_stack(self, addr):
         top = self.incr_stack(1)
@@ -43,26 +62,14 @@
         top = self.decr_stack(1)
         return top.address[0]
 
-    def allocate_stack(self):
-        return llmemory.raw_malloc(self.rootstacksize)
-
     def setup_root_walker(self):
-        stackbase = self.allocate_stack()
-        ll_assert(bool(stackbase), "could not allocate root stack")
-        self.gcdata.root_stack_top  = stackbase
-        self.gcdata.root_stack_base = stackbase
+        self.shadow_stack_pool.initial_setup()
         BaseRootWalker.setup_root_walker(self)
 
     def walk_stack_roots(self, collect_stack_root):
         gcdata = self.gcdata
-        gc = self.gc
-        rootstackhook = self.rootstackhook
-        addr = gcdata.root_stack_base
-        end = gcdata.root_stack_top
-        while addr != end:
-            addr += rootstackhook(collect_stack_root, gc, addr)
-        if self.collect_stacks_from_other_threads is not None:
-            self.collect_stacks_from_other_threads(collect_stack_root)
+        self.rootstackhook(collect_stack_root,
+                           gcdata.root_stack_base, gcdata.root_stack_top)
 
     def need_thread_support(self, gctransformer, getfn):
         from pypy.module.thread import ll_thread    # xxx fish
@@ -70,121 +77,84 @@
         from pypy.rpython.memory.support import copy_without_null_values
         gcdata = self.gcdata
         # the interfacing between the threads and the GC is done via
-        # three completely ad-hoc operations at the moment:
-        # gc_thread_prepare, gc_thread_run, gc_thread_die.
-        # See docstrings below.
+        # two completely ad-hoc operations at the moment:
+        # gc_thread_run and gc_thread_die.  See docstrings below.
 
-        def get_aid():
-            """Return the thread identifier, cast to an (opaque) address."""
-            return llmemory.cast_int_to_adr(ll_thread.get_ident())
+        shadow_stack_pool = self.shadow_stack_pool
+        SHADOWSTACKREF = get_shadowstackref(gctransformer)
+
+        # this is a dict {tid: SHADOWSTACKREF}, where the tid for the
+        # current thread may be missing so far
+        gcdata.thread_stacks = None
+
+        # Return the thread identifier, as an integer.
+        get_tid = ll_thread.get_ident
 
         def thread_setup():
-            """Called once when the program starts."""
-            aid = get_aid()
-            gcdata.main_thread = aid
-            gcdata.active_thread = aid
-            gcdata.thread_stacks = AddressDict()     # {aid: root_stack_top}
-            gcdata._fresh_rootstack = llmemory.NULL
-            gcdata.dead_threads_count = 0
-
-        def thread_prepare():
-            """Called just before thread.start_new_thread().  This
-            allocates a new shadow stack to be used by the future
-            thread.  If memory runs out, this raises a MemoryError
-            (which can be handled by the caller instead of just getting
-            ignored if it was raised in the newly starting thread).
-            """
-            if not gcdata._fresh_rootstack:
-                gcdata._fresh_rootstack = self.allocate_stack()
-                if not gcdata._fresh_rootstack:
-                    raise MemoryError
+            tid = get_tid()
+            gcdata.main_tid = tid
+            gcdata.active_tid = tid
 
         def thread_run():
             """Called whenever the current thread (re-)acquired the GIL.
             This should ensure that the shadow stack installed in
             gcdata.root_stack_top/root_stack_base is the one corresponding
             to the current thread.
+            No GC operation here, e.g. no mallocs or storing in a dict!
             """
-            aid = get_aid()
-            if gcdata.active_thread != aid:
-                switch_shadow_stacks(aid)
+            tid = get_tid()
+            if gcdata.active_tid != tid:
+                switch_shadow_stacks(tid)
 
         def thread_die():
             """Called just before the final GIL release done by a dying
             thread.  After a thread_die(), no more gc operation should
             occur in this thread.
             """
-            aid = get_aid()
-            if aid == gcdata.main_thread:
+            tid = get_tid()
+            if tid == gcdata.main_tid:
                 return   # ignore calls to thread_die() in the main thread
                          # (which can occur after a fork()).
-            gcdata.thread_stacks.setitem(aid, llmemory.NULL)
-            old = gcdata.root_stack_base
-            if gcdata._fresh_rootstack == llmemory.NULL:
-                gcdata._fresh_rootstack = old
+            # we need to switch somewhere else, so go to main_tid
+            gcdata.active_tid = gcdata.main_tid
+            thread_stacks = gcdata.thread_stacks
+            new_ref = thread_stacks[gcdata.active_tid]
+            try:
+                del thread_stacks[tid]
+            except KeyError:
+                pass
+            # no more GC operation from here -- switching shadowstack!
+            shadow_stack_pool.forget_current_state()
+            shadow_stack_pool.restore_state_from(new_ref)
+
+        def switch_shadow_stacks(new_tid):
+            # we have the wrong shadowstack right now, but it should not matter
+            thread_stacks = gcdata.thread_stacks
+            try:
+                if thread_stacks is None:
+                    gcdata.thread_stacks = thread_stacks = {}
+                    raise KeyError
+                new_ref = thread_stacks[new_tid]
+            except KeyError:
+                new_ref = lltype.nullptr(SHADOWSTACKREF)
+            try:
+                old_ref = thread_stacks[gcdata.active_tid]
+            except KeyError:
+                # first time we ask for a SHADOWSTACKREF for this active_tid
+                old_ref = shadow_stack_pool.allocate(SHADOWSTACKREF)
+                thread_stacks[gcdata.active_tid] = old_ref
+            #
+            # no GC operation from here -- switching shadowstack!
+            shadow_stack_pool.save_current_state_away(old_ref, llmemory.NULL)
+            if new_ref:
+                shadow_stack_pool.restore_state_from(new_ref)
             else:
-                llmemory.raw_free(old)
-            install_new_stack(gcdata.main_thread)
-            # from time to time, rehash the dictionary to remove
-            # old NULL entries
-            gcdata.dead_threads_count += 1
-            if (gcdata.dead_threads_count & 511) == 0:
-                copy = copy_without_null_values(gcdata.thread_stacks)
-                gcdata.thread_stacks.delete()
-                gcdata.thread_stacks = copy
-
-        def switch_shadow_stacks(new_aid):
-            save_away_current_stack()
-            install_new_stack(new_aid)
+                shadow_stack_pool.start_fresh_new_state()
+            # done
+            #
+            gcdata.active_tid = new_tid
         switch_shadow_stacks._dont_inline_ = True
 
-        def save_away_current_stack():
-            old_aid = gcdata.active_thread
-            # save root_stack_base on the top of the stack
-            self.push_stack(gcdata.root_stack_base)
-            # store root_stack_top into the dictionary
-            gcdata.thread_stacks.setitem(old_aid, gcdata.root_stack_top)
-
-        def install_new_stack(new_aid):
-            # look for the new stack top
-            top = gcdata.thread_stacks.get(new_aid, llmemory.NULL)
-            if top == llmemory.NULL:
-                # first time we see this thread.  It is an error if no
-                # fresh new stack is waiting.
-                base = gcdata._fresh_rootstack
-                gcdata._fresh_rootstack = llmemory.NULL
-                ll_assert(base != llmemory.NULL, "missing gc_thread_prepare")
-                gcdata.root_stack_top = base
-                gcdata.root_stack_base = base
-            else:
-                # restore the root_stack_base from the top of the stack
-                gcdata.root_stack_top = top
-                gcdata.root_stack_base = self.pop_stack()
-            # done
-            gcdata.active_thread = new_aid
-
-        def collect_stack(aid, stacktop, callback):
-            if stacktop != llmemory.NULL and aid != gcdata.active_thread:
-                # collect all valid stacks from the dict (the entry
-                # corresponding to the current thread is not valid)
-                gc = self.gc
-                rootstackhook = self.rootstackhook
-                end = stacktop - sizeofaddr
-                addr = end.address[0]
-                while addr != end:
-                    addr += rootstackhook(callback, gc, addr)
-
-        def collect_more_stacks(callback):
-            ll_assert(get_aid() == gcdata.active_thread,
-                      "collect_more_stacks(): invalid active_thread")
-            gcdata.thread_stacks.foreach(collect_stack, callback)
-
-        def _free_if_not_current(aid, stacktop, _):
-            if stacktop != llmemory.NULL and aid != gcdata.active_thread:
-                end = stacktop - sizeofaddr
-                base = end.address[0]
-                llmemory.raw_free(base)
-
         def thread_after_fork(result_of_fork, opaqueaddr):
             # we don't need a thread_before_fork in this case, so
             # opaqueaddr == NULL.  This is called after fork().
@@ -192,28 +162,210 @@
                 # We are in the child process.  Assumes that only the
                 # current thread survived, so frees the shadow stacks
                 # of all the other ones.
-                gcdata.thread_stacks.foreach(_free_if_not_current, None)
-                # Clears the dict (including the current thread, which
-                # was an invalid entry anyway and will be recreated by
-                # the next call to save_away_current_stack()).
                 gcdata.thread_stacks.clear()
                 # Finally, reset the stored thread IDs, in case it
                 # changed because of fork().  Also change the main
                 # thread to the current one (because there is not any
                 # other left).
-                aid = get_aid()
-                gcdata.main_thread = aid
-                gcdata.active_thread = aid
+                tid = get_tid()
+                gcdata.main_tid = tid
+                gcdata.active_tid = tid
 
         self.thread_setup = thread_setup
-        self.thread_prepare_ptr = getfn(thread_prepare, [], annmodel.s_None)
         self.thread_run_ptr = getfn(thread_run, [], annmodel.s_None,
-                                    inline=True)
-        # no thread_start_ptr here
-        self.thread_die_ptr = getfn(thread_die, [], annmodel.s_None)
+                                    inline=True, minimal_transform=False)
+        self.thread_die_ptr = getfn(thread_die, [], annmodel.s_None,
+                                    minimal_transform=False)
         # no thread_before_fork_ptr here
         self.thread_after_fork_ptr = getfn(thread_after_fork,
                                            [annmodel.SomeInteger(),
                                             annmodel.SomeAddress()],
-                                           annmodel.s_None)
-        self.collect_stacks_from_other_threads = collect_more_stacks
+                                           annmodel.s_None,
+                                           minimal_transform=False)
+
+    def need_stacklet_support(self, gctransformer, getfn):
+        shadow_stack_pool = self.shadow_stack_pool
+        SHADOWSTACKREF = get_shadowstackref(gctransformer)
+
+        def gc_shadowstackref_new():
+            ssref = shadow_stack_pool.allocate(SHADOWSTACKREF)
+            return lltype.cast_opaque_ptr(llmemory.GCREF, ssref)
+
+        def gc_shadowstackref_context(gcref):
+            ssref = lltype.cast_opaque_ptr(lltype.Ptr(SHADOWSTACKREF), gcref)
+            return ssref.context
+
+        def gc_shadowstackref_destroy(gcref):
+            ssref = lltype.cast_opaque_ptr(lltype.Ptr(SHADOWSTACKREF), gcref)
+            shadow_stack_pool.destroy(ssref)
+
+        def gc_save_current_state_away(gcref, ncontext):
+            ssref = lltype.cast_opaque_ptr(lltype.Ptr(SHADOWSTACKREF), gcref)
+            shadow_stack_pool.save_current_state_away(ssref, ncontext)
+
+        def gc_forget_current_state():
+            shadow_stack_pool.forget_current_state()
+
+        def gc_restore_state_from(gcref):
+            ssref = lltype.cast_opaque_ptr(lltype.Ptr(SHADOWSTACKREF), gcref)
+            shadow_stack_pool.restore_state_from(ssref)
+
+        def gc_start_fresh_new_state():
+            shadow_stack_pool.start_fresh_new_state()
+
+        s_gcref = annmodel.SomePtr(llmemory.GCREF)
+        s_addr = annmodel.SomeAddress()
+        self.gc_shadowstackref_new_ptr = getfn(gc_shadowstackref_new,
+                                               [], s_gcref,
+                                               minimal_transform=False)
+        self.gc_shadowstackref_context_ptr = getfn(gc_shadowstackref_context,
+                                                   [s_gcref], s_addr,
+                                                   inline=True)
+        self.gc_shadowstackref_destroy_ptr = getfn(gc_shadowstackref_destroy,
+                                                   [s_gcref], annmodel.s_None,
+                                                   inline=True)
+        self.gc_save_current_state_away_ptr = getfn(gc_save_current_state_away,
+                                                    [s_gcref, s_addr],
+                                                    annmodel.s_None,
+                                                    inline=True)
+        self.gc_forget_current_state_ptr = getfn(gc_forget_current_state,
+                                                 [], annmodel.s_None,
+                                                 inline=True)
+        self.gc_restore_state_from_ptr = getfn(gc_restore_state_from,
+                                               [s_gcref], annmodel.s_None,
+                                               inline=True)
+        self.gc_start_fresh_new_state_ptr = getfn(gc_start_fresh_new_state,
+                                                  [], annmodel.s_None,
+                                                  inline=True)
+        # fish...
+        translator = gctransformer.translator
+        if hasattr(translator, '_jit2gc'):
+            from pypy.rlib._rffi_stacklet import _translate_pointer
+            root_iterator = translator._jit2gc['root_iterator']
+            root_iterator.translateptr = _translate_pointer
+
+# ____________________________________________________________
+
+class ShadowStackPool(object):
+    """Manages a pool of shadowstacks.  The MAX most recently used
+    shadowstacks are fully allocated and can be directly jumped into.
+    The rest are stored in a more virtual-memory-friendly way, i.e.
+    with just the right amount malloced.  Before they can run, they
+    must be copied into a full shadowstack.  XXX NOT IMPLEMENTED SO FAR!
+    """
+    _alloc_flavor_ = "raw"
+    root_stack_depth = 163840
+
+    #MAX = 20  not implemented yet
+
+    def __init__(self, gcdata):
+        self.unused_full_stack = llmemory.NULL
+        self.gcdata = gcdata
+
+    def initial_setup(self):
+        self._prepare_unused_stack()
+        self.start_fresh_new_state()
+
+    def allocate(self, SHADOWSTACKREF):
+        """Allocate an empty SHADOWSTACKREF object."""
+        return lltype.malloc(SHADOWSTACKREF, zero=True)
+
+    def save_current_state_away(self, shadowstackref, ncontext):
+        """Save the current state away into 'shadowstackref'.
+        This either works, or raise MemoryError and nothing is done.
+        To do a switch, first call save_current_state_away() or
+        forget_current_state(), and then call restore_state_from()
+        or start_fresh_new_state().
+        """
+        self._prepare_unused_stack()
+        shadowstackref.base = self.gcdata.root_stack_base
+        shadowstackref.top  = self.gcdata.root_stack_top
+        shadowstackref.context = ncontext
+        ll_assert(shadowstackref.base <= shadowstackref.top,
+                  "save_current_state_away: broken shadowstack")
+        #shadowstackref.fullstack = True
+        #
+        # cannot use llop.gc_assume_young_pointers() here, because
+        # we are in a minimally-transformed GC helper :-/
+        gc = self.gcdata.gc
+        if hasattr(gc.__class__, 'assume_young_pointers'):
+            shadowstackadr = llmemory.cast_ptr_to_adr(shadowstackref)
+            gc.assume_young_pointers(shadowstackadr)
+        #
+        self.gcdata.root_stack_top = llmemory.NULL  # to detect missing restore
+
+    def forget_current_state(self):
+        if self.unused_full_stack:
+            llmemory.raw_free(self.unused_full_stack)
+        self.unused_full_stack = self.gcdata.root_stack_base
+        self.gcdata.root_stack_top = llmemory.NULL  # to detect missing restore
+
+    def restore_state_from(self, shadowstackref):
+        ll_assert(bool(shadowstackref.base), "empty shadowstackref!")
+        ll_assert(shadowstackref.base <= shadowstackref.top,
+                  "restore_state_from: broken shadowstack")
+        self.gcdata.root_stack_base = shadowstackref.base
+        self.gcdata.root_stack_top  = shadowstackref.top
+        self.destroy(shadowstackref)
+
+    def start_fresh_new_state(self):
+        self.gcdata.root_stack_base = self.unused_full_stack
+        self.gcdata.root_stack_top  = self.unused_full_stack
+        self.unused_full_stack = llmemory.NULL
+
+    def destroy(self, shadowstackref):
+        shadowstackref.base = llmemory.NULL
+        shadowstackref.top = llmemory.NULL
+        shadowstackref.context = llmemory.NULL
+
+    def _prepare_unused_stack(self):
+        if self.unused_full_stack == llmemory.NULL:
+            root_stack_size = sizeofaddr * self.root_stack_depth
+            self.unused_full_stack = llmemory.raw_malloc(root_stack_size)
+            if self.unused_full_stack == llmemory.NULL:
+                raise MemoryError
+
+
+def get_shadowstackref(gctransformer):
+    if hasattr(gctransformer, '_SHADOWSTACKREF'):
+        return gctransformer._SHADOWSTACKREF
+
+    SHADOWSTACKREFPTR = lltype.Ptr(lltype.GcForwardReference())
+    SHADOWSTACKREF = lltype.GcStruct('ShadowStackRef',
+                                     ('base', llmemory.Address),
+                                     ('top', llmemory.Address),
+                                     ('context', llmemory.Address),
+                                     #('fullstack', lltype.Bool),
+                                     rtti=True)
+    SHADOWSTACKREFPTR.TO.become(SHADOWSTACKREF)
+
+    translator = gctransformer.translator
+    if hasattr(translator, '_jit2gc'):
+        gc = gctransformer.gcdata.gc
+        root_iterator = translator._jit2gc['root_iterator']
+        def customtrace(obj, prev):
+            obj = llmemory.cast_adr_to_ptr(obj, SHADOWSTACKREFPTR)
+            if not prev:
+                root_iterator.context = obj.context
+                next = obj.base
+            else:
+                next = prev + sizeofaddr
+            return root_iterator.next(gc, next, obj.top)
+    else:
+        def customtrace(obj, prev):
+            # a simple but not JIT-ready version
+            if not prev:
+                next = llmemory.cast_adr_to_ptr(obj, SHADOWSTACKREFPTR).base
+            else:
+                next = prev + sizeofaddr
+            if next == llmemory.cast_adr_to_ptr(obj, SHADOWSTACKREFPTR).top:
+                next = llmemory.NULL
+            return next
+
+    CUSTOMTRACEFUNC = lltype.FuncType([llmemory.Address, llmemory.Address],
+                                      llmemory.Address)
+    customtraceptr = llhelper(lltype.Ptr(CUSTOMTRACEFUNC), customtrace)
+    lltype.attachRuntimeTypeInfo(SHADOWSTACKREF, customtraceptr=customtraceptr)
+
+    gctransformer._SHADOWSTACKREF = SHADOWSTACKREF
+    return SHADOWSTACKREF
diff --git a/pypy/rpython/memory/gctransform/test/test_framework.py b/pypy/rpython/memory/gctransform/test/test_framework.py
--- a/pypy/rpython/memory/gctransform/test/test_framework.py
+++ b/pypy/rpython/memory/gctransform/test/test_framework.py
@@ -70,23 +70,6 @@
     gg = graphof(t, g)
     assert not CollectAnalyzer(t).analyze_direct_call(gg)    
 
-def test_cancollect_stack_check():
-    from pypy.rlib import rstack
-
-    def with_check():
-        rstack.stack_check()
-
-    t = rtype(with_check, [])
-    with_check_graph = graphof(t, with_check)
-
-    assert not t.config.translation.stackless
-    can_collect = CollectAnalyzer(t).analyze_direct_call(with_check_graph)
-    assert not can_collect
-    
-    t.config.translation.stackless = True
-    can_collect = CollectAnalyzer(t).analyze_direct_call(with_check_graph)
-    assert can_collect
-
 def test_cancollect_external():
     fext1 = rffi.llexternal('fext1', [], lltype.Void, threadsafe=False)
     def g():
diff --git a/pypy/rpython/memory/gctransform/transform.py b/pypy/rpython/memory/gctransform/transform.py
--- a/pypy/rpython/memory/gctransform/transform.py
+++ b/pypy/rpython/memory/gctransform/transform.py
@@ -307,7 +307,6 @@
             if backendopt:
                 self.mixlevelannotator.backend_optimize()
         # Make sure that the database also sees all finalizers now.
-        # XXX we need to think more about the interaction with stackless...
         # It is likely that the finalizers need special support there
         newgcdependencies = self.ll_finalizers_ptrs
         return newgcdependencies
diff --git a/pypy/rpython/memory/gctypelayout.py b/pypy/rpython/memory/gctypelayout.py
--- a/pypy/rpython/memory/gctypelayout.py
+++ b/pypy/rpython/memory/gctypelayout.py
@@ -17,13 +17,21 @@
     _alloc_flavor_ = 'raw'
 
     OFFSETS_TO_GC_PTR = lltype.Array(lltype.Signed)
-    ADDRESS_VOID_FUNC = lltype.FuncType([llmemory.Address], lltype.Void)
-    FINALIZERTYPE = lltype.Ptr(ADDRESS_VOID_FUNC)
+
+    # When used as a finalizer, the following functions only take one
+    # address and ignore the second, and return NULL.  When used as a
+    # custom tracer (CT), it enumerates the addresses that contain GCREFs.
+    # It is called with the object as first argument, and the previous
+    # returned address (or NULL the first time) as the second argument.
+    FINALIZER_OR_CT_FUNC = lltype.FuncType([llmemory.Address,
+                                            llmemory.Address],
+                                           llmemory.Address)
+    FINALIZER_OR_CT = lltype.Ptr(FINALIZER_OR_CT_FUNC)
 
     # structure describing the layout of a typeid
     TYPE_INFO = lltype.Struct("type_info",
         ("infobits",       lltype.Signed),    # combination of the T_xxx consts
-        ("finalizer",      FINALIZERTYPE),
+        ("finalizer_or_customtrace", FINALIZER_OR_CT),
         ("fixedsize",      lltype.Signed),
         ("ofstoptrs",      lltype.Ptr(OFFSETS_TO_GC_PTR)),
         hints={'immutable': True},
@@ -71,7 +79,11 @@
         return (infobits & T_IS_GCARRAY_OF_GCPTR) != 0
 
     def q_finalizer(self, typeid):
-        return self.get(typeid).finalizer
+        typeinfo = self.get(typeid)
+        if typeinfo.infobits & T_HAS_FINALIZER:
+            return typeinfo.finalizer_or_customtrace
+        else:
+            return lltype.nullptr(GCData.FINALIZER_OR_CT_FUNC)
 
     def q_offsets_to_gc_pointers(self, typeid):
         return self.get(typeid).ofstoptrs
@@ -105,6 +117,25 @@
         infobits = self.get(typeid).infobits
         return infobits & T_IS_RPYTHON_INSTANCE != 0
 
+    def q_has_custom_trace(self, typeid):
+        infobits = self.get(typeid).infobits
+        return infobits & T_HAS_CUSTOM_TRACE != 0
+
+    def q_get_custom_trace(self, typeid):
+        ll_assert(self.q_has_custom_trace(typeid),
+                  "T_HAS_CUSTOM_TRACE missing")
+        typeinfo = self.get(typeid)
+        return typeinfo.finalizer_or_customtrace
+
+    def q_fast_path_tracing(self, typeid):
+        # return True if none of the flags T_HAS_GCPTR_IN_VARSIZE,
+        # T_IS_GCARRAY_OF_GCPTR or T_HAS_CUSTOM_TRACE is set
+        T_ANY_SLOW_FLAG = (T_HAS_GCPTR_IN_VARSIZE |
+                           T_IS_GCARRAY_OF_GCPTR |
+                           T_HAS_CUSTOM_TRACE)
+        infobits = self.get(typeid).infobits
+        return infobits & T_ANY_SLOW_FLAG == 0
+
     def set_query_functions(self, gc):
         gc.set_query_functions(
             self.q_is_varsize,
@@ -119,18 +150,23 @@
             self.q_varsize_offsets_to_gcpointers_in_var_part,
             self.q_weakpointer_offset,
             self.q_member_index,
-            self.q_is_rpython_class)
+            self.q_is_rpython_class,
+            self.q_has_custom_trace,
+            self.q_get_custom_trace,
+            self.q_fast_path_tracing)
 
 
 # the lowest 16bits are used to store group member index
-T_MEMBER_INDEX         =  0xffff
-T_IS_VARSIZE           = 0x10000
-T_HAS_GCPTR_IN_VARSIZE = 0x20000
-T_IS_GCARRAY_OF_GCPTR  = 0x40000
-T_IS_WEAKREF           = 0x80000
+T_MEMBER_INDEX         =   0xffff
+T_IS_VARSIZE           = 0x010000
+T_HAS_GCPTR_IN_VARSIZE = 0x020000
+T_IS_GCARRAY_OF_GCPTR  = 0x040000
+T_IS_WEAKREF           = 0x080000
 T_IS_RPYTHON_INSTANCE  = 0x100000    # the type is a subclass of OBJECT
+T_HAS_FINALIZER        = 0x200000
+T_HAS_CUSTOM_TRACE     = 0x400000
 T_KEY_MASK             = intmask(0xFF000000)
-T_KEY_VALUE            = intmask(0x7A000000)    # bug detection only
+T_KEY_VALUE            = intmask(0x5A000000)    # bug detection only
 
 def _check_valid_type_info(p):
     ll_assert(p.infobits & T_KEY_MASK == T_KEY_VALUE, "invalid type_id")
@@ -151,7 +187,18 @@
     offsets = offsets_to_gc_pointers(TYPE)
     infobits = index
     info.ofstoptrs = builder.offsets2table(offsets, TYPE)
-    info.finalizer = builder.make_finalizer_funcptr_for_type(TYPE)
+    #
+    kind_and_fptr = builder.special_funcptr_for_type(TYPE)
+    if kind_and_fptr is not None:
+        kind, fptr = kind_and_fptr
+        info.finalizer_or_customtrace = fptr
+        if kind == "finalizer":
+            infobits |= T_HAS_FINALIZER
+        elif kind == "custom_trace":
+            infobits |= T_HAS_CUSTOM_TRACE
+        else:
+            assert 0, kind
+    #
     if not TYPE._is_varsize():
         info.fixedsize = llarena.round_up_for_allocation(
             llmemory.sizeof(TYPE), builder.GCClass.object_minimal_size)
@@ -216,7 +263,7 @@
         # for debugging, the following list collects all the prebuilt
         # GcStructs and GcArrays
         self.all_prebuilt_gc = []
-        self.finalizer_funcptrs = {}
+        self._special_funcptrs = {}
         self.offsettable_cache = {}
 
     def make_type_info_group(self):
@@ -317,16 +364,29 @@
         self.offsettable_cache = None
         return self.type_info_group
 
-    def finalizer_funcptr_for_type(self, TYPE):
-        if TYPE in self.finalizer_funcptrs:
-            return self.finalizer_funcptrs[TYPE]
-        fptr = self.make_finalizer_funcptr_for_type(TYPE)
-        self.finalizer_funcptrs[TYPE] = fptr
-        return fptr
+    def special_funcptr_for_type(self, TYPE):
+        if TYPE in self._special_funcptrs:
+            return self._special_funcptrs[TYPE]
+        fptr1 = self.make_finalizer_funcptr_for_type(TYPE)
+        fptr2 = self.make_custom_trace_funcptr_for_type(TYPE)
+        assert not (fptr1 and fptr2), (
+            "type %r needs both a finalizer and a custom tracer" % (TYPE,))
+        if fptr1:
+            kind_and_fptr = "finalizer", fptr1
+        elif fptr2:
+            kind_and_fptr = "custom_trace", fptr2
+        else:
+            kind_and_fptr = None
+        self._special_funcptrs[TYPE] = kind_and_fptr
+        return kind_and_fptr
 
     def make_finalizer_funcptr_for_type(self, TYPE):
         # must be overridden for proper finalizer support
-        return lltype.nullptr(GCData.ADDRESS_VOID_FUNC)
+        return None
+
+    def make_custom_trace_funcptr_for_type(self, TYPE):
+        # must be overridden for proper custom tracer support
+        return None
 
     def initialize_gc_query_function(self, gc):
         return GCData(self.type_info_group).set_query_functions(gc)
diff --git a/pypy/rpython/memory/gcwrapper.py b/pypy/rpython/memory/gcwrapper.py
--- a/pypy/rpython/memory/gcwrapper.py
+++ b/pypy/rpython/memory/gcwrapper.py
@@ -196,17 +196,28 @@
             DESTR_ARG = lltype.typeOf(destrptr).TO.ARGS[0]
             destrgraph = destrptr._obj.graph
         else:
-            return lltype.nullptr(gctypelayout.GCData.FINALIZERTYPE.TO)
+            return None
 
         assert not type_contains_pyobjs(TYPE), "not implemented"
-        def ll_finalizer(addr):
+        def ll_finalizer(addr, dummy):
+            assert dummy == llmemory.NULL
             try:
                 v = llmemory.cast_adr_to_ptr(addr, DESTR_ARG)
                 self.llinterp.eval_graph(destrgraph, [v], recursive=True)
             except llinterp.LLException:
                 raise RuntimeError(
                     "a finalizer raised an exception, shouldn't happen")
-        return llhelper(gctypelayout.GCData.FINALIZERTYPE, ll_finalizer)
+            return llmemory.NULL
+        return llhelper(gctypelayout.GCData.FINALIZER_OR_CT, ll_finalizer)
+
+    def make_custom_trace_funcptr_for_type(self, TYPE):
+        from pypy.rpython.memory.gctransform.support import get_rtti, \
+                type_contains_pyobjs
+        rtti = get_rtti(TYPE)
+        if rtti is not None and hasattr(rtti._obj, 'custom_trace_funcptr'):
+            return rtti._obj.custom_trace_funcptr
+        else:
+            return None
 
 
 def collect_constants(graphs):
diff --git a/pypy/rpython/memory/test/test_gc.py b/pypy/rpython/memory/test/test_gc.py
--- a/pypy/rpython/memory/test/test_gc.py
+++ b/pypy/rpython/memory/test/test_gc.py
@@ -237,6 +237,46 @@
         res = self.interpret(f, [5])
         assert 160 <= res <= 165
 
+    def test_custom_trace(self):
+        from pypy.rpython.annlowlevel import llhelper
+        from pypy.rpython.lltypesystem import llmemory
+        from pypy.rpython.lltypesystem.llarena import ArenaError
+        #
+        S = lltype.GcStruct('S', ('x', llmemory.Address),
+                                 ('y', llmemory.Address), rtti=True)
+        T = lltype.GcStruct('T', ('z', lltype.Signed))
+        offset_of_x = llmemory.offsetof(S, 'x')
+        def customtrace(obj, prev):
+            if not prev:
+                return obj + offset_of_x
+            else:
+                return llmemory.NULL
+        CUSTOMTRACEFUNC = lltype.FuncType([llmemory.Address, llmemory.Address],
+                                          llmemory.Address)
+        customtraceptr = llhelper(lltype.Ptr(CUSTOMTRACEFUNC), customtrace)
+        lltype.attachRuntimeTypeInfo(S, customtraceptr=customtraceptr)
+        #
+        for attrname in ['x', 'y']:
+            def setup():
+                s1 = lltype.malloc(S)
+                tx = lltype.malloc(T)
+                tx.z = 42
+                ty = lltype.malloc(T)
+                s1.x = llmemory.cast_ptr_to_adr(tx)
+                s1.y = llmemory.cast_ptr_to_adr(ty)
+                return s1
+            def f():
+                s1 = setup()
+                llop.gc__collect(lltype.Void)
+                return llmemory.cast_adr_to_ptr(getattr(s1, attrname),
+                                                lltype.Ptr(T))
+            if attrname == 'x':
+                res = self.interpret(f, [])
+                assert res.z == 42
+            else:
+                py.test.raises((RuntimeError, ArenaError),
+                               self.interpret, f, [])
+
     def test_weakref(self):
         import weakref, gc
         class A(object):
diff --git a/pypy/rpython/memory/test/test_transformed_gc.py b/pypy/rpython/memory/test/test_transformed_gc.py
--- a/pypy/rpython/memory/test/test_transformed_gc.py
+++ b/pypy/rpython/memory/test/test_transformed_gc.py
@@ -7,7 +7,6 @@
 from pypy.rpython.lltypesystem import lltype, llmemory, llarena, rffi, llgroup
 from pypy.rpython.memory.gctransform import framework
 from pypy.rpython.lltypesystem.lloperation import llop, void
-from pypy.rpython.memory.gc.marksweep import X_CLONE, X_POOL, X_POOL_PTR
 from pypy.rlib.objectmodel import compute_unique_id, we_are_translated
 from pypy.rlib.debug import ll_assert
 from pypy.rlib import rgc
@@ -18,15 +17,13 @@
 WORD = LONG_BIT // 8
 
 
-def rtype(func, inputtypes, specialize=True, gcname='ref', stacklessgc=False,
+def rtype(func, inputtypes, specialize=True, gcname='ref',
           backendopt=False, **extraconfigopts):
     from pypy.translator.translator import TranslationContext
     t = TranslationContext()
     # XXX XXX XXX mess
     t.config.translation.gc = gcname
     t.config.translation.gcremovetypeptr = True
-    if stacklessgc:
-        t.config.translation.gcrootfinder = "stackless"
     t.config.set(**extraconfigopts)
     ann = t.buildannotator(policy=annpolicy.StrictAnnotatorPolicy())
     ann.build_types(func, inputtypes)
@@ -44,7 +41,6 @@
 
 class GCTest(object):
     gcpolicy = None
-    stacklessgc = False
     GC_CAN_MOVE = False
     GC_CAN_MALLOC_NONMOVABLE = True
     taggedpointers = False
@@ -103,7 +99,6 @@
 
         s_args = annmodel.SomePtr(lltype.Ptr(ARGS))
         t = rtype(entrypoint, [s_args], gcname=cls.gcname,
-                  stacklessgc=cls.stacklessgc,
                   taggedpointers=cls.taggedpointers)
 
         for fixup in mixlevelstuff:
@@ -410,6 +405,40 @@
         res = run([5, 42]) #XXX pure lazyness here too
         assert 160 <= res <= 165
 
+    def define_custom_trace(cls):
+        from pypy.rpython.annlowlevel import llhelper
+        from pypy.rpython.lltypesystem import llmemory
+        #
+        S = lltype.GcStruct('S', ('x', llmemory.Address), rtti=True)
+        T = lltype.GcStruct('T', ('z', lltype.Signed))
+        offset_of_x = llmemory.offsetof(S, 'x')
+        def customtrace(obj, prev):
+            if not prev:
+                return obj + offset_of_x
+            else:
+                return llmemory.NULL
+        CUSTOMTRACEFUNC = lltype.FuncType([llmemory.Address, llmemory.Address],
+                                          llmemory.Address)
+        customtraceptr = llhelper(lltype.Ptr(CUSTOMTRACEFUNC), customtrace)
+        lltype.attachRuntimeTypeInfo(S, customtraceptr=customtraceptr)
+        #
+        def setup():
+            s1 = lltype.malloc(S)
+            tx = lltype.malloc(T)
+            tx.z = 4243
+            s1.x = llmemory.cast_ptr_to_adr(tx)
+            return s1
+        def f():
+            s1 = setup()
+            llop.gc__collect(lltype.Void)
+            return llmemory.cast_adr_to_ptr(s1.x, lltype.Ptr(T)).z
+        return f
+
+    def test_custom_trace(self):
+        run = self.runner("custom_trace")
+        res = run([])
+        assert res == 4243
+
     def define_weakref(cls):
         import weakref, gc
         class A(object):
@@ -777,7 +806,6 @@
                 if op.opname == 'do_malloc_fixedsize_clear':
                     op.args = [Constant(type_id, llgroup.HALFWORD),
                                Constant(llmemory.sizeof(P), lltype.Signed),
-                               Constant(True, lltype.Bool),  # can_collect
                                Constant(False, lltype.Bool), # has_finalizer
                                Constant(False, lltype.Bool)] # contains_weakptr
                     break
@@ -814,7 +842,6 @@
                 if op.opname == 'do_malloc_fixedsize_clear':
                     op.args = [Constant(type_id, llgroup.HALFWORD),
                                Constant(llmemory.sizeof(P), lltype.Signed),
-                               Constant(True, lltype.Bool),  # can_collect
                                Constant(False, lltype.Bool), # has_finalizer
                                Constant(False, lltype.Bool)] # contains_weakptr
                     break
@@ -910,234 +937,6 @@
             root_stack_depth = 200
 
 
-    def define_cloning(cls):
-        B = lltype.GcStruct('B', ('x', lltype.Signed))
-        A = lltype.GcStruct('A', ('b', lltype.Ptr(B)),
-                                 ('unused', lltype.Ptr(B)))
-        def make(n):
-            b = lltype.malloc(B)
-            b.x = n
-            a = lltype.malloc(A)
-            a.b = b
-            return a
-        def func():
-            a1 = make(111)
-            # start recording mallocs in a new pool
-            oldpool = llop.gc_x_swap_pool(X_POOL_PTR, lltype.nullptr(X_POOL))
-            # the following a2 goes into the new list
-            a2 = make(222)
-            # now put the old pool back and get the new pool
-            newpool = llop.gc_x_swap_pool(X_POOL_PTR, oldpool)
-            a3 = make(333)
-            # clone a2
-            a2ref = lltype.cast_opaque_ptr(llmemory.GCREF, a2)
-            clonedata = lltype.malloc(X_CLONE)
-            clonedata.gcobjectptr = a2ref
-            clonedata.pool = newpool
-            llop.gc_x_clone(lltype.Void, clonedata)
-            a2copyref = clonedata.gcobjectptr
-            a2copy = lltype.cast_opaque_ptr(lltype.Ptr(A), a2copyref)
-            a2copy.b.x = 444
-            return a1.b.x * 1000000 + a2.b.x * 1000 + a3.b.x
-
-        return func
-
-    def test_cloning(self):
-        run = self.runner("cloning")
-        res = run([])
-        assert res == 111222333
-
-    def define_cloning_varsize(cls):
-        B = lltype.GcStruct('B', ('x', lltype.Signed))
-        A = lltype.GcStruct('A', ('b', lltype.Ptr(B)),
-                                 ('more', lltype.Array(lltype.Ptr(B))))
-        def make(n):
-            b = lltype.malloc(B)
-            b.x = n
-            a = lltype.malloc(A, 2)
-            a.b = b
-            a.more[0] = lltype.malloc(B)
-            a.more[0].x = n*10
-            a.more[1] = lltype.malloc(B)
-            a.more[1].x = n*10+1
-            return a
-        def func():
-            oldpool = llop.gc_x_swap_pool(X_POOL_PTR, lltype.nullptr(X_POOL))
-            a2 = make(22)
-            newpool = llop.gc_x_swap_pool(X_POOL_PTR, oldpool)
-            # clone a2
-            a2ref = lltype.cast_opaque_ptr(llmemory.GCREF, a2)
-            clonedata = lltype.malloc(X_CLONE)
-            clonedata.gcobjectptr = a2ref
-            clonedata.pool = newpool
-            llop.gc_x_clone(lltype.Void, clonedata)
-            a2copyref = clonedata.gcobjectptr
-            a2copy = lltype.cast_opaque_ptr(lltype.Ptr(A), a2copyref)
-            a2copy.b.x = 44
-            a2copy.more[0].x = 440
-            a2copy.more[1].x = 441
-            return a2.b.x * 1000000 + a2.more[0].x * 1000 + a2.more[1].x
-
-        return func
-
-    def test_cloning_varsize(self):
-        run = self.runner("cloning_varsize")
-        res = run([])
-        assert res == 22220221
-
-    def define_cloning_highlevel(cls):
-        class A:
-            pass
-        class B(A):
-            pass
-        def func(n, dummy):
-            if n > 5:
-                x = A()
-            else:
-                x = B()
-                x.bvalue = 123
-            x.next = A()
-            x.next.next = x
-            y, newpool = rgc.gc_clone(x, None)
-            assert y is not x
-            assert y.next is not x
-            assert y is not x.next
-            assert y.next is not x.next
-            assert y is not y.next
-            assert y is y.next.next
-            if isinstance(y, B):
-                assert n <= 5
-                assert y.bvalue == 123
-            else:
-                assert n > 5
-            return 1
-
-        return func
-
-    def test_cloning_highlevel(self):
-        run = self.runner("cloning_highlevel")
-        res = run([3, 0])
-        assert res == 1
-        res = run([7, 0])
-        assert res == 1
-
-    def define_cloning_highlevel_varsize(cls):
-        class A:
-            pass
-        def func(n, dummy):
-            lst = [A() for i in range(n)]
-            for a in lst:
-                a.value = 1
-            lst2, newpool = rgc.gc_clone(lst, None)
-            for i in range(n):
-                a = A()
-                a.value = i
-                lst.append(a)
-                lst[i].value = 4 + i
-                lst2[i].value = 7 + i
-
-            n = 0
-            for a in lst:
-                n = n*10 + a.value
-            for a in lst2:
-                n = n*10 + a.value
-            return n
-
-        return func
-
-    def test_cloning_highlevel_varsize(self):
-        run = self.runner("cloning_highlevel_varsize")
-        res = run([3, 0])
-        assert res == 456012789
-
-    def define_tree_cloning(cls):
-        import os
-        # this makes a tree of calls.  Each leaf stores its path (a linked
-        # list) in 'result'.  Paths are mutated in-place but the leaves don't
-        # see each other's mutations because of x_clone.
-        STUFF = lltype.FixedSizeArray(lltype.Signed, 21)
-        NODE = lltype.GcForwardReference()
-        NODE.become(lltype.GcStruct('node', ('index', lltype.Signed),
-                                            ('counter', lltype.Signed),
-                                            ('next', lltype.Ptr(NODE)),
-                                            ('use_some_space', STUFF)))
-        PATHARRAY = lltype.GcArray(lltype.Ptr(NODE))
-        clonedata = lltype.malloc(X_CLONE)
-
-        def clone(node):
-            # that's for testing if the test is correct...
-            if not node:
-                return node
-            newnode = lltype.malloc(NODE)
-            newnode.index = node.index
-            newnode.counter = node.counter
-            newnode.next = clone(node.next)
-            return newnode
-
-        def do_call(result, path, index, remaining_depth):
-            # clone the while path
-            clonedata.gcobjectptr = lltype.cast_opaque_ptr(llmemory.GCREF,
-                                                           path)
-            clonedata.pool = lltype.nullptr(X_POOL)
-            llop.gc_x_clone(lltype.Void, clonedata)
-            # install the new pool as the current one
-            parentpool = llop.gc_x_swap_pool(X_POOL_PTR, clonedata.pool)
-            path = lltype.cast_opaque_ptr(lltype.Ptr(NODE),
-                                          clonedata.gcobjectptr)
-
-            # The above should have the same effect as:
-            #    path = clone(path)
-
-            # bump all the path node counters by one
-            p = path
-            while p:
-                p.counter += 1
-                p = p.next
-
-            if remaining_depth == 0:
-                llop.debug_print(lltype.Void, "setting", index, "with", path)
-                result[index] = path   # leaf
-            else:
-                node = lltype.malloc(NODE)
-                node.index = index * 2
-                node.counter = 0
-                node.next = path
-                do_call(result, node, index * 2, remaining_depth - 1)
-                node.index += 1    # mutation!
-                do_call(result, node, index * 2 + 1, remaining_depth - 1)
-
-            # restore the parent pool
-            llop.gc_x_swap_pool(X_POOL_PTR, parentpool)
-
-        def check(path, index, level, depth):
-            if level == depth:
-                assert index == 0
-                assert not path
-            else:
-                assert path.index == index
-                assert path.counter == level + 1
-                check(path.next, index >> 1, level + 1, depth)
-
-        def func(depth, dummy):
-            result = lltype.malloc(PATHARRAY, 1 << depth)
-            os.write(2, 'building tree... ')
-            do_call(result, lltype.nullptr(NODE), 0, depth)
-            os.write(2, 'checking tree... ')
-            #from pypy.rpython.lltypesystem.lloperation import llop
-            #llop.debug_view(lltype.Void, result,
-            #                llop.gc_x_size_header(lltype.Signed))
-            for i in range(1 << depth):
-                check(result[i], i, 0, depth)
-            os.write(2, 'ok\n')
-            return 1
-        return func
-
-    def test_tree_cloning(self):
-        run = self.runner("tree_cloning")
-        res = run([3, 0])
-        assert res == 1
-
-
 class TestPrintingGC(GenericGCTests):
     gcname = "statistics"
 
diff --git a/pypy/rpython/test/test_stack.py b/pypy/rpython/test/test_stack.py
deleted file mode 100644
--- a/pypy/rpython/test/test_stack.py
+++ /dev/null
@@ -1,16 +0,0 @@
-
-from pypy.rpython.test.test_llinterp import interpret
-from pypy.rlib.rstack import stack_frames_depth
-
-
-def test_interp_c():
-    def f():
-        return stack_frames_depth()
-
-    def g():
-        return f()
-    res_f = interpret(f, [])
-    res_g = interpret(g, [])
-    assert res_f == 2
-    assert res_g == 3
-
diff --git a/pypy/tool/pytest/test/test_appsupport.py b/pypy/tool/pytest/test/test_appsupport.py
--- a/pypy/tool/pytest/test/test_appsupport.py
+++ b/pypy/tool/pytest/test/test_appsupport.py
@@ -39,7 +39,7 @@
         setpypyconftest(testdir)
         testdir.makepyfile("""
             class AppTestClass:
-                spaceconfig = {"objspace.usemodules._stackless": True}
+                spaceconfig = {"objspace.usemodules._random": True}
                 def setup_class(cls):
                     assert 0
                 def test_applevel(self):
@@ -48,7 +48,7 @@
         result = testdir.runpytest("-A")
         assert result.ret == 0
         if hasattr(sys, 'pypy_translation_info') and \
-           sys.pypy_translation_info.get('objspace.usemodules._stackless'):
+           sys.pypy_translation_info.get('objspace.usemodules._random'):
             result.stdout.fnmatch_lines(["*1 error*"])
         else:
             # setup_class didn't get called, otherwise it would error
@@ -58,9 +58,9 @@
         setpypyconftest(testdir)
         p = testdir.makepyfile("""
             class TestClass:
-                spaceconfig = {"objspace.usemodules._stackless": False}
+                spaceconfig = {"objspace.usemodules._random": False}
                 def setup_class(cls):
-                    assert not cls.space.config.objspace.usemodules._stackless
+                    assert not cls.space.config.objspace.usemodules._random
                 def test_interp(self, space):
                     assert self.space is space
                 def test_interp2(self, space):
diff --git a/pypy/translator/backendopt/inline.py b/pypy/translator/backendopt/inline.py
--- a/pypy/translator/backendopt/inline.py
+++ b/pypy/translator/backendopt/inline.py
@@ -540,7 +540,6 @@
 OP_WEIGHTS = {'same_as': 0,
               'cast_pointer': 0,
               'malloc': 2,
-              'yield_current_frame_to_caller': sys.maxint, # XXX bit extreme
               'instrument_count': 0,
               'debug_assert': -1,
               }
diff --git a/pypy/translator/c/database.py b/pypy/translator/c/database.py
--- a/pypy/translator/c/database.py
+++ b/pypy/translator/c/database.py
@@ -29,13 +29,11 @@
 
     def __init__(self, translator=None, standalone=False,
                  gcpolicyclass=None,
-                 stacklesstransformer=None,
                  thread_enabled=False,
                  sandbox=False):
         self.translator = translator
         self.standalone = standalone
         self.sandbox    = sandbox
-        self.stacklesstransformer = stacklesstransformer
         if gcpolicyclass is None:
             gcpolicyclass = gc.RefcountingGcPolicy
         self.gcpolicy = gcpolicyclass(self, thread_enabled)
@@ -251,8 +249,8 @@
         else:
             show_i = -1
 
-        # The order of database completion is fragile with stackless and
-        # gc transformers.  Here is what occurs:
+        # The order of database completion is fragile with gc transformers.
+        # Here is what occurs:
         #
         # 1. follow dependencies recursively from the entry point: data
         #    structures pointing to other structures or functions, and
@@ -270,24 +268,12 @@
         #    ll_finalize().  New FuncNodes are built for them.  No more
         #    FuncNodes can show up after this step.
         #
-        # 4. stacklesstransform.finish() - freeze the stackless resume point
-        #    table.
+        # 4. gctransformer.finish_tables() - freeze the gc types table.
         #
-        # 5. follow new dependencies (this should be only the new frozen
-        #    table, which contains only numbers and already-seen function
-        #    pointers).
-        #
-        # 6. gctransformer.finish_tables() - freeze the gc types table.
-        #
-        # 7. follow new dependencies (this should be only the gc type table,
+        # 5. follow new dependencies (this should be only the gc type table,
         #    which contains only numbers and pointers to ll_finalizer
         #    functions seen in step 3).
         #
-        # I think that there is no reason left at this point that force
-        # step 4 to be done before step 6, nor to have a follow-new-
-        # dependencies step inbetween.  It is important though to have step 3
-        # before steps 4 and 6.
-        #
         # This is implemented by interleaving the follow-new-dependencies
         # steps with calls to the next 'finish' function from the following
         # list:
@@ -295,10 +281,6 @@
         if self.gctransformer:
             finish_callbacks.append(('GC transformer: finished helpers',
                                      self.gctransformer.finish_helpers))
-        if self.stacklesstransformer:
-            finish_callbacks.append(('Stackless transformer: finished',
-                                     self.stacklesstransformer.finish))
-        if self.gctransformer:
             finish_callbacks.append(('GC transformer: finished tables',
                                      self.gctransformer.get_finish_tables()))
 
diff --git a/pypy/translator/c/funcgen.py b/pypy/translator/c/funcgen.py
--- a/pypy/translator/c/funcgen.py
+++ b/pypy/translator/c/funcgen.py
@@ -46,9 +46,6 @@
         self.gcpolicy = db.gcpolicy
         self.exception_policy = exception_policy
         self.functionname = functionname
-        # apply the stackless transformation
-        if db.stacklesstransformer:
-            db.stacklesstransformer.transform_graph(graph)
         # apply the exception transformation
         if self.db.exctransformer:
             self.db.exctransformer.create_exception_handling(self.graph)
diff --git a/pypy/translator/c/gc.py b/pypy/translator/c/gc.py
--- a/pypy/translator/c/gc.py
+++ b/pypy/translator/c/gc.py
@@ -11,7 +11,6 @@
 from pypy.translator.tool.cbuild import ExternalCompilationInfo
 
 class BasicGcPolicy(object):
-    requires_stackless = False
     stores_hash_at_the_end = False
 
     def __init__(self, db, thread_enabled=False):
@@ -320,8 +319,10 @@
             # still important to see it so that it can be followed as soon as
             # the mixlevelannotator resolves it.
             gctransf = self.db.gctransformer
-            fptr = gctransf.finalizer_funcptr_for_type(structdefnode.STRUCT)
-            self.db.get(fptr)
+            TYPE = structdefnode.STRUCT
+            kind_and_fptr = gctransf.special_funcptr_for_type(TYPE)
+            if kind_and_fptr:
+                self.db.get(kind_and_fptr[1])
 
     def array_setup(self, arraydefnode):
         pass
@@ -391,6 +392,9 @@
                fieldname,
                funcgen.expr(c_skipoffset)))
 
+    def OP_GC_ASSUME_YOUNG_POINTERS(self, funcgen, op):
+        raise Exception("the FramewokGCTransformer should handle this")
+
 class AsmGcRootFrameworkGcPolicy(FrameworkGcPolicy):
     transformerclass = asmgcroot.AsmGcRootFrameworkGCTransformer
 
diff --git a/pypy/translator/c/genc.py b/pypy/translator/c/genc.py
--- a/pypy/translator/c/genc.py
+++ b/pypy/translator/c/genc.py
@@ -120,8 +120,6 @@
         self.originalentrypoint = entrypoint
         self.config = config
         self.gcpolicy = gcpolicy    # for tests only, e.g. rpython/memory/
-        if gcpolicy is not None and gcpolicy.requires_stackless:
-            config.translation.stackless = True
         self.eci = self.get_eci()
         self.secondary_entrypoints = secondary_entrypoints
 
@@ -139,21 +137,8 @@
             if not self.standalone:
                 raise NotImplementedError("--gcrootfinder=asmgcc requires standalone")
 
-        if self.config.translation.stackless:
-            if not self.standalone:
-                raise Exception("stackless: only for stand-alone builds")
-            
-            from pypy.translator.stackless.transform import StacklessTransformer
-            stacklesstransformer = StacklessTransformer(
-                translator, self.originalentrypoint,
-                stackless_gc=gcpolicyclass.requires_stackless)
-            self.entrypoint = stacklesstransformer.slp_entry_point
-        else:
-            stacklesstransformer = None
-
         db = LowLevelDatabase(translator, standalone=self.standalone,
                               gcpolicyclass=gcpolicyclass,
-                              stacklesstransformer=stacklesstransformer,
                               thread_enabled=self.config.translation.thread,
                               sandbox=self.config.translation.sandbox)
         self.db = db
diff --git a/pypy/translator/c/src/mem.h b/pypy/translator/c/src/mem.h
--- a/pypy/translator/c/src/mem.h
+++ b/pypy/translator/c/src/mem.h
@@ -39,8 +39,9 @@
 #define pypy_asm_keepalive(v)  asm volatile ("/* keepalive %0 */" : : \
                                              "g" (v))
 
-/* marker for trackgcroot.py */
-#define pypy_asm_stack_bottom()  asm volatile ("/* GC_STACK_BOTTOM */" : : )
+/* marker for trackgcroot.py, and inhibits tail calls */
+#define pypy_asm_stack_bottom()  asm volatile ("/* GC_STACK_BOTTOM */" : : : \
+                                               "memory")
 
 #define OP_GC_ASMGCROOT_STATIC(i, r)   r =      \
                i == 0 ? (void*)&__gcmapstart :         \
diff --git a/pypy/translator/c/src/stack.h b/pypy/translator/c/src/stack.h
--- a/pypy/translator/c/src/stack.h
+++ b/pypy/translator/c/src/stack.h
@@ -15,7 +15,6 @@
 extern long _LLstacktoobig_stack_length;
 extern char _LLstacktoobig_report_error;
 
-void LL_stack_unwind(void);
 char LL_stack_too_big_slowpath(long);    /* returns 0 (ok) or 1 (too big) */
 void LL_stack_set_length_fraction(double);
 
diff --git a/pypy/translator/c/src/stacklet/Makefile b/pypy/translator/c/src/stacklet/Makefile
new file mode 100644
--- /dev/null
+++ b/pypy/translator/c/src/stacklet/Makefile
@@ -0,0 +1,49 @@
+
+all: stacklet.so
+
+stacklet.so: stacklet.c stacklet.h
+	gcc -fPIC -shared -O2 -o $@ stacklet.c
+
+stacklet_g.so: stacklet.c stacklet.h
+	gcc -fPIC -shared -g -o $@ stacklet.c -DDEBUG_DUMP
+
+clean:
+	rm -fr stacklet.so stacklet_g.so
+	rm -fr run_tests_*_[go]
+
+
+DEBUG = -DDEBUG_DUMP
+
+tests: clean
+	make -j1 run-all-tests
+
+ALL_TESTS = tests-static-g \
+            tests-static-o \
+            tests-dynamic-g \
+            tests-dynamic-o
+
+run-all-tests: $(ALL_TESTS)
+	@echo "*** All test suites passed ***"
+
+tests-static-g: stacklet.c stacklet.h tests.c
+	gcc -Wall -g -o run_tests_static_g stacklet.c tests.c ${DEBUG}
+	run_tests_static_g
+
+tests-static-o: stacklet.c stacklet.h tests.c
+	gcc -Wall -g -O2 -o run_tests_static_o stacklet.c tests.c ${DEBUG}
+	run_tests_static_o
+
+tests-dynamic-g: stacklet_g.so tests.c
+	gcc -Wall -g -o run_tests_dynamic_g stacklet_g.so tests.c ${DEBUG}
+	LD_LIBRARY_PATH=. run_tests_dynamic_g
+
+tests-dynamic-o: stacklet.so tests.c
+	gcc -Wall -g -O2 -o run_tests_dynamic_o stacklet.so tests.c ${DEBUG}
+	LD_LIBRARY_PATH=. run_tests_dynamic_o
+
+tests-repeat: tests
+	python runtests.py run_tests_static_g > /dev/null
+	python runtests.py run_tests_static_o > /dev/null
+	LD_LIBRARY_PATH=. python runtests.py run_tests_dynamic_g > /dev/null
+	LD_LIBRARY_PATH=. python runtests.py run_tests_dynamic_o > /dev/null
+	@echo "*** All tests passed repeatedly ***"
diff --git a/pypy/translator/c/src/stacklet/runtests.py b/pypy/translator/c/src/stacklet/runtests.py
new file mode 100644
--- /dev/null
+++ b/pypy/translator/c/src/stacklet/runtests.py
@@ -0,0 +1,8 @@
+import os, sys
+
+
+
+for i in range(2000):
+    err = os.system("%s %d" % (sys.argv[1], i))
+    if err != 0:
+        raise OSError("return code %r" % (err,))
diff --git a/pypy/translator/c/src/stacklet/slp_platformselect.h b/pypy/translator/c/src/stacklet/slp_platformselect.h
new file mode 100644
--- /dev/null
+++ b/pypy/translator/c/src/stacklet/slp_platformselect.h
@@ -0,0 +1,12 @@
+
+#if   defined(_M_IX86)
+#include "switch_x86_msvc.h" /* MS Visual Studio on X86 */
+#elif defined(_M_X64)
+#include "switch_x64_msvc.h" /* MS Visual Studio on X64 */
+#elif defined(__GNUC__) && defined(__amd64__)
+#include "switch_x86_64_gcc.h" /* gcc on amd64 */
+#elif defined(__GNUC__) && defined(__i386__)
+#include "switch_x86_gcc.h" /* gcc on X86 */
+#else
+#error "Unsupported platform!"
+#endif
diff --git a/pypy/translator/c/src/stacklet/stacklet.c b/pypy/translator/c/src/stacklet/stacklet.c
new file mode 100644
--- /dev/null
+++ b/pypy/translator/c/src/stacklet/stacklet.c
@@ -0,0 +1,340 @@
+/********** A really minimal coroutine package for C **********
+ * By Armin Rigo
+ */
+
+#include "stacklet.h"
+
+#include <stddef.h>
+#include <assert.h>
+#include <string.h>
+
+/************************************************************
+ * platform specific code
+ */
+
+/* The default stack direction is downwards, 0, but platforms
+ * can redefine it to upwards growing, 1.
+ */
+#define STACK_DIRECTION 0   
+
+#include "slp_platformselect.h"
+
+#if STACK_DIRECTION != 0
+#  error "review this whole code, which depends on STACK_DIRECTION==0 so far"
+#endif
+
+/************************************************************/
+
+/* #define DEBUG_DUMP */
+
+#ifdef DEBUG_DUMP
+#include <stdio.h>
+#endif
+
+/************************************************************/
+
+struct stacklet_s {
+    /* The portion of the real stack claimed by this paused tealet. */
+    char *stack_start;                /* the "near" end of the stack */
+    char *stack_stop;                 /* the "far" end of the stack */
+
+    /* The amount that has been saved away so far, just after this struct.
+     * There is enough allocated space for 'stack_stop - stack_start'
+     * bytes.
+     */
+    ptrdiff_t stack_saved;            /* the amount saved */
+
+    /* Internally, some stacklets are arranged in a list, to handle lazy
+     * saving of stacks: if the stacklet has a partially unsaved stack,
+     * this points to the next stacklet with a partially unsaved stack,
+     * creating a linked list with each stacklet's stack_stop higher
+     * than the previous one.  The last entry in the list is always the
+     * main stack.
+     */
+    struct stacklet_s *stack_prev;
+};
+
+void *(*_stacklet_switchstack)(void*(*)(void*, void*),
+                               void*(*)(void*, void*), void*) = NULL;
+void (*_stacklet_initialstub)(struct stacklet_thread_s *,
+                              stacklet_run_fn, void *) = NULL;
+
+struct stacklet_thread_s {
+    struct stacklet_s *g_stack_chain_head;  /* NULL <=> running main */
+    char *g_current_stack_stop;
+    char *g_current_stack_marker;
+    struct stacklet_s *g_source;
+    struct stacklet_s *g_target;
+};
+
+/***************************************************************/
+
+static void g_save(struct stacklet_s* g, char* stop
+#ifdef DEBUG_DUMP
+                   , int overwrite_stack_for_debug
+#endif
+                   )
+{
+    /* Save more of g's stack into the heap -- at least up to 'stop'
+
+       In the picture below, the C stack is on the left, growing down,
+       and the C heap on the right.  The area marked with xxx is the logical
+       stack of the stacklet 'g'.  It can be half in the C stack (its older
+       part), and half in the heap (its newer part).
+
+       g->stack_stop |________|
+                     |xxxxxxxx|
+                     |xxx __ stop       .........
+                     |xxxxxxxx|    ==>  :       :
+                     |________|         :_______:
+                     |        |         |xxxxxxx|
+                     |        |         |xxxxxxx|
+      g->stack_start |        |         |_______| g+1
+
+     */
+    ptrdiff_t sz1 = g->stack_saved;
+    ptrdiff_t sz2 = stop - g->stack_start;
+    assert(stop <= g->stack_stop);
+
+    if (sz2 > sz1) {
+        char *c = (char *)(g + 1);
+#if STACK_DIRECTION == 0
+        memcpy(c+sz1, g->stack_start+sz1, sz2-sz1);
+#  ifdef DEBUG_DUMP
+        if (overwrite_stack_for_debug)
+          memset(g->stack_start+sz1, 0xdb, sz2-sz1);
+#  endif
+#else
+        xxx;
+#endif
+        g->stack_saved = sz2;
+    }
+}
+
+/* Allocate and store in 'g_source' a new stacklet, which has the C
+ * stack from 'old_stack_pointer' to 'g_current_stack_stop'.  It is
+ * initially completely unsaved, so it is attached to the head of the
+ * chained list of 'stack_prev'.
+ */
+static int g_allocate_source_stacklet(void *old_stack_pointer,
+                                      struct stacklet_thread_s *thrd)
+{
+    struct stacklet_s *stacklet;
+    ptrdiff_t stack_size = (thrd->g_current_stack_stop -
+                            (char *)old_stack_pointer);
+
+    thrd->g_source = malloc(sizeof(struct stacklet_s) + stack_size);
+    if (thrd->g_source == NULL)
+        return -1;
+
+    stacklet = thrd->g_source;
+    stacklet->stack_start = old_stack_pointer;
+    stacklet->stack_stop  = thrd->g_current_stack_stop;
+    stacklet->stack_saved = 0;
+    stacklet->stack_prev  = thrd->g_stack_chain_head;
+    thrd->g_stack_chain_head = stacklet;
+    return 0;
+}
+
+/* Save more of the C stack away, up to 'target_stop'.
+ */
+static void g_clear_stack(struct stacklet_s *g_target,
+                          struct stacklet_thread_s *thrd)
+{
+    struct stacklet_s *current = thrd->g_stack_chain_head;
+    char *target_stop = g_target->stack_stop;
+
+    /* save and unlink tealets that are completely within
+       the area to clear. */
+    while (current != NULL && current->stack_stop <= target_stop) {
+        struct stacklet_s *prev = current->stack_prev;
+        current->stack_prev = NULL;
+        if (current != g_target) {
+            /* don't bother saving away g_target, because
+               it would be immediately restored */
+            g_save(current, current->stack_stop
+#ifdef DEBUG_DUMP
+                   , 1
+#endif
+                   );
+        }
+        current = prev;
+    }
+
+    /* save a partial stack */
+    if (current != NULL && current->stack_start < target_stop)
+        g_save(current, target_stop
+#ifdef DEBUG_DUMP
+               , 1
+#endif
+               );
+
+    thrd->g_stack_chain_head = current;
+}
+
+/* This saves the current state in a new stacklet that gets stored in
+ * 'g_source', and save away enough of the stack to allow a jump to
+ * 'g_target'.
+ */
+static void *g_save_state(void *old_stack_pointer, void *rawthrd)
+{
+    struct stacklet_thread_s *thrd = (struct stacklet_thread_s *)rawthrd;
+    if (g_allocate_source_stacklet(old_stack_pointer, thrd) < 0)
+        return NULL;
+    g_clear_stack(thrd->g_target, thrd);
+    return thrd->g_target->stack_start;
+}
+
+/* This saves the current state in a new stacklet that gets stored in
+ * 'g_source', but returns NULL, to not do any restoring yet.
+ */
+static void *g_initial_save_state(void *old_stack_pointer, void *rawthrd)
+{
+    struct stacklet_thread_s *thrd = (struct stacklet_thread_s *)rawthrd;
+    if (g_allocate_source_stacklet(old_stack_pointer, thrd) == 0)
+        g_save(thrd->g_source, thrd->g_current_stack_marker
+#ifdef DEBUG_DUMP
+               , 0
+#endif
+               );
+    return NULL;
+}
+
+/* Save away enough of the stack to allow a jump to 'g_target'.
+ */
+static void *g_destroy_state(void *old_stack_pointer, void *rawthrd)
+{
+    struct stacklet_thread_s *thrd = (struct stacklet_thread_s *)rawthrd;
+    thrd->g_source = EMPTY_STACKLET_HANDLE;
+    g_clear_stack(thrd->g_target, thrd);
+    return thrd->g_target->stack_start;
+}
+
+/* Restore the C stack by copying back from the heap in 'g_target',
+ * and free 'g_target'.
+ */
+static void *g_restore_state(void *new_stack_pointer, void *rawthrd)
+{
+    /* Restore the heap copy back into the C stack */
+    struct stacklet_thread_s *thrd = (struct stacklet_thread_s *)rawthrd;
+    struct stacklet_s *g = thrd->g_target;
+    ptrdiff_t stack_saved = g->stack_saved;
+
+    assert(new_stack_pointer == g->stack_start);
+#if STACK_DIRECTION == 0
+    memcpy(g->stack_start, g+1, stack_saved);
+#else
+    memcpy(g->stack_start - stack_saved, g+1, stack_saved);
+#endif
+    thrd->g_current_stack_stop = g->stack_stop;
+    free(g);
+    return EMPTY_STACKLET_HANDLE;
+}
+
+static void g_initialstub(struct stacklet_thread_s *thrd,
+                          stacklet_run_fn run, void *run_arg)
+{
+    struct stacklet_s *result;
+
+    /* The following call returns twice! */
+    result = (struct stacklet_s *) _stacklet_switchstack(g_initial_save_state,
+                                                         g_restore_state,
+                                                         thrd);
+    if (result == NULL && thrd->g_source != NULL) {
+        /* First time it returns.  Only g_initial_save_state() has run
+           and has created 'g_source'.  Call run(). */
+        thrd->g_current_stack_stop = thrd->g_current_stack_marker;
+        result = run(thrd->g_source, run_arg);
+
+        /* Then switch to 'result'. */
+        thrd->g_target = result;
+        _stacklet_switchstack(g_destroy_state, g_restore_state, thrd);
+
+        assert(!"stacklet: we should not return here");
+        abort();
+    }
+    /* The second time it returns. */
+}
+
+/************************************************************/
+
+stacklet_thread_handle stacklet_newthread(void)
+{
+    struct stacklet_thread_s *thrd;
+
+    if (_stacklet_switchstack == NULL) {
+        /* set up the following global with an indirection, which is needed
+           to prevent any inlining */
+        _stacklet_initialstub = g_initialstub;
+        _stacklet_switchstack = slp_switch;
+    }
+
+    thrd = malloc(sizeof(struct stacklet_thread_s));
+    if (thrd != NULL)
+        memset(thrd, 0, sizeof(struct stacklet_thread_s));
+    return thrd;
+}
+
+void stacklet_deletethread(stacklet_thread_handle thrd)
+{
+    free(thrd);
+}
+
+stacklet_handle stacklet_new(stacklet_thread_handle thrd,
+                             stacklet_run_fn run, void *run_arg)
+{
+    long stackmarker;
+    assert((char *)NULL < (char *)&stackmarker);
+    if (thrd->g_current_stack_stop <= (char *)&stackmarker)
+        thrd->g_current_stack_stop = ((char *)&stackmarker) + 1;
+
+    thrd->g_current_stack_marker = (char *)&stackmarker;
+    _stacklet_initialstub(thrd, run, run_arg);
+    return thrd->g_source;
+}
+
+stacklet_handle stacklet_switch(stacklet_thread_handle thrd,
+                                stacklet_handle target)
+{
+    long stackmarker;
+    if (thrd->g_current_stack_stop <= (char *)&stackmarker)
+        thrd->g_current_stack_stop = ((char *)&stackmarker) + 1;
+
+    thrd->g_target = target;
+    _stacklet_switchstack(g_save_state, g_restore_state, thrd);
+    return thrd->g_source;
+}
+
+void stacklet_destroy(stacklet_thread_handle thrd, stacklet_handle target)
+{
+    /* remove 'target' from the chained list 'unsaved_stack', if it is there */
+    struct stacklet_s **pp = &thrd->g_stack_chain_head;
+    for (; *pp != NULL; pp = &(*pp)->stack_prev)
+        if (*pp == target) {
+            *pp = target->stack_prev;
+            break;
+        }
+    free(target);
+}
+
+char **_stacklet_translate_pointer(stacklet_handle context, char **ptr)
+{
+  if (context == NULL)
+    return ptr;
+  char *p = (char *)ptr;
+  long delta = p - context->stack_start;
+  if (((unsigned long)delta) < ((unsigned long)context->stack_saved)) {
+      /* a pointer to a saved away word */
+      char *c = (char *)(context + 1);
+      return (char **)(c + delta);
+  }
+  if (((unsigned long)delta) >=
+      (unsigned long)(context->stack_stop - context->stack_start)) {
+      /* out-of-stack pointer!  it's only ok if we are the main stacklet
+         and we are reading past the end, because the main stacklet's
+         stack stop is not exactly known. */
+      assert(delta >= 0);
+      assert(((long)context->stack_stop) & 1);
+  }
+  return ptr;
+}
diff --git a/pypy/translator/c/src/stacklet/stacklet.h b/pypy/translator/c/src/stacklet/stacklet.h
new file mode 100644
--- /dev/null
+++ b/pypy/translator/c/src/stacklet/stacklet.h
@@ -0,0 +1,62 @@
+/********** A really minimal coroutine package for C **********/
+#ifndef _STACKLET_H_
+#define _STACKLET_H_
+
+#include <stdlib.h>
+
+
+/* A "stacklet handle" is an opaque pointer to a suspended stack.
+ * Whenever we suspend the current stack in order to switch elsewhere,
+ * stacklet.c passes to the target a 'stacklet_handle' argument that points
+ * to the original stack now suspended.  The handle must later be passed
+ * back to this API once, in order to resume the stack.  It is only
+ * valid once.
+ */
+typedef struct stacklet_s *stacklet_handle;
+
+#define EMPTY_STACKLET_HANDLE  ((stacklet_handle) -1)
+
+
+/* Multithread support.
+ */
+typedef struct stacklet_thread_s *stacklet_thread_handle;
+
+stacklet_thread_handle stacklet_newthread(void);
+void stacklet_deletethread(stacklet_thread_handle thrd);
+
+
+/* The "run" function of a stacklet.  The first argument is the handle
+ * of the stack from where we come.  When such a function returns, it
+ * must return a (non-empty) stacklet_handle that tells where to go next.
+ */
+typedef stacklet_handle (*stacklet_run_fn)(stacklet_handle, void *);
+
+/* Call 'run(source, run_arg)' in a new stack.  See stacklet_switch()
+ * for the return value.
+ */
+stacklet_handle stacklet_new(stacklet_thread_handle thrd,
+                             stacklet_run_fn run, void *run_arg);
+
+/* Switch to the target handle, resuming its stack.  This returns:
+ *  - if we come back from another call to stacklet_switch(), the source handle
+ *  - if we come back from a run() that finishes, EMPTY_STACKLET_HANDLE
+ *  - if we run out of memory, NULL
+ * Don't call this with an already-used target, with EMPTY_STACKLET_HANDLE,
+ * or with a stack handle from another thread (in multithreaded apps).
+ */
+stacklet_handle stacklet_switch(stacklet_thread_handle thrd,
+                                stacklet_handle target);
+
+/* Delete a stack handle without resuming it at all.
+ * (This works even if the stack handle is of a different thread)
+ */
+void stacklet_destroy(stacklet_thread_handle thrd, stacklet_handle target);
+
+/* stacklet_handle _stacklet_switch_to_copy(stacklet_handle) --- later */
+
+/* Hack: translate a pointer into the stack of a stacklet into a pointer
+ * to where it is really stored so far.  Only to access word-sized data.
+ */
+char **_stacklet_translate_pointer(stacklet_handle context, char **ptr);
+
+#endif /* _STACKLET_H_ */
diff --git a/pypy/translator/c/src/stacklet/switch_x64_msvc.asm b/pypy/translator/c/src/stacklet/switch_x64_msvc.asm
new file mode 100644
--- /dev/null
+++ b/pypy/translator/c/src/stacklet/switch_x64_msvc.asm
@@ -0,0 +1,101 @@
+;
+; stack switching code for MASM on x64
+; Kristjan Valur Jonsson, apr 2011
+;
+
+include macamd64.inc
+
+pop_reg MACRO reg
+	pop reg
+ENDM
+
+load_xmm128 macro Reg, Offset
+	movdqa  Reg, Offset[rsp]
+endm
+
+.code
+
+;arguments save_state, restore_state, extra are passed in rcx, rdx, r8 respectively
+;slp_switch PROC FRAME
+NESTED_ENTRY slp_switch, _TEXT$00
+	; save all registers that the x64 ABI specifies as non-volatile.
+	; This includes some mmx registers.  May not always be necessary,
+	; unless our application is doing 3D, but better safe than sorry.
+	alloc_stack 168; 10 * 16 bytes, plus 8 bytes to make stack 16 byte aligned
+	save_xmm128 xmm15, 144
+	save_xmm128 xmm14, 128
+	save_xmm128 xmm13, 112
+	save_xmm128 xmm12, 96
+	save_xmm128 xmm11, 80
+	save_xmm128 xmm10, 64
+	save_xmm128 xmm9,  48
+	save_xmm128 xmm8,  32
+	save_xmm128 xmm7,  16
+	save_xmm128 xmm6,  0
+	
+	push_reg r15
+	push_reg r14
+	push_reg r13
+	push_reg r12
+	
+	push_reg rbp
+	push_reg rbx
+	push_reg rdi
+	push_reg rsi
+	
+	sub rsp, 20h ;allocate shadow stack space for the arguments (must be multiple of 16)
+	.allocstack 20h
+.endprolog
+
+	;save argments in nonvolatile registers
+	mov r12, rcx ;save_state
+	mov r13, rdx
+	mov r14, r8
+
+	; load stack base that we are saving minus the callee argument
+	; shadow stack.  We don't want that clobbered
+	lea rcx, [rsp+20h] 
+	mov rdx, r14
+	call r12 ;pass stackpointer, return new stack pointer in eax
+	
+	; an null value means that we don't restore.
+	test rax, rax
+	jz exit
+	
+	;actual stack switch (and re-allocating the shadow stack):
+	lea rsp, [rax-20h]
+	
+	mov rcx, rax ;pass new stack pointer
+	mov rdx, r14
+	call r13
+	;return the rax
+EXIT:
+	
+	add rsp, 20h
+	pop_reg rsi
+	pop_reg rdi
+	pop_reg rbx
+	pop_reg rbp
+	
+	pop_reg r12
+	pop_reg r13
+	pop_reg r14
+	pop_reg r15
+	
+	load_xmm128 xmm15, 144
+	load_xmm128 xmm14, 128
+	load_xmm128 xmm13, 112
+	load_xmm128 xmm12, 96
+	load_xmm128 xmm11, 80
+	load_xmm128 xmm10, 64
+	load_xmm128 xmm9,  48
+	load_xmm128 xmm8,  32
+	load_xmm128 xmm7,  16
+	load_xmm128 xmm6,  0
+	add rsp, 168
+	ret
+	
+NESTED_END slp_switch, _TEXT$00
+;slp_switch ENDP 
+	
+END
\ No newline at end of file
diff --git a/pypy/translator/c/src/stacklet/switch_x64_msvc.h b/pypy/translator/c/src/stacklet/switch_x64_msvc.h
new file mode 100644
--- /dev/null
+++ b/pypy/translator/c/src/stacklet/switch_x64_msvc.h
@@ -0,0 +1,7 @@
+/* The actual stack saving function, which just stores the stack,
+ * this declared in an .asm file
+ */
+extern void *slp_switch(void *(*save_state)(void*, void*),
+                        void *(*restore_state)(void*, void*),
+                        void *extra);
+
diff --git a/pypy/translator/c/src/stacklet/switch_x86_64_gcc.h b/pypy/translator/c/src/stacklet/switch_x86_64_gcc.h
new file mode 100644
--- /dev/null
+++ b/pypy/translator/c/src/stacklet/switch_x86_64_gcc.h
@@ -0,0 +1,55 @@
+
+static void *slp_switch(void *(*save_state)(void*, void*),
+                        void *(*restore_state)(void*, void*),
+                        void *extra)
+{
+  void *result, *garbage1, *garbage2;
+  __asm__ volatile (
+     "pushq %%rbp\n"
+     "pushq %%rbx\n"       /* push the registers specified as caller-save */
+     "pushq %%r12\n"
+     "pushq %%r13\n"
+     "pushq %%r14\n"
+     "pushq %%r15\n"
+
+     "movq %%rax, %%r12\n" /* save 'restore_state' for later */
+     "movq %%rsi, %%r13\n" /* save 'extra' for later         */
+
+                           /* arg 2: extra (already in rsi)      */
+     "movq %%rsp, %%rdi\n" /* arg 1: current (old) stack pointer */
+     "call *%%rcx\n"       /* call save_state()                  */
+
+     "testq %%rax, %%rax\n"    /* skip the rest if the return value is null */
+     "jz 0f\n"
+
+     "movq %%rax, %%rsp\n"     /* change the stack pointer */
+
+     /* From now on, the stack pointer is modified, but the content of the
+        stack is not restored yet.  It contains only garbage here. */
+
+     "movq %%r13, %%rsi\n" /* arg 2: extra                       */
+     "movq %%rax, %%rdi\n" /* arg 1: current (new) stack pointer */
+     "call *%%r12\n"       /* call restore_state()               */
+
+     /* The stack's content is now restored. */
+
+     "0:\n"
+     "popq %%r15\n"
+     "popq %%r14\n"
+     "popq %%r13\n"
+     "popq %%r12\n"
+     "popq %%rbx\n"
+     "popq %%rbp\n"
+
+     : "=a"(result),             /* output variables */
+       "=c"(garbage1),
+       "=S"(garbage2)
+     : "a"(restore_state),       /* input variables  */
+       "c"(save_state),
+       "S"(extra)
+     : "memory", "rdx", "rdi", "r8", "r9", "r10", "r11",
+       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+       "xmm8", "xmm9", "xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"
+     );
+  return result;
+}
diff --git a/pypy/translator/c/src/stacklet/switch_x86_gcc.h b/pypy/translator/c/src/stacklet/switch_x86_gcc.h
new file mode 100644
--- /dev/null
+++ b/pypy/translator/c/src/stacklet/switch_x86_gcc.h
@@ -0,0 +1,56 @@
+
+static void *slp_switch(void *(*save_state)(void*, void*),
+                        void *(*restore_state)(void*, void*),
+                        void *extra)
+{
+  void *result, *garbage1, *garbage2;
+  __asm__ volatile (
+     "pushl %%ebp\n"
+     "pushl %%ebx\n"       /* push some registers that may contain */
+     "pushl %%esi\n"       /* some value that is meant to be saved */
+     "pushl %%edi\n"
+
+     "movl %%eax, %%esi\n" /* save 'restore_state' for later */
+     "movl %%edx, %%edi\n" /* save 'extra' for later         */
+
+     "movl %%esp, %%eax\n"
+
+     "pushl %%edx\n"       /* arg 2: extra                       */
+     "pushl %%eax\n"       /* arg 1: current (old) stack pointer */
+     "call *%%ecx\n"       /* call save_state()                  */
+
+     "testl %%eax, %%eax\n"/* skip the rest if the return value is null */
+     "jz 0f\n"
+
+     "movl %%eax, %%esp\n"     /* change the stack pointer */
+
+     /* From now on, the stack pointer is modified, but the content of the
+        stack is not restored yet.  It contains only garbage here. */
+
+     "pushl %%edi\n"       /* arg 2: extra                       */
+     "pushl %%eax\n"       /* arg 1: current (new) stack pointer */
+     "call *%%esi\n"       /* call restore_state()               */
+
+     /* The stack's content is now restored. */
+
+     "0:\n"
+     "addl $8, %%esp\n"
+     "popl %%edi\n"
+     "popl %%esi\n"
+     "popl %%ebx\n"
+     "popl %%ebp\n"
+
+     : "=a"(result),             /* output variables */
+       "=c"(garbage1),
+       "=d"(garbage2)
+     : "a"(restore_state),       /* input variables  */
+       "c"(save_state),
+       "d"(extra)
+     : "memory"
+     );
+  /* Note: we should also list all fp/xmm registers, but is there a way
+     to list only the ones used by the current compilation target?
+     For now we will just ignore the issue and hope (reasonably) that
+     this function is never inlined all the way into 3rd-party user code. */
+  return result;
+}
diff --git a/pypy/translator/c/src/stacklet/switch_x86_msvc.asm b/pypy/translator/c/src/stacklet/switch_x86_msvc.asm
new file mode 100644
--- /dev/null
+++ b/pypy/translator/c/src/stacklet/switch_x86_msvc.asm
@@ -0,0 +1,44 @@
+
+.386
+.model flat, c
+
+.code
+
+slp_switch_raw PROC save_state:DWORD, restore_state:DWORD, extra:DWORD
+  
+  ;save registers. EAX ECX and EDX are available for function use and thus
+  ;do not have to be stored.
+  push ebx
+  push esi
+  push edi
+  push ebp
+  
+  mov esi, restore_state ; /* save 'restore_state' for later */
+  mov edi, extra ;         /* save 'extra' for later         */
+
+  mov eax, esp
+
+  push edi ;               /* arg 2: extra                       */
+  push eax ;               /* arg 1: current (old) stack pointer */
+  mov  ecx, save_state
+  call ecx ;               /* call save_state()                  */
+
+  test eax, eax;           /* skip the restore if the return value is null */
+  jz exit
+
+  mov esp, eax;            /* change the stack pointer */
+
+  push edi ;               /* arg 2: extra                       */
+  push eax ;               /* arg 1: current (new) stack pointer */
+  call esi ;               /* call restore_state()               */
+
+exit:
+  add esp, 8
+  pop  ebp
+  pop  edi
+  pop  esi
+  pop  ebx
+  ret
+slp_switch_raw ENDP
+
+end
\ No newline at end of file
diff --git a/pypy/translator/c/src/stacklet/switch_x86_msvc.h b/pypy/translator/c/src/stacklet/switch_x86_msvc.h
new file mode 100644
--- /dev/null
+++ b/pypy/translator/c/src/stacklet/switch_x86_msvc.h
@@ -0,0 +1,26 @@
+/* The actual stack saving function, which just stores the stack,
+ * this declared in an .asm file
+ */
+extern void *slp_switch_raw(void *(*save_state)(void*, void*),
+                        void *(*restore_state)(void*, void*),
+                        void *extra);
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+/* Store any other runtime information on the local stack */
+#pragma optimize("", off) /* so that autos are stored on the stack */
+#pragma warning(disable:4733) /* disable warning about modifying FS[0] */
+
+static void *slp_switch(void *(*save_state)(void*, void*),
+                        void *(*restore_state)(void*, void*),
+                        void *extra)
+{
+    /* store the structured exception state for this stack */
+    DWORD seh_state = __readfsdword(FIELD_OFFSET(NT_TIB, ExceptionList));
+    void * result = slp_switch_raw(save_state, restore_state, extra);
+    __writefsdword(FIELD_OFFSET(NT_TIB, ExceptionList), seh_state);
+    return result;
+}
+#pragma warning(default:4733) /* disable warning about modifying FS[0] */
+#pragma optimize("", on)
diff --git a/pypy/translator/c/src/stacklet/tests.c b/pypy/translator/c/src/stacklet/tests.c
new file mode 100644
--- /dev/null
+++ b/pypy/translator/c/src/stacklet/tests.c
@@ -0,0 +1,671 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <assert.h>
+#include "stacklet.h"
+
+
+static stacklet_thread_handle thrd;
+
+/************************************************************/
+
+stacklet_handle empty_callback(stacklet_handle h, void *arg)
+{
+  assert(arg == (void *)123);
+  return h;
+}
+
+void test_new(void)
+{
+  stacklet_handle h = stacklet_new(thrd, empty_callback, (void *)123);
+  assert(h == EMPTY_STACKLET_HANDLE);
+}
+
+/************************************************************/
+
+static int status;
+
+stacklet_handle switchbackonce_callback(stacklet_handle h, void *arg)
+{
+  assert(arg == (void *)123);
+  assert(status == 0);
+  status = 1;
+  assert(h != EMPTY_STACKLET_HANDLE);
+  h = stacklet_switch(thrd, h);
+  assert(status == 2);
+  assert(h != EMPTY_STACKLET_HANDLE);
+  status = 3;
+  return h;
+}
+
+void test_simple_switch(void)
+{
+  status = 0;
+  stacklet_handle h = stacklet_new(thrd, switchbackonce_callback, (void *)123);
+  assert(h != EMPTY_STACKLET_HANDLE);
+  assert(status == 1);
+  status = 2;
+  h = stacklet_switch(thrd, h);
+  assert(status == 3);
+  assert(h == EMPTY_STACKLET_HANDLE);
+}
+
+/************************************************************/
+
+static stacklet_handle handles[10];
+static int nextstep, comefrom, gointo;
+static const int statusmax = 5000;
+
+int withdepth(int self, float d);
+
+stacklet_handle variousdepths_callback(stacklet_handle h, void *arg)
+{
+  int self, n;
+  assert(nextstep == status);
+  nextstep = -1;
+  self = (ptrdiff_t)arg;
+  assert(self == gointo);
+  assert(0 <= self && self < 10);
+  assert(handles[self] == NULL);
+  assert(0 <= comefrom && comefrom < 10);
+  assert(handles[comefrom] == NULL);
+  assert(h != NULL && h != EMPTY_STACKLET_HANDLE);
+  handles[comefrom] = h;
+  comefrom = -1;
+  gointo = -1;
+
+  while (withdepth(self, rand() % 20) == 0)
+    ;
+
+  assert(handles[self] == NULL);
+
+  do {
+    n = rand() % 10;
+  } while (handles[n] == NULL);
+
+  h = handles[n];
+  assert(h != EMPTY_STACKLET_HANDLE);
+  handles[n] = NULL;
+  comefrom = -42;
+  gointo = n;
+  assert(nextstep == -1);
+  nextstep = ++status;
+  //printf("LEAVING %d to go to %d\n", self, n);
+  return h;
+}
+
+typedef struct foo_s {
+  int self;
+  float d;
+  struct foo_s *next;
+} foo_t;
+
+int withdepth(int self, float d)
+{
+  int res = 0;
+  if (d > 0.0)
+    {
+      foo_t *foo = malloc(sizeof(foo_t));
+      foo_t *foo2 = malloc(sizeof(foo_t));
+      foo->self = self;
+      foo->d = d;
+      foo->next = foo2;
+      foo2->self = self + 100;
+      foo2->d = d;
+      foo2->next = NULL;
+      res = withdepth(self, d - 1.1);
+      assert(foo->self == self);
+      assert(foo->d    == d);
+      assert(foo->next == foo2);
+      assert(foo2->self == self + 100);
+      assert(foo2->d    == d);
+      assert(foo2->next == NULL);
+      free(foo2);
+      free(foo);
+    }
+  else
+    {
+      stacklet_handle h;
+      int n = rand() % 10;
+      if (n == self || (status >= statusmax && handles[n] == NULL))
+        return 1;
+
+      //printf("status == %d, self = %d\n", status, self);
+      assert(handles[self] == NULL);
+      assert(nextstep == -1);
+      nextstep = ++status;
+      comefrom = self;
+      gointo = n;
+      if (handles[n] == NULL)
+        {
+          /* start a new stacklet */
+          //printf("new %d\n", n);
+          h = stacklet_new(thrd, variousdepths_callback, (void *)(ptrdiff_t)n);
+        }
+      else
+        {
+          /* switch to this stacklet */
+          //printf("switch to %d\n", n);
+          h = handles[n];
+          handles[n] = NULL;
+          h = stacklet_switch(thrd, h);
+        }
+      //printf("back in self = %d, coming from %d\n", self, comefrom);
+      assert(nextstep == status);
+      nextstep = -1;
+      assert(gointo == self);
+      assert(comefrom != self);
+      assert(handles[self] == NULL);
+      if (comefrom != -42)
+        {
+          assert(0 <= comefrom && comefrom < 10);
+          assert(handles[comefrom] == NULL);
+          handles[comefrom] = h;
+        }
+      else
+        assert(h == EMPTY_STACKLET_HANDLE);
+      comefrom = -1;
+      gointo = -1;
+    }
+  assert((res & (res-1)) == 0);   /* to prevent a tail-call to withdepth() */
+  return res;
+}
+
+int any_alive(void)
+{
+  int i;
+  for (i=0; i<10; i++)
+    if (handles[i] != NULL)
+      return 1;
+  return 0;
+}
+
+void test_various_depths(void)
+{
+  int i;
+  for (i=0; i<10; i++)
+    handles[i] = NULL;
+
+  nextstep = -1;
+  comefrom = -1;
+  status = 0;
+  while (status < statusmax || any_alive())
+    withdepth(0, rand() % 50);
+}
+
+/************************************************************/
+#if 0
+
+static tealet_t *runner1(tealet_t *cur)
+{
+  abort();
+}
+
+void test_new_pending(void)
+{
+  tealet_t *g1 = tealet_new();
+  tealet_t *g2 = tealet_new();
+  int r1 = tealet_fill(g1, runner1);
+  int r2 = tealet_fill(g2, runner1);
+  assert(r1 == TEALET_OK);
+  assert(r2 == TEALET_OK);
+  assert(g1->suspended == 1);
+  assert(g2->suspended == 1);
+  tealet_delete(g1);
+  tealet_delete(g2);
+}
+
+/************************************************************/
+
+void test_not_switched(void)
+{
+  tealet_t *g1 = tealet_new();
+  tealet_t *g2 = tealet_new();
+  tealet_t *g3 = tealet_switch(g2, g1);
+  assert(!TEALET_ERROR(g3));
+  assert(g3 == g1);
+  tealet_delete(g1);
+  tealet_delete(g2);
+}
+
+/************************************************************/
+
+static tealet_t *g_main;
+
+static void step(int newstatus)
+{
+  assert(status == newstatus - 1);
+  status = newstatus;
+}
+
+static tealet_t *simple_run(tealet_t *t1)
+{
+  assert(t1 != g_main);
+  step(2);
+  tealet_delete(t1);
+  return g_main;
+}
+
+void test_simple(void)
+{
+  tealet_t *t1, *tmain;
+  int res;
+
+  status = 0;
+  g_main = tealet_new();
+  t1 = tealet_new();
+  res = tealet_fill(t1, simple_run);
+  assert(res == TEALET_OK);
+  step(1);
+  tmain = tealet_switch(g_main, t1);
+  step(3);
+  assert(tmain == g_main);
+  tealet_delete(g_main);
+  step(4);
+}
+
+/************************************************************/
+
+static tealet_t *simple_exit(tealet_t *t1)
+{
+  int res;
+  assert(t1 != g_main);
+  step(2);
+  tealet_delete(t1);
+  res = tealet_exit_to(g_main);
+  assert(!"oups");
+}
+
+void test_exit(void)
+{
+  tealet_t *t1, *tmain;
+  int res;
+
+  status = 0;
+  g_main = tealet_new();
+  t1 = tealet_new();
+  res = tealet_fill(t1, simple_exit);
+  assert(res == TEALET_OK);
+  step(1);
+  tmain = tealet_switch(g_main, t1);
+  step(3);
+  assert(tmain == g_main);
+  tealet_delete(g_main);
+  step(4);
+}
+
+/************************************************************/
+
+static tealet_t *g_other;
+
+static tealet_t *three_run_1(tealet_t *t1)
+{
+  assert(t1 != g_main);
+  assert(t1 != g_other);
+  step(2);
+  tealet_delete(t1);
+  return g_other;
+}
+
+static tealet_t *three_run_2(tealet_t *t2)
+{
+  assert(t2 == g_other);
+  step(3);
+  tealet_delete(t2);
+  return g_main;
+}
+
+void test_three_tealets(void)
+{
+  tealet_t *t1, *t2, *tmain;
+  int res;
+
+  status = 0;
+  g_main = tealet_new();
+  t1 = tealet_new();
+  t2 = tealet_new();
+  res = tealet_fill(t1, three_run_1);
+  assert(res == TEALET_OK);
+  res = tealet_fill(t2, three_run_2);
+  assert(res == TEALET_OK);
+  step(1);
+  g_other = t2;
+  tmain = tealet_switch(g_main, t1);
+  step(4);
+  assert(tmain == g_main);
+  tealet_delete(g_main);
+  step(5);
+}
+
+/************************************************************/
+
+static tealet_t *glob_t1;
+static tealet_t *glob_t2;
+
+tealet_t *test_switch_2(tealet_t *t2)
+{
+  assert(t2 != g_main);
+  assert(t2 != glob_t1);
+  glob_t2 = t2;
+
+  step(2);
+  t2 = tealet_switch(glob_t2, glob_t1);
+  assert(t2 == glob_t2);
+
+  step(4);
+  assert(glob_t1->suspended == 1);
+  t2 = tealet_switch(glob_t2, glob_t1);
+  assert(t2 == glob_t2);
+
+  step(6);
+  assert(glob_t1->suspended == 0);
+  t2 = tealet_switch(glob_t2, glob_t1);
+  assert(t2 == glob_t1);
+  printf("ok!\n");
+
+  return g_main;
+}
+
+tealet_t *test_switch_1(tealet_t *t1)
+{
+  tealet_t *t2 = tealet_new();
+  assert(t1 != g_main);
+  tealet_fill(t2, test_switch_2);
+  glob_t1 = t1;
+
+  step(1);
+  t1 = tealet_switch(glob_t1, t2);
+  assert(t1 == glob_t1);
+  assert(t2 == glob_t2);
+
+  step(3);
+  t1 = tealet_switch(glob_t1, t2);
+  assert(t1 == glob_t1);
+  assert(t2 == glob_t2);
+
+  step(5);
+  return t2;
+}
+
+void test_switch(void)
+{
+  int res;
+  tealet_t *t, *t2;
+
+  g_main = tealet_new();
+  status = 0;
+  t = tealet_new();
+  res = tealet_fill(t, test_switch_1);
+  assert(res == TEALET_OK);
+  t2 = tealet_switch(g_main, t);
+  assert(!TEALET_ERROR(t2));
+
+  step(7);
+  tealet_delete(g_main);
+  tealet_delete(glob_t1);
+  tealet_delete(glob_t2);
+}
+
+/************************************************************/
+
+#define ARRAYSIZE  127
+#define MAX_STATUS 50000
+
+static tealet_t *tealetarray[ARRAYSIZE] = {NULL};
+static int got_index;
+
+tealet_t *random_new_tealet(tealet_t*);
+
+static void random_run(tealet_t* cur, int index)
+{
+  int i, prevstatus;
+  tealet_t *t, *tres;
+  assert(tealetarray[index] == cur);
+  do
+    {
+      i = rand() % (ARRAYSIZE + 1);
+      status += 1;
+      if (i == ARRAYSIZE)
+        break;
+      prevstatus = status;
+      got_index = i;
+      if (tealetarray[i] == NULL)
+        {
+          if (status >= MAX_STATUS)
+            break;
+          t = tealet_new();
+          tealet_fill(t, random_new_tealet);
+          t->data = (void*)(ptrdiff_t)i;
+        }
+      else
+        {
+          t = tealetarray[i];
+        }
+      tres = tealet_switch(cur, t);
+      assert(tres == cur);
+
+      assert(status >= prevstatus);
+      assert(tealetarray[index] == cur);
+      assert(got_index == index);
+    }
+  while (status < MAX_STATUS);
+}
+
+tealet_t *random_new_tealet(tealet_t* cur)
+{
+  int i = got_index;
+  assert(i == (ptrdiff_t)(cur->data));
+  assert(i > 0 && i < ARRAYSIZE);
+  assert(tealetarray[i] == NULL);
+  tealetarray[i] = cur;
+  random_run(cur, i);
+  tealetarray[i] = NULL;
+  tealet_delete(cur);
+
+  i = rand() % ARRAYSIZE;
+  if (tealetarray[i] == NULL)
+    {
+      assert(tealetarray[0] != NULL);
+      i = 0;
+    }
+  got_index = i;
+  return tealetarray[i];
+}
+
+void test_random(void)
+{
+  int i;
+  g_main = tealet_new();
+  for( i=0; i<ARRAYSIZE; i++)
+      tealetarray[i] = NULL;
+  tealetarray[0] = g_main;
+  status = 0;
+  while (status < MAX_STATUS)
+    random_run(g_main, 0);
+
+  assert(g_main == tealetarray[0]);
+  for (i=1; i<ARRAYSIZE; i++)
+    while (tealetarray[i] != NULL)
+      random_run(g_main, 0);
+
+  tealet_delete(g_main);
+}
+
+/************************************************************/
+
+tealet_t *test_double_run(tealet_t *current)
+{
+  double d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, *numbers;
+  numbers = (double *)current->data;
+  d0 = numbers[0] + 1 / 1.0;
+  d1 = numbers[1] + 1 / 2.0;
+  d2 = numbers[2] + 1 / 4.0;
+  d3 = numbers[3] + 1 / 8.0;
+  d4 = numbers[4] + 1 / 16.0;
+  d5 = numbers[5] + 1 / 32.0;
+  d6 = numbers[6] + 1 / 64.0;
+  d7 = numbers[7] + 1 / 128.0;
+  d8 = numbers[8] + 1 / 256.0;
+  d9 = numbers[9] + 1 / 512.0;
+  numbers[0] = d0;
+  numbers[1] = d1;
+  numbers[2] = d2;
+  numbers[3] = d3;
+  numbers[4] = d4;
+  numbers[5] = d5;
+  numbers[6] = d6;
+  numbers[7] = d7;
+  numbers[8] = d8;
+  numbers[9] = d9;
+  tealet_delete(current);
+  return g_main;
+}
+
+void test_double(void)
+{
+  int i;
+  double d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, numbers[10];
+  g_main = tealet_new();
+
+  d0 = d1 = d2 = d3 = d4 = d5 = d6 = d7 = d8 = d9 = 0.0;
+  for (i=0; i<10; i++)
+    numbers[i] = 0.0;
+
+  for (i=0; i<99; i++)
+    {
+      tealet_t *t = tealet_new();
+      tealet_t *tres;
+      tealet_fill(t, test_double_run);
+      t->data = numbers;
+      tres = tealet_switch(g_main, t);
+      assert(tres == g_main);
+      d0 += numbers[0];
+      d1 += numbers[1];
+      d2 += numbers[2];
+      d3 += numbers[3];
+      d4 += numbers[4];
+      d5 += numbers[5];
+      d6 += numbers[6];
+      d7 += numbers[7];
+      d8 += numbers[8];
+      d9 += numbers[9];
+    }
+
+  assert(d0 == 4950.0 / 1.0);
+  assert(d1 == 4950.0 / 2.0);
+  assert(d2 == 4950.0 / 4.0);
+  assert(d3 == 4950.0 / 8.0);
+  assert(d4 == 4950.0 / 16.0);
+  assert(d5 == 4950.0 / 32.0);
+  assert(d6 == 4950.0 / 64.0);
+  assert(d7 == 4950.0 / 128.0);
+  assert(d8 == 4950.0 / 256.0);
+  assert(d9 == 4950.0 / 512.0);
+  tealet_delete(g_main);
+}
+
+/************************************************************/
+
+static tealet_t *g_main2, *g_sub, *g_sub2;
+
+tealet_t *test_two_mains_green(tealet_t *current)
+{
+  tealet_t *tres;
+  assert(current == g_sub2);
+
+  step(3); printf("3 G: M1 [S1]  M2 [S2]\n");
+  tres = tealet_switch(g_sub, g_main);
+  assert(tres == g_sub);
+
+  step(6); printf("6 G: M1 [S1]  [M2] S2\n");
+  return g_sub2;
+}
+
+tealet_t *test_two_mains_red(tealet_t *current)
+{
+  tealet_t *tres;
+  assert(current == g_sub);
+
+  step(2); printf("2 R: M1 [S1]  [M2] S2\n");
+  tres = tealet_switch(g_main2, g_sub2);
+  assert(tres == g_main2);
+
+  step(5); printf("5 R: [M1] S1  [M2] S2\n");
+  return g_sub;
+}
+
+void test_two_mains(void)
+{
+  int res;
+  tealet_t *tres;
+
+  status = 0;
+  g_main = tealet_new();
+  g_main2 = tealet_new();
+  g_sub = tealet_new();
+  g_sub2 = tealet_new();
+  res = tealet_fill(g_sub, test_two_mains_red);
+  assert(res == TEALET_OK);
+  res = tealet_fill(g_sub2, test_two_mains_green);
+  assert(res == TEALET_OK);
+
+  step(1); printf("1 W: [M1] S1  [M2] S2\n");
+  tres = tealet_switch(g_main, g_sub);
+  assert(tres == g_main);
+
+  step(4); printf("4 W: [M1] S1  M2 [S2]\n");
+  tres = tealet_switch(g_sub2, g_main2);
+  assert(tres == g_sub2);
+
+  step(7); printf("7 W: M1 [S1]  M2 [S2]\n");
+
+  tealet_delete(g_main);
+  tealet_delete(g_main2);
+  tealet_delete(g_sub);
+  tealet_delete(g_sub2);
+}
+#endif
+/************************************************************/
+
+#define TEST(name)   { name, #name }
+
+typedef struct {
+  void (*runtest)(void);
+  const char *name;
+} test_t;
+
+static test_t test_list[] = {
+  TEST(test_new),
+  TEST(test_simple_switch),
+  TEST(test_various_depths),
+#if 0
+  TEST(test_new_pending),
+  TEST(test_not_switched),
+  TEST(test_simple),
+  TEST(test_exit),
+  TEST(test_three_tealets),
+  TEST(test_two_mains),
+  TEST(test_switch),
+  TEST(test_double),
+  TEST(test_random),
+#endif
+  { NULL, NULL }
+};
+
+
+int main(int argc, char **argv)
+{
+  test_t *tst;
+  if (argc > 1)
+    srand(atoi(argv[1]));
+
+  thrd = stacklet_newthread();
+  for (tst=test_list; tst->runtest; tst++)
+    {
+      printf("+++ Running %s... +++\n", tst->name);
+      tst->runtest();
+    }
+  stacklet_deletethread(thrd);
+  printf("+++ All ok. +++\n");
+  return 0;
+}
diff --git a/pypy/translator/c/test/test_boehm.py b/pypy/translator/c/test/test_boehm.py
--- a/pypy/translator/c/test/test_boehm.py
+++ b/pypy/translator/c/test/test_boehm.py
@@ -17,7 +17,6 @@
 
 class AbstractGCTestClass(object):
     gcpolicy = "boehm"
-    stacklessgc = False
     use_threads = False
    
     # deal with cleanups
@@ -34,8 +33,6 @@
         config = get_pypy_config(translating=True)
         config.translation.gc = self.gcpolicy
         config.translation.thread = self.use_threads
-        if self.stacklessgc:
-            config.translation.gcrootfinder = "stackless"
         config.translation.simplifying = True
         t = TranslationContext(config=config)
         self.t = t
diff --git a/pypy/translator/c/test/test_newgc.py b/pypy/translator/c/test/test_newgc.py
--- a/pypy/translator/c/test/test_newgc.py
+++ b/pypy/translator/c/test/test_newgc.py
@@ -441,6 +441,45 @@
     def test_del_raises(self):
         self.run('del_raises') # does not raise
 
+    def define_custom_trace(cls):
+        from pypy.rpython.annlowlevel import llhelper
+        from pypy.rpython.lltypesystem import llmemory
+        #
+        S = lltype.GcStruct('S', ('x', llmemory.Address), rtti=True)
+        offset_of_x = llmemory.offsetof(S, 'x')
+        def customtrace(obj, prev):
+            if not prev:
+                return obj + offset_of_x
+            else:
+                return llmemory.NULL
+        CUSTOMTRACEFUNC = lltype.FuncType([llmemory.Address, llmemory.Address],
+                                          llmemory.Address)
+        customtraceptr = llhelper(lltype.Ptr(CUSTOMTRACEFUNC), customtrace)
+        lltype.attachRuntimeTypeInfo(S, customtraceptr=customtraceptr)
+        #
+        def setup():
+            s = lltype.nullptr(S)
+            for i in range(10000):
+                t = lltype.malloc(S)
+                t.x = llmemory.cast_ptr_to_adr(s)
+                s = t
+            return s
+        def measure_length(s):
+            res = 0
+            while s:
+                res += 1
+                s = llmemory.cast_adr_to_ptr(s.x, lltype.Ptr(S))
+            return res
+        def f(n):
+            s1 = setup()
+            llop.gc__collect(lltype.Void)
+            return measure_length(s1)
+        return f
+
+    def test_custom_trace(self):
+        res = self.run('custom_trace', 0)
+        assert res == 10000
+
     def define_weakref(cls):
         import weakref
 
diff --git a/pypy/translator/c/test/test_stackless.py b/pypy/translator/c/test/test_stackless.py
deleted file mode 100644
--- a/pypy/translator/c/test/test_stackless.py
+++ /dev/null
@@ -1,280 +0,0 @@
-from pypy.translator.translator import TranslationContext
-from pypy.translator.backendopt.all import backend_optimizations
-from pypy.translator.c.genc import CStandaloneBuilder
-from pypy.translator.c import gc
-from pypy.annotation.listdef import s_list_of_strings
-from pypy.rlib.rstack import stack_unwind, stack_frames_depth
-from pypy.rlib.rstack import yield_current_frame_to_caller, set_stack_depth_limit
-from pypy.config.config import Config
-import os
-
-
-class StacklessTest(object):
-    backendopt = False
-    gcpolicy = "boehm"
-    stacklessgc = False
-
-    def setup_class(cls):
-        import py
-        if cls.gcpolicy in (None, "ref"):
-            import py
-            py.test.skip("stackless + refcounting doesn't work any more for now")
-        elif cls.gcpolicy == "boehm":
-            from pypy.rpython.tool.rffi_platform import configure_boehm
-            from pypy.translator.platform import CompilationError
-            try:
-                configure_boehm()
-            except CompilationError:
-                py.test.skip("Boehm GC not present")
-
-    def wrap_stackless_function(self, fn):
-        def entry_point(argv):
-            os.write(1, str(fn())+"\n")
-            return 0
-
-        from pypy.config.pypyoption import get_pypy_config
-        config = get_pypy_config(translating=True)
-        config.translation.gc = self.gcpolicy
-        config.translation.stackless = True
-        if self.stacklessgc:
-            config.translation.gcrootfinder = "stackless"
-        t = TranslationContext(config=config)
-        self.t = t
-        t.buildannotator().build_types(entry_point, [s_list_of_strings])
-        t.buildrtyper().specialize()
-        if self.backendopt:
-            backend_optimizations(t)
-
-        from pypy.translator.transform import insert_ll_stackcheck
-        insert_ll_stackcheck(t)
-
-        cbuilder = CStandaloneBuilder(t, entry_point, config=config)
-        cbuilder.stackless = True
-        cbuilder.generate_source()
-        cbuilder.compile()
-        res = cbuilder.cmdexec('')
-        return int(res.strip())
-
-# ____________________________________________________________
-
-
-class TestStackless(StacklessTest):
-
-    def test_stack_depth(self):
-        def g1():
-            "just to check Void special cases around the code"
-        def g2(ignored):
-            g1()
-        def f(n):
-            g1()
-            if n > 0:
-                res = f(n-1) + 0 # make sure it is not a tail call
-            else:
-                res = stack_frames_depth()
-            g2(g1)
-            return res
-
-        def fn():
-            count0 = f(0)
-            count10 = f(10)
-            return count10 - count0
-
-        res = self.wrap_stackless_function(fn)
-        assert res == 10
-
-    def test_stack_withptr(self):
-        def f(n):
-            if n > 0:
-                res, dummy = f(n-1)
-            else:
-                res, dummy = stack_frames_depth(), 1
-            return res, dummy
-
-        def fn():
-            count0, _ = f(0)
-            count10, _ = f(10)
-            return count10 - count0
-
-        res = self.wrap_stackless_function(fn)
-        assert res == 10
-
-    def test_stackless_manytimes(self):
-        def f(n):
-            if n > 0:
-                stack_frames_depth()
-                res, dummy = f(n-1)
-            else:
-                res, dummy = stack_frames_depth(), 1
-            return res, dummy
-
-        def fn():
-            count0, _ = f(0)
-            count10, _ = f(100)
-            return count10 - count0
-
-        res = self.wrap_stackless_function(fn)
-        assert res == 100
-
-    def test_stackless_arguments(self):
-        def f(n, d, t):
-            if n > 0:
-                a, b, c = f(n-1, d, t)
-            else:
-                a, b, c = stack_frames_depth(), d, t
-            return a, b, c
-
-        def fn():
-            count0, d, t = f(0, 5.5, (1, 2))
-            count10, d, t = f(10, 5.5, (1, 2))
-            result = (count10 - count0) * 1000000
-            result += t[0]              * 10000
-            result += t[1]              * 100
-            result += int(d*10)
-            return result
-
-        res = self.wrap_stackless_function(fn)
-        assert res == 10010255
-
-
-    def test_stack_unwind(self):
-        def f():
-            stack_unwind()
-            return 42
-
-        res = self.wrap_stackless_function(f)
-        assert res == 42
-
-    def test_auto_stack_unwind(self):
-        import sys
-        def f(n):
-            if n == 1:
-                return 1
-            return (n+f(n-1)) % 1291
-
-        def fn():
-            set_stack_depth_limit(sys.maxint)
-            return f(10**6)
-        res = self.wrap_stackless_function(fn)
-        assert res == 704
-
-    def test_yield_frame(self):
-
-        def g(lst):
-            lst.append(2)
-            frametop_before_5 = yield_current_frame_to_caller()
-            lst.append(4)
-            frametop_before_7 = frametop_before_5.switch()
-            lst.append(6)
-            return frametop_before_7
-
-        def f():
-            lst = [1]
-            frametop_before_4 = g(lst)
-            lst.append(3)
-            frametop_before_6 = frametop_before_4.switch()
-            lst.append(5)
-            frametop_after_return = frametop_before_6.switch()
-            lst.append(7)
-            assert frametop_after_return is None
-            n = 0
-            for i in lst:
-                n = n*10 + i
-            return n
-
-        res = self.wrap_stackless_function(f)
-        assert res == 1234567
-
-    def test_foo(self):
-        def f():
-            c = g()
-            c.switch()
-            return 1
-        def g():
-            d = yield_current_frame_to_caller()
-            return d
-        res = self.wrap_stackless_function(f)
-        assert res == 1
-        
-
-    def test_yield_noswitch_frame(self):
-        # this time we make sure that function 'g' does not
-        # need to switch and even does not need to be stackless
-
-        def g(lst):
-            lst.append(2)
-            frametop_before_5 = yield_current_frame_to_caller()
-            lst.append(4)
-            return frametop_before_5
-
-        def f():
-            lst = [1]
-            frametop_before_4 = g(lst)
-            lst.append(3)
-            frametop_after_return = frametop_before_4.switch()
-            lst.append(5)
-            assert frametop_after_return is None
-            n = 0
-            for i in lst:
-                n = n*10 + i
-            return n
-
-        res = self.wrap_stackless_function(f)
-        assert res == 12345
-
-    # tested with refcounting too for sanity checking
-    def test_yield_frame_mem_pressure(self):
-
-        class A:
-            def __init__(self, value):
-                self.lst = [0] * 10000
-                self.lst[5000] = value
-
-            def inc(self, delta):
-                self.lst[5000] += delta
-                return self.lst[5000]
-
-        def g(lst):
-            a = A(1)
-            lst.append(a.inc(1))
-            frametop_before_5 = yield_current_frame_to_caller()
-            malloc_a_lot()
-            lst.append(a.inc(2))
-            frametop_before_7 = frametop_before_5.switch()
-            malloc_a_lot()
-            lst.append(a.inc(2))
-            return frametop_before_7
-
-        def f():
-            lst = [1]
-            frametop_before_4 = g(lst)
-            lst.append(3)
-            malloc_a_lot()
-            frametop_before_6 = frametop_before_4.switch()
-            lst.append(5)
-            malloc_a_lot()
-            frametop_after_return = frametop_before_6.switch()
-            lst.append(7)
-            assert frametop_after_return is None
-            n = 0
-            for i in lst:
-                n = n*10 + i
-            return n
-
-        res = self.wrap_stackless_function(f)
-        assert res == 1234567
-
-
-# ____________________________________________________________
-
-def malloc_a_lot():
-    i = 0
-    while i < 10:
-        i += 1
-        a = [1] * 10
-        j = 0
-        while j < 20:
-            j += 1
-            a.append(j)
-    from pypy.rpython.lltypesystem.lloperation import llop
-    from pypy.rpython.lltypesystem import lltype
-    llop.gc__collect(lltype.Void)
diff --git a/pypy/translator/c/test/test_tasklets.py b/pypy/translator/c/test/test_tasklets.py
deleted file mode 100644
--- a/pypy/translator/c/test/test_tasklets.py
+++ /dev/null
@@ -1,497 +0,0 @@
-import py
-import os
-
-from pypy.rpython.lltypesystem.llmemory import NULL
-from pypy.rlib.rstack import yield_current_frame_to_caller
-
-# ____________________________________________________________
-# For testing
-
-from pypy.translator.tool import cbuild
-from pypy.translator.c import gc
-from pypy.translator.c.test import test_stackless
-
-# count of loops in tests (set lower to speed up)
-loops = 1
-    
-def debug(s):
-    os.write(2, "%s\n" % s)
-
-class Globals:
-    def __init__(self):
-        pass
-
-globals = Globals()
-globals.count = 0
-
-# ____________________________________________________________
-
-class ThreadLocals(object):
-    pass
-threadlocals = ThreadLocals()
-
-class Resumable(object):
-    def __init__(self):
-        self.alive = False
-        
-    def start(self):