[pypy-commit] pypy numpy-comparison: merge default

Fri Sep 2 14:46:01 CEST 2011

Author: Ilya Osadchiy <osadchiy.ilya at gmail.com>
Branch: numpy-comparison
Changeset: r47015:ec77dcd63e59
Date: 2011-09-02 11:15 +0300
http://bitbucket.org/pypy/pypy/changeset/ec77dcd63e59/

Log:	merge default

diff too long, truncating to 10000 out of 18343 lines

diff --git a/lib-python/modified-2.7/ctypes/util.py b/lib-python/modified-2.7/ctypes/util.py
--- a/lib-python/modified-2.7/ctypes/util.py
+++ b/lib-python/modified-2.7/ctypes/util.py
@@ -72,8 +72,8 @@
         return name
 
 if os.name == "posix" and sys.platform == "darwin":
-    from ctypes.macholib.dyld import dyld_find as _dyld_find
     def find_library(name):
+        from ctypes.macholib.dyld import dyld_find as _dyld_find
         possible = ['lib%s.dylib' % name,
                     '%s.dylib' % name,
                     '%s.framework/%s' % (name, name)]
diff --git a/lib-python/modified-2.7/gzip.py b/lib-python/modified-2.7/gzip.py
new file mode 100644
--- /dev/null
+++ b/lib-python/modified-2.7/gzip.py
@@ -0,0 +1,514 @@
+"""Functions that read and write gzipped files.
+
+The user of the file doesn't have to worry about the compression,
+but random access is not allowed."""
+
+# based on Andrew Kuchling's minigzip.py distributed with the zlib module
+
+import struct, sys, time, os
+import zlib
+import io
+import __builtin__
+
+__all__ = ["GzipFile","open"]
+
+FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
+
+READ, WRITE = 1, 2
+
+def write32u(output, value):
+    # The L format writes the bit pattern correctly whether signed
+    # or unsigned.
+    output.write(struct.pack("<L", value))
+
+def read32(input):
+    return struct.unpack("<I", input.read(4))[0]
+
+def open(filename, mode="rb", compresslevel=9):
+    """Shorthand for GzipFile(filename, mode, compresslevel).
+
+    The filename argument is required; mode defaults to 'rb'
+    and compresslevel defaults to 9.
+
+    """
+    return GzipFile(filename, mode, compresslevel)
+
+class GzipFile(io.BufferedIOBase):
+    """The GzipFile class simulates most of the methods of a file object with
+    the exception of the readinto() and truncate() methods.
+
+    """
+
+    myfileobj = None
+    max_read_chunk = 10 * 1024 * 1024   # 10Mb
+
+    def __init__(self, filename=None, mode=None,
+                 compresslevel=9, fileobj=None, mtime=None):
+        """Constructor for the GzipFile class.
+
+        At least one of fileobj and filename must be given a
+        non-trivial value.
+
+        The new class instance is based on fileobj, which can be a regular
+        file, a StringIO object, or any other object which simulates a file.
+        It defaults to None, in which case filename is opened to provide
+        a file object.
+
+        When fileobj is not None, the filename argument is only used to be
+        included in the gzip file header, which may includes the original
+        filename of the uncompressed file.  It defaults to the filename of
+        fileobj, if discernible; otherwise, it defaults to the empty string,
+        and in this case the original filename is not included in the header.
+
+        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
+        depending on whether the file will be read or written.  The default
+        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
+        Be aware that only the 'rb', 'ab', and 'wb' values should be used
+        for cross-platform portability.
+
+        The compresslevel argument is an integer from 1 to 9 controlling the
+        level of compression; 1 is fastest and produces the least compression,
+        and 9 is slowest and produces the most compression.  The default is 9.
+
+        The mtime argument is an optional numeric timestamp to be written
+        to the stream when compressing.  All gzip compressed streams
+        are required to contain a timestamp.  If omitted or None, the
+        current time is used.  This module ignores the timestamp when
+        decompressing; however, some programs, such as gunzip, make use
+        of it.  The format of the timestamp is the same as that of the
+        return value of time.time() and of the st_mtime member of the
+        object returned by os.stat().
+
+        """
+
+        # guarantee the file is opened in binary mode on platforms
+        # that care about that sort of thing
+        if mode and 'b' not in mode:
+            mode += 'b'
+        if fileobj is None:
+            fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
+        if filename is None:
+            if hasattr(fileobj, 'name'): filename = fileobj.name
+            else: filename = ''
+        if mode is None:
+            if hasattr(fileobj, 'mode'): mode = fileobj.mode
+            else: mode = 'rb'
+
+        if mode[0:1] == 'r':
+            self.mode = READ
+            # Set flag indicating start of a new member
+            self._new_member = True
+            # Buffer data read from gzip file. extrastart is offset in
+            # stream where buffer starts. extrasize is number of
+            # bytes remaining in buffer from current stream position.
+            self.extrabuf = ""
+            self.extrasize = 0
+            self.extrastart = 0
+            self.name = filename
+            # Starts small, scales exponentially
+            self.min_readsize = 100
+
+        elif mode[0:1] == 'w' or mode[0:1] == 'a':
+            self.mode = WRITE
+            self._init_write(filename)
+            self.compress = zlib.compressobj(compresslevel,
+                                             zlib.DEFLATED,
+                                             -zlib.MAX_WBITS,
+                                             zlib.DEF_MEM_LEVEL,
+                                             0)
+        else:
+            raise IOError, "Mode " + mode + " not supported"
+
+        self.fileobj = fileobj
+        self.offset = 0
+        self.mtime = mtime
+
+        if self.mode == WRITE:
+            self._write_gzip_header()
+
+    @property
+    def filename(self):
+        import warnings
+        warnings.warn("use the name attribute", DeprecationWarning, 2)
+        if self.mode == WRITE and self.name[-3:] != ".gz":
+            return self.name + ".gz"
+        return self.name
+
+    def __repr__(self):
+        s = repr(self.fileobj)
+        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
+
+    def _check_closed(self):
+        """Raises a ValueError if the underlying file object has been closed.
+
+        """
+        if self.closed:
+            raise ValueError('I/O operation on closed file.')
+
+    def _init_write(self, filename):
+        self.name = filename
+        self.crc = zlib.crc32("") & 0xffffffffL
+        self.size = 0
+        self.writebuf = []
+        self.bufsize = 0
+
+    def _write_gzip_header(self):
+        self.fileobj.write('\037\213')             # magic header
+        self.fileobj.write('\010')                 # compression method
+        fname = os.path.basename(self.name)
+        if fname.endswith(".gz"):
+            fname = fname[:-3]
+        flags = 0
+        if fname:
+            flags = FNAME
+        self.fileobj.write(chr(flags))
+        mtime = self.mtime
+        if mtime is None:
+            mtime = time.time()
+        write32u(self.fileobj, long(mtime))
+        self.fileobj.write('\002')
+        self.fileobj.write('\377')
+        if fname:
+            self.fileobj.write(fname + '\000')
+
+    def _init_read(self):
+        self.crc = zlib.crc32("") & 0xffffffffL
+        self.size = 0
+
+    def _read_gzip_header(self):
+        magic = self.fileobj.read(2)
+        if magic != '\037\213':
+            raise IOError, 'Not a gzipped file'
+        method = ord( self.fileobj.read(1) )
+        if method != 8:
+            raise IOError, 'Unknown compression method'
+        flag = ord( self.fileobj.read(1) )
+        self.mtime = read32(self.fileobj)
+        # extraflag = self.fileobj.read(1)
+        # os = self.fileobj.read(1)
+        self.fileobj.read(2)
+
+        if flag & FEXTRA:
+            # Read & discard the extra field, if present
+            xlen = ord(self.fileobj.read(1))
+            xlen = xlen + 256*ord(self.fileobj.read(1))
+            self.fileobj.read(xlen)
+        if flag & FNAME:
+            # Read and discard a null-terminated string containing the filename
+            while True:
+                s = self.fileobj.read(1)
+                if not s or s=='\000':
+                    break
+        if flag & FCOMMENT:
+            # Read and discard a null-terminated string containing a comment
+            while True:
+                s = self.fileobj.read(1)
+                if not s or s=='\000':
+                    break
+        if flag & FHCRC:
+            self.fileobj.read(2)     # Read & discard the 16-bit header CRC
+
+    def write(self,data):
+        self._check_closed()
+        if self.mode != WRITE:
+            import errno
+            raise IOError(errno.EBADF, "write() on read-only GzipFile object")
+
+        if self.fileobj is None:
+            raise ValueError, "write() on closed GzipFile object"
+
+        # Convert data type if called by io.BufferedWriter.
+        if isinstance(data, memoryview):
+            data = data.tobytes()
+
+        if len(data) > 0:
+            self.size = self.size + len(data)
+            self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
+            self.fileobj.write( self.compress.compress(data) )
+            self.offset += len(data)
+
+        return len(data)
+
+    def read(self, size=-1):
+        self._check_closed()
+        if self.mode != READ:
+            import errno
+            raise IOError(errno.EBADF, "read() on write-only GzipFile object")
+
+        if self.extrasize <= 0 and self.fileobj is None:
+            return ''
+
+        readsize = 1024
+        if size < 0:        # get the whole thing
+            try:
+                while True:
+                    self._read(readsize)
+                    readsize = min(self.max_read_chunk, readsize * 2)
+            except EOFError:
+                size = self.extrasize
+        elif size == 0:
+            return ""
+        else:               # just get some more of it
+            try:
+                while size > self.extrasize:
+                    self._read(readsize)
+                    readsize = min(self.max_read_chunk, readsize * 2)
+            except EOFError:
+                if size > self.extrasize:
+                    size = self.extrasize
+
+        offset = self.offset - self.extrastart
+        chunk = self.extrabuf[offset: offset + size]
+        self.extrasize = self.extrasize - size
+
+        self.offset += size
+        return chunk
+
+    def _unread(self, buf):
+        self.extrasize = len(buf) + self.extrasize
+        self.offset -= len(buf)
+
+    def _read(self, size=1024):
+        if self.fileobj is None:
+            raise EOFError, "Reached EOF"
+
+        if self._new_member:
+            # If the _new_member flag is set, we have to
+            # jump to the next member, if there is one.
+            #
+            # First, check if we're at the end of the file;
+            # if so, it's time to stop; no more members to read.
+            pos = self.fileobj.tell()   # Save current position
+            self.fileobj.seek(0, 2)     # Seek to end of file
+            if pos == self.fileobj.tell():
+                raise EOFError, "Reached EOF"
+            else:
+                self.fileobj.seek( pos ) # Return to original position
+
+            self._init_read()
+            self._read_gzip_header()
+            self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
+            self._new_member = False
+
+        # Read a chunk of data from the file
+        buf = self.fileobj.read(size)
+
+        # If the EOF has been reached, flush the decompression object
+        # and mark this object as finished.
+
+        if buf == "":
+            uncompress = self.decompress.flush()
+            self._read_eof()
+            self._add_read_data( uncompress )
+            raise EOFError, 'Reached EOF'
+
+        uncompress = self.decompress.decompress(buf)
+        self._add_read_data( uncompress )
+
+        if self.decompress.unused_data != "":
+            # Ending case: we've come to the end of a member in the file,
+            # so seek back to the start of the unused data, finish up
+            # this member, and read a new gzip header.
+            # (The number of bytes to seek back is the length of the unused
+            # data, minus 8 because _read_eof() will rewind a further 8 bytes)
+            self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
+
+            # Check the CRC and file size, and set the flag so we read
+            # a new member on the next call
+            self._read_eof()
+            self._new_member = True
+
+    def _add_read_data(self, data):
+        self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
+        offset = self.offset - self.extrastart
+        self.extrabuf = self.extrabuf[offset:] + data
+        self.extrasize = self.extrasize + len(data)
+        self.extrastart = self.offset
+        self.size = self.size + len(data)
+
+    def _read_eof(self):
+        # We've read to the end of the file, so we have to rewind in order
+        # to reread the 8 bytes containing the CRC and the file size.
+        # We check the that the computed CRC and size of the
+        # uncompressed data matches the stored values.  Note that the size
+        # stored is the true file size mod 2**32.
+        self.fileobj.seek(-8, 1)
+        crc32 = read32(self.fileobj)
+        isize = read32(self.fileobj)  # may exceed 2GB
+        if crc32 != self.crc:
+            raise IOError("CRC check failed %s != %s" % (hex(crc32),
+                                                         hex(self.crc)))
+        elif isize != (self.size & 0xffffffffL):
+            raise IOError, "Incorrect length of data produced"
+
+        # Gzip files can be padded with zeroes and still have archives.
+        # Consume all zero bytes and set the file position to the first
+        # non-zero byte. See http://www.gzip.org/#faq8
+        c = "\x00"
+        while c == "\x00":
+            c = self.fileobj.read(1)
+        if c:
+            self.fileobj.seek(-1, 1)
+
+    @property
+    def closed(self):
+        return self.fileobj is None
+
+    def close(self):
+        if self.fileobj is None:
+            return
+        if self.mode == WRITE:
+            self.fileobj.write(self.compress.flush())
+            write32u(self.fileobj, self.crc)
+            # self.size may exceed 2GB, or even 4GB
+            write32u(self.fileobj, self.size & 0xffffffffL)
+            self.fileobj = None
+        elif self.mode == READ:
+            self.fileobj = None
+        if self.myfileobj:
+            self.myfileobj.close()
+            self.myfileobj = None
+
+    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
+        self._check_closed()
+        if self.mode == WRITE:
+            # Ensure the compressor's buffer is flushed
+            self.fileobj.write(self.compress.flush(zlib_mode))
+            self.fileobj.flush()
+
+    def fileno(self):
+        """Invoke the underlying file object's fileno() method.
+
+        This will raise AttributeError if the underlying file object
+        doesn't support fileno().
+        """
+        return self.fileobj.fileno()
+
+    def rewind(self):
+        '''Return the uncompressed stream file position indicator to the
+        beginning of the file'''
+        if self.mode != READ:
+            raise IOError("Can't rewind in write mode")
+        self.fileobj.seek(0)
+        self._new_member = True
+        self.extrabuf = ""
+        self.extrasize = 0
+        self.extrastart = 0
+        self.offset = 0
+
+    def readable(self):
+        return self.mode == READ
+
+    def writable(self):
+        return self.mode == WRITE
+
+    def seekable(self):
+        return True
+
+    def seek(self, offset, whence=0):
+        if whence:
+            if whence == 1:
+                offset = self.offset + offset
+            else:
+                raise ValueError('Seek from end not supported')
+        if self.mode == WRITE:
+            if offset < self.offset:
+                raise IOError('Negative seek in write mode')
+            count = offset - self.offset
+            for i in range(count // 1024):
+                self.write(1024 * '\0')
+            self.write((count % 1024) * '\0')
+        elif self.mode == READ:
+            if offset == self.offset:
+                self.read(0) # to make sure that this file is open
+                return self.offset
+            if offset < self.offset:
+                # for negative seek, rewind and do positive seek
+                self.rewind()
+            count = offset - self.offset
+            for i in range(count // 1024):
+                self.read(1024)
+            self.read(count % 1024)
+
+        return self.offset
+
+    def readline(self, size=-1):
+        if size < 0:
+            # Shortcut common case - newline found in buffer.
+            offset = self.offset - self.extrastart
+            i = self.extrabuf.find('\n', offset) + 1
+            if i > 0:
+                self.extrasize -= i - offset
+                self.offset += i - offset
+                return self.extrabuf[offset: i]
+
+            size = sys.maxint
+            readsize = self.min_readsize
+        else:
+            readsize = size
+        bufs = []
+        while size != 0:
+            c = self.read(readsize)
+            i = c.find('\n')
+
+            # We set i=size to break out of the loop under two
+            # conditions: 1) there's no newline, and the chunk is
+            # larger than size, or 2) there is a newline, but the
+            # resulting line would be longer than 'size'.
+            if (size <= i) or (i == -1 and len(c) > size):
+                i = size - 1
+
+            if i >= 0 or c == '':
+                bufs.append(c[:i + 1])    # Add portion of last chunk
+                self._unread(c[i + 1:])   # Push back rest of chunk
+                break
+
+            # Append chunk to list, decrease 'size',
+            bufs.append(c)
+            size = size - len(c)
+            readsize = min(size, readsize * 2)
+        if readsize > self.min_readsize:
+            self.min_readsize = min(readsize, self.min_readsize * 2, 512)
+        return ''.join(bufs) # Return resulting line
+
+
+def _test():
+    # Act like gzip; with -d, act like gunzip.
+    # The input file is not deleted, however, nor are any other gzip
+    # options or features supported.
+    args = sys.argv[1:]
+    decompress = args and args[0] == "-d"
+    if decompress:
+        args = args[1:]
+    if not args:
+        args = ["-"]
+    for arg in args:
+        if decompress:
+            if arg == "-":
+                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
+                g = sys.stdout
+            else:
+                if arg[-3:] != ".gz":
+                    print "filename doesn't end in .gz:", repr(arg)
+                    continue
+                f = open(arg, "rb")
+                g = __builtin__.open(arg[:-3], "wb")
+        else:
+            if arg == "-":
+                f = sys.stdin
+                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
+            else:
+                f = __builtin__.open(arg, "rb")
+                g = open(arg + ".gz", "wb")
+        while True:
+            chunk = f.read(1024)
+            if not chunk:
+                break
+            g.write(chunk)
+        if g is not sys.stdout:
+            g.close()
+        if f is not sys.stdin:
+            f.close()
+
+if __name__ == '__main__':
+    _test()
diff --git a/lib-python/modified-2.7/tarfile.py b/lib-python/modified-2.7/tarfile.py
--- a/lib-python/modified-2.7/tarfile.py
+++ b/lib-python/modified-2.7/tarfile.py
@@ -252,8 +252,8 @@
        the high bit set. So we calculate two checksums, unsigned and
        signed.
     """
-    unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
-    signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
+    unsigned_chksum = 256 + sum(struct.unpack("148B8x356B", buf[:512]))
+    signed_chksum = 256 + sum(struct.unpack("148b8x356b", buf[:512]))
     return unsigned_chksum, signed_chksum
 
 def copyfileobj(src, dst, length=None):
@@ -265,7 +265,6 @@
     if length is None:
         shutil.copyfileobj(src, dst)
         return
-
     BUFSIZE = 16 * 1024
     blocks, remainder = divmod(length, BUFSIZE)
     for b in xrange(blocks):
@@ -802,19 +801,19 @@
         if self.closed:
             raise ValueError("I/O operation on closed file")
 
-        buf = ""
         if self.buffer:
             if size is None:
-                buf = self.buffer
+                buf = self.buffer + self.fileobj.read()
                 self.buffer = ""
             else:
                 buf = self.buffer[:size]
                 self.buffer = self.buffer[size:]
-
-        if size is None:
-            buf += self.fileobj.read()
+                buf += self.fileobj.read(size - len(buf))
         else:
-            buf += self.fileobj.read(size - len(buf))
+            if size is None:
+                buf = self.fileobj.read()
+            else:
+                buf = self.fileobj.read(size)
 
         self.position += len(buf)
         return buf
diff --git a/lib_pypy/_ctypes/basics.py b/lib_pypy/_ctypes/basics.py
--- a/lib_pypy/_ctypes/basics.py
+++ b/lib_pypy/_ctypes/basics.py
@@ -166,7 +166,8 @@
     return tp._alignmentofinstances()
 
 def byref(cdata):
-    from ctypes import pointer
+    # "pointer" is imported at the end of this module to avoid circular
+    # imports
     return pointer(cdata)
 
 def cdata_from_address(self, address):
@@ -226,3 +227,6 @@
     'v' : _ffi.types.sshort,
     }
 
+
+# used by "byref"
+from _ctypes.pointer import pointer
diff --git a/lib_pypy/_ctypes/function.py b/lib_pypy/_ctypes/function.py
--- a/lib_pypy/_ctypes/function.py
+++ b/lib_pypy/_ctypes/function.py
@@ -469,7 +469,8 @@
         newargs = []
         for argtype, arg in zip(argtypes, args):
             param = argtype.from_param(arg)
-            if argtype._type_ == 'P': # special-case for c_void_p
+            _type_ = getattr(argtype, '_type_', None)
+            if _type_ == 'P': # special-case for c_void_p
                 param = param._get_buffer_value()
             elif self._is_primitive(argtype):
                 param = param.value
diff --git a/lib_pypy/_ctypes/structure.py b/lib_pypy/_ctypes/structure.py
--- a/lib_pypy/_ctypes/structure.py
+++ b/lib_pypy/_ctypes/structure.py
@@ -169,6 +169,8 @@
 
     def from_address(self, address):
         instance = StructOrUnion.__new__(self)
+        if isinstance(address, _rawffi.StructureInstance):
+            address = address.buffer
         instance.__dict__['_buffer'] = self._ffistruct.fromaddress(address)
         return instance
 
diff --git a/lib_pypy/_sqlite3.py b/lib_pypy/_sqlite3.py
--- a/lib_pypy/_sqlite3.py
+++ b/lib_pypy/_sqlite3.py
@@ -24,6 +24,7 @@
 from ctypes import c_void_p, c_int, c_double, c_int64, c_char_p, cdll
 from ctypes import POINTER, byref, string_at, CFUNCTYPE, cast
 from ctypes import sizeof, c_ssize_t
+from collections import OrderedDict
 import datetime
 import sys
 import time
@@ -274,6 +275,28 @@
 def unicode_text_factory(x):
     return unicode(x, 'utf-8')
 
+
+class StatementCache(object):
+    def __init__(self, connection, maxcount):
+        self.connection = connection
+        self.maxcount = maxcount
+        self.cache = OrderedDict()
+
+    def get(self, sql, cursor, row_factory):
+        try:
+            stat = self.cache[sql]
+        except KeyError:
+            stat = Statement(self.connection, sql)
+            self.cache[sql] = stat
+            if len(self.cache) > self.maxcount:
+                self.cache.popitem(0)
+        #
+        if stat.in_use:
+            stat = Statement(self.connection, sql)
+        stat.set_cursor_and_factory(cursor, row_factory)
+        return stat
+
+
 class Connection(object):
     def __init__(self, database, timeout=5.0, detect_types=0, isolation_level="",
                  check_same_thread=True, factory=None, cached_statements=100):
@@ -291,6 +314,7 @@
         self.row_factory = None
         self._isolation_level = isolation_level
         self.detect_types = detect_types
+        self.statement_cache = StatementCache(self, cached_statements)
 
         self.cursors = []
 
@@ -399,7 +423,7 @@
         cur = Cursor(self)
         if not isinstance(sql, (str, unicode)):
             raise Warning("SQL is of wrong type. Must be string or unicode.")
-        statement = Statement(cur, sql, self.row_factory)
+        statement = self.statement_cache.get(sql, cur, self.row_factory)
         return statement
 
     def _get_isolation_level(self):
@@ -708,7 +732,7 @@
         if type(sql) is unicode:
             sql = sql.encode("utf-8")
         self._check_closed()
-        self.statement = Statement(self, sql, self.row_factory)
+        self.statement = self.connection.statement_cache.get(sql, self, self.row_factory)
 
         if self.connection._isolation_level is not None:
             if self.statement.kind == "DDL":
@@ -746,7 +770,8 @@
         if type(sql) is unicode:
             sql = sql.encode("utf-8")
         self._check_closed()
-        self.statement = Statement(self, sql, self.row_factory)
+        self.statement = self.connection.statement_cache.get(sql, self, self.row_factory)
+        
         if self.statement.kind == "DML":
             self.connection._begin()
         else:
@@ -871,14 +896,12 @@
     lastrowid = property(_getlastrowid)
 
 class Statement(object):
-    def __init__(self, cur, sql, row_factory):
+    def __init__(self, connection, sql):
         self.statement = None
         if not isinstance(sql, str):
             raise ValueError, "sql must be a string"
-        self.con = cur.connection
-        self.cur = weakref.ref(cur)
+        self.con = connection
         self.sql = sql # DEBUG ONLY
-        self.row_factory = row_factory
         first_word = self._statement_kind = sql.lstrip().split(" ")[0].upper()
         if first_word in ("INSERT", "UPDATE", "DELETE", "REPLACE"):
             self.kind = "DML"
@@ -887,6 +910,11 @@
         else:
             self.kind = "DDL"
         self.exhausted = False
+        self.in_use = False
+        #
+        # set by set_cursor_and_factory
+        self.cur = None
+        self.row_factory = None
 
         self.statement = c_void_p()
         next_char = c_char_p()
@@ -907,6 +935,10 @@
 
         self._build_row_cast_map()
 
+    def set_cursor_and_factory(self, cur, row_factory):
+        self.cur = weakref.ref(cur)
+        self.row_factory = row_factory
+
     def _build_row_cast_map(self):
         self.row_cast_map = []
         for i in xrange(sqlite.sqlite3_column_count(self.statement)):
@@ -976,6 +1008,7 @@
         ret = sqlite.sqlite3_reset(self.statement)
         if ret != SQLITE_OK:
             raise self.con._get_exception(ret)
+        self.mark_dirty()
 
         if params is None:
             if sqlite.sqlite3_bind_parameter_count(self.statement) != 0:
@@ -1068,11 +1101,17 @@
 
     def reset(self):
         self.row_cast_map = None
-        return sqlite.sqlite3_reset(self.statement)
+        ret = sqlite.sqlite3_reset(self.statement)
+        self.in_use = False
+        return ret
 
     def finalize(self):
         sqlite.sqlite3_finalize(self.statement)
         self.statement = None
+        self.in_use = False
+
+    def mark_dirty(self):
+        self.in_use = True
 
     def __del__(self):
         sqlite.sqlite3_finalize(self.statement)
diff --git a/lib_pypy/greenlet.py b/lib_pypy/greenlet.py
--- a/lib_pypy/greenlet.py
+++ b/lib_pypy/greenlet.py
@@ -1,1 +1,138 @@
-from _stackless import greenlet
+import _continuation, sys
+
+
+# ____________________________________________________________
+# Exceptions
+
+class GreenletExit(Exception):
+    """This special exception does not propagate to the parent greenlet; it
+can be used to kill a single greenlet."""
+
+error = _continuation.error
+
+# ____________________________________________________________
+# Helper function
+
+def getcurrent():
+    "Returns the current greenlet (i.e. the one which called this function)."
+    try:
+        return _tls.current
+    except AttributeError:
+        # first call in this thread: current == main
+        _green_create_main()
+        return _tls.current
+
+# ____________________________________________________________
+# The 'greenlet' class
+
+_continulet = _continuation.continulet
+
+class greenlet(_continulet):
+    getcurrent = staticmethod(getcurrent)
+    error = error
+    GreenletExit = GreenletExit
+    __main = False
+    __started = False
+
+    def __new__(cls, *args, **kwds):
+        self = _continulet.__new__(cls)
+        self.parent = getcurrent()
+        return self
+
+    def __init__(self, run=None, parent=None):
+        if run is not None:
+            self.run = run
+        if parent is not None:
+            self.parent = parent
+
+    def switch(self, *args):
+        "Switch execution to this greenlet, optionally passing the values "
+        "given as argument(s).  Returns the value passed when switching back."
+        return self.__switch(_continulet.switch, args)
+
+    def throw(self, typ=GreenletExit, val=None, tb=None):
+        "raise exception in greenlet, return value passed when switching back"
+        return self.__switch(_continulet.throw, typ, val, tb)
+
+    def __switch(target, unbound_method, *args):
+        current = getcurrent()
+        #
+        while not target:
+            if not target.__started:
+                _continulet.__init__(target, _greenlet_start, *args)
+                args = ()
+                target.__started = True
+                break
+            # already done, go to the parent instead
+            # (NB. infinite loop possible, but unlikely, unless you mess
+            # up the 'parent' explicitly.  Good enough, because a Ctrl-C
+            # will show that the program is caught in this loop here.)
+            target = target.parent
+        #
+        try:
+            if current.__main:
+                if target.__main:
+                    # switch from main to main
+                    if unbound_method == _continulet.throw:
+                        raise args[0], args[1], args[2]
+                    (args,) = args
+                else:
+                    # enter from main to target
+                    args = unbound_method(target, *args)
+            else:
+                if target.__main:
+                    # leave to go to target=main
+                    args = unbound_method(current, *args)
+                else:
+                    # switch from non-main to non-main
+                    args = unbound_method(current, *args, to=target)
+        except GreenletExit, e:
+            args = (e,)
+        finally:
+            _tls.current = current
+        #
+        if len(args) == 1:
+            return args[0]
+        else:
+            return args
+
+    def __nonzero__(self):
+        return self.__main or _continulet.is_pending(self)
+
+    @property
+    def dead(self):
+        return self.__started and not self
+
+    @property
+    def gr_frame(self):
+        raise NotImplementedError("attribute 'gr_frame' of greenlet objects")
+
+# ____________________________________________________________
+# Internal stuff
+
+try:
+    from thread import _local
+except ImportError:
+    class _local(object):    # assume no threads
+        pass
+
+_tls = _local()
+
+def _green_create_main():
+    # create the main greenlet for this thread
+    _tls.current = None
+    gmain = greenlet.__new__(greenlet)
+    gmain._greenlet__main = True
+    gmain._greenlet__started = True
+    assert gmain.parent is None
+    _tls.main = gmain
+    _tls.current = gmain
+
+def _greenlet_start(greenlet, args):
+    _tls.current = greenlet
+    try:
+        res = greenlet.run(*args)
+    finally:
+        if greenlet.parent is not _tls.main:
+            _continuation.permute(greenlet, greenlet.parent)
+    return (res,)
diff --git a/lib_pypy/pyrepl/reader.py b/lib_pypy/pyrepl/reader.py
--- a/lib_pypy/pyrepl/reader.py
+++ b/lib_pypy/pyrepl/reader.py
@@ -401,13 +401,19 @@
             return "(arg: %s) "%self.arg
         if "\n" in self.buffer:
             if lineno == 0:
-                return self._ps2
+                res = self.ps2
             elif lineno == self.buffer.count("\n"):
-                return self._ps4
+                res = self.ps4
             else:
-                return self._ps3
+                res = self.ps3
         else:
-            return self._ps1
+            res = self.ps1
+        # Lazily call str() on self.psN, and cache the results using as key
+        # the object on which str() was called.  This ensures that even if the
+        # same object is used e.g. for ps1 and ps2, str() is called only once.
+        if res not in self._pscache:
+            self._pscache[res] = str(res)
+        return self._pscache[res]
 
     def push_input_trans(self, itrans):
         self.input_trans_stack.append(self.input_trans)
@@ -473,8 +479,7 @@
             self.pos = 0
             self.dirty = 1
             self.last_command = None
-            self._ps1, self._ps2, self._ps3, self._ps4 = \
-                           map(str, [self.ps1, self.ps2, self.ps3, self.ps4])
+            self._pscache = {}
         except:
             self.restore()
             raise
diff --git a/pypy/config/makerestdoc.py b/pypy/config/makerestdoc.py
--- a/pypy/config/makerestdoc.py
+++ b/pypy/config/makerestdoc.py
@@ -134,7 +134,7 @@
         for child in self._children:
             subpath = fullpath + "." + child._name
             toctree.append(subpath)
-        content.add(Directive("toctree", *toctree, maxdepth=4))
+        content.add(Directive("toctree", *toctree, **{'maxdepth': 4}))
         content.join(
             ListItem(Strong("name:"), self._name),
             ListItem(Strong("description:"), self.doc))
diff --git a/pypy/config/pypyoption.py b/pypy/config/pypyoption.py
--- a/pypy/config/pypyoption.py
+++ b/pypy/config/pypyoption.py
@@ -33,7 +33,8 @@
      "struct", "_hashlib", "_md5", "_sha", "_minimal_curses", "cStringIO",
      "thread", "itertools", "pyexpat", "_ssl", "cpyext", "array",
      "_bisect", "binascii", "_multiprocessing", '_warnings',
-     "_collections", "_multibytecodec", "micronumpy", "_ffi"]
+     "_collections", "_multibytecodec", "micronumpy", "_ffi",
+     "_continuation"]
 ))
 
 translation_modules = default_modules.copy()
@@ -99,6 +100,7 @@
     "_ssl"      : ["pypy.module._ssl.interp_ssl"],
     "_hashlib"  : ["pypy.module._ssl.interp_ssl"],
     "_minimal_curses": ["pypy.module._minimal_curses.fficurses"],
+    "_continuation": ["pypy.rlib.rstacklet"],
     }
 
 def get_module_validator(modname):
diff --git a/pypy/config/test/test_config.py b/pypy/config/test/test_config.py
--- a/pypy/config/test/test_config.py
+++ b/pypy/config/test/test_config.py
@@ -1,5 +1,5 @@
 from pypy.config.config import *
-import py
+import py, sys
 
 def make_description():
     gcoption = ChoiceOption('name', 'GC name', ['ref', 'framework'], 'ref')
@@ -69,13 +69,15 @@
     attrs = dir(config)
     assert '__repr__' in attrs        # from the type
     assert '_cfgimpl_values' in attrs # from self
-    assert 'gc' in attrs              # custom attribute
-    assert 'objspace' in attrs        # custom attribute
+    if sys.version_info >= (2, 6):
+        assert 'gc' in attrs              # custom attribute
+        assert 'objspace' in attrs        # custom attribute
     #
     attrs = dir(config.gc)
-    assert 'name' in attrs
-    assert 'dummy' in attrs
-    assert 'float' in attrs
+    if sys.version_info >= (2, 6):
+        assert 'name' in attrs
+        assert 'dummy' in attrs
+        assert 'float' in attrs
 
 def test_arbitrary_option():
     descr = OptionDescription("top", "", [
diff --git a/pypy/config/translationoption.py b/pypy/config/translationoption.py
--- a/pypy/config/translationoption.py
+++ b/pypy/config/translationoption.py
@@ -28,10 +28,9 @@
 
 translation_optiondescription = OptionDescription(
         "translation", "Translation Options", [
-    BoolOption("stackless", "enable stackless features during compilation",
-               default=False, cmdline="--stackless",
-               requires=[("translation.type_system", "lltype"),
-                         ("translation.gcremovetypeptr", False)]),  # XXX?
+    BoolOption("continuation", "enable single-shot continuations",
+               default=False, cmdline="--continuation",
+               requires=[("translation.type_system", "lltype")]),
     ChoiceOption("type_system", "Type system to use when RTyping",
                  ["lltype", "ootype"], cmdline=None, default="lltype",
                  requires={
@@ -70,7 +69,8 @@
                      "statistics": [("translation.gctransformer", "framework")],
                      "generation": [("translation.gctransformer", "framework")],
                      "hybrid": [("translation.gctransformer", "framework")],
-                     "boehm": [("translation.gctransformer", "boehm")],
+                     "boehm": [("translation.gctransformer", "boehm"),
+                               ("translation.continuation", False)],  # breaks
                      "markcompact": [("translation.gctransformer", "framework")],
                      "minimark": [("translation.gctransformer", "framework")],
                      },
@@ -389,8 +389,6 @@
             config.translation.suggest(withsmallfuncsets=5)
         elif word == 'jit':
             config.translation.suggest(jit=True)
-            if config.translation.stackless:
-                raise NotImplementedError("JIT conflicts with stackless for now")
         elif word == 'removetypeptr':
             config.translation.suggest(gcremovetypeptr=True)
         else:
diff --git a/pypy/doc/_ref.txt b/pypy/doc/_ref.txt
--- a/pypy/doc/_ref.txt
+++ b/pypy/doc/_ref.txt
@@ -1,11 +1,10 @@
 .. _`ctypes_configure/doc/sample.py`: https://bitbucket.org/pypy/pypy/src/default/ctypes_configure/doc/sample.py
 .. _`demo/`: https://bitbucket.org/pypy/pypy/src/default/demo/
-.. _`demo/pickle_coroutine.py`: https://bitbucket.org/pypy/pypy/src/default/demo/pickle_coroutine.py
 .. _`lib-python/`: https://bitbucket.org/pypy/pypy/src/default/lib-python/
 .. _`lib-python/2.7/dis.py`: https://bitbucket.org/pypy/pypy/src/default/lib-python/2.7/dis.py
 .. _`lib_pypy/`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/
+.. _`lib_pypy/greenlet.py`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/greenlet.py
 .. _`lib_pypy/pypy_test/`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/pypy_test/
-.. _`lib_pypy/stackless.py`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/stackless.py
 .. _`lib_pypy/tputil.py`: https://bitbucket.org/pypy/pypy/src/default/lib_pypy/tputil.py
 .. _`pypy/annotation`:
 .. _`pypy/annotation/`: https://bitbucket.org/pypy/pypy/src/default/pypy/annotation/
@@ -55,7 +54,6 @@
 .. _`pypy/module`:
 .. _`pypy/module/`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/
 .. _`pypy/module/__builtin__/__init__.py`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/__builtin__/__init__.py
-.. _`pypy/module/_stackless/test/test_composable_coroutine.py`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/_stackless/test/test_composable_coroutine.py
 .. _`pypy/objspace`:
 .. _`pypy/objspace/`: https://bitbucket.org/pypy/pypy/src/default/pypy/objspace/
 .. _`pypy/objspace/dump.py`: https://bitbucket.org/pypy/pypy/src/default/pypy/objspace/dump.py
@@ -117,6 +115,7 @@
 .. _`pypy/translator/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/
 .. _`pypy/translator/backendopt/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/backendopt/
 .. _`pypy/translator/c/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/c/
+.. _`pypy/translator/c/src/stacklet/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/c/src/stacklet/
 .. _`pypy/translator/cli/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/cli/
 .. _`pypy/translator/goal/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/goal/
 .. _`pypy/translator/jvm/`: https://bitbucket.org/pypy/pypy/src/default/pypy/translator/jvm/
diff --git a/pypy/doc/architecture.rst b/pypy/doc/architecture.rst
--- a/pypy/doc/architecture.rst
+++ b/pypy/doc/architecture.rst
@@ -153,7 +153,7 @@
 
 * Optionally, `various transformations`_ can then be applied which, for
   example, perform optimizations such as inlining, add capabilities
-  such as stackless_-style concurrency, or insert code for the
+  such as stackless-style concurrency (deprecated), or insert code for the
   `garbage collector`_.
 
 * Then, the graphs are converted to source code for the target platform
@@ -255,7 +255,6 @@
 
 .. _Python: http://docs.python.org/reference/
 .. _Psyco: http://psyco.sourceforge.net
-.. _stackless: stackless.html
 .. _`generate Just-In-Time Compilers`: jit/index.html
 .. _`JIT Generation in PyPy`: jit/index.html
 .. _`implement your own interpreter`: http://morepypy.blogspot.com/2011/04/tutorial-writing-interpreter-with-pypy.html
diff --git a/pypy/doc/config/objspace.usemodules._stackless.txt b/pypy/doc/config/objspace.usemodules._continuation.txt
copy from pypy/doc/config/objspace.usemodules._stackless.txt
copy to pypy/doc/config/objspace.usemodules._continuation.txt
--- a/pypy/doc/config/objspace.usemodules._stackless.txt
+++ b/pypy/doc/config/objspace.usemodules._continuation.txt
@@ -1,6 +1,4 @@
-Use the '_stackless' module. 
+Use the '_continuation' module. 
 
-Exposes the `stackless` primitives, and also implies a stackless build. 
-See also :config:`translation.stackless`.
-
-.. _`stackless`: ../stackless.html
+Exposes the `continulet` app-level primitives.
+See also :config:`translation.continuation`.
diff --git a/pypy/doc/config/objspace.usemodules._stackless.txt b/pypy/doc/config/objspace.usemodules._stackless.txt
--- a/pypy/doc/config/objspace.usemodules._stackless.txt
+++ b/pypy/doc/config/objspace.usemodules._stackless.txt
@@ -1,6 +1,1 @@
-Use the '_stackless' module. 
-
-Exposes the `stackless` primitives, and also implies a stackless build. 
-See also :config:`translation.stackless`.
-
-.. _`stackless`: ../stackless.html
+Deprecated.
diff --git a/pypy/doc/config/translation.stackless.txt b/pypy/doc/config/translation.continuation.txt
rename from pypy/doc/config/translation.stackless.txt
rename to pypy/doc/config/translation.continuation.txt
--- a/pypy/doc/config/translation.stackless.txt
+++ b/pypy/doc/config/translation.continuation.txt
@@ -1,5 +1,2 @@
-Run the `stackless transform`_ on each generated graph, which enables the use
-of coroutines at RPython level and the "stackless" module when translating
-PyPy.
-
-.. _`stackless transform`: ../stackless.html
+Enable the use of a stackless-like primitive called "stacklet".
+In PyPy, this is exposed at app-level by the "_continuation" module.
diff --git a/pypy/doc/cpython_differences.rst b/pypy/doc/cpython_differences.rst
--- a/pypy/doc/cpython_differences.rst
+++ b/pypy/doc/cpython_differences.rst
@@ -24,6 +24,7 @@
     _bisect
     _codecs
     _collections
+    `_continuation`_
     `_ffi`_
     _hashlib
     _io
@@ -84,10 +85,6 @@
 
     _winreg
 
-  Extra module with Stackless_ only:
-
-    _stackless
-
   Note that only some of these modules are built-in in a typical
   CPython installation, and the rest is from non built-in extension
   modules.  This means that e.g. ``import parser`` will, on CPython,
@@ -108,11 +105,11 @@
 
 .. the nonstandard modules are listed below...
 .. _`__pypy__`: __pypy__-module.html
+.. _`_continuation`: stackless.html
 .. _`_ffi`: ctypes-implementation.html
 .. _`_rawffi`: ctypes-implementation.html
 .. _`_minimal_curses`: config/objspace.usemodules._minimal_curses.html
 .. _`cpyext`: http://morepypy.blogspot.com/2010/04/using-cpython-extension-modules-with.html
-.. _Stackless: stackless.html
 
 
 Differences related to garbage collection strategies
diff --git a/pypy/doc/faq.rst b/pypy/doc/faq.rst
--- a/pypy/doc/faq.rst
+++ b/pypy/doc/faq.rst
@@ -315,6 +315,28 @@
 
 .. _`Andrew Brown's tutorial`: http://morepypy.blogspot.com/2011/04/tutorial-writing-interpreter-with-pypy.html
 
+---------------------------------------------------------
+Can RPython modules for PyPy be translated independently?
+---------------------------------------------------------
+
+No, you have to rebuild the entire interpreter.  This means two things:
+
+* It is imperative to use test-driven development.  You have to test
+  exhaustively your module in pure Python, before even attempting to
+  translate it.  Once you translate it, you should have only a few typing
+  issues left to fix, but otherwise the result should work out of the box.
+
+* Second, and perhaps most important: do you have a really good reason
+  for writing the module in RPython in the first place?  Nowadays you
+  should really look at alternatives, like writing it in pure Python,
+  using ctypes if it needs to call C code.  Other alternatives are being
+  developed too (as of summer 2011), like a Cython binding.
+
+In this context it is not that important to be able to translate
+RPython modules independently of translating the complete interpreter.
+(It could be done given enough efforts, but it's a really serious
+undertaking.  Consider it as quite unlikely for now.)
+
 ----------------------------------------------------------
 Why does PyPy draw a Mandelbrot fractal while translating?
 ----------------------------------------------------------
diff --git a/pypy/doc/getting-started-python.rst b/pypy/doc/getting-started-python.rst
--- a/pypy/doc/getting-started-python.rst
+++ b/pypy/doc/getting-started-python.rst
@@ -67,7 +67,6 @@
    * ``libssl-dev`` (for the optional ``_ssl`` module)
    * ``libgc-dev`` (for the Boehm garbage collector: only needed when translating with `--opt=0, 1` or `size`)
    * ``python-sphinx`` (for the optional documentation build.  You need version 1.0.7 or later)
-   * ``python-greenlet`` (for the optional stackless support in interpreted mode/testing)
 
 
 3. Translation is time-consuming -- 45 minutes on a very fast machine --
@@ -120,19 +119,8 @@
 Installation_ below.
 
 The ``translate.py`` script takes a very large number of options controlling
-what to translate and how.  See ``translate.py -h``. Some of the more
-interesting options (but for now incompatible with the JIT) are:
-
-   * ``--stackless``: this produces a pypy-c that includes features
-     inspired by `Stackless Python <http://www.stackless.com>`__.
-
-   * ``--gc=boehm|ref|marknsweep|semispace|generation|hybrid|minimark``:
-     choose between using
-     the `Boehm-Demers-Weiser garbage collector`_, our reference
-     counting implementation or one of own collector implementations
-     (the default depends on the optimization level but is usually
-     ``minimark``).
-
+what to translate and how.  See ``translate.py -h``. The default options
+should be suitable for mostly everybody by now.
 Find a more detailed description of the various options in our `configuration
 sections`_.
 
diff --git a/pypy/doc/how-to-release.rst b/pypy/doc/how-to-release.rst
--- a/pypy/doc/how-to-release.rst
+++ b/pypy/doc/how-to-release.rst
@@ -42,7 +42,6 @@
     JIT: windows, linux, os/x
     no JIT: windows, linux, os/x
     sandbox: linux, os/x
-    stackless: windows, linux, os/x
 
 * write release announcement pypy/doc/release-x.y(.z).txt
   the release announcement should contain a direct link to the download page
diff --git a/pypy/doc/index.rst b/pypy/doc/index.rst
--- a/pypy/doc/index.rst
+++ b/pypy/doc/index.rst
@@ -35,7 +35,7 @@
 
   * `Differences between PyPy and CPython`_
   * `What PyPy can do for your objects`_
-  * `Stackless and coroutines`_
+  * `Continulets and greenlets`_
   * `JIT Generation in PyPy`_ 
   * `Sandboxing Python code`_
 
@@ -292,8 +292,6 @@
 
 `pypy/translator/jvm/`_            the Java backend
 
-`pypy/translator/stackless/`_      the `Stackless Transform`_
-
 `pypy/translator/tool/`_           helper tools for translation, including the Pygame
                                    `graph viewer`_
 
@@ -318,7 +316,7 @@
 .. _`transparent proxies`: objspace-proxies.html#tproxy
 .. _`Differences between PyPy and CPython`: cpython_differences.html
 .. _`What PyPy can do for your objects`: objspace-proxies.html
-.. _`Stackless and coroutines`: stackless.html
+.. _`Continulets and greenlets`: stackless.html
 .. _StdObjSpace: objspace.html#the-standard-object-space 
 .. _`abstract interpretation`: http://en.wikipedia.org/wiki/Abstract_interpretation
 .. _`rpython`: coding-guide.html#rpython 
@@ -337,7 +335,6 @@
 .. _`low-level type system`: rtyper.html#low-level-type
 .. _`object-oriented type system`: rtyper.html#oo-type
 .. _`garbage collector`: garbage_collection.html
-.. _`Stackless Transform`: translation.html#the-stackless-transform
 .. _`main PyPy-translation scripts`: getting-started-python.html#translating-the-pypy-python-interpreter
 .. _`.NET`: http://www.microsoft.com/net/
 .. _Mono: http://www.mono-project.com/
diff --git a/pypy/doc/jit/pyjitpl5.rst b/pypy/doc/jit/pyjitpl5.rst
--- a/pypy/doc/jit/pyjitpl5.rst
+++ b/pypy/doc/jit/pyjitpl5.rst
@@ -103,7 +103,7 @@
 
 The meta-interpreter starts interpreting the JIT bytecode.  Each operation is
 executed and then recorded in a list of operations, called the trace.
-Operations can have a list of boxes that operate on, arguments.  Some operations
+Operations can have a list of boxes they operate on, arguments.  Some operations
 (like GETFIELD and GETARRAYITEM) also have special objects that describe how
 their arguments are laid out in memory.  All possible operations generated by
 tracing are listed in metainterp/resoperation.py.  When a (interpreter-level)
diff --git a/pypy/doc/rlib.rst b/pypy/doc/rlib.rst
--- a/pypy/doc/rlib.rst
+++ b/pypy/doc/rlib.rst
@@ -134,69 +134,6 @@
 a hierarchy of Address classes, in a typical static-OO-programming style.
 
 
-``rstack``
-==========
-
-The `pypy/rlib/rstack.py`_ module allows an RPython program to control its own execution stack.
-This is only useful if the program is translated using stackless. An old
-description of the exposed functions is below.
-
-We introduce an RPython type ``frame_stack_top`` and a built-in function
-``yield_current_frame_to_caller()`` that work as follows (see example below):
-
-* The built-in function ``yield_current_frame_to_caller()`` causes the current
-  function's state to be captured in a new ``frame_stack_top`` object that is
-  returned to the parent.  Only one frame, the current one, is captured this
-  way.  The current frame is suspended and the caller continues to run.  Note
-  that the caller is only resumed once: when
-  ``yield_current_frame_to_caller()`` is called.  See below.
-
-* A ``frame_stack_top`` object can be jumped to by calling its ``switch()``
-  method with no argument.
-
-* ``yield_current_frame_to_caller()`` and ``switch()`` themselves return a new
-  ``frame_stack_top`` object: the freshly captured state of the caller of the
-  source ``switch()`` that was just executed, or None in the case described
-  below.
-
-* the function that called ``yield_current_frame_to_caller()`` also has a
-  normal return statement, like all functions.  This statement must return
-  another ``frame_stack_top`` object.  The latter is *not* returned to the
-  original caller; there is no way to return several times to the caller.
-  Instead, it designates the place to which the execution must jump, as if by
-  a ``switch()``.  The place to which we jump this way will see a None as the
-  source frame stack top.
-
-* every frame stack top must be resumed once and only once.  Not resuming
-  it at all causes a leak.  Resuming it several times causes a crash.
-
-* a function that called ``yield_current_frame_to_caller()`` should not raise.
-  It would have no implicit parent frame to propagate the exception to.  That
-  would be a crashingly bad idea.
-
-The following example would print the numbers from 1 to 7 in order::
-
-    def g():
-        print 2
-        frametop_before_5 = yield_current_frame_to_caller()
-        print 4
-        frametop_before_7 = frametop_before_5.switch()
-        print 6
-        return frametop_before_7
-
-    def f():
-        print 1
-        frametop_before_4 = g()
-        print 3
-        frametop_before_6 = frametop_before_4.switch()
-        print 5
-        frametop_after_return = frametop_before_6.switch()
-        print 7
-        assert frametop_after_return is None
-
-    f()
-
-
 ``streamio``
 ============
 
diff --git a/pypy/doc/stackless.rst b/pypy/doc/stackless.rst
--- a/pypy/doc/stackless.rst
+++ b/pypy/doc/stackless.rst
@@ -8,446 +8,299 @@
 ================
 
 PyPy can expose to its user language features similar to the ones
-present in `Stackless Python`_: **no recursion depth limit**, and the
-ability to write code in a **massively concurrent style**.  It actually
-exposes three different paradigms to choose from:
+present in `Stackless Python`_: the ability to write code in a
+**massively concurrent style**.  (It does not (any more) offer the
+ability to run with no `recursion depth limit`_, but the same effect
+can be achieved indirectly.)
 
-* `Tasklets and channels`_;
+This feature is based on a custom primitive called a continulet_.
+Continulets can be directly used by application code, or it is possible
+to write (entirely at app-level) more user-friendly interfaces.
 
-* Greenlets_;
+Currently PyPy implements greenlets_ on top of continulets.  It would be
+easy to implement tasklets and channels as well, emulating the model
+of `Stackless Python`_.
 
-* Plain coroutines_.
+Continulets are extremely light-weight, which means that PyPy should be
+able to handle programs containing large amounts of them.  However, due
+to an implementation restriction, a PyPy compiled with
+``--gcrootfinder=shadowstack`` consumes at least one page of physical
+memory (4KB) per live continulet, and half a megabyte of virtual memory
+on 32-bit or a complete megabyte on 64-bit.  Moreover, the feature is
+only available (so far) on x86 and x86-64 CPUs; for other CPUs you need
+to add a short page of custom assembler to
+`pypy/translator/c/src/stacklet/`_.
 
-All of them are extremely light-weight, which means that PyPy should be
-able to handle programs containing large amounts of coroutines, tasklets
-and greenlets.
 
+Theory
+======
 
-Requirements
-++++++++++++++++
+The fundamental idea is that, at any point in time, the program happens
+to run one stack of frames (or one per thread, in case of
+multi-threading).  To see the stack, start at the top frame and follow
+the chain of ``f_back`` until you reach the bottom frame.  From the
+point of view of one of these frames, it has a ``f_back`` pointing to
+another frame (unless it is the bottom frame), and it is itself being
+pointed to by another frame (unless it is the top frame).
 
-If you are running py.py on top of CPython, then you need to enable
-the _stackless module by running it as follows::
+The theory behind continulets is to literally take the previous sentence
+as definition of "an O.K. situation".  The trick is that there are
+O.K. situations that are more complex than just one stack: you will
+always have one stack, but you can also have in addition one or more
+detached *cycles* of frames, such that by following the ``f_back`` chain
+you run in a circle.  But note that these cycles are indeed completely
+detached: the top frame (the currently running one) is always the one
+which is not the ``f_back`` of anybody else, and it is always the top of
+a stack that ends with the bottom frame, never a part of these extra
+cycles.
 
-    py.py --withmod-_stackless
+How do you create such cycles?  The fundamental operation to do so is to
+take two frames and *permute* their ``f_back`` --- i.e. exchange them.
+You can permute any two ``f_back`` without breaking the rule of "an O.K.
+situation".  Say for example that ``f`` is some frame halfway down the
+stack, and you permute its ``f_back`` with the ``f_back`` of the top
+frame.  Then you have removed from the normal stack all intermediate
+frames, and turned them into one stand-alone cycle.  By doing the same
+permutation again you restore the original situation.
 
-This is implemented internally using greenlets, so it only works on a
-platform where `greenlets`_ are supported.  A few features do
-not work this way, though, and really require a translated
-``pypy-c``.
+In practice, in PyPy, you cannot change the ``f_back`` of an abitrary
+frame, but only of frames stored in ``continulets``.
 
-To obtain a translated version of ``pypy-c`` that includes Stackless
-support, run translate.py as follows::
-
-    cd pypy/translator/goal
-    python translate.py --stackless
+Continulets are internally implemented using stacklets.  Stacklets are a
+bit more primitive (they are really one-shot continuations), but that
+idea only works in C, not in Python.  The basic idea of continulets is
+to have at any point in time a complete valid stack; this is important
+e.g. to correctly propagate exceptions (and it seems to give meaningful
+tracebacks too).
 
 
 Application level interface
 =============================
 
-A stackless PyPy contains a module called ``stackless``.  The interface
-exposed by this module have not been refined much, so it should be
-considered in-flux (as of 2007).
 
-So far, PyPy does not provide support for ``stackless`` in a threaded
-environment.  This limitation is not fundamental, as previous experience
-has shown, so supporting this would probably be reasonably easy.
+.. _continulet:
 
-An interesting point is that the same ``stackless`` module can provide
-a number of different concurrency paradigms at the same time.  From a
-theoretical point of view, none of above-mentioned existing three
-paradigms considered on its own is new: two of them are from previous
-Python work, and the third one is a variant of the classical coroutine.
-The new part is that the PyPy implementation manages to provide all of
-them and let the user implement more.  Moreover - and this might be an
-important theoretical contribution of this work - we manage to provide
-these concurrency concepts in a "composable" way.  In other words, it
-is possible to naturally mix in a single application multiple
-concurrency paradigms, and multiple unrelated usages of the same
-paradigm.  This is discussed in the Composability_ section below.
+Continulets
++++++++++++
 
+A translated PyPy contains by default a module called ``_continuation``
+exporting the type ``continulet``.  A ``continulet`` object from this
+module is a container that stores a "one-shot continuation".  It plays
+the role of an extra frame you can insert in the stack, and whose
+``f_back`` can be changed.
 
-Infinite recursion
-++++++++++++++++++
+To make a continulet object, call ``continulet()`` with a callable and
+optional extra arguments.
 
-Any stackless PyPy executable natively supports recursion that is only
-limited by the available memory.  As in normal Python, though, there is
-an initial recursion limit (which is 5000 in all pypy-c's, and 1000 in
-CPython).  It can be changed with ``sys.setrecursionlimit()``.  With a
-stackless PyPy, any value is acceptable - use ``sys.maxint`` for
-unlimited.
+Later, the first time you ``switch()`` to the continulet, the callable
+is invoked with the same continulet object as the extra first argument.
+At that point, the one-shot continuation stored in the continulet points
+to the caller of ``switch()``.  In other words you have a perfectly
+normal-looking stack of frames.  But when ``switch()`` is called again,
+this stored one-shot continuation is exchanged with the current one; it
+means that the caller of ``switch()`` is suspended with its continuation
+stored in the container, and the old continuation from the continulet
+object is resumed.
 
-In some cases, you can write Python code that causes interpreter-level
-infinite recursion -- i.e. infinite recursion without going via
-application-level function calls.  It is possible to limit that too,
-with ``_stackless.set_stack_depth_limit()``, or to unlimit it completely
-by setting it to ``sys.maxint``.
+The most primitive API is actually 'permute()', which just permutes the
+one-shot continuation stored in two (or more) continulets.
 
+In more details:
 
-Coroutines
-++++++++++
+* ``continulet(callable, *args, **kwds)``: make a new continulet.
+  Like a generator, this only creates it; the ``callable`` is only
+  actually called the first time it is switched to.  It will be
+  called as follows::
 
-A Coroutine is similar to a very small thread, with no preemptive scheduling.
-Within a family of coroutines, the flow of execution is explicitly
-transferred from one to another by the programmer.  When execution is
-transferred to a coroutine, it begins to execute some Python code.  When
-it transfers execution away from itself it is temporarily suspended, and
-when execution returns to it it resumes its execution from the
-point where it was suspended.  Conceptually, only one coroutine is
-actively running at any given time (but see Composability_ below).
+      callable(cont, *args, **kwds)
 
-The ``stackless.coroutine`` class is instantiated with no argument.
-It provides the following methods and attributes:
+  where ``cont`` is the same continulet object.
 
-* ``stackless.coroutine.getcurrent()``
+  Note that it is actually ``cont.__init__()`` that binds
+  the continulet.  It is also possible to create a not-bound-yet
+  continulet by calling explicitly ``continulet.__new__()``, and
+  only bind it later by calling explicitly ``cont.__init__()``.
 
-    Static method returning the currently running coroutine.  There is a
-    so-called "main" coroutine object that represents the "outer"
-    execution context, where your main program started and where it runs
-    as long as it does not switch to another coroutine.
+* ``cont.switch(value=None, to=None)``: start the continulet if
+  it was not started yet.  Otherwise, store the current continuation
+  in ``cont``, and activate the target continuation, which is the
+  one that was previously stored in ``cont``.  Note that the target
+  continuation was itself previously suspended by another call to
+  ``switch()``; this older ``switch()`` will now appear to return.
+  The ``value`` argument is any object that is carried to the target
+  and returned by the target's ``switch()``.
 
-* ``coro.bind(callable, *args, **kwds)``
+  If ``to`` is given, it must be another continulet object.  In
+  that case, performs a "double switch": it switches as described
+  above to ``cont``, and then immediately switches again to ``to``.
+  This is different from switching directly to ``to``: the current
+  continuation gets stored in ``cont``, the old continuation from
+  ``cont`` gets stored in ``to``, and only then we resume the
+  execution from the old continuation out of ``to``.
 
-    Bind the coroutine so that it will execute ``callable(*args,
-    **kwds)``.  The call is not performed immediately, but only the
-    first time we call the ``coro.switch()`` method.  A coroutine must
-    be bound before it is switched to.  When the coroutine finishes
-    (because the call to the callable returns), the coroutine exits and
-    implicitly switches back to another coroutine (its "parent"); after
-    this point, it is possible to bind it again and switch to it again.
-    (Which coroutine is the parent of which is not documented, as it is
-    likely to change when the interface is refined.)
+* ``cont.throw(type, value=None, tb=None, to=None)``: similar to
+  ``switch()``, except that immediately after the switch is done, raise
+  the given exception in the target.
 
-* ``coro.switch()``
+* ``cont.is_pending()``: return True if the continulet is pending.
+  This is False when it is not initialized (because we called
+  ``__new__`` and not ``__init__``) or when it is finished (because
+  the ``callable()`` returned).  When it is False, the continulet
+  object is empty and cannot be ``switch()``-ed to.
 
-    Suspend the current (caller) coroutine, and resume execution in the
-    target coroutine ``coro``.
+* ``permute(*continulets)``: a global function that permutes the
+  continuations stored in the given continulets arguments.  Mostly
+  theoretical.  In practice, using ``cont.switch()`` is easier and
+  more efficient than using ``permute()``; the latter does not on
+  its own change the currently running frame.
 
-* ``coro.kill()``
 
-    Kill ``coro`` by sending a CoroutineExit exception and switching
-    execution immediately to it. This exception can be caught in the 
-    coroutine itself and can be raised from any call to ``coro.switch()``. 
-    This exception isn't propagated to the parent coroutine.
+Genlets
++++++++
 
-* ``coro.throw(type, value)``
+The ``_continuation`` module also exposes the ``generator`` decorator::
 
-    Insert an exception in ``coro`` an resume switches execution
-    immediately to it. In the coroutine itself, this exception
-    will come from any call to ``coro.switch()`` and can be caught. If the
-    exception isn't caught, it will be propagated to the parent coroutine.
+    @generator
+    def f(cont, a, b):
+        cont.switch(a + b)
+        cont.switch(a + b + 1)
 
-When a coroutine is garbage-collected, it gets the ``.kill()`` method sent to
-it. This happens at the point the next ``.switch`` method is called, so the
-target coroutine of this call will be executed only after the ``.kill`` has
-finished.
+    for i in f(10, 20):
+        print i
 
-Example
-~~~~~~~
+This example prints 30 and 31.  The only advantage over using regular
+generators is that the generator itself is not limited to ``yield``
+statements that must all occur syntactically in the same function.
+Instead, we can pass around ``cont``, e.g. to nested sub-functions, and
+call ``cont.switch(x)`` from there.
 
-Here is a classical producer/consumer example: an algorithm computes a
-sequence of values, while another consumes them.  For our purposes we
-assume that the producer can generate several values at once, and the
-consumer can process up to 3 values in a batch - it can also process
-batches with fewer than 3 values without waiting for the producer (which
-would be messy to express with a classical Python generator). ::
+The ``generator`` decorator can also be applied to methods::
 
-    def producer(lst):
-        while True:
-            ...compute some more values...
-            lst.extend(new_values)
-            coro_consumer.switch()
-
-    def consumer(lst):
-        while True:
-            # First ask the producer for more values if needed
-            while len(lst) == 0:
-                coro_producer.switch()
-            # Process the available values in a batch, but at most 3
-            batch = lst[:3]
-            del lst[:3]
-            ...process batch...
-
-    # Initialize two coroutines with a shared list as argument
-    exchangelst = []
-    coro_producer = coroutine()
-    coro_producer.bind(producer, exchangelst)
-    coro_consumer = coroutine()
-    coro_consumer.bind(consumer, exchangelst)
-
-    # Start running the consumer coroutine
-    coro_consumer.switch()
-
-
-Tasklets and channels
-+++++++++++++++++++++
-
-The ``stackless`` module also provides an interface that is roughly
-compatible with the interface of the ``stackless`` module in `Stackless
-Python`_: it contains ``stackless.tasklet`` and ``stackless.channel``
-classes.  Tasklets are also similar to microthreads, but (like coroutines)
-they don't actually run in parallel with other microthreads; instead,
-they synchronize and exchange data with each other over Channels, and
-these exchanges determine which Tasklet runs next.
-
-For usage reference, see the documentation on the `Stackless Python`_
-website.
-
-Note that Tasklets and Channels are implemented at application-level in
-`lib_pypy/stackless.py`_ on top of coroutines_.  You can refer to this
-module for more details and API documentation.
-
-The stackless.py code tries to resemble the stackless C code as much
-as possible. This makes the code somewhat unpythonic.
-
-Bird's eye view of tasklets and channels
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Tasklets are a bit like threads: they encapsulate a function in such a way that
-they can be suspended/restarted any time. Unlike threads, they won't
-run concurrently, but must be cooperative. When using stackless
-features, it is vitally important that no action is performed that blocks
-everything else.  In particular, blocking input/output should be centralized
-to a single tasklet.
-
-Communication between tasklets is done via channels. 
-There are three ways for a tasklet to give up control:
-
-1. call ``stackless.schedule()``
-2. send something over a channel
-3. receive something from a channel
-
-A (live) tasklet can either be running, waiting to get scheduled, or be
-blocked by a channel.
-
-Scheduling is done in strictly round-robin manner. A blocked tasklet
-is removed from the scheduling queue and will be reinserted when it
-becomes unblocked.
-
-Example
-~~~~~~~
-
-Here is a many-producers many-consumers example, where any consumer can
-process the result of any producer.  For this situation we set up a
-single channel where all producer send, and on which all consumers
-wait::
-
-    def producer(chan):
-        while True:
-            chan.send(...next value...)
-
-    def consumer(chan):
-        while True:
-            x = chan.receive()
-            ...do something with x...
-
-    # Set up the N producer and M consumer tasklets
-    common_channel = stackless.channel()
-    for i in range(N):
-        stackless.tasklet(producer, common_channel)()
-    for i in range(M):
-        stackless.tasklet(consumer, common_channel)()
-
-    # Run it all
-    stackless.run()
-
-Each item sent over the channel is received by one of the waiting
-consumers; which one is not specified.  The producers block until their
-item is consumed: the channel is not a queue, but rather a meeting point
-which causes tasklets to block until both a consumer and a producer are
-ready.  In practice, the reason for having several consumers receiving
-on a single channel is that some of the consumers can be busy in other
-ways part of the time.  For example, each consumer might receive a
-database request, process it, and send the result to a further channel
-before it asks for the next request.  In this situation, further
-requests can still be received by other consumers.
+    class X:
+        @generator
+        def f(self, cont, a, b):
+            ...
 
 
 Greenlets
 +++++++++
 
-A Greenlet is a kind of primitive Tasklet with a lower-level interface
-and with exact control over the execution order.  Greenlets are similar
-to Coroutines, with a slightly different interface: greenlets put more
-emphasis on a tree structure.  The various greenlets of a program form a
-precise tree, which fully determines their order of execution.
+Greenlets are implemented on top of continulets in `lib_pypy/greenlet.py`_.
+See the official `documentation of the greenlets`_.
 
-For usage reference, see the `documentation of the greenlets`_.
-The PyPy interface is identical.  You should use ``greenlet.greenlet``
-instead of ``stackless.greenlet`` directly, because the greenlet library
-can give you the latter when you ask for the former on top of PyPy.
+Note that unlike the CPython greenlets, this version does not suffer
+from GC issues: if the program "forgets" an unfinished greenlet, it will
+always be collected at the next garbage collection.
 
-PyPy's greenlets do not suffer from the cyclic GC limitation that the
-CPython greenlets have: greenlets referencing each other via local
-variables tend to leak on top of CPython (where it is mostly impossible
-to do the right thing).  It works correctly on top of PyPy.
 
+Unimplemented features
+++++++++++++++++++++++
 
-Coroutine Pickling
-++++++++++++++++++
+The following features (present in some past Stackless version of PyPy)
+are for the time being not supported any more:
 
-Coroutines and tasklets can be pickled and unpickled, i.e. serialized to
-a string of bytes for the purpose of storage or transmission.  This
-allows "live" coroutines or tasklets to be made persistent, moved to
-other machines, or cloned in any way.  The standard ``pickle`` module
-works with coroutines and tasklets (at least in a translated ``pypy-c``;
-unpickling live coroutines or tasklets cannot be easily implemented on
-top of CPython).
+* Tasklets and channels (currently ``stackless.py`` seems to import,
+  but you have tasklets on top of coroutines on top of greenlets on
+  top of continulets on top of stacklets, and it's probably not too
+  hard to cut two of these levels by adapting ``stackless.py`` to
+  use directly continulets)
 
-To be able to achieve this result, we have to consider many objects that
-are not normally pickleable in CPython.  Here again, the `Stackless
-Python`_ implementation has paved the way, and we follow the same
-general design decisions: simple internal objects like bound method
-objects and various kinds of iterators are supported; frame objects can
-be fully pickled and unpickled
-(by serializing a reference to the bytecode they are
-running in addition to all the local variables).  References to globals
-and modules are pickled by name, similarly to references to functions
-and classes in the traditional CPython ``pickle``.
+* Coroutines (could be rewritten at app-level)
 
-The "magic" part of this process is the implementation of the unpickling
-of a chain of frames.  The Python interpreter of PyPy uses
-interpreter-level recursion to represent application-level calls.  The
-reason for this is that it tremendously simplifies the implementation of
-the interpreter itself.  Indeed, in Python, almost any operation can
-potentially result in a non-tail-recursive call to another Python
-function.  This makes writing a non-recursive interpreter extremely
-tedious; instead, we rely on lower-level transformations during the
-translation process to control this recursion.  This is the `Stackless
-Transform`_, which is at the heart of PyPy's support for stackless-style
-concurrency.
+* Pickling and unpickling continulets (*)
 
-At any point in time, a chain of Python-level frames corresponds to a
-chain of interpreter-level frames (e.g. C frames in pypy-c), where each
-single Python-level frame corresponds to one or a few interpreter-level
-frames - depending on the length of the interpreter-level call chain
-from one bytecode evaluation loop to the next (recursively invoked) one.
+* Continuing execution of a continulet in a different thread (*)
 
-This means that it is not sufficient to simply create a chain of Python
-frame objects in the heap of a process before we can resume execution of
-these newly built frames.  We must recreate a corresponding chain of
-interpreter-level frames.  To this end, we have inserted a few *named
-resume points* (see 3.2.4, in `D07.1 Massive Parallelism and Translation Aspects`_) in the Python interpreter of PyPy.  This is the
-motivation for implementing the interpreter-level primitives
-``resume_state_create()`` and ``resume_state_invoke()``, the powerful
-interface that allows an RPython program to artificially rebuild a chain
-of calls in a reflective way, completely from scratch, and jump to it.
+* Automatic unlimited stack (must be emulated__ so far)
 
-.. _`D07.1 Massive Parallelism and Translation Aspects`: http://codespeak.net/pypy/extradoc/eu-report/D07.1_Massive_Parallelism_and_Translation_Aspects-2007-02-28.pdf
+* Support for other CPUs than x86 and x86-64
 
-Example
-~~~~~~~
+* The app-level ``f_back`` field of frames crossing continulet boundaries
+  is None for now, unlike what I explain in the theoretical overview
+  above.  It mostly means that in a ``pdb.set_trace()`` you cannot go
+  ``up`` past countinulet boundaries.  This could be fixed.
 
-(See `demo/pickle_coroutine.py`_ for the complete source of this demo.)
+.. __: `recursion depth limit`_
 
-Consider a program which contains a part performing a long-running
-computation::
+(*) Pickling, as well as changing threads, could be implemented by using
+a "soft" stack switching mode again.  We would get either "hard" or
+"soft" switches, similarly to Stackless Python 3rd version: you get a
+"hard" switch (like now) when the C stack contains non-trivial C frames
+to save, and a "soft" switch (like previously) when it contains only
+simple calls from Python to Python.  Soft-switched continulets would
+also consume a bit less RAM, and the switch might be a bit faster too
+(unsure about that; what is the Stackless Python experience?).
 
-    def ackermann(x, y):
-        if x == 0:
-            return y + 1
-        if y == 0:
-            return ackermann(x - 1, 1)
-        return ackermann(x - 1, ackermann(x, y - 1))
 
-By using pickling, we can save the state of the computation while it is
-running, for the purpose of restoring it later and continuing the
-computation at another time or on a different machine.  However,
-pickling does not produce a whole-program dump: it can only pickle
-individual coroutines.  This means that the computation should be
-started in its own coroutine::
+Recursion depth limit
++++++++++++++++++++++
 
-    # Make a coroutine that will run 'ackermann(3, 8)'
-    coro = coroutine()
-    coro.bind(ackermann, 3, 8)
+You can use continulets to emulate the infinite recursion depth present
+in Stackless Python and in stackless-enabled older versions of PyPy.
 
-    # Now start running the coroutine
-    result = coro.switch()
+The trick is to start a continulet "early", i.e. when the recursion
+depth is very low, and switch to it "later", i.e. when the recursion
+depth is high.  Example::
 
-The coroutine itself must switch back to the main program when it needs
-to be interrupted (we can only pickle suspended coroutines).  Due to
-current limitations this requires an explicit check in the
-``ackermann()`` function::
+    from _continuation import continulet
 
-    def ackermann(x, y):
-        if interrupt_flag:      # test a global flag
-            main.switch()       # and switch back to 'main' if it is set
-        if x == 0:
-            return y + 1
-        if y == 0:
-            return ackermann(x - 1, 1)
-        return ackermann(x - 1, ackermann(x, y - 1))
+    def invoke(_, callable, arg):
+        return callable(arg)
 
-The global ``interrupt_flag`` would be set for example by a timeout, or
-by a signal handler reacting to Ctrl-C, etc.  It causes the coroutine to
-transfer control back to the main program.  The execution comes back
-just after the line ``coro.switch()``, where we can pickle the coroutine
-if necessary::
+    def bootstrap(c):
+        # this loop runs forever, at a very low recursion depth
+        callable, arg = c.switch()
+        while True:
+            # start a new continulet from here, and switch to
+            # it using an "exchange", i.e. a switch with to=.
+            to = continulet(invoke, callable, arg)
+            callable, arg = c.switch(to=to)
 
-    if not coro.is_alive:
-        print "finished; the result is:", result
-    else:
-        # save the state of the suspended coroutine
-        f = open('demo.pickle', 'w')
-        pickle.dump(coro, f)
-        f.close()
+    c = continulet(bootstrap)
+    c.switch()
 
-The process can then stop.  At any later time, or on another machine,
-we can reload the file and restart the coroutine with::
 
-    f = open('demo.pickle', 'r')
-    coro = pickle.load(f)
-    f.close()
-    result = coro.switch()
+    def recursive(n):
+        if n == 0:
+            return ("ok", n)
+        if n % 200 == 0:
+            prev = c.switch((recursive, n - 1))
+        else:
+            prev = recursive(n - 1)
+        return (prev[0], prev[1] + 1)
 
-Limitations
-~~~~~~~~~~~
+    print recursive(999999)     # prints ('ok', 999999)
 
-Coroutine pickling is subject to some limitations.  First of all, it is
-not a whole-program "memory dump".  It means that only the "local" state
-of a coroutine is saved.  The local state is defined to include the
-chain of calls and the local variables, but not for example the value of
-any global variable.
+Note that if you press Ctrl-C while running this example, the traceback
+will be built with *all* recursive() calls so far, even if this is more
+than the number that can possibly fit in the C stack.  These frames are
+"overlapping" each other in the sense of the C stack; more precisely,
+they are copied out of and into the C stack as needed.
 
-As in normal Python, the pickle will not include any function object's
-code, any class definition, etc., but only references to functions and
-classes.  Unlike normal Python, the pickle contains frames.  A pickled
-frame stores a bytecode index, representing the current execution
-position.  This means that the user program cannot be modified *at all*
-between pickling and unpickling!
+(The example above also makes use of the following general "guideline"
+to help newcomers write continulets: in ``bootstrap(c)``, only call
+methods on ``c``, not on another continulet object.  That's why we wrote
+``c.switch(to=to)`` and not ``to.switch()``, which would mess up the
+state.  This is however just a guideline; in general we would recommend
+to use other interfaces like genlets and greenlets.)
 
-On the other hand, the pickled data is fairly independent from the
-platform and from the PyPy version.
 
-Pickling/unpickling fails if the coroutine is suspended in a state that
-involves Python frames which were *indirectly* called.  To define this
-more precisely, a Python function can issue a regular function or method
-call to invoke another Python function - this is a *direct* call and can
-be pickled and unpickled.  But there are many ways to invoke a Python
-function indirectly.  For example, most operators can invoke a special
-method ``__xyz__()`` on a class, various built-in functions can call
-back Python functions, signals can invoke signal handlers, and so on.
-These cases are not supported yet.
-
-
-Composability
-+++++++++++++
+Theory of composability
++++++++++++++++++++++++
 
 Although the concept of coroutines is far from new, they have not been
 generally integrated into mainstream languages, or only in limited form
 (like generators in Python and iterators in C#).  We can argue that a
 possible reason for that is that they do not scale well when a program's
 complexity increases: they look attractive in small examples, but the
-models that require explicit switching, by naming the target coroutine,
-do not compose naturally.  This means that a program that uses
-coroutines for two unrelated purposes may run into conflicts caused by
-unexpected interactions.
+models that require explicit switching, for example by naming the target
+coroutine, do not compose naturally.  This means that a program that
+uses coroutines for two unrelated purposes may run into conflicts caused
+by unexpected interactions.
 
 To illustrate the problem, consider the following example (simplified
-code; see the full source in
-`pypy/module/_stackless/test/test_composable_coroutine.py`_).  First, a
-simple usage of coroutine::
+code using a theorical ``coroutine`` class).  First, a simple usage of
+coroutine::
 
     main_coro = coroutine.getcurrent()    # the main (outer) coroutine
     data = []
@@ -530,74 +383,35 @@
 main coroutine, which confuses the ``generator_iterator.next()`` method
 (it gets resumed, but not as a result of a call to ``Yield()``).
 
-As part of trying to combine multiple different paradigms into a single
-application-level module, we have built a way to solve this problem.
-The idea is to avoid the notion of a single, global "main" coroutine (or
-a single main greenlet, or a single main tasklet).  Instead, each
-conceptually separated user of one of these concurrency interfaces can
-create its own "view" on what the main coroutine/greenlet/tasklet is,
-which other coroutine/greenlet/tasklets there are, and which of these is
-the currently running one.  Each "view" is orthogonal to the others.  In
-particular, each view has one (and exactly one) "current"
-coroutine/greenlet/tasklet at any point in time.  When the user switches
-to a coroutine/greenlet/tasklet, it implicitly means that he wants to
-switch away from the current coroutine/greenlet/tasklet *that belongs to
-the same view as the target*.
+Thus the notion of coroutine is *not composable*.  By opposition, the
+primitive notion of continulets is composable: if you build two
+different interfaces on top of it, or have a program that uses twice the
+same interface in two parts, then assuming that both parts independently
+work, the composition of the two parts still works.
 
-The precise application-level interface has not been fixed yet; so far,
-"views" in the above sense are objects of the type
-``stackless.usercostate``.  The above two examples can be rewritten in
-the following way::
+A full proof of that claim would require careful definitions, but let us
+just claim that this fact is true because of the following observation:
+the API of continulets is such that, when doing a ``switch()``, it
+requires the program to have some continulet to explicitly operate on.
+It shuffles the current continuation with the continuation stored in
+that continulet, but has no effect outside.  So if a part of a program
+has a continulet object, and does not expose it as a global, then the
+rest of the program cannot accidentally influence the continuation
+stored in that continulet object.
 
-    producer_view = stackless.usercostate()   # a local view
-    main_coro = producer_view.getcurrent()    # the main (outer) coroutine
-    ...
-    producer_coro = producer_view.newcoroutine()
-    ...
-
-and::
-
-    generators_view = stackless.usercostate()
-
-    def generator(f):
-        def wrappedfunc(*args, **kwds):
-            g = generators_view.newcoroutine(generator_iterator)
-            ...
-
-            ...generators_view.getcurrent()...
-
-Then the composition ``grab_values()`` works as expected, because the
-two views are independent.  The coroutine captured as ``self.caller`` in
-the ``generator_iterator.next()`` method is the main coroutine of the
-``generators_view``.  It is no longer the same object as the main
-coroutine of the ``producer_view``, so when ``data_producer()`` issues
-the following command::
-
-    main_coro.switch()
-
-the control flow cannot accidentally jump back to
-``generator_iterator.next()``.  In other words, from the point of view
-of ``producer_view``, the function ``grab_next_value()`` always runs in
-its main coroutine ``main_coro`` and the function ``data_producer`` in
-its coroutine ``producer_coro``.  This is the case independently of
-which ``generators_view``-based coroutine is the current one when
-``grab_next_value()`` is called.
-
-Only code that has explicit access to the ``producer_view`` or its
-coroutine objects can perform switches that are relevant for the
-generator code.  If the view object and the coroutine objects that share
-this view are all properly encapsulated inside the generator logic, no
-external code can accidentally temper with the expected control flow any
-longer.
-
-In conclusion: we will probably change the app-level interface of PyPy's
-stackless module in the future to not expose coroutines and greenlets at
-all, but only views.  They are not much more difficult to use, and they
-scale automatically to larger programs.
+In other words, if we regard the continulet object as being essentially
+a modifiable ``f_back``, then it is just a link between the frame of
+``callable()`` and the parent frame --- and it cannot be arbitrarily
+changed by unrelated code, as long as they don't explicitly manipulate
+the continulet object.  Typically, both the frame of ``callable()``
+(commonly a local function) and its parent frame (which is the frame
+that switched to it) belong to the same class or module; so from that
+point of view the continulet is a purely local link between two local
+frames.  It doesn't make sense to have a concept that allows this link
+to be manipulated from outside.
 
 
 .. _`Stackless Python`: http://www.stackless.com
 .. _`documentation of the greenlets`: http://packages.python.org/greenlet/
-.. _`Stackless Transform`: translation.html#the-stackless-transform
 
 .. include:: _ref.txt
diff --git a/pypy/doc/translation.rst b/pypy/doc/translation.rst
--- a/pypy/doc/translation.rst
+++ b/pypy/doc/translation.rst
@@ -552,14 +552,15 @@
 
 The stackless transform converts functions into a form that knows how
 to save the execution point and active variables into a heap structure
-and resume execution at that point.  This is used to implement
+and resume execution at that point.  This was used to implement
 coroutines as an RPython-level feature, which in turn are used to
-implement `coroutines, greenlets and tasklets`_ as an application
+implement coroutines, greenlets and tasklets as an application
 level feature for the Standard Interpreter.
 
-Enable the stackless transformation with :config:`translation.stackless`.
+The stackless transformation has been deprecated and is no longer
+available in trunk.  It has been replaced with continulets_.
 
-.. _`coroutines, greenlets and tasklets`: stackless.html
+.. _continulets: stackless.html
 
 .. _`preparing the graphs for source generation`:
 
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -626,9 +626,9 @@
             self.default_compiler = compiler
             return compiler
 
-    def createframe(self, code, w_globals, closure=None):
+    def createframe(self, code, w_globals, outer_func=None):
         "Create an empty PyFrame suitable for this code object."
-        return self.FrameClass(self, code, w_globals, closure)
+        return self.FrameClass(self, code, w_globals, outer_func)
 
     def allocate_lock(self):
         """Return an interp-level Lock object if threads are enabled,
diff --git a/pypy/interpreter/function.py b/pypy/interpreter/function.py
--- a/pypy/interpreter/function.py
+++ b/pypy/interpreter/function.py
@@ -30,7 +30,7 @@
     can_change_code = True
     _immutable_fields_ = ['code?',
                           'w_func_globals?',
-                          'closure?',
+                          'closure?[*]',
                           'defs_w?[*]',
                           'name?']
 
@@ -96,7 +96,7 @@
             assert isinstance(code, PyCode)
             if nargs < 5:
                 new_frame = self.space.createframe(code, self.w_func_globals,
-                                                   self.closure)
+                                                   self)
                 for i in funccallunrolling:
                     if i < nargs:
                         new_frame.locals_stack_w[i] = args_w[i]
@@ -156,7 +156,7 @@
     def _flat_pycall(self, code, nargs, frame):
         # code is a PyCode
         new_frame = self.space.createframe(code, self.w_func_globals,
-                                                   self.closure)
+                                                   self)
         for i in xrange(nargs):
             w_arg = frame.peekvalue(nargs-1-i)
             new_frame.locals_stack_w[i] = w_arg
@@ -167,7 +167,7 @@
     def _flat_pycall_defaults(self, code, nargs, frame, defs_to_load):
         # code is a PyCode
         new_frame = self.space.createframe(code, self.w_func_globals,
-                                                   self.closure)
+                                                   self)
         for i in xrange(nargs):
             w_arg = frame.peekvalue(nargs-1-i)
             new_frame.locals_stack_w[i] = w_arg
diff --git a/pypy/interpreter/nestedscope.py b/pypy/interpreter/nestedscope.py
--- a/pypy/interpreter/nestedscope.py
+++ b/pypy/interpreter/nestedscope.py
@@ -8,7 +8,7 @@
 
 class Cell(Wrappable):
     "A simple container for a wrapped value."
-    
+
     def __init__(self, w_value=None):
         self.w_value = w_value
 
@@ -90,32 +90,33 @@
     #     variables coming from a parent function in which i'm nested
     # 'closure' is a list of Cell instances: the received free vars.
 
-    cells = None
-
     @jit.unroll_safe
-    def initialize_frame_scopes(self, closure, code):
-        super_initialize_frame_scopes(self, closure, code)
+    def initialize_frame_scopes(self, outer_func, code):
+        super_initialize_frame_scopes(self, outer_func, code)
         ncellvars = len(code.co_cellvars)
         nfreevars = len(code.co_freevars)
         if not nfreevars:
             if not ncellvars:
+                self.cells = []
                 return            # no self.cells needed - fast path
-            if closure is None:
-                closure = []
-        elif closure is None:
+        elif outer_func is None:
             space = self.space
             raise OperationError(space.w_TypeError,
                                  space.wrap("directly executed code object "
                                             "may not contain free variables"))
-        if len(closure) != nfreevars:
+        if outer_func and outer_func.closure:
+            closure_size = len(outer_func.closure)
+        else:
+            closure_size = 0
+        if closure_size != nfreevars:
             raise ValueError("code object received a closure with "
                                  "an unexpected number of free variables")
         self.cells = [None] * (ncellvars + nfreevars)
         for i in range(ncellvars):
             self.cells[i] = Cell()
         for i in range(nfreevars):
-            self.cells[i + ncellvars] = closure[i]
-    
+            self.cells[i + ncellvars] = outer_func.closure[i]
+
     def _getcells(self):
         return self.cells
 
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -198,7 +198,7 @@
 
     def funcrun(self, func, args):
         frame = self.space.createframe(self, func.w_func_globals,
-                                  func.closure)
+                                  func)
         sig = self._signature
         # speed hack
         fresh_frame = jit.hint(frame, access_directly=True,
@@ -211,7 +211,7 @@
 
     def funcrun_obj(self, func, w_obj, args):
         frame = self.space.createframe(self, func.w_func_globals,
-                                  func.closure)
+                                  func)
         sig = self._signature
         # speed hack
         fresh_frame = jit.hint(frame, access_directly=True,
diff --git a/pypy/interpreter/pyframe.py b/pypy/interpreter/pyframe.py
--- a/pypy/interpreter/pyframe.py
+++ b/pypy/interpreter/pyframe.py
@@ -51,7 +51,7 @@
     is_being_profiled        = False
     escaped                  = False  # see mark_as_escaped()
 
-    def __init__(self, space, code, w_globals, closure):
+    def __init__(self, space, code, w_globals, outer_func):
         if not we_are_translated():
             assert type(self) in (space.FrameClass, CPythonFrame), (
                 "use space.FrameClass(), not directly PyFrame()")
@@ -70,7 +70,7 @@
             self.builtin = space.builtin.pick_builtin(w_globals)
         # regular functions always have CO_OPTIMIZED and CO_NEWLOCALS.
         # class bodies only have CO_NEWLOCALS.
-        self.initialize_frame_scopes(closure, code)
+        self.initialize_frame_scopes(outer_func, code)
         self.f_lineno = code.co_firstlineno
 
     def mark_as_escaped(self):
@@ -117,8 +117,8 @@
             return self.builtin
         else:
             return self.space.builtin
-        
-    def initialize_frame_scopes(self, closure, code): 
+
+    def initialize_frame_scopes(self, outer_func, code):
         # regular functions always have CO_OPTIMIZED and CO_NEWLOCALS.
         # class bodies only have CO_NEWLOCALS.
         # CO_NEWLOCALS: make a locals dict unless optimized is also set
@@ -385,7 +385,11 @@
         
         # do not use the instance's __init__ but the base's, because we set
         # everything like cells from here
-        PyFrame.__init__(self, space, pycode, w_globals, closure)
+        # XXX hack
+        from pypy.interpreter.function import Function
+        outer_func = Function(space, None, closure=closure,
+                             forcename="fake")
+        PyFrame.__init__(self, space, pycode, w_globals, outer_func)
         f_back = space.interp_w(PyFrame, w_f_back, can_be_None=True)
         new_frame.f_backref = jit.non_virtual_ref(f_back)
 
diff --git a/pypy/interpreter/test/test_gateway.py b/pypy/interpreter/test/test_gateway.py
--- a/pypy/interpreter/test/test_gateway.py
+++ b/pypy/interpreter/test/test_gateway.py
@@ -704,7 +704,7 @@
 class TestPassThroughArguments_CALL_METHOD(TestPassThroughArguments):
 
     def setup_class(cls):
-        space = gettestobjspace(usemodules=('_stackless',), **{
+        space = gettestobjspace(usemodules=('itertools',), **{
             "objspace.opcodes.CALL_METHOD": True
             })
         cls.space = space
diff --git a/pypy/jit/backend/llgraph/runner.py b/pypy/jit/backend/llgraph/runner.py
--- a/pypy/jit/backend/llgraph/runner.py
+++ b/pypy/jit/backend/llgraph/runner.py
@@ -25,13 +25,14 @@
 class Descr(history.AbstractDescr):
 
     def __init__(self, ofs, typeinfo, extrainfo=None, name=None,
-                 arg_types=None, count_fields_if_immut=-1):
+                 arg_types=None, count_fields_if_immut=-1, ffi_flags=0):
         self.ofs = ofs
         self.typeinfo = typeinfo
         self.extrainfo = extrainfo
         self.name = name
         self.arg_types = arg_types
         self.count_fields_if_immut = count_fields_if_immut
+        self.ffi_flags = ffi_flags
 
     def get_arg_types(self):
         return self.arg_types
@@ -67,6 +68,9 @@
     def count_fields_if_immutable(self):
         return self.count_fields_if_immut
 
+    def get_ffi_flags(self):
+        return self.ffi_flags
+
     def __lt__(self, other):
         raise TypeError("cannot use comparison on Descrs")
     def __le__(self, other):
@@ -114,14 +118,14 @@
         return False
 
     def getdescr(self, ofs, typeinfo='?', extrainfo=None, name=None,
-                 arg_types=None, count_fields_if_immut=-1):
+                 arg_types=None, count_fields_if_immut=-1, ffi_flags=0):
         key = (ofs, typeinfo, extrainfo, name, arg_types,
-               count_fields_if_immut)
+               count_fields_if_immut, ffi_flags)
         try:
             return self._descrs[key]
         except KeyError:
             descr = Descr(ofs, typeinfo, extrainfo, name, arg_types,
-                          count_fields_if_immut)
+                          count_fields_if_immut, ffi_flags)
             self._descrs[key] = descr
             return descr
 
@@ -312,7 +316,7 @@
         token = history.getkind(getattr(S, fieldname))
         return self.getdescr(ofs, token[0], name=fieldname)
 
-    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo):
         arg_types = []
         for ARG in ARGS:
             token = history.getkind(ARG)
@@ -326,7 +330,7 @@
         return self.getdescr(0, token[0], extrainfo=extrainfo,
                              arg_types=''.join(arg_types))
 
-    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo=None):
+    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo, ffi_flags):
         from pypy.jit.backend.llsupport.ffisupport import get_ffi_type_kind
         from pypy.jit.backend.llsupport.ffisupport import UnsupportedKind
         arg_types = []
@@ -339,7 +343,8 @@
         except UnsupportedKind:
             return None
         return self.getdescr(0, reskind, extrainfo=extrainfo,
-                             arg_types=''.join(arg_types))
+                             arg_types=''.join(arg_types),
+                             ffi_flags=ffi_flags)
 
 
     def grab_exc_value(self):
@@ -522,7 +527,7 @@
         return FieldDescr.new(T1, fieldname)
 
     @staticmethod
-    def calldescrof(FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(FUNC, ARGS, RESULT, extrainfo):
         return StaticMethDescr.new(FUNC, ARGS, RESULT, extrainfo)
 
     @staticmethod
diff --git a/pypy/jit/backend/llsupport/descr.py b/pypy/jit/backend/llsupport/descr.py
--- a/pypy/jit/backend/llsupport/descr.py
+++ b/pypy/jit/backend/llsupport/descr.py
@@ -260,10 +260,12 @@
     _clsname = ''
     loop_token = None
     arg_classes = ''     # <-- annotation hack
+    ffi_flags = 0
 
-    def __init__(self, arg_classes, extrainfo=None):
+    def __init__(self, arg_classes, extrainfo=None, ffi_flags=0):
         self.arg_classes = arg_classes    # string of "r" and "i" (ref/int)
         self.extrainfo = extrainfo
+        self.ffi_flags = ffi_flags
 
     def __repr__(self):
         res = '%s(%s)' % (self.__class__.__name__, self.arg_classes)
@@ -284,6 +286,13 @@
     def get_extra_info(self):
         return self.extrainfo
 
+    def get_ffi_flags(self):
+        return self.ffi_flags
+
+    def get_call_conv(self):
+        from pypy.rlib.clibffi import get_call_conv
+        return get_call_conv(self.ffi_flags, True)
+
     def get_arg_types(self):
         return self.arg_classes
 
@@ -391,8 +400,8 @@
     """
     _clsname = 'DynamicIntCallDescr'
 
-    def __init__(self, arg_classes, result_size, result_sign, extrainfo=None):
-        BaseIntCallDescr.__init__(self, arg_classes, extrainfo)
+    def __init__(self, arg_classes, result_size, result_sign, extrainfo=None, ffi_flags=0):
+        BaseIntCallDescr.__init__(self, arg_classes, extrainfo, ffi_flags)
         assert isinstance(result_sign, bool)
         self._result_size = chr(result_size)
         self._result_sign = result_sign
diff --git a/pypy/jit/backend/llsupport/ffisupport.py b/pypy/jit/backend/llsupport/ffisupport.py
--- a/pypy/jit/backend/llsupport/ffisupport.py
+++ b/pypy/jit/backend/llsupport/ffisupport.py
@@ -8,7 +8,7 @@
 class UnsupportedKind(Exception):
     pass
 
-def get_call_descr_dynamic(cpu, ffi_args, ffi_result, extrainfo=None):
+def get_call_descr_dynamic(cpu, ffi_args, ffi_result, extrainfo=None, ffi_flags=0):
     """Get a call descr: the types of result and args are represented by
     rlib.libffi.types.*"""
     try:
@@ -20,18 +20,24 @@
     if reskind == history.INT:
         size = intmask(ffi_result.c_size)
         signed = is_ffi_type_signed(ffi_result)
-        return DynamicIntCallDescr(arg_classes, size, signed, extrainfo)
+        return DynamicIntCallDescr(arg_classes, size, signed, extrainfo,
+                                   ffi_flags=ffi_flags)
     elif reskind == history.REF:
-        return  NonGcPtrCallDescr(arg_classes, extrainfo)
+        return  NonGcPtrCallDescr(arg_classes, extrainfo,
+                                  ffi_flags=ffi_flags)
     elif reskind == history.FLOAT:
-        return FloatCallDescr(arg_classes, extrainfo)
+        return FloatCallDescr(arg_classes, extrainfo,
+                              ffi_flags=ffi_flags)
     elif reskind == history.VOID:
-        return VoidCallDescr(arg_classes, extrainfo)
+        return VoidCallDescr(arg_classes, extrainfo,
+                             ffi_flags=ffi_flags)
     elif reskind == 'L':
-        return LongLongCallDescr(arg_classes, extrainfo)
+        return LongLongCallDescr(arg_classes, extrainfo,
+                                 ffi_flags=ffi_flags)
     elif reskind == 'S':
         SingleFloatCallDescr = getCallDescrClass(rffi.FLOAT)
-        return SingleFloatCallDescr(arg_classes, extrainfo)
+        return SingleFloatCallDescr(arg_classes, extrainfo,
+                                    ffi_flags=ffi_flags)
     assert False
 
 def get_ffi_type_kind(cpu, ffi_type):
diff --git a/pypy/jit/backend/llsupport/gc.py b/pypy/jit/backend/llsupport/gc.py
--- a/pypy/jit/backend/llsupport/gc.py
+++ b/pypy/jit/backend/llsupport/gc.py
@@ -366,36 +366,92 @@
 
     def add_jit2gc_hooks(self, jit2gc):
         #
-        def collect_jit_stack_root(callback, gc, addr):
-            if addr.signed[0] != GcRootMap_shadowstack.MARKER:
-                # common case
-                if gc.points_to_valid_gc_object(addr):
-                    callback(gc, addr)
-                return WORD
-            else:
-                # case of a MARKER followed by an assembler stack frame
-                follow_stack_frame_of_assembler(callback, gc, addr)
-                return 2 * WORD
+        # ---------------
+        # This is used to enumerate the shadowstack in the presence
+        # of the JIT.  It is also used by the stacklet support in
+        # rlib/_stacklet_shadowstack.  That's why it is written as
+        # an iterator that can also be used with a custom_trace.
         #
-        def follow_stack_frame_of_assembler(callback, gc, addr):
-            frame_addr = addr.signed[1]
-            addr = llmemory.cast_int_to_adr(frame_addr + self.force_index_ofs)
-            force_index = addr.signed[0]
-            if force_index < 0:
-                force_index = ~force_index
-            callshape = self._callshapes[force_index]
-            n = 0
-            while True:
-                offset = rffi.cast(lltype.Signed, callshape[n])
-                if offset == 0:
-                    break
-                addr = llmemory.cast_int_to_adr(frame_addr + offset)
-                if gc.points_to_valid_gc_object(addr):
-                    callback(gc, addr)
-                n += 1
+        class RootIterator:
+            _alloc_flavor_ = "raw"
+
+            def next(iself, gc, next, range_highest):
+                # Return the "next" valid GC object' address.  This usually
+                # means just returning "next", until we reach "range_highest",
+                # except that we are skipping NULLs.  If "next" contains a
+                # MARKER instead, then we go into JIT-frame-lookup mode.
+                #
+                while True:
+                    #
+                    # If we are not iterating right now in a JIT frame
+                    if iself.frame_addr == 0:
+                        #
+                        # Look for the next shadowstack address that
+                        # contains a valid pointer
+                        while next != range_highest:
+                            if next.signed[0] == self.MARKER:
+                                break
+                            if gc.points_to_valid_gc_object(next):
+                                return next
+                            next += llmemory.sizeof(llmemory.Address)
+                        else:
+                            return llmemory.NULL     # done
+                        #
+                        # It's a JIT frame.  Save away 'next' for later, and
+                        # go into JIT-frame-exploring mode.
+                        next += llmemory.sizeof(llmemory.Address)
+                        frame_addr = next.signed[0]
+                        iself.saved_next = next
+                        iself.frame_addr = frame_addr
+                        addr = llmemory.cast_int_to_adr(frame_addr +
+                                                        self.force_index_ofs)
+                        addr = iself.translateptr(iself.context, addr)
+                        force_index = addr.signed[0]
+                        if force_index < 0:
+                            force_index = ~force_index
+                        # NB: the next line reads a still-alive _callshapes,
+                        # because we ensure that just before we called this
+                        # piece of assembler, we put on the (same) stack a
+                        # pointer to a loop_token that keeps the force_index
+                        # alive.
+                        callshape = self._callshapes[force_index]
+                    else:
+                        # Continuing to explore this JIT frame
+                        callshape = iself.callshape
+                    #
+                    # 'callshape' points to the next INT of the callshape.
+                    # If it's zero we are done with the JIT frame.
+                    while rffi.cast(lltype.Signed, callshape[0]) != 0:
+                        #
+                        # Non-zero: it's an offset inside the JIT frame.
+                        # Read it and increment 'callshape'.
+                        offset = rffi.cast(lltype.Signed, callshape[0])
+                        callshape = lltype.direct_ptradd(callshape, 1)
+                        addr = llmemory.cast_int_to_adr(iself.frame_addr +
+                                                        offset)
+                        addr = iself.translateptr(iself.context, addr)
+                        if gc.points_to_valid_gc_object(addr):
+                            #
+                            # The JIT frame contains a valid GC pointer at
+                            # this address (as opposed to NULL).  Save
+                            # 'callshape' for the next call, and return the
+                            # address.
+                            iself.callshape = callshape
+                            return addr
+                    #
+                    # Restore 'prev' and loop back to the start.
+                    iself.frame_addr = 0
+                    next = iself.saved_next
+                    next += llmemory.sizeof(llmemory.Address)
+
+        # ---------------
         #
+        root_iterator = RootIterator()
+        root_iterator.frame_addr = 0
+        root_iterator.context = llmemory.NULL
+        root_iterator.translateptr = lambda context, addr: addr
         jit2gc.update({
-            'rootstackhook': collect_jit_stack_root,
+            'root_iterator': root_iterator,
             })
 
     def initialize(self):
@@ -550,7 +606,7 @@
             has_finalizer = bool(tid & (1<<llgroup.HALFSHIFT))
             check_typeid(type_id)
             res = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
-                                                  type_id, size, True,
+                                                  type_id, size,
                                                   has_finalizer, False)
             # In case the operation above failed, we are returning NULL
             # from this function to assembler.  There is also an RPython
@@ -575,7 +631,7 @@
             return llop1.do_malloc_varsize_clear(
                 llmemory.GCREF,
                 type_id, num_elem, self.array_basesize, itemsize,
-                self.array_length_ofs, True)
+                self.array_length_ofs)
         self.malloc_array = malloc_array
         self.GC_MALLOC_ARRAY = lltype.Ptr(lltype.FuncType(
             [lltype.Signed] * 3, llmemory.GCREF))
@@ -591,12 +647,12 @@
             return llop1.do_malloc_varsize_clear(
                 llmemory.GCREF,
                 str_type_id, length, str_basesize, str_itemsize,
-                str_ofs_length, True)
+                str_ofs_length)
         def malloc_unicode(length):
             return llop1.do_malloc_varsize_clear(
                 llmemory.GCREF,
                 unicode_type_id, length, unicode_basesize,unicode_itemsize,
-                unicode_ofs_length, True)
+                unicode_ofs_length)
         self.malloc_str = malloc_str
         self.malloc_unicode = malloc_unicode
         self.GC_MALLOC_STR_UNICODE = lltype.Ptr(lltype.FuncType(
@@ -622,7 +678,7 @@
             # also use it to allocate varsized objects.  The tid
             # and possibly the length are both set afterward.
             gcref = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
-                                        0, size, True, False, False)
+                                        0, size, False, False)
             return rffi.cast(lltype.Signed, gcref)
         self.malloc_slowpath = malloc_slowpath
         self.MALLOC_SLOWPATH = lltype.FuncType([lltype.Signed], lltype.Signed)
diff --git a/pypy/jit/backend/llsupport/llmodel.py b/pypy/jit/backend/llsupport/llmodel.py
--- a/pypy/jit/backend/llsupport/llmodel.py
+++ b/pypy/jit/backend/llsupport/llmodel.py
@@ -254,13 +254,13 @@
         return ofs, size, sign
     unpack_arraydescr_size._always_inline_ = True
 
-    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo=None):
+    def calldescrof(self, FUNC, ARGS, RESULT, extrainfo):
         return get_call_descr(self.gc_ll_descr, ARGS, RESULT, extrainfo)
 
-    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo=None):
+    def calldescrof_dynamic(self, ffi_args, ffi_result, extrainfo, ffi_flags):
         from pypy.jit.backend.llsupport import ffisupport
         return ffisupport.get_call_descr_dynamic(self, ffi_args, ffi_result,
-                                                 extrainfo)
+                                                 extrainfo, ffi_flags)
 
     def get_overflow_error(self):
         ovf_vtable = self.cast_adr_to_int(self._ovf_error_vtable)
diff --git a/pypy/jit/backend/llsupport/test/test_ffisupport.py b/pypy/jit/backend/llsupport/test/test_ffisupport.py
--- a/pypy/jit/backend/llsupport/test/test_ffisupport.py
+++ b/pypy/jit/backend/llsupport/test/test_ffisupport.py
@@ -13,17 +13,19 @@
 
 def test_call_descr_dynamic():
     args = [types.sint, types.pointer]
-    descr = get_call_descr_dynamic(FakeCPU(), args, types.sint)
+    descr = get_call_descr_dynamic(FakeCPU(), args, types.sint, ffi_flags=42)
     assert isinstance(descr, DynamicIntCallDescr)
     assert descr.arg_classes == 'ii'
+    assert descr.get_ffi_flags() == 42
 
     args = [types.sint, types.double, types.pointer]
     descr = get_call_descr_dynamic(FakeCPU(), args, types.void)
     assert descr is None    # missing floats
     descr = get_call_descr_dynamic(FakeCPU(supports_floats=True),
-                                   args, types.void)
+                                   args, types.void, ffi_flags=43)
     assert isinstance(descr, VoidCallDescr)
     assert descr.arg_classes == 'ifi'
+    assert descr.get_ffi_flags() == 43
 
     descr = get_call_descr_dynamic(FakeCPU(), [], types.sint8)
     assert isinstance(descr, DynamicIntCallDescr)
@@ -39,14 +41,16 @@
         descr = get_call_descr_dynamic(FakeCPU(), [], types.slonglong)
         assert descr is None   # missing longlongs
         descr = get_call_descr_dynamic(FakeCPU(supports_longlong=True),
-                                       [], types.slonglong)
+                                       [], types.slonglong, ffi_flags=43)
         assert isinstance(descr, LongLongCallDescr)
+        assert descr.get_ffi_flags() == 43
     else:
         assert types.slonglong is types.slong
 
     descr = get_call_descr_dynamic(FakeCPU(), [], types.float)
     assert descr is None   # missing singlefloats
     descr = get_call_descr_dynamic(FakeCPU(supports_singlefloats=True),
-                                   [], types.float)
+                                   [], types.float, ffi_flags=44)
     SingleFloatCallDescr = getCallDescrClass(rffi.FLOAT)
     assert isinstance(descr, SingleFloatCallDescr)
+    assert descr.get_ffi_flags() == 44
diff --git a/pypy/jit/backend/llsupport/test/test_gc.py b/pypy/jit/backend/llsupport/test/test_gc.py
--- a/pypy/jit/backend/llsupport/test/test_gc.py
+++ b/pypy/jit/backend/llsupport/test/test_gc.py
@@ -246,9 +246,8 @@
     def __init__(self):
         self.record = []
 
-    def do_malloc_fixedsize_clear(self, RESTYPE, type_id, size, can_collect,
+    def do_malloc_fixedsize_clear(self, RESTYPE, type_id, size,
                                   has_finalizer, contains_weakptr):
-        assert can_collect
         assert not contains_weakptr
         p = llmemory.raw_malloc(size)
         p = llmemory.cast_adr_to_ptr(p, RESTYPE)
@@ -258,8 +257,7 @@
         return p
 
     def do_malloc_varsize_clear(self, RESTYPE, type_id, length, size,
-                                itemsize, offset_to_length, can_collect):
-        assert can_collect
+                                itemsize, offset_to_length):
         p = llmemory.raw_malloc(size + itemsize * length)
         (p + offset_to_length).signed[0] = length
         p = llmemory.cast_adr_to_ptr(p, RESTYPE)
diff --git a/pypy/jit/backend/test/calling_convention_test.py b/pypy/jit/backend/test/calling_convention_test.py
--- a/pypy/jit/backend/test/calling_convention_test.py
+++ b/pypy/jit/backend/test/calling_convention_test.py
@@ -8,6 +8,7 @@
                                          ConstObj, BoxFloat, ConstFloat)
 from pypy.jit.metainterp.resoperation import ResOperation, rop
 from pypy.jit.metainterp.typesystem import deref
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.tool.oparser import parse
 from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rffi, rclass
 from pypy.rpython.ootypesystem import ootype
@@ -96,7 +97,8 @@
             FUNC = self.FuncType(funcargs, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             ops = '[%s]\n' % arguments
@@ -148,7 +150,8 @@
             FUNC = self.FuncType(args, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             res = self.execute_operation(rop.CALL,
@@ -190,7 +193,8 @@
             FUNC = self.FuncType(args, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             res = self.execute_operation(rop.CALL,
@@ -268,7 +272,8 @@
                 else:
                     ARGS.append(lltype.Signed)
             FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-                lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+                lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+                EffectInfo.MOST_GENERAL)
             ops = '''
             [%s]
             f99 = call_assembler(%s, descr=called_looptoken)
@@ -337,7 +342,8 @@
             FUNC = self.FuncType(args, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
 
             res = self.execute_operation(rop.CALL,
diff --git a/pypy/jit/backend/test/runner_test.py b/pypy/jit/backend/test/runner_test.py
--- a/pypy/jit/backend/test/runner_test.py
+++ b/pypy/jit/backend/test/runner_test.py
@@ -9,6 +9,7 @@
                                          ConstObj, BoxFloat, ConstFloat)
 from pypy.jit.metainterp.resoperation import ResOperation, rop
 from pypy.jit.metainterp.typesystem import deref
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.tool.oparser import parse
 from pypy.rpython.lltypesystem import lltype, llmemory, rstr, rffi, rclass
 from pypy.rpython.ootypesystem import ootype
@@ -445,7 +446,8 @@
             return chr(ord(c) + 1)
         FPTR = self.Ptr(self.FuncType([lltype.Char], lltype.Char))
         func_ptr = llhelper(FPTR, func)
-        calldescr = cpu.calldescrof(deref(FPTR), (lltype.Char,), lltype.Char)
+        calldescr = cpu.calldescrof(deref(FPTR), (lltype.Char,), lltype.Char,
+                                    EffectInfo.MOST_GENERAL)
         x = cpu.bh_call_i(self.get_funcbox(cpu, func_ptr).value,
                           calldescr, [ord('A')], None, None)
         assert x == ord('B')
@@ -458,14 +460,15 @@
                                           lltype.Float))
             func_ptr = llhelper(FPTR, func)
             FTP = deref(FPTR)
-            calldescr = cpu.calldescrof(FTP, FTP.ARGS, FTP.RESULT)
+            calldescr = cpu.calldescrof(FTP, FTP.ARGS, FTP.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             x = cpu.bh_call_f(self.get_funcbox(cpu, func_ptr).value,
                               calldescr,
                               [42], None, [longlong.getfloatstorage(3.5)])
             assert longlong.getrealfloat(x) == 3.5 - 42
 
     def test_call(self):
-        from pypy.rlib.libffi import types
+        from pypy.rlib.libffi import types, FUNCFLAG_CDECL
 
         def func_int(a, b):
             return a + b
@@ -486,13 +489,16 @@
             FUNC = deref(FPTR)
             funcbox = self.get_funcbox(cpu, func_ptr)
             # first, try it with the "normal" calldescr
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             res = self.execute_operation(rop.CALL,
                                          [funcbox, BoxInt(num), BoxInt(num)],
                                          'int', descr=calldescr)
             assert res.value == 2 * num
             # then, try it with the dynamic calldescr
-            dyn_calldescr = cpu.calldescrof_dynamic([ffi_type, ffi_type], ffi_type)
+            dyn_calldescr = cpu.calldescrof_dynamic([ffi_type, ffi_type], ffi_type,
+                                                    EffectInfo.MOST_GENERAL,
+                                                    ffi_flags=FUNCFLAG_CDECL)
             res = self.execute_operation(rop.CALL,
                                          [funcbox, BoxInt(num), BoxInt(num)],
                                          'int', descr=dyn_calldescr)
@@ -507,7 +513,8 @@
             FUNC = self.FuncType([F] * 7 + [I] * 2 + [F] * 3, F)
             FPTR = self.Ptr(FUNC)
             func_ptr = llhelper(FPTR, func)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
             args = ([boxfloat(.1) for i in range(7)] +
                     [BoxInt(1), BoxInt(2), boxfloat(.2), boxfloat(.3),
@@ -529,7 +536,8 @@
 
         FUNC = self.FuncType([lltype.Signed]*16, lltype.Signed)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         func_ptr = llhelper(FPTR, func)
         args = range(16)
         funcbox = self.get_funcbox(self.cpu, func_ptr)
@@ -552,7 +560,8 @@
             FPTR = self.Ptr(self.FuncType([TP] * nb_args, TP))
             func_ptr = llhelper(FPTR, func_ints)
             FUNC = deref(FPTR)
-            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                        EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(cpu, func_ptr)
             args = [280-24*i for i in range(nb_args)]
             res = self.execute_operation(rop.CALL,
@@ -566,7 +575,8 @@
 
         FUNC = self.FuncType([lltype.Float, lltype.Float], lltype.Float)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         func_ptr = llhelper(FPTR, func)
         funcbox = self.get_funcbox(self.cpu, func_ptr)
         res = self.execute_operation(rop.CALL, [funcbox, constfloat(1.5),
@@ -1589,7 +1599,8 @@
         '''
         FPTR = lltype.Ptr(lltype.FuncType([lltype.Signed], lltype.Void))
         fptr = llhelper(FPTR, func)
-        calldescr = self.cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT)
+        calldescr = self.cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT,
+                                         EffectInfo.MOST_GENERAL)
 
         xtp = lltype.malloc(rclass.OBJECT_VTABLE, immortal=True)
         xtp.subclassrange_min = 1
@@ -1807,7 +1818,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Void)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1850,7 +1862,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Signed)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1895,7 +1908,8 @@
         FUNC = self.FuncType([lltype.Signed, lltype.Signed], lltype.Float)
         func_ptr = llhelper(lltype.Ptr(FUNC), maybe_force)
         funcbox = self.get_funcbox(self.cpu, func_ptr).constbox()
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         cpu = self.cpu
         i0 = BoxInt()
         i1 = BoxInt()
@@ -1931,7 +1945,7 @@
         assert values == [1, 10]
 
     def test_call_to_c_function(self):
-        from pypy.rlib.libffi import CDLL, types, ArgChain
+        from pypy.rlib.libffi import CDLL, types, ArgChain, FUNCFLAG_CDECL
         from pypy.rpython.lltypesystem.ll2ctypes import libc_name
         libc = CDLL(libc_name)
         c_tolower = libc.getpointer('tolower', [types.uchar], types.sint)
@@ -1941,7 +1955,9 @@
         cpu = self.cpu
         func_adr = llmemory.cast_ptr_to_adr(c_tolower.funcsym)
         funcbox = ConstInt(heaptracker.adr2int(func_adr))
-        calldescr = cpu.calldescrof_dynamic([types.uchar], types.sint)
+        calldescr = cpu.calldescrof_dynamic([types.uchar], types.sint,
+                                            EffectInfo.MOST_GENERAL,
+                                            ffi_flags=FUNCFLAG_CDECL)
         i1 = BoxInt()
         i2 = BoxInt()
         tok = BoxInt()
@@ -1997,7 +2013,9 @@
         funcbox = ConstInt(heaptracker.adr2int(func_adr))
         calldescr = cpu.calldescrof_dynamic([types.pointer, types_size_t,
                                              types_size_t, types.pointer],
-                                            types.void)
+                                            types.void,
+                                            EffectInfo.MOST_GENERAL,
+                                            ffi_flags=clibffi.FUNCFLAG_CDECL)
         i0 = BoxInt()
         i1 = BoxInt()
         i2 = BoxInt()
@@ -2023,6 +2041,62 @@
         assert len(glob.lst) > 0
         lltype.free(raw, flavor='raw')
 
+    def test_call_to_winapi_function(self):
+        from pypy.rlib.clibffi import _WIN32, FUNCFLAG_STDCALL
+        if not _WIN32:
+            py.test.skip("Windows test only")
+        from pypy.rlib.libffi import CDLL, types, ArgChain
+        from pypy.rlib.rwin32 import DWORD
+        libc = CDLL('KERNEL32')
+        c_GetCurrentDir = libc.getpointer('GetCurrentDirectoryA',
+                                          [types.ulong, types.pointer],
+                                          types.ulong)
+
+        cwd = os.getcwd()
+        buflen = len(cwd) + 10
+        buffer = lltype.malloc(rffi.CCHARP.TO, buflen, flavor='raw')
+        argchain = ArgChain().arg(rffi.cast(DWORD, buflen)).arg(buffer)
+        res = c_GetCurrentDir.call(argchain, DWORD)
+        assert rffi.cast(lltype.Signed, res) == len(cwd)
+        assert rffi.charp2strn(buffer, buflen) == cwd
+        lltype.free(buffer, flavor='raw')
+
+        cpu = self.cpu
+        func_adr = llmemory.cast_ptr_to_adr(c_GetCurrentDir.funcsym)
+        funcbox = ConstInt(heaptracker.adr2int(func_adr))
+        calldescr = cpu.calldescrof_dynamic([types.ulong, types.pointer],
+                                            types.ulong,
+                                            EffectInfo.MOST_GENERAL,
+                                            ffi_flags=FUNCFLAG_STDCALL)
+        i1 = BoxInt()
+        i2 = BoxInt()
+        faildescr = BasicFailDescr(1)
+        # if the stdcall convention is ignored, then ESP is wrong after the
+        # call: 8 bytes too much.  If we repeat the call often enough, crash.
+        ops = []
+        for i in range(50):
+            i3 = BoxInt()
+            ops += [
+                ResOperation(rop.CALL_RELEASE_GIL, [funcbox, i1, i2], i3,
+                             descr=calldescr),
+                ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+                ]
+            ops[-1].setfailargs([])
+        ops += [
+            ResOperation(rop.FINISH, [i3], None, descr=BasicFailDescr(0))
+        ]
+        looptoken = LoopToken()
+        self.cpu.compile_loop([i1, i2], ops, looptoken)
+
+        buffer = lltype.malloc(rffi.CCHARP.TO, buflen, flavor='raw')
+        self.cpu.set_future_value_int(0, buflen)
+        self.cpu.set_future_value_int(1, rffi.cast(lltype.Signed, buffer))
+        fail = self.cpu.execute_token(looptoken)
+        assert fail.identifier == 0
+        assert self.cpu.get_latest_value_int(0) == len(cwd)
+        assert rffi.charp2strn(buffer, buflen) == cwd
+        lltype.free(buffer, flavor='raw')
+
     def test_guard_not_invalidated(self):
         cpu = self.cpu
         i0 = BoxInt()
@@ -2292,7 +2366,8 @@
         ARGS = [lltype.Signed] * 10
         RES = lltype.Signed
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         for i in range(10):
             self.cpu.set_future_value_int(i, i+1)
         res = self.cpu.execute_token(looptoken)
@@ -2332,7 +2407,8 @@
         ARGS = [lltype.Float, lltype.Float]
         RES = lltype.Float
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         
         ops = '''
         [f0, f1]
@@ -2422,7 +2498,8 @@
         ARGS = [lltype.Float, lltype.Float]
         RES = lltype.Float
         FakeJitDriverSD.portal_calldescr = self.cpu.calldescrof(
-            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES)
+            lltype.Ptr(lltype.FuncType(ARGS, RES)), ARGS, RES,
+            EffectInfo.MOST_GENERAL)
         
         ops = '''
         [f0, f1]
@@ -2634,7 +2711,8 @@
             #
             FUNC = self.FuncType([lltype.Signed], RESTYPE)
             FPTR = self.Ptr(FUNC)
-            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                             EffectInfo.MOST_GENERAL)
             x = self.cpu.bh_call_i(self.get_funcbox(self.cpu, f).value,
                                    calldescr, [value], None, None)
             assert x == expected, (
@@ -2667,7 +2745,8 @@
             #
             FUNC = self.FuncType([lltype.Signed], RESTYPE)
             FPTR = self.Ptr(FUNC)
-            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+            calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                             EffectInfo.MOST_GENERAL)
             funcbox = self.get_funcbox(self.cpu, f)
             res = self.execute_operation(rop.CALL, [funcbox, BoxInt(value)],
                                          'int', descr=calldescr)
@@ -2701,7 +2780,8 @@
         #
         FUNC = self.FuncType([lltype.SignedLongLong], lltype.SignedLongLong)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         x = self.cpu.bh_call_f(self.get_funcbox(self.cpu, f).value,
                                calldescr, None, None, [value])
         assert x == expected
@@ -2728,7 +2808,8 @@
         #
         FUNC = self.FuncType([lltype.SignedLongLong], lltype.SignedLongLong)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         funcbox = self.get_funcbox(self.cpu, f)
         res = self.execute_operation(rop.CALL, [funcbox, BoxFloat(value)],
                                      'float', descr=calldescr)
@@ -2756,7 +2837,8 @@
         #
         FUNC = self.FuncType([lltype.SingleFloat], lltype.SingleFloat)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         ivalue = longlong.singlefloat2int(value)
         iexpected = longlong.singlefloat2int(expected)
         x = self.cpu.bh_call_i(self.get_funcbox(self.cpu, f).value,
@@ -2785,7 +2867,8 @@
         #
         FUNC = self.FuncType([lltype.SingleFloat], lltype.SingleFloat)
         FPTR = self.Ptr(FUNC)
-        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        calldescr = self.cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                         EffectInfo.MOST_GENERAL)
         funcbox = self.get_funcbox(self.cpu, f)
         ivalue = longlong.singlefloat2int(value)
         iexpected = longlong.singlefloat2int(expected)
diff --git a/pypy/jit/backend/test/test_ll_random.py b/pypy/jit/backend/test/test_ll_random.py
--- a/pypy/jit/backend/test/test_ll_random.py
+++ b/pypy/jit/backend/test/test_ll_random.py
@@ -6,6 +6,7 @@
 from pypy.jit.metainterp.history import BoxPtr, BoxInt
 from pypy.jit.metainterp.history import BasicFailDescr
 from pypy.jit.codewriter import heaptracker
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.rpython.annlowlevel import llhelper
 from pypy.rlib.rarithmetic import intmask
 from pypy.rpython.llinterp import LLException
@@ -468,6 +469,10 @@
         exec code in d
         return subset, d['f'], vtableptr
 
+    def getcalldescr(self, builder, TP):
+        ef = EffectInfo.MOST_GENERAL
+        return builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT, ef)
+
 # 1. non raising call and guard_no_exception
 class CallOperation(BaseCallOperation):
     def produce_into(self, builder, r):
@@ -481,7 +486,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         op = ResOperation(rop.GUARD_NO_EXCEPTION, [], None,
                           descr=BasicFailDescr())
@@ -501,7 +506,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         _, vtableptr = builder.get_random_structure_type_and_vtable(r)
         exc_box = ConstAddr(llmemory.cast_ptr_to_adr(vtableptr), builder.cpu)
@@ -523,7 +528,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         exc_box = ConstAddr(llmemory.cast_ptr_to_adr(exc), builder.cpu)
         op = ResOperation(rop.GUARD_EXCEPTION, [exc_box], BoxPtr(),
@@ -540,7 +545,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         op = ResOperation(rop.GUARD_NO_EXCEPTION, [], BoxPtr(),
                           descr=BasicFailDescr())
@@ -559,7 +564,7 @@
         ptr = llhelper(lltype.Ptr(TP), f)
         c_addr = ConstAddr(llmemory.cast_ptr_to_adr(ptr), builder.cpu)
         args = [c_addr] + subset
-        descr = builder.cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        descr = self.getcalldescr(builder, TP)
         self.put(builder, args, descr)
         while True:
             _, vtableptr = builder.get_random_structure_type_and_vtable(r)
diff --git a/pypy/jit/backend/x86/assembler.py b/pypy/jit/backend/x86/assembler.py
--- a/pypy/jit/backend/x86/assembler.py
+++ b/pypy/jit/backend/x86/assembler.py
@@ -34,6 +34,7 @@
 from pypy.rlib.debug import (debug_print, debug_start, debug_stop,
                              have_debug_prints)
 from pypy.rlib import rgc
+from pypy.rlib.clibffi import FFI_DEFAULT_ABI
 from pypy.jit.backend.x86.jump import remap_frame_layout
 from pypy.jit.metainterp.history import ConstInt, BoxInt
 from pypy.jit.codewriter.effectinfo import EffectInfo
@@ -1120,7 +1121,7 @@
         return genop_cmp_guard_float
 
     def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax,
-                   argtypes=None):
+                   argtypes=None, callconv=FFI_DEFAULT_ABI):
         if IS_X86_64:
             return self._emit_call_64(force_index, x, arglocs, start, argtypes)
 
@@ -1149,6 +1150,16 @@
         # x is a location
         self.mc.CALL(x)
         self.mark_gc_roots(force_index)
+        #
+        if callconv != FFI_DEFAULT_ABI:
+            self._fix_stdcall(callconv, p)
+
+    def _fix_stdcall(self, callconv, p):
+        from pypy.rlib.clibffi import FFI_STDCALL
+        assert callconv == FFI_STDCALL
+        # it's a bit stupid, but we're just going to cancel the fact that
+        # the called function just added 'p' to ESP, by subtracting it again.
+        self.mc.SUB_ri(esp.value, p)
 
     def _emit_call_64(self, force_index, x, arglocs, start, argtypes):
         src_locs = []
@@ -2127,7 +2138,8 @@
             tmp = eax
 
         self._emit_call(force_index, x, arglocs, 3, tmp=tmp,
-                        argtypes=op.getdescr().get_arg_types())
+                        argtypes=op.getdescr().get_arg_types(),
+                        callconv=op.getdescr().get_call_conv())
 
         if IS_X86_32 and isinstance(resloc, StackLoc) and resloc.width == 8:
             # a float or a long long return
diff --git a/pypy/jit/backend/x86/regalloc.py b/pypy/jit/backend/x86/regalloc.py
--- a/pypy/jit/backend/x86/regalloc.py
+++ b/pypy/jit/backend/x86/regalloc.py
@@ -843,8 +843,8 @@
 
     def consider_call(self, op):
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            oopspecindex = effectinfo.oopspecindex
+        oopspecindex = effectinfo.oopspecindex
+        if oopspecindex != EffectInfo.OS_NONE:
             if IS_X86_32:
                 # support for some of the llong operations,
                 # which only exist on x86-32
diff --git a/pypy/jit/backend/x86/rx86.py b/pypy/jit/backend/x86/rx86.py
--- a/pypy/jit/backend/x86/rx86.py
+++ b/pypy/jit/backend/x86/rx86.py
@@ -527,6 +527,7 @@
 
     NOP = insn('\x90')
     RET = insn('\xC3')
+    RET16_i = insn('\xC2', immediate(1, 'h'))
 
     PUSH_r = insn(rex_nw, register(1), '\x50')
     PUSH_b = insn(rex_nw, '\xFF', orbyte(6<<3), stack_bp(1))
diff --git a/pypy/jit/backend/x86/test/test_gc_integration.py b/pypy/jit/backend/x86/test/test_gc_integration.py
--- a/pypy/jit/backend/x86/test/test_gc_integration.py
+++ b/pypy/jit/backend/x86/test/test_gc_integration.py
@@ -7,6 +7,7 @@
      BoxPtr, ConstPtr, TreeLoop
 from pypy.jit.metainterp.resoperation import rop, ResOperation
 from pypy.jit.codewriter import heaptracker
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.backend.llsupport.descr import GcCache
 from pypy.jit.backend.llsupport.gc import GcLLDescription
 from pypy.jit.backend.detect_cpu import getcpuclass
@@ -76,7 +77,8 @@
         for box in boxes:
             regalloc.rm.try_allocate_reg(box)
         TP = lltype.FuncType([], lltype.Signed)
-        calldescr = cpu.calldescrof(TP, TP.ARGS, TP.RESULT)
+        calldescr = cpu.calldescrof(TP, TP.ARGS, TP.RESULT,
+                                    EffectInfo.MOST_GENERAL)
         regalloc.rm._check_invariants()
         box = boxes[0]
         regalloc.position = 0
diff --git a/pypy/jit/backend/x86/test/test_regalloc.py b/pypy/jit/backend/x86/test/test_regalloc.py
--- a/pypy/jit/backend/x86/test/test_regalloc.py
+++ b/pypy/jit/backend/x86/test/test_regalloc.py
@@ -16,6 +16,7 @@
 from pypy.rpython.annlowlevel import llhelper
 from pypy.rpython.lltypesystem import rclass, rstr
 from pypy.jit.codewriter import longlong
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.backend.x86.rx86 import *
 
 def test_is_comparison_or_ovf_op():
@@ -92,7 +93,8 @@
     zd_addr = cpu.cast_int_to_adr(zero_division_tp)
     zero_division_error = llmemory.cast_adr_to_ptr(zd_addr,
                                             lltype.Ptr(rclass.OBJECT_VTABLE))
-    raising_calldescr = cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT)
+    raising_calldescr = cpu.calldescrof(FPTR.TO, FPTR.TO.ARGS, FPTR.TO.RESULT,
+                                        EffectInfo.MOST_GENERAL)
 
     fdescr1 = BasicFailDescr(1)
     fdescr2 = BasicFailDescr(2)
@@ -115,9 +117,12 @@
     f2ptr = llhelper(F2PTR, f2)
     f10ptr = llhelper(F10PTR, f10)
 
-    f1_calldescr = cpu.calldescrof(F1PTR.TO, F1PTR.TO.ARGS, F1PTR.TO.RESULT)
-    f2_calldescr = cpu.calldescrof(F2PTR.TO, F2PTR.TO.ARGS, F2PTR.TO.RESULT)
-    f10_calldescr = cpu.calldescrof(F10PTR.TO, F10PTR.TO.ARGS, F10PTR.TO.RESULT)
+    f1_calldescr = cpu.calldescrof(F1PTR.TO, F1PTR.TO.ARGS, F1PTR.TO.RESULT,
+                                   EffectInfo.MOST_GENERAL)
+    f2_calldescr = cpu.calldescrof(F2PTR.TO, F2PTR.TO.ARGS, F2PTR.TO.RESULT,
+                                   EffectInfo.MOST_GENERAL)
+    f10_calldescr= cpu.calldescrof(F10PTR.TO, F10PTR.TO.ARGS, F10PTR.TO.RESULT,
+                                   EffectInfo.MOST_GENERAL)
 
     namespace = locals().copy()
     type_system = 'lltype'
diff --git a/pypy/jit/backend/x86/test/test_runner.py b/pypy/jit/backend/x86/test/test_runner.py
--- a/pypy/jit/backend/x86/test/test_runner.py
+++ b/pypy/jit/backend/x86/test/test_runner.py
@@ -433,6 +433,88 @@
                 ops_offset[operations[2]] <=
                 ops_offset[None])
 
+    def test_calling_convention(self, monkeypatch):
+        if WORD != 4:
+            py.test.skip("32-bit only test")
+        from pypy.jit.backend.x86.regloc import eax, edx
+        from pypy.jit.backend.x86 import codebuf
+        from pypy.jit.codewriter.effectinfo import EffectInfo
+        from pypy.rlib.libffi import types, clibffi
+        had_stdcall = hasattr(clibffi, 'FFI_STDCALL')
+        if not had_stdcall:    # not running on Windows, but we can still test
+            monkeypatch.setattr(clibffi, 'FFI_STDCALL', 12345, raising=False)
+        #
+        for ffi in [clibffi.FFI_DEFAULT_ABI, clibffi.FFI_STDCALL]:
+            cpu = self.cpu
+            mc = codebuf.MachineCodeBlockWrapper()
+            mc.MOV_rs(eax.value, 4)      # argument 1
+            mc.MOV_rs(edx.value, 40)     # argument 10
+            mc.SUB_rr(eax.value, edx.value)     # return arg1 - arg10
+            if ffi == clibffi.FFI_DEFAULT_ABI:
+                mc.RET()
+            else:
+                mc.RET16_i(40)
+            rawstart = mc.materialize(cpu.asmmemmgr, [])
+            #
+            calldescr = cpu.calldescrof_dynamic([types.slong] * 10,
+                                                types.slong,
+                                                EffectInfo.MOST_GENERAL,
+                                                ffi_flags=-1)
+            calldescr.get_call_conv = lambda: ffi      # <==== hack
+            funcbox = ConstInt(rawstart)
+            i1 = BoxInt()
+            i2 = BoxInt()
+            i3 = BoxInt()
+            i4 = BoxInt()
+            i5 = BoxInt()
+            i6 = BoxInt()
+            c = ConstInt(-1)
+            faildescr = BasicFailDescr(1)
+            # we must call it repeatedly: if the stack pointer gets increased
+            # by 40 bytes by the STDCALL call, and if we don't expect it,
+            # then we are going to get our stack emptied unexpectedly by
+            # several repeated calls
+            ops = [
+            ResOperation(rop.CALL_RELEASE_GIL,
+                         [funcbox, i1, c, c, c, c, c, c, c, c, i2],
+                         i3, descr=calldescr),
+            ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+
+            ResOperation(rop.CALL_RELEASE_GIL,
+                         [funcbox, i1, c, c, c, c, c, c, c, c, i2],
+                         i4, descr=calldescr),
+            ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+
+            ResOperation(rop.CALL_RELEASE_GIL,
+                         [funcbox, i1, c, c, c, c, c, c, c, c, i2],
+                         i5, descr=calldescr),
+            ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+
+            ResOperation(rop.CALL_RELEASE_GIL,
+                         [funcbox, i1, c, c, c, c, c, c, c, c, i2],
+                         i6, descr=calldescr),
+            ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+
+            ResOperation(rop.FINISH, [i3, i4, i5, i6], None,
+                         descr=BasicFailDescr(0))
+            ]
+            ops[1].setfailargs([])
+            ops[3].setfailargs([])
+            ops[5].setfailargs([])
+            ops[7].setfailargs([])
+            looptoken = LoopToken()
+            self.cpu.compile_loop([i1, i2], ops, looptoken)
+
+            self.cpu.set_future_value_int(0, 123450)
+            self.cpu.set_future_value_int(1, 123408)
+            fail = self.cpu.execute_token(looptoken)
+            assert fail.identifier == 0
+            assert self.cpu.get_latest_value_int(0) == 42
+            assert self.cpu.get_latest_value_int(1) == 42
+            assert self.cpu.get_latest_value_int(2) == 42
+            assert self.cpu.get_latest_value_int(3) == 42
+
+
 class TestDebuggingAssembler(object):
     def setup_method(self, meth):
         self.cpu = CPU(rtyper=None, stats=FakeStats())
diff --git a/pypy/jit/codewriter/call.py b/pypy/jit/codewriter/call.py
--- a/pypy/jit/codewriter/call.py
+++ b/pypy/jit/codewriter/call.py
@@ -6,7 +6,7 @@
 from pypy.jit.codewriter import support
 from pypy.jit.codewriter.jitcode import JitCode
 from pypy.jit.codewriter.effectinfo import (VirtualizableAnalyzer,
-    QuasiImmutAnalyzer, CanReleaseGILAnalyzer, effectinfo_from_writeanalyze,
+    QuasiImmutAnalyzer, RandomEffectsAnalyzer, effectinfo_from_writeanalyze,
     EffectInfo, CallInfoCollection)
 from pypy.translator.simplify import get_funcobj, get_functype
 from pypy.rpython.lltypesystem import lltype, llmemory
@@ -31,7 +31,7 @@
             self.readwrite_analyzer = ReadWriteAnalyzer(translator)
             self.virtualizable_analyzer = VirtualizableAnalyzer(translator)
             self.quasiimmut_analyzer = QuasiImmutAnalyzer(translator)
-            self.canreleasegil_analyzer = CanReleaseGILAnalyzer(translator)
+            self.randomeffects_analyzer = RandomEffectsAnalyzer(translator)
         #
         for index, jd in enumerate(jitdrivers_sd):
             jd.index = index
@@ -187,7 +187,7 @@
             fnaddr = llmemory.cast_ptr_to_adr(fnptr)
         NON_VOID_ARGS = [ARG for ARG in FUNC.ARGS if ARG is not lltype.Void]
         calldescr = self.cpu.calldescrof(FUNC, tuple(NON_VOID_ARGS),
-                                         FUNC.RESULT)
+                                         FUNC.RESULT, EffectInfo.MOST_GENERAL)
         return (fnaddr, calldescr)
 
     def getcalldescr(self, op, oopspecindex=EffectInfo.OS_NONE,
@@ -219,9 +219,11 @@
                 assert not NON_VOID_ARGS, ("arguments not supported for "
                                            "loop-invariant function!")
         # build the extraeffect
-        can_release_gil = self.canreleasegil_analyzer.analyze(op)
-        # can_release_gil implies can_invalidate
-        can_invalidate = can_release_gil or self.quasiimmut_analyzer.analyze(op)
+        random_effects = self.randomeffects_analyzer.analyze(op)
+        if random_effects:
+            extraeffect = EffectInfo.EF_RANDOM_EFFECTS
+        # random_effects implies can_invalidate
+        can_invalidate = random_effects or self.quasiimmut_analyzer.analyze(op)
         if extraeffect is None:
             if self.virtualizable_analyzer.analyze(op):
                 extraeffect = EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
@@ -239,12 +241,10 @@
         #
         effectinfo = effectinfo_from_writeanalyze(
             self.readwrite_analyzer.analyze(op), self.cpu, extraeffect,
-            oopspecindex, can_invalidate, can_release_gil)
+            oopspecindex, can_invalidate)
         #
-        if oopspecindex != EffectInfo.OS_NONE:
-            assert effectinfo is not None
+        assert effectinfo is not None
         if elidable or loopinvariant:
-            assert effectinfo is not None
             assert extraeffect != EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
             # XXX this should also say assert not can_invalidate, but
             #     it can't because our analyzer is not good enough for now
@@ -264,8 +264,7 @@
 
     def calldescr_canraise(self, calldescr):
         effectinfo = calldescr.get_extra_info()
-        return (effectinfo is None or
-                effectinfo.extraeffect > EffectInfo.EF_CANNOT_RAISE)
+        return effectinfo.check_can_raise()
 
     def jitdriver_sd_from_portal_graph(self, graph):
         for jd in self.jitdrivers_sd:
diff --git a/pypy/jit/codewriter/effectinfo.py b/pypy/jit/codewriter/effectinfo.py
--- a/pypy/jit/codewriter/effectinfo.py
+++ b/pypy/jit/codewriter/effectinfo.py
@@ -15,6 +15,7 @@
     EF_ELIDABLE_CAN_RAISE              = 3 #elidable function (but can raise)
     EF_CAN_RAISE                       = 4 #normal function (can raise)
     EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE = 5 #can raise and force virtualizables
+    EF_RANDOM_EFFECTS                  = 6 #can do whatever
 
     # the 'oopspecindex' field is one of the following values:
     OS_NONE                     = 0    # normal case, no oopspec
@@ -80,17 +81,26 @@
                 write_descrs_fields, write_descrs_arrays,
                 extraeffect=EF_CAN_RAISE,
                 oopspecindex=OS_NONE,
-                can_invalidate=False, can_release_gil=False):
-        key = (frozenset(readonly_descrs_fields),
-               frozenset(readonly_descrs_arrays),
-               frozenset(write_descrs_fields),
-               frozenset(write_descrs_arrays),
+                can_invalidate=False):
+        key = (frozenset_or_none(readonly_descrs_fields),
+               frozenset_or_none(readonly_descrs_arrays),
+               frozenset_or_none(write_descrs_fields),
+               frozenset_or_none(write_descrs_arrays),
                extraeffect,
                oopspecindex,
-               can_invalidate,
-               can_release_gil)
+               can_invalidate)
         if key in cls._cache:
             return cls._cache[key]
+        if extraeffect == EffectInfo.EF_RANDOM_EFFECTS:
+            assert readonly_descrs_fields is None
+            assert readonly_descrs_arrays is None
+            assert write_descrs_fields is None
+            assert write_descrs_arrays is None
+        else:
+            assert readonly_descrs_fields is not None
+            assert readonly_descrs_arrays is not None
+            assert write_descrs_fields is not None
+            assert write_descrs_arrays is not None
         result = object.__new__(cls)
         result.readonly_descrs_fields = readonly_descrs_fields
         result.readonly_descrs_arrays = readonly_descrs_arrays
@@ -104,11 +114,13 @@
             result.write_descrs_arrays = write_descrs_arrays
         result.extraeffect = extraeffect
         result.can_invalidate = can_invalidate
-        result.can_release_gil = can_release_gil
         result.oopspecindex = oopspecindex
         cls._cache[key] = result
         return result
 
+    def check_can_raise(self):
+        return self.extraeffect > self.EF_CANNOT_RAISE
+
     def check_can_invalidate(self):
         return self.can_invalidate
 
@@ -116,56 +128,71 @@
         return self.extraeffect >= self.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
 
     def has_random_effects(self):
-        return self.oopspecindex == self.OS_LIBFFI_CALL or self.can_release_gil
+        return self.extraeffect >= self.EF_RANDOM_EFFECTS
+
+
+def frozenset_or_none(x):
+    if x is None:
+        return None
+    return frozenset(x)
+
+EffectInfo.MOST_GENERAL = EffectInfo(None, None, None, None,
+                                     EffectInfo.EF_RANDOM_EFFECTS,
+                                     can_invalidate=True)
+
 
 def effectinfo_from_writeanalyze(effects, cpu,
                                  extraeffect=EffectInfo.EF_CAN_RAISE,
                                  oopspecindex=EffectInfo.OS_NONE,
-                                 can_invalidate=False,
-                                 can_release_gil=False):
+                                 can_invalidate=False):
     from pypy.translator.backendopt.writeanalyze import top_set
-    if effects is top_set:
-        return None
-    readonly_descrs_fields = []
-    readonly_descrs_arrays = []
-    write_descrs_fields = []
-    write_descrs_arrays = []
+    if effects is top_set or extraeffect == EffectInfo.EF_RANDOM_EFFECTS:
+        readonly_descrs_fields = None
+        readonly_descrs_arrays = None
+        write_descrs_fields = None
+        write_descrs_arrays = None
+        extraeffect = EffectInfo.EF_RANDOM_EFFECTS
+    else:
+        readonly_descrs_fields = []
+        readonly_descrs_arrays = []
+        write_descrs_fields = []
+        write_descrs_arrays = []
 
-    def add_struct(descrs_fields, (_, T, fieldname)):
-        T = deref(T)
-        if consider_struct(T, fieldname):
-            descr = cpu.fielddescrof(T, fieldname)
-            descrs_fields.append(descr)
+        def add_struct(descrs_fields, (_, T, fieldname)):
+            T = deref(T)
+            if consider_struct(T, fieldname):
+                descr = cpu.fielddescrof(T, fieldname)
+                descrs_fields.append(descr)
 
-    def add_array(descrs_arrays, (_, T)):
-        ARRAY = deref(T)
-        if consider_array(ARRAY):
-            descr = cpu.arraydescrof(ARRAY)
-            descrs_arrays.append(descr)
+        def add_array(descrs_arrays, (_, T)):
+            ARRAY = deref(T)
+            if consider_array(ARRAY):
+                descr = cpu.arraydescrof(ARRAY)
+                descrs_arrays.append(descr)
 
-    for tup in effects:
-        if tup[0] == "struct":
-            add_struct(write_descrs_fields, tup)
-        elif tup[0] == "readstruct":
-            tupw = ("struct",) + tup[1:]
-            if tupw not in effects:
-                add_struct(readonly_descrs_fields, tup)
-        elif tup[0] == "array":
-            add_array(write_descrs_arrays, tup)
-        elif tup[0] == "readarray":
-            tupw = ("array",) + tup[1:]
-            if tupw not in effects:
-                add_array(readonly_descrs_arrays, tup)
-        else:
-            assert 0
+        for tup in effects:
+            if tup[0] == "struct":
+                add_struct(write_descrs_fields, tup)
+            elif tup[0] == "readstruct":
+                tupw = ("struct",) + tup[1:]
+                if tupw not in effects:
+                    add_struct(readonly_descrs_fields, tup)
+            elif tup[0] == "array":
+                add_array(write_descrs_arrays, tup)
+            elif tup[0] == "readarray":
+                tupw = ("array",) + tup[1:]
+                if tupw not in effects:
+                    add_array(readonly_descrs_arrays, tup)
+            else:
+                assert 0
+    #
     return EffectInfo(readonly_descrs_fields,
                       readonly_descrs_arrays,
                       write_descrs_fields,
                       write_descrs_arrays,
                       extraeffect,
                       oopspecindex,
-                      can_invalidate,
-                      can_release_gil)
+                      can_invalidate)
 
 def consider_struct(TYPE, fieldname):
     if fieldType(TYPE, fieldname) is lltype.Void:
@@ -201,12 +228,13 @@
     def analyze_simple_operation(self, op, graphinfo):
         return op.opname == 'jit_force_quasi_immutable'
 
-class CanReleaseGILAnalyzer(BoolGraphAnalyzer):
+class RandomEffectsAnalyzer(BoolGraphAnalyzer):
     def analyze_direct_call(self, graph, seen=None):
-        releases_gil = False
         if hasattr(graph, "func") and hasattr(graph.func, "_ptr"):
-            releases_gil = graph.func._ptr._obj.releases_gil
-        return releases_gil or super(CanReleaseGILAnalyzer, self).analyze_direct_call(graph, seen)
+            if graph.func._ptr._obj.random_effects_on_gcobjs:
+                return True
+        return super(RandomEffectsAnalyzer, self).analyze_direct_call(graph,
+                                                                      seen)
 
     def analyze_simple_operation(self, op, graphinfo):
         return False
diff --git a/pypy/jit/codewriter/jtransform.py b/pypy/jit/codewriter/jtransform.py
--- a/pypy/jit/codewriter/jtransform.py
+++ b/pypy/jit/codewriter/jtransform.py
@@ -1,4 +1,5 @@
 import py
+
 from pypy.jit.codewriter import support, heaptracker, longlong
 from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.codewriter.flatten import ListOfKind, IndirectCallTargets
@@ -22,6 +23,11 @@
     t = Transformer(cpu, callcontrol, portal_jd)
     t.transform(graph)
 
+def integer_bounds(size, unsigned):
+    if unsigned:
+        return 0, 1 << (8 * size)
+    else:
+        return -(1 << (8 * size - 1)), 1 << (8 * size - 1)
 
 class Transformer(object):
     vable_array_vars = None
@@ -780,81 +786,127 @@
             raise NotImplementedError("cast_ptr_to_int")
 
     def rewrite_op_force_cast(self, op):
-        assert not self._is_gc(op.args[0])
-        fromll = longlong.is_longlong(op.args[0].concretetype)
-        toll   = longlong.is_longlong(op.result.concretetype)
-        if fromll and toll:
+        v_arg = op.args[0]
+        v_result = op.result
+        assert not self._is_gc(v_arg)
+
+        if v_arg.concretetype == v_result.concretetype:
             return
-        if fromll:
-            args = op.args
-            opname = 'truncate_longlong_to_int'
-            RESULT = lltype.Signed
-            v = varoftype(RESULT)
-            op1 = SpaceOperation(opname, args, v)
-            op2 = self.rewrite_operation(op1)
-            oplist = self.force_cast_without_longlong(op2.result, op.result)
+
+        float_arg = v_arg.concretetype in [lltype.Float, lltype.SingleFloat]
+        float_res = v_result.concretetype in [lltype.Float, lltype.SingleFloat]
+        if not float_arg and not float_res:
+            # some int -> some int cast
+            return self._int_to_int_cast(v_arg, v_result)
+        elif float_arg and float_res:
+            # some float -> some float cast
+            return self._float_to_float_cast(v_arg, v_result)
+        elif not float_arg and float_res:
+            # some int -> some float
+            ops = []
+            v1 = varoftype(lltype.Signed)
+            oplist = self.rewrite_operation(
+                SpaceOperation('force_cast', [v_arg], v1)
+            )
             if oplist:
-                return [op2] + oplist
-            #
-            # force a renaming to put the correct result in place, even though
-            # it might be slightly mistyped (e.g. Signed versus Unsigned)
-            assert op2.result is v
-            op2.result = op.result
-            return op2
-        elif toll:
-            size, unsigned = rffi.size_and_sign(op.args[0].concretetype)
-            if unsigned:
+                ops.extend(oplist)
+            else:
+                v1 = v_arg
+            v2 = varoftype(lltype.Float)
+            op = self.rewrite_operation(
+                SpaceOperation('cast_int_to_float', [v1], v2)
+            )
+            ops.append(op)
+            op2 = self.rewrite_operation(
+                SpaceOperation('force_cast', [v2], v_result)
+            )
+            if op2:
+                ops.append(op2)
+            else:
+                op.result = v_result
+            return ops
+        elif float_arg and not float_res:
+            # some float -> some int
+            ops = []
+            v1 = varoftype(lltype.Float)
+            op1 = self.rewrite_operation(
+                SpaceOperation('force_cast', [v_arg], v1)
+            )
+            if op1:
+                ops.append(op1)
+            else:
+                v1 = v_arg
+            v2 = varoftype(lltype.Signed)
+            op = self.rewrite_operation(
+                SpaceOperation('cast_float_to_int', [v1], v2)
+            )
+            ops.append(op)
+            oplist = self.rewrite_operation(
+                SpaceOperation('force_cast', [v2], v_result)
+            )
+            if oplist:
+                ops.extend(oplist)
+            else:
+                op.result = v_result
+            return ops
+        else:
+            assert False
+
+    def _int_to_int_cast(self, v_arg, v_result):
+        longlong_arg = longlong.is_longlong(v_arg.concretetype)
+        longlong_res = longlong.is_longlong(v_result.concretetype)
+        size1, unsigned1 = rffi.size_and_sign(v_arg.concretetype)
+        size2, unsigned2 = rffi.size_and_sign(v_result.concretetype)
+
+        if longlong_arg and longlong_res:
+            return
+        elif longlong_arg:
+            v = varoftype(lltype.Signed)
+            op1 = self.rewrite_operation(
+                SpaceOperation('truncate_longlong_to_int', [v_arg], v)
+            )
+            op2 = SpaceOperation('force_cast', [v], v_result)
+            oplist = self.rewrite_operation(op2)
+            if not oplist:
+                op1.result = v_result
+                oplist = []
+            return [op1] + oplist
+        elif longlong_res:
+            if unsigned1:
                 INTERMEDIATE = lltype.Unsigned
             else:
                 INTERMEDIATE = lltype.Signed
             v = varoftype(INTERMEDIATE)
-            oplist = self.force_cast_without_longlong(op.args[0], v)
+            op1 = SpaceOperation('force_cast', [v_arg], v)
+            oplist = self.rewrite_operation(op1)
             if not oplist:
-                v = op.args[0]
+                v = v_arg
                 oplist = []
-            if unsigned:
+            if unsigned1:
                 opname = 'cast_uint_to_longlong'
             else:
                 opname = 'cast_int_to_longlong'
-            op1 = SpaceOperation(opname, [v], op.result)
-            op2 = self.rewrite_operation(op1)
+            op2 = self.rewrite_operation(
+                SpaceOperation(opname, [v], v_result)
+            )
             return oplist + [op2]
-        else:
-            return self.force_cast_without_longlong(op.args[0], op.result)
 
-    def force_cast_without_longlong(self, v_arg, v_result):
-        if v_result.concretetype == v_arg.concretetype:
+        # We've now, ostensibly, dealt with the longlongs, everything should be
+        # a Signed or smaller
+        assert size1 <= rffi.sizeof(lltype.Signed)
+        assert size2 <= rffi.sizeof(lltype.Signed)
+
+        # the target type is LONG or ULONG
+        if size2 == rffi.sizeof(lltype.Signed):
             return
-        if v_arg.concretetype == rffi.FLOAT:
-            assert v_result.concretetype == lltype.Float, "cast %s -> %s" % (
-                v_arg.concretetype, v_result.concretetype)
-            return SpaceOperation('cast_singlefloat_to_float', [v_arg],
-                                  v_result)
-        if v_result.concretetype == rffi.FLOAT:
-            assert v_arg.concretetype == lltype.Float, "cast %s -> %s" % (
-                v_arg.concretetype, v_result.concretetype)
-            return SpaceOperation('cast_float_to_singlefloat', [v_arg],
-                                  v_result)
-        return self.force_cast_without_singlefloat(v_arg, v_result)
 
-    def force_cast_without_singlefloat(self, v_arg, v_result):
-        size2, unsigned2 = rffi.size_and_sign(v_result.concretetype)
-        assert size2 <= rffi.sizeof(lltype.Signed)
-        if size2 == rffi.sizeof(lltype.Signed):
-            return     # the target type is LONG or ULONG
-        size1, unsigned1 = rffi.size_and_sign(v_arg.concretetype)
-        assert size1 <= rffi.sizeof(lltype.Signed)
-        #
-        def bounds(size, unsigned):
-            if unsigned:
-                return 0, 1<<(8*size)
-            else:
-                return -(1<<(8*size-1)), 1<<(8*size-1)
-        min1, max1 = bounds(size1, unsigned1)
-        min2, max2 = bounds(size2, unsigned2)
+        min1, max1 = integer_bounds(size1, unsigned1)
+        min2, max2 = integer_bounds(size2, unsigned2)
+
+        # the target type includes the source range
         if min2 <= min1 <= max1 <= max2:
-            return     # the target type includes the source range
-        #
+            return
+
         result = []
         if min2:
             c_min2 = Constant(min2, lltype.Signed)
@@ -862,15 +914,28 @@
             result.append(SpaceOperation('int_sub', [v_arg, c_min2], v2))
         else:
             v2 = v_arg
-        c_mask = Constant(int((1<<(8*size2))-1), lltype.Signed)
-        v3 = varoftype(lltype.Signed)
+        c_mask = Constant(int((1 << (8 * size2)) - 1), lltype.Signed)
+        if min2:
+            v3 = varoftype(lltype.Signed)
+        else:
+            v3 = v_result
         result.append(SpaceOperation('int_and', [v2, c_mask], v3))
         if min2:
             result.append(SpaceOperation('int_add', [v3, c_min2], v_result))
-        else:
-            result[-1].result = v_result
         return result
 
+    def _float_to_float_cast(self, v_arg, v_result):
+        if v_arg.concretetype == lltype.SingleFloat:
+            assert v_result.concretetype == lltype.Float, "cast %s -> %s" % (
+                v_arg.concretetype, v_result.concretetype)
+            return SpaceOperation('cast_singlefloat_to_float', [v_arg],
+                                  v_result)
+        if v_result.concretetype == lltype.SingleFloat:
+            assert v_arg.concretetype == lltype.Float, "cast %s -> %s" % (
+                v_arg.concretetype, v_result.concretetype)
+            return SpaceOperation('cast_float_to_singlefloat', [v_arg],
+                                  v_result)
+
     def rewrite_op_direct_ptradd(self, op):
         # xxx otherwise, not implemented:
         assert op.args[0].concretetype == rffi.CCHARP
@@ -1417,7 +1482,7 @@
             extraeffect = EffectInfo.EF_CANNOT_RAISE
         elif oopspec_name.startswith('libffi_call_'):
             oopspecindex = EffectInfo.OS_LIBFFI_CALL
-            extraeffect = EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE
+            extraeffect = EffectInfo.EF_RANDOM_EFFECTS
         else:
             assert False, 'unsupported oopspec: %s' % oopspec_name
         return self._handle_oopspec_call(op, args, oopspecindex, extraeffect)
diff --git a/pypy/jit/codewriter/support.py b/pypy/jit/codewriter/support.py
--- a/pypy/jit/codewriter/support.py
+++ b/pypy/jit/codewriter/support.py
@@ -91,9 +91,12 @@
     reds_v = op.args[2+numgreens:]
     assert len(reds_v) == numreds
     #
-    def _sort(args_v):
+    def _sort(args_v, is_green):
         from pypy.jit.metainterp.history import getkind
         lst = [v for v in args_v if v.concretetype is not lltype.Void]
+        if is_green:
+            assert len(lst) == len(args_v), (
+                "not supported so far: 'greens' variables contain Void")
         _kind2count = {'int': 1, 'ref': 2, 'float': 3}
         lst2 = sorted(lst, key=lambda v: _kind2count[getkind(v.concretetype)])
         # a crash here means that you have to reorder the variable named in
@@ -102,7 +105,7 @@
         assert lst == lst2
         return lst
     #
-    return (_sort(greens_v), _sort(reds_v))
+    return (_sort(greens_v, True), _sort(reds_v, False))
 
 def maybe_on_top_of_llinterp(rtyper, fnptr):
     # Run a generated graph on top of the llinterp for testing.
diff --git a/pypy/jit/codewriter/test/test_call.py b/pypy/jit/codewriter/test/test_call.py
--- a/pypy/jit/codewriter/test/test_call.py
+++ b/pypy/jit/codewriter/test/test_call.py
@@ -191,4 +191,4 @@
     [block, _] = list(f_graph.iterblocks())
     [op] = block.operations
     call_descr = cc.getcalldescr(op)
-    assert call_descr.extrainfo.can_release_gil
\ No newline at end of file
+    assert call_descr.extrainfo.has_random_effects()
diff --git a/pypy/jit/codewriter/test/test_codewriter.py b/pypy/jit/codewriter/test/test_codewriter.py
--- a/pypy/jit/codewriter/test/test_codewriter.py
+++ b/pypy/jit/codewriter/test/test_codewriter.py
@@ -5,7 +5,7 @@
 from pypy.rpython.lltypesystem import lltype, llmemory, rffi
 
 class FakeCallDescr(AbstractDescr):
-    def __init__(self, FUNC, ARGS, RESULT, effectinfo=None):
+    def __init__(self, FUNC, ARGS, RESULT, effectinfo):
         self.FUNC = FUNC
         self.ARGS = ARGS
         self.RESULT = RESULT
diff --git a/pypy/jit/codewriter/test/test_flatten.py b/pypy/jit/codewriter/test/test_flatten.py
--- a/pypy/jit/codewriter/test/test_flatten.py
+++ b/pypy/jit/codewriter/test/test_flatten.py
@@ -50,7 +50,7 @@
     def __init__(self, rtyper):
         rtyper._builtin_func_for_spec_cache = FakeDict()
         self.rtyper = rtyper
-    def calldescrof(self, FUNC, ARGS, RESULT):
+    def calldescrof(self, FUNC, ARGS, RESULT, effectinfo):
         return FakeDescr()
     def fielddescrof(self, STRUCT, name):
         return FakeDescr()
@@ -324,7 +324,7 @@
     def test_exc_exitswitch(self):
         def g(i):
             pass
-        
+
         def f(i):
             try:
                 g(i)
@@ -854,13 +854,51 @@
             int_return %i0
         """, transform=True)
 
-    def test_force_cast_float(self):
+    def test_force_cast_floats(self):
         from pypy.rpython.lltypesystem import rffi
+        # Caststs to lltype.Float
         def f(n):
             return rffi.cast(lltype.Float, n)
         self.encoding_test(f, [12.456], """
             float_return %f0
         """, transform=True)
+        self.encoding_test(f, [rffi.cast(rffi.SIGNEDCHAR, 42)], """
+            cast_int_to_float %i0 -> %f0
+            float_return %f0
+        """, transform=True)
+
+        # Casts to lltype.SingleFloat
+        def g(n):
+            return rffi.cast(lltype.SingleFloat, n)
+        self.encoding_test(g, [12.456], """
+            cast_float_to_singlefloat %f0 -> %i0
+            int_return %i0
+        """, transform=True)
+        self.encoding_test(g, [rffi.cast(rffi.SIGNEDCHAR, 42)], """
+            cast_int_to_float %i0 -> %f0
+            cast_float_to_singlefloat %f0 -> %i1
+            int_return %i1
+        """, transform=True)
+
+        # Casts from floats
+        def f(n):
+            return rffi.cast(rffi.SIGNEDCHAR, n)
+        self.encoding_test(f, [12.456], """
+            cast_float_to_int %f0 -> %i0
+            int_sub %i0, $-128 -> %i1
+            int_and %i1, $255 -> %i2
+            int_add %i2, $-128 -> %i3
+            int_return %i3
+        """, transform=True)
+        self.encoding_test(f, [rffi.cast(lltype.SingleFloat, 12.456)], """
+            cast_singlefloat_to_float %i0 -> %f0
+            cast_float_to_int %f0 -> %i1
+            int_sub %i1, $-128 -> %i2
+            int_and %i2, $255 -> %i3
+            int_add %i3, $-128 -> %i4
+            int_return %i4
+        """, transform=True)
+
 
     def test_direct_ptradd(self):
         from pypy.rpython.lltypesystem import rffi
diff --git a/pypy/jit/metainterp/optimizeopt/fficall.py b/pypy/jit/metainterp/optimizeopt/fficall.py
--- a/pypy/jit/metainterp/optimizeopt/fficall.py
+++ b/pypy/jit/metainterp/optimizeopt/fficall.py
@@ -18,25 +18,27 @@
     def __init__(self, funcval, cpu, prepare_op):
         self.funcval = funcval
         self.opargs = []
-        argtypes, restype = self._get_signature(funcval)
-        self.descr = cpu.calldescrof_dynamic(argtypes, restype)
+        argtypes, restype, flags = self._get_signature(funcval)
+        self.descr = cpu.calldescrof_dynamic(argtypes, restype,
+                                             EffectInfo.MOST_GENERAL,
+                                             ffi_flags=flags)
         # ^^^ may be None if unsupported
         self.prepare_op = prepare_op
         self.delayed_ops = []
 
     def _get_signature(self, funcval):
         """
-        given the funcval, return a tuple (argtypes, restype), where the
-        actuall types are libffi.types.*
+        given the funcval, return a tuple (argtypes, restype, flags), where
+        the actuall types are libffi.types.*
 
         The implementation is tricky because we have three possible cases:
 
         - translated: the easiest case, we can just cast back the pointer to
-          the original Func instance and read .argtypes and .restype
+          the original Func instance and read .argtypes, .restype and .flags
 
         - completely untranslated: this is what we get from test_optimizeopt
           tests. funcval contains a FakeLLObject whose _fake_class is Func,
-          and we can just get .argtypes and .restype
+          and we can just get .argtypes, .restype and .flags
 
         - partially translated: this happens when running metainterp tests:
           funcval contains the low-level equivalent of a Func, and thus we
@@ -48,10 +50,10 @@
         llfunc = funcval.box.getref_base()
         if we_are_translated():
             func = cast_base_ptr_to_instance(Func, llfunc)
-            return func.argtypes, func.restype
+            return func.argtypes, func.restype, func.flags
         elif getattr(llfunc, '_fake_class', None) is Func:
             # untranslated
-            return llfunc.argtypes, llfunc.restype
+            return llfunc.argtypes, llfunc.restype, llfunc.flags
         else:
             # partially translated
             # llfunc contains an opaque pointer to something like the following:
@@ -62,7 +64,7 @@
             # because we don't have the exact TYPE to cast to.  Instead, we
             # just fish it manually :-(
             f = llfunc._obj.container
-            return f.inst_argtypes, f.inst_restype
+            return f.inst_argtypes, f.inst_restype, f.inst_flags
 
 
 class OptFfiCall(Optimization):
@@ -195,9 +197,7 @@
 
     def _get_oopspec(self, op):
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            return effectinfo.oopspecindex
-        return EffectInfo.OS_NONE
+        return effectinfo.oopspecindex
 
     def _get_funcval(self, op):
         return self.getvalue(op.getarg(1))
diff --git a/pypy/jit/metainterp/optimizeopt/heap.py b/pypy/jit/metainterp/optimizeopt/heap.py
--- a/pypy/jit/metainterp/optimizeopt/heap.py
+++ b/pypy/jit/metainterp/optimizeopt/heap.py
@@ -235,31 +235,33 @@
             opnum == rop.CALL_RELEASE_GIL or
             opnum == rop.CALL_ASSEMBLER):
             if opnum == rop.CALL_ASSEMBLER:
-                effectinfo = None
+                self._seen_guard_not_invalidated = False
             else:
                 effectinfo = op.getdescr().get_extra_info()
-            if effectinfo is None or effectinfo.check_can_invalidate():
-                self._seen_guard_not_invalidated = False
-            if effectinfo is not None and not effectinfo.has_random_effects():
-                # XXX we can get the wrong complexity here, if the lists
-                # XXX stored on effectinfo are large
-                for fielddescr in effectinfo.readonly_descrs_fields:
-                    self.force_lazy_setfield(fielddescr)
-                for arraydescr in effectinfo.readonly_descrs_arrays:
-                    self.force_lazy_setarrayitem(arraydescr)
-                for fielddescr in effectinfo.write_descrs_fields:
-                    self.force_lazy_setfield(fielddescr, can_cache=False)
-                for arraydescr in effectinfo.write_descrs_arrays:
-                    self.force_lazy_setarrayitem(arraydescr, can_cache=False)
-                if effectinfo.check_forces_virtual_or_virtualizable():
-                    vrefinfo = self.optimizer.metainterp_sd.virtualref_info
-                    self.force_lazy_setfield(vrefinfo.descr_forced)
-                    # ^^^ we only need to force this field; the other fields
-                    # of virtualref_info and virtualizable_info are not gcptrs.
-                return
+                if effectinfo.check_can_invalidate():
+                    self._seen_guard_not_invalidated = False
+                if not effectinfo.has_random_effects():
+                    self.force_from_effectinfo(effectinfo)
+                    return
         self.force_all_lazy_setfields_and_arrayitems()
         self.clean_caches()
 
+    def force_from_effectinfo(self, effectinfo):
+        # XXX we can get the wrong complexity here, if the lists
+        # XXX stored on effectinfo are large
+        for fielddescr in effectinfo.readonly_descrs_fields:
+            self.force_lazy_setfield(fielddescr)
+        for arraydescr in effectinfo.readonly_descrs_arrays:
+            self.force_lazy_setarrayitem(arraydescr)
+        for fielddescr in effectinfo.write_descrs_fields:
+            self.force_lazy_setfield(fielddescr, can_cache=False)
+        for arraydescr in effectinfo.write_descrs_arrays:
+            self.force_lazy_setarrayitem(arraydescr, can_cache=False)
+        if effectinfo.check_forces_virtual_or_virtualizable():
+            vrefinfo = self.optimizer.metainterp_sd.virtualref_info
+            self.force_lazy_setfield(vrefinfo.descr_forced)
+            # ^^^ we only need to force this field; the other fields
+            # of virtualref_info and virtualizable_info are not gcptrs.
 
     def turned_constant(self, value):
         assert value.is_constant()
diff --git a/pypy/jit/metainterp/optimizeopt/rewrite.py b/pypy/jit/metainterp/optimizeopt/rewrite.py
--- a/pypy/jit/metainterp/optimizeopt/rewrite.py
+++ b/pypy/jit/metainterp/optimizeopt/rewrite.py
@@ -433,11 +433,10 @@
         # specifically the given oopspec call.  For non-oopspec calls,
         # oopspecindex is just zero.
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            oopspecindex = effectinfo.oopspecindex
-            if oopspecindex == EffectInfo.OS_ARRAYCOPY:
-                if self._optimize_CALL_ARRAYCOPY(op):
-                    return
+        oopspecindex = effectinfo.oopspecindex
+        if oopspecindex == EffectInfo.OS_ARRAYCOPY:
+            if self._optimize_CALL_ARRAYCOPY(op):
+                return
         self.emit_operation(op)
 
     def _optimize_CALL_ARRAYCOPY(self, op):
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py b/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_optimizefficall.py
@@ -14,12 +14,15 @@
     can check that the signature of a call is really what you want.
     """
 
-    def __init__(self, arg_types, typeinfo):
+    def __init__(self, arg_types, typeinfo, flags):
         self.arg_types = arg_types
         self.typeinfo = typeinfo   # return type
+        self.flags = flags
 
     def __eq__(self, other):
-        return self.arg_types == other.arg_types and self.typeinfo == other.typeinfo
+        return (self.arg_types == other.arg_types and
+                self.typeinfo == other.typeinfo and
+                self.flags == other.get_ffi_flags())
 
 class FakeLLObject(object):
 
@@ -41,24 +44,31 @@
         vable_token_descr = LLtypeMixin.valuedescr
         valuedescr = LLtypeMixin.valuedescr
 
-        int_float__int = MyCallDescr('if', 'i')
+        int_float__int_42 = MyCallDescr('if', 'i', 42)
+        int_float__int_43 = MyCallDescr('if', 'i', 43)
         funcptr = FakeLLObject()
         func = FakeLLObject(_fake_class=Func,
                             argtypes=[types.sint, types.double],
-                            restype=types.sint)
+                            restype=types.sint,
+                            flags=42)
         func2 = FakeLLObject(_fake_class=Func,
                              argtypes=[types.sint, types.double],
-                             restype=types.sint)
+                             restype=types.sint,
+                             flags=43)
         #
         def calldescr(cpu, FUNC, oopspecindex, extraeffect=None):
-            einfo = EffectInfo([], [], [], [], oopspecindex=oopspecindex,
+            if extraeffect == EffectInfo.EF_RANDOM_EFFECTS:
+                f = None   # means "can force all" really
+            else:
+                f = []
+            einfo = EffectInfo(f, f, f, f, oopspecindex=oopspecindex,
                                extraeffect=extraeffect)
             return cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT, einfo)
         #
         libffi_prepare =  calldescr(cpu, FUNC, EffectInfo.OS_LIBFFI_PREPARE)
         libffi_push_arg = calldescr(cpu, FUNC, EffectInfo.OS_LIBFFI_PUSH_ARG)
         libffi_call =     calldescr(cpu, FUNC, EffectInfo.OS_LIBFFI_CALL,
-                                 EffectInfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE)
+                                    EffectInfo.EF_RANDOM_EFFECTS)
     
     namespace = namespace.__dict__
 
@@ -79,7 +89,7 @@
         """
         expected = """
         [i0, f1]
-        i3 = call_release_gil(12345, i0, f1, descr=int_float__int)
+        i3 = call_release_gil(12345, i0, f1, descr=int_float__int_42)
         guard_not_forced() []
         guard_no_exception() []
         jump(i3, f1)
@@ -119,7 +129,7 @@
         [i0, f1, p2]
         i4 = force_token()
         setfield_gc(p2, i4, descr=vable_token_descr)
-        i3 = call_release_gil(12345, i0, f1, descr=int_float__int)
+        i3 = call_release_gil(12345, i0, f1, descr=int_float__int_42)
         guard_not_forced() [p2]
         guard_no_exception() [p2]
         jump(i3, f1, p2)
@@ -216,7 +226,7 @@
         call(0, ConstPtr(func),                        descr=libffi_prepare)
         #
         # this "nested" call is nicely optimized
-        i4 = call_release_gil(67890, i0, f1, descr=int_float__int)
+        i4 = call_release_gil(67890, i0, f1, descr=int_float__int_43)
         guard_not_forced() []
         guard_no_exception() []
         #
@@ -261,7 +271,7 @@
         expected = """
         [i0, f1, p2]
         setfield_gc(p2, i0, descr=valuedescr)
-        i3 = call_release_gil(12345, i0, f1, descr=int_float__int)
+        i3 = call_release_gil(12345, i0, f1, descr=int_float__int_42)
         guard_not_forced() []
         guard_no_exception() []
         jump(i3, f1, p2)
diff --git a/pypy/jit/metainterp/optimizeopt/test/test_util.py b/pypy/jit/metainterp/optimizeopt/test/test_util.py
--- a/pypy/jit/metainterp/optimizeopt/test/test_util.py
+++ b/pypy/jit/metainterp/optimizeopt/test/test_util.py
@@ -167,7 +167,8 @@
     onedescr = cpu.fielddescrof(U, 'one')
 
     FUNC = lltype.FuncType([lltype.Signed], lltype.Signed)
-    plaincalldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+    plaincalldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
+                                     EffectInfo.MOST_GENERAL)
     nonwritedescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
                                     EffectInfo([], [], [], []))
     writeadescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT,
diff --git a/pypy/jit/metainterp/optimizeopt/vstring.py b/pypy/jit/metainterp/optimizeopt/vstring.py
--- a/pypy/jit/metainterp/optimizeopt/vstring.py
+++ b/pypy/jit/metainterp/optimizeopt/vstring.py
@@ -455,8 +455,8 @@
         # specifically the given oopspec call.  For non-oopspec calls,
         # oopspecindex is just zero.
         effectinfo = op.getdescr().get_extra_info()
-        if effectinfo is not None:
-            oopspecindex = effectinfo.oopspecindex
+        oopspecindex = effectinfo.oopspecindex
+        if oopspecindex != EffectInfo.OS_NONE:
             for value, meth in opt_call_oopspec_ops:
                 if oopspecindex == value:      # a match with the OS_STR_xxx
                     if meth(self, op, mode_string):
diff --git a/pypy/jit/metainterp/pyjitpl.py b/pypy/jit/metainterp/pyjitpl.py
--- a/pypy/jit/metainterp/pyjitpl.py
+++ b/pypy/jit/metainterp/pyjitpl.py
@@ -1257,10 +1257,8 @@
         assert i == len(allboxes)
         #
         effectinfo = descr.get_extra_info()
-        if (effectinfo is None or
-                effectinfo.extraeffect ==
-                             effectinfo.EF_FORCES_VIRTUAL_OR_VIRTUALIZABLE or
-                assembler_call):
+        if (assembler_call or
+                effectinfo.check_forces_virtual_or_virtualizable()):
             # residual calls require attention to keep virtualizables in-sync
             self.metainterp.clear_exception()
             self.metainterp.vable_and_vrefs_before_residual_call()
@@ -1693,12 +1691,11 @@
             return
         if opnum == rop.CALL:
             effectinfo = descr.get_extra_info()
-            if effectinfo is not None:
-                ef = effectinfo.extraeffect
-                if ef == effectinfo.EF_LOOPINVARIANT or \
-                   ef == effectinfo.EF_ELIDABLE_CANNOT_RAISE or \
-                   ef == effectinfo.EF_ELIDABLE_CAN_RAISE:
-                    return
+            ef = effectinfo.extraeffect
+            if ef == effectinfo.EF_LOOPINVARIANT or \
+               ef == effectinfo.EF_ELIDABLE_CANNOT_RAISE or \
+               ef == effectinfo.EF_ELIDABLE_CAN_RAISE:
+                return
         if self.heap_cache:
             self.heap_cache.clear()
         if self.heap_array_cache:
diff --git a/pypy/jit/metainterp/test/test_compile.py b/pypy/jit/metainterp/test/test_compile.py
--- a/pypy/jit/metainterp/test/test_compile.py
+++ b/pypy/jit/metainterp/test/test_compile.py
@@ -190,7 +190,7 @@
     class FakeJitDriverSD:
         portal_runner_ptr = llhelper(lltype.Ptr(FUNC), ll_portal_runner)
         portal_runner_adr = llmemory.cast_ptr_to_adr(portal_runner_ptr)
-        portal_calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT)
+        portal_calldescr = cpu.calldescrof(FUNC, FUNC.ARGS, FUNC.RESULT, None)
         portal_finishtoken = compile.DoneWithThisFrameDescrInt()
         num_red_args = 2
         result_type = INT
diff --git a/pypy/jit/metainterp/test/test_string.py b/pypy/jit/metainterp/test/test_string.py
--- a/pypy/jit/metainterp/test/test_string.py
+++ b/pypy/jit/metainterp/test/test_string.py
@@ -1,5 +1,6 @@
 import py
 from pypy.rlib.jit import JitDriver, dont_look_inside, we_are_jitted
+from pypy.rlib.debug import debug_print
 from pypy.jit.codewriter.policy import StopAtXPolicy
 from pypy.rpython.ootypesystem import ootype
 from pypy.jit.metainterp.test.support import LLJitMixin, OOJitMixin
@@ -521,7 +522,8 @@
         jitdriver = JitDriver(greens = ['g'], reds = ['m'])
         @dont_look_inside
         def escape(x):
-            print str(x)
+            # a plain "print" would call os.write() and release the gil
+            debug_print(str(x))
         def f(g, m):
             g = str(g)
             while m >= 0:
diff --git a/pypy/jit/metainterp/test/test_virtualstate.py b/pypy/jit/metainterp/test/test_virtualstate.py
--- a/pypy/jit/metainterp/test/test_virtualstate.py
+++ b/pypy/jit/metainterp/test/test_virtualstate.py
@@ -1,3 +1,4 @@
+from __future__ import with_statement
 import py
 from pypy.jit.metainterp.optimize import InvalidLoop
 from pypy.jit.metainterp.optimizeopt.virtualstate import VirtualStateInfo, VStructStateInfo, \
diff --git a/pypy/jit/metainterp/test/test_warmspot.py b/pypy/jit/metainterp/test/test_warmspot.py
--- a/pypy/jit/metainterp/test/test_warmspot.py
+++ b/pypy/jit/metainterp/test/test_warmspot.py
@@ -252,6 +252,41 @@
         self.check_loops({'int_sub': 1, 'int_gt': 1, 'guard_true': 1,
                           'jump': 1})
 
+    def test_void_red_variable(self):
+        mydriver = JitDriver(greens=[], reds=['a', 'm'])
+        def f1(m):
+            a = None
+            while m > 0:
+                mydriver.jit_merge_point(a=a, m=m)
+                m = m - 1
+                if m == 10:
+                    pass   # other case
+        self.meta_interp(f1, [18])
+
+    def test_bug_constant_rawptrs(self):
+        py.test.skip("crashes because a is a constant")
+        from pypy.rpython.lltypesystem import lltype, rffi
+        mydriver = JitDriver(greens=['a'], reds=['m'])
+        def f1(m):
+            a = lltype.nullptr(rffi.VOIDP.TO)
+            while m > 0:
+                mydriver.jit_merge_point(a=a, m=m)
+                m = m - 1
+        self.meta_interp(f1, [18])
+
+    def test_bug_rawptrs(self):
+        from pypy.rpython.lltypesystem import lltype, rffi
+        mydriver = JitDriver(greens=['a'], reds=['m'])
+        def f1(m):
+            a = lltype.malloc(rffi.VOIDP.TO, 5, flavor='raw')
+            while m > 0:
+                mydriver.jit_merge_point(a=a, m=m)
+                m = m - 1
+                if m == 10:
+                    pass
+            lltype.free(a, flavor='raw')
+        self.meta_interp(f1, [18])
+
 
 class TestLLWarmspot(WarmspotTests, LLJitMixin):
     CPUClass = runner.LLtypeCPU
diff --git a/pypy/jit/metainterp/warmspot.py b/pypy/jit/metainterp/warmspot.py
--- a/pypy/jit/metainterp/warmspot.py
+++ b/pypy/jit/metainterp/warmspot.py
@@ -21,6 +21,7 @@
 from pypy.jit.metainterp.jitdriver import JitDriverStaticData
 from pypy.jit.codewriter import support, codewriter, longlong
 from pypy.jit.codewriter.policy import JitPolicy
+from pypy.jit.codewriter.effectinfo import EffectInfo
 from pypy.jit.metainterp.optimizeopt import ALL_OPTS_NAMES
 
 # ____________________________________________________________
@@ -244,7 +245,8 @@
         graph.startblock = support.split_before_jit_merge_point(*jmpp)
         graph.startblock.isstartblock = True
         # a crash in the following checkgraph() means that you forgot
-        # to list some variable in greens=[] or reds=[] in JitDriver.
+        # to list some variable in greens=[] or reds=[] in JitDriver,
+        # or that a jit_merge_point() takes a constant as an argument.
         checkgraph(graph)
         for v in graph.getargs():
             assert isinstance(v, Variable)
@@ -654,11 +656,13 @@
         portalfunc_ARGS = []
         nums = {}
         for i, ARG in enumerate(PORTALFUNC.ARGS):
+            kind = history.getkind(ARG)
+            assert kind != 'void'
             if i < len(jd.jitdriver.greens):
                 color = 'green'
             else:
                 color = 'red'
-            attrname = '%s_%s' % (color, history.getkind(ARG))
+            attrname = '%s_%s' % (color, kind)
             count = nums.get(attrname, 0)
             nums[attrname] = count + 1
             portalfunc_ARGS.append((ARG, attrname, count))
@@ -746,7 +750,8 @@
         jd.portal_calldescr = self.cpu.calldescrof(
             jd._PTR_PORTAL_FUNCTYPE.TO,
             jd._PTR_PORTAL_FUNCTYPE.TO.ARGS,
-            jd._PTR_PORTAL_FUNCTYPE.TO.RESULT)
+            jd._PTR_PORTAL_FUNCTYPE.TO.RESULT,
+            EffectInfo.MOST_GENERAL)
 
         vinfo = jd.virtualizable_info
 
diff --git a/pypy/jit/metainterp/warmstate.py b/pypy/jit/metainterp/warmstate.py
--- a/pypy/jit/metainterp/warmstate.py
+++ b/pypy/jit/metainterp/warmstate.py
@@ -124,7 +124,7 @@
     # Hash of lltype or ootype object.
     # Only supports strings, unicodes and regular instances,
     # as well as primitives that can meaningfully be cast to Signed.
-    if isinstance(TYPE, lltype.Ptr):
+    if isinstance(TYPE, lltype.Ptr) and TYPE.TO._gckind == 'gc':
         if TYPE.TO is rstr.STR or TYPE.TO is rstr.UNICODE:
             return rstr.LLHelpers.ll_strhash(x)    # assumed not null
         else:
@@ -140,7 +140,7 @@
         else:
             return 0
     else:
-        return lltype.cast_primitive(lltype.Signed, x)
+        return rffi.cast(lltype.Signed, x)
 
 @specialize.ll_and_arg(3)
 def set_future_value(cpu, j, value, typecode):
diff --git a/pypy/module/__builtin__/__init__.py b/pypy/module/__builtin__/__init__.py
--- a/pypy/module/__builtin__/__init__.py
+++ b/pypy/module/__builtin__/__init__.py
@@ -19,6 +19,7 @@
         'sorted'        : 'app_functional.sorted',
         'any'           : 'app_functional.any',
         'all'           : 'app_functional.all',
+        'sum'           : 'app_functional.sum',
         'vars'          : 'app_inspect.vars',
         'dir'           : 'app_inspect.dir',
 
@@ -85,7 +86,6 @@
         'enumerate'     : 'functional.W_Enumerate',
         'min'           : 'functional.min',
         'max'           : 'functional.max',
-        'sum'           : 'functional.sum',
         'map'           : 'functional.map',
         'zip'           : 'functional.zip',
         'reduce'        : 'functional.reduce',
@@ -118,7 +118,7 @@
                 return module.Module(space, None, w_builtin)
            builtin = space.interpclass_w(w_builtin)
            if isinstance(builtin, module.Module):
-               return builtin   
+               return builtin
        # no builtin! make a default one.  Given them None, at least.
        builtin = module.Module(space, None)
        space.setitem(builtin.w_dict, space.wrap('None'), space.w_None)
diff --git a/pypy/module/__builtin__/app_functional.py b/pypy/module/__builtin__/app_functional.py
--- a/pypy/module/__builtin__/app_functional.py
+++ b/pypy/module/__builtin__/app_functional.py
@@ -34,3 +34,18 @@
         if not x:
             return False
     return True
+
+def sum(sequence, start=0):
+    """sum(sequence[, start]) -> value
+
+Returns the sum of a sequence of numbers (NOT strings) plus the value
+of parameter 'start' (which defaults to 0).  When the sequence is
+empty, returns start."""
+    if isinstance(start, basestring):
+        raise TypeError("sum() can't sum strings")
+    last = start
+    for x in sequence:
+        # Very intentionally *not* +=, that would have different semantics if
+        # start was a mutable type, such as a list
+        last = last + x
+    return last
\ No newline at end of file
diff --git a/pypy/module/__builtin__/functional.py b/pypy/module/__builtin__/functional.py
--- a/pypy/module/__builtin__/functional.py
+++ b/pypy/module/__builtin__/functional.py
@@ -292,7 +292,7 @@
                 raise
             break
         new_frame = space.createframe(code, w_func.w_func_globals,
-                                      w_func.closure)
+                                      w_func)
         new_frame.locals_stack_w[0] = w_item
         w_res = new_frame.run()
         result_w.append(w_res)
@@ -325,27 +325,6 @@
         result_w.append(w_res)
     return result_w
 
-def sum(space, w_sequence, w_start=0):
-    """sum(sequence[, start]) -> value
-
-Returns the sum of a sequence of numbers (NOT strings) plus the value
-of parameter 'start' (which defaults to 0).  When the sequence is
-empty, returns start."""
-    if space.is_true(space.isinstance(w_start, space.w_basestring)):
-        msg = "sum() can't sum strings"
-        raise OperationError(space.w_TypeError, space.wrap(msg))
-    w_iter = space.iter(w_sequence)
-    w_last = w_start
-    while True:
-        try:
-            w_next = space.next(w_iter)
-        except OperationError, e:
-            if not e.match(space, space.w_StopIteration):
-                raise
-            break
-        w_last = space.add(w_last, w_next)
-    return w_last
-
 @unwrap_spec(sequences_w="args_w")
 def zip(space, sequences_w):
     """Return a list of tuples, where the nth tuple contains every nth item of
diff --git a/pypy/module/__builtin__/test/test_classobj.py b/pypy/module/__builtin__/test/test_classobj.py
--- a/pypy/module/__builtin__/test/test_classobj.py
+++ b/pypy/module/__builtin__/test/test_classobj.py
@@ -981,6 +981,86 @@
         assert a.x == 2
         raises(TypeError, descr.__delete__, a)
 
+    def test_partial_ordering(self):
+        class A:
+            def __lt__(self, other):
+                return self
+        a1 = A()
+        a2 = A()
+        assert (a1 < a2) is a1
+        assert (a1 > a2) is a2
+
+    def test_eq_order(self):
+        # this gives the ordering of equality-related functions on top of
+        # CPython **for old-style classes**.
+        class A:
+            def __eq__(self, other): return self.__class__.__name__+':A.eq'
+            def __ne__(self, other): return self.__class__.__name__+':A.ne'
+            def __lt__(self, other): return self.__class__.__name__+':A.lt'
+            def __le__(self, other): return self.__class__.__name__+':A.le'
+            def __gt__(self, other): return self.__class__.__name__+':A.gt'
+            def __ge__(self, other): return self.__class__.__name__+':A.ge'
+        class B:
+            def __eq__(self, other): return self.__class__.__name__+':B.eq'
+            def __ne__(self, other): return self.__class__.__name__+':B.ne'
+            def __lt__(self, other): return self.__class__.__name__+':B.lt'
+            def __le__(self, other): return self.__class__.__name__+':B.le'
+            def __gt__(self, other): return self.__class__.__name__+':B.gt'
+            def __ge__(self, other): return self.__class__.__name__+':B.ge'
+        #
+        assert (A() == B()) == 'A:A.eq'
+        assert (A() != B()) == 'A:A.ne'
+        assert (A() <  B()) == 'A:A.lt'
+        assert (A() <= B()) == 'A:A.le'
+        assert (A() >  B()) == 'A:A.gt'
+        assert (A() >= B()) == 'A:A.ge'
+        #
+        assert (B() == A()) == 'B:B.eq'
+        assert (B() != A()) == 'B:B.ne'
+        assert (B() <  A()) == 'B:B.lt'
+        assert (B() <= A()) == 'B:B.le'
+        assert (B() >  A()) == 'B:B.gt'
+        assert (B() >= A()) == 'B:B.ge'
+        #
+        class C(A):
+            def __eq__(self, other): return self.__class__.__name__+':C.eq'
+            def __ne__(self, other): return self.__class__.__name__+':C.ne'
+            def __lt__(self, other): return self.__class__.__name__+':C.lt'
+            def __le__(self, other): return self.__class__.__name__+':C.le'
+            def __gt__(self, other): return self.__class__.__name__+':C.gt'
+            def __ge__(self, other): return self.__class__.__name__+':C.ge'
+        #
+        assert (A() == C()) == 'A:A.eq'
+        assert (A() != C()) == 'A:A.ne'
+        assert (A() <  C()) == 'A:A.lt'
+        assert (A() <= C()) == 'A:A.le'
+        assert (A() >  C()) == 'A:A.gt'
+        assert (A() >= C()) == 'A:A.ge'
+        #
+        assert (C() == A()) == 'C:C.eq'
+        assert (C() != A()) == 'C:C.ne'
+        assert (C() <  A()) == 'C:C.lt'
+        assert (C() <= A()) == 'C:C.le'
+        assert (C() >  A()) == 'C:C.gt'
+        assert (C() >= A()) == 'C:C.ge'
+        #
+        class D(A):
+            pass
+        #
+        assert (A() == D()) == 'A:A.eq'
+        assert (A() != D()) == 'A:A.ne'
+        assert (A() <  D()) == 'A:A.lt'
+        assert (A() <= D()) == 'A:A.le'
+        assert (A() >  D()) == 'A:A.gt'
+        assert (A() >= D()) == 'A:A.ge'
+        #
+        assert (D() == A()) == 'D:A.eq'
+        assert (D() != A()) == 'D:A.ne'
+        assert (D() <  A()) == 'D:A.lt'
+        assert (D() <= A()) == 'D:A.le'
+        assert (D() >  A()) == 'D:A.gt'
+        assert (D() >= A()) == 'D:A.ge'
+
 
 class AppTestOldStyleClassStrDict(object):
     def setup_class(cls):
diff --git a/pypy/module/__pypy__/interp_builders.py b/pypy/module/__pypy__/interp_builders.py
--- a/pypy/module/__pypy__/interp_builders.py
+++ b/pypy/module/__pypy__/interp_builders.py
@@ -7,7 +7,7 @@
 
 class W_UnicodeBuilder(Wrappable):
     def __init__(self, space, size):
-        if size == -1:
+        if size < 0:
             self.builder = UnicodeBuilder()
         else:
             self.builder = UnicodeBuilder(size)
@@ -47,4 +47,4 @@
     append_slice = interp2app(W_UnicodeBuilder.descr_append_slice),
     build = interp2app(W_UnicodeBuilder.descr_build),
 )
-W_UnicodeBuilder.typedef.acceptable_as_base_class = False
\ No newline at end of file
+W_UnicodeBuilder.typedef.acceptable_as_base_class = False
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -687,11 +687,15 @@
 # support for the "string escape" codec
 # This is a bytes-to bytes transformation
 
- at unwrap_spec(errors='str_or_None')
-def escape_encode(space, w_string, errors='strict'):
-    w_repr = space.repr(w_string)
-    w_result = space.getslice(w_repr, space.wrap(1), space.wrap(-1))
-    return space.newtuple([w_result, space.len(w_string)])
+ at unwrap_spec(data=str, errors='str_or_None')
+def escape_encode(space, data, errors='strict'):
+    from pypy.objspace.std.stringobject import string_escape_encode
+    result = string_escape_encode(data, quote="'")
+    start = 1
+    end = len(result) - 1
+    assert end >= 0
+    w_result = space.wrap(result[start:end])
+    return space.newtuple([w_result, space.wrap(len(data))])
 
 @unwrap_spec(data=str, errors='str_or_None')
 def escape_decode(space, data, errors='strict'):
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -102,7 +102,6 @@
     
     def test_indexerror(self):
         test =   "\\"     # trailing backslash
-             
         raises (ValueError, test.decode,'string-escape')
 
     def test_charmap_decode(self):
@@ -292,6 +291,10 @@
         assert '\\0f'.decode('string_escape') == chr(0) + 'f'
         assert '\\08'.decode('string_escape') == chr(0) + '8'
 
+    def test_escape_encode(self):
+        assert '"'.encode('string_escape') == '"'
+        assert "'".encode('string_escape') == "\\'"
+
     def test_decode_utf8_different_case(self):
         constant = u"a"
         assert constant.encode("utf-8") == constant.encode("UTF-8")
diff --git a/pypy/module/_continuation/__init__.py b/pypy/module/_continuation/__init__.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/__init__.py
@@ -0,0 +1,40 @@
+from pypy.interpreter.mixedmodule import MixedModule
+
+
+class Module(MixedModule):
+    """This module exposes 'one-shot continuation containers'.
+
+A 'continulet' object from this module is a container that stores a
+one-shot continuation.  It is similar in purpose to the 'f_back'
+attribute of frames, which points to where execution should continue
+after this frame finishes.  The difference is that it will be changed
+(often repeatedly) before the frame actually returns.
+
+To make a continulet object, call 'continulet' with a callable and
+optional extra arguments.  Later, the first time you switch() to the
+continulet, the callable is invoked with the same continulet object as
+the extra first argument.
+
+At this point, the one-shot continuation stored in the continulet points
+to the caller of switch().  When switch() is called again, this one-shot
+continuation is exchanged with the current one; it means that the caller
+of switch() is suspended, its continuation stored in the container, and
+the old continuation from the continulet object is resumed.
+
+Continulets are internally implemented using stacklets.  Stacklets
+are a bit more primitive (they are really one-shot continuations), but
+that idea only works in C, not in Python, notably because of exceptions.
+
+The most primitive API is actually 'permute()', which just permutes the
+one-shot continuation stored in two (or more) continulets.
+"""
+
+    appleveldefs = {
+        'error': 'app_continuation.error',
+        'generator': 'app_continuation.generator',
+    }
+
+    interpleveldefs = {
+        'continulet': 'interp_continuation.W_Continulet',
+        'permute': 'interp_continuation.permute',
+    }
diff --git a/pypy/module/_continuation/app_continuation.py b/pypy/module/_continuation/app_continuation.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/app_continuation.py
@@ -0,0 +1,35 @@
+
+class error(Exception):
+    "Usage error of the _continuation module."
+
+
+import _continuation
+
+
+class generator(object):
+
+    def __init__(self, callable):
+        self.__func__ = callable
+
+    def __get__(self, obj, type=None):
+        return generator(self.__func__.__get__(obj, type))
+
+    def __call__(self, *args, **kwds):
+        return genlet(self.__func__, *args, **kwds)
+
+
+class genlet(_continuation.continulet):
+
+    def __iter__(self):
+        return self
+
+    def next(self, value=None):
+        res = self.switch(value)
+        if self.is_pending():
+            return res
+        else:
+            if res is not None:
+                raise TypeError("_continuation.generator must return None")
+            raise StopIteration
+
+    send = next
diff --git a/pypy/module/_continuation/interp_continuation.py b/pypy/module/_continuation/interp_continuation.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/interp_continuation.py
@@ -0,0 +1,245 @@
+from pypy.rlib.rstacklet import StackletThread
+from pypy.rlib import jit
+from pypy.interpreter.error import OperationError
+from pypy.interpreter.executioncontext import ExecutionContext
+from pypy.interpreter.baseobjspace import Wrappable
+from pypy.interpreter.typedef import TypeDef
+from pypy.interpreter.gateway import interp2app
+
+
+class W_Continulet(Wrappable):
+    sthread = None
+
+    def __init__(self, space):
+        self.space = space
+        # states:
+        #  - not init'ed: self.sthread == None
+        #  - normal:      self.sthread != None, not is_empty_handle(self.h)
+        #  - finished:    self.sthread != None, is_empty_handle(self.h)
+
+    def check_sthread(self):
+        ec = self.space.getexecutioncontext()
+        if ec.stacklet_thread is not self.sthread:
+            start_state.clear()
+            raise geterror(self.space, "inter-thread support is missing")
+        return ec
+
+    def descr_init(self, w_callable, __args__):
+        if self.sthread is not None:
+            raise geterror(self.space, "continulet already __init__ialized")
+        start_state.origin = self
+        start_state.w_callable = w_callable
+        start_state.args = __args__
+        self.sthread = build_sthread(self.space)
+        try:
+            self.h = self.sthread.new(new_stacklet_callback)
+            if self.sthread.is_empty_handle(self.h):    # early return
+                raise MemoryError
+        except MemoryError:
+            self.sthread = None
+            start_state.clear()
+            raise getmemoryerror(self.space)
+
+    def switch(self, w_to):
+        to = self.space.interp_w(W_Continulet, w_to, can_be_None=True)
+        if to is not None:
+            if self is to:    # double-switch to myself: no-op
+                return get_result()
+            if to.sthread is None:
+                start_state.clear()
+                raise geterror(self.space, "continulet not initialized yet")
+        if self.sthread is None:
+            start_state.clear()
+            raise geterror(self.space, "continulet not initialized yet")
+        ec = self.check_sthread()
+        saved_topframeref = ec.topframeref
+        #
+        start_state.origin = self
+        if to is None:
+            # simple switch: going to self.h
+            start_state.destination = self
+        else:
+            # double switch: the final destination is to.h
+            start_state.destination = to
+        #
+        h = start_state.destination.h
+        sthread = self.sthread
+        if sthread.is_empty_handle(h):
+            start_state.clear()
+            raise geterror(self.space, "continulet already finished")
+        #
+        try:
+            do_switch(sthread, h)
+        except MemoryError:
+            start_state.clear()
+            raise getmemoryerror(self.space)
+        #
+        ec = sthread.ec
+        ec.topframeref = saved_topframeref
+        return get_result()
+
+    def descr_switch(self, w_value=None, w_to=None):
+        start_state.w_value = w_value
+        return self.switch(w_to)
+
+    def descr_throw(self, w_type, w_val=None, w_tb=None, w_to=None):
+        from pypy.interpreter.pytraceback import check_traceback
+        space = self.space
+        #
+        msg = "throw() third argument must be a traceback object"
+        if space.is_w(w_tb, space.w_None):
+            tb = None
+        else:
+            tb = check_traceback(space, w_tb, msg)
+        #
+        operr = OperationError(w_type, w_val, tb)
+        operr.normalize_exception(space)
+        start_state.w_value = None
+        start_state.propagate_exception = operr
+        return self.switch(w_to)
+
+    def descr_is_pending(self):
+        valid = (self.sthread is not None
+                 and not self.sthread.is_empty_handle(self.h))
+        return self.space.newbool(valid)
+
+
+def W_Continulet___new__(space, w_subtype, __args__):
+    r = space.allocate_instance(W_Continulet, w_subtype)
+    r.__init__(space)
+    return space.wrap(r)
+
+
+W_Continulet.typedef = TypeDef(
+    'continulet',
+    __module__ = '_continuation',
+    __new__     = interp2app(W_Continulet___new__),
+    __init__    = interp2app(W_Continulet.descr_init),
+    switch      = interp2app(W_Continulet.descr_switch),
+    throw       = interp2app(W_Continulet.descr_throw),
+    is_pending  = interp2app(W_Continulet.descr_is_pending),
+    )
+
+
+# ____________________________________________________________
+
+
+class State:
+    def __init__(self, space):
+        self.space = space 
+        w_module = space.getbuiltinmodule('_continuation')
+        self.w_error = space.getattr(w_module, space.wrap('error'))
+        self.w_memoryerror = OperationError(space.w_MemoryError, space.w_None)
+
+def geterror(space, message):
+    cs = space.fromcache(State)
+    return OperationError(cs.w_error, space.wrap(message))
+
+def getmemoryerror(space):
+    cs = space.fromcache(State)
+    return cs.w_memoryerror
+
+# ____________________________________________________________
+
+
+class SThread(StackletThread):
+
+    def __init__(self, space, ec):
+        StackletThread.__init__(self, space.config)
+        self.space = space
+        self.ec = ec
+
+ExecutionContext.stacklet_thread = None
+
+# ____________________________________________________________
+
+
+class StartState:   # xxx a single global to pass around the function to start
+    def clear(self):
+        self.origin = None
+        self.destination = None
+        self.w_callable = None
+        self.args = None
+        self.w_value = None
+        self.propagate_exception = None
+start_state = StartState()
+start_state.clear()
+
+
+def new_stacklet_callback(h, arg):
+    self       = start_state.origin
+    w_callable = start_state.w_callable
+    args       = start_state.args
+    start_state.clear()
+    try:
+        do_switch(self.sthread, h)
+    except MemoryError:
+        return h       # oups!  do an early return in this case
+    #
+    space = self.space
+    try:
+        ec = self.sthread.ec
+        ec.topframeref = jit.vref_None
+
+        if start_state.propagate_exception is not None:
+            raise start_state.propagate_exception   # just propagate it further
+        if start_state.w_value is not space.w_None:
+            raise OperationError(space.w_TypeError, space.wrap(
+                "can't send non-None value to a just-started continulet"))
+
+        args = args.prepend(self.space.wrap(self))
+        w_result = space.call_args(w_callable, args)
+    except Exception, e:
+        start_state.propagate_exception = e
+    else:
+        start_state.w_value = w_result
+    start_state.origin = self
+    start_state.destination = self
+    return self.h
+
+
+def do_switch(sthread, h):
+    h = sthread.switch(h)
+    origin = start_state.origin
+    self = start_state.destination
+    start_state.origin = None
+    start_state.destination = None
+    self.h, origin.h = origin.h, h
+
+def get_result():
+    if start_state.propagate_exception:
+        e = start_state.propagate_exception
+        start_state.propagate_exception = None
+        raise e
+    w_value = start_state.w_value
+    start_state.w_value = None
+    return w_value
+
+def build_sthread(space):
+    ec = space.getexecutioncontext()
+    sthread = ec.stacklet_thread
+    if not sthread:
+        sthread = ec.stacklet_thread = SThread(space, ec)
+    return sthread
+
+# ____________________________________________________________
+
+def permute(space, args_w):
+    sthread = build_sthread(space)
+    #
+    contlist = []
+    for w_cont in args_w:
+        cont = space.interp_w(W_Continulet, w_cont)
+        if cont.sthread is not sthread:
+            if cont.sthread is None:
+                raise geterror(space, "got a non-initialized continulet")
+            else:
+                raise geterror(space, "inter-thread support is missing")
+        elif sthread.is_empty_handle(cont.h):
+            raise geterror(space, "got an already-finished continulet")
+        contlist.append(cont)
+    #
+    if len(contlist) > 1:
+        other = contlist[-1].h
+        for cont in contlist:
+            other, cont.h = cont.h, other
diff --git a/pypy/module/_continuation/test/__init__.py b/pypy/module/_continuation/test/__init__.py
new file mode 100644
diff --git a/pypy/module/_continuation/test/support.py b/pypy/module/_continuation/test/support.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/support.py
@@ -0,0 +1,12 @@
+import py
+from pypy.conftest import gettestobjspace
+from pypy.rpython.tool.rffi_platform import CompilationError
+
+
+class BaseAppTest:
+    def setup_class(cls):
+        try:
+            import pypy.rlib.rstacklet
+        except CompilationError, e:
+            py.test.skip("cannot import rstacklet: %s" % e)
+        cls.space = gettestobjspace(usemodules=['_continuation'])
diff --git a/pypy/module/_continuation/test/test_generator.py b/pypy/module/_continuation/test/test_generator.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/test_generator.py
@@ -0,0 +1,70 @@
+from pypy.module._continuation.test.support import BaseAppTest
+
+
+class AppTestGenerator(BaseAppTest):
+
+    def test_simple(self):
+        from _continuation import generator
+        #
+        @generator
+        def f(gen, n):
+            gen.switch(n+1)
+            f2(gen, n+2)
+            gen.switch(n+3)
+        #
+        def f2(gen, m):
+            gen.switch(m*2)
+        #
+        g = f(10)
+        res = g.next()
+        assert res == 11
+        res = g.next()
+        assert res == 24
+        res = g.next()
+        assert res == 13
+        raises(StopIteration, g.next)
+
+    def test_iterator(self):
+        from _continuation import generator
+        #
+        @generator
+        def f(gen, n):
+            gen.switch(n+1)
+            f2(gen, n+2)
+            gen.switch(n+3)
+        #
+        def f2(gen, m):
+            gen.switch(m*2)
+        #
+        res = list(f(10))
+        assert res == [11, 24, 13]
+        g = f(20)
+        assert iter(g) is g
+
+    def test_bound_method(self):
+        from _continuation import generator
+        #
+        class A(object):
+            def __init__(self, m):
+                self.m = m
+            #
+            @generator
+            def f(self, gen, n):
+                gen.switch(n - self.m)
+        #
+        a = A(10)
+        res = list(a.f(25))
+        assert res == [15]
+
+    def test_must_return_None(self):
+        from _continuation import generator
+        #
+        @generator
+        def f(gen, n):
+            gen.switch(n+1)
+            return "foo"
+        #
+        g = f(10)
+        res = g.next()
+        assert res == 11
+        raises(TypeError, g.next)
diff --git a/pypy/module/_continuation/test/test_stacklet.py b/pypy/module/_continuation/test/test_stacklet.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/test_stacklet.py
@@ -0,0 +1,635 @@
+import os
+from pypy.module._continuation.test.support import BaseAppTest
+
+
+class AppTestStacklet(BaseAppTest):
+    def setup_class(cls):
+        BaseAppTest.setup_class.im_func(cls)
+        cls.w_translated = cls.space.wrap(
+            os.path.join(os.path.dirname(__file__),
+                         'test_translated.py'))
+
+    def test_new_empty(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c):
+            pass
+        #
+        c = continulet(empty_callback)
+        assert type(c) is continulet
+
+    def test_call_empty(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c1):
+            assert c1 is c
+            seen.append(1)
+            return 42
+        #
+        seen = []
+        c = continulet(empty_callback)
+        res = c.switch()
+        assert res == 42
+        assert seen == [1]
+
+    def test_no_double_init(self):
+        from _continuation import continulet, error
+        #
+        def empty_callback(c1):
+            pass
+        #
+        c = continulet(empty_callback)
+        raises(error, c.__init__, empty_callback)
+
+    def test_no_init_after_started(self):
+        from _continuation import continulet, error
+        #
+        def empty_callback(c1):
+            raises(error, c1.__init__, empty_callback)
+            return 42
+        #
+        c = continulet(empty_callback)
+        res = c.switch()
+        assert res == 42
+
+    def test_no_init_after_finished(self):
+        from _continuation import continulet, error
+        #
+        def empty_callback(c1):
+            return 42
+        #
+        c = continulet(empty_callback)
+        res = c.switch()
+        assert res == 42
+        raises(error, c.__init__, empty_callback)
+
+    def test_propagate_exception(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c1):
+            assert c1 is c
+            seen.append(42)
+            raise ValueError
+        #
+        seen = []
+        c = continulet(empty_callback)
+        raises(ValueError, c.switch)
+        assert seen == [42]
+
+    def test_callback_with_arguments(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c1, *args, **kwds):
+            seen.append(c1)
+            seen.append(args)
+            seen.append(kwds)
+            return 42
+        #
+        seen = []
+        c = continulet(empty_callback, 42, 43, foo=44, bar=45)
+        res = c.switch()
+        assert res == 42
+        assert seen == [c, (42, 43), {'foo': 44, 'bar': 45}]
+
+    def test_switch(self):
+        from _continuation import continulet
+        #
+        def switchbackonce_callback(c):
+            seen.append(1)
+            res = c.switch('a')
+            assert res == 'b'
+            seen.append(3)
+            return 'c'
+        #
+        seen = []
+        c = continulet(switchbackonce_callback)
+        seen.append(0)
+        res = c.switch()
+        assert res == 'a'
+        seen.append(2)
+        res = c.switch('b')
+        assert res == 'c'
+        assert seen == [0, 1, 2, 3]
+
+    def test_initial_switch_must_give_None(self):
+        from _continuation import continulet
+        #
+        def empty_callback(c):
+            return 'ok'
+        #
+        c = continulet(empty_callback)
+        res = c.switch(None)
+        assert res == 'ok'
+        #
+        c = continulet(empty_callback)
+        raises(TypeError, c.switch, 'foo')  # "can't send non-None value"
+
+    def test_continuation_error(self):
+        from _continuation import continulet, error
+        #
+        def empty_callback(c):
+            return 42
+        #
+        c = continulet(empty_callback)
+        c.switch()
+        e = raises(error, c.switch)
+        assert str(e.value) == "continulet already finished"
+
+    def test_not_initialized_yet(self):
+        from _continuation import continulet, error
+        c = continulet.__new__(continulet)
+        e = raises(error, c.switch)
+        assert str(e.value) == "continulet not initialized yet"
+
+    def test_go_depth2(self):
+        from _continuation import continulet
+        #
+        def depth2(c):
+            seen.append(3)
+            return 4
+        #
+        def depth1(c):
+            seen.append(1)
+            c2 = continulet(depth2)
+            seen.append(2)
+            res = c2.switch()
+            seen.append(res)
+            return 5
+        #
+        seen = []
+        c = continulet(depth1)
+        seen.append(0)
+        res = c.switch()
+        seen.append(res)
+        assert seen == [0, 1, 2, 3, 4, 5]
+
+    def test_exception_depth2(self):
+        from _continuation import continulet
+        #
+        def depth2(c):
+            seen.append(2)
+            raise ValueError
+        #
+        def depth1(c):
+            seen.append(1)
+            try:
+                continulet(depth2).switch()
+            except ValueError:
+                seen.append(3)
+            return 4
+        #
+        seen = []
+        c = continulet(depth1)
+        res = c.switch()
+        seen.append(res)
+        assert seen == [1, 2, 3, 4]
+
+    def test_exception_with_switch(self):
+        from _continuation import continulet
+        #
+        def depth1(c):
+            seen.append(1)
+            c.switch()
+            seen.append(3)
+            raise ValueError
+        #
+        seen = []
+        c = continulet(depth1)
+        seen.append(0)
+        c.switch()
+        seen.append(2)
+        raises(ValueError, c.switch)
+        assert seen == [0, 1, 2, 3]
+
+    def test_is_pending(self):
+        from _continuation import continulet
+        #
+        def switchbackonce_callback(c):
+            assert c.is_pending()
+            res = c.switch('a')
+            assert res == 'b'
+            assert c.is_pending()
+            return 'c'
+        #
+        c = continulet.__new__(continulet)
+        assert not c.is_pending()
+        c.__init__(switchbackonce_callback)
+        assert c.is_pending()
+        res = c.switch()
+        assert res == 'a'
+        assert c.is_pending()
+        res = c.switch('b')
+        assert res == 'c'
+        assert not c.is_pending()
+
+    def test_switch_alternate(self):
+        from _continuation import continulet
+        #
+        def func_lower(c):
+            res = c.switch('a')
+            assert res == 'b'
+            res = c.switch('c')
+            assert res == 'd'
+            return 'e'
+        #
+        def func_upper(c):
+            res = c.switch('A')
+            assert res == 'B'
+            res = c.switch('C')
+            assert res == 'D'
+            return 'E'
+        #
+        c_lower = continulet(func_lower)
+        c_upper = continulet(func_upper)
+        res = c_lower.switch()
+        assert res == 'a'
+        res = c_upper.switch()
+        assert res == 'A'
+        res = c_lower.switch('b')
+        assert res == 'c'
+        res = c_upper.switch('B')
+        assert res == 'C'
+        res = c_lower.switch('d')
+        assert res == 'e'
+        res = c_upper.switch('D')
+        assert res == 'E'
+
+    def test_exception_with_switch_depth2(self):
+        from _continuation import continulet
+        #
+        def depth2(c):
+            seen.append(4)
+            c.switch()
+            seen.append(6)
+            raise ValueError
+        #
+        def depth1(c):
+            seen.append(1)
+            c.switch()
+            seen.append(3)
+            c2 = continulet(depth2)
+            c2.switch()
+            seen.append(5)
+            raises(ValueError, c2.switch)
+            assert not c2.is_pending()
+            seen.append(7)
+            assert c.is_pending()
+            raise KeyError
+        #
+        seen = []
+        c = continulet(depth1)
+        c.switch()
+        seen.append(2)
+        raises(KeyError, c.switch)
+        assert not c.is_pending()
+        assert seen == [1, 2, 3, 4, 5, 6, 7]
+
+    def test_random_switching(self):
+        from _continuation import continulet
+        #
+        def t1(c1):
+            return c1.switch()
+        def s1(c1, n):
+            assert n == 123
+            c2 = t1(c1)
+            return c1.switch('a') + 1
+        #
+        def s2(c2, c1):
+            res = c1.switch(c2)
+            assert res == 'a'
+            return c2.switch('b') + 2
+        #
+        def f():
+            c1 = continulet(s1, 123)
+            c2 = continulet(s2, c1)
+            c1.switch()
+            res = c2.switch()
+            assert res == 'b'
+            res = c1.switch(1000)
+            assert res == 1001
+            return c2.switch(2000)
+        #
+        res = f()
+        assert res == 2002
+
+    def test_f_back_is_None_for_now(self):
+        import sys
+        from _continuation import continulet
+        #
+        def g(c):
+            c.switch(sys._getframe(0))
+            c.switch(sys._getframe(0).f_back)
+            c.switch(sys._getframe(1))
+            c.switch(sys._getframe(1).f_back)
+            c.switch(sys._getframe(2))
+        def f(c):
+            g(c)
+        #
+        c = continulet(f)
+        f1 = c.switch()
+        assert f1.f_code.co_name == 'g'
+        f2 = c.switch()
+        assert f2.f_code.co_name == 'f'
+        f3 = c.switch()
+        assert f3.f_code.co_name == 'f'
+        f4 = c.switch()
+        assert f4 is None
+        raises(ValueError, c.switch)    # "call stack is not deep enough"
+
+    def test_traceback_is_complete(self):
+        import sys
+        from _continuation import continulet
+        #
+        def g():
+            raise KeyError
+        def f(c):
+            g()
+        #
+        def do(c):
+            c.switch()
+        #
+        c = continulet(f)
+        try:
+            do(c)
+        except KeyError:
+            tb = sys.exc_info()[2]
+        else:
+            raise AssertionError("should have raised!")
+        #
+        assert tb.tb_next.tb_frame.f_code.co_name == 'do'
+        assert tb.tb_next.tb_next.tb_frame.f_code.co_name == 'f'
+        assert tb.tb_next.tb_next.tb_next.tb_frame.f_code.co_name == 'g'
+        assert tb.tb_next.tb_next.tb_next.tb_next is None
+
+    def test_switch2_simple(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            res = c1.switch('started 1')
+            assert res == 'a'
+            res = c1.switch('b', to=c2)
+            assert res == 'c'
+            return 42
+        def f2(c2):
+            res = c2.switch('started 2')
+            assert res == 'b'
+            res = c2.switch('c', to=c1)
+            not_reachable
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c1.switch()
+        assert res == 'started 1'
+        res = c2.switch()
+        assert res == 'started 2'
+        res = c1.switch('a')
+        assert res == 42
+
+    def test_switch2_pingpong(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            res = c1.switch('started 1')
+            assert res == 'go'
+            for i in range(10):
+                res = c1.switch(i, to=c2)
+                assert res == 100 + i
+            return 42
+        def f2(c2):
+            res = c2.switch('started 2')
+            for i in range(10):
+                assert res == i
+                res = c2.switch(100 + i, to=c1)
+            not_reachable
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c1.switch()
+        assert res == 'started 1'
+        res = c2.switch()
+        assert res == 'started 2'
+        res = c1.switch('go')
+        assert res == 42
+
+    def test_switch2_more_complex(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            res = c1.switch(to=c2)
+            assert res == 'a'
+            res = c1.switch('b', to=c2)
+            assert res == 'c'
+            return 41
+        def f2(c2):
+            res = c2.switch('a', to=c1)
+            assert res == 'b'
+            return 42
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c1.switch()
+        assert res == 42
+        assert not c2.is_pending()    # finished by returning 42
+        res = c1.switch('c')
+        assert res == 41
+
+    def test_switch2_no_op(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            res = c1.switch('a', to=c1)
+            assert res == 'a'
+            return 42
+        #
+        c1 = continulet(f1)
+        res = c1.switch()
+        assert res == 42
+
+    def test_switch2_immediately_away(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            print 'in f1'
+            return 'm'
+        #
+        def f2(c2):
+            res = c2.switch('z')
+            print 'got there!'
+            assert res == 'a'
+            return None
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c2.switch()
+        assert res == 'z'
+        assert c1.is_pending()
+        assert c2.is_pending()
+        print 'calling!'
+        res = c1.switch('a', to=c2)
+        print 'back'
+        assert res == 'm'
+
+    def test_switch2_immediately_away_corner_case(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            this_is_never_seen
+        #
+        def f2(c2):
+            res = c2.switch('z')
+            assert res is None
+            return 'b'    # this goes back into the caller, which is f1,
+                          # but f1 didn't start yet, so a None-value value
+                          # has nowhere to go to...
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c2.switch()
+        assert res == 'z'
+        raises(TypeError, c1.switch, to=c2)  # "can't send non-None value"
+
+    def test_switch2_not_initialized_yet(self):
+        from _continuation import continulet, error
+        #
+        def f1(c1):
+            not_reachable
+        #
+        c1 = continulet(f1)
+        c2 = continulet.__new__(continulet)
+        e = raises(error, c1.switch, to=c2)
+        assert str(e.value) == "continulet not initialized yet"
+
+    def test_switch2_already_finished(self):
+        from _continuation import continulet, error
+        #
+        def f1(c1):
+            not_reachable
+        def empty_callback(c):
+            return 42
+        #
+        c1 = continulet(f1)
+        c2 = continulet(empty_callback)
+        c2.switch()
+        e = raises(error, c1.switch, to=c2)
+        assert str(e.value) == "continulet already finished"
+
+    def test_throw(self):
+        import sys
+        from _continuation import continulet
+        #
+        def f1(c1):
+            try:
+                c1.switch()
+            except KeyError:
+                res = "got keyerror"
+            try:
+                c1.switch(res)
+            except IndexError, e:
+                pass
+            try:
+                c1.switch(e)
+            except IndexError, e2:
+                pass
+            try:
+                c1.switch(e2)
+            except IndexError:
+                c1.throw(*sys.exc_info())
+            should_never_reach_here
+        #
+        c1 = continulet(f1)
+        c1.switch()
+        res = c1.throw(KeyError)
+        assert res == "got keyerror"
+        class FooError(IndexError):
+            pass
+        foo = FooError()
+        res = c1.throw(foo)
+        assert res is foo
+        res = c1.throw(IndexError, foo)
+        assert res is foo
+        #
+        def main():
+            def do_raise():
+                raise foo
+            try:
+                do_raise()
+            except IndexError:
+                tb = sys.exc_info()[2]
+            try:
+                c1.throw(IndexError, foo, tb)
+            except IndexError:
+                tb = sys.exc_info()[2]
+            return tb
+        #
+        tb = main()
+        assert tb.tb_frame.f_code.co_name == 'main'
+        assert tb.tb_next.tb_frame.f_code.co_name == 'f1'
+        assert tb.tb_next.tb_next.tb_frame.f_code.co_name == 'main'
+        assert tb.tb_next.tb_next.tb_next.tb_frame.f_code.co_name == 'do_raise'
+        assert tb.tb_next.tb_next.tb_next.tb_next is None
+
+    def test_throw_to_starting(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            not_reached
+        #
+        c1 = continulet(f1)
+        raises(IndexError, c1.throw, IndexError)
+
+    def test_throw2_simple(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            not_reached
+        def f2(c2):
+            try:
+                c2.switch("ready")
+            except IndexError:
+                raise ValueError
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        res = c2.switch()
+        assert res == "ready"
+        assert c1.is_pending()
+        assert c2.is_pending()
+        raises(ValueError, c1.throw, IndexError, to=c2)
+        assert not c1.is_pending()
+        assert not c2.is_pending()
+
+    def test_throw2_no_op(self):
+        from _continuation import continulet
+        #
+        def f1(c1):
+            raises(ValueError, c1.throw, ValueError, to=c1)
+            return "ok"
+        #
+        c1 = continulet(f1)
+        res = c1.switch()
+        assert res == "ok"
+
+    def test_permute(self):
+        from _continuation import continulet, permute
+        #
+        def f1(c1):
+            res = c1.switch()
+            assert res == "ok"
+            return "done"
+        #
+        def f2(c2):
+            permute(c1, c2)
+            return "ok"
+        #
+        c1 = continulet(f1)
+        c2 = continulet(f2)
+        c1.switch()
+        res = c2.switch()
+        assert res == "done"
+
+    def test_various_depths(self):
+        skip("may fail on top of CPython")
+        # run it from test_translated, but not while being actually translated
+        d = {}
+        execfile(self.translated, d)
+        d['set_fast_mode']()
+        d['test_various_depths']()
diff --git a/pypy/module/_continuation/test/test_translated.py b/pypy/module/_continuation/test/test_translated.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/test_translated.py
@@ -0,0 +1,132 @@
+import py
+try:
+    import _continuation
+except ImportError:
+    py.test.skip("to run on top of a translated pypy-c")
+
+import sys, random
+
+# ____________________________________________________________
+
+STATUS_MAX = 50000
+CONTINULETS = 50
+
+def set_fast_mode():
+    global STATUS_MAX, CONTINULETS
+    STATUS_MAX = 100
+    CONTINULETS = 5
+
+# ____________________________________________________________
+
+class Done(Exception):
+    pass
+
+
+class Runner(object):
+
+    def __init__(self):
+        self.foobar = 12345
+        self.conts = {}     # {continulet: parent-or-None}
+        self.contlist = []
+
+    def run_test(self):
+        self.start_continulets()
+        self.n = 0
+        try:
+            while True:
+                self.do_switch(src=None)
+                assert self.target is None
+        except Done:
+            self.check_traceback(sys.exc_info()[2])
+
+    def do_switch(self, src):
+        assert src not in self.conts.values()
+        c = random.choice(self.contlist)
+        self.target = self.conts[c]
+        self.conts[c] = src
+        c.switch()
+        assert self.target is src
+
+    def run_continulet(self, c, i):
+        while True:
+            assert self.target is c
+            assert self.contlist[i] is c
+            self.do_switch(c)
+            assert self.foobar == 12345
+            self.n += 1
+            if self.n >= STATUS_MAX:
+                raise Done
+
+    def start_continulets(self, i=0):
+        c = _continuation.continulet(self.run_continulet, i)
+        self.contlist.append(c)
+        if i < CONTINULETS:
+            self.start_continulets(i + 1)
+            # ^^^ start each continulet with a different base stack
+        self.conts[c] = c   # initially (i.e. not started) there are all loops
+
+    def check_traceback(self, tb):
+        found = []
+        tb = tb.tb_next
+        while tb:
+            if tb.tb_frame.f_code.co_name != 'do_switch':
+                assert tb.tb_frame.f_code.co_name == 'run_continulet', (
+                    "got %r" % (tb.tb_frame.f_code.co_name,))
+                found.append(tb.tb_frame.f_locals['c'])
+            tb = tb.tb_next
+        found.reverse()
+        #
+        expected = []
+        c = self.target
+        while c is not None:
+            expected.append(c)
+            c = self.conts[c]
+        #
+        assert found == expected, "%r == %r" % (found, expected)
+
+# ____________________________________________________________
+
+class AppTestWrapper:
+    def setup_class(cls):
+        "Run test_various_depths() when we are run with 'pypy py.test -A'."
+        from pypy.conftest import option
+        if not option.runappdirect:
+            py.test.skip("meant only for -A run")
+
+    def test_single_threaded(self):
+        for i in range(20):
+            yield Runner().run_test,
+
+    def test_multi_threaded(self):
+        for i in range(5):
+            yield multithreaded_test,
+
+class ThreadTest(object):
+    def __init__(self, lock):
+        self.lock = lock
+        self.ok = False
+        lock.acquire()
+    def run(self):
+        try:
+            Runner().run_test()
+            self.ok = True
+        finally:
+            self.lock.release()
+
+def multithreaded_test():
+    try:
+        import thread
+    except ImportError:
+        py.test.skip("no threads")
+    ts = [ThreadTest(thread.allocate_lock()) for i in range(5)]
+    for t in ts:
+        thread.start_new_thread(t.run, ())
+    for t in ts:
+        t.lock.acquire()
+    for t in ts:
+        assert t.ok
+
+# ____________________________________________________________
+
+if __name__ == '__main__':
+    Runner().run_test()
diff --git a/pypy/module/bz2/interp_bz2.py b/pypy/module/bz2/interp_bz2.py
--- a/pypy/module/bz2/interp_bz2.py
+++ b/pypy/module/bz2/interp_bz2.py
@@ -351,6 +351,7 @@
         self.decompressor = W_BZ2Decompressor(space)
         self.readlength = r_longlong(0)
         self.buffer = ""
+        self.pos = 0
         self.finished = False
         if buffering < 1024:
             buffering = 1024   # minimum amount of compressed data read at once
@@ -385,6 +386,7 @@
             self.stream.seek(0, 0)
             self.decompressor = W_BZ2Decompressor(self.space)
             self.readlength = r_longlong(0)
+            self.pos = 0
             self.buffer = ""
             self.finished = False
         else:
@@ -410,15 +412,19 @@
                                  self.space.wrap("compressed file ended before the logical end-of-the-stream was detected"))
         result = self.space.str_w(w_result)
         self.readlength += len(result)
-        result = self.buffer + result
+        if len(self.buffer) != self.pos:
+            pos = self.pos
+            assert pos >= 0
+            result = self.buffer[pos:] + result
         self.buffer = ''
+        self.pos = 0
         return result
 
     def read(self, n):
         # XXX not nice
         if n <= 0:
             return ''
-        while not self.buffer:
+        while self.pos == len(self.buffer):
             if self.finished:
                 return ""
             moredata = self.stream.read(max(self.buffering, n))
@@ -433,17 +439,25 @@
                     return ""
                 raise
             self.buffer = self.space.str_w(w_read)
-        if len(self.buffer) >= n:
-            result = self.buffer[:n]
-            self.buffer = self.buffer[n:]
+            self.pos = 0
+        if len(self.buffer) - self.pos >= n:
+            pos = self.pos
+            assert pos >= 0
+            result = self.buffer[pos:pos + n]
+            self.pos += n
         else:
-            result = self.buffer
+            pos = self.pos
+            assert pos >= 0
+            result = self.buffer[pos:]
+            self.pos = 0
             self.buffer = ""
         self.readlength += len(result)
         return result
 
     def peek(self):
-        return self.buffer
+        pos = self.pos
+        assert pos >= 0
+        return self.buffer[pos:]
 
     def try_to_find_file_descriptor(self):
         return self.stream.try_to_find_file_descriptor()
diff --git a/pypy/module/bz2/test/test_bz2_file.py b/pypy/module/bz2/test/test_bz2_file.py
--- a/pypy/module/bz2/test/test_bz2_file.py
+++ b/pypy/module/bz2/test/test_bz2_file.py
@@ -274,14 +274,14 @@
             pass
         del bz2f   # delete from this frame, which is captured in the traceback
 
-    def test_read_chunk10(self):
+    def test_read_chunk9(self):
         from bz2 import BZ2File
         self.create_temp_file()
         
         bz2f = BZ2File(self.temppath)
         text_read = ""
         while True:
-            data = bz2f.read(10)
+            data = bz2f.read(9) # 9 doesn't divide evenly into data length
             if not data:
                 break
             text_read = "%s%s" % (text_read, data)
diff --git a/pypy/module/cpyext/frameobject.py b/pypy/module/cpyext/frameobject.py
--- a/pypy/module/cpyext/frameobject.py
+++ b/pypy/module/cpyext/frameobject.py
@@ -57,7 +57,7 @@
     code = space.interp_w(PyCode, w_code)
     w_globals = from_ref(space, py_frame.c_f_globals)
 
-    frame = space.FrameClass(space, code, w_globals, closure=None)
+    frame = space.FrameClass(space, code, w_globals, outer_func=None)
     frame.f_lineno = py_frame.c_f_lineno
     w_obj = space.wrap(frame)
     track_reference(space, py_obj, w_obj)
diff --git a/pypy/module/micronumpy/__init__.py b/pypy/module/micronumpy/__init__.py
--- a/pypy/module/micronumpy/__init__.py
+++ b/pypy/module/micronumpy/__init__.py
@@ -1,43 +1,46 @@
-
 from pypy.interpreter.mixedmodule import MixedModule
 
+
 class Module(MixedModule):
-
     applevel_name = 'numpy'
 
     interpleveldefs = {
         'array': 'interp_numarray.SingleDimArray',
         'dtype': 'interp_dtype.W_Dtype',
+        'ufunc': 'interp_ufuncs.W_Ufunc',
 
         'zeros': 'interp_numarray.zeros',
         'empty': 'interp_numarray.zeros',
         'ones': 'interp_numarray.ones',
         'fromstring': 'interp_support.fromstring',
+    }
 
-        # ufuncs
-        'abs': 'interp_ufuncs.absolute',
-        'absolute': 'interp_ufuncs.absolute',
-        'add': 'interp_ufuncs.add',
-        'copysign': 'interp_ufuncs.copysign',
-        'divide': 'interp_ufuncs.divide',
-        'exp': 'interp_ufuncs.exp',
-        'fabs': 'interp_ufuncs.fabs',
-        'floor': 'interp_ufuncs.floor',
-        'maximum': 'interp_ufuncs.maximum',
-        'minimum': 'interp_ufuncs.minimum',
-    'multiply': 'interp_ufuncs.multiply',
-    'negative': 'interp_ufuncs.negative',
-    'reciprocal': 'interp_ufuncs.reciprocal',
-    'sign': 'interp_ufuncs.sign',
-    'subtract': 'interp_ufuncs.subtract',
-    'sin': 'interp_ufuncs.sin',
-    'cos': 'interp_ufuncs.cos',
-    'tan': 'interp_ufuncs.tan',
-    'arcsin': 'interp_ufuncs.arcsin',
-    'arccos': 'interp_ufuncs.arccos',
-    'arctan': 'interp_ufuncs.arctan',
-    'equal': 'interp_ufuncs.equal',
-    }
+    # ufuncs
+    for exposed, impl in [
+        ("abs", "absolute"),
+        ("absolute", "absolute"),
+        ("add", "add"),
+        ("arccos", "arccos"),
+        ("arcsin", "arcsin"),
+        ("arctan", "arctan"),
+        ("copysign", "copysign"),
+        ("cos", "cos"),
+        ("divide", "divide"),
+        ("exp", "exp"),
+        ("fabs", "fabs"),
+        ("floor", "floor"),
+        ("maximum", "maximum"),
+        ("minimum", "minimum"),
+        ("multiply", "multiply"),
+        ("negative", "negative"),
+        ("reciprocal", "reciprocal"),
+        ("sign", "sign"),
+        ("sin", "sin"),
+        ("subtract", "subtract"),
+        ("tan", "tan"),
+        ("equal", "equal")
+    ]:
+        interpleveldefs[exposed] = "interp_ufuncs.get(space).%s" % impl
 
     appleveldefs = {
         'average': 'app_numpy.average',
diff --git a/pypy/module/micronumpy/compile.py b/pypy/module/micronumpy/compile.py
--- a/pypy/module/micronumpy/compile.py
+++ b/pypy/module/micronumpy/compile.py
@@ -20,6 +20,7 @@
 
 class FakeSpace(object):
     w_ValueError = None
+    w_TypeError = None
 
     def __init__(self):
         """NOT_RPYTHON"""
diff --git a/pypy/module/micronumpy/interp_dtype.py b/pypy/module/micronumpy/interp_dtype.py
--- a/pypy/module/micronumpy/interp_dtype.py
+++ b/pypy/module/micronumpy/interp_dtype.py
@@ -53,7 +53,9 @@
 
 VOID_TP = lltype.Ptr(lltype.Array(lltype.Void, hints={'nolength': True, "uncast_on_llgraph": True}))
 
-def create_low_level_dtype(num, kind, name, aliases, applevel_types, T, valtype):
+def create_low_level_dtype(num, kind, name, aliases, applevel_types, T, valtype,
+    expected_size=None):
+
     class Box(BaseBox):
         def __init__(self, val):
             self.val = val
@@ -113,6 +115,8 @@
     W_LowLevelDtype.aliases = aliases
     W_LowLevelDtype.applevel_types = applevel_types
     W_LowLevelDtype.num_bytes = rffi.sizeof(T)
+    if expected_size is not None:
+        assert W_LowLevelDtype.num_bytes == expected_size
     return W_LowLevelDtype
 
 
@@ -263,6 +267,9 @@
 class IntegerArithmeticDtype(ArithmaticTypeMixin):
     _mixin_ = True
 
+    def unwrap(self, space, w_item):
+        return self.adapt_val(space.int_w(space.int(w_item)))
+
     def for_computation(self, v):
         return widen(v)
 
@@ -290,7 +297,7 @@
     T = lltype.Bool,
     valtype = bool,
 )
-class W_BoolDtype(W_BoolDtype):
+class W_BoolDtype(IntegerArithmeticDtype, W_BoolDtype):
     def unwrap(self, space, w_item):
         return self.adapt_val(space.is_true(w_item))
 
@@ -301,20 +308,27 @@
     def for_computation(self, v):
         return int(v)
 
-    @binop
-    def add(self, v1, v2):
-        return bool(v1 + v2)
-
 W_Int8Dtype = create_low_level_dtype(
     num = 1, kind = SIGNEDLTR, name = "int8",
     aliases = ["int8"],
     applevel_types = [],
     T = rffi.SIGNEDCHAR,
     valtype = rffi.SIGNEDCHAR._type,
+    expected_size = 1,
 )
 class W_Int8Dtype(IntegerArithmeticDtype, W_Int8Dtype):
-    def unwrap(self, space, w_item):
-        return self.adapt_val(space.int_w(space.int(w_item)))
+    pass
+
+W_Int16Dtype = create_low_level_dtype(
+    num = 3, kind = SIGNEDLTR, name = "int16",
+    aliases = ["int16"],
+    applevel_types = [],
+    T = rffi.SHORT,
+    valtype = rffi.SHORT._type,
+    expected_size = 2,
+)
+class W_Int16Dtype(IntegerArithmeticDtype, W_Int16Dtype):
+    pass
 
 W_Int32Dtype = create_low_level_dtype(
     num = 5, kind = SIGNEDLTR, name = "int32",
@@ -322,10 +336,10 @@
     applevel_types = [],
     T = rffi.INT,
     valtype = rffi.INT._type,
+    expected_size = 4,
 )
 class W_Int32Dtype(IntegerArithmeticDtype, W_Int32Dtype):
-    def unwrap(self, space, w_item):
-        return self.adapt_val(space.int_w(space.int(w_item)))
+    pass
 
 W_Int64Dtype = create_low_level_dtype(
     num = 9, kind = SIGNEDLTR, name = "int64",
@@ -333,10 +347,10 @@
     applevel_types = ["long"],
     T = rffi.LONGLONG,
     valtype = rffi.LONGLONG._type,
+    expected_size = 8,
 )
 class W_Int64Dtype(IntegerArithmeticDtype, W_Int64Dtype):
-    def unwrap(self, space, w_item):
-        return self.adapt_val(space.int_w(space.int(w_item)))
+    pass
 
 W_Float64Dtype = create_low_level_dtype(
     num = 12, kind = FLOATINGLTR, name = "float64",
@@ -344,6 +358,7 @@
     applevel_types = ["float"],
     T = lltype.Float,
     valtype = float,
+    expected_size = 8,
 )
 class W_Float64Dtype(FloatArithmeticDtype, W_Float64Dtype):
     def unwrap(self, space, w_item):
@@ -354,7 +369,7 @@
 
 ALL_DTYPES = [
     W_BoolDtype,
-    W_Int8Dtype, W_Int32Dtype, W_Int64Dtype,
+    W_Int8Dtype, W_Int16Dtype, W_Int32Dtype, W_Int64Dtype,
     W_Float64Dtype
 ]
 
@@ -384,3 +399,4 @@
     kind = interp_attrproperty("kind", cls=W_Dtype),
     shape = GetSetProperty(W_Dtype.descr_get_shape),
 )
+W_Dtype.typedef.acceptable_as_base_class = False
diff --git a/pypy/module/micronumpy/interp_numarray.py b/pypy/module/micronumpy/interp_numarray.py
--- a/pypy/module/micronumpy/interp_numarray.py
+++ b/pypy/module/micronumpy/interp_numarray.py
@@ -53,26 +53,26 @@
             i += 1
         return arr
 
-    def _unaryop_impl(w_ufunc):
+    def _unaryop_impl(ufunc_name):
         def impl(self, space):
-            return w_ufunc(space, self)
-        return func_with_new_name(impl, "unaryop_%s_impl" % w_ufunc.__name__)
+            return getattr(interp_ufuncs.get(space), ufunc_name).call(space, [self])
+        return func_with_new_name(impl, "unaryop_%s_impl" % ufunc_name)
 
-    descr_pos = _unaryop_impl(interp_ufuncs.positive)
-    descr_neg = _unaryop_impl(interp_ufuncs.negative)
-    descr_abs = _unaryop_impl(interp_ufuncs.absolute)
+    descr_pos = _unaryop_impl("positive")
+    descr_neg = _unaryop_impl("negative")
+    descr_abs = _unaryop_impl("absolute")
 
-    def _binop_impl(w_ufunc):
+    def _binop_impl(ufunc_name):
         def impl(self, space, w_other):
-            return w_ufunc(space, self, w_other)
-        return func_with_new_name(impl, "binop_%s_impl" % w_ufunc.__name__)
+            return getattr(interp_ufuncs.get(space), ufunc_name).call(space, [self, w_other])
+        return func_with_new_name(impl, "binop_%s_impl" % ufunc_name)
 
-    descr_add = _binop_impl(interp_ufuncs.add)
-    descr_sub = _binop_impl(interp_ufuncs.subtract)
-    descr_mul = _binop_impl(interp_ufuncs.multiply)
-    descr_div = _binop_impl(interp_ufuncs.divide)
-    descr_pow = _binop_impl(interp_ufuncs.power)
-    descr_mod = _binop_impl(interp_ufuncs.mod)
+    descr_add = _binop_impl("add")
+    descr_sub = _binop_impl("subtract")
+    descr_mul = _binop_impl("multiply")
+    descr_div = _binop_impl("divide")
+    descr_pow = _binop_impl("power")
+    descr_mod = _binop_impl("mod")
 
     descr_eq = _binop_impl(interp_ufuncs.equal)
     descr_ne = _binop_impl(interp_ufuncs.not_equal)
@@ -81,69 +81,31 @@
     descr_gt = _binop_impl(interp_ufuncs.greater)
     descr_ge = _binop_impl(interp_ufuncs.greater_equal)
 
-    def _binop_right_impl(w_ufunc):
+    def _binop_right_impl(ufunc_name):
         def impl(self, space, w_other):
             w_other = scalar_w(space,
                 interp_ufuncs.find_dtype_for_scalar(space, w_other, self.find_dtype()),
                 w_other
             )
-            return w_ufunc(space, w_other, self)
-        return func_with_new_name(impl, "binop_right_%s_impl" % w_ufunc.__name__)
+            return getattr(interp_ufuncs.get(space), ufunc_name).call(space, [w_other, self])
+        return func_with_new_name(impl, "binop_right_%s_impl" % ufunc_name)
 
-    descr_radd = _binop_right_impl(interp_ufuncs.add)
-    descr_rsub = _binop_right_impl(interp_ufuncs.subtract)
-    descr_rmul = _binop_right_impl(interp_ufuncs.multiply)
-    descr_rdiv = _binop_right_impl(interp_ufuncs.divide)
-    descr_rpow = _binop_right_impl(interp_ufuncs.power)
-    descr_rmod = _binop_right_impl(interp_ufuncs.mod)
+    descr_radd = _binop_right_impl("add")
+    descr_rsub = _binop_right_impl("subtract")
+    descr_rmul = _binop_right_impl("multiply")
+    descr_rdiv = _binop_right_impl("divide")
+    descr_rpow = _binop_right_impl("power")
+    descr_rmod = _binop_right_impl("mod")
 
-    def _reduce_sum_prod_impl(op_name, init):
-        reduce_driver = jit.JitDriver(greens=['signature'],
-                         reds = ['i', 'size', 'self', 'result', 'res_dtype'])
+    def _reduce_ufunc_impl(ufunc_name):
+        def impl(self, space):
+            return getattr(interp_ufuncs.get(space), ufunc_name).descr_reduce(space, self)
+        return func_with_new_name(impl, "reduce_%s_impl" % ufunc_name)
 
-        def loop(self, res_dtype, result, size):
-            i = 0
-            while i < size:
-                reduce_driver.jit_merge_point(signature=self.signature,
-                                              self=self, res_dtype=res_dtype,
-                                              size=size, i=i, result=result)
-                result = getattr(res_dtype, op_name)(
-                    result,
-                    self.eval(i).convert_to(res_dtype)
-                )
-                i += 1
-            return result
-
-        def impl(self, space):
-            dtype = interp_ufuncs.find_unaryop_result_dtype(
-                space, self.find_dtype(), promote_to_largest=True
-            )
-            result = dtype.adapt_val(init)
-            return loop(self, dtype, result, self.find_size()).wrap(space)
-        return func_with_new_name(impl, "reduce_%s_impl" % op_name)
-
-    def _reduce_max_min_impl(op_name):
-        reduce_driver = jit.JitDriver(greens=['signature'],
-                         reds = ['i', 'size', 'self', 'result', 'dtype'])
-        def loop(self, result, size):
-            i = 1
-            dtype = self.find_dtype()
-            while i < size:
-                reduce_driver.jit_merge_point(signature=self.signature,
-                                              self=self, dtype=dtype,
-                                              size=size, i=i, result=result)
-                result = getattr(dtype, op_name)(result, self.eval(i))
-                i += 1
-            return result
-
-        def impl(self, space):
-            size = self.find_size()
-            if size == 0:
-                raise OperationError(space.w_ValueError,
-                    space.wrap("Can't call %s on zero-size arrays" \
-                            % op_name))
-            return loop(self, self.eval(0), size).wrap(space)
-        return func_with_new_name(impl, "reduce_%s_impl" % op_name)
+    descr_sum = _reduce_ufunc_impl("add")
+    descr_prod = _reduce_ufunc_impl("multiply")
+    descr_max = _reduce_ufunc_impl("maximum")
+    descr_min = _reduce_ufunc_impl("minimum")
 
     def _reduce_argmax_argmin_impl(op_name):
         reduce_driver = jit.JitDriver(greens=['signature'],
@@ -199,10 +161,6 @@
     def descr_any(self, space):
         return space.wrap(self._any())
 
-    descr_sum = _reduce_sum_prod_impl("add", 0)
-    descr_prod = _reduce_sum_prod_impl("mul", 1)
-    descr_max = _reduce_max_min_impl("max")
-    descr_min = _reduce_max_min_impl("min")
     descr_argmax = _reduce_argmax_argmin_impl("max")
     descr_argmin = _reduce_argmax_argmin_impl("min")
 
@@ -255,7 +213,7 @@
         res = "array([" + ", ".join(concrete._getnums(False)) + "]"
         dtype = concrete.find_dtype()
         if (dtype is not space.fromcache(interp_dtype.W_Float64Dtype) and
-            dtype is not space.fromcache(interp_dtype.W_Int64Dtype)):
+            dtype is not space.fromcache(interp_dtype.W_Int64Dtype)) or not self.find_size():
             res += ", dtype=" + dtype.name
         res += ")"
         return space.wrap(res)
@@ -266,7 +224,15 @@
         return space.wrap("[" + " ".join(concrete._getnums(True)) + "]")
 
     def descr_getitem(self, space, w_idx):
-        # TODO: indexing by tuples
+        # TODO: indexing by arrays and lists
+        if space.isinstance_w(w_idx, space.w_tuple):
+            length = space.len_w(w_idx)
+            if length == 0:
+                return space.wrap(self)
+            if length > 1: # only one dimension for now.
+                raise OperationError(space.w_IndexError,
+                                     space.wrap("invalid index"))
+            w_idx = space.getitem(w_idx, space.wrap(0))
         start, stop, step, slice_length = space.decode_index4(w_idx, self.find_size())
         if step == 0:
             # Single index
@@ -280,8 +246,19 @@
             return space.wrap(res)
 
     def descr_setitem(self, space, w_idx, w_value):
-        # TODO: indexing by tuples and lists
+        # TODO: indexing by arrays and lists
         self.invalidated()
+        if space.isinstance_w(w_idx, space.w_tuple):
+            length = space.len_w(w_idx)
+            if length > 1: # only one dimension for now.
+                raise OperationError(space.w_IndexError,
+                                     space.wrap("invalid index"))
+            if length == 0:
+                w_idx = space.newslice(space.wrap(0),
+                                      space.wrap(self.find_size()),
+                                      space.wrap(1))
+            else:
+                w_idx = space.getitem(w_idx, space.wrap(0))
         start, stop, step, slice_length = space.decode_index4(w_idx,
                                                               self.find_size())
         if step == 0:
@@ -487,7 +464,8 @@
         return self.parent.setitem_w(space, self.calc_index(item), w_value)
 
     def setitem(self, item, value):
-        return self.parent.setitem(self.calc_index(item), value)
+        # This is currently not possible to be called from anywhere.
+        raise NotImplementedError
 
     def descr_len(self, space):
         return space.wrap(self.find_size())
diff --git a/pypy/module/micronumpy/interp_ufuncs.py b/pypy/module/micronumpy/interp_ufuncs.py
--- a/pypy/module/micronumpy/interp_ufuncs.py
+++ b/pypy/module/micronumpy/interp_ufuncs.py
@@ -1,65 +1,171 @@
+from pypy.interpreter.baseobjspace import Wrappable
+from pypy.interpreter.error import OperationError, operationerrfmt
+from pypy.interpreter.gateway import interp2app
+from pypy.interpreter.typedef import TypeDef, GetSetProperty, interp_attrproperty
 from pypy.module.micronumpy import interp_dtype, signature
+from pypy.rlib import jit
 from pypy.tool.sourcetools import func_with_new_name
 
 
-def ufunc(func=None, promote_to_float=False):
-    if func is None:
-        return lambda func: ufunc(func, promote_to_float)
-    call_sig = signature.Call1(func)
-    def impl(space, w_obj):
+reduce_driver = jit.JitDriver(
+    greens = ["signature"],
+    reds = ["i", "size", "self", "dtype", "value", "obj"]
+)
+
+class W_Ufunc(Wrappable):
+    _attrs_ = ["name", "promote_to_float", "promote_bools", "bool_result", "identity"]
+
+    def __init__(self, name, promote_to_float, promote_bools, bool_result, identity):
+        self.name = name
+        self.promote_to_float = promote_to_float
+        self.promote_bools = promote_bools
+        self.bool_result = bool_result
+        self.identity = identity
+
+    def descr_repr(self, space):
+        return space.wrap("<ufunc '%s'>" % self.name)
+
+    def descr_get_identity(self, space):
+        if self.identity is None:
+            return space.w_None
+        return self.identity.wrap(space)
+
+    def descr_call(self, space, __args__):
+        try:
+            args_w = __args__.fixedunpack(self.argcount)
+        except ValueError, e:
+            raise OperationError(space.w_TypeError, space.wrap(str(e)))
+        return self.call(space, args_w)
+
+    def descr_reduce(self, space, w_obj):
+        from pypy.module.micronumpy.interp_numarray import convert_to_array, Scalar
+
+        if self.argcount != 2:
+            raise OperationError(space.w_ValueError, space.wrap("reduce only "
+                "supported for binary functions"))
+
+        assert isinstance(self, W_Ufunc2)
+        obj = convert_to_array(space, w_obj)
+        if isinstance(obj, Scalar):
+            raise OperationError(space.w_TypeError, space.wrap("cannot reduce "
+                "on a scalar"))
+
+        size = obj.find_size()
+        dtype = find_unaryop_result_dtype(
+            space, obj.find_dtype(),
+            promote_to_largest=True
+        )
+        start = 0
+        if self.identity is None:
+            if size == 0:
+                raise operationerrfmt(space.w_ValueError, "zero-size array to "
+                    "%s.reduce without identity", self.name)
+            value = obj.eval(0).convert_to(dtype)
+            start += 1
+        else:
+            value = self.identity.convert_to(dtype)
+        new_sig = signature.Signature.find_sig([
+            self.reduce_signature, obj.signature
+        ])
+        return self.reduce(new_sig, start, value, obj, dtype, size).wrap(space)
+
+    def reduce(self, signature, start, value, obj, dtype, size):
+        i = start
+        while i < size:
+            reduce_driver.jit_merge_point(signature=signature, self=self,
+                                          value=value, obj=obj, i=i,
+                                          dtype=dtype, size=size)
+            value = self.func(dtype, value, obj.eval(i).convert_to(dtype))
+            i += 1
+        return value
+
+class W_Ufunc1(W_Ufunc):
+    argcount = 1
+
+    def __init__(self, func, name, promote_to_float=False, promote_bools=False,
+        identity=None):
+
+        W_Ufunc.__init__(self, name, promote_to_float, promote_bools, False, identity)
+        self.func = func
+        self.signature = signature.Call1(func)
+
+    def call(self, space, args_w):
         from pypy.module.micronumpy.interp_numarray import (Call1,
             convert_to_array, Scalar)
 
+        [w_obj] = args_w
         w_obj = convert_to_array(space, w_obj)
         res_dtype = find_unaryop_result_dtype(space,
             w_obj.find_dtype(),
-            promote_to_float=promote_to_float,
+            promote_to_float=self.promote_to_float,
+            promote_bools=self.promote_bools,
         )
         if isinstance(w_obj, Scalar):
-            return func(res_dtype, w_obj.value.convert_to(res_dtype)).wrap(space)
+            return self.func(res_dtype, w_obj.value.convert_to(res_dtype)).wrap(space)
 
-        new_sig = signature.Signature.find_sig([call_sig, w_obj.signature])
+        new_sig = signature.Signature.find_sig([self.signature, w_obj.signature])
         w_res = Call1(new_sig, res_dtype, w_obj)
         w_obj.add_invalidates(w_res)
         return w_res
-    return func_with_new_name(impl, "%s_dispatcher" % func.__name__)
 
-def ufunc2(func=None, promote_to_float=False, bool_result=False):
-    if func is None:
-        return lambda func: ufunc2(func, promote_to_float, bool_result)
+class W_Ufunc2(W_Ufunc):
+    argcount = 2
 
-    call_sig = signature.Call2(func)
-    def impl(space, w_lhs, w_rhs):
+    def __init__(self, func, name, promote_to_float=False, promote_bools=False,
+        bool_result=False, identity=None):
+
+        W_Ufunc.__init__(self, name, promote_to_float, promote_bools, bool_result, identity)
+        self.func = func
+        self.signature = signature.Call2(func)
+        self.reduce_signature = signature.BaseSignature()
+
+    def call(self, space, args_w):
         from pypy.module.micronumpy.interp_numarray import (Call2,
             convert_to_array, Scalar)
 
+        [w_lhs, w_rhs] = args_w
         w_lhs = convert_to_array(space, w_lhs)
         w_rhs = convert_to_array(space, w_rhs)
         calc_dtype = find_binop_result_dtype(space,
             w_lhs.find_dtype(), w_rhs.find_dtype(),
-            promote_to_float=promote_to_float,
+            promote_to_float=self.promote_to_float,
+            promote_bools=self.promote_bools,
         )
         # Some operations return bool regardless of input type
-        if bool_result:
+        if self.bool_result:
             res_dtype = space.fromcache(interp_dtype.W_BoolDtype)
         else:
             res_dtype = calc_dtype
         if isinstance(w_lhs, Scalar) and isinstance(w_rhs, Scalar):
             lhs = w_lhs.value.convert_to(calc_dtype)
             rhs = w_rhs.value.convert_to(calc_dtype)
-            interm_res = func(calc_dtype, lhs, rhs)
+            interm_res = self.func(calc_dtype, lhs, rhs)
             return interm_res.convert_to(res_dtype).wrap(space)
+            return self.func(res_dtype, w_lhs.value, w_rhs.value).wrap(space)
 
         new_sig = signature.Signature.find_sig([
-            call_sig, w_lhs.signature, w_rhs.signature
+            self.signature, w_lhs.signature, w_rhs.signature
         ])
-        w_res = Call2(new_sig, res_dtype, calc_dtype, w_lhs, w_rhs)
+        w_res = Call2(new_sig, res_dtype, res_dtype, calc_dtype, w_lhs, w_rhs)
         w_lhs.add_invalidates(w_res)
         w_rhs.add_invalidates(w_res)
         return w_res
-    return func_with_new_name(impl, "%s_dispatcher" % func.__name__)
 
-def find_binop_result_dtype(space, dt1, dt2, promote_bools=False, promote_to_float=False):
+
+W_Ufunc.typedef = TypeDef("ufunc",
+    __module__ = "numpy",
+
+    __call__ = interp2app(W_Ufunc.descr_call),
+    __repr__ = interp2app(W_Ufunc.descr_repr),
+
+    identity = GetSetProperty(W_Ufunc.descr_get_identity),
+    nin = interp_attrproperty("argcount", cls=W_Ufunc),
+
+    reduce = interp2app(W_Ufunc.descr_reduce),
+)
+
+def find_binop_result_dtype(space, dt1, dt2, promote_to_float=False,
+    promote_bools=False):
     # dt1.num should be <= dt2.num
     if dt1.num > dt2.num:
         dt1, dt2 = dt2, dt1
@@ -79,7 +185,9 @@
     assert False
 
 def find_unaryop_result_dtype(space, dt, promote_to_float=False,
-    promote_to_largest=False):
+    promote_bools=False, promote_to_largest=False):
+    if promote_bools and (dt.kind == interp_dtype.BOOLLTR):
+        return space.fromcache(interp_dtype.W_Int8Dtype)
     if promote_to_float:
         for bytes, dtype in interp_dtype.dtypes_by_num_bytes:
             if dtype.kind == interp_dtype.FLOATINGLTR and dtype.num_bytes >= dt.num_bytes:
@@ -109,60 +217,72 @@
     return space.fromcache(interp_dtype.W_Float64Dtype)
 
 
-def ufunc_dtype_caller(ufunc_name, op_name, argcount, **kwargs):
+def ufunc_dtype_caller(ufunc_name, op_name, argcount):
     if argcount == 1:
-        @ufunc(**kwargs)
         def impl(res_dtype, value):
             return getattr(res_dtype, op_name)(value)
     elif argcount == 2:
-        @ufunc2(**kwargs)
         def impl(res_dtype, lvalue, rvalue):
             return getattr(res_dtype, op_name)(lvalue, rvalue)
     return func_with_new_name(impl, ufunc_name)
 
-for ufunc_def in [
-    ("add", "add", 2),
-    ("subtract", "sub", 2),
-    ("multiply", "mul", 2),
-    ("divide", "div", 2),
-    ("mod", "mod", 2),
-    ("power", "pow", 2),
+class UfuncState(object):
+    def __init__(self, space):
+        "NOT_RPYTHON"
+        for ufunc_def in [
+            ("add", "add", 2, {"identity": 0}),
+            ("subtract", "sub", 2),
+            ("multiply", "mul", 2, {"identity": 1}),
+            ("divide", "div", 2, {"promote_bools": True}),
+            ("mod", "mod", 2, {"promote_bools": True}),
+            ("power", "pow", 2, {"promote_bools": True}),
 
-    ("maximum", "max", 2),
-    ("minimum", "min", 2),
+            ("maximum", "max", 2),
+            ("minimum", "min", 2),
 
-    ("equal", "eq", 2, {"bool_result": True}),
-    ("not_equal", "ne", 2, {"bool_result": True}),
-    ("less", "lt", 2, {"bool_result": True}),
-    ("less_equal", "le", 2, {"bool_result": True}),
-    ("greater", "gt", 2, {"bool_result": True}),
-    ("greater_equal", "ge", 2, {"bool_result": True}),
+            ("equal", "eq", 2, {"bool_result": True}),
+            ("not_equal", "ne", 2, {"bool_result": True}),
+            ("less", "lt", 2, {"bool_result": True}),
+            ("less_equal", "le", 2, {"bool_result": True}),
+            ("greater", "gt", 2, {"bool_result": True}),
+            ("greater_equal", "ge", 2, {"bool_result": True}),
 
-    ("copysign", "copysign", 2, {"promote_to_float": True}),
+            ("copysign", "copysign", 2, {"promote_to_float": True}),
 
-    ("positive", "pos", 1),
-    ("negative", "neg", 1),
-    ("absolute", "abs", 1),
-    ("sign", "sign", 1),
-    ("reciprocal", "reciprocal", 1),
+            ("positive", "pos", 1),
+            ("negative", "neg", 1),
+            ("absolute", "abs", 1),
+            ("sign", "sign", 1, {"promote_bools": True}),
+            ("reciprocal", "reciprocal", 1),
 
-    ("fabs", "fabs", 1, {"promote_to_float": True}),
-    ("floor", "floor", 1, {"promote_to_float": True}),
-    ("exp", "exp", 1, {"promote_to_float": True}),
+            ("fabs", "fabs", 1, {"promote_to_float": True}),
+            ("floor", "floor", 1, {"promote_to_float": True}),
+            ("exp", "exp", 1, {"promote_to_float": True}),
 
-    ("sin", "sin", 1, {"promote_to_float": True}),
-    ("cos", "cos", 1, {"promote_to_float": True}),
-    ("tan", "tan", 1, {"promote_to_float": True}),
-    ("arcsin", "arcsin", 1, {"promote_to_float": True}),
-    ("arccos", "arccos", 1, {"promote_to_float": True}),
-    ("arctan", "arctan", 1, {"promote_to_float": True}),
-]:
-    ufunc_name = ufunc_def[0]
-    op_name = ufunc_def[1]
-    argcount = ufunc_def[2]
-    try:
-        extra_kwargs = ufunc_def[3]
-    except IndexError:
-        extra_kwargs = {}
+            ("sin", "sin", 1, {"promote_to_float": True}),
+            ("cos", "cos", 1, {"promote_to_float": True}),
+            ("tan", "tan", 1, {"promote_to_float": True}),
+            ("arcsin", "arcsin", 1, {"promote_to_float": True}),
+            ("arccos", "arccos", 1, {"promote_to_float": True}),
+            ("arctan", "arctan", 1, {"promote_to_float": True}),
+        ]:
+            self.add_ufunc(space, *ufunc_def)
 
-    globals()[ufunc_name] = ufunc_dtype_caller(ufunc_name, op_name, argcount, **extra_kwargs)
+    def add_ufunc(self, space, ufunc_name, op_name, argcount, extra_kwargs=None):
+        if extra_kwargs is None:
+            extra_kwargs = {}
+
+        identity = extra_kwargs.get("identity")
+        if identity is not None:
+            identity = space.fromcache(interp_dtype.W_Int64Dtype).adapt_val(identity)
+        extra_kwargs["identity"] = identity
+
+        func = ufunc_dtype_caller(ufunc_name, op_name, argcount)
+        if argcount == 1:
+            ufunc = W_Ufunc1(func, ufunc_name, **extra_kwargs)
+        elif argcount == 2:
+            ufunc = W_Ufunc2(func, ufunc_name, **extra_kwargs)
+        setattr(self, ufunc_name, ufunc)
+
+def get(space):
+    return space.fromcache(UfuncState)
diff --git a/pypy/module/micronumpy/test/test_dtypes.py b/pypy/module/micronumpy/test/test_dtypes.py
--- a/pypy/module/micronumpy/test/test_dtypes.py
+++ b/pypy/module/micronumpy/test/test_dtypes.py
@@ -82,14 +82,30 @@
             assert a[1] == 1
 
     def test_add_int8(self):
-        from numpy import array
+        from numpy import array, dtype
 
         a = array(range(5), dtype="int8")
         b = a + a
+        assert b.dtype is dtype("int8")
+        for i in range(5):
+            assert b[i] == i * 2
+
+    def test_add_int16(self):
+        from numpy import array, dtype
+
+        a = array(range(5), dtype="int16")
+        b = a + a
+        assert b.dtype is dtype("int16")
         for i in range(5):
             assert b[i] == i * 2
 
     def test_shape(self):
         from numpy import dtype
 
-        assert dtype(long).shape == ()
\ No newline at end of file
+        assert dtype(long).shape == ()
+
+    def test_cant_subclass(self):
+        from numpy import dtype
+
+        # You can't subclass dtype
+        raises(TypeError, type, "Foo", (dtype,), {})
diff --git a/pypy/module/micronumpy/test/test_numarray.py b/pypy/module/micronumpy/test/test_numarray.py
--- a/pypy/module/micronumpy/test/test_numarray.py
+++ b/pypy/module/micronumpy/test/test_numarray.py
@@ -52,10 +52,14 @@
         from numpy import array, zeros
         a = array(range(5), float)
         assert repr(a) == "array([0.0, 1.0, 2.0, 3.0, 4.0])"
+        a = array([], float)
+        assert repr(a) == "array([], dtype=float64)"
         a = zeros(1001)
         assert repr(a) == "array([0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0])"
         a = array(range(5), long)
         assert repr(a) == "array([0, 1, 2, 3, 4])"
+        a = array([], long)
+        assert repr(a) == "array([], dtype=int64)"
         a = array([True, False, True, False], "?")
         assert repr(a) == "array([True, False, True, False], dtype=bool)"
 
@@ -84,6 +88,9 @@
         a = array(range(5), dtype="int8")
         assert str(a) == "[0 1 2 3 4]"
 
+        a = array(range(5), dtype="int16")
+        assert str(a) == "[0 1 2 3 4]"
+
     def test_str_slice(self):
         from numpy import array, zeros
         a = array(range(5), float)
@@ -102,6 +109,16 @@
         assert a[-1] == 8
         raises(IndexError, "a[-6]")
 
+    def test_getitem_tuple(self):
+        from numpy import array
+        a = array(range(5))
+        raises(IndexError, "a[(1,2)]")
+        for i in xrange(5):
+            assert a[(i,)] == i
+        b = a[()]
+        for i in xrange(5):
+            assert a[i] == b[i]
+
     def test_setitem(self):
         from numpy import array
         a = array(range(5))
@@ -110,6 +127,17 @@
         raises(IndexError, "a[5] = 0.0")
         raises(IndexError, "a[-6] = 3.0")
 
+    def test_setitem_tuple(self):
+        from numpy import array
+        a = array(range(5))
+        raises(IndexError, "a[(1,2)] = [0,1]")
+        for i in xrange(5):
+            a[(i,)] = i+1
+            assert a[i] == i+1
+        a[()] = range(5)
+        for i in xrange(5):
+            assert a[i] == i
+
     def test_setslice_array(self):
         from numpy import array
         a = array(range(5))
@@ -236,12 +264,19 @@
             assert b[i] == i - 5
 
     def test_mul(self):
-        from numpy import array
+        from numpy import array, dtype
         a = array(range(5))
         b = a * a
         for i in range(5):
             assert b[i] == i * i
 
+        a = array(range(5), dtype=bool)
+        b = a * a
+        assert b.dtype is dtype(bool)
+        assert b[0] is False
+        for i in range(1, 5):
+            assert b[i] is True
+
     def test_mul_constant(self):
         from numpy import array
         a = array(range(5))
@@ -250,12 +285,18 @@
             assert b[i] == i * 5
 
     def test_div(self):
-        from numpy import array
+        from numpy import array, dtype
         a = array(range(1, 6))
         b = a / a
         for i in range(5):
             assert b[i] == 1
 
+        a = array(range(1, 6), dtype=bool)
+        b = a / a
+        assert b.dtype is dtype("int8")
+        for i in range(5):
+            assert b[i] == 1
+
     def test_div_other(self):
         from numpy import array
         a = array(range(5))
@@ -301,6 +342,12 @@
         for i in range(5):
             assert b[i] == 0
 
+        a = array(range(1, 6), float)
+        b = (a + 1) % a
+        assert b[0] == 0
+        for i in range(1, 5):
+            assert b[i] == 1
+
     def test_mod_other(self):
         from numpy import array
         a = array(range(5))
diff --git a/pypy/module/micronumpy/test/test_ufuncs.py b/pypy/module/micronumpy/test/test_ufuncs.py
--- a/pypy/module/micronumpy/test/test_ufuncs.py
+++ b/pypy/module/micronumpy/test/test_ufuncs.py
@@ -3,6 +3,32 @@
 
 
 class AppTestUfuncs(BaseNumpyAppTest):
+    def test_ufunc_instance(self):
+        from numpy import add, ufunc
+
+        assert isinstance(add, ufunc)
+        assert repr(add) == "<ufunc 'add'>"
+        assert repr(ufunc) == "<type 'numpy.ufunc'>"
+
+    def test_ufunc_attrs(self):
+        from numpy import add, multiply, sin
+
+        assert add.identity == 0
+        assert multiply.identity == 1
+        assert sin.identity is None
+
+        assert add.nin == 2
+        assert multiply.nin == 2
+        assert sin.nin == 1
+
+    def test_wrong_arguments(self):
+        from numpy import add, sin
+
+        raises(TypeError, add, 1)
+        raises(TypeError, add, 1, 2, 3)
+        raises(TypeError, sin, 1, 2)
+        raises(TypeError, sin)
+
     def test_single_item(self):
         from numpy import negative, sign, minimum
 
@@ -112,7 +138,7 @@
 
         x = maximum(2, 3)
         assert x == 3
-        assert type(x) is int
+        assert isinstance(x, (int, long))
 
     def test_multiply(self):
         from numpy import array, multiply
@@ -124,7 +150,7 @@
             assert c[i] == a[i] * b[i]
 
     def test_sign(self):
-        from numpy import array, sign
+        from numpy import array, sign, dtype
 
         reference = [-1.0, 0.0, 0.0, 1.0]
         a = array([-5.0, -0.0, 0.0, 6.0])
@@ -137,6 +163,11 @@
         for i in range(10):
             assert a[i] == ref[i]
 
+        a = sign(array([True, False], dtype=bool))
+        assert a.dtype == dtype("int8")
+        assert a[0] == 1
+        assert a[1] == 0
+
     def test_reciporocal(self):
         from numpy import array, reciprocal
 
@@ -275,3 +306,17 @@
         assert equal(3.0, 3.5) is False
         assert equal(3.0, 3) is True
         assert equal(3.0, 4) is False
+
+    def test_reduce_errors(self):
+        from numpy import sin, add
+
+        raises(ValueError, sin.reduce, [1, 2, 3])
+        raises(TypeError, add.reduce, 1)
+
+    def test_reduce(self):
+        from numpy import add, maximum
+
+        assert add.reduce([1, 2, 3]) == 6
+        assert maximum.reduce([1]) == 1
+        assert maximum.reduce([1, 2, 3]) == 3
+        raises(ValueError, maximum.reduce, [])
diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -19,7 +19,7 @@
     def test_add(self):
         def f(i):
             ar = SingleDimArray(i, dtype=self.float64_dtype)
-            v = interp_ufuncs.add(self.space, ar, ar)
+            v = interp_ufuncs.get(self.space).add.call(self.space, [ar, ar])
             return v.get_concrete().eval(3).val
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
@@ -31,9 +31,10 @@
     def test_floatadd(self):
         def f(i):
             ar = SingleDimArray(i, dtype=self.float64_dtype)
-            v = interp_ufuncs.add(self.space,
-                ar,
-                scalar_w(self.space, self.float64_dtype, self.space.wrap(4.5))
+            v = interp_ufuncs.get(self.space).add.call(self.space, [
+                    ar,
+                    scalar_w(self.space, self.float64_dtype, self.space.wrap(4.5))
+                ],
             )
             assert isinstance(v, BaseArray)
             return v.get_concrete().eval(3).val
@@ -89,14 +90,21 @@
     def test_max(self):
         space = self.space
         float64_dtype = self.float64_dtype
+        int64_dtype = self.int64_dtype
 
         def f(i):
-            ar = SingleDimArray(i, dtype=NonConstant(float64_dtype))
+            if NonConstant(False):
+                dtype = int64_dtype
+            else:
+                dtype = float64_dtype
+            ar = SingleDimArray(i, dtype=dtype)
             j = 0
             while j < i:
                 ar.get_concrete().setitem(j, float64_dtype.box(float(j)))
                 j += 1
-            return ar.descr_add(space, ar).descr_max(space).floatval
+            v = ar.descr_add(space, ar).descr_max(space)
+            assert isinstance(v, FloatObject)
+            return v.floatval
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({"getarrayitem_raw": 2, "float_add": 1,
@@ -108,14 +116,21 @@
     def test_min(self):
         space = self.space
         float64_dtype = self.float64_dtype
+        int64_dtype = self.int64_dtype
 
         def f(i):
-            ar = SingleDimArray(i, dtype=NonConstant(float64_dtype))
+            if NonConstant(False):
+                dtype = int64_dtype
+            else:
+                dtype = float64_dtype
+            ar = SingleDimArray(i, dtype=dtype)
             j = 0
             while j < i:
                 ar.get_concrete().setitem(j, float64_dtype.box(float(j)))
                 j += 1
-            return ar.descr_add(space, ar).descr_min(space).floatval
+            v = ar.descr_add(space, ar).descr_min(space)
+            assert isinstance(v, FloatObject)
+            return v.floatval
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
         self.check_loops({"getarrayitem_raw": 2, "float_add": 1,
@@ -180,9 +195,9 @@
 
         def f(i):
             ar = SingleDimArray(i, dtype=self.float64_dtype)
-            v1 = interp_ufuncs.add(space, ar, scalar_w(space, self.float64_dtype, space.wrap(4.5)))
+            v1 = interp_ufuncs.get(self.space).add.call(space, [ar, scalar_w(space, self.float64_dtype, space.wrap(4.5))])
             assert isinstance(v1, BaseArray)
-            v2 = interp_ufuncs.multiply(space, v1, scalar_w(space, self.float64_dtype, space.wrap(4.5)))
+            v2 = interp_ufuncs.get(self.space).multiply.call(space, [v1, scalar_w(space, self.float64_dtype, space.wrap(4.5))])
             v1.force_if_needed()
             assert isinstance(v2, BaseArray)
             return v2.get_concrete().eval(3).val
@@ -200,8 +215,8 @@
         space = self.space
         def f(i):
             ar = SingleDimArray(i, dtype=self.float64_dtype)
-            v1 = interp_ufuncs.add(space, ar, ar)
-            v2 = interp_ufuncs.negative(space, v1)
+            v1 = interp_ufuncs.get(self.space).add.call(space, [ar, ar])
+            v2 = interp_ufuncs.get(self.space).negative.call(space, [v1])
             return v2.get_concrete().eval(3).val
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
@@ -216,13 +231,13 @@
         def f(i):
             ar = SingleDimArray(i, dtype=self.float64_dtype)
 
-            v1 = interp_ufuncs.add(space, ar, ar)
-            v2 = interp_ufuncs.negative(space, v1)
+            v1 = interp_ufuncs.get(self.space).add.call(space, [ar, ar])
+            v2 = interp_ufuncs.get(self.space).negative.call(space, [v1])
             v2.get_concrete()
 
             for i in xrange(5):
-                v1 = interp_ufuncs.multiply(space, ar, ar)
-                v2 = interp_ufuncs.negative(space, v1)
+                v1 = interp_ufuncs.get(self.space).multiply.call(space, [ar, ar])
+                v2 = interp_ufuncs.get(self.space).negative.call(space, [v1])
                 v2.get_concrete()
 
         self.meta_interp(f, [5], listops=True, backendopt=True)
@@ -237,7 +252,7 @@
                 SingleDimSlice.signature, ar.signature
             ])
             s = SingleDimSlice(0, step*i, step, i, ar, new_sig)
-            v = interp_ufuncs.add(self.space, s, s)
+            v = interp_ufuncs.get(self.space).add.call(self.space, [s, s])
             return v.get_concrete().eval(3).val
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
@@ -259,7 +274,7 @@
                 SingleDimSlice.signature, s1.signature
             ])
             s2 = SingleDimSlice(0, step2*i, step2, i, ar, new_sig)
-            v = interp_ufuncs.add(self.space, s1, s2)
+            v = interp_ufuncs.get(self.space).add.call(self.space, [s1, s2])
             return v.get_concrete().eval(3).val
 
         result = self.meta_interp(f, [5], listops=True, backendopt=True)
diff --git a/pypy/module/posix/__init__.py b/pypy/module/posix/__init__.py
--- a/pypy/module/posix/__init__.py
+++ b/pypy/module/posix/__init__.py
@@ -161,6 +161,8 @@
         interpleveldefs['mknod'] = 'interp_posix.mknod'
     if hasattr(os, 'nice'):
         interpleveldefs['nice'] = 'interp_posix.nice'
+    if hasattr(os, 'getlogin'):
+        interpleveldefs['getlogin'] = 'interp_posix.getlogin'
 
     for name in ['setsid', 'getuid', 'geteuid', 'getgid', 'getegid', 'setuid',
                  'seteuid', 'setgid', 'setegid', 'getgroups', 'getpgrp', 
diff --git a/pypy/module/posix/interp_posix.py b/pypy/module/posix/interp_posix.py
--- a/pypy/module/posix/interp_posix.py
+++ b/pypy/module/posix/interp_posix.py
@@ -464,6 +464,15 @@
                              space.wrap("strerror() argument out of range"))
     return space.wrap(text)
 
+def getlogin(space):
+    """Return the currently logged in user."""
+    try:
+        cur = os.getlogin()
+    except OSError, e:
+        raise wrap_oserror(space, e)
+    else:
+        return space.wrap(cur)
+
 # ____________________________________________________________
 
 def getstatfields(space):
diff --git a/pypy/module/posix/test/test_posix2.py b/pypy/module/posix/test/test_posix2.py
--- a/pypy/module/posix/test/test_posix2.py
+++ b/pypy/module/posix/test/test_posix2.py
@@ -805,6 +805,16 @@
                 data = f.read()
                 assert data == "who cares?"
 
+    try:
+        os.getlogin()
+    except (AttributeError, OSError):
+        pass
+    else:
+        def test_getlogin(self):
+            assert isinstance(self.posix.getlogin(), str)
+            # How else could we test that getlogin is properly
+            # working?
+
     def test_tmpfile(self):
         os = self.posix
         f = os.tmpfile()
diff --git a/pypy/module/pyexpat/interp_pyexpat.py b/pypy/module/pyexpat/interp_pyexpat.py
--- a/pypy/module/pyexpat/interp_pyexpat.py
+++ b/pypy/module/pyexpat/interp_pyexpat.py
@@ -9,6 +9,7 @@
 
 from pypy.rpython.tool import rffi_platform
 from pypy.translator.tool.cbuild import ExternalCompilationInfo
+from pypy.translator.platform import platform
 
 import sys
 import py
@@ -19,7 +20,9 @@
     libname = 'expat'
 eci = ExternalCompilationInfo(
     libraries=[libname],
+    library_dirs=platform.preprocess_library_dirs([]),
     includes=['expat.h'],
+    include_dirs=platform.preprocess_include_dirs([]),
     )
 
 eci = rffi_platform.configure_external_library(
diff --git a/pypy/module/pypyjit/interp_jit.py b/pypy/module/pypyjit/interp_jit.py
--- a/pypy/module/pypyjit/interp_jit.py
+++ b/pypy/module/pypyjit/interp_jit.py
@@ -21,9 +21,11 @@
 
 PyFrame._virtualizable2_ = ['last_instr', 'pycode',
                             'valuestackdepth', 'locals_stack_w[*]',
+                            'cells[*]',
                             'last_exception',
                             'lastblock',
                             'is_being_profiled',
+                            'w_globals',
                             ]
 
 JUMP_ABSOLUTE = opmap['JUMP_ABSOLUTE']
diff --git a/pypy/module/pypyjit/test_pypy_c/test_call.py b/pypy/module/pypyjit/test_pypy_c/test_call.py
--- a/pypy/module/pypyjit/test_pypy_c/test_call.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_call.py
@@ -67,24 +67,14 @@
         assert log.opnames(ops) == ["guard_value",
                                     "getfield_gc", "guard_value",
                                     "getfield_gc", "guard_value",
-                                    "getfield_gc", "guard_nonnull_class"]
-        # LOAD_GLOBAL of OFFSET but in different function partially folded
-        # away
-        # XXX could be improved
+                                    "guard_not_invalidated"]
         ops = entry_bridge.ops_by_id('add', opcode='LOAD_GLOBAL')
-        assert log.opnames(ops) == ["guard_value", "getfield_gc", "guard_value"]
+        assert log.opnames(ops) == ["guard_not_invalidated"]
         #
-        # two LOAD_GLOBAL of f, the second is folded away
         ops = entry_bridge.ops_by_id('call', opcode='LOAD_GLOBAL')
-        assert log.opnames(ops) == ["getfield_gc", "guard_nonnull_class"]
+        assert log.opnames(ops) == []
         #
         assert entry_bridge.match_by_id('call', """
-            p29 = getfield_gc(ConstPtr(ptr28), descr=<GcPtrFieldDescr pypy.objspace.std.celldict.ModuleCell.inst_w_value .*>)
-            guard_nonnull_class(p29, ConstClass(Function), descr=...)
-            p33 = getfield_gc(p29, descr=<GcPtrFieldDescr pypy.interpreter.function.Function.inst_code .*>)
-            guard_value(p33, ConstPtr(ptr34), descr=...)
-            p35 = getfield_gc(p29, descr=<GcPtrFieldDescr pypy.interpreter.function.Function.inst_w_func_globals .*>)
-            p36 = getfield_gc(p29, descr=<GcPtrFieldDescr pypy.interpreter.function.Function.inst_closure .*>)
             p38 = call(ConstClass(getexecutioncontext), descr=<GcPtrCallDescr>)
             p39 = getfield_gc(p38, descr=<GcPtrFieldDescr pypy.interpreter.executioncontext.ExecutionContext.inst_topframeref .*>)
             i40 = force_token()
@@ -100,19 +90,16 @@
         # -----------------------------
         loop, = log.loops_by_id('call')
         assert loop.match("""
-            i12 = int_lt(i5, i6)
-            guard_true(i12, descr=...)
+            guard_not_invalidated(descr=...)
+            i9 = int_lt(i5, i6)
+            guard_true(i9, descr=...)
+            i10 = force_token()
+            i12 = int_add(i5, 1)
             i13 = force_token()
-            i15 = int_add(i5, 1)
-            i16 = int_add_ovf(i15, i7)
-            guard_no_overflow(descr=...)
-            i18 = force_token()
-            i20 = int_add_ovf(i16, 1)
-            guard_no_overflow(descr=...)
-            i21 = int_add_ovf(i20, i7)
+            i15 = int_add_ovf(i12, 1)
             guard_no_overflow(descr=...)
             --TICK--
-            jump(p0, p1, p2, p3, p4, i21, i6, i7, p8, p9, p10, p11, descr=<Loop0>)
+            jump(p0, p1, p2, p3, p4, i15, i6, p7, p8, descr=<Loop0>)
         """)
 
     def test_method_call(self):
@@ -187,7 +174,7 @@
             guard_no_overflow(descr=...)
             i18 = force_token()
             --TICK--
-            jump(p0, p1, p2, p3, p4, i8, p7, i17, p8, i9, i17, p10, p11, p12, descr=<Loop0>)
+            jump(..., descr=<Loop0>)
         """)
 
     def test_default_and_kw(self):
@@ -409,3 +396,70 @@
             --TICK--
             jump(..., descr=<Loop0>)
         """)
+
+    def test_global_closure_has_constant_cells(self):
+        log = self.run("""
+            def make_adder(n):
+                def add(x):
+                    return x + n
+                return add
+            add5 = make_adder(5)
+            def main():
+                i = 0
+                while i < 5000:
+                    i = add5(i) # ID: call
+            """, [])
+        loop, = log.loops_by_id('call', is_entry_bridge=True)
+        assert loop.match("""
+            guard_value(i6, 1, descr=...)
+            guard_nonnull_class(p8, ConstClass(W_IntObject), descr=...)
+            guard_value(i4, 0, descr=...)
+            guard_value(p3, ConstPtr(ptr14), descr=...)
+            i15 = getfield_gc_pure(p8, descr=<SignedFieldDescr pypy.objspace.std.intobject.W_IntObject.inst_intval 8>)
+            i17 = int_lt(i15, 5000)
+            guard_true(i17, descr=...)
+            p18 = getfield_gc(p0, descr=<GcPtrFieldDescr pypy.interpreter.eval.Frame.inst_w_globals 8>)
+            guard_value(p18, ConstPtr(ptr19), descr=...)
+            p20 = getfield_gc(p18, descr=<GcPtrFieldDescr pypy.objspace.std.dictmultiobject.W_DictMultiObject.inst_strategy 12>)
+            guard_value(p20, ConstPtr(ptr21), descr=...)
+            guard_not_invalidated(descr=...)
+            # most importantly, there is no getarrayitem_gc here
+            p23 = call(ConstClass(getexecutioncontext), descr=<GcPtrCallDescr>)
+            p24 = getfield_gc(p23, descr=<GcPtrFieldDescr pypy.interpreter.executioncontext.ExecutionContext.inst_topframeref 36>)
+            i25 = force_token()
+            p26 = getfield_gc(p23, descr=<GcPtrFieldDescr pypy.interpreter.executioncontext.ExecutionContext.inst_w_tracefunc 44>)
+            guard_isnull(p26, descr=...)
+            i27 = getfield_gc(p23, descr=<NonGcPtrFieldDescr pypy.interpreter.executioncontext.ExecutionContext.inst_profilefunc 24>)
+            i28 = int_is_zero(i27)
+            guard_true(i28, descr=...)
+            p30 = getfield_gc(ConstPtr(ptr29), descr=<GcPtrFieldDescr pypy.interpreter.nestedscope.Cell.inst_w_value 8>)
+            guard_nonnull_class(p30, ConstClass(W_IntObject), descr=...)
+            i32 = getfield_gc_pure(p30, descr=<SignedFieldDescr pypy.objspace.std.intobject.W_IntObject.inst_intval 8>)
+            i33 = int_add_ovf(i15, i32)
+            guard_no_overflow(descr=...)
+            --TICK--
+            jump(p0, p1, p2, p5, i33, i32, p23, p30, p24, descr=<Loop0>)
+        """)
+
+    def test_local_closure_is_virtual(self):
+        log = self.run("""
+            def main():
+                i = 0
+                while i < 5000:
+                    def add():
+                        return i + 1
+                    i = add() # ID: call
+            """, [])
+        loop, = log.loops_by_id('call')
+        assert loop.match("""
+            i8 = getfield_gc_pure(p6, descr=<SignedFieldDescr pypy.objspace.std.intobject.W_IntObject.inst_intval 8>)
+            i10 = int_lt(i8, 5000)
+            guard_true(i10, descr=...)
+            i11 = force_token()
+            i13 = int_add(i8, 1)
+            --TICK--
+            p22 = new_with_vtable(ConstClass(W_IntObject))
+            setfield_gc(p22, i13, descr=<SignedFieldDescr pypy.objspace.std.intobject.W_IntObject.inst_intval 8>)
+            setfield_gc(p4, p22, descr=<GcPtrFieldDescr pypy.interpreter.nestedscope.Cell.inst_w_value 8>)
+            jump(p0, p1, p2, p3, p4, p7, p22, p7, descr=<Loop0>)
+        """)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_globals.py b/pypy/module/pypyjit/test_pypy_c/test_globals.py
--- a/pypy/module/pypyjit/test_pypy_c/test_globals.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_globals.py
@@ -20,11 +20,9 @@
             guard_value(p10, ConstPtr(ptr11), descr=...)
             p12 = getfield_gc(p10, descr=<GcPtrFieldDescr .*W_DictMultiObject.inst_strategy .*>)
             guard_value(p12, ConstPtr(ptr13), descr=...)
-            p15 = getfield_gc(ConstPtr(ptr14), descr=<GcPtrFieldDescr .*ModuleCell.inst_w_value .*>)
-            guard_isnull(p15, descr=...)
             guard_not_invalidated(descr=...)
             p19 = getfield_gc(ConstPtr(p17), descr=<GcPtrFieldDescr .*W_DictMultiObject.inst_strategy .*>)
             guard_value(p19, ConstPtr(ptr20), descr=...)
             p22 = getfield_gc(ConstPtr(ptr21), descr=<GcPtrFieldDescr .*ModuleCell.inst_w_value .*>)
             guard_nonnull(p22, descr=...)
-        """)
\ No newline at end of file
+        """)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_instance.py b/pypy/module/pypyjit/test_pypy_c/test_instance.py
--- a/pypy/module/pypyjit/test_pypy_c/test_instance.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_instance.py
@@ -52,7 +52,7 @@
             i10 = int_add_ovf(i5, i7)
             guard_no_overflow(descr=...)
             --TICK--
-            jump(p0, p1, p2, p3, p4, i10, i6, p7, i7, p8, descr=<Loop0>)
+            jump(p0, p1, p2, p3, p4, i10, i6, i7, p8, descr=<Loop0>)
         """)
 
     def test_getattr_with_dynamic_attribute(self):
@@ -151,6 +151,7 @@
         assert loop.match_by_id('loadattr',
         '''
         guard_not_invalidated(descr=...)
+        i16 = arraylen_gc(p10, descr=<GcPtrArrayDescr>)
         i19 = call(ConstClass(ll_dict_lookup), _, _, _, descr=...)
         guard_no_exception(descr=...)
         i21 = int_and(i19, _)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_math.py b/pypy/module/pypyjit/test_pypy_c/test_math.py
--- a/pypy/module/pypyjit/test_pypy_c/test_math.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_math.py
@@ -47,6 +47,7 @@
         assert loop.match("""
             i2 = int_lt(i0, i1)
             guard_true(i2, descr=...)
+            guard_not_invalidated(descr=...)
             f1 = cast_int_to_float(i0)
             i3 = float_eq(f1, inf)
             i4 = float_eq(f1, -inf)
@@ -60,4 +61,33 @@
             i7 = int_add(i0, f1)
             --TICK--
             jump(..., descr=)
+        """)
+
+    def test_fmod(self):
+        def main(n):
+            import math
+
+            s = 0
+            while n > 0:
+                s += math.fmod(n, 2.0)
+                n -= 1
+            return s
+        log = self.run(main, [500])
+        assert log.result == main(500)
+        loop, = log.loops_by_filename(self.filepath)
+        assert loop.match("""
+            i1 = int_gt(i0, 0)
+            guard_true(i1, descr=...)
+            guard_not_invalidated(descr=...)
+            f1 = cast_int_to_float(i0)
+            i2 = float_eq(f1, inf)
+            i3 = float_eq(f1, -inf)
+            i4 = int_or(i2, i3)
+            i5 = int_is_true(i4)
+            guard_false(i5, descr=...)
+            f2 = call(ConstClass(fmod), f1, 2.0, descr=<FloatCallDescr>)
+            f3 = float_add(f0, f2)
+            i6 = int_sub(i0, 1)
+            --TICK--
+            jump(..., descr=)
         """)
\ No newline at end of file
diff --git a/pypy/module/pypyjit/test_pypy_c/test_misc.py b/pypy/module/pypyjit/test_pypy_c/test_misc.py
--- a/pypy/module/pypyjit/test_pypy_c/test_misc.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_misc.py
@@ -234,3 +234,18 @@
             return total
         #
         self.run_and_check(main, [])
+
+
+    def test_global(self):
+        log = self.run("""
+        i = 0
+        globalinc = 1
+        def main(n):
+            global i
+            while i < n:
+                l = globalinc # ID: globalread
+                i += l
+        """, [1000])
+
+        loop, = log.loops_by_id("globalread", is_entry_bridge=True)
+        assert len(loop.ops_by_id("globalread")) == 0
diff --git a/pypy/module/pypyjit/test_pypy_c/test_string.py b/pypy/module/pypyjit/test_pypy_c/test_string.py
--- a/pypy/module/pypyjit/test_pypy_c/test_string.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_string.py
@@ -34,9 +34,9 @@
             i25 = unicodegetitem(p13, i19)
             p27 = newstr(1)
             strsetitem(p27, 0, i23)
-            p30 = call(ConstClass(ll_str2unicode__rpy_stringPtr), p27, descr=<GcPtrCallDescr>)
+            p30 = call(ConstClass(ll_str2unicode__rpy_stringPtr), p27, descr=...)
             guard_no_exception(descr=...)
-            i32 = call(ConstClass(_ll_2_str_eq_checknull_char__rpy_unicodePtr_UniChar), p30, i25, descr=<SignedCallDescr>)
+            i32 = call(ConstClass(_ll_2_str_eq_checknull_char__rpy_unicodePtr_UniChar), p30, i25, descr=...)
             guard_true(i32, descr=...)
             i34 = int_add(i6, 1)
             --TICK--
@@ -105,5 +105,5 @@
             i58 = int_add_ovf(i6, i57)
             guard_no_overflow(descr=...)
             --TICK--
-            jump(p0, p1, p2, p3, p4, p5, i58, i7, i8, p9, p10, descr=<Loop4>)
+            jump(p0, p1, p2, p3, p4, p5, i58, i7, descr=<Loop4>)
         """)
diff --git a/pypy/module/sys/version.py b/pypy/module/sys/version.py
--- a/pypy/module/sys/version.py
+++ b/pypy/module/sys/version.py
@@ -14,7 +14,7 @@
 
 if platform.name == 'msvc':
     COMPILER_INFO = 'MSC v.%d 32 bit' % (platform.version * 10 + 600)
-elif platform.cc.startswith('gcc'):
+elif platform.cc is not None and platform.cc.startswith('gcc'):
     out = platform.execute(platform.cc, '--version').out
     match = re.search(' (\d+\.\d+(\.\d+)*)', out)
     if match:
diff --git a/pypy/module/test_lib_pypy/ctypes_tests/_ctypes_test.c b/pypy/module/test_lib_pypy/ctypes_tests/_ctypes_test.c
--- a/pypy/module/test_lib_pypy/ctypes_tests/_ctypes_test.c
+++ b/pypy/module/test_lib_pypy/ctypes_tests/_ctypes_test.c
@@ -481,6 +481,16 @@
 	int a, b, c, d, e, f, g, h;
 } S8I;
 
+
+
+typedef int (*CALLBACK_RECT)(RECT rect);
+
+EXPORT(int) call_callback_with_rect(CALLBACK_RECT cb, RECT rect)
+{
+    return cb(rect);
+}
+
+
 EXPORT(S8I) ret_8i_func(S8I inp)
 {
 	inp.a *= 2;
diff --git a/pypy/module/test_lib_pypy/ctypes_tests/test_callbacks.py b/pypy/module/test_lib_pypy/ctypes_tests/test_callbacks.py
--- a/pypy/module/test_lib_pypy/ctypes_tests/test_callbacks.py
+++ b/pypy/module/test_lib_pypy/ctypes_tests/test_callbacks.py
@@ -150,7 +150,6 @@
 class TestMoreCallbacks(BaseCTypesTestChecker):
 
     def test_callback_with_struct_argument(self):
-        py.test.skip("callbacks with struct arguments not implemented yet")
         class RECT(Structure):
             _fields_ = [("left", c_int), ("top", c_int),
                         ("right", c_int), ("bottom", c_int)]
@@ -167,6 +166,28 @@
 
         assert res == 1111
 
+    def test_callback_from_c_with_struct_argument(self):
+        import conftest
+        _ctypes_test = str(conftest.sofile)
+        dll = CDLL(_ctypes_test)
+
+        class RECT(Structure):
+            _fields_ = [("left", c_long), ("top", c_long),
+                        ("right", c_long), ("bottom", c_long)]
+
+        proto = CFUNCTYPE(c_int, RECT)
+        def callback(point):
+            return point.left+point.top+point.right+point.bottom
+
+        cbp = proto(callback)
+        rect = RECT(1000,100,10,1)
+
+        call_callback_with_rect = dll.call_callback_with_rect
+        call_callback_with_rect.restype = c_int
+        call_callback_with_rect.argtypes = [proto, RECT]
+        res = call_callback_with_rect(cbp, rect)
+        assert res == 1111
+
     def test_callback_unsupported_return_struct(self):
         class RECT(Structure):
             _fields_ = [("left", c_int), ("top", c_int),
diff --git a/pypy/module/test_lib_pypy/test_greenlet.py b/pypy/module/test_lib_pypy/test_greenlet.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/test_lib_pypy/test_greenlet.py
@@ -0,0 +1,233 @@
+from pypy.conftest import gettestobjspace
+
+
+class AppTestGreenlet:
+    def setup_class(cls):
+        cls.space = gettestobjspace(usemodules=['_continuation'])
+
+    def test_simple(self):
+        from greenlet import greenlet
+        lst = []
+        def f():
+            lst.append(1)
+            greenlet.getcurrent().parent.switch()
+            lst.append(3)
+        g = greenlet(f)
+        lst.append(0)
+        g.switch()
+        lst.append(2)
+        g.switch()
+        lst.append(4)
+        assert lst == range(5)
+
+    def test_parent(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        assert gmain.parent is None
+        g = greenlet(lambda: None)
+        assert g.parent is gmain
+
+    def test_pass_around(self):
+        from greenlet import greenlet
+        seen = []
+        def f(x, y):
+            seen.append((x, y))
+            seen.append(greenlet.getcurrent().parent.switch())
+            seen.append(greenlet.getcurrent().parent.switch(42))
+            return 44, 'z'
+        g = greenlet(f)
+        seen.append(g.switch(40, 'x'))
+        seen.append(g.switch(41, 'y'))
+        seen.append(g.switch(43))
+        #
+        def f2():
+            return 45
+        g = greenlet(f2)
+        seen.append(g.switch())
+        #
+        def f3():
+            pass
+        g = greenlet(f3)
+        seen.append(g.switch())
+        #
+        assert seen == [(40, 'x'), (), (41, 'y'), 42, 43, (44, 'z'), 45, None]
+
+    def test_exception_simple(self):
+        from greenlet import greenlet
+        #
+        def fmain():
+            raise ValueError
+        #
+        g1 = greenlet(fmain)
+        raises(ValueError, g1.switch)
+
+    def test_dead(self):
+        from greenlet import greenlet
+        #
+        def fmain():
+            assert g1 and not g1.dead
+        #
+        g1 = greenlet(fmain)
+        assert not g1 and not g1.dead
+        g1.switch()
+        assert not g1 and g1.dead
+        #
+        gmain = greenlet.getcurrent()
+        assert gmain and not gmain.dead
+
+    def test_GreenletExit(self):
+        from greenlet import greenlet, GreenletExit
+        #
+        def fmain(*args):
+            raise GreenletExit(*args)
+        #
+        g1 = greenlet(fmain)
+        res = g1.switch('foo', 'bar')
+        assert isinstance(res, GreenletExit) and res.args == ('foo', 'bar')
+
+    def test_throw_1(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        #
+        def f():
+            try:
+                gmain.switch()
+            except ValueError:
+                return "ok"
+        #
+        g = greenlet(f)
+        g.switch()
+        res = g.throw(ValueError)
+        assert res == "ok"
+
+    def test_throw_2(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        #
+        def f():
+            gmain.throw(ValueError)
+        #
+        g = greenlet(f)
+        raises(ValueError, g.switch)
+
+    def test_throw_3(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        raises(ValueError, gmain.throw, ValueError)
+
+    def test_throw_4(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        #
+        def f1():
+            g2.throw(ValueError)
+        #
+        def f2():
+            try:
+                gmain.switch()
+            except ValueError:
+                return "ok"
+        #
+        g1 = greenlet(f1)
+        g2 = greenlet(f2)
+        g2.switch()
+        res = g1.switch()
+        assert res == "ok"
+
+    def test_nondefault_parent(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            g2 = greenlet(f2)
+            res = g2.switch()
+            assert res == "from 2"
+            return "from 1"
+        #
+        def f2():
+            return "from 2"
+        #
+        g1 = greenlet(f1)
+        res = g1.switch()
+        assert res == "from 1"
+
+    def test_change_parent(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            res = g2.switch()
+            assert res == "from 2"
+            return "from 1"
+        #
+        def f2():
+            return "from 2"
+        #
+        g1 = greenlet(f1)
+        g2 = greenlet(f2)
+        g2.parent = g1
+        res = g1.switch()
+        assert res == "from 1"
+
+    def test_raises_through_parent_chain(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            raises(IndexError, g2.switch)
+            raise ValueError
+        #
+        def f2():
+            raise IndexError
+        #
+        g1 = greenlet(f1)
+        g2 = greenlet(f2)
+        g2.parent = g1
+        raises(ValueError, g1.switch)
+
+    def test_switch_to_dead_1(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            return "ok"
+        #
+        g1 = greenlet(f1)
+        res = g1.switch()
+        assert res == "ok"
+        res = g1.switch("goes to gmain instead")
+        assert res == "goes to gmain instead"
+
+    def test_switch_to_dead_2(self):
+        from greenlet import greenlet
+        #
+        def f1():
+            g2 = greenlet(f2)
+            return g2.switch()
+        #
+        def f2():
+            return "ok"
+        #
+        g1 = greenlet(f1)
+        res = g1.switch()
+        assert res == "ok"
+        res = g1.switch("goes to gmain instead")
+        assert res == "goes to gmain instead"
+
+    def test_switch_to_dead_3(self):
+        from greenlet import greenlet
+        gmain = greenlet.getcurrent()
+        #
+        def f1():
+            res = g2.switch()
+            assert res == "ok"
+            res = gmain.switch("next step")
+            assert res == "goes to f1 instead"
+            return "all ok"
+        #
+        def f2():
+            return "ok"
+        #
+        g1 = greenlet(f1)
+        g2 = greenlet(f2)
+        g2.parent = g1
+        res = g1.switch()
+        assert res == "next step"
+        res = g2.switch("goes to f1 instead")
+        assert res == "all ok"
diff --git a/pypy/module/thread/os_thread.py b/pypy/module/thread/os_thread.py
--- a/pypy/module/thread/os_thread.py
+++ b/pypy/module/thread/os_thread.py
@@ -15,11 +15,6 @@
 # * The start-up data (the app-level callable and arguments) is
 #   stored in the global bootstrapper object.
 #
-# * The GC is notified that a new thread is about to start; in the
-#   framework GC with shadow stacks, this allocates a fresh new shadow
-#   stack (but doesn't use it yet).  See gc_thread_prepare().  This
-#   has no effect in asmgcc.
-#
 # * The new thread is launched at RPython level using an rffi call
 #   to the C function RPyThreadStart() defined in
 #   translator/c/src/thread*.h.  This RPython thread will invoke the
@@ -33,8 +28,8 @@
 #   operation is called (this is all done by gil.after_external_call(),
 #   called from the rffi-generated wrapper).  The gc_thread_run()
 #   operation will automatically notice that the current thread id was
-#   not seen before, and start using the freshly prepared shadow stack.
-#   Again, this has no effect in asmgcc.
+#   not seen before, and (in shadowstack) it will allocate and use a
+#   fresh new stack.  Again, this has no effect in asmgcc.
 #
 # * Only then does bootstrap() really run.  The first thing it does
 #   is grab the start-up information (app-level callable and args)
@@ -180,7 +175,7 @@
     bootstrapper.acquire(space, w_callable, args)
     try:
         try:
-            thread.gc_thread_prepare()
+            thread.gc_thread_prepare()     # (this has no effect any more)
             ident = thread.start_new_thread(bootstrapper.bootstrap, ())
         except Exception, e:
             bootstrapper.release()     # normally called by the new thread
diff --git a/pypy/objspace/descroperation.py b/pypy/objspace/descroperation.py
--- a/pypy/objspace/descroperation.py
+++ b/pypy/objspace/descroperation.py
@@ -724,13 +724,22 @@
         w_left_src, w_left_impl = space.lookup_in_type_where(w_typ1, left)
         w_first = w_obj1
         w_second = w_obj2
-
-        if _same_class_w(space, w_obj1, w_obj2, w_typ1, w_typ2):
+        #
+        if left == right and _same_class_w(space, w_obj1, w_obj2,
+                                           w_typ1, w_typ2):
+            # for __eq__ and __ne__, if the objects have the same
+            # (old-style or new-style) class, then don't try the
+            # opposite method, which is the same one.
             w_right_impl = None
         else:
-            w_right_src, w_right_impl = space.lookup_in_type_where(w_typ2, right)
-            # XXX see binop_impl
-            if space.is_true(space.issubtype(w_typ2, w_typ1)):
+            # in all other cases, try the opposite method.
+            w_right_src, w_right_impl = space.lookup_in_type_where(w_typ2,right)
+            if space.is_w(w_typ1, w_typ2):
+                # if the type is the same, *or* if both are old-style classes,
+                # then don't reverse: try left first, right next.
+                pass
+            elif space.is_true(space.issubtype(w_typ2, w_typ1)):
+                # for new-style classes, if typ2 is a subclass of typ1.
                 w_obj1, w_obj2 = w_obj2, w_obj1
                 w_left_impl, w_right_impl = w_right_impl, w_left_impl
 
diff --git a/pypy/objspace/flow/flowcontext.py b/pypy/objspace/flow/flowcontext.py
--- a/pypy/objspace/flow/flowcontext.py
+++ b/pypy/objspace/flow/flowcontext.py
@@ -184,7 +184,7 @@
 
 class FlowExecutionContext(ExecutionContext):
 
-    def __init__(self, space, code, globals, constargs={}, closure=None,
+    def __init__(self, space, code, globals, constargs={}, outer_func=None,
                  name=None):
         ExecutionContext.__init__(self, space)
         self.code = code
@@ -193,11 +193,11 @@
 
         self.crnt_offset = -1
         self.crnt_frame = None
-        if closure is None:
+        if outer_func and outer_func.closure:
+            self.closure = [nestedscope.Cell(Constant(value))
+                            for value in outer_func.closure]
+        else:
             self.closure = None
-        else:
-            self.closure = [nestedscope.Cell(Constant(value))
-                            for value in closure]
         frame = self.create_frame()
         formalargcount = code.getformalargcount()
         arg_list = [Variable() for i in range(formalargcount)]
@@ -216,7 +216,7 @@
         # while ignoring any operation like the creation of the locals dict
         self.recorder = []
         frame = FlowSpaceFrame(self.space, self.code,
-                               self.w_globals, self.closure)
+                               self.w_globals, self)
         frame.last_instr = 0
         return frame
 
diff --git a/pypy/objspace/flow/objspace.py b/pypy/objspace/flow/objspace.py
--- a/pypy/objspace/flow/objspace.py
+++ b/pypy/objspace/flow/objspace.py
@@ -252,9 +252,9 @@
             raise TypeError("%r is a generator" % (func,))
         code = PyCode._from_code(self, code)
         if func.func_closure is None:
-            closure = None
+            cl = None
         else:
-            closure = [extract_cell_content(c) for c in func.func_closure]
+            cl = [extract_cell_content(c) for c in func.func_closure]
         # CallableFactory.pycall may add class_ to functions that are methods
         name = func.func_name
         class_ = getattr(func, 'class_', None)
@@ -262,8 +262,10 @@
             name = '%s.%s' % (class_.__name__, name)
         for c in "<>&!":
             name = name.replace(c, '_')
+        class outerfunc: # hack
+            closure = cl
         ec = flowcontext.FlowExecutionContext(self, code, func.func_globals,
-                                              constargs, closure, name)
+                                              constargs, outerfunc, name)
         graph = ec.graph
         graph.func = func
         # attach a signature and defaults to the graph
diff --git a/pypy/objspace/std/celldict.py b/pypy/objspace/std/celldict.py
--- a/pypy/objspace/std/celldict.py
+++ b/pypy/objspace/std/celldict.py
@@ -1,50 +1,57 @@
-""" A very simple cell dict implementation. The dictionary maps keys to cell.
-This ensures that the function (dict, key) -> cell is pure. By itself, this
-optimization is not helping at all, but in conjunction with the JIT it can
-speed up global lookups a lot."""
+""" A very simple cell dict implementation using a version tag. The dictionary
+maps keys to objects. If a specific key is changed a lot, a level of
+indirection is introduced to make the version tag change less often.
+"""
 
+from pypy.interpreter.baseobjspace import W_Root
 from pypy.objspace.std.dictmultiobject import IteratorImplementation
 from pypy.objspace.std.dictmultiobject import DictStrategy, _never_equal_to_string
 from pypy.objspace.std.dictmultiobject import ObjectDictStrategy
 from pypy.rlib import jit, rerased
 
-class ModuleCell(object):
+class VersionTag(object):
+    pass
+
+class ModuleCell(W_Root):
     def __init__(self, w_value=None):
         self.w_value = w_value
 
-    def invalidate(self):
-        w_value = self.w_value
-        self.w_value = None
-        return w_value
-
     def __repr__(self):
         return "<ModuleCell: %s>" % (self.w_value, )
 
+def unwrap_cell(w_value):
+    if isinstance(w_value, ModuleCell):
+        return w_value.w_value
+    return w_value
+
 class ModuleDictStrategy(DictStrategy):
 
     erase, unerase = rerased.new_erasing_pair("modulecell")
     erase = staticmethod(erase)
     unerase = staticmethod(unerase)
 
+    _immutable_fields_ = ["version?"]
+
     def __init__(self, space):
         self.space = space
+        self.version = VersionTag()
 
     def get_empty_storage(self):
        return self.erase({})
 
-    def getcell(self, w_dict, key, makenew):
-        if makenew or jit.we_are_jitted():
-            # when we are jitting, we always go through the pure function
-            # below, to ensure that we have no residual dict lookup
-            w_dict = jit.promote(w_dict)
-            self = jit.promote(self)
-            return self._getcell_makenew(w_dict, key)
+    def mutated(self):
+       self.version = VersionTag()
+
+    def getdictvalue_no_unwrapping(self, w_dict, key):
+        # NB: it's important to promote self here, so that self.version is a
+        # no-op due to the quasi-immutable field
+        self = jit.promote(self)
+        return self._getdictvalue_no_unwrapping_pure(self.version, w_dict, key)
+
+    @jit.elidable_promote('0,1,2')
+    def _getdictvalue_no_unwrapping_pure(self, version, w_dict, key):
         return self.unerase(w_dict.dstorage).get(key, None)
 
-    @jit.elidable
-    def _getcell_makenew(self, w_dict, key):
-        return self.unerase(w_dict.dstorage).setdefault(key, ModuleCell())
-
     def setitem(self, w_dict, w_key, w_value):
         space = self.space
         if space.is_w(space.type(w_key), space.w_str):
@@ -54,15 +61,24 @@
             w_dict.setitem(w_key, w_value)
 
     def setitem_str(self, w_dict, key, w_value):
-        self.getcell(w_dict, key, True).w_value = w_value
+        cell = self.getdictvalue_no_unwrapping(w_dict, key)
+        if isinstance(cell, ModuleCell):
+            cell.w_value = w_value
+            return
+        if cell is not None:
+            w_value = ModuleCell(w_value)
+        self.mutated()
+        self.unerase(w_dict.dstorage)[key] = w_value
 
     def setdefault(self, w_dict, w_key, w_default):
         space = self.space
         if space.is_w(space.type(w_key), space.w_str):
-            cell = self.getcell(w_dict, space.str_w(w_key), True)
-            if cell.w_value is None:
-                cell.w_value = w_default
-            return cell.w_value
+            key = space.str_w(w_key)
+            w_result = self.getitem_str(w_dict, key)
+            if w_result is not None:
+                return w_result
+            self.setitem_str(w_dict, key, w_default)
+            return w_default
         else:
             self.switch_to_object_strategy(w_dict)
             return w_dict.setdefault(w_key, w_default)
@@ -72,14 +88,13 @@
         w_key_type = space.type(w_key)
         if space.is_w(w_key_type, space.w_str):
             key = space.str_w(w_key)
-            cell = self.getcell(w_dict, key, False)
-            if cell is None or cell.w_value is None:
-                raise KeyError
-            # note that we don't remove the cell from self.content, to make
-            # sure that a key that was found at any point in the dict, still
-            # maps to the same cell later (even if this cell no longer
-            # represents a key)
-            cell.invalidate()
+            dict_w = self.unerase(w_dict.dstorage)
+            try:
+                del dict_w[key]
+            except KeyError:
+                raise
+            else:
+                self.mutated()
         elif _never_equal_to_string(space, w_key_type):
             raise KeyError
         else:
@@ -87,12 +102,7 @@
             w_dict.delitem(w_key)
 
     def length(self, w_dict):
-        # inefficient, but do we care?
-        res = 0
-        for cell in self.unerase(w_dict.dstorage).itervalues():
-            if cell.w_value is not None:
-                res += 1
-        return res
+        return len(self.unerase(w_dict.dstorage))
 
     def getitem(self, w_dict, w_key):
         space = self.space
@@ -107,11 +117,8 @@
             return w_dict.getitem(w_key)
 
     def getitem_str(self, w_dict, key):
-        res = self.getcell(w_dict, key, False)
-        if res is None:
-            return None
-        # note that even if the res.w_value is None, the next line is fine
-        return res.w_value
+        w_res = self.getdictvalue_no_unwrapping(w_dict, key)
+        return unwrap_cell(w_res)
 
     def iter(self, w_dict):
         return ModuleDictIteratorImplementation(self.space, self, w_dict)
@@ -119,44 +126,34 @@
     def keys(self, w_dict):
         space = self.space
         iterator = self.unerase(w_dict.dstorage).iteritems
-        return [space.wrap(key) for key, cell in iterator()
-                    if cell.w_value is not None]
+        return [space.wrap(key) for key, cell in iterator()]
 
     def values(self, w_dict):
         iterator = self.unerase(w_dict.dstorage).itervalues
-        return [cell.w_value for cell in iterator()
-                    if cell.w_value is not None]
+        return [unwrap_cell(cell) for cell in iterator()]
 
     def items(self, w_dict):
         space = self.space
         iterator = self.unerase(w_dict.dstorage).iteritems
-        return [space.newtuple([space.wrap(key), cell.w_value])
-                    for (key, cell) in iterator()
-                        if cell.w_value is not None]
+        return [space.newtuple([space.wrap(key), unwrap_cell(cell)])
+                    for key, cell in iterator()]
 
     def clear(self, w_dict):
-        iterator = self.unerase(w_dict.dstorage).iteritems
-        for k, cell in iterator():
-            cell.invalidate()
+        iterator = self.unerase(w_dict.dstorage).clear()
+        self.mutated()
 
     def popitem(self, w_dict):
-        # This is O(n) if called repeatadly, you probably shouldn't be on a
-        # Module's dict though
-        for k, cell in self.unerase(w_dict.dstorage).iteritems():
-            if cell.w_value is not None:
-                w_value = cell.w_value
-                cell.invalidate()
-                return self.space.wrap(k), w_value
-        else:
-            raise KeyError
+        d = self.unerase(w_dict.dstorage)
+        key, w_value = d.popitem()
+        self.mutated()
+        return self.space.wrap(key), unwrap_cell(w_value)
 
     def switch_to_object_strategy(self, w_dict):
         d = self.unerase(w_dict.dstorage)
         strategy = self.space.fromcache(ObjectDictStrategy)
         d_new = strategy.unerase(strategy.get_empty_storage())
         for key, cell in d.iteritems():
-            if cell.w_value is not None:
-                d_new[self.space.wrap(key)] = cell.w_value
+            d_new[self.space.wrap(key)] = unwrap_cell(cell)
         w_dict.strategy = strategy
         w_dict.dstorage = strategy.erase(d_new)
 
@@ -168,7 +165,6 @@
 
     def next_entry(self):
         for key, cell in self.iterator:
-            if cell.w_value is not None:
-                return (self.space.wrap(key), cell.w_value)
+            return (self.space.wrap(key), unwrap_cell(cell))
         else:
             return None, None
diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -38,7 +38,9 @@
         if space.config.objspace.std.withcelldict and module:
             from pypy.objspace.std.celldict import ModuleDictStrategy
             assert w_type is None
-            strategy = space.fromcache(ModuleDictStrategy)
+            # every module needs its own strategy, because the strategy stores
+            # the version tag
+            strategy = ModuleDictStrategy(space)
 
         elif instance or strdict or module:
             assert w_type is None
diff --git a/pypy/objspace/std/fake.py b/pypy/objspace/std/fake.py
--- a/pypy/objspace/std/fake.py
+++ b/pypy/objspace/std/fake.py
@@ -142,7 +142,7 @@
 
     def funcrun(self, func, args):
         frame = func.space.createframe(self, func.w_func_globals,
-                                        func.closure)
+                                       func)
         sig = self.signature()
         scope_w = args.parse_obj(None, func.name, sig, func.defs_w)
         frame.setfastscope(scope_w)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -129,12 +129,12 @@
         ec._py_repr = None
         return ec
 
-    def createframe(self, code, w_globals, closure=None):
+    def createframe(self, code, w_globals, outer_func=None):
         from pypy.objspace.std.fake import CPythonFakeCode, CPythonFakeFrame
         if not we_are_translated() and isinstance(code, CPythonFakeCode):
             return CPythonFakeFrame(self, code, w_globals)
         else:
-            return ObjSpace.createframe(self, code, w_globals, closure)
+            return ObjSpace.createframe(self, code, w_globals, outer_func)
 
     def gettypefor(self, cls):
         return self.gettypeobject(cls.typedef)
diff --git a/pypy/objspace/std/stringobject.py b/pypy/objspace/std/stringobject.py
--- a/pypy/objspace/std/stringobject.py
+++ b/pypy/objspace/std/stringobject.py
@@ -913,12 +913,16 @@
 def repr__String(space, w_str):
     s = w_str._value
 
-    buf = StringBuilder(len(s) + 2)
-
     quote = "'"
     if quote in s and '"' not in s:
         quote = '"'
 
+    return space.wrap(string_escape_encode(s, quote))
+
+def string_escape_encode(s, quote):
+
+    buf = StringBuilder(len(s) + 2)
+
     buf.append(quote)
     startslice = 0
 
@@ -959,7 +963,7 @@
 
     buf.append(quote)
 
-    return space.wrap(buf.build())
+    return buf.build()
 
 
 DEFAULT_NOOP_TABLE = ''.join([chr(i) for i in range(256)])
diff --git a/pypy/objspace/std/test/test_celldict.py b/pypy/objspace/std/test/test_celldict.py
--- a/pypy/objspace/std/test/test_celldict.py
+++ b/pypy/objspace/std/test/test_celldict.py
@@ -2,42 +2,110 @@
 from pypy.conftest import gettestobjspace, option
 from pypy.objspace.std.dictmultiobject import W_DictMultiObject
 from pypy.objspace.std.celldict import ModuleCell, ModuleDictStrategy
-from pypy.objspace.std.test.test_dictmultiobject import FakeSpace
+from pypy.objspace.std.test.test_dictmultiobject import FakeSpace, \
+        BaseTestRDictImplementation, BaseTestDevolvedDictImplementation
 from pypy.interpreter import gateway
 
 space = FakeSpace()
 
 class TestCellDict(object):
-    def test_basic_property(self):
+    def test_basic_property_cells(self):
         strategy = ModuleDictStrategy(space)
         storage = strategy.get_empty_storage()
         d = W_DictMultiObject(space, strategy, storage)
 
-        # replace getcell with getcell from strategy
-        def f(key, makenew):
-            return strategy.getcell(d, key, makenew)
-        d.getcell = f
+        v1 = strategy.version
+        d.setitem("a", 1)
+        v2 = strategy.version
+        assert v1 is not v2
+        assert d.getitem("a") == 1
+        assert d.strategy.getdictvalue_no_unwrapping(d, "a") == 1
 
-        d.setitem("a", 1)
-        assert d.getcell("a", False) is d.getcell("a", False)
-        acell = d.getcell("a", False)
-        d.setitem("b", 2)
-        assert d.getcell("b", False) is d.getcell("b", False)
-        assert d.getcell("c", True) is d.getcell("c", True)
+        d.setitem("a", 2)
+        v3 = strategy.version
+        assert v2 is not v3
+        assert d.getitem("a") == 2
+        assert d.strategy.getdictvalue_no_unwrapping(d, "a").w_value == 2
 
-        assert d.getitem("a") == 1
-        assert d.getitem("b") == 2
+        d.setitem("a", 3)
+        v4 = strategy.version
+        assert v3 is v4
+        assert d.getitem("a") == 3
+        assert d.strategy.getdictvalue_no_unwrapping(d, "a").w_value == 3
 
         d.delitem("a")
-        py.test.raises(KeyError, d.delitem, "a")
+        v5 = strategy.version
+        assert v5 is not v4
         assert d.getitem("a") is None
-        assert d.getcell("a", False) is acell
-        assert d.length() == 1
+        assert d.strategy.getdictvalue_no_unwrapping(d, "a") is None
 
-        d.clear()
-        assert d.getitem("a") is None
-        assert d.getcell("a", False) is acell
-        assert d.length() == 0
+class AppTestModuleDict(object):
+    def setup_class(cls):
+        cls.space = gettestobjspace(**{"objspace.std.withcelldict": True})
+        cls.w_runappdirect = cls.space.wrap(option.runappdirect)
+
+    def w_impl_used(self, obj):
+        if self.runappdirect:
+            skip("__repr__ doesn't work on appdirect")
+        import __pypy__
+        assert "ModuleDictStrategy" in __pypy__.internal_repr(obj)
+
+    def test_check_module_uses_module_dict(self):
+        m = type(__builtins__)("abc")
+        self.impl_used(m.__dict__)
+
+    def test_key_not_there(self):
+        d = type(__builtins__)("abc").__dict__
+        raises(KeyError, "d['def']")
+
+    def test_fallback_evil_key(self):
+        class F(object):
+            def __hash__(self):
+                return hash("s")
+            def __eq__(self, other):
+                return other == "s"
+        d = type(__builtins__)("abc").__dict__
+        d["s"] = 12
+        assert d["s"] == 12
+        assert d[F()] == d["s"]
+
+        d = type(__builtins__)("abc").__dict__
+        x = d.setdefault("s", 12)
+        assert x == 12
+        x = d.setdefault(F(), 12)
+        assert x == 12
+
+        d = type(__builtins__)("abc").__dict__
+        x = d.setdefault(F(), 12)
+        assert x == 12
+
+        d = type(__builtins__)("abc").__dict__
+        d["s"] = 12
+        del d[F()]
+
+        assert "s" not in d
+        assert F() not in d
+
+
+class TestModuleDictImplementation(BaseTestRDictImplementation):
+    StrategyClass = ModuleDictStrategy
+
+class TestModuleDictImplementationWithBuiltinNames(BaseTestRDictImplementation):
+    StrategyClass = ModuleDictStrategy
+
+    string = "int"
+    string2 = "isinstance"
+
+
+class TestDevolvedModuleDictImplementation(BaseTestDevolvedDictImplementation):
+    StrategyClass = ModuleDictStrategy
+
+class TestDevolvedModuleDictImplementationWithBuiltinNames(BaseTestDevolvedDictImplementation):
+    StrategyClass = ModuleDictStrategy
+
+    string = "int"
+    string2 = "isinstance"
+
 
 class AppTestCellDict(object):
     OPTIONS = {"objspace.std.withcelldict": True}
@@ -67,4 +135,4 @@
         d["a"] = 3
         del d["a"]
         d[object()] = 5
-        assert d.values() == [5]
\ No newline at end of file
+        assert d.values() == [5]
diff --git a/pypy/objspace/std/test/test_dictmultiobject.py b/pypy/objspace/std/test/test_dictmultiobject.py
--- a/pypy/objspace/std/test/test_dictmultiobject.py
+++ b/pypy/objspace/std/test/test_dictmultiobject.py
@@ -5,7 +5,6 @@
      W_DictMultiObject, setitem__DictMulti_ANY_ANY, getitem__DictMulti_ANY, \
      StringDictStrategy, ObjectDictStrategy
 
-from pypy.objspace.std.celldict import ModuleDictStrategy
 from pypy.conftest import gettestobjspace
 from pypy.conftest import option
 
@@ -731,52 +730,6 @@
                 set([('a', 1), ('b', 2), ('d', 4), ('e', 5)]))
 
 
-class AppTestModuleDict(object):
-    def setup_class(cls):
-        cls.space = gettestobjspace(**{"objspace.std.withcelldict": True})
-        if option.runappdirect:
-            py.test.skip("__repr__ doesn't work on appdirect")
-
-    def w_impl_used(self, obj):
-        import __pypy__
-        assert "ModuleDictStrategy" in __pypy__.internal_repr(obj)
-
-    def test_check_module_uses_module_dict(self):
-        m = type(__builtins__)("abc")
-        self.impl_used(m.__dict__)
-
-    def test_key_not_there(self):
-        d = type(__builtins__)("abc").__dict__
-        raises(KeyError, "d['def']")
-
-    def test_fallback_evil_key(self):
-        class F(object):
-            def __hash__(self):
-                return hash("s")
-            def __eq__(self, other):
-                return other == "s"
-        d = type(__builtins__)("abc").__dict__
-        d["s"] = 12
-        assert d["s"] == 12
-        assert d[F()] == d["s"]
-
-        d = type(__builtins__)("abc").__dict__
-        x = d.setdefault("s", 12)
-        assert x == 12
-        x = d.setdefault(F(), 12)
-        assert x == 12
-
-        d = type(__builtins__)("abc").__dict__
-        x = d.setdefault(F(), 12)
-        assert x == 12
-
-        d = type(__builtins__)("abc").__dict__
-        d["s"] = 12
-        del d[F()]
-
-        assert "s" not in d
-        assert F() not in d
-
 class AppTestStrategies(object):
     def setup_class(cls):
         if option.runappdirect:
@@ -1071,16 +1024,6 @@
 ##     ImplementionClass = MeasuringDictImplementation
 ##     DevolvedClass = MeasuringDictImplementation
 
-class TestModuleDictImplementation(BaseTestRDictImplementation):
-    StrategyClass = ModuleDictStrategy
-
-class TestModuleDictImplementationWithBuiltinNames(BaseTestRDictImplementation):
-    StrategyClass = ModuleDictStrategy
-
-    string = "int"
-    string2 = "isinstance"
-
-
 class BaseTestDevolvedDictImplementation(BaseTestRDictImplementation):
     def fill_impl(self):
         BaseTestRDictImplementation.fill_impl(self)
@@ -1092,15 +1035,6 @@
 class TestDevolvedStrDictImplementation(BaseTestDevolvedDictImplementation):
     StrategyClass = StringDictStrategy
 
-class TestDevolvedModuleDictImplementation(BaseTestDevolvedDictImplementation):
-    StrategyClass = ModuleDictStrategy
-
-class TestDevolvedModuleDictImplementationWithBuiltinNames(BaseTestDevolvedDictImplementation):
-    StrategyClass = ModuleDictStrategy
-
-    string = "int"
-    string2 = "isinstance"
-
 
 def test_module_uses_strdict():
     fakespace = FakeSpace()
diff --git a/pypy/objspace/std/test/test_rangeobject.py b/pypy/objspace/std/test/test_rangeobject.py
--- a/pypy/objspace/std/test/test_rangeobject.py
+++ b/pypy/objspace/std/test/test_rangeobject.py
@@ -89,6 +89,9 @@
         assert not self.not_forced(r)
         r.sort()
         assert r == range(1, 100) + [999]
+        r = range(10)
+        r.sort(key=lambda x: -x)
+        assert r == range(9, -1, -1)
 
     def test_pop(self):
         r = range(10)
diff --git a/pypy/objspace/test/test_descroperation.py b/pypy/objspace/test/test_descroperation.py
--- a/pypy/objspace/test/test_descroperation.py
+++ b/pypy/objspace/test/test_descroperation.py
@@ -377,7 +377,26 @@
 
         setattr(P, "__weakref__", 0)
 
+    def test_subclass_addition(self):
+        # the __radd__ is never called (compare with the next test)
+        l = []
+        class A(object):
+            def __add__(self, other):
+                l.append(self.__class__)
+                l.append(other.__class__)
+                return 123
+            def __radd__(self, other):
+                # should never be called!
+                return 456
+        class B(A):
+            pass
+        res1 = A() + B()
+        res2 = B() + A()
+        assert res1 == res2 == 123
+        assert l == [A, B, B, A]
+
     def test_subclass_comparison(self):
+        # the __eq__ *is* called with reversed arguments
         l = []
         class A(object):
             def __eq__(self, other):
@@ -395,7 +414,27 @@
 
         A() == B()
         A() < B()
-        assert l == [B, A, A, B]
+        B() < A()
+        assert l == [B, A, A, B, B, A]
+
+    def test_subclass_comparison_more(self):
+        # similarly, __gt__(b,a) is called instead of __lt__(a,b)
+        l = []
+        class A(object):
+            def __lt__(self, other):
+                l.append(self.__class__)
+                l.append(other.__class__)
+                return '<'
+            def __gt__(self, other):
+                l.append(self.__class__)
+                l.append(other.__class__)
+                return '>'
+        class B(A):
+            pass
+        res1 = A() < B()
+        res2 = B() < A()
+        assert res1 == '>' and res2 == '<'
+        assert l == [B, A, B, A]
 
     def test_rich_comparison(self):
         # Old-style
@@ -434,6 +473,84 @@
         assert not(C(1) == D(2))
         assert not(D(1) == C(2))
 
+    def test_partial_ordering(self):
+        class A(object):
+            def __lt__(self, other):
+                return self
+        a1 = A()
+        a2 = A()
+        assert (a1 < a2) is a1
+        assert (a1 > a2) is a2
+
+    def test_eq_order(self):
+        class A(object):
+            def __eq__(self, other): return self.__class__.__name__+':A.eq'
+            def __ne__(self, other): return self.__class__.__name__+':A.ne'
+            def __lt__(self, other): return self.__class__.__name__+':A.lt'
+            def __le__(self, other): return self.__class__.__name__+':A.le'
+            def __gt__(self, other): return self.__class__.__name__+':A.gt'
+            def __ge__(self, other): return self.__class__.__name__+':A.ge'
+        class B(object):
+            def __eq__(self, other): return self.__class__.__name__+':B.eq'
+            def __ne__(self, other): return self.__class__.__name__+':B.ne'
+            def __lt__(self, other): return self.__class__.__name__+':B.lt'
+            def __le__(self, other): return self.__class__.__name__+':B.le'
+            def __gt__(self, other): return self.__class__.__name__+':B.gt'
+            def __ge__(self, other): return self.__class__.__name__+':B.ge'
+        #
+        assert (A() == B()) == 'A:A.eq'
+        assert (A() != B()) == 'A:A.ne'
+        assert (A() <  B()) == 'A:A.lt'
+        assert (A() <= B()) == 'A:A.le'
+        assert (A() >  B()) == 'A:A.gt'
+        assert (A() >= B()) == 'A:A.ge'
+        #
+        assert (B() == A()) == 'B:B.eq'
+        assert (B() != A()) == 'B:B.ne'
+        assert (B() <  A()) == 'B:B.lt'
+        assert (B() <= A()) == 'B:B.le'
+        assert (B() >  A()) == 'B:B.gt'
+        assert (B() >= A()) == 'B:B.ge'
+        #
+        class C(A):
+            def __eq__(self, other): return self.__class__.__name__+':C.eq'
+            def __ne__(self, other): return self.__class__.__name__+':C.ne'
+            def __lt__(self, other): return self.__class__.__name__+':C.lt'
+            def __le__(self, other): return self.__class__.__name__+':C.le'
+            def __gt__(self, other): return self.__class__.__name__+':C.gt'
+            def __ge__(self, other): return self.__class__.__name__+':C.ge'
+        #
+        assert (A() == C()) == 'C:C.eq'
+        assert (A() != C()) == 'C:C.ne'
+        assert (A() <  C()) == 'C:C.gt'
+        assert (A() <= C()) == 'C:C.ge'
+        assert (A() >  C()) == 'C:C.lt'
+        assert (A() >= C()) == 'C:C.le'
+        #
+        assert (C() == A()) == 'C:C.eq'
+        assert (C() != A()) == 'C:C.ne'
+        assert (C() <  A()) == 'C:C.lt'
+        assert (C() <= A()) == 'C:C.le'
+        assert (C() >  A()) == 'C:C.gt'
+        assert (C() >= A()) == 'C:C.ge'
+        #
+        class D(A):
+            pass
+        #
+        assert (A() == D()) == 'D:A.eq'
+        assert (A() != D()) == 'D:A.ne'
+        assert (A() <  D()) == 'D:A.gt'
+        assert (A() <= D()) == 'D:A.ge'
+        assert (A() >  D()) == 'D:A.lt'
+        assert (A() >= D()) == 'D:A.le'
+        #
+        assert (D() == A()) == 'D:A.eq'
+        assert (D() != A()) == 'D:A.ne'
+        assert (D() <  A()) == 'D:A.lt'
+        assert (D() <= A()) == 'D:A.le'
+        assert (D() >  A()) == 'D:A.gt'
+        assert (D() >= A()) == 'D:A.ge'
+
     def test_addition(self):
         # Old-style
         class A:
diff --git a/pypy/pytest-A-stackless.cfg b/pypy/pytest-A-stackless.cfg
deleted file mode 100644
--- a/pypy/pytest-A-stackless.cfg
+++ /dev/null
@@ -1,10 +0,0 @@
-# run for some directories a file at a time
-
-def collect_one_testdir(testdirs, reldir, tests):
-    if (reldir.startswith('module/_stackless/') or
-        reldir.startswith('lib')):
-        testdirs.extend(tests)
-    else:     
-        testdirs.append(reldir)
-
-    
diff --git a/pypy/rlib/_rffi_stacklet.py b/pypy/rlib/_rffi_stacklet.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/_rffi_stacklet.py
@@ -0,0 +1,49 @@
+import py
+from pypy.tool.autopath import pypydir
+from pypy.rpython.lltypesystem import lltype, llmemory, rffi
+from pypy.translator.tool.cbuild import ExternalCompilationInfo
+from pypy.rpython.tool import rffi_platform
+
+
+cdir = py.path.local(pypydir) / 'translator' / 'c'
+
+
+eci = ExternalCompilationInfo(
+    include_dirs = [cdir],
+    includes = ['src/stacklet/stacklet.h'],
+    separate_module_sources = ['#include "src/stacklet/stacklet.c"\n'],
+)
+rffi_platform.verify_eci(eci.convert_sources_to_files())
+
+def llexternal(name, args, result, **kwds):
+    return rffi.llexternal(name, args, result, compilation_info=eci,
+                           _nowrapper=True, **kwds)
+
+# ----- types -----
+
+handle = rffi.COpaquePtr(typedef='stacklet_handle', compilation_info=eci)
+thread_handle = rffi.COpaquePtr(typedef='stacklet_thread_handle',
+                                compilation_info=eci)
+run_fn = lltype.Ptr(lltype.FuncType([handle, llmemory.Address], handle))
+
+# ----- constants -----
+
+null_handle = lltype.nullptr(handle.TO)
+
+def is_empty_handle(h):
+    return rffi.cast(lltype.Signed, h) == -1
+
+# ----- functions -----
+
+newthread = llexternal('stacklet_newthread', [], thread_handle)
+deletethread = llexternal('stacklet_deletethread',[thread_handle], lltype.Void)
+
+new = llexternal('stacklet_new', [thread_handle, run_fn, llmemory.Address],
+                 handle, random_effects_on_gcobjs=True)
+switch = llexternal('stacklet_switch', [thread_handle, handle], handle,
+                    random_effects_on_gcobjs=True)
+destroy = llexternal('stacklet_destroy', [thread_handle, handle], lltype.Void)
+
+_translate_pointer = llexternal("_stacklet_translate_pointer",
+                                [llmemory.Address, llmemory.Address],
+                                llmemory.Address)
diff --git a/pypy/rlib/_rsocket_rffi.py b/pypy/rlib/_rsocket_rffi.py
--- a/pypy/rlib/_rsocket_rffi.py
+++ b/pypy/rlib/_rsocket_rffi.py
@@ -489,10 +489,10 @@
 getnameinfo = external('getnameinfo', [sockaddr_ptr, socklen_t, CCHARP,
                        size_t, CCHARP, size_t, rffi.INT], rffi.INT)
 
-htonl = external('htonl', [rffi.UINT], rffi.UINT)
-htons = external('htons', [rffi.USHORT], rffi.USHORT)
-ntohl = external('ntohl', [rffi.UINT], rffi.UINT)
-ntohs = external('ntohs', [rffi.USHORT], rffi.USHORT)
+htonl = external('htonl', [rffi.UINT], rffi.UINT, threadsafe=False)
+htons = external('htons', [rffi.USHORT], rffi.USHORT, threadsafe=False)
+ntohl = external('ntohl', [rffi.UINT], rffi.UINT, threadsafe=False)
+ntohs = external('ntohs', [rffi.USHORT], rffi.USHORT, threadsafe=False)
 
 if _POSIX:
     inet_aton = external('inet_aton', [CCHARP, lltype.Ptr(in_addr)],
diff --git a/pypy/rlib/_stacklet_asmgcc.py b/pypy/rlib/_stacklet_asmgcc.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/_stacklet_asmgcc.py
@@ -0,0 +1,277 @@
+from pypy.rlib import _rffi_stacklet as _c
+from pypy.rlib.debug import ll_assert
+from pypy.rpython.lltypesystem import lltype, llmemory, rffi
+from pypy.rpython.lltypesystem.lloperation import llop
+from pypy.rpython.annlowlevel import llhelper
+
+
+_asmstackrootwalker = None    # BIG HACK: monkey-patched by asmgcroot.py
+_stackletrootwalker = None
+
+def get_stackletrootwalker():
+    # lazily called, to make the following imports lazy
+    global _stackletrootwalker
+    if _stackletrootwalker is not None:
+        return _stackletrootwalker
+
+    from pypy.rpython.memory.gctransform.asmgcroot import (
+        WALKFRAME, CALLEE_SAVED_REGS, INDEX_OF_EBP, sizeofaddr)
+
+    assert _asmstackrootwalker is not None, "should have been monkey-patched"
+    basewalker = _asmstackrootwalker
+
+    class StackletRootWalker(object):
+        _alloc_flavor_ = "raw"
+
+        enumerating = False
+
+        def setup(self, obj):
+            # initialization: read the SUSPSTACK object
+            p = llmemory.cast_adr_to_ptr(obj, lltype.Ptr(SUSPSTACK))
+            if not p.handle:
+                return False
+            self.context = llmemory.cast_ptr_to_adr(p.handle)
+            anchor = p.anchor
+            del p
+            self.curframe = lltype.malloc(WALKFRAME, flavor='raw')
+            self.otherframe = lltype.malloc(WALKFRAME, flavor='raw')
+            self.fill_initial_frame(self.curframe, anchor)
+            return True
+
+        def fill_initial_frame(self, curframe, initialframedata):
+            # Copy&paste :-(
+            initialframedata += 2*sizeofaddr
+            reg = 0
+            while reg < CALLEE_SAVED_REGS:
+                curframe.regs_stored_at[reg] = initialframedata+reg*sizeofaddr
+                reg += 1
+            retaddraddr = initialframedata + CALLEE_SAVED_REGS * sizeofaddr
+            retaddraddr = self.translateptr(retaddraddr)
+            curframe.frame_address = retaddraddr.address[0]
+
+        def teardown(self):
+            lltype.free(self.curframe, flavor='raw')
+            lltype.free(self.otherframe, flavor='raw')
+            self.context = llmemory.NULL
+            return llmemory.NULL
+
+        def next(self, obj, prev):
+            #
+            # Pointers to the stack can be "translated" or not:
+            #
+            #   * Non-translated pointers point to where the data would be
+            #     if the stack was installed and running.
+            #
+            #   * Translated pointers correspond to where the data
+            #     is now really in memory.
+            #
+            # Note that 'curframe' contains non-translated pointers, and
+            # of course the stack itself is full of non-translated pointers.
+            #
+            while True:
+                if not self.enumerating:
+                    if not prev:
+                        if not self.setup(obj):      # one-time initialization
+                            return llmemory.NULL
+                        prev = obj   # random value, but non-NULL
+                    callee = self.curframe
+                    retaddraddr = self.translateptr(callee.frame_address)
+                    retaddr = retaddraddr.address[0]
+                    basewalker.locate_caller_based_on_retaddr(retaddr)
+                    self.enumerating = True
+                #
+                # not really a loop, but kept this way for similarity
+                # with asmgcroot:
+                callee = self.curframe
+                ebp_in_caller = callee.regs_stored_at[INDEX_OF_EBP]
+                ebp_in_caller = self.translateptr(ebp_in_caller)
+                ebp_in_caller = ebp_in_caller.address[0]
+                while True:
+                    location = basewalker._shape_decompressor.next()
+                    if location == 0:
+                        break
+                    addr = basewalker.getlocation(callee, ebp_in_caller,
+                                                  location)
+                    # yield the translated addr of the next GCREF in the stack
+                    return self.translateptr(addr)
+                #
+                self.enumerating = False
+                caller = self.otherframe
+                reg = CALLEE_SAVED_REGS - 1
+                while reg >= 0:
+                    location = basewalker._shape_decompressor.next()
+                    addr = basewalker.getlocation(callee, ebp_in_caller,
+                                                  location)
+                    caller.regs_stored_at[reg] = addr   # non-translated
+                    reg -= 1
+
+                location = basewalker._shape_decompressor.next()
+                caller.frame_address = basewalker.getlocation(callee,
+                                                              ebp_in_caller,
+                                                              location)
+                # ^^^ non-translated
+                if caller.frame_address == llmemory.NULL:
+                    return self.teardown()    # completely done with this stack
+                #
+                self.otherframe = callee
+                self.curframe = caller
+                # loop back
+
+        def translateptr(self, addr):
+            return _c._translate_pointer(self.context, addr)
+
+    _stackletrootwalker = StackletRootWalker()
+    return _stackletrootwalker
+get_stackletrootwalker._annspecialcase_ = 'specialize:memo'
+
+
+def customtrace(obj, prev):
+    stackletrootwalker = get_stackletrootwalker()
+    return stackletrootwalker.next(obj, prev)
+
+
+SUSPSTACK = lltype.GcStruct('SuspStack',
+                            ('handle', _c.handle),
+                            ('anchor', llmemory.Address),
+                            rtti=True)
+NULL_SUSPSTACK = lltype.nullptr(SUSPSTACK)
+CUSTOMTRACEFUNC = lltype.FuncType([llmemory.Address, llmemory.Address],
+                                  llmemory.Address)
+customtraceptr = llhelper(lltype.Ptr(CUSTOMTRACEFUNC), customtrace)
+lltype.attachRuntimeTypeInfo(SUSPSTACK, customtraceptr=customtraceptr)
+
+ASM_FRAMEDATA_HEAD_PTR = lltype.Ptr(lltype.ForwardReference())
+ASM_FRAMEDATA_HEAD_PTR.TO.become(lltype.Struct('ASM_FRAMEDATA_HEAD',
+        ('prev', ASM_FRAMEDATA_HEAD_PTR),
+        ('next', ASM_FRAMEDATA_HEAD_PTR)
+    ))
+alternateanchor = lltype.malloc(ASM_FRAMEDATA_HEAD_PTR.TO,
+                                immortal=True)
+alternateanchor.prev = alternateanchor
+alternateanchor.next = alternateanchor
+
+FUNCNOARG_P = lltype.Ptr(lltype.FuncType([], _c.handle))
+pypy_asm_stackwalk2 = rffi.llexternal('pypy_asm_stackwalk',
+                                      [FUNCNOARG_P,
+                                       ASM_FRAMEDATA_HEAD_PTR],
+                                      _c.handle, sandboxsafe=True,
+                                      _nowrapper=True)
+
+
+def _new_callback():
+    # Here, we just closed the stack.  Get the stack anchor, store
+    # it in the gcrootfinder.suspstack.anchor, and create a new
+    # stacklet with stacklet_new().  If this call fails, then we
+    # are just returning NULL.
+    _stack_just_closed()
+    return _c.new(gcrootfinder.thrd, llhelper(_c.run_fn, _new_runfn),
+                  llmemory.NULL)
+
+def _stack_just_closed():
+    # Immediately unlink the new stackanchor from the doubly-linked
+    # chained list.  When returning from pypy_asm_stackwalk2, the
+    # assembler code will try to unlink it again, which should be
+    # a no-op given that the doubly-linked list is empty.
+    stackanchor = llmemory.cast_ptr_to_adr(alternateanchor.next)
+    gcrootfinder.suspstack.anchor = stackanchor
+    alternateanchor.prev = alternateanchor
+    alternateanchor.next = alternateanchor
+
+def _new_runfn(h, _):
+    # Here, we are in a fresh new stacklet.
+    llop.gc_stack_bottom(lltype.Void)   # marker for trackgcroot.py
+    #
+    # There is a fresh suspstack object waiting on the gcrootfinder,
+    # so populate it with data that represents the parent suspended
+    # stacklet and detach the suspstack object from gcrootfinder.
+    suspstack = gcrootfinder.attach_handle_on_suspstack(h)
+    #
+    # Call the main function provided by the (RPython) user.
+    suspstack = gcrootfinder.runfn(suspstack, gcrootfinder.arg)
+    #
+    # Here, suspstack points to the target stacklet to which we want
+    # to jump to next.  Read the 'handle' and forget about the
+    # suspstack object.
+    return _consume_suspstack(suspstack)
+
+def _consume_suspstack(suspstack):
+    h = suspstack.handle
+    ll_assert(bool(h), "_consume_suspstack: null handle")
+    suspstack.handle = _c.null_handle
+    return h
+
+def _switch_callback():
+    # Here, we just closed the stack.  Get the stack anchor, store
+    # it in the gcrootfinder.suspstack.anchor, and switch to this
+    # suspstack with stacklet_switch().  If this call fails, then we
+    # are just returning NULL.
+    oldanchor = gcrootfinder.suspstack.anchor
+    _stack_just_closed()
+    h = _consume_suspstack(gcrootfinder.suspstack)
+    #
+    # gcrootfinder.suspstack.anchor is left with the anchor of the
+    # previous place (i.e. before the call to switch()).
+    h2 = _c.switch(gcrootfinder.thrd, h)
+    #
+    if not h2:    # MemoryError: restore
+        gcrootfinder.suspstack.anchor = oldanchor
+        gcrootfinder.suspstack.handle = h
+    return h2
+
+
+class StackletGcRootFinder(object):
+    suspstack = NULL_SUSPSTACK
+
+    def new(self, thrd, callback, arg):
+        self.thrd = thrd._thrd
+        self.runfn = callback
+        self.arg = arg
+        # make a fresh new clean SUSPSTACK
+        newsuspstack = lltype.malloc(SUSPSTACK)
+        newsuspstack.handle = _c.null_handle
+        self.suspstack = newsuspstack
+        # Invoke '_new_callback' by closing the stack
+        h = pypy_asm_stackwalk2(llhelper(FUNCNOARG_P, _new_callback),
+                                alternateanchor)
+        return self.get_result_suspstack(h)
+
+    def switch(self, thrd, suspstack):
+        self.thrd = thrd._thrd
+        self.suspstack = suspstack
+        h = pypy_asm_stackwalk2(llhelper(FUNCNOARG_P, _switch_callback),
+                                alternateanchor)
+        return self.get_result_suspstack(h)
+
+    def attach_handle_on_suspstack(self, handle):
+        s = self.suspstack
+        self.suspstack = NULL_SUSPSTACK
+        ll_assert(bool(s.anchor), "s.anchor should not be null")
+        s.handle = handle
+        llop.gc_assume_young_pointers(lltype.Void, llmemory.cast_ptr_to_adr(s))
+        return s
+
+    def get_result_suspstack(self, h):
+        #
+        # Return from a new() or a switch(): 'h' is a handle, possibly
+        # an empty one, that says from where we switched to.
+        if not h:
+            raise MemoryError
+        elif _c.is_empty_handle(h):
+            return NULL_SUSPSTACK
+        else:
+            # This is a return that gave us a real handle.  Store it.
+            return self.attach_handle_on_suspstack(h)
+
+    def destroy(self, thrd, suspstack):
+        h = suspstack.handle
+        suspstack.handle = _c.null_handle
+        _c.destroy(thrd._thrd, h)
+
+    def is_empty_handle(self, suspstack):
+        return not suspstack
+
+    def get_null_handle(self):
+        return NULL_SUSPSTACK
+
+
+gcrootfinder = StackletGcRootFinder()
diff --git a/pypy/rlib/_stacklet_n_a.py b/pypy/rlib/_stacklet_n_a.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/_stacklet_n_a.py
@@ -0,0 +1,31 @@
+from pypy.rlib import _rffi_stacklet as _c
+from pypy.rpython.annlowlevel import llhelper
+from pypy.tool.staticmethods import StaticMethods
+
+
+class StackletGcRootFinder:
+    __metaclass__ = StaticMethods
+
+    def new(thrd, callback, arg):
+        h = _c.new(thrd._thrd, llhelper(_c.run_fn, callback), arg)
+        if not h:
+            raise MemoryError
+        return h
+    new._annspecialcase_ = 'specialize:arg(1)'
+
+    def switch(thrd, h):
+        h = _c.switch(thrd._thrd, h)
+        if not h:
+            raise MemoryError
+        return h
+
+    def destroy(thrd, h):
+        _c.destroy(thrd._thrd, h)
+
+    is_empty_handle = _c.is_empty_handle
+
+    def get_null_handle():
+        return _c.null_handle
+
+
+gcrootfinder = StackletGcRootFinder    # class object
diff --git a/pypy/rlib/_stacklet_shadowstack.py b/pypy/rlib/_stacklet_shadowstack.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/_stacklet_shadowstack.py
@@ -0,0 +1,110 @@
+from pypy.rlib import _rffi_stacklet as _c
+from pypy.rlib.debug import ll_assert
+from pypy.rpython.annlowlevel import llhelper
+from pypy.rpython.lltypesystem import lltype, llmemory
+from pypy.rpython.lltypesystem.lloperation import llop
+from pypy.tool.staticmethods import StaticMethods
+
+
+NULL_SUSPSTACK = lltype.nullptr(llmemory.GCREF.TO)
+
+
+def _new_callback(h, arg):
+    # We still have the old shadowstack active at this point; save it
+    # away, and start a fresh new one
+    oldsuspstack = gcrootfinder.oldsuspstack
+    h = llmemory.cast_ptr_to_adr(h)
+    llop.gc_save_current_state_away(lltype.Void,
+                                    oldsuspstack, h)
+    llop.gc_start_fresh_new_state(lltype.Void)
+    gcrootfinder.oldsuspstack = NULL_SUSPSTACK
+    #
+    newsuspstack = gcrootfinder.callback(oldsuspstack, arg)
+    #
+    # Finishing this stacklet.
+    gcrootfinder.oldsuspstack = NULL_SUSPSTACK
+    gcrootfinder.newsuspstack = newsuspstack
+    h = llop.gc_shadowstackref_context(llmemory.Address, newsuspstack)
+    return llmemory.cast_adr_to_ptr(h, _c.handle)
+
+def prepare_old_suspstack():
+    if not gcrootfinder.oldsuspstack:   # else reuse the one still there
+        _allocate_old_suspstack()
+
+def _allocate_old_suspstack():
+    suspstack = llop.gc_shadowstackref_new(llmemory.GCREF)
+    gcrootfinder.oldsuspstack = suspstack
+_allocate_old_suspstack._dont_inline_ = True
+
+def get_result_suspstack(h):
+    # Now we are in the target, after the switch() or the new().
+    # Note that this whole module was carefully written in such a way as
+    # not to invoke pushing/popping things off the shadowstack at
+    # unexpected moments...
+    oldsuspstack = gcrootfinder.oldsuspstack
+    newsuspstack = gcrootfinder.newsuspstack
+    gcrootfinder.oldsuspstack = NULL_SUSPSTACK
+    gcrootfinder.newsuspstack = NULL_SUSPSTACK
+    if not h:
+        raise MemoryError
+    # We still have the old shadowstack active at this point; save it
+    # away, and restore the new one
+    if oldsuspstack:
+        ll_assert(not _c.is_empty_handle(h),"unexpected empty stacklet handle")
+        h = llmemory.cast_ptr_to_adr(h)
+        llop.gc_save_current_state_away(lltype.Void, oldsuspstack, h)
+    else:
+        ll_assert(_c.is_empty_handle(h),"unexpected non-empty stacklet handle")
+        llop.gc_forget_current_state(lltype.Void)
+    #
+    llop.gc_restore_state_from(lltype.Void, newsuspstack)
+    #
+    # From this point on, 'newsuspstack' is consumed and done, its
+    # shadow stack installed as the current one.  It should not be
+    # used any more.  For performance, we avoid it being deallocated
+    # by letting it be reused on the next switch.
+    gcrootfinder.oldsuspstack = newsuspstack
+    # Return.
+    return oldsuspstack
+
+
+class StackletGcRootFinder:
+    __metaclass__ = StaticMethods
+
+    def new(thrd, callback, arg):
+        gcrootfinder.callback = callback
+        thread_handle = thrd._thrd
+        prepare_old_suspstack()
+        h = _c.new(thread_handle, llhelper(_c.run_fn, _new_callback), arg)
+        return get_result_suspstack(h)
+    new._dont_inline_ = True
+
+    def switch(thrd, suspstack):
+        # suspstack has a handle to target, i.e. where to switch to
+        ll_assert(suspstack != gcrootfinder.oldsuspstack,
+                  "stacklet: invalid use")
+        gcrootfinder.newsuspstack = suspstack
+        thread_handle = thrd._thrd
+        h = llop.gc_shadowstackref_context(llmemory.Address, suspstack)
+        h = llmemory.cast_adr_to_ptr(h, _c.handle)
+        prepare_old_suspstack()
+        h = _c.switch(thread_handle, h)
+        return get_result_suspstack(h)
+    switch._dont_inline_ = True
+
+    def destroy(thrd, suspstack):
+        h = llop.gc_shadowstackref_context(llmemory.Address, suspstack)
+        h = llmemory.cast_adr_to_ptr(h, _c.handle)
+        llop.gc_shadowstackref_destroy(lltype.Void, suspstack)
+        _c.destroy(thrd._thrd, h)
+
+    def is_empty_handle(suspstack):
+        return not suspstack
+
+    def get_null_handle():
+        return NULL_SUSPSTACK
+
+
+gcrootfinder = StackletGcRootFinder()
+gcrootfinder.oldsuspstack = NULL_SUSPSTACK
+gcrootfinder.newsuspstack = NULL_SUSPSTACK
diff --git a/pypy/rlib/clibffi.py b/pypy/rlib/clibffi.py
--- a/pypy/rlib/clibffi.py
+++ b/pypy/rlib/clibffi.py
@@ -286,10 +286,10 @@
 
 FFI_OK = cConfig.FFI_OK
 FFI_BAD_TYPEDEF = cConfig.FFI_BAD_TYPEDEF
-FFI_DEFAULT_ABI = rffi.cast(rffi.USHORT, cConfig.FFI_DEFAULT_ABI)
+FFI_DEFAULT_ABI = cConfig.FFI_DEFAULT_ABI
 if _WIN32:
-    FFI_STDCALL = rffi.cast(rffi.USHORT, cConfig.FFI_STDCALL)
-FFI_TYPE_STRUCT = rffi.cast(rffi.USHORT, cConfig.FFI_TYPE_STRUCT)
+    FFI_STDCALL = cConfig.FFI_STDCALL
+FFI_TYPE_STRUCT = cConfig.FFI_TYPE_STRUCT
 FFI_CIFP = rffi.COpaquePtr('ffi_cif', compilation_info=eci)
 
 FFI_CLOSUREP = lltype.Ptr(cConfig.ffi_closure)
@@ -319,7 +319,7 @@
        which the 'ffistruct' member is a regular FFI_TYPE.
     """
     tpe = lltype.malloc(FFI_STRUCT_P.TO, len(field_types)+1, flavor='raw')
-    tpe.ffistruct.c_type = FFI_TYPE_STRUCT
+    tpe.ffistruct.c_type = rffi.cast(rffi.USHORT, FFI_TYPE_STRUCT)
     tpe.ffistruct.c_size = rffi.cast(rffi.SIZE_T, size)
     tpe.ffistruct.c_alignment = rffi.cast(rffi.USHORT, aligment)
     tpe.ffistruct.c_elements = rffi.cast(FFI_TYPE_PP,
@@ -402,12 +402,20 @@
 
 closureHeap = ClosureHeap()
 
-FUNCFLAG_STDCALL   = 0
-FUNCFLAG_CDECL     = 1  # for WINAPI calls
+FUNCFLAG_STDCALL   = 0    # on Windows: for WINAPI calls
+FUNCFLAG_CDECL     = 1    # on Windows: for __cdecl calls
 FUNCFLAG_PYTHONAPI = 4
 FUNCFLAG_USE_ERRNO = 8
 FUNCFLAG_USE_LASTERROR = 16
 
+def get_call_conv(flags, from_jit):
+    if _WIN32 and (flags & FUNCFLAG_CDECL == 0):
+        return FFI_STDCALL
+    else:
+        return FFI_DEFAULT_ABI
+get_call_conv._annspecialcase_ = 'specialize:arg(1)'     # hack :-/
+
+
 class AbstractFuncPtr(object):
     ll_cif = lltype.nullptr(FFI_CIFP.TO)
     ll_argtypes = lltype.nullptr(FFI_TYPE_PP.TO)
@@ -427,21 +435,17 @@
         self.ll_cif = lltype.malloc(FFI_CIFP.TO, flavor='raw',
                                     track_allocation=False) # freed by the __del__
 
-        if _WIN32 and (flags & FUNCFLAG_CDECL == 0):
-            cc = FFI_STDCALL
-        else:
-            cc = FFI_DEFAULT_ABI
-
         if _MSVC:
             # This little trick works correctly with MSVC.
             # It returns small structures in registers
-            if r_uint(restype.c_type) == FFI_TYPE_STRUCT:
+            if intmask(restype.c_type) == FFI_TYPE_STRUCT:
                 if restype.c_size <= 4:
                     restype = ffi_type_sint32
                 elif restype.c_size <= 8:
                     restype = ffi_type_sint64
 
-        res = c_ffi_prep_cif(self.ll_cif, cc,
+        res = c_ffi_prep_cif(self.ll_cif,
+                             rffi.cast(rffi.USHORT, get_call_conv(flags,False)),
                              rffi.cast(rffi.UINT, argnum), restype,
                              self.ll_argtypes)
         if not res == FFI_OK:
diff --git a/pypy/rlib/debug.py b/pypy/rlib/debug.py
--- a/pypy/rlib/debug.py
+++ b/pypy/rlib/debug.py
@@ -26,6 +26,7 @@
         llop.debug_print_traceback(lltype.Void)
     llop.debug_fatalerror(lltype.Void, msg)
 fatalerror._dont_inline_ = True
+fatalerror._annspecialcase_ = 'specialize:arg(1)'
 
 
 class DebugLog(list):
diff --git a/pypy/rlib/libffi.py b/pypy/rlib/libffi.py
--- a/pypy/rlib/libffi.py
+++ b/pypy/rlib/libffi.py
@@ -75,7 +75,7 @@
     @staticmethod
     @jit.elidable
     def is_struct(ffi_type):
-        return intmask(ffi_type.c_type) == intmask(FFI_TYPE_STRUCT)
+        return intmask(ffi_type.c_type) == FFI_TYPE_STRUCT
 
 types._import()
 
diff --git a/pypy/rlib/parsing/makepackrat.py b/pypy/rlib/parsing/makepackrat.py
--- a/pypy/rlib/parsing/makepackrat.py
+++ b/pypy/rlib/parsing/makepackrat.py
@@ -251,9 +251,11 @@
         return "ErrorInformation(%s, %s)" % (self.pos, self.expected)
 
     def get_line_column(self, source):
-        uptoerror = source[:self.pos]
+        pos = self.pos
+        assert pos >= 0
+        uptoerror = source[:pos]
         lineno = uptoerror.count("\n")
-        columnno = self.pos - uptoerror.rfind("\n")
+        columnno = pos - uptoerror.rfind("\n")
         return lineno, columnno
 
     def nice_error_message(self, filename='<filename>', source=""):
diff --git a/pypy/rlib/parsing/tree.py b/pypy/rlib/parsing/tree.py
--- a/pypy/rlib/parsing/tree.py
+++ b/pypy/rlib/parsing/tree.py
@@ -6,9 +6,16 @@
         content = ["digraph G{"]
         content.extend(self.dot())
         content.append("}")
-        p = py.test.ensuretemp("automaton").join("temp.dot")
+        try:
+            p = py.test.ensuretemp("automaton").join("temp.dot")
+            remove = False
+        except AttributeError: # pytest lacks ensuretemp, make a normal one
+            p = py.path.local.mkdtemp().join('automaton.dot')
+            remove = True
         p.write("\n".join(content))
         graphclient.display_dot_file(str(p))
+        if remove:
+            p.dirpath().remove()
 
 class Symbol(Node):
 
diff --git a/pypy/rlib/rcoroutine.py b/pypy/rlib/rcoroutine.py
--- a/pypy/rlib/rcoroutine.py
+++ b/pypy/rlib/rcoroutine.py
@@ -29,6 +29,11 @@
 The type of a switch is determined by the target's costate.
 """
 
+import py; py.test.skip("fixme: rewrite using rlib.rstacklet")
+# XXX ^^^ the reason it is not done is that pypy.rlib.rcoroutine
+# plus pypy/module/_stackless look like faaaaaar too much code
+# to me :-(
+
 from pypy.rlib.rstack import yield_current_frame_to_caller
 from pypy.rlib.objectmodel import we_are_translated
 
diff --git a/pypy/rlib/rgc.py b/pypy/rlib/rgc.py
--- a/pypy/rlib/rgc.py
+++ b/pypy/rlib/rgc.py
@@ -15,132 +15,8 @@
     pass
 
 # ____________________________________________________________
-# Framework GC features
-
-class GcPool(object):
-    pass
-
-def gc_swap_pool(newpool):
-    """Set newpool as the current pool (create one if newpool is None).
-    All malloc'ed objects are put into the current pool;this is a
-    way to separate objects depending on when they were allocated.
-    """
-    raise NotImplementedError("only works in stacklessgc translated versions")
-
-def gc_clone(gcobject, pool):
-    """Recursively clone the gcobject and everything it points to,
-    directly or indirectly -- but stops at objects that are not
-    in the specified pool.  Pool can be None to mean the current one.
-    A new pool is built to contain the copies.  Return (newobject, newpool).
-    """
-    raise NotImplementedError("only works in stacklessgc translated versions")
-
-# ____________________________________________________________
 # Annotation and specialization
 
-class GcPoolEntry(ExtRegistryEntry):
-    "Link GcPool to its Repr."
-    _type_ = GcPool
-
-    def get_repr(self, rtyper, s_pool):
-        config = rtyper.getconfig()
-        # if the gc policy doesn't support allocation pools, lltype
-        # pools as Void.
-        if config.translation.gc != 'marksweep':
-            from pypy.annotation.model import s_None
-            return rtyper.getrepr(s_None)
-        else:
-            from pypy.rpython.rmodel import SimplePointerRepr
-            from pypy.rpython.memory.gc.marksweep import X_POOL_PTR
-            return SimplePointerRepr(X_POOL_PTR)
-
-
-class SwapPoolFnEntry(ExtRegistryEntry):
-    "Annotation and specialization of gc_swap_pool()."
-    _about_ = gc_swap_pool
-
-    def compute_result_annotation(self, s_newpool):
-        from pypy.annotation import model as annmodel
-        return annmodel.SomeExternalObject(GcPool)
-
-    def specialize_call(self, hop):
-        from pypy.annotation import model as annmodel
-        s_pool_ptr = annmodel.SomeExternalObject(GcPool)
-        r_pool_ptr = hop.rtyper.getrepr(s_pool_ptr)
-
-        opname = 'gc_x_swap_pool'
-        config = hop.rtyper.getconfig()
-        if config.translation.gc != 'marksweep':
-            # when the gc policy doesn't support pools, just return
-            # the argument (which is lltyped as Void anyway)
-            opname = 'same_as'
-            
-        s_pool_ptr = annmodel.SomeExternalObject(GcPool)
-        r_pool_ptr = hop.rtyper.getrepr(s_pool_ptr)
-        vlist = hop.inputargs(r_pool_ptr)
-        return hop.genop(opname, vlist, resulttype = r_pool_ptr)
-
-def _raise():
-    raise RuntimeError
-
-class CloneFnEntry(ExtRegistryEntry):
-    "Annotation and specialization of gc_clone()."
-    _about_ = gc_clone
-
-    def compute_result_annotation(self, s_gcobject, s_pool):
-        from pypy.annotation import model as annmodel
-        return annmodel.SomeTuple([s_gcobject,
-                                   annmodel.SomeExternalObject(GcPool)])
-
-    def specialize_call(self, hop):
-        from pypy.rpython.error import TyperError
-        from pypy.rpython.lltypesystem import rtuple
-        from pypy.annotation import model as annmodel
-        from pypy.rpython.memory.gc.marksweep import X_CLONE, X_CLONE_PTR
-
-        config = hop.rtyper.getconfig()
-        if config.translation.gc != 'marksweep':
-            # if the gc policy does not support allocation pools,
-            # gc_clone always raises RuntimeError
-            hop.exception_is_here()
-            hop.gendirectcall(_raise)
-            s_pool_ptr = annmodel.SomeExternalObject(GcPool)
-            r_pool_ptr = hop.rtyper.getrepr(s_pool_ptr)
-            r_tuple = hop.r_result
-            v_gcobject, v_pool = hop.inputargs(hop.args_r[0], r_pool_ptr)
-            return rtuple.newtuple(hop.llops, r_tuple, [v_gcobject, v_pool])
-
-        r_gcobject = hop.args_r[0]
-        if (not isinstance(r_gcobject.lowleveltype, lltype.Ptr) or
-            r_gcobject.lowleveltype.TO._gckind != 'gc'):
-            raise TyperError("gc_clone() can only clone a dynamically "
-                             "allocated object;\ngot %r" % (r_gcobject,))
-        s_pool_ptr = annmodel.SomeExternalObject(GcPool)
-        r_pool_ptr = hop.rtyper.getrepr(s_pool_ptr)
-        r_tuple = hop.r_result
-
-        c_CLONE       = hop.inputconst(lltype.Void, X_CLONE)
-        c_flags       = hop.inputconst(lltype.Void, {'flavor': 'gc'})
-        c_gcobjectptr = hop.inputconst(lltype.Void, "gcobjectptr")
-        c_pool        = hop.inputconst(lltype.Void, "pool")
-
-        v_gcobject, v_pool = hop.inputargs(hop.args_r[0], r_pool_ptr)
-        v_gcobjectptr = hop.genop('cast_opaque_ptr', [v_gcobject],
-                                  resulttype = llmemory.GCREF)
-        v_clonedata = hop.genop('malloc', [c_CLONE, c_flags],
-                                resulttype = X_CLONE_PTR)
-        hop.genop('setfield', [v_clonedata, c_gcobjectptr, v_gcobjectptr])
-        hop.genop('setfield', [v_clonedata, c_pool, v_pool])
-        hop.exception_is_here()
-        hop.genop('gc_x_clone', [v_clonedata])
-        v_gcobjectptr = hop.genop('getfield', [v_clonedata, c_gcobjectptr],
-                                  resulttype = llmemory.GCREF)
-        v_pool        = hop.genop('getfield', [v_clonedata, c_pool],
-                                  resulttype = r_pool_ptr)
-        v_gcobject = hop.genop('cast_opaque_ptr', [v_gcobjectptr],
-                               resulttype = r_tuple.items_r[0])
-        return rtuple.newtuple(hop.llops, r_tuple, [v_gcobject, v_pool])
-
 # Support for collection.
 
 class CollectEntry(ExtRegistryEntry):
diff --git a/pypy/rlib/rstack.py b/pypy/rlib/rstack.py
--- a/pypy/rlib/rstack.py
+++ b/pypy/rlib/rstack.py
@@ -14,25 +14,6 @@
 from pypy.rpython.controllerentry import Controller, SomeControlledInstance
 from pypy.translator.tool.cbuild import ExternalCompilationInfo
 
-def stack_unwind():
-    if we_are_translated():
-        return llop.stack_unwind(lltype.Void)
-    raise RuntimeError("cannot unwind stack in non-translated versions")
-
-
-def stack_capture():
-    if we_are_translated():
-        ptr = llop.stack_capture(OPAQUE_STATE_HEADER_PTR)
-        return frame_stack_top_controller.box(ptr)
-    raise RuntimeError("cannot unwind stack in non-translated versions")
-
-
-def stack_frames_depth():
-    if we_are_translated():
-        return llop.stack_frames_depth(lltype.Signed)
-    else:
-        return len(inspect.stack())
-
 # ____________________________________________________________
 
 compilation_info = ExternalCompilationInfo(includes=['src/stack.h'])
@@ -88,78 +69,6 @@
 @rgc.no_collect
 def stack_check_slowpath(current):
     if ord(_stack_too_big_slowpath(current)):
-        # Now we are sure that the stack is really too big.  Note that the
-        # stack_unwind implementation is different depending on if stackless
-        # is enabled. If it is it unwinds the stack, otherwise it simply
-        # raises a RuntimeError.
-        stack_unwind()
+        from pypy.rlib.rstackovf import _StackOverflow
+        raise _StackOverflow
 stack_check_slowpath._dont_inline_ = True
-
-# ____________________________________________________________
-
-def yield_current_frame_to_caller():
-    raise NotImplementedError("only works in translated versions")
-
-
-class frame_stack_top(object):
-    def switch(self):
-        raise NotImplementedError("only works in translated versions")
-
-
-class BoundSwitchOfFrameStackTop(object): pass
-class BoundSwitchOfFrameStackTopController(Controller):
-    knowntype = BoundSwitchOfFrameStackTop
-    def call(self, real_object):
-        from pypy.rpython.lltypesystem.lloperation import llop
-        ptr = llop.stack_switch(OPAQUE_STATE_HEADER_PTR, real_object)
-        return frame_stack_top_controller.box(ptr)
-
-
-class FrameStackTopController(Controller):
-    knowntype = frame_stack_top
-    can_be_None = True
-
-    def is_true(self, real_object):
-        return bool(real_object)
-
-    def get_switch(self, real_object):
-        return bound_switch_of_frame_stack_top_controller.box(real_object)
-
-    def convert(self, obj):
-        assert obj is None
-        return lltype.nullptr(OPAQUE_STATE_HEADER_PTR.TO)
-
-frame_stack_top_controller = FrameStackTopController()
-bound_switch_of_frame_stack_top_controller = BoundSwitchOfFrameStackTopController()
-OPAQUE_STATE_HEADER = lltype.GcOpaqueType("OPAQUE_STATE_HEADER", hints={"render_structure": True})
-OPAQUE_STATE_HEADER_PTR = lltype.Ptr(OPAQUE_STATE_HEADER)
-
-
-
-class FrameStackTopReturningFnEntry(ExtRegistryEntry):
-    def compute_result_annotation(self):
-        from pypy.annotation import model as annmodel
-        return SomeControlledInstance(annmodel.lltype_to_annotation(OPAQUE_STATE_HEADER_PTR), frame_stack_top_controller)
-
-
-class YieldCurrentFrameToCallerFnEntry(FrameStackTopReturningFnEntry):
-    _about_ = yield_current_frame_to_caller
-
-    def specialize_call(self, hop):
-        var = hop.genop("yield_current_frame_to_caller", [], hop.r_result.lowleveltype)
-        return var
-
-
-# ____________________________________________________________
-
-def get_stack_depth_limit():
-    if we_are_translated():
-        from pypy.rpython.lltypesystem.lloperation import llop
-        return llop.get_stack_depth_limit(lltype.Signed)
-    raise RuntimeError("no stack depth limit in non-translated versions")
-
-def set_stack_depth_limit(limit):
-    if we_are_translated():
-        from pypy.rpython.lltypesystem.lloperation import llop
-        return llop.set_stack_depth_limit(lltype.Void, limit)
-    raise RuntimeError("no stack depth limit in non-translated versions")
diff --git a/pypy/rlib/rstacklet.py b/pypy/rlib/rstacklet.py
new file mode 100644
--- /dev/null
+++ b/pypy/rlib/rstacklet.py
@@ -0,0 +1,58 @@
+from pypy.rlib import _rffi_stacklet as _c
+from pypy.rpython.lltypesystem import lltype, llmemory
+
+
+class StackletThread(object):
+
+    def __init__(self, config):
+        self._gcrootfinder = _getgcrootfinder(config)
+        self._thrd = _c.newthread()
+        if not self._thrd:
+            raise MemoryError
+        self._thrd_deleter = StackletThreadDeleter(self._thrd)
+
+    def new(self, callback, arg=llmemory.NULL):
+        return self._gcrootfinder.new(self, callback, arg)
+    new._annspecialcase_ = 'specialize:arg(1)'
+
+    def switch(self, stacklet):
+        return self._gcrootfinder.switch(self, stacklet)
+
+    def destroy(self, stacklet):
+        self._gcrootfinder.destroy(self, stacklet)
+
+    def is_empty_handle(self, stacklet):
+        # note that "being an empty handle" and being equal to
+        # "get_null_handle()" may be the same, or not; don't rely on it
+        return self._gcrootfinder.is_empty_handle(stacklet)
+
+    def get_null_handle(self):
+        return self._gcrootfinder.get_null_handle()
+
+
+class StackletThreadDeleter(object):
+    # quick hack: the __del__ is on another object, so that
+    # if the main StackletThread ends up in random circular
+    # references, on pypy deletethread() is only called
+    # when all that circular reference mess is gone.
+    def __init__(self, thrd):
+        self._thrd = thrd
+    def __del__(self):
+        thrd = self._thrd
+        if thrd:
+            self._thrd = lltype.nullptr(_c.thread_handle.TO)
+            _c.deletethread(thrd)
+
+# ____________________________________________________________
+
+def _getgcrootfinder(config):
+    if (config is None or
+        config.translation.gc in ('ref', 'boehm', 'none')):   # for tests
+        gcrootfinder = 'n/a'
+    else:
+        gcrootfinder = config.translation.gcrootfinder
+    gcrootfinder = gcrootfinder.replace('/', '_')
+    module = __import__('pypy.rlib._stacklet_%s' % gcrootfinder,
+                        None, None, ['__doc__'])
+    return module.gcrootfinder
+_getgcrootfinder._annspecialcase_ = 'specialize:memo'
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -1403,7 +1403,7 @@
                                     s, pos, pos + unicode_bytes)
             result.append(res)
             continue
-        result.append(unichr(t))
+        result.append(UNICHR(t))
         pos += unicode_bytes
     return result.build(), pos
 
diff --git a/pypy/rlib/streamio.py b/pypy/rlib/streamio.py
--- a/pypy/rlib/streamio.py
+++ b/pypy/rlib/streamio.py
@@ -496,29 +496,24 @@
         if bufsize == -1:     # Get default from the class
             bufsize = self.bufsize
         self.bufsize = bufsize  # buffer size (hint only)
-        self.lines = []         # ready-made lines (sans "\n")
-        self.buf = ""           # raw data (may contain "\n")
-        # Invariant: readahead == "\n".join(self.lines + [self.buf])
-        # self.lines contains no "\n"
-        # self.buf may contain "\n"
+        self.buf = ""           # raw data
+        self.pos = 0
 
     def flush_buffers(self):
-        if self.lines or self.buf:
+        if self.buf:
             try:
                 self.do_seek(self.tell(), 0)
             except MyNotImplementedError:
                 pass
             else:
-                self.lines = []
                 self.buf = ""
+                self.pos = 0
 
     def tell(self):
-        bytes = self.do_tell()  # This may fail
-        offset = len(self.buf)
-        for line in self.lines:
-            offset += len(line) + 1
-        assert bytes >= offset #, (locals(), self.__dict__)
-        return bytes - offset
+        tellpos = self.do_tell()  # This may fail
+        offset = len(self.buf) - self.pos
+        assert tellpos >= offset #, (locals(), self.__dict__)
+        return tellpos - offset
 
     def seek(self, offset, whence):
         # This may fail on the do_seek() or do_tell() call.
@@ -526,32 +521,25 @@
         # Nor on a seek to the very end.
         if whence == 0:
             self.do_seek(offset, 0)
-            self.lines = []
             self.buf = ""
+            self.pos = 0
             return
         if whence == 1:
+            currentsize = len(self.buf) - self.pos
             if offset < 0:
-                self.do_seek(self.tell() + offset, 0)
-                self.lines = []
-                self.buf = ""
+                if self.pos + offset >= 0:
+                    self.pos += offset
+                else:
+                    self.do_seek(self.tell() + offset, 0)
+                    self.pos = 0
+                    self.buf = ""
                 return
-            while self.lines:
-                line = self.lines[-1]
-                if offset <= len(line):
-                    intoffset = intmask(offset)
-                    assert intoffset >= 0
-                    self.lines[-1] = line[intoffset:]
-                    return
-                offset -= len(self.lines[-1]) - 1
-                self.lines.pop()
-            assert not self.lines
-            if offset <= len(self.buf):
-                intoffset = intmask(offset)
-                assert intoffset >= 0
-                self.buf = self.buf[intoffset:]
+            elif offset <= currentsize:
+                self.pos += offset
                 return
-            offset -= len(self.buf)
             self.buf = ""
+            self.pos = 0
+            offset -= currentsize
             try:
                 self.do_seek(offset, 1)
             except MyNotImplementedError:
@@ -564,18 +552,18 @@
             except MyNotImplementedError:
                 pass
             else:
-                self.lines = []
+                self.pos = 0
                 self.buf = ""
                 return
             # Skip relative to EOF by reading and saving only just as
             # much as needed
             intoffset = offset2int(offset)
-            self.lines.reverse()
-            data = "\n".join(self.lines + [self.buf])
-            total = len(data)
-            buffers = [data]
-            self.lines = []
+            pos = self.pos
+            assert pos >= 0
+            buffers = [self.buf[pos:]]
+            total = len(buffers[0])
             self.buf = ""
+            self.pos = 0
             while 1:
                 data = self.do_read(self.bufsize)
                 if not data:
@@ -589,157 +577,101 @@
             if cutoff < 0:
                 raise StreamError("cannot seek back")
             if buffers:
+                assert cutoff >= 0
                 buffers[0] = buffers[0][cutoff:]
             self.buf = "".join(buffers)
-            self.lines = []
             return
+
         raise StreamError("whence should be 0, 1 or 2")
 
     def readall(self):
-        self.lines.reverse()
-        self.lines.append(self.buf)
-        more = ["\n".join(self.lines)]
-        self.lines = []
+        pos = self.pos
+        assert pos >= 0
+        chunks = [self.buf[pos:]]
         self.buf = ""
+        self.pos = 0
         bufsize = self.bufsize
         while 1:
             data = self.do_read(bufsize)
             if not data:
                 break
-            more.append(data)
+            chunks.append(data)
             bufsize = min(bufsize*2, self.bigsize)
-        return "".join(more)
+        return "".join(chunks)
 
-    def read(self, n):
+    def read(self, n=-1):
         assert isinstance(n, int)
-        assert n >= 0
-        if self.lines:
-            # See if this can be satisfied from self.lines[0]
-            line = self.lines[-1]
-            if len(line) >= n:
-                self.lines[-1] = line[n:]
-                return line[:n]
-
-            # See if this can be satisfied *without exhausting* self.lines
-            k = 0
-            i = 0
-            lgt = len(self.lines)
-            for linenum in range(lgt-1,-1,-1):
-                line = self.lines[linenum]
-                k += len(line)
-                if k >= n:
-                    lines = self.lines[linenum + 1:]
-                    data = self.lines[linenum]
-                    cutoff = len(data) - (k-n)
-                    assert cutoff >= 0
-                    lines.reverse()
-                    lines.append(data[:cutoff])
-                    del self.lines[linenum:]
-                    self.lines.append(data[cutoff:])
-                    return "\n".join(lines)
-                k += 1
-
-            # See if this can be satisfied from self.lines plus self.buf
-            if k + len(self.buf) >= n:
-                lines = self.lines
-                lines.reverse()
-                self.lines = []
-                cutoff = n - k
-                assert cutoff >= 0
-                lines.append(self.buf[:cutoff])
-                self.buf = self.buf[cutoff:]
-                return "\n".join(lines)
-
+        if n < 0:
+            return self.readall()
+        currentsize = len(self.buf) - self.pos
+        start = self.pos
+        assert start >= 0
+        if n <= currentsize:
+            stop = start + n
+            assert stop >= 0
+            result = self.buf[start:stop]
+            self.pos += n
+            return result
         else:
-            # See if this can be satisfied from self.buf
-            data = self.buf
-            k = len(data)
-            if k >= n:
-                cutoff = len(data) - (k-n)
-                assert cutoff >= 0
-                assert len(data) >= cutoff
-                self.buf = data[cutoff:]
-                return data[:cutoff]
-
-        lines = self.lines
-        lines.reverse()
-        self.lines = []
-        lines.append(self.buf)
-        self.buf = ""
-        data = "\n".join(lines)
-        more = [data]
-        k = len(data)
-        while k < n:
-            data = self.do_read(max(self.bufsize, n-k))
-            k += len(data)
-            more.append(data)
-            if not data:
-                break
-        cutoff = len(data) - (k-n)
-        assert cutoff >= 0
-        if len(data) <= cutoff:
-            self.buf = ""
-        else:
-            self.buf = data[cutoff:]
-            more[-1] = data[:cutoff]
-        return "".join(more)
-
-    # read_next_bunch is generally this, version below is slightly faster
-    #def _read_next_bunch(self):
-    #    self.lines = self.buf.split("\n")
-    #    self.buf = self.lines.pop()
-    #    self.lines.reverse()
-
-    def _read_next_bunch(self):
-        numlines = self.buf.count("\n")
-        self.lines = [None] * numlines
-        last = -1
-        num = numlines - 1
-        while True:
-            start = last + 1
-            assert start >= 0
-            next = self.buf.find("\n", start)
-            if next == -1:
-                if last != -1:
-                    self.buf = self.buf[start:]
-                break
-            assert next >= 0
-            self.lines[num] = self.buf[start:next]
-            last = next
-            num -= 1
+            chunks = [self.buf[start:]]
+            while 1:
+                self.buf = self.do_read(self.bufsize)
+                if not self.buf:
+                    self.pos = 0
+                    break
+                currentsize += len(self.buf)
+                if currentsize >= n:
+                    self.pos = len(self.buf) - (currentsize - n)
+                    stop = self.pos
+                    assert stop >= 0
+                    chunks.append(self.buf[:stop])
+                    break
+                chunks.append(self.buf)
+            return ''.join(chunks)
 
     def readline(self):
-        if self.lines:
-            return self.lines.pop() + "\n"
-
-        # This block is needed because read() can leave self.buf
-        # containing newlines
-        self._read_next_bunch()
-        if self.lines:
-            return self.lines.pop() + "\n"
-
-        if self.buf:
-            buf = [self.buf]
-        else:
-            buf = []
+        pos = self.pos
+        assert pos >= 0
+        i = self.buf.find("\n", pos)
+        start = self.pos
+        assert start >= 0
+        if i >= 0: # new line found
+            i += 1
+            result = self.buf[start:i]
+            self.pos = i
+            return result
+        temp = self.buf[start:]
+        # read one buffer and most of the time a new line will be found
+        self.buf = self.do_read(self.bufsize)
+        i = self.buf.find("\n")
+        if i >= 0: # new line found
+            i += 1
+            result = temp + self.buf[:i]
+            self.pos = i
+            return result
+        if not self.buf:
+            self.pos = 0
+            return temp
+        # need to keep getting data until we find a new line
+        chunks = [temp, self.buf]
         while 1:
             self.buf = self.do_read(self.bufsize)
-            self._read_next_bunch()
-            if self.lines:
-                buf.append(self.lines.pop())
-                buf.append("\n")
+            if not self.buf:
+                self.pos = 0
                 break
-            if not self.buf:
+            i = self.buf.find("\n")
+            if i >= 0:
+                i += 1
+                chunks.append(self.buf[:i])
+                self.pos = i
                 break
-            buf.append(self.buf)
-
-        return "".join(buf)
+            chunks.append(self.buf)
+        return "".join(chunks)
 
     def peek(self):
-        if self.lines:
-            return self.lines[-1] + "\n"
-        else:
-            return self.buf
+        pos = self.pos
+        assert pos >= 0
+        return self.buf[pos:]