[pypy-svn] r79602 - in pypy/branch/jit-free-asm/pypy: jit/backend jit/backend/llsupport jit/backend/llsupport/test jit/backend/test jit/backend/x86 jit/backend/x86/test rpython/lltypesystem rpython/memory/gctransform

Sun Nov 28 11:49:52 CET 2010

Author: arigo
Date: Sun Nov 28 11:49:48 2010
New Revision: 79602

Added:
   pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/asmmemmgr.py
      - copied, changed from r79522, pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/codebuf.py
   pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/test/test_asmmemmgr.py
      - copied, changed from r79522, pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/test/test_codebuf.py
Removed:
   pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/codebuf.py
   pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/test/test_codebuf.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/codebuf.py
Modified:
   pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/gc.py
   pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/llmodel.py
   pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/test/test_gc.py
   pypy/branch/jit-free-asm/pypy/jit/backend/model.py
   pypy/branch/jit-free-asm/pypy/jit/backend/test/runner_test.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/assembler.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/regalloc.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/regloc.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/runner.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/rx86.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/support.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_assembler.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_gc_integration.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_regalloc.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_regalloc2.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_regloc.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_runner.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_rx86.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_zll_random.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_zmath.py
   pypy/branch/jit-free-asm/pypy/jit/backend/x86/valgrind.py
   pypy/branch/jit-free-asm/pypy/rpython/lltypesystem/llmemory.py
   pypy/branch/jit-free-asm/pypy/rpython/memory/gctransform/asmgcroot.py
Log:
This is a full implementation of freeing of all backend-generated
memory too.  Lots of minor changes here and there, and adding the
main class, BlockBuilderMixin, used in the x86 backend as a base
class of MachineCodeBlockWrapper.  The main change in the x86
backend is that machine code is no longer directly written to
mmaped memory (with all the messiness of detecting when we are
about to overflow), but instead it is written in a temporary
buffer that resizes as needed, and only at the end is it copied
into mmaped memory.


Copied: pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/asmmemmgr.py (from r79522, pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/codebuf.py)
==============================================================================

--- pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/codebuf.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/asmmemmgr.py	Sun Nov 28 11:49:48 2010
@@ -1,5 +1,5 @@
 import sys
-from pypy.rlib.rarithmetic import intmask, r_uint
+from pypy.rlib.rarithmetic import intmask, r_uint, LONG_BIT
 from pypy.rlib.objectmodel import we_are_translated
 from pypy.rlib import rmmap
 from pypy.rpython.lltypesystem import lltype, llmemory, rffi
@@ -9,18 +9,19 @@
     LARGE_ALLOC_SIZE = 1024 * 1024   # 1MB
     MIN_FRAGMENT = 64
     NUM_INDICES = 32     # good for all sizes between 64 bytes and ~490 KB
+    _allocated = None
 
     def __init__(self, large_alloc_size = LARGE_ALLOC_SIZE,
                        min_fragment     = MIN_FRAGMENT,
                        num_indices      = NUM_INDICES):
         self.total_memory_allocated = r_uint(0)
+        self.total_mallocs = r_uint(0)
         self.large_alloc_size = large_alloc_size
         self.min_fragment = min_fragment
         self.num_indices = num_indices
         self.free_blocks = {}      # map {start: stop}
         self.free_blocks_end = {}  # map {stop: start}
         self.blocks_by_size = [[] for i in range(self.num_indices)]
-        self._allocated = []
 
     def malloc(self, minsize, maxsize):
         """Allocate executable memory, between minsize and maxsize bytes,
@@ -31,12 +32,15 @@
         (start, stop) = result
         smaller_stop = start + maxsize
         if smaller_stop + self.min_fragment <= stop:
-            result = (start, smaller_stop)
             self._add_free_block(smaller_stop, stop)
+            stop = smaller_stop
+            result = (start, stop)
+        self.total_mallocs += stop - start
         return result   # pair (start, stop)
 
     def free(self, start, stop):
         """Free a block (start, stop) returned by a previous malloc()."""
+        self.total_mallocs -= (stop - start)
         self._add_free_block(start, stop)
 
     def _allocate_large_block(self, minsize):
@@ -48,6 +52,8 @@
         size = (size // self.large_alloc_size) * self.large_alloc_size
         data = rmmap.alloc(size)
         if not we_are_translated():
+            if self._allocated is None:
+                self._allocated = []
             self._allocated.append((data, size))
             if sys.maxint > 2147483647:
                 # Hack to make sure that mcs are not within 32-bits of one
@@ -128,5 +134,117 @@
 
     def _delete(self):
         "NOT_RPYTHON"
-        for data, size in self._allocated:
-            rmmap.free(data, size)
+        if self._allocated:
+            for data, size in self._allocated:
+                rmmap.free(data, size)
+        self._allocated = None
+
+
+class BlockBuilderMixin(object):
+    _mixin_ = True
+    # A base class to generate assembler.  It is equivalent to just a list
+    # of chars, but it is potentially more efficient for that usage.
+    # It works by allocating the assembler SUBBLOCK_SIZE bytes at a time.
+    # Ideally, this number should be a power of two that fits the GC's most
+    # compact allocation scheme (which is so far 35 * WORD for minimark.py).
+    WORD = LONG_BIT // 8
+    SUBBLOCK_SIZE = 32 * WORD
+    SUBBLOCK_PTR = lltype.Ptr(lltype.GcForwardReference())
+    SUBBLOCK = lltype.GcStruct('SUBBLOCK',
+                   ('prev', SUBBLOCK_PTR),
+                   ('data', lltype.FixedSizeArray(lltype.Char, SUBBLOCK_SIZE)))
+    SUBBLOCK_PTR.TO.become(SUBBLOCK)
+
+    gcroot_markers = None
+    gcroot_markers_total_size = 0
+
+    def __init__(self, translated=None):
+        if translated is None:
+            translated = we_are_translated()
+        if translated:
+            self.init_block_builder()
+        else:
+            self._become_a_plain_block_builder()
+
+    def init_block_builder(self):
+        self._cursubblock = lltype.nullptr(self.SUBBLOCK)
+        self._baserelpos = -self.SUBBLOCK_SIZE
+        self._make_new_subblock()
+
+    def _make_new_subblock(self):
+        nextsubblock = lltype.malloc(self.SUBBLOCK)
+        nextsubblock.prev = self._cursubblock
+        self._cursubblock = nextsubblock
+        self._cursubindex = 0
+        self._baserelpos += self.SUBBLOCK_SIZE
+    _make_new_subblock._dont_inline_ = True
+
+    def writechar(self, char):
+        index = self._cursubindex
+        if index == self.SUBBLOCK_SIZE:
+            self._make_new_subblock()
+            index = 0
+        self._cursubblock.data[index] = char
+        self._cursubindex = index + 1
+
+    def overwrite(self, index, char):
+        assert 0 <= index < self.get_relative_pos()
+        block = self._cursubblock
+        index -= self._baserelpos
+        while index < 0:
+            block = block.prev
+            index += self.SUBBLOCK_SIZE
+        block.data[index] = char
+
+    def get_relative_pos(self):
+        return self._baserelpos + self._cursubindex
+
+    def copy_to_raw_memory(self, addr):
+        # indirection for _become_a_plain_block_builder() and for subclasses
+        self._copy_to_raw_memory(addr)
+
+    def _copy_to_raw_memory(self, addr):
+        block = self._cursubblock
+        blocksize = self._cursubindex
+        targetindex = self._baserelpos
+        while targetindex >= 0:
+            dst = rffi.cast(rffi.CCHARP, addr + targetindex)
+            for j in range(blocksize):
+                dst[j] = block.data[j]
+            block = block.prev
+            blocksize = self.SUBBLOCK_SIZE
+            targetindex -= self.SUBBLOCK_SIZE
+        assert not block
+
+    def materialize(self, asmmemmgr, allblocks, gcrootmap=None):
+        size = self.get_relative_pos()
+        malloced = asmmemmgr.malloc(size, size)
+        allblocks.append(malloced)
+        rawstart = malloced[0]
+        self.copy_to_raw_memory(rawstart)
+        if self.gcroot_markers is not None:
+            assert gcrootmap is not None
+            gcrootmap.add_raw_gcroot_markers(asmmemmgr,
+                                             allblocks,
+                                             self.gcroot_markers,
+                                             self.gcroot_markers_total_size,
+                                             rawstart)
+        return rawstart
+
+    def _become_a_plain_block_builder(self):
+        # hack purely for speed of tests
+        self._data = []
+        self.writechar = self._data.append
+        self.overwrite = self._data.__setitem__
+        self.get_relative_pos = self._data.__len__
+        def plain_copy_to_raw_memory(addr):
+            dst = rffi.cast(rffi.CCHARP, addr)
+            for i, c in enumerate(self._data):
+                dst[i] = c
+        self._copy_to_raw_memory = plain_copy_to_raw_memory
+
+    def insert_gcroot_marker(self, mark):
+        if self.gcroot_markers is None:
+            self.gcroot_markers = []
+        self.gcroot_markers.append((self.get_relative_pos(), mark))
+        self.gcroot_markers_total_size += len(mark)

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/gc.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/gc.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/gc.py	Sun Nov 28 11:49:48 2010
@@ -15,6 +15,7 @@
 from pypy.jit.backend.llsupport.descr import GcCache, get_field_descr
 from pypy.jit.backend.llsupport.descr import GcPtrFieldDescr
 from pypy.jit.backend.llsupport.descr import get_call_descr
+from pypy.rpython.memory.gctransform import asmgcroot
 
 # ____________________________________________________________
 
@@ -35,6 +36,8 @@
         return False
     def has_write_barrier_class(self):
         return None
+    def freeing_block(self, start, stop):
+        pass
 
 # ____________________________________________________________
 
@@ -218,50 +221,149 @@
     LOC_EBP_PLUS  = 2
     LOC_EBP_MINUS = 3
 
-    GCMAP_ARRAY = rffi.CArray(llmemory.Address)
+    GCMAP_ARRAY = rffi.CArray(lltype.Signed)
     CALLSHAPE_ARRAY = rffi.CArray(rffi.UCHAR)
 
     def __init__(self):
+        # '_gcmap' is an array of length '_gcmap_maxlength' of addresses.
+        # '_gcmap_curlength' tells how full the array really is.
+        # The addresses are actually grouped in pairs:
+        #     (addr-after-the-CALL-in-assembler, addr-of-the-call-shape).
+        # '_gcmap_deadentries' counts pairs marked dead (2nd item is NULL).
+        # '_gcmap_sorted' is True only if we know the array is sorted.
         self._gcmap = lltype.nullptr(self.GCMAP_ARRAY)
         self._gcmap_curlength = 0
         self._gcmap_maxlength = 0
+        self._gcmap_deadentries = 0
+        self._gcmap_sorted = True
 
     def initialize(self):
         # hack hack hack.  Remove these lines and see MissingRTypeAttribute
         # when the rtyper tries to annotate these methods only when GC-ing...
         self.gcmapstart()
         self.gcmapend()
+        self.gcmarksorted()
 
     def gcmapstart(self):
-        return llmemory.cast_ptr_to_adr(self._gcmap)
+        return rffi.cast(llmemory.Address, self._gcmap)
 
     def gcmapend(self):
         addr = self.gcmapstart()
         if self._gcmap_curlength:
-            addr += llmemory.sizeof(llmemory.Address)*self._gcmap_curlength
+            addr += rffi.sizeof(lltype.Signed) * self._gcmap_curlength
         return addr
 
-    def put(self, retaddr, callshapeaddr):
+    def gcmarksorted(self):
+        # Called by the GC when it is about to sort [gcmapstart():gcmapend()].
+        # Returns the previous sortedness flag -- i.e. returns True if it
+        # is already sorted, False if sorting is needed.
+        sorted = self._gcmap_sorted
+        self._gcmap_sorted = True
+        return sorted
+
+    @rgc.no_collect
+    def _put(self, retaddr, callshapeaddr):
         """'retaddr' is the address just after the CALL.
-        'callshapeaddr' is the address returned by encode_callshape()."""
+        'callshapeaddr' is the address of the raw 'shape' marker.
+        Both addresses are actually integers here."""
         index = self._gcmap_curlength
         if index + 2 > self._gcmap_maxlength:
-            self._enlarge_gcmap()
+            index = self._enlarge_gcmap()
         self._gcmap[index] = retaddr
         self._gcmap[index+1] = callshapeaddr
         self._gcmap_curlength = index + 2
+        self._gcmap_sorted = False
 
+    @rgc.no_collect
     def _enlarge_gcmap(self):
-        newlength = 250 + self._gcmap_maxlength * 2
-        newgcmap = lltype.malloc(self.GCMAP_ARRAY, newlength, flavor='raw',
-                                 track_allocation=False)   # YYY leak
         oldgcmap = self._gcmap
-        for i in range(self._gcmap_curlength):
-            newgcmap[i] = oldgcmap[i]
-        self._gcmap = newgcmap
-        self._gcmap_maxlength = newlength
-        if oldgcmap:
-            lltype.free(oldgcmap, flavor='raw', track_allocation=False)
+        if self._gcmap_deadentries * 3 * 2 > self._gcmap_maxlength:
+            # More than 1/3rd of the entries are dead.  Don't actually
+            # enlarge the gcmap table, but just clean up the dead entries.
+            newgcmap = oldgcmap
+        else:
+            # Normal path: enlarge the array.
+            newlength = 250 + (self._gcmap_maxlength // 3) * 4
+            newgcmap = lltype.malloc(self.GCMAP_ARRAY, newlength, flavor='raw',
+                                     track_allocation=False)
+            self._gcmap_maxlength = newlength
+        #
+        j = 0
+        i = 0
+        end = self._gcmap_curlength
+        while i < end:
+            if oldgcmap[i + 1]:
+                newgcmap[j] = oldgcmap[i]
+                newgcmap[j + 1] = oldgcmap[i + 1]
+                j += 2
+            i += 2
+        self._gcmap_curlength = j
+        self._gcmap_deadentries = 0
+        if oldgcmap != newgcmap:
+            self._gcmap = newgcmap
+            if oldgcmap:
+                lltype.free(oldgcmap, flavor='raw', track_allocation=False)
+        return j
+
+    def add_raw_gcroot_markers(self, asmmemmgr, allblocks,
+                               markers, total_size, rawstart):
+        """The interface is a bit custom, but this routine writes the
+        shapes of gcroots (for the GC to use) into raw memory."""
+        # xxx so far, we never try to share them.  But right now
+        # the amount of potential sharing would not be too large.
+        dst = 1
+        stop = 0
+        for relpos, shape in markers:
+            #
+            if dst + len(shape) > stop:
+                # No more space in the previous raw block,
+                # allocate a raw block of memory big enough to fit
+                # as many of the remaining 'shapes' as possible
+                start, stop = asmmemmgr.malloc(len(shape), total_size)
+                # add the raw block to 'compiled_loop_token.asmmemmgr_blocks'
+                allblocks.append((start, stop))
+                dst = start
+            #
+            # add the entry 'pos_after_call -> dst' to the table
+            self._put(rawstart + relpos, dst)
+            # Copy 'shape' into the raw memory, reversing the order
+            # of the bytes.  Similar to compress_callshape() in
+            # trackgcroot.py.
+            total_size -= len(shape)
+            src = len(shape) - 1
+            while src >= 0:
+                rffi.cast(rffi.CCHARP, dst)[0] = shape[src]
+                dst += 1
+                src -= 1
+
+    @rgc.no_collect
+    def freeing_block(self, start, stop):
+        # if [start:stop] is a raw block of assembler, then look up the
+        # corresponding gcroot markers, and mark them as freed now in
+        # self._gcmap by setting the 2nd address of every entry to NULL.
+        gcmapstart = self.gcmapstart()
+        gcmapend   = self.gcmapend()
+        if gcmapstart == gcmapend:
+            return
+        if not self.gcmarksorted():
+            asmgcroot.sort_gcmap(gcmapstart, gcmapend)
+        # A note about gcmarksorted(): the deletion we do here keeps the
+        # array sorted.  This avoids needing too many sort_gcmap()s.
+        # Indeed, freeing_block() is typically called many times in a row,
+        # so it will call sort_gcmap() at most the first time.
+        startaddr = rffi.cast(llmemory.Address, start)
+        stopaddr  = rffi.cast(llmemory.Address, stop)
+        item = asmgcroot.binary_search(gcmapstart, gcmapend, startaddr)
+        # 'item' points to one of the entries.  Because the whole array
+        # is sorted, we know that it points either to the first entry we
+        # want to kill, or to the previous entry.
+        while item.address[0] < stopaddr:
+            if item.address[0] >= startaddr:
+                item.address[1] = llmemory.NULL
+                self._gcmap_deadentries += 1
+            if item == gcmapend:
+                break
+            item += asmgcroot.arrayitemsize
 
     def get_basic_shape(self, is_64_bit=False):
         # XXX: Should this code even really know about stack frame layout of
@@ -304,18 +406,6 @@
         assert reg_index > 0
         shape.append(chr(self.LOC_REG | (reg_index << 2)))
 
-    def compress_callshape(self, shape):
-        # Similar to compress_callshape() in trackgcroot.py.
-        # XXX so far, we always allocate a new small array (we could regroup
-        # them inside bigger arrays) and we never try to share them.
-        length = len(shape)
-        compressed = lltype.malloc(self.CALLSHAPE_ARRAY, length,
-                                   flavor='raw',
-                                   track_allocation=False)   # YYY leak
-        for i in range(length):
-            compressed[length-1-i] = rffi.cast(rffi.UCHAR, shape[i])
-        return llmemory.cast_ptr_to_adr(compressed)
-
 
 class WriteBarrierDescr(AbstractDescr):
     def __init__(self, gc_ll_descr):
@@ -379,6 +469,7 @@
             'layoutbuilder': self.layoutbuilder,
             'gcmapstart': lambda: gcrootmap.gcmapstart(),
             'gcmapend': lambda: gcrootmap.gcmapend(),
+            'gcmarksorted': lambda: gcrootmap.gcmarksorted(),
             }
         self.GCClass = self.layoutbuilder.GCClass
         self.moving_gc = self.GCClass.moving_gc
@@ -641,6 +732,9 @@
     def has_write_barrier_class(self):
         return WriteBarrierDescr
 
+    def freeing_block(self, start, stop):
+        self.gcrootmap.freeing_block(start, stop)
+
 # ____________________________________________________________
 
 def get_ll_description(gcdescr, translator=None, rtyper=None):

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/llmodel.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/llmodel.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/llmodel.py	Sun Nov 28 11:49:48 2010
@@ -18,6 +18,7 @@
 from pypy.jit.backend.llsupport.descr import BaseIntCallDescr, GcPtrCallDescr
 from pypy.jit.backend.llsupport.descr import FloatCallDescr, VoidCallDescr
 from pypy.jit.backend.llsupport.ffisupport import get_call_descr_dynamic
+from pypy.jit.backend.llsupport.asmmemmgr import AsmMemoryManager
 from pypy.rpython.annlowlevel import cast_instance_to_base_ptr
 
 
@@ -52,6 +53,7 @@
         else:
             self._setup_exception_handling_untranslated()
         self.saved_exc_value = lltype.nullptr(llmemory.GCREF.TO)
+        self.asmmemmgr = AsmMemoryManager()
         self.setup()
         if translate_support_code:
             self._setup_on_leave_jitted_translated()
@@ -177,6 +179,15 @@
         self.saved_exc_value = lltype.nullptr(llmemory.GCREF.TO)
         return exc
 
+    def free_loop_and_bridges(self, compiled_loop_token):
+        AbstractCPU.free_loop_and_bridges(self, compiled_loop_token)
+        blocks = compiled_loop_token.asmmemmgr_blocks
+        if blocks is not None:
+            compiled_loop_token.asmmemmgr_blocks = None
+            for rawstart, rawstop in blocks:
+                self.gc_ll_descr.freeing_block(rawstart, rawstop)
+                self.asmmemmgr.free(rawstart, rawstop)
+
     # ------------------- helpers and descriptions --------------------
 
     @staticmethod

Copied: pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/test/test_asmmemmgr.py (from r79522, pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/test/test_codebuf.py)
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/test/test_codebuf.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/test/test_asmmemmgr.py	Sun Nov 28 11:49:48 2010
@@ -1,5 +1,7 @@
 import random
-from pypy.jit.backend.llsupport.codebuf import AsmMemoryManager
+from pypy.jit.backend.llsupport.asmmemmgr import AsmMemoryManager
+from pypy.jit.backend.llsupport.asmmemmgr import BlockBuilderMixin
+from pypy.rpython.lltypesystem import lltype, rffi
 
 
 def test_get_index():
@@ -147,3 +149,69 @@
                     # than 131072.  Be reasonable and allow up to 147456.
                     assert new_total <= 147456
                     prev_total = new_total
+
+    def test_insert_gcroot_marker(self):
+        class FakeGcRootMap:
+            def add_raw_gcroot_markers(self, asmmemmgr, allblocks, markers,
+                                       total_size, rawstart):
+                self.asmmemmgr = asmmemmgr
+                self.allblocks = allblocks
+                self.markers = markers
+                self.total_size = total_size
+                self.rawstart = rawstart
+        #
+        mc = BlockBuilderMixin()
+        mc.writechar('X')
+        mc.writechar('x')
+        mc.insert_gcroot_marker(['a', 'b', 'c', 'd'])
+        mc.writechar('Y')
+        mc.writechar('y')
+        mc.insert_gcroot_marker(['e', 'f', 'g'])
+        mc.writechar('Z')
+        mc.writechar('z')
+        #
+        gcrootmap = FakeGcRootMap()
+        allblocks = []
+        rawstart = mc.materialize(self.memmgr, allblocks, gcrootmap)
+        p = rffi.cast(rffi.CArrayPtr(lltype.Char), rawstart)
+        assert p[0] == 'X'
+        assert p[1] == 'x'
+        assert p[2] == 'Y'
+        assert p[3] == 'y'
+        assert p[4] == 'Z'
+        assert p[5] == 'z'
+        assert allblocks == [(rawstart, rawstart + 6)]
+        assert gcrootmap.markers == [(2, ['a', 'b', 'c', 'd']),
+                                     (4, ['e', 'f', 'g'])]
+        assert gcrootmap.total_size == 4 + 3
+        assert gcrootmap.rawstart == rawstart
+
+
+def test_blockbuildermixin(translated=True):
+    mc = BlockBuilderMixin(translated)
+    for i in range(mc.SUBBLOCK_SIZE * 2 + 3):
+        assert mc.get_relative_pos() == i
+        mc.writechar(chr(i % 255))
+    if translated:
+        assert mc._cursubindex == 3
+        assert mc._cursubblock
+        assert mc._cursubblock.prev
+        assert mc._cursubblock.prev.prev
+        assert not mc._cursubblock.prev.prev.prev
+    #
+    for i in range(0, mc.SUBBLOCK_SIZE * 2 + 3, 2):
+        mc.overwrite(i, chr((i + 63) % 255))
+    #
+    p = lltype.malloc(rffi.CCHARP.TO, mc.SUBBLOCK_SIZE * 2 + 3, flavor='raw')
+    addr = rffi.cast(lltype.Signed, p)
+    mc.copy_to_raw_memory(addr)
+    #
+    for i in range(mc.SUBBLOCK_SIZE * 2 + 3):
+        if i & 1:
+            assert p[i] == chr(i % 255)
+        else:
+            assert p[i] == chr((i + 63) % 255)
+    lltype.free(p, flavor='raw')
+
+def test_blockbuildermixin2():
+    test_blockbuildermixin(translated=False)

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/test/test_gc.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/test/test_gc.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/llsupport/test/test_gc.py	Sun Nov 28 11:49:48 2010
@@ -9,6 +9,7 @@
 from pypy.jit.tool.oparser import parse
 from pypy.rpython.lltypesystem.rclass import OBJECT, OBJECT_VTABLE
 from pypy.jit.metainterp.test.test_optimizeopt import equaloplists
+from pypy.rpython.memory.gctransform import asmgcroot
 
 def test_boehm():
     gc_ll_descr = GcLLDescr_boehm(None, None, None)
@@ -62,58 +63,214 @@
     for i in range(len(allocs)):
         assert addrs[i].address[0] == llmemory.cast_ptr_to_adr(allocs[i])
 
-def test_GcRootMap_asmgcc():
-    def frame_pos(n):
-        return -4*(4+n)
-    gcrootmap = GcRootMap_asmgcc()
-    num1 = frame_pos(-5)
-    num1a = num1|2
-    num2 = frame_pos(55)
-    num2a = ((-num2|3) >> 7) | 128
-    num2b = (-num2|3) & 127
-    shape = gcrootmap.get_basic_shape()
-    gcrootmap.add_ebp_offset(shape, num1)
-    gcrootmap.add_ebp_offset(shape, num2)
-    assert shape == map(chr, [6, 7, 11, 15, 2, 0, num1a, num2b, num2a])
-    gcrootmap.add_callee_save_reg(shape, 1)
-    assert shape == map(chr, [6, 7, 11, 15, 2, 0, num1a, num2b, num2a,
-                              4])
-    gcrootmap.add_callee_save_reg(shape, 2)
-    assert shape == map(chr, [6, 7, 11, 15, 2, 0, num1a, num2b, num2a,
-                              4, 8])
-    gcrootmap.add_callee_save_reg(shape, 3)
-    assert shape == map(chr, [6, 7, 11, 15, 2, 0, num1a, num2b, num2a,
-                              4, 8, 12])
-    gcrootmap.add_callee_save_reg(shape, 4)
-    assert shape == map(chr, [6, 7, 11, 15, 2, 0, num1a, num2b, num2a,
-                              4, 8, 12, 16])
-    #
-    shapeaddr = gcrootmap.compress_callshape(shape)
-    PCALLSHAPE = lltype.Ptr(GcRootMap_asmgcc.CALLSHAPE_ARRAY)
-    p = llmemory.cast_adr_to_ptr(shapeaddr, PCALLSHAPE)
-    for i, expected in enumerate([16, 12, 8, 4,
-                                  num2a, num2b, num1a, 0, 2, 15, 11, 7, 6]):
-        assert p[i] == expected
-    #
-    retaddr = rffi.cast(llmemory.Address, 1234567890)
-    gcrootmap.put(retaddr, shapeaddr)
-    assert gcrootmap._gcmap[0] == retaddr
-    assert gcrootmap._gcmap[1] == shapeaddr
-    assert gcrootmap.gcmapstart().address[0] == retaddr
-    #
-    # the same as before, but enough times to trigger a few resizes
-    expected_shapeaddr = {}
-    for i in range(1, 700):
+class TestGcRootMapAsmGcc:
+
+    def test_make_shapes(self):
+        def frame_pos(n):
+            return -4*(4+n)
+        gcrootmap = GcRootMap_asmgcc()
+        num1 = frame_pos(-5)
+        num1a = num1|2
+        num2 = frame_pos(55)
+        num2a = ((-num2|3) >> 7) | 128
+        num2b = (-num2|3) & 127
         shape = gcrootmap.get_basic_shape()
-        gcrootmap.add_ebp_offset(shape, frame_pos(i))
-        shapeaddr = gcrootmap.compress_callshape(shape)
-        expected_shapeaddr[i] = shapeaddr
-        retaddr = rffi.cast(llmemory.Address, 123456789 + i)
-        gcrootmap.put(retaddr, shapeaddr)
-    for i in range(1, 700):
-        expected_retaddr = rffi.cast(llmemory.Address, 123456789 + i)
-        assert gcrootmap._gcmap[i*2+0] == expected_retaddr
-        assert gcrootmap._gcmap[i*2+1] == expected_shapeaddr[i]
+        gcrootmap.add_ebp_offset(shape, num1)
+        gcrootmap.add_ebp_offset(shape, num2)
+        assert shape == map(chr, [6, 7, 11, 15, 2, 0, num1a, num2b, num2a])
+        gcrootmap.add_callee_save_reg(shape, 1)
+        assert shape == map(chr, [6, 7, 11, 15, 2, 0, num1a, num2b, num2a,
+                                  4])
+        gcrootmap.add_callee_save_reg(shape, 2)
+        assert shape == map(chr, [6, 7, 11, 15, 2, 0, num1a, num2b, num2a,
+                                  4, 8])
+        gcrootmap.add_callee_save_reg(shape, 3)
+        assert shape == map(chr, [6, 7, 11, 15, 2, 0, num1a, num2b, num2a,
+                                  4, 8, 12])
+        gcrootmap.add_callee_save_reg(shape, 4)
+        assert shape == map(chr, [6, 7, 11, 15, 2, 0, num1a, num2b, num2a,
+                                  4, 8, 12, 16])
+
+    def test_put_basic(self):
+        gcrootmap = GcRootMap_asmgcc()
+        retaddr = 1234567890
+        shapeaddr = 51627384
+        gcrootmap._put(retaddr, shapeaddr)
+        assert gcrootmap._gcmap[0] == retaddr
+        assert gcrootmap._gcmap[1] == shapeaddr
+        p = rffi.cast(rffi.LONGP, gcrootmap.gcmapstart())
+        assert p[0] == retaddr
+        assert (gcrootmap.gcmapend() ==
+                gcrootmap.gcmapstart() + rffi.sizeof(lltype.Signed) * 2)
+
+    def test_put_resize(self):
+        # the same as before, but enough times to trigger a few resizes
+        gcrootmap = GcRootMap_asmgcc()
+        for i in range(700):
+            shapeaddr = i * 100 + 1
+            retaddr = 123456789 + i
+            gcrootmap._put(retaddr, shapeaddr)
+        for i in range(700):
+            assert gcrootmap._gcmap[i*2+0] == 123456789 + i
+            assert gcrootmap._gcmap[i*2+1] == i * 100 + 1
+
+    def test_remove_nulls(self):
+        expected = []
+        def check():
+            assert gcrootmap._gcmap_curlength == len(expected) * 2
+            for i, (a, b) in enumerate(expected):
+                assert gcrootmap._gcmap[i*2] == a
+                assert gcrootmap._gcmap[i*2+1] == b
+        #
+        gcrootmap = GcRootMap_asmgcc()
+        for i in range(700):
+            shapeaddr = i * 100       # 0 if i == 0
+            retaddr = 123456789 + i
+            gcrootmap._put(retaddr, shapeaddr)
+            if shapeaddr != 0:
+                expected.append((retaddr, shapeaddr))
+        # at the first resize, the 0 should be removed
+        check()
+        for repeat in range(10):
+            # now clear up half the entries
+            assert len(expected) == 699
+            for i in range(0, len(expected), 2):
+                gcrootmap._gcmap[i*2+1] = 0
+                gcrootmap._gcmap_deadentries += 1
+            expected = expected[1::2]
+            assert gcrootmap._gcmap_deadentries*6 > gcrootmap._gcmap_maxlength
+            # check that we can again insert 350 entries without a resize
+            oldgcmap = gcrootmap._gcmap
+            for i in range(0, 699, 2):
+                gcrootmap._put(515151 + i + repeat, 626262 + i)
+                expected.append((515151 + i + repeat, 626262 + i))
+            assert gcrootmap._gcmap == oldgcmap
+            check()
+
+    def test_add_raw_gcroot_markers_maxalloc(self):
+        class FakeAsmMemMgr:
+            def malloc(self, minsize, maxsize):
+                assert minsize == 4
+                assert maxsize == 7
+                return (prawstart, prawstart + 8)
+        put = []
+        def fakeput(a, b):
+            put.append((a, b))
+        gcrootmap = GcRootMap_asmgcc()
+        gcrootmap._put = fakeput
+        memmgr = FakeAsmMemMgr()
+        allblocks = []
+        p = lltype.malloc(rffi.CArray(lltype.Char), 7, immortal=True)
+        prawstart = rffi.cast(lltype.Signed, p)
+        gcrootmap.add_raw_gcroot_markers(memmgr, allblocks,
+                                         [(2, ['a', 'b', 'c', 'd']),
+                                          (4, ['e', 'f', 'g'])],
+                                         4 + 3, 1200000)
+        assert allblocks == [(prawstart, prawstart + 8)]
+        assert ''.join([p[i] for i in range(7)]) == 'dcbagfe'
+        assert put == [(1200002, prawstart),
+                       (1200004, prawstart + 4)]
+
+    def test_add_raw_gcroot_markers_minalloc(self):
+        class FakeAsmMemMgr:
+            callnum = 0
+            def malloc(self, minsize, maxsize):
+                self.callnum += 1
+                if self.callnum == 1:
+                    assert minsize == 4
+                    assert maxsize == 7
+                    return (prawstart, prawstart + 6)
+                elif self.callnum == 2:
+                    assert minsize == 3
+                    assert maxsize == 3
+                    return (qrawstart, qrawstart + 5)
+                else:
+                    raise AssertionError
+        put = []
+        def fakeput(a, b):
+            put.append((a, b))
+        gcrootmap = GcRootMap_asmgcc()
+        gcrootmap._put = fakeput
+        memmgr = FakeAsmMemMgr()
+        allblocks = []
+        p = lltype.malloc(rffi.CArray(lltype.Char), 6, immortal=True)
+        prawstart = rffi.cast(lltype.Signed, p)
+        q = lltype.malloc(rffi.CArray(lltype.Char), 5, immortal=True)
+        qrawstart = rffi.cast(lltype.Signed, q)
+        gcrootmap.add_raw_gcroot_markers(memmgr, allblocks,
+                                         [(2, ['a', 'b', 'c', 'd']),
+                                          (4, ['e', 'f', 'g'])],
+                                         4 + 3, 1200000)
+        assert allblocks == [(prawstart, prawstart + 6),
+                             (qrawstart, qrawstart + 5)]
+        assert ''.join([p[i] for i in range(4)]) == 'dcba'
+        assert ''.join([q[i] for i in range(3)]) == 'gfe'
+        assert put == [(1200002, prawstart),
+                       (1200004, qrawstart)]
+
+    def test_freeing_block(self):
+        from pypy.jit.backend.llsupport import gc
+        class Asmgcroot:
+            arrayitemsize = 2 * llmemory.sizeof(llmemory.Address)
+            sort_count = 0
+            def sort_gcmap(self, gcmapstart, gcmapend):
+                self.sort_count += 1
+            def binary_search(self, gcmapstart, gcmapend, startaddr):
+                i = 0
+                while (i < gcrootmap._gcmap_curlength//2 and
+                       gcrootmap._gcmap[i*2] < startaddr):
+                    i += 1
+                if i > 0:
+                    i -= 1
+                assert 0 <= i < gcrootmap._gcmap_curlength//2
+                p = rffi.cast(rffi.CArrayPtr(llmemory.Address), gcmapstart)
+                p = rffi.ptradd(p, 2*i)
+                return llmemory.cast_ptr_to_adr(p)
+        saved = gc.asmgcroot
+        try:
+            gc.asmgcroot = Asmgcroot()
+            #
+            gcrootmap = GcRootMap_asmgcc()
+            gcrootmap._gcmap = lltype.malloc(gcrootmap.GCMAP_ARRAY,
+                                             1400, flavor='raw',
+                                             immortal=True)
+            for i in range(700):
+                gcrootmap._gcmap[i*2] = 1200000 + i
+                gcrootmap._gcmap[i*2+1] = i * 100 + 1
+            assert gcrootmap._gcmap_deadentries == 0
+            assert gc.asmgcroot.sort_count == 0
+            gcrootmap._gcmap_maxlength = 700
+            gcrootmap._gcmap_curlength = 700
+            gcrootmap._gcmap_sorted = False
+            #
+            gcrootmap.freeing_block(1200000 - 100, 1200000)
+            assert gcrootmap._gcmap_deadentries == 0
+            assert gc.asmgcroot.sort_count == 1
+            #
+            gcrootmap.freeing_block(1200000 + 100, 1200000 + 200)
+            assert gcrootmap._gcmap_deadentries == 100
+            assert gc.asmgcroot.sort_count == 1
+            for i in range(700):
+                if 100 <= i < 200:
+                    expected = 0
+                else:
+                    expected = i * 100 + 1
+                assert gcrootmap._gcmap[i*2] == 1200000 + i
+                assert gcrootmap._gcmap[i*2+1] == expected
+            #
+            gcrootmap.freeing_block(1200000 + 650, 1200000 + 750)
+            assert gcrootmap._gcmap_deadentries == 150
+            assert gc.asmgcroot.sort_count == 1
+            for i in range(700):
+                if 100 <= i < 200 or 650 <= i:
+                    expected = 0
+                else:
+                    expected = i * 100 + 1
+                assert gcrootmap._gcmap[i*2] == 1200000 + i
+                assert gcrootmap._gcmap[i*2+1] == expected
+        #
+        finally:
+            gc.asmgcroot = saved
 
 
 class FakeLLOp(object):

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/model.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/model.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/model.py	Sun Nov 28 11:49:48 2010
@@ -139,10 +139,14 @@
         # resume descrs are the largest consumers of memory (about 3x
         # more than the assembler, in the case of the x86 backend).
         lst = self.fail_descr_list
-        for n in compiled_loop_token.faildescr_indices:
+        # We expect 'compiled_loop_token' to be itself garbage-collected soon,
+        # but better safe than sorry: be ready to handle several calls to
+        # free_loop_and_bridges() for the same compiled_loop_token.
+        faildescr_indices = compiled_loop_token.faildescr_indices
+        compiled_loop_token.faildescr_indices = []
+        for n in faildescr_indices:
             lst[n] = None
-        self.fail_descr_free_list.extend(compiled_loop_token.faildescr_indices)
-        # We expect 'compiled_loop_token' to be itself garbage-collected soon.
+        self.fail_descr_free_list.extend(faildescr_indices)
 
     @staticmethod
     def sizeof(S):
@@ -271,6 +275,9 @@
 
 
 class CompiledLoopToken(object):
+    asmmemmgr_blocks = None
+    asmmemmgr_gcroots = 0
+
     def __init__(self, cpu, number):
         cpu.total_compiled_loops += 1
         self.cpu = cpu

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/test/runner_test.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/test/runner_test.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/test/runner_test.py	Sun Nov 28 11:49:48 2010
@@ -211,6 +211,7 @@
 
         assert self.cpu.total_compiled_loops == 1
         assert self.cpu.total_compiled_bridges == 1
+        return looptoken
 
     def test_compile_bridge_with_holes(self):
         i0 = BoxInt()
@@ -2229,6 +2230,17 @@
             assert res.value == expected, (
                 "%r: got %r, expected %r" % (RESTYPE, res.value, expected))
 
+    def test_free_loop_and_bridges(self):
+        if hasattr(self.cpu, 'setup_once'):
+            self.cpu.setup_once()
+        mem0 = self.cpu.asmmemmgr.total_mallocs
+        looptoken = self.test_compile_bridge()
+        mem1 = self.cpu.asmmemmgr.total_mallocs
+        self.cpu.free_loop_and_bridges(looptoken.compiled_loop_token)
+        mem2 = self.cpu.asmmemmgr.total_mallocs
+        assert mem2 < mem1
+        assert mem2 == mem0
+
 
 class OOtypeBackendTest(BaseBackendTest):
 

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/assembler.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/assembler.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/assembler.py	Sun Nov 28 11:49:48 2010
@@ -6,7 +6,6 @@
 from pypy.rpython.lltypesystem import lltype, rffi, rstr, llmemory
 from pypy.rpython.lltypesystem.lloperation import llop
 from pypy.rpython.annlowlevel import llhelper
-from pypy.tool.uid import fixid
 from pypy.jit.backend.model import CompiledLoopToken
 from pypy.jit.backend.x86.regalloc import (RegAlloc, X86RegisterManager,
                                            X86XMMRegisterManager, get_ebp_ofs,
@@ -31,6 +30,7 @@
 from pypy.jit.backend.x86 import rx86, regloc, codebuf
 from pypy.jit.metainterp.resoperation import rop, ResOperation
 from pypy.jit.backend.x86.support import values_array
+from pypy.jit.backend.x86 import support
 from pypy.rlib.debug import debug_print, debug_start, debug_stop
 from pypy.rlib import rgc
 from pypy.jit.backend.x86.jump import remap_frame_layout
@@ -43,122 +43,17 @@
 def align_stack_words(words):
     return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
 
-class MachineCodeBlockWrapper(object):
-    MC_DEFAULT_SIZE = 1024*1024
-
-    def __init__(self, assembler, bigsize, profile_agent=None):
-        self.assembler = assembler
-        self.old_mcs = [] # keepalive
-        self.bigsize = bigsize
-        self._mc = self._instantiate_mc()
-        self.function_name = None
-        self.profile_agent = profile_agent
-        self.reset_reserved_bytes()
-
-    def _instantiate_mc(self): # hook for testing
-        return codebuf.MachineCodeBlock(self.bigsize)
-
-    def ensure_bytes_available(self, num_bytes):
-        if self.bytes_free() <= (self._reserved_bytes + num_bytes):
-            self.make_new_mc()
-
-    def reserve_bytes(self, num_bytes):
-        self.ensure_bytes_available(num_bytes)
-        self._reserved_bytes += num_bytes
-
-    def reset_reserved_bytes(self):
-        # XXX er.... pretty random number, just to be sure
-        #     not to write half-instruction
-        self._reserved_bytes = 64
-
-    def get_relative_pos(self):
-        return self._mc.get_relative_pos()
-
-    def overwrite(self, pos, listofchars):
-        return self._mc.overwrite(pos, listofchars)
-
-    def bytes_free(self):
-        return self._mc._size - self._mc.get_relative_pos()
-
-    def start_function(self, name):
-        self.function_name = name
-        self.start_pos = self._mc.get_relative_pos()
-
-    def end_function(self, done=True):
-        assert self.function_name is not None
-        size = self._mc.get_relative_pos() - self.start_pos
-        address = self.tell() - size
-        if self.profile_agent is not None:
-            self.profile_agent.native_code_written(self.function_name,
-                                                   address, size)
-        if done:
-            self.function_name = None
-
-    def make_new_mc(self):
-        new_mc = self._instantiate_mc()
-        debug_print('[new machine code block at', new_mc.tell(), ']')
-
-        if IS_X86_64:
-            # The scratch register is sometimes used as a temporary
-            # register, but the JMP below might clobber it. Rather than risk
-            # subtle bugs, we preserve the scratch register across the jump.
-            self._mc.PUSH_r(X86_64_SCRATCH_REG.value)
-            
-        self._mc.JMP(imm(new_mc.tell()))
-
-        if IS_X86_64:
-            # Restore scratch reg
-            new_mc.POP_r(X86_64_SCRATCH_REG.value)
-
-        if self.function_name is not None:
-            self.end_function(done=False)
-            self.start_pos = new_mc.get_relative_pos()
-
-        self.assembler.write_pending_failure_recoveries()
-
-        self._mc.done()
-        self.old_mcs.append(self._mc)
-        self._mc = new_mc
-    make_new_mc._dont_inline_ = True
-
-    def tell(self):
-        return self._mc.tell()
-
-    def done(self):
-        self._mc.done()
-
-def _new_method(name):
-    def method(self, *args):
-        if self.bytes_free() < self._reserved_bytes:
-            self.make_new_mc()
-        getattr(self._mc, name)(*args)    
-    method.func_name = name
-    return method
-
-for _name in rx86.all_instructions + regloc.all_extra_instructions:
-    setattr(MachineCodeBlockWrapper, _name, _new_method(_name))
-
-for name in dir(codebuf.MachineCodeBlock):
-    if name.upper() == name or name == "writechr":
-        setattr(MachineCodeBlockWrapper, name, _new_method(name))
 
 class GuardToken(object):
-    def __init__(self, faildescr, failargs, fail_locs, exc, desc_bytes):
+    def __init__(self, faildescr, failargs, fail_locs, exc):
         self.faildescr = faildescr
         self.failargs = failargs
         self.fail_locs = fail_locs
         self.exc = exc
-        self.desc_bytes = desc_bytes
-
-    def recovery_stub_size(self):
-        # XXX: 32 is pulled out of the air
-        return 32 + len(self.desc_bytes)
 
 DEBUG_COUNTER = lltype.Struct('DEBUG_COUNTER', ('i', lltype.Signed))
 
 class Assembler386(object):
-    mc = None
-    mc_size = MachineCodeBlockWrapper.MC_DEFAULT_SIZE
     _float_constants = None
     _regalloc = None
     _output_loop_log = None
@@ -177,18 +72,17 @@
         self.fail_boxes_float = values_array(lltype.Float, failargs_limit)
         self.fail_ebp = 0
         self.loop_run_counters = []
-        # if we have 10000 loops, we have some other problems I guess
         self.float_const_neg_addr = 0
         self.float_const_abs_addr = 0
         self.malloc_fixedsize_slowpath1 = 0
         self.malloc_fixedsize_slowpath2 = 0
-        self.pending_guard_tokens = None
         self.memcpy_addr = 0
         self.setup_failure_recovery()
         self._debug = False
         self.debug_counter_descr = cpu.fielddescrof(DEBUG_COUNTER, 'i')
         self.fail_boxes_count = 0
         self._current_depths_cache = (0, 0)
+        self.teardown()
 
     def leave_jitted_hook(self):
         ptrs = self.fail_boxes_ptr.ar
@@ -198,54 +92,57 @@
     def set_debug(self, v):
         self._debug = v
 
+    def setup_once(self):
+        # the address of the function called by 'new'
+        gc_ll_descr = self.cpu.gc_ll_descr
+        gc_ll_descr.initialize()
+        ll_new = gc_ll_descr.get_funcptr_for_new()
+        self.malloc_func_addr = rffi.cast(lltype.Signed, ll_new)
+        if gc_ll_descr.get_funcptr_for_newarray is not None:
+            ll_new_array = gc_ll_descr.get_funcptr_for_newarray()
+            self.malloc_array_func_addr = rffi.cast(lltype.Signed,
+                                                    ll_new_array)
+        if gc_ll_descr.get_funcptr_for_newstr is not None:
+            ll_new_str = gc_ll_descr.get_funcptr_for_newstr()
+            self.malloc_str_func_addr = rffi.cast(lltype.Signed,
+                                                  ll_new_str)
+        if gc_ll_descr.get_funcptr_for_newunicode is not None:
+            ll_new_unicode = gc_ll_descr.get_funcptr_for_newunicode()
+            self.malloc_unicode_func_addr = rffi.cast(lltype.Signed,
+                                                      ll_new_unicode)
+        self.memcpy_addr = self.cpu.cast_ptr_to_int(support.memcpy_fn)
+        self._build_failure_recovery(False)
+        self._build_failure_recovery(True)
+        if self.cpu.supports_floats:
+            self._build_failure_recovery(False, withfloats=True)
+            self._build_failure_recovery(True, withfloats=True)
+            support.ensure_sse2_floats()
+            self._build_float_constants()
+        if hasattr(gc_ll_descr, 'get_malloc_fixedsize_slowpath_addr'):
+            self._build_malloc_fixedsize_slowpath()
+        s = os.environ.get('PYPYLOG')
+        if s:
+            if s.find(':') != -1:
+                s = s.split(':')[-1]
+            self.set_debug(True)
+
     def setup(self):
-        if self.mc is None:
-            # the address of the function called by 'new'
-            gc_ll_descr = self.cpu.gc_ll_descr
-            gc_ll_descr.initialize()
-            ll_new = gc_ll_descr.get_funcptr_for_new()
-            self.malloc_func_addr = rffi.cast(lltype.Signed, ll_new)
-            if gc_ll_descr.get_funcptr_for_newarray is not None:
-                ll_new_array = gc_ll_descr.get_funcptr_for_newarray()
-                self.malloc_array_func_addr = rffi.cast(lltype.Signed,
-                                                        ll_new_array)
-            if gc_ll_descr.get_funcptr_for_newstr is not None:
-                ll_new_str = gc_ll_descr.get_funcptr_for_newstr()
-                self.malloc_str_func_addr = rffi.cast(lltype.Signed,
-                                                      ll_new_str)
-            if gc_ll_descr.get_funcptr_for_newunicode is not None:
-                ll_new_unicode = gc_ll_descr.get_funcptr_for_newunicode()
-                self.malloc_unicode_func_addr = rffi.cast(lltype.Signed,
-                                                          ll_new_unicode)
-            self.memcpy_addr = self.cpu.cast_ptr_to_int(codebuf.memcpy_fn)
-            self.mc = MachineCodeBlockWrapper(self, self.mc_size, self.cpu.profile_agent)
-            self._build_failure_recovery(False)
-            self._build_failure_recovery(True)
-            if self.cpu.supports_floats:
-                self._build_failure_recovery(False, withfloats=True)
-                self._build_failure_recovery(True, withfloats=True)
-                codebuf.ensure_sse2_floats()
-                self._build_float_constants()
-            if hasattr(gc_ll_descr, 'get_malloc_fixedsize_slowpath_addr'):
-                self._build_malloc_fixedsize_slowpath()
-            s = os.environ.get('PYPYLOG')
-            if s:
-                if s.find(':') != -1:
-                    s = s.split(':')[-1]
-                self.set_debug(True)
-            # Intialize here instead of __init__ to prevent
-            # pending_guard_tokens from being considered a prebuilt object,
-            # which sometimes causes memory leaks since the prebuilt list is
-            # still considered a GC root after we re-assign
-            # pending_guard_tokens in write_pending_failure_recoveries
-            self.pending_guard_tokens = []
+        assert self.memcpy_addr != 0, "setup_once() not called?"
+        self.pending_guard_tokens = []
+        self.mc = codebuf.MachineCodeBlockWrapper()
+
+    def teardown(self):
+        self.pending_guard_tokens = None
+        self.mc = None
+        self.looppos = -1
+        self.currently_compiling_loop = None
 
     def finish_once(self):
         if self._debug:
             debug_start('jit-backend-counts')
             for i in range(len(self.loop_run_counters)):
-                name, struct = self.loop_run_counters[i]
-                debug_print(str(name) + ':' + str(struct.i))
+                struct = self.loop_run_counters[i]
+                debug_print(str(i) + ':' + str(struct.i))
             debug_stop('jit-backend-counts')
 
     def _build_float_constants(self):
@@ -270,47 +167,51 @@
 
     def _build_malloc_fixedsize_slowpath(self):
         # ---------- first helper for the slow path of malloc ----------
-        self.malloc_fixedsize_slowpath1 = self.mc.tell()
+        mc = codebuf.MachineCodeBlockWrapper()
         if self.cpu.supports_floats:          # save the XMM registers in
             for i in range(self.cpu.NUM_REGS):# the *caller* frame, from esp+8
-                self.mc.MOVSD_sx((WORD*2)+8*i, i)
-        self.mc.SUB_rr(edx.value, eax.value)       # compute the size we want
+                mc.MOVSD_sx((WORD*2)+8*i, i)
+        mc.SUB_rr(edx.value, eax.value)       # compute the size we want
         if IS_X86_32:
-            self.mc.MOV_sr(WORD, edx.value)        # save it as the new argument
+            mc.MOV_sr(WORD, edx.value)        # save it as the new argument
         elif IS_X86_64:
             # rdi can be clobbered: its content was forced to the stack
             # by _fastpath_malloc(), like all other save_around_call_regs.
-            self.mc.MOV_rr(edi.value, edx.value)
+            mc.MOV_rr(edi.value, edx.value)
 
         addr = self.cpu.gc_ll_descr.get_malloc_fixedsize_slowpath_addr()
-        self.mc.JMP(imm(addr))                    # tail call to the real malloc
+        mc.JMP(imm(addr))                    # tail call to the real malloc
+        rawstart = mc.materialize(self.cpu.asmmemmgr, [])
+        self.malloc_fixedsize_slowpath1 = rawstart
         # ---------- second helper for the slow path of malloc ----------
-        self.malloc_fixedsize_slowpath2 = self.mc.tell()
+        mc = codebuf.MachineCodeBlockWrapper()
         if self.cpu.supports_floats:          # restore the XMM registers
             for i in range(self.cpu.NUM_REGS):# from where they were saved
-                self.mc.MOVSD_xs(i, (WORD*2)+8*i)
+                mc.MOVSD_xs(i, (WORD*2)+8*i)
         nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
-        self.mc.MOV(edx, heap(nursery_free_adr))   # load this in EDX
-        self.mc.RET()
-        self.mc.done()
+        mc.MOV(edx, heap(nursery_free_adr))   # load this in EDX
+        mc.RET()
+        rawstart = mc.materialize(self.cpu.asmmemmgr, [])
+        self.malloc_fixedsize_slowpath2 = rawstart
 
     def assemble_loop(self, inputargs, operations, looptoken, log):
-        """adds the following attributes to looptoken:
+        '''adds the following attributes to looptoken:
                _x86_loop_code       (an integer giving an address)
                _x86_bootstrap_code  (an integer giving an address)
-               _x86_direct_bootstrap_code
+               _x86_direct_bootstrap_code  ( "    "     "    "   )
                _x86_frame_depth
                _x86_param_depth
                _x86_arglocs
                _x86_debug_checksum
-        """
-        looptoken.compiled_loop_token = CompiledLoopToken(self.cpu,
-                                                          looptoken.number)
+        '''
+        clt = CompiledLoopToken(self.cpu, looptoken.number)
+        looptoken.compiled_loop_token = clt
         if not we_are_translated():
             # Arguments should be unique
             assert len(set(inputargs)) == len(inputargs)
 
         self.setup()
+        self.currently_compiling_loop = looptoken
         funcname = self._find_debug_merge_point(operations)
         if log:
             self._register_counter()
@@ -320,43 +221,61 @@
         arglocs = regalloc.prepare_loop(inputargs, operations, looptoken)
         looptoken._x86_arglocs = arglocs
 
-        # profile support
-        name = "Loop # %s: %s" % (looptoken.number, funcname)
-        self.mc.start_function(name)
-        looptoken._x86_bootstrap_code = self.mc.tell()
-        adr_stackadjust = self._assemble_bootstrap_code(inputargs, arglocs)
-        curadr = self.mc.tell()
-        looptoken._x86_loop_code = curadr
+        bootstrappos = self.mc.get_relative_pos()
+        stackadjustpos = self._assemble_bootstrap_code(inputargs, arglocs)
+        self.looppos = self.mc.get_relative_pos()
         looptoken._x86_frame_depth = -1     # temporarily
         looptoken._x86_param_depth = -1     # temporarily        
         frame_depth, param_depth = self._assemble(regalloc, operations)
-        self._patch_stackadjust(adr_stackadjust, frame_depth+param_depth)
         looptoken._x86_frame_depth = frame_depth
         looptoken._x86_param_depth = param_depth
 
-        looptoken._x86_direct_bootstrap_code = self.mc.tell()
-        self._assemble_bootstrap_direct_call(arglocs, curadr,
+        directbootstrappos = self.mc.get_relative_pos()
+        self._assemble_bootstrap_direct_call(arglocs, self.looppos,
                                              frame_depth+param_depth)
-        #
-        debug_print("Loop #%d has address %x to %x" % (looptoken.number,
-                                                       looptoken._x86_loop_code,
-                                                       self.mc.tell()))
-        self.mc.end_function()
         self.write_pending_failure_recoveries()
-        
-    def assemble_bridge(self, faildescr, inputargs, operations, log):
+        fullsize = self.mc.get_relative_pos()
+        #
+        rawstart = self.materialize(looptoken)
+        debug_print("Loop #%d (%s) has address %x to %x" % (
+            looptoken.number, funcname,
+            rawstart + self.looppos,
+            rawstart + directbootstrappos))
+        self._patch_stackadjust(rawstart + stackadjustpos,
+                                frame_depth + param_depth)
+        self.patch_pending_failure_recoveries(rawstart)
+        #
+        looptoken._x86_bootstrap_code = rawstart + bootstrappos
+        looptoken._x86_loop_code = rawstart + self.looppos
+        looptoken._x86_direct_bootstrap_code = rawstart + directbootstrappos
+        self.teardown()
+        # oprofile support
+        if self.cpu.profile_agent is not None:
+            name = "Loop # %s: %s" % (looptoken.number, funcname)
+            self.cpu.profile_agent.native_code_written(name,
+                                                       rawstart, fullsize)
+
+    def assemble_bridge(self, faildescr, inputargs, operations,
+                        original_loop_token, log):
         if not we_are_translated():
             # Arguments should be unique
             assert len(set(inputargs)) == len(inputargs)
 
+        descr_number = self.cpu.get_fail_descr_number(faildescr)
+        try:
+            failure_recovery = self._find_failure_recovery_bytecode(faildescr)
+        except ValueError:
+            debug_print("Bridge out of guard", descr_number,
+                        "was already compiled!")
+            return
+
         self.setup()
         funcname = self._find_debug_merge_point(operations)
         if log:
             self._register_counter()
             operations = self._inject_debugging_code(faildescr, operations)
 
-        arglocs = self.rebuild_faillocs_from_descr(
-            faildescr._x86_failure_recovery_bytecode)
+        arglocs = self.rebuild_faillocs_from_descr(failure_recovery)
         if not we_are_translated():
             assert ([loc.assembler() for loc in arglocs] ==
                     [loc.assembler() for loc in faildescr._x86_debug_faillocs])
@@ -365,37 +284,58 @@
         regalloc.prepare_bridge(fail_depths, inputargs, arglocs,
                                 operations)
 
-        # oprofile support
-        descr_number = self.cpu.get_fail_descr_number(faildescr)
-        name = "Bridge # %s: %s" % (descr_number, funcname)
-        self.mc.start_function(name)
-
-        adr_bridge = self.mc.tell()
-        adr_stackadjust = self._patchable_stackadjust()
+        stackadjustpos = self._patchable_stackadjust()
         frame_depth, param_depth = self._assemble(regalloc, operations)
-        self._patch_stackadjust(adr_stackadjust, frame_depth+param_depth)
+        codeendpos = self.mc.get_relative_pos()
+        self.write_pending_failure_recoveries()
+        fullsize = self.mc.get_relative_pos()
+        #
+        rawstart = self.materialize(original_loop_token)
+
+        debug_print("Bridge out of guard %d (%s) has address %x to %x" %
+                    (descr_number, funcname, rawstart, rawstart + codeendpos))
+        self._patch_stackadjust(rawstart + stackadjustpos,
+                                frame_depth + param_depth)
+        self.patch_pending_failure_recoveries(rawstart)
         if not we_are_translated():
             # for the benefit of tests
             faildescr._x86_bridge_frame_depth = frame_depth
             faildescr._x86_bridge_param_depth = param_depth
         # patch the jump from original guard
-        self.patch_jump_for_descr(faildescr, adr_bridge)
-        debug_print("Bridge out of guard %d has address %x to %x" %
-                    (descr_number, adr_bridge, self.mc.tell()))
-        self.mc.end_function()
-        self.write_pending_failure_recoveries()
+        self.patch_jump_for_descr(faildescr, rawstart)
+        self.teardown()
+        # oprofile support
+        if self.cpu.profile_agent is not None:
+            name = "Bridge # %s: %s" % (descr_number, funcname)
+            self.cpu.profile_agent.native_code_written(name,
+                                                       rawstart, fullsize)
 
     def write_pending_failure_recoveries(self):
+        # for each pending guard, generate the code of the recovery stub
+        # at the end of self.mc.
         for tok in self.pending_guard_tokens:
-            # Okay to write to _mc because we've already made sure that
-            # there's enough space by "reserving" bytes.
-            addr = self.generate_quick_failure(self.mc._mc, tok.faildescr, tok.failargs, tok.fail_locs, tok.exc, tok.desc_bytes)
-            tok.faildescr._x86_adr_recovery_stub = addr
-            self.patch_jump_for_descr(tok.faildescr, addr)
+            tok.pos_recovery_stub = self.generate_quick_failure(tok)
 
-        self.pending_guard_tokens = []
-        self.mc.reset_reserved_bytes()
-        self.mc.done()
+    def patch_pending_failure_recoveries(self, rawstart):
+        # after we wrote the assembler to raw memory, set up
+        # tok.faildescr._x86_adr_jump_offset to contain the raw address of
+        # the 4-byte target field in the JMP/Jcond instruction, and patch
+        # the field in question to point (initially) to the recovery stub
+        for tok in self.pending_guard_tokens:
+            addr = rawstart + tok.pos_jump_offset
+            tok.faildescr._x86_adr_jump_offset = addr
+            relative_target = tok.pos_recovery_stub - (tok.pos_jump_offset + 4)
+            assert rx86.fits_in_32bits(relative_target)
+            p = rffi.cast(rffi.INTP, addr)
+            p[0] = rffi.cast(rffi.INT, relative_target)
+
+    def materialize(self, looptoken):
+        clt = looptoken.compiled_loop_token
+        if clt.asmmemmgr_blocks is None:
+            clt.asmmemmgr_blocks = []
+        return self.mc.materialize(self.cpu.asmmemmgr,
+                                   clt.asmmemmgr_blocks,
+                                   self.cpu.gc_ll_descr.gcrootmap)
 
     def _find_debug_merge_point(self, operations):
 
@@ -410,30 +350,50 @@
 
     def _register_counter(self):
         if self._debug:
-            # YYY leak -- just put it in self.mc instead
+            # YYY very minor leak -- we need the counters to stay alive
+            # forever, just because we want to report them at the end
+            # of the process
             struct = lltype.malloc(DEBUG_COUNTER, flavor='raw',
                                    track_allocation=False)
             struct.i = 0
-            self.loop_run_counters.append((len(self.loop_run_counters), struct))
-        
+            self.loop_run_counters.append(struct)
+
+    def _find_failure_recovery_bytecode(self, faildescr):
+        adr_jump_offset = faildescr._x86_adr_jump_offset
+        if adr_jump_offset == 0:
+            raise ValueError
+        # follow the JMP/Jcond
+        p = rffi.cast(rffi.INTP, adr_jump_offset)
+        adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
+        # skip the CALL
+        if WORD == 4:
+            adr_target += 5     # CALL imm
+        else:
+            adr_target += 13    # MOV r11, imm; CALL *r11
+        return adr_target
+
     def patch_jump_for_descr(self, faildescr, adr_new_target):
         adr_jump_offset = faildescr._x86_adr_jump_offset
-        adr_recovery_stub = faildescr._x86_adr_recovery_stub
+        assert adr_jump_offset != 0
         offset = adr_new_target - (adr_jump_offset + 4)
         # If the new target fits within a rel32 of the jump, just patch
         # that. Otherwise, leave the original rel32 to the recovery stub in
         # place, but clobber the recovery stub with a jump to the real
         # target.
+        mc = codebuf.MachineCodeBlockWrapper()
         if rx86.fits_in_32bits(offset):
-            mc = codebuf.InMemoryCodeBuilder(adr_jump_offset, adr_jump_offset + 4)
             mc.writeimm32(offset)
+            mc.copy_to_raw_memory(adr_jump_offset)
         else:
-            # "mov r11, addr; jmp r11" is 13 bytes
-            mc = codebuf.InMemoryCodeBuilder(adr_recovery_stub, adr_recovery_stub + 13)
+            # "mov r11, addr; jmp r11" is 13 bytes, which fits in there
+            # because we always write "mov r11, addr; call *r11" in the
+            # first place.
             mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
             mc.JMP_r(X86_64_SCRATCH_REG.value)
-
-        mc.done()
+            p = rffi.cast(rffi.INTP, adr_jump_offset)
+            adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
+            mc.copy_to_raw_memory(adr_target)
+        faildescr._x86_adr_jump_offset = 0    # means "patched"
 
     @specialize.argtype(1)
     def _inject_debugging_code(self, looptoken, operations):
@@ -444,7 +404,7 @@
                 s += op.getopnum()
             looptoken._x86_debug_checksum = s
             c_adr = ConstInt(rffi.cast(lltype.Signed,
-                                     self.loop_run_counters[-1][1]))
+                                       self.loop_run_counters[-1]))
             box = BoxInt()
             box2 = BoxInt()
             ops = [ResOperation(rop.GETFIELD_RAW, [c_adr],
@@ -453,19 +413,11 @@
                    ResOperation(rop.SETFIELD_RAW, [c_adr, box2],
                                 None, descr=self.debug_counter_descr)]
             operations = ops + operations
-            # # we need one register free (a bit of a hack, but whatever)
-            # self.mc.PUSH(eax)
-            # adr = rffi.cast(lltype.Signed, self.loop_run_counters[-1][1])
-            # self.mc.MOV(eax, heap(adr))
-            # self.mc.ADD(eax, imm1)
-            # self.mc.MOV(heap(adr), eax)
-            # self.mc.POP(eax)
         return operations
 
     def _assemble(self, regalloc, operations):
         self._regalloc = regalloc
         regalloc.walk_operations(operations)        
-        self.mc.done()
         if we_are_translated() or self.cpu.dont_keepalive_stuff:
             self._regalloc = None   # else keep it around for debugging
         frame_depth = regalloc.fm.frame_depth
@@ -481,30 +433,31 @@
     def _patchable_stackadjust(self):
         # stack adjustment LEA
         self.mc.LEA32_rb(esp.value, 0)
-        return self.mc.tell() - 4
+        return self.mc.get_relative_pos() - 4
 
-    def _patch_stackadjust(self, adr_lea, reserved_depth):
+    def _patch_stackadjust(self, adr_lea, allocated_depth):
         # patch stack adjustment LEA
-        mc = codebuf.InMemoryCodeBuilder(adr_lea, adr_lea + 4)
-        # Compute the correct offset for the instruction LEA ESP, [EBP-4*words].
+        mc = codebuf.MachineCodeBlockWrapper()
+        # Compute the correct offset for the instruction LEA ESP, [EBP-4*words]
+        mc.writeimm32(self._get_offset_of_ebp_from_esp(allocated_depth))
+        mc.copy_to_raw_memory(adr_lea)
+
+    def _get_offset_of_ebp_from_esp(self, allocated_depth):
         # Given that [EBP] is where we saved EBP, i.e. in the last word
         # of our fixed frame, then the 'words' value is:
-        words = (self.cpu.FRAME_FIXED_SIZE - 1) + reserved_depth
-        # align, e.g. for Mac OS X        
+        words = (self.cpu.FRAME_FIXED_SIZE - 1) + allocated_depth
+        # align, e.g. for Mac OS X
         aligned_words = align_stack_words(words+2)-2 # 2 = EIP+EBP
-        mc.writeimm32(-WORD * aligned_words)
-        mc.done()
+        return -WORD * aligned_words
 
     def _call_header(self):
+        # NB. the shape of the frame is hard-coded in get_basic_shape() too.
+        # Also, make sure this is consistent with FRAME_FIXED_SIZE.
         self.mc.PUSH_r(ebp.value)
         self.mc.MOV_rr(ebp.value, esp.value)
         for regloc in self.cpu.CALLEE_SAVE_REGISTERS:
             self.mc.PUSH_r(regloc.value)
 
-        # NB. the shape of the frame is hard-coded in get_basic_shape() too.
-        # Also, make sure this is consistent with FRAME_FIXED_SIZE.
-        return self._patchable_stackadjust()
-
     def _call_footer(self):
         self.mc.LEA_rb(esp.value, -len(self.cpu.CALLEE_SAVE_REGISTERS) * WORD)
 
@@ -514,17 +467,16 @@
         self.mc.POP_r(ebp.value)
         self.mc.RET()
 
-    def _assemble_bootstrap_direct_call(self, arglocs, jmpadr, stackdepth):
+    def _assemble_bootstrap_direct_call(self, arglocs, jmppos, stackdepth):
         if IS_X86_64:
-            return self._assemble_bootstrap_direct_call_64(arglocs, jmpadr, stackdepth)
+            return self._assemble_bootstrap_direct_call_64(arglocs, jmppos, stackdepth)
         # XXX pushing ebx esi and edi is a bit pointless, since we store
         #     all regsiters anyway, for the case of guard_not_forced
         # XXX this can be improved greatly. Right now it'll behave like
         #     a normal call
         nonfloatlocs, floatlocs = arglocs
-        # XXX not to repeat the logic, a bit around
-        adr_stackadjust = self._call_header()
-        self._patch_stackadjust(adr_stackadjust, stackdepth)
+        self._call_header()
+        self.mc.LEA_rb(esp.value, self._get_offset_of_ebp_from_esp(stackdepth))
         for i in range(len(nonfloatlocs)):
             loc = nonfloatlocs[i]
             if isinstance(loc, RegLoc):
@@ -546,9 +498,11 @@
                 self.mc.MOVSD_xb(xmmtmp.value, (1 + i) * 2 * WORD)
                 assert isinstance(loc, StackLoc)
                 self.mc.MOVSD_bx(loc.value, xmmtmp.value)
-        self.mc.JMP_l(jmpadr)
+        endpos = self.mc.get_relative_pos() + 5
+        self.mc.JMP_l(jmppos - endpos)
+        assert endpos == self.mc.get_relative_pos()
 
-    def _assemble_bootstrap_direct_call_64(self, arglocs, jmpadr, stackdepth):
+    def _assemble_bootstrap_direct_call_64(self, arglocs, jmppos, stackdepth):
         # XXX: Very similar to _emit_call_64
 
         src_locs = []
@@ -562,8 +516,8 @@
         unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
 
         nonfloatlocs, floatlocs = arglocs
-        adr_stackadjust = self._call_header()
-        self._patch_stackadjust(adr_stackadjust, stackdepth)
+        self._call_header()
+        self.mc.LEA_rb(esp.value, self._get_offset_of_ebp_from_esp(stackdepth))
 
         # The lists are padded with Nones
         assert len(nonfloatlocs) == len(floatlocs)
@@ -599,8 +553,9 @@
                 # clobber the scratch register
                 self.mc.MOV(loc, X86_64_SCRATCH_REG)
 
-        finaljmp = self.mc.tell()
-        self.mc.JMP(imm(jmpadr))
+        endpos = self.mc.get_relative_pos() + 5
+        self.mc.JMP_l(jmppos - endpos)
+        assert endpos == self.mc.get_relative_pos()
 
     def redirect_call_assembler(self, oldlooptoken, newlooptoken):
         # some minimal sanity checking
@@ -613,16 +568,17 @@
         # Ideally we should rather patch all existing CALLs, but well.
         oldadr = oldlooptoken._x86_direct_bootstrap_code
         target = newlooptoken._x86_direct_bootstrap_code
-        mc = codebuf.InMemoryCodeBuilder(oldadr, oldadr + 16)
+        mc = codebuf.MachineCodeBlockWrapper()
         mc.JMP(imm(target))
-        mc.done()
+        mc.copy_to_raw_memory(oldadr)
 
     def _assemble_bootstrap_code(self, inputargs, arglocs):
         nonfloatlocs, floatlocs = arglocs
-        adr_stackadjust = self._call_header()
+        self._call_header()
+        stackadjustpos = self._patchable_stackadjust()
         tmp = X86RegisterManager.all_regs[0]
         xmmtmp = X86XMMRegisterManager.all_regs[0]
-        self.mc._mc.begin_reuse_scratch_register()
+        self.mc.begin_reuse_scratch_register()
         for i in range(len(nonfloatlocs)):
             loc = nonfloatlocs[i]
             if loc is None:
@@ -654,8 +610,8 @@
                 self.mc.MOVSD(xmmtmp, heap(adr))
                 assert isinstance(loc, StackLoc)
                 self.mc.MOVSD_bx(loc.value, xmmtmp.value)
-        self.mc._mc.end_reuse_scratch_register()
-        return adr_stackadjust
+        self.mc.end_reuse_scratch_register()
+        return stackadjustpos
 
     def dump(self, text):
         if not self.verbose:
@@ -663,7 +619,8 @@
         _prev = Box._extended_display
         try:
             Box._extended_display = False
-            print >> sys.stderr, ' 0x%x  %s' % (fixid(self.mc.tell()), text)
+            pos = self.mc.get_relative_pos()
+            print >> sys.stderr, ' 0x%x  %s' % (pos, text)
         finally:
             Box._extended_display = _prev
 
@@ -723,7 +680,7 @@
                                          arglocs, resloc)
         if not we_are_translated():
             # must be added by the genop_guard_list[]()
-            assert hasattr(faildescr, '_x86_adr_jump_offset')
+            assert guard_token is self.pending_guard_tokens[-1]
 
     def regalloc_perform_guard(self, guard_op, faillocs, arglocs, resloc,
                                current_depths):
@@ -796,9 +753,6 @@
                                   result_loc):
             guard_opnum = guard_op.getopnum()
             self.mc.UCOMISD(arglocs[0], arglocs[1])
-            # 16 is enough space for the rel8 jumps below and the rel32
-            # jump in implement_guard
-            self.mc.ensure_bytes_available(16 + guard_token.recovery_stub_size())
             if guard_opnum == rop.GUARD_FALSE:
                 if need_jp:
                     self.mc.J_il8(rx86.Conditions['P'], 6)
@@ -966,9 +920,6 @@
     def genop_guard_float_ne(self, op, guard_op, guard_token, arglocs, result_loc):
         guard_opnum = guard_op.getopnum()
         self.mc.UCOMISD(arglocs[0], arglocs[1])
-        # 16 is enough space for the rel8 jumps below and the rel32
-        # jump in implement_guard
-        self.mc.ensure_bytes_available(16 + guard_token.recovery_stub_size())
         if guard_opnum == rop.GUARD_TRUE:
             self.mc.J_il8(rx86.Conditions['P'], 6)
             self.implement_guard(guard_token, 'E')
@@ -1292,13 +1243,11 @@
                 self.mc.CMP32_mi((locs[0].value, 0), expected_typeid)
 
     def genop_guard_guard_class(self, ign_1, guard_op, guard_token, locs, ign_2):
-        self.mc.ensure_bytes_available(256)
         self._cmp_guard_class(locs)
         self.implement_guard(guard_token, 'NE')
 
     def genop_guard_guard_nonnull_class(self, ign_1, guard_op,
                                         guard_token, locs, ign_2):
-        self.mc.ensure_bytes_available(256)
         self.mc.CMP(locs[0], imm1)
         # Patched below
         self.mc.J_il8(rx86.Conditions['B'], 0)
@@ -1307,7 +1256,7 @@
         # patch the JB above
         offset = self.mc.get_relative_pos() - jb_location
         assert 0 < offset <= 127
-        self.mc.overwrite(jb_location-1, [chr(offset)])
+        self.mc.overwrite(jb_location-1, chr(offset))
         #
         self.implement_guard(guard_token, 'NE')
 
@@ -1316,42 +1265,33 @@
         exc = (guard_opnum == rop.GUARD_EXCEPTION or
                guard_opnum == rop.GUARD_NO_EXCEPTION or
                guard_opnum == rop.GUARD_NOT_FORCED)
-        desc_bytes = self.failure_recovery_description(failargs, fail_locs)
-        return GuardToken(faildescr, failargs, fail_locs, exc, desc_bytes)
+        return GuardToken(faildescr, failargs, fail_locs, exc)
 
-    def generate_quick_failure(self, mc, faildescr, failargs, fail_locs, exc, desc_bytes):
+    def generate_quick_failure(self, guardtok):
         """Generate the initial code for handling a failure.  We try to
-        keep it as compact as possible.  The idea is that this code is
-        executed at most once (and very often, zero times); when
-        executed, it generates a more complete piece of code which can
-        really handle recovery from this particular failure.
+        keep it as compact as possible.
         """
-        fail_index = self.cpu.get_fail_descr_number(faildescr)
-        addr = mc.tell()
+        fail_index = self.cpu.get_fail_descr_number(guardtok.faildescr)
+        mc = self.mc
+        startpos = mc.get_relative_pos()
         withfloats = False
-        for box in failargs:
+        for box in guardtok.failargs:
             if box is not None and box.type == FLOAT:
                 withfloats = True
                 break
+        exc = guardtok.exc
         mc.CALL(imm(self.failure_recovery_code[exc + 2 * withfloats]))
         # write tight data that describes the failure recovery
-        faildescr._x86_failure_recovery_bytecode = mc.tell()
-        for byte in desc_bytes:
-            mc.writechr(ord(byte))
+        self.write_failure_recovery_description(mc, guardtok.failargs,
+                                                guardtok.fail_locs)
         # write the fail_index too
         mc.writeimm32(fail_index)
         # for testing the decoding, write a final byte 0xCC
         if not we_are_translated():
-            mc.writechr(0xCC)
-            faildescr._x86_debug_faillocs = [loc for loc in fail_locs
-                                                 if loc is not None]
-
-        # Make sure the recovery stub is at least 16 bytes long (for the
-        # case where we overwrite the recovery stub with a 64-bit absolute
-        # jump)
-        while mc.tell() - addr < 16:
-            mc.writechr(0x00)
-        return addr
+            mc.writechar('\xCC')
+            faillocs = [loc for loc in guardtok.fail_locs if loc is not None]
+            guardtok.faildescr._x86_debug_faillocs = faillocs
+        return startpos
 
     DESCR_REF       = 0x00
     DESCR_INT       = 0x01
@@ -1362,8 +1302,7 @@
     CODE_STOP       = 0 | DESCR_SPECIAL
     CODE_HOLE       = 4 | DESCR_SPECIAL
 
-    def failure_recovery_description(self, failargs, locs):
-        desc_bytes = []
+    def write_failure_recovery_description(self, mc, failargs, locs):
         for i in range(len(failargs)):
             arg = failargs[i]
             if arg is not None:
@@ -1383,19 +1322,14 @@
                     n = loc.value
                 n = kind + 4*n
                 while n > 0x7F:
-                    desc_bytes.append(chr((n & 0x7F) | 0x80))
+                    mc.writechar(chr((n & 0x7F) | 0x80))
                     n >>= 7
             else:
                 n = self.CODE_HOLE
-            desc_bytes.append(chr(n))
-        desc_bytes.append(chr(self.CODE_STOP))
+            mc.writechar(chr(n))
+        mc.writechar(chr(self.CODE_STOP))
         # assert that the fail_boxes lists are big enough
         assert len(failargs) <= self.fail_boxes_int.SIZE
-        return desc_bytes
-
-    def write_failure_recovery_description(self, mc, failargs, locs):
-        for byte in self.failure_recovery_description(failargs, locs):
-            mc.writechr(ord(byte))
 
     def rebuild_faillocs_from_descr(self, bytecode):
         from pypy.jit.backend.x86.regalloc import X86FrameManager
@@ -1533,10 +1467,8 @@
                                          self.failure_recovery_func)
         failure_recovery_func = rffi.cast(lltype.Signed,
                                           failure_recovery_func)
-        mc = self.mc._mc
-        # Assume that we are called at the beginning, when there is no risk
-        # that 'mc' runs out of space.  Checked by asserts in mc.write().
-        recovery_addr = mc.tell()
+        mc = codebuf.MachineCodeBlockWrapper()
+        self.mc = mc
 
         # Push all general purpose registers
         for gpr in range(self.cpu.NUM_REGS-1, -1, -1):
@@ -1587,11 +1519,12 @@
         # above.
 
         self._call_footer()
-        self.mc.done()
-        self.failure_recovery_code[exc + 2 * withfloats] = recovery_addr
+        rawstart = mc.materialize(self.cpu.asmmemmgr, [])
+        self.failure_recovery_code[exc + 2 * withfloats] = rawstart
+        self.mc = None
 
     def generate_failure(self, fail_index, locs, exc, locs_are_ref):
-        self.mc._mc.begin_reuse_scratch_register()
+        self.mc.begin_reuse_scratch_register()
         for i in range(len(locs)):
             loc = locs[i]
             if isinstance(loc, RegLoc):
@@ -1618,7 +1551,7 @@
                         adr = self.fail_boxes_int.get_addr_for_num(i)
                     self.mc.MOV(eax, loc)
                     self.mc.MOV(heap(adr), eax)
-        self.mc._mc.end_reuse_scratch_register()
+        self.mc.end_reuse_scratch_register()
 
         # we call a provided function that will
         # - call our on_leave_jitted_hook which will mark
@@ -1634,17 +1567,13 @@
         self._call_footer()
 
     def implement_guard(self, guard_token, condition=None):
-        self.mc.reserve_bytes(guard_token.recovery_stub_size())
-        self.pending_guard_tokens.append(guard_token)
-        # These jumps are patched later, the mc.tell() are just
-        # dummy values.  Also, use self.mc._mc to avoid triggering a
-        # "buffer full" exactly here.
-        mc = self.mc._mc
+        # These jumps are patched later.
         if condition:
-            mc.J_il(rx86.Conditions[condition], mc.tell())
+            self.mc.J_il(rx86.Conditions[condition], 0)
         else:
-            mc.JMP_l(mc.tell())
-        guard_token.faildescr._x86_adr_jump_offset = mc.tell() - 4
+            self.mc.JMP_l(0)
+        guard_token.pos_jump_offset = self.mc.get_relative_pos() - 4
+        self.pending_guard_tokens.append(guard_token)
 
     def genop_call(self, op, arglocs, resloc):
         sizeloc = arglocs[0]
@@ -1699,7 +1628,6 @@
         # Write a call to the direct_bootstrap_code of the target assembler
         self._emit_call(imm(descr._x86_direct_bootstrap_code), arglocs, 2,
                         tmp=eax)
-        self.mc.ensure_bytes_available(256)
         if op.result is None:
             assert result_loc is None
             value = self.cpu.done_with_this_frame_void_v
@@ -1735,7 +1663,7 @@
         # Path B: fast path.  Must load the return value, and reset the token
         offset = jmp_location - je_location
         assert 0 < offset <= 127
-        self.mc.overwrite(je_location - 1, [chr(offset)])
+        self.mc.overwrite(je_location - 1, chr(offset))
         #
         # Reset the vable token --- XXX really too much special logic here:-(
         if jd.index_of_virtualizable >= 0:
@@ -1770,7 +1698,7 @@
         # Here we join Path A and Path B again
         offset = self.mc.get_relative_pos() - jmp_location
         assert 0 <= offset <= 127
-        self.mc.overwrite(jmp_location - 1, [chr(offset)])
+        self.mc.overwrite(jmp_location - 1, chr(offset))
         self.mc.CMP_bi(FORCE_INDEX_OFS, 0)
         self.implement_guard(guard_token, 'L')
 
@@ -1785,9 +1713,6 @@
             cls = self.cpu.gc_ll_descr.has_write_barrier_class()
             assert cls is not None and isinstance(descr, cls)
         loc_base = arglocs[0]
-        # ensure that enough bytes are available to write the whole
-        # following piece of code atomically (for the JZ)
-        self.mc.ensure_bytes_available(256)
         self.mc.TEST8_mi((loc_base.value, descr.jit_wb_if_flag_byteofs),
                 descr.jit_wb_if_flag_singlebyte)
         self.mc.J_il8(rx86.Conditions['Z'], 0) # patched later
@@ -1830,7 +1755,7 @@
         # patch the JZ above
         offset = self.mc.get_relative_pos() - jz_location
         assert 0 < offset <= 127
-        self.mc.overwrite(jz_location-1, [chr(offset)])
+        self.mc.overwrite(jz_location-1, chr(offset))
 
     def genop_force_token(self, op, arglocs, resloc):
         # RegAlloc.consider_force_token ensures this:
@@ -1853,18 +1778,21 @@
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
         if gcrootmap:
             mark = self._regalloc.get_mark_gc_roots(gcrootmap)
-            gcrootmap.put(rffi.cast(llmemory.Address, self.mc.tell()), mark)
+            self.mc.insert_gcroot_marker(mark)
 
     def target_arglocs(self, loop_token):
         return loop_token._x86_arglocs
 
     def closing_jump(self, loop_token):
-        self.mc.JMP(imm(loop_token._x86_loop_code))
+        if loop_token is self.currently_compiling_loop:
+            curpos = self.mc.get_relative_pos() + 5
+            self.mc.JMP_l(self.looppos - curpos)
+        else:
+            self.mc.JMP(imm(loop_token._x86_loop_code))
 
     def malloc_cond_fixedsize(self, nursery_free_adr, nursery_top_adr,
                               size, tid):
         size = max(size, self.cpu.gc_ll_descr.minimal_size_in_nursery)
-        self.mc.ensure_bytes_available(256)
         self.mc.MOV(eax, heap(nursery_free_adr))
         self.mc.LEA_rm(edx.value, (eax.value, size))
         self.mc.CMP(edx, heap(nursery_top_adr))
@@ -1892,7 +1820,7 @@
 
         offset = self.mc.get_relative_pos() - jmp_adr
         assert 0 < offset <= 127
-        self.mc.overwrite(jmp_adr-1, [chr(offset)])
+        self.mc.overwrite(jmp_adr-1, chr(offset))
         # on 64-bits, 'tid' is a value that fits in 31 bits
         self.mc.MOV_mi((eax.value, 0), tid)
         self.mc.MOV(heap(nursery_free_adr), edx)

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/regalloc.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/regalloc.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/regalloc.py	Sun Nov 28 11:49:48 2010
@@ -1108,7 +1108,7 @@
             if (isinstance(v, BoxPtr) and self.rm.stays_alive(v)):
                 assert reg in self.rm.REGLOC_TO_GCROOTMAP_REG_INDEX
                 gcrootmap.add_callee_save_reg(shape, self.rm.REGLOC_TO_GCROOTMAP_REG_INDEX[reg])
-        return gcrootmap.compress_callshape(shape)
+        return shape
 
     def consider_force_token(self, op):
         loc = self.rm.force_allocate_reg(op.result)

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/regloc.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/regloc.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/regloc.py	Sun Nov 28 11:49:48 2010
@@ -334,9 +334,9 @@
                 if code == possible_code:
                     val = getattr(loc, "value_" + possible_code)()
                     if possible_code == 'i':
-                        offset = intmask(val - (self.tell() + 5))
-                        if rx86.fits_in_32bits(offset):
+                        if self.WORD == 4:
                             _rx86_getattr(self, name + "_l")(val)
+                            self.add_pending_relocation()
                         else:
                             assert self.WORD == 8
                             self._load_scratch(val)

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/runner.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/runner.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/runner.py	Sun Nov 28 11:49:48 2010
@@ -44,6 +44,7 @@
 
     def setup_once(self):
         self.profile_agent.startup()
+        self.assembler.setup_once()
 
     def finish_once(self):
         self.assembler.finish_once()
@@ -58,7 +59,7 @@
         clt = original_loop_token.compiled_loop_token
         clt.compiling_a_bridge()
         self.assembler.assemble_bridge(faildescr, inputargs, operations,
-                                       log=log)
+                                       original_loop_token, log=log)
 
     def set_future_value_int(self, index, intvalue):
         self.assembler.fail_boxes_int.setitem(index, intvalue)
@@ -127,8 +128,8 @@
         assert fail_index >= 0, "already forced!"
         faildescr = self.get_fail_descr_from_number(fail_index)
         rffi.cast(TP, addr_of_force_index)[0] = -1
-        bytecode = rffi.cast(rffi.UCHARP,
-                             faildescr._x86_failure_recovery_bytecode)
+        frb = self.assembler._find_failure_recovery_bytecode(faildescr)
+        bytecode = rffi.cast(rffi.UCHARP, frb)
         # start of "no gc operation!" block
         fail_index_2 = self.assembler.grab_frame_values(
             bytecode,
@@ -168,14 +169,12 @@
 CPU = CPU386
 
 # silence warnings
-history.LoopToken._x86_param_depth = 0
-history.LoopToken._x86_arglocs = (None, None)
-history.LoopToken._x86_frame_depth = 0
-history.LoopToken._x86_bootstrap_code = 0
-history.LoopToken._x86_direct_bootstrap_code = 0
-history.LoopToken._x86_loop_code = 0
-history.LoopToken._x86_debug_checksum = 0
-compile.AbstractFailDescr._x86_current_depths = (0, 0)
-compile.AbstractFailDescr._x86_failure_recovery_bytecode = 0
-compile.AbstractFailDescr._x86_adr_jump_offset = 0
-compile.AbstractFailDescr._x86_adr_recovery_stub = 0
+##history.LoopToken._x86_param_depth = 0
+##history.LoopToken._x86_arglocs = (None, None)
+##history.LoopToken._x86_frame_depth = 0
+##history.LoopToken._x86_bootstrap_code = 0
+##history.LoopToken._x86_direct_bootstrap_code = 0
+##history.LoopToken._x86_loop_code = 0
+##history.LoopToken._x86_debug_checksum = 0
+##compile.AbstractFailDescr._x86_current_depths = (0, 0)
+##compile.AbstractFailDescr._x86_adr_jump_offset = 0

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/rx86.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/rx86.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/rx86.py	Sun Nov 28 11:49:48 2010
@@ -137,10 +137,9 @@
 # ____________________________________________________________
 # Emit an immediate displacement (relative to the cur insn)
 
-def encode_relative(mc, target, _, orbyte):
+def encode_relative(mc, relative_target, _, orbyte):
     assert orbyte == 0
-    offset = intmask(target - (mc.tell() + 4))
-    mc.writeimm32(offset)
+    mc.writeimm32(relative_target)
     return 0
 
 def relative(argnum):

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/support.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/support.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/support.py	Sun Nov 28 11:49:48 2010
@@ -1,4 +1,7 @@
+import sys
 from pypy.rpython.lltypesystem import lltype, rffi, llmemory
+from pypy.translator.tool.cbuild import ExternalCompilationInfo
+
 
 def values_array(TP, size):
     ATP = lltype.GcArray(TP)
@@ -23,3 +26,22 @@
             return True
 
     return ValuesArray()
+
+# ____________________________________________________________
+
+memcpy_fn = rffi.llexternal('memcpy', [llmemory.Address, llmemory.Address,
+                                       rffi.SIZE_T], lltype.Void,
+                            sandboxsafe=True, _nowrapper=True)
+
+# ____________________________________________________________
+
+if sys.platform == 'win32':
+    ensure_sse2_floats = lambda : None
+else:
+    _sse2_eci = ExternalCompilationInfo(
+        compile_extra = ['-msse2', '-mfpmath=sse'],
+        separate_module_sources = ['void PYPY_NO_OP(void) {}'],
+        )
+    ensure_sse2_floats = rffi.llexternal('PYPY_NO_OP', [], lltype.Void,
+                                         compilation_info=_sse2_eci,
+                                         sandboxsafe=True)

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_assembler.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_assembler.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_assembler.py	Sun Nov 28 11:49:48 2010
@@ -1,5 +1,5 @@
 from pypy.jit.backend.x86.regloc import *
-from pypy.jit.backend.x86.assembler import Assembler386, MachineCodeBlockWrapper
+from pypy.jit.backend.x86.assembler import Assembler386
 from pypy.jit.backend.x86.regalloc import X86FrameManager, get_ebp_ofs
 from pypy.jit.metainterp.history import BoxInt, BoxPtr, BoxFloat, INT, REF, FLOAT
 from pypy.rlib.rarithmetic import intmask
@@ -19,28 +19,10 @@
         return 42
 
 class FakeMC:
-    def __init__(self, base_address=0):
+    def __init__(self):
         self.content = []
-        self._size = 100
-        self.base_address = base_address
-    def writechr(self, n):
-        self.content.append(n)
-    def tell(self):
-        return self.base_address + len(self.content)
-    def get_relative_pos(self):
-        return len(self.content)
-    def JMP(self, *args):
-        self.content.append(("JMP", args))
-    def done(self):
-        pass
-    def PUSH_r(self, reg):
-        pass
-    def POP_r(self, reg):
-        pass
-
-class FakeAssembler:
-    def write_pending_failure_recoveries(self):
-        pass
+    def writechar(self, char):
+        self.content.append(ord(char))
 
 def test_write_failure_recovery_description():
     assembler = Assembler386(FakeCPU())
@@ -255,41 +237,3 @@
         assert assembler.fail_boxes_int.getitem(i) == expected_ints[i]
         assert assembler.fail_boxes_ptr.getitem(i) == expected_ptrs[i]
         assert assembler.fail_boxes_float.getitem(i) == expected_floats[i]
-
-class FakeProfileAgent(object):
-    def __init__(self):
-        self.functions = []
-
-    def native_code_written(self, name, address, size):
-        self.functions.append((name, address, size))
-
-class FakeMCWrapper(MachineCodeBlockWrapper):
-    count = 0
-    def _instantiate_mc(self):
-        self.count += 1
-        return FakeMC(200 * (self.count - 1))
-
-def test_mc_wrapper_profile_agent():
-    agent = FakeProfileAgent()
-    assembler = FakeAssembler()
-    mc = FakeMCWrapper(assembler, 100, agent)
-    mc.start_function("abc")
-    mc.writechr("x")
-    mc.writechr("x")
-    mc.writechr("x")
-    mc.writechr("x")
-    mc.end_function()
-    assert agent.functions == [("abc", 0, 4)]
-    mc.writechr("x")
-    mc.start_function("cde")
-    mc.writechr("x")
-    mc.writechr("x")
-    mc.writechr("x")
-    mc.writechr("x")
-    mc.end_function()
-    assert agent.functions == [("abc", 0, 4), ("cde", 5, 4)]
-    mc.start_function("xyz")
-    for i in range(50):
-        mc.writechr("x")
-    mc.end_function()
-    assert agent.functions == [("abc", 0, 4), ("cde", 5, 4), ("xyz", 9, 29), ("xyz", 200, 22)]

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_gc_integration.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_gc_integration.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_gc_integration.py	Sun Nov 28 11:49:48 2010
@@ -33,9 +33,6 @@
     def add_callee_save_reg(self, shape, reg_index):
         index_to_name = { 1: 'ebx', 2: 'esi', 3: 'edi' }
         shape.append(index_to_name[reg_index])
-    def compress_callshape(self, shape):
-        assert shape[0] == 'shape'
-        return ['compressed'] + shape[1:]
 
 class MockGcDescr(GcCache):
     def get_funcptr_for_new(self):
@@ -58,6 +55,7 @@
 
     def test_mark_gc_roots(self):
         cpu = CPU(None, None)
+        cpu.setup_once()
         regalloc = RegAlloc(MockAssembler(cpu, MockGcDescr(False)))
         boxes = [BoxPtr() for i in range(len(X86RegisterManager.all_regs))]
         longevity = {}
@@ -81,7 +79,7 @@
         assert len(regalloc.assembler.movs) == 3
         #
         mark = regalloc.get_mark_gc_roots(cpu.gc_ll_descr.gcrootmap)
-        assert mark[0] == 'compressed'
+        assert mark[0] == 'shape'
         base = -WORD * FRAME_FIXED_SIZE
         expected = ['ebx', 'esi', 'edi', base, base-WORD, base-WORD*2]
         assert dict.fromkeys(mark[1:]) == dict.fromkeys(expected)
@@ -90,6 +88,7 @@
     
     cpu = CPU(None, None)
     cpu.gc_ll_descr = MockGcDescr(False)
+    cpu.setup_once()
     
     S = lltype.GcForwardReference()
     S.become(lltype.GcStruct('S', ('field', lltype.Ptr(S)),
@@ -214,6 +213,7 @@
         cpu = CPU(None, None)
         cpu.vtable_offset = WORD
         cpu.gc_ll_descr = GCDescrFastpathMalloc()
+        cpu.setup_once()
 
         NODE = lltype.Struct('node', ('tid', lltype.Signed),
                                      ('value', lltype.Signed))

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_regalloc.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_regalloc.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_regalloc.py	Sun Nov 28 11:49:48 2010
@@ -41,7 +41,10 @@
         self.movs = []
         self.performs = []
         self.lea = []
-        self.cpu = cpu or CPU(None, None)
+        if cpu is None:
+            cpu = CPU(None, None)
+            cpu.setup_once()
+        self.cpu = cpu
         if gc_ll_descr is None:
             gc_ll_descr = MockGcDescr(False)
         self.cpu.gc_ll_descr = gc_ll_descr
@@ -76,6 +79,7 @@
 
 class BaseTestRegalloc(object):
     cpu = CPU(None, None)
+    cpu.setup_once()
 
     def raising_func(i):
         if i:

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_regalloc2.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_regalloc2.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_regalloc2.py	Sun Nov 28 11:49:48 2010
@@ -19,6 +19,7 @@
         ResOperation(rop.FINISH, [v4, v3], None, descr=BasicFailDescr()),
         ]
     cpu = CPU(None, None)
+    cpu.setup_once()
     looptoken = LoopToken()
     cpu.compile_loop(inputargs, operations, looptoken)
     cpu.set_future_value_int(0, 9)
@@ -41,6 +42,7 @@
         ResOperation(rop.FINISH, [v4, v3, tmp5], None, descr=BasicFailDescr()),
             ]
     cpu = CPU(None, None)
+    cpu.setup_once()
     looptoken = LoopToken()
     cpu.compile_loop(inputargs, operations, looptoken)
     cpu.set_future_value_int(0, -10)
@@ -137,6 +139,7 @@
         ResOperation(rop.FINISH, [v40, v36, v37, v31, v16, v34, v35, v23, v22, v29, v14, v39, v30, v38], None, descr=BasicFailDescr()),
             ]
     cpu = CPU(None, None)
+    cpu.setup_once()
     looptoken = LoopToken()
     cpu.compile_loop(inputargs, operations, looptoken)
     cpu.set_future_value_int(0, -13)
@@ -251,6 +254,7 @@
         ResOperation(rop.FINISH, [v40, v10, v36, v26, v13, v30, v21, v33, v18, v25, v31, v32, v28, v29, v35, v38, v20, v39, v34, v23, v37], None, descr=BasicFailDescr()),
             ]
     cpu = CPU(None, None)
+    cpu.setup_once()
     looptoken = LoopToken()
     cpu.compile_loop(inputargs, operations, looptoken)
     cpu.set_future_value_int(0, 17)

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_regloc.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_regloc.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_regloc.py	Sun Nov 28 11:49:48 2010
@@ -1,4 +1,4 @@
-import struct
+import struct, sys
 from pypy.jit.backend.x86.regloc import *
 from pypy.jit.backend.x86.test.test_rx86 import CodeBuilder32, CodeBuilder64, assert_encodes_as
 from pypy.jit.backend.x86.assembler import heap
@@ -37,26 +37,36 @@
     assert_encodes_as(cb64, "CMP16", (ecx, ImmedLoc(12345)), '\x66\x81\xF9\x39\x30')
     assert_encodes_as(cb64, "CMP16", (AddressLoc(r13, ImmedLoc(0), 0, 0), ImmedLoc(12345)), '\x66\x41\x81\x7D\x00\x39\x30')
 
-def test_jmp_wraparound():
-    if not IS_X86_32:
-        py.test.skip()
-
-    pos_addr = intmask(0x7FFFFF00)
-    neg_addr = intmask(0x800000BB)
-
-    # JMP to "negative" address from "positive" address
-    s = cb32()
-    s.base_address = pos_addr
-    s.JMP(ImmedLoc(neg_addr))
-    expected_ofs = neg_addr - (pos_addr+5)
-    assert s.getvalue() == '\xE9' + struct.pack("<i", expected_ofs)
-
-    # JMP to a "positive" address from a "negative" address
-    s = cb32()
-    s.base_address = neg_addr
-    s.JMP(ImmedLoc(pos_addr))
-    expected_ofs = pos_addr - (neg_addr+5)
-    assert s.getvalue() == '\xE9' + struct.pack("<i", expected_ofs)
+def test_relocation():
+    from pypy.rpython.lltypesystem import lltype, rffi
+    from pypy.jit.backend.x86 import codebuf
+    for target in [0x01020304, 0x0102030405060708]:
+        if target > sys.maxint:
+            continue
+        mc = codebuf.MachineCodeBlockWrapper()
+        mc.CALL(ImmedLoc(target))
+        length = mc.get_relative_pos()
+        buf = lltype.malloc(rffi.CCHARP.TO, length, flavor='raw')
+        rawstart = rffi.cast(lltype.Signed, buf)
+        if IS_X86_32:
+            assert length == 5
+            assert mc.relocations == [5]
+            expected = "\xE8" + struct.pack('<i', target - (rawstart + 5))
+        elif IS_X86_64:
+            assert mc.relocations == []
+            if target <= 0x7fffffff:
+                assert length == 10
+                expected = (
+                    "\x49\xC7\xC3\x04\x03\x02\x01"  # MOV %r11, target
+                    "\x41\xFF\xD3")                 # CALL *%r11
+            else:
+                assert length == 13
+                expected = (
+                    "\x49\xBB\x08\x07\x06\x05\x04\x03\x02\x01" # MOV %r11, targ
+                    "\x41\xFF\xD3")                 # CALL *%r11
+        mc.copy_to_raw_memory(rawstart)
+        assert ''.join([buf[i] for i in range(length)]) == expected
+        lltype.free(buf, flavor='raw')
 
 
 class Test64Bits:

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_runner.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_runner.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_runner.py	Sun Nov 28 11:49:48 2010
@@ -33,6 +33,7 @@
     
     def setup_method(self, meth):
         self.cpu = CPU(rtyper=None, stats=FakeStats())
+        self.cpu.setup_once()
 
     def test_execute_ptr_operation(self):
         cpu = self.cpu
@@ -329,7 +330,11 @@
                         assert result != expected
 
     def test_compile_bridge_check_profile_info(self):
-        from pypy.jit.backend.x86.test.test_assembler import FakeProfileAgent
+        class FakeProfileAgent(object):
+            def __init__(self):
+                self.functions = []
+            def native_code_written(self, name, address, size):
+                self.functions.append((name, address, size))
         self.cpu.profile_agent = agent = FakeProfileAgent()
 
         i0 = BoxInt()
@@ -398,102 +403,10 @@
         assert res.value == 4.0
 
 
-class TestX86OverflowMC(TestX86):
-
-    def setup_method(self, meth):
-        self.cpu = CPU(rtyper=None, stats=FakeStats())
-        self.cpu.assembler.mc_size = 1024
-
-    def test_overflow_mc(self):
-        ops = []
-        base_v = BoxInt()
-        v = base_v
-        for i in range(1024):
-            next_v = BoxInt()
-            ops.append(ResOperation(rop.INT_ADD, [v, ConstInt(1)], next_v))
-            v = next_v
-        ops.append(ResOperation(rop.FINISH, [v], None,
-                                descr=BasicFailDescr()))
-        looptoken = LoopToken()
-        self.cpu.assembler.setup()
-        old_mc_mc = self.cpu.assembler.mc._mc
-        self.cpu.compile_loop([base_v], ops, looptoken)
-        assert self.cpu.assembler.mc._mc != old_mc_mc   # overflowed
-        self.cpu.set_future_value_int(0, base_v.value)
-        self.cpu.execute_token(looptoken)
-        assert self.cpu.get_latest_value_int(0) == 1024
-
-    def test_overflow_guard_float_cmp(self):
-        # The float comparisons on x86 tend to use small relative jumps,
-        # which may run into trouble if they fall on the edge of a
-        # MachineCodeBlock change.
-        a = BoxFloat(1.0)
-        b = BoxFloat(2.0)
-        failed = BoxInt(41)
-        finished = BoxInt(42)
-
-        # We select guards that will always succeed, so that execution will
-        # continue through the entire set of comparisions
-        ops_to_test = (
-            (rop.FLOAT_LT, [a, b], rop.GUARD_TRUE),
-            (rop.FLOAT_LT, [b, a], rop.GUARD_FALSE),
-
-            (rop.FLOAT_LE, [a, a], rop.GUARD_TRUE),
-            (rop.FLOAT_LE, [a, b], rop.GUARD_TRUE),
-            (rop.FLOAT_LE, [b, a], rop.GUARD_FALSE),
-
-            (rop.FLOAT_EQ, [a, a], rop.GUARD_TRUE),
-            (rop.FLOAT_EQ, [a, b], rop.GUARD_FALSE),
-
-            (rop.FLOAT_NE, [a, b], rop.GUARD_TRUE),
-            (rop.FLOAT_NE, [a, a], rop.GUARD_FALSE),
-
-            (rop.FLOAT_GT, [b, a], rop.GUARD_TRUE),
-            (rop.FLOAT_GT, [a, b], rop.GUARD_FALSE),
-
-            (rop.FLOAT_GE, [a, a], rop.GUARD_TRUE),
-            (rop.FLOAT_GE, [b, a], rop.GUARD_TRUE),
-            (rop.FLOAT_GE, [a, b], rop.GUARD_FALSE),
-        )
-
-        for float_op, args, guard_op in ops_to_test:
-            ops = []
-
-            for i in range(200):
-                cmp_result = BoxInt()
-                ops.append(ResOperation(float_op, args, cmp_result))
-                ops.append(ResOperation(guard_op, [cmp_result], None, descr=BasicFailDescr()))
-                ops[-1].setfailargs([failed])
-
-            ops.append(ResOperation(rop.FINISH, [finished], None, descr=BasicFailDescr()))
-
-            looptoken = LoopToken()
-            self.cpu.compile_loop([a, b, failed, finished], ops, looptoken)
-            self.cpu.set_future_value_float(0, a.value)
-            self.cpu.set_future_value_float(1, b.value)
-            self.cpu.set_future_value_int(2, failed.value)
-            self.cpu.set_future_value_int(3, finished.value)
-            self.cpu.execute_token(looptoken)
-
-            # Really just a sanity check. We're actually interested in
-            # whether the test segfaults.
-            assert self.cpu.get_latest_value_int(0) == finished.value
-
-    def test_overflow_guard_exception(self):
-        for i in range(50):
-            self.test_exceptions()
-
-
 class TestDebuggingAssembler(object):
     def setup_method(self, meth):
-        self.pypylog = os.environ.get('PYPYLOG', None)
-        self.logfile = str(udir.join('x86_runner.log'))
-        os.environ['PYPYLOG'] = "mumble:" + self.logfile
         self.cpu = CPU(rtyper=None, stats=FakeStats())
-
-    def teardown_method(self, meth):
-        if self.pypylog is not None:
-            os.environ['PYPYLOG'] = self.pypylog
+        self.cpu.setup_once()
 
     def test_debugger_on(self):
         from pypy.tool.logparser import parse_log_file, extract_category
@@ -515,8 +428,7 @@
             self.cpu.set_future_value_int(0, 0)
             self.cpu.execute_token(ops.token)
             # check debugging info
-            name, struct = self.cpu.assembler.loop_run_counters[0]
-            assert name == 0       # 'xyz'
+            struct = self.cpu.assembler.loop_run_counters[0]
             assert struct.i == 10
             self.cpu.finish_once()
         finally:

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_rx86.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_rx86.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_rx86.py	Sun Nov 28 11:49:48 2010
@@ -5,7 +5,6 @@
 class CodeBuilderMixin(object):
     def __init__(self):
         self.buffer = []
-        self.base_address = 0x76543210
 
     def writechar(self, c):
         assert isinstance(c, str) and len(c) == 1
@@ -14,9 +13,6 @@
     def getvalue(self):
         return ''.join(self.buffer)
 
-    def tell(self):
-        return self.base_address + len(self.buffer)
-
 def assert_encodes_as(code_builder_cls, insn_name, args, expected_encoding):
     s = code_builder_cls()
     getattr(s, insn_name)(*args)
@@ -104,21 +100,18 @@
 
 def test_call_l(s=None):
     s = s or CodeBuilder32()
-    s.CALL_l(0x01234567)
-    ofs = 0x01234567 - (0x76543210+5)
-    assert s.getvalue() == '\xE8' + struct.pack("<i", ofs)
+    s.CALL_l(0x01234567)   # relative offset
+    assert s.getvalue() == '\xE8' + struct.pack("<i", 0x01234567)
 
 def test_jmp_l():
     s = CodeBuilder32()
-    s.JMP_l(0x01234567)
-    ofs = 0x01234567 - (0x76543210+5)
-    assert s.getvalue() == '\xE9' + struct.pack("<i", ofs)
+    s.JMP_l(0x01234567)   # relative offset
+    assert s.getvalue() == '\xE9' + struct.pack("<i", 0x01234567)
 
 def test_j_il():
     s = CodeBuilder32()
-    s.J_il(5, 0x01234567)
-    ofs = 0x01234567 - (0x76543210+6)
-    assert s.getvalue() == '\x0F\x85' + struct.pack("<i", ofs)
+    s.J_il(5, 0x01234567)   # relative offset
+    assert s.getvalue() == '\x0F\x85' + struct.pack("<i", 0x01234567)
 
 def test_set_ir():
     s = CodeBuilder32()

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_zll_random.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_zll_random.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_zll_random.py	Sun Nov 28 11:49:48 2010
@@ -6,9 +6,7 @@
 
 def test_stress():
     cpu = CPU(None, None)
+    cpu.setup_once()
     r = Random()
     for i in range(1000):
         check_random_function(cpu, LLtypeOperationBuilder, r, i, 1000)
-
-
-

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_zmath.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_zmath.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/test/test_zmath.py	Sun Nov 28 11:49:48 2010
@@ -4,7 +4,7 @@
 import py, math
 from pypy.module.math.test import test_direct
 from pypy.translator.c.test.test_genc import compile
-from pypy.jit.backend.x86.codebuf import ensure_sse2_floats
+from pypy.jit.backend.x86.support import ensure_sse2_floats
 
 
 def get_test_case((fnname, args, expected)):

Modified: pypy/branch/jit-free-asm/pypy/jit/backend/x86/valgrind.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/jit/backend/x86/valgrind.py	(original)
+++ pypy/branch/jit-free-asm/pypy/jit/backend/x86/valgrind.py	Sun Nov 28 11:49:48 2010
@@ -27,4 +27,4 @@
 
 def discard_translations(data, size):
     if we_are_translated() and VALGRIND_DISCARD_TRANSLATIONS is not None:
-        VALGRIND_DISCARD_TRANSLATIONS(llmemory.cast_ptr_to_adr(data), size)
+        VALGRIND_DISCARD_TRANSLATIONS(llmemory.cast_int_to_adr(data), size)

Modified: pypy/branch/jit-free-asm/pypy/rpython/lltypesystem/llmemory.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/rpython/lltypesystem/llmemory.py	(original)
+++ pypy/branch/jit-free-asm/pypy/rpython/lltypesystem/llmemory.py	Sun Nov 28 11:49:48 2010
@@ -93,8 +93,10 @@
                 return endmarker._as_ptr()
             else:
                 return parent.getitem(index)._as_ptr()
-        elif (isinstance(A, lltype.FixedSizeArray) and
-              array_item_type_match(A.OF, self.TYPE)):
+        elif ((isinstance(A, lltype.FixedSizeArray)
+               or (isinstance(A, lltype.Array) and A._hints.get('nolength',
+                                                                False)))
+              and array_item_type_match(A.OF, self.TYPE)):
             # for array of primitives or pointers
             return lltype.direct_ptradd(firstitemptr, self.repeat)
         else:

Modified: pypy/branch/jit-free-asm/pypy/rpython/memory/gctransform/asmgcroot.py
==============================================================================
--- pypy/branch/jit-free-asm/pypy/rpython/memory/gctransform/asmgcroot.py	(original)
+++ pypy/branch/jit-free-asm/pypy/rpython/memory/gctransform/asmgcroot.py	Sun Nov 28 11:49:48 2010
@@ -139,12 +139,13 @@
         self._shape_decompressor = ShapeDecompressor()
         if hasattr(gctransformer.translator, '_jit2gc'):
             jit2gc = gctransformer.translator._jit2gc
-            self._extra_gcmapstart = jit2gc['gcmapstart']
-            self._extra_gcmapend   = jit2gc['gcmapend']
+            self._extra_gcmapstart  = jit2gc['gcmapstart']
+            self._extra_gcmapend    = jit2gc['gcmapend']
+            self._extra_mark_sorted = jit2gc['gcmarksorted']
         else:
-            returns_null = lambda: llmemory.NULL
-            self._extra_gcmapstart = returns_null
-            self._extra_gcmapend   = returns_null
+            self._extra_gcmapstart  = lambda: llmemory.NULL
+            self._extra_gcmapend    = lambda: llmemory.NULL
+            self._extra_mark_sorted = lambda: True
 
     def need_thread_support(self, gctransformer, getfn):
         # Threads supported "out of the box" by the rest of the code.
@@ -295,14 +296,16 @@
             # we have a non-empty JIT-produced table to look in
             item = search_in_gcmap2(gcmapstart2, gcmapend2, retaddr)
             if item:
-                self._shape_decompressor.setaddr(item.address[1])
+                self._shape_decompressor.setaddr(item)
                 return
             # maybe the JIT-produced table is not sorted?
-            sort_gcmap(gcmapstart2, gcmapend2)
-            item = search_in_gcmap2(gcmapstart2, gcmapend2, retaddr)
-            if item:
-                self._shape_decompressor.setaddr(item.address[1])
-                return
+            was_already_sorted = self._extra_mark_sorted()
+            if not was_already_sorted:
+                sort_gcmap(gcmapstart2, gcmapend2)
+                item = search_in_gcmap2(gcmapstart2, gcmapend2, retaddr)
+                if item:
+                    self._shape_decompressor.setaddr(item)
+                    return
         # the item may have been not found because the main array was
         # not sorted.  Sort it and try again.
         win32_follow_gcmap_jmp(gcmapstart, gcmapend)
@@ -357,7 +360,8 @@
     The interval from the start address (included) to the end address
     (excluded) is assumed to be a sorted arrays of pairs (addr1, addr2).
     This searches for the item with a given addr1 and returns its
-    address.
+    address.  If not found exactly, it tries to return the address
+    of the item left of addr1 (i.e. such that result.address[0] < addr1).
     """
     count = (end - start) // arrayitemsize
     while count > 1:
@@ -386,7 +390,7 @@
     # (item.signed[1] is an address in this case, not a signed at all!)
     item = binary_search(gcmapstart, gcmapend, retaddr)
     if item.address[0] == retaddr:
-        return item     # found
+        return item.address[1]     # found
     else:
         return llmemory.NULL    # failed