[pypy-commit] pypy ppc-updated-backend: PPC Backend #1: merged "default" and the unfinished

Fri Aug 28 09:13:03 CEST 2015

Author: Armin Rigo <arigo at tunes.org>
Branch: ppc-updated-backend
Changeset: r79264:2b7e225663d7
Date: 2015-08-22 05:41 -0700
http://bitbucket.org/pypy/pypy/changeset/2b7e225663d7/

Log:	PPC Backend #1: merged "default" and the unfinished "ppc-updated-
	backend" branch, and then fixed stuff until we reach a state where
	it doesn't completely segfaults whenever we try to run any test. At
	this point, a bit more than half the tests of test_runner.py pass.

diff --git a/rpython/jit/backend/ppc/TODO b/rpython/jit/backend/ppc/TODO
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/ppc/TODO
@@ -0,0 +1,4 @@
+
+prepare_guard_int_lt & friends: re-enable in walk_operations()
+
+guard_xyz: fail if the target of the branch is too far away (+32k?)
diff --git a/rpython/jit/backend/ppc/arch.py b/rpython/jit/backend/ppc/arch.py
--- a/rpython/jit/backend/ppc/arch.py
+++ b/rpython/jit/backend/ppc/arch.py
@@ -72,10 +72,3 @@
 
 STD_FRAME_SIZE_IN_BYTES = GPR_SAVE_AREA_OFFSET + len(REGISTERS_SAVED) * WORD
 assert STD_FRAME_SIZE_IN_BYTES % 16 == 0
-
-
-# The JITFRAME_FIXED_SIZE is measured in words, and should be the
-# number of registers that need to be saved into the jitframe when
-# failing a guard, for example.  (Note: it is about the jitframe,
-# not the frame.)
-JITFRAME_FIXED_SIZE = len(r.MANAGED_REGS) + len(r.MANAGED_FP_REGS)
diff --git a/rpython/jit/backend/ppc/codebuilder.py b/rpython/jit/backend/ppc/codebuilder.py
--- a/rpython/jit/backend/ppc/codebuilder.py
+++ b/rpython/jit/backend/ppc/codebuilder.py
@@ -923,15 +923,15 @@
     def flush_icache(x, y): pass
 
 class PPCGuardToken(GuardToken):
-    # We may have to find a suitable default value for fcond
-    def __init__(self, cpu, gcmap, descr, failargs, faillocs, offset,
+    def __init__(self, cpu, gcmap, descr, failargs, faillocs,
                  exc, frame_depth, is_guard_not_invalidated=False,
-                 is_guard_not_forced=False, fcond=c.EQ):
+                 is_guard_not_forced=False, fcond=c.UH):
+        assert fcond != c.UH
         GuardToken.__init__(self, cpu, gcmap, descr, failargs, faillocs, exc,
                             frame_depth, is_guard_not_invalidated,
                             is_guard_not_forced)
         self.fcond = fcond
-        self.offset = offset
+        #self.offset = offset
 
 class OverwritingBuilder(PPCAssembler):
     def __init__(self, cb, start, num_insts):
@@ -990,14 +990,6 @@
         else:
             self.ldx(rD.value, 0, rD.value)
 
-    def store_reg(self, source_reg, addr):
-        with scratch_reg(self):
-            self.load_imm(r.SCRATCH, addr)
-            if IS_PPC_32:
-                self.stwx(source_reg.value, 0, r.SCRATCH.value)
-            else:
-                self.stdx(source_reg.value, 0, r.SCRATCH.value)
-
     def b_offset(self, target):
         curpos = self.currpos()
         offset = target - curpos
@@ -1160,7 +1152,7 @@
         self.writechar(chr(word & 0xFF))
 
     def currpos(self):
-        return self.get_rel_pos()
+        return self.get_relative_pos()
 
     def flush_cache(self, addr):
         startaddr = rffi.cast(lltype.Signed, addr)
diff --git a/rpython/jit/backend/ppc/condition.py b/rpython/jit/backend/ppc/condition.py
--- a/rpython/jit/backend/ppc/condition.py
+++ b/rpython/jit/backend/ppc/condition.py
@@ -9,6 +9,7 @@
 LT = (0, SET)
 EQ = (2, SET)
 GE = (0, UNSET)
+UH = (-1, -1)    # invalid
 
 # values below are random ...
 
diff --git a/rpython/jit/backend/ppc/jump.py b/rpython/jit/backend/ppc/jump.py
--- a/rpython/jit/backend/ppc/jump.py
+++ b/rpython/jit/backend/ppc/jump.py
@@ -1,7 +1,5 @@
 # ../x86/jump.py
 # XXX combine with ../x86/jump.py and move to llsupport
-import sys
-from rpython.tool.pairtype import extendabletype
 
 def remap_frame_layout(assembler, src_locations, dst_locations, tmpreg):
     pending_dests = len(dst_locations)
@@ -77,9 +75,9 @@
 def remap_frame_layout_mixed(assembler,
                              src_locations1, dst_locations1, tmpreg1,
                              src_locations2, dst_locations2, tmpreg2):
-    # find and push the xmm stack locations from src_locations2 that
+    # find and push the fp stack locations from src_locations2 that
     # are going to be overwritten by dst_locations1
-    from pypy.jit.backend.ppc.arch import WORD
+    from rpython.jit.backend.ppc.arch import WORD
     extrapushes = []
     dst_keys = {}
     for loc in dst_locations1:
@@ -104,10 +102,10 @@
     # remap the integer and pointer registers and stack locations
     remap_frame_layout(assembler, src_locations1, dst_locations1, tmpreg1)
     #
-    # remap the vfp registers and stack locations
+    # remap the fp registers and stack locations
     remap_frame_layout(assembler, src_locations2, dst_locations2, tmpreg2)
     #
-    # finally, pop the extra xmm stack locations
+    # finally, pop the extra fp stack locations
     while len(extrapushes) > 0:
         loc = extrapushes.pop()
         assembler.regalloc_pop(loc)
diff --git a/rpython/jit/backend/ppc/locations.py b/rpython/jit/backend/ppc/locations.py
--- a/rpython/jit/backend/ppc/locations.py
+++ b/rpython/jit/backend/ppc/locations.py
@@ -1,9 +1,7 @@
 from rpython.jit.metainterp.history import INT, FLOAT
 import sys
 
-# TODO: solve the circular import: runner -> arch -> register -> locations ->
-# arch
-# XXX import from arch.py, currently we have a circular import
+# cannot import from arch.py, currently we have a circular import
 if sys.maxint == (2**31 - 1):
     WORD = 4
     FWORD = 8
@@ -12,8 +10,6 @@
     FWORD = 8
 DWORD = 2 * WORD
 
-# JITFRAME_FIXED_SIZE is also duplicated because of the circular import
-JITFRAME_FIXED_SIZE = 27 + 31 + 1 + 4 + 1
 
 class AssemblerLocation(object):
     _immutable_ = True
@@ -148,11 +144,8 @@
 def imm(val):
     return ImmLocation(val)
 
-def get_spp_offset(pos):
-    if pos < 0:
-        return -pos * WORD
-    else:
-        return -(pos + 1) * WORD
-
 def get_fp_offset(base_ofs, position):
-    return base_ofs + position
+    from rpython.jit.backend.ppc.register import JITFRAME_FIXED_SIZE
+    # Argument is a frame position (0, 1, 2...).
+    # Returns the n'th word beyond the fixed frame size.
+    return base_ofs + WORD * (position + JITFRAME_FIXED_SIZE)
diff --git a/rpython/jit/backend/ppc/opassembler.py b/rpython/jit/backend/ppc/opassembler.py
--- a/rpython/jit/backend/ppc/opassembler.py
+++ b/rpython/jit/backend/ppc/opassembler.py
@@ -211,24 +211,24 @@
     def emit_cast_float_to_int(self, op, arglocs, regalloc):
         l0, temp_loc, res = arglocs
         self.mc.fctidz(temp_loc.value, l0.value)
-        self.mc.stfd(temp_loc.value, r.SPP.value, FORCE_INDEX_OFS + WORD)
-        self.mc.ld(res.value, r.SPP.value, FORCE_INDEX_OFS + WORD)
+        self.mc.stfd(temp_loc.value, r.SP.value, -16)
+        self.mc.ld(res.value, r.SP.value, -16)
 
     def emit_cast_int_to_float(self, op, arglocs, regalloc):
         l0, temp_loc, res = arglocs
-        self.mc.std(l0.value, r.SPP.value, FORCE_INDEX_OFS + WORD)
-        self.mc.lfd(temp_loc.value, r.SPP.value, FORCE_INDEX_OFS + WORD)
+        self.mc.std(l0.value, r.SP.value, -16)
+        self.mc.lfd(temp_loc.value, r.SP.value, -16)
         self.mc.fcfid(res.value, temp_loc.value)
 
     def emit_convert_float_bytes_to_longlong(self, op, arglocs, regalloc):
         l0, res = arglocs
-        self.mc.stfd(l0.value, r.SPP.value, FORCE_INDEX_OFS + WORD)
-        self.mc.ld(res.value, r.SPP.value, FORCE_INDEX_OFS + WORD)
+        self.mc.stfd(l0.value, r.SP.value, -16)
+        self.mc.ld(res.value, r.SP.value, -16)
 
     def emit_convert_longlong_bytes_to_float(self, op, arglocs, regalloc):
         l0, res = arglocs
-        self.mc.std(l0.value, r.SPP.value, FORCE_INDEX_OFS + WORD)
-        self.mc.lfd(res.value, r.SPP.value, FORCE_INDEX_OFS + WORD)
+        self.mc.std(l0.value, r.SP.value, -16)
+        self.mc.lfd(res.value, r.SP.value, -16)
 
 class GuardOpAssembler(object):
 
@@ -237,22 +237,22 @@
     def _emit_guard(self, op, arglocs, fcond, save_exc=False,
                     is_guard_not_invalidated=False,
                     is_guard_not_forced=False):
-        pos = self.mc.currpos()
-        self.mc.nop()     # has to be patched later on
         token = self.build_guard_token(op, arglocs[0].value, arglocs[1:],
                                        fcond, save_exc, is_guard_not_invalidated,
                                        is_guard_not_forced)
-        self.pending_guards.append(token)
+        token.pos_jump_offset = self.mc.currpos()
+        self.mc.nop()     # has to be patched later on
+        self.pending_guard_tokens.append(token)
 
     def build_guard_token(self, op, frame_depth, arglocs, fcond, save_exc,
                           is_guard_not_invalidated=False,
                           is_guard_not_forced=False):
         descr = op.getdescr()
-        offset = self.mc.currpos()
-        gcmap = allocate_gcmap(self, frame_depth, JITFRAME_FIXED_SIZE)
+        gcmap = allocate_gcmap(self, frame_depth, r.JITFRAME_FIXED_SIZE)
         token = PPCGuardToken(self.cpu, gcmap, descr, op.getfailargs(),
                               arglocs, save_exc, frame_depth,
-                              is_guard_not_invalidated, is_guard_not_forced)
+                              is_guard_not_invalidated, is_guard_not_forced,
+                              fcond)
         return token
 
     def emit_guard_true(self, op, arglocs, regalloc):
@@ -356,7 +356,10 @@
         base_ofs = self.cpu.get_baseofs_of_frame_field()
         if len(arglocs) == 2:
             [return_val, fail_descr_loc] = arglocs
-            self.mc.std(return_val.value, r.SPP.value, base_ofs)
+            if op.getarg(0).type == FLOAT:
+                self.mc.stfd(return_val.value, r.SPP.value, base_ofs)
+            else:
+                self.mc.std(return_val.value, r.SPP.value, base_ofs)
         else:
             [fail_descr_loc] = arglocs
 
diff --git a/rpython/jit/backend/ppc/ppc_assembler.py b/rpython/jit/backend/ppc/ppc_assembler.py
--- a/rpython/jit/backend/ppc/ppc_assembler.py
+++ b/rpython/jit/backend/ppc/ppc_assembler.py
@@ -7,12 +7,12 @@
                                           LR_BC_OFFSET, REGISTERS_SAVED,
                                           GPR_SAVE_AREA_OFFSET,
                                           THREADLOCAL_ADDR_OFFSET,
-                                          STD_FRAME_SIZE_IN_BYTES,
-                                          JITFRAME_FIXED_SIZE)
+                                          STD_FRAME_SIZE_IN_BYTES)
 from rpython.jit.backend.ppc.helper.assembler import Saved_Volatiles
 from rpython.jit.backend.ppc.helper.regalloc import _check_imm_arg
 import rpython.jit.backend.ppc.register as r
 import rpython.jit.backend.ppc.condition as c
+from rpython.jit.backend.ppc.register import JITFRAME_FIXED_SIZE
 from rpython.jit.metainterp.history import AbstractFailDescr
 from rpython.jit.metainterp.history import ConstInt, BoxInt
 from rpython.jit.backend.llsupport import jitframe
@@ -30,7 +30,7 @@
 from rpython.rtyper.annlowlevel import llhelper
 from rpython.rlib.objectmodel import we_are_translated, specialize
 from rpython.rtyper.lltypesystem.lloperation import llop
-from rpython.jit.backend.ppc.locations import StackLocation, get_spp_offset, imm
+from rpython.jit.backend.ppc.locations import StackLocation, get_fp_offset, imm
 from rpython.rlib.jit import AsmInfo
 from rpython.rlib.objectmodel import compute_unique_id
 from rpython.rlib.rarithmetic import r_uint
@@ -166,50 +166,29 @@
         mc.addi(r.r15.value, r.r15.value, -2 * WORD)  # SUB r15, r15, 2*WORD
         mc.store(r.r15.value, r.r14.value, 0) # STR r15, [rootstacktop]
 
+    def new_stack_loc(self, i, tp):
+        base_ofs = self.cpu.get_baseofs_of_frame_field()
+        return StackLocation(i, get_fp_offset(base_ofs, i), tp)
+
     def setup_failure_recovery(self):
-
-        @rgc.no_collect
-        def failure_recovery_func(mem_loc, spilling_pointer,
-                                  managed_registers_pointer):
-            """
-                mem_loc is a pointer to the beginning of the encoding.
-
-                spilling_pointer is the address of the spilling area.
-            """
-            regs = rffi.cast(rffi.LONGP, managed_registers_pointer)
-            fpregs = rffi.ptradd(regs, len(r.MANAGED_REGS))
-            fpregs = rffi.cast(rffi.LONGP, fpregs)
-            return self.decode_registers_and_descr(mem_loc, 
-                                                   spilling_pointer,
-                                                   regs, fpregs)
-
-        self.failure_recovery_func = failure_recovery_func
-        self.failure_recovery_code = [0, 0, 0]
-
-    recovery_func_sign = lltype.Ptr(lltype.FuncType([lltype.Signed] * 3,
-            lltype.Signed))
+        self.failure_recovery_code = [0, 0, 0, 0]
 
     # TODO: see with we really need the ignored_regs argument
     def _push_all_regs_to_jitframe(self, mc, ignored_regs, withfloats,
                                    callee_only=False):
         base_ofs = self.cpu.get_baseofs_of_frame_field()
         if callee_only:
-            # Only push registers used to pass arguments to the callee
-            regs = r.VOLATILES
+            regs = XXX
         else:
-            regs = r.ALL_REGS
+            regs = r.MANAGED_REGS
         # For now, just push all regs to the jitframe
-        for i, reg in enumerate(regs):
-            # XXX should we progress to higher addresses?
-            mc.store_reg(reg, base_ofs - (i * WORD))
-
+        for reg in regs:
+            v = r.ALL_REG_INDEXES[reg]
+            mc.std(reg.value, r.SPP.value, base_ofs + v * WORD)
         if withfloats:
-            if callee_only:
-                regs = r.VOLATILES_FLOAT
-            else:
-                regs = r.ALL_FLOAT_REGS
-            for i, reg in enumerate(regs):
-                pass # TODO find or create the proper store indexed for fpr's
+            for reg in r.MANAGED_FP_REGS:
+                v = r.ALL_REG_INDEXES[reg]
+                mc.stfd(reg.value, r.SPP.value, base_ofs + v * WORD)
 
     def _pop_all_regs_from_jitframe(self, mc, ignored_regs, withfloats,
                                     callee_only=False):
@@ -230,135 +209,36 @@
             for i, reg in enumerate(regs):
                 pass # TODO find or create the proper load indexed for fpr's
 
-    @rgc.no_collect
-    def decode_registers_and_descr(self, mem_loc, spp, registers, fp_registers):
-        """Decode locations encoded in memory at mem_loc and write the values
-        to the failboxes.  Values for spilled vars and registers are stored on
-        stack at frame_loc """
-        assert spp & 1 == 0
-        self.fail_force_index = spp + FORCE_INDEX_OFS
-        bytecode = rffi.cast(rffi.UCHARP, mem_loc)
-        num = 0
-        value = 0
-        fvalue = 0
-        code_inputarg = False
-        while True:
-            code = rffi.cast(lltype.Signed, bytecode[0])
-            bytecode = rffi.ptradd(bytecode, 1)
-            if code >= self.CODE_FROMSTACK:
-                if code > 0x7F:
-                    shift = 7
-                    code &= 0x7F
-                    while True:
-                        nextcode = rffi.cast(lltype.Signed, bytecode[0])
-                        bytecode = rffi.ptradd(bytecode, 1)
-                        code |= (nextcode & 0x7F) << shift
-                        shift += 7
-                        if nextcode <= 0x7F:
-                            break
-                # load the value from the stack
-                kind = code & 3
-                code = int((code - self.CODE_FROMSTACK) >> 2)
-                if code_inputarg:
-                    code = ~code
-                    code_inputarg = False
-                if kind == self.DESCR_FLOAT:
-                    start = spp + get_spp_offset(int(code))
-                    fvalue = rffi.cast(rffi.LONGP, start)[0]
-                else:
-                    start = spp + get_spp_offset(int(code))
-                    value = rffi.cast(rffi.LONGP, start)[0]
-            else:
-                # 'code' identifies a register: load its value
-                kind = code & 3
-                if kind == self.DESCR_SPECIAL:
-                    if code == self.CODE_HOLE:
-                        num += 1
-                        continue
-                    if code == self.CODE_INPUTARG:
-                        code_inputarg = True
-                        continue
-                    assert code == self.CODE_STOP
-                    break
-                code >>= 2
-                if kind == self.DESCR_FLOAT:
-                    reg_index = r.get_managed_fpreg_index(code)
-                    fvalue = fp_registers[reg_index]
-                else:
-                    reg_index = r.get_managed_reg_index(code)
-                    value = registers[reg_index]
-            # store the loaded value into fail_boxes_<type>
-            if kind == self.DESCR_FLOAT:
-                tgt = self.fail_boxes_float.get_addr_for_num(num)
-                rffi.cast(rffi.LONGP, tgt)[0] = fvalue
-            else:
-                if kind == self.DESCR_INT:
-                    tgt = self.fail_boxes_int.get_addr_for_num(num)
-                elif kind == self.DESCR_REF:
-                    assert (value & 3) == 0, "misaligned pointer"
-                    tgt = self.fail_boxes_ptr.get_addr_for_num(num)
-                else:
-                    assert 0, "bogus kind"
-                rffi.cast(rffi.LONGP, tgt)[0] = value
-            num += 1
-        self.fail_boxes_count = num
-        fail_index = rffi.cast(rffi.INTP, bytecode)[0]
-        fail_index = rffi.cast(lltype.Signed, fail_index)
-        return fail_index
+    def _build_failure_recovery(self, exc, withfloats=False):
+        mc = PPCBuilder()
+        self.mc = mc
 
-    def decode_inputargs(self, code):
-        descr_to_box_type = [REF, INT, FLOAT]
-        bytecode = rffi.cast(rffi.UCHARP, code)
-        arglocs = []
-        code_inputarg = False
-        while 1:
-            # decode the next instruction from the bytecode
-            code = rffi.cast(lltype.Signed, bytecode[0])
-            bytecode = rffi.ptradd(bytecode, 1)
-            if code >= self.CODE_FROMSTACK:
-                # 'code' identifies a stack location
-                if code > 0x7F:
-                    shift = 7
-                    code &= 0x7F
-                    while True:
-                        nextcode = rffi.cast(lltype.Signed, bytecode[0])
-                        bytecode = rffi.ptradd(bytecode, 1)
-                        code |= (nextcode & 0x7F) << shift
-                        shift += 7
-                        if nextcode <= 0x7F:
-                            break
-                kind = code & 3
-                code = (code - self.CODE_FROMSTACK) >> 2
-                if code_inputarg:
-                    code = ~code
-                    code_inputarg = False
-                loc = PPCFrameManager.frame_pos(code, descr_to_box_type[kind])
-            elif code == self.CODE_STOP:
-                break
-            elif code == self.CODE_HOLE:
-                continue
-            elif code == self.CODE_INPUTARG:
-                code_inputarg = True
-                continue
-            else:
-                # 'code' identifies a register
-                kind = code & 3
-                code >>= 2
-                if kind == self.DESCR_FLOAT:
-                    assert (r.ALL_FLOAT_REGS[code] is 
-                            r.MANAGED_FP_REGS[r.get_managed_fpreg_index(code)])
-                    loc = r.ALL_FLOAT_REGS[code]
-                else:
-                    #loc = r.all_regs[code]
-                    assert (r.ALL_REGS[code] is 
-                            r.MANAGED_REGS[r.get_managed_reg_index(code)])
-                    loc = r.ALL_REGS[code]
-            arglocs.append(loc)
-        return arglocs[:]
+        # fill in the jf_descr and jf_gcmap fields of the frame according
+        # to which failure we are resuming from.  These are set before
+        # this function is called (see generate_quick_failure()).
+        ofs = self.cpu.get_ofs_of_frame_field('jf_descr')
+        ofs2 = self.cpu.get_ofs_of_frame_field('jf_gcmap')
+        mc.store(r.r0.value, r.SPP.value, ofs)
+        mc.store(r.r2.value, r.SPP.value, ofs2)
 
-    # TODO
-    def _build_failure_recovery(self, exc, withfloats=False):
-        pass
+        self._push_all_regs_to_jitframe(mc, [], withfloats)
+
+        if exc:
+            # We might have an exception pending.  Load it into r2...
+            mc.write32(0)
+            #mc.MOV(ebx, heap(self.cpu.pos_exc_value()))
+            #mc.MOV(heap(self.cpu.pos_exception()), imm0)
+            #mc.MOV(heap(self.cpu.pos_exc_value()), imm0)
+            ## ...and save ebx into 'jf_guard_exc'
+            #offset = self.cpu.get_ofs_of_frame_field('jf_guard_exc')
+            #mc.MOV_br(offset, ebx.value)
+
+        # now we return from the complete frame, which starts from
+        # _call_header_with_stack_check().  The _call_footer below does it.
+        self._call_footer()
+        rawstart = mc.materialize(self.cpu, [])
+        self.failure_recovery_code[exc + 2 * withfloats] = rawstart
+        self.mc = None
 
     # TODO
     def build_frame_realloc_slowpath(self):
@@ -836,7 +716,7 @@
         #
         self.patch_stack_checks(frame_depth_no_fixed_size + JITFRAME_FIXED_SIZE,
                                 rawstart)
-        looptoken._ll_loop_code = looppos + rawstart
+        looptoken._ppc_loop_code = looppos + rawstart
         debug_start("jit-backend-addr")
         debug_print("Loop %d (%s) has address 0x%x to 0x%x (bootstrap 0x%x)" % (
             looptoken.number, loopname,
@@ -881,56 +761,48 @@
             frame_depth = max(frame_depth, target_frame_depth)
         return frame_depth
 
-    def assemble_bridge(self, faildescr, inputargs, operations, looptoken, log):
+    @rgc.no_release_gil
+    def assemble_bridge(self, faildescr, inputargs, operations,
+                        original_loop_token, log, logger):
         if not we_are_translated():
+            # Arguments should be unique
             assert len(set(inputargs)) == len(inputargs)
 
-        self.setup(looptoken)
+        self.setup(original_loop_token)
         descr_number = compute_unique_id(faildescr)
         if log:
             operations = self._inject_debugging_code(faildescr, operations,
                                                      'b', descr_number)
-        assert isinstance(faildescr, AbstractFailDescr)
+
         arglocs = self.rebuild_faillocs_from_descr(faildescr, inputargs)
-
         regalloc = Regalloc(assembler=self)
+        startpos = self.mc.get_relative_pos()
         operations = regalloc.prepare_bridge(inputargs, arglocs,
                                              operations,
                                              self.current_clt.allgcrefs,
                                              self.current_clt.frame_info)
-
-        startpos = self.mc.currpos()
-        spilling_area, param_depth = self._assemble(operations, regalloc)
-        codeendpos = self.mc.currpos()
-
+        #self._check_frame_depth(self.mc, regalloc.get_gcmap())
+        frame_depth_no_fixed_size = self._assemble(regalloc, inputargs, operations)
+        codeendpos = self.mc.get_relative_pos()
         self.write_pending_failure_recoveries()
-
-        rawstart = self.materialize_loop(looptoken, False)
-        self.process_pending_guards(rawstart)
-        self.patch_trace(faildescr, looptoken, rawstart, regalloc)
+        fullsize = self.mc.get_relative_pos()
+        #
+        rawstart = self.materialize_loop(original_loop_token)
+        self.patch_stack_checks(frame_depth_no_fixed_size + JITFRAME_FIXED_SIZE,
+                                rawstart)
+        debug_bridge(descr_number, rawstart, codeendpos)
+        self.patch_pending_failure_recoveries(rawstart)
+        # patch the jump from original guard
+        self.patch_jump_for_descr(faildescr, rawstart)
+        ops_offset = self.mc.ops_offset
+        frame_depth = max(self.current_clt.frame_info.jfi_frame_depth,
+                          frame_depth_no_fixed_size + JITFRAME_FIXED_SIZE)
+        if logger:
+            logger.log_bridge(inputargs, operations, "rewritten",
+                              ops_offset=ops_offset)
         self.fixup_target_tokens(rawstart)
-        self.current_clt.frame_depth = max(self.current_clt.frame_depth,
-                spilling_area)
-        self.current_clt.param_depth = max(self.current_clt.param_depth, param_depth)
-
-        if not we_are_translated():
-            # for the benefit of tests
-            faildescr._ppc_bridge_frame_depth = self.current_clt.frame_depth
-            faildescr._ppc_bridge_param_depth = self.current_clt.param_depth
-            if log:
-                self.mc._dump_trace(rawstart, 'bridge_%d.asm' %
-                self.cpu.total_compiled_bridges)
-
-        self._patch_sp_offset(sp_patch_location, rawstart)
-
-        ops_offset = self.mc.ops_offset
+        self.update_frame_depth(frame_depth)
         self.teardown()
-
-        debug_start("jit-backend-addr")
-        debug_print("bridge out of Guard %d has address %x to %x" %
-                    (descr_number, rawstart, rawstart + codeendpos))
-        debug_stop("jit-backend-addr")
-
         return AsmInfo(ops_offset, startpos + rawstart, codeendpos - startpos)
 
     def _patch_sp_offset(self, sp_patch_location, rawstart):
@@ -1053,21 +925,20 @@
         #print "=== Loop start is at %s ===" % hex(r_uint(start))
         return start
 
-    def push_gcmap(self, mc, gcmap, push=False, store=False):
+    def load_gcmap(self, mc, gcmap):
+        # load the current gcmap into register r2
         ptr = rffi.cast(lltype.Signed, gcmap)
-        if push:
-            with scratch_reg(mc):
-                mc.load_imm(r.SCRATCH, ptr)
-                mc.stdu(r.SCRATCH.value, r.SP.value, -WORD)
-        elif store:
-            assert False, "Not implemented"
+        mc.load_imm(r.r2, ptr)
 
     def generate_quick_failure(self, guardtok):
         startpos = self.mc.currpos()
         fail_descr, target = self.store_info_on_descr(startpos, guardtok)
-        self.regalloc_push(imm(fail_descr))
-        self.push_gcmap(self.mc, gcmap=guardtok.gcmap, push=True)
-        self.mc.call(target)
+        assert target != 0
+        self.load_gcmap(self.mc, gcmap=guardtok.gcmap)   # -> r2
+        self.mc.load_imm(r.r0, target)
+        self.mc.mtctr(r.r0.value)
+        self.mc.load_imm(r.r0, fail_descr)
+        self.mc.bctr()
         return startpos
 
     def write_pending_failure_recoveries(self):
@@ -1079,32 +950,37 @@
     def patch_pending_failure_recoveries(self, rawstart):
         clt = self.current_clt
         for tok in self.pending_guard_tokens:
-            xxxxxxxxx
-
-    def process_pending_guards(self, block_start):
-        clt = self.current_clt
-        for tok in self.pending_guards:
-            descr = tok.faildescr
-            assert isinstance(descr, AbstractFailDescr)
-            descr._ppc_block_start = block_start
-
+            addr = rawstart + tok.pos_jump_offset
+            #
+            # XXX see patch_jump_for_descr()
+            #tok.faildescr.adr_jump_offset = addr
+            tok.faildescr.adr_recovery_stub = rawstart + tok.pos_recovery_stub
+            #
+            relative_target = tok.pos_recovery_stub - tok.pos_jump_offset
+            #
             if not tok.is_guard_not_invalidated:
                 mc = PPCBuilder()
-                offset = tok.pos_recovery_stub - tok.offset
-                mc.b_cond_offset(offset, tok.fcond)
-                mc.copy_to_raw_memory(block_start + tok.offset)
+                mc.b_cond_offset(relative_target, tok.fcond)
+                mc.copy_to_raw_memory(addr)
             else:
-                clt.invalidate_positions.append((block_start + tok.offset,
-                        descr._ppc_guard_pos - tok.offset))
+                # GUARD_NOT_INVALIDATED, record an entry in
+                # clt.invalidate_positions of the form:
+                #     (addr-in-the-code-of-the-not-yet-written-jump-target,
+                #      relative-target-to-use)
+                relpos = tok.pos_jump_offset
+                clt.invalidate_positions.append((rawstart + relpos,
+                                                 relative_target))
 
-    def patch_trace(self, faildescr, looptoken, bridge_addr, regalloc):
-        # The first instruction (word) is not overwritten, because it is the
-        # one that actually checks the condition
+    def patch_jump_for_descr(self, faildescr, adr_new_target):
+        # 'faildescr.adr_jump_offset' is the address of an instruction that is a
+        # conditional jump.  We must patch this conditional jump to go
+        # to 'adr_new_target'.  If the target is too far away, we can't
+        # patch it inplace, and instead we patch the quick failure code
+        # (which should be at least 5 instructions, so enough).
+        # --- XXX for now we always use the second solution ---
         mc = PPCBuilder()
-        patch_addr = faildescr._ppc_block_start + faildescr._ppc_guard_pos
-        mc.b_abs(bridge_addr)
-        mc.copy_to_raw_memory(patch_addr)
-        faildescr._failure_recovery_code_ofs = 0
+        mc.b_abs(adr_new_target)
+        mc.copy_to_raw_memory(faildescr.adr_recovery_stub)
 
     def get_asmmemmgr_blocks(self, looptoken):
         clt = looptoken.compiled_loop_token
diff --git a/rpython/jit/backend/ppc/regalloc.py b/rpython/jit/backend/ppc/regalloc.py
--- a/rpython/jit/backend/ppc/regalloc.py
+++ b/rpython/jit/backend/ppc/regalloc.py
@@ -5,7 +5,7 @@
 from rpython.jit.codewriter import longlong
 from rpython.jit.backend.ppc.jump import (remap_frame_layout,
                                           remap_frame_layout_mixed)
-from rpython.jit.backend.ppc.locations import imm, get_fp_offset, get_spp_offset
+from rpython.jit.backend.ppc.locations import imm, get_fp_offset
 from rpython.jit.backend.ppc.helper.regalloc import (_check_imm_arg,
                                                      prepare_cmp_op,
                                                      prepare_unary_int_op,
@@ -192,7 +192,7 @@
 
     @staticmethod
     def get_loc_index(loc):
-        assert loc.is_stack()
+        assert isinstance(loc, locations.StackLocation)
         return loc.position
 
 class Regalloc(BaseRegalloc):
@@ -227,28 +227,41 @@
         # note: we need to make a copy of inputargs because possibly_free_vars
         # is also used on op args, which is a non-resizable list
         self.possibly_free_vars(list(inputargs))
+        self.min_bytes_before_label = 4    # for redirect_call_assembler()
         return operations
 
-    def prepare_bridge(self, inputargs, arglocs, ops):
-        self._prepare(inputargs, ops)
+    def prepare_bridge(self, inputargs, arglocs, operations, allgcrefs,
+                       frame_info):
+        operations = self._prepare(inputargs, operations, allgcrefs)
         self._update_bindings(arglocs, inputargs)
+        self.min_bytes_before_label = 0
+        return operations
+
+    def ensure_next_label_is_at_least_at_position(self, at_least_position):
+        self.min_bytes_before_label = max(self.min_bytes_before_label,
+                                          at_least_position)
 
     def _update_bindings(self, locs, inputargs):
+        # XXX this should probably go to llsupport/regalloc.py
         used = {}
         i = 0
         for loc in locs:
+            if loc is None: # xxx bit kludgy
+                loc = r.SPP
             arg = inputargs[i]
             i += 1
             if loc.is_reg():
-                self.rm.reg_bindings[arg] = loc
+                if loc is r.SPP:
+                    self.rm.bindings_to_frame_reg[arg] = None
+                else:
+                    self.rm.reg_bindings[arg] = loc
+                    used[loc] = None
             elif loc.is_fp_reg():
                 self.fprm.reg_bindings[arg] = loc
+                used[loc] = None
             else:
                 assert loc.is_stack()
-                self.frame_manager.set_binding(arg, loc)
-            used[loc] = None
-
-        # XXX combine with x86 code and move to llsupport
+                self.fm.bind(arg, loc)
         self.rm.free_regs = []
         for reg in self.rm.all_regs:
             if reg not in used:
@@ -257,9 +270,10 @@
         for reg in self.fprm.all_regs:
             if reg not in used:
                 self.fprm.free_regs.append(reg)
-        # note: we need to make a copy of inputargs because possibly_free_vars
-        # is also used on op args, which is a non-resizable list
         self.possibly_free_vars(list(inputargs))
+        self.fm.finish_binding()
+        self.rm._check_invariants()
+        self.fprm._check_invariants()
 
     def get_final_frame_depth(self):
         return self.fm.get_frame_depth()
@@ -317,7 +331,12 @@
                 i += 1
                 self.possibly_free_vars_for_op(op)
                 continue
-            if self.can_merge_with_next_guard(op, i, operations):
+            if self.can_merge_with_next_guard(op, i, operations) and (
+                # XXX FIX
+                op.getopnum() in (rop.CALL_RELEASE_GIL, rop.CALL_ASSEMBLER,
+                                  rop.CALL_MAY_FORCE)
+                # XXX FIX
+                ):
                 arglocs = oplist_with_guard[op.getopnum()](self, op,
                                                            operations[i + 1])
                 assert arglocs is not None
@@ -339,11 +358,18 @@
             i += 1
         assert not self.rm.reg_bindings
         assert not self.fprm.reg_bindings
-        #self.flush_loop()
+        self.flush_loop()
         self.assembler.mc.mark_op(None) # end of the loop
         for arg in inputargs:
             self.possibly_free_var(arg)
 
+    def flush_loop(self):
+        # Emit a nop in the rare case where we have a guard_not_invalidated
+        # immediately before a label
+        mc = self.assembler.mc
+        while self.min_bytes_before_label > mc.get_relative_pos():
+            mc.nop()
+
     def loc(self, var):
         if var.type == FLOAT:
             return self.fprm.loc(var)
@@ -360,6 +386,10 @@
         else:
             self.rm.force_spill_var(var)
 
+    def _consider_force_spill(self, op):
+        # This operation is used only for testing
+        self.force_spill_var(op.getarg(0))
+
     def before_call(self, force_store=[], save_all_regs=False):
         self.rm.before_call(force_store, save_all_regs)
         self.fprm.before_call(force_store, save_all_regs)
@@ -561,12 +591,13 @@
     def _prepare_guard(self, op, args=None):
         if args is None:
             args = []
-        args.append(imm(len(self.frame_manager.used)))
+        args.append(imm(self.fm.get_frame_depth()))
         for arg in op.getfailargs():
             if arg:
                 args.append(self.loc(arg))
             else:
                 args.append(None)
+        self.possibly_free_vars(op.getfailargs())
         return args
     
     def prepare_guard_true(self, op):
@@ -695,7 +726,7 @@
             if isinstance(box, Box):
                 loc = arglocs[i]
                 if loc is not None and loc.is_stack():
-                    self.frame_manager.hint_frame_locations[box] = loc
+                    self.fm.hint_frame_pos[box] = self.fm.get_loc_index(loc)
 
     def prepare_jump(self, op):
         descr = op.getdescr()
@@ -1067,7 +1098,6 @@
         return [res_loc]
 
     def prepare_label(self, op):
-        # XXX big refactoring needed?
         descr = op.getdescr()
         assert isinstance(descr, TargetToken)
         inputargs = op.getarglist()
@@ -1082,15 +1112,26 @@
             assert isinstance(arg, Box)
             if self.last_real_usage.get(arg, -1) <= position:
                 self.force_spill_var(arg)
-
+        #
+        # we need to make sure that no variable is stored in spp (=r31)
+        for arg in inputargs:
+            if self.loc(arg) is r.SPP:
+                loc2 = self.fm.loc(arg)
+                self.assembler.mc.store(r.SPP, loc2)
+        self.rm.bindings_to_frame_reg.clear()
         #
         for i in range(len(inputargs)):
             arg = inputargs[i]
             assert isinstance(arg, Box)
             loc = self.loc(arg)
+            assert loc is not r.SPP
             arglocs[i] = loc
             if loc.is_reg():
-                self.frame_manager.mark_as_free(arg)
+                self.fm.mark_as_free(arg)
+        #
+        # if we are too close to the start of the loop, the label's target may
+        # get overridden by redirect_call_assembler().  (rare case)
+        self.flush_loop()
         #
         descr._ppc_arglocs = arglocs
         descr._ppc_loop_code = self.assembler.mc.currpos()
diff --git a/rpython/jit/backend/ppc/register.py b/rpython/jit/backend/ppc/register.py
--- a/rpython/jit/backend/ppc/register.py
+++ b/rpython/jit/backend/ppc/register.py
@@ -37,6 +37,16 @@
 
 MANAGED_FP_REGS = VOLATILES_FLOAT[1:] #+ NONVOLATILES_FLOAT
 
+
+# The JITFRAME_FIXED_SIZE is measured in words, and should be the
+# number of registers that need to be saved into the jitframe when
+# failing a guard, for example.
+ALL_REG_INDEXES = {}
+for _r in MANAGED_REGS + MANAGED_FP_REGS:
+    ALL_REG_INDEXES[_r] = len(ALL_REG_INDEXES)
+JITFRAME_FIXED_SIZE = len(ALL_REG_INDEXES)
+
+
 PARAM_REGS = [r3, r4, r5, r6, r7, r8, r9, r10]
 PARAM_FPREGS = [f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13]
 
diff --git a/rpython/jit/backend/ppc/runner.py b/rpython/jit/backend/ppc/runner.py
--- a/rpython/jit/backend/ppc/runner.py
+++ b/rpython/jit/backend/ppc/runner.py
@@ -18,8 +18,13 @@
     IS_64_BIT = True
     BOOTSTRAP_TP = lltype.FuncType([], lltype.Signed)
 
+    from rpython.jit.backend.ppc.register import JITFRAME_FIXED_SIZE
     frame_reg = r.SP
-    all_reg_indexes = range(len(r.ALL_REGS))
+    all_reg_indexes = [-1] * 32
+    for _i, _r in enumerate(r.MANAGED_REGS):
+        all_reg_indexes[_r.value] = _i
+    gen_regs = r.MANAGED_REGS
+    float_regs = r.MANAGED_FP_REGS
 
     def __init__(self, rtyper, stats, opts=None, translate_support_code=False,
                  gcdescr=None):
@@ -31,7 +36,7 @@
         AbstractLLCPU.__init__(self, rtyper, stats, opts,
                                translate_support_code, gcdescr)
 
-        # floats are not supported yet
+        # floats are supported.  singlefloats are not supported yet
         self.supports_floats = True
 
     def setup(self):
@@ -44,11 +49,11 @@
         self.assembler.finish_once()
 
     def compile_bridge(self, faildescr, inputargs, operations,
-                      original_loop_token, log=False):
+                       original_loop_token, log=True, logger=None):
         clt = original_loop_token.compiled_loop_token
         clt.compiling_a_bridge()
         return self.assembler.assemble_bridge(faildescr, inputargs, operations,
-                                       original_loop_token, log=log)
+                                              original_loop_token, log, logger)
 
     @staticmethod
     def cast_ptr_to_int(x):
diff --git a/rpython/jit/backend/ppc/test/test_runner.py b/rpython/jit/backend/ppc/test/test_runner.py
--- a/rpython/jit/backend/ppc/test/test_runner.py
+++ b/rpython/jit/backend/ppc/test/test_runner.py
@@ -76,10 +76,11 @@
         ARGS = [lltype.Signed] * numargs
         RES = lltype.Signed
         args = [i+1 for i in range(numargs)]
-        res = self.cpu.execute_token(looptoken, *args)
-        assert res is faildescr
+        deadframe = self.cpu.execute_token(looptoken, *args)
+        fail = self.cpu.get_latest_descr(deadframe)
+        assert fail is faildescr
         for i in range(numargs):
-            assert self.cpu.get_latest_value_int(i) == i + 1
+            assert self.cpu.get_int_value(deadframe, i) == i + 1
 
         bridgeops = [arglist]
         bridgeops.append("guard_value(i1, -5) %s" % arglist)
@@ -88,12 +89,14 @@
         faildescr2 = bridge.operations[-1].getdescr()
 
         self.cpu.compile_bridge(faildescr, bridge.inputargs, bridge.operations, looptoken)
-        res2 = self.cpu.execute_token(looptoken, *args)
-        assert res2 is faildescr2
+        deadframe = self.cpu.execute_token(looptoken, *args)
+        fail = self.cpu.get_latest_descr(deadframe)
+        assert fail is faildescr2
         for i in range(numargs):
-            assert self.cpu.get_latest_value_int(i) == i + 1
+            assert self.cpu.get_int_value(deadframe, i) == i + 1
 
     def test_unicodesetitem_really_needs_temploc(self):
+        py.test.skip("XXX")
         u_box = self.alloc_unicode(u"abcdsdasdsaddefg")
         
         i0 = BoxInt()
@@ -128,6 +131,7 @@
             assert self.cpu.get_latest_value_int(i) == args[i]
 
     def test_debugger_on(self):
+        py.test.skip("XXX")
         from pypy.rlib import debug
 
         targettoken, preambletoken = TargetToken(), TargetToken()
diff --git a/rpython/jit/backend/test/runner_test.py b/rpython/jit/backend/test/runner_test.py
--- a/rpython/jit/backend/test/runner_test.py
+++ b/rpython/jit/backend/test/runner_test.py
@@ -230,6 +230,8 @@
         self.cpu.compile_loop(inputargs, operations, looptoken)
         if hasattr(looptoken, '_x86_ops_offset'):
             del looptoken._x86_ops_offset # else it's kept alive
+        if hasattr(looptoken, '_ppc_ops_offset'):
+            del looptoken._ppc_ops_offset # else it's kept alive
         del i0, i1, i2
         del inputargs
         del operations