[pypy-commit] pypy vecopt-merge: a all new stitch bridge that considers register mapping. works for accumulation values as well

Sun Aug 23 17:24:05 CEST 2015

Author: Richard Plangger <rich at pasra.at>
Branch: vecopt-merge
Changeset: r79159:0a4078644343
Date: 2015-08-23 17:24 +0200
http://bitbucket.org/pypy/pypy/changeset/0a4078644343/

Log:	a all new stitch bridge that considers register mapping. works for
	accumulation values as well

diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -590,9 +590,57 @@
                                                        rawstart, fullsize)
         return AsmInfo(ops_offset, startpos + rawstart, codeendpos - startpos, rawstart)
 
-    def stitch_bridge(self, faildescr, target):
-        assert target.rawstart != 0
-        self.patch_jump_for_descr(faildescr, target.rawstart)
+    def stitch_bridge(self, faildescr, version):
+        """ Stitching means that one can enter a bridge with a complete different register
+            allocation. This needs remapping which is done here for both normal registers
+            and accumulation registers.
+            Why? Because this only generates a very small junk of memory, instead of
+            duplicating the loop assembler!
+        """
+        asminfo, bridge_faildescr, compiled_version, looptoken = version._compiled
+        assert asminfo.rawstart != 0
+        self.mc = codebuf.MachineCodeBlockWrapper()
+        allblocks = self.get_asmmemmgr_blocks(looptoken)
+        self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
+                                                   allblocks)
+        frame_info = self.datablockwrapper.malloc_aligned(
+            jitframe.JITFRAMEINFO_SIZE, alignment=WORD)
+
+        self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
+        # if accumulation is saved at the guard, we need to update it here!
+        guard_locs = self.rebuild_faillocs_from_descr(faildescr, version.inputargs)
+        bridge_locs = self.rebuild_faillocs_from_descr(bridge_faildescr, compiled_version.inputargs)
+        guard_accum_info = faildescr.rd_accum_list
+        # O(n^2), but usually you only have at most 1 fail argument
+        while guard_accum_info:
+            bridge_accum_info = bridge_faildescr.rd_accum_list
+            while bridge_accum_info:
+                if bridge_accum_info.scalar_position == guard_accum_info.scalar_position:
+                    # the mapping might be wrong!
+                    if bridge_accum_info.vector_loc is not guard_accum_info.vector_loc:
+                        self.mov(guard_accum_info.vector_loc, bridge_accum_info.vector_loc)
+                bridge_accum_info = bridge_accum_info.prev
+            guard_accum_info = guard_accum_info.prev
+
+        # register mapping is most likely NOT valid, thus remap it in this
+        # short piece of assembler
+        assert len(guard_locs) == len(bridge_locs)
+        for i,gloc in enumerate(guard_locs):
+            bloc = bridge_locs[i]
+            bstack = bloc.location_code() == 'b'
+            gstack = gloc.location_code() == 'b'
+            if bstack and gstack:
+                pass
+            elif gloc is not bloc:
+                self.mov(gloc, bloc)
+        self.mc.JMP_l(0)
+        self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
+        offset = self.mc.get_relative_pos() - 4
+        rawstart = self.materialize_loop(looptoken)
+        # update the exit target
+        self._patch_jump_for_descr(rawstart + offset, asminfo.rawstart)
+        # update the guard to jump right to this custom piece of assembler
+        self.patch_jump_for_descr(faildescr, rawstart)
 
     def write_pending_failure_recoveries(self, regalloc):
         # for each pending guard, generate the code of the recovery stub
@@ -732,6 +780,10 @@
 
     def patch_jump_for_descr(self, faildescr, adr_new_target):
         adr_jump_offset = faildescr.adr_jump_offset
+        self._patch_jump_for_descr(adr_jump_offset, adr_new_target)
+        faildescr.adr_jump_offset = 0    # means "patched"
+
+    def _patch_jump_for_descr(self, adr_jump_offset, adr_new_target):
         assert adr_jump_offset != 0
         offset = adr_new_target - (adr_jump_offset + 4)
         # If the new target fits within a rel32 of the jump, just patch
@@ -752,7 +804,6 @@
             p = rffi.cast(rffi.INTP, adr_jump_offset)
             adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
             mc.copy_to_raw_memory(adr_target)
-        faildescr.adr_jump_offset = 0    # means "patched"
 
     def fixup_target_tokens(self, rawstart):
         for targettoken in self.target_tokens_currently_compiling:
diff --git a/rpython/jit/metainterp/compile.py b/rpython/jit/metainterp/compile.py
--- a/rpython/jit/metainterp/compile.py
+++ b/rpython/jit/metainterp/compile.py
@@ -209,9 +209,8 @@
                                    version.operations, jitcell_token)
             record_loop_or_bridge(metainterp_sd, vl)
             assert asminfo is not None
-            version._compiled = asminfo
+            version._compiled = (asminfo, faildescr, faildescr.version, jitcell_token)
             faildescr.version = None
-        # stitch the rest of the traces
         for lv in loop.versions:
             if not lv.compiled():
                 # the version was never compiled, do not bother
@@ -221,7 +220,7 @@
                 assert isinstance(faildescr, CompileLoopVersionDescr)
                 version = faildescr.version
                 if version and version.compiled():
-                    cpu.stitch_bridge(faildescr, version._compiled)
+                    cpu.stitch_bridge(faildescr, version)
                 faildescr.version = None
     loop.versions = None