[pypy-commit] pypy default: hg merge branch-prediction

arigo pypy.commits at gmail.com
Fri Apr 7 10:35:28 EDT 2017


Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r91018:16bac2ad5e92
Date: 2017-04-07 16:33 +0200
http://bitbucket.org/pypy/pypy/changeset/16bac2ad5e92/

Log:	hg merge branch-prediction

	Help the branch predictor on x86 CPUs. Our JIT-generated machine
	code is a single long basic block, and now it contains only forward
	conditional jumps that are rarely taken. It is better than
	previously, where a subset of the forward conditional jumps were
	over slow-path code and so expected to be usually taken. Now these
	blocks of slow-path code are put off-line after the main block.

diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -76,6 +76,7 @@
         BaseAssembler.setup(self, looptoken)
         assert self.memcpy_addr != 0, "setup_once() not called?"
         self.current_clt = looptoken.compiled_loop_token
+        self.pending_slowpaths = []
         self.pending_guard_tokens = []
         if WORD == 8:
             self.pending_memoryerror_trampoline_from = []
@@ -95,6 +96,7 @@
             self.pending_memoryerror_trampoline_from = None
         self.mc = None
         self.current_clt = None
+        self.frame_depth_to_patch = None
 
     def _build_float_constants(self):
         # 0x80000000000000008000000000000000
@@ -181,6 +183,7 @@
         """ This builds a general call slowpath, for whatever call happens to
         come.
         """
+        self.pending_slowpaths = []
         mc = codebuf.MachineCodeBlockWrapper()
         # copy registers to the frame, with the exception of the
         # 'cond_call_register_arguments' and eax, because these have already
@@ -211,6 +214,7 @@
         self.pop_gcmap(mc)   # cancel the push_gcmap(store=True) in the caller
         self._pop_all_regs_from_frame(mc, [eax], supports_floats, callee_only)
         mc.RET()
+        self.flush_pending_slowpaths(mc)
         return mc.materialize(self.cpu, [])
 
     def _build_malloc_slowpath(self, kind):
@@ -227,6 +231,7 @@
         This function must preserve all registers apart from ecx and edx.
         """
         assert kind in ['fixed', 'str', 'unicode', 'var']
+        self.pending_slowpaths = []
         mc = codebuf.MachineCodeBlockWrapper()
         self._push_all_regs_to_frame(mc, [ecx, edx], self.cpu.supports_floats)
         # the caller already did push_gcmap(store=True)
@@ -276,13 +281,13 @@
         self.set_extra_stack_depth(mc, 0)
         #
         mc.TEST_rr(eax.value, eax.value)
+        # common case: not taken
         mc.J_il(rx86.Conditions['Z'], 0xfffff) # patched later
         jz_location = mc.get_relative_pos(break_basic_block=False)
         mc.MOV_rr(ecx.value, eax.value)
         #
         nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
         self._pop_all_regs_from_frame(mc, [ecx, edx], self.cpu.supports_floats)
-        mc.MOV(edx, heap(nursery_free_adr))   # load this in EDX
         self.pop_gcmap(mc)   # push_gcmap(store=True) done by the caller
         mc.RET()
         #
@@ -298,6 +303,7 @@
         mc.force_frame_size(DEFAULT_FRAME_BYTES + WORD)
         mc.ADD_ri(esp.value, WORD)
         mc.JMP(imm(self.propagate_exception_path))
+        self.flush_pending_slowpaths(mc)
         #
         rawstart = mc.materialize(self.cpu, [])
         return rawstart
@@ -756,7 +762,17 @@
         gcreftracers.append(tracer)    # keepalive
         self.teardown_gcrefs_list()
 
+    def flush_pending_slowpaths(self, mc):
+        # for each pending slowpath, generate it now.  Note that this
+        # may occasionally add an extra guard_token in
+        # pending_guard_tokens, so it must be done before the
+        # following loop in write_pending_failure_recoveries().
+        for sp in self.pending_slowpaths:
+            sp.generate(self, mc)
+        self.pending_slowpaths = None
+
     def write_pending_failure_recoveries(self, regalloc):
+        self.flush_pending_slowpaths(self.mc)
         # for each pending guard, generate the code of the recovery stub
         # at the end of self.mc.
         for tok in self.pending_guard_tokens:
@@ -822,6 +838,14 @@
         for ofs in self.frame_depth_to_patch:
             self._patch_frame_depth(ofs + rawstart, framedepth)
 
+    class IncreaseStackSlowPath(codebuf.SlowPath):
+        def generate_body(self, assembler, mc):
+            mc.MOV_si(WORD, 0xffffff)     # force writing 32 bit
+            ofs2 = mc.get_relative_pos(break_basic_block=False) - 4
+            assembler.frame_depth_to_patch.append(ofs2)
+            assembler.push_gcmap(mc, self.gcmap, store=True)
+            mc.CALL(imm(assembler._frame_realloc_slowpath))
+
     def _check_frame_depth(self, mc, gcmap):
         """ check if the frame is of enough depth to follow this bridge.
         Otherwise reallocate the frame in a helper.
@@ -832,15 +856,11 @@
         ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr)
         mc.CMP_bi(ofs, 0xffffff)     # force writing 32 bit
         stack_check_cmp_ofs = mc.get_relative_pos(break_basic_block=False) - 4
-        jg_location = mc.emit_forward_jump('GE')
-        mc.MOV_si(WORD, 0xffffff)     # force writing 32 bit
-        ofs2 = mc.get_relative_pos(break_basic_block=False) - 4
-        self.push_gcmap(mc, gcmap, store=True)
-        mc.CALL(imm(self._frame_realloc_slowpath))
-        # patch the JG above
-        mc.patch_forward_jump(jg_location)
         self.frame_depth_to_patch.append(stack_check_cmp_ofs)
-        self.frame_depth_to_patch.append(ofs2)
+        sp = self.IncreaseStackSlowPath(mc, rx86.Conditions['L'])
+        sp.gcmap = gcmap
+        sp.set_continue_addr(mc)
+        self.pending_slowpaths.append(sp)
 
     def _check_frame_depth_debug(self, mc):
         """ double check the depth size. It prints the error (and potentially
@@ -986,6 +1006,10 @@
         if gcrootmap and gcrootmap.is_shadow_stack:
             self._call_header_shadowstack(gcrootmap)
 
+    class StackCheckSlowPath(codebuf.SlowPath):
+        def generate_body(self, assembler, mc):
+            mc.CALL(imm(assembler.stack_check_slowpath))
+
     def _call_header_with_stack_check(self):
         self._call_header()
         if self.stack_check_slowpath == 0:
@@ -995,11 +1019,9 @@
             self.mc.MOV(eax, heap(endaddr))             # MOV eax, [start]
             self.mc.SUB(eax, esp)                       # SUB eax, current
             self.mc.CMP(eax, heap(lengthaddr))          # CMP eax, [length]
-            jb_location = self.mc.emit_forward_jump('BE')#JBE .skip
-            self.mc.CALL(imm(self.stack_check_slowpath))# CALL slowpath
-            # patch the JB above                        # .skip:
-            self.mc.patch_forward_jump(jb_location)
-            #
+            sp = self.StackCheckSlowPath(self.mc, rx86.Conditions['A'])
+            sp.set_continue_addr(self.mc)
+            self.pending_slowpaths.append(sp)
 
     def _call_footer(self):
         # the return value is the jitframe
@@ -1155,9 +1177,9 @@
                                                     faillocs, frame_depth)
         genop_guard_list[guard_opnum](self, guard_op, guard_token,
                                       arglocs, resloc)
-        if not we_are_translated():
-            # must be added by the genop_guard_list[]()
-            assert guard_token is self.pending_guard_tokens[-1]
+        # this must usually have added guard_token as last element
+        # of self.pending_guard_tokens, but not always (see
+        # genop_guard_guard_no_exception)
 
     def load_effective_addr(self, sizereg, baseofs, scale, result, frm=imm0):
         self.mc.LEA(result, addr_add(frm, sizereg, baseofs, scale))
@@ -1536,9 +1558,11 @@
         reg = arglocs[0]
         self.mc.TEST(reg, reg)
         if WORD == 4:
+            # common case: not taken
             self.mc.J_il(rx86.Conditions['Z'], self.propagate_exception_path)
             self.mc.add_pending_relocation()
         elif WORD == 8:
+            # common case: not taken
             self.mc.J_il(rx86.Conditions['Z'], 0)
             pos = self.mc.get_relative_pos(break_basic_block=False)
             self.pending_memoryerror_trampoline_from.append(pos)
@@ -1695,22 +1719,21 @@
     genop_guard_guard_isnull = genop_guard_guard_false
 
     def genop_guard_guard_no_exception(self, guard_op, guard_token, locs, ign):
+        # If the previous operation was a COND_CALL, don't emit
+        # anything now.  Instead, we'll emit the GUARD_NO_EXCEPTION at
+        # the end of the slowpath in CondCallSlowPath.
+        if self._find_nearby_operation(-1).getopnum() in (
+                rop.COND_CALL, rop.COND_CALL_VALUE_I, rop.COND_CALL_VALUE_R):
+            sp = self.pending_slowpaths[-1]
+            assert isinstance(sp, self.CondCallSlowPath)
+            sp.guard_token_no_exception = guard_token
+        else:
+            self.generate_guard_no_exception(guard_token)
+
+    def generate_guard_no_exception(self, guard_token):
         self.mc.CMP(heap(self.cpu.pos_exception()), imm0)
         self.guard_success_cc = rx86.Conditions['Z']
         self.implement_guard(guard_token)
-        # If the previous operation was a COND_CALL, overwrite its conditional
-        # jump to jump over this GUARD_NO_EXCEPTION as well, if we can
-        if self._find_nearby_operation(-1).getopnum() in (
-                rop.COND_CALL, rop.COND_CALL_VALUE_I, rop.COND_CALL_VALUE_R):
-            j_location = self.previous_cond_call_jcond
-            try:
-                self.mc.patch_forward_jump(j_location)
-            except codebuf.ShortJumpTooFar:
-                pass    # ignore this case
-            else:
-                # succeeded: forget the saved value of the scratch
-                # register here
-                self.mc.forget_scratch_register()
 
     def genop_guard_guard_not_invalidated(self, guard_op, guard_token,
                                           locs, ign):
@@ -2073,6 +2096,7 @@
     def implement_guard(self, guard_token):
         # These jumps are patched later.
         assert self.guard_success_cc >= 0
+        # common case: not taken
         self.mc.J_il(rx86.invert_condition(self.guard_success_cc), 0)
         self.guard_success_cc = rx86.cond_none
         pos = self.mc.get_relative_pos(break_basic_block=False)
@@ -2240,6 +2264,84 @@
 
     # ------------------- END CALL ASSEMBLER -----------------------
 
+    class WriteBarrierSlowPath(codebuf.SlowPath):
+        def generate_body(self, assembler, mc):
+            mc.force_frame_size(DEFAULT_FRAME_BYTES)
+            # for cond_call_gc_wb_array, also add another fast path:
+            # if GCFLAG_CARDS_SET, then we can just set one bit and be done
+            card_marking = (self.loc_index is not None)
+            if card_marking:
+                # GCFLAG_CARDS_SET is in this byte at 0x80, so this fact can
+                # been checked by the sign flags of the previous TEST8
+                js_location = mc.emit_forward_jump('S')   # patched later
+            else:
+                js_location = 0
+
+            # Write only a CALL to the helper prepared in advance, passing it as
+            # argument the address of the structure we are writing into
+            # (the first argument to COND_CALL_GC_WB).
+            helper_num = self.helper_num
+            is_frame = (helper_num == 4)
+            descr = self.descr
+            loc_base = self.loc_base
+            #
+            if not is_frame:
+                mc.PUSH(loc_base)
+            mc.CALL(imm(assembler.wb_slowpath[helper_num]))
+            if not is_frame:
+                mc.stack_frame_size_delta(-WORD)
+
+            if card_marking:
+                # The helper ends again with a check of the flag in the object.
+                # So here, we can simply write again a 'JNS', which will be
+                # taken if GCFLAG_CARDS_SET is still not set.
+                jns_location = mc.emit_forward_jump('NS')   # patched later
+                #
+                # patch the JS above
+                mc.patch_forward_jump(js_location)
+                #
+                # case GCFLAG_CARDS_SET: emit a few instructions to do
+                # directly the card flag setting
+                loc_index = self.loc_index
+                if isinstance(loc_index, RegLoc):
+                    if IS_X86_64 and isinstance(loc_base, RegLoc):
+                        # copy loc_index into r11
+                        tmp1 = X86_64_SCRATCH_REG
+                        mc.forget_scratch_register()
+                        mc.MOV_rr(tmp1.value, loc_index.value)
+                        final_pop = False
+                    else:
+                        # must save the register loc_index before it is mutated
+                        mc.PUSH_r(loc_index.value)
+                        tmp1 = loc_index
+                        final_pop = True
+                    # SHR tmp, card_page_shift
+                    mc.SHR_ri(tmp1.value, descr.jit_wb_card_page_shift)
+                    # XOR tmp, -8
+                    mc.XOR_ri(tmp1.value, -8)
+                    # BTS [loc_base], tmp
+                    if final_pop:
+                        # r11 is not specially used, fall back to regloc.py
+                        mc.BTS(addr_add_const(loc_base, 0), tmp1)
+                    else:
+                        # tmp1 is r11!  but in this case, loc_base is a
+                        # register so we can invoke directly rx86.py
+                        mc.BTS_mr((loc_base.value, 0), tmp1.value)
+                    # done
+                    if final_pop:
+                        mc.POP_r(loc_index.value)
+                    #
+                elif isinstance(loc_index, ImmedLoc):
+                    byte_index = loc_index.value >> descr.jit_wb_card_page_shift
+                    byte_ofs = ~(byte_index >> 3)
+                    byte_val = 1 << (byte_index & 7)
+                    mc.OR8(addr_add_const(loc_base, byte_ofs), imm(byte_val))
+                else:
+                    raise AssertionError("index is neither RegLoc nor ImmedLoc")
+                #
+                # patch the JNS above
+                mc.patch_forward_jump(jns_location)
+
     def _write_barrier_fastpath(self, mc, descr, arglocs, array=False,
                                 is_frame=False):
         # Write code equivalent to write_barrier() in the GC: it checks
@@ -2251,6 +2353,7 @@
             assert cls is not None and isinstance(descr, cls)
         #
         card_marking = False
+        loc_index = None
         mask = descr.jit_wb_if_flag_singlebyte
         if array and descr.jit_wb_cards_set != 0:
             # assumptions the rest of the function depends on:
@@ -2258,6 +2361,7 @@
                     descr.jit_wb_if_flag_byteofs)
             assert descr.jit_wb_cards_set_singlebyte == -0x80
             card_marking = True
+            loc_index = arglocs[1]
             mask = descr.jit_wb_if_flag_singlebyte | -0x80
         #
         loc_base = arglocs[0]
@@ -2266,21 +2370,7 @@
             loc = raw_stack(descr.jit_wb_if_flag_byteofs)
         else:
             loc = addr_add_const(loc_base, descr.jit_wb_if_flag_byteofs)
-        mc.TEST8(loc, imm(mask))
-        jz_location = mc.emit_forward_jump('Z')   # patched later
-
-        # for cond_call_gc_wb_array, also add another fast path:
-        # if GCFLAG_CARDS_SET, then we can just set one bit and be done
-        if card_marking:
-            # GCFLAG_CARDS_SET is in this byte at 0x80, so this fact can
-            # been checked by the sign flags of the previous TEST8
-            js_location = mc.emit_forward_jump('S')   # patched later
-        else:
-            js_location = 0
-
-        # Write only a CALL to the helper prepared in advance, passing it as
-        # argument the address of the structure we are writing into
-        # (the first argument to COND_CALL_GC_WB).
+        #
         helper_num = card_marking
         if is_frame:
             helper_num = 4
@@ -2293,65 +2383,14 @@
                                     bool(self._regalloc.xrm.reg_bindings))
             assert self.wb_slowpath[helper_num] != 0
         #
-        if not is_frame:
-            mc.PUSH(loc_base)
-        mc.CALL(imm(self.wb_slowpath[helper_num]))
-        if not is_frame:
-            mc.stack_frame_size_delta(-WORD)
-
-        if card_marking:
-            # The helper ends again with a check of the flag in the object.
-            # So here, we can simply write again a 'JNS', which will be
-            # taken if GCFLAG_CARDS_SET is still not set.
-            jns_location = mc.emit_forward_jump('NS')   # patched later
-            #
-            # patch the JS above
-            mc.patch_forward_jump(js_location)
-            #
-            # case GCFLAG_CARDS_SET: emit a few instructions to do
-            # directly the card flag setting
-            loc_index = arglocs[1]
-            if isinstance(loc_index, RegLoc):
-                if IS_X86_64 and isinstance(loc_base, RegLoc):
-                    # copy loc_index into r11
-                    tmp1 = X86_64_SCRATCH_REG
-                    mc.forget_scratch_register()
-                    mc.MOV_rr(tmp1.value, loc_index.value)
-                    final_pop = False
-                else:
-                    # must save the register loc_index before it is mutated
-                    mc.PUSH_r(loc_index.value)
-                    tmp1 = loc_index
-                    final_pop = True
-                # SHR tmp, card_page_shift
-                mc.SHR_ri(tmp1.value, descr.jit_wb_card_page_shift)
-                # XOR tmp, -8
-                mc.XOR_ri(tmp1.value, -8)
-                # BTS [loc_base], tmp
-                if final_pop:
-                    # r11 is not specially used, fall back to regloc.py
-                    mc.BTS(addr_add_const(loc_base, 0), tmp1)
-                else:
-                    # tmp1 is r11!  but in this case, loc_base is a
-                    # register so we can invoke directly rx86.py
-                    mc.BTS_mr((loc_base.value, 0), tmp1.value)
-                # done
-                if final_pop:
-                    mc.POP_r(loc_index.value)
-                #
-            elif isinstance(loc_index, ImmedLoc):
-                byte_index = loc_index.value >> descr.jit_wb_card_page_shift
-                byte_ofs = ~(byte_index >> 3)
-                byte_val = 1 << (byte_index & 7)
-                mc.OR8(addr_add_const(loc_base, byte_ofs), imm(byte_val))
-            else:
-                raise AssertionError("index is neither RegLoc nor ImmedLoc")
-            #
-            # patch the JNS above
-            mc.patch_forward_jump(jns_location)
-
-        # patch the JZ above
-        mc.patch_forward_jump(jz_location)
+        mc.TEST8(loc, imm(mask))
+        sp = self.WriteBarrierSlowPath(mc, rx86.Conditions['NZ'])
+        sp.loc_base = loc_base
+        sp.loc_index = loc_index
+        sp.helper_num = helper_num
+        sp.descr = descr
+        sp.set_continue_addr(mc)
+        self.pending_slowpaths.append(sp)
 
     def genop_discard_cond_call_gc_wb(self, op, arglocs):
         self._write_barrier_fastpath(self.mc, op.getdescr(), arglocs)
@@ -2382,37 +2421,66 @@
     def label(self):
         self._check_frame_depth_debug(self.mc)
 
+    class CondCallSlowPath(codebuf.SlowPath):
+        guard_token_no_exception = None
+
+        def generate_body(self, assembler, mc):
+            assembler.push_gcmap(mc, self.gcmap, store=True)
+            #
+            # first save away the 4 registers from
+            # 'cond_call_register_arguments' plus the register 'eax'
+            base_ofs = assembler.cpu.get_baseofs_of_frame_field()
+            should_be_saved = self.should_be_saved
+            restore_eax = False
+            for gpr in cond_call_register_arguments + [eax]:
+                if gpr not in should_be_saved or gpr is self.resloc:
+                    continue
+                v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value]
+                mc.MOV_br(v * WORD + base_ofs, gpr.value)
+                if gpr is eax:
+                    restore_eax = True
+            #
+            # load the 0-to-4 arguments into these registers
+            from rpython.jit.backend.x86.jump import remap_frame_layout
+            arglocs = self.arglocs
+            remap_frame_layout(assembler, arglocs,
+                               cond_call_register_arguments[:len(arglocs)],
+                               X86_64_SCRATCH_REG if IS_X86_64 else None)
+            #
+            # load the constant address of the function to call into eax
+            mc.MOV(eax, self.imm_func)
+            #
+            # figure out which variant of cond_call_slowpath to call,
+            # and call it
+            cond_call_adr = assembler.cond_call_slowpath[self.variant_num]
+            mc.CALL(imm(follow_jump(cond_call_adr)))
+            # if this is a COND_CALL_VALUE, we need to move the result in place
+            resloc = self.resloc
+            if resloc is not None and resloc is not eax:
+                mc.MOV(resloc, eax)
+            # restoring the registers saved above, and doing pop_gcmap(), is
+            # left to the cond_call_slowpath helper.  We must only restore eax,
+            # if needed.
+            if restore_eax:
+                v = gpr_reg_mgr_cls.all_reg_indexes[eax.value]
+                mc.MOV_rb(eax.value, v * WORD + base_ofs)
+            #
+            # if needed, emit now the guard_no_exception
+            if self.guard_token_no_exception is not None:
+                assembler.generate_guard_no_exception(
+                             self.guard_token_no_exception)
+
     def cond_call(self, gcmap, imm_func, arglocs, resloc=None):
         assert self.guard_success_cc >= 0
-        j_location = self.mc.emit_forward_jump_cond(
-            rx86.invert_condition(self.guard_success_cc))
+        sp = self.CondCallSlowPath(self.mc, self.guard_success_cc)
+        sp.set_continue_addr(self.mc)
         self.guard_success_cc = rx86.cond_none
+        sp.gcmap = gcmap
+        sp.imm_func = imm_func
+        sp.arglocs = arglocs
+        sp.resloc = resloc
+        sp.should_be_saved = self._regalloc.rm.reg_bindings.values()
         #
-        self.push_gcmap(self.mc, gcmap, store=True)
-        #
-        # first save away the 4 registers from 'cond_call_register_arguments'
-        # plus the register 'eax'
-        base_ofs = self.cpu.get_baseofs_of_frame_field()
-        should_be_saved = self._regalloc.rm.reg_bindings.values()
-        restore_eax = False
-        for gpr in cond_call_register_arguments + [eax]:
-            if gpr not in should_be_saved or gpr is resloc:
-                continue
-            v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value]
-            self.mc.MOV_br(v * WORD + base_ofs, gpr.value)
-            if gpr is eax:
-                restore_eax = True
-        #
-        # load the 0-to-4 arguments into these registers
-        from rpython.jit.backend.x86.jump import remap_frame_layout
-        remap_frame_layout(self, arglocs,
-                           cond_call_register_arguments[:len(arglocs)],
-                           X86_64_SCRATCH_REG if IS_X86_64 else None)
-        #
-        # load the constant address of the function to call into eax
-        self.mc.MOV(eax, imm_func)
-        #
-        # figure out which variant of cond_call_slowpath to call, and call it
         callee_only = False
         floats = False
         if self._regalloc is not None:
@@ -2423,34 +2491,25 @@
                 callee_only = True
             if self._regalloc.xrm.reg_bindings:
                 floats = True
-        cond_call_adr = self.cond_call_slowpath[floats * 2 + callee_only]
-        self.mc.CALL(imm(follow_jump(cond_call_adr)))
-        # if this is a COND_CALL_VALUE, we need to move the result in place
-        if resloc is not None and resloc is not eax:
-            self.mc.MOV(resloc, eax)
-        # restoring the registers saved above, and doing pop_gcmap(), is left
-        # to the cond_call_slowpath helper.  We must only restore eax, if
-        # needed.
-        if restore_eax:
-            v = gpr_reg_mgr_cls.all_reg_indexes[eax.value]
-            self.mc.MOV_rb(eax.value, v * WORD + base_ofs)
+        sp.variant_num = floats * 2 + callee_only
         #
-        self.mc.patch_forward_jump(j_location)
-        # might be overridden again to skip over the following
-        # guard_no_exception too
-        self.previous_cond_call_jcond = j_location
+        self.pending_slowpaths.append(sp)
+
+    class MallocCondSlowPath(codebuf.SlowPath):
+        def generate_body(self, assembler, mc):
+            assembler.push_gcmap(mc, self.gcmap, store=True)
+            mc.CALL(imm(follow_jump(assembler.malloc_slowpath)))
 
     def malloc_cond(self, nursery_free_adr, nursery_top_adr, size, gcmap):
         assert size & (WORD-1) == 0     # must be correctly aligned
         self.mc.MOV(ecx, heap(nursery_free_adr))
         self.mc.LEA_rm(edx.value, (ecx.value, size))
         self.mc.CMP(edx, heap(nursery_top_adr))
-        jna_location = self.mc.emit_forward_jump('NA')   # patched later
-        # save the gcmap
-        self.push_gcmap(self.mc, gcmap, store=True)
-        self.mc.CALL(imm(follow_jump(self.malloc_slowpath)))
-        self.mc.patch_forward_jump(jna_location)
+        sp = self.MallocCondSlowPath(self.mc, rx86.Conditions['A'])
+        sp.gcmap = gcmap
         self.mc.MOV(heap(nursery_free_adr), edx)
+        sp.set_continue_addr(self.mc)
+        self.pending_slowpaths.append(sp)
 
     def malloc_cond_varsize_frame(self, nursery_free_adr, nursery_top_adr,
                                   sizeloc, gcmap):
@@ -2463,12 +2522,31 @@
         else:
             self.mc.LEA_ra(edx.value, (ecx.value, sizeloc.value, 0, 0))
         self.mc.CMP(edx, heap(nursery_top_adr))
-        jna_location = self.mc.emit_forward_jump('NA')   # patched later
-        # save the gcmap
-        self.push_gcmap(self.mc, gcmap, store=True)
-        self.mc.CALL(imm(follow_jump(self.malloc_slowpath)))
-        self.mc.patch_forward_jump(jna_location)
+        sp = self.MallocCondSlowPath(self.mc, rx86.Conditions['A'])
+        sp.gcmap = gcmap
         self.mc.MOV(heap(nursery_free_adr), edx)
+        sp.set_continue_addr(self.mc)
+        self.pending_slowpaths.append(sp)
+
+    class MallocCondVarsizeSlowPath(codebuf.SlowPath):
+        def generate_body(self, assembler, mc):
+            # save the gcmap
+            assembler.push_gcmap(mc, self.gcmap, store=True)
+            kind = self.kind
+            if kind == rewrite.FLAG_ARRAY:
+                mc.MOV_si(WORD, self.itemsize)
+                mc.MOV_ri(ecx.value, self.arraydescr.tid)
+                addr = assembler.malloc_slowpath_varsize
+            else:
+                if kind == rewrite.FLAG_STR:
+                    addr = assembler.malloc_slowpath_str
+                else:
+                    assert kind == rewrite.FLAG_UNICODE
+                    addr = assembler.malloc_slowpath_unicode
+            lengthloc = self.lengthloc
+            assert lengthloc is not ecx and lengthloc is not edx
+            mc.MOV(edx, lengthloc)
+            mc.CALL(imm(follow_jump(addr)))
 
     def malloc_cond_varsize(self, kind, nursery_free_adr, nursery_top_adr,
                             lengthloc, itemsize, maxlength, gcmap,
@@ -2509,34 +2587,24 @@
         # now edx contains the total size in bytes, rounded up to a multiple
         # of WORD, plus nursery_free_adr
         self.mc.CMP(edx, heap(nursery_top_adr))
-        jna_location = self.mc.emit_forward_jump('NA')   # patched later
-        #
         self.mc.patch_forward_jump(ja_location)
-        # save the gcmap
-        self.push_gcmap(self.mc, gcmap, store=True)
-        if kind == rewrite.FLAG_ARRAY:
-            self.mc.MOV_si(WORD, itemsize)
-            self.mc.MOV(edx, lengthloc)
-            self.mc.MOV_ri(ecx.value, arraydescr.tid)
-            addr = self.malloc_slowpath_varsize
-        else:
-            if kind == rewrite.FLAG_STR:
-                addr = self.malloc_slowpath_str
-            else:
-                assert kind == rewrite.FLAG_UNICODE
-                addr = self.malloc_slowpath_unicode
-            self.mc.MOV(edx, lengthloc)
-        self.mc.CALL(imm(follow_jump(addr)))
-        jmp_location = self.mc.emit_forward_jump_uncond()  # jump to later
-        #
-        self.mc.patch_forward_jump(jna_location)
-        self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
-        # write down the tid, but not if it's the result of the CALL
+        # Note: we call the slow path in condition 'A', which may be
+        # true either because the CMP just above really got that
+        # condition, or because we jumped here from ja_location before.
+        # In both cases, the jumps are forward-going and the expected
+        # common case is "not taken".
+        sp = self.MallocCondVarsizeSlowPath(self.mc, rx86.Conditions['A'])
+        sp.gcmap = gcmap
+        sp.kind = kind
+        sp.itemsize = itemsize
+        sp.lengthloc = lengthloc
+        sp.arraydescr = arraydescr
+        # some more code that is only if we *don't* call the slow
+        # path: write down the tid, and save edx into nursery_free_adr
         self.mc.MOV(mem(ecx, 0), imm(arraydescr.tid))
-        # while we're at it, this line is not needed if we've done the CALL
         self.mc.MOV(heap(nursery_free_adr), edx)
-        #
-        self.mc.patch_forward_jump(jmp_location)
+        sp.set_continue_addr(self.mc)
+        self.pending_slowpaths.append(sp)
 
     def store_force_descr(self, op, fail_locs, frame_depth):
         guard_token = self.implement_guard_recovery(op.opnum,
diff --git a/rpython/jit/backend/x86/callbuilder.py b/rpython/jit/backend/x86/callbuilder.py
--- a/rpython/jit/backend/x86/callbuilder.py
+++ b/rpython/jit/backend/x86/callbuilder.py
@@ -11,6 +11,7 @@
     r12, r13, r14, r15, X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG,
     RegLoc, RawEspLoc, RawEbpLoc, imm, ImmedLoc)
 from rpython.jit.backend.x86.jump import remap_frame_layout
+from rpython.jit.backend.x86 import codebuf
 from rpython.jit.backend.llsupport.callbuilder import AbstractCallBuilder
 from rpython.jit.backend.llsupport import llerrno
 from rpython.rtyper.lltypesystem import llmemory, rffi
@@ -293,6 +294,41 @@
             tlofsreg = self.get_tlofs_reg()    # => esi (possibly reused)
             mc.MOV32_mr((tlofsreg.value, lasterror), eax.value)
 
+    class ReacqGilSlowPath(codebuf.SlowPath):
+        early_jump_addr = 0
+
+        def generate_body(self, assembler, mc):
+            if self.early_jump_addr != 0:
+                # This slow-path has two entry points, with two
+                # conditional jumps.  We can jump to the regular start
+                # of this slow-path with the 2nd conditional jump.  Or,
+                # we can jump past the "MOV(heap(fastgil), ecx)"
+                # instruction from the 1st conditional jump.
+                # This instruction reverts the rpy_fastgil acquired
+                # previously, so that the general 'reacqgil_addr'
+                # function can acquire it again.  It must only be done
+                # if we actually succeeded in acquiring rpy_fastgil.
+                from rpython.jit.backend.x86.assembler import heap
+                mc.MOV(heap(self.fastgil), ecx)
+                offset = mc.get_relative_pos() - self.early_jump_addr
+                mc.overwrite32(self.early_jump_addr-4, offset)
+                # scratch register forgotten here, by get_relative_pos()
+
+            # call the reacqgil() function
+            cb = self.callbuilder
+            if not cb.result_value_saved_early:
+                cb.save_result_value(save_edx=False)
+            if assembler._is_asmgcc():
+                if IS_X86_32:
+                    css_value = edx
+                    old_value = ecx
+                    mc.MOV_sr(4, old_value.value)
+                    mc.MOV_sr(0, css_value.value)
+                # on X86_64, they are already in the right registers
+            mc.CALL(imm(follow_jump(assembler.reacqgil_addr)))
+            if not cb.result_value_saved_early:
+                cb.restore_result_value(save_edx=False)
+
     def move_real_result_and_call_reacqgil_addr(self, fastgil):
         from rpython.jit.backend.x86 import rx86
         #
@@ -314,8 +350,8 @@
                     if not self.result_value_saved_early:
                         mc.MOV_sr(12, edx.value)
                         restore_edx = True
-                css_value = edx
-                old_value = ecx
+                css_value = edx    # note: duplicated in ReacqGilSlowPath
+                old_value = ecx    #
             elif IS_X86_64:
                 css_value = edi
                 old_value = esi
@@ -341,35 +377,25 @@
             # thread.  So here we check if the shadowstack pointer
             # is still the same as before we released the GIL (saved
             # in 'ebx'), and if not, we fall back to 'reacqgil_addr'.
-            jne_location = mc.emit_forward_jump('NE')
+            mc.J_il(rx86.Conditions['NE'], 0xfffff)     # patched later
+            early_jump_addr = mc.get_relative_pos(break_basic_block=False)
+            # ^^^ this jump will go to almost the same place as the
+            # ReacqGilSlowPath() computes, but one instruction farther,
+            # i.e. just after the "MOV(heap(fastgil), ecx)".
+
             # here, ecx (=old_value) is zero (so rpy_fastgil was in 'released'
             # state before the XCHG, but the XCHG acquired it by writing 1)
             rst = gcrootmap.get_root_stack_top_addr()
             mc = self.mc
             mc.CMP(ebx, heap(rst))
-            je_location = mc.emit_forward_jump('E')
-            # revert the rpy_fastgil acquired above, so that the
-            # general 'reacqgil_addr' below can acquire it again...
-            mc.MOV(heap(fastgil), ecx)
-            # patch the JNE above
-            mc.patch_forward_jump(jne_location)
+            sp = self.ReacqGilSlowPath(mc, rx86.Conditions['NE'])
+            sp.early_jump_addr = early_jump_addr
+            sp.fastgil = fastgil
         else:
-            je_location = mc.emit_forward_jump('E')
-        #
-        # Yes, we need to call the reacqgil() function
-        if not self.result_value_saved_early:
-            self.save_result_value(save_edx=False)
-        if self.asm._is_asmgcc():
-            if IS_X86_32:
-                mc.MOV_sr(4, old_value.value)
-                mc.MOV_sr(0, css_value.value)
-            # on X86_64, they are already in the right registers
-        mc.CALL(imm(follow_jump(self.asm.reacqgil_addr)))
-        if not self.result_value_saved_early:
-            self.restore_result_value(save_edx=False)
-        #
-        # patch the JE above
-        mc.patch_forward_jump(je_location)
+            sp = self.ReacqGilSlowPath(mc, rx86.Conditions['NE'])
+        sp.callbuilder = self
+        sp.set_continue_addr(mc)
+        self.asm.pending_slowpaths.append(sp)
         #
         if restore_edx:
             mc.MOV_rs(edx.value, 12)   # restore this
diff --git a/rpython/jit/backend/x86/codebuf.py b/rpython/jit/backend/x86/codebuf.py
--- a/rpython/jit/backend/x86/codebuf.py
+++ b/rpython/jit/backend/x86/codebuf.py
@@ -81,3 +81,30 @@
         if break_basic_block:
             self.forget_scratch_register()
         return BlockBuilderMixin.get_relative_pos(self)
+
+
+class SlowPath(object):
+    def __init__(self, mc, condition):
+        mc.J_il(condition, 0xfffff)     # patched later
+        self.cond_jump_addr = mc.get_relative_pos(break_basic_block=False)
+        self.saved_scratch_value_1 = mc.get_scratch_register_known_value()
+
+    def set_continue_addr(self, mc):
+        self.continue_addr = mc.get_relative_pos(break_basic_block=False)
+        self.saved_scratch_value_2 = mc.get_scratch_register_known_value()
+
+    def generate(self, assembler, mc):
+        # no alignment here, prefer compactness for these slow-paths.
+        # patch the original jump to go here
+        offset = mc.get_relative_pos() - self.cond_jump_addr
+        mc.overwrite32(self.cond_jump_addr-4, offset)
+        # restore the knowledge of the scratch register value
+        # (this does not emit any code)
+        mc.restore_scratch_register_known_value(self.saved_scratch_value_1)
+        # generate the body of the slow-path
+        self.generate_body(assembler, mc)
+        # reload (if needed) the (possibly different) scratch register value
+        mc.load_scratch_if_known(self.saved_scratch_value_2)
+        # jump back
+        curpos = mc.get_relative_pos() + 5
+        mc.JMP_l(self.continue_addr - curpos)
diff --git a/rpython/jit/backend/x86/regloc.py b/rpython/jit/backend/x86/regloc.py
--- a/rpython/jit/backend/x86/regloc.py
+++ b/rpython/jit/backend/x86/regloc.py
@@ -373,7 +373,7 @@
 class LocationCodeBuilder(object):
     _mixin_ = True
 
-    _scratch_register_value = 0    # 0 means 'unknown'
+    _scratch_register_value = -1    # -1 means 'unknown'
 
     def _binaryop(name):
 
@@ -552,7 +552,7 @@
         # If we are within a "reuse_scratch_register" block, we remember the
         # last value we loaded to the scratch register and encode the address
         # as an offset from that if we can
-        if self._scratch_register_value != 0:
+        if self._scratch_register_value != -1:
             offset = r_uint(addr) - r_uint(self._scratch_register_value)
             offset = intmask(offset)
             if rx86.fits_in_32bits(offset):
@@ -593,7 +593,7 @@
         return (reg, scalereg, scale, ofs)
 
     def _load_scratch(self, value):
-        if self._scratch_register_value != 0:
+        if self._scratch_register_value != -1:
             if self._scratch_register_value == value:
                 #print '_load_scratch(%x) [REUSED]' % (value,)
                 return
@@ -619,7 +619,7 @@
         self.MOV_ri(X86_64_SCRATCH_REG.value, value)
 
     def forget_scratch_register(self):
-        self._scratch_register_value = 0
+        self._scratch_register_value = -1
 
     def get_scratch_register_known_value(self):
         return self._scratch_register_value
@@ -627,6 +627,11 @@
     def restore_scratch_register_known_value(self, saved_value):
         self._scratch_register_value = saved_value
 
+    def load_scratch_if_known(self, saved_value):
+        if saved_value != -1:
+            assert IS_X86_64
+            self._load_scratch(saved_value)
+
     def trap(self):
         self.INT3()
 
diff --git a/rpython/jit/backend/x86/test/test_regloc.py b/rpython/jit/backend/x86/test/test_regloc.py
--- a/rpython/jit/backend/x86/test/test_regloc.py
+++ b/rpython/jit/backend/x86/test/test_regloc.py
@@ -211,9 +211,9 @@
     def test_64bit_address_4(self):
         base_addr = intmask(0xFEDCBA9876543210)
         cb = LocationCodeBuilder64()
-        assert cb._scratch_register_value == 0
+        assert cb._scratch_register_value == -1
         cb.MOV(ecx, AddressLoc(edx, esi, 2, base_addr))
-        assert cb._scratch_register_value == 0
+        assert cb._scratch_register_value == -1
         # this case is a CMP_ra
         #
         expected_instructions = (
diff --git a/rpython/jit/backend/x86/test/test_runner.py b/rpython/jit/backend/x86/test/test_runner.py
--- a/rpython/jit/backend/x86/test/test_runner.py
+++ b/rpython/jit/backend/x86/test/test_runner.py
@@ -39,7 +39,7 @@
                                  'nop; '    # for the label
                                  'add; test; je; jmp;')   # plus some padding
         bridge_loop_instructions = (
-            'cmp; jge; mov;( movabs;| lea;)? mov; (mov|movabs|lea); call; mov(abs)?; jmp;')
+            'cmp; jl; mov(abs)?; jmp;')
 
     def get_cpu(self):
         cpu = CPU(rtyper=None, stats=FakeStats())


More information about the pypy-commit mailing list