[pypy-commit] pypy default: hg merge branch-prediction
arigo
pypy.commits at gmail.com
Fri Apr 7 10:35:28 EDT 2017
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r91018:16bac2ad5e92
Date: 2017-04-07 16:33 +0200
http://bitbucket.org/pypy/pypy/changeset/16bac2ad5e92/
Log: hg merge branch-prediction
Help the branch predictor on x86 CPUs. Our JIT-generated machine
code is a single long basic block, and now it contains only forward
conditional jumps that are rarely taken. It is better than
previously, where a subset of the forward conditional jumps were
over slow-path code and so expected to be usually taken. Now these
blocks of slow-path code are put off-line after the main block.
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -76,6 +76,7 @@
BaseAssembler.setup(self, looptoken)
assert self.memcpy_addr != 0, "setup_once() not called?"
self.current_clt = looptoken.compiled_loop_token
+ self.pending_slowpaths = []
self.pending_guard_tokens = []
if WORD == 8:
self.pending_memoryerror_trampoline_from = []
@@ -95,6 +96,7 @@
self.pending_memoryerror_trampoline_from = None
self.mc = None
self.current_clt = None
+ self.frame_depth_to_patch = None
def _build_float_constants(self):
# 0x80000000000000008000000000000000
@@ -181,6 +183,7 @@
""" This builds a general call slowpath, for whatever call happens to
come.
"""
+ self.pending_slowpaths = []
mc = codebuf.MachineCodeBlockWrapper()
# copy registers to the frame, with the exception of the
# 'cond_call_register_arguments' and eax, because these have already
@@ -211,6 +214,7 @@
self.pop_gcmap(mc) # cancel the push_gcmap(store=True) in the caller
self._pop_all_regs_from_frame(mc, [eax], supports_floats, callee_only)
mc.RET()
+ self.flush_pending_slowpaths(mc)
return mc.materialize(self.cpu, [])
def _build_malloc_slowpath(self, kind):
@@ -227,6 +231,7 @@
This function must preserve all registers apart from ecx and edx.
"""
assert kind in ['fixed', 'str', 'unicode', 'var']
+ self.pending_slowpaths = []
mc = codebuf.MachineCodeBlockWrapper()
self._push_all_regs_to_frame(mc, [ecx, edx], self.cpu.supports_floats)
# the caller already did push_gcmap(store=True)
@@ -276,13 +281,13 @@
self.set_extra_stack_depth(mc, 0)
#
mc.TEST_rr(eax.value, eax.value)
+ # common case: not taken
mc.J_il(rx86.Conditions['Z'], 0xfffff) # patched later
jz_location = mc.get_relative_pos(break_basic_block=False)
mc.MOV_rr(ecx.value, eax.value)
#
nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
self._pop_all_regs_from_frame(mc, [ecx, edx], self.cpu.supports_floats)
- mc.MOV(edx, heap(nursery_free_adr)) # load this in EDX
self.pop_gcmap(mc) # push_gcmap(store=True) done by the caller
mc.RET()
#
@@ -298,6 +303,7 @@
mc.force_frame_size(DEFAULT_FRAME_BYTES + WORD)
mc.ADD_ri(esp.value, WORD)
mc.JMP(imm(self.propagate_exception_path))
+ self.flush_pending_slowpaths(mc)
#
rawstart = mc.materialize(self.cpu, [])
return rawstart
@@ -756,7 +762,17 @@
gcreftracers.append(tracer) # keepalive
self.teardown_gcrefs_list()
+ def flush_pending_slowpaths(self, mc):
+ # for each pending slowpath, generate it now. Note that this
+ # may occasionally add an extra guard_token in
+ # pending_guard_tokens, so it must be done before the
+ # following loop in write_pending_failure_recoveries().
+ for sp in self.pending_slowpaths:
+ sp.generate(self, mc)
+ self.pending_slowpaths = None
+
def write_pending_failure_recoveries(self, regalloc):
+ self.flush_pending_slowpaths(self.mc)
# for each pending guard, generate the code of the recovery stub
# at the end of self.mc.
for tok in self.pending_guard_tokens:
@@ -822,6 +838,14 @@
for ofs in self.frame_depth_to_patch:
self._patch_frame_depth(ofs + rawstart, framedepth)
+ class IncreaseStackSlowPath(codebuf.SlowPath):
+ def generate_body(self, assembler, mc):
+ mc.MOV_si(WORD, 0xffffff) # force writing 32 bit
+ ofs2 = mc.get_relative_pos(break_basic_block=False) - 4
+ assembler.frame_depth_to_patch.append(ofs2)
+ assembler.push_gcmap(mc, self.gcmap, store=True)
+ mc.CALL(imm(assembler._frame_realloc_slowpath))
+
def _check_frame_depth(self, mc, gcmap):
""" check if the frame is of enough depth to follow this bridge.
Otherwise reallocate the frame in a helper.
@@ -832,15 +856,11 @@
ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr)
mc.CMP_bi(ofs, 0xffffff) # force writing 32 bit
stack_check_cmp_ofs = mc.get_relative_pos(break_basic_block=False) - 4
- jg_location = mc.emit_forward_jump('GE')
- mc.MOV_si(WORD, 0xffffff) # force writing 32 bit
- ofs2 = mc.get_relative_pos(break_basic_block=False) - 4
- self.push_gcmap(mc, gcmap, store=True)
- mc.CALL(imm(self._frame_realloc_slowpath))
- # patch the JG above
- mc.patch_forward_jump(jg_location)
self.frame_depth_to_patch.append(stack_check_cmp_ofs)
- self.frame_depth_to_patch.append(ofs2)
+ sp = self.IncreaseStackSlowPath(mc, rx86.Conditions['L'])
+ sp.gcmap = gcmap
+ sp.set_continue_addr(mc)
+ self.pending_slowpaths.append(sp)
def _check_frame_depth_debug(self, mc):
""" double check the depth size. It prints the error (and potentially
@@ -986,6 +1006,10 @@
if gcrootmap and gcrootmap.is_shadow_stack:
self._call_header_shadowstack(gcrootmap)
+ class StackCheckSlowPath(codebuf.SlowPath):
+ def generate_body(self, assembler, mc):
+ mc.CALL(imm(assembler.stack_check_slowpath))
+
def _call_header_with_stack_check(self):
self._call_header()
if self.stack_check_slowpath == 0:
@@ -995,11 +1019,9 @@
self.mc.MOV(eax, heap(endaddr)) # MOV eax, [start]
self.mc.SUB(eax, esp) # SUB eax, current
self.mc.CMP(eax, heap(lengthaddr)) # CMP eax, [length]
- jb_location = self.mc.emit_forward_jump('BE')#JBE .skip
- self.mc.CALL(imm(self.stack_check_slowpath))# CALL slowpath
- # patch the JB above # .skip:
- self.mc.patch_forward_jump(jb_location)
- #
+ sp = self.StackCheckSlowPath(self.mc, rx86.Conditions['A'])
+ sp.set_continue_addr(self.mc)
+ self.pending_slowpaths.append(sp)
def _call_footer(self):
# the return value is the jitframe
@@ -1155,9 +1177,9 @@
faillocs, frame_depth)
genop_guard_list[guard_opnum](self, guard_op, guard_token,
arglocs, resloc)
- if not we_are_translated():
- # must be added by the genop_guard_list[]()
- assert guard_token is self.pending_guard_tokens[-1]
+ # this must usually have added guard_token as last element
+ # of self.pending_guard_tokens, but not always (see
+ # genop_guard_guard_no_exception)
def load_effective_addr(self, sizereg, baseofs, scale, result, frm=imm0):
self.mc.LEA(result, addr_add(frm, sizereg, baseofs, scale))
@@ -1536,9 +1558,11 @@
reg = arglocs[0]
self.mc.TEST(reg, reg)
if WORD == 4:
+ # common case: not taken
self.mc.J_il(rx86.Conditions['Z'], self.propagate_exception_path)
self.mc.add_pending_relocation()
elif WORD == 8:
+ # common case: not taken
self.mc.J_il(rx86.Conditions['Z'], 0)
pos = self.mc.get_relative_pos(break_basic_block=False)
self.pending_memoryerror_trampoline_from.append(pos)
@@ -1695,22 +1719,21 @@
genop_guard_guard_isnull = genop_guard_guard_false
def genop_guard_guard_no_exception(self, guard_op, guard_token, locs, ign):
+ # If the previous operation was a COND_CALL, don't emit
+ # anything now. Instead, we'll emit the GUARD_NO_EXCEPTION at
+ # the end of the slowpath in CondCallSlowPath.
+ if self._find_nearby_operation(-1).getopnum() in (
+ rop.COND_CALL, rop.COND_CALL_VALUE_I, rop.COND_CALL_VALUE_R):
+ sp = self.pending_slowpaths[-1]
+ assert isinstance(sp, self.CondCallSlowPath)
+ sp.guard_token_no_exception = guard_token
+ else:
+ self.generate_guard_no_exception(guard_token)
+
+ def generate_guard_no_exception(self, guard_token):
self.mc.CMP(heap(self.cpu.pos_exception()), imm0)
self.guard_success_cc = rx86.Conditions['Z']
self.implement_guard(guard_token)
- # If the previous operation was a COND_CALL, overwrite its conditional
- # jump to jump over this GUARD_NO_EXCEPTION as well, if we can
- if self._find_nearby_operation(-1).getopnum() in (
- rop.COND_CALL, rop.COND_CALL_VALUE_I, rop.COND_CALL_VALUE_R):
- j_location = self.previous_cond_call_jcond
- try:
- self.mc.patch_forward_jump(j_location)
- except codebuf.ShortJumpTooFar:
- pass # ignore this case
- else:
- # succeeded: forget the saved value of the scratch
- # register here
- self.mc.forget_scratch_register()
def genop_guard_guard_not_invalidated(self, guard_op, guard_token,
locs, ign):
@@ -2073,6 +2096,7 @@
def implement_guard(self, guard_token):
# These jumps are patched later.
assert self.guard_success_cc >= 0
+ # common case: not taken
self.mc.J_il(rx86.invert_condition(self.guard_success_cc), 0)
self.guard_success_cc = rx86.cond_none
pos = self.mc.get_relative_pos(break_basic_block=False)
@@ -2240,6 +2264,84 @@
# ------------------- END CALL ASSEMBLER -----------------------
+ class WriteBarrierSlowPath(codebuf.SlowPath):
+ def generate_body(self, assembler, mc):
+ mc.force_frame_size(DEFAULT_FRAME_BYTES)
+ # for cond_call_gc_wb_array, also add another fast path:
+ # if GCFLAG_CARDS_SET, then we can just set one bit and be done
+ card_marking = (self.loc_index is not None)
+ if card_marking:
+ # GCFLAG_CARDS_SET is in this byte at 0x80, so this fact can
+ # been checked by the sign flags of the previous TEST8
+ js_location = mc.emit_forward_jump('S') # patched later
+ else:
+ js_location = 0
+
+ # Write only a CALL to the helper prepared in advance, passing it as
+ # argument the address of the structure we are writing into
+ # (the first argument to COND_CALL_GC_WB).
+ helper_num = self.helper_num
+ is_frame = (helper_num == 4)
+ descr = self.descr
+ loc_base = self.loc_base
+ #
+ if not is_frame:
+ mc.PUSH(loc_base)
+ mc.CALL(imm(assembler.wb_slowpath[helper_num]))
+ if not is_frame:
+ mc.stack_frame_size_delta(-WORD)
+
+ if card_marking:
+ # The helper ends again with a check of the flag in the object.
+ # So here, we can simply write again a 'JNS', which will be
+ # taken if GCFLAG_CARDS_SET is still not set.
+ jns_location = mc.emit_forward_jump('NS') # patched later
+ #
+ # patch the JS above
+ mc.patch_forward_jump(js_location)
+ #
+ # case GCFLAG_CARDS_SET: emit a few instructions to do
+ # directly the card flag setting
+ loc_index = self.loc_index
+ if isinstance(loc_index, RegLoc):
+ if IS_X86_64 and isinstance(loc_base, RegLoc):
+ # copy loc_index into r11
+ tmp1 = X86_64_SCRATCH_REG
+ mc.forget_scratch_register()
+ mc.MOV_rr(tmp1.value, loc_index.value)
+ final_pop = False
+ else:
+ # must save the register loc_index before it is mutated
+ mc.PUSH_r(loc_index.value)
+ tmp1 = loc_index
+ final_pop = True
+ # SHR tmp, card_page_shift
+ mc.SHR_ri(tmp1.value, descr.jit_wb_card_page_shift)
+ # XOR tmp, -8
+ mc.XOR_ri(tmp1.value, -8)
+ # BTS [loc_base], tmp
+ if final_pop:
+ # r11 is not specially used, fall back to regloc.py
+ mc.BTS(addr_add_const(loc_base, 0), tmp1)
+ else:
+ # tmp1 is r11! but in this case, loc_base is a
+ # register so we can invoke directly rx86.py
+ mc.BTS_mr((loc_base.value, 0), tmp1.value)
+ # done
+ if final_pop:
+ mc.POP_r(loc_index.value)
+ #
+ elif isinstance(loc_index, ImmedLoc):
+ byte_index = loc_index.value >> descr.jit_wb_card_page_shift
+ byte_ofs = ~(byte_index >> 3)
+ byte_val = 1 << (byte_index & 7)
+ mc.OR8(addr_add_const(loc_base, byte_ofs), imm(byte_val))
+ else:
+ raise AssertionError("index is neither RegLoc nor ImmedLoc")
+ #
+ # patch the JNS above
+ mc.patch_forward_jump(jns_location)
+
def _write_barrier_fastpath(self, mc, descr, arglocs, array=False,
is_frame=False):
# Write code equivalent to write_barrier() in the GC: it checks
@@ -2251,6 +2353,7 @@
assert cls is not None and isinstance(descr, cls)
#
card_marking = False
+ loc_index = None
mask = descr.jit_wb_if_flag_singlebyte
if array and descr.jit_wb_cards_set != 0:
# assumptions the rest of the function depends on:
@@ -2258,6 +2361,7 @@
descr.jit_wb_if_flag_byteofs)
assert descr.jit_wb_cards_set_singlebyte == -0x80
card_marking = True
+ loc_index = arglocs[1]
mask = descr.jit_wb_if_flag_singlebyte | -0x80
#
loc_base = arglocs[0]
@@ -2266,21 +2370,7 @@
loc = raw_stack(descr.jit_wb_if_flag_byteofs)
else:
loc = addr_add_const(loc_base, descr.jit_wb_if_flag_byteofs)
- mc.TEST8(loc, imm(mask))
- jz_location = mc.emit_forward_jump('Z') # patched later
-
- # for cond_call_gc_wb_array, also add another fast path:
- # if GCFLAG_CARDS_SET, then we can just set one bit and be done
- if card_marking:
- # GCFLAG_CARDS_SET is in this byte at 0x80, so this fact can
- # been checked by the sign flags of the previous TEST8
- js_location = mc.emit_forward_jump('S') # patched later
- else:
- js_location = 0
-
- # Write only a CALL to the helper prepared in advance, passing it as
- # argument the address of the structure we are writing into
- # (the first argument to COND_CALL_GC_WB).
+ #
helper_num = card_marking
if is_frame:
helper_num = 4
@@ -2293,65 +2383,14 @@
bool(self._regalloc.xrm.reg_bindings))
assert self.wb_slowpath[helper_num] != 0
#
- if not is_frame:
- mc.PUSH(loc_base)
- mc.CALL(imm(self.wb_slowpath[helper_num]))
- if not is_frame:
- mc.stack_frame_size_delta(-WORD)
-
- if card_marking:
- # The helper ends again with a check of the flag in the object.
- # So here, we can simply write again a 'JNS', which will be
- # taken if GCFLAG_CARDS_SET is still not set.
- jns_location = mc.emit_forward_jump('NS') # patched later
- #
- # patch the JS above
- mc.patch_forward_jump(js_location)
- #
- # case GCFLAG_CARDS_SET: emit a few instructions to do
- # directly the card flag setting
- loc_index = arglocs[1]
- if isinstance(loc_index, RegLoc):
- if IS_X86_64 and isinstance(loc_base, RegLoc):
- # copy loc_index into r11
- tmp1 = X86_64_SCRATCH_REG
- mc.forget_scratch_register()
- mc.MOV_rr(tmp1.value, loc_index.value)
- final_pop = False
- else:
- # must save the register loc_index before it is mutated
- mc.PUSH_r(loc_index.value)
- tmp1 = loc_index
- final_pop = True
- # SHR tmp, card_page_shift
- mc.SHR_ri(tmp1.value, descr.jit_wb_card_page_shift)
- # XOR tmp, -8
- mc.XOR_ri(tmp1.value, -8)
- # BTS [loc_base], tmp
- if final_pop:
- # r11 is not specially used, fall back to regloc.py
- mc.BTS(addr_add_const(loc_base, 0), tmp1)
- else:
- # tmp1 is r11! but in this case, loc_base is a
- # register so we can invoke directly rx86.py
- mc.BTS_mr((loc_base.value, 0), tmp1.value)
- # done
- if final_pop:
- mc.POP_r(loc_index.value)
- #
- elif isinstance(loc_index, ImmedLoc):
- byte_index = loc_index.value >> descr.jit_wb_card_page_shift
- byte_ofs = ~(byte_index >> 3)
- byte_val = 1 << (byte_index & 7)
- mc.OR8(addr_add_const(loc_base, byte_ofs), imm(byte_val))
- else:
- raise AssertionError("index is neither RegLoc nor ImmedLoc")
- #
- # patch the JNS above
- mc.patch_forward_jump(jns_location)
-
- # patch the JZ above
- mc.patch_forward_jump(jz_location)
+ mc.TEST8(loc, imm(mask))
+ sp = self.WriteBarrierSlowPath(mc, rx86.Conditions['NZ'])
+ sp.loc_base = loc_base
+ sp.loc_index = loc_index
+ sp.helper_num = helper_num
+ sp.descr = descr
+ sp.set_continue_addr(mc)
+ self.pending_slowpaths.append(sp)
def genop_discard_cond_call_gc_wb(self, op, arglocs):
self._write_barrier_fastpath(self.mc, op.getdescr(), arglocs)
@@ -2382,37 +2421,66 @@
def label(self):
self._check_frame_depth_debug(self.mc)
+ class CondCallSlowPath(codebuf.SlowPath):
+ guard_token_no_exception = None
+
+ def generate_body(self, assembler, mc):
+ assembler.push_gcmap(mc, self.gcmap, store=True)
+ #
+ # first save away the 4 registers from
+ # 'cond_call_register_arguments' plus the register 'eax'
+ base_ofs = assembler.cpu.get_baseofs_of_frame_field()
+ should_be_saved = self.should_be_saved
+ restore_eax = False
+ for gpr in cond_call_register_arguments + [eax]:
+ if gpr not in should_be_saved or gpr is self.resloc:
+ continue
+ v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value]
+ mc.MOV_br(v * WORD + base_ofs, gpr.value)
+ if gpr is eax:
+ restore_eax = True
+ #
+ # load the 0-to-4 arguments into these registers
+ from rpython.jit.backend.x86.jump import remap_frame_layout
+ arglocs = self.arglocs
+ remap_frame_layout(assembler, arglocs,
+ cond_call_register_arguments[:len(arglocs)],
+ X86_64_SCRATCH_REG if IS_X86_64 else None)
+ #
+ # load the constant address of the function to call into eax
+ mc.MOV(eax, self.imm_func)
+ #
+ # figure out which variant of cond_call_slowpath to call,
+ # and call it
+ cond_call_adr = assembler.cond_call_slowpath[self.variant_num]
+ mc.CALL(imm(follow_jump(cond_call_adr)))
+ # if this is a COND_CALL_VALUE, we need to move the result in place
+ resloc = self.resloc
+ if resloc is not None and resloc is not eax:
+ mc.MOV(resloc, eax)
+ # restoring the registers saved above, and doing pop_gcmap(), is
+ # left to the cond_call_slowpath helper. We must only restore eax,
+ # if needed.
+ if restore_eax:
+ v = gpr_reg_mgr_cls.all_reg_indexes[eax.value]
+ mc.MOV_rb(eax.value, v * WORD + base_ofs)
+ #
+ # if needed, emit now the guard_no_exception
+ if self.guard_token_no_exception is not None:
+ assembler.generate_guard_no_exception(
+ self.guard_token_no_exception)
+
def cond_call(self, gcmap, imm_func, arglocs, resloc=None):
assert self.guard_success_cc >= 0
- j_location = self.mc.emit_forward_jump_cond(
- rx86.invert_condition(self.guard_success_cc))
+ sp = self.CondCallSlowPath(self.mc, self.guard_success_cc)
+ sp.set_continue_addr(self.mc)
self.guard_success_cc = rx86.cond_none
+ sp.gcmap = gcmap
+ sp.imm_func = imm_func
+ sp.arglocs = arglocs
+ sp.resloc = resloc
+ sp.should_be_saved = self._regalloc.rm.reg_bindings.values()
#
- self.push_gcmap(self.mc, gcmap, store=True)
- #
- # first save away the 4 registers from 'cond_call_register_arguments'
- # plus the register 'eax'
- base_ofs = self.cpu.get_baseofs_of_frame_field()
- should_be_saved = self._regalloc.rm.reg_bindings.values()
- restore_eax = False
- for gpr in cond_call_register_arguments + [eax]:
- if gpr not in should_be_saved or gpr is resloc:
- continue
- v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value]
- self.mc.MOV_br(v * WORD + base_ofs, gpr.value)
- if gpr is eax:
- restore_eax = True
- #
- # load the 0-to-4 arguments into these registers
- from rpython.jit.backend.x86.jump import remap_frame_layout
- remap_frame_layout(self, arglocs,
- cond_call_register_arguments[:len(arglocs)],
- X86_64_SCRATCH_REG if IS_X86_64 else None)
- #
- # load the constant address of the function to call into eax
- self.mc.MOV(eax, imm_func)
- #
- # figure out which variant of cond_call_slowpath to call, and call it
callee_only = False
floats = False
if self._regalloc is not None:
@@ -2423,34 +2491,25 @@
callee_only = True
if self._regalloc.xrm.reg_bindings:
floats = True
- cond_call_adr = self.cond_call_slowpath[floats * 2 + callee_only]
- self.mc.CALL(imm(follow_jump(cond_call_adr)))
- # if this is a COND_CALL_VALUE, we need to move the result in place
- if resloc is not None and resloc is not eax:
- self.mc.MOV(resloc, eax)
- # restoring the registers saved above, and doing pop_gcmap(), is left
- # to the cond_call_slowpath helper. We must only restore eax, if
- # needed.
- if restore_eax:
- v = gpr_reg_mgr_cls.all_reg_indexes[eax.value]
- self.mc.MOV_rb(eax.value, v * WORD + base_ofs)
+ sp.variant_num = floats * 2 + callee_only
#
- self.mc.patch_forward_jump(j_location)
- # might be overridden again to skip over the following
- # guard_no_exception too
- self.previous_cond_call_jcond = j_location
+ self.pending_slowpaths.append(sp)
+
+ class MallocCondSlowPath(codebuf.SlowPath):
+ def generate_body(self, assembler, mc):
+ assembler.push_gcmap(mc, self.gcmap, store=True)
+ mc.CALL(imm(follow_jump(assembler.malloc_slowpath)))
def malloc_cond(self, nursery_free_adr, nursery_top_adr, size, gcmap):
assert size & (WORD-1) == 0 # must be correctly aligned
self.mc.MOV(ecx, heap(nursery_free_adr))
self.mc.LEA_rm(edx.value, (ecx.value, size))
self.mc.CMP(edx, heap(nursery_top_adr))
- jna_location = self.mc.emit_forward_jump('NA') # patched later
- # save the gcmap
- self.push_gcmap(self.mc, gcmap, store=True)
- self.mc.CALL(imm(follow_jump(self.malloc_slowpath)))
- self.mc.patch_forward_jump(jna_location)
+ sp = self.MallocCondSlowPath(self.mc, rx86.Conditions['A'])
+ sp.gcmap = gcmap
self.mc.MOV(heap(nursery_free_adr), edx)
+ sp.set_continue_addr(self.mc)
+ self.pending_slowpaths.append(sp)
def malloc_cond_varsize_frame(self, nursery_free_adr, nursery_top_adr,
sizeloc, gcmap):
@@ -2463,12 +2522,31 @@
else:
self.mc.LEA_ra(edx.value, (ecx.value, sizeloc.value, 0, 0))
self.mc.CMP(edx, heap(nursery_top_adr))
- jna_location = self.mc.emit_forward_jump('NA') # patched later
- # save the gcmap
- self.push_gcmap(self.mc, gcmap, store=True)
- self.mc.CALL(imm(follow_jump(self.malloc_slowpath)))
- self.mc.patch_forward_jump(jna_location)
+ sp = self.MallocCondSlowPath(self.mc, rx86.Conditions['A'])
+ sp.gcmap = gcmap
self.mc.MOV(heap(nursery_free_adr), edx)
+ sp.set_continue_addr(self.mc)
+ self.pending_slowpaths.append(sp)
+
+ class MallocCondVarsizeSlowPath(codebuf.SlowPath):
+ def generate_body(self, assembler, mc):
+ # save the gcmap
+ assembler.push_gcmap(mc, self.gcmap, store=True)
+ kind = self.kind
+ if kind == rewrite.FLAG_ARRAY:
+ mc.MOV_si(WORD, self.itemsize)
+ mc.MOV_ri(ecx.value, self.arraydescr.tid)
+ addr = assembler.malloc_slowpath_varsize
+ else:
+ if kind == rewrite.FLAG_STR:
+ addr = assembler.malloc_slowpath_str
+ else:
+ assert kind == rewrite.FLAG_UNICODE
+ addr = assembler.malloc_slowpath_unicode
+ lengthloc = self.lengthloc
+ assert lengthloc is not ecx and lengthloc is not edx
+ mc.MOV(edx, lengthloc)
+ mc.CALL(imm(follow_jump(addr)))
def malloc_cond_varsize(self, kind, nursery_free_adr, nursery_top_adr,
lengthloc, itemsize, maxlength, gcmap,
@@ -2509,34 +2587,24 @@
# now edx contains the total size in bytes, rounded up to a multiple
# of WORD, plus nursery_free_adr
self.mc.CMP(edx, heap(nursery_top_adr))
- jna_location = self.mc.emit_forward_jump('NA') # patched later
- #
self.mc.patch_forward_jump(ja_location)
- # save the gcmap
- self.push_gcmap(self.mc, gcmap, store=True)
- if kind == rewrite.FLAG_ARRAY:
- self.mc.MOV_si(WORD, itemsize)
- self.mc.MOV(edx, lengthloc)
- self.mc.MOV_ri(ecx.value, arraydescr.tid)
- addr = self.malloc_slowpath_varsize
- else:
- if kind == rewrite.FLAG_STR:
- addr = self.malloc_slowpath_str
- else:
- assert kind == rewrite.FLAG_UNICODE
- addr = self.malloc_slowpath_unicode
- self.mc.MOV(edx, lengthloc)
- self.mc.CALL(imm(follow_jump(addr)))
- jmp_location = self.mc.emit_forward_jump_uncond() # jump to later
- #
- self.mc.patch_forward_jump(jna_location)
- self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
- # write down the tid, but not if it's the result of the CALL
+ # Note: we call the slow path in condition 'A', which may be
+ # true either because the CMP just above really got that
+ # condition, or because we jumped here from ja_location before.
+ # In both cases, the jumps are forward-going and the expected
+ # common case is "not taken".
+ sp = self.MallocCondVarsizeSlowPath(self.mc, rx86.Conditions['A'])
+ sp.gcmap = gcmap
+ sp.kind = kind
+ sp.itemsize = itemsize
+ sp.lengthloc = lengthloc
+ sp.arraydescr = arraydescr
+ # some more code that is only if we *don't* call the slow
+ # path: write down the tid, and save edx into nursery_free_adr
self.mc.MOV(mem(ecx, 0), imm(arraydescr.tid))
- # while we're at it, this line is not needed if we've done the CALL
self.mc.MOV(heap(nursery_free_adr), edx)
- #
- self.mc.patch_forward_jump(jmp_location)
+ sp.set_continue_addr(self.mc)
+ self.pending_slowpaths.append(sp)
def store_force_descr(self, op, fail_locs, frame_depth):
guard_token = self.implement_guard_recovery(op.opnum,
diff --git a/rpython/jit/backend/x86/callbuilder.py b/rpython/jit/backend/x86/callbuilder.py
--- a/rpython/jit/backend/x86/callbuilder.py
+++ b/rpython/jit/backend/x86/callbuilder.py
@@ -11,6 +11,7 @@
r12, r13, r14, r15, X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG,
RegLoc, RawEspLoc, RawEbpLoc, imm, ImmedLoc)
from rpython.jit.backend.x86.jump import remap_frame_layout
+from rpython.jit.backend.x86 import codebuf
from rpython.jit.backend.llsupport.callbuilder import AbstractCallBuilder
from rpython.jit.backend.llsupport import llerrno
from rpython.rtyper.lltypesystem import llmemory, rffi
@@ -293,6 +294,41 @@
tlofsreg = self.get_tlofs_reg() # => esi (possibly reused)
mc.MOV32_mr((tlofsreg.value, lasterror), eax.value)
+ class ReacqGilSlowPath(codebuf.SlowPath):
+ early_jump_addr = 0
+
+ def generate_body(self, assembler, mc):
+ if self.early_jump_addr != 0:
+ # This slow-path has two entry points, with two
+ # conditional jumps. We can jump to the regular start
+ # of this slow-path with the 2nd conditional jump. Or,
+ # we can jump past the "MOV(heap(fastgil), ecx)"
+ # instruction from the 1st conditional jump.
+ # This instruction reverts the rpy_fastgil acquired
+ # previously, so that the general 'reacqgil_addr'
+ # function can acquire it again. It must only be done
+ # if we actually succeeded in acquiring rpy_fastgil.
+ from rpython.jit.backend.x86.assembler import heap
+ mc.MOV(heap(self.fastgil), ecx)
+ offset = mc.get_relative_pos() - self.early_jump_addr
+ mc.overwrite32(self.early_jump_addr-4, offset)
+ # scratch register forgotten here, by get_relative_pos()
+
+ # call the reacqgil() function
+ cb = self.callbuilder
+ if not cb.result_value_saved_early:
+ cb.save_result_value(save_edx=False)
+ if assembler._is_asmgcc():
+ if IS_X86_32:
+ css_value = edx
+ old_value = ecx
+ mc.MOV_sr(4, old_value.value)
+ mc.MOV_sr(0, css_value.value)
+ # on X86_64, they are already in the right registers
+ mc.CALL(imm(follow_jump(assembler.reacqgil_addr)))
+ if not cb.result_value_saved_early:
+ cb.restore_result_value(save_edx=False)
+
def move_real_result_and_call_reacqgil_addr(self, fastgil):
from rpython.jit.backend.x86 import rx86
#
@@ -314,8 +350,8 @@
if not self.result_value_saved_early:
mc.MOV_sr(12, edx.value)
restore_edx = True
- css_value = edx
- old_value = ecx
+ css_value = edx # note: duplicated in ReacqGilSlowPath
+ old_value = ecx #
elif IS_X86_64:
css_value = edi
old_value = esi
@@ -341,35 +377,25 @@
# thread. So here we check if the shadowstack pointer
# is still the same as before we released the GIL (saved
# in 'ebx'), and if not, we fall back to 'reacqgil_addr'.
- jne_location = mc.emit_forward_jump('NE')
+ mc.J_il(rx86.Conditions['NE'], 0xfffff) # patched later
+ early_jump_addr = mc.get_relative_pos(break_basic_block=False)
+ # ^^^ this jump will go to almost the same place as the
+ # ReacqGilSlowPath() computes, but one instruction farther,
+ # i.e. just after the "MOV(heap(fastgil), ecx)".
+
# here, ecx (=old_value) is zero (so rpy_fastgil was in 'released'
# state before the XCHG, but the XCHG acquired it by writing 1)
rst = gcrootmap.get_root_stack_top_addr()
mc = self.mc
mc.CMP(ebx, heap(rst))
- je_location = mc.emit_forward_jump('E')
- # revert the rpy_fastgil acquired above, so that the
- # general 'reacqgil_addr' below can acquire it again...
- mc.MOV(heap(fastgil), ecx)
- # patch the JNE above
- mc.patch_forward_jump(jne_location)
+ sp = self.ReacqGilSlowPath(mc, rx86.Conditions['NE'])
+ sp.early_jump_addr = early_jump_addr
+ sp.fastgil = fastgil
else:
- je_location = mc.emit_forward_jump('E')
- #
- # Yes, we need to call the reacqgil() function
- if not self.result_value_saved_early:
- self.save_result_value(save_edx=False)
- if self.asm._is_asmgcc():
- if IS_X86_32:
- mc.MOV_sr(4, old_value.value)
- mc.MOV_sr(0, css_value.value)
- # on X86_64, they are already in the right registers
- mc.CALL(imm(follow_jump(self.asm.reacqgil_addr)))
- if not self.result_value_saved_early:
- self.restore_result_value(save_edx=False)
- #
- # patch the JE above
- mc.patch_forward_jump(je_location)
+ sp = self.ReacqGilSlowPath(mc, rx86.Conditions['NE'])
+ sp.callbuilder = self
+ sp.set_continue_addr(mc)
+ self.asm.pending_slowpaths.append(sp)
#
if restore_edx:
mc.MOV_rs(edx.value, 12) # restore this
diff --git a/rpython/jit/backend/x86/codebuf.py b/rpython/jit/backend/x86/codebuf.py
--- a/rpython/jit/backend/x86/codebuf.py
+++ b/rpython/jit/backend/x86/codebuf.py
@@ -81,3 +81,30 @@
if break_basic_block:
self.forget_scratch_register()
return BlockBuilderMixin.get_relative_pos(self)
+
+
+class SlowPath(object):
+ def __init__(self, mc, condition):
+ mc.J_il(condition, 0xfffff) # patched later
+ self.cond_jump_addr = mc.get_relative_pos(break_basic_block=False)
+ self.saved_scratch_value_1 = mc.get_scratch_register_known_value()
+
+ def set_continue_addr(self, mc):
+ self.continue_addr = mc.get_relative_pos(break_basic_block=False)
+ self.saved_scratch_value_2 = mc.get_scratch_register_known_value()
+
+ def generate(self, assembler, mc):
+ # no alignment here, prefer compactness for these slow-paths.
+ # patch the original jump to go here
+ offset = mc.get_relative_pos() - self.cond_jump_addr
+ mc.overwrite32(self.cond_jump_addr-4, offset)
+ # restore the knowledge of the scratch register value
+ # (this does not emit any code)
+ mc.restore_scratch_register_known_value(self.saved_scratch_value_1)
+ # generate the body of the slow-path
+ self.generate_body(assembler, mc)
+ # reload (if needed) the (possibly different) scratch register value
+ mc.load_scratch_if_known(self.saved_scratch_value_2)
+ # jump back
+ curpos = mc.get_relative_pos() + 5
+ mc.JMP_l(self.continue_addr - curpos)
diff --git a/rpython/jit/backend/x86/regloc.py b/rpython/jit/backend/x86/regloc.py
--- a/rpython/jit/backend/x86/regloc.py
+++ b/rpython/jit/backend/x86/regloc.py
@@ -373,7 +373,7 @@
class LocationCodeBuilder(object):
_mixin_ = True
- _scratch_register_value = 0 # 0 means 'unknown'
+ _scratch_register_value = -1 # -1 means 'unknown'
def _binaryop(name):
@@ -552,7 +552,7 @@
# If we are within a "reuse_scratch_register" block, we remember the
# last value we loaded to the scratch register and encode the address
# as an offset from that if we can
- if self._scratch_register_value != 0:
+ if self._scratch_register_value != -1:
offset = r_uint(addr) - r_uint(self._scratch_register_value)
offset = intmask(offset)
if rx86.fits_in_32bits(offset):
@@ -593,7 +593,7 @@
return (reg, scalereg, scale, ofs)
def _load_scratch(self, value):
- if self._scratch_register_value != 0:
+ if self._scratch_register_value != -1:
if self._scratch_register_value == value:
#print '_load_scratch(%x) [REUSED]' % (value,)
return
@@ -619,7 +619,7 @@
self.MOV_ri(X86_64_SCRATCH_REG.value, value)
def forget_scratch_register(self):
- self._scratch_register_value = 0
+ self._scratch_register_value = -1
def get_scratch_register_known_value(self):
return self._scratch_register_value
@@ -627,6 +627,11 @@
def restore_scratch_register_known_value(self, saved_value):
self._scratch_register_value = saved_value
+ def load_scratch_if_known(self, saved_value):
+ if saved_value != -1:
+ assert IS_X86_64
+ self._load_scratch(saved_value)
+
def trap(self):
self.INT3()
diff --git a/rpython/jit/backend/x86/test/test_regloc.py b/rpython/jit/backend/x86/test/test_regloc.py
--- a/rpython/jit/backend/x86/test/test_regloc.py
+++ b/rpython/jit/backend/x86/test/test_regloc.py
@@ -211,9 +211,9 @@
def test_64bit_address_4(self):
base_addr = intmask(0xFEDCBA9876543210)
cb = LocationCodeBuilder64()
- assert cb._scratch_register_value == 0
+ assert cb._scratch_register_value == -1
cb.MOV(ecx, AddressLoc(edx, esi, 2, base_addr))
- assert cb._scratch_register_value == 0
+ assert cb._scratch_register_value == -1
# this case is a CMP_ra
#
expected_instructions = (
diff --git a/rpython/jit/backend/x86/test/test_runner.py b/rpython/jit/backend/x86/test/test_runner.py
--- a/rpython/jit/backend/x86/test/test_runner.py
+++ b/rpython/jit/backend/x86/test/test_runner.py
@@ -39,7 +39,7 @@
'nop; ' # for the label
'add; test; je; jmp;') # plus some padding
bridge_loop_instructions = (
- 'cmp; jge; mov;( movabs;| lea;)? mov; (mov|movabs|lea); call; mov(abs)?; jmp;')
+ 'cmp; jl; mov(abs)?; jmp;')
def get_cpu(self):
cpu = CPU(rtyper=None, stats=FakeStats())
More information about the pypy-commit
mailing list