[pypy-commit] pypy default: In-progress: streamline the write barrier called by the x86
arigo
noreply at buildbot.pypy.org
Sun Jun 3 12:18:58 CEST 2012
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r55276:247ff481a07e
Date: 2012-06-03 01:37 +0200
http://bitbucket.org/pypy/pypy/changeset/247ff481a07e/
Log: In-progress: streamline the write barrier called by the x86 jit
backend. The goal is to try out if a simpler write barrier wouldn't
be better after all. The additional motivation is that there is a
really, really rare potentially buggy corner case with xmm registers
not saved around the write barrier.
diff --git a/pypy/jit/backend/llsupport/gc.py b/pypy/jit/backend/llsupport/gc.py
--- a/pypy/jit/backend/llsupport/gc.py
+++ b/pypy/jit/backend/llsupport/gc.py
@@ -577,7 +577,6 @@
def __init__(self, gc_ll_descr):
self.llop1 = gc_ll_descr.llop1
self.WB_FUNCPTR = gc_ll_descr.WB_FUNCPTR
- self.WB_ARRAY_FUNCPTR = gc_ll_descr.WB_ARRAY_FUNCPTR
self.fielddescr_tid = gc_ll_descr.fielddescr_tid
#
GCClass = gc_ll_descr.GCClass
@@ -592,6 +591,11 @@
self.jit_wb_card_page_shift = GCClass.JIT_WB_CARD_PAGE_SHIFT
self.jit_wb_cards_set_byteofs, self.jit_wb_cards_set_singlebyte = (
self.extract_flag_byte(self.jit_wb_cards_set))
+ #
+ # the x86 backend uses the following "accidental" facts to
+ # avoid one instruction:
+ assert self.jit_wb_cards_set_byteofs == self.jit_wb_if_flag_byteofs
+ assert self.jit_wb_cards_set_singlebyte == -0x80
else:
self.jit_wb_cards_set = 0
@@ -615,7 +619,7 @@
# returns a function with arguments [array, index, newvalue]
llop1 = self.llop1
funcptr = llop1.get_write_barrier_from_array_failing_case(
- self.WB_ARRAY_FUNCPTR)
+ self.WB_FUNCPTR)
funcaddr = llmemory.cast_ptr_to_adr(funcptr)
return cpu.cast_adr_to_int(funcaddr) # this may return 0
@@ -699,9 +703,7 @@
def _setup_write_barrier(self):
self.WB_FUNCPTR = lltype.Ptr(lltype.FuncType(
- [llmemory.Address, llmemory.Address], lltype.Void))
- self.WB_ARRAY_FUNCPTR = lltype.Ptr(lltype.FuncType(
- [llmemory.Address, lltype.Signed, llmemory.Address], lltype.Void))
+ [llmemory.Address], lltype.Void))
self.write_barrier_descr = WriteBarrierDescr(self)
def _make_functions(self, really_not_translated):
@@ -859,8 +861,7 @@
# the GC, and call it immediately
llop1 = self.llop1
funcptr = llop1.get_write_barrier_failing_case(self.WB_FUNCPTR)
- funcptr(llmemory.cast_ptr_to_adr(gcref_struct),
- llmemory.cast_ptr_to_adr(gcref_newptr))
+ funcptr(llmemory.cast_ptr_to_adr(gcref_struct))
def can_use_nursery_malloc(self, size):
return size < self.max_size_of_young_obj
diff --git a/pypy/jit/backend/x86/assembler.py b/pypy/jit/backend/x86/assembler.py
--- a/pypy/jit/backend/x86/assembler.py
+++ b/pypy/jit/backend/x86/assembler.py
@@ -83,6 +83,7 @@
self.float_const_abs_addr = 0
self.malloc_slowpath1 = 0
self.malloc_slowpath2 = 0
+ self.wb_slowpath = [0, 0, 0, 0]
self.memcpy_addr = 0
self.setup_failure_recovery()
self._debug = False
@@ -109,9 +110,13 @@
self.memcpy_addr = self.cpu.cast_ptr_to_int(support.memcpy_fn)
self._build_failure_recovery(False)
self._build_failure_recovery(True)
+ self._build_wb_slowpath(False)
+ self._build_wb_slowpath(True)
if self.cpu.supports_floats:
self._build_failure_recovery(False, withfloats=True)
self._build_failure_recovery(True, withfloats=True)
+ self._build_wb_slowpath(False, withfloats=True)
+ self._build_wb_slowpath(True, withfloats=True)
support.ensure_sse2_floats()
self._build_float_constants()
self._build_propagate_exception_path()
@@ -344,6 +349,82 @@
rawstart = mc.materialize(self.cpu.asmmemmgr, [])
self.stack_check_slowpath = rawstart
+ def _build_wb_slowpath(self, withcards, withfloats=False):
+ descr = self.cpu.gc_ll_descr.write_barrier_descr
+ if descr is None:
+ return
+ if not withcards:
+ func = descr.get_write_barrier_fn(self.cpu)
+ else:
+ if descr.jit_wb_cards_set == 0:
+ return
+ func = descr.get_write_barrier_from_array_fn(self.cpu)
+ if func == 0:
+ return
+ #
+ # This builds a helper function called from the slow path of
+ # write barriers. It must save all registers, and optionally
+ # all XMM registers. It takes a single argument just pushed
+ # on the stack even on X86_64. It must restore stack alignment
+ # accordingly.
+ mc = codebuf.MachineCodeBlockWrapper()
+ #
+ frame_size = (1 + # my argument, considered part of my frame
+ 1 + # my return address
+ len(self._regalloc.rm.save_around_call_regs))
+ if withfloats:
+ frame_size += 16 # X86_32: 16 words for 8 registers;
+ # X86_64: just 16 registers
+ if IS_X86_32:
+ frame_size += 1 # argument to pass to the call
+ #
+ # align to a multiple of 16 bytes
+ frame_size = (frame_size + (CALL_ALIGN-1)) & ~(CALL_ALIGN-1)
+ #
+ correct_esp_by = (frame_size - 2) * WORD
+ mc.SUB_ri(esp.value, correct_esp_by)
+ #
+ ofs = correct_esp_by
+ for reg in self._regalloc.rm.save_around_call_regs:
+ ofs -= WORD
+ mc.MOV_sr(ofs, reg.value)
+ if withfloats:
+ for reg in self._regalloc.xmm.save_around_call_regs:
+ ofs -= 8
+ mc.MOVSD_sx(ofs, reg.value)
+ #
+ if IS_X86_32:
+ mc.MOV_rs(eax.value, (frame_size - 1) * WORD)
+ mc.MOV_sr(0, eax.value)
+ elif IS_X86_64:
+ mc.MOV_rs(edi.value, (frame_size - 1) * WORD)
+ mc.CALL(imm(func))
+ #
+ if withcards:
+ # A final TEST8 before the RET, for the caller. Careful to
+ # not follow this instruction with another one that changes
+ # the status of the CPU flags!
+ mc.MOV_rs(eax.value, (frame_size - 1) * WORD)
+ mc.TEST8(addr_add_const(eax, descr.jit_wb_if_flag_byteofs),
+ imm(-0x80))
+ #
+ ofs = correct_esp_by
+ for reg in self._regalloc.rm.save_around_call_regs:
+ ofs -= WORD
+ mc.MOV_rs(reg.value, ofs)
+ if withfloats:
+ for reg in self._regalloc.xmm.save_around_call_regs:
+ ofs -= 8
+ mc.MOVSD_xs(reg.value, ofs)
+ #
+ # ADD esp, correct_esp_by --- but cannot use ADD, because
+ # of its effects on the CPU flags
+ mc.LEA_rs(esp.value, correct_esp_by)
+ mc.RET(WORD)
+ #
+ rawstart = mc.materialize(self.cpu.asmmemmgr, [])
+ self.wb_slowpath[withcards + 2 * withfloats] = rawstart
+
@staticmethod
@rgc.no_collect
def _release_gil_asmgcc(css):
@@ -2324,102 +2405,76 @@
def genop_discard_cond_call_gc_wb(self, op, arglocs):
# Write code equivalent to write_barrier() in the GC: it checks
- # a flag in the object at arglocs[0], and if set, it calls the
- # function remember_young_pointer() from the GC. The arguments
- # to the call are in arglocs[:N]. The rest, arglocs[N:], contains
- # registers that need to be saved and restored across the call.
- # N is either 2 (regular write barrier) or 3 (array write barrier).
+ # a flag in the object at arglocs[0], and if set, it calls a
+ # helper piece of assembler. The latter saves registers as needed
+ # and call the function jit_remember_young_pointer() from the GC.
descr = op.getdescr()
if we_are_translated():
cls = self.cpu.gc_ll_descr.has_write_barrier_class()
assert cls is not None and isinstance(descr, cls)
#
opnum = op.getopnum()
- if opnum == rop.COND_CALL_GC_WB:
- N = 2
- func = descr.get_write_barrier_fn(self.cpu)
- card_marking = False
- elif opnum == rop.COND_CALL_GC_WB_ARRAY:
- N = 3
- func = descr.get_write_barrier_from_array_fn(self.cpu)
- assert func != 0
- card_marking = descr.jit_wb_cards_set != 0
- else:
- raise AssertionError(opnum)
+ card_marking = False
+ mask = descr.jit_wb_if_flag_singlebyte
+ if opnum == rop.COND_CALL_GC_WB_ARRAY and descr.jit_wb_cards_set != 0:
+ # assumptions the rest of the function depends on:
+ assert (descr.jit_wb_cards_set_byteofs ==
+ descr.jit_wb_if_flag_byteofs)
+ assert descr.jit_wb_cards_set_singlebyte == -0x80
+ card_marking = True
+ mask = descr.jit_wb_if_flag_singlebyte | -0x80
#
loc_base = arglocs[0]
self.mc.TEST8(addr_add_const(loc_base, descr.jit_wb_if_flag_byteofs),
- imm(descr.jit_wb_if_flag_singlebyte))
+ imm(mask))
self.mc.J_il8(rx86.Conditions['Z'], 0) # patched later
jz_location = self.mc.get_relative_pos()
# for cond_call_gc_wb_array, also add another fast path:
# if GCFLAG_CARDS_SET, then we can just set one bit and be done
if card_marking:
- self.mc.TEST8(addr_add_const(loc_base,
- descr.jit_wb_cards_set_byteofs),
- imm(descr.jit_wb_cards_set_singlebyte))
- self.mc.J_il8(rx86.Conditions['NZ'], 0) # patched later
- jnz_location = self.mc.get_relative_pos()
+ # GCFLAG_CARDS_SET is in this byte at 0x80, so this fact can
+ # been checked by the status flags of the previous TEST8
+ self.mc.J_il8(rx86.Conditions['S'], 0) # patched later
+ js_location = self.mc.get_relative_pos()
else:
- jnz_location = 0
+ js_location = 0
- # the following is supposed to be the slow path, so whenever possible
- # we choose the most compact encoding over the most efficient one.
- if IS_X86_32:
- limit = -1 # push all arglocs on the stack
- elif IS_X86_64:
- limit = N - 1 # push only arglocs[N:] on the stack
- for i in range(len(arglocs)-1, limit, -1):
- loc = arglocs[i]
- if isinstance(loc, RegLoc):
- self.mc.PUSH_r(loc.value)
- else:
- assert not IS_X86_64 # there should only be regs in arglocs[N:]
- self.mc.PUSH_i32(loc.getint())
- if IS_X86_64:
- # We clobber these registers to pass the arguments, but that's
- # okay, because consider_cond_call_gc_wb makes sure that any
- # caller-save registers with values in them are present in
- # arglocs[N:] too, so they are saved on the stack above and
- # restored below.
- if N == 2:
- callargs = [edi, esi]
- else:
- callargs = [edi, esi, edx]
- remap_frame_layout(self, arglocs[:N], callargs,
- X86_64_SCRATCH_REG)
- #
- # misaligned stack in the call, but it's ok because the write barrier
- # is not going to call anything more. Also, this assumes that the
- # write barrier does not touch the xmm registers. (Slightly delicate
- # assumption, given that the write barrier can end up calling the
- # platform's malloc() from AddressStack.append(). XXX may need to
- # be done properly)
- self.mc.CALL(imm(func))
- if IS_X86_32:
- self.mc.ADD_ri(esp.value, N*WORD)
- for i in range(N, len(arglocs)):
- loc = arglocs[i]
- assert isinstance(loc, RegLoc)
- self.mc.POP_r(loc.value)
+ # Write only a CALL to the helper prepared in advance, passing it as
+ # argument the address of the structure we are writing into
+ # (the first argument to COND_CALL_GC_WB).
+ self.mc.PUSH(loc_base) # push loc_base, either a reg or an immed
+ helper_num = card_marking
+ if self._regalloc.xrm.reg_bindings:
+ helper_num += 2
+ self.mc.CALL(imm(self.wb_slowpath[helper_num]))
- # if GCFLAG_CARDS_SET, then we can do the whole thing that would
- # be done in the CALL above with just four instructions, so here
- # is an inline copy of them
if card_marking:
- self.mc.JMP_l8(0) # jump to the exit, patched later
- jmp_location = self.mc.get_relative_pos()
- # patch the JNZ above
- offset = self.mc.get_relative_pos() - jnz_location
+ # The helper ends again with a check of the flag in the object.
+ # So here, we can simply write again a 'JNS', which will be
+ # taken if GCFLAG_CARDS_SET is still not set.
+ self.mc.J_il8(rx86.Conditions['NS'], 0) # patched later
+ jns_location = self.mc.get_relative_pos()
+ #
+ # patch the JS above
+ offset = self.mc.get_relative_pos() - js_location
assert 0 < offset <= 127
- self.mc.overwrite(jnz_location-1, chr(offset))
+ self.mc.overwrite(js_location-1, chr(offset))
#
+ # case GCFLAG_CARDS_SET: emit a few instructions to do
+ # directly the card flag setting
loc_index = arglocs[1]
if isinstance(loc_index, RegLoc):
- # choose a scratch register
- tmp1 = loc_index
- self.mc.PUSH_r(tmp1.value)
+ if IS_X86_64 and isinstance(loc_base, RegLoc):
+ # copy loc_index into r11
+ tmp1 = X86_64_SCRATCH_REG
+ self.mc.MOV_rr(tmp1.value, loc_index.value)
+ final_pop = False
+ else:
+ # must save the register loc_index before it is mutated
+ self.mc.PUSH_r(loc_index.value)
+ tmp1 = loc_index
+ final_pop = True
# SHR tmp, card_page_shift
self.mc.SHR_ri(tmp1.value, descr.jit_wb_card_page_shift)
# XOR tmp, -8
@@ -2427,7 +2482,9 @@
# BTS [loc_base], tmp
self.mc.BTS(addr_add_const(loc_base, 0), tmp1)
# done
- self.mc.POP_r(tmp1.value)
+ if final_pop:
+ self.mc.POP_r(loc_index.value)
+ #
elif isinstance(loc_index, ImmedLoc):
byte_index = loc_index.value >> descr.jit_wb_card_page_shift
byte_ofs = ~(byte_index >> 3)
@@ -2435,11 +2492,12 @@
self.mc.OR8(addr_add_const(loc_base, byte_ofs), imm(byte_val))
else:
raise AssertionError("index is neither RegLoc nor ImmedLoc")
- # patch the JMP above
- offset = self.mc.get_relative_pos() - jmp_location
+ #
+ # patch the JNS above
+ offset = self.mc.get_relative_pos() - jns_location
assert 0 < offset <= 127
- self.mc.overwrite(jmp_location-1, chr(offset))
- #
+ self.mc.overwrite(jns_location-1, chr(offset))
+
# patch the JZ above
offset = self.mc.get_relative_pos() - jz_location
assert 0 < offset <= 127
diff --git a/pypy/jit/backend/x86/regalloc.py b/pypy/jit/backend/x86/regalloc.py
--- a/pypy/jit/backend/x86/regalloc.py
+++ b/pypy/jit/backend/x86/regalloc.py
@@ -980,16 +980,6 @@
# or setarrayitem_gc. It avoids loading it twice from the memory.
arglocs = [self.rm.make_sure_var_in_reg(op.getarg(i), args)
for i in range(N)]
- # add eax, ecx and edx as extra "arguments" to ensure they are
- # saved and restored. Fish in self.rm to know which of these
- # registers really need to be saved (a bit of a hack). Moreover,
- # we don't save and restore any SSE register because the called
- # function, a GC write barrier, is known not to touch them.
- # See remember_young_pointer() in rpython/memory/gc/generation.py.
- for v, reg in self.rm.reg_bindings.items():
- if (reg in self.rm.save_around_call_regs
- and self.rm.stays_alive(v)):
- arglocs.append(reg)
self.PerformDiscard(op, arglocs)
self.rm.possibly_free_vars_for_op(op)
diff --git a/pypy/rpython/memory/gc/minimark.py b/pypy/rpython/memory/gc/minimark.py
--- a/pypy/rpython/memory/gc/minimark.py
+++ b/pypy/rpython/memory/gc/minimark.py
@@ -111,10 +111,13 @@
# The following flag is set on externally raw_malloc'ed arrays of pointers.
# They are allocated with some extra space in front of them for a bitfield,
# one bit per 'card_page_indices' indices.
-GCFLAG_HAS_CARDS = first_gcflag << 5
-GCFLAG_CARDS_SET = first_gcflag << 6 # <- at least one card bit is set
+GCFLAG_HAS_CARDS = first_gcflag << 6
+GCFLAG_CARDS_SET = first_gcflag << 7 # <- at least one card bit is set
+# note that GCFLAG_CARDS_SET is the most significant bit of a byte:
+# this is required for the JIT (x86)
-TID_MASK = (first_gcflag << 7) - 1
+#GCFLAG_UNUSED = first_gcflag << 5 # this flag is free
+TID_MASK = (first_gcflag << 8) - 1
FORWARDSTUB = lltype.GcStruct('forwarding_stub',
@@ -994,12 +997,9 @@
def _init_writebarrier_logic(self):
DEBUG = self.DEBUG
# The purpose of attaching remember_young_pointer to the instance
- # instead of keeping it as a regular method is to help the JIT call it.
- # Additionally, it makes the code in write_barrier() marginally smaller
+ # instead of keeping it as a regular method is to
+ # make the code in write_barrier() marginally smaller
# (which is important because it is inlined *everywhere*).
- # For x86, there is also an extra requirement: when the JIT calls
- # remember_young_pointer(), it assumes that it will not touch the SSE
- # registers, so it does not save and restore them (that's a *hack*!).
def remember_young_pointer(addr_struct, newvalue):
# 'addr_struct' is the address of the object in which we write.
# 'newvalue' is the address that we are going to write in there.
@@ -1033,6 +1033,17 @@
remember_young_pointer._dont_inline_ = True
self.remember_young_pointer = remember_young_pointer
#
+ def jit_remember_young_pointer(addr_struct):
+ # minimal version of the above, with just one argument,
+ # called by the JIT when GCFLAG_TRACK_YOUNG_PTRS is set
+ self.old_objects_pointing_to_young.append(addr_struct)
+ objhdr = self.header(addr_struct)
+ objhdr.tid &= ~GCFLAG_TRACK_YOUNG_PTRS
+ if objhdr.tid & GCFLAG_NO_HEAP_PTRS:
+ objhdr.tid &= ~GCFLAG_NO_HEAP_PTRS
+ self.prebuilt_root_objects.append(addr_struct)
+ self.jit_remember_young_pointer = jit_remember_young_pointer
+ #
if self.card_page_indices > 0:
self._init_writebarrier_with_card_marker()
@@ -1087,60 +1098,21 @@
self.remember_young_pointer_from_array2 = (
remember_young_pointer_from_array2)
- # xxx trying it out for the JIT: a 3-arguments version of the above
- def remember_young_pointer_from_array3(addr_array, index, newvalue):
+ def jit_remember_young_pointer_from_array(addr_array):
+ # minimal version of the above, with just one argument,
+ # called by the JIT when GCFLAG_TRACK_YOUNG_PTRS is set
+ # but GCFLAG_CARDS_SET is cleared. This tries to set
+ # GCFLAG_CARDS_SET if possible; otherwise, it falls back
+ # to jit_remember_young_pointer().
objhdr = self.header(addr_array)
- #
- # a single check for the common case of neither GCFLAG_HAS_CARDS
- # nor GCFLAG_NO_HEAP_PTRS
- if objhdr.tid & (GCFLAG_HAS_CARDS | GCFLAG_NO_HEAP_PTRS) == 0:
- # common case: fast path, jump to the end of the function
- pass
- elif objhdr.tid & GCFLAG_HAS_CARDS == 0:
- # no cards, but GCFLAG_NO_HEAP_PTRS is set.
- objhdr.tid &= ~GCFLAG_NO_HEAP_PTRS
- self.prebuilt_root_objects.append(addr_array)
- # jump to the end of the function
+ if objhdr.tid & GCFLAG_HAS_CARDS:
+ self.old_objects_with_cards_set.append(addr_array)
+ objhdr.tid |= GCFLAG_CARDS_SET
else:
- # case with cards.
- #
- # If the newly written address does not actually point to a
- # young object, leave now.
- if not self.appears_to_be_young(newvalue):
- return
- #
- # 'addr_array' is a raw_malloc'ed array with card markers
- # in front. Compute the index of the bit to set:
- bitindex = index >> self.card_page_shift
- byteindex = bitindex >> 3
- bitmask = 1 << (bitindex & 7)
- #
- # If the bit is already set, leave now.
- addr_byte = self.get_card(addr_array, byteindex)
- byte = ord(addr_byte.char[0])
- if byte & bitmask:
- return
- addr_byte.char[0] = chr(byte | bitmask)
- #
- if objhdr.tid & GCFLAG_CARDS_SET == 0:
- self.old_objects_with_cards_set.append(addr_array)
- objhdr.tid |= GCFLAG_CARDS_SET
- return
- #
- # Logic for the no-cards case, put here to minimize the number
- # of checks done at the start of the function
- if DEBUG: # note: PYPY_GC_DEBUG=1 does not enable this
- ll_assert(self.debug_is_old_object(addr_array),
- "young array with no card but GCFLAG_TRACK_YOUNG_PTRS")
- #
- if self.appears_to_be_young(newvalue):
- self.old_objects_pointing_to_young.append(addr_array)
- objhdr.tid &= ~GCFLAG_TRACK_YOUNG_PTRS
+ self.jit_remember_young_pointer(addr_array)
- remember_young_pointer_from_array3._dont_inline_ = True
- assert self.card_page_indices > 0
- self.remember_young_pointer_from_array3 = (
- remember_young_pointer_from_array3)
+ self.jit_remember_young_pointer_from_array = (
+ jit_remember_young_pointer_from_array)
def get_card(self, obj, byteindex):
size_gc_header = self.gcheaderbuilder.size_gc_header
diff --git a/pypy/rpython/memory/gctransform/framework.py b/pypy/rpython/memory/gctransform/framework.py
--- a/pypy/rpython/memory/gctransform/framework.py
+++ b/pypy/rpython/memory/gctransform/framework.py
@@ -455,13 +455,12 @@
annmodel.SomeAddress()],
annmodel.s_None,
inline=True)
- func = getattr(gcdata.gc, 'remember_young_pointer', None)
+ func = getattr(gcdata.gc, 'jit_remember_young_pointer', None)
if func is not None:
# func should not be a bound method, but a real function
assert isinstance(func, types.FunctionType)
self.write_barrier_failing_case_ptr = getfn(func,
- [annmodel.SomeAddress(),
- annmodel.SomeAddress()],
+ [annmodel.SomeAddress()],
annmodel.s_None)
func = getattr(GCClass, 'write_barrier_from_array', None)
if func is not None:
@@ -472,16 +471,15 @@
annmodel.SomeInteger()],
annmodel.s_None,
inline=True)
- func = getattr(gcdata.gc, 'remember_young_pointer_from_array3',
+ func = getattr(gcdata.gc,
+ 'jit_remember_young_pointer_from_array',
None)
if func is not None:
# func should not be a bound method, but a real function
assert isinstance(func, types.FunctionType)
self.write_barrier_from_array_failing_case_ptr = \
getfn(func,
- [annmodel.SomeAddress(),
- annmodel.SomeInteger(),
- annmodel.SomeAddress()],
+ [annmodel.SomeAddress()],
annmodel.s_None)
self.statistics_ptr = getfn(GCClass.statistics.im_func,
[s_gc, annmodel.SomeInteger()],
More information about the pypy-commit
mailing list