[pypy-commit] pypy arm64: start fighting with write barriers

fijal pypy.commits at gmail.com
Mon Jun 17 09:12:30 EDT 2019


Author: Maciej Fijalkowski <fijall at gmail.com>
Branch: arm64
Changeset: r96811:09bce457dc4b
Date: 2019-06-17 13:11 +0000
http://bitbucket.org/pypy/pypy/changeset/09bce457dc4b/

Log:	start fighting with write barriers

diff --git a/rpython/jit/backend/aarch64/assembler.py b/rpython/jit/backend/aarch64/assembler.py
--- a/rpython/jit/backend/aarch64/assembler.py
+++ b/rpython/jit/backend/aarch64/assembler.py
@@ -30,6 +30,7 @@
     def __init__(self, cpu, translate_support_code=False):
         ResOpAssembler.__init__(self, cpu, translate_support_code)
         self.failure_recovery_code = [0, 0, 0, 0]
+        self.wb_slowpath = [0, 0, 0, 0, 0]
 
     def assemble_loop(self, jd_id, unique_id, logger, loopname, inputargs,
                       operations, looptoken, log):
@@ -318,7 +319,76 @@
         self.mc.B(self.propagate_exception_path)
 
     def _build_wb_slowpath(self, withcards, withfloats=False, for_frame=False):
-        pass # XXX
+        descr = self.cpu.gc_ll_descr.write_barrier_descr
+        if descr is None:
+            return
+        if not withcards:
+            func = descr.get_write_barrier_fn(self.cpu)
+        else:
+            if descr.jit_wb_cards_set == 0:
+                return
+            func = descr.get_write_barrier_from_array_fn(self.cpu)
+            if func == 0:
+                return
+        #
+        # This builds a helper function called from the slow path of
+        # write barriers.  It must save all registers, and optionally
+        # all vfp registers.  It takes a single argument which is in x0.
+        # It must keep stack alignment accordingly.
+        mc = InstrBuilder()
+        #
+        exc0 = exc1 = None
+        mc.SUB_ri(r.sp.value, r.sp.value, 2 * WORD)
+        mc.STR_ri(r.ip0.value, r.sp.value, WORD)
+        mc.STR_ri(r.lr.value, r.sp.value, 0)
+        if not for_frame:
+            self._push_all_regs_to_jitframe(mc, [], withfloats, callee_only=True)
+        else:
+            # NOTE: don't save registers on the jitframe here!  It might
+            # override already-saved values that will be restored
+            # later...
+            #
+            # we're possibly called from the slowpath of malloc
+            # save the caller saved registers
+            # assuming we do not collect here
+            exc0, exc1 = r.r4, r.r5
+            XXX
+            mc.PUSH([gpr.value for gpr in r.caller_resp] + [exc0.value, exc1.value])
+            mc.VPUSH([vfpr.value for vfpr in r.caller_vfp_resp])
+
+            self._store_and_reset_exception(mc, exc0, exc1)
+        mc.BL(func)
+        #
+        if not for_frame:
+            self._pop_all_regs_from_jitframe(mc, [], withfloats, callee_only=True)
+        else:
+            XXX
+            self._restore_exception(mc, exc0, exc1)
+            mc.VPOP([vfpr.value for vfpr in r.caller_vfp_resp])
+            assert exc0 is not None
+            assert exc1 is not None
+            mc.POP([gpr.value for gpr in r.caller_resp] +
+                            [exc0.value, exc1.value])
+        #
+        if withcards:
+            # A final TEST8 before the RET, for the caller.  Careful to
+            # not follow this instruction with another one that changes
+            # the status of the CPU flags!
+            YYY
+            mc.LDRB_ri(r.ip.value, r.r0.value,
+                                    imm=descr.jit_wb_if_flag_byteofs)
+            mc.TST_ri(r.ip.value, imm=0x80)
+        #
+        mc.LDR_ri(r.ip0.value, r.sp.value, WORD)
+        mc.LDR_ri(r.ip1.value, r.sp.value, 0)
+        mc.ADD_ri(r.sp.value, r.sp.value, 2 * WORD)
+        mc.RET_r(r.ip1.value)
+        #
+        rawstart = mc.materialize(self.cpu, [])
+        if for_frame:
+            self.wb_slowpath[4] = rawstart
+        else:
+            self.wb_slowpath[withcards + 2 * withfloats] = rawstart
 
     def build_frame_realloc_slowpath(self):
         # this code should do the following steps
diff --git a/rpython/jit/backend/aarch64/codebuilder.py b/rpython/jit/backend/aarch64/codebuilder.py
--- a/rpython/jit/backend/aarch64/codebuilder.py
+++ b/rpython/jit/backend/aarch64/codebuilder.py
@@ -256,6 +256,11 @@
         self.write32((base << 21) | (0b11111 << 16) | (cond << 12) | (1 << 10) |
                      (0b11111 << 5) | rd)
 
+    def TST_rr_shift(self, rn, rm, shift):
+        assert 0 <= shift <= 64
+        base = 0b11101010000
+        self.write32((base << 21) | (rm << 16) | (shift << 10) | (rn << 5) | 0b11111)
+
     def NOP(self):
         self.write32(0b11010101000000110010000000011111)
 
diff --git a/rpython/jit/backend/aarch64/opassembler.py b/rpython/jit/backend/aarch64/opassembler.py
--- a/rpython/jit/backend/aarch64/opassembler.py
+++ b/rpython/jit/backend/aarch64/opassembler.py
@@ -1,10 +1,12 @@
 
+from rpython.rlib.objectmodel import we_are_translated
 from rpython.jit.metainterp.history import (AbstractFailDescr, ConstInt,
                                             INT, FLOAT, REF)
 from rpython.jit.backend.aarch64 import registers as r
+from rpython.jit.backend.aarch64.codebuilder import OverwritingBuilder
 from rpython.jit.backend.aarch64.callbuilder import Aarch64CallBuilder
 from rpython.jit.backend.arm import conditions as c
-from rpython.jit.backend.aarch64.arch import JITFRAME_FIXED_SIZE
+from rpython.jit.backend.aarch64.arch import JITFRAME_FIXED_SIZE, WORD
 from rpython.jit.backend.aarch64.locations import imm
 from rpython.jit.backend.llsupport.assembler import GuardToken, BaseAssembler
 from rpython.jit.backend.llsupport.gcmap import allocate_gcmap
@@ -493,6 +495,128 @@
     def emit_op_restore_exception(self, op, arglocs):
         self._restore_exception(self.mc, arglocs[1], arglocs[0])
 
+    def emit_op_cond_call_gc_wb(self, op, arglocs):
+        self._write_barrier_fastpath(self.mc, op.getdescr(), arglocs)
+
+    def _write_barrier_fastpath(self, mc, descr, arglocs, array=False, is_frame=False):
+        # Write code equivalent to write_barrier() in the GC: it checks
+        # a flag in the object at arglocs[0], and if set, it calls a
+        # helper piece of assembler.  The latter saves registers as needed
+        # and call the function remember_young_pointer() from the GC.
+        if we_are_translated():
+            cls = self.cpu.gc_ll_descr.has_write_barrier_class()
+            assert cls is not None and isinstance(descr, cls)
+        #
+        card_marking = False
+        mask = descr.jit_wb_if_flag_singlebyte
+        if array and descr.jit_wb_cards_set != 0:
+            # assumptions the rest of the function depends on:
+            assert (descr.jit_wb_cards_set_byteofs ==
+                    descr.jit_wb_if_flag_byteofs)
+            assert descr.jit_wb_cards_set_singlebyte == -0x80
+            card_marking = True
+            mask = descr.jit_wb_if_flag_singlebyte | -0x80
+        #
+        loc_base = arglocs[0]
+        if is_frame:
+            assert loc_base is r.fp
+        mc.LDRB_ri(r.ip0.value, loc_base.value, descr.jit_wb_if_flag_byteofs)
+        mask &= 0xFF
+        mc.MOVZ_r_u16(r.ip1.value, mask, 0)
+        mc.TST_rr_shift(r.ip0.value, r.ip1.value, 0)
+        jz_location = mc.currpos()
+        mc.BRK()
+
+        # for cond_call_gc_wb_array, also add another fast path:
+        # if GCFLAG_CARDS_SET, then we can just set one bit and be done
+        if card_marking:
+            XXX
+            # GCFLAG_CARDS_SET is in this byte at 0x80
+            mc.TST_ri(r.ip.value, imm=0x80)
+
+            js_location = mc.currpos()
+            mc.BKPT()
+        else:
+            js_location = 0
+
+        # Write only a CALL to the helper prepared in advance, passing it as
+        # argument the address of the structure we are writing into
+        # (the first argument to COND_CALL_GC_WB).
+        helper_num = card_marking
+        if is_frame:
+            helper_num = 4
+        elif self._regalloc is not None and self._regalloc.vfprm.reg_bindings:
+            helper_num += 2
+        if self.wb_slowpath[helper_num] == 0:    # tests only
+            assert not we_are_translated()
+            self.cpu.gc_ll_descr.write_barrier_descr = descr
+            self._build_wb_slowpath(card_marking,
+                                    bool(self._regalloc.vfprm.reg_bindings))
+            assert self.wb_slowpath[helper_num] != 0
+        #
+        if loc_base is not r.x0:
+            # push two registers to keep stack aligned
+            mc.SUB_ri(r.sp.value, r.sp.value, 2 * WORD)
+            mc.STR_ri(r.x0.value, r.sp.value, WORD)
+            mc.STR_ri(loc_base.value, r.sp.value, 0)
+            mc.MOV_rr(r.x0.value, loc_base.value)
+            if is_frame:
+                assert loc_base is r.fp
+        mc.BL(self.wb_slowpath[helper_num])
+        if loc_base is not r.x0:
+            mc.LDR_ri(r.x0.value, r.sp.value, WORD)
+            mc.LDR_ri(loc_base.value, r.sp.value, 0)
+            mc.ADD_ri(r.sp.value, r.sp.value, 2 * WORD)
+
+        if card_marking:
+            # The helper ends again with a check of the flag in the object.  So
+            # here, we can simply write again a conditional jump, which will be
+            # taken if GCFLAG_CARDS_SET is still not set.
+            jns_location = mc.currpos()
+            mc.BKPT()
+            #
+            # patch the JS above
+            offset = mc.currpos()
+            pmc = OverwritingBuilder(mc, js_location, WORD)
+            pmc.B_offs(offset, c.NE)  # We want to jump if the z flag isn't set
+            #
+            # case GCFLAG_CARDS_SET: emit a few instructions to do
+            # directly the card flag setting
+            loc_index = arglocs[1]
+            assert loc_index.is_core_reg()
+            # must save the register loc_index before it is mutated
+            mc.PUSH([loc_index.value])
+            tmp1 = loc_index
+            tmp2 = arglocs[-1]  # the last item is a preallocated tmp
+            # lr = byteofs
+            s = 3 + descr.jit_wb_card_page_shift
+            mc.MVN_rr(r.lr.value, loc_index.value,
+                                       imm=s, shifttype=shift.LSR)
+
+            # tmp1 = byte_index
+            mc.MOV_ri(r.ip.value, imm=7)
+            mc.AND_rr(tmp1.value, r.ip.value, loc_index.value,
+            imm=descr.jit_wb_card_page_shift, shifttype=shift.LSR)
+
+            # set the bit
+            mc.MOV_ri(tmp2.value, imm=1)
+            mc.LDRB_rr(r.ip.value, loc_base.value, r.lr.value)
+            mc.ORR_rr_sr(r.ip.value, r.ip.value, tmp2.value,
+                                          tmp1.value, shifttype=shift.LSL)
+            mc.STRB_rr(r.ip.value, loc_base.value, r.lr.value)
+            # done
+            mc.POP([loc_index.value])
+            #
+            #
+            # patch the JNS above
+            offset = mc.currpos()
+            pmc = OverwritingBuilder(mc, jns_location, WORD)
+            pmc.B_offs(offset, c.EQ)  # We want to jump if the z flag is set
+
+        offset = mc.currpos() - jz_location
+        pmc = OverwritingBuilder(mc, jz_location, WORD)
+        pmc.B_ofs_cond(offset, c.EQ)
+
     # ----------------------------- call ------------------------------
 
     def _genop_call(self, op, arglocs):
diff --git a/rpython/jit/backend/aarch64/regalloc.py b/rpython/jit/backend/aarch64/regalloc.py
--- a/rpython/jit/backend/aarch64/regalloc.py
+++ b/rpython/jit/backend/aarch64/regalloc.py
@@ -800,6 +800,15 @@
                                  src_locations2, dst_locations2, vfptmploc)
         return []
 
+    def prepare_op_cond_call_gc_wb(self, op):
+        # we force all arguments in a reg because it will be needed anyway by
+        # the following gc_store. It avoids loading it twice from the memory.
+        N = op.numargs()
+        args = op.getarglist()
+        arglocs = [self.make_sure_var_in_reg(op.getarg(i), args)
+                                                              for i in range(N)]
+        return arglocs
+
     def force_allocate_reg(self, var, forbidden_vars=[], selected_reg=None):
         if var.type == FLOAT:
             return self.vfprm.force_allocate_reg(var, forbidden_vars,


More information about the pypy-commit mailing list