[pypy-commit] pypy arm64: enough scaffolding to actually run the first loop test. It does not pass yet, but it does run!

Wed Mar 6 05:33:01 EST 2019

Author: Maciej Fijalkowski <fijall at gmail.com>
Branch: arm64
Changeset: r96214:173ed1a7572d
Date: 2019-03-06 10:32 +0000
http://bitbucket.org/pypy/pypy/changeset/173ed1a7572d/

Log:	enough scaffolding to actually run the first loop test. It does not
	pass yet, but it does run!

diff --git a/rpython/jit/backend/aarch64/assembler.py b/rpython/jit/backend/aarch64/assembler.py
--- a/rpython/jit/backend/aarch64/assembler.py
+++ b/rpython/jit/backend/aarch64/assembler.py
@@ -9,6 +9,7 @@
 #    CoreRegisterManager, check_imm_arg, VFPRegisterManager,
 #from rpython.jit.backend.arm import callbuilder
 from rpython.jit.backend.aarch64 import registers as r
+from rpython.jit.backend.arm import conditions as c
 from rpython.jit.backend.llsupport import jitframe
 from rpython.jit.backend.llsupport.assembler import BaseAssembler
 from rpython.jit.backend.llsupport.regalloc import get_scale, valid_addressing_size
@@ -113,6 +114,7 @@
 
     def setup(self, looptoken):
         BaseAssembler.setup(self, looptoken)
+        self.failure_recovery_code = [0, 0, 0, 0]
         assert self.memcpy_addr != 0, 'setup_once() not called?'
         if we_are_translated():
             self.debug = False
@@ -135,7 +137,48 @@
         self.pending_guards = None
 
     def _build_failure_recovery(self, exc, withfloats=False):
-        pass # XXX
+        return # XXX
+        mc = InstrBuilder()
+        self._push_all_regs_to_jitframe(mc, [], withfloats)
+
+        if exc:
+            XXX
+            # We might have an exception pending.  Load it into r4
+            # (this is a register saved across calls)
+            mc.gen_load_int(r.r5.value, self.cpu.pos_exc_value())
+            mc.LDR_ri(r.r4.value, r.r5.value)
+            # clear the exc flags
+            mc.gen_load_int(r.r6.value, 0)
+            mc.STR_ri(r.r6.value, r.r5.value) # pos_exc_value is still in r5
+            mc.gen_load_int(r.r5.value, self.cpu.pos_exception())
+            mc.STR_ri(r.r6.value, r.r5.value)
+            # save r4 into 'jf_guard_exc'
+            offset = self.cpu.get_ofs_of_frame_field('jf_guard_exc')
+            assert check_imm_arg(abs(offset))
+            mc.STR_ri(r.r4.value, r.fp.value, imm=offset)
+        # now we return from the complete frame, which starts from
+        # _call_header_with_stack_check().  The LEA in _call_footer below
+        # throws away most of the frame, including all the PUSHes that we
+        # did just above.
+        ofs = self.cpu.get_ofs_of_frame_field('jf_descr')
+        assert check_imm_arg(abs(ofs))
+        ofs2 = self.cpu.get_ofs_of_frame_field('jf_gcmap')
+        assert check_imm_arg(abs(ofs2))
+        base_ofs = self.cpu.get_baseofs_of_frame_field()
+        # store the gcmap
+        mc.POP([r.ip.value])
+        mc.STR_ri(r.ip.value, r.fp.value, imm=ofs2)
+        # store the descr
+        mc.POP([r.ip.value])
+        mc.STR_ri(r.ip.value, r.fp.value, imm=ofs)
+
+        # set return value
+        assert check_imm_arg(base_ofs)
+        mc.MOV_rr(r.r0.value, r.fp.value)
+        #
+        self.gen_func_epilog(mc)
+        rawstart = mc.materialize(self.cpu, [])
+        self.failure_recovery_code[exc + 2 * withfloats] = rawstart
 
     def _build_wb_slowpath(self, withcards, withfloats=False, for_frame=False):
         pass # XXX
@@ -159,8 +202,25 @@
         baseofs = self.cpu.get_baseofs_of_frame_field()
         self.current_clt.frame_info.update_frame_depth(baseofs, frame_depth)
 
+    def generate_quick_failure(self, guardtok):
+        startpos = self.mc.currpos()
+        faildescrindex, target = self.store_info_on_descr(startpos, guardtok)
+        self.mc.SUB_ri(r.sp.value, r.sp.value, 2 * WORD)
+        self.load_from_gc_table(r.ip0.value, faildescrindex)
+        self.store_reg(self.mc, r.ip0, r.fp, WORD)
+        self.push_gcmap(self.mc, gcmap=guardtok.gcmap, ofs=0)
+        self.mc.BL(target)
+        return startpos
+
+    def push_gcmap(self, mc, gcmap, ofs):
+        ptr = rffi.cast(lltype.Signed, gcmap)
+        mc.gen_load_int(r.ip0.value, ptr)
+        self.store_reg(mc, r.ip0, r.fp, ofs)
+
     def write_pending_failure_recoveries(self):
-        pass # XXX
+        for tok in self.pending_guards:
+            #generate the exit stub and the encoded representation
+            tok.pos_recovery_stub = self.generate_quick_failure(tok)
 
     def reserve_gcref_table(self, allgcrefs):
         gcref_table_size = len(allgcrefs) * WORD
@@ -200,8 +260,25 @@
         #    self.codemap.get_final_bytecode(res, size))
         return res
 
-    def process_pending_guards(self, rawstart):
-        pass
+    def process_pending_guards(self, block_start):
+        clt = self.current_clt
+        for tok in self.pending_guards:
+            descr = tok.faildescr
+            assert isinstance(descr, AbstractFailDescr)
+            failure_recovery_pos = block_start + tok.pos_recovery_stub
+            descr.adr_jump_offset = failure_recovery_pos
+            relative_offset = tok.pos_recovery_stub - tok.offset
+            guard_pos = block_start + tok.offset
+            if not tok.guard_not_invalidated():
+                # patch the guard jump to the stub
+                # overwrite the generate BRK with a B_offs to the pos of the
+                # stub
+                mc = InstrBuilder()
+                mc.B_ofs_cond(relative_offset, c.get_opposite_of(tok.fcond))
+                mc.copy_to_raw_memory(guard_pos)
+            else:
+                XX
+                clt.invalidate_positions.append((guard_pos, relative_offset))
 
     def fixup_target_tokens(self, rawstart):
         for targettoken in self.target_tokens_currently_compiling:
@@ -277,10 +354,11 @@
             elif not we_are_translated() and op.getopnum() == rop.FORCE_SPILL:
                 regalloc.prepare_force_spill(op)
             elif i < len(operations) - 1 and regalloc.next_op_can_accept_cc(operations, i):
-                arglocs = guard_operations[operations[i + 1].getopnum()](
-                    regalloc, operations[i + 1], op)
+                guard_op = operations[i + 1]
+                guard_num = guard_op.getopnum()
+                arglocs, fcond = guard_operations[guard_num](regalloc, guard_op, op)
                 if arglocs is not None:
-                    xxx
+                    asm_guard_operations[guard_num](self, guard_op, fcond, arglocs)
                 regalloc.next_instruction() # advance one more
             else:
                 arglocs = regalloc_operations[opnum](regalloc, op)
@@ -302,8 +380,7 @@
         opnum = op.getopnum()
         arglocs = comp_operations[opnum](self._regalloc, op, True)
         assert arglocs is not None
-        asm_comp_operations[opnum](self, op, arglocs)
-        return arglocs
+        return asm_comp_operations[opnum](self, op, arglocs)
 
     # regalloc support
     def load(self, loc, value):
@@ -353,6 +430,14 @@
         # if save_helper:
         #     self.mc.POP([helper.value], cond=cond)
 
+    def _mov_reg_to_loc(self, prev_loc, loc):
+        if loc.is_core_reg():
+            self.mc.MOV_rr(loc.value, prev_loc.value)
+        elif loc.is_stack():
+            self.mc.STR_ri(r.fp.value, prev_loc.value, loc.value)
+        else:
+            XXX
+
     def regalloc_mov(self, prev_loc, loc):
         """Moves a value from a previous location to some other location"""
         if prev_loc.is_imm():
@@ -420,6 +505,18 @@
         #    mc.gen_load_int(r.ip1, ofs)
         #    mc.STR_rr(source.value, base.value, r.ip1)
 
+    def check_frame_before_jump(self, target_token):
+        if target_token in self.target_tokens_currently_compiling:
+            return
+        if target_token._arm_clt is self.current_clt:
+            return
+        # We can have a frame coming from god knows where that's
+        # passed to a jump to another loop. Make sure it has the
+        # correct depth
+        expected_size = target_token._arm_clt.frame_info.jfi_frame_depth
+        self._check_frame_depth(self.mc, self._regalloc.get_gcmap(),
+                                expected_size=expected_size)
+
 
 def not_implemented(msg):
     msg = '[ARM/asm] %s\n' % msg
@@ -436,7 +533,12 @@
     print "[ARM/asm] %s not implemented" % op.getopname()
     raise NotImplementedError(op)
 
+def notimplemented_guard_op(self, op, fcond, arglocs):
+    print "[ARM/asm] %s not implemented" % op.getopname()
+    raise NotImplementedError(op)
+
 asm_operations = [notimplemented_op] * (rop._LAST + 1)
+asm_guard_operations = [notimplemented_guard_op] * (rop._LAST + 1)
 asm_comp_operations = [notimplemented_comp_op] * (rop._LAST + 1)
 asm_extra_operations = {}
 
@@ -449,6 +551,10 @@
         opname = name[len('emit_op_'):]
         num = getattr(rop, opname.upper())
         asm_operations[num] = value
+    elif name.startswith('emit_guard_op_'):
+        opname = name[len('emit_guard_op_'):]
+        num = getattr(rop, opname.upper())
+        asm_guard_operations[num] = value
     elif name.startswith('emit_comp_op_'):
         opname = name[len('emit_comp_op_'):]
         num = getattr(rop, opname.upper())
diff --git a/rpython/jit/backend/aarch64/codebuilder.py b/rpython/jit/backend/aarch64/codebuilder.py
--- a/rpython/jit/backend/aarch64/codebuilder.py
+++ b/rpython/jit/backend/aarch64/codebuilder.py
@@ -7,6 +7,7 @@
 from rpython.rtyper.lltypesystem import lltype, rffi
 from rpython.tool.udir import udir
 
+PC_OFFSET = 8
 
 class AbstractAarch64Builder(object):
     def write32(self, word):
@@ -70,6 +71,11 @@
         self.write32((base << 22) | (constant << 10) |
                      (rn << 5) | rd)
 
+    def SUB_ri(self, rd, rn, constant):
+        base = 0b1101000100
+        assert 0 <= constant < 4096
+        self.write32((base << 22) | (constant << 10) | (rn << 5) | rd)
+
     def LDP_rri(self, reg1, reg2, rn, offset):
         base = 0b1010100101
         assert -512 <= offset < 512
@@ -108,6 +114,34 @@
         base = 0b11101011000
         self.write32((base << 21) | (rm << 16) | (rn << 5) | 0b11111)
 
+    def B_ofs(self, ofs):
+        base = 0b000101
+        assert ofs & 0x3 == 0
+        pos = self.currpos()
+        target_ofs = ofs - (pos + PC_OFFSET)
+        assert -(1 << (26 + 2)) < target_ofs < 1<<(26 + 2)
+        if target_ofs < 0:
+            target_ofs = 1<<25 | (~target_ofs)
+        self.write32((base << 26) | (target_ofs >> 2))
+
+    def B_ofs_cond(self, ofs, cond):
+        base = 0b01010100
+        assert ofs & 0x3 == 0
+        assert -1 << 10 < ofs < 1 << 10
+        imm = ofs >> 2
+        if imm < 0:
+            xxx
+        self.write32((base << 24) | (imm << 5) | cond)
+
+    def BL(self, target):
+        target = rffi.cast(lltype.Signed, target)
+        self.gen_load_int(r.ip0.value, target)
+        self.BR(r.ip0.value)
+
+    def BR(self, reg):
+        base = 0b1101011000011111000000
+        self.write32((base << 10) | (reg << 5))
+
     def BRK(self):
         self.write32(0b11010100001 << 21)
 
@@ -116,9 +150,12 @@
         register"""
         # XXX optimize!
         self.MOVZ_r_u16(r, value & 0xFFFF, 0)
-        self.MOVK_r_u16(r, (value >> 16) & 0xFFFF, 16)
-        self.MOVK_r_u16(r, (value >> 32) & 0xFFFF, 32)
-        self.MOVK_r_u16(r, (value >> 48) & 0xFFFF, 48)
+        value = value >> 16
+        shift = 16
+        while value:
+            self.MOVK_r_u16(r, (value >> 16) & 0xFFFF, shift)
+            shift += 16
+            value >>= 16
 
 
 class InstrBuilder(BlockBuilderMixin, AbstractAarch64Builder):
diff --git a/rpython/jit/backend/aarch64/locations.py b/rpython/jit/backend/aarch64/locations.py
--- a/rpython/jit/backend/aarch64/locations.py
+++ b/rpython/jit/backend/aarch64/locations.py
@@ -47,7 +47,6 @@
         return True
 
     def as_key(self):       # 0 <= as_key <= 30, 31 being zero register
-        xxx
         return self.value
 
 class VFPRegisterLocation(RegisterLocation):
@@ -64,7 +63,6 @@
         return True
 
     def as_key(self):            # 40 <= as_key <= 71
-        xxx
         return self.value + 40
 
     def is_float(self):
@@ -110,7 +108,6 @@
         return True
 
     def as_key(self):                # an aligned word + 10000
-        XXX
         return self.position + 10000
 
     def is_float(self):
@@ -127,7 +124,10 @@
         return "xzr"
 
     def as_key(self):
-        return 31
+        raise ValueError("should never make it to jump")
+
+def imm(i):
+    return ImmLocation(i)
 
 def get_fp_offset(base_ofs, position):
     return base_ofs + WORD * (position + JITFRAME_FIXED_SIZE)
diff --git a/rpython/jit/backend/aarch64/opassembler.py b/rpython/jit/backend/aarch64/opassembler.py
--- a/rpython/jit/backend/aarch64/opassembler.py
+++ b/rpython/jit/backend/aarch64/opassembler.py
@@ -2,8 +2,12 @@
 from rpython.jit.metainterp.history import (AbstractFailDescr, ConstInt,
                                             INT, FLOAT, REF)
 from rpython.jit.backend.aarch64 import registers as r
-from rpython.jit.backend.arm import conditions as c # yes, arm, not aarch64
+from rpython.jit.backend.arm import conditions as c
+from rpython.jit.backend.aarch64.arch import JITFRAME_FIXED_SIZE
 from rpython.jit.backend.llsupport.assembler import GuardToken, BaseAssembler
+from rpython.jit.backend.llsupport.gcmap import allocate_gcmap
+from rpython.jit.metainterp.history import TargetToken
+
 
 class ResOpAssembler(BaseAssembler):
     def emit_op_int_add(self, op, arglocs):
@@ -34,7 +38,13 @@
         else:
             self.mc.CMP_rr(l0.value, l1.value)
 
-    emit_comp_op_int_le = emit_int_comp_op
+    def emit_comp_op_int_lt(self, op, arglocs):
+        self.emit_int_comp_op(op, arglocs)
+        return c.LT
+
+    def emit_comp_op_int_le(self, op, arglocs):
+        self.emit_int_comp_op(op, arglocs)
+        return c.LE
 
     def emit_op_increment_debug_counter(self, op, arglocs):
         return # XXXX
@@ -43,9 +53,52 @@
         self.mc.ADD_ri(value_loc.value, value_loc.value, 1)
         self.mc.STR_ri(value_loc.value, base_loc.value, 0)
 
+    def build_guard_token(self, op, frame_depth, arglocs, offset, fcond):
+        descr = op.getdescr()
+        assert isinstance(descr, AbstractFailDescr)
+
+        gcmap = allocate_gcmap(self, frame_depth, JITFRAME_FIXED_SIZE)
+        faildescrindex = self.get_gcref_from_faildescr(descr)
+        token = GuardToken(self.cpu, gcmap, descr,
+                                    failargs=op.getfailargs(),
+                                    fail_locs=arglocs,
+                                    guard_opnum=op.getopnum(),
+                                    frame_depth=frame_depth,
+                                    faildescrindex=faildescrindex)
+        token.fcond = fcond
+        return token
+
+    def _emit_guard(self, op, fcond, arglocs, is_guard_not_invalidated=False):
+        pos = self.mc.currpos()
+        token = self.build_guard_token(op, arglocs[0].value, arglocs[1:], pos,
+                                       fcond)
+        token.offset = pos
+        self.pending_guards.append(token)
+        assert token.guard_not_invalidated() == is_guard_not_invalidated
+        # For all guards that are not GUARD_NOT_INVALIDATED we emit a
+        # breakpoint to ensure the location is patched correctly. In the case
+        # of GUARD_NOT_INVALIDATED we use just a NOP, because it is only
+        # eventually patched at a later point.
+        if is_guard_not_invalidated:
+            self.mc.NOP()
+        else:
+            self.mc.BRK()
+
+    def emit_guard_op_guard_true(self, guard_op, fcond, arglocs):
+        self._emit_guard(guard_op, fcond, arglocs)
+
     def emit_op_label(self, op, arglocs):
         pass
 
+    def emit_op_jump(self, op, arglocs):
+        target_token = op.getdescr()
+        assert isinstance(target_token, TargetToken)
+        target = target_token._ll_loop_code
+        if target_token in self.target_tokens_currently_compiling:
+            self.mc.B_ofs(target)
+        else:
+            self.mc.B(target)
+
     def emit_op_finish(self, op, arglocs):
         base_ofs = self.cpu.get_baseofs_of_frame_field()
         if len(arglocs) > 0:
diff --git a/rpython/jit/backend/aarch64/regalloc.py b/rpython/jit/backend/aarch64/regalloc.py
--- a/rpython/jit/backend/aarch64/regalloc.py
+++ b/rpython/jit/backend/aarch64/regalloc.py
@@ -12,6 +12,8 @@
         get_scale
 from rpython.rtyper.lltypesystem import lltype, rffi, rstr, llmemory
 from rpython.jit.backend.aarch64 import registers as r
+from rpython.jit.backend.arm.jump import remap_frame_layout_mixed
+from rpython.jit.backend.aarch64.locations import imm
 
 
 class TempInt(TempVar):
@@ -363,11 +365,54 @@
         return locs
 
     def prepare_guard_op_guard_true(self, op, prevop):
-        arglocs = self.assembler.dispatch_comparison(prevop)
-        xxx
+        fcond = self.assembler.dispatch_comparison(prevop)
+        # result is in CC
+
+        arglocs = [None] * (len(op.getfailargs()) + 1)
+        arglocs[0] = imm(self.frame_manager.get_frame_depth())
+        failargs = op.getfailargs()
+        for i in range(len(failargs)):
+            if failargs[i]:
+                arglocs[i + 1] = self.loc(failargs[i])
+        return arglocs, fcond
 
     prepare_op_nursery_ptr_increment = prepare_op_int_add
 
+    def prepare_op_jump(self, op):
+        assert self.jump_target_descr is None
+        descr = op.getdescr()
+        assert isinstance(descr, TargetToken)
+        self.jump_target_descr = descr
+        arglocs = descr._arm_arglocs
+
+        # get temporary locs
+        tmploc = r.ip0
+        vfptmploc = None # XXX r.vfp_ip
+
+        # Part about non-floats
+        src_locations1 = []
+        dst_locations1 = []
+        # Part about floats
+        src_locations2 = []
+        dst_locations2 = []
+
+        # Build the four lists
+        for i in range(op.numargs()):
+            box = op.getarg(i)
+            src_loc = self.loc(box)
+            dst_loc = arglocs[i]
+            if box.type != FLOAT:
+                src_locations1.append(src_loc)
+                dst_locations1.append(dst_loc)
+            else:
+                src_locations2.append(src_loc)
+                dst_locations2.append(dst_loc)
+        self.assembler.check_frame_before_jump(self.jump_target_descr)
+        remap_frame_layout_mixed(self.assembler,
+                                 src_locations1, dst_locations1, tmploc,
+                                 src_locations2, dst_locations2, vfptmploc)
+        return []
+
     def force_allocate_reg(self, var, forbidden_vars=[], selected_reg=None):
         if var.type == FLOAT:
             return self.vfprm.force_allocate_reg(var, forbidden_vars,
diff --git a/rpython/jit/backend/aarch64/runner.py b/rpython/jit/backend/aarch64/runner.py
--- a/rpython/jit/backend/aarch64/runner.py
+++ b/rpython/jit/backend/aarch64/runner.py
@@ -1,11 +1,14 @@
 
 from rpython.rtyper.lltypesystem import llmemory, lltype
 from rpython.jit.backend.aarch64.assembler import AssemblerARM64
+from rpython.jit.backend.aarch64 import registers as r
 from rpython.jit.backend.llsupport.llmodel import AbstractLLCPU
 
 class CPU_ARM64(AbstractLLCPU):
     """ARM 64"""
     backend_name = "aarch64"
+    frame_reg = r.fp
+    all_reg_indexes = range(len(r.all_regs))
 
     IS_64_BIT = True
 
diff --git a/rpython/jit/backend/aarch64/test/test_instr_builder.py b/rpython/jit/backend/aarch64/test/test_instr_builder.py
--- a/rpython/jit/backend/aarch64/test/test_instr_builder.py
+++ b/rpython/jit/backend/aarch64/test/test_instr_builder.py
@@ -11,6 +11,9 @@
     def writechar(self, char):
         self.buffer.append(char)
 
+    def currpos(self):
+        return 0
+
     def hexdump(self):
         return ''.join(self.buffer)
 
@@ -125,6 +128,15 @@
         assert cb.hexdump() == assemble("ADD %r, %r, %r" % (rd, rn, rm))
 
     @settings(max_examples=20)
+    @given(rd=st.sampled_from(r.registers),
+           rn=st.sampled_from(r.registers),
+           ofs=st.integers(min_value=0, max_value=4095))
+    def test_SUB_ri(self, rd, rn, ofs):
+        cb = CodeBuilder()
+        cb.SUB_ri(rd.value, rn.value, ofs)
+        assert cb.hexdump() == assemble("SUB %r, %r, %d" % (rd, rn, ofs))
+
+    @settings(max_examples=20)
     @given(rn=st.sampled_from(r.registers),
            rm=st.sampled_from(r.registers))
     def test_CMP_rr(self, rn, rm):
diff --git a/rpython/jit/backend/test/runner_test.py b/rpython/jit/backend/test/runner_test.py
--- a/rpython/jit/backend/test/runner_test.py
+++ b/rpython/jit/backend/test/runner_test.py
@@ -185,6 +185,11 @@
         """, namespace={'targettoken': targettoken,
                         'fdescr': BasicFailDescr(2)})
         self.cpu.compile_loop(loop.inputargs, loop.operations, looptoken)
+        print "ONE"
+        deadframe = self.cpu.execute_token(looptoken, 10)
+        print "TWO"
+        fail = self.cpu.get_latest_descr(deadframe)
+        assert fail.identifier == 2
         deadframe = self.cpu.execute_token(looptoken, 2)
         fail = self.cpu.get_latest_descr(deadframe)
         assert fail.identifier == 2