[pypy-commit] pypy arm64: zero_array, increment debug counter and exception handling

Tue Jun 25 07:13:38 EDT 2019

Author: Maciej Fijalkowski <fijall at gmail.com>
Branch: arm64
Changeset: r96857:c89b27574833
Date: 2019-06-25 11:12 +0000
http://bitbucket.org/pypy/pypy/changeset/c89b27574833/

Log:	zero_array, increment debug counter and exception handling

diff --git a/rpython/jit/backend/aarch64/assembler.py b/rpython/jit/backend/aarch64/assembler.py
--- a/rpython/jit/backend/aarch64/assembler.py
+++ b/rpython/jit/backend/aarch64/assembler.py
@@ -405,6 +405,7 @@
         mc.LDP_rri(r.x0.value, r.x1.value, r.sp.value, 0)
         
         mc.STR_ri(r.lr.value, r.sp.value, 0)
+        mc.STR_ri(r.x19.value, r.sp.value, WORD)
 
         # store the current gcmap(r0) in the jitframe
         gcmap_ofs = self.cpu.get_ofs_of_frame_field('jf_gcmap')
@@ -414,8 +415,7 @@
         mc.MOV_rr(r.x0.value, r.fp.value)
 
         # store a possibly present exception
-        # we use a callee saved reg here as a tmp for the exc.
-        self._store_and_reset_exception(mc, None, r.ip1, on_frame=True)
+        self._store_and_reset_exception(mc, None, r.x19, on_frame=True)
 
         # call realloc_frame, it takes two arguments
         # arg0: the old jitframe
@@ -427,7 +427,7 @@
         mc.MOV_rr(r.fp.value, r.x0.value)
 
         # restore a possibly present exception
-        self._restore_exception(mc, None, r.ip1)
+        self._restore_exception(mc, None, r.x19)
 
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
         if gcrootmap and gcrootmap.is_shadow_stack:
@@ -445,6 +445,7 @@
 
         # return
         mc.LDR_ri(r.lr.value, r.sp.value, 0)
+        mc.LDR_ri(r.x19.value, r.sp.value, WORD)
         mc.ADD_ri(r.sp.value, r.sp.value, 2*WORD)
         mc.RET_r(r.lr.value)
         self._frame_realloc_slowpath = mc.materialize(self.cpu, [])        
diff --git a/rpython/jit/backend/aarch64/codebuilder.py b/rpython/jit/backend/aarch64/codebuilder.py
--- a/rpython/jit/backend/aarch64/codebuilder.py
+++ b/rpython/jit/backend/aarch64/codebuilder.py
@@ -60,6 +60,15 @@
         base = 0b11100100
         self.write32((scale << 30) | (base << 22) | (imm >> scale << 10) | (rn << 5) | rt)
 
+    def STRB_ri(self, rt, rn, imm):
+        self.STR_size_ri(0, rt, rn, imm)
+
+    def STRH_ri(self, rt, rn, imm):
+        self.STR_size_ri(1, rt, rn, imm)
+
+    def STRW_ri(self, rt, rn, imm):
+        self.STR_size_ri(2, rt, rn, imm)
+
     def MOV_rr(self, rd, rn):
         self.ORR_rr(rd, r.xzr.value, rn)
 
diff --git a/rpython/jit/backend/aarch64/opassembler.py b/rpython/jit/backend/aarch64/opassembler.py
--- a/rpython/jit/backend/aarch64/opassembler.py
+++ b/rpython/jit/backend/aarch64/opassembler.py
@@ -249,7 +249,6 @@
             raise AssertionError("bad number of bytes")
 
     def emit_op_increment_debug_counter(self, op, arglocs):
-        return # XXXX
         base_loc, value_loc = arglocs
         self.mc.LDR_ri(value_loc.value, base_loc.value, 0)
         self.mc.ADD_ri(value_loc.value, value_loc.value, 1)
@@ -635,6 +634,111 @@
         self._write_barrier_fastpath(self.mc, op.getdescr(), arglocs,
                                      array=True)
 
+        #from ../x86/regalloc.py:1388
+    def emit_op_zero_array(self, op, arglocs):
+        from rpython.jit.backend.llsupport.descr import unpack_arraydescr
+        assert len(arglocs) == 0
+        size_box = op.getarg(2)
+        if isinstance(size_box, ConstInt) and size_box.getint() == 0:
+            return
+        itemsize, baseofs, _ = unpack_arraydescr(op.getdescr())
+        args = op.getarglist()
+        #
+        # ZERO_ARRAY(base_loc, start, size, 1, 1)
+        # 'start' and 'size' are both expressed in bytes,
+        # and the two scaling arguments should always be ConstInt(1) on ARM.
+        assert args[3].getint() == 1
+        assert args[4].getint() == 1
+        #
+        base_loc = self._regalloc.rm.make_sure_var_in_reg(args[0], args)
+        startbyte_box = args[1]
+        if isinstance(startbyte_box, ConstInt):
+            startbyte_loc = None
+            startbyte = startbyte_box.getint()
+            assert startbyte >= 0
+        else:
+            startbyte_loc = self._regalloc.rm.make_sure_var_in_reg(startbyte_box,
+                                                                   args)
+            startbyte = -1
+
+        # base_loc and startbyte_loc are in two regs here (or startbyte_loc
+        # is an immediate).  Compute the dstaddr_loc, which is the raw
+        # address that we will pass as first argument to memset().
+        # It can be in the same register as either one, but not in
+        # args[2], because we're still needing the latter.
+        dstaddr_loc = r.ip1
+        if startbyte >= 0:    # a constant
+            ofs = baseofs + startbyte
+            reg = base_loc.value
+        else:
+            self.mc.ADD_rr(dstaddr_loc.value,
+                           base_loc.value, startbyte_loc.value)
+            ofs = baseofs
+            reg = dstaddr_loc.value
+        if check_imm_arg(ofs):
+            self.mc.ADD_ri(dstaddr_loc.value, reg, ofs)
+        else:
+            self.mc.gen_load_int(r.ip0.value, ofs)
+            self.mc.ADD_rr(dstaddr_loc.value, reg, r.ip0.value)
+
+        # We use STRB, STRH, STRW or STR based on whether we know the array
+        # item size is a multiple of 1, 2 or 4.
+        if   itemsize & 1: itemsize = 1
+        elif itemsize & 2: itemsize = 2
+        elif itemsize & 4: itemsize = 4
+        else:              itemsize = 8
+        limit = itemsize
+        next_group = -1
+        if itemsize < 8 and startbyte >= 0:
+            # we optimize STRB/STRH into STR, but this needs care:
+            # it only works if startindex_loc is a constant, otherwise
+            # we'd be doing unaligned accesses.
+            next_group = (-startbyte) & 7
+            limit = 8
+
+        if (isinstance(size_box, ConstInt) and
+                size_box.getint() <= 14 * limit):     # same limit as GCC
+            # Inline a series of STR operations, starting at 'dstaddr_loc'.
+            #
+            self.mc.gen_load_int(r.ip0.value, 0)
+            i = 0
+            adjustment = 0
+            needs_adjustment = itemsize < 8 and (startbyte % 8)
+            total_size = size_box.getint()
+            while i < total_size:
+                sz = itemsize
+                if i == next_group:
+                    next_group += 8
+                    if next_group <= total_size:
+                        sz = 8
+                if sz == 8:
+                    if needs_adjustment:
+                        self.mc.ADD_ri(dstaddr_loc.value, dstaddr_loc.value, i)
+                        adjustment = -i
+                        needs_adjustment = False
+                    self.mc.STR_ri(r.ip0.value, dstaddr_loc.value, i + adjustment)                    
+                elif sz == 4:
+                    self.mc.STRW_ri(r.ip0.value, dstaddr_loc.value, i + adjustment)
+                elif sz == 2:
+                    self.mc.STRH_ri(r.ip0.value, dstaddr_loc.value, i + adjustment)
+                else:
+                    self.mc.STRB_ri(r.ip0.value, dstaddr_loc.value, i + adjustment)
+                i += sz
+
+        else:
+            if isinstance(size_box, ConstInt):
+                size_loc = self.imm(size_box.getint())
+            else:
+                # load size_loc in a register different than dstaddr_loc
+                size_loc = self._regalloc.rm.make_sure_var_in_reg(size_box,
+                                                            [])
+            #
+            # call memset()
+            self._regalloc.before_call()
+            self.simple_call_no_collect(self.imm(self.memset_addr),
+                                        [dstaddr_loc, self.imm(0), size_loc])
+            self._regalloc.rm.possibly_free_var(size_box)
+
     def _emit_op_cond_call(self, op, arglocs, fcond):
         if len(arglocs) == 2:
             res_loc = arglocs[1]     # cond_call_value
@@ -859,6 +963,10 @@
                                      result_size)
         cb.emit()
 
+    def simple_call_no_collect(self, fnloc, arglocs):
+        cb = Aarch64CallBuilder(self, fnloc, arglocs)
+        cb.emit_no_collect()
+
     def emit_guard_op_guard_not_forced(self, op, guard_op, fcond, arglocs):
         # arglocs is call locs + guard_locs, split them
         if rop.is_call_assembler(op.getopnum()):
diff --git a/rpython/jit/backend/aarch64/regalloc.py b/rpython/jit/backend/aarch64/regalloc.py
--- a/rpython/jit/backend/aarch64/regalloc.py
+++ b/rpython/jit/backend/aarch64/regalloc.py
@@ -313,6 +313,7 @@
     prepare_op_jit_debug = void
     prepare_op_enter_portal_frame = void
     prepare_op_leave_portal_frame = void
+    prepare_op_zero_array = void # dealth with in opassembler.py
 
     def prepare_int_ri(self, op, res_in_cc):
         boxes = op.getarglist()
@@ -648,6 +649,10 @@
         resloc = self.after_call(op)
         return resloc
 
+    def before_call(self, save_all_regs=False):
+        self.rm.before_call(save_all_regs=save_all_regs)
+        self.vfprm.before_call(save_all_regs=save_all_regs)
+
     def after_call(self, v):
         if v.type == 'v':
             return