[pypy-commit] pypy s390x-backend: rewritten many calls to use one stack frame less

plan_rich pypy.commits at gmail.com
Mon Jan 25 07:33:03 EST 2016


Author: Richard Plangger <planrichi at gmail.com>
Branch: s390x-backend
Changeset: r81928:6a1b2984c003
Date: 2016-01-25 13:31 +0100
http://bitbucket.org/pypy/pypy/changeset/6a1b2984c003/

Log:	rewritten many calls to use one stack frame less

diff --git a/rpython/jit/backend/zarch/arch.py b/rpython/jit/backend/zarch/arch.py
--- a/rpython/jit/backend/zarch/arch.py
+++ b/rpython/jit/backend/zarch/arch.py
@@ -34,7 +34,8 @@
 # in reverse order to SP
 
 STD_FRAME_SIZE_IN_BYTES = 160
-THREADLOCAL_ADDR_OFFSET = 16 # at position of r2, but r2 is never saved!!
+THREADLOCAL_ON_ENTER_JIT = 8
+THREADLOCAL_ADDR_OFFSET = STD_FRAME_SIZE_IN_BYTES + THREADLOCAL_ON_ENTER_JIT
 
 assert STD_FRAME_SIZE_IN_BYTES % 2 == 0
 
diff --git a/rpython/jit/backend/zarch/assembler.py b/rpython/jit/backend/zarch/assembler.py
--- a/rpython/jit/backend/zarch/assembler.py
+++ b/rpython/jit/backend/zarch/assembler.py
@@ -16,7 +16,8 @@
 from rpython.jit.backend.zarch.arch import (WORD,
         STD_FRAME_SIZE_IN_BYTES, THREADLOCAL_ADDR_OFFSET,
         RECOVERY_GCMAP_POOL_OFFSET, RECOVERY_TARGET_POOL_OFFSET,
-        JUMPABS_TARGET_ADDR__POOL_OFFSET, JUMPABS_POOL_ADDR_POOL_OFFSET)
+        JUMPABS_TARGET_ADDR__POOL_OFFSET, JUMPABS_POOL_ADDR_POOL_OFFSET,
+        THREADLOCAL_ON_ENTER_JIT)
 from rpython.jit.backend.zarch.opassembler import OpAssembler
 from rpython.jit.backend.zarch.regalloc import Regalloc
 from rpython.jit.codewriter.effectinfo import EffectInfo
@@ -382,7 +383,7 @@
         """
         # signature of these cond_call_slowpath functions:
         #   * on entry, r12 contains the function to call
-        #   * r3, r4, r5, r6 contain arguments for the call
+        #   * r2, r3, r4, r5 contain arguments for the call
         #   * r0 is the gcmap
         #   * the old value of these regs must already be stored in the jitframe
         #   * on exit, all registers are restored from the jitframe
@@ -391,6 +392,8 @@
         self.mc = mc
         ofs2 = self.cpu.get_ofs_of_frame_field('jf_gcmap')
         mc.STG(r.SCRATCH2, l.addr(ofs2,r.SPP))
+        mc.STMG(r.r14,r.r15,l.addr(14*WORD, r.SP))
+        mc.push_std_frame()
 
         # copy registers to the frame, with the exception of r3 to r6 and r12,
         # because these have already been saved by the caller.  Note that
@@ -406,21 +409,21 @@
                        reg is not r.r4 and
                        reg is not r.r5 and
                        reg is not r.r12]
-        self._push_core_regs_to_jitframe(mc, regs + [r.r14])
+        self._push_core_regs_to_jitframe(mc, regs)
         if supports_floats:
             self._push_fp_regs_to_jitframe(mc)
 
         # allocate a stack frame!
-        mc.push_std_frame()
         mc.raw_call(r.r12)
-        mc.pop_std_frame()
 
         # Finish
         self._reload_frame_if_necessary(mc)
 
-        self._pop_core_regs_from_jitframe(mc, saved_regs + [r.r14])
+        self._pop_core_regs_from_jitframe(mc, saved_regs)
         if supports_floats:
             self._pop_fp_regs_from_jitframe(mc)
+        size = STD_FRAME_SIZE_IN_BYTES
+        mc.LMG(r.r14, r.r15, l.addr(size+14*WORD, r.SP))
         mc.BCR(c.ANY, r.RETURN)
         self.mc = None
         return mc.materialize(self.cpu, [])
@@ -446,8 +449,11 @@
         mc.STG(r.SCRATCH, l.addr(ofs2, r.SPP))
         saved_regs = [reg for reg in r.MANAGED_REGS
                           if reg is not r.RES and reg is not r.RSZ]
-        self._push_core_regs_to_jitframe(mc, saved_regs + [r.r14])
+        self._push_core_regs_to_jitframe(mc, saved_regs)
         self._push_fp_regs_to_jitframe(mc)
+        # alloc a frame for the callee
+        mc.STMG(r.r14, r.r15, l.addr(14*WORD, r.SP))
+        mc.push_std_frame()
         #
         if kind == 'fixed':
             addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
@@ -478,10 +484,8 @@
 
         # Do the call
         addr = rffi.cast(lltype.Signed, addr)
-        mc.push_std_frame()
         mc.load_imm(mc.RAW_CALL_REG, addr)
         mc.raw_call()
-        mc.pop_std_frame()
 
         self._reload_frame_if_necessary(mc)
 
@@ -490,7 +494,7 @@
         # emit_call_malloc_gc()).
         self.propagate_memoryerror_if_r2_is_null()
 
-        self._pop_core_regs_from_jitframe(mc, saved_regs + [r.r14])
+        self._pop_core_regs_from_jitframe(mc, saved_regs)
         self._pop_fp_regs_from_jitframe(mc)
 
         nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
@@ -501,6 +505,8 @@
         # r.RSZ is loaded from [SCRATCH], to make the caller's store a no-op here
         mc.load(r.RSZ, r.r1, 0)
         #
+        size = STD_FRAME_SIZE_IN_BYTES
+        mc.LMG(r.r14, r.r15, l.addr(size+14*WORD, r.SP))
         mc.BCR(c.ANY, r.r14)
         self.mc = None
         return mc.materialize(self.cpu, [])
@@ -517,7 +523,7 @@
         mc = InstrBuilder()
         #
         # store the link backwards
-        self.mc.STMG(r.r14, r.r15, l.addr(14*WORD, r.SP))
+        mc.STMG(r.r14, r.r15, l.addr(14*WORD, r.SP))
         mc.push_std_frame()
 
         mc.LGR(r.r2, r.SP)
@@ -532,7 +538,7 @@
         mc.cmp_op(r.SCRATCH, l.imm(0), imm=True)
         #
         size = STD_FRAME_SIZE_IN_BYTES
-        self.mc.LMG(r.r14, r.r15, l.addr(size+14*WORD, r.SP)) # restore the link
+        mc.LMG(r.r14, r.r15, l.addr(size+14*WORD, r.SP)) # restore the link
         # So we return to our caller, conditionally if "EQ"
         mc.BCR(c.EQ, r.r14)
         mc.trap() # debug if this is EVER executed!
@@ -590,11 +596,11 @@
         # LGHI r0, ... (4  bytes)
         #       sum -> (14 bytes)
         mc.write('\x00'*14)
-        self.mc.push_std_frame()
+        mc.push_std_frame()
         mc.load_imm(r.RETURN, self._frame_realloc_slowpath)
         self.load_gcmap(mc, r.r1, gcmap)
         mc.raw_call()
-        self.mc.pop_std_frame()
+        mc.pop_std_frame()
 
         self.frame_depth_to_patch.append((patch_pos, mc.currpos()))
 
@@ -1006,8 +1012,8 @@
         # save the back chain
         self.mc.STG(r.SP, l.addr(0, r.SP))
 
-        # save r3, the second argument, to THREADLOCAL_ADDR_OFFSET
-        self.mc.STG(r.r3, l.addr(THREADLOCAL_ADDR_OFFSET, r.SP))
+        # save r3, the second argument, to the thread local position
+        self.mc.STG(r.r3, l.addr(THREADLOCAL_ON_ENTER_JIT, r.SP))
 
         # push a standard frame for any call
         self.mc.push_std_frame()
@@ -1418,9 +1424,7 @@
             raise AssertionError(kind)
         #
         # call!
-        mc.push_std_frame()
         mc.branch_absolute(addr)
-        mc.pop_std_frame()
 
         jmp_location = mc.currpos()
         mc.reserve_cond_jump(short=True)      # jump forward, patched later
diff --git a/rpython/jit/backend/zarch/callbuilder.py b/rpython/jit/backend/zarch/callbuilder.py
--- a/rpython/jit/backend/zarch/callbuilder.py
+++ b/rpython/jit/backend/zarch/callbuilder.py
@@ -62,6 +62,7 @@
         # called function will in turn call further functions (which must be passed the
         # address of the new frame). This stack grows downwards from high addresses
         # """
+        self.subtracted_to_sp = 0
 
         gpr_regs = 0
         fpr_regs = 0
@@ -83,18 +84,18 @@
                     stack_params.append(i)
 
         self.subtracted_to_sp += len(stack_params) * WORD
-        base = -len(stack_params) * WORD
+        base = len(stack_params) * WORD
         if self.is_call_release_gil:
             self.subtracted_to_sp += 8*WORD
-            base -= 8*WORD
-        # one additional owrd for remap frame layout
+            base += 8*WORD
+        # one additional word for remap frame layout
         # regalloc_push will overwrite -8(r.SP) and destroy
         # a parameter if we would not reserve that space
-        base -= WORD
-        self.subtracted_to_sp += WORD
+        # base += WORD
+        # TODO self.subtracted_to_sp += WORD
         for idx,i in enumerate(stack_params):
             loc = arglocs[i]
-            offset = base + 8 * idx
+            offset = STD_FRAME_SIZE_IN_BYTES - base + 8 * idx
             if loc.type == FLOAT:
                 if loc.is_fp_reg():
                     src = loc
@@ -148,15 +149,23 @@
     def emit_raw_call(self):
         # always allocate a stack frame for the new function
         # save the SP back chain
-        self.mc.STG(r.SP, l.addr(-self.subtracted_to_sp, r.SP))
+        #self.mc.STG(r.SP, l.addr(-self.subtracted_to_sp, r.SP))
         # move the frame pointer
         if self.subtracted_to_sp != 0:
             self.mc.LAY(r.SP, l.addr(-self.subtracted_to_sp, r.SP))
         self.mc.raw_call()
+
+
+    def restore_stack_pointer(self):
+        # it must at LEAST be 160 bytes
+        if self.subtracted_to_sp != 0:
+            self.mc.LAY(r.SP, l.addr(self.subtracted_to_sp, r.SP))
+
+    def load_result(self):
+        assert (self.resloc is None or
+                self.resloc is r.GPR_RETURN or
+                self.resloc is r.FPR_RETURN)
         #
-        self.ensure_correct_signzero_extension()
-
-    def ensure_correct_signzero_extension(self):
         if self.restype == 'i' and self.ressize != WORD:
             # we must be sure! libffi (s390x impl) will not return
             # a sane 64 bit zero/sign extended value. fix for this
@@ -177,25 +186,14 @@
                 else:
                     assert 0, "cannot zero extend size %d" % self.ressize
 
-
-    def restore_stack_pointer(self):
-        # it must at LEAST be 160 bytes
-        if self.subtracted_to_sp != 0:
-            self.mc.LAY(r.SP, l.addr(self.subtracted_to_sp, r.SP))
-
-    def load_result(self):
-        assert (self.resloc is None or
-                self.resloc is r.GPR_RETURN or
-                self.resloc is r.FPR_RETURN)
-
-
     def call_releasegil_addr_and_move_real_arguments(self, fastgil):
         assert self.is_call_release_gil
         RSHADOWOLD = self.RSHADOWOLD
         RSHADOWPTR = self.RSHADOWPTR
         RFASTGILPTR = self.RFASTGILPTR
         #
-        self.mc.STMG(r.r8, r.r13, l.addr(-7*WORD, r.SP))
+        pos = STD_FRAME_SIZE_IN_BYTES - 7*WORD
+        self.mc.STMG(r.r8, r.r13, l.addr(pos, r.SP))
         # 6 registers, 1 for a floating point return value!
         # registered by prepare_arguments!
         #
@@ -268,26 +266,27 @@
         PARAM_SAVE_AREA_OFFSET = 0
         if reg is not None:
             # save 1 word below the stack pointer
+            pos = STD_FRAME_SIZE_IN_BYTES
             if reg.is_core_reg():
-                self.mc.STG(reg, l.addr(-1*WORD, r.SP))
+                self.mc.STG(reg, l.addr(pos-1*WORD, r.SP))
             elif reg.is_fp_reg():
-                self.mc.STD(reg, l.addr(-1*WORD, r.SP))
-        self.mc.push_std_frame(8*WORD)
+                self.mc.STD(reg, l.addr(pos-1*WORD, r.SP))
         self.mc.load_imm(self.mc.RAW_CALL_REG, self.asm.reacqgil_addr)
         self.mc.raw_call()
-        self.mc.pop_std_frame(8*WORD)
         if reg is not None:
+            pos = STD_FRAME_SIZE_IN_BYTES
             if reg.is_core_reg():
-                self.mc.LG(reg, l.addr(-1*WORD, r.SP))
+                self.mc.LG(reg, l.addr(pos-1*WORD, r.SP))
             elif reg.is_fp_reg():
-                self.mc.LD(reg, l.addr(-1*WORD, r.SP))
+                self.mc.LD(reg, l.addr(pos-1*WORD, r.SP))
 
         # replace b1_location with BEQ(here)
         pmc = OverwritingBuilder(self.mc, b1_location, 1)
         pmc.BRCL(c.EQ, l.imm(self.mc.currpos() - b1_location))
         pmc.overwrite()
 
-        self.mc.LMG(r.r8, r.r13, l.addr(-7*WORD, r.SP))
+        pos = STD_FRAME_SIZE_IN_BYTES - 7*WORD
+        self.mc.LMG(r.r8, r.r13, l.addr(pos, r.SP))
 
     def write_real_errno(self, save_err):
         if save_err & rffi.RFFI_READSAVED_ERRNO:
diff --git a/rpython/jit/backend/zarch/codebuilder.py b/rpython/jit/backend/zarch/codebuilder.py
--- a/rpython/jit/backend/zarch/codebuilder.py
+++ b/rpython/jit/backend/zarch/codebuilder.py
@@ -198,7 +198,7 @@
         function pointer, which means on big-endian that it is actually
         the address of a three-words descriptor.
         """
-        self.BASR(r.RETURN, call_reg)
+        self.BASR(r.r14, call_reg)
 
     def reserve_cond_jump(self, short=False):
         self.trap()        # conditional jump, patched later
diff --git a/rpython/jit/backend/zarch/opassembler.py b/rpython/jit/backend/zarch/opassembler.py
--- a/rpython/jit/backend/zarch/opassembler.py
+++ b/rpython/jit/backend/zarch/opassembler.py
@@ -530,11 +530,7 @@
             mc.LGR(r.r0, loc_base)    # unusual argument location
 
         mc.load_imm(r.r14, self.wb_slowpath[helper_num])
-        # alloc a stack frame
-        mc.push_std_frame()
         mc.BASR(r.r14, r.r14)
-        # destory the frame
-        mc.pop_std_frame()
 
         if card_marking_mask:
             # The helper ends again with a check of the flag in the object.


More information about the pypy-commit mailing list