[pypy-svn] pypy jit-shadowstack: Work in progress. A bit hard to test individual changes :-(

arigo commits-noreply at bitbucket.org
Thu Mar 31 11:43:25 CEST 2011


Author: Armin Rigo <arigo at tunes.org>
Branch: jit-shadowstack
Changeset: r43036:3f4a55febb56
Date: 2011-03-31 11:40 +0200
http://bitbucket.org/pypy/pypy/changeset/3f4a55febb56/

Log:	Work in progress. A bit hard to test individual changes :-(

diff --git a/pypy/jit/backend/x86/rx86.py b/pypy/jit/backend/x86/rx86.py
--- a/pypy/jit/backend/x86/rx86.py
+++ b/pypy/jit/backend/x86/rx86.py
@@ -349,6 +349,8 @@
     INSN_rb = insn(rex_w, chr(base+3), register(1,8), stack_bp(2))
     INSN_rm = insn(rex_w, chr(base+3), register(1,8), mem_reg_plus_const(2))
     INSN_rj = insn(rex_w, chr(base+3), register(1,8), '\x05', immediate(2))
+    INSN_ji8 = insn(rex_w, '\x83', orbyte(base), '\x05', immediate(1),
+                    immediate(2,'b'))
     INSN_bi8 = insn(rex_w, '\x83', orbyte(base), stack_bp(1), immediate(2,'b'))
     INSN_bi32= insn(rex_w, '\x81', orbyte(base), stack_bp(1), immediate(2))
 
@@ -366,7 +368,8 @@
             INSN_bi32(mc, offset, immed)
     INSN_bi._always_inline_ = True      # try to constant-fold single_byte()
 
-    return INSN_ri, INSN_rr, INSN_rb, INSN_bi, INSN_br, INSN_rm, INSN_rj
+    return (INSN_ri, INSN_rr, INSN_rb, INSN_bi, INSN_br, INSN_rm, INSN_rj,
+            INSN_ji8)
 
 def select_8_or_32_bit_immed(insn_8, insn_32):
     def INSN(*args):
@@ -444,13 +447,13 @@
 
     # ------------------------------ Arithmetic ------------------------------
 
-    ADD_ri, ADD_rr, ADD_rb, _, _, ADD_rm, ADD_rj = common_modes(0)
-    OR_ri,  OR_rr,  OR_rb,  _, _, OR_rm,  OR_rj  = common_modes(1)
-    AND_ri, AND_rr, AND_rb, _, _, AND_rm, AND_rj = common_modes(4)
-    SUB_ri, SUB_rr, SUB_rb, _, _, SUB_rm, SUB_rj = common_modes(5)
-    SBB_ri, SBB_rr, SBB_rb, _, _, SBB_rm, SBB_rj = common_modes(3)
-    XOR_ri, XOR_rr, XOR_rb, _, _, XOR_rm, XOR_rj = common_modes(6)
-    CMP_ri, CMP_rr, CMP_rb, CMP_bi, CMP_br, CMP_rm, CMP_rj = common_modes(7)
+    ADD_ri, ADD_rr, ADD_rb, _, _, ADD_rm, ADD_rj, _ = common_modes(0)
+    OR_ri,  OR_rr,  OR_rb,  _, _, OR_rm,  OR_rj,  _ = common_modes(1)
+    AND_ri, AND_rr, AND_rb, _, _, AND_rm, AND_rj, _ = common_modes(4)
+    SUB_ri, SUB_rr, SUB_rb, _, _, SUB_rm, SUB_rj, SUB_ji8 = common_modes(5)
+    SBB_ri, SBB_rr, SBB_rb, _, _, SBB_rm, SBB_rj, _ = common_modes(3)
+    XOR_ri, XOR_rr, XOR_rb, _, _, XOR_rm, XOR_rj, _ = common_modes(6)
+    CMP_ri, CMP_rr, CMP_rb, CMP_bi, CMP_br, CMP_rm, CMP_rj, _ = common_modes(7)
 
     CMP_mi8 = insn(rex_w, '\x83', orbyte(7<<3), mem_reg_plus_const(1), immediate(2, 'b'))
     CMP_mi32 = insn(rex_w, '\x81', orbyte(7<<3), mem_reg_plus_const(1), immediate(2))

diff --git a/pypy/jit/backend/llsupport/gc.py b/pypy/jit/backend/llsupport/gc.py
--- a/pypy/jit/backend/llsupport/gc.py
+++ b/pypy/jit/backend/llsupport/gc.py
@@ -1,3 +1,4 @@
+import os
 from pypy.rlib import rgc
 from pypy.rlib.objectmodel import we_are_translated
 from pypy.rlib.debug import fatalerror
@@ -15,7 +16,6 @@
 from pypy.jit.backend.llsupport.descr import GcCache, get_field_descr
 from pypy.jit.backend.llsupport.descr import GcPtrFieldDescr
 from pypy.jit.backend.llsupport.descr import get_call_descr
-from pypy.rpython.memory.gctransform import asmgcroot
 
 # ____________________________________________________________
 
@@ -212,10 +212,12 @@
         return addr_ref
 
 
-class GcRootMap_asmgcc:
+class GcRootMap_asmgcc(object):
     """Handles locating the stack roots in the assembler.
     This is the class supporting --gcrootfinder=asmgcc.
     """
+    is_shadow_stack = False
+
     LOC_REG       = 0
     LOC_ESP_PLUS  = 1
     LOC_EBP_PLUS  = 2
@@ -224,7 +226,7 @@
     GCMAP_ARRAY = rffi.CArray(lltype.Signed)
     CALLSHAPE_ARRAY_PTR = rffi.CArrayPtr(rffi.UCHAR)
 
-    def __init__(self):
+    def __init__(self, gcdescr=None):
         # '_gcmap' is an array of length '_gcmap_maxlength' of addresses.
         # '_gcmap_curlength' tells how full the array really is.
         # The addresses are actually grouped in pairs:
@@ -237,6 +239,13 @@
         self._gcmap_deadentries = 0
         self._gcmap_sorted = True
 
+    def add_jit2gc_hooks(self, jit2gc):
+        jit2gc.update({
+            'gcmapstart': lambda: self.gcmapstart(),
+            'gcmapend': lambda: self.gcmapend(),
+            'gcmarksorted': lambda: self.gcmarksorted(),
+            })
+
     def initialize(self):
         # hack hack hack.  Remove these lines and see MissingRTypeAttribute
         # when the rtyper tries to annotate these methods only when GC-ing...
@@ -309,6 +318,7 @@
 
     @rgc.no_collect
     def freeing_block(self, start, stop):
+        from pypy.rpython.memory.gctransform import asmgcroot
         # if [start:stop] is a raw block of assembler, then look up the
         # corresponding gcroot markers, and mark them as freed now in
         # self._gcmap by setting the 2nd address of every entry to NULL.
@@ -365,7 +375,7 @@
             number >>= 7
         shape.append(chr(number | flag))
 
-    def add_ebp_offset(self, shape, offset):
+    def add_frame_offset(self, shape, offset):
         assert (offset & 3) == 0
         if offset >= 0:
             num = self.LOC_EBP_PLUS | offset
@@ -388,6 +398,125 @@
         return rawaddr
 
 
+class GcRootMap_shadowstack(object):
+    """Handles locating the stack roots in the assembler.
+    This is the class supporting --gcrootfinder=shadowstack.
+    """
+    is_shadow_stack = True
+    MARKER = 8
+
+    # The "shadowstack" is a portable way in which the GC finds the
+    # roots that live in the stack.  Normally it is just a list of
+    # pointers to GC objects.  The pointers may be moved around by a GC
+    # collection.  But with the JIT, an entry can also be MARKER, in
+    # which case the next entry points to an assembler stack frame.
+    # During a residual CALL from the assembler (which may indirectly
+    # call the GC), we use the force_index stored in the assembler
+    # stack frame to identify the call: we can go from the force_index
+    # to a list of where the GC pointers are in the frame (this is the
+    # purpose of the present class).
+    #
+    # Note that across CALL_MAY_FORCE or CALL_ASSEMBLER, we can also go
+    # from the force_index to a ResumeGuardForcedDescr instance, which
+    # is used if the virtualizable or the virtualrefs need to be forced
+    # (see pypy.jit.backend.model).  The force_index number in the stack
+    # frame is initially set to a non-negative value x, but it is
+    # occasionally turned into (~x) in case of forcing.
+
+    INTARRAYPTR = rffi.CArrayPtr(rffi.INT)
+    CALLSHAPES_ARRAY = rffi.CArray(INTARRAYPTR)
+
+    def __init__(self, gcdescr):
+        self._callshapes = lltype.nullptr(self.CALLSHAPES_ARRAY)
+        self._callshapes_maxlength = 0
+        self.force_index_ofs = gcdescr.force_index_ofs
+
+    def add_jit2gc_hooks(self, jit2gc):
+        #
+        def collect_jit_stack_root(callback, gc, addr):
+            if addr.signed[0] != GcRootMap_shadowstack.MARKER:
+                # common case
+                if gc.points_to_valid_gc_object(addr):
+                    callback(gc, addr)
+                return WORD
+            else:
+                # case of a MARKER followed by an assembler stack frame
+                self.follow_stack_frame_of_assembler(callback, gc, addr)
+                return 2 * WORD
+        #
+        jit2gc.update({
+            'rootstackhook': collect_jit_stack_root,
+            })
+
+    def initialize(self):
+        pass
+
+    def follow_stack_frame_of_assembler(self, callback, gc, addr):
+        frame_addr = addr.signed[1]
+        addr = llmemory.cast_int_to_adr(frame_addr + self.force_index_ofs)
+        force_index = addr.signed[0]
+        if force_index < 0:
+            force_index = ~force_index
+        callshape = self._callshapes[force_index]
+        n = 0
+        while True:
+            offset = rffi.cast(lltype.Signed, callshape[n])
+            if offset == 0:
+                break
+            addr = llmemory.cast_int_to_adr(frame_addr + offset)
+            callback(gc, addr)
+            n += 1
+
+    def get_basic_shape(self, is_64_bit=False):
+        return []
+
+    def add_frame_offset(self, shape, offset):
+        assert offset != 0
+        shape.append(offset)
+
+    def add_callee_save_reg(self, shape, register):
+        msg = "GC pointer in %s was not spilled" % register
+        os.write(2, '[llsupport/gc] %s\n' % msg)
+        raise AssertionError(msg)
+
+    def compress_callshape(self, shape, datablockwrapper):
+        length = len(shape)
+        SZINT = rffi.sizeof(rffi.INT)
+        rawaddr = datablockwrapper.malloc_aligned((length + 1) * SZINT, SZINT)
+        p = rffi.cast(self.INTARRAYPTR, rawaddr)
+        for i in range(length):
+            p[i] = rffi.cast(rffi.INT, shape[i])
+        p[length] = rffi.cast(rffi.INT, 0)
+        return p
+
+    def write_callshape(self, p, force_index):
+        if force_index >= self._callshapes_maxlength:
+            self._enlarge_callshape_list(force_index + 1)
+        self._callshapes[force_index] = p
+
+    def _enlarge_callshape_list(self, minsize):
+        newlength = 250 + (self._callshapes_maxlength // 3) * 4
+        if newlength < minsize:
+            newlength = minsize
+        newarray = lltype.malloc(self.CALLSHAPES_ARRAY, newlength,
+                                 flavor='raw', track_allocation=False)
+        if self._callshapes:
+            i = self._callshapes_maxlength - 1
+            while i >= 0:
+                newarray[i] = self._callshapes[i]
+                i -= 1
+            lltype.free(self._callshapes, flavor='raw')
+        self._callshapes = newarray
+        self._callshapes_maxlength = newlength
+
+    def freeing_block(self, start, stop):
+        pass     # nothing needed here
+
+    def get_root_stack_top_addr(self):
+        rst_addr = llop.gc_adr_of_root_stack_top(llmemory.Address)
+        return rffi.cast(lltype.Signed, rst_addr)
+
+
 class WriteBarrierDescr(AbstractDescr):
     def __init__(self, gc_ll_descr):
         self.llop1 = gc_ll_descr.llop1
@@ -437,7 +566,7 @@
         except KeyError:
             raise NotImplementedError("--gcrootfinder=%s not implemented"
                                       " with the JIT" % (name,))
-        gcrootmap = cls()
+        gcrootmap = cls(gcdescr)
         self.gcrootmap = gcrootmap
         self.gcrefs = GcRefList()
         self.single_gcref_descr = GcPtrFieldDescr('', 0)
@@ -446,12 +575,9 @@
         # where it can be fished and reused by the FrameworkGCTransformer
         self.layoutbuilder = framework.TransformerLayoutBuilder(translator)
         self.layoutbuilder.delay_encoding()
-        self.translator._jit2gc = {
-            'layoutbuilder': self.layoutbuilder,
-            'gcmapstart': lambda: gcrootmap.gcmapstart(),
-            'gcmapend': lambda: gcrootmap.gcmapend(),
-            'gcmarksorted': lambda: gcrootmap.gcmarksorted(),
-            }
+        self.translator._jit2gc = {'layoutbuilder': self.layoutbuilder}
+        gcrootmap.add_jit2gc_hooks(self.translator._jit2gc)
+
         self.GCClass = self.layoutbuilder.GCClass
         self.moving_gc = self.GCClass.moving_gc
         self.HDRPTR = lltype.Ptr(self.GCClass.HDR)

diff --git a/pypy/rpython/memory/gctransform/framework.py b/pypy/rpython/memory/gctransform/framework.py
--- a/pypy/rpython/memory/gctransform/framework.py
+++ b/pypy/rpython/memory/gctransform/framework.py
@@ -506,6 +506,10 @@
         s_gc = self.translator.annotator.bookkeeper.valueoftype(GCClass)
         r_gc = self.translator.rtyper.getrepr(s_gc)
         self.c_const_gc = rmodel.inputconst(r_gc, self.gcdata.gc)
+        s_gc_data = self.translator.annotator.bookkeeper.valueoftype(
+            gctypelayout.GCData)
+        r_gc_data = self.translator.rtyper.getrepr(s_gc_data)
+        self.c_const_gcdata = rmodel.inputconst(r_gc_data, self.gcdata)
         self.malloc_zero_filled = GCClass.malloc_zero_filled
 
         HDR = self.HDR = self.gcdata.gc.gcheaderbuilder.HDR
@@ -792,6 +796,15 @@
                              resulttype=llmemory.Address)
         hop.genop('adr_add', [v_gc_adr, c_ofs], resultvar=op.result)
 
+    def gct_gc_adr_of_root_stack_top(self, hop):
+        op = hop.spaceop
+        ofs = llmemory.offsetof(self.c_const_gcdata.concretetype.TO,
+                                'inst_root_stack_top')
+        c_ofs = rmodel.inputconst(lltype.Signed, ofs)
+        v_gcdata_adr = hop.genop('cast_ptr_to_adr', [self.c_const_gcdata],
+                                 resulttype=llmemory.Address)
+        hop.genop('adr_add', [v_gcdata_adr, c_ofs], resultvar=op.result)
+
     def gct_gc_x_swap_pool(self, hop):
         op = hop.spaceop
         [v_malloced] = op.args
@@ -1336,10 +1349,9 @@
         self.rootstackhook = gctransformer.root_stack_jit_hook
         if self.rootstackhook is None:
             def collect_stack_root(callback, gc, addr):
-                if we_are_translated():
-                    ll_assert(addr.address[0].signed[0] != 0,
-                              "unexpected null object header")
-                callback(gc, addr)
+                if gc.points_to_valid_gc_object(addr):
+                    callback(gc, addr)
+                return sizeofaddr
             self.rootstackhook = collect_stack_root
 
     def push_stack(self, addr):
@@ -1367,9 +1379,7 @@
         addr = gcdata.root_stack_base
         end = gcdata.root_stack_top
         while addr != end:
-            if gc.points_to_valid_gc_object(addr):
-                rootstackhook(collect_stack_root, gc, addr)
-            addr += sizeofaddr
+            addr += rootstackhook(collect_stack_root, gc, addr)
         if self.collect_stacks_from_other_threads is not None:
             self.collect_stacks_from_other_threads(collect_stack_root)
 
@@ -1480,9 +1490,7 @@
                 end = stacktop - sizeofaddr
                 addr = end.address[0]
                 while addr != end:
-                    if gc.points_to_valid_gc_object(addr):
-                        rootstackhook(callback, gc, addr)
-                    addr += sizeofaddr
+                    addr += rootstackhook(callback, gc, addr)
 
         def collect_more_stacks(callback):
             ll_assert(get_aid() == gcdata.active_thread,

diff --git a/pypy/jit/backend/x86/regalloc.py b/pypy/jit/backend/x86/regalloc.py
--- a/pypy/jit/backend/x86/regalloc.py
+++ b/pypy/jit/backend/x86/regalloc.py
@@ -837,6 +837,7 @@
         self.rm.possibly_free_vars_for_op(op)
 
     def _fastpath_malloc(self, op, descr):
+        XXX
         assert isinstance(descr, BaseSizeDescr)
         gc_ll_descr = self.assembler.cpu.gc_ll_descr
         self.rm.force_allocate_reg(op.result, selected_reg=eax)
@@ -859,7 +860,8 @@
 
     def consider_new(self, op):
         gc_ll_descr = self.assembler.cpu.gc_ll_descr
-        if gc_ll_descr.can_inline_malloc(op.getdescr()):
+        os.write(2, "fixme: consider_new\n")
+        if 0 and gc_ll_descr.can_inline_malloc(op.getdescr()):  # XXX
             self._fastpath_malloc(op, op.getdescr())
         else:
             args = gc_ll_descr.args_for_new(op.getdescr())
@@ -869,7 +871,8 @@
     def consider_new_with_vtable(self, op):
         classint = op.getarg(0).getint()
         descrsize = heaptracker.vtable2descr(self.assembler.cpu, classint)
-        if self.assembler.cpu.gc_ll_descr.can_inline_malloc(descrsize):
+        os.write(2, "fixme: consider_new_with_vtable\n")
+        if 0 and self.assembler.cpu.gc_ll_descr.can_inline_malloc(descrsize): # XXX
             self._fastpath_malloc(op, descrsize)
             self.assembler.set_vtable(eax, imm(classint))
             # result of fastpath malloc is in eax
@@ -1132,7 +1135,7 @@
         # call memcpy()
         self.rm.before_call()
         self.xrm.before_call()
-        self.assembler._emit_call(imm(self.assembler.memcpy_addr),
+        self.assembler._emit_call(-1, imm(self.assembler.memcpy_addr),
                                   [dstaddr_loc, srcaddr_loc, length_loc])
         self.rm.possibly_free_var(length_box)
         self.rm.possibly_free_var(dstaddr_box)
@@ -1205,7 +1208,7 @@
         for v, val in self.fm.frame_bindings.items():
             if (isinstance(v, BoxPtr) and self.rm.stays_alive(v)):
                 assert isinstance(val, StackLoc)
-                gcrootmap.add_ebp_offset(shape, get_ebp_ofs(val.position))
+                gcrootmap.add_frame_offset(shape, get_ebp_ofs(val.position))
         for v, reg in self.rm.reg_bindings.items():
             if reg is eax:
                 continue      # ok to ignore this one

diff --git a/pypy/jit/backend/model.py b/pypy/jit/backend/model.py
--- a/pypy/jit/backend/model.py
+++ b/pypy/jit/backend/model.py
@@ -23,18 +23,22 @@
         self.fail_descr_list = []
         self.fail_descr_free_list = []
 
+    def reserve_some_free_fail_descr_number(self):
+        lst = self.fail_descr_list
+        if len(self.fail_descr_free_list) > 0:
+            n = self.fail_descr_free_list.pop()
+            assert lst[n] is None
+        else:
+            n = len(lst)
+            lst.append(None)
+        return n
+
     def get_fail_descr_number(self, descr):
         assert isinstance(descr, history.AbstractFailDescr)
         n = descr.index
         if n < 0:
-            lst = self.fail_descr_list
-            if len(self.fail_descr_free_list) > 0:
-                n = self.fail_descr_free_list.pop()
-                assert lst[n] is None
-                lst[n] = descr
-            else:
-                n = len(lst)
-                lst.append(descr)
+            n = self.reserve_some_free_fail_descr_number()
+            self.fail_descr_list[n] = descr
             descr.index = n
         return n
 
@@ -294,6 +298,13 @@
     def record_faildescr_index(self, n):
         self.faildescr_indices.append(n)
 
+    def reserve_and_record_some_faildescr_index(self):
+        # like record_faildescr_index(), but invent and return a new,
+        # unused faildescr index
+        n = self.cpu.reserve_some_free_fail_descr_number()
+        self.record_faildescr_index(n)
+        return n
+
     def compiling_a_bridge(self):
         self.cpu.total_compiled_bridges += 1
         self.bridges_count += 1

diff --git a/pypy/jit/backend/x86/runner.py b/pypy/jit/backend/x86/runner.py
--- a/pypy/jit/backend/x86/runner.py
+++ b/pypy/jit/backend/x86/runner.py
@@ -19,6 +19,8 @@
 
     def __init__(self, rtyper, stats, opts=None, translate_support_code=False,
                  gcdescr=None):
+        if gcdescr is not None:
+            gcdescr.force_index_ofs = FORCE_INDEX_OFS
         AbstractLLCPU.__init__(self, rtyper, stats, opts,
                                translate_support_code, gcdescr)
 
@@ -127,7 +129,7 @@
         fail_index = rffi.cast(TP, addr_of_force_index)[0]
         assert fail_index >= 0, "already forced!"
         faildescr = self.get_fail_descr_from_number(fail_index)
-        rffi.cast(TP, addr_of_force_index)[0] = -1
+        rffi.cast(TP, addr_of_force_index)[0] = ~fail_index
         frb = self.assembler._find_failure_recovery_bytecode(faildescr)
         bytecode = rffi.cast(rffi.UCHARP, frb)
         # start of "no gc operation!" block

diff --git a/pypy/rpython/lltypesystem/lloperation.py b/pypy/rpython/lltypesystem/lloperation.py
--- a/pypy/rpython/lltypesystem/lloperation.py
+++ b/pypy/rpython/lltypesystem/lloperation.py
@@ -487,7 +487,9 @@
     # ^^^ returns an address of nursery free pointer, for later modifications
     'gc_adr_of_nursery_top' : LLOp(),
     # ^^^ returns an address of pointer, since it can change at runtime
-    
+    'gc_adr_of_root_stack_top': LLOp(),
+    # ^^^ returns the address of gcdata.root_stack_top (for shadowstack only)
+
     # experimental operations in support of thread cloning, only
     # implemented by the Mark&Sweep GC
     'gc_x_swap_pool':       LLOp(canraise=(MemoryError,), canunwindgc=True),

diff --git a/pypy/jit/backend/x86/assembler.py b/pypy/jit/backend/x86/assembler.py
--- a/pypy/jit/backend/x86/assembler.py
+++ b/pypy/jit/backend/x86/assembler.py
@@ -171,25 +171,42 @@
         self.float_const_abs_addr = float_constants + 16
 
     def _build_malloc_fixedsize_slowpath(self):
+        # With asmgcc, we need two helpers, so that we can write two CALL
+        # instructions in assembler, with a mark_gc_roots in between.
+        # With shadowstack, this is not needed, so we produce a single helper.
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        #
         # ---------- first helper for the slow path of malloc ----------
         mc = codebuf.MachineCodeBlockWrapper()
         if self.cpu.supports_floats:          # save the XMM registers in
             for i in range(self.cpu.NUM_REGS):# the *caller* frame, from esp+8
                 mc.MOVSD_sx((WORD*2)+8*i, i)
         mc.SUB_rr(edx.value, eax.value)       # compute the size we want
-        if IS_X86_32:
-            mc.MOV_sr(WORD, edx.value)        # save it as the new argument
-        elif IS_X86_64:
-            # rdi can be clobbered: its content was forced to the stack
-            # by _fastpath_malloc(), like all other save_around_call_regs.
-            mc.MOV_rr(edi.value, edx.value)
-
         addr = self.cpu.gc_ll_descr.get_malloc_fixedsize_slowpath_addr()
-        mc.JMP(imm(addr))                    # tail call to the real malloc
-        rawstart = mc.materialize(self.cpu.asmmemmgr, [])
-        self.malloc_fixedsize_slowpath1 = rawstart
-        # ---------- second helper for the slow path of malloc ----------
-        mc = codebuf.MachineCodeBlockWrapper()
+        #
+        if gcrootmap.is_shadow_stack:
+            # ---- shadowstack ----
+            mc.SUB_ri(esp.value, 16 - WORD)      # stack alignment of 16 bytes
+            if IS_X86_32:
+                mc.MOV_sr(0, edx.value)          # push argument
+            elif IS_X86_64:
+                mc.MOV_rr(edi.value, edx.value)
+            mc.CALL(imm(addr))
+            mc.ADD_ri(esp.value, 16 - WORD)
+        else:
+            # ---- asmgcc ----
+            if IS_X86_32:
+                mc.MOV_sr(WORD, edx.value)       # save it as the new argument
+            elif IS_X86_64:
+                # rdi can be clobbered: its content was forced to the stack
+                # by _fastpath_malloc(), like all other save_around_call_regs.
+                mc.MOV_rr(edi.value, edx.value)
+            mc.JMP(imm(addr))                    # tail call to the real malloc
+            rawstart = mc.materialize(self.cpu.asmmemmgr, [])
+            self.malloc_fixedsize_slowpath1 = rawstart
+            # ---------- second helper for the slow path of malloc ----------
+            mc = codebuf.MachineCodeBlockWrapper()
+        #
         if self.cpu.supports_floats:          # restore the XMM registers
             for i in range(self.cpu.NUM_REGS):# from where they were saved
                 mc.MOVSD_xs(i, (WORD*2)+8*i)
@@ -550,6 +567,10 @@
         for regloc in self.cpu.CALLEE_SAVE_REGISTERS:
             self.mc.PUSH_r(regloc.value)
 
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        if gcrootmap and gcrootmap.is_shadow_stack:
+            self._call_header_shadowstack(gcrootmap)
+
     def _call_header_with_stack_check(self):
         if self.stack_check_slowpath == 0:
             pass                # no stack check (e.g. not translated)
@@ -571,12 +592,32 @@
     def _call_footer(self):
         self.mc.LEA_rb(esp.value, -len(self.cpu.CALLEE_SAVE_REGISTERS) * WORD)
 
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        if gcrootmap and gcrootmap.is_shadow_stack:
+            self._call_footer_shadowstack(gcrootmap)
+
         for i in range(len(self.cpu.CALLEE_SAVE_REGISTERS)-1, -1, -1):
             self.mc.POP_r(self.cpu.CALLEE_SAVE_REGISTERS[i].value)
 
         self.mc.POP_r(ebp.value)
         self.mc.RET()
 
+    def _call_header_shadowstack(self, gcrootmap):
+        # we need to put two words into the shadowstack: the MARKER
+        # and the address of the frame (ebp, actually)
+        rst = gcrootmap.get_root_stack_top_addr()
+        assert rx86.fits_in_32bits(rst)
+        self.mc.MOV_rj(eax.value, rst)                # MOV eax, [rootstacktop]
+        self.mc.LEA_rm(edx.value, (eax.value, 2*WORD))  # LEA edx, [eax+2*WORD]
+        self.mc.MOV_mi((eax.value, 0), gcrootmap.MARKER)    # MOV [eax], MARKER
+        self.mc.MOV_mr((eax.value, WORD), ebp.value)      # MOV [eax+WORD], ebp
+        self.mc.MOV_jr(rst, edx.value)                # MOV [rootstacktop], edx
+
+    def _call_footer_shadowstack(self, gcrootmap):
+        rst = gcrootmap.get_root_stack_top_addr()
+        assert rx86.fits_in_32bits(rst)
+        self.mc.SUB_ji8(rst, 2*WORD)       # SUB [rootstacktop], 2*WORD
+
     def _assemble_bootstrap_direct_call(self, arglocs, jmppos, stackdepth):
         if IS_X86_64:
             return self._assemble_bootstrap_direct_call_64(arglocs, jmppos, stackdepth)
@@ -896,7 +937,7 @@
                     self.implement_guard(guard_token, checkfalsecond)
         return genop_cmp_guard_float
 
-    def _emit_call(self, x, arglocs, start=0, tmp=eax):
+    def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax):
         if IS_X86_64:
             return self._emit_call_64(x, arglocs, start)
 
@@ -924,9 +965,9 @@
         self._regalloc.reserve_param(p//WORD)
         # x is a location
         self.mc.CALL(x)
-        self.mark_gc_roots()
+        self.mark_gc_roots(force_index)
 
-    def _emit_call_64(self, x, arglocs, start=0):
+    def _emit_call_64(self, force_index, x, arglocs, start=0):
         src_locs = []
         dst_locs = []
         xmm_src_locs = []
@@ -984,12 +1025,27 @@
 
         self._regalloc.reserve_param(len(pass_on_stack))
         self.mc.CALL(x)
-        self.mark_gc_roots()
+        self.mark_gc_roots(force_index)
 
     def call(self, addr, args, res):
-        self._emit_call(imm(addr), args)
+        force_index = self.write_new_force_index()
+        self._emit_call(force_index, imm(addr), args)
         assert res is eax
 
+    def write_new_force_index(self):
+        # for shadowstack only: get a new, unused force_index number and
+        # write it to FORCE_INDEX_OFS.  Used to record the call shape
+        # (i.e. where the GC pointers are in the stack) around a CALL
+        # instruction that doesn't already have a force_index.
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        if gcrootmap and gcrootmap.is_shadow_stack:
+            clt = self.currently_compiling_loop.compiled_loop_token
+            force_index = clt.reserve_and_record_some_faildescr_index()
+            self.mc.MOV_bi(FORCE_INDEX_OFS, force_index)
+            return force_index
+        else:
+            return 0
+
     genop_int_neg = _unaryop("NEG")
     genop_int_invert = _unaryop("NOT")
     genop_int_add = _binaryop("ADD", True)
@@ -1796,8 +1852,9 @@
             tmp = ecx
         else:
             tmp = eax
-        
-        self._emit_call(x, arglocs, 3, tmp=tmp)
+
+        force_index = self.write_new_force_index()
+        self._emit_call(force_index, x, arglocs, 3, tmp=tmp)
 
         if IS_X86_32 and isinstance(resloc, StackLoc) and resloc.width == 8:
             # a float or a long long return
@@ -1842,8 +1899,8 @@
         assert len(arglocs) - 2 == len(descr._x86_arglocs[0])
         #
         # Write a call to the direct_bootstrap_code of the target assembler
-        self._emit_call(imm(descr._x86_direct_bootstrap_code), arglocs, 2,
-                        tmp=eax)
+        self._emit_call(fail_index, imm(descr._x86_direct_bootstrap_code),
+                        arglocs, 2, tmp=eax)
         if op.result is None:
             assert result_loc is None
             value = self.cpu.done_with_this_frame_void_v
@@ -1868,7 +1925,7 @@
         jd = descr.outermost_jitdriver_sd
         assert jd is not None
         asm_helper_adr = self.cpu.cast_adr_to_int(jd.assembler_helper_adr)
-        self._emit_call(imm(asm_helper_adr), [eax, arglocs[1]], 0,
+        self._emit_call(fail_index, imm(asm_helper_adr), [eax, arglocs[1]], 0,
                         tmp=ecx)
         if IS_X86_32 and isinstance(result_loc, StackLoc) and result_loc.type == FLOAT:
             self.mc.FSTP_b(result_loc.value)
@@ -1990,11 +2047,16 @@
         not_implemented("not implemented operation (guard): %s" %
                         op.getopname())
 
-    def mark_gc_roots(self):
+    def mark_gc_roots(self, force_index):
+        if force_index < 0:
+            return     # not needed
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
         if gcrootmap:
             mark = self._regalloc.get_mark_gc_roots(gcrootmap)
-            self.mc.insert_gcroot_marker(mark)
+            if gcrootmap.is_shadow_stack:
+                gcrootmap.write_callshape(mark, force_index)
+            else:
+                self.mc.insert_gcroot_marker(mark)
 
     def target_arglocs(self, loop_token):
         return loop_token._x86_arglocs
@@ -2025,11 +2087,16 @@
         # result in EAX; slowpath_addr2 additionally returns in EDX a
         # copy of heap(nursery_free_adr), so that the final MOV below is
         # a no-op.
-        slowpath_addr1 = self.malloc_fixedsize_slowpath1
+
         # reserve room for the argument to the real malloc and the
         # 8 saved XMM regs
         self._regalloc.reserve_param(1+16)
-        self.mc.CALL(imm(slowpath_addr1))
+
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        if not gcrootmap.is_shadow_stack:
+            # there are two helpers to call only with asmgcc
+            slowpath_addr1 = self.malloc_fixedsize_slowpath1
+            self.mc.CALL(imm(slowpath_addr1))
         self.mark_gc_roots()
         slowpath_addr2 = self.malloc_fixedsize_slowpath2
         self.mc.CALL(imm(slowpath_addr2))
@@ -2038,6 +2105,7 @@
         assert 0 < offset <= 127
         self.mc.overwrite(jmp_adr-1, chr(offset))
         # on 64-bits, 'tid' is a value that fits in 31 bits
+        assert rx86.fits_in_32bits(tid)
         self.mc.MOV_mi((eax.value, 0), tid)
         self.mc.MOV(heap(nursery_free_adr), edx)
         

diff --git a/pypy/jit/backend/x86/test/test_zrpy_gc.py b/pypy/jit/backend/x86/test/test_zrpy_gc.py
--- a/pypy/jit/backend/x86/test/test_zrpy_gc.py
+++ b/pypy/jit/backend/x86/test/test_zrpy_gc.py
@@ -127,7 +127,7 @@
 
 # ______________________________________________________________________
 
-class TestCompileFramework(object):
+class CompileFrameworkTests(object):
     # Test suite using (so far) the minimark GC.
     def setup_class(cls):
         funcs = []
@@ -178,7 +178,7 @@
         try:
             GcLLDescr_framework.DEBUG = True
             cls.cbuilder = compile(get_entry(allfuncs), DEFL_GC,
-                                   gcrootfinder="asmgcc", jit=True)
+                                   gcrootfinder=cls.gcrootfinder, jit=True)
         finally:
             GcLLDescr_framework.DEBUG = OLD_DEBUG
 
@@ -576,3 +576,10 @@
 
     def test_compile_framework_minimal_size_in_nursery(self):
         self.run('compile_framework_minimal_size_in_nursery')
+
+
+class TestShadowStack(CompileFrameworkTests):
+    gcrootfinder = "shadowstack"
+
+class TestAsmGcc(CompileFrameworkTests):
+    gcrootfinder = "asmgcc"


More information about the Pypy-commit mailing list