[pypy-svn] r68481 - in pypy/trunk/pypy: jit/backend/llsupport jit/backend/x86 jit/backend/x86/test rpython rpython/lltypesystem rpython/memory/gc rpython/memory/gctransform rpython/memory/test

fijal at codespeak.net fijal at codespeak.net
Thu Oct 15 11:31:51 CEST 2009


Author: fijal
Date: Thu Oct 15 11:31:51 2009
New Revision: 68481

Modified:
   pypy/trunk/pypy/jit/backend/llsupport/gc.py
   pypy/trunk/pypy/jit/backend/x86/assembler.py
   pypy/trunk/pypy/jit/backend/x86/regalloc.py
   pypy/trunk/pypy/jit/backend/x86/test/test_gc_integration.py
   pypy/trunk/pypy/rpython/llinterp.py
   pypy/trunk/pypy/rpython/lltypesystem/lloperation.py
   pypy/trunk/pypy/rpython/memory/gc/generation.py
   pypy/trunk/pypy/rpython/memory/gctransform/framework.py
   pypy/trunk/pypy/rpython/memory/test/test_transformed_gc.py
Log:
(fijal, arigo)
Merge the inline-fastpath-malloc branch.
This branch inlines the fastpath of malloc_fixedsize_clear from generation GC
directly into assembler, speeding up mallocing in nursery, when objects
don't have finalizers.


Modified: pypy/trunk/pypy/jit/backend/llsupport/gc.py
==============================================================================
--- pypy/trunk/pypy/jit/backend/llsupport/gc.py	(original)
+++ pypy/trunk/pypy/jit/backend/llsupport/gc.py	Thu Oct 15 11:31:51 2009
@@ -12,6 +12,7 @@
 from pypy.jit.backend.llsupport.descr import GcCache, get_field_descr
 from pypy.jit.backend.llsupport.descr import GcPtrFieldDescr
 from pypy.jit.backend.llsupport.descr import get_call_descr
+from pypy.rlib.rarithmetic import r_ulonglong, r_uint
 
 # ____________________________________________________________
 
@@ -27,6 +28,8 @@
         pass
     def rewrite_assembler(self, cpu, operations):
         pass
+    def can_inline_malloc(self, descr):
+        return False
 
 # ____________________________________________________________
 
@@ -341,6 +344,8 @@
                                                lltype.Void)
         (self.array_basesize, _, self.array_length_ofs) = \
              symbolic.get_array_token(lltype.GcArray(lltype.Signed), True)
+        min_ns = self.GCClass.TRANSLATION_PARAMS['min_nursery_size']
+        self.max_size_of_young_obj = self.GCClass.get_young_fixedsize(min_ns)
 
         # make a malloc function, with three arguments
         def malloc_basic(size, tid):
@@ -391,6 +396,28 @@
         self.malloc_unicode = malloc_unicode
         self.GC_MALLOC_STR_UNICODE = lltype.Ptr(lltype.FuncType(
             [lltype.Signed], llmemory.GCREF))
+        def malloc_fixedsize_slowpath(size):
+            gcref = llop1.do_malloc_fixedsize_clear(llmemory.GCREF,
+                                        0, size, True, False, False)
+            res = rffi.cast(lltype.Signed, gcref)
+            nurs_free = llop1.gc_adr_of_nursery_free(llmemory.Address).signed[0]
+            return r_ulonglong(nurs_free) << 32 | r_ulonglong(r_uint(res))
+        self.malloc_fixedsize_slowpath = malloc_fixedsize_slowpath
+        self.MALLOC_FIXEDSIZE_SLOWPATH = lltype.FuncType([lltype.Signed],
+                                                 lltype.UnsignedLongLong)
+
+    def get_nursery_free_addr(self):
+        nurs_addr = llop.gc_adr_of_nursery_free(llmemory.Address)
+        return rffi.cast(lltype.Signed, nurs_addr)
+
+    def get_nursery_top_addr(self):
+        nurs_top_addr = llop.gc_adr_of_nursery_top(llmemory.Address)
+        return rffi.cast(lltype.Signed, nurs_top_addr)
+
+    def get_malloc_fixedsize_slowpath_addr(self):
+        fptr = llhelper(lltype.Ptr(self.MALLOC_FIXEDSIZE_SLOWPATH),
+                        self.malloc_fixedsize_slowpath)
+        return rffi.cast(lltype.Signed, fptr)
 
     def initialize(self):
         self.gcrefs.initialize()
@@ -519,6 +546,15 @@
         newops.append(ResOperation(rop.COND_CALL_GC_WB, args, None,
                                    descr=self.calldescr_jit_wb))
 
+    def can_inline_malloc(self, descr):
+        assert isinstance(descr, BaseSizeDescr)
+        if descr.size < self.max_size_of_young_obj:
+            has_finalizer = bool(descr.tid & (1<<16))
+            if has_finalizer:
+                return False
+            return True
+        return False
+
 # ____________________________________________________________
 
 def get_ll_description(gcdescr, translator=None):

Modified: pypy/trunk/pypy/jit/backend/x86/assembler.py
==============================================================================
--- pypy/trunk/pypy/jit/backend/x86/assembler.py	(original)
+++ pypy/trunk/pypy/jit/backend/x86/assembler.py	Thu Oct 15 11:31:51 2009
@@ -523,7 +523,10 @@
         arglocs = arglocs[:-1]
         self.call(self.malloc_func_addr, arglocs, eax)
         # xxx ignore NULL returns for now
-        self.mc.MOV(mem(eax, self.cpu.vtable_offset), loc_vtable)
+        self.set_vtable(eax, loc_vtable)
+
+    def set_vtable(self, loc, loc_vtable):
+        self.mc.MOV(mem(loc, self.cpu.vtable_offset), loc_vtable)
 
     # XXX genop_new is abused for all varsized mallocs with Boehm, for now
     # (instead of genop_new_array, genop_newstr, genop_newunicode)
@@ -902,8 +905,30 @@
 
     def closing_jump(self, loop_token):
         self.mc.JMP(rel32(loop_token._x86_loop_code))
-        
 
+    def malloc_cond_fixedsize(self, nursery_free_adr, nursery_top_adr,
+                              size, tid, slowpath_addr):
+        # don't use self.mc
+        mc = self.mc._mc
+        mc.MOV(eax, heap(nursery_free_adr))
+        mc.LEA(edx, addr_add(eax, imm(size)))
+        mc.CMP(edx, heap(nursery_top_adr))
+        mc.write('\x76\x00') # JNA after the block
+        jmp_adr = mc.get_relative_pos()
+        mc.PUSH(imm(size))
+        mc.CALL(rel32(slowpath_addr))
+        self.mark_gc_roots()
+        # note that slowpath_addr returns a "long long", or more precisely
+        # two results, which end up in eax and edx.
+        # eax should contain the result of allocation, edx new value
+        # of nursery_free_adr
+        mc.ADD(esp, imm(4))
+        offset = mc.get_relative_pos() - jmp_adr
+        assert 0 < offset <= 127
+        mc.overwrite(jmp_adr-1, chr(offset))
+        mc.MOV(addr_add(eax, imm(0)), imm(tid))
+        mc.MOV(heap(nursery_free_adr), edx)
+        
 genop_discard_list = [Assembler386.not_implemented_op_discard] * rop._LAST
 genop_list = [Assembler386.not_implemented_op] * rop._LAST
 genop_guard_list = [Assembler386.not_implemented_op_guard] * rop._LAST

Modified: pypy/trunk/pypy/jit/backend/x86/regalloc.py
==============================================================================
--- pypy/trunk/pypy/jit/backend/x86/regalloc.py	(original)
+++ pypy/trunk/pypy/jit/backend/x86/regalloc.py	Thu Oct 15 11:31:51 2009
@@ -14,7 +14,7 @@
 from pypy.jit.backend.x86.jump import remap_stack_layout
 from pypy.jit.metainterp.resoperation import rop
 from pypy.jit.backend.llsupport.descr import BaseFieldDescr, BaseArrayDescr
-from pypy.jit.backend.llsupport.descr import BaseCallDescr
+from pypy.jit.backend.llsupport.descr import BaseCallDescr, BaseSizeDescr
 from pypy.jit.backend.llsupport.regalloc import StackManager, RegisterManager,\
      TempBox
 
@@ -623,18 +623,52 @@
         self.PerformDiscard(op, arglocs)
         self.rm.possibly_free_vars(op.args)
 
+    def _fastpath_malloc(self, op, descr):
+        assert isinstance(descr, BaseSizeDescr)
+        gc_ll_descr = self.assembler.cpu.gc_ll_descr
+        tmp0 = TempBox()
+        self.rm.force_allocate_reg(op.result, selected_reg=eax)
+        self.rm.force_allocate_reg(tmp0, selected_reg=edx)
+        for v, reg in self.rm.reg_bindings.items():
+            if reg is ecx:
+                to_sync = v
+                break
+        else:
+            to_sync = None
+        if to_sync is not None:
+            self.rm._sync_var(to_sync)
+            del self.rm.reg_bindings[to_sync]
+            self.rm.free_regs.append(ecx)
+        # we need to do it here, so edx is not in reg_bindings
+        self.rm.possibly_free_var(tmp0)
+        self.assembler.malloc_cond_fixedsize(
+            gc_ll_descr.get_nursery_free_addr(),
+            gc_ll_descr.get_nursery_top_addr(),
+            descr.size, descr.tid,
+            gc_ll_descr.get_malloc_fixedsize_slowpath_addr(),
+            )
+
     def consider_new(self, op, ignored):
-        args = self.assembler.cpu.gc_ll_descr.args_for_new(op.descr)
-        arglocs = [imm(x) for x in args]
-        return self._call(op, arglocs)
+        gc_ll_descr = self.assembler.cpu.gc_ll_descr
+        if gc_ll_descr.can_inline_malloc(op.descr):
+            self._fastpath_malloc(op, op.descr)
+        else:
+            args = gc_ll_descr.args_for_new(op.descr)
+            arglocs = [imm(x) for x in args]
+            return self._call(op, arglocs)
 
     def consider_new_with_vtable(self, op, ignored):
         classint = op.args[0].getint()
         descrsize = self.assembler.cpu.class_sizes[classint]
-        args = self.assembler.cpu.gc_ll_descr.args_for_new(descrsize)
-        arglocs = [imm(x) for x in args]
-        arglocs.append(self.loc(op.args[0]))
-        return self._call(op, arglocs)
+        if self.assembler.cpu.gc_ll_descr.can_inline_malloc(descrsize):
+            self._fastpath_malloc(op, descrsize)
+            self.assembler.set_vtable(eax, imm(classint))
+            # result of fastpath malloc is in eax
+        else:
+            args = self.assembler.cpu.gc_ll_descr.args_for_new(descrsize)
+            arglocs = [imm(x) for x in args]
+            arglocs.append(self.loc(op.args[0]))
+            return self._call(op, arglocs)
 
     def consider_newstr(self, op, ignored):
         gc_ll_descr = self.assembler.cpu.gc_ll_descr

Modified: pypy/trunk/pypy/jit/backend/x86/test/test_gc_integration.py
==============================================================================
--- pypy/trunk/pypy/jit/backend/x86/test/test_gc_integration.py	(original)
+++ pypy/trunk/pypy/jit/backend/x86/test/test_gc_integration.py	Thu Oct 15 11:31:51 2009
@@ -7,6 +7,7 @@
      BoxPtr, ConstPtr, TreeLoop
 from pypy.jit.metainterp.resoperation import rop, ResOperation
 from pypy.jit.backend.llsupport.descr import GcCache
+from pypy.jit.backend.llsupport.gc import GcLLDescription
 from pypy.jit.backend.x86.runner import CPU
 from pypy.jit.backend.x86.regalloc import RegAlloc, WORD
 from pypy.jit.metainterp.test.oparser import parse
@@ -20,6 +21,7 @@
 from pypy.jit.backend.x86.test.test_regalloc import BaseTestRegalloc
 from pypy.jit.backend.x86.regalloc import X86RegisterManager, X86StackManager,\
      X86XMMRegisterManager
+from pypy.rpython.annlowlevel import llhelper
 
 class MockGcRootMap(object):
     def get_basic_shape(self):
@@ -159,3 +161,128 @@
         jump(i0, i1, 1, 17, i4, ConstPtr(ptr0), i6, i7, i24)
         '''
         self.interpret(ops, [0, 0, 0, 0, 0, 0, 0, 0, 0], run=False)
+
+class GCDescrFastpathMalloc(GcLLDescription):
+    gcrootmap = None
+    
+    def __init__(self):
+        GcCache.__init__(self, False)
+        # create a nursery
+        NTP = rffi.CArray(lltype.Signed)
+        self.nursery = lltype.malloc(NTP, 16, flavor='raw')
+        self.addrs = lltype.malloc(rffi.CArray(lltype.Signed), 2,
+                                   flavor='raw')
+        self.addrs[0] = rffi.cast(lltype.Signed, self.nursery)
+        self.addrs[1] = self.addrs[0] + 64
+        # 64 bytes
+        def malloc_slowpath(size):
+            from pypy.rlib.rarithmetic import r_ulonglong
+            assert size == 8
+            nadr = rffi.cast(lltype.Signed, self.nursery)
+            self.addrs[0] = 99999    # should be overridden by the caller
+            return ((r_ulonglong(nadr + size) << 32) |     # this part in edx
+                     r_ulonglong(nadr))                    # this part in eax
+        self.malloc_slowpath = malloc_slowpath
+        self.MALLOC_SLOWPATH = lltype.FuncType([lltype.Signed],
+                                               lltype.UnsignedLongLong)
+        self._counter = 123
+
+    def can_inline_malloc(self, descr):
+        return True
+
+    def get_funcptr_for_new(self):
+        return 42
+#        return llhelper(lltype.Ptr(self.NEW_TP), self.new)
+
+    def init_size_descr(self, S, descr):
+        descr.tid = self._counter
+        self._counter += 1
+
+    def get_nursery_free_addr(self):
+        return rffi.cast(lltype.Signed, self.addrs)
+
+    def get_nursery_top_addr(self):
+        return rffi.cast(lltype.Signed, self.addrs) + 4
+
+    def get_malloc_fixedsize_slowpath_addr(self):
+        fptr = llhelper(lltype.Ptr(self.MALLOC_SLOWPATH), self.malloc_slowpath)
+        return rffi.cast(lltype.Signed, fptr)
+
+    get_funcptr_for_newarray = None
+    get_funcptr_for_newstr = None
+    get_funcptr_for_newunicode = None
+
+class TestMallocFastpath(BaseTestRegalloc):
+
+    def setup_method(self, method):
+        cpu = CPU(None, None)
+        cpu.vtable_offset = 4
+        cpu.gc_ll_descr = GCDescrFastpathMalloc()
+
+        NODE = lltype.Struct('node', ('tid', lltype.Signed),
+                                     ('value', lltype.Signed))
+        nodedescr = cpu.sizeof(NODE)     # xxx hack: NODE is not a GcStruct
+        valuedescr = cpu.fielddescrof(NODE, 'value')
+
+        self.cpu = cpu
+        self.nodedescr = nodedescr
+        vtable = lltype.malloc(rclass.OBJECT_VTABLE, immortal=True)
+        vtable_int = cpu.cast_adr_to_int(llmemory.cast_ptr_to_adr(vtable))
+        NODE2 = lltype.Struct('node2', ('tid', lltype.Signed),
+                                  ('vtable', lltype.Ptr(rclass.OBJECT_VTABLE)))
+        descrsize = cpu.sizeof(NODE2)
+        cpu.set_class_sizes({vtable_int: descrsize})
+        self.descrsize = descrsize
+        self.vtable_int = vtable_int
+
+        self.namespace = locals().copy()
+        
+    def test_malloc_fastpath(self):
+        ops = '''
+        [i0]
+        p0 = new(descr=nodedescr)
+        setfield_gc(p0, i0, descr=valuedescr)
+        finish(p0)
+        '''
+        self.interpret(ops, [42])
+        # check the nursery
+        gc_ll_descr = self.cpu.gc_ll_descr
+        assert gc_ll_descr.nursery[0] == self.nodedescr.tid
+        assert gc_ll_descr.nursery[1] == 42
+        nurs_adr = rffi.cast(lltype.Signed, gc_ll_descr.nursery)
+        assert gc_ll_descr.addrs[0] == nurs_adr + 8
+
+    def test_malloc_slowpath(self):
+        ops = '''
+        []
+        p0 = new(descr=nodedescr)
+        p1 = new(descr=nodedescr)
+        p2 = new(descr=nodedescr)
+        p3 = new(descr=nodedescr)
+        p4 = new(descr=nodedescr)
+        p5 = new(descr=nodedescr)
+        p6 = new(descr=nodedescr)
+        p7 = new(descr=nodedescr)
+        p8 = new(descr=nodedescr)
+        finish(p0, p1, p2, p3, p4, p5, p6, p7, p8)
+        '''
+        self.interpret(ops, [])
+        # this should call slow path once
+        gc_ll_descr = self.cpu.gc_ll_descr
+        nadr = rffi.cast(lltype.Signed, gc_ll_descr.nursery)
+        assert gc_ll_descr.addrs[0] == nadr + 8
+
+    def test_new_with_vtable(self):
+        ops = '''
+        [i0, i1]
+        p0 = new_with_vtable(ConstClass(vtable))
+        guard_class(p0, ConstClass(vtable)) [i0]
+        finish(i1)
+        '''
+        self.interpret(ops, [0, 1])
+        assert self.getint(0) == 1
+        gc_ll_descr = self.cpu.gc_ll_descr
+        assert gc_ll_descr.nursery[0] == self.descrsize.tid
+        assert gc_ll_descr.nursery[1] == self.vtable_int
+        nurs_adr = rffi.cast(lltype.Signed, gc_ll_descr.nursery)
+        assert gc_ll_descr.addrs[0] == nurs_adr + 8

Modified: pypy/trunk/pypy/rpython/llinterp.py
==============================================================================
--- pypy/trunk/pypy/rpython/llinterp.py	(original)
+++ pypy/trunk/pypy/rpython/llinterp.py	Thu Oct 15 11:31:51 2009
@@ -848,6 +848,12 @@
     def op_gc_restore_exception(self, exc):
         raise NotImplementedError("gc_restore_exception")
 
+    def op_gc_adr_of_nursery_top(self):
+        raise NotImplementedError
+
+    def op_gc_adr_of_nursery_free(self):
+        raise NotImplementedError
+
     def op_gc_call_rtti_destructor(self, rtti, addr):
         if hasattr(rtti._obj, 'destructor_funcptr'):
             d = rtti._obj.destructor_funcptr

Modified: pypy/trunk/pypy/rpython/lltypesystem/lloperation.py
==============================================================================
--- pypy/trunk/pypy/rpython/lltypesystem/lloperation.py	(original)
+++ pypy/trunk/pypy/rpython/lltypesystem/lloperation.py	Thu Oct 15 11:31:51 2009
@@ -446,6 +446,14 @@
     'gc_thread_run'       : LLOp(),
     'gc_thread_die'       : LLOp(),
     'gc_assume_young_pointers': LLOp(),
+
+    # ------- JIT & GC interaction, only for some GCs ----------
+    
+    'gc_adr_of_nursery_free' : LLOp(),
+    # ^^^ returns an address of nursery free pointer, for later modifications
+    'gc_adr_of_nursery_top' : LLOp(),
+    # ^^^ returns an address of pointer, since it can change at runtime
+    
     # experimental operations in support of thread cloning, only
     # implemented by the Mark&Sweep GC
     'gc_x_swap_pool':       LLOp(canraise=(MemoryError,), canunwindgc=True),

Modified: pypy/trunk/pypy/rpython/memory/gc/generation.py
==============================================================================
--- pypy/trunk/pypy/rpython/memory/gc/generation.py	(original)
+++ pypy/trunk/pypy/rpython/memory/gc/generation.py	Thu Oct 15 11:31:51 2009
@@ -122,10 +122,12 @@
         # a new nursery (e.g. if it invokes finalizers).
         self.semispace_collect()
 
-    def get_young_fixedsize(self, nursery_size):
+    @staticmethod
+    def get_young_fixedsize(nursery_size):
         return nursery_size // 2 - 1
 
-    def get_young_var_basesize(self, nursery_size):
+    @staticmethod
+    def get_young_var_basesize(nursery_size):
         return nursery_size // 4 - 1
 
     def is_in_nursery(self, addr):

Modified: pypy/trunk/pypy/rpython/memory/gctransform/framework.py
==============================================================================
--- pypy/trunk/pypy/rpython/memory/gctransform/framework.py	(original)
+++ pypy/trunk/pypy/rpython/memory/gctransform/framework.py	Thu Oct 15 11:31:51 2009
@@ -599,6 +599,28 @@
         hop.genop("direct_call", [self.assume_young_pointers_ptr,
                                   self.c_const_gc, v_addr])
 
+    def gct_gc_adr_of_nursery_free(self, hop):
+        if getattr(self.gcdata.gc, 'nursery_free', None) is None:
+            raise NotImplementedError("gc_adr_of_nursery_free only for generational gcs")
+        op = hop.spaceop
+        ofs = llmemory.offsetof(self.c_const_gc.concretetype.TO,
+                                'inst_nursery_free')
+        c_ofs = rmodel.inputconst(lltype.Signed, ofs)
+        v_gc_adr = hop.genop('cast_ptr_to_adr', [self.c_const_gc],
+                             resulttype=llmemory.Address)
+        hop.genop('adr_add', [v_gc_adr, c_ofs], resultvar=op.result)
+
+    def gct_gc_adr_of_nursery_top(self, hop):
+        if getattr(self.gcdata.gc, 'nursery_top', None) is None:
+            raise NotImplementedError("gc_adr_of_nursery_top only for generational gcs")
+        op = hop.spaceop
+        ofs = llmemory.offsetof(self.c_const_gc.concretetype.TO,
+                                'inst_nursery_top')
+        c_ofs = rmodel.inputconst(lltype.Signed, ofs)
+        v_gc_adr = hop.genop('cast_ptr_to_adr', [self.c_const_gc],
+                             resulttype=llmemory.Address)
+        hop.genop('adr_add', [v_gc_adr, c_ofs], resultvar=op.result)
+
     def _can_realloc(self):
         return self.gcdata.gc.can_realloc
 

Modified: pypy/trunk/pypy/rpython/memory/test/test_transformed_gc.py
==============================================================================
--- pypy/trunk/pypy/rpython/memory/test/test_transformed_gc.py	(original)
+++ pypy/trunk/pypy/rpython/memory/test/test_transformed_gc.py	Thu Oct 15 11:31:51 2009
@@ -1043,6 +1043,27 @@
         #  * the GcArray pointer from gc.wr_to_objects_with_id
         #  * the GcArray pointer from gc.object_id_dict.
 
+    def test_adr_of_nursery(self):
+        class A(object):
+            pass
+        
+        def f():
+            # we need at least 1 obj to allocate a nursery
+            a = A()
+            nf_a = llop.gc_adr_of_nursery_free(llmemory.Address)
+            nt_a = llop.gc_adr_of_nursery_top(llmemory.Address)
+            nf0 = nf_a.address[0]
+            nt0 = nt_a.address[0]
+            a0 = A()
+            a1 = A()
+            nf1 = nf_a.address[0]
+            nt1 = nt_a.address[0]
+            assert nf1 > nf0
+            assert nt1 > nf1
+            assert nt1 == nt0
+        run = self.runner(f, nbargs=0)
+        res = run([])        
+
 class TestGenerationalNoFullCollectGC(GCTest):
     # test that nursery is doing its job and that no full collection
     # is needed when most allocated objects die quickly



More information about the Pypy-commit mailing list