[pypy-commit] pypy release-2.0.x: Manual merge of emit-call-x86: fix for multithreaded programs, particularly

arigo noreply at buildbot.pypy.org
Mon May 20 13:38:03 CEST 2013

Author: Armin Rigo <arigo at tunes.org>
Branch: release-2.0.x
Changeset: r64365:f7e4f43c9487
Date: 2013-05-20 13:30 +0200

Log:	Manual merge of emit-call-x86: fix for multithreaded programs,
	particularly those that run more threads than cores. The issue was
	that a call_release_gil instruction compiles to code that still
	accesses ebp/rsp after it released the GIL.

diff --git a/pypy/module/pypyjit/test_pypy_c/bug1.py b/pypy/module/pypyjit/test_pypy_c/bug1.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/pypyjit/test_pypy_c/bug1.py
@@ -0,0 +1,57 @@
+import cffi, thread, time, sys
+ffi = cffi.FFI()
+    long foobar(long a, long b, long c, long d, long e, long f,
+                long a2, long b2, long c2, long d2, long e2, long f2,
+                long a3, long b3, long c3, long d3, long e3, long f3,
+                long a4, long b4, long c4, long d4, long e4, long f4);
+lib = ffi.verify("""
+    long foobar(long a, long b, long c, long d, long e, long f,
+                long a2, long b2, long c2, long d2, long e2, long f2,
+                long a3, long b3, long c3, long d3, long e3, long f3,
+                long a4, long b4, long c4, long d4, long e4, long f4)
+    {
+        return a * 1 + b * 2 + c * 3 + d * 4 + e * 5 + f * 6 +
+               (a2 * 1 + b2 * 2 + c2 * 3 + d2 * 4 + e2 * 5 + f2 * 6) * (-3) +
+               (a3 * 1 + b3 * 2 + c3 * 3 + d3 * 4 + e3 * 5 + f3 * 6) * (-5) +
+               (a4 * 1 + b4 * 2 + c4 * 3 + d4 * 4 + e4 * 5 + f4 * 6) * (-7);
+    }
+def runme():
+    for j in range(10):
+        for i in range(10000):
+            args = [i-k for k in range(24)]
+            x = lib.foobar(*args)
+            (a,b,c,d,e,f,a2,b2,c2,d2,e2,f2,
+             a3,b3,c3,d3,e3,f3,a4,b4,c4,d4,e4,f4) = args
+            assert x == (
+                a * 1 + b * 2 + c * 3 + d * 4 + e * 5 + f * 6 +
+                (a2 * 1 + b2 * 2 + c2 * 3 + d2 * 4 + e2 * 5 + f2 * 6) * (-3) +
+                (a3 * 1 + b3 * 2 + c3 * 3 + d3 * 4 + e3 * 5 + f3 * 6) * (-5) +
+                (a4 * 1 + b4 * 2 + c4 * 3 + d4 * 4 + e4 * 5 + f4 * 6) * (-7))
+done = []
+def submain():
+    try:
+        runme()
+        err = None
+    except:
+        err = sys.exc_info()
+    done.append(err)
+for i in range(2):
+    thread.start_new_thread(submain, ())
+while len(done) < 2:
+    time.sleep(0.1)
+for err in done:
+    if err is not None:
+        raise err[0], err[1], err[2]
diff --git a/pypy/module/pypyjit/test_pypy_c/test_bug.py b/pypy/module/pypyjit/test_pypy_c/test_bug.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/pypyjit/test_pypy_c/test_bug.py
@@ -0,0 +1,14 @@
+import os, sys, py, subprocess
+localdir = os.path.dirname(os.path.abspath(__file__))
+def test_bug1():
+    if not sys.platform.startswith('linux'):
+        py.test.skip("linux-only test")
+    cmdline = ['taskset', '-c', '0',
+               sys.executable, os.path.join(localdir, 'bug1.py')]
+    popen = subprocess.Popen(cmdline)
+    err = popen.wait()
+    assert err == 0
diff --git a/rpython/jit/backend/llgraph/test/test_llgraph.py b/rpython/jit/backend/llgraph/test/test_llgraph.py
--- a/rpython/jit/backend/llgraph/test/test_llgraph.py
+++ b/rpython/jit/backend/llgraph/test/test_llgraph.py
@@ -15,6 +15,9 @@
     def test_memoryerror(self):
         py.test.skip("does not make much sense on the llgraph backend")
+    def test_call_release_gil_variable_function_and_arguments(self):
+        py.test.skip("the arguments seem not correctly casted")
 def test_cast_adr_to_int_and_back():
     X = lltype.Struct('X', ('foo', lltype.Signed))
diff --git a/rpython/jit/backend/llsupport/assembler.py b/rpython/jit/backend/llsupport/assembler.py
--- a/rpython/jit/backend/llsupport/assembler.py
+++ b/rpython/jit/backend/llsupport/assembler.py
@@ -108,8 +108,7 @@
             self.malloc_slowpath_unicode = None
-        if gc_ll_descr.gcrootmap:
-            self._build_release_gil(gc_ll_descr.gcrootmap)
+        self._build_release_gil(gc_ll_descr.gcrootmap)
         if not self._debug:
             # if self._debug is already set it means that someone called
             # set_debug by hand before initializing the assembler. Leave it
@@ -348,12 +347,19 @@
         if after:
+    @staticmethod
+    def _no_op():
+        pass
     _NOARG_FUNC = lltype.Ptr(lltype.FuncType([], lltype.Void))
     _CLOSESTACK_FUNC = lltype.Ptr(lltype.FuncType([rffi.LONGP],
     def _build_release_gil(self, gcrootmap):
-        if gcrootmap.is_shadow_stack:
+        if gcrootmap is None:
+            releasegil_func = llhelper(self._NOARG_FUNC, self._no_op)
+            reacqgil_func = llhelper(self._NOARG_FUNC, self._no_op)
+        elif gcrootmap.is_shadow_stack:
             releasegil_func = llhelper(self._NOARG_FUNC,
             reacqgil_func = llhelper(self._NOARG_FUNC,
diff --git a/rpython/jit/backend/test/runner_test.py b/rpython/jit/backend/test/runner_test.py
--- a/rpython/jit/backend/test/runner_test.py
+++ b/rpython/jit/backend/test/runner_test.py
@@ -2532,6 +2532,219 @@
         assert rffi.charp2strn(buffer, buflen) == cwd
         lltype.free(buffer, flavor='raw')
+    def test_call_release_gil_return_types(self):
+        from rpython.rlib.libffi import types
+        from rpython.rlib.rarithmetic import r_uint, r_longlong, r_ulonglong
+        from rpython.rlib.rarithmetic import r_singlefloat
+        cpu = self.cpu
+        for ffitype, result, TP in [
+            (types.ulong,  r_uint(sys.maxint + 10), lltype.Unsigned),
+            (types.slong,  -4321, lltype.Signed),
+            (types.uint8,  200, rffi.UCHAR),
+            (types.sint8,  -42, rffi.SIGNEDCHAR),
+            (types.uint16, 50000, rffi.USHORT),
+            (types.sint16, -20000, rffi.SHORT),
+            (types.uint32, r_uint(3000000000), rffi.UINT),
+            (types.sint32, -2000000000, rffi.INT),
+            (types.uint64, r_ulonglong(9999999999999999999),
+                                                   lltype.UnsignedLongLong),
+            (types.sint64, r_longlong(-999999999999999999),
+                                                   lltype.SignedLongLong),
+            (types.double, 12.3475226, rffi.DOUBLE),
+            (types.float,  r_singlefloat(-592.75), rffi.FLOAT),
+            ]:
+            if sys.maxint < 2**32 and TP in (lltype.SignedLongLong,
+                                             lltype.UnsignedLongLong):
+                if not cpu.supports_longlong:
+                    continue
+            if TP == rffi.DOUBLE:
+                if not cpu.supports_floats:
+                    continue
+            if TP == rffi.FLOAT:
+                if not cpu.supports_singlefloats:
+                    continue
+            #
+            result = rffi.cast(TP, result)
+            #
+            def pseudo_c_function():
+                return result
+            #
+            FPTR = self.Ptr(self.FuncType([], TP))
+            func_ptr = llhelper(FPTR, pseudo_c_function)
+            funcbox = self.get_funcbox(cpu, func_ptr)
+            calldescr = cpu._calldescr_dynamic_for_tests([], ffitype)
+            faildescr = BasicFailDescr(1)
+            kind = types.getkind(ffitype)
+            if kind in 'uis':
+                b3 = BoxInt()
+            elif kind in 'fUI':
+                b3 = BoxFloat()
+            else:
+                assert 0, kind
+            #
+            ops = [
+                ResOperation(rop.CALL_RELEASE_GIL, [funcbox], b3,
+                             descr=calldescr),
+                ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+                ResOperation(rop.FINISH, [b3], None, descr=BasicFinalDescr(0))
+                ]
+            ops[1].setfailargs([])
+            looptoken = JitCellToken()
+            self.cpu.compile_loop([], ops, looptoken)
+            deadframe = self.cpu.execute_token(looptoken)
+            fail = self.cpu.get_latest_descr(deadframe)
+            assert fail.identifier == 0
+            if isinstance(b3, BoxInt):
+                r = self.cpu.get_int_value(deadframe, 0)
+                if isinstance(result, r_singlefloat):
+                    assert -sys.maxint-1 <= r <= 0xFFFFFFFF
+                    r, = struct.unpack("f", struct.pack("I", r & 0xFFFFFFFF))
+                    result = float(result)
+                else:
+                    r = rffi.cast(TP, r)
+                assert r == result
+            elif isinstance(b3, BoxFloat):
+                r = self.cpu.get_float_value(deadframe, 0)
+                if isinstance(result, float):
+                    r = longlong.getrealfloat(r)
+                else:
+                    r = rffi.cast(TP, r)
+                assert r == result
+    def test_call_release_gil_variable_function_and_arguments(self):
+        from rpython.rlib.libffi import types
+        from rpython.rlib.rarithmetic import r_uint, r_longlong, r_ulonglong
+        from rpython.rlib.rarithmetic import r_singlefloat
+        cpu = self.cpu
+        rnd = random.Random(525)
+        ALL_TYPES = [
+            (types.ulong,  lltype.Unsigned),
+            (types.slong,  lltype.Signed),
+            (types.uint8,  rffi.UCHAR),
+            (types.sint8,  rffi.SIGNEDCHAR),
+            (types.uint16, rffi.USHORT),
+            (types.sint16, rffi.SHORT),
+            (types.uint32, rffi.UINT),
+            (types.sint32, rffi.INT),
+            ]
+        if sys.maxint < 2**32 and cpu.supports_longlong:
+            ALL_TYPES += [
+                (types.uint64, lltype.UnsignedLongLong),
+                (types.sint64, lltype.SignedLongLong),
+                ] * 2
+        if cpu.supports_floats:
+            ALL_TYPES += [
+                (types.double, rffi.DOUBLE),
+                ] * 4
+        if cpu.supports_singlefloats:
+            ALL_TYPES += [
+                (types.float,  rffi.FLOAT),
+                ] * 4
+        for k in range(100):
+            POSSIBLE_TYPES = [rnd.choice(ALL_TYPES)
+                              for i in range(random.randrange(2, 5))]
+            load_factor = rnd.random()
+            keepalive_factor = rnd.random()
+            #
+            def pseudo_c_function(*args):
+                seen.append(list(args))
+            #
+            ffitypes = []
+            ARGTYPES = []
+            for i in range(rnd.randrange(4, 20)):
+                ffitype, TP = rnd.choice(POSSIBLE_TYPES)
+                ffitypes.append(ffitype)
+                ARGTYPES.append(TP)
+            #
+            FPTR = self.Ptr(self.FuncType(ARGTYPES, lltype.Void))
+            func_ptr = llhelper(FPTR, pseudo_c_function)
+            funcbox = self.get_funcbox(cpu, func_ptr)
+            calldescr = cpu._calldescr_dynamic_for_tests(ffitypes, types.void)
+            faildescr = BasicFailDescr(1)
+            #
+            argboxes = [BoxInt()]   # for the function to call
+            codes = ['X']
+            for ffitype in ffitypes:
+                kind = types.getkind(ffitype)
+                codes.append(kind)
+                if kind in 'uis':
+                    b1 = BoxInt()
+                elif kind in 'fUI':
+                    b1 = BoxFloat()
+                else:
+                    assert 0, kind
+                argboxes.append(b1)
+            codes = ''.join(codes)     # useful for pdb
+            print
+            print codes
+            #
+            argvalues = [funcbox.getint()]
+            for TP in ARGTYPES:
+                r = (rnd.random() - 0.5) * 999999999999.9
+                r = rffi.cast(TP, r)
+                argvalues.append(r)
+            #
+            argvalues_normal = argvalues[:1]
+            for ffitype, r in zip(ffitypes, argvalues[1:]):
+                kind = types.getkind(ffitype)
+                if kind in 'ui':
+                    r = rffi.cast(lltype.Signed, r)
+                elif kind in 's':
+                    r, = struct.unpack("i", struct.pack("f", float(r)))
+                elif kind in 'f':
+                    r = longlong.getfloatstorage(r)
+                elif kind in 'UI':   # 32-bit only
+                    r = rffi.cast(lltype.SignedLongLong, r)
+                else:
+                    assert 0
+                argvalues_normal.append(r)
+            #
+            ops = []
+            loadcodes = []
+            insideboxes = []
+            for b1 in argboxes:
+                load = rnd.random() < load_factor
+                loadcodes.append(' ^'[load])
+                if load:
+                    b2 = b1.clonebox()
+                    ops.insert(rnd.randrange(0, len(ops)+1),
+                               ResOperation(rop.SAME_AS, [b1], b2))
+                    b1 = b2
+                insideboxes.append(b1)
+            loadcodes = ''.join(loadcodes)
+            print loadcodes
+            ops += [
+                ResOperation(rop.CALL_RELEASE_GIL, insideboxes, None,
+                             descr=calldescr),
+                ResOperation(rop.GUARD_NOT_FORCED, [], None, descr=faildescr),
+                ResOperation(rop.FINISH, [], None, descr=BasicFinalDescr(0))
+                ]
+            ops[-2].setfailargs([])
+            # keep alive a random subset of the insideboxes
+            for b1 in insideboxes:
+                if rnd.random() < keepalive_factor:
+                    ops.insert(-1, ResOperation(rop.SAME_AS, [b1],
+                                                b1.clonebox()))
+            looptoken = JitCellToken()
+            self.cpu.compile_loop(argboxes, ops, looptoken)
+            #
+            seen = []
+            deadframe = self.cpu.execute_token(looptoken, *argvalues_normal)
+            fail = self.cpu.get_latest_descr(deadframe)
+            assert fail.identifier == 0
+            expected = argvalues[1:]
+            [got] = seen
+            different_values = ['%r != %r' % (a, b)
+                                    for a, b in zip(got, expected)
+                                        if a != b]
+            assert got == expected, ', '.join(different_values)
     def test_guard_not_invalidated(self):
         cpu = self.cpu
         i0 = BoxInt()
diff --git a/rpython/jit/backend/x86/arch.py b/rpython/jit/backend/x86/arch.py
--- a/rpython/jit/backend/x86/arch.py
+++ b/rpython/jit/backend/x86/arch.py
@@ -40,4 +40,4 @@
     PASS_ON_MY_FRAME = 12
     JITFRAME_FIXED_SIZE = 28 # 13 GPR + 15 XMM
-assert PASS_ON_MY_FRAME >= 11       # asmgcc needs at least JIT_USE_WORDS + 2
+assert PASS_ON_MY_FRAME >= 12       # asmgcc needs at least JIT_USE_WORDS + 3
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -6,7 +6,7 @@
                                                 DEBUG_COUNTER, debug_bridge)
 from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
 from rpython.jit.backend.llsupport.gcmap import allocate_gcmap
-from rpython.jit.metainterp.history import Const, Box
+from rpython.jit.metainterp.history import Const, Box, VOID
 from rpython.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
 from rpython.rtyper.lltypesystem import lltype, rffi, rstr, llmemory
 from rpython.rtyper.lltypesystem.lloperation import llop
@@ -25,28 +25,17 @@
     RegLoc, FrameLoc, ConstFloatLoc, ImmedLoc, AddressLoc, imm,
     imm0, imm1, FloatImmedLoc, RawEbpLoc, RawEspLoc)
 from rpython.rlib.objectmodel import we_are_translated
-from rpython.jit.backend.x86 import rx86, codebuf
+from rpython.jit.backend.x86 import rx86, codebuf, callbuilder
 from rpython.jit.metainterp.resoperation import rop
 from rpython.jit.backend.x86 import support
 from rpython.rlib.debug import debug_print, debug_start, debug_stop
 from rpython.rlib import rgc
-from rpython.rlib.clibffi import FFI_DEFAULT_ABI
-from rpython.jit.backend.x86.jump import remap_frame_layout
 from rpython.jit.codewriter.effectinfo import EffectInfo
 from rpython.jit.codewriter import longlong
 from rpython.rlib.rarithmetic import intmask, r_uint
 from rpython.rlib.objectmodel import compute_unique_id
-# darwin requires the stack to be 16 bytes aligned on calls. Same for gcc 4.5.0,
-# better safe than sorry
-def align_stack_words(words):
-    return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
 class Assembler386(BaseAssembler):
     _regalloc = None
     _output_loop_log = None
@@ -131,10 +120,10 @@
             mc.MOV_rs(esi.value, WORD*2)
             # push first arg
             mc.MOV_rr(edi.value, ebp.value)
-            align = align_stack_words(1)
+            align = callbuilder.align_stack_words(1)
             mc.SUB_ri(esp.value, (align - 1) * WORD)
-            align = align_stack_words(3)
+            align = callbuilder.align_stack_words(3)
             mc.MOV_rs(eax.value, WORD * 2)
             mc.SUB_ri(esp.value, (align - 1) * WORD)
             mc.MOV_sr(WORD, eax.value)
@@ -1014,175 +1003,24 @@
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
         return bool(gcrootmap) and not gcrootmap.is_shadow_stack
-    def _emit_call(self, x, arglocs, start=0, tmp=eax,
-                   argtypes=None, callconv=FFI_DEFAULT_ABI,
-                   # whether to worry about a CALL that can collect; this
-                   # is always true except in call_release_gil
-                   can_collect=True,
-                   # max number of arguments we can pass on esp; if more,
-                   # we need to decrease esp temporarily
-                   stack_max=PASS_ON_MY_FRAME):
-        #
-        if IS_X86_64:
-            return self._emit_call_64(x, arglocs, start, argtypes,
-                                      can_collect, stack_max)
-        stack_depth = 0
-        n = len(arglocs)
-        for i in range(start, n):
-            loc = arglocs[i]
-            stack_depth += loc.get_width() // WORD
-        if stack_depth > stack_max:
-            align = align_stack_words(stack_depth - stack_max)
-            self.mc.SUB_ri(esp.value, align * WORD)
-            if can_collect:
-                self.set_extra_stack_depth(self.mc, align * WORD)
+    def simple_call(self, fnloc, arglocs, result_loc=eax):
+        if result_loc is xmm0:
+            result_type = FLOAT
+            result_size = 8
+        elif result_loc is None:
+            result_type = VOID
+            result_size = 0
-            align = 0
-        p = 0
-        for i in range(start, n):
-            loc = arglocs[i]
-            if isinstance(loc, RegLoc):
-                if loc.is_xmm:
-                    self.mc.MOVSD_sx(p, loc.value)
-                else:
-                    self.mc.MOV_sr(p, loc.value)
-            p += loc.get_width()
-        p = 0
-        for i in range(start, n):
-            loc = arglocs[i]
-            if not isinstance(loc, RegLoc):
-                if loc.get_width() == 8:
-                    self.mc.MOVSD(xmm0, loc)
-                    self.mc.MOVSD_sx(p, xmm0.value)
-                else:
-                    self.mc.MOV(tmp, loc)
-                    self.mc.MOV_sr(p, tmp.value)
-            p += loc.get_width()
-        # x is a location
-        if can_collect:
-            # we push *now* the gcmap, describing the status of GC registers
-            # after the rearrangements done just above, ignoring the return
-            # value eax, if necessary
-            noregs = self.cpu.gc_ll_descr.is_shadow_stack()
-            gcmap = self._regalloc.get_gcmap([eax], noregs=noregs)
-            self.push_gcmap(self.mc, gcmap, store=True)
-        #
-        self.mc.CALL(x)
-        if callconv != FFI_DEFAULT_ABI:
-            self._fix_stdcall(callconv, p - align * WORD)
-        elif align:
-            self.mc.ADD_ri(esp.value, align * WORD)
-        #
-        if can_collect:
-            self._reload_frame_if_necessary(self.mc)
-            if align:
-                self.set_extra_stack_depth(self.mc, 0)
-            self.pop_gcmap(self.mc)
+            result_type = INT
+            result_size = WORD
+        cb = callbuilder.CallBuilder(self, fnloc, arglocs,
+                                     result_loc, result_type,
+                                     result_size)
+        cb.emit()
-    def _fix_stdcall(self, callconv, p):
-        from rpython.rlib.clibffi import FFI_STDCALL
-        assert callconv == FFI_STDCALL
-        # it's a bit stupid, but we're just going to cancel the fact that
-        # the called function just added 'p' to ESP, by subtracting it again.
-        self.mc.SUB_ri(esp.value, p)
-    def _emit_call_64(self, x, arglocs, start, argtypes,
-                      can_collect, stack_max):
-        src_locs = []
-        dst_locs = []
-        xmm_src_locs = []
-        xmm_dst_locs = []
-        singlefloats = None
-        # In reverse order for use with pop()
-        unused_gpr = [r9, r8, ecx, edx, esi, edi]
-        unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
-        on_stack = 0
-        # count the stack depth
-        floats = 0
-        for i in range(start, len(arglocs)):
-            arg = arglocs[i]
-            if arg.is_float() or argtypes and argtypes[i - start] == 'S':
-                floats += 1
-        all_args = len(arglocs) - start
-        stack_depth = (max(all_args - floats - len(unused_gpr), 0) +
-                       max(floats - len(unused_xmm), 0))
-        align = 0
-        if stack_depth > stack_max:
-            align = align_stack_words(stack_depth - stack_max)
-            if can_collect:
-                self.set_extra_stack_depth(self.mc, align * WORD)
-            self.mc.SUB_ri(esp.value, align * WORD)
-        for i in range(start, len(arglocs)):
-            loc = arglocs[i]
-            if loc.is_float():
-                xmm_src_locs.append(loc)
-                if len(unused_xmm) > 0:
-                    xmm_dst_locs.append(unused_xmm.pop())
-                else:
-                    xmm_dst_locs.append(RawEspLoc(on_stack * WORD, FLOAT))
-                    on_stack += 1
-            elif argtypes is not None and argtypes[i-start] == 'S':
-                # Singlefloat argument
-                if singlefloats is None:
-                    singlefloats = []
-                if len(unused_xmm) > 0:
-                    singlefloats.append((loc, unused_xmm.pop()))
-                else:
-                    singlefloats.append((loc, RawEspLoc(on_stack * WORD, INT)))
-                    on_stack += 1
-            else:
-                src_locs.append(loc)
-                if len(unused_gpr) > 0:
-                    dst_locs.append(unused_gpr.pop())
-                else:
-                    dst_locs.append(RawEspLoc(on_stack * WORD, INT))
-                    on_stack += 1
-        # Handle register arguments: first remap the xmm arguments
-        remap_frame_layout(self, xmm_src_locs, xmm_dst_locs,
-                           X86_64_XMM_SCRATCH_REG)
-        # Load the singlefloat arguments from main regs or stack to xmm regs
-        if singlefloats is not None:
-            for src, dst in singlefloats:
-                if isinstance(dst, RawEspLoc):
-                    # XXX too much special logic
-                    if isinstance(src, RawEbpLoc):
-                        self.mc.MOV32(X86_64_SCRATCH_REG, src)
-                        self.mc.MOV32(dst, X86_64_SCRATCH_REG)
-                    else:
-                        self.mc.MOV32(dst, src)
-                    continue
-                if isinstance(src, ImmedLoc):
-                    self.mc.MOV(X86_64_SCRATCH_REG, src)
-                    src = X86_64_SCRATCH_REG
-                self.mc.MOVD(dst, src)
-        # Finally remap the arguments in the main regs
-        # If x is a register and is in dst_locs, then oups, it needs to
-        # be moved away:
-        if x in dst_locs:
-            src_locs.append(x)
-            dst_locs.append(r10)
-            x = r10
-        remap_frame_layout(self, src_locs, dst_locs, X86_64_SCRATCH_REG)
-        if can_collect:
-            # we push *now* the gcmap, describing the status of GC registers
-            # after the rearrangements done just above, ignoring the return
-            # value eax, if necessary
-            noregs = self.cpu.gc_ll_descr.is_shadow_stack()
-            gcmap = self._regalloc.get_gcmap([eax], noregs=noregs)
-            self.push_gcmap(self.mc, gcmap, store=True)
-        #
-        self.mc.CALL(x)
-        if align:
-            self.mc.ADD_ri(esp.value, align * WORD)
-        #
-        if can_collect:
-            self._reload_frame_if_necessary(self.mc)
-            if align:
-                self.set_extra_stack_depth(self.mc, 0)
-            self.pop_gcmap(self.mc)
+    def simple_call_no_collect(self, fnloc, arglocs):
+        cb = callbuilder.CallBuilder(self, fnloc, arglocs)
+        cb.emit_no_collect()
     def _reload_frame_if_necessary(self, mc, align_stack=False):
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
@@ -1198,10 +1036,6 @@
             self._write_barrier_fastpath(mc, wbdescr, [ebp], array=False,
                                          is_frame=True, align_stack=align_stack)
-    def call(self, addr, args, res):
-        self._emit_call(imm(addr), args)
-        assert res is eax
     genop_int_neg = _unaryop("NEG")
     genop_int_invert = _unaryop("NOT")
     genop_int_add = _binaryop_or_lea("ADD", True)
@@ -1446,7 +1280,7 @@
     # ----------
     def genop_call_malloc_gc(self, op, arglocs, result_loc):
-        self.genop_call(op, arglocs, result_loc)
+        self._genop_call(op, arglocs, result_loc)
     def propagate_memoryerror_if_eax_is_null(self):
@@ -1993,75 +1827,29 @@
     def genop_call(self, op, arglocs, resloc):
-        return self._genop_call(op, arglocs, resloc)
+        self._genop_call(op, arglocs, resloc)
     def _genop_call(self, op, arglocs, resloc, is_call_release_gil=False):
         from rpython.jit.backend.llsupport.descr import CallDescr
-        sizeloc = arglocs[0]
-        assert isinstance(sizeloc, ImmedLoc)
-        size = sizeloc.value
-        signloc = arglocs[1]
-        x = arglocs[2]     # the function address
-        if x is eax:
-            tmp = ecx
-        else:
-            tmp = eax
+        cb = callbuilder.CallBuilder(self, arglocs[2], arglocs[3:], resloc)
         descr = op.getdescr()
         assert isinstance(descr, CallDescr)
+        cb.callconv = descr.get_call_conv()
+        cb.argtypes = descr.get_arg_types()
+        cb.restype  = descr.get_result_type()
+        sizeloc = arglocs[0]
+        assert isinstance(sizeloc, ImmedLoc)
+        cb.ressize = sizeloc.value
+        signloc = arglocs[1]
+        assert isinstance(signloc, ImmedLoc)
+        cb.ressign = signloc.value
-        stack_max = PASS_ON_MY_FRAME
         if is_call_release_gil:
-            if self._is_asmgcc():
-                from rpython.memory.gctransform import asmgcroot
-                stack_max -= asmgcroot.JIT_USE_WORDS
-            can_collect = False
+            cb.emit_call_release_gil()
-            can_collect = True
-        self._emit_call(x, arglocs, 3, tmp=tmp,
-                        argtypes=descr.get_arg_types(),
-                        callconv=descr.get_call_conv(),
-                        can_collect=can_collect,
-                        stack_max=stack_max)
-        if IS_X86_32 and isinstance(resloc, FrameLoc) and resloc.type == FLOAT:
-            # a float or a long long return
-            if descr.get_result_type() == 'L':
-                self.mc.MOV_br(resloc.value, eax.value)      # long long
-                self.mc.MOV_br(resloc.value + 4, edx.value)
-                # XXX should ideally not move the result on the stack,
-                #     but it's a mess to load eax/edx into a xmm register
-                #     and this way is simpler also because the result loc
-                #     can just be always a stack location
-            else:
-                self.mc.FSTPL_b(resloc.value)   # float return
-        elif descr.get_result_type() == 'S':
-            # singlefloat return
-            assert resloc is eax
-            if IS_X86_32:
-                # must convert ST(0) to a 32-bit singlefloat and load it into EAX
-                # mess mess mess
-                self.mc.SUB_ri(esp.value, 4)
-                self.mc.FSTPS_s(0)
-                self.mc.POP_r(eax.value)
-            elif IS_X86_64:
-                # must copy from the lower 32 bits of XMM0 into eax
-                self.mc.MOVD_rx(eax.value, xmm0.value)
-        elif size == WORD:
-            assert resloc is eax or resloc is xmm0    # a full word
-        elif size == 0:
-            pass    # void return
-        else:
-            # use the code in load_from_mem to do the zero- or sign-extension
-            assert resloc is eax
-            if size == 1:
-                srcloc = eax.lowest8bits()
-            else:
-                srcloc = eax
-            self.load_from_mem(eax, srcloc, sizeloc, signloc)
+            cb.emit()
     def _store_force_index(self, guard_op):
         faildescr = guard_op.getdescr()
@@ -2077,64 +1865,15 @@
     def genop_guard_call_may_force(self, op, guard_op, guard_token,
                                    arglocs, result_loc):
-        self.genop_call(op, arglocs, result_loc)
+        self._genop_call(op, arglocs, result_loc)
     def genop_guard_call_release_gil(self, op, guard_op, guard_token,
                                      arglocs, result_loc):
-        # first, close the stack in the sense of the asmgcc GC root tracker
-        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
-        if gcrootmap:
-            # we put the gcmap now into the frame before releasing the GIL,
-            # and pop it below after reacquiring the GIL.  The assumption
-            # is that this gcmap describes correctly the situation at any
-            # point in-between: all values containing GC pointers should
-            # be safely saved out of registers by now, and will not be
-            # manipulated by any of the following CALLs.
-            gcmap = self._regalloc.get_gcmap(noregs=True)
-            self.push_gcmap(self.mc, gcmap, store=True)
-            self.call_release_gil(gcrootmap, arglocs)
-        # do the call
         self._genop_call(op, arglocs, result_loc, is_call_release_gil=True)
-        # then reopen the stack
-        if gcrootmap:
-            self.call_reacquire_gil(gcrootmap, result_loc)
-            self.pop_gcmap(self.mc)     # remove the gcmap saved above
-        # finally, the guard_not_forced
-    def call_release_gil(self, gcrootmap, save_registers):
-        if gcrootmap.is_shadow_stack:
-            args = []
-        else:
-            from rpython.memory.gctransform import asmgcroot
-            # build a 'css' structure on the stack: 2 words for the linkage,
-            # and 5/7 words as described for asmgcroot.ASM_FRAMEDATA, for a
-            # total size of JIT_USE_WORDS.  This structure is found at
-            # [ESP+css].
-            css = WORD * (PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS)
-            assert css >= 2
-            # Save ebp
-            index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
-            self.mc.MOV_sr(index_of_ebp, ebp.value)  # MOV [css.ebp], EBP
-            # Save the "return address": we pretend that it's css
-            if IS_X86_32:
-                reg = eax
-            elif IS_X86_64:
-                reg = edi
-            self.mc.LEA_rs(reg.value, css)           # LEA reg, [css]
-            frame_ptr = css + WORD * (2+asmgcroot.FRAME_PTR)
-            self.mc.MOV_sr(frame_ptr, reg.value)     # MOV [css.frame], reg
-            # Set up jf_extra_stack_depth to pretend that the return address
-            # was at css, and so our stack frame is supposedly shorter by
-            # (css+WORD) bytes
-            self.set_extra_stack_depth(self.mc, -css-WORD)
-            # Call the closestack() function (also releasing the GIL)
-            args = [reg]
-        #
-        self._emit_call(imm(self.releasegil_addr), args, can_collect=False)
     def call_reacquire_gil(self, gcrootmap, save_loc):
         # save the previous result (eax/xmm0) into the stack temporarily.
         # XXX like with call_release_gil(), we assume that we don't need
@@ -2186,11 +1925,11 @@
         self.call_assembler(op, guard_op, argloc, vloc, result_loc, eax)
-    def _call_assembler_emit_call(self, addr, argloc, tmploc):
-        self._emit_call(addr, [argloc], 0, tmp=tmploc)
+    def _call_assembler_emit_call(self, addr, argloc, _):
+        self.simple_call(addr, [argloc])
-    def _call_assembler_emit_helper_call(self, addr, arglocs, _):
-         self._emit_call(addr, arglocs, 0, tmp=self._second_tmp_reg)
+    def _call_assembler_emit_helper_call(self, addr, arglocs, result_loc):
+        self.simple_call(addr, arglocs, result_loc)
     def _call_assembler_check_descr(self, value, tmploc):
         ofs = self.cpu.get_ofs_of_frame_field('jf_descr')
diff --git a/rpython/jit/backend/x86/callbuilder.py b/rpython/jit/backend/x86/callbuilder.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/x86/callbuilder.py
@@ -0,0 +1,577 @@
+from rpython.rlib.clibffi import FFI_DEFAULT_ABI
+from rpython.rlib.objectmodel import we_are_translated
+from rpython.jit.metainterp.history import INT, FLOAT
+from rpython.jit.backend.x86.arch import (WORD, IS_X86_64, IS_X86_32,
+                                          PASS_ON_MY_FRAME)
+from rpython.jit.backend.x86.regloc import (eax, ecx, edx, ebx, esp, ebp, esi,
+    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, r8, r9, r10, r11, edi,
+    r12, r13, r14, r15, X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG,
+    RegLoc, RawEspLoc, RawEbpLoc, imm, ImmedLoc)
+from rpython.jit.backend.x86.jump import remap_frame_layout
+# darwin requires the stack to be 16 bytes aligned on calls.
+# Same for gcc 4.5.0, better safe than sorry
+def align_stack_words(words):
+    return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
+class AbstractCallBuilder(object):
+    # max number of words we have room in esp; if we need more for
+    # arguments, we need to decrease esp temporarily
+    stack_max = PASS_ON_MY_FRAME
+    # this can be set to guide more complex calls: gives the detailed
+    # type of the arguments
+    argtypes = ""
+    ressign = False
+    # this is the calling convention (can be FFI_STDCALL on Windows)
+    callconv = FFI_DEFAULT_ABI
+    # is it for the main CALL of a call_release_gil?
+    is_call_release_gil = False
+    # set by save_result_value()
+    tmpresloc = None
+    def __init__(self, assembler, fnloc, arglocs,
+                 resloc=eax, restype=INT, ressize=WORD):
+        # Avoid tons of issues with a non-immediate fnloc by sticking it
+        # as an extra argument if needed
+        self.fnloc_is_immediate = isinstance(fnloc, ImmedLoc)
+        if self.fnloc_is_immediate:
+            self.fnloc = fnloc
+            self.arglocs = arglocs
+        else:
+            self.arglocs = arglocs + [fnloc]
+        self.asm = assembler
+        self.mc = assembler.mc
+        self.resloc = resloc
+        self.restype = restype
+        self.ressize = ressize
+        self.current_esp = 0     # 0 or (usually) negative, counted in bytes
+    def emit_no_collect(self):
+        """Emit a call that cannot collect."""
+        self.prepare_arguments()
+        self.emit_raw_call()
+        self.restore_esp()
+        self.load_result()
+    def emit(self):
+        """Emit a regular call; not for CALL_RELEASE_GIL."""
+        self.prepare_arguments()
+        self.push_gcmap()
+        self.emit_raw_call()
+        self.restore_esp()
+        self.pop_gcmap()
+        self.load_result()
+    def emit_call_release_gil(self):
+        """Emit a CALL_RELEASE_GIL, including calls to releasegil_addr
+        and reacqgil_addr."""
+        self.select_call_release_gil_mode()
+        self.prepare_arguments()
+        self.push_gcmap_for_call_release_gil()
+        self.call_releasegil_addr_and_move_real_arguments()
+        self.emit_raw_call()
+        self.restore_esp()
+        self.move_real_result_and_call_reacqgil_addr()
+        self.pop_gcmap()
+        self.load_result()
+    def select_call_release_gil_mode(self):
+        """Overridden in CallBuilder64"""
+        self.is_call_release_gil = True
+        if self.asm._is_asmgcc():
+            from rpython.memory.gctransform import asmgcroot
+            self.stack_max = PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS
+            assert self.stack_max >= 3
+    def emit_raw_call(self):
+        self.mc.CALL(self.fnloc)
+        if self.callconv != FFI_DEFAULT_ABI:
+            self.current_esp += self._fix_stdcall(self.callconv)
+    def subtract_esp_aligned(self, count):
+        if count > 0:
+            align = align_stack_words(count)
+            self.current_esp -= align * WORD
+            self.mc.SUB_ri(esp.value, align * WORD)
+    def restore_esp(self, target_esp=0):
+        if self.current_esp != target_esp:
+            self.mc.ADD_ri(esp.value, target_esp - self.current_esp)
+            self.current_esp = target_esp
+    def load_result(self):
+        """Overridden in CallBuilder32 and CallBuilder64"""
+        if self.ressize == 0:
+            return      # void result
+        # use the code in load_from_mem to do the zero- or sign-extension
+        srcloc = self.tmpresloc
+        if srcloc is None:
+            if self.restype == FLOAT:
+                srcloc = xmm0
+            else:
+                srcloc = eax
+        if self.ressize >= WORD and self.resloc is srcloc:
+            return      # no need for any MOV
+        if self.ressize == 1 and isinstance(srcloc, RegLoc):
+            srcloc = srcloc.lowest8bits()
+        self.asm.load_from_mem(self.resloc, srcloc,
+                               imm(self.ressize), imm(self.ressign))
+    def push_gcmap(self):
+        # we push *now* the gcmap, describing the status of GC registers
+        # after the rearrangements done just before, ignoring the return
+        # value eax, if necessary
+        assert not self.is_call_release_gil
+        self.change_extra_stack_depth = (self.current_esp != 0)
+        if self.change_extra_stack_depth:
+            self.asm.set_extra_stack_depth(self.mc, -self.current_esp)
+        noregs = self.asm.cpu.gc_ll_descr.is_shadow_stack()
+        gcmap = self.asm._regalloc.get_gcmap([eax], noregs=noregs)
+        self.asm.push_gcmap(self.mc, gcmap, store=True)
+    def push_gcmap_for_call_release_gil(self):
+        assert self.is_call_release_gil
+        # we put the gcmap now into the frame before releasing the GIL,
+        # and pop it after reacquiring the GIL.  The assumption
+        # is that this gcmap describes correctly the situation at any
+        # point in-between: all values containing GC pointers should
+        # be safely saved out of registers by now, and will not be
+        # manipulated by any of the following CALLs.
+        gcmap = self.asm._regalloc.get_gcmap(noregs=True)
+        self.asm.push_gcmap(self.mc, gcmap, store=True)
+    def pop_gcmap(self):
+        self.asm._reload_frame_if_necessary(self.mc)
+        if self.change_extra_stack_depth:
+            self.asm.set_extra_stack_depth(self.mc, 0)
+        self.asm.pop_gcmap(self.mc)
+    def call_releasegil_addr_and_move_real_arguments(self):
+        initial_esp = self.current_esp
+        self.save_register_arguments()
+        #
+        if not self.asm._is_asmgcc():
+            # the helper takes no argument
+            self.change_extra_stack_depth = False
+        else:
+            from rpython.memory.gctransform import asmgcroot
+            # build a 'css' structure on the stack: 2 words for the linkage,
+            # and 5/7 words as described for asmgcroot.ASM_FRAMEDATA, for a
+            # total size of JIT_USE_WORDS.  This structure is found at
+            # [ESP+css].
+            css = -self.current_esp + (
+                WORD * (PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS))
+            assert css >= 2 * WORD
+            # Save ebp
+            index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
+            self.mc.MOV_sr(index_of_ebp, ebp.value)  # MOV [css.ebp], EBP
+            # Save the "return address": we pretend that it's css
+            if IS_X86_32:
+                reg = eax
+            elif IS_X86_64:
+                reg = edi
+            self.mc.LEA_rs(reg.value, css)           # LEA reg, [css]
+            frame_ptr = css + WORD * (2+asmgcroot.FRAME_PTR)
+            self.mc.MOV_sr(frame_ptr, reg.value)     # MOV [css.frame], reg
+            # Set up jf_extra_stack_depth to pretend that the return address
+            # was at css, and so our stack frame is supposedly shorter by
+            # (PASS_ON_MY_FRAME-JIT_USE_WORDS+1) words
+            delta = PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS + 1
+            self.change_extra_stack_depth = True
+            self.asm.set_extra_stack_depth(self.mc, -delta * WORD)
+            # Call the closestack() function (also releasing the GIL)
+            # with 'reg' as argument
+            if IS_X86_32:
+                self.subtract_esp_aligned(1)
+                self.mc.MOV_sr(0, reg.value)
+            #else:
+            #   on x86_64, reg is edi so that it is already correct
+        #
+        self.mc.CALL(imm(self.asm.releasegil_addr))
+        #
+        if not we_are_translated():        # for testing: we should not access
+            self.mc.ADD(ebp, imm(1))       # ebp any more
+        #
+        self.restore_register_arguments()
+        self.restore_esp(initial_esp)
+    def save_register_arguments(self):
+        """Overridden in CallBuilder64"""
+    def restore_register_arguments(self):
+        """Overridden in CallBuilder64"""
+    def move_real_result_and_call_reacqgil_addr(self):
+        # save the result we just got (in eax/eax+edx/st(0)/xmm0)
+        self.save_result_value()
+        # call the reopenstack() function (also reacquiring the GIL)
+        if not self.asm._is_asmgcc():
+            css = 0     # the helper takes no argument
+        else:
+            from rpython.memory.gctransform import asmgcroot
+            css = WORD * (PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS)
+            if IS_X86_32:
+                reg = eax
+            elif IS_X86_64:
+                reg = edi
+            self.mc.LEA_rs(reg.value, css)
+            if IS_X86_32:
+                self.mc.MOV_sr(0, reg.value)
+        #
+        self.mc.CALL(imm(self.asm.reacqgil_addr))
+        #
+        if not we_are_translated():        # for testing: now we can accesss
+            self.mc.SUB(ebp, imm(1))       # ebp again
+        #
+        # Now that we required the GIL, we can reload a possibly modified ebp
+        if self.asm._is_asmgcc():
+            # special-case: reload ebp from the css
+            from rpython.memory.gctransform import asmgcroot
+            index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
+            self.mc.MOV_rs(ebp.value, index_of_ebp)  # MOV EBP, [css.ebp]
+        #else:
+        #   for shadowstack, done for us by _reload_frame_if_necessary()
+    def save_result_value(self):
+        """Overridden in CallBuilder32 and CallBuilder64"""
+        raise NotImplementedError
+class CallBuilder32(AbstractCallBuilder):
+    def prepare_arguments(self):
+        arglocs = self.arglocs
+        stack_depth = 0
+        n = len(arglocs)
+        for i in range(n):
+            loc = arglocs[i]
+            stack_depth += loc.get_width() // WORD
+        self.subtract_esp_aligned(stack_depth - self.stack_max)
+        #
+        p = 0
+        for i in range(n):
+            loc = arglocs[i]
+            if isinstance(loc, RegLoc):
+                if loc.is_xmm:
+                    self.mc.MOVSD_sx(p, loc.value)
+                else:
+                    self.mc.MOV_sr(p, loc.value)
+            p += loc.get_width()
+        p = 0
+        for i in range(n):
+            loc = arglocs[i]
+            if not isinstance(loc, RegLoc):
+                if loc.get_width() == 8:
+                    self.mc.MOVSD(xmm0, loc)
+                    self.mc.MOVSD_sx(p, xmm0.value)
+                elif isinstance(loc, ImmedLoc):
+                    self.mc.MOV_si(p, loc.value)
+                else:
+                    self.mc.MOV(eax, loc)
+                    self.mc.MOV_sr(p, eax.value)
+            p += loc.get_width()
+        self.total_stack_used_by_arguments = p
+        #
+        if not self.fnloc_is_immediate:    # the last "argument" pushed above
+            self.fnloc = RawEspLoc(p - WORD, INT)
+    def _fix_stdcall(self, callconv):
+        from rpython.rlib.clibffi import FFI_STDCALL
+        assert callconv == FFI_STDCALL
+        return self.total_stack_used_by_arguments
+    def load_result(self):
+        resloc = self.resloc
+        if resloc is not None and resloc.is_float():
+            # a float or a long long return
+            if self.tmpresloc is None:
+                if self.restype == 'L':     # long long
+                    # move eax/edx -> xmm0
+                    self.mc.MOVD_xr(resloc.value^1, edx.value)
+                    self.mc.MOVD_xr(resloc.value,   eax.value)
+                    self.mc.PUNPCKLDQ_xx(resloc.value, resloc.value^1)
+                else:
+                    # float: we have to go via the stack
+                    self.mc.FSTPL_s(0)
+                    self.mc.MOVSD_xs(resloc.value, 0)
+            else:
+                self.mc.MOVSD(resloc, self.tmpresloc)
+            #
+        elif self.restype == 'S':
+            # singlefloat return: must convert ST(0) to a 32-bit singlefloat
+            # and load it into self.resloc.  mess mess mess
+            if self.tmpresloc is None:
+                self.mc.FSTPS_s(0)
+                self.mc.MOV_rs(resloc.value, 0)
+            else:
+                self.mc.MOV(resloc, self.tmpresloc)
+        else:
+            AbstractCallBuilder.load_result(self)
+    def save_result_value(self):
+        # Temporarily save the result value into [ESP+4].  We use "+4"
+        # in order to leave the word at [ESP+0] free, in case it's needed
+        if self.ressize == 0:      # void return
+            return
+        if self.resloc.is_float():
+            # a float or a long long return
+            self.tmpresloc = RawEspLoc(4, FLOAT)
+            if self.restype == 'L':
+                self.mc.MOV_sr(4, eax.value)      # long long
+                self.mc.MOV_sr(8, edx.value)
+            else:
+                self.mc.FSTPL_s(4)                # float return
+        else:
+            self.tmpresloc = RawEspLoc(4, INT)
+            if self.restype == 'S':
+                self.mc.FSTPS_s(4)
+            else:
+                assert self.restype == INT
+                assert self.ressize <= WORD
+                self.mc.MOV_sr(4, eax.value)
+class CallBuilder64(AbstractCallBuilder):
+    ARGUMENTS_GPR = [edi, esi, edx, ecx, r8, r9]
+    ARGUMENTS_XMM = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7]
+    DONT_MOVE_GPR = []
+    _ALL_CALLEE_SAVE_GPR = [ebx, r12, r13, r14, r15]
+    next_arg_gpr = 0
+    next_arg_xmm = 0
+    def _unused_gpr(self, hint):
+        i = self.next_arg_gpr
+        self.next_arg_gpr = i + 1
+        try:
+            res = self.ARGUMENTS_GPR[i]
+        except IndexError:
+            return None
+        if hint in self.DONT_MOVE_GPR:
+            self.ARGUMENTS_GPR[i] = hint
+            res = hint
+        return res
+    def _unused_xmm(self):
+        i = self.next_arg_xmm
+        self.next_arg_xmm = i + 1
+        try:
+            return self.ARGUMENTS_XMM[i]
+        except IndexError:
+            return None
+    def _permute_to_prefer_unused_registers(self, lst):
+        # permute 'lst' so that it starts with registers that are not
+        # in 'self.already_used', and ends with registers that are.
+        N = len(lst)
+        i = 0
+        while i < N:
+            reg = lst[i]
+            if reg in self.already_used:
+                # move this reg to the end, and decrement N
+                N -= 1
+                assert N >= i
+                lst[N], lst[i] = lst[i], lst[N]
+            else:
+                i += 1
+    def select_call_release_gil_mode(self):
+        AbstractCallBuilder.select_call_release_gil_mode(self)
+        # We have to copy the arguments around a bit more in this mode,
+        # but on the other hand we don't need prepare_arguments() moving
+        # them in precisely the final registers.  Here we look around for
+        # unused registers that may be more likely usable.
+        from rpython.jit.backend.x86.regalloc import X86_64_RegisterManager
+        from rpython.jit.backend.x86.regalloc import X86_64_XMMRegisterManager
+        self.already_used = {}
+        for loc in self.arglocs:
+            self.already_used[loc] = None
+        #
+        lst = X86_64_RegisterManager.save_around_call_regs[:]
+        self._permute_to_prefer_unused_registers(lst)
+        # <optimization>
+        extra = []
+        for reg in self.asm._regalloc.rm.free_regs:
+            if (reg not in self.already_used and
+                    reg in self._ALL_CALLEE_SAVE_GPR):
+                extra.append(reg)
+        self.free_callee_save_gprs = extra
+        lst = extra + lst
+        # </optimization>
+        self.ARGUMENTS_GPR = lst[:len(self.ARGUMENTS_GPR)]
+        self.DONT_MOVE_GPR = self._ALL_CALLEE_SAVE_GPR
+        #
+        lst = X86_64_XMMRegisterManager.save_around_call_regs[:]
+        self._permute_to_prefer_unused_registers(lst)
+        self.ARGUMENTS_XMM = lst[:len(self.ARGUMENTS_XMM)]
+    def prepare_arguments(self):
+        src_locs = []
+        dst_locs = []
+        xmm_src_locs = []
+        xmm_dst_locs = []
+        singlefloats = None
+        arglocs = self.arglocs
+        argtypes = self.argtypes
+        on_stack = 0
+        for i in range(len(arglocs)):
+            loc = arglocs[i]
+            if loc.is_float():
+                tgt = self._unused_xmm()
+                if tgt is None:
+                    tgt = RawEspLoc(on_stack * WORD, FLOAT)
+                    on_stack += 1
+                xmm_src_locs.append(loc)
+                xmm_dst_locs.append(tgt)
+            elif i < len(argtypes) and argtypes[i] == 'S':
+                # Singlefloat argument
+                if singlefloats is None:
+                    singlefloats = []
+                tgt = self._unused_xmm()
+                if tgt is None:
+                    tgt = RawEspLoc(on_stack * WORD, INT)
+                    on_stack += 1
+                singlefloats.append((loc, tgt))
+            else:
+                tgt = self._unused_gpr(hint=loc)
+                if tgt is None:
+                    tgt = RawEspLoc(on_stack * WORD, INT)
+                    on_stack += 1
+                src_locs.append(loc)
+                dst_locs.append(tgt)
+        if not self.fnloc_is_immediate:
+            self.fnloc = dst_locs[-1]     # the last "argument" prepared above
+        if not we_are_translated():  # assert that we got the right stack depth
+            floats = 0
+            for i in range(len(arglocs)):
+                arg = arglocs[i]
+                if arg.is_float() or (i < len(argtypes) and argtypes[i]=='S'):
+                    floats += 1
+            all_args = len(arglocs)
+            stack_depth = (max(all_args - floats - len(self.ARGUMENTS_GPR), 0)
+                           + max(floats - len(self.ARGUMENTS_XMM), 0))
+            assert stack_depth == on_stack
+        self.subtract_esp_aligned(on_stack - self.stack_max)
+        # Handle register arguments: first remap the xmm arguments
+        remap_frame_layout(self.asm, xmm_src_locs, xmm_dst_locs,
+                           X86_64_XMM_SCRATCH_REG)
+        # Load the singlefloat arguments from main regs or stack to xmm regs
+        if singlefloats is not None:
+            for src, dst in singlefloats:
+                if isinstance(dst, RawEspLoc):
+                    # XXX too much special logic
+                    if isinstance(src, RawEbpLoc):
+                        self.mc.MOV32(X86_64_SCRATCH_REG, src)
+                        self.mc.MOV32(dst, X86_64_SCRATCH_REG)
+                    else:
+                        self.mc.MOV32(dst, src)
+                    continue
+                if isinstance(src, ImmedLoc):
+                    self.mc.MOV(X86_64_SCRATCH_REG, src)
+                    src = X86_64_SCRATCH_REG
+                self.mc.MOVD(dst, src)
+        # Finally remap the arguments in the main regs
+        remap_frame_layout(self.asm, src_locs, dst_locs, X86_64_SCRATCH_REG)
+    def _fix_stdcall(self, callconv):
+        assert 0     # should not occur on 64-bit
+    def load_result(self):
+        if self.restype == 'S' and self.tmpresloc is None:
+            # singlefloat return: use MOVD to load the target register
+            # from the lower 32 bits of XMM0
+            self.mc.MOVD(self.resloc, xmm0)
+        else:
+            AbstractCallBuilder.load_result(self)
+    def save_result_value(self):
+        # Temporarily save the result value into [ESP].
+        if self.ressize == 0:      # void return
+            return
+        #
+        if self.restype == FLOAT:    # and not 'S'
+            self.mc.MOVSD_sx(0, xmm0.value)
+            self.tmpresloc = RawEspLoc(0, FLOAT)
+            return
+        #
+        if len(self.free_callee_save_gprs) == 0:
+            self.tmpresloc = RawEspLoc(0, INT)
+        else:
+            self.tmpresloc = self.free_callee_save_gprs[0]
+        #
+        if self.restype == 'S':
+            # singlefloat return: use MOVD to store the lower 32 bits
+            # of XMM0 into the tmpresloc (register or [ESP])
+            self.mc.MOVD(self.tmpresloc, xmm0)
+        else:
+            assert self.restype == INT
+            self.mc.MOV(self.tmpresloc, eax)
+    def save_register_arguments(self):
+        # Save the argument registers, which are given by self.ARGUMENTS_xxx.
+        n_gpr = min(self.next_arg_gpr, len(self.ARGUMENTS_GPR))
+        n_xmm = min(self.next_arg_xmm, len(self.ARGUMENTS_XMM))
+        n_saved_regs = n_gpr + n_xmm
+        for i in range(n_gpr):
+            if self.ARGUMENTS_GPR[i] in self._ALL_CALLEE_SAVE_GPR:
+                n_saved_regs -= 1     # don't need to save it
+        self.subtract_esp_aligned(n_saved_regs)
+        #
+        n = 0
+        for i in range(n_gpr):
+            if self.ARGUMENTS_GPR[i] not in self._ALL_CALLEE_SAVE_GPR:
+                self.mc.MOV_sr(n * WORD, self.ARGUMENTS_GPR[i].value)
+                n += 1
+        for i in range(n_xmm):
+            self.mc.MOVSD_sx(n * WORD, self.ARGUMENTS_XMM[i].value)
+            n += 1
+        assert n == n_saved_regs
+        self.n_saved_regs = n_saved_regs
+    def restore_register_arguments(self):
+        # Restore the saved values into the *real* registers used for calls
+        # --- which are not self.ARGUMENTS_xxx!
+        n_gpr = min(self.next_arg_gpr, len(self.ARGUMENTS_GPR))
+        n_xmm = min(self.next_arg_xmm, len(self.ARGUMENTS_XMM))
+        #
+        n = 0
+        for i in range(n_gpr):
+            tgtvalue = CallBuilder64.ARGUMENTS_GPR[i].value
+            if self.ARGUMENTS_GPR[i] not in self._ALL_CALLEE_SAVE_GPR:
+                self.mc.MOV_rs(tgtvalue, n * WORD)
+                n += 1
+            else:
+                self.mc.MOV_rr(tgtvalue, self.ARGUMENTS_GPR[i].value)
+        for i in range(n_xmm):
+            self.mc.MOVSD_xs(CallBuilder64.ARGUMENTS_XMM[i].value, n * WORD)
+            n += 1
+        assert n == self.n_saved_regs
+        #
+        if isinstance(self.fnloc, RegLoc):    # fix this register
+            self.fnloc = CallBuilder64.ARGUMENTS_GPR[n_gpr - 1]
+if IS_X86_32:
+    CallBuilder = CallBuilder32
+if IS_X86_64:
+    CallBuilder = CallBuilder64
diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -79,26 +79,14 @@
         rffi.cast(rffi.CArrayPtr(longlong.FLOATSTORAGE), adr)[1] = y
         return ConstFloatLoc(adr)
-    def after_call(self, v):
-        # the result is stored in st0, but we don't have this around,
-        # so genop_call will move it to some frame location immediately
-        # after the call
-        return self.frame_manager.loc(v)
+    def call_result_location(self, v):
+        return xmm0
 class X86_64_XMMRegisterManager(X86XMMRegisterManager):
     # xmm15 reserved for scratch use
     all_regs = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14]
     save_around_call_regs = all_regs
-    def call_result_location(self, v):
-        return xmm0
-    def after_call(self, v):
-        # We use RegisterManager's implementation, since X86XMMRegisterManager
-        # places the result on the stack, which we don't need to do when the
-        # calling convention places the result in xmm0
-        return RegisterManager.after_call(self, v)
 class X86FrameManager(FrameManager):
     def __init__(self, base_ofs):
@@ -799,14 +787,6 @@
         self._consider_call(op, guard_op)
     def consider_call_release_gil(self, op, guard_op):
-        # We spill the arguments to the stack, because we need to do 3 calls:
-        # call_release_gil(), the_real_c_function(), and call_reacquire_gil().
-        # The arguments are used on the second call only.  XXX we assume
-        # that the XMM arguments won't be modified by call_release_gil().
-        for i in range(op.numargs()):
-            loc = self.loc(op.getarg(i))
-            if loc in self.rm.save_around_call_regs:
-                self.rm.force_spill_var(op.getarg(i))
         assert guard_op is not None
         self._consider_call(op, guard_op)
@@ -1151,9 +1131,8 @@
         # call memcpy()
-        self.assembler._emit_call(imm(self.assembler.memcpy_addr),
-                                  [dstaddr_loc, srcaddr_loc, length_loc],
-                                  can_collect=False)
+        self.assembler.simple_call_no_collect(imm(self.assembler.memcpy_addr),
+                                        [dstaddr_loc, srcaddr_loc, length_loc])
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -553,6 +553,7 @@
     CALL_l = insn('\xE8', relative(1))
     CALL_r = insn(rex_nw, '\xFF', register(1), chr(0xC0 | (2<<3)))
     CALL_b = insn('\xFF', orbyte(2<<3), stack_bp(1))
+    CALL_s = insn('\xFF', orbyte(2<<3), stack_sp(1))
     # XXX: Only here for testing purposes..."as" happens the encode the
     # registers in the opposite order that we would otherwise do in a
@@ -583,6 +584,7 @@
     # x87 instructions
     FSTPL_b = insn('\xDD', orbyte(3<<3), stack_bp(1)) # rffi.DOUBLE ('as' wants L??)
+    FSTPL_s = insn('\xDD', orbyte(3<<3), stack_sp(1)) # rffi.DOUBLE ('as' wants L??)
     FSTPS_s = insn('\xD9', orbyte(3<<3), stack_sp(1)) # lltype.SingleFloat
     # ------------------------------ Random mess -----------------------

More information about the pypy-commit mailing list