[pypy-commit] pypy emit-call-x86: in-progress

arigo noreply at buildbot.pypy.org
Sun May 19 17:04:42 CEST 2013


Author: Armin Rigo <arigo at tunes.org>
Branch: emit-call-x86
Changeset: r64328:fb8653a29037
Date: 2013-05-19 17:04 +0200
http://bitbucket.org/pypy/pypy/changeset/fb8653a29037/

Log:	in-progress

diff --git a/rpython/jit/backend/llsupport/assembler.py b/rpython/jit/backend/llsupport/assembler.py
--- a/rpython/jit/backend/llsupport/assembler.py
+++ b/rpython/jit/backend/llsupport/assembler.py
@@ -108,8 +108,7 @@
             self.malloc_slowpath_unicode = None
 
         self._build_stack_check_slowpath()
-        if gc_ll_descr.gcrootmap:
-            self._build_release_gil(gc_ll_descr.gcrootmap)
+        self._build_release_gil(gc_ll_descr.gcrootmap)
         if not self._debug:
             # if self._debug is already set it means that someone called
             # set_debug by hand before initializing the assembler. Leave it
@@ -348,12 +347,19 @@
         if after:
             after()
 
+    @staticmethod
+    def _no_op():
+        pass
+
     _NOARG_FUNC = lltype.Ptr(lltype.FuncType([], lltype.Void))
     _CLOSESTACK_FUNC = lltype.Ptr(lltype.FuncType([rffi.LONGP],
                                                   lltype.Void))
 
     def _build_release_gil(self, gcrootmap):
-        if gcrootmap.is_shadow_stack:
+        if gcrootmap is None:
+            releasegil_func = llhelper(self._NOARG_FUNC, self._no_op)
+            reacqgil_func = llhelper(self._NOARG_FUNC, self._no_op)
+        elif gcrootmap.is_shadow_stack:
             releasegil_func = llhelper(self._NOARG_FUNC,
                                        self._release_gil_shadowstack)
             reacqgil_func = llhelper(self._NOARG_FUNC,
diff --git a/rpython/jit/backend/x86/arch.py b/rpython/jit/backend/x86/arch.py
--- a/rpython/jit/backend/x86/arch.py
+++ b/rpython/jit/backend/x86/arch.py
@@ -40,4 +40,4 @@
     PASS_ON_MY_FRAME = 12
     JITFRAME_FIXED_SIZE = 28 # 13 GPR + 15 XMM
 
-assert PASS_ON_MY_FRAME >= 11       # asmgcc needs at least JIT_USE_WORDS + 2
+assert PASS_ON_MY_FRAME >= 12       # asmgcc needs at least JIT_USE_WORDS + 3
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -1265,7 +1265,7 @@
     # ----------
 
     def genop_call_malloc_gc(self, op, arglocs, result_loc):
-        self.genop_call(op, arglocs, result_loc)
+        self._genop_call(op, arglocs, result_loc)
         self.propagate_memoryerror_if_eax_is_null()
 
     def propagate_memoryerror_if_eax_is_null(self):
@@ -1812,9 +1812,9 @@
         self.pending_guard_tokens.append(guard_token)
 
     def genop_call(self, op, arglocs, resloc):
-        return self._genop_call(op, arglocs, resloc)
+        self._genop_call(op, arglocs, resloc)
 
-    def _genop_call(self, op, arglocs, resloc):
+    def _genop_call(self, op, arglocs, resloc, is_call_release_gil=False):
         from rpython.jit.backend.llsupport.descr import CallDescr
 
         cb = callbuilder.CallBuilder(self, arglocs[2], arglocs[3:], resloc)
@@ -1831,7 +1831,10 @@
         assert isinstance(signloc, ImmedLoc)
         cb.ressign = signloc.value
 
-        cb.emit()
+        if is_call_release_gil:
+            cb.emit_call_release_gil()
+        else:
+            cb.emit()
 
     def _store_force_index(self, guard_op):
         faildescr = guard_op.getdescr()
@@ -1847,64 +1850,15 @@
     def genop_guard_call_may_force(self, op, guard_op, guard_token,
                                    arglocs, result_loc):
         self._store_force_index(guard_op)
-        self.genop_call(op, arglocs, result_loc)
+        self._genop_call(op, arglocs, result_loc)
         self._emit_guard_not_forced(guard_token)
 
     def genop_guard_call_release_gil(self, op, guard_op, guard_token,
                                      arglocs, result_loc):
         self._store_force_index(guard_op)
-        # first, close the stack in the sense of the asmgcc GC root tracker
-        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
-        if gcrootmap:
-            # we put the gcmap now into the frame before releasing the GIL,
-            # and pop it below after reacquiring the GIL.  The assumption
-            # is that this gcmap describes correctly the situation at any
-            # point in-between: all values containing GC pointers should
-            # be safely saved out of registers by now, and will not be
-            # manipulated by any of the following CALLs.
-            gcmap = self._regalloc.get_gcmap(noregs=True)
-            self.push_gcmap(self.mc, gcmap, store=True)
-            self.call_release_gil(gcrootmap, arglocs)
-        # do the call
         self._genop_call(op, arglocs, result_loc, is_call_release_gil=True)
-        # then reopen the stack
-        if gcrootmap:
-            self.call_reacquire_gil(gcrootmap, result_loc)
-            self.pop_gcmap(self.mc)     # remove the gcmap saved above
-        # finally, the guard_not_forced
         self._emit_guard_not_forced(guard_token)
 
-    def call_release_gil(self, gcrootmap, save_registers):
-        if gcrootmap.is_shadow_stack:
-            args = []
-        else:
-            from rpython.memory.gctransform import asmgcroot
-            # build a 'css' structure on the stack: 2 words for the linkage,
-            # and 5/7 words as described for asmgcroot.ASM_FRAMEDATA, for a
-            # total size of JIT_USE_WORDS.  This structure is found at
-            # [ESP+css].
-            css = WORD * (PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS)
-            assert css >= 2
-            # Save ebp
-            index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
-            self.mc.MOV_sr(index_of_ebp, ebp.value)  # MOV [css.ebp], EBP
-            # Save the "return address": we pretend that it's css
-            if IS_X86_32:
-                reg = eax
-            elif IS_X86_64:
-                reg = edi
-            self.mc.LEA_rs(reg.value, css)           # LEA reg, [css]
-            frame_ptr = css + WORD * (2+asmgcroot.FRAME_PTR)
-            self.mc.MOV_sr(frame_ptr, reg.value)     # MOV [css.frame], reg
-            # Set up jf_extra_stack_depth to pretend that the return address
-            # was at css, and so our stack frame is supposedly shorter by
-            # (css+WORD) bytes
-            self.set_extra_stack_depth(self.mc, -css-WORD)
-            # Call the closestack() function (also releasing the GIL)
-            args = [reg]
-        #
-        self._emit_call(imm(self.releasegil_addr), args, can_collect=False)
-
     def call_reacquire_gil(self, gcrootmap, save_loc):
         # save the previous result (eax/xmm0) into the stack temporarily.
         # XXX like with call_release_gil(), we assume that we don't need
diff --git a/rpython/jit/backend/x86/callbuilder.py b/rpython/jit/backend/x86/callbuilder.py
--- a/rpython/jit/backend/x86/callbuilder.py
+++ b/rpython/jit/backend/x86/callbuilder.py
@@ -6,7 +6,7 @@
 from rpython.jit.backend.x86.regloc import (eax, ecx, edx, ebx, esp, ebp, esi,
     xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, r8, r9, r10, r11, edi,
     r12, r13, r14, r15, X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG,
-    RegLoc, RawEspLoc, FrameLoc)
+    RegLoc, RawEspLoc, imm)
 from rpython.jit.backend.x86.jump import remap_frame_layout
 
 
@@ -38,6 +38,9 @@
     # is it for the main CALL of a call_release_gil?
     is_call_release_gil = False
 
+    # set by save_result_value()
+    tmpresloc = None
+
 
     def __init__(self, assembler, fnloc, arglocs, resloc=eax):
         self.asm = assembler
@@ -52,39 +55,67 @@
         self.prepare_arguments()
         self.push_gcmap()
         self.emit_raw_call()
+        self.restore_esp()
         self.pop_gcmap()
         self.load_result()
+
+    def emit_call_release_gil(self):
+        """Emit a CALL_RELEASE_GIL, including calls to releasegil_addr
+        and reacqgil_addr."""
+        self.select_call_release_gil_mode()
+        self.prepare_arguments()
+        self.push_gcmap_for_call_release_gil()
+        self.call_releasegil_addr_and_move_real_arguments()
+        self.emit_raw_call()
         self.restore_esp()
+        self.move_real_result_and_call_reacqgil_addr()
+        self.pop_gcmap()
+        self.load_result()
+
+    def select_call_release_gil_mode(self):
+        """Overridden in CallBuilder64"""
+        self.is_call_release_gil = True
+        if self.asm._is_asmgcc():
+            from rpython.memory.gctransform import asmgcroot
+            self.stack_max = PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS
+            assert self.stack_max >= 3
 
     def emit_raw_call(self):
         self.mc.CALL(self.fnloc)
         if self.callconv != FFI_DEFAULT_ABI:
             self.current_esp += self._fix_stdcall(self.callconv)
 
-    def restore_esp(self):
-        if self.current_esp != 0:
-            self.mc.SUB_ri(esp.value, self.current_esp)
-            self.current_esp = 0
+    def subtract_esp_aligned(self, count):
+        align = align_stack_words(count)
+        self.current_esp -= align * WORD
+        self.mc.SUB_ri(esp.value, align * WORD)
+
+    def restore_esp(self, target_esp=0):
+        if self.current_esp != target_esp:
+            self.mc.SUB_ri(esp.value, self.current_esp - target_esp)
+            self.current_esp = target_esp
 
     def load_result(self):
         """Overridden in CallBuilder32 and CallBuilder64"""
         if self.ressize == 0:
             return      # void result
         # use the code in load_from_mem to do the zero- or sign-extension
-        if self.restype == FLOAT:
-            srcloc = xmm0
-        elif self.ressize == 1:
-            srcloc = eax.lowest8bits()
-        else:
-            srcloc = eax
+        srcloc = self.tmpresloc
+        if srcloc is None:
+            if self.restype == FLOAT:
+                srcloc = xmm0
+            elif self.ressize == 1:
+                srcloc = eax.lowest8bits()
+            else:
+                srcloc = eax
         if self.ressize >= WORD and self.resloc is srcloc:
-            return      # no need for any move
+            return      # no need for any MOV
         self.asm.load_from_mem(self.resloc, srcloc,
                                imm(self.ressize), imm(self.ressign))
 
     def push_gcmap(self):
         # we push *now* the gcmap, describing the status of GC registers
-        # after the rearrangements done just above, ignoring the return
+        # after the rearrangements done just before, ignoring the return
         # value eax, if necessary
         assert not self.is_call_release_gil
         self.change_extra_stack_depth = (self.current_esp != 0)
@@ -94,13 +125,112 @@
         gcmap = self.asm._regalloc.get_gcmap([eax], noregs=noregs)
         self.asm.push_gcmap(self.mc, gcmap, store=True)
 
+    def push_gcmap_for_call_release_gil(self):
+        assert self.is_call_release_gil
+        # we put the gcmap now into the frame before releasing the GIL,
+        # and pop it after reacquiring the GIL.  The assumption
+        # is that this gcmap describes correctly the situation at any
+        # point in-between: all values containing GC pointers should
+        # be safely saved out of registers by now, and will not be
+        # manipulated by any of the following CALLs.
+        gcmap = self.asm._regalloc.get_gcmap(noregs=True)
+        self.asm.push_gcmap(self.mc, gcmap, store=True)
+
     def pop_gcmap(self):
-        assert not self.is_call_release_gil
         self.asm._reload_frame_if_necessary(self.mc)
         if self.change_extra_stack_depth:
             self.asm.set_extra_stack_depth(self.mc, 0)
         self.asm.pop_gcmap(self.mc)
 
+    def call_releasegil_addr_and_move_real_arguments(self):
+        if IS_X86_32 and self.asm._is_asmgcc():
+            needs_extra_esp = 1      # only for asmgcc on x86_32
+        else:
+            needs_extra_esp = 0
+        initial_esp = self.current_esp
+        self.save_register_arguments(needs_extra_esp)
+        #
+        if not self.asm._is_asmgcc():
+            # the helper takes no argument
+            self.change_extra_stack_depth = False
+        else:
+            from rpython.memory.gctransform import asmgcroot
+            # build a 'css' structure on the stack: 2 words for the linkage,
+            # and 5/7 words as described for asmgcroot.ASM_FRAMEDATA, for a
+            # total size of JIT_USE_WORDS.  This structure is found at
+            # [ESP+css].
+            css = WORD * (self.current_esp +
+                          PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS)
+            assert css >= 2
+            # Save ebp
+            index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
+            self.mc.MOV_sr(index_of_ebp, ebp.value)  # MOV [css.ebp], EBP
+            # Save the "return address": we pretend that it's css
+            if IS_X86_32:
+                reg = eax
+            elif IS_X86_64:
+                reg = edi
+            self.mc.LEA_rs(reg.value, css)           # LEA reg, [css]
+            frame_ptr = css + WORD * (2+asmgcroot.FRAME_PTR)
+            self.mc.MOV_sr(frame_ptr, reg.value)     # MOV [css.frame], reg
+            # Set up jf_extra_stack_depth to pretend that the return address
+            # was at css, and so our stack frame is supposedly shorter by
+            # (PASS_ON_MY_FRAME-JIT_USE_WORDS+1) words
+            delta = PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS + 1
+            self.change_extra_stack_depth = True
+            self.set_extra_stack_depth(self.mc, -delta * WORD)
+            # Call the closestack() function (also releasing the GIL)
+            # with 'reg' as argument
+            if IS_X86_32:
+                self.mc.MOV_sr(0, reg.value)
+            #else:
+            #   on x86_64, reg is edi so that it is already correct
+        #
+        self.mc.CALL(imm(self.asm.releasegil_addr))
+        #
+        self.restore_register_arguments()
+        self.restore_esp(initial_esp)
+
+    def save_register_arguments(self, needs_extra_esp):
+        """Overridden in CallBuilder64"""
+        if needs_extra_esp:
+            self.subtract_esp_aligned(needs_extra_esp)
+
+    def restore_register_arguments(self):
+        """Overridden in CallBuilder64"""
+
+    def move_real_result_and_call_reacqgil_addr(self):
+        # save the result we just got (in eax/eax+edx/st(0)/xmm0)
+        self.save_result_value()
+        # call the reopenstack() function (also reacquiring the GIL)
+        if not self.asm._is_asmgcc():
+            css = 0     # the helper takes no argument
+        else:
+            from rpython.memory.gctransform import asmgcroot
+            css = WORD * (PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS)
+            if IS_X86_32:
+                reg = eax
+            elif IS_X86_64:
+                reg = edi
+            self.mc.LEA_rs(reg.value, css)
+            if IS_X86_32:
+                self.mc.MOV_sr(0, reg.value)
+        #
+        self.mc.CALL(imm(self.asm.reacqgil_addr))
+        #
+        # Now that we required the GIL, we can reload a possibly modified ebp
+        if self.asm._is_asmgcc():
+            # special-case: reload ebp from the css
+            from rpython.memory.gctransform import asmgcroot
+            index_of_ebp = css + WORD * (2+asmgcroot.INDEX_OF_EBP)
+            self.mc.MOV_rs(ebp.value, index_of_ebp)  # MOV EBP, [css.ebp]
+        #else:
+        #   for shadowstack, done for us by _reload_frame_if_necessary()
+
+    def save_result_value(self):
+        """Overridden in CallBuilder32 and CallBuilder64"""
+        raise NotImplementedError
+
 
 class CallBuilder32(AbstractCallBuilder):
 
@@ -112,9 +242,7 @@
             loc = arglocs[i]
             stack_depth += loc.get_width() // WORD
         if stack_depth > self.stack_max:
-            align = align_stack_words(stack_depth - self.stack_max)
-            self.current_esp -= align * WORD
-            self.mc.SUB_ri(esp.value, align * WORD)
+            self.subtract_esp_aligned(stack_depth - self.stack_max)
         #
         p = 0
         for i in range(n):
@@ -149,33 +277,109 @@
         return self.total_stack_used_by_arguments
 
     def load_result(self):
+        if self.ressize == 0:
+            return      # void result
         resloc = self.resloc
-        if isinstance(resloc, FrameLoc) and resloc.type == FLOAT:
+        if resloc.is_float():
             # a float or a long long return
-            if self.restype == 'L':
-                self.mc.MOV_br(resloc.value, eax.value)      # long long
-                self.mc.MOV_br(resloc.value + 4, edx.value)
-                # XXX should ideally not move the result on the stack,
-                #     but it's a mess to load eax/edx into a xmm register
-                #     and this way is simpler also because the result loc
-                #     can just be always a stack location
+            if self.tmpresloc is None:
+                if self.restype == 'L':     # long long
+                    # move eax/edx -> xmm0
+                    self.mc.MOVD_xr(resloc.value^1, edx.value)
+                    self.mc.MOVD_xr(resloc.value,   eax.value)
+                    self.mc.PUNPCKLDQ_xx(resloc.value, resloc.value^1)
+                else:
+                    # float: we have to go via the stack
+                    self.mc.FSTPL_s(0)
+                    self.mc.MOVSD_xs(resloc.value, 0)
             else:
-                self.mc.FSTPL_b(resloc.value)   # float return
+                self.mc.MOVSD(resloc, self.tmpresloc)
+            #
         elif self.restype == 'S':
             # singlefloat return: must convert ST(0) to a 32-bit singlefloat
             # and load it into self.resloc.  mess mess mess
-            self.mc.SUB_ri(esp.value, 4)
-            self.mc.FSTPS_s(0)
-            self.mc.POP(self.resloc)
+            if self.tmpresloc is None:
+                self.mc.FSTPS_s(0)
+                self.mc.MOV_rs(resloc.value, 0)
+            else:
+                self.mc.MOV(resloc, self.tmpresloc)
         else:
             AbstractCallBuilder.load_result(self)
 
+    def save_result_value(self):
+        # Temporarily save the result value into [ESP+4].  We use "+4"
+        # in order to leave the word at [ESP+0] free, in case it's needed
+        if self.ressize == 0:      # void return
+            return
+        if self.resloc.is_float():
+            # a float or a long long return
+            self.tmpresloc = RawEspLoc(4, FLOAT)
+            if self.restype == 'L':
+                self.mc.MOV_sr(4, eax.value)      # long long
+                self.mc.MOV_sr(8, edx.value)
+            else:
+                self.mc.FSTPL_s(4)                # float return
+        else:
+            self.tmpresloc = RawEspLoc(4, INT)
+            if self.restype == 'S':
+                self.mc.FSTPS_s(4)
+            else:
+                assert self.restype == INT
+                assert self.ressize <= WORD
+                self.mc.MOV_sr(4, eax.value)
+
 
 class CallBuilder64(AbstractCallBuilder):
 
-    # In reverse order for use with pop()
-    unused_gpr = [r9, r8, ecx, edx, esi, edi]
-    unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
+    ARGUMENTS_GPR = [edi, esi, edx, ecx, r8, r9]
+    ARGUMENTS_XMM = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7]
+
+    next_arg_gpr = 0
+    next_arg_xmm = 0
+
+    def _unused_gpr(self):
+        i = self.next_arg_gpr
+        self.next_arg_gpr = i + 1
+        try:
+            return self.ARGUMENTS_GPR[i]
+        except IndexError:
+            return None
+
+    def _unused_xmm(self):
+        i = self.next_arg_xmm
+        self.next_arg_xmm = i + 1
+        try:
+            return self.ARGUMENTS_XMM[i]
+        except IndexError:
+            return None
+
+    def _permute_to_prefer_unused_registers(self, lst):
+        N = len(lst)
+        for i in range(N - 1):
+            reg = lst[i]
+            if reg in self.already_used:
+                for j in range(i, N - 1):        # move reg to the end
+                    lst[j] = lst[j + 1]
+                lst[N - 1] = reg
+
+    def select_call_release_gil_mode(self):
+        AbstractCallBuilder.select_call_release_gil_mode(self)
+        # We have to copy the arguments around a bit more in this mode,
+        # but on the other hand we don't need prepare_arguments() moving
+        # them in precisely the final registers.  Here we look around for
+        # unused registers that may be more likely usable.
+        from rpython.jit.backend.x86.regalloc import X86_64_RegisterManager
+        self.already_used = {}
+        for loc in self.arglocs:
+            self.already_used[loc] = None
+        #
+        lst = X86_64_RegisterManager.save_around_call_regs[:]
+        self._permute_to_prefer_unused_registers(lst)
+        self.ARGUMENTS_GPR = lst[:len(self.ARGUMENTS_GPR)]
+        #
+        lst = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7]
+        self._permute_to_prefer_unused_registers(lst)
+        self.ARGUMENTS_XMM = lst
 
     def prepare_arguments(self):
         src_locs = []
@@ -186,35 +390,33 @@
 
         arglocs = self.arglocs
         argtypes = self.argtypes
-        unused_gpr = self.unused_gpr[:]
-        unused_xmm = self.unused_xmm[:]
 
         on_stack = 0
         for i in range(len(arglocs)):
             loc = arglocs[i]
             if loc.is_float():
+                tgt = self._unused_xmm()
+                if tgt is None:
+                    tgt = RawEspLoc(on_stack * WORD, FLOAT)
+                    on_stack += 1
                 xmm_src_locs.append(loc)
-                if len(unused_xmm) > 0:
-                    xmm_dst_locs.append(unused_xmm.pop())
-                else:
-                    xmm_dst_locs.append(RawEspLoc(on_stack * WORD, FLOAT))
-                    on_stack += 1
+                xmm_dst_locs.append(tgt)
             elif argtypes is not None and argtypes[i] == 'S':
                 # Singlefloat argument
                 if singlefloats is None:
                     singlefloats = []
-                if len(unused_xmm) > 0:
-                    singlefloats.append((loc, unused_xmm.pop()))
-                else:
-                    singlefloats.append((loc, RawEspLoc(on_stack * WORD, INT)))
+                tgt = self._unused_xmm()
+                if tgt is None:
+                    tgt = RawEspLoc(on_stack * WORD, INT)
                     on_stack += 1
+                singlefloats.append((loc, tgt))
             else:
+                tgt = self._unused_gpr()
+                if tgt is None:
+                    tgt = RawEspLoc(on_stack * WORD, INT)
+                    on_stack += 1
                 src_locs.append(loc)
-                if len(unused_gpr) > 0:
-                    dst_locs.append(unused_gpr.pop())
-                else:
-                    dst_locs.append(RawEspLoc(on_stack * WORD, INT))
-                    on_stack += 1
+                dst_locs.append(tgt)
 
         if not we_are_translated():  # assert that we got the right stack depth
             floats = 0
@@ -223,15 +425,12 @@
                 if arg.is_float() or argtypes and argtypes[i] == 'S':
                     floats += 1
             all_args = len(arglocs)
-            stack_depth = (max(all_args - floats - 6, 0) +
-                           max(floats - 8, 0))
+            stack_depth = (max(all_args - floats - len(self.ARGUMENTS_GPR), 0)
+                           + max(floats - len(self.ARGUMENTS_XMM), 0))
             assert stack_depth == on_stack
 
-        align = 0
         if on_stack > self.stack_max:
-            align = align_stack_words(on_stack - self.stack_max)
-            self.current_esp -= align * WORD
-            self.mc.SUB_ri(esp.value, align * WORD)
+            self.subtract_esp_aligned(on_stack - self.stack_max)
 
         # Handle register arguments: first remap the xmm arguments
         remap_frame_layout(self.asm, xmm_src_locs, xmm_dst_locs,
@@ -265,15 +464,38 @@
         assert 0     # should not occur on 64-bit
 
     def load_result(self):
-        if self.restype == 'S':
+        if self.restype == 'S' and self.tmpresloc is None:
             # singlefloat return: use MOVD to load the target register
-            # with the lower 32 bits of XMM0
-            resloc = self.resloc
-            assert isinstance(resloc, RegLoc)
-            self.mc.MOVD_rx(resloc.value, xmm0.value)
+            # from the lower 32 bits of XMM0
+            self.mc.MOVD(self.resloc, xmm0)
         else:
             AbstractCallBuilder.load_result(self)
 
+    def save_result_value(self):
+        # Temporarily save the result value into [ESP].
+        if self.ressize == 0:      # void return
+            return
+        #
+        if self.restype == 'S':
+            # singlefloat return: use MOVD to store the lower 32 bits
+            # of XMM0 into [ESP]
+            self.mc.MOVD_sx(0, xmm0.value)
+            type = INT
+        elif self.restype == FLOAT:
+            self.mc.MOVSD_sx(0, xmm0.value)
+            type = FLOAT
+        else:
+            assert self.restype == INT
+            self.mc.MOV_sr(0, eax.value)
+            type = INT
+        self.tmpresloc = RawEspLoc(0, type)
+
+    def save_register_arguments(self, needs_extra_esp):
+        xxx
+
+    def restore_register_arguments(self):
+        xxx
+
 
 if IS_X86_32:
     CallBuilder = CallBuilder32
diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -79,26 +79,14 @@
         rffi.cast(rffi.CArrayPtr(longlong.FLOATSTORAGE), adr)[1] = y
         return ConstFloatLoc(adr)
 
-    def after_call(self, v):
-        # the result is stored in st0, but we don't have this around,
-        # so genop_call will move it to some frame location immediately
-        # after the call
-        return self.frame_manager.loc(v)
+    def call_result_location(self, v):
+        return xmm0
 
 class X86_64_XMMRegisterManager(X86XMMRegisterManager):
     # xmm15 reserved for scratch use
     all_regs = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14]
     save_around_call_regs = all_regs
 
-    def call_result_location(self, v):
-        return xmm0
-
-    def after_call(self, v):
-        # We use RegisterManager's implementation, since X86XMMRegisterManager
-        # places the result on the stack, which we don't need to do when the
-        # calling convention places the result in xmm0
-        return RegisterManager.after_call(self, v)
-
 class X86FrameManager(FrameManager):
     def __init__(self, base_ofs):
         FrameManager.__init__(self)
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -583,6 +583,7 @@
 
     # x87 instructions
     FSTPL_b = insn('\xDD', orbyte(3<<3), stack_bp(1)) # rffi.DOUBLE ('as' wants L??)
+    FSTPL_s = insn('\xDD', orbyte(3<<3), stack_sp(1)) # rffi.DOUBLE ('as' wants L??)
     FSTPS_s = insn('\xD9', orbyte(3<<3), stack_sp(1)) # lltype.SingleFloat
 
     # ------------------------------ Random mess -----------------------


More information about the pypy-commit mailing list