[pypy-commit] pypy default: merge emit-call-arm

Mon May 27 14:52:32 CEST 2013

Author: David Schneider <david.schneider at picle.org>
Branch: 
Changeset: r64580:9e31743395b4
Date: 2013-05-27 07:43 -0500
http://bitbucket.org/pypy/pypy/changeset/9e31743395b4/

Log:	merge emit-call-arm

diff --git a/rpython/jit/backend/arm/assembler.py b/rpython/jit/backend/arm/assembler.py
--- a/rpython/jit/backend/arm/assembler.py
+++ b/rpython/jit/backend/arm/assembler.py
@@ -19,7 +19,7 @@
 from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
 from rpython.jit.backend.model import CompiledLoopToken
 from rpython.jit.codewriter.effectinfo import EffectInfo
-from rpython.jit.metainterp.history import AbstractFailDescr, FLOAT
+from rpython.jit.metainterp.history import AbstractFailDescr, FLOAT, INT, VOID
 from rpython.jit.metainterp.resoperation import rop
 from rpython.rlib.debug import debug_print, debug_start, debug_stop
 from rpython.rlib.jit import AsmInfo
@@ -27,6 +27,7 @@
 from rpython.rlib.rarithmetic import r_uint
 from rpython.rtyper.annlowlevel import llhelper, cast_instance_to_gcref
 from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.jit.backend.arm import callbuilder
 
 class AssemblerARM(ResOpAssembler):
 
@@ -934,23 +935,6 @@
         asm_math_operations[oopspecindex](self, op, arglocs, regalloc, fcond)
         return fcond
 
-    def _ensure_result_bit_extension(self, resloc, size, signed):
-        if size == 4:
-            return
-        if size == 1:
-            if not signed:  # unsigned char
-                self.mc.AND_ri(resloc.value, resloc.value, 0xFF)
-            else:
-                self.mc.LSL_ri(resloc.value, resloc.value, 24)
-                self.mc.ASR_ri(resloc.value, resloc.value, 24)
-        elif size == 2:
-            if not signed:
-                self.mc.LSL_ri(resloc.value, resloc.value, 16)
-                self.mc.LSR_ri(resloc.value, resloc.value, 16)
-            else:
-                self.mc.LSL_ri(resloc.value, resloc.value, 16)
-                self.mc.ASR_ri(resloc.value, resloc.value, 16)
-
     def patch_trace(self, faildescr, looptoken, bridge_addr, regalloc):
         b = InstrBuilder(self.cpu.cpuinfo.arch_version)
         patch_addr = faildescr._arm_failure_recovery_block
@@ -1012,20 +996,32 @@
             mc.gen_load_int(helper.value, ofs, cond=cond)
             mc.STR_rr(source.value, base.value, helper.value, cond=cond)
 
+    def get_tmp_reg(self, forbidden_regs=None):
+        if forbidden_regs is None:
+            return r.ip, False
+        for x in [r.ip, r.lr]:
+            if x not in forbidden_regs:
+                return x, False
+        # pick some reg, that we need to save
+        for x in r.all_regs:
+            if x not in forbidden_regs:
+                return x, True
+        assert 0
+
     def _mov_imm_to_loc(self, prev_loc, loc, cond=c.AL):
-        if not loc.is_reg() and not (loc.is_stack() and loc.type != FLOAT):
+        if loc.type == FLOAT:
             raise AssertionError("invalid target for move from imm value")
         if loc.is_reg():
             new_loc = loc
-        elif loc.is_stack():
-            self.mc.PUSH([r.lr.value], cond=cond)
+        elif loc.is_stack() or loc.is_raw_sp():
             new_loc = r.lr
         else:
             raise AssertionError("invalid target for move from imm value")
         self.mc.gen_load_int(new_loc.value, prev_loc.value, cond=cond)
         if loc.is_stack():
             self.regalloc_mov(new_loc, loc)
-            self.mc.POP([r.lr.value], cond=cond)
+        elif loc.is_raw_sp():
+            self.store_reg(self.mc, new_loc, r.sp, loc.value, cond=cond, helper=r.ip)
 
     def _mov_reg_to_loc(self, prev_loc, loc, cond=c.AL):
         if loc.is_imm():
@@ -1034,60 +1030,77 @@
             self.mc.MOV_rr(loc.value, prev_loc.value, cond=cond)
         elif loc.is_stack() and loc.type != FLOAT:
             # spill a core register
-            if prev_loc is r.ip:
-                temp = r.lr
-            else:
-                temp = r.ip
+            temp, save = self.get_tmp_reg([prev_loc, loc])
             offset = loc.value
             is_imm = check_imm_arg(offset, size=0xFFF)
-            if not is_imm:
+            if not is_imm and save:
                 self.mc.PUSH([temp.value], cond=cond)
             self.store_reg(self.mc, prev_loc, r.fp, offset, helper=temp, cond=cond)
-            if not is_imm:
+            if not is_imm and save:
                 self.mc.POP([temp.value], cond=cond)
+        elif loc.is_raw_sp() and loc.type != FLOAT:
+            temp, save = self.get_tmp_reg([prev_loc])
+            assert not save
+            self.store_reg(self.mc, prev_loc, r.sp, loc.value, cond=cond, helper=temp)
         else:
             assert 0, 'unsupported case'
 
     def _mov_stack_to_loc(self, prev_loc, loc, cond=c.AL):
-        # disabled for now, has side effects in combination with remap_frame_layout when called from a jump
-        helper = None # self._regalloc.get_free_reg()
+        helper = None
+        offset = prev_loc.value
+        tmp = None
         if loc.is_reg():
             assert prev_loc.type != FLOAT, 'trying to load from an \
                 incompatible location into a core register'
-            assert loc is not r.lr, 'lr is not supported as a target \
-                when moving from the stack'
             # unspill a core register
-            offset = prev_loc.value
             is_imm = check_imm_arg(offset, size=0xFFF)
-            helper = r.lr if helper is None else helper
-            save_helper = not is_imm and helper is r.lr
+            helper, save = self.get_tmp_reg([loc])
+            save_helper = not is_imm and save
         elif loc.is_vfp_reg():
             assert prev_loc.type == FLOAT, 'trying to load from an \
                 incompatible location into a float register'
             # load spilled value into vfp reg
-            offset = prev_loc.value
             is_imm = check_imm_arg(offset)
-            helper = r.ip if helper is None else helper
-            save_helper = not is_imm and helper is r.ip
+            helper, save = self.get_tmp_reg()
+            save_helper = not is_imm and save
+        elif loc.is_raw_sp():
+            assert (loc.type == prev_loc.type == FLOAT
+                    or (loc.type != FLOAT and prev_loc.type != FLOAT))
+            tmp = loc
+            if loc.is_float():
+                loc = r.vfp_ip
+            else:
+                loc, save_helper = self.get_tmp_reg()
+                assert not save_helper
+            helper, save_helper = self.get_tmp_reg([loc])
+            assert not save_helper
         else:
             assert 0, 'unsupported case'
+
         if save_helper:
             self.mc.PUSH([helper.value], cond=cond)
         self.load_reg(self.mc, loc, r.fp, offset, cond=cond, helper=helper)
         if save_helper:
             self.mc.POP([helper.value], cond=cond)
 
+        if tmp and tmp.is_raw_sp():
+            self.store_reg(self.mc, loc, r.sp, tmp.value, cond=cond, helper=helper)
+
     def _mov_imm_float_to_loc(self, prev_loc, loc, cond=c.AL):
         if loc.is_vfp_reg():
-            self.mc.PUSH([r.ip.value], cond=cond)
-            self.mc.gen_load_int(r.ip.value, prev_loc.getint(), cond=cond)
-            self.load_reg(self.mc, loc, r.ip, 0, cond=cond)
-            self.mc.POP([r.ip.value], cond=cond)
-        elif loc.is_stack():
-            self.regalloc_push(r.vfp_ip)
+            helper, save_helper = self.get_tmp_reg([loc])
+            if save_helper:
+                self.mc.PUSH([helper.value], cond=cond)
+            self.mc.gen_load_int(helper.value, prev_loc.getint(), cond=cond)
+            self.load_reg(self.mc, loc, helper, 0, cond=cond)
+            if save_helper:
+                self.mc.POP([helper.value], cond=cond)
+        elif loc.is_stack() and loc.type == FLOAT:
             self.regalloc_mov(prev_loc, r.vfp_ip, cond)
             self.regalloc_mov(r.vfp_ip, loc, cond)
-            self.regalloc_pop(r.vfp_ip)
+        elif loc.is_raw_sp() and loc.type == FLOAT:
+            self.regalloc_mov(prev_loc, r.vfp_ip, cond)
+            self.regalloc_mov(r.vfp_ip, loc, cond)
         else:
             assert 0, 'unsupported case'
 
@@ -1100,11 +1113,11 @@
             # spill vfp register
             offset = loc.value
             is_imm = check_imm_arg(offset)
-            if not is_imm:
-                self.mc.PUSH([r.ip.value], cond=cond)
-            self.store_reg(self.mc, prev_loc, r.fp, offset, cond=cond)
-            if not is_imm:
-                self.mc.POP([r.ip.value], cond=cond)
+            self.store_reg(self.mc, prev_loc, r.fp, offset, cond=cond, helper=r.ip)
+        elif loc.is_raw_sp():
+            assert loc.type == FLOAT, 'trying to store to an \
+                incompatible location from a float register'
+            self.store_reg(self.mc, prev_loc, r.sp, loc.value, cond=cond)
         else:
             assert 0, 'unsupported case'
 
@@ -1120,6 +1133,8 @@
             self._mov_imm_float_to_loc(prev_loc, loc, cond)
         elif prev_loc.is_vfp_reg():
             self._mov_vfp_reg_to_loc(prev_loc, loc, cond)
+        elif prev_loc.is_raw_sp():
+            assert 0, 'raw sp locs are not supported as source loc'
         else:
             assert 0, 'unsupported case'
     mov_loc_loc = regalloc_mov
@@ -1131,23 +1146,29 @@
         if vfp_loc.is_vfp_reg():
             self.mc.VMOV_rc(reg1.value, reg2.value, vfp_loc.value, cond=cond)
         elif vfp_loc.is_imm_float():
-            self.mc.PUSH([r.ip.value], cond=cond)
-            self.mc.gen_load_int(r.ip.value, vfp_loc.getint(), cond=cond)
+            helper, save_helper = self.get_tmp_reg([reg1, reg2])
+            if save_helper:
+                self.mc.PUSH([helper.value], cond=cond)
+            self.mc.gen_load_int(helper.value, vfp_loc.getint(), cond=cond)
             # we need to load one word to loc and one to loc+1 which are
             # two 32-bit core registers
-            self.mc.LDR_ri(reg1.value, r.ip.value, cond=cond)
-            self.mc.LDR_ri(reg2.value, r.ip.value, imm=WORD, cond=cond)
-            self.mc.POP([r.ip.value], cond=cond)
+            self.mc.LDR_ri(reg1.value, helper.value, cond=cond)
+            self.mc.LDR_ri(reg2.value, helper.value, imm=WORD, cond=cond)
+            if save_helper:
+                self.mc.POP([helper.value], cond=cond)
         elif vfp_loc.is_stack() and vfp_loc.type == FLOAT:
             # load spilled vfp value into two core registers
             offset = vfp_loc.value
             if not check_imm_arg(offset, size=0xFFF):
-                self.mc.PUSH([r.ip.value], cond=cond)
-                self.mc.gen_load_int(r.ip.value, offset, cond=cond)
-                self.mc.LDR_rr(reg1.value, r.fp.value, r.ip.value, cond=cond)
-                self.mc.ADD_ri(r.ip.value, r.ip.value, imm=WORD, cond=cond)
-                self.mc.LDR_rr(reg2.value, r.fp.value, r.ip.value, cond=cond)
-                self.mc.POP([r.ip.value], cond=cond)
+                helper, save_helper = self.get_tmp_reg([reg1, reg2])
+                if save_helper:
+                    self.mc.PUSH([helper.value], cond=cond)
+                self.mc.gen_load_int(helper.value, offset, cond=cond)
+                self.mc.LDR_rr(reg1.value, r.fp.value, helper.value, cond=cond)
+                self.mc.ADD_ri(helper.value, helper.value, imm=WORD, cond=cond)
+                self.mc.LDR_rr(reg2.value, r.fp.value, helper.value, cond=cond)
+                if save_helper:
+                    self.mc.POP([helper.value], cond=cond)
             else:
                 self.mc.LDR_ri(reg1.value, r.fp.value, imm=offset, cond=cond)
                 self.mc.LDR_ri(reg2.value, r.fp.value,
@@ -1165,12 +1186,15 @@
             # move from two core registers to a float stack location
             offset = vfp_loc.value
             if not check_imm_arg(offset + WORD, size=0xFFF):
-                self.mc.PUSH([r.ip.value], cond=cond)
-                self.mc.gen_load_int(r.ip.value, offset, cond=cond)
-                self.mc.STR_rr(reg1.value, r.fp.value, r.ip.value, cond=cond)
-                self.mc.ADD_ri(r.ip.value, r.ip.value, imm=WORD, cond=cond)
-                self.mc.STR_rr(reg2.value, r.fp.value, r.ip.value, cond=cond)
-                self.mc.POP([r.ip.value], cond=cond)
+                helper, save_helper = self.get_tmp_reg([reg1, reg2])
+                if save_helper:
+                    self.mc.PUSH([helper.value], cond=cond)
+                self.mc.gen_load_int(helper.value, offset, cond=cond)
+                self.mc.STR_rr(reg1.value, r.fp.value, helper.value, cond=cond)
+                self.mc.ADD_ri(helper.value, helper.value, imm=WORD, cond=cond)
+                self.mc.STR_rr(reg2.value, r.fp.value, helper.value, cond=cond)
+                if save_helper:
+                    self.mc.POP([helper.value], cond=cond)
             else:
                 self.mc.STR_ri(reg1.value, r.fp.value, imm=offset, cond=cond)
                 self.mc.STR_ri(reg2.value, r.fp.value,
@@ -1417,6 +1441,26 @@
         #
         return shiftsize
 
+    def simple_call(self, fnloc, arglocs, result_loc=r.r0):
+        if result_loc is None:
+            result_type = VOID
+            result_size = 0
+        elif result_loc.is_vfp_reg():
+            result_type = FLOAT
+            result_size = DOUBLE_WORD
+        else:
+            result_type = INT
+            result_size = WORD
+        cb = callbuilder.get_callbuilder(self.cpu, self, fnloc, arglocs,
+                                     result_loc, result_type,
+                                     result_size)
+        cb.emit()
+
+    def simple_call_no_collect(self, fnloc, arglocs):
+        cb = callbuilder.get_callbuilder(self.cpu, self, fnloc, arglocs)
+        cb.emit_no_collect()
+
+
 def not_implemented(msg):
     os.write(2, '[ARM/asm] %s\n' % msg)
     raise NotImplementedError(msg)
diff --git a/rpython/jit/backend/arm/callbuilder.py b/rpython/jit/backend/arm/callbuilder.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/arm/callbuilder.py
@@ -0,0 +1,304 @@
+from rpython.rlib.clibffi import FFI_DEFAULT_ABI
+from rpython.rlib.objectmodel import we_are_translated
+from rpython.jit.metainterp.history import INT, FLOAT, REF
+from rpython.jit.backend.arm.arch import WORD
+from rpython.jit.backend.arm import registers as r
+from rpython.jit.backend.arm import conditions as c
+from rpython.jit.backend.arm.locations import RawSPStackLocation
+from rpython.jit.backend.arm.jump import remap_frame_layout
+from rpython.jit.backend.llsupport.callbuilder import AbstractCallBuilder
+from rpython.jit.backend.arm.helper.assembler import count_reg_args
+from rpython.jit.backend.arm.helper.assembler import saved_registers
+from rpython.jit.backend.arm.helper.regalloc import check_imm_arg
+
+
+class ARMCallbuilder(AbstractCallBuilder):
+    def __init__(self, assembler, fnloc, arglocs,
+                 resloc=r.r0, restype=INT, ressize=WORD, ressigned=True):
+        AbstractCallBuilder.__init__(self, assembler, fnloc, arglocs,
+                                     resloc, restype, ressize)
+        self.current_sp = 0
+
+    def push_gcmap(self):
+        assert not self.is_call_release_gil
+        # we push *now* the gcmap, describing the status of GC registers
+        # after the rearrangements done just above, ignoring the return
+        # value eax, if necessary
+        noregs = self.asm.cpu.gc_ll_descr.is_shadow_stack()
+        gcmap = self.asm._regalloc.get_gcmap([r.r0], noregs=noregs)
+        self.asm.push_gcmap(self.mc, gcmap, store=True)
+
+    def pop_gcmap(self):
+        self.asm._reload_frame_if_necessary(self.mc)
+        self.asm.pop_gcmap(self.mc)
+
+    def emit_raw_call(self):
+        #the actual call
+        if self.fnloc.is_imm():
+            self.mc.BL(self.fnloc.value)
+            return
+        if self.fnloc.is_stack():
+            self.asm.mov_loc_loc(self.fnloc, r.ip)
+            self.fnloc = r.ip
+        assert self.fnloc.is_reg()
+        self.mc.BLX(self.fnloc.value)
+
+    def restore_stack_pointer(self):
+        # readjust the sp in case we passed some args on the stack
+        assert self.current_sp % 8 == 0  # sanity check
+        if self.current_sp != 0:
+            self._adjust_sp(self.current_sp)
+        self.current_sp = 0
+
+    def _push_stack_args(self, stack_args, on_stack):
+        assert on_stack % 8 == 0
+        self._adjust_sp(-on_stack)
+        self.current_sp = on_stack
+        ofs = 0
+        for i, arg in enumerate(stack_args):
+            if arg is not None:
+                sp_loc = RawSPStackLocation(ofs, arg.type)
+                self.asm.regalloc_mov(arg, sp_loc)
+                ofs += sp_loc.width
+            else:  # alignment word
+                ofs += WORD
+
+    def _adjust_sp(self, n):
+        # adjust the current stack pointer by n bytes
+        if n > 0:
+            if check_imm_arg(n):
+                self.mc.ADD_ri(r.sp.value, r.sp.value, n)
+            else:
+                self.mc.gen_load_int(r.ip.value, n)
+                self.mc.ADD_rr(r.sp.value, r.sp.value, r.ip.value)
+        else:
+            n = abs(n)
+            if check_imm_arg(n):
+                self.mc.SUB_ri(r.sp.value, r.sp.value, n)
+            else:
+                self.mc.gen_load_int(r.ip.value, n)
+                self.mc.SUB_rr(r.sp.value, r.sp.value, r.ip.value)
+
+    def select_call_release_gil_mode(self):
+        AbstractCallBuilder.select_call_release_gil_mode(self)
+
+    def call_releasegil_addr_and_move_real_arguments(self):
+        assert not self.asm._is_asmgcc()
+        from rpython.jit.backend.arm.regalloc import CoreRegisterManager
+        with saved_registers(self.mc,
+                            CoreRegisterManager.save_around_call_regs):
+            self.mc.BL(self.asm.releasegil_addr)
+
+        if not we_are_translated():                     # for testing: we should not access
+            self.mc.ADD_ri(r.fp.value, r.fp.value, 1)   # fp any more
+
+    def move_real_result_and_call_reacqgil_addr(self):
+        # save the result we just got
+        assert not self.asm._is_asmgcc()
+        gpr_to_save, vfp_to_save = self.get_result_locs()
+        with saved_registers(self.mc, gpr_to_save, vfp_to_save):
+            self.mc.BL(self.asm.reacqgil_addr)
+
+        if not we_are_translated():                    # for testing: now we can accesss
+            self.mc.SUB_ri(r.fp.value, r.fp.value, 1)  # fp again
+
+        #   for shadowstack, done for us by _reload_frame_if_necessary()
+
+    def get_result_locs(self):
+        raise NotImplementedError
+
+    def _ensure_result_bit_extension(self, resloc, size, signed):
+        if size == 4:
+            return
+        if size == 1:
+            if not signed:  # unsigned char
+                self.mc.AND_ri(resloc.value, resloc.value, 0xFF)
+            else:
+                self.mc.LSL_ri(resloc.value, resloc.value, 24)
+                self.mc.ASR_ri(resloc.value, resloc.value, 24)
+        elif size == 2:
+            if not signed:
+                self.mc.LSL_ri(resloc.value, resloc.value, 16)
+                self.mc.LSR_ri(resloc.value, resloc.value, 16)
+            else:
+                self.mc.LSL_ri(resloc.value, resloc.value, 16)
+                self.mc.ASR_ri(resloc.value, resloc.value, 16)
+
+
+
+class SoftFloatCallBuilder(ARMCallbuilder):
+
+    def get_result_locs(self):
+        if self.resloc is None:
+            return [], []
+        if self.resloc.is_vfp_reg():
+            return [r.r0, r.r1], []
+        assert self.resloc.is_reg()
+        return [r.r0], []
+
+    def load_result(self):
+        # ensure the result is wellformed and stored in the correct location
+        resloc = self.resloc
+        if resloc is None:
+            return
+        if resloc.is_vfp_reg():
+            # move result to the allocated register
+            self.asm.mov_to_vfp_loc(r.r0, r.r1, resloc)
+        elif resloc.is_reg():
+            # move result to the allocated register
+            if resloc is not r.r0:
+                self.asm.mov_loc_loc(r.r0, resloc)
+            self._ensure_result_bit_extension(resloc,
+                                              self.ressize, self.ressign)
+
+
+    def _collect_and_push_stack_args(self, arglocs):
+        n_args = len(arglocs)
+        reg_args = count_reg_args(arglocs)
+        # all arguments past the 4th go on the stack
+        # first we need to prepare the list so it stays aligned
+        stack_args = []
+        count = 0
+        on_stack = 0
+        if n_args > reg_args:
+            for i in range(reg_args, n_args):
+                arg = arglocs[i]
+                if arg.type != FLOAT:
+                    count += 1
+                    on_stack += 1
+                else:
+                    on_stack += 2
+                    if count % 2 != 0:
+                        stack_args.append(None)
+                        count = 0
+                        on_stack += 1
+                stack_args.append(arg)
+            if count % 2 != 0:
+                on_stack += 1
+                stack_args.append(None)
+        if on_stack > 0:
+            self._push_stack_args(stack_args, on_stack*WORD)
+
+    def prepare_arguments(self):
+        arglocs = self.arglocs
+        reg_args = count_reg_args(arglocs)
+        self._collect_and_push_stack_args(arglocs)
+        # collect variables that need to go in registers and the registers they
+        # will be stored in
+        num = 0
+        count = 0
+        non_float_locs = []
+        non_float_regs = []
+        float_locs = []
+        for i in range(reg_args):
+            arg = arglocs[i]
+            if arg.type == FLOAT and count % 2 != 0:
+                    num += 1
+                    count = 0
+            reg = r.caller_resp[num]
+
+            if arg.type == FLOAT:
+                float_locs.append((arg, reg))
+            else:
+                non_float_locs.append(arg)
+                non_float_regs.append(reg)
+
+            if arg.type == FLOAT:
+                num += 2
+            else:
+                num += 1
+                count += 1
+        # Check that the address of the function we want to call is not
+        # currently stored in one of the registers used to pass the arguments
+        # or on the stack, which we can not access later
+        # If this happens to be the case we remap the register to r4 and use r4
+        # to call the function
+        if self.fnloc in r.argument_regs or self.fnloc.is_stack():
+            non_float_locs.append(self.fnloc)
+            non_float_regs.append(r.r4)
+            self.fnloc = r.r4
+        # remap values stored in core registers
+        remap_frame_layout(self.asm, non_float_locs, non_float_regs, r.ip)
+
+        for loc, reg in float_locs:
+            self.asm.mov_from_vfp_loc(loc, reg, r.all_regs[reg.value + 1])
+
+class HardFloatCallBuilder(ARMCallbuilder):
+
+    def prepare_arguments(self):
+        non_float_locs = []
+        non_float_regs = []
+        float_locs = []
+        float_regs = []
+        stack_args = []
+
+        arglocs = self.arglocs
+        argtypes = self.argtypes
+
+        count = 0                      # stack alignment counter
+        on_stack = 0
+        for arg in arglocs:
+            if arg.type != FLOAT:
+                if len(non_float_regs) < len(r.argument_regs):
+                    reg = r.argument_regs[len(non_float_regs)]
+                    non_float_locs.append(arg)
+                    non_float_regs.append(reg)
+                else:  # non-float argument that needs to go on the stack
+                    count += 1
+                    on_stack += 1
+                    stack_args.append(arg)
+            else:
+                if len(float_regs) < len(r.vfp_argument_regs):
+                    reg = r.vfp_argument_regs[len(float_regs)]
+                    float_locs.append(arg)
+                    float_regs.append(reg)
+                else:  # float argument that needs to go on the stack
+                    if count % 2 != 0:
+                        stack_args.append(None)
+                        count = 0
+                        on_stack += 1
+                    stack_args.append(arg)
+                    on_stack += 2
+        # align the stack
+        if count % 2 != 0:
+            stack_args.append(None)
+            on_stack += 1
+        self._push_stack_args(stack_args, on_stack*WORD)
+        # Check that the address of the function we want to call is not
+        # currently stored in one of the registers used to pass the arguments
+        # or on the stack, which we can not access later
+        # If this happens to be the case we remap the register to r4 and use r4
+        # to call the function
+        if self.fnloc in non_float_regs or self.fnloc.is_stack():
+            non_float_locs.append(self.fnloc)
+            non_float_regs.append(r.r4)
+            self.fnloc = r.r4
+        # remap values stored in core registers
+        remap_frame_layout(self.asm, non_float_locs, non_float_regs, r.ip)
+        # remap values stored in vfp registers
+        remap_frame_layout(self.asm, float_locs, float_regs, r.vfp_ip)
+
+    def load_result(self):
+        resloc = self.resloc
+        # ensure the result is wellformed and stored in the correct location
+        if resloc is not None and resloc.is_reg():
+            self._ensure_result_bit_extension(resloc,
+                                                  self.ressize, self.ressign)
+
+    def get_result_locs(self):
+        if self.resloc is None:
+            return [], []
+        if self.resloc.is_vfp_reg():
+            return [], [r.d0]
+        assert self.resloc.is_reg()
+        return [r.r0], []
+
+
+def get_callbuilder(cpu, assembler, fnloc, arglocs,
+                 resloc=r.r0, restype=INT, ressize=WORD, ressigned=True):
+    if cpu.cpuinfo.hf_abi:
+        return HardFloatCallBuilder(assembler, fnloc, arglocs, resloc,
+                                        restype, ressize, ressigned)
+    else:
+        return SoftFloatCallBuilder(assembler, fnloc, arglocs, resloc,
+                                        restype, ressize, ressigned)
diff --git a/rpython/jit/backend/arm/locations.py b/rpython/jit/backend/arm/locations.py
--- a/rpython/jit/backend/arm/locations.py
+++ b/rpython/jit/backend/arm/locations.py
@@ -12,6 +12,9 @@
     def is_stack(self):
         return False
 
+    def is_raw_sp(self):
+        return False
+
     def is_reg(self):
         return False
 
@@ -145,7 +148,27 @@
         return self.position + 10000
 
     def is_float(self):
-        return type == FLOAT
+        return self.type == FLOAT
+
+class RawSPStackLocation(AssemblerLocation):
+    _immutable_ = True
+
+    def __init__(self, sp_offset, type=INT):
+        if type == FLOAT:
+            self.width = DOUBLE_WORD
+        else:
+            self.width = WORD
+        self.value = sp_offset
+        self.type = type
+
+    def __repr__(self):
+        return 'SP(%s)+%d' % (self.type, self.value,)
+
+    def is_raw_sp(self):
+        return True
+
+    def is_float(self):
+        return self.type == FLOAT
 
 
 def imm(i):
diff --git a/rpython/jit/backend/arm/opassembler.py b/rpython/jit/backend/arm/opassembler.py
--- a/rpython/jit/backend/arm/opassembler.py
+++ b/rpython/jit/backend/arm/opassembler.py
@@ -13,8 +13,7 @@
                                                 gen_emit_float_cmp_op,
                                                 gen_emit_float_cmp_op_guard,
                                                 gen_emit_unary_float_op,
-                                                saved_registers,
-                                                count_reg_args)
+                                                saved_registers)
 from rpython.jit.backend.arm.helper.regalloc import check_imm_arg
 from rpython.jit.backend.arm.codebuilder import InstrBuilder, OverwritingBuilder
 from rpython.jit.backend.arm.jump import remap_frame_layout
@@ -31,8 +30,7 @@
 from rpython.rlib.objectmodel import we_are_translated
 from rpython.rtyper.lltypesystem import rstr, rffi, lltype
 from rpython.rtyper.annlowlevel import cast_instance_to_gcref
-
-NO_FORCE_INDEX = -1
+from rpython.jit.backend.arm import callbuilder
 
 
 class ArmGuardToken(GuardToken):
@@ -339,217 +337,36 @@
         return fcond
 
     def emit_op_call(self, op, arglocs, regalloc, fcond):
-        resloc = arglocs[0]
-        adr = arglocs[1]
-        arglist = arglocs[2:]
+        return self._emit_call(op, arglocs, fcond=fcond)
+
+    def _emit_call(self, op, arglocs, is_call_release_gil=False, fcond=c.AL):
+        # args = [resloc, size, sign, args...]
+        from rpython.jit.backend.llsupport.descr import CallDescr
+
+        cb = callbuilder.get_callbuilder(self.cpu, self, arglocs[3], arglocs[4:], arglocs[0])
+
         descr = op.getdescr()
-        size = descr.get_result_size()
-        signed = descr.is_result_signed()
-        cond = self._emit_call(adr, arglist,
-                                            fcond, resloc, (size, signed))
-        return cond
+        assert isinstance(descr, CallDescr)
+        cb.callconv = descr.get_call_conv()
+        cb.argtypes = descr.get_arg_types()
+        cb.restype  = descr.get_result_type()
+        sizeloc = arglocs[1]
+        assert sizeloc.is_imm()
+        cb.ressize = sizeloc.value
+        signloc = arglocs[2]
+        assert signloc.is_imm()
+        cb.ressign = signloc.value
 
-    def _emit_call(self, adr, arglocs, fcond=c.AL, resloc=None,
-                    result_info=(-1, -1),
-                    # whether to worry about a CALL that can collect; this
-                    # is always true except in call_release_gil
-                    can_collect=True):
-        if self.cpu.cpuinfo.hf_abi:
-            stack_args, adr = self._setup_call_hf(adr, arglocs, fcond,
-                                            resloc, result_info)
+        if is_call_release_gil:
+            cb.emit_call_release_gil()
         else:
-            stack_args, adr = self._setup_call_sf(adr, arglocs, fcond,
-                                            resloc, result_info)
-
-        if can_collect:
-            # we push *now* the gcmap, describing the status of GC registers
-            # after the rearrangements done just above, ignoring the return
-            # value eax, if necessary
-            noregs = self.cpu.gc_ll_descr.is_shadow_stack()
-            gcmap = self._regalloc.get_gcmap([r.r0], noregs=noregs)
-            self.push_gcmap(self.mc, gcmap, store=True)
-        #the actual call
-        if adr.is_imm():
-            self.mc.BL(adr.value)
-        elif adr.is_stack():
-            self.mov_loc_loc(adr, r.ip)
-            adr = r.ip
-        else:
-            assert adr.is_reg()
-        if adr.is_reg():
-            self.mc.BLX(adr.value)
-        self._restore_sp(stack_args, fcond)
-
-        # ensure the result is wellformed and stored in the correct location
-        if resloc is not None:
-            if resloc.is_vfp_reg() and not self.cpu.cpuinfo.hf_abi:
-                # move result to the allocated register
-                self.mov_to_vfp_loc(r.r0, r.r1, resloc)
-            elif resloc.is_reg() and result_info != (-1, -1):
-                self._ensure_result_bit_extension(resloc, result_info[0],
-                                                          result_info[1])
-        if can_collect:
-            self._reload_frame_if_necessary(self.mc)
-            self.pop_gcmap(self.mc)
+            cb.emit()
         return fcond
 
-    def _restore_sp(self, stack_args, fcond):
-        # readjust the sp in case we passed some args on the stack
-        if len(stack_args) > 0:
-            n = 0
-            for arg in stack_args:
-                if arg is None or arg.type != FLOAT:
-                    n += WORD
-                else:
-                    n += DOUBLE_WORD
-            self._adjust_sp(-n, fcond=fcond)
-            assert n % 8 == 0  # sanity check
-
-    def _adjust_sp(self, n, cb=None, fcond=c.AL, base_reg=r.sp):
-        if cb is None:
-            cb = self.mc
-        if n < 0:
-            n = -n
-            rev = True
-        else:
-            rev = False
-        if n <= 0xFF and fcond == c.AL:
-            if rev:
-                cb.ADD_ri(r.sp.value, base_reg.value, n)
-            else:
-                cb.SUB_ri(r.sp.value, base_reg.value, n)
-        else:
-            cb.gen_load_int(r.ip.value, n, cond=fcond)
-            if rev:
-                cb.ADD_rr(r.sp.value, base_reg.value, r.ip.value, cond=fcond)
-            else:
-                cb.SUB_rr(r.sp.value, base_reg.value, r.ip.value, cond=fcond)
-
-
-    def _collect_stack_args_sf(self, arglocs):
-        n_args = len(arglocs)
-        reg_args = count_reg_args(arglocs)
-        # all arguments past the 4th go on the stack
-        # first we need to prepare the list so it stays aligned
-        stack_args = []
-        count = 0
-        if n_args > reg_args:
-            for i in range(reg_args, n_args):
-                arg = arglocs[i]
-                if arg.type != FLOAT:
-                    count += 1
-                else:
-                    if count % 2 != 0:
-                        stack_args.append(None)
-                        count = 0
-                stack_args.append(arg)
-            if count % 2 != 0:
-                stack_args.append(None)
-        return stack_args
-
-    def _push_stack_args(self, stack_args):
-            #then we push every thing on the stack
-            for i in range(len(stack_args) - 1, -1, -1):
-                arg = stack_args[i]
-                if arg is None:
-                    self.mc.PUSH([r.ip.value])
-                else:
-                    self.regalloc_push(arg)
-
-    def _setup_call_sf(self, adr, arglocs, fcond=c.AL,
-                                         resloc=None, result_info=(-1, -1)):
-        reg_args = count_reg_args(arglocs)
-        stack_args = self._collect_stack_args_sf(arglocs)
-        self._push_stack_args(stack_args)
-        # collect variables that need to go in registers and the registers they
-        # will be stored in
-        num = 0
-        count = 0
-        non_float_locs = []
-        non_float_regs = []
-        float_locs = []
-        for i in range(reg_args):
-            arg = arglocs[i]
-            if arg.type == FLOAT and count % 2 != 0:
-                    num += 1
-                    count = 0
-            reg = r.caller_resp[num]
-
-            if arg.type == FLOAT:
-                float_locs.append((arg, reg))
-            else:
-                non_float_locs.append(arg)
-                non_float_regs.append(reg)
-
-            if arg.type == FLOAT:
-                num += 2
-            else:
-                num += 1
-                count += 1
-        # Check that the address of the function we want to call is not
-        # currently stored in one of the registers used to pass the arguments.
-        # If this happens to be the case we remap the register to r4 and use r4
-        # to call the function
-        if adr in non_float_regs:
-            non_float_locs.append(adr)
-            non_float_regs.append(r.r4)
-            adr = r.r4
-        # remap values stored in core registers
-        remap_frame_layout(self, non_float_locs, non_float_regs, r.ip)
-
-        for loc, reg in float_locs:
-            self.mov_from_vfp_loc(loc, reg, r.all_regs[reg.value + 1])
-        return stack_args, adr
-
-    def _setup_call_hf(self, adr, arglocs, fcond=c.AL,
-                                         resloc=None, result_info=(-1, -1)):
-        non_float_locs = []
-        non_float_regs = []
-        float_locs = []
-        float_regs = []
-        stack_args = []
-        count = 0                      # stack alignment counter
-        for arg in arglocs:
-            if arg.type != FLOAT:
-                if len(non_float_regs) < len(r.argument_regs):
-                    reg = r.argument_regs[len(non_float_regs)]
-                    non_float_locs.append(arg)
-                    non_float_regs.append(reg)
-                else:  # non-float argument that needs to go on the stack
-                    count += 1
-                    stack_args.append(arg)
-            else:
-                if len(float_regs) < len(r.vfp_argument_regs):
-                    reg = r.vfp_argument_regs[len(float_regs)]
-                    float_locs.append(arg)
-                    float_regs.append(reg)
-                else:  # float argument that needs to go on the stack
-                    if count % 2 != 0:
-                        stack_args.append(None)
-                        count = 0
-                    stack_args.append(arg)
-        # align the stack
-        if count % 2 != 0:
-            stack_args.append(None)
-        self._push_stack_args(stack_args)
-        # Check that the address of the function we want to call is not
-        # currently stored in one of the registers used to pass the arguments.
-        # If this happens to be the case we remap the register to r4 and use r4
-        # to call the function
-        if adr in non_float_regs:
-            non_float_locs.append(adr)
-            non_float_regs.append(r.r4)
-            adr = r.r4
-        # remap values stored in core registers
-        remap_frame_layout(self, non_float_locs, non_float_regs, r.ip)
-        # remap values stored in vfp registers
-        remap_frame_layout(self, float_locs, float_regs, r.vfp_ip)
-
-        return stack_args, adr
-
     def emit_op_same_as(self, op, arglocs, regalloc, fcond):
         argloc, resloc = arglocs
-        self.mov_loc_loc(argloc, resloc)
+        if argloc is not resloc:
+            self.mov_loc_loc(argloc, resloc)
         return fcond
 
     emit_op_cast_ptr_to_int = emit_op_same_as
@@ -1037,9 +854,8 @@
             length_loc = bytes_loc
         # call memcpy()
         regalloc.before_call()
-        self._emit_call(imm(self.memcpy_addr),
-                                  [dstaddr_loc, srcaddr_loc, length_loc],
-                                  can_collect=False)
+        self.simple_call_no_collect(imm(self.memcpy_addr),
+                                  [dstaddr_loc, srcaddr_loc, length_loc])
         regalloc.rm.possibly_free_var(length_box)
         regalloc.rm.possibly_free_var(dstaddr_box)
         regalloc.rm.possibly_free_var(srcaddr_box)
@@ -1127,14 +943,14 @@
             vloc = imm(0)
         self.call_assembler(op, guard_op, argloc, vloc, result_loc, tmploc)
         self._emit_guard_may_force(guard_op,
-                        regalloc._prepare_guard(guard_op), guard_op.numargs())
+                        regalloc._prepare_guard(guard_op))
         return fcond
 
     def _call_assembler_emit_call(self, addr, argloc, resloc):
-        self._emit_call(addr, [argloc], resloc=resloc)
+        self.simple_call(addr, [argloc], result_loc=resloc)
 
     def _call_assembler_emit_helper_call(self, addr, arglocs, resloc):
-        self._emit_call(addr, arglocs, resloc=resloc)
+        self.simple_call(addr, arglocs, result_loc=resloc)
 
     def _call_assembler_check_descr(self, value, tmploc):
         ofs = self.cpu.get_ofs_of_frame_field('jf_descr')
@@ -1213,20 +1029,14 @@
                                                                     fcond):
         self._store_force_index(guard_op)
         numargs = op.numargs()
-        callargs = arglocs[2:numargs + 1]  # extract the arguments to the call
-        adr = arglocs[1]
-        resloc = arglocs[0]
+        callargs = arglocs[:numargs + 3]  # extract the arguments to the call
+        guardargs = arglocs[len(callargs):]
         #
-        descr = op.getdescr()
-        size = descr.get_result_size()
-        signed = descr.is_result_signed()
-        #
-        self._emit_call(adr, callargs, fcond,
-                                    resloc, (size, signed))
-        self._emit_guard_may_force(guard_op, arglocs[1 + numargs:], numargs)
+        self._emit_call(op, callargs, fcond=fcond)
+        self._emit_guard_may_force(guard_op, guardargs)
         return fcond
 
-    def _emit_guard_may_force(self, guard_op, arglocs, numargs):
+    def _emit_guard_may_force(self, guard_op, arglocs):
         ofs = self.cpu.get_ofs_of_frame_field('jf_descr')
         self.mc.LDR_ri(r.ip.value, r.fp.value, imm=ofs)
         self.mc.CMP_ri(r.ip.value, 0)
@@ -1235,68 +1045,14 @@
 
     def emit_guard_call_release_gil(self, op, guard_op, arglocs, regalloc,
                                                                     fcond):
-
+        numargs = op.numargs()
+        callargs = arglocs[:numargs + 3]     # extract the arguments to the call
+        guardargs = arglocs[len(callargs):]  # extrat the arguments for the guard
         self._store_force_index(guard_op)
-        # first, close the stack in the sense of the asmgcc GC root tracker
-        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
-        numargs = op.numargs()
-        callargs = arglocs[2:numargs + 1]  # extract the arguments to the call
-        adr = arglocs[1]
-        resloc = arglocs[0]
-
-        if gcrootmap:
-            # we put the gcmap now into the frame before releasing the GIL,
-            # and pop it below after reacquiring the GIL.  The assumption
-            # is that this gcmap describes correctly the situation at any
-            # point in-between: all values containing GC pointers should
-            # be safely saved out of registers by now, and will not be
-            # manipulated by any of the following CALLs.
-            gcmap = self._regalloc.get_gcmap(noregs=True)
-            self.push_gcmap(self.mc, gcmap, store=True)
-            self.call_release_gil(gcrootmap, arglocs, regalloc, fcond)
-        # do the call
-        descr = op.getdescr()
-        size = descr.get_result_size()
-        signed = descr.is_result_signed()
-        #
-        self._emit_call(adr, callargs, fcond,
-                                    resloc, (size, signed),
-                                    can_collect=False)
-        # then reopen the stack
-        if gcrootmap:
-            self.call_reacquire_gil(gcrootmap, resloc, regalloc, fcond)
-            self.pop_gcmap(self.mc)     # remove the gcmap saved above
-
-        self._emit_guard_may_force(guard_op, arglocs[numargs+1:], numargs)
+        self._emit_call(op, callargs, is_call_release_gil=True)
+        self._emit_guard_may_force(guard_op, guardargs)
         return fcond
 
-    def call_release_gil(self, gcrootmap, save_registers, regalloc, fcond):
-        # Save caller saved registers and do the call
-        # NOTE: We assume that  the floating point registers won't be modified.
-        assert gcrootmap.is_shadow_stack
-        with saved_registers(self.mc, regalloc.rm.save_around_call_regs):
-            self._emit_call(imm(self.releasegil_addr), [],
-                                        fcond, can_collect=False)
-
-    def call_reacquire_gil(self, gcrootmap, save_loc, regalloc, fcond):
-        # save the previous result into the stack temporarily, in case it is in
-        # a caller saved register.
-        # NOTE: like with call_release_gil(), we assume that we don't need to
-        # save vfp regs in this case. Besides the result location
-        regs_to_save = []
-        vfp_regs_to_save = []
-        if save_loc and save_loc in regalloc.rm.save_around_call_regs:
-            regs_to_save.append(save_loc)
-            regs_to_save.append(r.ip)  # for alingment
-        elif save_loc and save_loc in regalloc.vfprm.save_around_call_regs:
-            vfp_regs_to_save.append(save_loc)
-        assert gcrootmap.is_shadow_stack
-        # call the reopenstack() function (also reacquiring the GIL)
-        with saved_registers(self.mc, regs_to_save, vfp_regs_to_save):
-            self._emit_call(imm(self.reacqgil_addr), [], fcond,
-                    can_collect=False)
-        self._reload_frame_if_necessary(self.mc)
-
     def _store_force_index(self, guard_op):
         faildescr = guard_op.getdescr()
         ofs = self.cpu.get_ofs_of_frame_field('jf_force_descr')
diff --git a/rpython/jit/backend/arm/regalloc.py b/rpython/jit/backend/arm/regalloc.py
--- a/rpython/jit/backend/arm/regalloc.py
+++ b/rpython/jit/backend/arm/regalloc.py
@@ -34,6 +34,7 @@
 from rpython.jit.backend.llsupport.descr import unpack_fielddescr
 from rpython.jit.backend.llsupport.descr import unpack_interiorfielddescr
 from rpython.rlib.rarithmetic import r_uint
+from rpython.jit.backend.llsupport.descr import CallDescr
 
 
 # xxx hack: set a default value for TargetToken._ll_loop_code.  If 0, we know
@@ -555,9 +556,27 @@
         return self._prepare_call(op)
 
     def _prepare_call(self, op, force_store=[], save_all_regs=False):
-        args = [None] * (op.numargs() + 1)
+        args = [None] * (op.numargs() + 3)
+        calldescr = op.getdescr()
+        assert isinstance(calldescr, CallDescr)
+        assert len(calldescr.arg_classes) == op.numargs() - 1
+
         for i in range(op.numargs()):
-            args[i + 1] = self.loc(op.getarg(i))
+            args[i + 3] = self.loc(op.getarg(i))
+
+        size = calldescr.get_result_size()
+        sign = calldescr.is_result_signed()
+        if sign:
+            sign_loc = imm(1)
+        else:
+            sign_loc = imm(0)
+        args[1] = imm(size)
+        args[2] = sign_loc
+
+        args[0] = self._call(op, args, force_store, save_all_regs)
+        return args
+
+    def _call(self, op, arglocs, force_store=[], save_all_regs=False):
         # spill variables that need to be saved around calls
         self.vfprm.before_call(save_all_regs=save_all_regs)
         if not save_all_regs:
@@ -565,11 +584,11 @@
             if gcrootmap and gcrootmap.is_shadow_stack:
                 save_all_regs = 2
         self.rm.before_call(save_all_regs=save_all_regs)
+        self.before_call_called = True
+        resloc = None
         if op.result:
             resloc = self.after_call(op.result)
-            args[0] = resloc
-        self.before_call_called = True
-        return args
+        return resloc
 
     def prepare_op_call_malloc_gc(self, op, fcond):
         return self._prepare_call(op)
@@ -1153,9 +1172,9 @@
     def prepare_guard_call_assembler(self, op, guard_op, fcond):
         locs = self.locs_for_call_assembler(op, guard_op)
         tmploc = self.get_scratch_reg(INT, selected_reg=r.r0)
-        call_locs = self._prepare_call(op, save_all_regs=True)
+        resloc = self._call(op, locs + [tmploc], save_all_regs=True)
         self.possibly_free_vars(guard_op.getfailargs())
-        return locs + [call_locs[0], tmploc]
+        return locs + [resloc, tmploc]
 
     def _prepare_args_for_new_op(self, new_args):
         gc_ll_descr = self.cpu.gc_ll_descr
diff --git a/rpython/jit/backend/arm/test/test_regalloc_mov.py b/rpython/jit/backend/arm/test/test_regalloc_mov.py
--- a/rpython/jit/backend/arm/test/test_regalloc_mov.py
+++ b/rpython/jit/backend/arm/test/test_regalloc_mov.py
@@ -1,9 +1,10 @@
 from rpython.rlib.objectmodel import instantiate
 from rpython.jit.backend.arm.assembler import AssemblerARM
-from rpython.jit.backend.arm.locations import imm, ConstFloatLoc,\
-                                        RegisterLocation, StackLocation, \
-                                        VFPRegisterLocation, get_fp_offset
-from rpython.jit.backend.arm.registers import lr, ip, fp, vfp_ip
+from rpython.jit.backend.arm.locations import imm, ConstFloatLoc
+from rpython.jit.backend.arm.locations import RegisterLocation, StackLocation
+from rpython.jit.backend.arm.locations import VFPRegisterLocation, get_fp_offset
+from rpython.jit.backend.arm.locations import RawSPStackLocation
+from rpython.jit.backend.arm.registers import lr, ip, fp, vfp_ip, sp
 from rpython.jit.backend.arm.conditions import AL
 from rpython.jit.backend.arm.arch import WORD
 from rpython.jit.metainterp.history import FLOAT
@@ -54,6 +55,12 @@
     addr = int(value)  # whatever
     return ConstFloatLoc(addr)
 
+def raw_stack(i):
+    return RawSPStackLocation(i)
+
+def raw_stack_float(i):
+    return RawSPStackLocation(i, type=FLOAT)
+
 
 class MockBuilder(object):
     def __init__(self):
@@ -79,13 +86,13 @@
         result = self.builder.instrs
         assert result == expected
 
-
-class TestRegallocMov(BaseMovTest):
-
     def mov(self, a, b, expected=None):
         self.asm.regalloc_mov(a, b)
         self.validate(expected)
 
+
+class TestRegallocMov(BaseMovTest):
+
     def test_mov_imm_to_reg(self):
         val = imm(123)
         reg = r(7)
@@ -102,45 +109,37 @@
         val = imm(100)
         s = stack(7)
         expected = [
-                mi('PUSH', [lr.value], cond=AL),
                 mi('gen_load_int', lr.value, 100, cond=AL),
                 mi('STR_ri', lr.value, fp.value, imm=s.value, cond=AL),
-                mi('POP', [lr.value], cond=AL)]
+        ]
         self.mov(val, s, expected)
 
     def test_mov_big_imm_to_stacklock(self):
         val = imm(65536)
         s = stack(7)
         expected = [
-                mi('PUSH', [lr.value], cond=AL),
                 mi('gen_load_int', lr.value, 65536, cond=AL),
                 mi('STR_ri', lr.value, fp.value, imm=s.value, cond=AL),
-                mi('POP', [lr.value], cond=AL)]
-
+                ]
         self.mov(val, s, expected)
 
     def test_mov_imm_to_big_stacklock(self):
         val = imm(100)
         s = stack(8191)
-        expected = [mi('PUSH', [lr.value], cond=AL),
-                    mi('gen_load_int', lr.value, 100, cond=AL),
-                    mi('PUSH', [ip.value], cond=AL),
+        expected = [ mi('gen_load_int', lr.value, 100, cond=AL),
                     mi('gen_load_int', ip.value, s.value, cond=AL),
                     mi('STR_rr', lr.value, fp.value, ip.value, cond=AL),
-                    mi('POP', [ip.value], cond=AL),
-                    mi('POP', [lr.value], cond=AL)]
+                    ]
         self.mov(val, s, expected)
 
     def test_mov_big_imm_to_big_stacklock(self):
         val = imm(65536)
         s = stack(8191)
-        expected = [mi('PUSH', [lr.value], cond=AL),
+        expected = [
                     mi('gen_load_int', lr.value, 65536, cond=AL),
-                    mi('PUSH', [ip.value], cond=AL),
                     mi('gen_load_int', ip.value, s.value, cond=AL),
                     mi('STR_rr', lr.value, fp.value, ip.value, cond=AL),
-                    mi('POP', [ip.value], cond=AL),
-                    mi('POP', [lr.value], cond=AL)]
+                    ]
         self.mov(val, s, expected)
 
     def test_mov_reg_to_reg(self):
@@ -158,10 +157,10 @@
     def test_mov_reg_to_big_stackloc(self):
         s = stack(8191)
         r6 = r(6)
-        expected = [mi('PUSH', [ip.value], cond=AL),
+        expected = [
                     mi('gen_load_int', ip.value, s.value, cond=AL),
                     mi('STR_rr', r6.value, fp.value, ip.value, cond=AL),
-                    mi('POP', [ip.value], cond=AL)]
+                   ]
         self.mov(r6, s, expected)
 
     def test_mov_stack_to_reg(self):
@@ -174,10 +173,8 @@
         s = stack(8191)
         r6 = r(6)
         expected = [
-                   mi('PUSH', [lr.value], cond=AL),
-                   mi('gen_load_int', lr.value, 32940, cond=AL),
-                   mi('LDR_rr', r6.value, fp.value, lr.value, cond=AL),
-                   mi('POP', [lr.value], cond=AL),
+                   mi('gen_load_int', ip.value, 32940, cond=AL),
+                   mi('LDR_rr', r6.value, fp.value, ip.value, cond=AL),
         ]
         self.mov(s, r6, expected)
 
@@ -185,10 +182,9 @@
         f = imm_float(3.5)
         reg = vfp(5)
         expected = [
-                    mi('PUSH', [ip.value], cond=AL),
                     mi('gen_load_int', ip.value, f.value, cond=AL),
                     mi('VLDR', 5, ip.value, imm=0, cond=AL),
-                    mi('POP', [ip.value], cond=AL)]
+                    ]
         self.mov(f, reg, expected)
 
     def test_mov_vfp_reg_to_vfp_reg(self):
@@ -206,11 +202,11 @@
     def test_mov_vfp_reg_to_large_stackloc(self):
         reg = vfp(7)
         s = stack_float(800)
-        expected = [mi('PUSH', [ip.value], cond=AL),
+        expected = [
                     mi('gen_load_int', ip.value, s.value, cond=AL),
                     mi('ADD_rr', ip.value, fp.value, ip.value, cond=AL),
                     mi('VSTR', reg.value, ip.value, cond=AL),
-                    mi('POP', [ip.value], cond=AL)]
+                   ]
         self.mov(reg, s, expected)
 
     def test_mov_stack_to_vfp_reg(self):
@@ -222,11 +218,11 @@
     def test_mov_big_stackloc_to_vfp_reg(self):
         reg = vfp(7)
         s = stack_float(800)
-        expected = [mi('PUSH', [ip.value], cond=AL),
+        expected = [
                     mi('gen_load_int', ip.value, s.value, cond=AL),
                     mi('ADD_rr', ip.value, fp.value, ip.value, cond=AL),
                     mi('VSTR', reg.value, ip.value, cond=AL),
-                    mi('POP', [ip.value], cond=AL)]
+                   ]
         self.mov(reg, s, expected)
 
     def test_unsopported_cases(self):
@@ -265,8 +261,6 @@
         py.test.raises(AssertionError,
                     'self.asm.regalloc_mov(stack(1), vfp(2))')
         py.test.raises(AssertionError,
-                    'self.asm.regalloc_mov(stack(1), lr)')
-        py.test.raises(AssertionError,
                     'self.asm.regalloc_mov(stack_float(1), imm(2))')
         py.test.raises(AssertionError,
                     'self.asm.regalloc_mov(stack_float(1), imm_float(2))')
@@ -312,12 +306,11 @@
         r1 = r(1)
         r2 = r(2)
         e = [
-            mi('PUSH', [ip.value], cond=AL),
             mi('gen_load_int', ip.value, s.value, cond=AL),
             mi('LDR_rr', r1.value, fp.value, ip.value, cond=AL),
             mi('ADD_ri', ip.value, ip.value, imm=WORD, cond=AL),
             mi('LDR_rr', r2.value, fp.value, ip.value, cond=AL),
-            mi('POP', [ip.value], cond=AL)]
+            ]
         self.mov(s, r1, r2, e)
 
     def test_from_imm_float(self):
@@ -325,11 +318,10 @@
         r1 = r(1)
         r2 = r(2)
         e = [
-            mi('PUSH', [ip.value], cond=AL),
             mi('gen_load_int', ip.value, i.value, cond=AL),
             mi('LDR_ri', r1.value, ip.value, cond=AL),
             mi('LDR_ri', r2.value, ip.value, imm=4, cond=AL),
-            mi('POP', [ip.value], cond=AL)]
+            ]
         self.mov(i, r1, r2, e)
 
     def test_unsupported(self):
@@ -369,12 +361,11 @@
         r1 = r(1)
         r2 = r(2)
         e = [
-            mi('PUSH', [ip.value], cond=AL),
             mi('gen_load_int', ip.value, s.value, cond=AL),
             mi('STR_rr', r1.value, fp.value, ip.value, cond=AL),
             mi('ADD_ri', ip.value, ip.value, imm=4, cond=AL),
             mi('STR_rr', r2.value, fp.value, ip.value, cond=AL),
-            mi('POP', [ip.value], cond=AL)]
+            ]
         self.mov(r1, r2, s, e)
 
     def unsupported(self):
@@ -408,10 +399,9 @@
 
     def test_push_imm_float(self):
         f = imm_float(7)
-        e = [mi('PUSH', [ip.value], cond=AL),
+        e = [
             mi('gen_load_int', ip.value, 7, cond=AL),
             mi('VLDR', vfp_ip.value, ip.value, imm=0, cond=AL),
-            mi('POP', [ip.value], cond=AL),
             mi('VPUSH', [vfp_ip.value], cond=AL)
             ]
         self.push(f, e)
@@ -426,10 +416,8 @@
     def test_push_big_stack(self):
         s = stack(1025)
         e = [
-            mi('PUSH', [lr.value], cond=AL),
             mi('gen_load_int', lr.value, s.value, cond=AL),
             mi('LDR_rr', ip.value, fp.value, lr.value, cond=AL),
-            mi('POP', [lr.value], cond=AL),
             mi('PUSH', [ip.value], cond=AL)
             ]
         self.push(s, e)
@@ -450,11 +438,9 @@
     def test_push_large_stackfloat(self):
         sf = stack_float(100)
         e = [
-            mi('PUSH', [ip.value], cond=AL),
             mi('gen_load_int', ip.value, sf.value, cond=AL),
             mi('ADD_rr', ip.value, fp.value, ip.value, cond=AL),
             mi('VLDR', vfp_ip.value, ip.value, cond=AL),
-            mi('POP', [ip.value], cond=AL),
             mi('VPUSH', [vfp_ip.value], cond=AL),
         ]
         self.push(sf, e)
@@ -486,10 +472,8 @@
         s = stack(1200)
         e = [
             mi('POP', [ip.value], cond=AL),
-            mi('PUSH', [lr.value], cond=AL),
             mi('gen_load_int', lr.value, s.value, cond=AL),
             mi('STR_rr', ip.value, fp.value, lr.value, cond=AL),
-            mi('POP', [lr.value], cond=AL)
             ]
         self.pop(s, e)
 
@@ -505,13 +489,88 @@
         s = stack_float(1200)
         e = [
             mi('VPOP', [vfp_ip.value], cond=AL),
-            mi('PUSH', [ip.value], cond=AL),
             mi('gen_load_int', ip.value, s.value, cond=AL),
             mi('ADD_rr', ip.value, fp.value, ip.value, cond=AL),
             mi('VSTR', vfp_ip.value, ip.value, cond=AL),
-            mi('POP', [ip.value], cond=AL)]
+        ]
         self.pop(s, e)
 
     def test_unsupported(self):
         py.test.raises(AssertionError, 'self.asm.regalloc_pop(imm(1))')
         py.test.raises(AssertionError, 'self.asm.regalloc_pop(imm_float(1))')
+
+class TestRawStackLocs(BaseMovTest):
+    def test_unsupported(self):
+        py.test.raises(AssertionError, 'self.asm.regalloc_mov(raw_stack(0), imm(1))')
+        py.test.raises(AssertionError, 'self.asm.regalloc_mov(raw_stack(0), imm_float(1))')
+        py.test.raises(AssertionError, 'self.asm.regalloc_mov(raw_stack(0), r(1))')
+        py.test.raises(AssertionError, 'self.asm.regalloc_mov(raw_stack(0), vfp(1))')
+        py.test.raises(AssertionError, 'self.asm.regalloc_mov(raw_stack(0), stack(1))')
+        py.test.raises(AssertionError, 'self.asm.regalloc_mov(raw_stack(0), stack_float(1))')
+
+        py.test.raises(AssertionError, 'self.asm.regalloc_mov(imm_float(1), raw_stack(1))')
+        py.test.raises(AssertionError, 'self.asm.regalloc_mov(imm(1), raw_stack_float(1))')
+
+        py.test.raises(AssertionError, 'self.asm.regalloc_mov(vfp(1), raw_stack(1))')
+        py.test.raises(AssertionError, 'self.asm.regalloc_mov(r(1), raw_stack_float(1))')
+
+        py.test.raises(AssertionError, 'self.asm.regalloc_mov(stack_float(1), raw_stack(1))')
+        py.test.raises(AssertionError, 'self.asm.regalloc_mov(stack(1), raw_stack_float(1))')
+
+    def test_from_imm(self):
+        s = raw_stack(1024)
+        i = imm(999)
+        e = [
+            mi('gen_load_int', lr.value, i.value, cond=AL),
+            mi('gen_load_int', ip.value, s.value, cond=AL),
+            mi('STR_rr', lr.value, sp.value, ip.value, cond=AL),
+            ]
+        self.mov(i, s, e)
+
+    def test_from_vfp_imm(self):
+        s = raw_stack_float(1024)
+        i = imm_float(999)
+        e = [
+            mi('gen_load_int', ip.value, i.value, cond=AL),
+            mi('VLDR', vfp_ip.value, ip.value, cond=AL, imm=0),
+            mi('gen_load_int', ip.value, s.value, cond=AL),
+            mi('ADD_rr', ip.value, sp.value, ip.value, cond=AL),
+            mi('VSTR', vfp_ip.value, ip.value, cond=AL),
+            ]
+        self.mov(i, s, e)
+
+    def test_from_reg(self):
+        s = raw_stack(1024)
+        reg = r(10)
+        e = [mi('gen_load_int', ip.value, s.value, cond=AL),
+             mi('STR_rr', reg.value, sp.value, ip.value, cond=AL),
+            ]
+        self.mov(reg, s, e)
+
+    def test_from_vfp_reg(self):
+        s = raw_stack_float(1024)
+        reg = vfp(10)
+        e = [mi('gen_load_int', ip.value, s.value, cond=AL),
+             mi('ADD_rr', ip.value, sp.value, ip.value, cond=AL),
+             mi('VSTR', reg.value, ip.value, cond=AL),
+            ]
+        self.mov(reg, s, e)
+
+    def test_from_stack(self):
+        s = raw_stack(1024)
+        reg = stack(10)
+        e = [mi('LDR_ri', ip.value, fp.value, imm=216, cond=AL),
+             mi('gen_load_int', lr.value, s.value, cond=AL),
+             mi('STR_rr', ip.value, sp.value, lr.value, cond=AL),
+            ]
+        self.mov(reg, s, e)
+
+    def test_from_vfp_stack(self):
+        s = raw_stack_float(1024)
+        reg = stack_float(10)
+        e = [mi('VLDR', vfp_ip.value, fp.value, imm=220, cond=AL),
+             mi('gen_load_int', ip.value, s.value, cond=AL),
+             mi('ADD_rr', ip.value, sp.value, ip.value, cond=AL),
+             mi('VSTR', vfp_ip.value, ip.value, cond=AL),
+            ]
+        self.mov(reg, s, e)
diff --git a/rpython/jit/backend/llsupport/assembler.py b/rpython/jit/backend/llsupport/assembler.py
--- a/rpython/jit/backend/llsupport/assembler.py
+++ b/rpython/jit/backend/llsupport/assembler.py
@@ -372,6 +372,9 @@
         self.releasegil_addr  = self.cpu.cast_ptr_to_int(releasegil_func)
         self.reacqgil_addr = self.cpu.cast_ptr_to_int(reacqgil_func)
 
+    def _is_asmgcc(self):
+        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
+        return bool(gcrootmap) and not gcrootmap.is_shadow_stack
 
 
 def debug_bridge(descr_number, rawstart, codeendpos):
diff --git a/rpython/jit/backend/llsupport/callbuilder.py b/rpython/jit/backend/llsupport/callbuilder.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/llsupport/callbuilder.py
@@ -0,0 +1,92 @@
+from rpython.rlib.clibffi import FFI_DEFAULT_ABI
+
+class AbstractCallBuilder(object):
+
+    # this is the calling convention (can be FFI_STDCALL on Windows)
+    callconv = FFI_DEFAULT_ABI
+
+    # is it for the main CALL of a call_release_gil?
+    is_call_release_gil = False
+
+    # this can be set to guide more complex calls: gives the detailed
+    # type of the arguments
+    argtypes = ""
+    ressign = False
+
+
+    def __init__(self, assembler, fnloc, arglocs, resloc, restype, ressize):
+        self.fnloc = fnloc
+        self.arglocs = arglocs
+        self.asm = assembler
+        self.mc = assembler.mc
+        self.resloc = resloc
+        self.restype = restype
+        self.ressize = ressize
+
+    def emit_no_collect(self):
+        """Emit a call that cannot collect."""
+        self.prepare_arguments()
+        self.emit_raw_call()
+        self.restore_stack_pointer()
+        self.load_result()
+
+    def emit(self):
+        """Emit a regular call; not for CALL_RELEASE_GIL."""
+        self.prepare_arguments()
+        self.push_gcmap()
+        self.emit_raw_call()
+        self.restore_stack_pointer()
+        self.pop_gcmap()
+        self.load_result()
+
+    def emit_call_release_gil(self):
+        """Emit a CALL_RELEASE_GIL, including calls to releasegil_addr
+        and reacqgil_addr."""
+        self.select_call_release_gil_mode()
+        self.prepare_arguments()
+        self.push_gcmap_for_call_release_gil()
+        self.call_releasegil_addr_and_move_real_arguments()
+        self.emit_raw_call()
+        self.restore_stack_pointer()
+        self.move_real_result_and_call_reacqgil_addr()
+        self.pop_gcmap()
+        self.load_result()
+
+    def call_releasegil_addr_and_move_real_arguments(self):
+        raise NotImplementedError
+
+    def move_real_result_and_call_reacqgil_addr(self):
+        raise NotImplementedError
+
+    def select_call_release_gil_mode(self):
+        """Overridden in CallBuilder64"""
+        self.is_call_release_gil = True
+
+    def prepare_arguments(self):
+        raise NotImplementedError
+
+    def push_gcmap(self):
+        raise NotImplementedError
+
+    def push_gcmap_for_call_release_gil(self):
+        assert self.is_call_release_gil
+        # we put the gcmap now into the frame before releasing the GIL,
+        # and pop it after reacquiring the GIL.  The assumption
+        # is that this gcmap describes correctly the situation at any
+        # point in-between: all values containing GC pointers should
+        # be safely saved out of registers by now, and will not be
+        # manipulated by any of the following CALLs.
+        gcmap = self.asm._regalloc.get_gcmap(noregs=True)
+        self.asm.push_gcmap(self.mc, gcmap, store=True)
+
+    def pop_gcmap(self):
+        raise NotImplementedError
+
+    def emit_raw_call(self):
+        raise NotImplementedError
+
+    def restore_stack_pointer(self):
+        raise NotImplementedError
+
+    def load_result(self):
+        raise NotImplementedError
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -999,10 +999,6 @@
                     self.implement_guard(guard_token, checkfalsecond)
         return genop_cmp_guard_float
 
-    def _is_asmgcc(self):
-        gcrootmap = self.cpu.gc_ll_descr.gcrootmap
-        return bool(gcrootmap) and not gcrootmap.is_shadow_stack
-
     def simple_call(self, fnloc, arglocs, result_loc=eax):
         if result_loc is xmm0:
             result_type = FLOAT
diff --git a/rpython/jit/backend/x86/callbuilder.py b/rpython/jit/backend/x86/callbuilder.py
--- a/rpython/jit/backend/x86/callbuilder.py
+++ b/rpython/jit/backend/x86/callbuilder.py
@@ -8,6 +8,7 @@
     r12, r13, r14, r15, X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG,
     RegLoc, RawEspLoc, RawEbpLoc, imm, ImmedLoc)
 from rpython.jit.backend.x86.jump import remap_frame_layout
+from rpython.jit.backend.llsupport.callbuilder import AbstractCallBuilder
 
 
 # darwin requires the stack to be 16 bytes aligned on calls.
@@ -18,77 +19,30 @@
     return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
 
 
-
-class AbstractCallBuilder(object):
+class CallBuilderX86(AbstractCallBuilder):
 
     # max number of words we have room in esp; if we need more for
     # arguments, we need to decrease esp temporarily
     stack_max = PASS_ON_MY_FRAME
 
-    # this can be set to guide more complex calls: gives the detailed
-    # type of the arguments
-    argtypes = ""
-    ressign = False
-
-    # this is the calling convention (can be FFI_STDCALL on Windows)
-    callconv = FFI_DEFAULT_ABI
-
-    # is it for the main CALL of a call_release_gil?
-    is_call_release_gil = False
-
     # set by save_result_value()
     tmpresloc = None
 
-
     def __init__(self, assembler, fnloc, arglocs,
                  resloc=eax, restype=INT, ressize=WORD):
+        AbstractCallBuilder.__init__(self, assembler, fnloc, arglocs,
+                                     resloc, restype, ressize)
         # Avoid tons of issues with a non-immediate fnloc by sticking it
         # as an extra argument if needed
         self.fnloc_is_immediate = isinstance(fnloc, ImmedLoc)
-        if self.fnloc_is_immediate:
-            self.fnloc = fnloc
-            self.arglocs = arglocs
-        else:
+        if not self.fnloc_is_immediate:
+            self.fnloc = None
             self.arglocs = arglocs + [fnloc]
-        self.asm = assembler
-        self.mc = assembler.mc
-        self.resloc = resloc
-        self.restype = restype
-        self.ressize = ressize
         self.current_esp = 0     # 0 or (usually) negative, counted in bytes
 
-    def emit_no_collect(self):
-        """Emit a call that cannot collect."""
-        self.prepare_arguments()
-        self.emit_raw_call()
-        self.restore_esp()
-        self.load_result()
-
-    def emit(self):
-        """Emit a regular call; not for CALL_RELEASE_GIL."""
-        self.prepare_arguments()
-        self.push_gcmap()
-        self.emit_raw_call()
-        self.restore_esp()
-        self.pop_gcmap()
-        self.load_result()
-
-    def emit_call_release_gil(self):
-        """Emit a CALL_RELEASE_GIL, including calls to releasegil_addr
-        and reacqgil_addr."""
-        self.select_call_release_gil_mode()
-        self.prepare_arguments()
-        self.push_gcmap_for_call_release_gil()
-        self.call_releasegil_addr_and_move_real_arguments()
-        self.emit_raw_call()
-        self.restore_esp()
-        self.move_real_result_and_call_reacqgil_addr()
-        self.pop_gcmap()
-        self.load_result()
-
     def select_call_release_gil_mode(self):
         """Overridden in CallBuilder64"""
-        self.is_call_release_gil = True
+        AbstractCallBuilder.select_call_release_gil_mode(self)
         if self.asm._is_asmgcc():
             from rpython.memory.gctransform import asmgcroot
             self.stack_max = PASS_ON_MY_FRAME - asmgcroot.JIT_USE_WORDS
@@ -105,7 +59,7 @@
             self.current_esp -= align * WORD
             self.mc.SUB_ri(esp.value, align * WORD)
 
-    def restore_esp(self, target_esp=0):
+    def restore_stack_pointer(self, target_esp=0):
         if self.current_esp != target_esp:
             self.mc.ADD_ri(esp.value, target_esp - self.current_esp)
             self.current_esp = target_esp
@@ -140,17 +94,6 @@
         gcmap = self.asm._regalloc.get_gcmap([eax], noregs=noregs)
         self.asm.push_gcmap(self.mc, gcmap, store=True)
 
-    def push_gcmap_for_call_release_gil(self):
-        assert self.is_call_release_gil
-        # we put the gcmap now into the frame before releasing the GIL,
-        # and pop it after reacquiring the GIL.  The assumption
-        # is that this gcmap describes correctly the situation at any
-        # point in-between: all values containing GC pointers should
-        # be safely saved out of registers by now, and will not be
-        # manipulated by any of the following CALLs.
-        gcmap = self.asm._regalloc.get_gcmap(noregs=True)
-        self.asm.push_gcmap(self.mc, gcmap, store=True)
-
     def pop_gcmap(self):
         self.asm._reload_frame_if_necessary(self.mc)
         if self.change_extra_stack_depth:
@@ -204,7 +147,7 @@
             self.mc.ADD(ebp, imm(1))       # ebp any more
         #
         self.restore_register_arguments()
-        self.restore_esp(initial_esp)
+        self.restore_stack_pointer(initial_esp)
 
     def save_register_arguments(self):
         """Overridden in CallBuilder64"""
@@ -248,7 +191,7 @@
         raise NotImplementedError
 
 
-class CallBuilder32(AbstractCallBuilder):
+class CallBuilder32(CallBuilderX86):
 
     def prepare_arguments(self):
         arglocs = self.arglocs
@@ -318,7 +261,7 @@
             else:
                 self.mc.MOV(resloc, self.tmpresloc)
         else:
-            AbstractCallBuilder.load_result(self)
+            CallBuilderX86.load_result(self)
 
     def save_result_value(self):
         # Temporarily save the result value into [ESP+4].  We use "+4"
@@ -343,7 +286,7 @@
                 self.mc.MOV_sr(4, eax.value)
 
 
-class CallBuilder64(AbstractCallBuilder):
+class CallBuilder64(CallBuilderX86):
 
     ARGUMENTS_GPR = [edi, esi, edx, ecx, r8, r9]
     ARGUMENTS_XMM = [xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7]
@@ -389,7 +332,7 @@
                 i += 1
 
     def select_call_release_gil_mode(self):
-        AbstractCallBuilder.select_call_release_gil_mode(self)
+        CallBuilderX86.select_call_release_gil_mode(self)
         # We have to copy the arguments around a bit more in this mode,
         # but on the other hand we don't need prepare_arguments() moving
         # them in precisely the final registers.  Here we look around for
@@ -502,7 +445,7 @@
             # from the lower 32 bits of XMM0
             self.mc.MOVD(self.resloc, xmm0)
         else:
-            AbstractCallBuilder.load_result(self)
+            CallBuilderX86.load_result(self)
 
     def save_result_value(self):
         # Temporarily save the result value into [ESP].