[pypy-commit] pypy more-rposix: hg merge default

amauryfa noreply at buildbot.pypy.org
Sun May 3 18:38:03 CEST 2015


Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: more-rposix
Changeset: r77014:8a0a9728af1b
Date: 2015-05-03 18:37 +0200
http://bitbucket.org/pypy/pypy/changeset/8a0a9728af1b/

Log:	hg merge default

diff too long, truncating to 2000 out of 4936 lines

diff --git a/LICENSE b/LICENSE
--- a/LICENSE
+++ b/LICENSE
@@ -420,3 +420,10 @@
 the terms of the GPL license version 2 or any later version.  Thus the
 gdbm module, provided in the file lib_pypy/gdbm.py, is redistributed
 under the terms of the GPL license as well.
+
+License for 'pypy/module/_vmprof/src'
+--------------------------------------
+
+The code is based on gperftools. You may see a copy of the License for it at
+
+    https://code.google.com/p/gperftools/source/browse/COPYING
diff --git a/pypy/config/pypyoption.py b/pypy/config/pypyoption.py
--- a/pypy/config/pypyoption.py
+++ b/pypy/config/pypyoption.py
@@ -38,6 +38,10 @@
     "_csv", "cppyy", "_pypyjson"
 ])
 
+if sys.platform.startswith('linux') and sys.maxint > 2147483647:
+  if 0:     # XXX disabled until we fix the absurd .so mess
+    working_modules.add('_vmprof')
+
 translation_modules = default_modules.copy()
 translation_modules.update([
     "fcntl", "time", "select", "signal", "_rawffi", "zlib", "struct", "_md5",
@@ -99,6 +103,7 @@
     "_hashlib"  : ["pypy.module._ssl.interp_ssl"],
     "_minimal_curses": ["pypy.module._minimal_curses.fficurses"],
     "_continuation": ["rpython.rlib.rstacklet"],
+    "_vmprof" : ["pypy.module._vmprof.interp_vmprof"],
     }
 
 def get_module_validator(modname):
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -67,3 +67,10 @@
 
 .. branch: object-dtype2
 Extend numpy dtypes to allow using objects with associated garbage collection hook
+
+.. branch: vmprof2
+Add backend support for vmprof - a lightweight statistical profiler -
+to linux64, see client at https://vmprof.readthedocs.org
+
+.. branch: jit_hint_docs
+Add more detail to @jit.elidable and @jit.promote in rpython/rlib/jit.py
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -11,7 +11,7 @@
     INT_MIN, INT_MAX, UINT_MAX, USHRT_MAX
 
 from pypy.interpreter.executioncontext import (ExecutionContext, ActionFlag,
-    UserDelAction)
+    UserDelAction, CodeUniqueIds)
 from pypy.interpreter.error import OperationError, new_exception_class, oefmt
 from pypy.interpreter.argument import Arguments
 from pypy.interpreter.miscutils import ThreadLocals, make_weak_value_dictionary
@@ -388,6 +388,7 @@
         self.actionflag = ActionFlag()    # changed by the signal module
         self.check_signal_action = None   # changed by the signal module
         self.user_del_action = UserDelAction(self)
+        self.code_unique_ids = CodeUniqueIds()
         self._code_of_sys_exc_info = None
 
         # can be overridden to a subclass
@@ -666,6 +667,16 @@
             assert ec is not None
             return ec
 
+    def register_code_callback(self, callback):
+        cui = self.code_unique_ids
+        cui.code_callback = callback
+
+    def register_code_object(self, pycode):
+        cui = self.code_unique_ids
+        if cui.code_callback is None:
+            return
+        cui.code_callback(self, pycode)
+
     def _freeze_(self):
         return True
 
diff --git a/pypy/interpreter/executioncontext.py b/pypy/interpreter/executioncontext.py
--- a/pypy/interpreter/executioncontext.py
+++ b/pypy/interpreter/executioncontext.py
@@ -579,3 +579,11 @@
         # there is no list of length n: if n is large, then the GC
         # will run several times while walking the list, but it will
         # see lower and lower memory usage, with no lower bound of n.
+
+class CodeUniqueIds(object):
+    def __init__(self):
+        if sys.maxint == 2147483647:
+            self.code_unique_id = 0 # XXX this is wrong, it won't work on 32bit
+        else:
+            self.code_unique_id = 0x7000000000000000
+        self.code_callback = None
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -14,9 +14,10 @@
     CO_OPTIMIZED, CO_NEWLOCALS, CO_VARARGS, CO_VARKEYWORDS, CO_NESTED,
     CO_GENERATOR, CO_KILL_DOCSTRING, CO_YIELD_INSIDE_TRY)
 from pypy.tool.stdlib_opcode import opcodedesc, HAVE_ARGUMENT
-from rpython.rlib.rarithmetic import intmask
+from rpython.rlib.rarithmetic import intmask, r_longlong
 from rpython.rlib.objectmodel import compute_hash
 from rpython.rlib import jit
+from rpython.rlib.debug import debug_start, debug_stop, debug_print
 
 
 class BytecodeCorruption(Exception):
@@ -54,8 +55,9 @@
     "CPython-style code objects."
     _immutable_ = True
     _immutable_fields_ = ["co_consts_w[*]", "co_names_w[*]", "co_varnames[*]",
-                          "co_freevars[*]", "co_cellvars[*]", "_args_as_cellvars[*]"]
-
+                          "co_freevars[*]", "co_cellvars[*]",
+                          "_args_as_cellvars[*]"]
+    
     def __init__(self, space,  argcount, nlocals, stacksize, flags,
                      code, consts, names, varnames, filename,
                      name, firstlineno, lnotab, freevars, cellvars,
@@ -83,6 +85,7 @@
         self.magic = magic
         self._signature = cpython_code_signature(self)
         self._initialize()
+        space.register_code_object(self)
 
     def _initialize(self):
         if self.co_cellvars:
@@ -124,6 +127,15 @@
             from pypy.objspace.std.mapdict import init_mapdict_cache
             init_mapdict_cache(self)
 
+        cui = self.space.code_unique_ids
+        self._unique_id = cui.code_unique_id
+        cui.code_unique_id += 4  # so we have two bits that we can mark stuff
+        # with
+
+    def _get_full_name(self):
+        return "py:%s:%d:%s" % (self.co_name, self.co_firstlineno,
+                                self.co_filename)
+
     def _cleanup_(self):
         if (self.magic == cpython_magic and
             '__pypy__' not in sys.builtin_module_names):
diff --git a/pypy/interpreter/pyframe.py b/pypy/interpreter/pyframe.py
--- a/pypy/interpreter/pyframe.py
+++ b/pypy/interpreter/pyframe.py
@@ -49,14 +49,35 @@
     last_instr               = -1
     last_exception           = None
     f_backref                = jit.vref_None
+    # For tracing
     w_f_trace                = None
-    # For tracing
     instr_lb                 = 0
     instr_ub                 = 0
     instr_prev_plus_one      = 0
+    # end of tracing
+    
     is_being_profiled        = False
     escaped                  = False  # see mark_as_escaped()
 
+    w_globals = None
+    w_locals = None # dict containing locals, if forced or necessary
+    pycode = None # code object executed by that frame
+    locals_stack_w = None # the list of all locals and valuestack
+    valuestackdepth = 0 # number of items on valuestack
+    lastblock = None
+    # default to False
+    f_lineno = 0 # current lineno
+    cells = None # cells
+
+    # other fields:
+    
+    # builtin - builtin cache, only if honor__builtins__ is True,
+
+    # there is also self.space which is removed by the annotator
+
+    # additionally JIT uses vable_token field that is representing
+    # frame current virtualizable state as seen by the JIT
+
     def __init__(self, space, code, w_globals, outer_func):
         if not we_are_translated():
             assert type(self) == space.FrameClass, (
@@ -65,11 +86,9 @@
         assert isinstance(code, pycode.PyCode)
         self.space = space
         self.w_globals = w_globals
-        self.w_locals = None
         self.pycode = code
         self.locals_stack_w = [None] * (code.co_nlocals + code.co_stacksize)
         self.valuestackdepth = code.co_nlocals
-        self.lastblock = None
         make_sure_not_resized(self.locals_stack_w)
         check_nonneg(self.valuestackdepth)
         #
diff --git a/pypy/module/_vmprof/__init__.py b/pypy/module/_vmprof/__init__.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/__init__.py
@@ -0,0 +1,18 @@
+from pypy.interpreter.mixedmodule import MixedModule
+
+class Module(MixedModule):
+    """
+    Write me :)
+    """
+    appleveldefs = {
+    }
+
+    interpleveldefs = {
+        'enable': 'interp_vmprof.enable',
+        'disable': 'interp_vmprof.disable',
+    }
+
+    def setup_after_space_initialization(self):
+        # force the __extend__ hacks to occur early
+        from pypy.module._vmprof.interp_vmprof import VMProf
+        self.vmprof = VMProf()
diff --git a/pypy/module/_vmprof/interp_vmprof.py b/pypy/module/_vmprof/interp_vmprof.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/interp_vmprof.py
@@ -0,0 +1,240 @@
+import py, os, sys
+from rpython.rtyper.lltypesystem import lltype, rffi, llmemory
+from rpython.translator.tool.cbuild import ExternalCompilationInfo
+from rpython.rtyper.annlowlevel import cast_instance_to_gcref, cast_base_ptr_to_instance
+from rpython.rlib.objectmodel import we_are_translated
+from rpython.rlib import jit, rposix, rgc
+from rpython.rlib.rarithmetic import ovfcheck_float_to_int
+from rpython.rtyper.tool import rffi_platform as platform
+from rpython.rlib.rstring import StringBuilder
+from pypy.interpreter.baseobjspace import W_Root
+from pypy.interpreter.error import oefmt, wrap_oserror, OperationError
+from pypy.interpreter.gateway import unwrap_spec
+from pypy.interpreter.pyframe import PyFrame
+from pypy.interpreter.pycode import PyCode
+
+ROOT = py.path.local(__file__).join('..')
+SRC = ROOT.join('src')
+
+# by default, we statically link vmprof.c into pypy; however, if you set
+# DYNAMIC_VMPROF to True, it will be dynamically linked to the libvmprof.so
+# which is expected to be inside pypy/module/_vmprof/src: this is very useful
+# during development. Note that you have to manually build libvmprof by
+# running make inside the src dir
+DYNAMIC_VMPROF = False
+
+eci_kwds = dict(
+    include_dirs = [SRC],
+    includes = ['vmprof.h', 'trampoline.h'],
+    separate_module_files = [SRC.join('trampoline.asmgcc.s')],
+    link_files = ['-Wl,-Bstatic', '-lunwind', '-llzma','-Wl,-Bdynamic'],
+    
+    post_include_bits=["""
+        void pypy_vmprof_init(void);
+    """],
+    
+    separate_module_sources=["""
+        void pypy_vmprof_init(void) {
+            vmprof_set_mainloop(pypy_execute_frame_trampoline, 0,
+                                NULL);
+        }
+    """],
+    )
+
+
+if DYNAMIC_VMPROF:
+    eci_kwds['libraries'] += ['vmprof']
+    eci_kwds['link_extra'] = ['-Wl,-rpath,%s' % SRC, '-L%s' % SRC]
+else:
+    eci_kwds['separate_module_files'] += [SRC.join('vmprof.c')]
+
+eci = ExternalCompilationInfo(**eci_kwds)
+
+check_eci = eci.merge(ExternalCompilationInfo(separate_module_files=[
+    SRC.join('fake_pypy_api.c')]))
+
+platform.verify_eci(check_eci)
+
+pypy_execute_frame_trampoline = rffi.llexternal(
+    "pypy_execute_frame_trampoline",
+    [llmemory.GCREF, llmemory.GCREF, llmemory.GCREF, lltype.Signed],
+    llmemory.GCREF,
+    compilation_info=eci,
+    _nowrapper=True, sandboxsafe=True,
+    random_effects_on_gcobjs=True)
+
+pypy_vmprof_init = rffi.llexternal("pypy_vmprof_init", [], lltype.Void,
+                                   compilation_info=eci)
+vmprof_enable = rffi.llexternal("vmprof_enable",
+                                [rffi.INT, rffi.LONG, rffi.INT,
+                                 rffi.CCHARP, rffi.INT],
+                                rffi.INT, compilation_info=eci,
+                                save_err=rffi.RFFI_SAVE_ERRNO)
+vmprof_disable = rffi.llexternal("vmprof_disable", [], rffi.INT,
+                                 compilation_info=eci,
+                                save_err=rffi.RFFI_SAVE_ERRNO)
+
+vmprof_register_virtual_function = rffi.llexternal(
+    "vmprof_register_virtual_function",
+    [rffi.CCHARP, rffi.VOIDP, rffi.VOIDP], lltype.Void,
+    compilation_info=eci, _nowrapper=True)
+
+original_execute_frame = PyFrame.execute_frame.im_func
+original_execute_frame.c_name = 'pypy_pyframe_execute_frame'
+original_execute_frame._dont_inline_ = True
+
+class __extend__(PyFrame):
+    def execute_frame(frame, w_inputvalue=None, operr=None):
+        # go through the asm trampoline ONLY if we are translated but not being JITted.
+        #
+        # If we are not translated, we obviously don't want to go through the
+        # trampoline because there is no C function it can call.
+        #
+        # If we are being JITted, we want to skip the trampoline, else the JIT
+        # cannot see throug it
+        if we_are_translated() and not jit.we_are_jitted():
+            # if we are translated, call the trampoline
+            gc_frame = cast_instance_to_gcref(frame)
+            gc_inputvalue = cast_instance_to_gcref(w_inputvalue)
+            gc_operr = cast_instance_to_gcref(operr)
+            unique_id = frame.pycode._unique_id
+            gc_result = pypy_execute_frame_trampoline(gc_frame, gc_inputvalue,
+                                                      gc_operr, unique_id)
+            return cast_base_ptr_to_instance(W_Root, gc_result)
+        else:
+            return original_execute_frame(frame, w_inputvalue, operr)
+
+
+
+def write_long_to_string_builder(l, b):
+    if sys.maxint == 2147483647:
+        b.append(chr(l & 0xff))
+        b.append(chr((l >> 8) & 0xff))
+        b.append(chr((l >> 16) & 0xff))
+        b.append(chr((l >> 24) & 0xff))
+    else:
+        b.append(chr(l & 0xff))
+        b.append(chr((l >> 8) & 0xff))
+        b.append(chr((l >> 16) & 0xff))
+        b.append(chr((l >> 24) & 0xff))
+        b.append(chr((l >> 32) & 0xff))
+        b.append(chr((l >> 40) & 0xff))
+        b.append(chr((l >> 48) & 0xff))
+        b.append(chr((l >> 56) & 0xff))
+
+def try_cast_to_pycode(gcref):
+    return rgc.try_cast_gcref_to_instance(PyCode, gcref)
+
+MAX_CODES = 1000
+
+class VMProf(object):
+    def __init__(self):
+        self.is_enabled = False
+        self.ever_enabled = False
+        self.fileno = -1
+        self.current_codes = []
+
+    def enable(self, space, fileno, period_usec):
+        if self.is_enabled:
+            raise oefmt(space.w_ValueError, "_vmprof already enabled")
+        self.fileno = fileno
+        self.is_enabled = True
+        self.write_header(fileno, period_usec)
+        if not self.ever_enabled:
+            if we_are_translated():
+                pypy_vmprof_init()
+            self.ever_enabled = True
+        self.gather_all_code_objs(space)
+        space.register_code_callback(vmprof_register_code)
+        if we_are_translated():
+            # does not work untranslated
+            res = vmprof_enable(fileno, period_usec, 0,
+                                lltype.nullptr(rffi.CCHARP.TO), 0)
+        else:
+            res = 0
+        if res == -1:
+            raise wrap_oserror(space, OSError(rposix.get_saved_errno(),
+                                              "_vmprof.enable"))
+
+    def gather_all_code_objs(self, space):
+        all_code_objs = rgc.do_get_objects(try_cast_to_pycode)
+        for code in all_code_objs:
+            self.register_code(space, code)
+
+    def write_header(self, fileno, period_usec):
+        assert period_usec > 0
+        b = StringBuilder()
+        write_long_to_string_builder(0, b)
+        write_long_to_string_builder(3, b)
+        write_long_to_string_builder(0, b)
+        write_long_to_string_builder(period_usec, b)
+        write_long_to_string_builder(0, b)
+        b.append('\x04') # interp name
+        b.append(chr(len('pypy')))
+        b.append('pypy')
+        os.write(fileno, b.build())
+
+    def register_code(self, space, code):
+        if self.fileno == -1:
+            raise OperationError(space.w_RuntimeError,
+                                 space.wrap("vmprof not running"))
+        self.current_codes.append(code)
+        if len(self.current_codes) >= MAX_CODES:
+            self._flush_codes(space)
+
+    def _flush_codes(self, space):
+        b = StringBuilder()
+        for code in self.current_codes:
+            name = code._get_full_name()
+            b.append('\x02')
+            write_long_to_string_builder(code._unique_id, b)
+            write_long_to_string_builder(len(name), b)
+            b.append(name)
+        os.write(self.fileno, b.build())
+        self.current_codes = []
+
+    def disable(self, space):
+        if not self.is_enabled:
+            raise oefmt(space.w_ValueError, "_vmprof not enabled")
+        self.is_enabled = False
+        space.register_code_callback(None)
+        self._flush_codes(space)
+        self.fileno = -1
+        if we_are_translated():
+           # does not work untranslated
+            res = vmprof_disable()
+        else:
+            res = 0
+        if res == -1:
+            raise wrap_oserror(space, OSError(rposix.get_saved_errno(),
+                                              "_vmprof.disable"))
+
+def vmprof_register_code(space, code):
+    from pypy.module._vmprof import Module
+    mod_vmprof = space.getbuiltinmodule('_vmprof')
+    assert isinstance(mod_vmprof, Module)
+    mod_vmprof.vmprof.register_code(space, code)
+
+ at unwrap_spec(fileno=int, period=float)
+def enable(space, fileno, period=0.01):   # default 100 Hz
+    from pypy.module._vmprof import Module
+    mod_vmprof = space.getbuiltinmodule('_vmprof')
+    assert isinstance(mod_vmprof, Module)
+    #
+    try:
+        period_usec = ovfcheck_float_to_int(period * 1000000.0 + 0.5)
+        if period_usec <= 0 or period_usec >= 1e6:
+            # we don't want seconds here at all
+            raise ValueError
+    except (ValueError, OverflowError):
+        raise OperationError(space.w_ValueError,
+                             space.wrap("'period' too large or non positive"))
+    #
+    mod_vmprof.vmprof.enable(space, fileno, period_usec)
+
+def disable(space):
+    from pypy.module._vmprof import Module
+    mod_vmprof = space.getbuiltinmodule('_vmprof')
+    assert isinstance(mod_vmprof, Module)
+    mod_vmprof.vmprof.disable(space)
+
diff --git a/pypy/module/_vmprof/src/config.h b/pypy/module/_vmprof/src/config.h
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/config.h
@@ -0,0 +1,2 @@
+#define HAVE_SYS_UCONTEXT_H
+#define PC_FROM_UCONTEXT uc_mcontext.gregs[REG_RIP]
diff --git a/pypy/module/_vmprof/src/fake_pypy_api.c b/pypy/module/_vmprof/src/fake_pypy_api.c
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/fake_pypy_api.c
@@ -0,0 +1,21 @@
+
+long pypy_jit_stack_depth_at_loc(long x)
+{
+	return 0;
+}
+
+void *pypy_find_codemap_at_addr(long x)
+{
+	return (void *)0;
+}
+
+long pypy_yield_codemap_at_addr(void *x, long y, long *a)
+{
+	return 0;
+}
+
+void pypy_pyframe_execute_frame(void)
+{
+}
+
+volatile int pypy_codemap_currently_invalid = 0;
diff --git a/pypy/module/_vmprof/src/get_custom_offset.c b/pypy/module/_vmprof/src/get_custom_offset.c
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/get_custom_offset.c
@@ -0,0 +1,66 @@
+
+extern volatile int pypy_codemap_currently_invalid;
+
+void *pypy_find_codemap_at_addr(long addr, long *start_addr);
+long pypy_yield_codemap_at_addr(void *codemap_raw, long addr,
+                                long *current_pos_addr);
+long pypy_jit_stack_depth_at_loc(long loc);
+
+
+void vmprof_set_tramp_range(void* start, void* end)
+{
+}
+
+int custom_sanity_check()
+{
+    return !pypy_codemap_currently_invalid;
+}
+
+static ptrdiff_t vmprof_unw_get_custom_offset(void* ip, void *cp) {
+    intptr_t ip_l = (intptr_t)ip;
+    return pypy_jit_stack_depth_at_loc(ip_l);
+}
+
+static long vmprof_write_header_for_jit_addr(void **result, long n,
+                                             void *ip, int max_depth)
+{
+    void *codemap;
+    long current_pos = 0;
+    intptr_t id;
+    long start_addr = 0;
+    intptr_t addr = (intptr_t)ip;
+    int start, k;
+    void *tmp;
+
+    codemap = pypy_find_codemap_at_addr(addr, &start_addr);
+    if (codemap == NULL)
+        // not a jit code at all
+        return n;
+
+    // modify the last entry to point to start address and not the random one
+    // in the middle
+    result[n - 1] = (void*)start_addr;
+    result[n] = (void*)2;
+    n++;
+    start = n;
+    while (n < max_depth) {
+        id = pypy_yield_codemap_at_addr(codemap, addr, &current_pos);
+        if (id == -1)
+            // finish
+            break;
+        if (id == 0)
+            continue; // not main codemap
+        result[n++] = (void *)id;
+    }
+    k = 0;
+    while (k < (n - start) / 2) {
+        tmp = result[start + k];
+        result[start + k] = result[n - k - 1];
+        result[n - k - 1] = tmp;
+        k++;
+    }
+    if (n < max_depth) {
+        result[n++] = (void*)3;
+    }
+    return n;
+}
diff --git a/pypy/module/_vmprof/src/getpc.h b/pypy/module/_vmprof/src/getpc.h
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/getpc.h
@@ -0,0 +1,187 @@
+// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
+// Copyright (c) 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// ---
+// Author: Craig Silverstein
+//
+// This is an internal header file used by profiler.cc.  It defines
+// the single (inline) function GetPC.  GetPC is used in a signal
+// handler to figure out the instruction that was being executed when
+// the signal-handler was triggered.
+//
+// To get this, we use the ucontext_t argument to the signal-handler
+// callback, which holds the full context of what was going on when
+// the signal triggered.  How to get from a ucontext_t to a Program
+// Counter is OS-dependent.
+
+#ifndef BASE_GETPC_H_
+#define BASE_GETPC_H_
+
+#include "config.h"
+
+// On many linux systems, we may need _GNU_SOURCE to get access to
+// the defined constants that define the register we want to see (eg
+// REG_EIP).  Note this #define must come first!
+#define _GNU_SOURCE 1
+// If #define _GNU_SOURCE causes problems, this might work instead.
+// It will cause problems for FreeBSD though!, because it turns off
+// the needed __BSD_VISIBLE.
+//#define _XOPEN_SOURCE 500
+
+#include <string.h>         // for memcmp
+#if defined(HAVE_SYS_UCONTEXT_H)
+#include <sys/ucontext.h>
+#elif defined(HAVE_UCONTEXT_H)
+#include <ucontext.h>       // for ucontext_t (and also mcontext_t)
+#elif defined(HAVE_CYGWIN_SIGNAL_H)
+#include <cygwin/signal.h>
+typedef ucontext ucontext_t;
+#endif
+
+
+// Take the example where function Foo() calls function Bar().  For
+// many architectures, Bar() is responsible for setting up and tearing
+// down its own stack frame.  In that case, it's possible for the
+// interrupt to happen when execution is in Bar(), but the stack frame
+// is not properly set up (either before it's done being set up, or
+// after it's been torn down but before Bar() returns).  In those
+// cases, the stack trace cannot see the caller function anymore.
+//
+// GetPC can try to identify this situation, on architectures where it
+// might occur, and unwind the current function call in that case to
+// avoid false edges in the profile graph (that is, edges that appear
+// to show a call skipping over a function).  To do this, we hard-code
+// in the asm instructions we might see when setting up or tearing
+// down a stack frame.
+//
+// This is difficult to get right: the instructions depend on the
+// processor, the compiler ABI, and even the optimization level.  This
+// is a best effort patch -- if we fail to detect such a situation, or
+// mess up the PC, nothing happens; the returned PC is not used for
+// any further processing.
+struct CallUnrollInfo {
+  // Offset from (e)ip register where this instruction sequence
+  // should be matched. Interpreted as bytes. Offset 0 is the next
+  // instruction to execute. Be extra careful with negative offsets in
+  // architectures of variable instruction length (like x86) - it is
+  // not that easy as taking an offset to step one instruction back!
+  int pc_offset;
+  // The actual instruction bytes. Feel free to make it larger if you
+  // need a longer sequence.
+  unsigned char ins[16];
+  // How many bytes to match from ins array?
+  int ins_size;
+  // The offset from the stack pointer (e)sp where to look for the
+  // call return address. Interpreted as bytes.
+  int return_sp_offset;
+};
+
+
+// The dereferences needed to get the PC from a struct ucontext were
+// determined at configure time, and stored in the macro
+// PC_FROM_UCONTEXT in config.h.  The only thing we need to do here,
+// then, is to do the magic call-unrolling for systems that support it.
+
+// -- Special case 1: linux x86, for which we have CallUnrollInfo
+#if defined(__linux) && defined(__i386) && defined(__GNUC__)
+static const CallUnrollInfo callunrollinfo[] = {
+  // Entry to a function:  push %ebp;  mov  %esp,%ebp
+  // Top-of-stack contains the caller IP.
+  { 0,
+    {0x55, 0x89, 0xe5}, 3,
+    0
+  },
+  // Entry to a function, second instruction:  push %ebp;  mov  %esp,%ebp
+  // Top-of-stack contains the old frame, caller IP is +4.
+  { -1,
+    {0x55, 0x89, 0xe5}, 3,
+    4
+  },
+  // Return from a function: RET.
+  // Top-of-stack contains the caller IP.
+  { 0,
+    {0xc3}, 1,
+    0
+  }
+};
+
+inline void* GetPC(ucontext_t *signal_ucontext) {
+  // See comment above struct CallUnrollInfo.  Only try instruction
+  // flow matching if both eip and esp looks reasonable.
+  const int eip = signal_ucontext->uc_mcontext.gregs[REG_EIP];
+  const int esp = signal_ucontext->uc_mcontext.gregs[REG_ESP];
+  if ((eip & 0xffff0000) != 0 && (~eip & 0xffff0000) != 0 &&
+      (esp & 0xffff0000) != 0) {
+    char* eip_char = reinterpret_cast<char*>(eip);
+    for (int i = 0; i < sizeof(callunrollinfo)/sizeof(*callunrollinfo); ++i) {
+      if (!memcmp(eip_char + callunrollinfo[i].pc_offset,
+                  callunrollinfo[i].ins, callunrollinfo[i].ins_size)) {
+        // We have a match.
+        void **retaddr = (void**)(esp + callunrollinfo[i].return_sp_offset);
+        return *retaddr;
+      }
+    }
+  }
+  return (void*)eip;
+}
+
+// Special case #2: Windows, which has to do something totally different.
+#elif defined(_WIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__) || defined(__MINGW32__)
+// If this is ever implemented, probably the way to do it is to have
+// profiler.cc use a high-precision timer via timeSetEvent:
+//    http://msdn2.microsoft.com/en-us/library/ms712713.aspx
+// We'd use it in mode TIME_CALLBACK_FUNCTION/TIME_PERIODIC.
+// The callback function would be something like prof_handler, but
+// alas the arguments are different: no ucontext_t!  I don't know
+// how we'd get the PC (using StackWalk64?)
+//    http://msdn2.microsoft.com/en-us/library/ms680650.aspx
+
+#include "base/logging.h"   // for RAW_LOG
+#ifndef HAVE_CYGWIN_SIGNAL_H
+typedef int ucontext_t;
+#endif
+
+inline void* GetPC(ucontext_t *signal_ucontext) {
+  RAW_LOG(ERROR, "GetPC is not yet implemented on Windows\n");
+  return NULL;
+}
+
+// Normal cases.  If this doesn't compile, it's probably because
+// PC_FROM_UCONTEXT is the empty string.  You need to figure out
+// the right value for your system, and add it to the list in
+// configure.ac (or set it manually in your config.h).
+#else
+inline void* GetPC(ucontext_t *signal_ucontext) {
+  return (void*)signal_ucontext->PC_FROM_UCONTEXT;   // defined in config.h
+}
+
+#endif
+
+#endif  // BASE_GETPC_H_
diff --git a/pypy/module/_vmprof/src/trampoline.asmgcc.s b/pypy/module/_vmprof/src/trampoline.asmgcc.s
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/trampoline.asmgcc.s
@@ -0,0 +1,16 @@
+// NOTE: you need to use TABs, not spaces!
+        
+	.text
+	.p2align 4,,-1
+	.globl	pypy_execute_frame_trampoline
+	.type	pypy_execute_frame_trampoline, @function
+pypy_execute_frame_trampoline:
+	.cfi_startproc
+	pushq	%rcx
+	.cfi_def_cfa_offset 16
+	call pypy_pyframe_execute_frame at PLT
+	popq	%rcx
+	.cfi_def_cfa_offset 8
+	ret
+	.cfi_endproc
+	.size	pypy_execute_frame_trampoline, .-pypy_execute_frame_trampoline
diff --git a/pypy/module/_vmprof/src/trampoline.h b/pypy/module/_vmprof/src/trampoline.h
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/trampoline.h
@@ -0,0 +1,1 @@
+void* pypy_execute_frame_trampoline(void*, void*, void*, long);
diff --git a/pypy/module/_vmprof/src/vmprof.c b/pypy/module/_vmprof/src/vmprof.c
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/vmprof.c
@@ -0,0 +1,398 @@
+/* VMPROF
+ *
+ * statistical sampling profiler specifically designed to profile programs
+ * which run on a Virtual Machine and/or bytecode interpreter, such as Python,
+ * etc.
+ *
+ * The logic to dump the C stack traces is partly stolen from the code in gperftools.
+ * The file "getpc.h" has been entirely copied from gperftools.
+ *
+ * Tested only on gcc, linux, x86_64.
+ *
+ * Copyright (C) 2014-2015
+ *   Antonio Cuni - anto.cuni at gmail.com
+ *   Maciej Fijalkowski - fijall at gmail.com
+ *
+ */
+
+
+#include "getpc.h"      // should be first to get the _GNU_SOURCE dfn
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <stddef.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <pthread.h>
+
+#define UNW_LOCAL_ONLY
+#include <libunwind.h>
+
+#include "vmprof.h"
+
+#define _unused(x) ((void)x)
+
+#define MAX_FUNC_NAME 128
+#define MAX_STACK_DEPTH 1024
+#define BUFFER_SIZE 8192
+
+
+static int profile_file = 0;
+static char profile_write_buffer[BUFFER_SIZE];
+static int profile_buffer_position = 0;
+void* vmprof_mainloop_func;
+static ptrdiff_t mainloop_sp_offset;
+static vmprof_get_virtual_ip_t mainloop_get_virtual_ip;
+static long last_period_usec = 0;
+static int atfork_hook_installed = 0;
+
+
+/* *************************************************************
+ * functions to write a profile file compatible with gperftools
+ * *************************************************************
+ */
+
+#define MARKER_STACKTRACE '\x01'
+#define MARKER_VIRTUAL_IP '\x02'
+#define MARKER_TRAILER '\x03'
+
+static void prof_word(long x) {
+	((long*)(profile_write_buffer + profile_buffer_position))[0] = x;
+	profile_buffer_position += sizeof(long);
+}
+
+static void prof_header(long period_usec) {
+    // XXX never used here?
+    prof_word(0);
+    prof_word(3);
+    prof_word(0);
+    prof_word(period_usec);
+    prof_word(0);
+    write(profile_file, profile_write_buffer, profile_buffer_position);
+    profile_buffer_position = 0;
+}
+
+static void prof_write_stacktrace(void** stack, int depth, int count) {
+    int i;
+	char marker = MARKER_STACKTRACE;
+
+	profile_write_buffer[profile_buffer_position++] = MARKER_STACKTRACE;
+    prof_word(count);
+    prof_word(depth);
+    for(i=0; i<depth; i++)
+        prof_word((long)stack[i]);
+    write(profile_file, profile_write_buffer, profile_buffer_position);
+    profile_buffer_position = 0;
+}
+
+
+/* ******************************************************
+ * libunwind workaround for process JIT frames correctly
+ * ******************************************************
+ */
+
+#include "get_custom_offset.c"
+
+typedef struct {
+    void* _unused1;
+    void* _unused2;
+    void* sp;
+    void* ip;
+    void* _unused3[sizeof(unw_cursor_t)/sizeof(void*) - 4];
+} vmprof_hacked_unw_cursor_t;
+
+static int vmprof_unw_step(unw_cursor_t *cp, int first_run) {
+	void* ip;
+    void* sp;
+    ptrdiff_t sp_offset;
+    unw_get_reg (cp, UNW_REG_IP, (unw_word_t*)&ip);
+    unw_get_reg (cp, UNW_REG_SP, (unw_word_t*)&sp);
+	if (!first_run)
+		// make sure we're pointing to the CALL and not to the first
+		// instruction after. If the callee adjusts the stack for us
+		// it's not safe to be at the instruction after
+		ip -= 1;
+    sp_offset = vmprof_unw_get_custom_offset(ip, cp);
+
+    if (sp_offset == -1) {
+        // it means that the ip is NOT in JITted code, so we can use the
+        // stardard unw_step
+        return unw_step(cp);
+    }
+    else {
+        // this is a horrible hack to manually walk the stack frame, by
+        // setting the IP and SP in the cursor
+        vmprof_hacked_unw_cursor_t *cp2 = (vmprof_hacked_unw_cursor_t*)cp;
+        void* bp = (void*)sp + sp_offset;
+        cp2->sp = bp;
+		bp -= sizeof(void*);
+        cp2->ip = ((void**)bp)[0];
+        // the ret is on the top of the stack minus WORD
+        return 1;
+    }
+}
+
+
+/* *************************************************************
+ * functions to dump the stack trace
+ * *************************************************************
+ */
+
+// The original code here has a comment, "stolen from pprof",
+// about a "__thread int recursive".  But general __thread
+// variables are not really supposed to be accessed from a
+// signal handler.  Moreover, we are using SIGPROF, which
+// should not be recursively called on the same thread.
+//static __thread int recursive;
+
+int get_stack_trace(void** result, int max_depth, ucontext_t *ucontext) {
+    void *ip;
+    int n = 0;
+    unw_cursor_t cursor;
+    unw_context_t uc = *ucontext;
+    //if (recursive) {
+    //    return 0;
+    //}
+    if (!custom_sanity_check()) {
+        return 0;
+    }
+    //++recursive;
+
+    int ret = unw_init_local(&cursor, &uc);
+    assert(ret >= 0);
+    _unused(ret);
+	int first_run = 1;
+
+    while (n < max_depth) {
+        if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0) {
+            break;
+        }
+
+        unw_proc_info_t pip;
+        unw_get_proc_info(&cursor, &pip);
+
+        /* char funcname[4096]; */
+        /* unw_word_t offset; */
+        /* unw_get_proc_name(&cursor, funcname, 4096, &offset); */
+        /* printf("%s+%#lx <%p>\n", funcname, offset, ip); */
+
+        /* if n==0, it means that the signal handler interrupted us while we
+           were in the trampoline, so we are not executing (yet) the real main
+           loop function; just skip it */
+        if (vmprof_mainloop_func && 
+            (void*)pip.start_ip == (void*)vmprof_mainloop_func &&
+            n > 0) {
+          // found main loop stack frame
+          void* sp;
+          unw_get_reg(&cursor, UNW_REG_SP, (unw_word_t *) &sp);
+          void *arg_addr = (char*)sp + mainloop_sp_offset;
+          void **arg_ptr = (void**)arg_addr;
+          // fprintf(stderr, "stacktrace mainloop: rsp %p   &f2 %p   offset %ld\n", 
+          //         sp, arg_addr, mainloop_sp_offset);
+		  if (mainloop_get_virtual_ip) {
+			  ip = mainloop_get_virtual_ip(*arg_ptr);
+		  } else {
+			  ip = *arg_ptr;
+		  }
+        }
+
+        result[n++] = ip;
+		n = vmprof_write_header_for_jit_addr(result, n, ip, max_depth);
+        if (vmprof_unw_step(&cursor, first_run) <= 0) {
+            break;
+        }
+		first_run = 0;
+    }
+    //--recursive;
+    return n;
+}
+
+
+static int __attribute__((noinline)) frame_forcer(int rv) {
+    return rv;
+}
+
+static void sigprof_handler(int sig_nr, siginfo_t* info, void *ucontext) {
+    void* stack[MAX_STACK_DEPTH];
+    int saved_errno = errno;
+    stack[0] = GetPC((ucontext_t*)ucontext);
+    int depth = frame_forcer(get_stack_trace(stack+1, MAX_STACK_DEPTH-1, ucontext));
+    depth++;  // To account for pc value in stack[0];
+    prof_write_stacktrace(stack, depth, 1);
+    errno = saved_errno;
+}
+
+/* *************************************************************
+ * functions to enable/disable the profiler
+ * *************************************************************
+ */
+
+static int open_profile(int fd, long period_usec, int write_header, char *s,
+						int slen) {
+	if ((fd = dup(fd)) == -1) {
+		return -1;
+	}
+	profile_buffer_position = 0;
+    profile_file = fd;
+	if (write_header)
+		prof_header(period_usec);
+	if (s)
+		write(profile_file, s, slen);
+	return 0;
+}
+
+static int close_profile(void) {
+	// XXX all of this can happily fail
+    FILE* src;
+    char buf[BUFSIZ];
+    size_t size;
+	int marker = MARKER_TRAILER;
+	write(profile_file, &marker, 1);
+
+    // copy /proc/PID/maps to the end of the profile file
+    sprintf(buf, "/proc/%d/maps", getpid());
+    src = fopen(buf, "r");    
+    while ((size = fread(buf, 1, BUFSIZ, src))) {
+        write(profile_file, buf, size);
+    }
+    fclose(src);
+    close(profile_file);
+	return 0;
+}
+
+
+static int install_sigprof_handler(void) {
+    struct sigaction sa;
+    memset(&sa, 0, sizeof(sa));
+    sa.sa_sigaction = sigprof_handler;
+    sa.sa_flags = SA_RESTART | SA_SIGINFO;
+    if (sigemptyset(&sa.sa_mask) == -1 ||
+		sigaction(SIGPROF, &sa, NULL) == -1) {
+		return -1;
+	}
+	return 0;
+}
+
+static int remove_sigprof_handler(void) {
+    sighandler_t res = signal(SIGPROF, SIG_DFL);
+	if (res == SIG_ERR) {
+		return -1;
+	}
+	return 0;
+};
+
+static int install_sigprof_timer(long period_usec) {
+    static struct itimerval timer;
+    last_period_usec = period_usec;
+    timer.it_interval.tv_sec = 0;
+    timer.it_interval.tv_usec = period_usec;
+    timer.it_value = timer.it_interval;
+    if (setitimer(ITIMER_PROF, &timer, NULL) != 0) {
+		return -1;
+    }
+	return 0;
+}
+
+static int remove_sigprof_timer(void) {
+    static struct itimerval timer;
+    last_period_usec = 0;
+    timer.it_interval.tv_sec = 0;
+    timer.it_interval.tv_usec = 0;
+    timer.it_value.tv_sec = 0;
+    timer.it_value.tv_usec = 0;
+    if (setitimer(ITIMER_PROF, &timer, NULL) != 0) {
+		return -1;
+    }
+	return 0;
+}
+
+static void atfork_disable_timer(void) {
+    remove_sigprof_timer();
+}
+
+static void atfork_enable_timer(void) {
+    install_sigprof_timer(last_period_usec);
+}
+
+static int install_pthread_atfork_hooks(void) {
+    /* this is needed to prevent the problems described there:
+         - http://code.google.com/p/gperftools/issues/detail?id=278
+         - http://lists.debian.org/debian-glibc/2010/03/msg00161.html
+
+        TL;DR: if the RSS of the process is large enough, the clone() syscall
+        will be interrupted by the SIGPROF before it can complete, then
+        retried, interrupted again and so on, in an endless loop.  The
+        solution is to disable the timer around the fork, and re-enable it
+        only inside the parent.
+    */
+    if (atfork_hook_installed)
+        return 0;
+    int ret = pthread_atfork(atfork_disable_timer, atfork_enable_timer, NULL);
+    if (ret != 0)
+        return -1;
+    atfork_hook_installed = 1;
+    return 0;
+}
+
+/* *************************************************************
+ * public API
+ * *************************************************************
+ */
+
+void vmprof_set_mainloop(void* func, ptrdiff_t sp_offset, 
+                         vmprof_get_virtual_ip_t get_virtual_ip) {
+    mainloop_sp_offset = sp_offset;
+    mainloop_get_virtual_ip = get_virtual_ip;
+    vmprof_mainloop_func = func;
+}
+
+int vmprof_enable(int fd, long period_usec, int write_header, char *s,
+				  int slen)
+{
+    assert(period_usec > 0);
+    if (open_profile(fd, period_usec, write_header, s, slen) == -1) {
+		return -1;
+	}
+    if (install_sigprof_handler() == -1) {
+		return -1;
+	}
+    if (install_sigprof_timer(period_usec) == -1) {
+		return -1;
+	}
+    if (install_pthread_atfork_hooks() == -1) {
+        return -1;
+    }
+	return 0;
+}
+
+int vmprof_disable(void) {
+    if (remove_sigprof_timer() == -1) {
+		return -1;
+	}
+    if (remove_sigprof_handler() == -1) {
+		return -1;
+	}
+    if (close_profile() == -1) {
+		return -1;
+	}
+	return 0;
+}
+
+void vmprof_register_virtual_function(const char* name, void* start, void* end) {
+	// XXX unused by pypy
+    // for now *end is simply ignored
+	char buf[1024];
+	int lgt = strlen(name) + 2 * sizeof(long) + 1;
+
+	if (lgt > 1024) {
+		lgt = 1024;
+	}
+	buf[0] = MARKER_VIRTUAL_IP;
+	((void **)(((void*)buf) + 1))[0] = start;
+	((long *)(((void*)buf) + 1 + sizeof(long)))[0] = lgt - 2 * sizeof(long) - 1;
+	strncpy(buf + 2 * sizeof(long) + 1, name, 1024 - 2 * sizeof(long) - 1);
+	write(profile_file, buf, lgt);
+}
diff --git a/pypy/module/_vmprof/src/vmprof.h b/pypy/module/_vmprof/src/vmprof.h
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/src/vmprof.h
@@ -0,0 +1,22 @@
+#ifndef VMPROF_VMPROF_H_
+#define VMPROF_VMPROF_H_
+
+#include <stddef.h>
+
+typedef void* (*vmprof_get_virtual_ip_t)(void*);
+
+extern void* vmprof_mainloop_func;
+void vmprof_set_mainloop(void* func, ptrdiff_t sp_offset, 
+                         vmprof_get_virtual_ip_t get_virtual_ip);
+
+void vmprof_register_virtual_function(const char* name, void* start, void* end);
+
+
+int vmprof_enable(int fd, long period_usec, int write_header, char* vips,
+				  int vips_len);
+int vmprof_disable(void);
+
+// XXX: this should be part of _vmprof (the CPython extension), not vmprof (the library)
+void vmprof_set_tramp_range(void* start, void* end);
+
+#endif
diff --git a/pypy/module/_vmprof/test/__init__.py b/pypy/module/_vmprof/test/__init__.py
new file mode 100644
diff --git a/pypy/module/_vmprof/test/test__vmprof.py b/pypy/module/_vmprof/test/test__vmprof.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/test/test__vmprof.py
@@ -0,0 +1,72 @@
+
+import tempfile
+from pypy.tool.pytest.objspace import gettestobjspace
+
+class AppTestVMProf(object):
+    def setup_class(cls):
+        cls.space = gettestobjspace(usemodules=['_vmprof', 'struct'])
+        cls.tmpfile = tempfile.NamedTemporaryFile()
+        cls.w_tmpfileno = cls.space.wrap(cls.tmpfile.fileno())
+        cls.w_tmpfilename = cls.space.wrap(cls.tmpfile.name)
+        cls.tmpfile2 = tempfile.NamedTemporaryFile()
+        cls.w_tmpfileno2 = cls.space.wrap(cls.tmpfile2.fileno())
+        cls.w_tmpfilename2 = cls.space.wrap(cls.tmpfile2.name)
+
+    def test_import_vmprof(self):
+        import struct, sys
+
+        WORD = struct.calcsize('l')
+        
+        def count(s):
+            i = 0
+            count = 0
+            i += 5 * WORD # header
+            assert s[i] == '\x04'
+            i += 1 # marker
+            assert s[i] == '\x04'
+            i += 1 # length
+            i += len('pypy')
+            while i < len(s):
+                if s[i] == '\x03':
+                    break
+                if s[i] == '\x01':
+                    xxx
+                assert s[i] == '\x02'
+                i += 1
+                _, size = struct.unpack("ll", s[i:i + 2 * WORD])
+                count += 1
+                i += 2 * WORD + size
+            return count
+        
+        import _vmprof
+        _vmprof.enable(self.tmpfileno)
+        _vmprof.disable()
+        s = open(self.tmpfilename).read()
+        no_of_codes = count(s)
+        assert no_of_codes > 10
+        d = {}
+
+        exec """def foo():
+            pass
+        """ in d
+
+        _vmprof.enable(self.tmpfileno2)
+
+        exec """def foo2():
+            pass
+        """ in d
+
+        _vmprof.disable()
+        s = open(self.tmpfilename2).read()
+        no_of_codes2 = count(s)
+        assert "py:foo:" in s
+        assert "py:foo2:" in s
+        assert no_of_codes2 >= no_of_codes + 2 # some extra codes from tests
+
+    def test_enable_ovf(self):
+        import _vmprof
+        raises(ValueError, _vmprof.enable, 999, 0)
+        raises(ValueError, _vmprof.enable, 999, -2.5)
+        raises(ValueError, _vmprof.enable, 999, 1e300)
+        raises(ValueError, _vmprof.enable, 999, 1e300 * 1e300)
+        raises(ValueError, _vmprof.enable, 999, (1e300*1e300) / (1e300*1e300))
diff --git a/pypy/module/_vmprof/test/test_direct.py b/pypy/module/_vmprof/test/test_direct.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_vmprof/test/test_direct.py
@@ -0,0 +1,71 @@
+
+import py
+try:
+    import cffi
+except ImportError:
+    py.test.skip('cffi required')
+
+srcdir = py.path.local(__file__).join("..", "..", "src")
+
+ffi = cffi.FFI()
+ffi.cdef("""
+long vmprof_write_header_for_jit_addr(void **, long, void*, int);
+void *pypy_find_codemap_at_addr(long addr, long *start_addr);
+long pypy_yield_codemap_at_addr(void *codemap_raw, long addr,
+                                long *current_pos_addr);
+long buffer[];
+""")
+
+lib = ffi.verify("""
+volatile int pypy_codemap_currently_invalid = 0;
+
+long buffer[] = {0, 0, 0, 0, 0};
+
+
+
+void *pypy_find_codemap_at_addr(long addr, long *start_addr)
+{
+    return (void*)buffer;
+}
+
+long pypy_yield_codemap_at_addr(void *codemap_raw, long addr,
+                                long *current_pos_addr)
+{
+    long c = *current_pos_addr;
+    if (c >= 5)
+        return -1;
+    *current_pos_addr = c + 1;
+    return *((long*)codemap_raw + c);
+}
+
+
+""" + open(str(srcdir.join("get_custom_offset.c"))).read())
+
+class TestDirect(object):
+    def test_infrastructure(self):
+        cont = ffi.new("long[1]", [0])
+        buf = lib.pypy_find_codemap_at_addr(0, cont)
+        assert buf
+        cont[0] = 0
+        next_addr = lib.pypy_yield_codemap_at_addr(buf, 0, cont)
+        assert cont[0] == 1
+        assert not next_addr
+        lib.buffer[0] = 13
+        cont[0] = 0
+        next_addr = lib.pypy_yield_codemap_at_addr(buf, 0, cont)
+        assert int(ffi.cast("long", next_addr)) == 13
+
+    def test_write_header_for_jit_addr(self):
+        lib.buffer[0] = 4
+        lib.buffer[1] = 8
+        lib.buffer[2] = 12
+        lib.buffer[3] = 16
+        lib.buffer[4] = 0
+        buf = ffi.new("long[10]", [0] * 10)
+        result = ffi.cast("void**", buf)
+        res = lib.vmprof_write_header_for_jit_addr(result, 0, ffi.NULL, 100)
+        assert res == 6
+        assert buf[0] == 2
+        assert buf[1] == 16
+        assert buf[2] == 12
+        assert buf[3] == 8
diff --git a/pypy/module/gc/referents.py b/pypy/module/gc/referents.py
--- a/pypy/module/gc/referents.py
+++ b/pypy/module/gc/referents.py
@@ -44,30 +44,6 @@
     return OperationError(space.w_NotImplementedError,
                           space.wrap("operation not implemented by this GC"))
 
-# ____________________________________________________________
-
-def clear_gcflag_extra(fromlist):
-    pending = fromlist[:]
-    while pending:
-        gcref = pending.pop()
-        if rgc.get_gcflag_extra(gcref):
-            rgc.toggle_gcflag_extra(gcref)
-            pending.extend(rgc.get_rpy_referents(gcref))
-
-def do_get_objects():
-    roots = [gcref for gcref in rgc.get_rpy_roots() if gcref]
-    pending = roots[:]
-    result_w = []
-    while pending:
-        gcref = pending.pop()
-        if not rgc.get_gcflag_extra(gcref):
-            rgc.toggle_gcflag_extra(gcref)
-            w_obj = try_cast_gcref_to_w_root(gcref)
-            if w_obj is not None:
-                result_w.append(w_obj)
-            pending.extend(rgc.get_rpy_referents(gcref))
-    clear_gcflag_extra(roots)
-    return result_w
 
 # ____________________________________________________________
 
@@ -116,8 +92,8 @@
                 break
     # done.  Clear flags carefully
     rgc.toggle_gcflag_extra(gcarg)
-    clear_gcflag_extra(roots)
-    clear_gcflag_extra([gcarg])
+    rgc.clear_gcflag_extra(roots)
+    rgc.clear_gcflag_extra([gcarg])
     return result_w
 
 # ____________________________________________________________
@@ -189,8 +165,7 @@
     """Return a list of all app-level objects."""
     if not rgc.has_gcflag_extra():
         raise missing_operation(space)
-    result_w = do_get_objects()
-    rgc.assert_no_more_gcflags()
+    result_w = rgc.do_get_objects(try_cast_gcref_to_w_root)
     return space.newlist(result_w)
 
 def get_referents(space, args_w):
diff --git a/pypy/module/micronumpy/concrete.py b/pypy/module/micronumpy/concrete.py
--- a/pypy/module/micronumpy/concrete.py
+++ b/pypy/module/micronumpy/concrete.py
@@ -328,11 +328,8 @@
         return ArrayBuffer(self, readonly)
 
     def astype(self, space, dtype):
-        # we want to create a new array, but must respect the strides
-        # in self. So find a factor of the itemtype.elsize, and use this
-        factor = float(dtype.elsize) / self.dtype.elsize
-        strides = [int(factor*s) for s in self.get_strides()]
-        backstrides = [int(factor*s) for s in self.get_backstrides()]
+        strides, backstrides = calc_strides(self.get_shape(), dtype,
+                                                    self.order)
         impl = ConcreteArray(self.get_shape(), dtype, self.order,
                              strides, backstrides)
         loop.setslice(space, impl.get_shape(), impl, self)
diff --git a/pypy/module/micronumpy/test/test_ndarray.py b/pypy/module/micronumpy/test/test_ndarray.py
--- a/pypy/module/micronumpy/test/test_ndarray.py
+++ b/pypy/module/micronumpy/test/test_ndarray.py
@@ -2183,8 +2183,7 @@
         assert b.dtype == 'bool'
 
         a = arange(6, dtype='f4').reshape(2,3)
-        b = a.T.astype('i4')
-        assert (a.T.strides == b.strides)
+        b = a.astype('i4')
 
         a = array('x').astype('S3').dtype
         assert a.itemsize == 3
diff --git a/pypy/module/micronumpy/test/test_object_arrays.py b/pypy/module/micronumpy/test/test_object_arrays.py
--- a/pypy/module/micronumpy/test/test_object_arrays.py
+++ b/pypy/module/micronumpy/test/test_object_arrays.py
@@ -1,7 +1,12 @@
 from pypy.module.micronumpy.test.test_base import BaseNumpyAppTest
+from pypy.conftest import option
 
 
 class AppTestObjectDtypes(BaseNumpyAppTest):
+    def setup_class(cls):
+        BaseNumpyAppTest.setup_class.im_func(cls)
+        cls.w_runappdirect = cls.space.wrap(option.runappdirect)
+
     def test_scalar_from_object(self):
         from numpy import array
         import sys
@@ -109,6 +114,8 @@
 
     def test_array_interface(self):
         import numpy as np
+        if self.runappdirect:
+            skip('requires numpy.core, test with numpy test suite instead')
         import sys
         class DummyArray(object):
             def __init__(self, interface, base=None):
diff --git a/pypy/module/pypyjit/interp_jit.py b/pypy/module/pypyjit/interp_jit.py
--- a/pypy/module/pypyjit/interp_jit.py
+++ b/pypy/module/pypyjit/interp_jit.py
@@ -35,6 +35,9 @@
     name = opcode_method_names[ord(bytecode.co_code[next_instr])]
     return '%s #%d %s' % (bytecode.get_repr(), next_instr, name)
 
+def get_unique_id(next_instr, is_being_profiled, bytecode):
+    return bytecode._unique_id
+
 
 def should_unroll_one_iteration(next_instr, is_being_profiled, bytecode):
     return (bytecode.co_flags & CO_GENERATOR) != 0
@@ -45,6 +48,7 @@
     virtualizables = ['frame']
 
 pypyjitdriver = PyPyJitDriver(get_printable_location = get_printable_location,
+                              get_unique_id = get_unique_id,
                               should_unroll_one_iteration =
                               should_unroll_one_iteration,
                               name='pypyjit')
diff --git a/pypy/module/pypyjit/test_pypy_c/model.py b/pypy/module/pypyjit/test_pypy_c/model.py
--- a/pypy/module/pypyjit/test_pypy_c/model.py
+++ b/pypy/module/pypyjit/test_pypy_c/model.py
@@ -134,7 +134,8 @@
 
     def _ops_for_chunk(self, chunk, include_guard_not_invalidated):
         for op in chunk.operations:
-            if op.name != 'debug_merge_point' and \
+            if op.name not in ('debug_merge_point', 'enter_portal_frame',
+                               'leave_portal_frame') and \
                 (op.name != 'guard_not_invalidated' or include_guard_not_invalidated):
                 yield op
 
diff --git a/pypy/module/pypyjit/test_pypy_c/test_misc.py b/pypy/module/pypyjit/test_pypy_c/test_misc.py
--- a/pypy/module/pypyjit/test_pypy_c/test_misc.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_misc.py
@@ -65,9 +65,7 @@
         assert loop.match("""
             i7 = int_gt(i4, 1)
             guard_true(i7, descr=...)
-            p9 = call(ConstClass(fromint), i4, descr=...)
-            guard_no_exception(descr=...)
-            p11 = call(ConstClass(rbigint.mul), p5, p9, descr=...)
+            p11 = call(ConstClass(rbigint.int_mul), p5, i4, descr=...)
             guard_no_exception(descr=...)
             i13 = int_sub(i4, 1)
             --TICK--
diff --git a/pypy/objspace/std/complexobject.py b/pypy/objspace/std/complexobject.py
--- a/pypy/objspace/std/complexobject.py
+++ b/pypy/objspace/std/complexobject.py
@@ -270,7 +270,7 @@
         imag = space.float_w(space.getattr(self, space.wrap("imag")))
         real_b = rbigint.fromrarith_int(float2longlong(real))
         imag_b = rbigint.fromrarith_int(r_ulonglong(float2longlong(imag)))
-        val = real_b.lshift(64).or_(imag_b).lshift(3).or_(rbigint.fromint(tag))
+        val = real_b.lshift(64).or_(imag_b).lshift(3).int_or_(tag)
         return space.newlong_from_rbigint(val)
 
     def int(self, space):
diff --git a/pypy/objspace/std/floatobject.py b/pypy/objspace/std/floatobject.py
--- a/pypy/objspace/std/floatobject.py
+++ b/pypy/objspace/std/floatobject.py
@@ -185,7 +185,7 @@
         from pypy.objspace.std.util import IDTAG_FLOAT as tag
         val = float2longlong(space.float_w(self))
         b = rbigint.fromrarith_int(val)
-        b = b.lshift(3).or_(rbigint.fromint(tag))
+        b = b.lshift(3).int_or_(tag)
         return space.newlong_from_rbigint(b)
 
     def __repr__(self):
diff --git a/pypy/objspace/std/intobject.py b/pypy/objspace/std/intobject.py
--- a/pypy/objspace/std/intobject.py
+++ b/pypy/objspace/std/intobject.py
@@ -46,7 +46,7 @@
         if self.user_overridden_class:
             return None
         b = space.bigint_w(self)
-        b = b.lshift(3).or_(rbigint.fromint(IDTAG_INT))
+        b = b.lshift(3).int_or_(IDTAG_INT)
         return space.newlong_from_rbigint(b)
 
     def int(self, space):
diff --git a/pypy/objspace/std/longobject.py b/pypy/objspace/std/longobject.py
--- a/pypy/objspace/std/longobject.py
+++ b/pypy/objspace/std/longobject.py
@@ -45,7 +45,7 @@
         if self.user_overridden_class:
             return None
         b = space.bigint_w(self)
-        b = b.lshift(3).or_(rbigint.fromint(IDTAG_LONG))
+        b = b.lshift(3).int_or_(IDTAG_LONG)
         return space.newlong_from_rbigint(b)
 
     def unwrap(self, space):
@@ -350,8 +350,13 @@
 
     def _make_descr_cmp(opname):
         op = getattr(rbigint, opname)
-        @delegate_other
+        intop = getattr(rbigint, "int_" + opname)
+
         def descr_impl(self, space, w_other):
+            if isinstance(w_other, W_AbstractIntObject):
+                return space.newbool(intop(self.num, w_other.int_w(space)))
+            elif not isinstance(w_other, W_AbstractLongObject):
+                return space.w_NotImplemented
             return space.newbool(op(self.num, w_other.asbigint()))
         return func_with_new_name(descr_impl, "descr_" + opname)
 
@@ -362,7 +367,7 @@
     descr_gt = _make_descr_cmp('gt')
     descr_ge = _make_descr_cmp('ge')
 
-    def _make_generic_descr_binop(opname):
+    def _make_generic_descr_binop_noncommutative(opname):
         methname = opname + '_' if opname in ('and', 'or') else opname
         descr_rname = 'descr_r' + opname
         op = getattr(rbigint, methname)
@@ -372,33 +377,65 @@
         def descr_binop(self, space, w_other):
             return W_LongObject(op(self.num, w_other.asbigint()))
 
-        if opname in COMMUTATIVE_OPS:
-            @func_renamer(descr_rname)
-            def descr_rbinop(self, space, w_other):
-                return descr_binop(self, space, w_other)
-        else:
-            @func_renamer(descr_rname)
-            @delegate_other
-            def descr_rbinop(self, space, w_other):
-                return W_LongObject(op(w_other.asbigint(), self.num))
+        @func_renamer(descr_rname)
+        @delegate_other
+        def descr_rbinop(self, space, w_other):
+            return W_LongObject(op(w_other.asbigint(), self.num))
 
         return descr_binop, descr_rbinop
 
+    def _make_generic_descr_binop(opname):
+        if opname not in COMMUTATIVE_OPS:
+            raise Exception("Not supported")
+            
+        methname = opname + '_' if opname in ('and', 'or') else opname
+        descr_rname = 'descr_r' + opname
+        op = getattr(rbigint, methname)
+        intop = getattr(rbigint, "int_" + methname)
+        
+        @func_renamer('descr_' + opname)
+        def descr_binop(self, space, w_other):
+            if isinstance(w_other, W_AbstractIntObject):
+                return W_LongObject(intop(self.num, w_other.int_w(space)))
+            elif not isinstance(w_other, W_AbstractLongObject):
+                return space.w_NotImplemented
+
+            return W_LongObject(op(self.num, w_other.asbigint()))
+
+        @func_renamer(descr_rname)
+        def descr_rbinop(self, space, w_other):
+            if isinstance(w_other, W_AbstractIntObject):
+                return W_LongObject(intop(self.num, w_other.int_w(space)))
+            elif not isinstance(w_other, W_AbstractLongObject):
+                return space.w_NotImplemented
+
+            return W_LongObject(op(w_other.asbigint(), self.num))
+
+        return descr_binop, descr_rbinop
+        
     descr_add, descr_radd = _make_generic_descr_binop('add')
-    descr_sub, descr_rsub = _make_generic_descr_binop('sub')
+    descr_sub, descr_rsub = _make_generic_descr_binop_noncommutative('sub')
     descr_mul, descr_rmul = _make_generic_descr_binop('mul')
     descr_and, descr_rand = _make_generic_descr_binop('and')
     descr_or, descr_ror = _make_generic_descr_binop('or')
     descr_xor, descr_rxor = _make_generic_descr_binop('xor')
 
-    def _make_descr_binop(func):
+    def _make_descr_binop(func, int_func=None):
         opname = func.__name__[1:]
 
-        @delegate_other
-        @func_renamer('descr_' + opname)
-        def descr_binop(self, space, w_other):
-            return func(self, space, w_other)
-
+        if int_func:
+            @func_renamer('descr_' + opname)
+            def descr_binop(self, space, w_other):
+                if isinstance(w_other, W_AbstractIntObject):
+                    return int_func(self, space, w_other.int_w(space))
+                elif not isinstance(w_other, W_AbstractLongObject):
+                    return space.w_NotImplemented
+                return func(self, space, w_other)
+        else:
+            @delegate_other
+            @func_renamer('descr_' + opname)
+            def descr_binop(self, space, w_other):
+                return func(self, space, w_other)
         @delegate_other
         @func_renamer('descr_r' + opname)
         def descr_rbinop(self, space, w_other):
@@ -417,7 +454,13 @@
         except OverflowError:   # b too big
             raise oefmt(space.w_OverflowError, "shift count too large")
         return W_LongObject(self.num.lshift(shift))
-    descr_lshift, descr_rlshift = _make_descr_binop(_lshift)
+        
+    def _int_lshift(self, space, w_other):
+        if w_other < 0:
+            raise oefmt(space.w_ValueError, "negative shift count")
+        return W_LongObject(self.num.lshift(w_other))
+        
+    descr_lshift, descr_rlshift = _make_descr_binop(_lshift, _int_lshift)
 
     def _rshift(self, space, w_other):
         if w_other.asbigint().sign < 0:
@@ -427,8 +470,22 @@
         except OverflowError:   # b too big # XXX maybe just return 0L instead?
             raise oefmt(space.w_OverflowError, "shift count too large")
         return newlong(space, self.num.rshift(shift))
-    descr_rshift, descr_rrshift = _make_descr_binop(_rshift)
+        
+    def _int_rshift(self, space, w_other):
+        if w_other < 0:
+            raise oefmt(space.w_ValueError, "negative shift count")
 
+        return newlong(space, self.num.rshift(w_other))
+    descr_rshift, descr_rrshift = _make_descr_binop(_rshift, _int_rshift)
+
+    def _floordiv(self, space, w_other):
+        try:
+            z = self.num.floordiv(w_other.asbigint())
+        except ZeroDivisionError:
+            raise oefmt(space.w_ZeroDivisionError,
+                        "long division or modulo by zero")
+        return newlong(space, z)
+        
     def _floordiv(self, space, w_other):
         try:
             z = self.num.floordiv(w_other.asbigint())
@@ -448,7 +505,15 @@
             raise oefmt(space.w_ZeroDivisionError,
                         "long division or modulo by zero")
         return newlong(space, z)
-    descr_mod, descr_rmod = _make_descr_binop(_mod)
+        
+    def _int_mod(self, space, w_other):
+        try:
+            z = self.num.int_mod(w_other)
+        except ZeroDivisionError:
+            raise oefmt(space.w_ZeroDivisionError,
+                        "long division or modulo by zero")
+        return newlong(space, z)
+    descr_mod, descr_rmod = _make_descr_binop(_mod, _int_mod)
 
     def _divmod(self, space, w_other):
         try:
diff --git a/rpython/bin/rpython-vmprof b/rpython/bin/rpython-vmprof
new file mode 100755
--- /dev/null
+++ b/rpython/bin/rpython-vmprof
@@ -0,0 +1,28 @@
+#!/usr/bin/env pypy
+
+"""RPython translation usage:
+
+rpython <translation options> target <targetoptions>
+
+run with --help for more information
+"""
+
+import sys, os
+sys.path.insert(0, os.path.dirname(os.path.dirname(
+                       os.path.dirname(os.path.realpath(__file__)))))
+from rpython.translator.goal.translate import main
+
+# no implicit targets
+if len(sys.argv) == 1:
+    print __doc__
+    sys.exit(1)
+
+import _vmprof, subprocess
+x = subprocess.Popen('gzip > vmprof.log.gz', shell=True, stdin=subprocess.PIPE)
+_vmprof.enable(x.stdin.fileno(), 0.001)
+try:
+    main()
+finally:
+    _vmprof.disable()
+    x.stdin.close()
+    x.wait()
diff --git a/rpython/jit/backend/arm/assembler.py b/rpython/jit/backend/arm/assembler.py
--- a/rpython/jit/backend/arm/assembler.py
+++ b/rpython/jit/backend/arm/assembler.py
@@ -57,6 +57,7 @@
         BaseAssembler.setup_once(self)
 
     def setup(self, looptoken):
+        BaseAssembler.setup(self, looptoken)
         assert self.memcpy_addr != 0, 'setup_once() not called?'
         if we_are_translated():
             self.debug = False
@@ -71,7 +72,6 @@
         self.mc.datablockwrapper = self.datablockwrapper
         self.target_tokens_currently_compiling = {}
         self.frame_depth_to_patch = []
-        self._finish_gcmap = lltype.nullptr(jitframe.GCMAP)
 
     def teardown(self):
         self.current_clt = None
@@ -102,7 +102,7 @@
         self.store_reg(mc, r.r0, r.fp, ofs)
         mc.MOV_rr(r.r0.value, r.fp.value)
         self.gen_func_epilog(mc)
-        rawstart = mc.materialize(self.cpu.asmmemmgr, [])
+        rawstart = mc.materialize(self.cpu, [])
         self.propagate_exception_path = rawstart
 
     def _store_and_reset_exception(self, mc, excvalloc=None, exctploc=None,
@@ -198,7 +198,7 @@
         mc.ADD_ri(r.sp.value, r.sp.value, (len(r.argument_regs) + 2) * WORD)
         mc.B(self.propagate_exception_path)
         #
-        rawstart = mc.materialize(self.cpu.asmmemmgr, [])
+        rawstart = mc.materialize(self.cpu, [])
         self.stack_check_slowpath = rawstart
 
     def _build_wb_slowpath(self, withcards, withfloats=False, for_frame=False):
@@ -255,7 +255,7 @@
         #
         mc.POP([r.ip.value, r.pc.value])
         #
-        rawstart = mc.materialize(self.cpu.asmmemmgr, [])
+        rawstart = mc.materialize(self.cpu, [])
         if for_frame:
             self.wb_slowpath[4] = rawstart
         else:
@@ -276,7 +276,7 @@
                                       callee_only)
         # return
         mc.POP([r.ip.value, r.pc.value])
-        return mc.materialize(self.cpu.asmmemmgr, [])
+        return mc.materialize(self.cpu, [])
 
     def _build_malloc_slowpath(self, kind):
         """ While arriving on slowpath, we have a gcpattern on stack 0.
@@ -352,7 +352,7 @@
         mc.POP([r.ip.value, r.pc.value])
 
         #
-        rawstart = mc.materialize(self.cpu.asmmemmgr, [])
+        rawstart = mc.materialize(self.cpu, [])
         return rawstart
 
     def _reload_frame_if_necessary(self, mc):
@@ -473,7 +473,7 @@
         mc.MOV_rr(r.r0.value, r.fp.value)
         #
         self.gen_func_epilog(mc)
-        rawstart = mc.materialize(self.cpu.asmmemmgr, [])
+        rawstart = mc.materialize(self.cpu, [])
         self.failure_recovery_code[exc + 2 * withfloats] = rawstart
 
     def generate_quick_failure(self, guardtok):
@@ -575,8 +575,8 @@
             self.mc.BL(self.stack_check_slowpath, c=c.HI)      # call if ip > lr
 
     # cpu interface
-    def assemble_loop(self, logger, loopname, inputargs, operations, looptoken,
-                      log):
+    def assemble_loop(self, jd_id, unique_id, logger, loopname, inputargs,
+                      operations, looptoken, log):
         clt = CompiledLoopToken(self.cpu, looptoken.number)
         looptoken.compiled_loop_token = clt
         clt._debug_nbargs = len(inputargs)
@@ -586,6 +586,9 @@
             assert len(set(inputargs)) == len(inputargs)
 
         self.setup(looptoken)
+        #self.codemap_builder.enter_portal_frame(jd_id, unique_id,
+        #                                self.mc.get_relative_pos())
+
 
         frame_info = self.datablockwrapper.malloc_aligned(
             jitframe.JITFRAMEINFO_SIZE, alignment=WORD)
@@ -659,6 +662,7 @@
             assert len(set(inputargs)) == len(inputargs)
 
         self.setup(original_loop_token)
+        #self.codemap.inherit_code_from_position(faildescr.adr_jump_offset)
         descr_number = compute_unique_id(faildescr)
         if log:
             operations = self._inject_debugging_code(faildescr, operations,
@@ -850,7 +854,7 @@
         # restore registers
         self._pop_all_regs_from_jitframe(mc, [], self.cpu.supports_floats)
         mc.POP([r.ip.value, r.pc.value])  # return
-        self._frame_realloc_slowpath = mc.materialize(self.cpu.asmmemmgr, [])
+        self._frame_realloc_slowpath = mc.materialize(self.cpu, [])
 
     def _load_shadowstack_top(self, mc, reg, gcrootmap):
         rst = gcrootmap.get_root_stack_top_addr()
@@ -879,8 +883,12 @@
         self.datablockwrapper.done()      # finish using cpu.asmmemmgr
         self.datablockwrapper = None
         allblocks = self.get_asmmemmgr_blocks(looptoken)
-        return self.mc.materialize(self.cpu.asmmemmgr, allblocks,
+        size = self.mc.get_relative_pos() 
+        res = self.mc.materialize(self.cpu, allblocks,
                                    self.cpu.gc_ll_descr.gcrootmap)
+        #self.cpu.codemap.register_codemap(
+        #    self.codemap.get_final_bytecode(res, size))
+        return res
 
     def update_frame_depth(self, frame_depth):
         baseofs = self.cpu.get_baseofs_of_frame_field()
diff --git a/rpython/jit/backend/arm/opassembler.py b/rpython/jit/backend/arm/opassembler.py
--- a/rpython/jit/backend/arm/opassembler.py
+++ b/rpython/jit/backend/arm/opassembler.py
@@ -459,6 +459,8 @@
         return fcond
     emit_op_jit_debug = emit_op_debug_merge_point
     emit_op_keepalive = emit_op_debug_merge_point
+    emit_op_enter_portal_frame = emit_op_debug_merge_point
+    emit_op_leave_portal_frame = emit_op_debug_merge_point
 
     def emit_op_cond_call_gc_wb(self, op, arglocs, regalloc, fcond):
         self._write_barrier_fastpath(self.mc, op.getdescr(), arglocs, fcond)
diff --git a/rpython/jit/backend/arm/regalloc.py b/rpython/jit/backend/arm/regalloc.py
--- a/rpython/jit/backend/arm/regalloc.py
+++ b/rpython/jit/backend/arm/regalloc.py
@@ -373,6 +373,12 @@
         return gcmap
 
     # ------------------------------------------------------------
+    def perform_enter_portal_frame(self, op):
+        self.assembler.enter_portal_frame(op)
+
+    def perform_leave_portal_frame(self, op):
+        self.assembler.leave_portal_frame(op)
+
     def perform_extra(self, op, args, fcond):
         return self.assembler.regalloc_emit_extra(op, args, fcond, self)
 
@@ -1149,6 +1155,8 @@
     prepare_op_debug_merge_point = void
     prepare_op_jit_debug = void
     prepare_op_keepalive = void
+    prepare_op_enter_portal_frame = void
+    prepare_op_leave_portal_frame = void
 
     def prepare_op_cond_call_gc_wb(self, op, fcond):
         assert op.result is None
diff --git a/rpython/jit/backend/arm/runner.py b/rpython/jit/backend/arm/runner.py
--- a/rpython/jit/backend/arm/runner.py
+++ b/rpython/jit/backend/arm/runner.py
@@ -50,16 +50,12 @@
     def setup_once(self):
         self.cpuinfo.arch_version = detect_arch_version()
         self.cpuinfo.hf_abi = detect_hardfloat()
+        #self.codemap.setup()
         self.assembler.setup_once()
 
     def finish_once(self):
         self.assembler.finish_once()
 
-    def compile_loop(self, inputargs, operations, looptoken,
-                     log=True, name='', logger=None):
-        return self.assembler.assemble_loop(logger, name, inputargs, operations,
-                                            looptoken, log=log)
-
     def compile_bridge(self, faildescr, inputargs, operations,
                        original_loop_token, log=True, logger=None):
         clt = original_loop_token.compiled_loop_token
diff --git a/rpython/jit/backend/arm/test/support.py b/rpython/jit/backend/arm/test/support.py
--- a/rpython/jit/backend/arm/test/support.py
+++ b/rpython/jit/backend/arm/test/support.py
@@ -24,7 +24,7 @@
 
 def run_asm(asm):
     BOOTSTRAP_TP = lltype.FuncType([], lltype.Signed)
-    addr = asm.mc.materialize(asm.cpu.asmmemmgr, [], None)
+    addr = asm.mc.materialize(asm.cpu, [], None)
     assert addr % 8 == 0
     func = rffi.cast(lltype.Ptr(BOOTSTRAP_TP), addr)
     asm.mc._dump_trace(addr, 'test.asm')
diff --git a/rpython/jit/backend/arm/test/test_calling_convention.py b/rpython/jit/backend/arm/test/test_calling_convention.py
--- a/rpython/jit/backend/arm/test/test_calling_convention.py
+++ b/rpython/jit/backend/arm/test/test_calling_convention.py
@@ -29,7 +29,7 @@
         mc = InstrBuilder()
         mc.MOV_rr(r.r0.value, r.sp.value)
         mc.MOV_rr(r.pc.value, r.lr.value)
-        return mc.materialize(self.cpu.asmmemmgr, [])
+        return mc.materialize(self.cpu, [])
 
     def get_alignment_requirements(self):
         return 8
diff --git a/rpython/jit/backend/llgraph/runner.py b/rpython/jit/backend/llgraph/runner.py
--- a/rpython/jit/backend/llgraph/runner.py
+++ b/rpython/jit/backend/llgraph/runner.py
@@ -245,8 +245,8 @@
         self.stats = stats or MiniStats()
         self.vinfo_for_tests = kwds.get('vinfo_for_tests', None)
 
-    def compile_loop(self, inputargs, operations, looptoken, log=True,
-                     name='', logger=None):
+    def compile_loop(self, inputargs, operations, looptoken, jd_id=0,
+                     unique_id=0, log=True, name='', logger=None):


More information about the pypy-commit mailing list