[pypy-commit] pypy value-profiling: merge

Tue Aug 11 15:27:07 CEST 2015

Author: Carl Friedrich Bolz <cfbolz at gmx.de>
Branch: value-profiling
Changeset: r78896:c6fd1f04a9e0
Date: 2015-08-11 15:27 +0200
http://bitbucket.org/pypy/pypy/changeset/c6fd1f04a9e0/

Log:	merge

diff too long, truncating to 2000 out of 5220 lines

diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -55,3 +55,11 @@
 .. branch: nditer-revisited
 
 Implement nditer 'buffered' flag and fix some edge cases
+
+.. branch: ufunc-reduce
+
+Allow multiple axes in ufunc.reduce()
+
+.. branch: fix-tinylang-goals
+
+Update tinylang goals to match current rpython
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -11,7 +11,7 @@
     INT_MIN, INT_MAX, UINT_MAX, USHRT_MAX
 
 from pypy.interpreter.executioncontext import (ExecutionContext, ActionFlag,
-    UserDelAction, CodeUniqueIds)
+    UserDelAction)
 from pypy.interpreter.error import OperationError, new_exception_class, oefmt
 from pypy.interpreter.argument import Arguments
 from pypy.interpreter.miscutils import ThreadLocals, make_weak_value_dictionary
@@ -391,7 +391,6 @@
         self.actionflag = ActionFlag()    # changed by the signal module
         self.check_signal_action = None   # changed by the signal module
         self.user_del_action = UserDelAction(self)
-        self.code_unique_ids = CodeUniqueIds()
         self._code_of_sys_exc_info = None
 
         # can be overridden to a subclass
@@ -670,16 +669,6 @@
             assert ec is not None
             return ec
 
-    def register_code_callback(self, callback):
-        cui = self.code_unique_ids
-        cui.code_callback = callback
-
-    def register_code_object(self, pycode):
-        cui = self.code_unique_ids
-        if cui.code_callback is None:
-            return
-        cui.code_callback(self, pycode)
-
     def _freeze_(self):
         return True
 
diff --git a/pypy/interpreter/executioncontext.py b/pypy/interpreter/executioncontext.py
--- a/pypy/interpreter/executioncontext.py
+++ b/pypy/interpreter/executioncontext.py
@@ -590,11 +590,3 @@
         # there is no list of length n: if n is large, then the GC
         # will run several times while walking the list, but it will
         # see lower and lower memory usage, with no lower bound of n.
-
-class CodeUniqueIds(object):
-    def __init__(self):
-        if sys.maxint == 2147483647:
-            self.code_unique_id = 0 # XXX this is wrong, it won't work on 32bit
-        else:
-            self.code_unique_id = 0x7000000000000000
-        self.code_callback = None
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -94,7 +94,7 @@
         self.magic = magic
         self._signature = cpython_code_signature(self)
         self._initialize()
-        space.register_code_object(self)
+        self._init_ready()
         self.vprofs = [ValueProf() for i in range(self.co_nlocals)]
 
     def _initialize(self):
@@ -137,14 +137,8 @@
             from pypy.objspace.std.mapdict import init_mapdict_cache
             init_mapdict_cache(self)
 
-        cui = self.space.code_unique_ids
-        self._unique_id = cui.code_unique_id
-        cui.code_unique_id += 4  # so we have two bits that we can mark stuff
-        # with
-
-    def _get_full_name(self):
-        return "py:%s:%d:%s" % (self.co_name, self.co_firstlineno,
-                                self.co_filename)
+    def _init_ready(self):
+        "This is a hook for the vmprof module, which overrides this method."
 
     def _cleanup_(self):
         if (self.magic == cpython_magic and
diff --git a/pypy/module/__pypy__/__init__.py b/pypy/module/__pypy__/__init__.py
--- a/pypy/module/__pypy__/__init__.py
+++ b/pypy/module/__pypy__/__init__.py
@@ -62,6 +62,7 @@
     }
 
     interpleveldefs = {
+        'attach_gdb'                : 'interp_magic.attach_gdb',
         'internal_repr'             : 'interp_magic.internal_repr',
         'bytebuffer'                : 'bytebuffer.bytebuffer',
         'identity_dict'             : 'interp_identitydict.W_IdentityDict',
@@ -100,8 +101,6 @@
 
     def setup_after_space_initialization(self):
         """NOT_RPYTHON"""
-        if not self.space.config.translating:
-            self.extra_interpdef('interp_pdb', 'interp_magic.interp_pdb')
         if self.space.config.objspace.std.withmethodcachecounter:
             self.extra_interpdef('method_cache_counter',
                                  'interp_magic.method_cache_counter')
diff --git a/pypy/module/__pypy__/interp_magic.py b/pypy/module/__pypy__/interp_magic.py
--- a/pypy/module/__pypy__/interp_magic.py
+++ b/pypy/module/__pypy__/interp_magic.py
@@ -15,12 +15,10 @@
     return space.wrap('%r' % (w_object,))
 
 
-def interp_pdb(space):
-    """Run an interp-level pdb.
-    This is not available in translated versions of PyPy."""
-    assert not we_are_translated()
-    import pdb
-    pdb.set_trace()
+def attach_gdb(space):
+    """Run an interp-level gdb (or pdb when untranslated)"""
+    from rpython.rlib.debug import attach_gdb
+    attach_gdb()
 
 
 @unwrap_spec(name=str)
diff --git a/pypy/module/_file/interp_file.py b/pypy/module/_file/interp_file.py
--- a/pypy/module/_file/interp_file.py
+++ b/pypy/module/_file/interp_file.py
@@ -209,7 +209,7 @@
                     # EAGAIN after already some data was received, return it.
                     # Note that we can get EAGAIN while there is buffered data
                     # waiting; read that too.
-                    if is_wouldblock_error(e):
+                    if is_wouldblock_error(e.errno):
                         m = stream.count_buffered_bytes()
                         if m > 0:
                             result.append(stream.read(min(n, m)))
@@ -321,6 +321,10 @@
         self.getstream()    # check if the file is still open
         return os.isatty(self.fd)
 
+    def direct_readinto(self, w_rwbuffer):
+        from pypy.module._file.readinto import direct_readinto
+        return direct_readinto(self, w_rwbuffer)
+
     # ____________________________________________________________
     #
     # The 'file_' methods are the one exposed to app-level.
@@ -413,6 +417,9 @@
 Notice that when in non-blocking mode, less data than what was requested
 may be returned, even if no size parameter was given.""")
 
+    _decl(locals(), "readinto",
+        """readinto(buf) -> length.  Read into the given read-write buffer.""")
+
     _decl(locals(), "readline",
         """readline([size]) -> next line from the file, as a string.
 
@@ -508,16 +515,6 @@
         for w_line in lines:
             self.file_write(w_line)
 
-    def file_readinto(self, w_rwbuffer):
-        """readinto() -> Undocumented.  Don't use this; it may go away."""
-        # XXX not the most efficient solution as it doesn't avoid the copying
-        space = self.space
-        rwbuffer = space.writebuf_w(w_rwbuffer)
-        w_data = self.file_read(rwbuffer.getlength())
-        data = space.str_w(w_data)
-        rwbuffer.setslice(0, data)
-        return space.wrap(len(data))
-
 
 # ____________________________________________________________
 
@@ -603,7 +600,6 @@
                               cls=W_File,
                               doc="Support for 'print'."),
     __repr__ = interp2app(W_File.file__repr__),
-    readinto = interp2app(W_File.file_readinto),
     writelines = interp2app(W_File.file_writelines),
     __exit__ = interp2app(W_File.file__exit__),
     __weakref__ = make_weakref_descr(W_File),
@@ -632,10 +628,10 @@
 MAYBE_EAGAIN      = getattr(errno, 'EAGAIN',      None)
 MAYBE_EWOULDBLOCK = getattr(errno, 'EWOULDBLOCK', None)
 
-def is_wouldblock_error(e):
-    if MAYBE_EAGAIN is not None and e.errno == MAYBE_EAGAIN:
+def is_wouldblock_error(errno):
+    if MAYBE_EAGAIN is not None and errno == MAYBE_EAGAIN:
         return True
-    if MAYBE_EWOULDBLOCK is not None and e.errno == MAYBE_EWOULDBLOCK:
+    if MAYBE_EWOULDBLOCK is not None and errno == MAYBE_EWOULDBLOCK:
         return True
     return False
 
diff --git a/pypy/module/_file/readinto.py b/pypy/module/_file/readinto.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_file/readinto.py
@@ -0,0 +1,81 @@
+import sys, errno
+from rpython.rlib import rposix
+from rpython.rlib.objectmodel import keepalive_until_here
+from rpython.rtyper.lltypesystem import lltype, rffi
+from pypy.module._file.interp_file import is_wouldblock_error, signal_checker
+
+_WIN32 = sys.platform.startswith('win')
+UNDERSCORE_ON_WIN32 = '_' if _WIN32 else ''
+
+os_read = rffi.llexternal(UNDERSCORE_ON_WIN32 + 'read',
+                          [rffi.INT, rffi.CCHARP, rffi.SIZE_T],
+                          rffi.SIZE_T, save_err=rffi.RFFI_SAVE_ERRNO)
+
+
+def direct_readinto(self, w_rwbuffer):
+    rwbuffer = self.space.writebuf_w(w_rwbuffer)
+    stream = self.getstream()
+    size = rwbuffer.getlength()
+    target_address = lltype.nullptr(rffi.CCHARP.TO)
+    fd = -1
+    target_pos = 0
+
+    if size > 64:
+        try:
+            target_address = rwbuffer.get_raw_address()
+        except ValueError:
+            pass
+        else:
+            fd = stream.try_to_find_file_descriptor()
+
+    if fd < 0 or not target_address:
+        # fall-back
+        MAX_PART = 1024 * 1024    # 1 MB
+        while size > MAX_PART:
+            data = self.direct_read(MAX_PART)
+            rwbuffer.setslice(target_pos, data)
+            target_pos += len(data)
+            size -= len(data)
+            if len(data) != MAX_PART:
+                break
+        else:
+            data = self.direct_read(size)
+            rwbuffer.setslice(target_pos, data)
+            target_pos += len(data)
+
+    else:
+        # optimized case: reading more than 64 bytes into a rwbuffer
+        # with a valid raw address
+        self.check_readable()
+
+        # first "read" the part that is already sitting in buffers, if any
+        initial_size = min(size, stream.count_buffered_bytes())
+        if initial_size > 0:
+            data = stream.read(initial_size)
+            rwbuffer.setslice(target_pos, data)
+            target_pos += len(data)
+            size -= len(data)
+
+        # then call os_read() to get the rest
+        if size > 0:
+            stream.flush()
+            while True:
+                got = os_read(fd, rffi.ptradd(target_address, target_pos), size)
+                if got > 0:
+                    target_pos += got
+                    size -= got
+                    if size <= 0:
+                        break
+                elif got == 0:
+                    break
+                else:
+                    err = rposix.get_saved_errno()
+                    if err == errno.EINTR:
+                        signal_checker(self.space)()
+                        continue
+                    if is_wouldblock_error(err) and target_pos > 0:
+                        break
+                    raise OSError(err, "read error")
+            keepalive_until_here(rwbuffer)
+
+    return self.space.wrap(target_pos)
diff --git a/pypy/module/_vmprof/__init__.py b/pypy/module/_vmprof/__init__.py
--- a/pypy/module/_vmprof/__init__.py
+++ b/pypy/module/_vmprof/__init__.py
@@ -2,7 +2,7 @@
 
 class Module(MixedModule):
     """
-    Write me :)
+    VMProf for PyPy: a statistical profiler
     """
     appleveldefs = {
     }
@@ -10,9 +10,13 @@
     interpleveldefs = {
         'enable': 'interp_vmprof.enable',
         'disable': 'interp_vmprof.disable',
+        'VMProfError': 'space.fromcache(interp_vmprof.Cache).w_VMProfError',
     }
 
-    def setup_after_space_initialization(self):
-        # force the __extend__ hacks to occur early
-        from pypy.module._vmprof.interp_vmprof import VMProf
-        self.vmprof = VMProf()
+
+# Force the __extend__ hacks and method replacements to occur
+# early.  Without this, for example, 'PyCode._init_ready' was
+# already found by the annotator to be the original empty
+# method, and the annotator doesn't notice that interp_vmprof.py
+# (loaded later) replaces this method.
+import pypy.module._vmprof.interp_vmprof
diff --git a/pypy/module/_vmprof/interp_vmprof.py b/pypy/module/_vmprof/interp_vmprof.py
--- a/pypy/module/_vmprof/interp_vmprof.py
+++ b/pypy/module/_vmprof/interp_vmprof.py
@@ -1,252 +1,74 @@
-import py, os, sys
-from rpython.rtyper.lltypesystem import lltype, rffi, llmemory
-from rpython.translator.tool.cbuild import ExternalCompilationInfo
-from rpython.rtyper.annlowlevel import cast_instance_to_gcref, cast_base_ptr_to_instance
-from rpython.rlib.objectmodel import we_are_translated
-from rpython.rlib import jit, rposix, rgc
-from rpython.rlib.rarithmetic import ovfcheck_float_to_int
-from rpython.rtyper.tool import rffi_platform as platform
-from rpython.rlib.rstring import StringBuilder
-from pypy.interpreter.baseobjspace import W_Root
-from pypy.interpreter.error import oefmt, wrap_oserror, OperationError
+from pypy.interpreter.error import OperationError
 from pypy.interpreter.gateway import unwrap_spec
 from pypy.interpreter.pyframe import PyFrame
 from pypy.interpreter.pycode import PyCode
+from pypy.interpreter.baseobjspace import W_Root
+from rpython.rlib import rvmprof
 
-ROOT = py.path.local(__file__).join('..')
-SRC = ROOT.join('src')
+# ____________________________________________________________
 
-# by default, we statically link vmprof.c into pypy; however, if you set
-# DYNAMIC_VMPROF to True, it will be dynamically linked to the libvmprof.so
-# which is expected to be inside pypy/module/_vmprof/src: this is very useful
-# during development. Note that you have to manually build libvmprof by
-# running make inside the src dir
-DYNAMIC_VMPROF = False
 
-if sys.platform.startswith('linux'):
-    libs = ['dl']
-else:
-    libs = []
+_get_code = lambda frame, w_inputvalue, operr: frame.pycode
+_decorator = rvmprof.vmprof_execute_code("pypy", _get_code, W_Root)
+my_execute_frame = _decorator(PyFrame.execute_frame)
 
-eci_kwds = dict(
-    include_dirs = [SRC],
-    includes = ['vmprof.h', 'trampoline.h'],
-    separate_module_files = [SRC.join('trampoline.vmprof.s')],
-    libraries = libs,
-    
-    post_include_bits=["""
-        int pypy_vmprof_init(void);
-    """],
-    
-    separate_module_sources=["""
-        int pypy_vmprof_init(void) {
-            return vmprof_set_mainloop(pypy_execute_frame_trampoline, 0,
-                                NULL);
-        }
-    """],
-    )
-
-
-if DYNAMIC_VMPROF:
-    eci_kwds['libraries'] += ['vmprof']
-    eci_kwds['link_extra'] = ['-Wl,-rpath,%s' % SRC, '-L%s' % SRC]
-else:
-    eci_kwds['separate_module_files'] += [SRC.join('vmprof.c')]
-
-eci = ExternalCompilationInfo(**eci_kwds)
-
-check_eci = eci.merge(ExternalCompilationInfo(separate_module_files=[
-    SRC.join('fake_pypy_api.c')]))
-
-platform.verify_eci(check_eci)
-
-pypy_execute_frame_trampoline = rffi.llexternal(
-    "pypy_execute_frame_trampoline",
-    [llmemory.GCREF, llmemory.GCREF, llmemory.GCREF, lltype.Signed],
-    llmemory.GCREF,
-    compilation_info=eci,
-    _nowrapper=True, sandboxsafe=True,
-    random_effects_on_gcobjs=True)
-
-pypy_vmprof_init = rffi.llexternal("pypy_vmprof_init", [], rffi.INT,
-                                   compilation_info=eci)
-vmprof_enable = rffi.llexternal("vmprof_enable",
-                                [rffi.INT, rffi.LONG, rffi.INT,
-                                 rffi.CCHARP, rffi.INT],
-                                rffi.INT, compilation_info=eci,
-                                save_err=rffi.RFFI_SAVE_ERRNO)
-vmprof_disable = rffi.llexternal("vmprof_disable", [], rffi.INT,
-                                 compilation_info=eci,
-                                save_err=rffi.RFFI_SAVE_ERRNO)
-vmprof_get_error = rffi.llexternal("vmprof_get_error", [], rffi.CCHARP,
-                                   compilation_info=eci,
-                                   save_err=rffi.RFFI_SAVE_ERRNO)
-
-vmprof_register_virtual_function = rffi.llexternal(
-    "vmprof_register_virtual_function",
-    [rffi.CCHARP, rffi.VOIDP, rffi.VOIDP], lltype.Void,
-    compilation_info=eci, _nowrapper=True)
-
-original_execute_frame = PyFrame.execute_frame.im_func
-original_execute_frame.c_name = 'pypy_pyframe_execute_frame'
-original_execute_frame._dont_inline_ = True
 
 class __extend__(PyFrame):
-    def execute_frame(frame, w_inputvalue=None, operr=None):
-        # go through the asm trampoline ONLY if we are translated but not being JITted.
-        #
-        # If we are not translated, we obviously don't want to go through the
-        # trampoline because there is no C function it can call.
-        #
-        # If we are being JITted, we want to skip the trampoline, else the JIT
-        # cannot see throug it
-        if we_are_translated() and not jit.we_are_jitted():
-            # if we are translated, call the trampoline
-            gc_frame = cast_instance_to_gcref(frame)
-            gc_inputvalue = cast_instance_to_gcref(w_inputvalue)
-            gc_operr = cast_instance_to_gcref(operr)
-            unique_id = frame.pycode._unique_id
-            gc_result = pypy_execute_frame_trampoline(gc_frame, gc_inputvalue,
-                                                      gc_operr, unique_id)
-            return cast_base_ptr_to_instance(W_Root, gc_result)
-        else:
-            return original_execute_frame(frame, w_inputvalue, operr)
+    def execute_frame(self, w_inputvalue=None, operr=None):
+        # indirection for the optional arguments
+        return my_execute_frame(self, w_inputvalue, operr)
 
 
+def _safe(s):
+    if len(s) > 110:
+        s = s[:107] + '...'
+    return s.replace(':', ';')
 
-def write_long_to_string_builder(l, b):
-    if sys.maxint == 2147483647:
-        b.append(chr(l & 0xff))
-        b.append(chr((l >> 8) & 0xff))
-        b.append(chr((l >> 16) & 0xff))
-        b.append(chr((l >> 24) & 0xff))
-    else:
-        b.append(chr(l & 0xff))
-        b.append(chr((l >> 8) & 0xff))
-        b.append(chr((l >> 16) & 0xff))
-        b.append(chr((l >> 24) & 0xff))
-        b.append(chr((l >> 32) & 0xff))
-        b.append(chr((l >> 40) & 0xff))
-        b.append(chr((l >> 48) & 0xff))
-        b.append(chr((l >> 56) & 0xff))
+def _get_full_name(pycode):
+    # careful, must not have extraneous ':' or be longer than 255 chars
+    return "py:%s:%d:%s" % (_safe(pycode.co_name), pycode.co_firstlineno,
+                            _safe(pycode.co_filename))
 
-def try_cast_to_pycode(gcref):
-    return rgc.try_cast_gcref_to_instance(PyCode, gcref)
+rvmprof.register_code_object_class(PyCode, _get_full_name)
 
-MAX_CODES = 1000
 
-class VMProf(object):
-    def __init__(self):
-        self.is_enabled = False
-        self.ever_enabled = False
-        self.fileno = -1
-        self.current_codes = []
+def _init_ready(pycode):
+    rvmprof.register_code(pycode, _get_full_name)
 
-    def enable(self, space, fileno, period_usec):
-        if self.is_enabled:
-            raise oefmt(space.w_ValueError, "_vmprof already enabled")
-        self.fileno = fileno
-        self.is_enabled = True
-        self.write_header(fileno, period_usec)
-        if not self.ever_enabled:
-            if we_are_translated():
-                res = pypy_vmprof_init()
-                if res:
-                    raise OperationError(
-                        space.w_IOError,
-                        space.wrap(rffi.charp2str(vmprof_get_error())))
-            self.ever_enabled = True
-        self.gather_all_code_objs(space)
-        space.register_code_callback(vmprof_register_code)
-        if we_are_translated():
-            # does not work untranslated
-            res = vmprof_enable(fileno, period_usec, 0,
-                                lltype.nullptr(rffi.CCHARP.TO), 0)
-        else:
-            res = 0
-        if res == -1:
-            raise wrap_oserror(space, OSError(rposix.get_saved_errno(),
-                                              "_vmprof.enable"))
+PyCode._init_ready = _init_ready
 
-    def gather_all_code_objs(self, space):
-        all_code_objs = rgc.do_get_objects(try_cast_to_pycode)
-        for code in all_code_objs:
-            self.register_code(space, code)
 
-    def write_header(self, fileno, period_usec):
-        assert period_usec > 0
-        b = StringBuilder()
-        write_long_to_string_builder(0, b)
-        write_long_to_string_builder(3, b)
-        write_long_to_string_builder(0, b)
-        write_long_to_string_builder(period_usec, b)
-        write_long_to_string_builder(0, b)
-        b.append('\x04') # interp name
-        b.append(chr(len('pypy')))
-        b.append('pypy')
-        os.write(fileno, b.build())
+# ____________________________________________________________
 
-    def register_code(self, space, code):
-        if self.fileno == -1:
-            raise OperationError(space.w_RuntimeError,
-                                 space.wrap("vmprof not running"))
-        self.current_codes.append(code)
-        if len(self.current_codes) >= MAX_CODES:
-            self._flush_codes(space)
 
-    def _flush_codes(self, space):
-        b = StringBuilder()
-        for code in self.current_codes:
-            name = code._get_full_name()
-            b.append('\x02')
-            write_long_to_string_builder(code._unique_id, b)
-            write_long_to_string_builder(len(name), b)
-            b.append(name)
-        os.write(self.fileno, b.build())
-        self.current_codes = []
+class Cache:
+    def __init__(self, space):
+        self.w_VMProfError = space.new_exception_class("_vmprof.VMProfError")
 
-    def disable(self, space):
-        if not self.is_enabled:
-            raise oefmt(space.w_ValueError, "_vmprof not enabled")
-        self.is_enabled = False
-        space.register_code_callback(None)
-        self._flush_codes(space)
-        self.fileno = -1
-        if we_are_translated():
-           # does not work untranslated
-            res = vmprof_disable()
-        else:
-            res = 0
-        if res == -1:
-            raise wrap_oserror(space, OSError(rposix.get_saved_errno(),
-                                              "_vmprof.disable"))
+def VMProfError(space, e):
+    w_VMProfError = space.fromcache(Cache).w_VMProfError
+    return OperationError(w_VMProfError, space.wrap(e.msg))
 
-def vmprof_register_code(space, code):
-    from pypy.module._vmprof import Module
-    mod_vmprof = space.getbuiltinmodule('_vmprof')
-    assert isinstance(mod_vmprof, Module)
-    mod_vmprof.vmprof.register_code(space, code)
 
 @unwrap_spec(fileno=int, period=float)
-def enable(space, fileno, period=0.01):   # default 100 Hz
-    from pypy.module._vmprof import Module
-    mod_vmprof = space.getbuiltinmodule('_vmprof')
-    assert isinstance(mod_vmprof, Module)
-    #
+def enable(space, fileno, period):
+    """Enable vmprof.  Writes go to the given 'fileno', a file descriptor
+    opened for writing.  *The file descriptor must remain open at least
+    until disable() is called.*
+
+    'interval' is a float representing the sampling interval, in seconds.
+    Must be smaller than 1.0
+    """
     try:
-        period_usec = ovfcheck_float_to_int(period * 1000000.0 + 0.5)
-        if period_usec <= 0 or period_usec >= 1e6:
-            # we don't want seconds here at all
-            raise ValueError
-    except (ValueError, OverflowError):
-        raise OperationError(space.w_ValueError,
-                             space.wrap("'period' too large or non positive"))
-    #
-    mod_vmprof.vmprof.enable(space, fileno, period_usec)
+        rvmprof.enable(fileno, period)
+    except rvmprof.VMProfError, e:
+        raise VMProfError(space, e)
 
 def disable(space):
-    from pypy.module._vmprof import Module
-    mod_vmprof = space.getbuiltinmodule('_vmprof')
-    assert isinstance(mod_vmprof, Module)
-    mod_vmprof.vmprof.disable(space)
-
+    """Disable vmprof.  Remember to close the file descriptor afterwards
+    if necessary.
+    """
+    try:
+        rvmprof.disable()
+    except rvmprof.VMProfError, e:
+        raise VMProfError(space, e)
diff --git a/pypy/module/_vmprof/src/config.h b/pypy/module/_vmprof/src/config.h
deleted file mode 100644
--- a/pypy/module/_vmprof/src/config.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#define HAVE_SYS_UCONTEXT_H
-#if defined(__FreeBSD__) || defined(__APPLE__)
-#define PC_FROM_UCONTEXT uc_mcontext.mc_rip
-#else
-#define PC_FROM_UCONTEXT uc_mcontext.gregs[REG_RIP]
-#endif
diff --git a/pypy/module/_vmprof/src/fake_pypy_api.c b/pypy/module/_vmprof/src/fake_pypy_api.c
deleted file mode 100644
--- a/pypy/module/_vmprof/src/fake_pypy_api.c
+++ /dev/null
@@ -1,4 +0,0 @@
-
-void pypy_pyframe_execute_frame(void)
-{
-}
diff --git a/pypy/module/_vmprof/src/get_custom_offset.c b/pypy/module/_vmprof/src/get_custom_offset.c
deleted file mode 100644
--- a/pypy/module/_vmprof/src/get_custom_offset.c
+++ /dev/null
@@ -1,80 +0,0 @@
-
-#ifdef PYPY_JIT_CODEMAP
-
-extern volatile int pypy_codemap_currently_invalid;
-
-void *pypy_find_codemap_at_addr(long addr, long *start_addr);
-long pypy_yield_codemap_at_addr(void *codemap_raw, long addr,
-                                long *current_pos_addr);
-long pypy_jit_stack_depth_at_loc(long loc);
-
-#endif
-
-
-void vmprof_set_tramp_range(void* start, void* end)
-{
-}
-
-int custom_sanity_check()
-{
-#ifdef PYPY_JIT_CODEMAP
-    return !pypy_codemap_currently_invalid;
-#else
-    return 1;
-#endif
-}
-
-static ptrdiff_t vmprof_unw_get_custom_offset(void* ip, void *cp) {
-#ifdef PYPY_JIT_CODEMAP
-    intptr_t ip_l = (intptr_t)ip;
-    return pypy_jit_stack_depth_at_loc(ip_l);
-#else
-    return 0;
-#endif
-}
-
-static long vmprof_write_header_for_jit_addr(void **result, long n,
-                                             void *ip, int max_depth)
-{
-#ifdef PYPY_JIT_CODEMAP
-    void *codemap;
-    long current_pos = 0;
-    intptr_t id;
-    long start_addr = 0;
-    intptr_t addr = (intptr_t)ip;
-    int start, k;
-    void *tmp;
-
-    codemap = pypy_find_codemap_at_addr(addr, &start_addr);
-    if (codemap == NULL)
-        // not a jit code at all
-        return n;
-
-    // modify the last entry to point to start address and not the random one
-    // in the middle
-    result[n - 1] = (void*)start_addr;
-    result[n] = (void*)2;
-    n++;
-    start = n;
-    while (n < max_depth) {
-        id = pypy_yield_codemap_at_addr(codemap, addr, &current_pos);
-        if (id == -1)
-            // finish
-            break;
-        if (id == 0)
-            continue; // not main codemap
-        result[n++] = (void *)id;
-    }
-    k = 0;
-    while (k < (n - start) / 2) {
-        tmp = result[start + k];
-        result[start + k] = result[n - k - 1];
-        result[n - k - 1] = tmp;
-        k++;
-    }
-    if (n < max_depth) {
-        result[n++] = (void*)3;
-    }
-#endif
-    return n;
-}
diff --git a/pypy/module/_vmprof/src/trampoline.h b/pypy/module/_vmprof/src/trampoline.h
deleted file mode 100644
--- a/pypy/module/_vmprof/src/trampoline.h
+++ /dev/null
@@ -1,1 +0,0 @@
-void* pypy_execute_frame_trampoline(void*, void*, void*, long);
diff --git a/pypy/module/_vmprof/src/trampoline.vmprof.s b/pypy/module/_vmprof/src/trampoline.vmprof.s
deleted file mode 100644
--- a/pypy/module/_vmprof/src/trampoline.vmprof.s
+++ /dev/null
@@ -1,15 +0,0 @@
-// NOTE: you need to use TABs, not spaces!
-        
-	.text
-	.globl	pypy_execute_frame_trampoline
-	.type	pypy_execute_frame_trampoline, @function
-pypy_execute_frame_trampoline:
-	.cfi_startproc
-	pushq	%rcx
-	.cfi_def_cfa_offset 16
-	call pypy_pyframe_execute_frame at PLT
-	popq	%rcx
-	.cfi_def_cfa_offset 8
-	ret
-	.cfi_endproc
-	.size	pypy_execute_frame_trampoline, .-pypy_execute_frame_trampoline
diff --git a/pypy/module/_vmprof/src/vmprof.c b/pypy/module/_vmprof/src/vmprof.c
deleted file mode 100644
--- a/pypy/module/_vmprof/src/vmprof.c
+++ /dev/null
@@ -1,463 +0,0 @@
-/* VMPROF
- *
- * statistical sampling profiler specifically designed to profile programs
- * which run on a Virtual Machine and/or bytecode interpreter, such as Python,
- * etc.
- *
- * The logic to dump the C stack traces is partly stolen from the code in gperftools.
- * The file "getpc.h" has been entirely copied from gperftools.
- *
- * Tested only on gcc, linux, x86_64.
- *
- * Copyright (C) 2014-2015
- *   Antonio Cuni - anto.cuni at gmail.com
- *   Maciej Fijalkowski - fijall at gmail.com
- *
- */
-
-
-#include "getpc.h"      // should be first to get the _GNU_SOURCE dfn
-#include <signal.h>
-#include <stdio.h>
-#include <string.h>
-#include <stddef.h>
-#include <assert.h>
-#include <unistd.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <errno.h>
-#include <pthread.h>
-#include <dlfcn.h>
-
-//#define UNW_LOCAL_ONLY
-//#include <libunwind.h>
-
-#include "vmprof.h"
-#if defined(__FreeBSD__) || defined(__APPLE__)
-#define sighandler_t sig_t
-#endif
-
-#define _unused(x) ((void)x)
-
-#define MAX_FUNC_NAME 128
-#define MAX_STACK_DEPTH 1024
-#define BUFFER_SIZE 8192
-
-
-static int profile_file = 0;
-static char profile_write_buffer[BUFFER_SIZE];
-static int profile_buffer_position = 0;
-void* vmprof_mainloop_func;
-char* vmprof_error = NULL;
-static ptrdiff_t mainloop_sp_offset;
-static vmprof_get_virtual_ip_t mainloop_get_virtual_ip;
-static long last_period_usec = 0;
-static int atfork_hook_installed = 0;
-
-
-/* *************************************************************
- * functions to write a profile file compatible with gperftools
- * *************************************************************
- */
-
-#define MARKER_STACKTRACE '\x01'
-#define MARKER_VIRTUAL_IP '\x02'
-#define MARKER_TRAILER '\x03'
-
-int (*unw_get_reg)(unw_cursor_t*, int, unw_word_t*) = NULL;
-int (*unw_step)(unw_cursor_t*) = NULL;
-int (*unw_init_local)(unw_cursor_t *, unw_context_t *) = NULL;
-int (*unw_get_proc_info)(unw_cursor_t *, unw_proc_info_t *) = NULL;
-
-static void prof_word(long x) {
-	((long*)(profile_write_buffer + profile_buffer_position))[0] = x;
-	profile_buffer_position += sizeof(long);
-}
-
-static void prof_header(long period_usec) {
-    // XXX never used here?
-    prof_word(0);
-    prof_word(3);
-    prof_word(0);
-    prof_word(period_usec);
-    prof_word(0);
-    write(profile_file, profile_write_buffer, profile_buffer_position);
-    profile_buffer_position = 0;
-}
-
-static void prof_write_stacktrace(void** stack, int depth, int count) {
-    int i;
-	char marker = MARKER_STACKTRACE;
-
-	profile_write_buffer[profile_buffer_position++] = MARKER_STACKTRACE;
-    prof_word(count);
-    prof_word(depth);
-    for(i=0; i<depth; i++)
-        prof_word((long)stack[i]);
-    write(profile_file, profile_write_buffer, profile_buffer_position);
-    profile_buffer_position = 0;
-}
-
-
-/* ******************************************************
- * libunwind workaround for process JIT frames correctly
- * ******************************************************
- */
-
-#include "get_custom_offset.c"
-
-typedef struct {
-    void* _unused1;
-    void* _unused2;
-    void* sp;
-    void* ip;
-    void* _unused3[sizeof(unw_cursor_t)/sizeof(void*) - 4];
-} vmprof_hacked_unw_cursor_t;
-
-static int vmprof_unw_step(unw_cursor_t *cp, int first_run) {
-	void* ip;
-    void* sp;
-    ptrdiff_t sp_offset;
-    unw_get_reg (cp, UNW_REG_IP, (unw_word_t*)&ip);
-    unw_get_reg (cp, UNW_REG_SP, (unw_word_t*)&sp);
-	if (!first_run)
-		// make sure we're pointing to the CALL and not to the first
-		// instruction after. If the callee adjusts the stack for us
-		// it's not safe to be at the instruction after
-		ip -= 1;
-    sp_offset = vmprof_unw_get_custom_offset(ip, cp);
-
-    if (sp_offset == -1) {
-        // it means that the ip is NOT in JITted code, so we can use the
-        // stardard unw_step
-        return unw_step(cp);
-    }
-    else {
-        // this is a horrible hack to manually walk the stack frame, by
-        // setting the IP and SP in the cursor
-        vmprof_hacked_unw_cursor_t *cp2 = (vmprof_hacked_unw_cursor_t*)cp;
-        void* bp = (void*)sp + sp_offset;
-        cp2->sp = bp;
-		bp -= sizeof(void*);
-        cp2->ip = ((void**)bp)[0];
-        // the ret is on the top of the stack minus WORD
-        return 1;
-    }
-}
-
-
-/* *************************************************************
- * functions to dump the stack trace
- * *************************************************************
- */
-
-// The original code here has a comment, "stolen from pprof",
-// about a "__thread int recursive".  But general __thread
-// variables are not really supposed to be accessed from a
-// signal handler.  Moreover, we are using SIGPROF, which
-// should not be recursively called on the same thread.
-//static __thread int recursive;
-
-int get_stack_trace(void** result, int max_depth, ucontext_t *ucontext) {
-    void *ip;
-    int n = 0;
-    unw_cursor_t cursor;
-    unw_context_t uc = *ucontext;
-    //if (recursive) {
-    //    return 0;
-    //}
-    if (!custom_sanity_check()) {
-        return 0;
-    }
-    //++recursive;
-
-    int ret = unw_init_local(&cursor, &uc);
-    assert(ret >= 0);
-    _unused(ret);
-	int first_run = 1;
-
-    while (n < max_depth) {
-        if (unw_get_reg(&cursor, UNW_REG_IP, (unw_word_t *) &ip) < 0) {
-            break;
-        }
-
-        unw_proc_info_t pip;
-        unw_get_proc_info(&cursor, &pip);
-
-        /* char funcname[4096]; */
-        /* unw_word_t offset; */
-        /* unw_get_proc_name(&cursor, funcname, 4096, &offset); */
-        /* printf("%s+%#lx <%p>\n", funcname, offset, ip); */
-
-        /* if n==0, it means that the signal handler interrupted us while we
-           were in the trampoline, so we are not executing (yet) the real main
-           loop function; just skip it */
-        if (vmprof_mainloop_func && 
-            (void*)pip.start_ip == (void*)vmprof_mainloop_func &&
-            n > 0) {
-          // found main loop stack frame
-          void* sp;
-          unw_get_reg(&cursor, UNW_REG_SP, (unw_word_t *) &sp);
-          void *arg_addr = (char*)sp + mainloop_sp_offset;
-          void **arg_ptr = (void**)arg_addr;
-          // fprintf(stderr, "stacktrace mainloop: rsp %p   &f2 %p   offset %ld\n", 
-          //         sp, arg_addr, mainloop_sp_offset);
-		  if (mainloop_get_virtual_ip) {
-			  ip = mainloop_get_virtual_ip(*arg_ptr);
-		  } else {
-			  ip = *arg_ptr;
-		  }
-        }
-
-        result[n++] = ip;
-		n = vmprof_write_header_for_jit_addr(result, n, ip, max_depth);
-        if (vmprof_unw_step(&cursor, first_run) <= 0) {
-            break;
-        }
-		first_run = 0;
-    }
-    //--recursive;
-    return n;
-}
-
-
-static int __attribute__((noinline)) frame_forcer(int rv) {
-    return rv;
-}
-
-static void sigprof_handler(int sig_nr, siginfo_t* info, void *ucontext) {
-    void* stack[MAX_STACK_DEPTH];
-    int saved_errno = errno;
-    stack[0] = GetPC((ucontext_t*)ucontext);
-    int depth = frame_forcer(get_stack_trace(stack+1, MAX_STACK_DEPTH-1, ucontext));
-    depth++;  // To account for pc value in stack[0];
-    prof_write_stacktrace(stack, depth, 1);
-    errno = saved_errno;
-}
-
-/* *************************************************************
- * functions to enable/disable the profiler
- * *************************************************************
- */
-
-static int open_profile(int fd, long period_usec, int write_header, char *s,
-						int slen) {
-	if ((fd = dup(fd)) == -1) {
-		return -1;
-	}
-	profile_buffer_position = 0;
-    profile_file = fd;
-	if (write_header)
-		prof_header(period_usec);
-	if (s)
-		write(profile_file, s, slen);
-	return 0;
-}
-
-static int close_profile(void) {
-	// XXX all of this can happily fail
-    FILE* src;
-    char buf[BUFSIZ];
-    size_t size;
-	int marker = MARKER_TRAILER;
-	write(profile_file, &marker, 1);
-
-#ifdef __linux__
-    // copy /proc/PID/maps to the end of the profile file
-    sprintf(buf, "/proc/%d/maps", getpid());
-    src = fopen(buf, "r");
-    if (!src) {
-        vmprof_error = "error opening proc maps";
-        return -1;
-    }
-    while ((size = fread(buf, 1, BUFSIZ, src))) {
-        write(profile_file, buf, size);
-    }
-    fclose(src);
-#else
-    // freebsd and mac
-    sprintf(buf, "procstat -v %d", getpid());
-    src = popen(buf, "r");
-    if (!src) {
-        vmprof_error = "error calling procstat";
-        return -1;
-    }
-    while ((size = fread(buf, 1, BUFSIZ, src))) {
-        write(profile_file, buf, size);
-    }
-    pclose(src);
-#endif
-    close(profile_file);
-	return 0;
-}
-
-
-static int install_sigprof_handler(void) {
-    struct sigaction sa;
-    memset(&sa, 0, sizeof(sa));
-    sa.sa_sigaction = sigprof_handler;
-    sa.sa_flags = SA_RESTART | SA_SIGINFO;
-    if (sigemptyset(&sa.sa_mask) == -1 ||
-		sigaction(SIGPROF, &sa, NULL) == -1) {
-		return -1;
-	}
-	return 0;
-}
-
-static int remove_sigprof_handler(void) {
-    sighandler_t res = signal(SIGPROF, SIG_DFL);
-	if (res == SIG_ERR) {
-		return -1;
-	}
-	return 0;
-};
-
-static int install_sigprof_timer(long period_usec) {
-    static struct itimerval timer;
-    last_period_usec = period_usec;
-    timer.it_interval.tv_sec = 0;
-    timer.it_interval.tv_usec = period_usec;
-    timer.it_value = timer.it_interval;
-    if (setitimer(ITIMER_PROF, &timer, NULL) != 0) {
-		return -1;
-    }
-	return 0;
-}
-
-static int remove_sigprof_timer(void) {
-    static struct itimerval timer;
-    timer.it_interval.tv_sec = 0;
-    timer.it_interval.tv_usec = 0;
-    timer.it_value.tv_sec = 0;
-    timer.it_value.tv_usec = 0;
-    if (setitimer(ITIMER_PROF, &timer, NULL) != 0) {
-		return -1;
-    }
-	return 0;
-}
-
-static void atfork_disable_timer(void) {
-    if (last_period_usec) {
-        remove_sigprof_timer();
-    }
-}
-
-static void atfork_enable_timer(void) {
-    if (last_period_usec) {
-        install_sigprof_timer(last_period_usec);
-    }
-}
-
-static int install_pthread_atfork_hooks(void) {
-    /* this is needed to prevent the problems described there:
-         - http://code.google.com/p/gperftools/issues/detail?id=278
-         - http://lists.debian.org/debian-glibc/2010/03/msg00161.html
-
-        TL;DR: if the RSS of the process is large enough, the clone() syscall
-        will be interrupted by the SIGPROF before it can complete, then
-        retried, interrupted again and so on, in an endless loop.  The
-        solution is to disable the timer around the fork, and re-enable it
-        only inside the parent.
-    */
-    if (atfork_hook_installed)
-        return 0;
-    int ret = pthread_atfork(atfork_disable_timer, atfork_enable_timer, NULL);
-    if (ret != 0)
-        return -1;
-    atfork_hook_installed = 1;
-    return 0;
-}
-
-/* *************************************************************
- * public API
- * *************************************************************
- */
-
-int vmprof_set_mainloop(void* func, ptrdiff_t sp_offset, 
-                         vmprof_get_virtual_ip_t get_virtual_ip) {
-    void *libhandle;
-
-    mainloop_sp_offset = sp_offset;
-    mainloop_get_virtual_ip = get_virtual_ip;
-    vmprof_mainloop_func = func;
-    if (!unw_get_reg) {
-        if (!(libhandle = dlopen("libunwind.so", RTLD_LAZY | RTLD_LOCAL))) {
-            vmprof_error = dlerror();
-            return -1;
-        }
-        if (!(unw_get_reg = dlsym(libhandle, "_ULx86_64_get_reg"))) {
-            vmprof_error = dlerror();
-            return -1;
-        }
-        if (!(unw_get_proc_info = dlsym(libhandle, "_ULx86_64_get_proc_info"))){
-            vmprof_error = dlerror();
-            return -1;
-        }
-        if (!(unw_init_local = dlsym(libhandle, "_ULx86_64_init_local"))) {
-            vmprof_error = dlerror();
-            return -1;
-        }
-        if (!(unw_step = dlsym(libhandle, "_ULx86_64_step"))) {
-            vmprof_error = dlerror();
-            return -1;
-        }
-    }
-    return 0;
-}
-
-char* vmprof_get_error()
-{
-    char* res;
-    res = vmprof_error;
-    vmprof_error = NULL;
-    return res;
-}
-
-int vmprof_enable(int fd, long period_usec, int write_header, char *s,
-				  int slen)
-{
-    assert(period_usec > 0);
-    if (open_profile(fd, period_usec, write_header, s, slen) == -1) {
-		return -1;
-	}
-    if (install_sigprof_handler() == -1) {
-		return -1;
-	}
-    if (install_sigprof_timer(period_usec) == -1) {
-		return -1;
-	}
-    if (install_pthread_atfork_hooks() == -1) {
-        return -1;
-    }
-	return 0;
-}
-
-int vmprof_disable(void) {
-    if (remove_sigprof_timer() == -1) {
-		return -1;
-	}
-    last_period_usec = 0;
-    if (remove_sigprof_handler() == -1) {
-		return -1;
-	}
-    if (close_profile() == -1) {
-		return -1;
-	}
-	return 0;
-}
-
-void vmprof_register_virtual_function(const char* name, void* start, void* end) {
-	// XXX unused by pypy
-    // for now *end is simply ignored
-	char buf[1024];
-	int lgt = strlen(name) + 2 * sizeof(long) + 1;
-
-	if (lgt > 1024) {
-		lgt = 1024;
-	}
-	buf[0] = MARKER_VIRTUAL_IP;
-	((void **)(((void*)buf) + 1))[0] = start;
-	((long *)(((void*)buf) + 1 + sizeof(long)))[0] = lgt - 2 * sizeof(long) - 1;
-	strncpy(buf + 2 * sizeof(long) + 1, name, 1024 - 2 * sizeof(long) - 1);
-	write(profile_file, buf, lgt);
-}
diff --git a/pypy/module/_vmprof/src/vmprof.h b/pypy/module/_vmprof/src/vmprof.h
deleted file mode 100644
--- a/pypy/module/_vmprof/src/vmprof.h
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef VMPROF_VMPROF_H_
-#define VMPROF_VMPROF_H_
-
-#include <stddef.h>
-#include <stdint.h>
-#include <ucontext.h>
-
-// copied from libunwind.h
-
-typedef enum
-  {
-    UNW_X86_64_RAX,
-    UNW_X86_64_RDX,
-    UNW_X86_64_RCX,
-    UNW_X86_64_RBX,
-    UNW_X86_64_RSI,
-    UNW_X86_64_RDI,
-    UNW_X86_64_RBP,
-    UNW_X86_64_RSP,
-    UNW_X86_64_R8,
-    UNW_X86_64_R9,
-    UNW_X86_64_R10,
-    UNW_X86_64_R11,
-    UNW_X86_64_R12,
-    UNW_X86_64_R13,
-    UNW_X86_64_R14,
-    UNW_X86_64_R15,
-    UNW_X86_64_RIP,
-#ifdef CONFIG_MSABI_SUPPORT
-    UNW_X86_64_XMM0,
-    UNW_X86_64_XMM1,
-    UNW_X86_64_XMM2,
-    UNW_X86_64_XMM3,
-    UNW_X86_64_XMM4,
-    UNW_X86_64_XMM5,
-    UNW_X86_64_XMM6,
-    UNW_X86_64_XMM7,
-    UNW_X86_64_XMM8,
-    UNW_X86_64_XMM9,
-    UNW_X86_64_XMM10,
-    UNW_X86_64_XMM11,
-    UNW_X86_64_XMM12,
-    UNW_X86_64_XMM13,
-    UNW_X86_64_XMM14,
-    UNW_X86_64_XMM15,
-    UNW_TDEP_LAST_REG = UNW_X86_64_XMM15,
-#else
-    UNW_TDEP_LAST_REG = UNW_X86_64_RIP,
-#endif
-
-    /* XXX Add other regs here */
-
-    /* frame info (read-only) */
-    UNW_X86_64_CFA,
-
-    UNW_TDEP_IP = UNW_X86_64_RIP,
-    UNW_TDEP_SP = UNW_X86_64_RSP,
-    UNW_TDEP_BP = UNW_X86_64_RBP,
-    UNW_TDEP_EH = UNW_X86_64_RAX
-  }
-x86_64_regnum_t;
-
-typedef uint64_t unw_word_t;
-
-#define UNW_TDEP_CURSOR_LEN 127
-
-typedef struct unw_cursor
-  {
-    unw_word_t opaque[UNW_TDEP_CURSOR_LEN];
-  }
-unw_cursor_t;
-
-#define UNW_REG_IP UNW_X86_64_RIP
-#define UNW_REG_SP UNW_X86_64_RSP
-
-typedef ucontext_t unw_context_t;
-
-typedef struct unw_proc_info
-  {
-    unw_word_t start_ip;	/* first IP covered by this procedure */
-    unw_word_t end_ip;		/* first IP NOT covered by this procedure */
-    unw_word_t lsda;		/* address of lang.-spec. data area (if any) */
-    unw_word_t handler;		/* optional personality routine */
-    unw_word_t gp;		/* global-pointer value for this procedure */
-    unw_word_t flags;		/* misc. flags */
-
-    int format;			/* unwind-info format (arch-specific) */
-    int unwind_info_size;	/* size of the information (if applicable) */
-    void *unwind_info;		/* unwind-info (arch-specific) */
-  }
-unw_proc_info_t;
-
-// functions copied from libunwind using dlopen
-
-extern int (*unw_get_reg)(unw_cursor_t*, int, unw_word_t*);
-extern int (*unw_step)(unw_cursor_t*);
-extern int (*unw_init_local)(unw_cursor_t *, unw_context_t *);
-extern int (*unw_get_proc_info)(unw_cursor_t *, unw_proc_info_t *);
-
-// end of copy
-
-extern char* vmprof_error;
-
-typedef void* (*vmprof_get_virtual_ip_t)(void*);
-char* vmprof_get_error();
-
-extern void* vmprof_mainloop_func;
-int vmprof_set_mainloop(void* func, ptrdiff_t sp_offset, 
-                         vmprof_get_virtual_ip_t get_virtual_ip);
-
-void vmprof_register_virtual_function(const char* name, void* start, void* end);
-
-
-int vmprof_enable(int fd, long period_usec, int write_header, char* vips,
-				  int vips_len);
-int vmprof_disable(void);
-
-// XXX: this should be part of _vmprof (the CPython extension), not vmprof (the library)
-void vmprof_set_tramp_range(void* start, void* end);
-
-#endif
diff --git a/pypy/module/_vmprof/test/test__vmprof.py b/pypy/module/_vmprof/test/test__vmprof.py
--- a/pypy/module/_vmprof/test/test__vmprof.py
+++ b/pypy/module/_vmprof/test/test__vmprof.py
@@ -1,14 +1,14 @@
 
-import tempfile
+from rpython.tool.udir import udir
 from pypy.tool.pytest.objspace import gettestobjspace
 
 class AppTestVMProf(object):
     def setup_class(cls):
         cls.space = gettestobjspace(usemodules=['_vmprof', 'struct'])
-        cls.tmpfile = tempfile.NamedTemporaryFile()
+        cls.tmpfile = udir.join('test__vmprof.1').open('wb')
         cls.w_tmpfileno = cls.space.wrap(cls.tmpfile.fileno())
         cls.w_tmpfilename = cls.space.wrap(cls.tmpfile.name)
-        cls.tmpfile2 = tempfile.NamedTemporaryFile()
+        cls.tmpfile2 = udir.join('test__vmprof.2').open('wb')
         cls.w_tmpfileno2 = cls.space.wrap(cls.tmpfile2.fileno())
         cls.w_tmpfilename2 = cls.space.wrap(cls.tmpfile2.name)
 
@@ -29,19 +29,23 @@
             while i < len(s):
                 if s[i] == '\x03':
                     break
-                if s[i] == '\x01':
-                    xxx
-                assert s[i] == '\x02'
-                i += 1
-                _, size = struct.unpack("ll", s[i:i + 2 * WORD])
-                count += 1
-                i += 2 * WORD + size
+                elif s[i] == '\x01':
+                    i += 1
+                    _, size = struct.unpack("ll", s[i:i + 2 * WORD])
+                    i += 2 * WORD + size * struct.calcsize("P")
+                elif s[i] == '\x02':
+                    i += 1
+                    _, size = struct.unpack("ll", s[i:i + 2 * WORD])
+                    count += 1
+                    i += 2 * WORD + size
+                else:
+                    raise AssertionError(ord(s[i]))
             return count
         
         import _vmprof
-        _vmprof.enable(self.tmpfileno)
+        _vmprof.enable(self.tmpfileno, 0.01)
         _vmprof.disable()
-        s = open(self.tmpfilename).read()
+        s = open(self.tmpfilename, 'rb').read()
         no_of_codes = count(s)
         assert no_of_codes > 10
         d = {}
@@ -50,14 +54,14 @@
             pass
         """ in d
 
-        _vmprof.enable(self.tmpfileno2)
+        _vmprof.enable(self.tmpfileno2, 0.01)
 
         exec """def foo2():
             pass
         """ in d
 
         _vmprof.disable()
-        s = open(self.tmpfilename2).read()
+        s = open(self.tmpfilename2, 'rb').read()
         no_of_codes2 = count(s)
         assert "py:foo:" in s
         assert "py:foo2:" in s
@@ -65,8 +69,9 @@
 
     def test_enable_ovf(self):
         import _vmprof
-        raises(ValueError, _vmprof.enable, 999, 0)
-        raises(ValueError, _vmprof.enable, 999, -2.5)
-        raises(ValueError, _vmprof.enable, 999, 1e300)
-        raises(ValueError, _vmprof.enable, 999, 1e300 * 1e300)
-        raises(ValueError, _vmprof.enable, 999, (1e300*1e300) / (1e300*1e300))
+        raises(_vmprof.VMProfError, _vmprof.enable, 999, 0)
+        raises(_vmprof.VMProfError, _vmprof.enable, 999, -2.5)
+        raises(_vmprof.VMProfError, _vmprof.enable, 999, 1e300)
+        raises(_vmprof.VMProfError, _vmprof.enable, 999, 1e300 * 1e300)
+        NaN = (1e300*1e300) / (1e300*1e300)
+        raises(_vmprof.VMProfError, _vmprof.enable, 999, NaN)
diff --git a/pypy/module/_vmprof/test/test_direct.py b/pypy/module/_vmprof/test/test_direct.py
--- a/pypy/module/_vmprof/test/test_direct.py
+++ b/pypy/module/_vmprof/test/test_direct.py
@@ -5,7 +5,8 @@
 except ImportError:
     py.test.skip('cffi required')
 
-srcdir = py.path.local(__file__).join("..", "..", "src")
+from rpython.rlib import rvmprof
+srcdir = py.path.local(rvmprof.__file__).join("..", "src")
 
 ffi = cffi.FFI()
 ffi.cdef("""
@@ -17,6 +18,8 @@
 """)
 
 lib = ffi.verify("""
+#define PYPY_JIT_CODEMAP
+
 volatile int pypy_codemap_currently_invalid = 0;
 
 long buffer[] = {0, 0, 0, 0, 0};
@@ -39,7 +42,7 @@
 }
 
 
-""" + open(str(srcdir.join("get_custom_offset.c"))).read())
+""" + open(str(srcdir.join("rvmprof_get_custom_offset.h"))).read())
 
 class TestDirect(object):
     def test_infrastructure(self):
diff --git a/pypy/module/cpyext/TODO b/pypy/module/cpyext/TODO
deleted file mode 100644
--- a/pypy/module/cpyext/TODO
+++ /dev/null
@@ -1,26 +0,0 @@
- - Complete the PyTypeObject initialization code. (see XXX in the code)
-  - Implement further method callers.
-  - Copy the slots from the base.
-  - Those tasks are necessary to be able to call slots from C code correctly.
-  - Additionally, implement further slot wrappers. This is necessary to call
-    slots of PTOs defined in C.
- - Complete the Members support.
-
- - Use a WeakKeyDictionary to count how often a PyObject is allocated for
-   a given wrapped object and use this to assess whether optimizations are
-   useful
-
- - replace @cpython_api(external=False) by another explicit name: all
-   it does is a lltype function pointer, no C code involved.
-
- - Fix GIL handling (e.g. after releasing the GIL, GC operations might occur in savethreads).
-
- - refactor management of py_objects_r2w and py_objects_w2r, this can
-   probably be expressed in terms of _PyObject_GC_TRACK macros.
-
- - PyWeakref_GetObject() returns a borrowed reference, but this turns the
-   WeakRef into a strong reference!
-
- - sort out pypy's buffer protocol. PyPy's buffer right now don't support
-   raw memory (except array which supports it in a hackish way), which
-   should be fixed in order to make it nicely work with cpyext.
diff --git a/pypy/module/micronumpy/casting.py b/pypy/module/micronumpy/casting.py
--- a/pypy/module/micronumpy/casting.py
+++ b/pypy/module/micronumpy/casting.py
@@ -145,23 +145,32 @@
     # equivalent to PyArray_CanCastTypeTo
     if origin == target:
         return True
-    if origin.is_record() or target.is_record():
-        return can_cast_record(space, origin, target, casting)
+    if casting == 'unsafe':
+        return True
+    elif casting == 'no':
+        return origin.eq(space, target)
+    if origin.num == target.num:
+        if origin.is_record():
+            return (target.is_record() and
+                    can_cast_record(space, origin, target, casting))
+        else:
+            if casting == 'equiv':
+                return origin.elsize == target.elsize
+            elif casting == 'safe':
+                return origin.elsize <= target.elsize
+            else:
+                return True
 
-    if casting == 'no':
-        return origin.eq(space, target)
-    elif casting == 'equiv':
-        return origin.num == target.num and origin.elsize == target.elsize
-    elif casting == 'unsafe':
-        return True
     elif casting == 'same_kind':
         if can_cast_to(origin, target):
             return True
         if origin.kind in kind_ordering and target.kind in kind_ordering:
             return kind_ordering[origin.kind] <= kind_ordering[target.kind]
         return False
-    else:  # 'safe'
+    elif casting == 'safe':
         return can_cast_to(origin, target)
+    else:  # 'equiv'
+        return origin.num == target.num and origin.elsize == target.elsize
 
 def can_cast_record(space, origin, target, casting):
     if origin is target:
diff --git a/pypy/module/micronumpy/descriptor.py b/pypy/module/micronumpy/descriptor.py
--- a/pypy/module/micronumpy/descriptor.py
+++ b/pypy/module/micronumpy/descriptor.py
@@ -101,6 +101,9 @@
 
     @specialize.argtype(1)
     def box(self, value):
+        if self.is_record():
+            raise oefmt(self.itemtype.space.w_NotImplementedError,
+                "cannot box a value into a 'record' dtype, this is a bug please report it")
         return self.itemtype.box(value)
 
     @specialize.argtype(1, 2)
@@ -1028,6 +1031,11 @@
     elif space.isinstance_w(w_dtype, space.w_tuple):
         w_dtype0 = space.getitem(w_dtype, space.wrap(0))
         w_dtype1 = space.getitem(w_dtype, space.wrap(1))
+        if space.isinstance_w(w_dtype0, space.w_type) and \
+           space.isinstance_w(w_dtype1, space.w_list):
+            #obscure api - (subclass, spec). Ignore the subclass
+            return make_new_dtype(space, w_subtype, w_dtype1, alignment, 
+                        copy=copy, w_shape=w_shape, w_metadata=w_metadata)
         subdtype = make_new_dtype(space, w_subtype, w_dtype0, alignment, copy)
         assert isinstance(subdtype, W_Dtype)
         if subdtype.elsize == 0:
diff --git a/pypy/module/micronumpy/iterators.py b/pypy/module/micronumpy/iterators.py
--- a/pypy/module/micronumpy/iterators.py
+++ b/pypy/module/micronumpy/iterators.py
@@ -204,17 +204,16 @@
         self.array.setitem(state.offset, elem)
 
 
-def AxisIter(array, shape, axis, cumulative):
+def AxisIter(array, shape, axis):
     strides = array.get_strides()
     backstrides = array.get_backstrides()
-    if not cumulative:
-        if len(shape) == len(strides):
-            # keepdims = True
-            strides = strides[:axis] + [0] + strides[axis + 1:]
-            backstrides = backstrides[:axis] + [0] + backstrides[axis + 1:]
-        else:
-            strides = strides[:axis] + [0] + strides[axis:]
-            backstrides = backstrides[:axis] + [0] + backstrides[axis:]
+    if len(shape) == len(strides):
+        # keepdims = True
+        strides = strides[:axis] + [0] + strides[axis + 1:]
+        backstrides = backstrides[:axis] + [0] + backstrides[axis + 1:]
+    else:
+        strides = strides[:axis] + [0] + strides[axis:]
+        backstrides = backstrides[:axis] + [0] + backstrides[axis:]
     return ArrayIter(array, support.product(shape), shape, strides, backstrides)
 
 
diff --git a/pypy/module/micronumpy/loop.py b/pypy/module/micronumpy/loop.py
--- a/pypy/module/micronumpy/loop.py
+++ b/pypy/module/micronumpy/loop.py
@@ -9,7 +9,7 @@
 from pypy.module.micronumpy import support, constants as NPY
 from pypy.module.micronumpy.base import W_NDimArray, convert_to_array
 from pypy.module.micronumpy.iterators import PureShapeIter, AxisIter, \
-    AllButAxisIter
+    AllButAxisIter, ArrayIter
 from pypy.interpreter.argument import Arguments
 
 
@@ -190,23 +190,64 @@
         source_state = source_iter.next(source_state)
     return target
 
-reduce_driver = jit.JitDriver(name='numpy_reduce',
-                              greens = ['shapelen', 'func', 'done_func',
-                                        'calc_dtype'],
-                              reds = 'auto')
 
-def compute_reduce(space, obj, calc_dtype, func, done_func, identity):
-    obj_iter, obj_state = obj.create_iter()
+def split_iter(arr, axis_flags):
+    """Prepare 2 iterators for nested iteration over `arr`.
+
+    Arguments:
+        arr: instance of BaseConcreteArray
+        axis_flags: list of bools, one for each dimension of `arr`.The inner
+        iterator operates over the dimensions for which the flag is True
+    """
+    shape = arr.get_shape()
+    strides = arr.get_strides()
+    backstrides = arr.get_backstrides()
+    shapelen = len(shape)
+    assert len(axis_flags) == shapelen
+    inner_shape = [-1] * shapelen
+    inner_strides = [-1] * shapelen
+    inner_backstrides = [-1] * shapelen
+    outer_shape = [-1] * shapelen
+    outer_strides = [-1] * shapelen
+    outer_backstrides = [-1] * shapelen
+    for i in range(len(shape)):
+        if axis_flags[i]:
+            inner_shape[i] = shape[i]
+            inner_strides[i] = strides[i]
+            inner_backstrides[i] = backstrides[i]
+            outer_shape[i] = 1
+            outer_strides[i] = 0
+            outer_backstrides[i] = 0
+        else:
+            outer_shape[i] = shape[i]
+            outer_strides[i] = strides[i]
+            outer_backstrides[i] = backstrides[i]
+            inner_shape[i] = 1
+            inner_strides[i] = 0
+            inner_backstrides[i] = 0
+    inner_iter = ArrayIter(arr, support.product(inner_shape),
+                           inner_shape, inner_strides, inner_backstrides)
+    outer_iter = ArrayIter(arr, support.product(outer_shape),
+                           outer_shape, outer_strides, outer_backstrides)
+    return inner_iter, outer_iter
+
+
+reduce_flat_driver = jit.JitDriver(
+    name='numpy_reduce_flat',
+    greens = ['shapelen', 'func', 'done_func', 'calc_dtype'], reds = 'auto')
+
+def reduce_flat(space, func, w_arr, calc_dtype, done_func, identity):
+    obj_iter, obj_state = w_arr.create_iter()
     if identity is None:
         cur_value = obj_iter.getitem(obj_state).convert_to(space, calc_dtype)
         obj_state = obj_iter.next(obj_state)
     else:
         cur_value = identity.convert_to(space, calc_dtype)
-    shapelen = len(obj.get_shape())
+    shapelen = len(w_arr.get_shape())
     while not obj_iter.done(obj_state):
-        reduce_driver.jit_merge_point(shapelen=shapelen, func=func,
-                                      done_func=done_func,
-                                      calc_dtype=calc_dtype)
+        reduce_flat_driver.jit_merge_point(
+            shapelen=shapelen, func=func,
+            done_func=done_func, calc_dtype=calc_dtype)
         rval = obj_iter.getitem(obj_state).convert_to(space, calc_dtype)
         if done_func is not None and done_func(calc_dtype, rval):
             return rval
@@ -214,33 +255,105 @@
         obj_state = obj_iter.next(obj_state)
     return cur_value
 
-reduce_cum_driver = jit.JitDriver(
-    name='numpy_reduce_cum_driver',
+
+reduce_driver = jit.JitDriver(
+    name='numpy_reduce',
+    greens=['shapelen', 'func', 'dtype'], reds='auto')
+
+def reduce(space, func, w_arr, axis_flags, dtype, out, identity):
+    out_iter, out_state = out.create_iter()
+    out_iter.track_index = False
+    shape = w_arr.get_shape()
+    shapelen = len(shape)
+    inner_iter, outer_iter = split_iter(w_arr.implementation, axis_flags)
+    assert outer_iter.size == out_iter.size
+
+    if identity is not None:
+        identity = identity.convert_to(space, dtype)
+    outer_state = outer_iter.reset()
+    while not outer_iter.done(outer_state):
+        inner_state = inner_iter.reset()
+        inner_state.offset = outer_state.offset
+        if identity is not None:
+            w_val = identity
+        else:
+            w_val = inner_iter.getitem(inner_state).convert_to(space, dtype)
+            inner_state = inner_iter.next(inner_state)
+        while not inner_iter.done(inner_state):
+            reduce_driver.jit_merge_point(
+                shapelen=shapelen, func=func, dtype=dtype)
+            w_item = inner_iter.getitem(inner_state).convert_to(space, dtype)
+            w_val = func(dtype, w_item, w_val)
+            inner_state = inner_iter.next(inner_state)
+        out_iter.setitem(out_state, w_val)
+        out_state = out_iter.next(out_state)
+        outer_state = outer_iter.next(outer_state)
+    return out
+
+accumulate_flat_driver = jit.JitDriver(
+    name='numpy_accumulate_flat',
     greens=['shapelen', 'func', 'dtype', 'out_dtype'],
     reds='auto')
 
-def compute_reduce_cumulative(space, obj, out, calc_dtype, func, identity):
-    obj_iter, obj_state = obj.create_iter()
-    out_iter, out_state = out.create_iter()
+def accumulate_flat(space, func, w_arr, calc_dtype, w_out, identity):
+    arr_iter, arr_state = w_arr.create_iter()
+    out_iter, out_state = w_out.create_iter()
     out_iter.track_index = False
     if identity is None:
-        cur_value = obj_iter.getitem(obj_state).convert_to(space, calc_dtype)
+        cur_value = arr_iter.getitem(arr_state).convert_to(space, calc_dtype)
         out_iter.setitem(out_state, cur_value)
         out_state = out_iter.next(out_state)
-        obj_state = obj_iter.next(obj_state)
+        arr_state = arr_iter.next(arr_state)
     else:
         cur_value = identity.convert_to(space, calc_dtype)
-    shapelen = len(obj.get_shape())
-    out_dtype = out.get_dtype()
-    while not obj_iter.done(obj_state):
-        reduce_cum_driver.jit_merge_point(
-            shapelen=shapelen, func=func,
-            dtype=calc_dtype, out_dtype=out_dtype)
-        rval = obj_iter.getitem(obj_state).convert_to(space, calc_dtype)
-        cur_value = func(calc_dtype, cur_value, rval)
+    shapelen = len(w_arr.get_shape())
+    out_dtype = w_out.get_dtype()
+    while not arr_iter.done(arr_state):
+        accumulate_flat_driver.jit_merge_point(
+            shapelen=shapelen, func=func, dtype=calc_dtype,
+            out_dtype=out_dtype)
+        w_item = arr_iter.getitem(arr_state).convert_to(space, calc_dtype)
+        cur_value = func(calc_dtype, cur_value, w_item)
         out_iter.setitem(out_state, out_dtype.coerce(space, cur_value))
         out_state = out_iter.next(out_state)
-        obj_state = obj_iter.next(obj_state)
+        arr_state = arr_iter.next(arr_state)
+
+accumulate_driver = jit.JitDriver(
+    name='numpy_accumulate',
+    greens=['shapelen', 'func', 'calc_dtype'], reds='auto')
+
+
+def accumulate(space, func, w_arr, axis, calc_dtype, w_out, identity):
+    out_iter, out_state = w_out.create_iter()
+    arr_shape = w_arr.get_shape()
+    temp_shape = arr_shape[:axis] + arr_shape[axis + 1:]
+    temp = W_NDimArray.from_shape(space, temp_shape, calc_dtype, w_instance=w_arr)
+    temp_iter = AxisIter(temp.implementation, w_arr.get_shape(), axis)
+    temp_state = temp_iter.reset()
+    arr_iter, arr_state = w_arr.create_iter()
+    arr_iter.track_index = False
+    if identity is not None:
+        identity = identity.convert_to(space, calc_dtype)
+    shapelen = len(arr_shape)
+    while not out_iter.done(out_state):
+        accumulate_driver.jit_merge_point(shapelen=shapelen, func=func,
+                                          calc_dtype=calc_dtype)
+        w_item = arr_iter.getitem(arr_state).convert_to(space, calc_dtype)
+        arr_state = arr_iter.next(arr_state)
+
+        out_indices = out_iter.indices(out_state)
+        if out_indices[axis] == 0:
+            if identity is not None:
+                w_item = func(calc_dtype, identity, w_item)
+        else:
+            cur_value = temp_iter.getitem(temp_state)
+            w_item = func(calc_dtype, cur_value, w_item)
+
+        out_iter.setitem(out_state, w_item)
+        out_state = out_iter.next(out_state)
+        temp_iter.setitem(temp_state, w_item)
+        temp_state = temp_iter.next(temp_state)
+    return w_out
 
 def fill(arr, box):
     arr_iter, arr_state = arr.create_iter()
@@ -298,64 +411,56 @@
             state = x_state
     return out
 
-axis_reduce_driver = jit.JitDriver(name='numpy_axis_reduce',
-                                   greens=['shapelen', 'func', 'dtype'],
-                                   reds='auto')
-
-def do_axis_reduce(space, shape, func, arr, dtype, axis, out, identity, cumulative,
-                   temp):
-    out_iter = AxisIter(out.implementation, arr.get_shape(), axis, cumulative)
-    out_state = out_iter.reset()
-    if cumulative:
-        temp_iter = AxisIter(temp.implementation, arr.get_shape(), axis, False)
-        temp_state = temp_iter.reset()
-    else:
-        temp_iter = out_iter  # hack
-        temp_state = out_state
-    arr_iter, arr_state = arr.create_iter()
-    arr_iter.track_index = False
-    if identity is not None:
-        identity = identity.convert_to(space, dtype)
-    shapelen = len(shape)
-    while not out_iter.done(out_state):
-        axis_reduce_driver.jit_merge_point(shapelen=shapelen, func=func,
-                                           dtype=dtype)
-        w_val = arr_iter.getitem(arr_state).convert_to(space, dtype)
-        arr_state = arr_iter.next(arr_state)
-
-        out_indices = out_iter.indices(out_state)
-        if out_indices[axis] == 0:
-            if identity is not None:
-                w_val = func(dtype, identity, w_val)
-        else:
-            cur = temp_iter.getitem(temp_state)
-            w_val = func(dtype, cur, w_val)
-
-        out_iter.setitem(out_state, w_val)
-        out_state = out_iter.next(out_state)
-        if cumulative:
-            temp_iter.setitem(temp_state, w_val)
-            temp_state = temp_iter.next(temp_state)
-        else:
-            temp_state = out_state
-    return out
-
 
 def _new_argmin_argmax(op_name):
     arg_driver = jit.JitDriver(name='numpy_' + op_name,
                                greens = ['shapelen', 'dtype'],
                                reds = 'auto')
+    arg_flat_driver = jit.JitDriver(name='numpy_flat_' + op_name,
+                                    greens = ['shapelen', 'dtype'],
+                                    reds = 'auto')
 
-    def argmin_argmax(arr):
+    def argmin_argmax(space, w_arr, w_out, axis):
+        from pypy.module.micronumpy.descriptor import get_dtype_cache
+        dtype = w_arr.get_dtype()
+        shapelen = len(w_arr.get_shape())
+        axis_flags = [False] * shapelen
+        axis_flags[axis] = True
+        inner_iter, outer_iter = split_iter(w_arr.implementation, axis_flags)
+        outer_state = outer_iter.reset()
+        out_iter, out_state = w_out.create_iter()
+        while not outer_iter.done(outer_state):
+            inner_state = inner_iter.reset()
+            inner_state.offset = outer_state.offset
+            cur_best = inner_iter.getitem(inner_state)
+            inner_state = inner_iter.next(inner_state)
+            result = 0
+            idx = 1
+            while not inner_iter.done(inner_state):
+                arg_driver.jit_merge_point(shapelen=shapelen, dtype=dtype)
+                w_val = inner_iter.getitem(inner_state)
+                new_best = getattr(dtype.itemtype, op_name)(cur_best, w_val)
+                if dtype.itemtype.ne(new_best, cur_best):
+                    result = idx
+                    cur_best = new_best
+                inner_state = inner_iter.next(inner_state)
+                idx += 1
+            result = get_dtype_cache(space).w_longdtype.box(result)
+            out_iter.setitem(out_state, result)
+            out_state = out_iter.next(out_state)
+            outer_state = outer_iter.next(outer_state)
+        return w_out
+
+    def argmin_argmax_flat(w_arr):
         result = 0
         idx = 1
-        dtype = arr.get_dtype()
-        iter, state = arr.create_iter()
+        dtype = w_arr.get_dtype()
+        iter, state = w_arr.create_iter()
         cur_best = iter.getitem(state)
         state = iter.next(state)
-        shapelen = len(arr.get_shape())
+        shapelen = len(w_arr.get_shape())
         while not iter.done(state):
-            arg_driver.jit_merge_point(shapelen=shapelen, dtype=dtype)
+            arg_flat_driver.jit_merge_point(shapelen=shapelen, dtype=dtype)
             w_val = iter.getitem(state)
             new_best = getattr(dtype.itemtype, op_name)(cur_best, w_val)
             if dtype.itemtype.ne(new_best, cur_best):
@@ -364,9 +469,10 @@
             state = iter.next(state)
             idx += 1
         return result
-    return argmin_argmax
-argmin = _new_argmin_argmax('min')
-argmax = _new_argmin_argmax('max')
+
+    return argmin_argmax, argmin_argmax_flat
+argmin, argmin_flat = _new_argmin_argmax('min')
+argmax, argmax_flat = _new_argmin_argmax('max')
 
 dot_driver = jit.JitDriver(name = 'numpy_dot',
                            greens = ['dtype'],
diff --git a/pypy/module/micronumpy/ndarray.py b/pypy/module/micronumpy/ndarray.py
--- a/pypy/module/micronumpy/ndarray.py
+++ b/pypy/module/micronumpy/ndarray.py
@@ -23,6 +23,8 @@
     get_shape_from_iterable, shape_agreement, shape_agreement_multiple,
     is_c_contiguous, is_f_contiguous, calc_strides, new_view)
 from pypy.module.micronumpy.casting import can_cast_array
+from pypy.module.micronumpy.descriptor import get_dtype_cache
+
 
 
 def _match_dot_shapes(space, left, right):
@@ -484,7 +486,7 @@
         return self.implementation.swapaxes(space, self, axis1, axis2)
 
     def descr_nonzero(self, space):
-        index_type = descriptor.get_dtype_cache(space).w_int64dtype
+        index_type = get_dtype_cache(space).w_int64dtype
         return self.implementation.nonzero(space, index_type)
 
     def descr_tolist(self, space):
@@ -544,8 +546,10 @@
     def descr_set_flatiter(self, space, w_obj):
         iter, state = self.create_iter()
         dtype = self.get_dtype()
-        arr = convert_to_array(space, w_obj)
-        loop.flatiter_setitem(space, dtype, arr, iter, state, 1, iter.size)
+        w_arr = convert_to_array(space, w_obj)
+        if dtype.is_record():
+            return self.implementation.setslice(space, w_arr)
+        loop.flatiter_setitem(space, dtype, w_arr, iter, state, 1, iter.size)
 
     def descr_get_flatiter(self, space):
         from .flatiter import W_FlatIterator
@@ -810,7 +814,7 @@
             if self.get_dtype().is_bool():
                 # numpy promotes bool.round() to float16. Go figure.
                 w_out = W_NDimArray.from_shape(space, self.get_shape(),
-                    descriptor.get_dtype_cache(space).w_float16dtype)
+                    get_dtype_cache(space).w_float16dtype)
             else:
                 w_out = None
         elif not isinstance(w_out, W_NDimArray):
@@ -818,7 +822,7 @@
                 "return arrays must be of ArrayType"))
         out = descriptor.dtype_agreement(space, [self], self.get_shape(), w_out)
         if out.get_dtype().is_bool() and self.get_dtype().is_bool():
-            calc_dtype = descriptor.get_dtype_cache(space).w_longdtype
+            calc_dtype = get_dtype_cache(space).w_longdtype
         else:
             calc_dtype = out.get_dtype()
 
@@ -837,7 +841,7 @@
             raise oefmt(space.w_ValueError, "a must be a 1-d array")
         v = convert_to_array(space, w_v)
         ret = W_NDimArray.from_shape(
-            space, v.get_shape(), descriptor.get_dtype_cache(space).w_longdtype)
+            space, v.get_shape(), get_dtype_cache(space).w_longdtype)
         if side == NPY.SEARCHLEFT:
             binsearch = loop.binsearch_left
         else:
@@ -1145,35 +1149,46 @@
 
     # ----------------------- reduce -------------------------------
 
-    def _reduce_ufunc_impl(ufunc_name, cumulative=False, bool_result=False):
+    def _reduce_ufunc_impl(ufunc_name, name, bool_result=False):
         @unwrap_spec(keepdims=bool)
         def impl(self, space, w_axis=None, w_dtype=None, w_out=None, keepdims=False):
             out = out_converter(space, w_out)
             if bool_result:
-                w_dtype = descriptor.get_dtype_cache(space).w_booldtype
+                w_dtype = get_dtype_cache(space).w_booldtype
             return getattr(ufuncs.get(space), ufunc_name).reduce(