[pypy-commit] pypy online-transforms: hg merge default

Thu Nov 6 17:43:44 CET 2014

Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch: online-transforms
Changeset: r74355:b4bdf3e9a6b6
Date: 2014-11-06 16:41 +0000
http://bitbucket.org/pypy/pypy/changeset/b4bdf3e9a6b6/

Log:	hg merge default

diff --git a/README.rst b/README.rst
--- a/README.rst
+++ b/README.rst
@@ -37,4 +37,4 @@
 to use virtualenv with the resulting pypy-c as the interpreter; you can
 find more details about various installation schemes here:
 
-http://doc.pypy.org/en/latest/getting-started.html#installing-pypy
+    http://doc.pypy.org/en/latest/install.html
diff --git a/lib_pypy/grp.py b/lib_pypy/grp.py
--- a/lib_pypy/grp.py
+++ b/lib_pypy/grp.py
@@ -66,11 +66,12 @@
 
 @builtinify
 def getgrnam(name):
-    if not isinstance(name, str):
+    if not isinstance(name, basestring):
         raise TypeError("expected string")
+    name = str(name)
     res = libc.getgrnam(name)
     if not res:
-        raise KeyError(name)
+        raise KeyError("'getgrnam(): name not found: %s'" % name)
     return _group_from_gstruct(res)
 
 @builtinify
diff --git a/pypy/config/pypyoption.py b/pypy/config/pypyoption.py
--- a/pypy/config/pypyoption.py
+++ b/pypy/config/pypyoption.py
@@ -86,9 +86,10 @@
     # itself needs the interp-level struct module
     # because 'P' is missing from the app-level one
     "_rawffi": [("objspace.usemodules.struct", True)],
-    "cpyext": [("translation.secondaryentrypoints", "cpyext,main"),
-               ("translation.shared", sys.platform == "win32")],
+    "cpyext": [("translation.secondaryentrypoints", "cpyext,main")],
 }
+if sys.platform == "win32":
+    module_suggests["cpyext"].append(("translation.shared", True))
 
 module_import_dependencies = {
     # no _rawffi if importing rpython.rlib.clibffi raises ImportError
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -39,3 +39,7 @@
 .. branch: kill-multimethod
 
 Kill multimethod machinery, all multimethods were removed earlier.
+
+.. branch nditer-external_loop
+
+Implement `external_loop` arguement to numpy's nditer
diff --git a/pypy/interpreter/executioncontext.py b/pypy/interpreter/executioncontext.py
--- a/pypy/interpreter/executioncontext.py
+++ b/pypy/interpreter/executioncontext.py
@@ -32,6 +32,17 @@
         self.compiler = space.createcompiler()
         self.profilefunc = None
         self.w_profilefuncarg = None
+        self.thread_disappeared = False   # might be set to True after os.fork()
+
+    @staticmethod
+    def _mark_thread_disappeared(space):
+        # Called in the child process after os.fork() by interp_posix.py.
+        # Marks all ExecutionContexts except the current one
+        # with 'thread_disappeared = True'.
+        me = space.getexecutioncontext()
+        for ec in space.threadlocals.getallvalues().values():
+            if ec is not me:
+                ec.thread_disappeared = True
 
     def gettopframe(self):
         return self.topframeref()
diff --git a/pypy/module/_file/interp_stream.py b/pypy/module/_file/interp_stream.py
--- a/pypy/module/_file/interp_stream.py
+++ b/pypy/module/_file/interp_stream.py
@@ -34,8 +34,12 @@
         # this function runs with the GIL acquired so there is no race
         # condition in the creation of the lock
         me = self.space.getexecutioncontext()   # used as thread ident
-        if self.slockowner is me:
-            return False    # already acquired by the current thread
+        if self.slockowner is not None:
+            if self.slockowner is me:
+                return False    # already acquired by the current thread
+            if self.slockowner.thread_disappeared:
+                self.slockowner = None
+                self.slock = None
         try:
             if self.slock is None:
                 self.slock = self.space.allocate_lock()
diff --git a/pypy/module/micronumpy/concrete.py b/pypy/module/micronumpy/concrete.py
--- a/pypy/module/micronumpy/concrete.py
+++ b/pypy/module/micronumpy/concrete.py
@@ -449,7 +449,7 @@
                 strides.reverse()
                 backstrides.reverse()
                 new_shape.reverse()
-            return SliceArray(self.start, strides, backstrides, new_shape,
+            return self.__class__(self.start, strides, backstrides, new_shape,
                               self, orig_array)
         new_strides = calc_new_strides(new_shape, self.get_shape(),
                                        self.get_strides(),
@@ -460,10 +460,16 @@
         new_backstrides = [0] * len(new_shape)
         for nd in range(len(new_shape)):
             new_backstrides[nd] = (new_shape[nd] - 1) * new_strides[nd]
-        return SliceArray(self.start, new_strides, new_backstrides, new_shape,
+        return self.__class__(self.start, new_strides, new_backstrides, new_shape,
                           self, orig_array)
 
 
+class NonWritableSliceArray(SliceArray):
+    def descr_setitem(self, space, orig_array, w_index, w_value):
+        raise OperationError(space.w_ValueError, space.wrap(
+            "assignment destination is read-only"))
+
+
 class VoidBoxStorage(BaseConcreteArray):
     def __init__(self, size, dtype):
         self.storage = alloc_raw_storage(size)
diff --git a/pypy/module/micronumpy/iterators.py b/pypy/module/micronumpy/iterators.py
--- a/pypy/module/micronumpy/iterators.py
+++ b/pypy/module/micronumpy/iterators.py
@@ -8,8 +8,8 @@
 At which byte in x.data does the item x[3,4] begin?
 if x.strides==[1,5]:
     pData = x.pData + (x.start + 3*1 + 4*5)*sizeof(x.pData[0])
-    pData = x.pData + (x.start + 24) * sizeof(x.pData[0])
-so the offset of the element is 24 elements after the first
+    pData = x.pData + (x.start + 23) * sizeof(x.pData[0])
+so the offset of the element is 23 elements after the first
 
 What is the next element in x after coordinates [3,4]?
 if x.order =='C':
@@ -33,7 +33,7 @@
   which is x.strides[1] * (x.shape[1] - 1) + x.strides[0]
 so if we precalculate the overflow backstride as
 [x.strides[i] * (x.shape[i] - 1) for i in range(len(x.shape))]
-we can go faster.
+we can do only addition while iterating
 All the calculations happen in next()
 """
 from rpython.rlib import jit
@@ -41,6 +41,16 @@
 from pypy.module.micronumpy.base import W_NDimArray
 from pypy.module.micronumpy.flagsobj import _update_contiguous_flags
 
+class OpFlag(object):
+    def __init__(self):
+        self.rw = ''
+        self.broadcast = True
+        self.force_contig = False
+        self.force_align = False
+        self.native_byte_order = False
+        self.tmp_copy = ''
+        self.allocate = False
+
 
 class PureShapeIter(object):
     def __init__(self, shape, idx_w):
@@ -89,11 +99,13 @@
 class ArrayIter(object):
     _immutable_fields_ = ['contiguous', 'array', 'size', 'ndim_m1', 'shape_m1[*]',
                           'strides[*]', 'backstrides[*]', 'factors[*]',
-                          'track_index']
+                          'slice_shape', 'slice_stride', 'slice_backstride',
+                          'track_index', 'operand_type', 'slice_operand_type']
 
     track_index = True
 
-    def __init__(self, array, size, shape, strides, backstrides):
+    def __init__(self, array, size, shape, strides, backstrides, op_flags=OpFlag()):
+        from pypy.module.micronumpy import concrete
         assert len(shape) == len(strides) == len(backstrides)
         _update_contiguous_flags(array)
         self.contiguous = (array.flags & NPY.ARRAY_C_CONTIGUOUS and
@@ -105,6 +117,12 @@
         self.shape_m1 = [s - 1 for s in shape]
         self.strides = strides
         self.backstrides = backstrides
+        self.slice_shape = 1
+        self.slice_stride = -1
+        if strides:
+            self.slice_stride = strides[-1]
+        self.slice_backstride = 1
+        self.slice_operand_type = concrete.SliceArray
 
         ndim = len(shape)
         factors = [0] * ndim
@@ -114,6 +132,10 @@
             else:
                 factors[ndim-i-1] = factors[ndim-i] * shape[ndim-i]
         self.factors = factors
+        if op_flags.rw == 'r':
+            self.operand_type = concrete.ConcreteNonWritableArrayWithBase
+        else:
+            self.operand_type = concrete.ConcreteArrayWithBase
 
     @jit.unroll_safe
     def reset(self, state=None):
@@ -193,6 +215,12 @@
         assert state.iterator is self
         self.array.setitem(state.offset, elem)
 
+    def getoperand(self, st, base):
+        impl = self.operand_type
+        res = impl([], self.array.dtype, self.array.order, [], [],
+                   self.array.storage, base)
+        res.start = st.offset
+        return res
 
 def AxisIter(array, shape, axis, cumulative):
     strides = array.get_strides()
@@ -216,3 +244,42 @@
         size /= shape[axis]
     shape[axis] = backstrides[axis] = 0
     return ArrayIter(array, size, shape, array.strides, backstrides)
+
+class SliceIter(ArrayIter):
+    '''
+    used with external loops, getitem and setitem return a SliceArray
+    view into the original array
+    '''
+    _immutable_fields_ = ['base', 'slice_shape[*]', 'slice_stride[*]', 'slice_backstride[*]']
+
+    def __init__(self, array, size, shape, strides, backstrides, slice_shape,
+                 slice_stride, slice_backstride, op_flags, base):
+        from pypy.module.micronumpy import concrete
+        ArrayIter.__init__(self, array, size, shape, strides, backstrides, op_flags)
+        self.slice_shape = slice_shape
+        self.slice_stride = slice_stride
+        self.slice_backstride = slice_backstride
+        self.base = base
+        if op_flags.rw == 'r':
+            self.slice_operand_type = concrete.NonWritableSliceArray
+        else:
+            self.slice_operand_type = concrete.SliceArray
+
+    def getitem(self, state):
+        # XXX cannot be called - must return a boxed value
+        assert False
+
+    def getitem_bool(self, state):
+        # XXX cannot be called - must return a boxed value
+        assert False
+
+    def setitem(self, state, elem):
+        # XXX cannot be called - must return a boxed value
+        assert False
+
+    def getoperand(self, state, base):
+        assert state.iterator is self
+        impl = self.slice_operand_type
+        arr = impl(state.offset, [self.slice_stride], [self.slice_backstride],
+                   [self.slice_shape], self.array, self.base)
+        return arr
diff --git a/pypy/module/micronumpy/ndarray.py b/pypy/module/micronumpy/ndarray.py
--- a/pypy/module/micronumpy/ndarray.py
+++ b/pypy/module/micronumpy/ndarray.py
@@ -83,8 +83,12 @@
         raise OperationError(space.w_AttributeError, space.wrap(
             "Cannot delete array dtype"))
 
+    def ndims(self):
+        return len(self.get_shape())
+    ndims._always_inline_ = True
+
     def descr_get_ndim(self, space):
-        return space.wrap(len(self.get_shape()))
+        return space.wrap(self.ndims())
 
     def descr_get_itemsize(self, space):
         return space.wrap(self.get_dtype().elsize)
@@ -103,14 +107,14 @@
         return space.wrap(loop.tostring(space, self))
 
     def getitem_filter(self, space, arr):
-        if len(arr.get_shape()) > 1 and arr.get_shape() != self.get_shape():
+        if arr.ndims() > 1 and arr.get_shape() != self.get_shape():
             raise OperationError(space.w_ValueError, space.wrap(
                 "boolean index array should have 1 dimension"))
         if arr.get_size() > self.get_size():
             raise OperationError(space.w_ValueError, space.wrap(
                 "index out of range for array"))
         size = loop.count_all_true(arr)
-        if len(arr.get_shape()) == 1:
+        if arr.ndims() == 1:
             res_shape = [size] + self.get_shape()[1:]
         else:
             res_shape = [size]
@@ -119,7 +123,7 @@
         return loop.getitem_filter(w_res, self, arr)
 
     def setitem_filter(self, space, idx, val):
-        if len(idx.get_shape()) > 1 and idx.get_shape() != self.get_shape():
+        if idx.ndims() > 1 and idx.get_shape() != self.get_shape():
             raise OperationError(space.w_ValueError, space.wrap(
                 "boolean index array should have 1 dimension"))
         if idx.get_size() > self.get_size():
@@ -210,7 +214,7 @@
         if space.is_w(w_idx, space.w_Ellipsis):
             return self
         elif isinstance(w_idx, W_NDimArray) and w_idx.get_dtype().is_bool() \
-                and len(w_idx.get_shape()) > 0:
+                and w_idx.ndims() > 0:
             return self.getitem_filter(space, w_idx)
         try:
             return self.implementation.descr_getitem(space, self, w_idx)
@@ -228,7 +232,7 @@
             self.implementation.setslice(space, convert_to_array(space, w_value))
             return
         elif isinstance(w_idx, W_NDimArray) and w_idx.get_dtype().is_bool() \
-                and len(w_idx.get_shape()) > 0:
+                and w_idx.ndims() > 0:
             self.setitem_filter(space, w_idx, convert_to_array(space, w_value))
             return
         try:
@@ -289,7 +293,7 @@
             shape=shape, backward_broadcast=backward_broadcast)
 
     def is_scalar(self):
-        return len(self.get_shape()) == 0
+        return self.ndims() == 0
 
     def set_scalar_value(self, w_val):
         return self.implementation.setitem(self.implementation.start, w_val)
@@ -408,7 +412,7 @@
         """
         if axis1 == axis2:
             return self
-        n = len(self.get_shape())
+        n = self.ndims()
         if n <= 1:
             return self
         if axis1 < 0:
@@ -426,7 +430,7 @@
         return self.implementation.nonzero(space, index_type)
 
     def descr_tolist(self, space):
-        if len(self.get_shape()) == 0:
+        if self.ndims() == 0:
             return self.get_scalar_value().item(space)
         l_w = []
         for i in range(self.get_shape()[0]):
@@ -514,7 +518,7 @@
         if len(args_w) == 0:
             raise OperationError(space.w_ValueError, space.wrap(
                 "itemset must have at least one argument"))
-        if len(args_w) != len(self.get_shape()) + 1:
+        if len(args_w) != self.ndims() + 1:
             raise OperationError(space.w_ValueError, space.wrap(
                 "incorrect number of indices for array"))
         self.descr_setitem(space, space.newtuple(args_w[:-1]), args_w[-1])
@@ -647,14 +651,14 @@
 
     @unwrap_spec(offset=int, axis1=int, axis2=int)
     def descr_diagonal(self, space, offset=0, axis1=0, axis2=1):
-        if len(self.get_shape()) < 2:
+        if self.ndims() < 2:
             raise OperationError(space.w_ValueError, space.wrap(
                 "need at least 2 dimensions for diagonal"))
-        if (axis1 < 0 or axis2 < 0 or axis1 >= len(self.get_shape()) or
-                axis2 >= len(self.get_shape())):
+        if (axis1 < 0 or axis2 < 0 or axis1 >= self.ndims() or
+                axis2 >= self.ndims()):
             raise oefmt(space.w_ValueError,
                         "axis1(=%d) and axis2(=%d) must be withing range "
-                        "(ndim=%d)", axis1, axis2, len(self.get_shape()))
+                        "(ndim=%d)", axis1, axis2, self.ndims())
         if axis1 == axis2:
             raise OperationError(space.w_ValueError, space.wrap(
                 "axis1 and axis2 cannot be the same"))
@@ -733,7 +737,7 @@
             raise OperationError(space.w_NotImplementedError, space.wrap(
                 'sorter not supported in searchsort'))
         side = searchside_converter(space, w_side)
-        if len(self.get_shape()) != 1:
+        if self.ndims() != 1:
             raise oefmt(space.w_ValueError, "a must be a 1-d array")
         v = convert_to_array(space, w_v)
         ret = W_NDimArray.from_shape(
@@ -972,7 +976,7 @@
         if other.is_scalar():
             #Note: w_out is not modified, this is numpy compliant.
             return self.descr_mul(space, other)
-        elif len(self.get_shape()) < 2 and len(other.get_shape()) < 2:
+        elif self.ndims() < 2 and other.ndims() < 2:
             w_res = self.descr_mul(space, other)
             assert isinstance(w_res, W_NDimArray)
             return w_res.descr_sum(space, space.wrap(-1), out)
@@ -989,7 +993,7 @@
                 matches = False
             elif not out.implementation.order == "C":
                 matches = False
-            elif len(out.get_shape()) != len(out_shape):
+            elif out.ndims() != len(out_shape):
                 matches = False
             else:
                 for i in range(len(out_shape)):
diff --git a/pypy/module/micronumpy/nditer.py b/pypy/module/micronumpy/nditer.py
--- a/pypy/module/micronumpy/nditer.py
+++ b/pypy/module/micronumpy/nditer.py
@@ -5,7 +5,7 @@
 from pypy.module.micronumpy import ufuncs, support, concrete
 from pypy.module.micronumpy.base import W_NDimArray, convert_to_array
 from pypy.module.micronumpy.descriptor import decode_w_dtype
-from pypy.module.micronumpy.iterators import ArrayIter
+from pypy.module.micronumpy.iterators import ArrayIter, SliceIter, OpFlag
 from pypy.module.micronumpy.strides import (calculate_broadcast_strides,
                                             shape_agreement, shape_agreement_multiple)
 
@@ -35,17 +35,6 @@
     return ret
 
 
-class OpFlag(object):
-    def __init__(self):
-        self.rw = ''
-        self.broadcast = True
-        self.force_contig = False
-        self.force_align = False
-        self.native_byte_order = False
-        self.tmp_copy = ''
-        self.allocate = False
-
-
 def parse_op_flag(space, lst):
     op_flag = OpFlag()
     for w_item in lst:
@@ -71,17 +60,17 @@
         elif item == 'allocate':
             op_flag.allocate = True
         elif item == 'no_subtype':
-            raise OperationError(space.w_NotImplementedError, space.wrap(
-                '"no_subtype" op_flag not implemented yet'))
+            raise oefmt(space.w_NotImplementedError,
+                '"no_subtype" op_flag not implemented yet')
         elif item == 'arraymask':
-            raise OperationError(space.w_NotImplementedError, space.wrap(
-                '"arraymask" op_flag not implemented yet'))
+            raise oefmt(space.w_NotImplementedError,
+                '"arraymask" op_flag not implemented yet')
         elif item == 'writemask':
-            raise OperationError(space.w_NotImplementedError, space.wrap(
-                '"writemask" op_flag not implemented yet'))
+            raise oefmt(space.w_NotImplementedError,
+                '"writemask" op_flag not implemented yet')
         else:
-            raise OperationError(space.w_ValueError, space.wrap(
-                'op_flags must be a tuple or array of per-op flag-tuples'))
+            raise oefmt(space.w_ValueError,
+                'op_flags must be a tuple or array of per-op flag-tuples')
     if op_flag.rw == '':
         raise oefmt(space.w_ValueError,
                     "None of the iterator flags READWRITE, READONLY, or "
@@ -94,8 +83,8 @@
         return
     elif not space.isinstance_w(w_flags, space.w_tuple) and not \
             space.isinstance_w(w_flags, space.w_list):
-        raise OperationError(space.w_ValueError, space.wrap(
-            'Iter global flags must be a list or tuple of strings'))
+        raise oefmt(space.w_ValueError,
+            'Iter global flags must be a list or tuple of strings')
     lst = space.listview(w_flags)
     for w_item in lst:
         if not space.isinstance_w(w_item, space.w_str) and not \
@@ -106,12 +95,10 @@
                         typename)
         item = space.str_w(w_item)
         if item == 'external_loop':
-            raise OperationError(space.w_NotImplementedError, space.wrap(
-                'nditer external_loop not implemented yet'))
             nditer.external_loop = True
         elif item == 'buffered':
-            raise OperationError(space.w_NotImplementedError, space.wrap(
-                'nditer buffered not implemented yet'))
+            raise oefmt(space.w_NotImplementedError,
+                'nditer buffered not implemented yet')
             # For numpy compatability
             nditer.buffered = True
         elif item == 'c_index':
@@ -131,8 +118,8 @@
         elif item == 'refs_ok':
             nditer.refs_ok = True
         elif item == 'reduce_ok':
-            raise OperationError(space.w_NotImplementedError, space.wrap(
-                'nditer reduce_ok not implemented yet'))
+            raise oefmt(space.w_NotImplementedError,
+                'nditer reduce_ok not implemented yet')
             nditer.reduce_ok = True
         elif item == 'zerosize_ok':
             nditer.zerosize_ok = True
@@ -141,9 +128,9 @@
                         'Unexpected iterator global flag "%s"',
                         item)
     if nditer.tracked_index and nditer.external_loop:
-        raise OperationError(space.w_ValueError, space.wrap(
+        raise oefmt(space.w_ValueError,
             'Iterator flag EXTERNAL_LOOP cannot be used if an index or '
-            'multi-index is being tracked'))
+            'multi-index is being tracked')
 
 
 def is_backward(imp, order):
@@ -155,11 +142,11 @@
         raise NotImplementedError('not implemented yet')
 
 
-def get_iter(space, order, arr, shape, dtype):
+def get_iter(space, order, arr, shape, dtype, op_flags):
     imp = arr.implementation
     backward = is_backward(imp, order)
     if arr.is_scalar():
-        return ArrayIter(imp, 1, [], [], [])
+        return ArrayIter(imp, 1, [], [], [], op_flags=op_flags)
     if (imp.strides[0] < imp.strides[-1] and not backward) or \
        (imp.strides[0] > imp.strides[-1] and backward):
         # flip the strides. Is this always true for multidimension?
@@ -174,8 +161,103 @@
         backstrides = imp.backstrides
     r = calculate_broadcast_strides(strides, backstrides, imp.shape,
                                     shape, backward)
-    return ArrayIter(imp, imp.get_size(), shape, r[0], r[1])
+    return ArrayIter(imp, imp.get_size(), shape, r[0], r[1], op_flags=op_flags)
 
+def calculate_ndim(op_in, oa_ndim):
+    if oa_ndim >=0:
+        return oa_ndim
+    else:
+        ndim = 0
+        for op in op_in:
+            if op is None:
+                continue
+            assert isinstance(op, W_NDimArray)
+            ndim = max(ndim, op.ndims())
+    return ndim
+
+def coalesce_axes(it, space):
+    # Copy logic from npyiter_coalesce_axes, used in ufunc iterators
+    # and in nditer's with 'external_loop' flag
+    can_coalesce = True
+    if it.order == 'F':
+        fastest = 0
+    else:
+        fastest = -1
+    for idim in range(it.ndim - 1):
+        for op_it, _ in it.iters:
+            if op_it is None:
+                continue
+            assert isinstance(op_it, ArrayIter)
+            indx = len(op_it.strides)
+            if it.order == 'F':
+                indx = len(op_it.array.strides) - indx
+                assert indx >=0
+                astrides = op_it.array.strides[indx:]
+            else:
+                astrides = op_it.array.strides[:indx]
+            # does op_it iters over array "naturally"
+            if astrides != op_it.strides:
+                can_coalesce = False
+                break
+        if can_coalesce:
+            for i in range(len(it.iters)):
+                old_iter = it.iters[i][0]
+                shape = [s+1 for s in old_iter.shape_m1]
+                strides = old_iter.strides
+                backstrides = old_iter.backstrides
+                if it.order == 'F':
+                    new_shape = shape[1:]
+                    new_strides = strides[1:]
+                    new_backstrides = backstrides[1:]
+                    _stride = min(strides[0], old_iter.slice_stride)
+                else:
+                    new_shape = shape[:-1]
+                    new_strides = strides[:-1]
+                    new_backstrides = backstrides[:-1]
+                    _stride = old_iter.slice_stride
+                # We always want the "fastest" iterator in external loops
+                _shape = shape[fastest] * old_iter.slice_shape
+                _backstride = (_shape - 1) * _stride
+                new_iter = SliceIter(old_iter.array, old_iter.size / shape[fastest],
+                            new_shape, new_strides, new_backstrides,
+                            _shape, _stride, _backstride,
+                            it.op_flags[i], it)
+                it.iters[i] = (new_iter, new_iter.reset())
+            if len(it.shape) > 1:
+                if it.order == 'F':
+                    it.shape = it.shape[1:]
+                else:
+                    it.shape = it.shape[:-1]
+            else:
+                it.shape = [1]
+        else:
+            break
+    # Always coalesce at least one
+    for i in range(len(it.iters)):
+        old_iter = it.iters[i][0]
+        shape = [s+1 for s in old_iter.shape_m1]
+        strides = old_iter.strides
+        backstrides = old_iter.backstrides
+        new_shape = shape[:-1]
+        new_strides = strides[:-1]
+        new_backstrides = backstrides[:-1]
+        _shape = shape[-1] * old_iter.slice_shape
+        # use the operand's iterator's rightmost stride,
+        # even if it is not the fastest (for 'F' or swapped axis)
+        _stride = old_iter.slice_stride
+        _backstride = (_shape - 1) * _stride
+        new_iter = SliceIter(old_iter.array, old_iter.size / shape[-1],
+                    new_shape, new_strides, new_backstrides,
+                    _shape, _stride, _backstride,
+                    it.op_flags[i], it)
+        it.iters[i] = (new_iter, new_iter.reset())
+    if len(it.shape) > 1:
+        if it.order == 'F':
+            it.shape = it.shape[1:]
+        else:
+            it.shape = it.shape[:-1]
+    else:
+        it.shape = [1]
 
 class IndexIterator(object):
     def __init__(self, shape, backward=False):
@@ -205,6 +287,7 @@
 
 
 class W_NDIter(W_Root):
+    _immutable_fields_ = ['ndim', ]
     def __init__(self, space, w_seq, w_flags, w_op_flags, w_op_dtypes, w_casting,
                  w_op_axes, w_itershape, w_buffersize, order):
         self.order = order
@@ -236,28 +319,29 @@
         self.op_flags = parse_op_arg(space, 'op_flags', w_op_flags,
                                      len(self.seq), parse_op_flag)
         # handle w_op_axes
+        oa_ndim = -1
         if not space.is_none(w_op_axes):
-            self.set_op_axes(space, w_op_axes)
+            oa_ndim = self.set_op_axes(space, w_op_axes)
+        self.ndim = calculate_ndim(self.seq, oa_ndim)
 
         # handle w_op_dtypes part 1: creating self.dtypes list from input
         if not space.is_none(w_op_dtypes):
             w_seq_as_list = space.listview(w_op_dtypes)
             self.dtypes = [decode_w_dtype(space, w_elem) for w_elem in w_seq_as_list]
             if len(self.dtypes) != len(self.seq):
-                raise OperationError(space.w_ValueError, space.wrap(
-                    "op_dtypes must be a tuple/list matching the number of ops"))
+                raise oefmt(space.w_ValueError,
+                    "op_dtypes must be a tuple/list matching the number of ops")
         else:
             self.dtypes = []
 
         # handle None or writable operands, calculate my shape
-        self.iters = []
         outargs = [i for i in range(len(self.seq))
                    if self.seq[i] is None or self.op_flags[i].rw == 'w']
         if len(outargs) > 0:
             out_shape = shape_agreement_multiple(space, [self.seq[i] for i in outargs])
         else:
             out_shape = None
-        self.shape = iter_shape = shape_agreement_multiple(space, self.seq,
+        self.shape = shape_agreement_multiple(space, self.seq,
                                                            shape=out_shape)
         if len(outargs) > 0:
             # Make None operands writeonly and flagged for allocation
@@ -276,11 +360,11 @@
             for i in outargs:
                 if self.seq[i] is None:
                     # XXX can we postpone allocation to later?
-                    self.seq[i] = W_NDimArray.from_shape(space, iter_shape, out_dtype)
+                    self.seq[i] = W_NDimArray.from_shape(space, self.shape, out_dtype)
                 else:
                     if not self.op_flags[i].broadcast:
                         # Raises if ooutput cannot be broadcast
-                        shape_agreement(space, iter_shape, self.seq[i], False)
+                        shape_agreement(space, self.shape, self.seq[i], False)
 
         if self.tracked_index != "":
             if self.order == "K":
@@ -289,7 +373,7 @@
                 backward = False
             else:
                 backward = self.order != self.tracked_index
-            self.index_iter = IndexIterator(iter_shape, backward=backward)
+            self.index_iter = IndexIterator(self.shape, backward=backward)
 
         # handle w_op_dtypes part 2: copy where needed if possible
         if len(self.dtypes) > 0:
@@ -311,49 +395,49 @@
             self.dtypes = [s.get_dtype() for s in self.seq]
 
         # create an iterator for each operand
+        self.iters = []
         for i in range(len(self.seq)):
-            it = get_iter(space, self.order, self.seq[i], iter_shape, self.dtypes[i])
+            it = get_iter(space, self.order, self.seq[i], self.shape,
+                          self.dtypes[i], self.op_flags[i])
             it.contiguous = False
             self.iters.append((it, it.reset()))
 
+        if self.external_loop:
+            coalesce_axes(self, space)
+
     def set_op_axes(self, space, w_op_axes):
         if space.len_w(w_op_axes) != len(self.seq):
             raise oefmt(space.w_ValueError,
                         "op_axes must be a tuple/list matching the number of ops")
         op_axes = space.listview(w_op_axes)
-        l = -1
+        oa_ndim = -1
         for w_axis in op_axes:
             if not space.is_none(w_axis):
                 axis_len = space.len_w(w_axis)
-                if l == -1:
-                    l = axis_len
-                elif axis_len != l:
+                if oa_ndim == -1:
+                    oa_ndim = axis_len
+                elif axis_len != oa_ndim:
                     raise oefmt(space.w_ValueError,
                                 "Each entry of op_axes must have the same size")
                 self.op_axes.append([space.int_w(x) if not space.is_none(x) else -1
                                      for x in space.listview(w_axis)])
-        if l == -1:
+        if oa_ndim == -1:
             raise oefmt(space.w_ValueError,
                         "If op_axes is provided, at least one list of axes "
                         "must be contained within it")
-        raise Exception('xxx TODO')
+        raise oefmt(space.w_NotImplementedError, "op_axis not finished yet")
         # Check that values make sense:
         # - in bounds for each operand
         # ValueError: Iterator input op_axes[0][3] (==3) is not a valid axis of op[0], which has 2 dimensions
         # - no repeat axis
         # ValueError: The 'op_axes' provided to the iterator constructor for operand 1 contained duplicate value 0
+        return oa_ndim
 
     def descr_iter(self, space):
         return space.wrap(self)
 
-    def getitem(self, it, st, op_flags):
-        if op_flags.rw == 'r':
-            impl = concrete.ConcreteNonWritableArrayWithBase
-        else:
-            impl = concrete.ConcreteArrayWithBase
-        res = impl([], it.array.dtype, it.array.order, [], [],
-                   it.array.storage, self)
-        res.start = st.offset
+    def getitem(self, it, st):
+        res = it.getoperand(st, self)
         return W_NDimArray(res)
 
     def descr_getitem(self, space, w_idx):
@@ -363,7 +447,7 @@
         except IndexError:
             raise oefmt(space.w_IndexError,
                         "Iterator operand index %d is out of bounds", idx)
-        return self.getitem(it, st, self.op_flags[idx])
+        return self.getitem(it, st)
 
     def descr_setitem(self, space, w_idx, w_value):
         raise oefmt(space.w_NotImplementedError, "not implemented yet")
@@ -385,7 +469,7 @@
             else:
                 self.first_next = False
         for i, (it, st) in enumerate(self.iters):
-            res.append(self.getitem(it, st, self.op_flags[i]))
+            res.append(self.getitem(it, st))
             self.iters[i] = (it, it.next(st))
         if len(res) < 2:
             return res[0]
@@ -477,7 +561,7 @@
         raise oefmt(space.w_NotImplementedError, "not implemented yet")
 
     def descr_get_ndim(self, space):
-        raise oefmt(space.w_NotImplementedError, "not implemented yet")
+        return space.wrap(self.ndim)
 
     def descr_get_nop(self, space):
         raise oefmt(space.w_NotImplementedError, "not implemented yet")
diff --git a/pypy/module/micronumpy/test/test_nditer.py b/pypy/module/micronumpy/test/test_nditer.py
--- a/pypy/module/micronumpy/test/test_nditer.py
+++ b/pypy/module/micronumpy/test/test_nditer.py
@@ -63,9 +63,6 @@
         from numpy import arange, nditer, array
         a = arange(24).reshape(2, 3, 4)
         import sys
-        if '__pypy__' in sys.builtin_module_names:
-            raises(NotImplementedError, nditer, a, flags=['external_loop'])
-            skip('nditer external_loop not implmented')
         r = []
         n = 0
         for x in nditer(a, flags=['external_loop']):
@@ -79,7 +76,9 @@
             r.append(x)
             n += 1
         assert n == 12
-        assert (array(r) == [[ 0, 12], [ 4, 16], [ 8, 20], [ 1, 13], [ 5, 17], [ 9, 21], [ 2, 14], [ 6, 18], [10, 22], [ 3, 15], [ 7, 19], [11, 23]]).all()
+        assert (array(r) == [[ 0, 12], [ 4, 16], [ 8, 20], [ 1, 13], [ 5, 17], [ 9, 21],
+                             [ 2, 14], [ 6, 18], [10, 22], [ 3, 15], [ 7, 19], [11, 23],
+                            ]).all()
         e = raises(ValueError, 'r[0][0] = 0')
         assert str(e.value) == 'assignment destination is read-only'
         r = []
@@ -222,9 +221,6 @@
     def test_outarg(self):
         from numpy import nditer, zeros, arange
         import sys
-        if '__pypy__' in sys.builtin_module_names:
-            raises(NotImplementedError, nditer, [1, 2], flags=['external_loop'])
-            skip('nditer external_loop not implmented')
 
         def square1(a):
             it = nditer([a, None])
@@ -233,6 +229,9 @@
             return it.operands[1]
         assert (square1([1, 2, 3]) == [1, 4, 9]).all()
 
+        if '__pypy__' in sys.builtin_module_names:
+            raises(NotImplementedError, nditer, [1, 2], flags=['buffered'])
+            skip('nditer buffered not implmented')
         def square2(a, out=None):
             it = nditer([a, out], flags=['external_loop', 'buffered'],
                         op_flags=[['readonly'],
@@ -252,10 +251,11 @@
         from numpy import nditer, arange
         a = arange(3)
         import sys
+        b = arange(8).reshape(2,4)
         if '__pypy__' in sys.builtin_module_names:
-            raises(NotImplementedError, nditer, a, flags=['external_loop'])
-            skip('nditer external_loop not implmented')
-        b = arange(8).reshape(2,4)
+            raises(NotImplementedError, nditer, [a, b, None], flags=['external_loop'],
+                   op_axes=[[0, -1, -1], [-1, 0, 1], None])
+            skip('nditer op_axes not implemented yet')
         it = nditer([a, b, None], flags=['external_loop'],
                     op_axes=[[0, -1, -1], [-1, 0, 1], None])
         for x, y, z in it:
diff --git a/pypy/module/posix/interp_posix.py b/pypy/module/posix/interp_posix.py
--- a/pypy/module/posix/interp_posix.py
+++ b/pypy/module/posix/interp_posix.py
@@ -10,6 +10,7 @@
 
 from pypy.interpreter.gateway import unwrap_spec
 from pypy.interpreter.error import OperationError, wrap_oserror, wrap_oserror2
+from pypy.interpreter.executioncontext import ExecutionContext
 from pypy.module.sys.interp_encoding import getfilesystemencoding
 
 
@@ -721,6 +722,8 @@
     "NOT_RPYTHON"
     get_fork_hooks(where).append(hook)
 
+add_fork_hook('child', ExecutionContext._mark_thread_disappeared)
+
 @specialize.arg(0)
 def run_fork_hooks(where, space):
     for hook in get_fork_hooks(where):
diff --git a/pypy/module/test_lib_pypy/test_grp_extra.py b/pypy/module/test_lib_pypy/test_grp_extra.py
--- a/pypy/module/test_lib_pypy/test_grp_extra.py
+++ b/pypy/module/test_lib_pypy/test_grp_extra.py
@@ -9,7 +9,8 @@
                                     "No grp module on this platform")
 
     def test_basic(self):
-        raises(KeyError, self.grp.getgrnam, "dEkLofcG")
+        e = raises(KeyError, self.grp.getgrnam, "dEkLofcG")
+        assert e.value.args[0] == "'getgrnam(): name not found: dEkLofcG'"
         for name in ["root", "wheel"]:
             try:
                 g = self.grp.getgrnam(name)
@@ -19,6 +20,8 @@
             assert 'root' in g.gr_mem or g.gr_mem == []
             assert g.gr_name == name
             assert isinstance(g.gr_passwd, str)    # usually just 'x', don't hope :-)
+            g2 = self.grp.getgrnam(unicode(name))
+            assert g2 == g
             break
         else:
             raise
diff --git a/pypy/module/test_lib_pypy/test_posix_extra.py b/pypy/module/test_lib_pypy/test_posix_extra.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/test_lib_pypy/test_posix_extra.py
@@ -0,0 +1,40 @@
+import py
+import sys, os, subprocess
+
+
+CODE = """
+import sys, os, thread, time
+
+fd1, fd2 = os.pipe()
+f1 = os.fdopen(fd1, 'r', 0)
+f2 = os.fdopen(fd2, 'w', 0)
+
+def f():
+    print "thread started"
+    x = f1.read(1)
+    assert x == "X"
+    print "thread exit"
+
+thread.start_new_thread(f, ())
+time.sleep(0.5)
+if os.fork() == 0:   # in the child
+    time.sleep(0.5)
+    x = f1.read(1)
+    assert x == "Y"
+    print "ok!"
+    sys.exit()
+
+f2.write("X")   # in the parent
+f2.write("Y")   # in the parent
+time.sleep(1.0)
+"""
+
+
+def test_thread_fork_file_lock():
+    if not hasattr(os, 'fork'):
+        py.test.skip("requires 'fork'")
+    output = subprocess.check_output([sys.executable, '-u', '-c', CODE])
+    assert output.splitlines() == [
+        'thread started',
+        'thread exit',
+        'ok!']
diff --git a/rpython/jit/metainterp/history.py b/rpython/jit/metainterp/history.py
--- a/rpython/jit/metainterp/history.py
+++ b/rpython/jit/metainterp/history.py
@@ -279,7 +279,8 @@
             # careful in this comparison: if self.value and other.value
             # are both NaN, stored as regular floats (i.e. on 64-bit),
             # then just using "==" would say False: two NaNs are always
-            # different from each other.
+            # different from each other.  Conversely, "0.0 == -0.0" but
+            # they are not the same constant.
             return (longlong.extract_bits(self.value) ==
                     longlong.extract_bits(other.value))
         return False
diff --git a/rpython/jit/metainterp/pyjitpl.py b/rpython/jit/metainterp/pyjitpl.py
--- a/rpython/jit/metainterp/pyjitpl.py
+++ b/rpython/jit/metainterp/pyjitpl.py
@@ -33,6 +33,14 @@
 
 # ____________________________________________________________
 
+FASTPATHS_SAME_BOXES = {
+    "ne": "history.CONST_FALSE",
+    "eq": "history.CONST_TRUE",
+    "lt": "history.CONST_FALSE",
+    "le": "history.CONST_TRUE",
+    "gt": "history.CONST_FALSE",
+    "ge": "history.CONST_TRUE",
+}
 
 class MIFrame(object):
     debug = False
@@ -188,8 +196,6 @@
     # ------------------------------
 
     for _opimpl in ['int_add', 'int_sub', 'int_mul', 'int_floordiv', 'int_mod',
-                    'int_lt', 'int_le', 'int_eq',
-                    'int_ne', 'int_gt', 'int_ge',
                     'int_and', 'int_or', 'int_xor',
                     'int_rshift', 'int_lshift', 'uint_rshift',
                     'uint_lt', 'uint_le', 'uint_gt', 'uint_ge',
@@ -197,7 +203,6 @@
                     'float_add', 'float_sub', 'float_mul', 'float_truediv',
                     'float_lt', 'float_le', 'float_eq',
                     'float_ne', 'float_gt', 'float_ge',
-                    'ptr_eq', 'ptr_ne', 'instance_ptr_eq', 'instance_ptr_ne',
                     ]:
         exec py.code.Source('''
             @arguments("box", "box")
@@ -205,6 +210,18 @@
                 return self.execute(rop.%s, b1, b2)
         ''' % (_opimpl, _opimpl.upper())).compile()
 
+    for _opimpl in ['int_eq', 'int_ne', 'int_lt', 'int_le', 'int_gt', 'int_ge',
+                    'ptr_eq', 'ptr_ne',
+                    'instance_ptr_eq', 'instance_ptr_ne']:
+        exec py.code.Source('''
+            @arguments("box", "box")
+            def opimpl_%s(self, b1, b2):
+                if b1 is b2: # crude fast check
+                    return %s
+                return self.execute(rop.%s, b1, b2)
+        ''' % (_opimpl, FASTPATHS_SAME_BOXES[_opimpl.split("_")[-1]], _opimpl.upper())
+        ).compile()
+
     for _opimpl in ['int_add_ovf', 'int_sub_ovf', 'int_mul_ovf']:
         exec py.code.Source('''
             @arguments("box", "box")
@@ -340,10 +357,13 @@
         exec py.code.Source('''
             @arguments("box", "box", "label")
             def opimpl_goto_if_not_%s(self, b1, b2, target):
-                condbox = self.execute(rop.%s, b1, b2)
+                if b1 is b2:
+                    condbox = %s
+                else:
+                    condbox = self.execute(rop.%s, b1, b2)
                 self.opimpl_goto_if_not(condbox, target)
-        ''' % (_opimpl, _opimpl.upper())).compile()
-
+        ''' % (_opimpl, FASTPATHS_SAME_BOXES[_opimpl.split("_")[-1]], _opimpl.upper())
+        ).compile()
 
     def _establish_nullity(self, box, orgpc):
         value = box.nonnull()
diff --git a/rpython/jit/metainterp/test/test_ajit.py b/rpython/jit/metainterp/test/test_ajit.py
--- a/rpython/jit/metainterp/test/test_ajit.py
+++ b/rpython/jit/metainterp/test/test_ajit.py
@@ -4119,3 +4119,64 @@
         assert res == 42
         res = self.interp_operations(f, [-42])
         assert res == 0
+
+    def test_cmp_fastpaths(self):
+        class Z: pass
+        def make_int(cmp):
+            def f(x):
+                if cmp == 'eq':
+                    return x == x and x == x
+                if cmp == 'ne':
+                    return x != x or x != x
+                if cmp == 'lt':
+                    return x < x or x != x
+                if cmp == 'le':
+                    return x <= x and x <= x
+                if cmp == 'gt':
+                    return x > x or x > x
+                if cmp == 'ge':
+                    return x >= x and x >= x
+                assert 0
+            return f
+
+        def make_str(cmp):
+            def f(x):
+                x = str(x)
+                if cmp == 'eq':
+                    return x is x or x is x
+                if cmp == 'ne':
+                    return x is not x and x is not x
+                assert 0
+            return f
+
+        def make_object(cmp):
+            def f(x):
+                y = Z()
+                y.x = x
+                x = y
+                if cmp == 'eq':
+                    return x is x
+                if cmp == 'ne':
+                    return x is not x
+                assert 0
+            return f
+
+        for cmp in 'eq ne lt le gt ge'.split():
+            f = make_int(cmp)
+            res = self.interp_operations(f, [42])
+            assert res == f(42)
+            opname = "int_%s" % cmp
+            self.check_operations_history(**{opname: 0})
+
+        for cmp in 'eq ne'.split():
+            f = make_str(cmp)
+            res = self.interp_operations(f, [42])
+            assert res == f(42)
+            opname = "ptr_%s" % cmp
+            self.check_operations_history(**{opname: 0})
+
+            f = make_object(cmp)
+            res = self.interp_operations(f, [42])
+            assert res == f(42)
+            opname = "instance_ptr_%s" % cmp
+            self.check_operations_history(**{opname: 0})
diff --git a/rpython/rlib/objectmodel.py b/rpython/rlib/objectmodel.py
--- a/rpython/rlib/objectmodel.py
+++ b/rpython/rlib/objectmodel.py
@@ -201,6 +201,11 @@
         return result
     return decorator
 
+def always_inline(func):
+    """ mark the function as to-be-inlined by the RPython optimizations (not
+    the JIT!), no matter its size."""
+    func._always_inline_ = True
+    return func
 
 
 # ____________________________________________________________
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -137,22 +137,25 @@
     def flatten_marks(self):
         # for testing
         if self.match_marks_flat is None:
-            self.match_marks_flat = [self.match_start, self.match_end]
-            mark = self.match_marks
-            if mark is not None:
-                self.match_lastindex = mark.gid
-            else:
-                self.match_lastindex = -1
-            while mark is not None:
-                index = mark.gid + 2
-                while index >= len(self.match_marks_flat):
-                    self.match_marks_flat.append(-1)
-                if self.match_marks_flat[index] == -1:
-                    self.match_marks_flat[index] = mark.position
-                mark = mark.prev
-            self.match_marks = None    # clear
+            self._compute_flattened_marks()
         return self.match_marks_flat
 
+    def _compute_flattened_marks(self):
+        self.match_marks_flat = [self.match_start, self.match_end]
+        mark = self.match_marks
+        if mark is not None:
+            self.match_lastindex = mark.gid
+        else:
+            self.match_lastindex = -1
+        while mark is not None:
+            index = mark.gid + 2
+            while index >= len(self.match_marks_flat):
+                self.match_marks_flat.append(-1)
+            if self.match_marks_flat[index] == -1:
+                self.match_marks_flat[index] = mark.position
+            mark = mark.prev
+        self.match_marks = None    # clear
+
     def span(self, groupnum=0):
         # compatibility
         fmarks = self.flatten_marks()
diff --git a/rpython/rlib/test/test_objectmodel.py b/rpython/rlib/test/test_objectmodel.py
--- a/rpython/rlib/test/test_objectmodel.py
+++ b/rpython/rlib/test/test_objectmodel.py
@@ -438,6 +438,11 @@
     assert exc.value.message == "f argument 'b' must be of type <type 'str'>"
     py.test.raises(TypeError, "f('hello', 'world', 3)")
 
+def test_always_inline():
+    @always_inline
+    def f(a, b, c):
+        return a, b, c
+    assert f._always_inline_ == True
 
 def test_enforceargs_defaults():
     @enforceargs(int, int)
diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -825,7 +825,7 @@
         allows for the process to be performed without an extra copy.
         Make sure to call keep_buffer_alive_until_here on the returned values.
         """
-        new_buf = lltype.malloc(STRTYPE, count)
+        new_buf = mallocfn(count)
         pinned = 0
         if rgc.can_move(new_buf):
             if rgc.pin(new_buf):
@@ -857,7 +857,7 @@
             if llop.shrink_array(lltype.Bool, gc_buf, needed_size):
                 pass     # now 'gc_buf' is smaller
             else:
-                gc_buf = lltype.malloc(STRTYPE, needed_size)
+                gc_buf = mallocfn(needed_size)
                 case_num = 2
         if case_num == 2:
             copy_raw_to_string(raw_buf, gc_buf, 0, needed_size)
diff --git a/rpython/rtyper/module/ll_os.py b/rpython/rtyper/module/ll_os.py
--- a/rpython/rtyper/module/ll_os.py
+++ b/rpython/rtyper/module/ll_os.py
@@ -22,8 +22,6 @@
 from rpython.rtyper.tool import rffi_platform as platform
 from rpython.rlib import rposix
 from rpython.translator.tool.cbuild import ExternalCompilationInfo
-from rpython.rtyper.lltypesystem.llmemory import itemoffsetof, offsetof
-from rpython.rtyper.lltypesystem.rstr import STR
 from rpython.rlib.objectmodel import specialize
 from rpython.translator import cdir
 
@@ -251,12 +249,9 @@
 
     @registering_if(os, 'execv')
     def register_os_execv(self):
-        eci = self.gcc_profiling_bug_workaround(
-            'RPY_EXPORTED_FOR_TESTS int _noprof_execv(char *path, char *argv[])',
-            'return execv(path, argv);')
-        os_execv = self.llexternal('_noprof_execv',
-                                   [rffi.CCHARP, rffi.CCHARPP],
-                                   rffi.INT, compilation_info = eci)
+        os_execv = self.llexternal(
+            'execv',
+            [rffi.CCHARP, rffi.CCHARPP], rffi.INT)
 
         def execv_llimpl(path, args):
             l_args = rffi.ll_liststr2charpp(args)
@@ -270,12 +265,9 @@
 
     @registering_if(os, 'execve')
     def register_os_execve(self):
-        eci = self.gcc_profiling_bug_workaround(
-            'RPY_EXPORTED_FOR_TESTS int _noprof_execve(char *filename, char *argv[], char *envp[])',
-            'return execve(filename, argv, envp);')
         os_execve = self.llexternal(
-            '_noprof_execve', [rffi.CCHARP, rffi.CCHARPP, rffi.CCHARPP],
-            rffi.INT, compilation_info = eci)
+            'execve',
+            [rffi.CCHARP, rffi.CCHARPP, rffi.CCHARPP], rffi.INT)
 
         def execve_llimpl(path, args, env):
             # XXX Check path, args, env for \0 and raise TypeErrors as
@@ -1001,8 +993,6 @@
                                   [rffi.INT, rffi.VOIDP, rffi.SIZE_T],
                                   rffi.SIZE_T)
 
-        offset = offsetof(STR, 'chars') + itemoffsetof(STR.chars, 0)
-
         def os_read_llimpl(fd, count):
             if count < 0:
                 raise OSError(errno.EINVAL, None)
@@ -1727,10 +1717,7 @@
     @registering_if(os, 'fork')
     def register_os_fork(self):
         from rpython.rlib import debug, rthread
-        eci = self.gcc_profiling_bug_workaround('RPY_EXPORTED_FOR_TESTS pid_t _noprof_fork(void)',
-                                                'return fork();')
-        os_fork = self.llexternal('_noprof_fork', [], rffi.PID_T,
-                                  compilation_info = eci,
+        os_fork = self.llexternal('fork', [], rffi.PID_T,
                                   _nowrapper = True)
 
         def fork_llimpl():
@@ -1931,21 +1918,6 @@
         return extdef([int], str, "ll_os.ttyname",
                       llimpl=ttyname_llimpl)
 
-    # ____________________________________________________________
-    # XXX horrible workaround for a bug of profiling in gcc on
-    # OS X with functions containing a direct call to some system calls
-    # like fork(), execv(), execve()
-    def gcc_profiling_bug_workaround(self, decl, body):
-        body = ('/*--no-profiling-for-this-file!--*/\n'
-                '#include "src/precommondefs.h"\n'
-                '%s {\n'
-                '\t%s\n'
-                '}\n' % (decl, body,))
-        return ExternalCompilationInfo(
-            include_dirs=[cdir],
-            post_include_bits = [decl + ';'],
-            separate_module_sources = [body])
-
 # ____________________________________________________________
 # Support for os.environ
 
diff --git a/rpython/translator/c/genc.py b/rpython/translator/c/genc.py
--- a/rpython/translator/c/genc.py
+++ b/rpython/translator/c/genc.py
@@ -42,26 +42,7 @@
         self.compiler = compiler
 
     def first(self):
-        platform = self.compiler.platform
-        if platform.name.startswith('darwin'):
-            # XXX incredible hack for darwin
-            STR = '/*--no-profiling-for-this-file!--*/'
-            no_prof = []
-            prof = []
-            for cfile in self.compiler.cfiles:
-                if STR in cfile.read():
-                    no_prof.append(cfile)
-                else:
-                    prof.append(cfile)
-            p_eci = self.compiler.eci.merge(
-                ExternalCompilationInfo(compile_extra=['-fprofile-generate'],
-                                        link_extra=['-fprofile-generate']))
-            ofiles = platform._compile_o_files(prof, p_eci)
-            _, eci = self.compiler.eci.get_module_files()
-            ofiles += platform._compile_o_files(no_prof, eci)
-            return platform._finish_linking(ofiles, p_eci, None, True)
-        else:
-            return self.build('-fprofile-generate')
+        return self.build('-fprofile-generate')
 
     def probe(self, exe, args):
         # 'args' is a single string typically containing spaces
diff --git a/rpython/translator/platform/distutils_platform.py b/rpython/translator/platform/distutils_platform.py
--- a/rpython/translator/platform/distutils_platform.py
+++ b/rpython/translator/platform/distutils_platform.py
@@ -127,14 +127,6 @@
         for cfile in self.cfilenames:
             cfile = py.path.local(cfile)
             compile_extra = self.compile_extra[:]
-            # -frandom-seed is only to try to be as reproducable as possible
-            if 0 and self.fix_gcc_random_seed:
-                compile_extra.append('-frandom-seed=%s' % (cfile.basename,))
-                # XXX horrible workaround for a bug of profiling in gcc on
-                # OS X with functions containing a direct call to fork()
-                if '/*--no-profiling-for-this-file!--*/' in cfile.read():
-                    compile_extra = [arg for arg in compile_extra
-                                     if not arg.startswith('-fprofile-')]
 
             old = cfile.dirpath().chdir()
             try: