[pypy-commit] pypy object-dtype2: merge default into branch

Thu Mar 26 20:28:46 CET 2015

Author: mattip <matti.picus at gmail.com>
Branch: object-dtype2
Changeset: r76580:6c9f1ab16470
Date: 2015-03-26 18:08 +0200
http://bitbucket.org/pypy/pypy/changeset/6c9f1ab16470/

Log:	merge default into branch

diff too long, truncating to 2000 out of 4415 lines

diff --git a/.hgtags b/.hgtags
--- a/.hgtags
+++ b/.hgtags
@@ -11,3 +11,12 @@
 32f35069a16d819b58c1b6efb17c44e3e53397b2 release-2.2=3.1
 32f35069a16d819b58c1b6efb17c44e3e53397b2 release-2.3.1
 10f1b29a2bd21f837090286174a9ca030b8680b2 release-2.5.0
+8e24dac0b8e2db30d46d59f2c4daa3d4aaab7861 release-2.5.1
+8e24dac0b8e2db30d46d59f2c4daa3d4aaab7861 release-2.5.1
+0000000000000000000000000000000000000000 release-2.5.1
+0000000000000000000000000000000000000000 release-2.5.1
+e3d046c43451403f5969580fc1c41d5df6c4082a release-2.5.1
+e3d046c43451403f5969580fc1c41d5df6c4082a release-2.5.1
+0000000000000000000000000000000000000000 release-2.5.1
+0000000000000000000000000000000000000000 release-2.5.1
+9c4588d731b7fe0b08669bd732c2b676cb0a8233 release-2.5.1
diff --git a/lib_pypy/_tkinter/app.py b/lib_pypy/_tkinter/app.py
--- a/lib_pypy/_tkinter/app.py
+++ b/lib_pypy/_tkinter/app.py
@@ -96,7 +96,7 @@
 
         if not self.threaded:
             # TCL is not thread-safe, calls needs to be serialized.
-            self._tcl_lock = threading.Lock()
+            self._tcl_lock = threading.RLock()
         else:
             self._tcl_lock = _DummyLock()
 
diff --git a/lib_pypy/cffi.egg-info b/lib_pypy/cffi.egg-info
--- a/lib_pypy/cffi.egg-info
+++ b/lib_pypy/cffi.egg-info
@@ -1,6 +1,6 @@
 Metadata-Version: 1.0
 Name: cffi
-Version: 0.9.0
+Version: 0.9.2
 Summary: Foreign Function Interface for Python calling C code.
 Home-page: http://cffi.readthedocs.org
 Author: Armin Rigo, Maciej Fijalkowski
diff --git a/lib_pypy/cffi/__init__.py b/lib_pypy/cffi/__init__.py
--- a/lib_pypy/cffi/__init__.py
+++ b/lib_pypy/cffi/__init__.py
@@ -4,8 +4,8 @@
 from .api import FFI, CDefError, FFIError
 from .ffiplatform import VerificationError, VerificationMissing
 
-__version__ = "0.9.0"
-__version_info__ = (0, 9, 0)
+__version__ = "0.9.2"
+__version_info__ = (0, 9, 2)
 
 # The verifier module file names are based on the CRC32 of a string that
 # contains the following version number.  It may be older than __version__
diff --git a/pypy/doc/conf.py b/pypy/doc/conf.py
--- a/pypy/doc/conf.py
+++ b/pypy/doc/conf.py
@@ -58,7 +58,7 @@
 
 # General information about the project.
 project = u'PyPy'
-copyright = u'2014, The PyPy Project'
+copyright = u'2015, The PyPy Project'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -67,7 +67,7 @@
 # The short X.Y version.
 version = '2.5'
 # The full version, including alpha/beta/rc tags.
-release = '2.5.0'
+release = '2.5.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/pypy/doc/index-of-release-notes.rst b/pypy/doc/index-of-release-notes.rst
--- a/pypy/doc/index-of-release-notes.rst
+++ b/pypy/doc/index-of-release-notes.rst
@@ -6,6 +6,7 @@
 
 .. toctree::
 
+   release-2.5.1.rst
    release-2.5.0.rst
    release-2.4.0.rst
    release-2.3.1.rst
diff --git a/pypy/doc/index-of-whatsnew.rst b/pypy/doc/index-of-whatsnew.rst
--- a/pypy/doc/index-of-whatsnew.rst
+++ b/pypy/doc/index-of-whatsnew.rst
@@ -7,6 +7,7 @@
 .. toctree::
 
    whatsnew-head.rst
+   whatsnew-2.5.1.rst
    whatsnew-2.5.0.rst
    whatsnew-2.4.0.rst
    whatsnew-2.3.1.rst
diff --git a/pypy/doc/release-2.5.1.rst b/pypy/doc/release-2.5.1.rst
new file mode 100644
--- /dev/null
+++ b/pypy/doc/release-2.5.1.rst
@@ -0,0 +1,115 @@
+================================
+PyPy 2.5.1 - Pineapple Bromeliad
+================================
+
+We're pleased to announce PyPy 2.5.1, Pineapple `Bromeliad`_ following on the heels of 2.5.0
+
+You can download the PyPy 2.5.1 release here:
+
+    http://pypy.org/download.html
+
+We would like to thank our donors for the continued support of the PyPy
+project, and for those who donate to our three sub-projects, as well as our
+volunteers and contributors.  
+We've shown quite a bit of progress, but we're slowly running out of funds.
+Please consider donating more, or even better convince your employer to donate,
+so we can finish those projects! The three sub-projects are:
+
+* `Py3k`_ (supporting Python 3.x): We have released a Python 3.2.5 compatible version
+   we call PyPy3 2.4.0, and are working toward a Python 3.3 compatible version
+
+* `STM`_ (software transactional memory): We have released a first working version,
+  and continue to try out new promising paths of achieving a fast multithreaded Python
+
+* `NumPy`_ which requires installation of our fork of upstream numpy,
+  available `on bitbucket`_
+
+.. _`Bromeliad`: http://xkcd.com/1498
+.. _`Py3k`: http://pypy.org/py3donate.html
+.. _`STM`: http://pypy.org/tmdonate2.html
+.. _`NumPy`: http://pypy.org/numpydonate.html
+.. _`on bitbucket`: https://www.bitbucket.org/pypy/numpy
+
+We would also like to encourage new people to join the project. PyPy has many
+layers and we need help with all of them: `PyPy`_ and `Rpython`_ documentation
+improvements, tweaking popular `modules`_ to run on pypy, or general `help`_ with making
+Rpython's JIT even better.
+
+.. _`PyPy`: http://doc.pypy.org 
+.. _`Rpython`: http://rpython.readthedocs.org
+.. _`modules`: http://doc.pypy.org/en/latest/project-ideas.html#make-more-python-modules-pypy-friendly
+.. _`help`: http://doc.pypy.org/en/latest/project-ideas.html
+
+What is PyPy?
+=============
+
+PyPy is a very compliant Python interpreter, almost a drop-in replacement for
+CPython 2.7. It's fast (`pypy and cpython 2.7.x`_ performance comparison)
+due to its integrated tracing JIT compiler.
+
+This release supports **x86** machines on most common operating systems
+(Linux 32/64, Mac OS X 64, Windows, and OpenBSD),
+as well as newer **ARM** hardware (ARMv6 or ARMv7, with VFPv3) running Linux.
+
+While we support 32 bit python on Windows, work on the native Windows 64
+bit python is still stalling, we would welcome a volunteer
+to `handle that`_.
+
+.. _`pypy and cpython 2.7.x`: http://speed.pypy.org
+.. _`handle that`: http://doc.pypy.org/en/latest/windows.html#what-is-missing-for-a-full-64-bit-translation
+
+Highlights 
+==========
+
+* The past months have seen pypy mature and grow, as rpython becomes the goto
+  solution for writing fast dynamic language interpreters. Our separation of
+  Rpython and the python interpreter PyPy is now much clearer in the
+  `PyPy documentation`_  and we now have seperate `RPython documentation`_.
+  Tell us what still isn't clear, or even better help us improve the documentation.
+
+* We merged version 2.7.9 of python's stdlib. From the python release notice:
+
+  * The entirety of Python 3.4's `ssl module`_ has been backported. 
+    See `PEP 466`_ for justification.
+
+  * HTTPS certificate validation using the system's certificate store is now
+    enabled by default. See `PEP 476`_ for details.
+
+  * SSLv3 has been disabled by default in httplib and its reverse dependencies
+    due to the `POODLE attack`_.
+
+  * The `ensurepip module`_ has been backported, which provides the pip
+    package manager in every Python 2.7 installation. See `PEP 477`_.
+
+* The garbage collector now ignores parts of the stack which did not change
+  since the last collection, another performance boost
+
+* errno and LastError are saved around cffi calls so things like pdb will not
+  overwrite it
+
+* We continue to asymptotically approach a score of 7 times faster than cpython
+  on our benchmark suite, we now rank 6.98 on latest runs
+
+* Issues reported with our previous release were resolved_ after reports from users on
+  our issue tracker at https://bitbucket.org/pypy/pypy/issues or on IRC at
+  #pypy.
+
+.. _`PyPy documentation`: http://doc.pypy.org
+.. _`RPython documentation`: http://rpython.readthedocs.org
+.. _`ssl module`: https://docs.python.org/3/library/ssl.html
+.. _`PEP 466`: https://www.python.org/dev/peps/pep-0466
+.. _`PEP 476`: https://www.python.org/dev/peps/pep-0476
+.. _`PEP 477`: https://www.python.org/dev/peps/pep-0477
+.. _`POODLE attack`: https://www.imperialviolet.org/2014/10/14/poodle.html
+.. _`ensurepip module`: https://docs.python.org/2/library/ensurepip.html
+.. _resolved: http://doc.pypy.org/en/latest/whatsnew-2.5.1.html
+
+Please try it out and let us know what you think. We welcome
+success stories, `experiments`_,  or `benchmarks`_, we know you are using PyPy, please tell us about it!
+
+Cheers
+
+The PyPy Team
+
+.. _`experiments`: http://morepypy.blogspot.com/2015/02/experiments-in-pyrlang-with-rpython.html
+.. _`benchmarks`: https://mithrandi.net/blog/2015/03/axiom-benchmark-results-on-pypy-2-5-0
diff --git a/pypy/doc/stm.rst b/pypy/doc/stm.rst
--- a/pypy/doc/stm.rst
+++ b/pypy/doc/stm.rst
@@ -25,8 +25,8 @@
 .. _`2nd call for donation`: http://pypy.org/tmdonate2.html
 
 
-Introduction
-============
+What pypy-stm is for
+====================
 
 ``pypy-stm`` is a variant of the regular PyPy interpreter.  (This
 version supports Python 2.7; see below for `Python 3`_.)  With caveats_
@@ -45,15 +45,36 @@
   it as a drop-in replacement and multithreaded programs will run on
   multiple cores.
 
-* ``pypy-stm`` does not impose any special API to the user, but it
-  provides a new pure Python module called `transactional_memory`_ with
-  features to inspect the state or debug conflicts_ that prevent
-  parallelization.  This module can also be imported on top of a non-STM
-  PyPy or CPython.
+* ``pypy-stm`` provides (but does not impose) a special API to the
+  user in the pure Python module ``transaction``.  This module is based
+  on the lower-level module ``pypystm``, but also provides some
+  compatibily with non-STM PyPy's or CPython's.
 
 * Building on top of the way the GIL is removed, we will talk
-  about `Atomic sections, Transactions, etc.: a better way to write
-  parallel programs`_.
+  about `How to write multithreaded programs: the 10'000-feet view`_
+  and `transaction.TransactionQueue`_.
+
+
+...and what pypy-stm is not for
+-------------------------------
+
+``pypy-stm`` gives a Python without the GIL.  This means that it is
+useful in situations where the GIL is the problem in the first place.
+(This includes cases where the program can easily be modified to run
+in multiple threads; often, we don't consider doing that precisely
+because of the GIL.)
+
+However, there are plenty of cases where the GIL is not the problem.
+Do not hope ``pypy-stm`` to be helpful in these cases!  This includes
+all programs that use multiple threads but don't actually spend a lot
+of time running Python code.  For example, it may be spending all its
+time waiting for I/O to occur, or performing some long computation on
+a huge matrix.  These are cases where the CPU is either idle, or in
+some C/Fortran library anyway; in both cases, the interpreter (either
+CPython or the regular PyPy) should release the GIL around the
+external calls.  The threads will thus not end up fighting for the
+GIL.
+
 
 
 Getting Started
@@ -63,9 +84,10 @@
 
 Development is done in the branch `stmgc-c7`_.  If you are only
 interested in trying it out, you can download a Ubuntu binary here__
-(``pypy-stm-2.3*.tar.bz2``, Ubuntu 12.04-14.04).  The current version
+(``pypy-stm-2.*.tar.bz2``, for Ubuntu 12.04-14.04).  The current version
 supports four "segments", which means that it will run up to four
-threads in parallel.
+threads in parallel.  (Development recently switched to `stmgc-c8`_,
+but that is not ready for trying out yet.)
 
 To build a version from sources, you first need to compile a custom
 version of clang(!); we recommend downloading `llvm and clang like
@@ -78,6 +100,7 @@
    rpython/bin/rpython -Ojit --stm pypy/goal/targetpypystandalone.py
 
 .. _`stmgc-c7`: https://bitbucket.org/pypy/pypy/src/stmgc-c7/
+.. _`stmgc-c8`: https://bitbucket.org/pypy/pypy/src/stmgc-c8/
 .. __: https://bitbucket.org/pypy/pypy/downloads/
 .. __: http://clang.llvm.org/get_started.html
 .. __: https://bitbucket.org/pypy/stmgc/src/default/c7/llvmfix/
@@ -85,54 +108,72 @@
 
 .. _caveats:
 
-Current status
---------------
+Current status (stmgc-c7)
+-------------------------
 
-* So far, small examples work fine, but there are still a few bugs.
-  We're busy fixing them as we find them; feel free to `report bugs`_.
+* **NEW:** It seems to work fine, without crashing any more.  Please `report
+  any crash`_ you find (or other bugs).
 
 * It runs with an overhead as low as 20% on examples like "richards".
   There are also other examples with higher overheads --currently up to
   2x for "translate.py"-- which we are still trying to understand.
   One suspect is our partial GC implementation, see below.
 
+* **NEW:** the ``PYPYSTM`` environment variable and the
+  ``pypy/stm/print_stm_log.py`` script let you know exactly which
+  "conflicts" occurred.  This is described in the section
+  `transaction.TransactionQueue`_ below.
+
+* **NEW:** special transaction-friendly APIs (like ``stmdict``),
+  described in the section `transaction.TransactionQueue`_ below.  The
+  old API changed again, mostly moving to different modules.  Sorry
+  about that.  I feel it's a better idea to change the API early
+  instead of being stuck with a bad one later...
+
 * Currently limited to 1.5 GB of RAM (this is just a parameter in
-  `core.h`__).  Memory overflows are not correctly handled; they cause
-  segfaults.
+  `core.h`__ -- theoretically.  In practice, increase it too much and
+  clang crashes again).  Memory overflows are not correctly handled;
+  they cause segfaults.
 
-* The JIT warm-up time improved recently but is still bad.  In order to
-  produce machine code, the JIT needs to enter a special single-threaded
-  mode for now.  This means that you will get bad performance results if
-  your program doesn't run for several seconds, where *several* can mean
-  *many.*  When trying benchmarks, be sure to check that you have
-  reached the warmed state, i.e. the performance is not improving any
-  more.  This should be clear from the fact that as long as it's
-  producing more machine code, ``pypy-stm`` will run on a single core.
+* **NEW:** The JIT warm-up time improved again, but is still
+  relatively large.  In order to produce machine code, the JIT needs
+  to enter "inevitable" mode.  This means that you will get bad
+  performance results if your program doesn't run for several seconds,
+  where *several* can mean *many.* When trying benchmarks, be sure to
+  check that you have reached the warmed state, i.e. the performance
+  is not improving any more.
 
 * The GC is new; although clearly inspired by PyPy's regular GC, it
   misses a number of optimizations for now.  Programs allocating large
   numbers of small objects that don't immediately die (surely a common
-  situation) suffer from these missing optimizations.
+  situation) suffer from these missing optimizations.  (The bleeding
+  edge ``stmgc-c8`` is better at that.)
 
-* The GC has no support for destructors: the ``__del__`` method is never
-  called (including on file objects, which won't be closed for you).
-  This is of course temporary.  Also, weakrefs might appear to work a
-  bit strangely for now (staying alive even though ``gc.collect()``, or
-  even dying but then un-dying for a short time before dying again).
+* Weakrefs might appear to work a bit strangely for now, sometimes
+  staying alive throught ``gc.collect()``, or even dying but then
+  un-dying for a short time before dying again.  A similar problem can
+  show up occasionally elsewhere with accesses to some external
+  resources, where the (apparent) serialized order doesn't match the
+  underlying (multithreading) order.  These are bugs (partially fixed
+  already in ``stmgc-c8``).  Also, debugging helpers like
+  ``weakref.getweakrefcount()`` might give wrong answers.
 
 * The STM system is based on very efficient read/write barriers, which
   are mostly done (their placement could be improved a bit in
-  JIT-generated machine code).  But the overall bookkeeping logic could
-  see more improvements (see `Low-level statistics`_ below).
+  JIT-generated machine code).
 
 * Forking the process is slow because the complete memory needs to be
   copied manually.  A warning is printed to this effect.
 
 * Very long-running processes (on the order of days) will eventually
   crash on an assertion error because of a non-implemented overflow of
-  an internal 29-bit number.
+  an internal 28-bit counter.
 
-.. _`report bugs`: https://bugs.pypy.org/
+* The recursion detection code was not reimplemented.  Infinite
+  recursion just segfaults for now.
+
+
+.. _`report any crash`: https://bitbucket.org/pypy/pypy/issues?status=new&status=open
 .. __: https://bitbucket.org/pypy/pypy/raw/stmgc-c7/rpython/translator/stm/src_stm/stm/core.h
 
 
@@ -155,10 +196,41 @@
 interpreter and other ones might have slightly different needs.
 
 
-
 User Guide
 ==========
 
+How to write multithreaded programs: the 10'000-feet view
+---------------------------------------------------------
+
+PyPy-STM offers two ways to write multithreaded programs:
+
+* the traditional way, using the ``thread`` or ``threading`` modules,
+  described first__.
+
+* using ``TransactionQueue``, described next__, as a way to hide the
+  low-level notion of threads.
+
+.. __: `Drop-in replacement`_
+.. __: `transaction.TransactionQueue`_
+
+The issues with low-level threads are well known (particularly in other
+languages that don't have GIL-based interpreters): memory corruption,
+deadlocks, livelocks, and so on.  There are alternative approaches to
+dealing directly with threads, like OpenMP_.  These approaches
+typically enforce some structure on your code.  ``TransactionQueue``
+is in part similar: your program needs to have "some chances" of
+parallelization before you can apply it.  But I believe that the scope
+of applicability is much larger with ``TransactionQueue`` than with
+other approaches.  It usually works without forcing a complete
+reorganization of your existing code, and it works on any Python
+program which has got *latent* and *imperfect* parallelism.  Ideally,
+it only requires that the end programmer identifies where this
+parallelism is likely to be found, and communicates it to the system
+using a simple API.
+
+.. _OpenMP: http://en.wikipedia.org/wiki/OpenMP
+
+
 Drop-in replacement
 -------------------
 
@@ -175,29 +247,168 @@
 
 This works by internally considering the points where a standard PyPy or
 CPython would release the GIL, and replacing them with the boundaries of
-"transaction".  Like their database equivalent, multiple transactions
+"transactions".  Like their database equivalent, multiple transactions
 can execute in parallel, but will commit in some serial order.  They
 appear to behave as if they were completely run in this serialization
 order.
 
 
+transaction.TransactionQueue
+----------------------------
+
+In CPU-hungry programs, we can often easily identify outermost loops
+over some data structure, or other repetitive algorithm, where each
+"block" consists of processing a non-trivial amount of data, and where
+the blocks "have a good chance" to be independent from each other.  We
+don't need to prove that they are actually independent: it is enough
+if they are *often independent* --- or, more precisely, if we *think
+they should be* often independent.
+
+One typical example would look like this, where the function ``func()``
+typically invokes a large amount of code::
+
+    for key, value in bigdict.items():
+        func(key, value)
+
+Then you simply replace the loop with::
+
+    from transaction import TransactionQueue
+
+    tr = TransactionQueue()
+    for key, value in bigdict.items():
+        tr.add(func, key, value)
+    tr.run()
+
+This code's behavior is equivalent.  Internally, the
+``TransactionQueue`` object will start N threads and try to run the
+``func(key, value)`` calls on all threads in parallel.  But note the
+difference with a regular thread-pooling library, as found in many
+lower-level languages than Python: the function calls are not randomly
+interleaved with each other just because they run in parallel.  The
+behavior did not change because we are using ``TransactionQueue``.
+All the calls still *appear* to execute in some serial order.
+
+A typical usage of ``TransactionQueue`` goes like that: at first,
+the performance does not increase.
+In fact, it is likely to be worse.  Typically, this is
+indicated by the total CPU usage, which remains low (closer to 1 than
+N cores).  First note that it is expected that the CPU usage should
+not go much higher than 1 in the JIT warm-up phase: you must run a
+program for several seconds, or for larger programs at least one
+minute, to give the JIT a chance to warm up enough.  But if CPU usage
+remains low even afterwards, then the ``PYPYSTM`` environment variable
+can be used to track what is going on.
+
+Run your program with ``PYPYSTM=logfile`` to produce a log file called
+``logfile``.  Afterwards, use the ``pypy/stm/print_stm_log.py``
+utility to inspect the content of this log file.  It produces output
+like this (sorted by amount of time lost, largest first)::
+
+    10.5s lost in aborts, 1.25s paused (12412x STM_CONTENTION_WRITE_WRITE)
+    File "foo.py", line 10, in f
+      someobj.stuff = 5
+    File "bar.py", line 20, in g
+      someobj.other = 10
+
+This means that 10.5 seconds were lost running transactions that were
+aborted (which caused another 1.25 seconds of lost time by pausing),
+because of the reason shown in the two independent single-entry
+tracebacks: one thread ran the line ``someobj.stuff = 5``, whereas
+another thread concurrently ran the line ``someobj.other = 10`` on the
+same object.  These two writes are done to the same object.  This
+causes a conflict, which aborts one of the two transactions.  In the
+example above this occurred 12412 times.
+
+The two other conflict sources are ``STM_CONTENTION_INEVITABLE``,
+which means that two transactions both tried to do an external
+operation, like printing or reading from a socket or accessing an
+external array of raw data; and ``STM_CONTENTION_WRITE_READ``, which
+means that one transaction wrote to an object but the other one merely
+read it, not wrote to it (in that case only the writing transaction is
+reported; the location for the reads is not recorded because doing so
+is not possible without a very large performance impact).
+
+Common causes of conflicts:
+
+* First of all, any I/O or raw manipulation of memory turns the
+  transaction inevitable ("must not abort").  There can be only one
+  inevitable transaction running at any time.  A common case is if
+  each transaction starts with sending data to a log file.  You should
+  refactor this case so that it occurs either near the end of the
+  transaction (which can then mostly run in non-inevitable mode), or
+  delegate it to a separate transaction or even a separate thread.
+
+* Writing to a list or a dictionary conflicts with any read from the
+  same list or dictionary, even one done with a different key.  For
+  dictionaries and sets, you can try the types ``transaction.stmdict``
+  and ``transaction.stmset``, which behave mostly like ``dict`` and
+  ``set`` but allow concurrent access to different keys.  (What is
+  missing from them so far is lazy iteration: for example,
+  ``stmdict.iterkeys()`` is implemented as ``iter(stmdict.keys())``;
+  and, unlike PyPy's dictionaries and sets, the STM versions are not
+  ordered.)  There are also experimental ``stmiddict`` and
+  ``stmidset`` classes using the identity of the key.
+
+* ``time.time()`` and ``time.clock()`` turn the transaction inevitable
+  in order to guarantee that a call that appears to be later will really
+  return a higher number.  If getting slightly unordered results is
+  fine, use ``transaction.time()`` or ``transaction.clock()``.  The
+  latter operations guarantee to return increasing results only if you
+  can "prove" that two calls occurred in a specific order (for example
+  because they are both called by the same thread).  In cases where no
+  such proof is possible, you might get randomly interleaved values.
+  (If you have two independent transactions, they normally behave as if
+  one of them was fully executed before the other; but using
+  ``transaction.time()`` you might see the "hidden truth" that they are
+  actually interleaved.)
+
+* ``transaction.threadlocalproperty`` can be used at class-level::
+
+      class Foo(object):     # must be a new-style class!
+          x = transaction.threadlocalproperty()
+          y = transaction.threadlocalproperty(dict)
+
+  This declares that instances of ``Foo`` have two attributes ``x``
+  and ``y`` that are thread-local: reading or writing them from
+  concurrently-running transactions will return independent results.
+  (Any other attributes of ``Foo`` instances will be globally visible
+  from all threads, as usual.)  The optional argument to
+  ``threadlocalproperty()`` is the default value factory: in case no
+  value was assigned in the current thread yet, the factory is called
+  and its result becomes the value in that thread (like
+  ``collections.defaultdict``).  If no default value factory is
+  specified, uninitialized reads raise ``AttributeError``.  Note that
+  with ``TransactionQueue`` you get a pool of a fixed number of
+  threads, each running the transactions one after the other; such
+  thread-local properties will have the value last stored in them in
+  the same thread,, which may come from a random previous transaction.
+  This means that ``threadlocalproperty`` is useful mainly to avoid
+  conflicts from cache-like data structures.
+
+Note that Python is a complicated language; there are a number of less
+common cases that may cause conflict (of any kind) where we might not
+expect it at priori.  In many of these cases it could be fixed; please
+report any case that you don't understand.
+
+
 Atomic sections
 ---------------
 
-PyPy supports *atomic sections,* which are blocks of code which you want
-to execute without "releasing the GIL".  *This is experimental and may
-be removed in the future.*  In STM terms, this means blocks of code that
-are executed while guaranteeing that the transaction is not interrupted
-in the middle.
+The ``TransactionQueue`` class described above is based on *atomic
+sections,* which are blocks of code which you want to execute without
+"releasing the GIL".  In STM terms, this means blocks of code that are
+executed while guaranteeing that the transaction is not interrupted in
+the middle.  *This is experimental and may be removed in the future*
+if `Software lock elision`_ is ever implemented.
 
-Here is a usage example::
+Here is a direct usage example::
 
-    with __pypy__.thread.atomic:
+    with transaction.atomic:
         assert len(lst1) == 10
         x = lst1.pop(0)
         lst1.append(x)
 
-In this (bad) example, we are sure that the item popped off one end of
+In this example, we are sure that the item popped off one end of
 the list is appened again at the other end atomically.  It means that
 another thread can run ``len(lst1)`` or ``x in lst1`` without any
 particular synchronization, and always see the same results,
@@ -221,25 +432,27 @@
 it likely that such a piece of code will eventually block all other
 threads anyway.
 
-Note that if you want to experiment with ``atomic``, you may have to add
-manually a transaction break just before the atomic block.  This is
+Note that if you want to experiment with ``atomic``, you may have to
+manually add a transaction break just before the atomic block.  This is
 because the boundaries of the block are not guaranteed to be the
 boundaries of the transaction: the latter is at least as big as the
-block, but maybe bigger.  Therefore, if you run a big atomic block, it
+block, but may be bigger.  Therefore, if you run a big atomic block, it
 is a good idea to break the transaction just before.  This can be done
-e.g. by the hack of calling ``time.sleep(0)``.  (This may be fixed at
+by calling ``transaction.hint_commit_soon()``.  (This may be fixed at
 some point.)
 
-There are also issues with the interaction of locks and atomic blocks.
-This can be seen if you write to files (which have locks), including
-with a ``print`` to standard output.  If one thread tries to acquire a
-lock while running in an atomic block, and another thread has got the
-same lock, then the former may fail with a ``thread.error``.  The reason
-is that "waiting" for some condition to become true --while running in
-an atomic block-- does not really make sense.  For now you can work
-around it by making sure that, say, all your prints are either in an
-``atomic`` block or none of them are.  (This kind of issue is
-theoretically hard to solve.)
+There are also issues with the interaction of regular locks and atomic
+blocks.  This can be seen if you write to files (which have locks),
+including with a ``print`` to standard output.  If one thread tries to
+acquire a lock while running in an atomic block, and another thread
+has got the same lock at that point, then the former may fail with a
+``thread.error``.  (Don't rely on it; it may also deadlock.)
+The reason is that "waiting" for some condition to
+become true --while running in an atomic block-- does not really make
+sense.  For now you can work around it by making sure that, say, all
+your prints are either in an ``atomic`` block or none of them are.
+(This kind of issue is theoretically hard to solve and may be the
+reason for atomic block support to eventually be removed.)
 
 
 Locks
@@ -293,106 +506,38 @@
 .. _`software lock elision`: https://www.repository.cam.ac.uk/handle/1810/239410
 
 
-Atomic sections, Transactions, etc.: a better way to write parallel programs
-----------------------------------------------------------------------------
+Miscellaneous functions
+-----------------------
 
-(This section is based on locks as we plan to implement them, but also
-works with the existing atomic sections.)
-
-In the cases where elision works, the block of code can run in parallel
-with other blocks of code *even if they are protected by the same lock.*
-You still get the illusion that the blocks are run sequentially.  This
-works even for multiple threads that run each a series of such blocks
-and nothing else, protected by one single global lock.  This is
-basically the Python application-level equivalent of what was done with
-the interpreter in ``pypy-stm``: while you think you are writing
-thread-unfriendly code because of this global lock, actually the
-underlying system is able to make it run on multiple cores anyway.
-
-This capability can be hidden in a library or in the framework you use;
-the end user's code does not need to be explicitly aware of using
-threads.  For a simple example of this, there is `transaction.py`_ in
-``lib_pypy``.  The idea is that you write, or already have, some program
-where the function ``f(key, value)`` runs on every item of some big
-dictionary, say::
-
-    for key, value in bigdict.items():
-        f(key, value)
-
-Then you simply replace the loop with::
-
-    for key, value in bigdict.items():
-        transaction.add(f, key, value)
-    transaction.run()
-
-This code runs the various calls to ``f(key, value)`` using a thread
-pool, but every single call is executed under the protection of a unique
-lock.  The end result is that the behavior is exactly equivalent --- in
-fact it makes little sense to do it in this way on a non-STM PyPy or on
-CPython.  But on ``pypy-stm``, the various locked calls to ``f(key,
-value)`` can tentatively be executed in parallel, even if the observable
-result is as if they were executed in some serial order.
-
-This approach hides the notion of threads from the end programmer,
-including all the hard multithreading-related issues.  This is not the
-first alternative approach to explicit threads; for example, OpenMP_ is
-one.  However, it is one of the first ones which does not require the
-code to be organized in a particular fashion.  Instead, it works on any
-Python program which has got latent, imperfect parallelism.  Ideally, it
-only requires that the end programmer identifies where this parallelism
-is likely to be found, and communicates it to the system, using for
-example the ``transaction.add()`` scheme.
-
-.. _`transaction.py`: https://bitbucket.org/pypy/pypy/raw/stmgc-c7/lib_pypy/transaction.py
-.. _OpenMP: http://en.wikipedia.org/wiki/OpenMP
-
-
-.. _`transactional_memory`:
-
-API of transactional_memory
----------------------------
-
-The new pure Python module ``transactional_memory`` runs on both CPython
-and PyPy, both with and without STM.  It contains:
-
-* ``getsegmentlimit()``: return the number of "segments" in
+* ``transaction.getsegmentlimit()``: return the number of "segments" in
   this pypy-stm.  This is the limit above which more threads will not be
   able to execute on more cores.  (Right now it is limited to 4 due to
   inter-segment overhead, but should be increased in the future.  It
   should also be settable, and the default value should depend on the
   number of actual CPUs.)  If STM is not available, this returns 1.
 
-* ``print_abort_info(minimum_time=0.0)``: debugging help.  Each thread
-  remembers the longest abort or pause it did because of cross-thread
-  contention_.  This function prints it to ``stderr`` if the time lost
-  is greater than ``minimum_time`` seconds.  The record is then
-  cleared, to make it ready for new events.  This function returns
-  ``True`` if it printed a report, and ``False`` otherwise.
+* ``__pypy__.thread.signals_enabled``: a context manager that runs its
+  block of code with signals enabled.  By default, signals are only
+  enabled in the main thread; a non-main thread will not receive
+  signals (this is like CPython).  Enabling signals in non-main
+  threads is useful for libraries where threads are hidden and the end
+  user is not expecting his code to run elsewhere than in the main
+  thread.
 
+* ``pypystm.exclusive_atomic``: a context manager similar to
+  ``transaction.atomic`` but which complains if it is nested.
 
-API of __pypy__.thread
-----------------------
+* ``transaction.is_atomic()``: return True if called from an atomic
+  context.
 
-The ``__pypy__.thread`` submodule is a built-in module of PyPy that
-contains a few internal built-in functions used by the
-``transactional_memory`` module, plus the following:
+* ``pypystm.count()``: return a different positive integer every time
+  it is called.  This works without generating conflicts.  The
+  returned integers are only roughly in increasing order; this should
+  not be relied upon.
 
-* ``__pypy__.thread.atomic``: a context manager to run a block in
-  fully atomic mode, without "releasing the GIL".  (May be eventually
-  removed?)
 
-* ``__pypy__.thread.signals_enabled``: a context manager that runs its
-  block with signals enabled.  By default, signals are only enabled in
-  the main thread; a non-main thread will not receive signals (this is
-  like CPython).  Enabling signals in non-main threads is useful for
-  libraries where threads are hidden and the end user is not expecting
-  his code to run elsewhere than in the main thread.
-
-
-.. _contention:
-
-Conflicts
----------
+More details about conflicts
+----------------------------
 
 Based on Software Transactional Memory, the ``pypy-stm`` solution is
 prone to "conflicts".  To repeat the basic idea, threads execute their code
@@ -408,25 +553,26 @@
 the transaction).  If this occurs too often, parallelization fails.
 
 How much actual parallelization a multithreaded program can see is a bit
-subtle.  Basically, a program not using ``__pypy__.thread.atomic`` or
+subtle.  Basically, a program not using ``transaction.atomic`` or
 eliding locks, or doing so for very short amounts of time, will
 parallelize almost freely (as long as it's not some artificial example
 where, say, all threads try to increase the same global counter and do
 nothing else).
 
-However, using if the program requires longer transactions, it comes
+However, if the program requires longer transactions, it comes
 with less obvious rules.  The exact details may vary from version to
 version, too, until they are a bit more stabilized.  Here is an
 overview.
 
 Parallelization works as long as two principles are respected.  The
-first one is that the transactions must not *conflict* with each other.
-The most obvious sources of conflicts are threads that all increment a
-global shared counter, or that all store the result of their
-computations into the same list --- or, more subtly, that all ``pop()``
-the work to do from the same list, because that is also a mutation of
-the list.  (It is expected that some STM-aware library will eventually
-be designed to help with conflict problems, like a STM-aware queue.)
+first one is that the transactions must not *conflict* with each
+other.  The most obvious sources of conflicts are threads that all
+increment a global shared counter, or that all store the result of
+their computations into the same list --- or, more subtly, that all
+``pop()`` the work to do from the same list, because that is also a
+mutation of the list.  (You can work around it with
+``transaction.stmdict``, but for that specific example, some STM-aware
+queue should eventually be designed.)
 
 A conflict occurs as follows: when a transaction commits (i.e. finishes
 successfully) it may cause other transactions that are still in progress
@@ -442,22 +588,23 @@
 Another issue is that of avoiding long-running so-called "inevitable"
 transactions ("inevitable" is taken in the sense of "which cannot be
 avoided", i.e. transactions which cannot abort any more).  Transactions
-like that should only occur if you use ``__pypy__.thread.atomic``,
-generally become of I/O in atomic blocks.  They work, but the
+like that should only occur if you use ``atomic``,
+generally because of I/O in atomic blocks.  They work, but the
 transaction is turned inevitable before the I/O is performed.  For all
 the remaining execution time of the atomic block, they will impede
 parallel work.  The best is to organize the code so that such operations
-are done completely outside ``__pypy__.thread.atomic``.
+are done completely outside ``atomic``.
 
-(This is related to the fact that blocking I/O operations are
+(This is not unrelated to the fact that blocking I/O operations are
 discouraged with Twisted, and if you really need them, you should do
 them on their own separate thread.)
 
-In case of lock elision, we don't get long-running inevitable
-transactions, but a different problem can occur: doing I/O cancels lock
-elision, and the lock turns into a real lock, preventing other threads
-from committing if they also need this lock.  (More about it when lock
-elision is implemented and tested.)
+In case lock elision eventually replaces atomic sections, we wouldn't
+get long-running inevitable transactions, but the same problem occurs
+in a different way: doing I/O cancels lock elision, and the lock turns
+into a real lock.  This prevents other threads from committing if they
+also need this lock.  (More about it when lock elision is implemented
+and tested.)
 
 
 
@@ -467,56 +614,30 @@
 XXX this section mostly empty for now
 
 
-Low-level statistics
---------------------
+Technical reports
+-----------------
 
-When a non-main thread finishes, you get low-level statistics printed to
-stderr, looking like that::
+STMGC-C7 is described in detail in a `technical report`__.
 
-      thread 0x7f73377fe600:
-          outside transaction          42182    0.506 s
-          run current                  85466    0.000 s
-          run committed                34262    3.178 s
-          run aborted write write       6982    0.083 s
-          run aborted write read         550    0.005 s
-          run aborted inevitable         388    0.010 s
-          run aborted other                0    0.000 s
-          wait free segment                0    0.000 s
-          wait write read                 78    0.027 s
-          wait inevitable                887    0.490 s
-          wait other                       0    0.000 s
-          sync commit soon                 1    0.000 s
-          bookkeeping                  51418    0.606 s
-          minor gc                    162970    1.135 s
-          major gc                         1    0.019 s
-          sync pause                   59173    1.738 s
-          longest recordered marker          0.000826 s
-          "File "x.py", line 5, in f"
+A separate `position paper`__ gives an overview of our position about
+STM in general.
 
-On each line, the first number is a counter, and the second number gives
-the associated time --- the amount of real time that the thread was in
-this state.  The sum of all the times should be equal to the total time
-between the thread's start and the thread's end.  The most important
-points are "run committed", which gives the amount of useful work, and
-"outside transaction", which should give the time spent e.g. in library
-calls (right now it seems to be larger than that; to investigate).  The
-various "run aborted" and "wait" entries are time lost due to
-conflicts_.  Everything else is overhead of various forms.  (Short-,
-medium- and long-term future work involves reducing this overhead :-)
-
-The last two lines are special; they are an internal marker read by
-``transactional_memory.print_abort_info()``.
+.. __: http://bitbucket.org/pypy/extradoc/src/extradoc/talk/dls2014/paper/paper.pdf
+.. __: http://bitbucket.org/pypy/extradoc/src/extradoc/talk/icooolps2014/
 
 
 Reference to implementation details
 -----------------------------------
 
-The core of the implementation is in a separate C library called stmgc_,
-in the c7_ subdirectory.  Please see the `README.txt`_ for more
-information.  In particular, the notion of segment is discussed there.
+The core of the implementation is in a separate C library called
+stmgc_, in the c7_ subdirectory (current version of pypy-stm) and in
+the c8_ subdirectory (bleeding edge version).  Please see the
+`README.txt`_ for more information.  In particular, the notion of
+segment is discussed there.
 
 .. _stmgc: https://bitbucket.org/pypy/stmgc/src/default/
 .. _c7: https://bitbucket.org/pypy/stmgc/src/default/c7/
+.. _c8: https://bitbucket.org/pypy/stmgc/src/default/c8/
 .. _`README.txt`: https://bitbucket.org/pypy/stmgc/raw/default/c7/README.txt
 
 PyPy itself adds on top of it the automatic placement of read__ and write__
diff --git a/pypy/doc/whatsnew-2.5.0.rst b/pypy/doc/whatsnew-2.5.0.rst
--- a/pypy/doc/whatsnew-2.5.0.rst
+++ b/pypy/doc/whatsnew-2.5.0.rst
@@ -1,6 +1,6 @@
-=======================
-What's new in PyPy 2.5
-=======================
+========================
+What's new in PyPy 2.5.0
+========================
 
 .. this is a revision shortly after release-2.4.x
 .. startrev: 7026746cbb1b
diff --git a/pypy/doc/whatsnew-2.5.1.rst b/pypy/doc/whatsnew-2.5.1.rst
new file mode 100644
--- /dev/null
+++ b/pypy/doc/whatsnew-2.5.1.rst
@@ -0,0 +1,47 @@
+========================
+What's new in PyPy 2.5.1
+========================
+
+.. this is a revision shortly after release-2.5.0
+.. startrev: 397b96217b85
+
+
+Non-blocking file reads sometimes raised EAGAIN even though they
+had buffered data waiting, fixed in b1c4fcb04a42
+
+Fix a bug in cpyext in multithreded programs acquiring/releasing the GIL
+
+.. branch: vmprof
+
+.. branch: stackroot-speedup-2
+
+Avoid tracing all stack roots during repeated minor collections,
+by ignoring the part of the stack that didn't change
+
+.. branch: stdlib-2.7.9
+
+Update stdlib to version 2.7.9
+
+.. branch: fix-kqueue-error2
+
+Fix exception being raised by kqueue.control (CPython compatibility)
+
+.. branch: gitignore
+
+.. branch: framestate2
+
+Refactor rpython.flowspace.framestate.FrameState.
+
+.. branch: alt_errno
+
+Add an alternative location to save LastError, errno around ctypes,
+cffi external calls so things like pdb will not overwrite it
+
+.. branch: nonquadratic-heapcache
+
+Speed up the warmup times of the JIT by removing a quadratic algorithm in the
+heapcache.
+
+.. branch: online-transforms-2
+
+Simplify flow graphs on the fly during annotation phase.
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -2,35 +2,6 @@
 What's new in PyPy 2.5+
 =======================
 
-.. this is a revision shortly after release-2.5.x
+.. this is a revision shortly after release-2.5.1
 .. startrev: 397b96217b85
 
-
-Non-blocking file reads sometimes raised EAGAIN even though they
-had buffered data waiting, fixed in b1c4fcb04a42
-
-
-.. branch: vmprof
-
-.. branch: stackroot-speedup-2
-Avoid tracing all stack roots during repeated minor collections,
-by ignoring the part of the stack that didn't change
-
-.. branch: stdlib-2.7.9
-Update stdlib to version 2.7.9
-
-.. branch: fix-kqueue-error2
-Fix exception being raised by kqueue.control (CPython compatibility)
-
-.. branch: gitignore
-
-.. branch: framestate2
-Refactor rpython.flowspace.framestate.FrameState.
-
-.. branch: alt_errno
-Add an alternative location to save LastError, errno around ctypes,
-cffi external calls so things like pdb will not overwrite it
-
-.. branch: nonquadratic-heapcache
-Speed up the warmup times of the JIT by removing a quadratic algorithm in the
-heapcache.
diff --git a/pypy/goal/getnightly.py b/pypy/goal/getnightly.py
--- a/pypy/goal/getnightly.py
+++ b/pypy/goal/getnightly.py
@@ -7,7 +7,7 @@
 if sys.platform.startswith('linux'):
     arch = 'linux'
     cmd = 'wget "%s"'
-    tar = "tar -x -v --wildcards --strip-components=2 -f %s '*/bin/pypy'"
+    tar = "tar -x -v --wildcards --strip-components=2 -f %s '*/bin/pypy' '*/bin/libpypy-c.so'"
     if os.uname()[-1].startswith('arm'):
         arch += '-armhf-raspbian'
 elif sys.platform.startswith('darwin'):
diff --git a/pypy/interpreter/astcompiler/assemble.py b/pypy/interpreter/astcompiler/assemble.py
--- a/pypy/interpreter/astcompiler/assemble.py
+++ b/pypy/interpreter/astcompiler/assemble.py
@@ -648,7 +648,7 @@
 
 
 def _compute_UNPACK_SEQUENCE(arg):
-    return arg + 1
+    return arg - 1
 
 def _compute_DUP_TOPX(arg):
     return arg
diff --git a/pypy/interpreter/astcompiler/test/test_compiler.py b/pypy/interpreter/astcompiler/test/test_compiler.py
--- a/pypy/interpreter/astcompiler/test/test_compiler.py
+++ b/pypy/interpreter/astcompiler/test/test_compiler.py
@@ -759,6 +759,19 @@
         """
         self.simple_test(source, 'l', [1, 2])
 
+    def test_unpack_wrong_stackeffect(self):
+        source = """if 1:
+        l = [1, 2]
+        a, b = l
+        a, b = l
+        a, b = l
+        a, b = l
+        a, b = l
+        a, b = l
+        """
+        code = compile_with_astcompiler(source, 'exec', self.space)
+        assert code.co_stacksize == 2
+
     def test_lambda(self):
         yield self.st, "y = lambda x: x", "y(4)", 4
 
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -4,7 +4,7 @@
 The bytecode interpreter itself is implemented by the PyFrame class.
 """
 
-import dis, imp, struct, types, new, sys
+import dis, imp, struct, types, new, sys, os
 
 from pypy.interpreter import eval
 from pypy.interpreter.signature import Signature
@@ -128,6 +128,17 @@
         if (self.magic == cpython_magic and
             '__pypy__' not in sys.builtin_module_names):
             raise Exception("CPython host codes should not be rendered")
+        # When translating PyPy, freeze the file name
+        #     <builtin>/lastdirname/basename.py
+        # instead of freezing the complete translation-time path.
+        filename = self.co_filename.lstrip('<').rstrip('>')
+        if filename.lower().endswith('.pyc'):
+            filename = filename[:-1]
+        basename = os.path.basename(filename)
+        lastdirname = os.path.basename(os.path.dirname(filename))
+        if lastdirname:
+            basename = '%s/%s' % (lastdirname, basename)
+        self.co_filename = '<builtin>/%s' % (basename,)
 
     co_names = property(lambda self: [self.space.unwrap(w_name) for w_name in self.co_names_w]) # for trace
 
diff --git a/pypy/interpreter/pyopcode.py b/pypy/interpreter/pyopcode.py
--- a/pypy/interpreter/pyopcode.py
+++ b/pypy/interpreter/pyopcode.py
@@ -1619,6 +1619,13 @@
     def prepare_exec(f, prog, globals, locals, compile_flags, builtin, codetype):
         """Manipulate parameters to exec statement to (codeobject, dict, dict).
         """
+        if (globals is None and locals is None and
+            isinstance(prog, tuple) and
+            (len(prog) == 2 or len(prog) == 3)):
+            globals = prog[1]
+            if len(prog) == 3:
+                locals = prog[2]
+            prog = prog[0]
         if globals is None:
             globals = f.f_globals
             if locals is None:
diff --git a/pypy/interpreter/test/test_exec.py b/pypy/interpreter/test/test_exec.py
--- a/pypy/interpreter/test/test_exec.py
+++ b/pypy/interpreter/test/test_exec.py
@@ -262,3 +262,11 @@
         """]
         for c in code:
             compile(c, "<code>", "exec")
+
+    def test_exec_tuple(self):
+        # note: this is VERY different than testing exec("a = 42", d), because
+        # this specific case is handled specially by the AST compiler
+        d = {}
+        x = ("a = 42", d)
+        exec x
+        assert d['a'] == 42
diff --git a/pypy/module/_cffi_backend/__init__.py b/pypy/module/_cffi_backend/__init__.py
--- a/pypy/module/_cffi_backend/__init__.py
+++ b/pypy/module/_cffi_backend/__init__.py
@@ -2,13 +2,15 @@
 from pypy.interpreter.mixedmodule import MixedModule
 from rpython.rlib import rdynload
 
+VERSION = "0.9.2"
+
 
 class Module(MixedModule):
 
     appleveldefs = {
         }
     interpleveldefs = {
-        '__version__': 'space.wrap("0.9.0")',
+        '__version__': 'space.wrap("%s")' % VERSION,
 
         'load_library': 'libraryobj.load_library',
 
diff --git a/pypy/module/_cffi_backend/test/_backend_test_c.py b/pypy/module/_cffi_backend/test/_backend_test_c.py
--- a/pypy/module/_cffi_backend/test/_backend_test_c.py
+++ b/pypy/module/_cffi_backend/test/_backend_test_c.py
@@ -3247,6 +3247,88 @@
     cast(p, c)[1] += 500
     assert list(a) == [10000, 20500, 30000]
 
+def test_from_buffer_not_str_unicode_bytearray():
+    BChar = new_primitive_type("char")
+    BCharP = new_pointer_type(BChar)
+    BCharA = new_array_type(BCharP, None)
+    py.test.raises(TypeError, from_buffer, BCharA, b"foo")
+    py.test.raises(TypeError, from_buffer, BCharA, u"foo")
+    py.test.raises(TypeError, from_buffer, BCharA, bytearray(b"foo"))
+    try:
+        from __builtin__ import buffer
+    except ImportError:
+        pass
+    else:
+        py.test.raises(TypeError, from_buffer, BCharA, buffer(b"foo"))
+        py.test.raises(TypeError, from_buffer, BCharA, buffer(u"foo"))
+        py.test.raises(TypeError, from_buffer, BCharA,
+                       buffer(bytearray(b"foo")))
+    try:
+        from __builtin__ import memoryview
+    except ImportError:
+        pass
+    else:
+        py.test.raises(TypeError, from_buffer, BCharA, memoryview(b"foo"))
+        py.test.raises(TypeError, from_buffer, BCharA,
+                       memoryview(bytearray(b"foo")))
+
+def test_from_buffer_more_cases():
+    try:
+        from _cffi_backend import _testbuff
+    except ImportError:
+        py.test.skip("not for pypy")
+    BChar = new_primitive_type("char")
+    BCharP = new_pointer_type(BChar)
+    BCharA = new_array_type(BCharP, None)
+    #
+    def check1(bufobj, expected):
+        c = from_buffer(BCharA, bufobj)
+        assert typeof(c) is BCharA
+        if sys.version_info >= (3,):
+            expected = [bytes(c, "ascii") for c in expected]
+        assert list(c) == list(expected)
+    #
+    def check(methods, expected, expected_for_memoryview=None):
+        if sys.version_info >= (3,):
+            if methods <= 7:
+                return
+            if expected_for_memoryview is not None:
+                expected = expected_for_memoryview
+        class X(object):
+            pass
+        _testbuff(X, methods)
+        bufobj = X()
+        check1(bufobj, expected)
+        try:
+            from __builtin__ import buffer
+            bufobjb = buffer(bufobj)
+        except (TypeError, ImportError):
+            pass
+        else:
+            check1(bufobjb, expected)
+        try:
+            bufobjm = memoryview(bufobj)
+        except (TypeError, NameError):
+            pass
+        else:
+            check1(bufobjm, expected_for_memoryview or expected)
+    #
+    check(1, "RDB")
+    check(2, "WRB")
+    check(4, "CHB")
+    check(8, "GTB")
+    check(16, "ROB")
+    #
+    check(1 | 2,  "RDB")
+    check(1 | 4,  "RDB")
+    check(2 | 4,  "CHB")
+    check(1 | 8,  "RDB", "GTB")
+    check(1 | 16, "RDB", "ROB")
+    check(2 | 8,  "WRB", "GTB")
+    check(2 | 16, "WRB", "ROB")
+    check(4 | 8,  "CHB", "GTB")
+    check(4 | 16, "CHB", "ROB")
+
 def test_version():
     # this test is here mostly for PyPy
-    assert __version__ == "0.9.0"
+    assert __version__ == "0.9.2"
diff --git a/pypy/module/_cffi_backend/test/test_file.py b/pypy/module/_cffi_backend/test/test_file.py
--- a/pypy/module/_cffi_backend/test/test_file.py
+++ b/pypy/module/_cffi_backend/test/test_file.py
@@ -15,3 +15,15 @@
             "Update test/_backend_test_c.py by copying it from "
             "https://bitbucket.org/cffi/cffi/raw/default/c/test_c.py "
             "and killing the import lines at the start")
+
+def test_egginfo_version():
+    from pypy.module._cffi_backend import VERSION
+    line = "Version: %s\n" % VERSION
+    eggfile = py.path.local(__file__).join('..', '..', '..', '..', '..',
+                                           'lib_pypy', 'cffi.egg-info')
+    assert line in eggfile.readlines()
+
+def test_app_version():
+    from pypy.module import _cffi_backend
+    from lib_pypy import cffi
+    assert _cffi_backend.VERSION == cffi.__version__
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -470,7 +470,7 @@
         allow_surrogates=True)
     return space.newtuple([space.wrap(result), space.wrap(consumed)])
 
- at unwrap_spec(data=str, errors='str_or_None', byteorder=int,
+ at unwrap_spec(data='bufferstr', errors='str_or_None', byteorder=int,
              w_final=WrappedDefault(False))
 def utf_16_ex_decode(space, data, errors='strict', byteorder=0, w_final=None):
     if errors is None:
@@ -491,7 +491,7 @@
     return space.newtuple([space.wrap(res), space.wrap(consumed),
                            space.wrap(byteorder)])
 
- at unwrap_spec(data=str, errors='str_or_None', byteorder=int,
+ at unwrap_spec(data='bufferstr', errors='str_or_None', byteorder=int,
              w_final=WrappedDefault(False))
 def utf_32_ex_decode(space, data, errors='strict', byteorder=0, w_final=None):
     final = space.is_true(w_final)
@@ -589,7 +589,7 @@
             "character mapping must return integer, None or str")
 
 
- at unwrap_spec(string=str, errors='str_or_None')
+ at unwrap_spec(string='bufferstr', errors='str_or_None')
 def charmap_decode(space, string, errors="strict", w_mapping=None):
     if errors is None:
         errors = 'strict'
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -727,3 +727,23 @@
         _codecs.register_error("test.test_codecs_not_a_string", f)
         raises(TypeError, u'\u1234'.encode, 'ascii',
                'test.test_codecs_not_a_string')
+
+    def test_decode_bytearray(self):
+        import _codecs
+        b = bytearray()
+        assert _codecs.ascii_decode(b) == (u'', 0)
+        assert _codecs.latin_1_decode(b) == (u'', 0)
+        assert _codecs.utf_7_decode(b) == (u'', 0)
+        assert _codecs.utf_8_decode(b) == (u'', 0)
+        assert _codecs.utf_16_be_decode(b) == (u'', 0)
+        assert _codecs.utf_16_decode(b) == (u'', 0)
+        assert _codecs.utf_16_le_decode(b) == (u'', 0)
+        assert _codecs.utf_16_ex_decode(b) == (u'', 0, 0)
+        assert _codecs.utf_32_decode(b) == (u'', 0)
+        assert _codecs.utf_32_be_decode(b) == (u'', 0)
+        assert _codecs.utf_32_le_decode(b) == (u'', 0)
+        assert _codecs.utf_32_ex_decode(b) == (u'', 0, 0)
+        assert _codecs.charmap_decode(b) == (u'', 0)
+        assert _codecs.unicode_escape_decode(b) == (u'', 0)
+        assert _codecs.raw_unicode_escape_decode(b) == (u'', 0)
+        assert _codecs.unicode_internal_decode(b) == (u'', 0)
diff --git a/pypy/module/_csv/test/__init__.py b/pypy/module/_csv/test/__init__.py
new file mode 100644
diff --git a/pypy/module/_io/test/__init__.py b/pypy/module/_io/test/__init__.py
new file mode 100644
diff --git a/pypy/module/_multiprocessing/test/__init__.py b/pypy/module/_multiprocessing/test/__init__.py
new file mode 100644
diff --git a/pypy/module/_random/interp_random.py b/pypy/module/_random/interp_random.py
--- a/pypy/module/_random/interp_random.py
+++ b/pypy/module/_random/interp_random.py
@@ -4,7 +4,7 @@
 from pypy.interpreter.typedef import TypeDef
 from pypy.interpreter.gateway import interp2app, unwrap_spec
 from pypy.interpreter.baseobjspace import W_Root
-from rpython.rlib.rarithmetic import r_uint, intmask
+from rpython.rlib.rarithmetic import r_uint, intmask, widen
 from rpython.rlib import rbigint, rrandom, rstring
 
 
@@ -54,8 +54,8 @@
     def getstate(self, space):
         state = [None] * (rrandom.N + 1)
         for i in range(rrandom.N):
-            state[i] = space.newint(intmask(self._rnd.state[i]))
-        state[rrandom.N] = space.newint(self._rnd.index)
+            state[i] = space.wrap(widen(self._rnd.state[i]))
+        state[rrandom.N] = space.newlong(self._rnd.index)
         return space.newtuple(state)
 
     def setstate(self, space, w_state):
diff --git a/pypy/module/_random/test/test_random.py b/pypy/module/_random/test/test_random.py
--- a/pypy/module/_random/test/test_random.py
+++ b/pypy/module/_random/test/test_random.py
@@ -41,6 +41,17 @@
         # does not crash
         rnd1.setstate((-1, ) * 624 + (0, ))
 
+    def test_state_repr(self):
+        # since app-level jumpahead salts with repr(state),
+        # it is important the repr is consistent with cpython
+        import _random
+        rnd = _random.Random()
+        rnd.seed(1234)
+        state = rnd.getstate()
+        s = repr(state)
+        assert len(s) == 7956
+        assert s.count('L') == 625
+
     def test_seed(self):
         import _random, sys
         rnd = _random.Random()
@@ -102,3 +113,10 @@
                 self.x = x
         r = R(x=15)
         assert r.x == 15
+
+    def test_exact_result(self):
+        # this passes on CPython 2.7.9 on Linux 32 and Linux 64
+        import _random
+        rnd = _random.Random(-3**31)
+        x = rnd.random()
+        assert x == 0.8181851342382107
diff --git a/pypy/module/_socket/interp_socket.py b/pypy/module/_socket/interp_socket.py
--- a/pypy/module/_socket/interp_socket.py
+++ b/pypy/module/_socket/interp_socket.py
@@ -30,7 +30,7 @@
                                space.wrap(addr.get_protocol()),
                                space.wrap(addr.get_pkttype()),
                                space.wrap(addr.get_hatype()),
-                               space.wrap(addr.get_addr())])
+                               space.wrap(addr.get_haddr())])
     elif rsocket.HAS_AF_UNIX and isinstance(addr, rsocket.UNIXAddress):
         return space.wrap(addr.get_path())
     elif rsocket.HAS_AF_NETLINK and isinstance(addr, rsocket.NETLINKAddress):
@@ -79,7 +79,7 @@
         raise NotImplementedError
 
 # XXX Hack to seperate rpython and pypy
-def addr_from_object(family, space, w_address):
+def addr_from_object(family, fd, space, w_address):
     if family == rsocket.AF_INET:
         w_host, w_port = space.unpackiterable(w_address, 2)
         host = space.str_w(w_host)
@@ -89,8 +89,9 @@
     if family == rsocket.AF_INET6:
         pieces_w = space.unpackiterable(w_address)
         if not (2 <= len(pieces_w) <= 4):
-            raise TypeError("AF_INET6 address must be a tuple of length 2 "
-                               "to 4, not %d" % len(pieces_w))
+            raise oefmt(space.w_TypeError,
+                        "AF_INET6 address must be a tuple of length 2 "
+                        "to 4, not %d", len(pieces_w))
         host = space.str_w(pieces_w[0])
         port = space.int_w(pieces_w[1])
         port = make_ushort_port(space, port)
@@ -105,6 +106,28 @@
     if rsocket.HAS_AF_NETLINK and family == rsocket.AF_NETLINK:
         w_pid, w_groups = space.unpackiterable(w_address, 2)
         return rsocket.NETLINKAddress(space.uint_w(w_pid), space.uint_w(w_groups))
+    if rsocket.HAS_AF_PACKET and family == rsocket.AF_PACKET:
+        pieces_w = space.unpackiterable(w_address)
+        if not (2 <= len(pieces_w) <= 5):
+            raise oefmt(space.w_TypeError,
+                        "AF_PACKET address must be a tuple of length 2 "
+                        "to 5, not %d", len(pieces_w))
+        ifname = space.str_w(pieces_w[0])
+        ifindex = rsocket.PacketAddress.get_ifindex_from_ifname(fd, ifname)
+        protocol = space.int_w(pieces_w[1])
+        if len(pieces_w) > 2: pkttype = space.int_w(pieces_w[2])
+        else:                 pkttype = 0
+        if len(pieces_w) > 3: hatype = space.int_w(pieces_w[3])
+        else:                 hatype = 0
+        if len(pieces_w) > 4: haddr = space.str_w(pieces_w[4])
+        else:                 haddr = ""
+        if len(haddr) > 8:
+            raise OperationError(space.w_ValueError, space.wrap(
+                "Hardware address must be 8 bytes or less"))
+        if protocol < 0 or protocol > 0xfffff:
+            raise OperationError(space.w_OverflowError, space.wrap(
+                "protoNumber must be 0-65535."))
+        return rsocket.PacketAddress(ifindex, protocol, pkttype, hatype, haddr)
     raise RSocketError("unknown address family")
 
 # XXX Hack to seperate rpython and pypy
@@ -172,7 +195,8 @@
     # convert an app-level object into an Address
     # based on the current socket's family
     def addr_from_object(self, space, w_address):
-        return addr_from_object(self.sock.family, space, w_address)
+        fd = intmask(self.sock.fd)
+        return addr_from_object(self.sock.family, fd, space, w_address)
 
     def bind_w(self, space, w_addr):
         """bind(address)
diff --git a/pypy/module/_socket/test/test_sock_app.py b/pypy/module/_socket/test/test_sock_app.py
--- a/pypy/module/_socket/test/test_sock_app.py
+++ b/pypy/module/_socket/test/test_sock_app.py
@@ -1,4 +1,4 @@
-import sys
+import sys, os
 import py
 from pypy.tool.pytest.objspace import gettestobjspace
 from rpython.tool.udir import udir
@@ -615,6 +615,28 @@
             os.chdir(oldcwd)
 
 
+class AppTestPacket:
+    def setup_class(cls):
+        if not hasattr(os, 'getuid') or os.getuid() != 0:
+            py.test.skip("AF_PACKET needs to be root for testing")
+        w_ok = space.appexec([], "(): import _socket; " +
+                                 "return hasattr(_socket, 'AF_PACKET')")
+        if not space.is_true(w_ok):
+            py.test.skip("no AF_PACKET on this platform")
+        cls.space = space
+
+    def test_convert_between_tuple_and_sockaddr_ll(self):
+        import _socket
+        s = _socket.socket(_socket.AF_PACKET, _socket.SOCK_RAW)
+        assert s.getsockname() == ('', 0, 0, 0, '')
+        s.bind(('lo', 123))
+        a, b, c, d, e = s.getsockname()
+        assert (a, b, c) == ('lo', 123, 0)
+        assert isinstance(d, int)
+        assert isinstance(e, str)
+        assert 0 <= len(e) <= 8
+
+
 class AppTestSocketTCP:
     HOST = 'localhost'
 
diff --git a/pypy/module/_ssl/__init__.py b/pypy/module/_ssl/__init__.py
--- a/pypy/module/_ssl/__init__.py
+++ b/pypy/module/_ssl/__init__.py
@@ -51,6 +51,11 @@
 
         super(Module, cls).buildloaders()
 
+    def setup_after_space_initialization(self):
+        """NOT_RPYTHON"""
+        from pypy.module._ssl.interp_ssl import PWINFO_STORAGE
+        PWINFO_STORAGE.clear()
+
     def startup(self, space):
         from rpython.rlib.ropenssl import init_ssl
         init_ssl()
diff --git a/pypy/module/_ssl/test/__init__.py b/pypy/module/_ssl/test/__init__.py
new file mode 100644
diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -700,7 +700,8 @@
 
         @rgc.must_be_light_finalizer
         def __del__(self):
-            lltype.free(self.buffer, flavor='raw')
+            if self.buffer:
+                lltype.free(self.buffer, flavor='raw')
 
         def setlen(self, size, zero=False, overallocate=True):
             if size > 0:
diff --git a/pypy/module/cpyext/api.py b/pypy/module/cpyext/api.py
--- a/pypy/module/cpyext/api.py
+++ b/pypy/module/cpyext/api.py
@@ -192,7 +192,7 @@
 
 class ApiFunction:
     def __init__(self, argtypes, restype, callable, error=_NOT_SPECIFIED,
-                 c_name=None):
+                 c_name=None, gil=None):
         self.argtypes = argtypes
         self.restype = restype
         self.functype = lltype.Ptr(lltype.FuncType(argtypes, restype))
@@ -208,6 +208,7 @@
         assert argnames[0] == 'space'
         self.argnames = argnames[1:]
         assert len(self.argnames) == len(self.argtypes)
+        self.gil = gil
 
     def _freeze_(self):
         return True
@@ -223,14 +224,15 @@
     def get_wrapper(self, space):
         wrapper = getattr(self, '_wrapper', None)
         if wrapper is None:
-            wrapper = make_wrapper(space, self.callable)
+            wrapper = make_wrapper(space, self.callable, self.gil)
             self._wrapper = wrapper
             wrapper.relax_sig_check = True
             if self.c_name is not None:
                 wrapper.c_name = cpyext_namespace.uniquename(self.c_name)
         return wrapper
 
-def cpython_api(argtypes, restype, error=_NOT_SPECIFIED, external=True):
+def cpython_api(argtypes, restype, error=_NOT_SPECIFIED, external=True,
+                gil=None):
     """
     Declares a function to be exported.
     - `argtypes`, `restype` are lltypes and describe the function signature.
@@ -240,6 +242,8 @@
       SytemError.
     - set `external` to False to get a C function pointer, but not exported by
       the API headers.
+    - set `gil` to "acquire", "release" or "around" to acquire the GIL,
+      release the GIL, or both
     """
     if isinstance(restype, lltype.Typedef):
         real_restype = restype.OF
@@ -262,7 +266,8 @@
             c_name = None
         else:
             c_name = func_name
-        api_function = ApiFunction(argtypes, restype, func, error, c_name=c_name)
+        api_function = ApiFunction(argtypes, restype, func, error,
+                                   c_name=c_name, gil=gil)
         func.api_func = api_function
 
         if external:
@@ -594,12 +599,15 @@
 pypy_debug_catch_fatal_exception = rffi.llexternal('pypy_debug_catch_fatal_exception', [], lltype.Void)
 
 # Make the wrapper for the cases (1) and (2)
-def make_wrapper(space, callable):
+def make_wrapper(space, callable, gil=None):
     "NOT_RPYTHON"
     names = callable.api_func.argnames
     argtypes_enum_ui = unrolling_iterable(enumerate(zip(callable.api_func.argtypes,
         [name.startswith("w_") for name in names])))
     fatal_value = callable.api_func.restype._defl()
+    gil_acquire = (gil == "acquire" or gil == "around")
+    gil_release = (gil == "release" or gil == "around")
+    assert gil is None or gil_acquire or gil_release
 
     @specialize.ll()
     def wrapper(*args):
@@ -607,6 +615,10 @@
         from pypy.module.cpyext.pyobject import Reference
         # we hope that malloc removal removes the newtuple() that is
         # inserted exactly here by the varargs specializer
+        if gil_acquire:
+            after = rffi.aroundstate.after
+            if after:
+                after()
         rffi.stackcounter.stacks_counter += 1
         llop.gc_stack_bottom(lltype.Void)   # marker for trackgcroot.py
         retval = fatal_value
@@ -678,6 +690,10 @@
                 print str(e)
                 pypy_debug_catch_fatal_exception()
         rffi.stackcounter.stacks_counter -= 1
+        if gil_release:
+            before = rffi.aroundstate.before
+            if before:
+                before()
         return retval
     callable._always_inline_ = 'try'
     wrapper.__name__ = "wrapper for %r" % (callable, )
diff --git a/pypy/module/cpyext/pystate.py b/pypy/module/cpyext/pystate.py
--- a/pypy/module/cpyext/pystate.py
+++ b/pypy/module/cpyext/pystate.py
@@ -19,7 +19,7 @@
 class NoThreads(Exception):
     pass
 
- at cpython_api([], PyThreadState, error=CANNOT_FAIL)
+ at cpython_api([], PyThreadState, error=CANNOT_FAIL, gil="release")
 def PyEval_SaveThread(space):
     """Release the global interpreter lock (if it has been created and thread
     support is enabled) and reset the thread state to NULL, returning the
@@ -29,19 +29,15 @@
     state = space.fromcache(InterpreterState)
     tstate = state.swap_thread_state(
         space, lltype.nullptr(PyThreadState.TO))
-    if rffi.aroundstate.before:
-        rffi.aroundstate.before()
     return tstate
 
- at cpython_api([PyThreadState], lltype.Void)
+ at cpython_api([PyThreadState], lltype.Void, gil="acquire")
 def PyEval_RestoreThread(space, tstate):
     """Acquire the global interpreter lock (if it has been created and thread
     support is enabled) and set the thread state to tstate, which must not be
     NULL.  If the lock has been created, the current thread must not have
     acquired it, otherwise deadlock ensues.  (This function is available even
     when thread support is disabled at compile time.)"""
-    if rffi.aroundstate.after:
-        rffi.aroundstate.after()
     state = space.fromcache(InterpreterState)
     state.swap_thread_state(space, tstate)
 
@@ -182,17 +178,14 @@
     state = space.fromcache(InterpreterState)
     return state.swap_thread_state(space, tstate)
 
- at cpython_api([PyThreadState], lltype.Void)
+ at cpython_api([PyThreadState], lltype.Void, gil="acquire")
 def PyEval_AcquireThread(space, tstate):
     """Acquire the global interpreter lock and set the current thread state to
     tstate, which should not be NULL.  The lock must have been created earlier.
     If this thread already has the lock, deadlock ensues.  This function is not
     available when thread support is disabled at compile time."""
-    if rffi.aroundstate.after:
-        # After external call is before entering Python
-        rffi.aroundstate.after()
 
- at cpython_api([PyThreadState], lltype.Void)
+ at cpython_api([PyThreadState], lltype.Void, gil="release")
 def PyEval_ReleaseThread(space, tstate):
     """Reset the current thread state to NULL and release the global interpreter
     lock.  The lock must have been created earlier and must be held by the current
@@ -200,28 +193,20 @@
     that it represents the current thread state --- if it isn't, a fatal error is
     reported. This function is not available when thread support is disabled at
     compile time."""
-    if rffi.aroundstate.before:
-        # Before external call is after running Python
-        rffi.aroundstate.before()
 
 PyGILState_STATE = rffi.INT
 
- at cpython_api([], PyGILState_STATE, error=CANNOT_FAIL)
+ at cpython_api([], PyGILState_STATE, error=CANNOT_FAIL, gil="acquire")
 def PyGILState_Ensure(space):
     # XXX XXX XXX THIS IS A VERY MINIMAL IMPLEMENTATION THAT WILL HAPPILY
     # DEADLOCK IF CALLED TWICE ON THE SAME THREAD, OR CRASH IF CALLED IN A
     # NEW THREAD.  We should very carefully follow what CPython does instead.
-    if rffi.aroundstate.after:
-        # After external call is before entering Python
-        rffi.aroundstate.after()
     return rffi.cast(PyGILState_STATE, 0)
 
- at cpython_api([PyGILState_STATE], lltype.Void)
+ at cpython_api([PyGILState_STATE], lltype.Void, gil="release")
 def PyGILState_Release(space, state):
     # XXX XXX XXX We should very carefully follow what CPython does instead.
-    if rffi.aroundstate.before:
-        # Before external call is after running Python
-        rffi.aroundstate.before()
+    pass
 
 @cpython_api([], PyInterpreterState, error=CANNOT_FAIL)
 def PyInterpreterState_Head(space):
@@ -236,7 +221,8 @@
     """
     return lltype.nullptr(PyInterpreterState.TO)
 
- at cpython_api([PyInterpreterState], PyThreadState, error=CANNOT_FAIL)
+ at cpython_api([PyInterpreterState], PyThreadState, error=CANNOT_FAIL,
+             gil="around")
 def PyThreadState_New(space, interp):
     """Create a new thread state object belonging to the given interpreter
     object.  The global interpreter lock need not be held, but may be held if
@@ -245,12 +231,8 @@
         raise NoThreads
     # PyThreadState_Get will allocate a new execution context,
     # we need to protect gc and other globals with the GIL.
-    rffi.aroundstate.after()
-    try:
-        rthread.gc_thread_start()
-        return PyThreadState_Get(space)
-    finally:
-        rffi.aroundstate.before()
+    rthread.gc_thread_start()
+    return PyThreadState_Get(space)
 
 @cpython_api([PyThreadState], lltype.Void)
 def PyThreadState_Clear(space, tstate):
diff --git a/pypy/module/cpyext/test/test_translate.py b/pypy/module/cpyext/test/test_translate.py
--- a/pypy/module/cpyext/test/test_translate.py
+++ b/pypy/module/cpyext/test/test_translate.py
@@ -11,7 +11,7 @@
     FT = lltype.FuncType([], lltype.Signed)
     FTPTR = lltype.Ptr(FT)
 
-    def make_wrapper(space, func):
+    def make_wrapper(space, func, gil=None):
         def wrapper():
             return func(space)
         return wrapper
diff --git a/pypy/module/imp/importing.py b/pypy/module/imp/importing.py
--- a/pypy/module/imp/importing.py
+++ b/pypy/module/imp/importing.py
@@ -621,7 +621,10 @@
                 try:
                     load_module(space, w_modulename, find_info, reuse=True)
                 finally:
-                    find_info.stream.close()
+                    try:
+                        find_info.stream.close()
+                    except StreamErrors:
+                        pass
                 # fetch the module again, in case of "substitution"
                 w_mod = check_sys_modules(space, w_modulename)
                 return w_mod
@@ -663,7 +666,10 @@
             if find_info:
                 stream = find_info.stream
                 if stream:
-                    stream.close()
+                    try:
+                        stream.close()
+                    except StreamErrors:
+                        pass
 
     if tentative:
         return None
@@ -881,7 +887,10 @@
         try:
             code_w = read_compiled_module(space, cpathname, stream.readall())
         finally:
-            stream.close()
+            try:
+                stream.close()
+            except StreamErrors:
+                pass
         space.setattr(w_mod, w('__file__'), w(cpathname))
     else:
         code_w = parse_source_module(space, pathname, source)
@@ -966,7 +975,10 @@
         return stream
     except StreamErrors:
         if stream:
-            stream.close()
+            try:
+                stream.close()
+            except StreamErrors:
+                pass
         return None    # XXX! must not eat all exceptions, e.g.
                        # Out of file descriptors.
 
diff --git a/pypy/module/itertools/test/__init__.py b/pypy/module/itertools/test/__init__.py
new file mode 100644
diff --git a/pypy/module/micronumpy/ndarray.py b/pypy/module/micronumpy/ndarray.py
--- a/pypy/module/micronumpy/ndarray.py
+++ b/pypy/module/micronumpy/ndarray.py
@@ -1462,6 +1462,7 @@
     imag = GetSetProperty(W_NDimArray.descr_get_imag,
                           W_NDimArray.descr_set_imag),
     conj = interp2app(W_NDimArray.descr_conj),
+    conjugate = interp2app(W_NDimArray.descr_conj),
 
     argsort  = interp2app(W_NDimArray.descr_argsort),
     sort  = interp2app(W_NDimArray.descr_sort),
diff --git a/pypy/module/micronumpy/test/test_complex.py b/pypy/module/micronumpy/test/test_complex.py
--- a/pypy/module/micronumpy/test/test_complex.py
+++ b/pypy/module/micronumpy/test/test_complex.py
@@ -382,6 +382,7 @@
         assert np.conjugate(1+2j) == 1-2j
 
         eye2 = np.array([[1, 0], [0, 1]])
+        assert (eye2.conjugate() == eye2).all()
         x = eye2 + 1j * eye2
         for a, b in zip(np.conjugate(x), np.array([[ 1.-1.j,  0.-0.j], [ 0.-0.j,  1.-1.j]])):
             assert a[0] == b[0]
diff --git a/pypy/module/pwd/test/__init__.py b/pypy/module/pwd/test/__init__.py
new file mode 100644
diff --git a/pypy/module/pyexpat/__init__.py b/pypy/module/pyexpat/__init__.py
--- a/pypy/module/pyexpat/__init__.py
+++ b/pypy/module/pyexpat/__init__.py
@@ -39,8 +39,6 @@
         'error':         'space.fromcache(interp_pyexpat.Cache).w_error',
 
         '__version__':   'space.wrap("85819")',
-        'EXPAT_VERSION': 'interp_pyexpat.get_expat_version(space)',
-        'version_info':  'interp_pyexpat.get_expat_version_info(space)',
         }
 
     submodules = {
@@ -53,3 +51,9 @@
                  'XML_PARAM_ENTITY_PARSING_ALWAYS']:
         interpleveldefs[name] = 'space.wrap(interp_pyexpat.%s)' % (name,)
 
+    def startup(self, space):
+        from pypy.module.pyexpat import interp_pyexpat
+        w_ver = interp_pyexpat.get_expat_version(space)
+        space.setattr(self, space.wrap("EXPAT_VERSION"), w_ver)
+        w_ver = interp_pyexpat.get_expat_version_info(space)
+        space.setattr(self, space.wrap("version_info"), w_ver)
diff --git a/pypy/module/pypyjit/test_pypy_c/test_00_model.py b/pypy/module/pypyjit/test_pypy_c/test_00_model.py
--- a/pypy/module/pypyjit/test_pypy_c/test_00_model.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_00_model.py
@@ -62,7 +62,7 @@
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
         stdout, stderr = pipe.communicate()
-        if getattr(pipe, 'returncode', 0) < 0:
+        if pipe.wait() < 0:
             raise IOError("subprocess was killed by signal %d" % (
                 pipe.returncode,))
         if stderr.startswith('SKIP:'):
diff --git a/pypy/module/select/test/__init__.py b/pypy/module/select/test/__init__.py
new file mode 100644
diff --git a/pypy/module/struct/test/__init__.py b/pypy/module/struct/test/__init__.py
new file mode 100644
diff --git a/pypy/module/test_lib_pypy/cffi_tests/test_zdistutils.py b/pypy/module/test_lib_pypy/cffi_tests/test_zdistutils.py
--- a/pypy/module/test_lib_pypy/cffi_tests/test_zdistutils.py
+++ b/pypy/module/test_lib_pypy/cffi_tests/test_zdistutils.py
@@ -165,7 +165,8 @@
         assert lib.sin(12.3) == math.sin(12.3)
         v = ffi.verifier
         ext = v.get_extension()
-        assert 'distutils.extension.Extension' in str(ext.__class__)
+        assert 'distutils.extension.Extension' in str(ext.__class__) or \
+               'setuptools.extension.Extension' in str(ext.__class__)
         assert ext.sources == [maybe_relative_path(v.sourcefilename)]
         assert ext.name == v.get_module_name()
         assert ext.define_macros == [('TEST_EXTENSION_OBJECT', '1')]
@@ -194,7 +195,8 @@
         assert lib.test1eoes(7.0) == 42.0
         v = ffi.verifier
         ext = v.get_extension()
-        assert 'distutils.extension.Extension' in str(ext.__class__)
+        assert 'distutils.extension.Extension' in str(ext.__class__) or \
+               'setuptools.extension.Extension' in str(ext.__class__)
         assert ext.sources == [maybe_relative_path(v.sourcefilename),
                                extra_source]
         assert ext.name == v.get_module_name()
diff --git a/pypy/module/test_lib_pypy/cffi_tests/test_zintegration.py b/pypy/module/test_lib_pypy/cffi_tests/test_zintegration.py
--- a/pypy/module/test_lib_pypy/cffi_tests/test_zintegration.py
+++ b/pypy/module/test_lib_pypy/cffi_tests/test_zintegration.py
@@ -4,6 +4,9 @@
 import subprocess
 from pypy.module.test_lib_pypy.cffi_tests.udir import udir
 
+if sys.platform == 'win32':
+    py.test.skip('snippets do not run on win32')
+
 def create_venv(name):
     tmpdir = udir.join(name)
     try:
@@ -13,6 +16,23 @@
     except OSError as e:
         py.test.skip("Cannot execute virtualenv: %s" % (e,))
 
+    try:
+        deepcopy = os.symlink
+    except:
+        import shutil, errno
+        def deepcopy(src, dst):
+            try:
+                shutil.copytree(src, dst)
+            except OSError as e:
+                if e.errno in (errno.ENOTDIR, errno.EINVAL):
+                    shutil.copy(src, dst)
+                else:
+                    print('got errno')
+                    print(e.errno)
+                    print('not')
+                    print(errno.ENOTDIR)
+                    raise
+
     site_packages = None
     for dirpath, dirnames, filenames in os.walk(str(tmpdir)):
         if os.path.basename(dirpath) == 'site-packages':
@@ -32,7 +52,7 @@
                 modules += ('ply',)   # needed for older versions of pycparser
         for module in modules:
             target = imp.find_module(module)[1]
-            os.symlink(target, os.path.join(site_packages,
+            deepcopy(target, os.path.join(site_packages,
                                             os.path.basename(target)))
     return tmpdir
 
@@ -51,7 +71,11 @@
     python_f.write(py.code.Source(python_snippet))
     try:
         os.chdir(str(SNIPPET_DIR.join(dirname)))
-        vp = str(venv_dir.join('bin/python'))
+        if os.name == 'nt':
+            bindir = 'Scripts'
+        else:
+            bindir = 'bin'
+        vp = str(venv_dir.join(bindir).join('python'))
         subprocess.check_call((vp, 'setup.py', 'clean'))
         subprocess.check_call((vp, 'setup.py', 'install'))
         subprocess.check_call((vp, str(python_f)))
diff --git a/pypy/module/zipimport/test/test_zipimport_deflated.py b/pypy/module/zipimport/test/test_zipimport_deflated.py
--- a/pypy/module/zipimport/test/test_zipimport_deflated.py
+++ b/pypy/module/zipimport/test/test_zipimport_deflated.py
@@ -14,7 +14,7 @@
     def setup_class(cls):
         try:
             import rpython.rlib.rzlib
-        except ImportError:
+        except CompilationError:
             py.test.skip("zlib not available, cannot test compressed zipfiles")
         cls.make_class()
         cls.w_BAD_ZIP = cls.space.wrap(BAD_ZIP)
diff --git a/pypy/objspace/fake/checkmodule.py b/pypy/objspace/fake/checkmodule.py
--- a/pypy/objspace/fake/checkmodule.py
+++ b/pypy/objspace/fake/checkmodule.py
@@ -10,6 +10,7 @@
         mod = __import__('pypy.module.%s' % modname, None, None, ['__doc__'])
         # force computation and record what we wrap
         module = mod.Module(space, W_Root())
+        module.setup_after_space_initialization()
         module.startup(space)
         for name in module.loaders:
             seeobj_w.append(module._load_lazily(space, name))
diff --git a/pypy/objspace/std/test/test_typeobject.py b/pypy/objspace/std/test/test_typeobject.py
--- a/pypy/objspace/std/test/test_typeobject.py
+++ b/pypy/objspace/std/test/test_typeobject.py
@@ -1165,3 +1165,17 @@
                 return x + 1
         a = A()
         assert a.f(1) == 2
+
+    def test_eq_returns_notimplemented(self):
+        assert type.__eq__(int, 42) is NotImplemented
+        assert type.__ne__(dict, 42) is NotImplemented
+        assert type.__eq__(int, int) is True
+        assert type.__eq__(int, dict) is False
+
+    def test_cmp_on_types(self):
+        class X(type):
+            def __cmp__(self, other):
+                return -1
+        class Y:
+            __metaclass__ = X
+        assert (Y < Y) is True
diff --git a/pypy/objspace/std/typeobject.py b/pypy/objspace/std/typeobject.py
--- a/pypy/objspace/std/typeobject.py
+++ b/pypy/objspace/std/typeobject.py
@@ -645,9 +645,13 @@
                     "type object '%N' has no attribute %R", self, w_name)
 
     def descr_eq(self, space, w_other):
+        if not isinstance(w_other, W_TypeObject):
+            return space.w_NotImplemented
         return space.is_(self, w_other)
 
     def descr_ne(self, space, w_other):
+        if not isinstance(w_other, W_TypeObject):
+            return space.w_NotImplemented
         return space.newbool(not space.is_w(self, w_other))