[pypy-commit] pypy reflex-support: merge default into branch

Wed Apr 24 03:23:45 CEST 2013

Author: Wim Lavrijsen <WLavrijsen at lbl.gov>
Branch: reflex-support
Changeset: r63579:637ebd06c4f9
Date: 2013-04-23 18:23 -0700
http://bitbucket.org/pypy/pypy/changeset/637ebd06c4f9/

Log:	merge default into branch

diff too long, truncating to 2000 out of 2179 lines

diff --git a/Makefile b/Makefile
new file mode 100644
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,7 @@
+
+all: pypy-c
+
+pypy-c:
+	@echo "Building PyPy with JIT, it'll take about 40 minutes and 4G of RAM"
+	@sleep 3
+	rpython/bin/rpython -Ojit pypy/goal/targetpypystandalone.py
diff --git a/README.rst b/README.rst
--- a/README.rst
+++ b/README.rst
@@ -22,3 +22,16 @@
 and send us feedback!
 
     the pypy-dev team <pypy-dev at python.org>
+
+Building
+========
+
+build with::
+
+  rpython/bin/rpython -Ojit pypy/goal/targetpypystandalone.py
+
+This ends up with ``pypy-c`` binary in the main pypy directory. We suggest
+to use virtualenv with the resulting pypy-c as the interpreter, you can
+find more details about various installation schemes here:
+
+http://doc.pypy.org/en/latest/getting-started.html#installing-pypy
diff --git a/lib-python/2.7/json/decoder.py b/lib-python/2.7/json/decoder.py
--- a/lib-python/2.7/json/decoder.py
+++ b/lib-python/2.7/json/decoder.py
@@ -162,7 +162,7 @@
         if nextchar == '}':
             if object_pairs_hook is not None:
                 result = object_pairs_hook(pairs)
-                return result, end
+                return result, end + 1
             pairs = {}
             if object_hook is not None:
                 pairs = object_hook(pairs)
diff --git a/lib-python/2.7/json/tests/test_decode.py b/lib-python/2.7/json/tests/test_decode.py
--- a/lib-python/2.7/json/tests/test_decode.py
+++ b/lib-python/2.7/json/tests/test_decode.py
@@ -44,6 +44,7 @@
                                     object_pairs_hook=OrderedDict,
                                     object_hook=lambda x: None),
                          OrderedDict(p))
+        self.assertEqual(self.loads("{}", object_pairs_hook=list), [])
 
 
 class TestPyDecode(TestDecode, PyTest): pass
diff --git a/lib-python/2.7/test/test_descr.py b/lib-python/2.7/test/test_descr.py
--- a/lib-python/2.7/test/test_descr.py
+++ b/lib-python/2.7/test/test_descr.py
@@ -3592,6 +3592,9 @@
         list.__init__(a, sequence=[0, 1, 2])
         self.assertEqual(a, [0, 1, 2])
 
+    @unittest.skipIf(test_support.check_impl_detail(pypy=True) and
+                     sys.platform == 'win32',
+                     "XXX: https://bugs.pypy.org/issue1461")
     def test_recursive_call(self):
         # Testing recursive __call__() by setting to instance of class...
         class A(object):
diff --git a/lib-python/2.7/test/test_fileio.py b/lib-python/2.7/test/test_fileio.py
--- a/lib-python/2.7/test/test_fileio.py
+++ b/lib-python/2.7/test/test_fileio.py
@@ -318,7 +318,6 @@
         self.assertRaises(ValueError, _FileIO, -10)
         self.assertRaises(OSError, _FileIO, make_bad_fd())
         if sys.platform == 'win32':
-            raise unittest.SkipTest('Set _invalid_parameter_handler for low level io')
             import msvcrt
             self.assertRaises(IOError, msvcrt.get_osfhandle, make_bad_fd())
 
diff --git a/lib_pypy/msvcrt.py b/lib_pypy/msvcrt.py
--- a/lib_pypy/msvcrt.py
+++ b/lib_pypy/msvcrt.py
@@ -8,25 +8,37 @@
 # PAC: 2010/08 added MS locking for Whoosh
 
 import ctypes
+import errno
 from ctypes_support import standard_c_lib as _c
 from ctypes_support import get_errno
-import errno
 
 try:
     open_osfhandle = _c._open_osfhandle
 except AttributeError: # we are not on windows
     raise ImportError
 
-try: from __pypy__ import builtinify
-except ImportError: builtinify = lambda f: f
+try: from __pypy__ import builtinify, validate_fd
+except ImportError: builtinify = validate_fd = lambda f: f
 
 
 open_osfhandle.argtypes = [ctypes.c_int, ctypes.c_int]
 open_osfhandle.restype = ctypes.c_int
 
-get_osfhandle = _c._get_osfhandle
-get_osfhandle.argtypes = [ctypes.c_int]
-get_osfhandle.restype = ctypes.c_int
+_get_osfhandle = _c._get_osfhandle
+_get_osfhandle.argtypes = [ctypes.c_int]
+_get_osfhandle.restype = ctypes.c_int
+
+ at builtinify
+def get_osfhandle(fd):
+    """"get_osfhandle(fd) -> file handle
+
+    Return the file handle for the file descriptor fd. Raises IOError if
+    fd is not recognized."""
+    try:
+        validate_fd(fd)
+    except OSError as e:
+        raise IOError(*e.args)
+    return _get_osfhandle(fd)
 
 setmode = _c._setmode
 setmode.argtypes = [ctypes.c_int, ctypes.c_int]
diff --git a/pypy/config/pypyoption.py b/pypy/config/pypyoption.py
--- a/pypy/config/pypyoption.py
+++ b/pypy/config/pypyoption.py
@@ -35,7 +35,7 @@
      "thread", "itertools", "pyexpat", "_ssl", "cpyext", "array",
      "binascii", "_multiprocessing", '_warnings',
      "_collections", "_multibytecodec", "micronumpy", "_ffi",
-     "_continuation", "_cffi_backend", "_csv"]
+     "_continuation", "_cffi_backend", "_csv", "cppyy"]
 ))
 
 translation_modules = default_modules.copy()
@@ -64,6 +64,8 @@
     del working_modules["termios"]
     del working_modules["_minimal_curses"]
 
+    del working_modules["cppyy"]  # not tested on win32
+
     # The _locale module is needed by site.py on Windows
     default_modules["_locale"] = None
 
@@ -75,7 +77,7 @@
     del working_modules["_minimal_curses"]
     del working_modules["termios"]
     del working_modules["_multiprocessing"]   # depends on rctime
-
+    del working_modules["cppyy"]  # depends on ctypes
 
 
 module_dependencies = {
diff --git a/pypy/doc/config/translation.gcrootfinder.txt b/pypy/doc/config/translation.gcrootfinder.txt
--- a/pypy/doc/config/translation.gcrootfinder.txt
+++ b/pypy/doc/config/translation.gcrootfinder.txt
@@ -9,7 +9,9 @@
 - ``--gcrootfinder=asmgcc``: use assembler hackery to find the
   roots directly from the normal stack.  This is a bit faster,
   but platform specific.  It works so far with GCC or MSVC,
-  on i386 and x86-64.
+  on i386 and x86-64.  It is tested only on Linux (where it is
+  the default) so other platforms (as well as MSVC) may need
+  various fixes before they can be used.
 
 You may have to force the use of the shadowstack root finder if
 you are running into troubles or if you insist on translating
diff --git a/pypy/doc/cppyy.rst b/pypy/doc/cppyy.rst
--- a/pypy/doc/cppyy.rst
+++ b/pypy/doc/cppyy.rst
@@ -2,94 +2,128 @@
 cppyy: C++ bindings for PyPy
 ============================
 
-The cppyy module provides C++ bindings for PyPy by using the reflection
-information extracted from C++ header files by means of the
-`Reflex package`_.
-For this to work, you have to both install Reflex and build PyPy from source,
-as the cppyy module is not enabled by default.
-Note that the development version of cppyy lives in the reflex-support
-branch.
-As indicated by this being a branch, support for Reflex is still
-experimental.
-However, it is functional enough to put it in the hands of those who want
-to give it a try.
-In the medium term, cppyy will move away from Reflex and instead use
-`cling`_ as its backend, which is based on `llvm`_.
-Although that will change the logistics on the generation of reflection
-information, it will not change the python-side interface.
+The cppyy module creates, at run-time, Python-side classes and functions for
+C++, by querying a C++ reflection system.
+The default system used is `Reflex`_, which extracts the needed information
+from C++ header files.
+Another current backend is based on `CINT`_, and yet another, more important
+one for the medium- to long-term will be based on `cling`_.
+The latter sits on top of `llvm`_'s `clang`_, and will therefore allow the use
+of C++11.
+The work on the cling backend has so far been done only for CPython, but
+bringing it to PyPy is a lot less work than developing it in the first place.
 
-.. _`Reflex package`: http://root.cern.ch/drupal/content/reflex
+.. _`Reflex`: http://root.cern.ch/drupal/content/reflex
+.. _`CINT`: http://root.cern.ch/drupal/content/cint
 .. _`cling`: http://root.cern.ch/drupal/content/cling
 .. _`llvm`: http://llvm.org/
+.. _`clang`: http://clang.llvm.org/
+
+This document describes the version of cppyy that lives in the main branch of
+PyPy.
+The development of cppyy happens in the "reflex-support" branch.
 
 
 Motivation
 ==========
 
-The cppyy module offers two unique features, which result in great
-performance as well as better functionality and cross-language integration
-than would otherwise be possible.
-First, cppyy is written in RPython and therefore open to optimizations by the
-JIT up until the actual point of call into C++.
-This means that there are no conversions necessary between a garbage collected
-and a reference counted environment, as is needed for the use of existing
-extension modules written or generated for CPython.
-It also means that if variables are already unboxed by the JIT, they can be
-passed through directly to C++.
-Second, Reflex (and cling far more so) adds dynamic features to C++, thus
-greatly reducing impedance mismatches between the two languages.
-In fact, Reflex is dynamic enough that you could write the runtime bindings
+To provide bindings to another language in CPython, you program to a
+generic C-API that exposes many of the interpreter features.
+With PyPy, however, there is no such generic C-API, because several of the
+interpreter features (e.g. the memory model) are pluggable and therefore
+subject to change.
+Furthermore, a generic API does not allow any assumptions about the calls
+into another language, forcing the JIT to behave conservatively around these
+calls and with the objects that cross language boundaries.
+In contrast, cppyy does not expose an API, but expects one to be implemented
+by a backend.
+It makes strong assumptions about the semantics of the API that it uses and
+that in turn allows the JIT to make equally strong assumptions.
+This is possible, because the expected API is only for providing C++ language
+bindings, and does not provide generic programmability.
+
+The cppyy module further offers two features, which result in improved
+performance as well as better functionality and cross-language integration.
+First, cppyy itself is written in RPython and therefore open to optimizations
+by the JIT up until the actual point of call into C++.
+This means for example, that if variables are already unboxed by the JIT, they
+can be passed through directly to C++.
+Second, a backend such as Reflex (and cling far more so) adds dynamic features
+to C++, thus greatly reducing impedance mismatches between the two languages.
+For example, Reflex is dynamic enough to allow writing runtime bindings
 generation in python (as opposed to RPython) and this is used to create very
 natural "pythonizations" of the bound code.
+As another example, cling allows automatic instantiations of templates.
+
+See this description of the `cppyy architecture`_ for further details.
+
+.. _`cppyy architecture`: http://morepypy.blogspot.com/2012/06/architecture-of-cppyy.html
 
 
 Installation
 ============
 
-For now, the easiest way of getting the latest version of Reflex, is by
-installing the ROOT package.
-Besides getting the latest version of Reflex, another advantage is that with
-the full ROOT package, you can also use your Reflex-bound code on `CPython`_.
-`Download`_ a binary or install from `source`_.
-Some Linux and Mac systems may have ROOT provided in the list of scientific
-software of their packager.
-If, however, you prefer a standalone version of Reflex, the best is to get
-this `recent snapshot`_, and install like so::
+There are two ways of using cppyy, and the choice depends on how pypy-c was
+built: the backend can be builtin, or dynamically loadable.
+The former has the disadvantage of requiring pypy-c to be linked with external
+C++ libraries (e.g. libReflex.so), but has the advantage of being faster in
+some cases.
+That advantage will disappear over time, however, with improvements in the
+JIT.
+Therefore, this document assumes that the dynamically loadable backend is
+chosen (it is, by default).
+See the `backend documentation`_.
 
-    $ tar jxf reflex-2012-05-02.tar.bz2
-    $ cd reflex-2012-05-02
-    $ build/autogen
+.. _`backend documentation`: cppyy_backend.html
+
+A standalone version of Reflex that also provides the dynamically loadable
+backend is available for `download`_.
+That version, as well as any other distribution of Reflex (e.g. the one that
+comes with `ROOT`_, which may be part of your Linux distribution as part of
+the selection of scientific software) will also work for a build with the
+builtin backend.
+
+.. _`download`: http://cern.ch/wlav/reflex-2013-04-23.tar.bz2
+.. _`ROOT`: http://root.cern.ch/
+
+Besides Reflex, you probably need a version of `gccxml`_ installed, which is
+most easily provided by the packager of your system.
+If you read up on gccxml, you will probably notice that it is no longer being
+developed and hence will not provide C++11 support.
+That's why the medium term plan is to move to cling.
+Note that gccxml is only needed to generate reflection libraries.
+It is not needed to use them.
+
+.. _`gccxml`: http://www.gccxml.org
+
+To install the standalone version of Reflex, after download::
+
+    $ tar jxf reflex-2013-04-23.tar.bz2
+    $ cd reflex-2013-04-23
+    $ ./build/autogen
     $ ./configure <usual set of options such as --prefix>
     $ make && make install
 
-Also, make sure you have a version of `gccxml`_ installed, which is most
-easily provided by the packager of your system.
-If you read up on gccxml, you'll probably notice that it is no longer being
-developed and hence will not provide C++11 support.
-That's why the medium term plan is to move to `cling`_.
+The usual rules apply: <prefix>/bin needs to be added to the ``PATH`` and
+<prefix>/lib to the ``LD_LIBRARY_PATH`` environment variable.
+For convenience, this document will assume that there is a ``REFLEXHOME``
+variable that points to <prefix>.
+If you downloaded or built the whole of ROOT, ``REFLEXHOME`` should be equal
+to ``ROOTSYS``.
 
-.. _`Download`: http://root.cern.ch/drupal/content/downloading-root
-.. _`source`: http://root.cern.ch/drupal/content/installing-root-source
-.. _`recent snapshot`: http://cern.ch/wlav/reflex-2012-05-02.tar.bz2
-.. _`gccxml`: http://www.gccxml.org
+The following is optional, and is only to show how pypy-c can be build
+`from source`_, for example to get at the main development branch of cppyy.
+The `backend documentation`_ has more details on the backend-specific
+prerequisites.
 
-Next, get the `PyPy sources`_, optionally select the reflex-support branch,
-and build it.
-For the build to succeed, the ``$ROOTSYS`` environment variable must point to
-the location of your ROOT (or standalone Reflex) installation, or the
-``root-config`` utility must be accessible through ``PATH`` (e.g. by adding
-``$ROOTSYS/bin`` to ``PATH``).
-In case of the former, include files are expected under ``$ROOTSYS/include``
-and libraries under ``$ROOTSYS/lib``.
 Then run the translation to build ``pypy-c``::
 
     $ hg clone https://bitbucket.org/pypy/pypy
     $ cd pypy
     $ hg up reflex-support         # optional
-    $ cd pypy/goal
     
     # This example shows python, but using pypy-c is faster and uses less memory
-    $ python ../../rpython/bin/rpython.py -O jit --gcrootfinder=shadowstack targetpypystandalone.py --withmod-cppyy
+    $ python rpython/translator/goal/translate.py --opt=jit pypy/goal/targetpypystandalone --withmod-cppyy
 
 This will build a ``pypy-c`` that includes the cppyy module, and through that,
 Reflex support.
@@ -98,12 +132,12 @@
 If not, you may want `to obtain a binary distribution`_ to speed up the
 translation step.
 
-.. _`PyPy sources`: https://bitbucket.org/pypy/pypy/overview
+.. _`from source`: https://bitbucket.org/pypy/pypy/overview
 .. _`to obtain a binary distribution`: http://doc.pypy.org/en/latest/getting-started.html#download-a-pre-built-pypy
 
 
-Basic example
-=============
+Basic bindings example
+======================
 
 Now test with a trivial example whether all packages are properly installed
 and functional.
@@ -127,7 +161,7 @@
 code::
 
     $ genreflex MyClass.h
-    $ g++ -fPIC -rdynamic -O2 -shared -I$ROOTSYS/include MyClass_rflx.cpp -o libMyClassDict.so -L$ROOTSYS/lib -lReflex
+    $ g++ -fPIC -rdynamic -O2 -shared -I$REFLEXHOME/include MyClass_rflx.cpp -o libMyClassDict.so -L$REFLEXHOME/lib -lReflex
 
 Now you're ready to use the bindings.
 Since the bindings are designed to look pythonistic, it should be
@@ -176,7 +210,7 @@
 For example::
 
     $ genreflex MyClass.h --rootmap=libMyClassDict.rootmap --rootmap-lib=libMyClassDict.so
-    $ g++ -fPIC -rdynamic -O2 -shared -I$ROOTSYS/include MyClass_rflx.cpp -o libMyClassDict.so -L$ROOTSYS/lib -lReflex
+    $ g++ -fPIC -rdynamic -O2 -shared -I$REFLEXHOME/include MyClass_rflx.cpp -o libMyClassDict.so -L$REFLEXHOME/lib -lReflex
 
 where the first option (``--rootmap``) specifies the output file name, and the
 second option (``--rootmap-lib``) the name of the reflection library where
@@ -277,7 +311,7 @@
 Now the reflection info can be generated and compiled::
 
     $ genreflex MyAdvanced.h --selection=MyAdvanced.xml
-    $ g++ -fPIC -rdynamic -O2 -shared -I$ROOTSYS/include MyAdvanced_rflx.cpp -o libAdvExDict.so -L$ROOTSYS/lib -lReflex
+    $ g++ -fPIC -rdynamic -O2 -shared -I$REFLEXHOME/include MyAdvanced_rflx.cpp -o libAdvExDict.so -L$REFLEXHOME/lib -lReflex
 
 and subsequently be used from PyPy::
 
@@ -336,7 +370,7 @@
 bound using::
 
     $ genreflex example.h --deep --rootmap=libexampleDict.rootmap --rootmap-lib=libexampleDict.so
-    $ g++ -fPIC -rdynamic -O2 -shared -I$ROOTSYS/include example_rflx.cpp -o libexampleDict.so -L$ROOTSYS/lib -lReflex
+    $ g++ -fPIC -rdynamic -O2 -shared -I$REFLEXHOME/include example_rflx.cpp -o libexampleDict.so -L$REFLEXHOME/lib -lReflex
 
 .. _`example code`: cppyy_example.html
 
@@ -595,6 +629,16 @@
   All template classes must already exist in the loaded reflection info, they
   do not work (yet) with the class loader.
 
+  For compatibility with other bindings generators, use of square brackets
+  instead of parenthesis to instantiate templates is supported as well.
+
+* **templated functions**: Automatically participate in overloading and are
+  used in the same way as other global functions.
+
+* **templated methods**: For now, require an explicit selection of the
+  template parameters.
+  This will be changed to allow them to participate in overloads as expected.
+
 * **typedefs**: Are simple python references to the actual classes to which
   they refer.
 
@@ -692,7 +736,7 @@
 Run the normal ``genreflex`` and compilation steps::
 
     $ genreflex MyTemplate.h --selection=MyTemplate.xml
-    $ g++ -fPIC -rdynamic -O2 -shared -I$ROOTSYS/include MyTemplate_rflx.cpp -o libTemplateDict.so -L$ROOTSYS/lib -lReflex
+    $ g++ -fPIC -rdynamic -O2 -shared -I$REFLEXHOME/include MyTemplate_rflx.cpp -o libTemplateDict.so -L$REFLEXHOME/lib -lReflex
 
 Note: this is a dirty corner that clearly could do with some automation,
 even if the macro already helps.
@@ -727,18 +771,18 @@
 The fast lane
 =============
 
-The following is an experimental feature of cppyy, and that makes it doubly
-experimental, so caveat emptor.
+The following is an experimental feature of cppyy.
+It mostly works, but there are some known issues (e.g. with return-by-value).
+Soon it should be the default mode, however.
+
 With a slight modification of Reflex, it can provide function pointers for
 C++ methods, and hence allow PyPy to call those pointers directly, rather than
 calling C++ through a Reflex stub.
-This results in a rather significant speed-up.
-Mind you, the normal stub path is not exactly slow, so for now only use this
-out of curiosity or if you really need it.
 
-To install this patch of Reflex, locate the file genreflex-methptrgetter.patch
-in pypy/module/cppyy and apply it to the genreflex python scripts found in
-``$ROOTSYS/lib``::
+The standalone version of Reflex `provided`_ has been patched, but if you get
+Reflex from another source (most likely with a ROOT distribution), locate the
+file `genreflex-methptrgetter.patch`_ in pypy/module/cppyy and apply it to
+the genreflex python scripts found in ``$ROOTSYS/lib``::
 
     $ cd $ROOTSYS/lib
     $ patch -p2 < genreflex-methptrgetter.patch
@@ -749,8 +793,10 @@
 ``-Wno-pmf-conversions`` option to ``g++`` when compiling.
 The rest works the same way: the fast path will be used transparently (which
 also means that you can't actually find out whether it is in use, other than
-by running a micro-benchmark).
+by running a micro-benchmark or a JIT test).
 
+.. _`provided`: http://cern.ch/wlav/reflex-2013-04-23.tar.bz2
+.. _`genreflex-methptrgetter.patch`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/cppyy/genreflex-methptrgetter.patch
 
 CPython
 =======
diff --git a/pypy/doc/cppyy_backend.rst b/pypy/doc/cppyy_backend.rst
new file mode 100644
--- /dev/null
+++ b/pypy/doc/cppyy_backend.rst
@@ -0,0 +1,53 @@
+==================
+Backends for cppyy
+==================
+
+The cppyy module needs a backend to provide the C++ reflection information on
+which the Python bindings are build.
+The backend is called through a C-API, which can be found in the PyPy sources
+in: `pypy/module/cppyy/include/capi.h`_.
+There are two kinds of API calls: querying about reflection information, which
+are used during the creation of Python-side constructs, and making the actual
+calls into C++.
+The objects passed around are all opaque: cppyy does not make any assumptions
+about them, other than that the opaque handles can be copied.
+Their definition, however, appears in two places: in the C code (in capi.h),
+and on the RPython side (in `capi_types.py`_), so if they are changed, they
+need to be changed on both sides.
+
+.. _`pypy/module/cppyy/include/capi.h`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/cppyy/include/capi.h
+.. _`capi_types.py`: https://bitbucket.org/pypy/pypy/src/default/pypy/module/cppyy/capi/capi_types.py
+
+There are two places where selections in the RPython code affect the choice
+(and use) of the backend.
+The first is in `pypy/module/cppyy/capi/__init__.py`_::
+
+    # choose C-API access method:
+    from pypy.module.cppyy.capi.loadable_capi import *
+    #from pypy.module.cppyy.capi.builtin_capi import *
+
+The default is the loadable C-API.
+Comment it and uncomment the builtin C-API line, to use the builtin version.
+
+.. _`pypy/module/cppyy/capi/__init__.py`:  https://bitbucket.org/pypy/pypy/src/default/pypy/module/cppyy/capi/__init__.py
+
+Next, if the builtin C-API is chosen, the specific backend needs to be set as
+well (default is Reflex).
+This second choice is in `pypy/module/cppyy/capi/builtin_capi.py`_::
+
+    import reflex_capi as backend
+    #import cint_capi as backend
+
+After those choices have been made, built pypy-c as usual.
+
+.. _`pypy/module/cppyy/capi/builtin_capi.py`:  https://bitbucket.org/pypy/pypy/src/default/pypy/module/cppyy/capi/builtin_capi.py
+
+When building pypy-c from source, keep the following in mind.
+If the loadable_capi is chosen, no further prerequisites are needed.
+However, for the build of the builtin_capi to succeed, the ``ROOTSYS``
+environment variable must point to the location of your ROOT (or standalone
+Reflex in the case of the Reflex backend) installation, or the ``root-config``
+utility must be accessible through ``$PATH`` (e.g. by adding ``$ROOTSYS/bin``
+to ``PATH``).
+In case of the former, include files are expected under ``$ROOTSYS/include``
+and libraries under ``$ROOTSYS/lib``.
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -57,10 +57,12 @@
 Inline the fast path of newarray in the assembler.
 Disabled on ARM until we fix issues.
 
+.. branch: reflex-support
+Allow dynamic loading of a (Reflex) backend that implements the C-API needed
+to provide reflection information
 
 .. branches we don't care about
 .. branch: autoreds
-.. branch: reflex-support
 .. branch: kill-faking
 .. branch: improved_ebnfparse_error
 .. branch: task-decorator
diff --git a/pypy/module/_minimal_curses/__init__.py b/pypy/module/_minimal_curses/__init__.py
--- a/pypy/module/_minimal_curses/__init__.py
+++ b/pypy/module/_minimal_curses/__init__.py
@@ -1,6 +1,6 @@
 try:
     import _curses
-except ImportError:
+except Exception:   # probably ImportError or cffi's VerificationError
     try:
         # when running on top of pypy before it had _curses, settle for minimal
         # we prefer _curses so any constants added make it into _minimal_curses
diff --git a/pypy/module/cppyy/capi/__init__.py b/pypy/module/cppyy/capi/__init__.py
--- a/pypy/module/cppyy/capi/__init__.py
+++ b/pypy/module/cppyy/capi/__init__.py
@@ -9,8 +9,8 @@
 # the selection of the desired backend (default is Reflex).
 
 # choose C-API access method:
-#from pypy.module.cppyy.capi.loadable_capi import *
-from pypy.module.cppyy.capi.builtin_capi import *
+from pypy.module.cppyy.capi.loadable_capi import *
+#from pypy.module.cppyy.capi.builtin_capi import *
 
 from pypy.module.cppyy.capi.capi_types import C_OBJECT,\
     C_NULL_TYPE, C_NULL_OBJECT
diff --git a/pypy/module/cppyy/capi/loadable_capi.py b/pypy/module/cppyy/capi/loadable_capi.py
--- a/pypy/module/cppyy/capi/loadable_capi.py
+++ b/pypy/module/cppyy/capi/loadable_capi.py
@@ -11,7 +11,7 @@
    C_METHOD, C_INDEX, C_INDEX_ARRAY, WLAVC_INDEX, C_METHPTRGETTER_PTR
 
 
-reflection_library = 'rflxlib.so'
+reflection_library = 'libcppyy_backend.so'
 
 def identify():
     return 'loadable_capi'
@@ -231,7 +231,7 @@
     except Exception:
         if objectmodel.we_are_translated():
             raise OperationError(space.w_ImportError,
-                                 space.wrap("missing reflection library rflxlib.so"))
+                                 space.wrap("missing reflection library %s" % reflection_library))
         return False
     return True
 
diff --git a/pypy/module/cppyy/test/Makefile b/pypy/module/cppyy/test/Makefile
--- a/pypy/module/cppyy/test/Makefile
+++ b/pypy/module/cppyy/test/Makefile
@@ -3,7 +3,11 @@
 std_streamsDict.so iotypesDict.so
 all : $(dicts)
 
-ROOTSYS := ${ROOTSYS}
+ifneq (${REFLEXHOME},)
+  ROOTSYS := ${REFLEXHOME}
+else
+  ROOTSYS := ${ROOTSYS}
+endif
 
 ifeq ($(ROOTSYS),)
   genreflex=genreflex
diff --git a/pypy/module/posix/app_posix.py b/pypy/module/posix/app_posix.py
--- a/pypy/module/posix/app_posix.py
+++ b/pypy/module/posix/app_posix.py
@@ -20,7 +20,7 @@
 class stat_result:
     __metaclass__ = structseqtype
 
-    name = "posix.stat_result"
+    name = osname + ".stat_result"
 
     st_mode  = structseqfield(0, "protection bits")
     st_ino   = structseqfield(1, "inode")
diff --git a/pypy/module/posix/test/test_posix2.py b/pypy/module/posix/test/test_posix2.py
--- a/pypy/module/posix/test/test_posix2.py
+++ b/pypy/module/posix/test/test_posix2.py
@@ -159,6 +159,7 @@
         assert st.st_atime == 41
         assert st.st_mtime == 42.1
         assert st.st_ctime == 43
+        assert repr(st).startswith(self.posix.__name__ + '.stat_result')
 
     def test_stat_lstat(self):
         import stat
diff --git a/pypy/module/thread/threadlocals.py b/pypy/module/thread/threadlocals.py
--- a/pypy/module/thread/threadlocals.py
+++ b/pypy/module/thread/threadlocals.py
@@ -52,7 +52,7 @@
 
     def signals_enabled(self):
         ec = self.getvalue()
-        return ec._signals_enabled
+        return ec is not None and ec._signals_enabled
 
     def enable_signals(self, space):
         ec = self.getvalue()
@@ -72,10 +72,12 @@
     def leave_thread(self, space):
         "Notification that the current thread is about to stop."
         from pypy.module.thread.os_local import thread_is_stopping
-        try:
-            thread_is_stopping(self.getvalue())
-        finally:
-            self.setvalue(None)
+        ec = self.getvalue()
+        if ec is not None:
+            try:
+                thread_is_stopping(ec)
+            finally:
+                self.setvalue(None)
 
     def reinit_threads(self, space):
         "Called in the child process after a fork()"
diff --git a/pypy/pytest-A.py b/pypy/pytest-A.py
new file mode 100644
--- /dev/null
+++ b/pypy/pytest-A.py
@@ -0,0 +1,31 @@
+# custom test collection for the app-level testrunner
+import platform
+
+DIRS_SPLIT = {
+    'arm': ['interpreter/astcompiler/test',
+            'interpreter/pyparser/test',
+            'interpreter/test',
+            'interpreter/test2',
+            'objspace/std/test',
+    ],
+}
+
+
+def get_arch():
+    arch = platform.machine().lower()
+    if arch.startswith('arm'):
+        return 'arm'
+    if arch.startswith('x86'):
+        return 'x86'
+    return arch
+
+
+def collect_one_testdir(testdirs, reldir, tests):
+    arch = get_arch()
+    dirsplit = DIRS_SPLIT.get(arch, [])
+    for dir in dirsplit:
+        if reldir.startswith(dir):
+            testdirs.extend(tests)
+            break
+    else:
+        testdirs.append(reldir)
diff --git a/pypy/tool/pypyjit.py b/pypy/tool/pypyjit.py
--- a/pypy/tool/pypyjit.py
+++ b/pypy/tool/pypyjit.py
@@ -41,7 +41,6 @@
 config.objspace.usemodules._lsprof = False
 #
 config.objspace.usemodules._ffi = True
-#config.objspace.usemodules.cppyy = True
 config.objspace.usemodules.micronumpy = False
 #
 set_pypy_opt_level(config, level='jit')
diff --git a/rpython/annotator/model.py b/rpython/annotator/model.py
--- a/rpython/annotator/model.py
+++ b/rpython/annotator/model.py
@@ -527,6 +527,7 @@
 s_Int = SomeInteger()
 s_ImpossibleValue = SomeImpossibleValue()
 s_Str0 = SomeString(no_nul=True)
+s_Unicode0 = SomeUnicodeString(no_nul=True)
 
 
 # ____________________________________________________________
diff --git a/rpython/jit/backend/arm/assembler.py b/rpython/jit/backend/arm/assembler.py
--- a/rpython/jit/backend/arm/assembler.py
+++ b/rpython/jit/backend/arm/assembler.py
@@ -3,6 +3,7 @@
 import os
 
 from rpython.jit.backend.arm import conditions as c, registers as r
+from rpython.jit.backend.arm import shift
 from rpython.jit.backend.arm.arch import (WORD, DOUBLE_WORD, FUNC_ALIGN,
     JITFRAME_FIXED_SIZE)
 from rpython.jit.backend.arm.codebuilder import InstrBuilder, OverwritingBuilder
@@ -12,8 +13,9 @@
     CoreRegisterManager, check_imm_arg, VFPRegisterManager,
     operations as regalloc_operations,
     operations_with_guard as regalloc_operations_with_guard)
-from rpython.jit.backend.llsupport import jitframe
+from rpython.jit.backend.llsupport import jitframe, rewrite
 from rpython.jit.backend.llsupport.assembler import DEBUG_COUNTER, debug_bridge, BaseAssembler
+from rpython.jit.backend.llsupport.regalloc import get_scale, valid_addressing_size
 from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
 from rpython.jit.backend.model import CompiledLoopToken
 from rpython.jit.codewriter.effectinfo import EffectInfo
@@ -85,7 +87,7 @@
         ofs = self.cpu.get_ofs_of_frame_field('jf_guard_exc')
         # make sure ofs fits into a register
         assert check_imm_arg(ofs)
-        mc.LDR_ri(r.r0.value, r.fp.value, imm=ofs)
+        self.store_reg(mc, r.r0, r.fp, ofs)
         propagate_exception_descr = rffi.cast(lltype.Signed,
                   cast_instance_to_gcref(self.cpu.propagate_exception_descr))
         # put propagate_exception_descr into frame
@@ -93,7 +95,7 @@
         # make sure ofs fits into a register
         assert check_imm_arg(ofs)
         mc.gen_load_int(r.r0.value, propagate_exception_descr)
-        mc.STR_ri(r.r0.value, r.fp.value, imm=ofs)
+        self.store_reg(mc, r.r0, r.fp, ofs)
         mc.MOV_rr(r.r0.value, r.fp.value)
         self.gen_func_epilog(mc)
         rawstart = mc.materialize(self.cpu.asmmemmgr, [])
@@ -115,12 +117,18 @@
             # store exc_value in JITFRAME
             ofs = self.cpu.get_ofs_of_frame_field('jf_guard_exc')
             assert check_imm_arg(ofs)
-            self.store_reg(mc, r.ip, r.fp, ofs)
+            #
+            self.load_reg(mc, r.ip, r.ip, helper=tmpreg)
+            #
+            self.store_reg(mc, r.ip, r.fp, ofs, helper=tmpreg)
         if exctploc is not None:
             # store pos_exception in exctploc
             assert exctploc.is_reg()
             mc.gen_load_int(r.ip.value, self.cpu.pos_exception())
-            self.load_reg(mc, exctploc, r.ip)
+            self.load_reg(mc, exctploc, r.ip, helper=tmpreg)
+
+        if on_frame or exctploc is not None:
+            mc.gen_load_int(r.ip.value, self.cpu.pos_exc_value())
 
         # reset exception
         mc.gen_load_int(tmpreg.value, 0)
@@ -250,27 +258,59 @@
             self.wb_slowpath[withcards + 2 * withfloats] = rawstart
 
     def _build_malloc_slowpath(self, kind):
-        if kind != 'fixed':
-            return 0
+        """ While arriving on slowpath, we have a gcpattern on stack 0.
+        The arguments are passed in r0 and r10, as follows:
+
+        kind == 'fixed': nursery_head in r0 and the size in r1 - r0.
+
+        kind == 'str/unicode': length of the string to allocate in r0.
+
+        kind == 'var': length to allocate in r1, tid in r0,
+                       and itemsize on the stack.
+
+        This function must preserve all registers apart from r0 and r1.
+        """
+        assert kind in ['fixed', 'str', 'unicode', 'var']
         mc = InstrBuilder(self.cpu.cpuinfo.arch_version)
+        #
         self._push_all_regs_to_jitframe(mc, [r.r0, r.r1], self.cpu.supports_floats)
+        #
+        if kind == 'fixed':
+            addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
+        elif kind == 'str':
+            addr = self.cpu.gc_ll_descr.get_malloc_fn_addr('malloc_str')
+        elif kind == 'unicode':
+            addr = self.cpu.gc_ll_descr.get_malloc_fn_addr('malloc_unicode')
+        else:
+            addr = self.cpu.gc_ll_descr.get_malloc_slowpath_array_addr()
+        if kind == 'fixed':
+            # stack layout: [gcmap]
+            # At this point we know that the values we need to compute the size
+            # are stored in r0 and r1.
+            mc.SUB_rr(r.r0.value, r.r1.value, r.r0.value) # compute the size we want
+
+            if hasattr(self.cpu.gc_ll_descr, 'passes_frame'):
+                mc.MOV_rr(r.r1.value, r.fp.value)
+        elif kind == 'str' or kind == 'unicode':
+            # stack layout: [gcmap]
+            mc.MOV_rr(r.r0.value, r.r1.value)
+        else:  # var
+            # stack layout: [gcmap][itemsize]...
+            # tid is in r0
+            # length is in r1
+            mc.MOV_rr(r.r2.value, r.r1.value)
+            mc.MOV_rr(r.r1.value, r.r0.value)
+            mc.POP([r.r0.value])  # load itemsize
+        # store the gc pattern
+        mc.POP([r.r4.value])
         ofs = self.cpu.get_ofs_of_frame_field('jf_gcmap')
-        # store the gc pattern
-        mc.POP([r.r2.value])
-        self.store_reg(mc, r.r2, r.fp, ofs)
+        self.store_reg(mc, r.r4, r.fp, ofs)
+        #
         # We need to push two registers here because we are going to make a
         # call an therefore the stack needs to be 8-byte aligned
         mc.PUSH([r.ip.value, r.lr.value])
-        # At this point we know that the values we need to compute the size
-        # are stored in r0 and r1.
-        mc.SUB_rr(r.r0.value, r.r1.value, r.r0.value) # compute the size we want
-
-        if hasattr(self.cpu.gc_ll_descr, 'passes_frame'):
-            mc.MOV_rr(r.r1.value, r.fp.value)
-
-        addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
+        #
         mc.BL(addr)
-
         #
         # If the slowpath malloc failed, we raise a MemoryError that
         # always interrupts the current loop, as a "good enough"
@@ -289,7 +329,10 @@
         self.store_reg(mc, r.ip, r.fp, ofs)
         # return
         mc.POP([r.ip.value, r.pc.value])
-        return mc.materialize(self.cpu.asmmemmgr, [])
+
+        #
+        rawstart = mc.materialize(self.cpu.asmmemmgr, [])
+        return rawstart
 
     def _reload_frame_if_necessary(self, mc):
         gcrootmap = self.cpu.gc_ll_descr.gcrootmap
@@ -1168,21 +1211,17 @@
         else:
             raise AssertionError('Trying to pop to an invalid location')
 
-    def malloc_cond(self, nursery_free_adr, nursery_top_adr, sizeloc, gcmap):
-        if sizeloc.is_imm():     # must be correctly aligned
-            assert sizeloc.value & (WORD-1) == 0
+    def malloc_cond(self, nursery_free_adr, nursery_top_adr, size, gcmap):
+        assert size & (WORD-1) == 0
 
         self.mc.gen_load_int(r.r0.value, nursery_free_adr)
         self.mc.LDR_ri(r.r0.value, r.r0.value)
 
-        if sizeloc.is_imm():
-            if check_imm_arg(sizeloc.value):
-                self.mc.ADD_ri(r.r1.value, r.r0.value, sizeloc.value)
-            else:
-                self.mc.gen_load_int(r.r1.value, sizeloc.value)
-                self.mc.ADD_rr(r.r1.value, r.r0.value, r.r1.value)
+        if check_imm_arg(size):
+            self.mc.ADD_ri(r.r1.value, r.r0.value, size)
         else:
-            self.mc.ADD_rr(r.r1.value, r.r0.value, sizeloc.value)
+            self.mc.gen_load_int(r.r1.value, size)
+            self.mc.ADD_rr(r.r1.value, r.r0.value, r.r1.value)
 
         self.mc.gen_load_int(r.ip.value, nursery_top_adr)
         self.mc.LDR_ri(r.ip.value, r.ip.value)
@@ -1203,6 +1242,128 @@
         self.mc.gen_load_int(r.ip.value, nursery_free_adr)
         self.mc.STR_ri(r.r1.value, r.ip.value)
 
+    def malloc_cond_varsize_frame(self, nursery_free_adr, nursery_top_adr,
+                                  sizeloc, gcmap):
+        if sizeloc is r.r0:
+            self.mc.MOV_rr(r.r1.value, r.r0.value)
+            sizeloc = r.r1
+        self.mc.gen_load_int(r.r0.value, nursery_free_adr)
+        self.mc.LDR_ri(r.r0.value, r.r0.value)
+        #
+        self.mc.ADD_rr(r.r1.value, r.r0.value, sizeloc.value)
+        #
+        self.mc.gen_load_int(r.ip.value, nursery_top_adr)
+        self.mc.LDR_ri(r.ip.value, r.ip.value)
+
+        self.mc.CMP_rr(r.r1.value, r.ip.value)
+        #
+        self.push_gcmap(self.mc, gcmap, push=True, cond=c.HI)
+
+        self.mc.BL(self.malloc_slowpath, c=c.HI)
+
+        self.mc.gen_load_int(r.ip.value, nursery_free_adr)
+        self.mc.STR_ri(r.r1.value, r.ip.value)
+
+    def malloc_cond_varsize(self, kind, nursery_free_adr, nursery_top_adr,
+                            lengthloc, itemsize, maxlength, gcmap,
+                            arraydescr):
+        from rpython.jit.backend.llsupport.descr import ArrayDescr
+        assert isinstance(arraydescr, ArrayDescr)
+
+        # lengthloc is the length of the array, which we must not modify!
+        assert lengthloc is not r.r0 and lengthloc is not r.r1
+        if lengthloc.is_reg():
+            varsizeloc = lengthloc
+        else:
+            assert lengthloc.is_stack()
+            self.regalloc_mov(lengthloc, r.r1)
+            varsizeloc = r.r1
+        #
+        if check_imm_arg(maxlength):
+            self.mc.CMP_ri(varsizeloc.value, maxlength)
+        else:
+            self.mc.gen_load_int(r.ip.value, maxlength)
+            self.mc.CMP_rr(varsizeloc.value, r.ip.value)
+        jmp_adr0 = self.mc.currpos()  # jump to (large)
+        self.mc.BKPT()
+        #
+        self.mc.gen_load_int(r.r0.value, nursery_free_adr)
+        self.mc.LDR_ri(r.r0.value, r.r0.value)
+
+
+        if valid_addressing_size(itemsize):
+            shiftsize = get_scale(itemsize)
+        else:
+            shiftsize = self._mul_const_scaled(self.mc, r.lr, varsizeloc,
+                                                itemsize)
+            varsizeloc = r.lr
+        # now varsizeloc is a register != r0.  The size of
+        # the variable part of the array is (varsizeloc << shiftsize)
+        assert arraydescr.basesize >= self.gc_minimal_size_in_nursery
+        constsize = arraydescr.basesize + self.gc_size_of_header
+        force_realignment = (itemsize % WORD) != 0
+        if force_realignment:
+            constsize += WORD - 1
+        self.mc.gen_load_int(r.ip.value, constsize)
+        # constsize + (varsizeloc << shiftsize)
+        self.mc.ADD_rr(r.r1.value, r.ip.value, varsizeloc.value,
+                                imm=shiftsize, shifttype=shift.LSL)
+        self.mc.ADD_rr(r.r1.value, r.r1.value, r.r0.value)
+        if force_realignment:
+            self.mc.MVN_ri(r.ip.value, imm=(WORD - 1))
+            self.mc.AND_rr(r.r1.value, r.r1.value, r.ip.value)
+        # now r1 contains the total size in bytes, rounded up to a multiple
+        # of WORD, plus nursery_free_adr
+        #
+        self.mc.gen_load_int(r.ip.value, nursery_top_adr)
+        self.mc.LDR_ri(r.ip.value, r.ip.value)
+
+        self.mc.CMP_rr(r.r1.value, r.ip.value)
+        jmp_adr1 = self.mc.currpos()  # jump to (after-call)
+        self.mc.BKPT()
+        #
+        # (large)
+        currpos = self.mc.currpos()
+        pmc = OverwritingBuilder(self.mc, jmp_adr0, WORD)
+        pmc.B_offs(currpos, c.GT)
+        #
+        # save the gcmap
+        self.push_gcmap(self.mc, gcmap, push=True)
+        #
+        if kind == rewrite.FLAG_ARRAY:
+            self.mc.gen_load_int(r.r0.value, arraydescr.tid)
+            self.regalloc_mov(lengthloc, r.r1)
+            self.regalloc_push(imm(itemsize))
+            addr = self.malloc_slowpath_varsize
+        else:
+            if kind == rewrite.FLAG_STR:
+                addr = self.malloc_slowpath_str
+            else:
+                assert kind == rewrite.FLAG_UNICODE
+                addr = self.malloc_slowpath_unicode
+            self.regalloc_mov(lengthloc, r.r1)
+        self.mc.BL(addr)
+        #
+        jmp_location = self.mc.currpos()  # jump to (done)
+        self.mc.BKPT()
+        # (after-call)
+        currpos = self.mc.currpos()
+        pmc = OverwritingBuilder(self.mc, jmp_adr1, WORD)
+        pmc.B_offs(currpos, c.LS)
+        #
+        # write down the tid, but not if it's the result of the CALL
+        self.mc.gen_load_int(r.ip.value, arraydescr.tid)
+        self.mc.STR_ri(r.ip.value, r.r0.value)
+
+        # while we're at it, this line is not needed if we've done the CALL
+        self.mc.gen_load_int(r.ip.value, nursery_free_adr)
+        self.mc.STR_ri(r.r1.value, r.ip.value)
+        # (done)
+        # skip instructions after call
+        currpos = self.mc.currpos()
+        pmc = OverwritingBuilder(self.mc, jmp_location, WORD)
+        pmc.B_offs(currpos)
+
     def push_gcmap(self, mc, gcmap, push=False, store=False, cond=c.AL):
         ptr = rffi.cast(lltype.Signed, gcmap)
         if push:
@@ -1220,6 +1381,32 @@
         mc.gen_load_int(r.ip.value, 0)
         self.store_reg(mc, r.ip, r.fp, ofs)
 
+    def _mul_const_scaled(self, mc, targetreg, sourcereg, itemsize):
+        """Produce one operation to do roughly
+               targetreg = sourcereg * itemsize
+           except that the targetreg may still need shifting by 0,1,2,3.
+        """
+        if (itemsize & 7) == 0:
+            shiftsize = 3
+        elif (itemsize & 3) == 0:
+            shiftsize = 2
+        elif (itemsize & 1) == 0:
+            shiftsize = 1
+        else:
+            shiftsize = 0
+        itemsize >>= shiftsize
+        #
+        if valid_addressing_size(itemsize - 1):
+            self.mc.ADD_rr(targetreg.value, sourcereg.value, sourcereg.value,
+                                imm=get_scale(itemsize - 1), shifttype=shift.LSL)
+        elif valid_addressing_size(itemsize):
+            self.mc.LSL_ri(targetreg.value, sourcereg.value,
+                    get_scale(itemsize))
+        else:
+            mc.gen_load_int(targetreg.value, itemsize)
+            mc.MUL(targetreg.value, sourcereg.value, targetreg.value)
+        #
+        return shiftsize
 
 def not_implemented(msg):
     os.write(2, '[ARM/asm] %s\n' % msg)
diff --git a/rpython/jit/backend/arm/detect.py b/rpython/jit/backend/arm/detect.py
--- a/rpython/jit/backend/arm/detect.py
+++ b/rpython/jit/backend/arm/detect.py
@@ -2,6 +2,7 @@
 
 from rpython.translator.tool.cbuild import ExternalCompilationInfo
 from rpython.rtyper.tool import rffi_platform
+from rpython.rlib.clibffi import FFI_DEFAULT_ABI, FFI_SYSV, FFI_VFP
 from rpython.translator.platform import CompilationError
 from rpython.rlib.debug import debug_print, debug_start, debug_stop
 
diff --git a/rpython/jit/backend/arm/opassembler.py b/rpython/jit/backend/arm/opassembler.py
--- a/rpython/jit/backend/arm/opassembler.py
+++ b/rpython/jit/backend/arm/opassembler.py
@@ -18,7 +18,7 @@
 from rpython.jit.backend.arm.helper.regalloc import check_imm_arg
 from rpython.jit.backend.arm.codebuilder import InstrBuilder, OverwritingBuilder
 from rpython.jit.backend.arm.jump import remap_frame_layout
-from rpython.jit.backend.arm.regalloc import TempInt, TempPtr
+from rpython.jit.backend.arm.regalloc import TempBox
 from rpython.jit.backend.arm.locations import imm
 from rpython.jit.backend.llsupport import symbolic
 from rpython.jit.backend.llsupport.gcmap import allocate_gcmap
@@ -191,7 +191,7 @@
         return fcond
 
     def _emit_guard(self, op, arglocs, fcond, save_exc,
-                                    is_guard_not_invalidated=False, 
+                                    is_guard_not_invalidated=False,
                                     is_guard_not_forced=False):
         assert isinstance(save_exc, bool)
         assert isinstance(fcond, int)
@@ -297,7 +297,7 @@
         return self._emit_guard(op, locs, fcond, save_exc=False,
                                             is_guard_not_invalidated=True)
 
-    def emit_op_label(self, op, arglocs, regalloc, fcond): 
+    def emit_op_label(self, op, arglocs, regalloc, fcond):
         self._check_frame_depth_debug(self.mc)
         return fcond
 
@@ -1003,57 +1003,33 @@
     def _emit_copystrcontent(self, op, regalloc, fcond, is_unicode):
         # compute the source address
         args = op.getarglist()
-        base_loc = regalloc.make_sure_var_in_reg(args[0], args)
-        ofs_loc = regalloc.make_sure_var_in_reg(args[2], args)
+        base_loc = regalloc.rm.make_sure_var_in_reg(args[0], args)
+        ofs_loc = regalloc.rm.make_sure_var_in_reg(args[2], args)
         assert args[0] is not args[1]    # forbidden case of aliasing
-        regalloc.possibly_free_var(args[0])
-        regalloc.free_temp_vars()
-        if args[3] is not args[2] is not args[4]:  # MESS MESS MESS: don't free
-            regalloc.possibly_free_var(args[2])  # it if ==args[3] or args[4]
-            regalloc.free_temp_vars()
-        srcaddr_box = TempPtr()
+        srcaddr_box = TempBox()
         forbidden_vars = [args[1], args[3], args[4], srcaddr_box]
-        srcaddr_loc = regalloc.force_allocate_reg(srcaddr_box,
-                                                        selected_reg=r.r1)
+        srcaddr_loc = regalloc.rm.force_allocate_reg(srcaddr_box, forbidden_vars)
         self._gen_address_inside_string(base_loc, ofs_loc, srcaddr_loc,
                                         is_unicode=is_unicode)
-
         # compute the destination address
-        forbidden_vars = [args[4], args[3], srcaddr_box]
-        dstaddr_box = TempPtr()
-        dstaddr_loc = regalloc.force_allocate_reg(dstaddr_box,
-                                                        selected_reg=r.r0)
-        forbidden_vars.append(dstaddr_box)
-        base_loc = regalloc.make_sure_var_in_reg(args[1], forbidden_vars)
-        ofs_loc = regalloc.make_sure_var_in_reg(args[3], forbidden_vars)
-        assert base_loc.is_reg()
-        assert ofs_loc.is_reg()
-        regalloc.possibly_free_var(args[1])
-        if args[3] is not args[4]:     # more of the MESS described above
-            regalloc.possibly_free_var(args[3])
-        regalloc.free_temp_vars()
+        base_loc = regalloc.rm.make_sure_var_in_reg(args[1], forbidden_vars)
+        ofs_loc = regalloc.rm.make_sure_var_in_reg(args[3], forbidden_vars)
+        forbidden_vars = [args[4], srcaddr_box]
+        dstaddr_box = TempBox()
+        dstaddr_loc = regalloc.rm.force_allocate_reg(dstaddr_box, forbidden_vars)
         self._gen_address_inside_string(base_loc, ofs_loc, dstaddr_loc,
                                         is_unicode=is_unicode)
-
         # compute the length in bytes
-        forbidden_vars = [srcaddr_box, dstaddr_box]
-        # XXX basically duplicates regalloc.ensure_value_is_boxed, but we
-        # need the box here
-        if isinstance(args[4], Box):
-            length_box = args[4]
-            length_loc = regalloc.make_sure_var_in_reg(args[4],
-                                                        forbidden_vars)
-        else:
-            length_box = TempInt()
-            length_loc = regalloc.force_allocate_reg(length_box,
-                                        forbidden_vars, selected_reg=r.r2)
-            immloc = regalloc.convert_to_imm(args[4])
-            self.load(length_loc, immloc)
+        length_box = args[4]
+        length_loc = regalloc.loc(length_box)
         if is_unicode:
-            bytes_box = TempPtr()
-            bytes_loc = regalloc.force_allocate_reg(bytes_box,
-                                        forbidden_vars, selected_reg=r.r2)
+            forbidden_vars = [srcaddr_box, dstaddr_box]
+            bytes_box = TempBox()
+            bytes_loc = regalloc.rm.force_allocate_reg(bytes_box, forbidden_vars)
             scale = self._get_unicode_item_scale()
+            if not length_loc.is_reg():
+                self.regalloc_mov(length_loc, bytes_loc)
+                length_loc = bytes_loc
             assert length_loc.is_reg()
             self.mc.MOV_ri(r.ip.value, 1 << scale)
             self.mc.MUL(bytes_loc.value, r.ip.value, length_loc.value)
@@ -1062,12 +1038,11 @@
         # call memcpy()
         regalloc.before_call()
         self._emit_call(imm(self.memcpy_addr),
-                            [dstaddr_loc, srcaddr_loc, length_loc],
-                            can_collect=False)
-
-        regalloc.possibly_free_var(length_box)
-        regalloc.possibly_free_var(dstaddr_box)
-        regalloc.possibly_free_var(srcaddr_box)
+                                  [dstaddr_loc, srcaddr_loc, length_loc],
+                                  can_collect=False)
+        regalloc.rm.possibly_free_var(length_box)
+        regalloc.rm.possibly_free_var(dstaddr_box)
+        regalloc.rm.possibly_free_var(srcaddr_box)
 
     def _gen_address_inside_string(self, baseloc, ofsloc, resloc, is_unicode):
         if is_unicode:
@@ -1079,21 +1054,21 @@
                                               self.cpu.translate_support_code)
             assert itemsize == 1
             scale = 0
-        self._gen_address(ofsloc, ofs_items, scale, resloc, baseloc)
+        self._gen_address(resloc, baseloc, ofsloc, scale, ofs_items)
 
-    def _gen_address(self, sizereg, baseofs, scale, result, baseloc=None):
-        assert sizereg.is_reg()
+   # result = base_loc  + (scaled_loc << scale) + static_offset
+    def _gen_address(self, result, base_loc, scaled_loc, scale=0, static_offset=0):
+        assert scaled_loc.is_reg()
+        assert base_loc.is_reg()
+        assert check_imm_arg(scale)
+        assert check_imm_arg(static_offset)
         if scale > 0:
+            self.mc.LSL_ri(r.ip.value, scaled_loc.value, scale)
             scaled_loc = r.ip
-            self.mc.LSL_ri(r.ip.value, sizereg.value, scale)
         else:
-            scaled_loc = sizereg
-        if baseloc is not None:
-            assert baseloc.is_reg()
-            self.mc.ADD_rr(result.value, baseloc.value, scaled_loc.value)
-            self.mc.ADD_ri(result.value, result.value, baseofs)
-        else:
-            self.mc.ADD_ri(result.value, scaled_loc.value, baseofs)
+            scaled_loc = scaled_loc
+        self.mc.ADD_rr(result.value, base_loc.value, scaled_loc.value)
+        self.mc.ADD_ri(result.value, result.value, static_offset)
 
     def _get_unicode_item_scale(self):
         _, itemsize, _ = symbolic.get_array_token(rstr.UNICODE,
@@ -1320,6 +1295,7 @@
         with saved_registers(self.mc, regs_to_save, vfp_regs_to_save):
             self._emit_call(imm(self.reacqgil_addr), [], fcond,
                     can_collect=False)
+        self._reload_frame_if_necessary(self.mc)
 
     def _store_force_index(self, guard_op):
         faildescr = guard_op.getdescr()
@@ -1334,23 +1310,6 @@
         self._alignment_check()
         return fcond
 
-    def emit_op_call_malloc_nursery(self, op, arglocs, regalloc, fcond):
-        # registers r0 and r1 are allocated for this call
-        assert len(arglocs) == 1
-        sizeloc = arglocs[0]
-        gc_ll_descr = self.cpu.gc_ll_descr
-        gcmap = regalloc.get_gcmap([r.r0, r.r1])
-        self.malloc_cond(
-            gc_ll_descr.get_nursery_free_addr(),
-            gc_ll_descr.get_nursery_top_addr(),
-            sizeloc,
-            gcmap
-            )
-        self._alignment_check()
-        return fcond
-    emit_op_call_malloc_nursery_varsize_frame = emit_op_call_malloc_nursery
-
-
     def _alignment_check(self):
         if not self.debug:
             return
@@ -1436,7 +1395,7 @@
         self.mc.VCVT_f64_f32(r.vfp_ip.value, arg.value)
         self.mc.VMOV_rc(res.value, r.ip.value, r.vfp_ip.value)
         return fcond
-    
+
     def emit_op_cast_singlefloat_to_float(self, op, arglocs, regalloc, fcond):
         arg, res = arglocs
         assert res.is_vfp_reg()
diff --git a/rpython/jit/backend/arm/regalloc.py b/rpython/jit/backend/arm/regalloc.py
--- a/rpython/jit/backend/arm/regalloc.py
+++ b/rpython/jit/backend/arm/regalloc.py
@@ -2,7 +2,8 @@
 from rpython.rlib import rgc
 from rpython.rlib.debug import debug_print, debug_start, debug_stop
 from rpython.jit.backend.llsupport.regalloc import FrameManager, \
-        RegisterManager, TempBox, compute_vars_longevity, BaseRegalloc
+        RegisterManager, TempBox, compute_vars_longevity, BaseRegalloc, \
+        get_scale
 from rpython.jit.backend.arm import registers as r
 from rpython.jit.backend.arm import locations
 from rpython.jit.backend.arm.locations import imm, get_fp_offset
@@ -1011,20 +1012,72 @@
         self.rm.force_allocate_reg(op.result, selected_reg=r.r0)
         t = TempInt()
         self.rm.force_allocate_reg(t, selected_reg=r.r1)
+
+        sizeloc = size_box.getint()
+        gc_ll_descr = self.cpu.gc_ll_descr
+        gcmap = self.get_gcmap([r.r0, r.r1])
         self.possibly_free_var(t)
-        return [imm(size)]
+        self.assembler.malloc_cond(
+            gc_ll_descr.get_nursery_free_addr(),
+            gc_ll_descr.get_nursery_top_addr(),
+            sizeloc,
+            gcmap
+            )
+        self.assembler._alignment_check()
 
     def prepare_op_call_malloc_nursery_varsize_frame(self, op, fcond):
         size_box = op.getarg(0)
-        assert isinstance(size_box, BoxInt)
-
+        assert isinstance(size_box, BoxInt) # we cannot have a const here!
+        # sizeloc must be in a register, but we can free it now
+        # (we take care explicitly of conflicts with r0 or r1)
+        sizeloc = self.rm.make_sure_var_in_reg(size_box)
+        self.rm.possibly_free_var(size_box)
+        #
         self.rm.force_allocate_reg(op.result, selected_reg=r.r0)
+        #
         t = TempInt()
         self.rm.force_allocate_reg(t, selected_reg=r.r1)
-        argloc = self.make_sure_var_in_reg(size_box,
-                                            forbidden_vars=[op.result, t])
+        #
+        gcmap = self.get_gcmap([r.r0, r.r1])
         self.possibly_free_var(t)
-        return [argloc]
+        #
+        gc_ll_descr = self.assembler.cpu.gc_ll_descr
+        self.assembler.malloc_cond_varsize_frame(
+            gc_ll_descr.get_nursery_free_addr(),
+            gc_ll_descr.get_nursery_top_addr(),
+            sizeloc,
+            gcmap
+            )
+        self.assembler._alignment_check()
+
+    def prepare_op_call_malloc_nursery_varsize(self, op, fcond):
+        gc_ll_descr = self.assembler.cpu.gc_ll_descr
+        if not hasattr(gc_ll_descr, 'max_size_of_young_obj'):
+            raise Exception("unreachable code")
+            # for boehm, this function should never be called
+        arraydescr = op.getdescr()
+        length_box = op.getarg(2)
+        assert isinstance(length_box, BoxInt) # we cannot have a const here!
+        # the result will be in r0
+        self.rm.force_allocate_reg(op.result, selected_reg=r.r0)
+        # we need r1 as a temporary
+        tmp_box = TempBox()
+        self.rm.force_allocate_reg(tmp_box, selected_reg=r.r1)
+        gcmap = self.get_gcmap([r.r0, r.r1]) # allocate the gcmap *before*
+        self.rm.possibly_free_var(tmp_box)
+        # length_box always survives: it's typically also present in the
+        # next operation that will copy it inside the new array.  It's
+        # fine to load it from the stack too, as long as it's != r0, r1.
+        lengthloc = self.rm.loc(length_box)
+        self.rm.possibly_free_var(length_box)
+        #
+        itemsize = op.getarg(1).getint()
+        maxlength = (gc_ll_descr.max_size_of_young_obj - WORD * 2) / itemsize
+        self.assembler.malloc_cond_varsize(
+            op.getarg(0).getint(),
+            gc_ll_descr.get_nursery_free_addr(),
+            gc_ll_descr.get_nursery_top_addr(),
+            lengthloc, itemsize, maxlength, gcmap, arraydescr)
 
     prepare_op_debug_merge_point = void
     prepare_op_jit_debug = void
@@ -1211,13 +1264,6 @@
 operations_with_guard = [notimplemented_with_guard] * (rop._LAST + 1)
 
 
-def get_scale(size):
-    scale = 0
-    while (1 << scale) < size:
-        scale += 1
-    assert (1 << scale) == size
-    return scale
-
 for key, value in rop.__dict__.items():
     key = key.lower()
     if key.startswith('_'):
diff --git a/rpython/jit/backend/arm/runner.py b/rpython/jit/backend/arm/runner.py
--- a/rpython/jit/backend/arm/runner.py
+++ b/rpython/jit/backend/arm/runner.py
@@ -23,6 +23,7 @@
     supports_longlong = False # XXX requires an implementation of
                               # read_timestamp that works in user mode
     supports_singlefloats = not detect_hardfloat()
+    can_inline_varsize_malloc = True
 
     from rpython.jit.backend.arm.arch import JITFRAME_FIXED_SIZE
     all_reg_indexes = range(len(all_regs))
diff --git a/rpython/jit/backend/x86/test/test_exception.py b/rpython/jit/backend/arm/test/test_exception.py
copy from rpython/jit/backend/x86/test/test_exception.py
copy to rpython/jit/backend/arm/test/test_exception.py
--- a/rpython/jit/backend/x86/test/test_exception.py
+++ b/rpython/jit/backend/arm/test/test_exception.py
@@ -1,9 +1,9 @@
 
 import py
-from rpython.jit.backend.x86.test.test_basic import Jit386Mixin
+from rpython.jit.backend.arm.test.support import JitARMMixin
 from rpython.jit.metainterp.test.test_exception import ExceptionTests
 
-class TestExceptions(Jit386Mixin, ExceptionTests):
+class TestExceptions(JitARMMixin, ExceptionTests):
     # for the individual tests see
     # ====> ../../../metainterp/test/test_exception.py
 
diff --git a/rpython/jit/backend/detect_cpu.py b/rpython/jit/backend/detect_cpu.py
--- a/rpython/jit/backend/detect_cpu.py
+++ b/rpython/jit/backend/detect_cpu.py
@@ -2,12 +2,40 @@
 Processor auto-detection
 """
 import sys, os
+from rpython.rtyper.tool.rffi_platform import getdefined
+from rpython.translator.platform import is_host_build
 
 
 class ProcessorAutodetectError(Exception):
     pass
 
+
+def detect_main_model_and_size_from_platform():
+    # based on http://sourceforge.net/p/predef/wiki/Architectures/
+    mapping = {
+            ('x86', '64'): [
+                '__amd64__', '__amd64', '__x86_64__', '__x86_64',  # AMD64
+                '__ia64__', '_IA64', '__IA64__'                    # Intel Itanium (IA-64)
+                ],
+            ('arm', '32'): ['__arm__', '__thumb__'],
+            ('x86', '32'): ['i386', '__i386', '__i386__', '__i686__',],
+            ('ppc', '64'): ['__powerpc64__'],
+    }
+    for k, v in mapping.iteritems():
+        for macro in v:
+            if not getdefined(macro, ''):
+                continue
+            return '_'.join(k)
+    raise ProcessorAutodetectError, "Cannot detect processor using compiler macros"
+
+
+def detect_main_model_from_platform():
+    return detect_main_model_and_size_from_platform()[0]
+
+
 def autodetect_main_model():
+    if not is_host_build():
+        return detect_main_model_from_platform()
     mach = None
     try:
         import platform
@@ -40,6 +68,8 @@
         return mach
 
 def autodetect_main_model_and_size():
+    if not is_host_build():
+        return detect_main_model_and_size_from_platform()
     model = autodetect_main_model()
     if sys.maxint == 2**31-1:
         model += '_32'
diff --git a/rpython/jit/backend/llsupport/regalloc.py b/rpython/jit/backend/llsupport/regalloc.py
--- a/rpython/jit/backend/llsupport/regalloc.py
+++ b/rpython/jit/backend/llsupport/regalloc.py
@@ -740,6 +740,16 @@
     op = Fake(None)
     return op.is_comparison() or op.is_ovf()
 
+def valid_addressing_size(size):
+    return size == 1 or size == 2 or size == 4 or size == 8
+
+def get_scale(size):
+    assert valid_addressing_size(size)
+    if size < 4:
+        return size - 1         # 1, 2 => 0, 1
+    else:
+        return (size >> 2) + 1  # 4, 8 => 2, 3
+
 
 def not_implemented(msg):
     os.write(2, '[llsupport/regalloc] %s\n' % msg)
diff --git a/rpython/jit/backend/test/runner_test.py b/rpython/jit/backend/test/runner_test.py
--- a/rpython/jit/backend/test/runner_test.py
+++ b/rpython/jit/backend/test/runner_test.py
@@ -3943,7 +3943,8 @@
         a = lltype.malloc(A, 2, flavor='raw')
         a[0] = rffi.cast(rffi.SHORT, 666)
         a[1] = rffi.cast(rffi.SHORT, 777)
-        a_int = rffi.cast(lltype.Signed, a)
+        addr = llmemory.cast_ptr_to_adr(a)
+        a_int = heaptracker.adr2int(addr)
         print 'a_int:', a_int
         self.execute_operation(rop.SETARRAYITEM_RAW,
                                [ConstInt(a_int), ConstInt(0), ConstInt(-7654)],
diff --git a/rpython/jit/backend/test/test_detect_cpu.py b/rpython/jit/backend/test/test_detect_cpu.py
--- a/rpython/jit/backend/test/test_detect_cpu.py
+++ b/rpython/jit/backend/test/test_detect_cpu.py
@@ -26,3 +26,8 @@
     else:
         from rpython.jit.backend.model import AbstractCPU
         assert issubclass(cpu, AbstractCPU)
+
+
+def test_detect_main_model_and_size_from_platform():
+    info = autodetect_main_model_and_size()
+    assert detect_main_model_and_size_from_platform() == info
diff --git a/rpython/jit/backend/tool/viewcode.py b/rpython/jit/backend/tool/viewcode.py
--- a/rpython/jit/backend/tool/viewcode.py
+++ b/rpython/jit/backend/tool/viewcode.py
@@ -53,8 +53,7 @@
         'x86_32': 'i386',
         'x86_64': 'x86-64',
         'i386': 'i386',
-        'armv6_32': 'arm',
-        'armv7_32': 'arm',
+        'arm_32': 'arm',
     }
     cmd = find_objdump()
     objdump = ('%(command)s -M %(backend)s -b binary -m %(machine)s '
diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -13,8 +13,9 @@
 from rpython.rtyper.annlowlevel import llhelper, cast_instance_to_gcref
 from rpython.rlib.jit import AsmInfo
 from rpython.jit.backend.model import CompiledLoopToken
-from rpython.jit.backend.x86.regalloc import (RegAlloc, get_ebp_ofs, _get_scale,
-    gpr_reg_mgr_cls, xmm_reg_mgr_cls, _valid_addressing_size)
+from rpython.jit.backend.x86.regalloc import (RegAlloc, get_ebp_ofs,
+    gpr_reg_mgr_cls, xmm_reg_mgr_cls)
+from rpython.jit.backend.llsupport.regalloc import (get_scale, valid_addressing_size)
 from rpython.jit.backend.x86.arch import (FRAME_FIXED_SIZE, WORD, IS_X86_64,
                                        JITFRAME_FIXED_SIZE, IS_X86_32,
                                        PASS_ON_MY_FRAME)
@@ -1523,7 +1524,7 @@
         base_loc, ofs_loc, size_loc, ofs, sign_loc = arglocs
         assert isinstance(ofs, ImmedLoc)
         assert isinstance(size_loc, ImmedLoc)
-        scale = _get_scale(size_loc.value)
+        scale = get_scale(size_loc.value)
         src_addr = addr_add(base_loc, ofs_loc, ofs.value, scale)
         self.load_from_mem(resloc, src_addr, size_loc, sign_loc)
 
@@ -1552,12 +1553,12 @@
             shift = 0
         itemsize >>= shift
         #
-        if _valid_addressing_size(itemsize - 1):
+        if valid_addressing_size(itemsize - 1):
             mc.LEA_ra(targetreg, (sourcereg, sourcereg,
-                                  _get_scale(itemsize - 1), 0))
-        elif _valid_addressing_size(itemsize):
+                                  get_scale(itemsize - 1), 0))
+        elif valid_addressing_size(itemsize):
             mc.LEA_ra(targetreg, (rx86.NO_BASE_REGISTER, sourcereg,
-                                  _get_scale(itemsize), 0))
+                                  get_scale(itemsize), 0))
         else:
             mc.IMUL_rri(targetreg, sourcereg, itemsize)
         #
@@ -1570,9 +1571,9 @@
         if isinstance(index_loc, ImmedLoc):
             temp_loc = imm(index_loc.value * itemsize)
             shift = 0
-        elif _valid_addressing_size(itemsize):
+        elif valid_addressing_size(itemsize):
             temp_loc = index_loc
-            shift = _get_scale(itemsize)
+            shift = get_scale(itemsize)
         else:
             assert isinstance(index_loc, RegLoc)
             assert isinstance(temp_loc, RegLoc)
@@ -1610,7 +1611,7 @@
         base_loc, ofs_loc, value_loc, size_loc, baseofs = arglocs
         assert isinstance(baseofs, ImmedLoc)
         assert isinstance(size_loc, ImmedLoc)
-        scale = _get_scale(size_loc.value)
+        scale = get_scale(size_loc.value)
         dest_addr = AddressLoc(base_loc, ofs_loc, scale, baseofs.value)
         self.save_into_mem(dest_addr, value_loc, size_loc)
 
@@ -2443,8 +2444,8 @@
         jmp_adr0 = self.mc.get_relative_pos()
 
         self.mc.MOV(eax, heap(nursery_free_adr))
-        if _valid_addressing_size(itemsize):
-            shift = _get_scale(itemsize)
+        if valid_addressing_size(itemsize):
+            shift = get_scale(itemsize)
         else:
             shift = self._imul_const_scaled(self.mc, edi.value,
                                             varsizeloc.value, itemsize)
diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -1380,16 +1380,6 @@
     # i.e. the n'th word beyond the fixed frame size.
     return base_ofs + WORD * (position + JITFRAME_FIXED_SIZE)
 
-def _valid_addressing_size(size):
-    return size == 1 or size == 2 or size == 4 or size == 8
-
-def _get_scale(size):
-    assert _valid_addressing_size(size)
-    if size < 4:
-        return size - 1         # 1, 2 => 0, 1
-    else:
-        return (size >> 2) + 1  # 4, 8 => 2, 3
-
 def not_implemented(msg):
     os.write(2, '[x86/regalloc] %s\n' % msg)
     raise NotImplementedError(msg)
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -682,10 +682,12 @@
 
 def define_modrm_modes(insnname_template, before_modrm, after_modrm=[], regtype='GPR'):
     def add_insn(code, *modrm):
-        args = before_modrm + list(modrm) + after_modrm
+        args = before_modrm + list(modrm)
         methname = insnname_template.replace('*', code)
-        if methname.endswith('_rr') or methname.endswith('_xx'):
+        if (methname.endswith('_rr') or methname.endswith('_xx')
+                or methname.endswith('_ri')):
             args.append('\xC0')
+        args += after_modrm
 
         if regtype == 'XMM':
             insn_func = xmminsn(*args)
diff --git a/rpython/jit/backend/x86/test/test_regloc.py b/rpython/jit/backend/x86/test/test_regloc.py
--- a/rpython/jit/backend/x86/test/test_regloc.py
+++ b/rpython/jit/backend/x86/test/test_regloc.py
@@ -1,4 +1,5 @@
 import struct, sys
+from rpython.jit.backend.x86.rx86 import R
 from rpython.jit.backend.x86.regloc import *
 from rpython.jit.backend.x86.test.test_rx86 import CodeBuilder32, CodeBuilder64, assert_encodes_as
 from rpython.jit.backend.x86.assembler import heap
@@ -15,36 +16,49 @@
 cb32 = LocationCodeBuilder32
 cb64 = LocationCodeBuilder64
 
+def test_mov_8():
+    assert_encodes_as(cb32, "MOV8_ri", (R.cl, 25), '\xB1\x19')
+
 def test_mov_16():
+    # only 'MOV16_*r' and 'MOV16_*i' are supported
     # 32-bit
     assert_encodes_as(cb32, "MOV16", (ecx, ebx), '\x66\x89\xD9')
-    assert_encodes_as(cb32, "MOV16", (ecx, ImmedLoc(12345)), '\x66\xB9\x39\x30')
-
+    assert_encodes_as(cb32, "MOV16",
+                      (AddressLoc(ecx, ImmedLoc(16), 0, 0), ebx),
+                      '\x66\x89\x59\x10')
     # 64-bit
     assert_encodes_as(cb64, "MOV16", (r8, ebx), '\x66\x41\x89\xD8')  # 11 011 000
     assert_encodes_as(cb64, "MOV16", (ebx, r8), '\x66\x44\x89\xC3')  # 11 000 011
-    assert_encodes_as(cb64, "MOV16", (ecx, ebx), '\x66\x40\x89\xD9')
-    assert_encodes_as(cb64, "MOV16", (ecx, ImmedLoc(12345)), '\x66\xB9\x39\x30')
+    assert_encodes_as(cb64, "MOV16", (ecx, ebx), '\x66\x89\xD9')
     # for the next case we don't pick the most efficient encoding, but well
-    expected = '\x66\x40\xC7\xC1\xC7\xCF'  # could be '\x66\xB9\xC7\xCF'
+    expected = '\x66\xC7\xC1\x39\x30'      # could be '\x66\xB9\x39\x30'
+    assert_encodes_as(cb64, "MOV16", (ecx, ImmedLoc(12345)), expected)
+    # for the next case we don't pick the most efficient encoding, but well
+    expected = '\x66\xC7\xC1\xC7\xCF'      # could be '\x66\xB9\xC7\xCF'
     assert_encodes_as(cb64, "MOV16", (ecx, ImmedLoc(-12345)), expected)
-    assert_encodes_as(cb64, "MOV16", (r9, ImmedLoc(12345)), '\x66\x41\xB9\x39\x30')
+    # for the next case we don't pick the most efficient encoding, but well
+    expected = '\x66\x41\xC7\xC1\x39\x30'  # could be '\x66\x41\xB9\x39\x30'
+    assert_encodes_as(cb64, "MOV16", (r9, ImmedLoc(12345)), expected)
     # for the next case we don't pick the most efficient encoding, but well
     expected = '\x66\x41\xC7\xC1\xC7\xCF'  # could be '\x66\x41\xB9\xC7\xCF'
     assert_encodes_as(cb64, "MOV16", (r9, ImmedLoc(-12345)), expected)
-    assert_encodes_as(cb64, "MOV16", (AddressLoc(r13, ImmedLoc(0), 0, 0), ImmedLoc(12345)), '\x66\x41\xC7\x45\x00\x39\x30')
+    assert_encodes_as(cb64, "MOV16",
+                      (AddressLoc(r13, ImmedLoc(0), 0, 0), ImmedLoc(12345)),
+                      '\x66\x41\xC7\x45\x00\x39\x30')
 
 def test_cmp_16():
+    # only 'CMP16_mi' is supported
     # 32-bit
-    assert_encodes_as(cb32, "CMP16", (ecx, ebx), '\x66\x39\xD9')
-    assert_encodes_as(cb32, "CMP16", (ecx, ImmedLoc(12345)), '\x66\x81\xF9\x39\x30')
-
+    assert_encodes_as(cb32, "CMP16",
+                      (AddressLoc(ecx, ImmedLoc(0), 0, 0), ImmedLoc(21324)),
+                      '\x66\x81\x39\x4c\x53')
+    assert_encodes_as(cb32, "CMP16",
+                      (AddressLoc(esi, ImmedLoc(2), 0, 0), ImmedLoc(-12345)),
+                      '\x66\x81\x7e\x02\xc7\xcf')
     # 64-bit
-    assert_encodes_as(cb64, "CMP16", (r8, ebx), '\x66\x41\x39\xD8')  # 11 011 000
-    assert_encodes_as(cb64, "CMP16", (ebx, r8), '\x66\x44\x39\xC3')  # 11 000 011
-    assert_encodes_as(cb64, "CMP16", (ecx, ebx), '\x66\x40\x39\xD9')
-    assert_encodes_as(cb64, "CMP16", (ecx, ImmedLoc(12345)), '\x66\x40\x81\xF9\x39\x30')
-    assert_encodes_as(cb64, "CMP16", (AddressLoc(r13, ImmedLoc(0), 0, 0), ImmedLoc(12345)), '\x66\x41\x81\x7D\x00\x39\x30')
+    assert_encodes_as(cb64, "CMP16",
+                      (AddressLoc(r13, ImmedLoc(0), 0, 0), ImmedLoc(12345)),
+                      '\x66\x41\x81\x7D\x00\x39\x30')
 
 def test_relocation():
     from rpython.rtyper.lltypesystem import lltype, rffi
diff --git a/rpython/rlib/rwin32.py b/rpython/rlib/rwin32.py
--- a/rpython/rlib/rwin32.py
+++ b/rpython/rlib/rwin32.py
@@ -5,6 +5,7 @@
 import os
 import errno
 
+from rpython.rtyper.module.ll_os_environ import make_env_impls
 from rpython.rtyper.tool import rffi_platform
 from rpython.tool.udir import udir
 from rpython.translator.tool.cbuild import ExternalCompilationInfo
@@ -390,3 +391,5 @@
                 raise lastWindowsError('os_kill failed to terminate process')
         finally:
             CloseHandle(handle)
+
+    _wenviron_items, _wgetenv, _wputenv = make_env_impls(win32=True)
diff --git a/rpython/rlib/test/test_rwin32.py b/rpython/rlib/test/test_rwin32.py
--- a/rpython/rlib/test/test_rwin32.py
+++ b/rpython/rlib/test/test_rwin32.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
 import os, py
 if os.name != 'nt':
     py.test.skip('tests for win32 only')
@@ -47,3 +48,13 @@
     rwin32.CloseHandle(handle)
     assert proc.wait() == signal.SIGTERM
  
+ at py.test.mark.dont_track_allocations('putenv intentionally keeps strings alive')
+def test_wenviron():
+    name, value = u'PYPY_TEST_日本', u'foobar日本'
+    rwin32._wputenv(name, value)
+    assert rwin32._wgetenv(name) == value
+    env = dict(rwin32._wenviron_items())
+    assert env[name] == value
+    for key, value in env.iteritems():
+        assert type(key) is unicode
+        assert type(value) is unicode
diff --git a/rpython/rtyper/module/ll_os.py b/rpython/rtyper/module/ll_os.py
--- a/rpython/rtyper/module/ll_os.py
+++ b/rpython/rtyper/module/ll_os.py
@@ -7,14 +7,15 @@
 
 import os, sys, errno
 import py
-from rpython.rtyper.module.support import OOSupport
+from rpython.rtyper.module.support import (
+    _WIN32, OOSupport, StringTraits, UnicodeTraits, underscore_on_windows)
 from rpython.tool.sourcetools import func_renamer
 from rpython.rlib.rarithmetic import r_longlong
 from rpython.rtyper.extfunc import (
     BaseLazyRegistering, register_external)
 from rpython.rtyper.extfunc import registering, registering_if, extdef
 from rpython.annotator.model import (
-    SomeInteger, SomeString, SomeTuple, SomeFloat, SomeUnicodeString)
+    SomeInteger, SomeString, SomeTuple, SomeFloat, s_Str0, s_Unicode0)
 from rpython.annotator.model import s_ImpossibleValue, s_None, s_Bool
 from rpython.rtyper.lltypesystem import rffi
 from rpython.rtyper.lltypesystem import lltype
@@ -25,8 +26,8 @@
 from rpython.rtyper.lltypesystem.rstr import STR
 from rpython.rlib.objectmodel import specialize
 
-str0 = SomeString(no_nul=True)
-unicode0 = SomeUnicodeString(no_nul=True)
+str0 = s_Str0
+unicode0 = s_Unicode0
 
 def monkeypatch_rposix(posixfunc, unicodefunc, signature):
     func_name = posixfunc.__name__
@@ -66,42 +67,6 @@
     # Monkeypatch the function in rpython.rlib.rposix
     setattr(rposix, func_name, new_func)
 
-class StringTraits:
-    str = str
-    str0 = str0
-    CHAR = rffi.CHAR
-    CCHARP = rffi.CCHARP
-    charp2str = staticmethod(rffi.charp2str)
-    str2charp = staticmethod(rffi.str2charp)
-    free_charp = staticmethod(rffi.free_charp)
-    scoped_alloc_buffer = staticmethod(rffi.scoped_alloc_buffer)
-
-    @staticmethod
-    def posix_function_name(name):
-        return underscore_on_windows + name
-
-    @staticmethod
-    def ll_os_name(name):
-        return 'll_os.ll_os_' + name
-
-class UnicodeTraits:
-    str = unicode
-    str0 = unicode0
-    CHAR = rffi.WCHAR_T
-    CCHARP = rffi.CWCHARP
-    charp2str = staticmethod(rffi.wcharp2unicode)
-    str2charp = staticmethod(rffi.unicode2wcharp)
-    free_charp = staticmethod(rffi.free_wcharp)
-    scoped_alloc_buffer = staticmethod(rffi.scoped_alloc_unicodebuffer)
-
-    @staticmethod
-    def posix_function_name(name):
-        return underscore_on_windows + 'w' + name
-
-    @staticmethod
-    def ll_os_name(name):
-        return 'll_os.ll_os_w' + name
-
 def registering_str_unicode(posixfunc, condition=True):
     if not condition or posixfunc is None:
         return registering(None, condition=False)
@@ -129,16 +94,6 @@
 
 posix = __import__(os.name)
 
-if sys.platform.startswith('win'):
-    _WIN32 = True
-else:
-    _WIN32 = False
-
-if _WIN32:
-    underscore_on_windows = '_'
-else:
-    underscore_on_windows = ''
-
 includes = []
 if not _WIN32:
     # XXX many of these includes are not portable at all
diff --git a/rpython/rtyper/module/ll_os_environ.py b/rpython/rtyper/module/ll_os_environ.py
--- a/rpython/rtyper/module/ll_os_environ.py
+++ b/rpython/rtyper/module/ll_os_environ.py
@@ -4,10 +4,10 @@
 from rpython.rtyper.controllerentry import Controller
 from rpython.rtyper.extfunc import register_external
 from rpython.rtyper.lltypesystem import rffi, lltype
-from rpython.rtyper.module import ll_os
-from rpython.rlib import rposix
+from rpython.rtyper.module.support import _WIN32, StringTraits, UnicodeTraits
+from rpython.translator.tool.cbuild import ExternalCompilationInfo
 
-str0 = ll_os.str0
+str0 = annmodel.s_Str0
 
 # ____________________________________________________________
 #
@@ -59,85 +59,8 @@
         return r_getenv
 
 # ____________________________________________________________
-#
-# Lower-level interface: dummy placeholders and external registations
-
-def r_getenv(name):
-    just_a_placeholder     # should return None if name not found
-
-os_getenv = rffi.llexternal('getenv', [rffi.CCHARP], rffi.CCHARP,
-                            threadsafe=False)
-
-def getenv_llimpl(name):
-    with rffi.scoped_str2charp(name) as l_name:
-        l_result = os_getenv(l_name)
-        return rffi.charp2str(l_result) if l_result else None
-
-register_external(r_getenv, [str0],
-                  annmodel.SomeString(can_be_None=True, no_nul=True),
-                  export_name='ll_os.ll_os_getenv',
-                  llimpl=getenv_llimpl)
-
-# ____________________________________________________________
-
-def r_putenv(name, value):
-    just_a_placeholder
-
-class EnvKeepalive:
-    pass
-envkeepalive = EnvKeepalive()
-envkeepalive.byname = {}
-
-os_putenv = rffi.llexternal('putenv', [rffi.CCHARP], rffi.INT)
-
-def putenv_llimpl(name, value):
-    l_string = rffi.str2charp('%s=%s' % (name, value))
-    error = rffi.cast(lltype.Signed, os_putenv(l_string))
-    if error:
-        rffi.free_charp(l_string)
-        raise OSError(rposix.get_errno(), "os_putenv failed")
-    # keep 'l_string' alive - we know that the C library needs it
-    # until the next call to putenv() with the same 'name'.
-    l_oldstring = envkeepalive.byname.get(name, lltype.nullptr(rffi.CCHARP.TO))
-    envkeepalive.byname[name] = l_string
-    if l_oldstring:
-        rffi.free_charp(l_oldstring)
-
-register_external(r_putenv, [str0, str0], annmodel.s_None,
-                  export_name='ll_os.ll_os_putenv',
-                  llimpl=putenv_llimpl)
-
-# ____________________________________________________________
-
-def r_unsetenv(name):
-    # default implementation for platforms without a real unsetenv()
-    r_putenv(name, '')
-
-if hasattr(__import__(os.name), 'unsetenv'):
-    os_unsetenv = rffi.llexternal('unsetenv', [rffi.CCHARP], rffi.INT)
-
-    def unsetenv_llimpl(name):
-        with rffi.scoped_str2charp(name) as l_name:
-            error = rffi.cast(lltype.Signed, os_unsetenv(l_name))
-        if error:
-            raise OSError(rposix.get_errno(), "os_unsetenv failed")
-        try:
-            l_oldstring = envkeepalive.byname[name]
-        except KeyError:
-            pass
-        else:
-            del envkeepalive.byname[name]
-            rffi.free_charp(l_oldstring)
-
-    register_external(r_unsetenv, [str0], annmodel.s_None,
-                      export_name='ll_os.ll_os_unsetenv',
-                      llimpl=unsetenv_llimpl)
-
-# ____________________________________________________________
 # Access to the 'environ' external variable
 
-from rpython.translator.tool.cbuild import ExternalCompilationInfo
-
 if sys.platform.startswith('darwin'):
     CCHARPPP = rffi.CArrayPtr(rffi.CCHARPP)
     _os_NSGetEnviron = rffi.llexternal(
@@ -146,16 +69,21 @@
         )
     def os_get_environ():
         return _os_NSGetEnviron()[0]
-elif sys.platform.startswith('win'):
+elif _WIN32:
+    eci = ExternalCompilationInfo(includes=['stdlib.h'])
+    CWCHARPP = lltype.Ptr(lltype.Array(rffi.CWCHARP, hints={'nolength': True}))
+
     os_get_environ, _os_set_environ = rffi.CExternVariable(
-        rffi.CCHARPP,
-        '_environ',
-        ExternalCompilationInfo(includes=['stdlib.h']))
+        rffi.CCHARPP, '_environ', eci)
+    get__wenviron, _set__wenviron = rffi.CExternVariable(
+        CWCHARPP, '_wenviron', eci, c_type='wchar_t **')
 else:
     os_get_environ, _os_set_environ = rffi.CExternVariable(
         rffi.CCHARPP, 'environ', ExternalCompilationInfo())
 
 # ____________________________________________________________
+#
+# Lower-level interface: dummy placeholders and external registations
 
 def r_envkeys():
     just_a_placeholder
@@ -181,18 +109,109 @@
 def r_envitems():
     just_a_placeholder
 
-def envitems_llimpl():
-    environ = os_get_environ()
-    result = []
-    i = 0
-    while environ[i]:
-        name_value = rffi.charp2str(environ[i])
-        p = name_value.find('=')
-        if p >= 0:
-            result.append((name_value[:p], name_value[p+1:]))
-        i += 1
-    return result
+def r_getenv(name):
+    just_a_placeholder     # should return None if name not found
+
+def r_putenv(name, value):
+    just_a_placeholder
+
+os_getenv = rffi.llexternal('getenv', [rffi.CCHARP], rffi.CCHARP,
+                            threadsafe=False)
+os_putenv = rffi.llexternal('putenv', [rffi.CCHARP], rffi.INT)
+if _WIN32:
+    _wgetenv = rffi.llexternal('_wgetenv', [rffi.CWCHARP], rffi.CWCHARP,
+                               compilation_info=eci, threadsafe=False)
+    _wputenv = rffi.llexternal('_wputenv', [rffi.CWCHARP], rffi.INT,
+                               compilation_info=eci)
+
+class EnvKeepalive:
+    pass
+envkeepalive = EnvKeepalive()
+envkeepalive.byname = {}
+envkeepalive.bywname = {}
+
+def make_env_impls(win32=False):
+    if not win32:
+        traits = StringTraits()