[pypy-commit] pypy py3.5: merge unicode-utf8-py3 into py3.5
mattip
pypy.commits at gmail.com
Wed Feb 13 17:07:27 EST 2019
Author: Matti Picus <matti.picus at gmail.com>
Branch: py3.5
Changeset: r96005:0bbb64dc7f98
Date: 2019-02-13 23:13 +0200
http://bitbucket.org/pypy/pypy/changeset/0bbb64dc7f98/
Log: merge unicode-utf8-py3 into py3.5
diff too long, truncating to 2000 out of 25065 lines
diff --git a/.hgtags b/.hgtags
--- a/.hgtags
+++ b/.hgtags
@@ -61,3 +61,9 @@
9112c8071614108b1042bfef0713915107004d62 release-pypy2.7-v7.0.0
1f86f25937b6ae6c8b25236c35228fac587678bf release-pypy3.5-v7.0.0
dab365a465140aa79a5f3ba4db784c4af4d5c195 release-pypy3.6-v7.0.0
+9112c8071614108b1042bfef0713915107004d62 release-pypy2.7-v7.0.0
+c8805ee6d7846ca2722b106eeaa2f128c699aba3 release-pypy2.7-v7.0.0
+1f86f25937b6ae6c8b25236c35228fac587678bf release-pypy3.5-v7.0.0
+928a4f70d3de7d17449456946154c5da6e600162 release-pypy3.5-v7.0.0
+dab365a465140aa79a5f3ba4db784c4af4d5c195 release-pypy3.6-v7.0.0
+fb40f7a5524c77b80e6c468e087d621610137261 release-pypy3.6-v7.0.0
diff --git a/TODO b/TODO
new file mode 100644
--- /dev/null
+++ b/TODO
@@ -0,0 +1,20 @@
+* find a better way to run "find" without creating the index storage, if one
+ if one is not already readily available (understand cost now, improve after merge)
+* improve performance of splitlines (CF)
+* think about cost of utf8 list strategy (CF)
+* revisit why runicode import str_decode_utf_8_impl needed instead of runicode
+ import str_decode_utf_8
+* revisit remaining places in win32 where we do utf8.decode('utf-8'), they should work
+ directly with utf8 (can be converted via runicode.str_decode_utf_8 as well)
+ - rutf8.utf8_encode_mbcs
+ - unicodehelper.fsencode
+ - _winreg.interp_winreg
+* remove 'assert not isinstance(*, unicode)
+* add a flag that prevents support for unicode in rpython and enable it in PyPy (CF, Armin)
+* convert all realunicode_w to unicode_w after we flush out all old uses of
+ unicode_w
+* review all uses of W_Unicode.text_w, right now it is exactly W_Unicode.utf8_w.
+ It shoud only return valid utf8 (see 0be26dc39a59 which broke translation on
+ win32 and failed tests on linux64). Then we can use it in places like
+ _socket.interp_func.getaddrinfo instead of space.encode_unicode_object(w_port,
+ 'utf-8', 'strict')
diff --git a/pypy/TODO b/pypy/TODO
--- a/pypy/TODO
+++ b/pypy/TODO
@@ -1,6 +1,3 @@
-...
-
-
antocuni's older TODO:
* run coverage against the parser/astbuilder/astcompiler: it's probably full of
@@ -11,3 +8,5 @@
* re-enable BUILD_LIST_FROM_ARG: see the comment in astcompiler/codegen.py in
ast.ListComp.build_container
+
+* review use of std_decode_utf8, we probably do not want to be using it
diff --git a/pypy/doc/release-v7.0.0.rst b/pypy/doc/release-v7.0.0.rst
--- a/pypy/doc/release-v7.0.0.rst
+++ b/pypy/doc/release-v7.0.0.rst
@@ -39,7 +39,7 @@
The utf8 branch that changes internal representation of unicode to utf8 did not
make it into the release, so there is still more goodness coming.
-You can download the v6.0 releases here:
+You can download the v7.0 releases here:
http://pypy.org/download.html
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -5,6 +5,11 @@
.. this is a revision shortly after release-pypy-7.0.0
.. startrev: 481c69f7d81f
+.. branch: zlib-copying-third-time-a-charm
+
+Make sure zlib decompressobjs have their streams deallocated immediately
+on flush.
+
.. branch: zlib-copying-redux
Fix calling copy on already-flushed compressobjs.
@@ -15,7 +20,11 @@
as they do on CPython.
-.. math-improvements
+.. branch: math-improvements
Improve performance of long operations where one of the operands fits into
-an int.
\ No newline at end of file
+an int.
+
+.. branch: regalloc-playgrounds
+
+Improve register allocation in the JIT.
diff --git a/pypy/doc/whatsnew-pypy2-5.10.0.rst b/pypy/doc/whatsnew-pypy2-5.10.0.rst
--- a/pypy/doc/whatsnew-pypy2-5.10.0.rst
+++ b/pypy/doc/whatsnew-pypy2-5.10.0.rst
@@ -1,42 +1,42 @@
-==========================
-What's new in PyPy2.7 5.10
-==========================
-
-.. this is a revision shortly after release-pypy2.7-v5.9.0
-.. startrev:d56dadcef996
-
-
-.. branch: cppyy-packaging
-
-Cleanup and improve cppyy packaging
-
-.. branch: docs-osx-brew-openssl
-
-.. branch: keep-debug-symbols
-
-Add a smartstrip tool, which can optionally keep the debug symbols in a
-separate file, instead of just stripping them away. Use it in packaging
-
-.. branch: bsd-patches
-
-Fix failures on FreeBSD, contributed by David Naylor as patches on the issue
-tracker (issues 2694, 2695, 2696, 2697)
-
-.. branch: run-extra-tests
-
-Run extra_tests/ in buildbot
-
-.. branch: vmprof-0.4.10
-
-Upgrade the _vmprof backend to vmprof 0.4.10
-
-.. branch: fix-vmprof-stacklet-switch
-.. branch: fix-vmprof-stacklet-switch-2
-
-Fix a vmprof+continulets (i.e. greenelts, eventlet, gevent, ...)
-
-.. branch: win32-vcvars
-
-.. branch: rdict-fast-hash
-
-Make it possible to declare that the hash function of an r_dict is fast in RPython.
+==========================
+What's new in PyPy2.7 5.10
+==========================
+
+.. this is a revision shortly after release-pypy2.7-v5.9.0
+.. startrev:d56dadcef996
+
+
+.. branch: cppyy-packaging
+
+Cleanup and improve cppyy packaging
+
+.. branch: docs-osx-brew-openssl
+
+.. branch: keep-debug-symbols
+
+Add a smartstrip tool, which can optionally keep the debug symbols in a
+separate file, instead of just stripping them away. Use it in packaging
+
+.. branch: bsd-patches
+
+Fix failures on FreeBSD, contributed by David Naylor as patches on the issue
+tracker (issues 2694, 2695, 2696, 2697)
+
+.. branch: run-extra-tests
+
+Run extra_tests/ in buildbot
+
+.. branch: vmprof-0.4.10
+
+Upgrade the _vmprof backend to vmprof 0.4.10
+
+.. branch: fix-vmprof-stacklet-switch
+.. branch: fix-vmprof-stacklet-switch-2
+
+Fix a vmprof+continulets (i.e. greenelts, eventlet, gevent, ...)
+
+.. branch: win32-vcvars
+
+.. branch: rdict-fast-hash
+
+Make it possible to declare that the hash function of an r_dict is fast in RPython.
diff --git a/pypy/doc/whatsnew-pypy2-6.0.0.rst b/pypy/doc/whatsnew-pypy2-6.0.0.rst
--- a/pypy/doc/whatsnew-pypy2-6.0.0.rst
+++ b/pypy/doc/whatsnew-pypy2-6.0.0.rst
@@ -1,132 +1,128 @@
-===========================
-What's new in PyPy2.7 5.10+
-===========================
-
-.. this is a revision shortly after release-pypy2.7-v5.10.0
-.. startrev: 6b024edd9d12
-
-.. branch: cpyext-avoid-roundtrip
-
-Big refactoring of some cpyext code, which avoids a lot of nonsense when
-calling C from Python and vice-versa: the result is a big speedup in
-function/method calls, up to 6 times faster.
-
-.. branch: cpyext-datetime2
-
-Support ``tzinfo`` field on C-API datetime objects, fixes latest pandas HEAD
-
-
-.. branch: mapdict-size-limit
-
-Fix a corner case of mapdict: When an instance is used like a dict (using
-``setattr`` and ``getattr``, or ``.__dict__``) and a lot of attributes are
-added, then the performance using mapdict is linear in the number of
-attributes. This is now fixed (by switching to a regular dict after 80
-attributes).
-
-
-.. branch: cpyext-faster-arg-passing
-
-When using cpyext, improve the speed of passing certain objects from PyPy to C
-code, most notably None, True, False, types, all instances of C-defined types.
-Before, a dict lookup was needed every time such an object crossed over, now it
-is just a field read.
-
-
-.. branch: 2634_datetime_timedelta_performance
-
-Improve datetime + timedelta performance.
-
-.. branch: memory-accounting
-
-Improve way to describe memory
-
-.. branch: msvc14
-
-Allow compilaiton with Visual Studio 2017 compiler suite on windows
-
-.. branch: winapi
-
-Update _winapi and internal _winbase_cffi (via _winbase_build) for python 3
-
-.. branch: refactor-slots
-
-Refactor cpyext slots.
-
-
-.. branch: call-loopinvariant-into-bridges
-
-Speed up branchy code that does a lot of function inlining by saving one call
-to read the TLS in most bridges.
-
-.. branch: rpython-sprint
-
-Refactor in rpython signatures
-
-.. branch: cpyext-tls-operror2
-
-Store error state thread-locally in executioncontext, fixes issue #2764
-
-.. branch: cpyext-fast-typecheck
-
-Optimize `Py*_Check` for `Bool`, `Float`, `Set`. Also refactor and simplify
-`W_PyCWrapperObject` which is used to call slots from the C-API, greatly
-improving microbenchmarks in https://github.com/antocuni/cpyext-benchmarks
-
-
-.. branch: fix-sre-problems
-
-Fix two (unrelated) JIT bugs manifesting in the re module:
-
-- green fields are broken and were thus disabled, plus their usage removed from
- the _sre implementation
-
-- in rare "trace is too long" situations, the JIT could break behaviour
- arbitrarily.
-
-.. branch: jit-hooks-can-be-disabled
-
-Be more efficient about JIT hooks. Make it possible for the frontend to declare
-that jit hooks are currently not enabled at all. in that case, the list of ops
-does not have to be created in the case of the on_abort hook (which is
-expensive).
-
-
-.. branch: pyparser-improvements
-
-Improve speed of Python parser, improve ParseError messages slightly.
-
-.. branch: ioctl-arg-size
-
-Work around possible bugs in upstream ioctl users, like CPython allocate at
-least 1024 bytes for the arg in calls to ``ioctl(fd, request, arg)``. Fixes
-issue #2776
-
-.. branch: cpyext-subclass-setattr
-
-Fix for python-level classes that inherit from C-API types, previously the
-`w_obj` was not necessarily preserved throughout the lifetime of the `pyobj`
-which led to cases where instance attributes were lost. Fixes issue #2793
-
-
-.. branch: pyparser-improvements-2
-
-Improve line offsets that are reported by SyntaxError. Improve error messages
-for a few situations, including mismatched parenthesis.
-
-.. branch: issue2752
-
-Fix a rare GC bug that was introduced more than one year ago, but was
-not diagnosed before issue #2752.
-
-.. branch: gc-hooks
-
-Introduce GC hooks, as documented in doc/gc_info.rst
-
-.. branch: gc-hook-better-timestamp
-
-Improve GC hooks
-
-.. branch: cppyy-packaging
-
-Update backend to 0.6.0 and support exceptions through wrappers
+===========================
+What's new in PyPy2.7 5.10+
+===========================
+
+.. this is a revision shortly after release-pypy2.7-v5.10.0
+.. startrev: 6b024edd9d12
+
+.. branch: cpyext-avoid-roundtrip
+
+Big refactoring of some cpyext code, which avoids a lot of nonsense when
+calling C from Python and vice-versa: the result is a big speedup in
+function/method calls, up to 6 times faster.
+
+.. branch: cpyext-datetime2
+
+Support ``tzinfo`` field on C-API datetime objects, fixes latest pandas HEAD
+
+
+.. branch: mapdict-size-limit
+
+Fix a corner case of mapdict: When an instance is used like a dict (using
+``setattr`` and ``getattr``, or ``.__dict__``) and a lot of attributes are
+added, then the performance using mapdict is linear in the number of
+attributes. This is now fixed (by switching to a regular dict after 80
+attributes).
+
+
+.. branch: cpyext-faster-arg-passing
+
+When using cpyext, improve the speed of passing certain objects from PyPy to C
+code, most notably None, True, False, types, all instances of C-defined types.
+Before, a dict lookup was needed every time such an object crossed over, now it
+is just a field read.
+
+
+.. branch: 2634_datetime_timedelta_performance
+
+Improve datetime + timedelta performance.
+
+.. branch: memory-accounting
+
+Improve way to describe memory
+
+.. branch: msvc14
+
+Allow compilaiton with Visual Studio 2017 compiler suite on windows
+
+.. branch: refactor-slots
+
+Refactor cpyext slots.
+
+
+.. branch: call-loopinvariant-into-bridges
+
+Speed up branchy code that does a lot of function inlining by saving one call
+to read the TLS in most bridges.
+
+.. branch: rpython-sprint
+
+Refactor in rpython signatures
+
+.. branch: cpyext-tls-operror2
+
+Store error state thread-locally in executioncontext, fixes issue #2764
+
+.. branch: cpyext-fast-typecheck
+
+Optimize `Py*_Check` for `Bool`, `Float`, `Set`. Also refactor and simplify
+`W_PyCWrapperObject` which is used to call slots from the C-API, greatly
+improving microbenchmarks in https://github.com/antocuni/cpyext-benchmarks
+
+
+.. branch: fix-sre-problems
+
+Fix two (unrelated) JIT bugs manifesting in the re module:
+
+- green fields are broken and were thus disabled, plus their usage removed from
+ the _sre implementation
+
+- in rare "trace is too long" situations, the JIT could break behaviour
+ arbitrarily.
+
+.. branch: jit-hooks-can-be-disabled
+
+Be more efficient about JIT hooks. Make it possible for the frontend to declare
+that jit hooks are currently not enabled at all. in that case, the list of ops
+does not have to be created in the case of the on_abort hook (which is
+expensive).
+
+
+.. branch: pyparser-improvements
+
+Improve speed of Python parser, improve ParseError messages slightly.
+
+.. branch: ioctl-arg-size
+
+Work around possible bugs in upstream ioctl users, like CPython allocate at
+least 1024 bytes for the arg in calls to ``ioctl(fd, request, arg)``. Fixes
+issue #2776
+
+.. branch: cpyext-subclass-setattr
+
+Fix for python-level classes that inherit from C-API types, previously the
+`w_obj` was not necessarily preserved throughout the lifetime of the `pyobj`
+which led to cases where instance attributes were lost. Fixes issue #2793
+
+
+.. branch: pyparser-improvements-2
+
+Improve line offsets that are reported by SyntaxError. Improve error messages
+for a few situations, including mismatched parenthesis.
+
+.. branch: issue2752
+
+Fix a rare GC bug that was introduced more than one year ago, but was
+not diagnosed before issue #2752.
+
+.. branch: gc-hooks
+
+Introduce GC hooks, as documented in doc/gc_info.rst
+
+.. branch: gc-hook-better-timestamp
+
+Improve GC hooks
+
+.. branch: cppyy-packaging
+
+Update backend to 0.6.0 and support exceptions through wrappers
diff --git a/pypy/doc/whatsnew-pypy2-7.0.0.rst b/pypy/doc/whatsnew-pypy2-7.0.0.rst
--- a/pypy/doc/whatsnew-pypy2-7.0.0.rst
+++ b/pypy/doc/whatsnew-pypy2-7.0.0.rst
@@ -1,69 +1,69 @@
-==========================
-What's new in PyPy2.7 6.0+
-==========================
-
-.. this is a revision shortly after release-pypy-6.0.0
-.. startrev: e50e11af23f1
-
-.. branch: cppyy-packaging
-
-Main items: vastly better template resolution and improved performance. In
-detail: upgrade to backend 1.4, improved handling of templated methods and
-functions (in particular automatic deduction of types), improved pythonization
-interface, range of compatibility fixes for Python3, free functions now take
-fast libffi path when possible, moves for strings (incl. from Python str),
-easier/faster handling of std::vector by numpy, improved and faster object
-identity preservation
-
-.. branch: socket_default_timeout_blockingness
-
-Make sure 'blocking-ness' of socket is set along with default timeout
-
-.. branch: crypt_h
-
-Include crypt.h for crypt() on Linux
-
-.. branch: gc-more-logging
-
-Log additional gc-minor and gc-collect-step info in the PYPYLOG
-
-.. branch: reverse-debugger
-
-The reverse-debugger branch has been merged. For more information, see
-https://bitbucket.org/pypy/revdb
-
-
-.. branch: pyparser-improvements-3
-
-Small refactorings in the Python parser.
-
-.. branch: fix-readme-typo
-
-.. branch: avoid_shell_injection_in_shutil
-
-Backport CPython fix for possible shell injection issue in `distutils.spawn`,
-https://bugs.python.org/issue34540
-
-.. branch: cffi_dlopen_unicode
-
-Enable use of unicode file names in `dlopen`
-
-.. branch: rlock-in-rpython
-
-Backport CPython fix for `thread.RLock`
-
-
-.. branch: expose-gc-time
-
-Make GC hooks measure time in seconds (as opposed to an opaque unit).
-
-.. branch: cleanup-test_lib_pypy
-
-Update most test_lib_pypy/ tests and move them to extra_tests/.
-
-.. branch: gc-disable
-
-Make it possible to manually manage the GC by using a combination of
-gc.disable() and gc.collect_step(). Make sure to write a proper release
-announcement in which we explain that existing programs could leak memory if
-they run for too much time between a gc.disable()/gc.enable()
+==========================
+What's new in PyPy2.7 6.0+
+==========================
+
+.. this is a revision shortly after release-pypy-6.0.0
+.. startrev: e50e11af23f1
+
+.. branch: cppyy-packaging
+
+Main items: vastly better template resolution and improved performance. In
+detail: upgrade to backend 1.4, improved handling of templated methods and
+functions (in particular automatic deduction of types), improved pythonization
+interface, range of compatibility fixes for Python3, free functions now take
+fast libffi path when possible, moves for strings (incl. from Python str),
+easier/faster handling of std::vector by numpy, improved and faster object
+identity preservation
+
+.. branch: socket_default_timeout_blockingness
+
+Make sure 'blocking-ness' of socket is set along with default timeout
+
+.. branch: crypt_h
+
+Include crypt.h for crypt() on Linux
+
+.. branch: gc-more-logging
+
+Log additional gc-minor and gc-collect-step info in the PYPYLOG
+
+.. branch: reverse-debugger
+
+The reverse-debugger branch has been merged. For more information, see
+https://bitbucket.org/pypy/revdb
+
+
+.. branch: pyparser-improvements-3
+
+Small refactorings in the Python parser.
+
+.. branch: fix-readme-typo
+
+.. branch: avoid_shell_injection_in_shutil
+
+Backport CPython fix for possible shell injection issue in `distutils.spawn`,
+https://bugs.python.org/issue34540
+
+.. branch: cffi_dlopen_unicode
+
+Enable use of unicode file names in `dlopen`
+
+.. branch: rlock-in-rpython
+
+Backport CPython fix for `thread.RLock`
+
+
+.. branch: expose-gc-time
+
+Make GC hooks measure time in seconds (as opposed to an opaque unit).
+
+.. branch: cleanup-test_lib_pypy
+
+Update most test_lib_pypy/ tests and move them to extra_tests/.
+
+.. branch: gc-disable
+
+Make it possible to manually manage the GC by using a combination of
+gc.disable() and gc.collect_step(). Make sure to write a proper release
+announcement in which we explain that existing programs could leak memory if
+they run for too much time between a gc.disable()/gc.enable()
diff --git a/pypy/doc/whatsnew-pypy3-5.10.0.rst b/pypy/doc/whatsnew-pypy3-5.10.0.rst
--- a/pypy/doc/whatsnew-pypy3-5.10.0.rst
+++ b/pypy/doc/whatsnew-pypy3-5.10.0.rst
@@ -1,21 +1,7 @@
-=========================
-What's new in PyPy3 5.9+
-=========================
-
-.. this is the revision after release-pypy3.5-5.9
-.. startrev: be41e3ac0a29
-
-.. branch: sched_yield
-Add sched_yield posix attribute
-
-.. branch: py3.5-appexec
-Raise if space.is_true(space.appexec()) used in app level tests, fix tests
-that did this
-
-.. branch: py3.5-mac-embedding
-Download and patch dependencies when building cffi-based stdlib modules
-
-.. branch: os_lockf
-
-.. branch: py3.5-xattr
-Add posix.*attr() functions
+========================
+What's new in PyPy3 7.0+
+========================
+
+.. this is the revision after release-pypy3.5-v7.0
+.. startrev: 9d2fa7c63b7c
+
diff --git a/pypy/doc/whatsnew-pypy3-6.0.0.rst b/pypy/doc/whatsnew-pypy3-6.0.0.rst
--- a/pypy/doc/whatsnew-pypy3-6.0.0.rst
+++ b/pypy/doc/whatsnew-pypy3-6.0.0.rst
@@ -1,28 +1,7 @@
-=========================
-What's new in PyPy3 5.10+
-=========================
+========================
+What's new in PyPy3 7.0+
+========================
-.. this is the revision after release-pypy3.5-v5.10
-.. startrev: 34c63fba0bba
+.. this is the revision after release-pypy3.5-v7.0
+.. startrev: 9d2fa7c63b7c
-.. branch: hroncok/fix-typeerror-str-does-not-support-the-b-1514414905375
-
-Fix for bytestrings in console repl
-
-.. branch: py3-winreg
-
-Update winreg module to use unicode, wide-strings
-
-.. branch: cpyext-py3-instancemethod-attributes
-
-Add missing ``__doc__``, ``__module__``, ``__name__`` attributes to
-``instancemethod``
-
-.. branch: winapi
-
-Update support for _winapi cffi module for python3
-
-.. branch: py3.5-refactor-slots
-
-Refactor cpyext slots.
-
diff --git a/pypy/doc/whatsnew-pypy3-7.0.0.rst b/pypy/doc/whatsnew-pypy3-7.0.0.rst
--- a/pypy/doc/whatsnew-pypy3-7.0.0.rst
+++ b/pypy/doc/whatsnew-pypy3-7.0.0.rst
@@ -5,15 +5,10 @@
.. this is the revision after release-pypy3.5-v6.0
.. startrev: 580e3e26cd32
-.. branch: hroncok/fix-multiprocessing-regression-on-newer--1524656522151
+.. branch: unicode-utf8
-Fix multiprocessing regression on newer glibcs
+Use utf-8 internally to represent unicode strings
-.. branch: py3.5-user-site-impl
+.. branch: unicode-utf8-py3
-Use implementation-specific site directories in sysconfig like in Python2
-
-.. branch: py3.5-reverse-debugger
-
-The reverse-debugger branch has been merged. For more information, see
-https://bitbucket.org/pypy/revdb
+Use utf-8 internally to represent unicode strings
diff --git a/pypy/goal/targetpypystandalone.py b/pypy/goal/targetpypystandalone.py
--- a/pypy/goal/targetpypystandalone.py
+++ b/pypy/goal/targetpypystandalone.py
@@ -83,7 +83,7 @@
## con.interact()
except OperationError as e:
debug("OperationError:")
- debug(" operror-type: " + e.w_type.getname(space).encode('utf-8'))
+ debug(" operror-type: " + e.w_type.getname(space))
debug(" operror-value: " + space.text_w(space.str(e.get_w_value(space))))
return 1
finally:
@@ -91,7 +91,7 @@
space.finish()
except OperationError as e:
debug("OperationError:")
- debug(" operror-type: " + e.w_type.getname(space).encode('utf-8'))
+ debug(" operror-type: " + e.w_type.getname(space))
debug(" operror-value: " + space.text_w(space.str(e.get_w_value(space))))
return 1
return exitcode
@@ -148,7 +148,7 @@
except OperationError as e:
if verbose:
debug("OperationError:")
- debug(" operror-type: " + e.w_type.getname(space).encode('utf-8'))
+ debug(" operror-type: " + e.w_type.getname(space))
debug(" operror-value: " + space.text_w(space.str(e.get_w_value(space))))
return rffi.cast(rffi.INT, -1)
finally:
@@ -202,7 +202,7 @@
""")
except OperationError as e:
debug("OperationError:")
- debug(" operror-type: " + e.w_type.getname(space).encode('utf-8'))
+ debug(" operror-type: " + e.w_type.getname(space))
debug(" operror-value: " + space.text_w(space.str(e.get_w_value(space))))
return -1
return 0
diff --git a/pypy/interpreter/argument.py b/pypy/interpreter/argument.py
--- a/pypy/interpreter/argument.py
+++ b/pypy/interpreter/argument.py
@@ -596,6 +596,10 @@
except IndexError:
name = '?'
else:
+ w_enc = space.newtext(space.sys.defaultencoding)
+ w_err = space.newtext("replace")
+ w_name = space.call_method(w_name, "encode", w_enc,
+ w_err)
name = space.text_w(w_name)
break
self.kwd_name = name
diff --git a/pypy/interpreter/astcompiler/astbuilder.py b/pypy/interpreter/astcompiler/astbuilder.py
--- a/pypy/interpreter/astcompiler/astbuilder.py
+++ b/pypy/interpreter/astcompiler/astbuilder.py
@@ -58,6 +58,7 @@
self.space = space
self.compile_info = compile_info
self.root_node = n
+ # used in f-strings
self.recursive_parser = recursive_parser
def build_ast(self):
diff --git a/pypy/interpreter/astcompiler/fstring.py b/pypy/interpreter/astcompiler/fstring.py
--- a/pypy/interpreter/astcompiler/fstring.py
+++ b/pypy/interpreter/astcompiler/fstring.py
@@ -3,6 +3,7 @@
from pypy.interpreter import error
from pypy.interpreter import unicodehelper
from rpython.rlib.rstring import StringBuilder
+from rpython.rlib.rutf8 import codepoints_in_utf8
def add_constant_string(astbuilder, joined_pieces, w_string, atom_node):
@@ -21,10 +22,8 @@
joined_pieces.append(node(w_string, atom_node.get_lineno(),
atom_node.get_column()))
-def f_constant_string(astbuilder, joined_pieces, u, atom_node):
- space = astbuilder.space
- add_constant_string(astbuilder, joined_pieces, space.newunicode(u),
- atom_node)
+def f_constant_string(astbuilder, joined_pieces, w_u, atom_node):
+ add_constant_string(astbuilder, joined_pieces, w_u, atom_node)
def f_string_compile(astbuilder, source, atom_node):
# Note: a f-string is kept as a single literal up to here.
@@ -259,20 +258,20 @@
i += 1
fstr.current_index = i
+ space = astbuilder.space
literal = builder.build()
+ lgt = codepoints_in_utf8(literal)
if not fstr.raw_mode and '\\' in literal:
- space = astbuilder.space
literal = parsestring.decode_unicode_utf8(space, literal, 0,
len(literal))
- return unicodehelper.decode_unicode_escape(space, literal)
- else:
- return literal.decode('utf-8')
+ literal, lgt, pos = unicodehelper.decode_unicode_escape(space, literal)
+ return space.newtext(literal, lgt)
def fstring_find_literal_and_expr(astbuilder, fstr, atom_node, rec):
- # Return a tuple with the next literal part, and optionally the
+ # Return a tuple with the next literal part as a W_Unicode, and optionally the
# following expression node. Updates the current index inside 'fstr'.
- literal = fstring_find_literal(astbuilder, fstr, atom_node, rec)
+ w_u = fstring_find_literal(astbuilder, fstr, atom_node, rec)
s = fstr.unparsed
i = fstr.current_index
@@ -284,7 +283,7 @@
# We must now be the start of an expression, on a '{'.
assert s[i] == '{'
expr = fstring_find_expr(astbuilder, fstr, atom_node, rec)
- return literal, expr
+ return w_u, expr
def parse_f_string(astbuilder, joined_pieces, fstr, atom_node, rec=0):
@@ -303,11 +302,11 @@
"really the case", atom_node)
while True:
- literal, expr = fstring_find_literal_and_expr(astbuilder, fstr,
+ w_u, expr = fstring_find_literal_and_expr(astbuilder, fstr,
atom_node, rec)
# add the literal part
- f_constant_string(astbuilder, joined_pieces, literal, atom_node)
+ f_constant_string(astbuilder, joined_pieces, w_u, atom_node)
if expr is None:
break # We're done with this f-string.
diff --git a/pypy/interpreter/astcompiler/misc.py b/pypy/interpreter/astcompiler/misc.py
--- a/pypy/interpreter/astcompiler/misc.py
+++ b/pypy/interpreter/astcompiler/misc.py
@@ -112,7 +112,7 @@
# only intern identifier-like strings
from pypy.objspace.std.unicodeobject import _isidentifier
if (space.is_w(space.type(w_const), space.w_unicode) and
- _isidentifier(space.unicode_w(w_const))):
+ _isidentifier(space.utf8_w(w_const))):
return space.new_interned_w_str(w_const)
return w_const
diff --git a/pypy/interpreter/astcompiler/optimize.py b/pypy/interpreter/astcompiler/optimize.py
--- a/pypy/interpreter/astcompiler/optimize.py
+++ b/pypy/interpreter/astcompiler/optimize.py
@@ -5,7 +5,7 @@
from pypy.tool import stdlib_opcode as ops
from pypy.interpreter.error import OperationError
from rpython.rlib.unroll import unrolling_iterable
-from rpython.rlib.runicode import MAXUNICODE
+from rpython.rlib.rutf8 import MAXUNICODE
from rpython.rlib.objectmodel import specialize
@@ -326,7 +326,7 @@
# produce compatible pycs.
if (self.space.isinstance_w(w_obj, self.space.w_unicode) and
self.space.isinstance_w(w_const, self.space.w_unicode)):
- #unistr = self.space.unicode_w(w_const)
+ #unistr = self.space.utf8_w(w_const)
#if len(unistr) == 1:
# ch = ord(unistr[0])
#else:
diff --git a/pypy/interpreter/astcompiler/test/test_astbuilder.py b/pypy/interpreter/astcompiler/test/test_astbuilder.py
--- a/pypy/interpreter/astcompiler/test/test_astbuilder.py
+++ b/pypy/interpreter/astcompiler/test/test_astbuilder.py
@@ -902,7 +902,7 @@
def test_flufl(self):
source = "x <> y"
- raises(SyntaxError, self.get_ast, source)
+ py.test.raises(SyntaxError, self.get_ast, source)
comp = self.get_first_expr(source,
flags=consts.CO_FUTURE_BARRY_AS_BDFL)
assert isinstance(comp, ast.Compare)
@@ -1130,7 +1130,7 @@
s = self.get_first_expr("b'hi' b' implicitly' b' extra'")
assert isinstance(s, ast.Bytes)
assert space.eq_w(s.s, space.newbytes("hi implicitly extra"))
- raises(SyntaxError, self.get_first_expr, "b'hello' 'world'")
+ py.test.raises(SyntaxError, self.get_first_expr, "b'hello' 'world'")
sentence = u"Die Männer ärgern sich!"
source = u"# coding: utf-7\nstuff = '%s'" % (sentence,)
info = pyparse.CompileInfo("<test>", "exec")
@@ -1325,8 +1325,8 @@
assert isinstance(if2, ast.Name)
def test_cpython_issue12983(self):
- raises(SyntaxError, self.get_ast, r"""b'\x'""")
- raises(SyntaxError, self.get_ast, r"""b'\x0'""")
+ py.test.raises(SyntaxError, self.get_ast, r"""b'\x'""")
+ py.test.raises(SyntaxError, self.get_ast, r"""b'\x0'""")
def test_matmul(self):
mod = self.get_ast("a @ b")
diff --git a/pypy/interpreter/astcompiler/test/test_compiler.py b/pypy/interpreter/astcompiler/test/test_compiler.py
--- a/pypy/interpreter/astcompiler/test/test_compiler.py
+++ b/pypy/interpreter/astcompiler/test/test_compiler.py
@@ -1,5 +1,6 @@
from __future__ import division
import py, sys
+from pytest import raises
from pypy.interpreter.astcompiler import codegen, astbuilder, symtable, optimize
from pypy.interpreter.pyparser import pyparse
from pypy.interpreter.pyparser.test import expressions
@@ -76,7 +77,7 @@
space = self.space
pyco_expr = PyCode._from_code(space, co_expr)
w_res = pyco_expr.exec_host_bytecode(w_dict, w_dict)
- res = space.str_w(space.repr(w_res))
+ res = space.text_w(space.repr(w_res))
expected_repr = self.get_py3_repr(expected)
if isinstance(expected, float):
# Float representation can vary a bit between interpreter
@@ -1249,7 +1250,6 @@
def test_revdb_metavar(self):
from pypy.interpreter.reverse_debugging import dbstate, setup_revdb
- self.space.config.translation.reverse_debugger = True
self.space.reverse_debugging = True
try:
setup_revdb(self.space)
@@ -1264,9 +1264,6 @@
class AppTestCompiler:
- def setup_class(cls):
- cls.w_maxunicode = cls.space.wrap(sys.maxunicode)
-
def test_docstring_not_loaded(self):
import io, dis, sys
ns = {}
@@ -1428,7 +1425,7 @@
''', d)
return d['f'](5)
""")
- assert 'generator' in space.str_w(space.repr(w_generator))
+ assert 'generator' in space.text_w(space.repr(w_generator))
def test_folding_of_list_constants(self):
for source in (
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -3,7 +3,7 @@
from rpython.rlib.cache import Cache
from rpython.tool.uid import HUGEVAL_BYTES
-from rpython.rlib import jit, types
+from rpython.rlib import jit, types, rutf8
from rpython.rlib.debug import make_sure_not_resized
from rpython.rlib.objectmodel import (we_are_translated, newlist_hint,
compute_unique_id, specialize, not_rpython)
@@ -80,10 +80,10 @@
def getname(self, space):
try:
- return space.unicode_w(space.getattr(self, space.newtext('__name__')))
+ return space.utf8_w(space.getattr(self, space.newtext('__name__')))
except OperationError as e:
if e.match(space, space.w_TypeError) or e.match(space, space.w_AttributeError):
- return u'?'
+ return '?'
raise
def getaddrstring(self, space):
@@ -105,9 +105,9 @@
w_id = space.rshift(w_id, w_4)
return ''.join(addrstring)
- def getrepr(self, space, info, moreinfo=u''):
- addrstring = unicode(self.getaddrstring(space))
- return space.newunicode(u"<%s at 0x%s%s>" % (info, addrstring, moreinfo))
+ def getrepr(self, space, info, moreinfo=''):
+ addrstring = self.getaddrstring(space)
+ return space.newtext("<%s at 0x%s%s>" % (info, addrstring, moreinfo))
def getslotvalue(self, index):
raise NotImplementedError
@@ -245,11 +245,14 @@
def bytes_w(self, space):
self._typed_unwrap_error(space, "bytes")
- def unicode_w(self, space):
- self._typed_unwrap_error(space, "string")
+ def text_w(self, space):
+ self._typed_unwrap_error(space, "unicode")
- def text_w(self, space):
- self._typed_unwrap_error(space, "string")
+ def utf8_w(self, space):
+ self._typed_unwrap_error(space, "unicode")
+
+ def convert_to_w_unicode(self, space):
+ self._typed_unwrap_error(space, "unicode")
def bytearray_list_of_chars_w(self, space):
self._typed_unwrap_error(space, "bytearray")
@@ -420,7 +423,7 @@
self.builtin_modules = {}
self.reloading_modules = {}
- self.interned_strings = make_weak_value_dictionary(self, unicode, W_Root)
+ self.interned_strings = make_weak_value_dictionary(self, str, W_Root)
self.actionflag = ActionFlag() # changed by the signal module
self.check_signal_action = None # changed by the signal module
make_finalizer_queue(W_Root, self)
@@ -781,12 +784,12 @@
def setitem_str(self, w_obj, key, w_value):
# key is a "text", i.e. a byte string (in python3 it
- # represents a utf-8-encoded unicode)
+ # represents a valid utf-8-encoded unicode)
return self.setitem(w_obj, self.newtext(key), w_value)
def finditem_str(self, w_obj, key):
# key is a "text", i.e. a byte string (in python3 it
- # represents a utf-8-encoded unicode)
+ # represents a valid utf-8-encoded unicode)
return self.finditem(w_obj, self.newtext(key))
def finditem(self, w_obj, w_key):
@@ -820,9 +823,9 @@
def new_interned_w_str(self, w_u):
assert isinstance(w_u, W_Root) # and is not None
- u = self.unicode_w(w_u)
+ u = self.utf8_w(w_u)
if not we_are_translated():
- assert type(u) is unicode
+ assert type(u) is str
w_u1 = self.interned_strings.get(u)
if w_u1 is None:
w_u1 = w_u
@@ -835,12 +838,11 @@
# returns a "text" object (ie str in python2 and unicode in python3)
if not we_are_translated():
assert type(s) is str
- u = s.decode('utf-8')
- w_s1 = self.interned_strings.get(u)
+ w_s1 = self.interned_strings.get(s)
if w_s1 is None:
- w_s1 = self.newunicode(u)
+ w_s1 = self.newtext(s)
if self._side_effects_ok():
- self.interned_strings.set(u, w_s1)
+ self.interned_strings.set(s, w_s1)
return w_s1
def _revdb_startup(self):
@@ -879,11 +881,7 @@
# interface for marshal_impl
if not we_are_translated():
assert type(s) is str
- try:
- u = s.decode('utf-8')
- except UnicodeDecodeError:
- return None
- return self.interned_strings.get(u) # may be None
+ return self.interned_strings.get(s) # may be None
@specialize.arg(1)
def descr_self_interp_w(self, RequiredClass, w_obj):
@@ -1066,7 +1064,7 @@
"""
return None
- def listview_unicode(self, w_list):
+ def listview_utf8(self, w_list):
""" Return a list of unwrapped unicode out of a list of unicode. If the
argument is not a list or does not contain only unicode, return None.
May return None anyway.
@@ -1096,8 +1094,15 @@
def newlist_bytes(self, list_s):
return self.newlist([self.newbytes(s) for s in list_s])
- def newlist_unicode(self, list_u):
- return self.newlist([self.newunicode(u) for u in list_u])
+ def newlist_utf8(self, list_u, is_ascii):
+ l_w = [None] * len(list_u)
+ for i, item in enumerate(list_u):
+ if not is_ascii:
+ length = rutf8.check_utf8(item, True)
+ else:
+ length = len(item)
+ l_w[i] = self.newutf8(item, length)
+ return self.newlist(l_w)
def newlist_int(self, list_i):
return self.newlist([self.newint(i) for i in list_i])
@@ -1595,6 +1600,8 @@
else:
assert False
+ if self.isinstance_w(w_obj, self.w_unicode):
+ return w_obj.charbuf_w(self)
def text_or_none_w(self, w_obj):
return None if self.is_none(w_obj) else self.text_w(w_obj)
@@ -1617,18 +1624,22 @@
an utf-8 encoded rpython string.
"""
assert w_obj is not None
+ if not self.isinstance_w(w_obj, self.w_unicode):
+ w_obj._typed_unwrap_error(self, "unicode")
return w_obj.text_w(self)
@not_rpython # tests only; should be replaced with bytes_w or text_w
def str_w(self, w_obj):
"""
- if w_obj is unicode, call text_w() (i.e., return the UTF-8-nosg
+ if w_obj is unicode, call utf8_w() (i.e., return the UTF-8-nosg
encoded string). Else, call bytes_w().
We should kill str_w completely and manually substitute it with
text_w/bytes_w at all call sites. It remains for now for tests only.
"""
+ XXX # deprecated, leaving in place for clear errors
if self.isinstance_w(w_obj, self.w_unicode):
+ # XXX lo text_w, but better to deprecate str_w than to fix this
return w_obj.text_w(self)
else:
return w_obj.bytes_w(self)
@@ -1711,23 +1722,38 @@
assert w_obj is not None
return w_obj.float_w(self, allow_conversion)
- @specialize.argtype(1)
- def unicode_w(self, w_obj):
- assert w_obj is not None
- return w_obj.unicode_w(self)
+ def utf8_w(self, w_obj):
+ return w_obj.utf8_w(self)
- def unicode0_w(self, w_obj):
- "Like unicode_w, but rejects strings with NUL bytes."
+ def utf8_0_w(self, w_obj):
+ "Like utf_w, but rejects strings with NUL bytes."
from rpython.rlib import rstring
- result = w_obj.unicode_w(self)
- if u'\x00' in result:
+ result = w_obj.utf8_w(self)
+ if '\x00' in result:
+ raise oefmt(self.w_TypeError,
+ "argument must be a string without NUL "
+ "characters")
+ return rstring.assert_str0(result)
+
+ def convert_to_w_unicode(self, w_obj):
+ return w_obj.convert_to_w_unicode(self)
+
+ def realunicode_w(self, w_obj):
+ from pypy.interpreter.unicodehelper import decode_utf8sp
+ utf8 = self.utf8_w(w_obj)
+ return decode_utf8sp(self, utf8)[0].decode('utf8')
+
+ def utf8_0_w(self, w_obj):
+ "Like utf8_w, but rejects strings with NUL bytes."
+ from rpython.rlib import rstring
+ result = w_obj.utf8_w(self)
+ if '\x00' in result:
raise oefmt(self.w_ValueError,
- "argument must be a unicode string without NUL "
+ "argument must be a utf8 string without NUL "
"characters")
return rstring.assert_str0(result)
realtext_w = text_w # Python 2 compatibility
- realunicode_w = unicode_w
def fsencode(space, w_obj):
from pypy.interpreter.unicodehelper import fsencode
@@ -1742,6 +1768,27 @@
w_obj = self.fsencode(w_obj)
return self.bytesbuf0_w(w_obj)
+ def convert_arg_to_w_unicode(self, w_obj, strict=None):
+ # XXX why convert_to_w_unicode does something slightly different?
+ from pypy.objspace.std.unicodeobject import W_UnicodeObject
+ # for z_translation tests
+ if hasattr(self, 'is_fake_objspace'): return self.newtext("foobar")
+ return W_UnicodeObject.convert_arg_to_w_unicode(self, w_obj, strict)
+
+ def utf8_len_w(self, w_obj):
+ w_obj = self.convert_arg_to_w_unicode(w_obj)
+ return w_obj._utf8, w_obj._len()
+
+ def realutf8_w(self, w_obj):
+ # Like utf8_w(), but only works if w_obj is really of type
+ # 'unicode'. On Python 3 this is the same as utf8_w().
+ from pypy.objspace.std.unicodeobject import W_UnicodeObject
+ # for z_translation tests
+ if hasattr(self, 'is_fake_objspace'): return self.newtext("foobar")
+ if not isinstance(w_obj, W_UnicodeObject):
+ raise oefmt(self.w_TypeError, "argument must be a unicode")
+ return self.utf8_w(w_obj)
+
def bytesbuf0_w(self, w_obj):
# Like bytes0_w(), but also accept a read-only buffer.
from rpython.rlib import rstring
@@ -1759,7 +1806,7 @@
def fsdecode_w(self, w_obj):
if self.isinstance_w(w_obj, self.w_bytes):
w_obj = self.fsdecode(w_obj)
- return self.unicode0_w(w_obj)
+ return self.utf8_w(w_obj)
def bool_w(self, w_obj):
# Unwraps a bool, also accepting an int for compatibility.
@@ -2087,7 +2134,7 @@
'float_w',
'uint_w',
'bigint_w',
- 'unicode_w',
+ 'utf8_w',
'unwrap',
'is_true',
'is_w',
diff --git a/pypy/interpreter/error.py b/pypy/interpreter/error.py
--- a/pypy/interpreter/error.py
+++ b/pypy/interpreter/error.py
@@ -9,8 +9,7 @@
from rpython.rlib.objectmodel import we_are_translated, specialize
from rpython.rlib.objectmodel import dont_inline, not_rpython
from rpython.rlib import rstack, rstackovf
-from rpython.rlib import rwin32
-from rpython.rlib import runicode
+from rpython.rlib import rwin32, rutf8
from pypy.interpreter import debug
@@ -21,7 +20,8 @@
def strerror(errno):
"""Translate an error code to a unicode message string."""
from pypy.module._codecs.locale import str_decode_locale_surrogateescape
- return str_decode_locale_surrogateescape(os.strerror(errno))
+ utf8, lgt = str_decode_locale_surrogateescape(os.strerror(errno))
+ return utf8, lgt
class OperationError(Exception):
"""Interpreter-level exception that signals an exception that should be
@@ -72,7 +72,7 @@
space = getattr(self.w_type, 'space', None)
if space is not None:
if self.__class__ is not OperationError and s is None:
- s = self._compute_value(space)
+ s, lgt = self._compute_value(space)
try:
s = space.text_w(s)
except Exception:
@@ -306,8 +306,8 @@
def get_w_value(self, space):
w_value = self._w_value
if w_value is None:
- value = self._compute_value(space)
- self._w_value = w_value = space.newunicode(value)
+ value, lgt = self._compute_value(space)
+ self._w_value = w_value = space.newtext(value, lgt)
return w_value
def _compute_value(self, space):
@@ -472,16 +472,7 @@
assert len(formats) > 0, "unsupported: no % command found"
return tuple(parts), tuple(formats)
-def _decode_utf8(string):
- # when building the error message, don't crash if the byte string
- # provided is not valid UTF-8
- assert isinstance(string, str)
- result, consumed = runicode.str_decode_utf_8(
- string, len(string), "replace", final=True)
- return result
-
def get_operrcls2(valuefmt):
- valuefmt = valuefmt.decode('ascii')
strings, formats = decompose_valuefmt(valuefmt)
assert len(strings) == len(formats) + 1
try:
@@ -501,30 +492,49 @@
def _compute_value(self, space):
lst = [None] * (len(formats) + len(formats) + 1)
+ lgt = 0
for i, fmt, attr in entries:
lst[i + i] = self.xstrings[i]
+ lgt += len(self.xstrings[i])
value = getattr(self, attr)
if fmt == 'd':
- result = str(value).decode('ascii')
+ result = str(value)
+ lgt += len(result)
elif fmt == 'R':
- result = space.unicode_w(space.repr(value))
+ result = space.utf8_w(space.repr(value))
+ lgt += len(result)
elif fmt == 'S':
- result = space.unicode_w(space.str(value))
+ result = space.utf8_w(space.str(value))
+ lgt += len(result)
elif fmt == 'T':
- result = _decode_utf8(space.type(value).name)
+ result = space.type(value).name
+ lgt += len(result)
elif fmt == 'N':
result = value.getname(space)
+ lgt += len(result)
elif fmt == '8':
- result = _decode_utf8(value)
+ # u'str\uxxxx' -> 'str\xXX\xXX' -> u"'str\xXX\xXX'"
+ from pypy.interpreter import unicodehelper
+ result, _lgt, pos = unicodehelper.str_decode_utf8(
+ value, 'replace', True,
+ unicodehelper.decode_never_raise, True)
+ lgt += _lgt
+ elif isinstance(value, unicode):
+ # 's'
+ result = str(value.encode('utf-8'))
+ lgt += len(value)
else:
- if isinstance(value, unicode):
- result = value
- else:
- result = _decode_utf8(str(value))
+ result = str(value)
+ try:
+ lgt += rutf8.check_utf8(result, True)
+ except rutf8.CheckError as e:
+ lgt -= e.pos
lst[i + i + 1] = result
lst[-1] = self.xstrings[-1]
- return u''.join(lst)
- #
+ lgt += len(self.xstrings[-1])
+ retval = ''.join(lst)
+ return retval, lgt
+
_fmtcache2[formats] = OpErrFmt
return OpErrFmt, strings
@@ -534,7 +544,7 @@
self.setup(w_type)
def _compute_value(self, space):
- return self._value.decode('utf-8')
+ return self._value, len(self._value)
def async(self, space):
# also matches a RuntimeError("maximum rec.") if the stack is
@@ -565,8 +575,8 @@
%8 - The result of arg.decode('utf-8')
%N - The result of w_arg.getname(space)
- %R - The result of space.unicode_w(space.repr(w_arg))
- %S - The result of space.unicode_w(space.str(w_arg))
+ %R - The result of space.utf8_w(space.repr(w_arg))
+ %S - The result of space.utf8_w(space.str(w_arg))
%T - The result of space.type(w_arg).name
"""
@@ -621,12 +631,13 @@
if rwin32.WIN32 and isinstance(e, WindowsError):
winerror = e.winerror
try:
- msg = rwin32.FormatErrorW(winerror)
+ msg, lgt = rwin32.FormatErrorW(winerror)
except ValueError:
- msg = u'Windows Error %d' % winerror
+ msg = 'Windows Error %d' % winerror
+ lgt = len(msg)
w_errno = space.w_None
w_winerror = space.newint(winerror)
- w_msg = space.newunicode(msg)
+ w_msg = space.newtext(msg, lgt)
else:
errno = e.errno
if errno == EINTR:
@@ -635,12 +646,13 @@
return None
try:
- msg = strerror(errno)
+ msg, lgt = strerror(errno)
except ValueError:
- msg = u'error %d' % errno
+ msg = 'error %d' % errno
+ lgt = len(msg)
w_errno = space.newint(errno)
w_winerror = space.w_None
- w_msg = space.newunicode(msg)
+ w_msg = space.newtext(msg, lgt)
if w_filename is None:
w_filename = space.w_None
@@ -670,9 +682,9 @@
eintr_retry=eintr_retry)
def exception_from_errno(space, w_type, errno):
- msg = strerror(errno)
+ msg, lgt = strerror(errno)
w_error = space.call_function(w_type, space.newint(errno),
- space.newunicode(msg))
+ space.newtext(msg, lgt))
return OperationError(w_type, w_error)
def exception_from_saved_errno(space, w_type):
diff --git a/pypy/interpreter/function.py b/pypy/interpreter/function.py
--- a/pypy/interpreter/function.py
+++ b/pypy/interpreter/function.py
@@ -45,7 +45,8 @@
closure=None, w_ann=None, forcename=None, qualname=None):
self.space = space
self.name = forcename or code.co_name
- self.qualname = qualname or self.name.decode('utf-8')
+ self.qualname = qualname or self.name
+ assert isinstance(self.qualname, str)
self.w_doc = None # lazily read from code.getdocstring()
self.code = code # Code instance
self.w_func_globals = w_globals # the globals dictionary
@@ -255,7 +256,7 @@
return self.call_args(__args__)
def descr_function_repr(self):
- return self.getrepr(self.space, u'function %s' % self.qualname)
+ return self.getrepr(self.space, 'function %s' % self.qualname)
def _cleanup_(self):
@@ -313,7 +314,7 @@
tup_base = []
tup_state = [
space.newtext(self.name),
- space.newunicode(self.qualname),
+ space.newtext(self.qualname),
w_doc,
self.code,
w_func_globals,
@@ -337,7 +338,7 @@
self.space = space
self.name = space.text_w(w_name)
- self.qualname = space.unicode_w(w_qualname)
+ self.qualname = space.utf8_w(w_qualname)
self.code = space.interp_w(Code, w_code)
if not space.is_w(w_closure, space.w_None):
from pypy.interpreter.nestedscope import Cell
@@ -430,11 +431,11 @@
"__name__ must be set to a string object")
def fget_func_qualname(self, space):
- return space.newunicode(self.qualname)
+ return space.newtext(self.qualname)
def fset_func_qualname(self, space, w_name):
try:
- self.qualname = space.unicode_w(w_name)
+ self.qualname = space.realutf8_w(w_name)
except OperationError as e:
if e.match(space, space.w_TypeError):
raise oefmt(space.w_TypeError,
@@ -549,14 +550,14 @@
name = self.w_function.getname(self.space)
else:
try:
- name = space.unicode_w(w_name)
+ name = space.utf8_w(w_name)
except OperationError as e:
if not e.match(space, space.w_TypeError):
raise
- name = u'?'
- objrepr = space.unicode_w(space.repr(self.w_instance))
- s = u'<bound method %s of %s>' % (name, objrepr)
- return space.newunicode(s)
+ name = '?'
+ objrepr = space.utf8_w(space.repr(self.w_instance))
+ s = b'<bound method %s of %s>' % (name, objrepr)
+ return space.newtext(s)
def descr_method_getattribute(self, w_attr):
space = self.space
@@ -598,7 +599,7 @@
else:
w_builtins = space.getbuiltinmodule('builtins')
new_inst = space.getattr(w_builtins, space.newtext('getattr'))
- tup = [w_instance, space.newunicode(w_function.getname(space))]
+ tup = [w_instance, space.newtext(w_function.getname(space))]
return space.newtuple([new_inst, space.newtuple(tup)])
@@ -699,7 +700,7 @@
return self.space.newtext('<built-in function %s>' % (self.name,))
def descr__reduce__(self, space):
- return space.newunicode(self.qualname)
+ return space.newtext(self.qualname)
def is_builtin_code(w_func):
from pypy.interpreter.gateway import BuiltinCode
diff --git a/pypy/interpreter/gateway.py b/pypy/interpreter/gateway.py
--- a/pypy/interpreter/gateway.py
+++ b/pypy/interpreter/gateway.py
@@ -174,6 +174,9 @@
def visit_unicode(self, el, app_sig):
self.checked_space_method(el, app_sig)
+ def visit_utf8(self, el, app_sig):
+ self.checked_space_method(el, app_sig)
+
def visit_fsencode(self, el, app_sig):
self.checked_space_method(el, app_sig)
@@ -324,7 +327,10 @@
self.run_args.append("space.text0_w(%s)" % (self.scopenext(),))
def visit_unicode(self, typ):
- self.run_args.append("space.unicode_w(%s)" % (self.scopenext(),))
+ self.run_args.append("space.realunicode_w(%s)" % (self.scopenext(),))
+
+ def visit_utf8(self, typ):
+ self.run_args.append("space.utf8_w(%s)" % (self.scopenext(),))
def visit_fsencode(self, typ):
self.run_args.append("space.fsencode_w(%s)" % (self.scopenext(),))
@@ -492,11 +498,14 @@
self.unwrap.append("space.text_w(%s)" % (self.nextarg(),))
def visit_unicode(self, typ):
- self.unwrap.append("space.unicode_w(%s)" % (self.nextarg(),))
+ self.unwrap.append("space.realunicode_w(%s)" % (self.nextarg(),))
def visit_text0(self, typ):
self.unwrap.append("space.text0_w(%s)" % (self.nextarg(),))
+ def visit_utf8(self, typ):
+ self.unwrap.append("space.utf8_w(%s)" % (self.nextarg(),))
+
def visit_fsencode(self, typ):
self.unwrap.append("space.fsencode_w(%s)" % (self.nextarg(),))
@@ -567,8 +576,10 @@
assert typ in (int, str, float, unicode, r_longlong, r_uint, r_ulonglong, bool)
if typ is r_int is r_longlong:
return 'gateway_r_longlong_w'
- elif typ in (str, unicode):
- return typ.__name__ + '_w'
+ elif typ is str:
+ return 'utf8_w'
+ elif typ is unicode:
+ return 'realunicode_w'
elif typ is bool:
# For argument clinic's "bool" specifier: accept any object, and
# convert it to a boolean value. If you don't want this
@@ -1113,7 +1124,7 @@
kw_defs_w = []
for name, w_def in sorted(alldefs_w.items()):
assert name in sig.kwonlyargnames
- w_name = space.newunicode(name.decode('utf-8'))
+ w_name = space.newtext(name)
kw_defs_w.append((w_name, w_def))
return defs_w, kw_defs_w
diff --git a/pypy/interpreter/generator.py b/pypy/interpreter/generator.py
--- a/pypy/interpreter/generator.py
+++ b/pypy/interpreter/generator.py
@@ -38,14 +38,12 @@
# 'qualname' is a unicode string
if self._qualname is not None:
return self._qualname
- return self.get_name().decode('utf-8')
+ return self.get_name()
def descr__repr__(self, space):
addrstring = self.getaddrstring(space)
- return space.newunicode(u"<%s object %s at 0x%s>" %
- (unicode(self.KIND),
- self.get_qualname(),
- unicode(addrstring)))
+ return space.newtext("<%s object %s at 0x%s>" %
+ (self.KIND, self.get_qualname(), addrstring))
def descr_send(self, w_arg):
"""send(arg) -> send 'arg' into generator/coroutine,
@@ -215,7 +213,7 @@
e2.record_context(space, space.getexecutioncontext())
raise e2
else:
- space.warn(space.newunicode(u"generator '%s' raised StopIteration"
+ space.warn(space.newtext("generator '%s' raised StopIteration"
% self.get_qualname()),
space.w_PendingDeprecationWarning)
@@ -308,11 +306,11 @@
"__name__ must be set to a string object")
def descr__qualname__(self, space):
- return space.newunicode(self.get_qualname())
+ return space.newtext(self.get_qualname())
def descr_set__qualname__(self, space, w_name):
try:
- self._qualname = space.unicode_w(w_name)
+ self._qualname = space.utf8_w(w_name)
except OperationError as e:
if e.match(space, space.w_TypeError):
raise oefmt(space.w_TypeError,
@@ -399,8 +397,8 @@
self.frame is not None and \
self.frame.last_instr == -1:
space = self.space
- msg = u"coroutine '%s' was never awaited" % self.get_qualname()
- space.warn(space.newunicode(msg), space.w_RuntimeWarning)
+ msg = "coroutine '%s' was never awaited" % self.get_qualname()
+ space.warn(space.newtext(msg), space.w_RuntimeWarning)
GeneratorOrCoroutine._finalize_(self)
diff --git a/pypy/interpreter/mixedmodule.py b/pypy/interpreter/mixedmodule.py
--- a/pypy/interpreter/mixedmodule.py
+++ b/pypy/interpreter/mixedmodule.py
@@ -130,7 +130,7 @@
bltin.w_module = self.w_name
func._builtinversion_ = bltin
bltin.name = name
- bltin.qualname = bltin.name.decode('utf-8')
+ bltin.qualname = bltin.name
w_value = bltin
space.setitem(self.w_dict, w_name, w_value)
return w_value
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -206,7 +206,7 @@
self.co_filename = '<builtin>/%s' % (basename,)
self.w_filename = self.space.newfilename(self.co_filename)
- co_names = property(lambda self: [self.space.str_w(w_name) for w_name in self.co_names_w]) # for trace
+ co_names = property(lambda self: [self.space.text_w(w_name) for w_name in self.co_names_w]) # for trace
def signature(self):
return self._signature
@@ -452,8 +452,8 @@
def repr(self, space):
space = self.space
# co_name should be an identifier
- name = self.co_name.decode('utf-8')
- fn = space.unicode_w(self.w_filename)
- return space.newunicode(u'<code object %s at 0x%s, file "%s", line %d>' % (
- name, unicode(self.getaddrstring(space)), fn,
+ name = self.co_name
+ fn = space.utf8_w(self.w_filename)
+ return space.newtext(b'<code object %s at 0x%s, file "%s", line %d>' % (
+ name, self.getaddrstring(space), fn,
-1 if self.co_firstlineno == 0 else self.co_firstlineno))
diff --git a/pypy/interpreter/pyopcode.py b/pypy/interpreter/pyopcode.py
--- a/pypy/interpreter/pyopcode.py
+++ b/pypy/interpreter/pyopcode.py
@@ -1081,8 +1081,8 @@
try:
w_pkgname = space.getattr(
w_module, space.newtext('__name__'))
- w_fullname = space.newunicode(u'%s.%s' %
- (space.unicode_w(w_pkgname), space.unicode_w(w_name)))
+ w_fullname = space.newtext(b'%s.%s' %
+ (space.utf8_w(w_pkgname), space.utf8_w(w_name)))
return space.getitem(space.sys.get('modules'), w_fullname)
except OperationError:
raise oefmt(
@@ -1333,7 +1333,7 @@
def _make_function(self, oparg, freevars=None):
space = self.space
w_qualname = self.popvalue()
- qualname = self.space.unicode_w(w_qualname)
+ qualname = self.space.utf8_w(w_qualname)
w_codeobj = self.popvalue()
codeobj = self.space.interp_w(PyCode, w_codeobj)
if freevars is not None:
@@ -1628,7 +1628,7 @@
if (oparg & consts.FVS_MASK) == consts.FVS_HAVE_SPEC:
w_spec = self.popvalue()
else:
- w_spec = space.newunicode(u'')
+ w_spec = space.newtext('')
w_value = self.popvalue()
#
conversion = oparg & consts.FVC_MASK
@@ -1649,9 +1649,9 @@
lst = []
for i in range(itemcount-1, -1, -1):
w_item = self.peekvalue(i)
- lst.append(space.unicode_w(w_item))
+ lst.append(space.utf8_w(w_item))
self.dropvalues(itemcount)
- w_res = space.newunicode(u''.join(lst))
+ w_res = space.newtext(''.join(lst))
self.pushvalue(w_res)
def _revdb_load_var(self, oparg):
diff --git a/pypy/interpreter/pyparser/error.py b/pypy/interpreter/pyparser/error.py
--- a/pypy/interpreter/pyparser/error.py
+++ b/pypy/interpreter/pyparser/error.py
@@ -29,20 +29,24 @@
except: # we can't allow any exceptions here!
return None""")
elif self.text is not None:
- from rpython.rlib.runicode import str_decode_utf_8
+ from rpython.rlib.runicode import str_decode_utf_8_impl
# self.text may not be UTF-8 in case of decoding errors.
# adjust the encoded text offset to a decoded offset
# XXX do the right thing about continuation lines, which
# XXX are their own fun, sometimes giving offset >
# XXX len(self.text) for example (right now, avoid crashing)
+ def replace_error_handler(errors, encoding, msg, s, startpos, endpos):
+ # must return unicode
+ return u'\ufffd', endpos
if offset > len(self.text):
offset = len(self.text)
- text, _ = str_decode_utf_8(self.text, offset, 'replace')
+ text, _ = str_decode_utf_8_impl(self.text, offset,
+ 'replace', False, replace_error_handler, True)
offset = len(text)
if len(self.text) != offset:
- text, _ = str_decode_utf_8(self.text, len(self.text),
- 'replace')
- w_text = space.newunicode(text)
+ text, _ = str_decode_utf_8_impl(self.text, len(self.text),
+ 'replace', False, replace_error_handler, True)
+ w_text = space.newtext(text.encode('utf8'), len(text))
return space.newtuple([
space.newtext(self.msg),
space.newtuple([
diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -1,4 +1,5 @@
# coding: utf-8
+from rpython.rlib import rutf8
from pypy.interpreter.baseobjspace import W_Root
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter import unicodehelper
@@ -91,9 +92,11 @@
if encoding is None:
substr = s[ps:q]
else:
+ unicodehelper.check_utf8_or_raise(space, s, ps, q)
substr = decode_unicode_utf8(space, s, ps, q)
- v = unicodehelper.decode_unicode_escape(space, substr)
- return space.newunicode(v)
+ r = unicodehelper.decode_unicode_escape(space, substr)
+ v, length, pos = r
+ return space.newutf8(v, length)
assert 0 <= ps <= q
substr = s[ps : q]
@@ -111,8 +114,8 @@
elif saw_f:
return W_FString(substr, rawmode)
else:
- v = unicodehelper.decode_utf8(space, substr)
- return space.newunicode(v)
+ v = unicodehelper.str_decode_utf8(substr, 'strict', True, None)
+ return space.newtext(*v)
v = PyString_DecodeEscape(space, substr, 'strict', encoding)
return space.newbytes(v)
@@ -135,15 +138,12 @@
# the backslash we just wrote, we emit "\u005c"
# instead.
lis.append("u005c")
- if ord(s[ps]) & 0x80: # XXX inefficient
- w, ps = decode_utf8(space, s, ps, end)
- for c in w:
- # The equivalent of %08x, which is not supported by RPython.
- # 7 zeroes are enough for the unicode range, and the
- # result still fits in 32-bit.
- hexa = hex(ord(c) + 0x10000000)
- lis.append('\\U0')
- lis.append(hexa[3:]) # Skip 0x and the leading 1
+ if ord(s[ps]) & 0x80:
+ cp = rutf8.codepoint_at_pos(s, ps)
+ hexa = hex(cp + 0x10000000)
+ lis.append('\\U0')
+ lis.append(hexa[3:]) # Skip 0x and the leading 1
+ ps = rutf8.next_codepoint_pos(s, ps)
else:
lis.append(s[ps])
ps += 1
@@ -250,20 +250,29 @@
ch >= 'A' and ch <= 'F')
-def decode_utf8(space, s, ps, end):
+def check_utf8(space, s, ps, end):
assert ps >= 0
pt = ps
# while (s < end && *s != '\\') s++; */ /* inefficient for u".."
while ps < end and ord(s[ps]) & 0x80:
ps += 1
- u = unicodehelper.decode_utf8(space, s[pt:ps])
- return u, ps
+ try:
+ rutf8.check_utf8(s, True, pt, ps)
+ except rutf8.CheckError as e:
+ lgt, flag = rutf8.check_utf8(s, True, pt, e.pos)
+ unicodehelper.decode_error_handler(space)('strict', 'utf8',
+ 'invalid utf-8', s, pt + lgt, pt + lgt + 1)
+ return s[pt:ps]
def decode_utf8_recode(space, s, ps, end, recode_encoding):
- u, ps = decode_utf8(space, s, ps, end)
- w_v = unicodehelper.encode(space, space.newunicode(u), recode_encoding)
+ p = ps
+ while p < end and ord(s[p]) & 0x80:
+ p += 1
+ lgt = unicodehelper.check_utf8_or_raise(space, s, ps, p)
+ w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt),
+ recode_encoding)
v = space.bytes_w(w_v)
- return v, ps
+ return v, p
def raise_app_valueerror(space, msg):
raise OperationError(space.w_ValueError, space.newtext(msg))
diff --git a/pypy/interpreter/pyparser/pytokenizer.py b/pypy/interpreter/pyparser/pytokenizer.py
--- a/pypy/interpreter/pyparser/pytokenizer.py
+++ b/pypy/interpreter/pyparser/pytokenizer.py
@@ -6,6 +6,7 @@
from pypy.interpreter.pyparser.pytokenize import tabsize, alttabsize, whiteSpaceDFA, \
triple_quoted, endDFAs, single_quoted, pseudoDFA
from pypy.interpreter.astcompiler import consts
+from rpython.rlib import rutf8
NAMECHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
NUMCHARS = '0123456789'
@@ -46,14 +47,9 @@
def verify_utf8(token):
- for c in token:
- if ord(c) >= 0x80:
- break
- else:
- return True
try:
- u = token.decode('utf-8')
- except UnicodeDecodeError:
+ rutf8.check_utf8(token, False)
+ except rutf8.CheckError:
return False
return True
@@ -69,17 +65,12 @@
def verify_identifier(token):
# 1=ok; 0=not an identifier; -1=bad utf-8
- for c in token:
- if ord(c) >= 0x80:
- break
- else:
- return 1
try:
- u = token.decode('utf-8')
- except UnicodeDecodeError:
+ rutf8.check_utf8(token, False)
+ except rutf8.CheckError:
return -1
from pypy.objspace.std.unicodeobject import _isidentifier
- return _isidentifier(u)
+ return _isidentifier(token)
DUMMY_DFA = automata.DFA([], [])
diff --git a/pypy/interpreter/pyparser/test/test_parsestring.py b/pypy/interpreter/pyparser/test/test_parsestring.py
--- a/pypy/interpreter/pyparser/test/test_parsestring.py
+++ b/pypy/interpreter/pyparser/test/test_parsestring.py
@@ -10,7 +10,7 @@
assert space.bytes_w(w_ret) == value
elif isinstance(value, unicode):
assert space.type(w_ret) == space.w_unicode
- assert space.unicode_w(w_ret) == value
+ assert space.utf8_w(w_ret).decode('utf8') == value
else:
assert False
@@ -61,7 +61,7 @@
s = "u'\x81'"
s = s.decode("koi8-u").encode("utf8")[1:]
w_ret = parsestring.parsestr(self.space, 'koi8-u', s)
- ret = space.unwrap(w_ret)
+ ret = w_ret._utf8.decode('utf8')
assert ret == eval("# -*- coding: koi8-u -*-\nu'\x81'")
def test_unicode_pep414(self):
@@ -112,14 +112,14 @@
space = self.space
s = '"""' + '\\' + '\n"""'
w_ret = parsestring.parsestr(space, None, s)
- assert space.str_w(w_ret) == ''
+ assert space.text_w(w_ret) == ''
def test_bug1(self):
space = self.space
expected = ['x', ' ', chr(0xc3), chr(0xa9), ' ', '\n']
input = ["'", 'x', ' ', chr(0xc3), chr(0xa9), ' ', chr(92), 'n', "'"]
w_ret = parsestring.parsestr(space, 'utf8', ''.join(input))
- assert space.str_w(w_ret) == ''.join(expected)
+ assert space.text_w(w_ret) == ''.join(expected)
def test_wide_unicode_in_source(self):
if sys.maxunicode == 65535:
@@ -131,7 +131,4 @@
def test_decode_unicode_utf8(self):
buf = parsestring.decode_unicode_utf8(self.space,
'u"\xf0\x9f\x92\x8b"', 2, 6)
- if sys.maxunicode == 65535:
- assert buf == r"\U0000d83d\U0000dc8b"
- else:
- assert buf == r"\U0001f48b"
+ assert buf == r"\U0001f48b"
diff --git a/pypy/interpreter/test/test_appinterp.py b/pypy/interpreter/test/test_appinterp.py
--- a/pypy/interpreter/test/test_appinterp.py
+++ b/pypy/interpreter/test/test_appinterp.py
@@ -155,7 +155,7 @@
w_mymod2 = MyModule(space2, space2.wrap('mymod'))
w_str = space1.getattr(w_mymod1, space1.wrap("hi"))
- assert space1.str_w(w_str) == "hello"
+ assert space1.text_w(w_str) == "hello"
class TestMixedModuleUnfreeze:
spaceconfig = dict(usemodules=('_socket',))
diff --git a/pypy/interpreter/test/test_argument.py b/pypy/interpreter/test/test_argument.py
--- a/pypy/interpreter/test/test_argument.py
+++ b/pypy/interpreter/test/test_argument.py
@@ -55,6 +55,9 @@
pass
class DummySpace(object):
+ class sys:
+ defaultencoding = 'utf-8'
+
def newtuple(self, items):
return tuple(items)
@@ -92,16 +95,15 @@
def getitem(self, obj, key):
return obj[key]
- def wrap(self, obj):
+ def wrap(self, obj, lgt=-1):
return obj
newtext = wrap
- newunicode = wrap
def text_w(self, s):
- return self.unicode_w(s).encode('utf-8')
+ return self.utf8_w(s)
- def unicode_w(self, s):
- return unicode(s)
+ def utf8_w(self, s):
+ return s
def len(self, x):
return len(x)
@@ -135,7 +137,7 @@
def type(self, obj):
class Type:
def getname(self, space):
- return unicode(type(obj).__name__)
+ return type(obj).__name__
return Type()
@@ -343,14 +345,14 @@
def test_unwrap_error(self):
space = DummySpace()
valuedummy = object()
- def unicode_w(w):
+ def utf8_w(w):
if w is None:
raise OperationError(TypeError, None)
if w is valuedummy:
raise OperationError(ValueError, None)
- return str(w)
- space.unicode_w = unicode_w
- space.text_w = unicode_w
+ return bytes(w, 'utf-8')
+ space.utf8_w = utf8_w
+ space.text_w = utf8_w
excinfo = py.test.raises(OperationError, Arguments, space, [],
["a"], [1], w_starstararg={None: 1})
assert excinfo.value.w_type is TypeError
@@ -672,14 +674,14 @@
try:
Arguments(space, [], w_stararg=space.wrap(42))
except OperationError as e:
- msg = space.str_w(space.str(e.get_w_value(space)))
+ msg = space.text_w(space.str(e.get_w_value(space)))
assert msg == "argument after * must be an iterable, not int"
else:
assert 0, "did not raise"
try:
Arguments(space, [], w_starstararg=space.wrap(42))
except OperationError as e:
- msg = space.str_w(space.str(e.get_w_value(space)))
+ msg = space.text_w(space.str(e.get_w_value(space)))
assert msg == "argument after ** must be a mapping, not int"
else:
assert 0, "did not raise"
@@ -838,7 +840,6 @@
More information about the pypy-commit
mailing list