[pypy-commit] pypy default: Fix the general testing for newstr(utf8, length_in_number_of_chars),
arigo
pypy.commits at gmail.com
Sat Apr 13 10:00:48 EDT 2019
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r96454:1f16a5e43952
Date: 2019-04-13 15:36 +0200
http://bitbucket.org/pypy/pypy/changeset/1f16a5e43952/
Log: Fix the general testing for newstr(utf8, length_in_number_of_chars),
which *now* should work and complain if we give an invalid number of
chars.
Fix array.array for a place where invalid utf8 strings were still
being made, found by the above.
diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -1053,21 +1053,17 @@
code = r_uint(ord(item))
# cpython will allow values > sys.maxunicode
# while silently truncating the top bits
- if code <= r_uint(0x7F):
- # Encode ASCII
- item = chr(code)
- elif code <= r_uint(0x07FF):
- item = (chr((0xc0 | (code >> 6))) +
- chr((0x80 | (code & 0x3f))))
- elif code <= r_uint(0xFFFF):
- item = (chr((0xe0 | (code >> 12))) +
- chr((0x80 | ((code >> 6) & 0x3f))) +
- chr((0x80 | (code & 0x3f))))
- else:
- item = (chr((0xf0 | (code >> 18)) & 0xff) +
- chr((0x80 | ((code >> 12) & 0x3f))) +
- chr((0x80 | ((code >> 6) & 0x3f))) +
- chr((0x80 | (code & 0x3f))))
+ # For now I (arigo) am going to ignore that and
+ # raise a ValueError always here, instead of getting
+ # some invalid utf8-encoded string which makes things
+ # potentially explode left and right.
+ try:
+ item = rutf8.unichr_as_utf8(code)
+ except rutf8.OutOfRange:
+ raise oefmt(space.w_ValueError,
+ "cannot operate on this array('u') because it contains"
+ " character %s not in range [U+0000; U+10ffff]"
+ " at index %d", 'U+%x' % code, idx)
return space.newutf8(item, 1)
assert 0, "unreachable"
diff --git a/pypy/module/array/test/test_array.py b/pypy/module/array/test/test_array.py
--- a/pypy/module/array/test/test_array.py
+++ b/pypy/module/array/test/test_array.py
@@ -851,7 +851,13 @@
a = self.array('u', input_unicode)
b = self.array('u', input_unicode)
b.byteswap()
- assert a != b
+ assert b[2] == u'\u0000'
+ raises(ValueError, "b[1]") # doesn't work
+ e = raises(ValueError, "a != b") # doesn't work
+ assert str(e.value) == (
+ "cannot operate on this array('u') because it contains"
+ " character U+1000000 not in range [U+0000; U+10ffff]"
+ " at index 0")
assert str(a) == "array('u', %r)" % (input_unicode,)
assert str(b) == ("array('u', <character U+1000000 is not in"
" range [U+0000; U+10ffff]>)")
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -42,13 +42,10 @@
self._length = length
self._index_storage = rutf8.null_storage()
if not we_are_translated():
- try:
- # best effort, too expensive to handle surrogates
- ulength = rutf8.codepoints_in_utf(utf8str)
- except:
- ulength = length
- assert ulength == length
-
+ # utf8str must always be a valid utf8 string, except maybe with
+ # explicit surrogate characters---which .decode('utf-8') doesn't
+ # special-case in Python 2, which is exactly what we want here
+ assert length == len(utf8str.decode('utf-8'))
@staticmethod
More information about the pypy-commit
mailing list