[pypy-commit] pypy default: Fix the general testing for newstr(utf8, length_in_number_of_chars),

Sat Apr 13 10:00:48 EDT 2019

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r96454:1f16a5e43952
Date: 2019-04-13 15:36 +0200
http://bitbucket.org/pypy/pypy/changeset/1f16a5e43952/

Log:	Fix the general testing for newstr(utf8, length_in_number_of_chars),
	which *now* should work and complain if we give an invalid number of
	chars.

	Fix array.array for a place where invalid utf8 strings were still
	being made, found by the above.

diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -1053,21 +1053,17 @@
                 code = r_uint(ord(item))
                 # cpython will allow values > sys.maxunicode
                 # while silently truncating the top bits
-                if code <= r_uint(0x7F):
-                    # Encode ASCII
-                    item = chr(code)
-                elif code <= r_uint(0x07FF):
-                    item = (chr((0xc0 | (code >> 6))) + 
-                            chr((0x80 | (code & 0x3f))))
-                elif code <= r_uint(0xFFFF):
-                    item = (chr((0xe0 | (code >> 12))) +
-                            chr((0x80 | ((code >> 6) & 0x3f))) +
-                            chr((0x80 | (code & 0x3f))))
-                else:
-                    item = (chr((0xf0 | (code >> 18)) & 0xff) +
-                            chr((0x80 | ((code >> 12) & 0x3f))) +
-                            chr((0x80 | ((code >> 6) & 0x3f))) +
-                            chr((0x80 | (code & 0x3f))))
+                # For now I (arigo) am going to ignore that and
+                # raise a ValueError always here, instead of getting
+                # some invalid utf8-encoded string which makes things
+                # potentially explode left and right.
+                try:
+                    item = rutf8.unichr_as_utf8(code)
+                except rutf8.OutOfRange:
+                    raise oefmt(space.w_ValueError,
+                        "cannot operate on this array('u') because it contains"
+                        " character %s not in range [U+0000; U+10ffff]"
+                        " at index %d", 'U+%x' % code, idx)
                 return space.newutf8(item, 1)
             assert 0, "unreachable"
 
diff --git a/pypy/module/array/test/test_array.py b/pypy/module/array/test/test_array.py
--- a/pypy/module/array/test/test_array.py
+++ b/pypy/module/array/test/test_array.py
@@ -851,7 +851,13 @@
         a = self.array('u', input_unicode)
         b = self.array('u', input_unicode)
         b.byteswap()
-        assert a != b
+        assert b[2] == u'\u0000'
+        raises(ValueError, "b[1]")        # doesn't work
+        e = raises(ValueError, "a != b")  # doesn't work
+        assert str(e.value) == (
+            "cannot operate on this array('u') because it contains"
+            " character U+1000000 not in range [U+0000; U+10ffff]"
+            " at index 0")
         assert str(a) == "array('u', %r)" % (input_unicode,)
         assert str(b) == ("array('u', <character U+1000000 is not in"
                           " range [U+0000; U+10ffff]>)")
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -42,13 +42,10 @@
         self._length = length
         self._index_storage = rutf8.null_storage()
         if not we_are_translated():
-            try:
-                # best effort, too expensive to handle surrogates
-                ulength = rutf8.codepoints_in_utf(utf8str)
-            except:
-                ulength = length 
-            assert ulength == length
-
+            # utf8str must always be a valid utf8 string, except maybe with
+            # explicit surrogate characters---which .decode('utf-8') doesn't
+            # special-case in Python 2, which is exactly what we want here
+            assert length == len(utf8str.decode('utf-8'))
 
 
     @staticmethod