[pypy-commit] pypy unicode-utf8: disallow invalid unicode from array

fijal pypy.commits at gmail.com
Sat Dec 9 07:46:38 EST 2017


Author: fijal
Branch: unicode-utf8
Changeset: r93325:fe927a5758d2
Date: 2017-12-09 14:45 +0200
http://bitbucket.org/pypy/pypy/changeset/fe927a5758d2/

Log:	disallow invalid unicode from array

diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -380,6 +380,7 @@
         if len(s) % self.itemsize != 0:
             raise oefmt(self.space.w_ValueError,
                         "string length not a multiple of item size")
+        self.check_valid_unicode(space, s) # empty for non-u arrays
         oldlen = self.len
         new = len(s) / self.itemsize
         if not new:
@@ -710,6 +711,9 @@
             s = "array('%s', %s)" % (self.typecode, space.text_w(r))
             return space.newtext(s)
 
+    def check_valid_unicode(self, space, s):
+        pass # overwritten by u
+
 W_ArrayBase.typedef = TypeDef(
     'array.array',
     __new__ = interp2app(w_array),
@@ -870,6 +874,18 @@
         def get_buffer(self):
             return rffi.cast(mytype.arrayptrtype, self._buffer)
 
+        if mytype.unwrap == 'utf8_len_w':
+            def check_valid_unicode(self, space, s):
+                i = 0
+                while i < len(s):
+                    if s[i] != '\x00' or ord(s[i + 1]) > 0x10:
+                        v = ((ord(s[i]) << 24) + (ord(s[i + 1]) << 16) +
+                             (ord(s[i + 2]) << 8) + ord(s[i + 3]))
+                        raise oefmt(space.w_ValueError,
+                            "Character U+%s is not in range [U+0000, U+10ffff]",
+                            hex(v)[2:])
+                    i += 4
+
         def item_w(self, w_item):
             space = self.space
             unwrap = getattr(space, mytype.unwrap)
diff --git a/pypy/module/array/test/test_array.py b/pypy/module/array/test/test_array.py
--- a/pypy/module/array/test/test_array.py
+++ b/pypy/module/array/test/test_array.py
@@ -844,13 +844,7 @@
         import sys
         if sys.maxunicode == 0xffff:
             skip("test for 32-bit unicodes")
-        a = self.array('u', b'\xff\xff\xff\xff')
-        assert len(a) == 1
-        assert repr(a[0]) == "u'\Uffffffff'"
-        if sys.maxint == 2147483647:
-            assert ord(a[0]) == -1
-        else:
-            assert ord(a[0]) == 4294967295
+        raises(ValueError, self.array, 'u', b'\xff\xff\xff\xff')
 
     def test_weakref(self):
         import weakref


More information about the pypy-commit mailing list