[pypy-commit] pypy unicode-utf8-test: hg merge unicode-utf8

Sat Dec 9 09:05:21 EST 2017

Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch: unicode-utf8-test
Changeset: r93330:a31f4ea5722a
Date: 2017-12-09 14:04 +0000
http://bitbucket.org/pypy/pypy/changeset/a31f4ea5722a/

Log:	hg merge unicode-utf8

diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -70,9 +70,6 @@
                 raise oefmt(space.w_IndexError,
                             "position %d from error handler out of bounds",
                             newpos)
-            if newpos < startpos:
-                raise oefmt(space.w_IndexError,
-                    "position %d from error handler did not progress", newpos)
             w_replace = space.convert_to_w_unicode(w_replace)
             return w_replace._utf8, newpos
         return call_errorhandler
@@ -226,7 +223,7 @@
         w_end = space.getattr(w_exc, space.newtext('end'))
         end = space.int_w(w_end)
         start = w_obj._index_to_byte(start)
-        end = w_obj._index_to_byte(end)        
+        end = w_obj._index_to_byte(end)
         builder = StringBuilder()
         pos = start
         obj = w_obj._utf8
diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -380,6 +380,7 @@
         if len(s) % self.itemsize != 0:
             raise oefmt(self.space.w_ValueError,
                         "string length not a multiple of item size")
+        self.check_valid_unicode(space, s) # empty for non-u arrays
         oldlen = self.len
         new = len(s) / self.itemsize
         if not new:
@@ -710,6 +711,9 @@
             s = "array('%s', %s)" % (self.typecode, space.text_w(r))
             return space.newtext(s)
 
+    def check_valid_unicode(self, space, s):
+        pass # overwritten by u
+
 W_ArrayBase.typedef = TypeDef(
     'array.array',
     __new__ = interp2app(w_array),
@@ -870,6 +874,18 @@
         def get_buffer(self):
             return rffi.cast(mytype.arrayptrtype, self._buffer)
 
+        if mytype.unwrap == 'utf8_len_w':
+            def check_valid_unicode(self, space, s):
+                i = 0
+                while i < len(s):
+                    if s[i] != '\x00' or ord(s[i + 1]) > 0x10:
+                        v = ((ord(s[i]) << 24) + (ord(s[i + 1]) << 16) +
+                             (ord(s[i + 2]) << 8) + ord(s[i + 3]))
+                        raise oefmt(space.w_ValueError,
+                            "Character U+%s is not in range [U+0000, U+10ffff]",
+                            hex(v)[2:])
+                    i += 4
+
         def item_w(self, w_item):
             space = self.space
             unwrap = getattr(space, mytype.unwrap)
diff --git a/pypy/module/array/test/test_array.py b/pypy/module/array/test/test_array.py
--- a/pypy/module/array/test/test_array.py
+++ b/pypy/module/array/test/test_array.py
@@ -844,13 +844,7 @@
         import sys
         if sys.maxunicode == 0xffff:
             skip("test for 32-bit unicodes")
-        a = self.array('u', b'\xff\xff\xff\xff')
-        assert len(a) == 1
-        assert repr(a[0]) == "u'\Uffffffff'"
-        if sys.maxint == 2147483647:
-            assert ord(a[0]) == -1
-        else:
-            assert ord(a[0]) == 4294967295
+        raises(ValueError, self.array, 'u', b'\xff\xff\xff\xff')
 
     def test_weakref(self):
         import weakref
diff --git a/pypy/module/pyexpat/interp_pyexpat.py b/pypy/module/pyexpat/interp_pyexpat.py
--- a/pypy/module/pyexpat/interp_pyexpat.py
+++ b/pypy/module/pyexpat/interp_pyexpat.py
@@ -587,21 +587,22 @@
 
     def UnknownEncodingHandler(self, space, name, info):
         # Yes, supports only 8bit encodings
-        translationmap = space.unicode_w(
+        translationmap, lgt = space.utf8_len_w(
             space.call_method(
                 space.newbytes(self.all_chars), "decode",
                 space.newtext(name), space.newtext("replace")))
 
-        if len(translationmap) != 256:
+        if lgt != 256:
             raise oefmt(space.w_ValueError,
                         "multi-byte encodings are not supported")
 
-        for i in range(256):
-            c = translationmap[i]
-            if c == u'\ufffd':
+        i = 0
+        for c in rutf8.Utf8StringIterator(translationmap):
+            if c == 0xfffd:
                 info.c_map[i] = rffi.cast(rffi.INT, -1)
             else:
                 info.c_map[i] = rffi.cast(rffi.INT, c)
+            i += 1
         info.c_data = lltype.nullptr(rffi.VOIDP.TO)
         info.c_convert = lltype.nullptr(rffi.VOIDP.TO)
         info.c_release = lltype.nullptr(rffi.VOIDP.TO)
diff --git a/pypy/module/unicodedata/interp_ucd.py b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -7,11 +7,8 @@
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.typedef import TypeDef, interp_attrproperty
 from rpython.rlib.rarithmetic import r_longlong
-from rpython.rlib.objectmodel import we_are_translated
-from rpython.rlib.runicode import MAXUNICODE
 from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0
-from rpython.rlib.runicode import code_to_unichr, ord_accepts_surrogate
-import sys
+from rpython.rlib.rutf8 import Utf8StringBuilder, unichr_as_utf8
 
 
 # Contants for Hangul characters
@@ -30,49 +27,17 @@
 # unicode code point.
 
 
-if MAXUNICODE > 0xFFFF:
-    # Target is wide build
-    def unichr_to_code_w(space, w_unichr):
-        if not space.isinstance_w(w_unichr, space.w_unicode):
-            raise oefmt(
-                space.w_TypeError, 'argument 1 must be unicode, not %T',
-                w_unichr)
+# Target is wide build
+def unichr_to_code_w(space, w_unichr):
+    if not space.isinstance_w(w_unichr, space.w_unicode):
+        raise oefmt(
+            space.w_TypeError, 'argument 1 must be unicode, not %T',
+            w_unichr)
 
-        if not we_are_translated() and sys.maxunicode == 0xFFFF:
-            # Host CPython is narrow build, accept surrogates
-            try:
-                return ord_accepts_surrogate(space.unicode_w(w_unichr))
-            except TypeError:
-                raise oefmt(space.w_TypeError,
-                            "need a single Unicode character as parameter")
-        else:
-            if not space.len_w(w_unichr) == 1:
-                raise oefmt(space.w_TypeError,
-                            "need a single Unicode character as parameter")
-            return space.int_w(space.ord(w_unichr))
-
-else:
-    # Target is narrow build
-    def unichr_to_code_w(space, w_unichr):
-        if not space.isinstance_w(w_unichr, space.w_unicode):
-            raise oefmt(
-                space.w_TypeError, 'argument 1 must be unicode, not %T',
-                w_unichr)
-
-        if not we_are_translated() and sys.maxunicode > 0xFFFF:
-            # Host CPython is wide build, forbid surrogates
-            if not space.len_w(w_unichr) == 1:
-                raise oefmt(space.w_TypeError,
-                            "need a single Unicode character as parameter")
-            return space.int_w(space.ord(w_unichr))
-
-        else:
-            # Accept surrogates
-            try:
-                return ord_accepts_surrogate(space.unicode_w(w_unichr))
-            except TypeError:
-                raise oefmt(space.w_TypeError,
-                            "need a single Unicode character as parameter")
+    if not space.len_w(w_unichr) == 1:
+        raise oefmt(space.w_TypeError,
+                    "need a single Unicode character as parameter")
+    return space.int_w(space.ord(w_unichr))
 
 
 class UCD(W_Root):
@@ -110,7 +75,7 @@
         except KeyError:
             msg = space.mod(space.newtext("undefined character name '%s'"), space.newtext(name))
             raise OperationError(space.w_KeyError, msg)
-        return space.newunicode(code_to_unichr(code))
+        return space.newutf8(unichr_as_utf8(code), 1)
 
     def name(self, space, w_unichr, w_default=None):
         code = unichr_to_code_w(space, w_unichr)
@@ -259,10 +224,10 @@
                 result[0] = ch
 
         if not composed: # If decomposed normalization we are done
-            return space.newunicode(u''.join([unichr(i) for i in result[:j]]))
+            return self.build(space, result, stop=j)
 
         if j <= 1:
-            return space.newunicode(u''.join([unichr(i) for i in result[:j]]))
+            return self.build(space, result, stop=j)
 
         current = result[0]
         starter_pos = 0
@@ -310,7 +275,13 @@
 
         result[starter_pos] = current
 
-        return space.newunicode(u''.join([unichr(i) for i in result[:next_insert]]))
+        return self.build(space, result, stop=next_insert)
+
+    def build(self, space, r, stop):
+        builder = Utf8StringBuilder(stop * 3)
+        for i in range(stop):
+            builder.append_code(r[i])
+        return space.newutf8(builder.build(), stop)
 
 
 methods = {}
diff --git a/pypy/module/unicodedata/test/test_hyp.py b/pypy/module/unicodedata/test/test_hyp.py
--- a/pypy/module/unicodedata/test/test_hyp.py
+++ b/pypy/module/unicodedata/test/test_hyp.py
@@ -1,3 +1,4 @@
+
 import pytest
 try:
     from hypothesis import given, strategies as st, example, settings
@@ -5,12 +6,14 @@
     pytest.skip("hypothesis required")
 
 from pypy.module.unicodedata.interp_ucd import ucd
+from rpython.rlib.rutf8 import get_utf8_length
 
 def make_normalization(space, NF_code):
     def normalize(s):
-        w_s = space.newunicode(s)
+        u = s.encode('utf8')
+        w_s = space.newutf8(u, get_utf8_length(u))
         w_res = ucd.normalize(space, NF_code, w_s)
-        return space.unicode_w(w_res)
+        return space.utf8_w(w_res).decode('utf8')
     return normalize
 
 all_forms = ['NFC', 'NFD', 'NFKC', 'NFKD']