[pypy-commit] pypy default: Unicode characters out of range(0x11000): fix a few docstrings, and try to more

Wed Mar 27 07:26:34 EDT 2019

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r96359:a59c3b47eec9
Date: 2019-03-27 12:26 +0100
http://bitbucket.org/pypy/pypy/changeset/a59c3b47eec9/

Log:	Unicode characters out of range(0x11000): fix a few docstrings, and
	try to more systematically test (and fix) various corner cases

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -537,14 +537,17 @@
 def wcharpsize2utf8(space, wcharp, size):
     """Safe version of rffi.wcharpsize2utf8.
 
-    Raises app-level rutf8.OutOfRange if any wchar value is outside the valid
+    Raises app-level ValueError if any wchar value is outside the valid
     codepoint range.
     """
     try:
         return rffi.wcharpsize2utf8(wcharp, size)
     except rutf8.OutOfRange as e:
-        raise oefmt(space.w_ValueError,
-            "character %s is not in range [U+0000; U+10ffff]", 'U+%x' % e.code)
+        raise wrap_unicode_out_of_range_error(space, e)
+
+def wrap_unicode_out_of_range_error(space, e):
+    raise oefmt(space.w_ValueError,
+        "character %s is not in range [U+0000; U+10ffff]", 'U+%x' % e.code)
 
 
 # ____________________________________________________________
diff --git a/pypy/module/_rawffi/interp_rawffi.py b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -5,6 +5,8 @@
 from pypy.interpreter.gateway import interp2app, unwrap_spec
 from pypy.interpreter.typedef import interp_attrproperty
 from pypy.interpreter.typedef import TypeDef, GetSetProperty
+from pypy.interpreter.unicodehelper import wcharpsize2utf8
+from pypy.interpreter.unicodehelper import wrap_unicode_out_of_range_error
 
 from rpython.rlib.clibffi import *
 from rpython.rtyper.lltypesystem import lltype, rffi
@@ -596,10 +598,13 @@
     if address == 0:
         return space.w_None
     wcharp_addr = rffi.cast(rffi.CWCHARP, address)
-    if maxlength == -1:
-        s, lgt = rffi.wcharp2utf8(wcharp_addr)
-    else:
-        s, lgt = rffi.wcharp2utf8n(wcharp_addr, maxlength)
+    try:
+        if maxlength == -1:
+            s, lgt = rffi.wcharp2utf8(wcharp_addr)
+        else:
+            s, lgt = rffi.wcharp2utf8n(wcharp_addr, maxlength)
+    except rutf8.OutOfRange as e:
+        raise wrap_unicode_out_of_range_error(space, e)
     return space.newutf8(s, lgt)
 
 @unwrap_spec(address=r_uint, maxlength=int)
@@ -613,7 +618,7 @@
 def wcharp2rawunicode(space, address, maxlength=-1):
     if maxlength == -1:
         return wcharp2unicode(space, address)
-    s = rffi.wcharpsize2utf8(rffi.cast(rffi.CWCHARP, address), maxlength)
+    s = wcharpsize2utf8(space, rffi.cast(rffi.CWCHARP, address), maxlength)
     return space.newutf8(s, maxlength)
 
 @unwrap_spec(address=r_uint, newcontent='bufferstr')
diff --git a/pypy/module/_rawffi/test/test__rawffi.py b/pypy/module/_rawffi/test/test__rawffi.py
--- a/pypy/module/_rawffi/test/test__rawffi.py
+++ b/pypy/module/_rawffi/test/test__rawffi.py
@@ -1229,6 +1229,23 @@
         lib = _rawffi.CDLL(self.lib_name)
         assert lib.name == self.lib_name
 
+    def test_wcharp2rawunicode(self):
+        import _rawffi
+        A = _rawffi.Array('i')
+        arg = A(1)
+        arg[0] = 0x1234
+        u = _rawffi.wcharp2rawunicode(arg.itemaddress(0))
+        assert u == u'\u1234'
+        u = _rawffi.wcharp2rawunicode(arg.itemaddress(0), 1)
+        assert u == u'\u1234'
+        arg[0] = -1
+        raises(ValueError, _rawffi.wcharp2rawunicode, arg.itemaddress(0))
+        raises(ValueError, _rawffi.wcharp2rawunicode, arg.itemaddress(0), 1)
+        arg[0] = 0x110000
+        raises(ValueError, _rawffi.wcharp2rawunicode, arg.itemaddress(0))
+        raises(ValueError, _rawffi.wcharp2rawunicode, arg.itemaddress(0), 1)
+        arg.free()
+
 
 class AppTestAutoFree:
     spaceconfig = dict(usemodules=['_rawffi', 'struct'])
diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -13,6 +13,7 @@
     interp2app, interpindirect2app, unwrap_spec)
 from pypy.interpreter.typedef import (
     GetSetProperty, TypeDef, make_weakref_descr)
+from pypy.interpreter.unicodehelper import wcharpsize2utf8
 from pypy.module._file.interp_file import W_File
 
 
@@ -463,7 +464,8 @@
         """
         if self.typecode == 'u':
             buf = rffi.cast(UNICODE_ARRAY, self._buffer_as_unsigned())
-            return space.newutf8(rffi.wcharpsize2utf8(buf, self.len), self.len)
+            utf8 = wcharpsize2utf8(space, buf, self.len)
+            return space.newutf8(utf8, self.len)
         else:
             raise oefmt(space.w_ValueError,
                         "tounicode() may only be called on type 'u' arrays")
@@ -714,8 +716,15 @@
             s = "array('%s', %s)" % (self.typecode, space.text_w(r))
             return space.newtext(s)
         elif self.typecode == "u":
-            r = space.repr(self.descr_tounicode(space))
-            s = "array('%s', %s)" % (self.typecode, space.text_w(r))
+            try:
+                w_unicode = self.descr_tounicode(space)
+            except OperationError as e:
+                if not e.match(space, space.w_ValueError):
+                    raise
+                r = '<character out of range>'
+            else:
+                r = space.text_w(space.repr(w_unicode))
+            s = "array('%s', %s)" % (self.typecode, r)
             return space.newtext(s)
         else:
             r = space.repr(self.descr_tolist(space))
diff --git a/pypy/module/array/test/test_array.py b/pypy/module/array/test/test_array.py
--- a/pypy/module/array/test/test_array.py
+++ b/pypy/module/array/test/test_array.py
@@ -847,10 +847,15 @@
         assert repr(mya('i', (1, 2, 3))) == "array('i', [1, 2, 3])"
 
     def test_unicode_outofrange(self):
-        a = self.array('u', u'\x01\u263a\x00\ufeff')
-        b = self.array('u', u'\x01\u263a\x00\ufeff')
+        input_unicode = u'\x01\u263a\x00\ufeff'
+        a = self.array('u', input_unicode)
+        b = self.array('u', input_unicode)
         b.byteswap()
         assert a != b
+        assert str(a) == "array('u', %r)" % (input_unicode,)
+        assert str(b) == "array('u', <character out of range>)"
+        assert a.tounicode() == input_unicode
+        raises(ValueError, b.tounicode)   # doesn't work
 
     def test_weakref(self):
         import weakref
diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1023,7 +1023,7 @@
 def wcharpsize2utf8(w, size):
     """ Helper to convert WCHARP pointer to utf8 in one go.
     Equivalent to wcharpsize2unicode().encode("utf8")
-    Raises ValueError if characters are outside range(0x110000)!
+    Raises rutf8.OutOfRange if characters are outside range(0x110000)!
     """
     from rpython.rlib import rutf8
 
@@ -1033,6 +1033,9 @@
     return s.build()
 
 def wcharp2utf8(w):
+    """
+    Raises rutf8.OutOfRange if characters are outside range(0x110000)!
+    """
     from rpython.rlib import rutf8
 
     s = rutf8.Utf8StringBuilder()
@@ -1043,6 +1046,9 @@
     return s.build(), i
 
 def wcharp2utf8n(w, maxlen):
+    """
+    Raises rutf8.OutOfRange if characters are outside range(0x110000)!
+    """
     from rpython.rlib import rutf8
 
     s = rutf8.Utf8StringBuilder(maxlen)