[pypy-commit] pypy unicode-utf8: first attempt at fixing the unicode surrogate mess

Sat Nov 4 18:16:59 EDT 2017

Author: fijal
Branch: unicode-utf8
Changeset: r92937:0c93ee971f62
Date: 2017-11-04 19:07 +0100
http://bitbucket.org/pypy/pypy/changeset/0c93ee971f62/

Log:	first attempt at fixing the unicode surrogate mess

diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -1,4 +1,3 @@
-* unskip tests in test_unicodeobject.py
 * rutf8.prev_codepoint_pos should use r_uint
 * find a better way to run "find" without creating the index storage,
   if one is not already readily available
@@ -9,3 +8,4 @@
 * find all the fast-paths that we want to do with utf8 (we only do
   utf-8 now, not UTF8 or utf8) for decode/encode
 * encode_error_handler has XXX
+* reenable list strategies for ascii-only unicode
diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -72,8 +72,8 @@
     substr = s[ps : q]
     if rawmode or '\\' not in s[ps:]:
         if need_encoding:
-            utf, lgt = unicodehelper.decode_utf8(space, substr)
-            w_u = space.newutf8(utf, lgt)
+            utf, (lgt, flag) = unicodehelper.decode_utf8(space, substr)
+            w_u = space.newutf8(utf, lgt, flag)
             w_v = unicodehelper.encode(space, w_u, encoding)
             return w_v
         else:
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -45,14 +45,14 @@
 
 def _has_surrogate(u):
     for c in u:
-        if 0xDB80 <= ord(c) <= 0xCBFF or 0xD800 <= ord(c) <= 0xDB7F:
+        if 0xD800 <= ord(c) <= 0xDFFF:
             return True
     return False
 
 def _get_flag(u):
     flag = rutf8.FLAG_ASCII
     for c in u:
-        if 0xDB80 <= ord(c) <= 0xCBFF or 0xD800 <= ord(c) <= 0xDB7F:
+        if 0xD800 <= ord(c) <= 0xDFFF:
             return rutf8.FLAG_HAS_SURROGATES
         if ord(c) >= 0x80:
             flag = rutf8.FLAG_REGULAR
@@ -143,7 +143,7 @@
 def str_decode_ascii(s, slen, errors, final, errorhandler):
     try:
         rutf8.check_ascii(s)
-        return s, slen, len(s)
+        return s, slen, len(s), rutf8.FLAG_ASCII
     except rutf8.CheckError:
         w = DecodeWrapper((errorhandler))
         u, pos = runicode.str_decode_ascii(s, slen, errors, final, w.handle)
diff --git a/pypy/module/__builtin__/operation.py b/pypy/module/__builtin__/operation.py
--- a/pypy/module/__builtin__/operation.py
+++ b/pypy/module/__builtin__/operation.py
@@ -30,8 +30,8 @@
         raise oefmt(space.w_ValueError, "unichr() arg out of range")
     if code < 0x80:
         flag = rutf8.FLAG_ASCII
-    elif 0xDB80 <= code <= 0xCBFF or 0xD800 <= code <= 0xDB7F:
-        flag = rutf8.FLAG_HAS_SURROGATE
+    elif 0xD800 <= code <= 0xDFFF:
+        flag = rutf8.FLAG_HAS_SURROGATES
     else:
         flag = rutf8.FLAG_REGULAR
     return space.newutf8(s, 1, flag)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -516,8 +516,9 @@
             return w_obj.listview_unicode()
         if type(w_obj) is W_SetObject or type(w_obj) is W_FrozensetObject:
             return w_obj.listview_unicode()
-        #if isinstance(w_obj, W_UnicodeObject) and self._uni_uses_no_iter(w_obj):
-        #    return w_obj.listview_unicode()
+        if (isinstance(w_obj, W_UnicodeObject) and self._uni_uses_no_iter(w_obj)
+            and w_obj.is_ascii()):
+            return w_obj.listview_unicode()
         if isinstance(w_obj, W_ListObject) and self._uses_list_iter(w_obj):
             return w_obj.getitems_unicode()
         return None
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -27,7 +27,6 @@
         assert len(warnings) == 2
 
     def test_listview_unicode(self):
-        py.test.skip("skip for new")
         w_str = self.space.newutf8('abcd', 4, rutf8.FLAG_ASCII)
         assert self.space.listview_unicode(w_str) == list(u"abcd")
 
@@ -662,7 +661,6 @@
         assert unicode('+AB', 'utf-7', 'replace') == u'\ufffd'
 
     def test_codecs_utf8(self):
-        skip("unskip this before merge")
         assert u''.encode('utf-8') == ''
         assert u'\u20ac'.encode('utf-8') == '\xe2\x82\xac'
         assert u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82'
@@ -695,7 +693,6 @@
         assert unicode('\xe2\x82\xac', 'utf-8') == u'\u20ac' 
 
     def test_codecs_errors(self):
-        skip("some nonsense in handling of ignore and replace")
         # Error handling (encoding)
         raises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
         raises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -93,6 +93,8 @@
         return space.text_w(space.str(self))
 
     def utf8_w(self, space):
+        if self._has_surrogates():
+            return rutf8.reencode_utf8_with_surrogates(self._utf8)
         return self._utf8
 
     def readbuf_w(self, space):
@@ -115,8 +117,8 @@
     charbuf_w = str_w
 
     def listview_unicode(self):
-        XXX # fix at some point
-        return _create_list_from_unicode(self._value)
+        assert self.is_ascii()
+        return _create_list_from_unicode(self._utf8)
 
     def ord(self, space):
         if self._len() != 1:
@@ -410,7 +412,7 @@
                                 "or unicode")
             try:
                 if codepoint >= 0x80:
-                    flag = self._combine_flags(flag, rutf8.FLAG_NORMAL)
+                    flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
                 rutf8.unichr_as_utf8_append(result, codepoint,
                                             allow_surrogates=True)
                 result_length += 1
@@ -632,7 +634,7 @@
         return rutf8.FLAG_REGULAR
 
     def _get_flag(self):
-        if self._is_ascii():
+        if self.is_ascii():
             return rutf8.FLAG_ASCII
         elif self._has_surrogates():
             return rutf8.FLAG_HAS_SURROGATES
@@ -977,7 +979,7 @@
         end = rutf8.next_codepoint_pos(self._utf8, start)
         return W_UnicodeObject(self._utf8[start:end], 1, self._get_flag())
 
-    def _is_ascii(self):
+    def is_ascii(self):
         return self._index_storage is rutf8.UTF8_IS_ASCII
 
     def _has_surrogates(self):
@@ -986,7 +988,8 @@
                  self._index_storage.flag == rutf8.FLAG_HAS_SURROGATES))
 
     def _index_to_byte(self, index):
-        if self._is_ascii():
+        if self.is_ascii():
+            assert index >= 0
             return index
         return rutf8.codepoint_position_at_index(
             self._utf8, self._get_index_storage(), index)
@@ -1195,7 +1198,7 @@
                 assert False, "always raises"
             return space.newbytes(s)
         if ((encoding is None and space.sys.defaultencoding == 'utf8') or
-             encoding == 'utf-8'):
+             encoding == 'utf-8' or encoding == 'utf8'):
             return space.newbytes(space.utf8_w(w_object))
     if w_encoder is None:
         from pypy.module._codecs.interp_codecs import lookup_codec
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -388,6 +388,34 @@
     assert pos - continuation_bytes >= 0
     return pos - continuation_bytes, flag
 
+def reencode_utf8_with_surrogates(utf8):
+    """ Receiving valid UTF8 which contains surrogates, combine surrogate
+    pairs into correct UTF8 with pairs collpased. This is a rare case
+    and you should not be using surrogate pairs in the first place,
+    so the performance here is a bit secondary
+    """
+    s = StringBuilder(len(utf8))
+    stop = len(utf8)
+    i = 0
+    while i < stop:
+        uchr = codepoint_at_pos(utf8, i)
+        if 0xD800 <= uchr <= 0xDBFF:
+            high = uchr
+            i = next_codepoint_pos(utf8, i)
+            if i >= stop:
+                unichr_as_utf8_append(s, uchr, True)
+                break
+            low = codepoint_at_pos(utf8, i)
+            if 0xDC00 <= low <= 0xDFFF:
+                uchr = 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00)
+                i = next_codepoint_pos(utf8, i)                
+            # else not really a surrogate pair, just append high
+        else:
+            i = next_codepoint_pos(utf8, i)
+        unichr_as_utf8_append(s, uchr, True)
+    return s.build()
+
+
 @jit.elidable
 def codepoints_in_utf8(value, start=0, end=sys.maxint):
     """Return the number of codepoints in the UTF-8 byte string
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -40,9 +40,7 @@
 def _test_check_utf8(s, allow_surrogates):
     def _has_surrogates(s):
         for u in s.decode('utf8'):
-            if 0xD800 <= ord(u) <= 0xDB7F:
-                return True
-            if 0xDC00 <= ord(u) <= 0xDBFF:
+            if 0xD800 <= ord(u) <= 0xDFFF:
                 return True
         return False