[pypy-commit] pypy unicode-utf8: first attempt at fixing the unicode surrogate mess
fijal
pypy.commits at gmail.com
Sat Nov 4 18:16:59 EDT 2017
Author: fijal
Branch: unicode-utf8
Changeset: r92937:0c93ee971f62
Date: 2017-11-04 19:07 +0100
http://bitbucket.org/pypy/pypy/changeset/0c93ee971f62/
Log: first attempt at fixing the unicode surrogate mess
diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -1,4 +1,3 @@
-* unskip tests in test_unicodeobject.py
* rutf8.prev_codepoint_pos should use r_uint
* find a better way to run "find" without creating the index storage,
if one is not already readily available
@@ -9,3 +8,4 @@
* find all the fast-paths that we want to do with utf8 (we only do
utf-8 now, not UTF8 or utf8) for decode/encode
* encode_error_handler has XXX
+* reenable list strategies for ascii-only unicode
diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -72,8 +72,8 @@
substr = s[ps : q]
if rawmode or '\\' not in s[ps:]:
if need_encoding:
- utf, lgt = unicodehelper.decode_utf8(space, substr)
- w_u = space.newutf8(utf, lgt)
+ utf, (lgt, flag) = unicodehelper.decode_utf8(space, substr)
+ w_u = space.newutf8(utf, lgt, flag)
w_v = unicodehelper.encode(space, w_u, encoding)
return w_v
else:
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -45,14 +45,14 @@
def _has_surrogate(u):
for c in u:
- if 0xDB80 <= ord(c) <= 0xCBFF or 0xD800 <= ord(c) <= 0xDB7F:
+ if 0xD800 <= ord(c) <= 0xDFFF:
return True
return False
def _get_flag(u):
flag = rutf8.FLAG_ASCII
for c in u:
- if 0xDB80 <= ord(c) <= 0xCBFF or 0xD800 <= ord(c) <= 0xDB7F:
+ if 0xD800 <= ord(c) <= 0xDFFF:
return rutf8.FLAG_HAS_SURROGATES
if ord(c) >= 0x80:
flag = rutf8.FLAG_REGULAR
@@ -143,7 +143,7 @@
def str_decode_ascii(s, slen, errors, final, errorhandler):
try:
rutf8.check_ascii(s)
- return s, slen, len(s)
+ return s, slen, len(s), rutf8.FLAG_ASCII
except rutf8.CheckError:
w = DecodeWrapper((errorhandler))
u, pos = runicode.str_decode_ascii(s, slen, errors, final, w.handle)
diff --git a/pypy/module/__builtin__/operation.py b/pypy/module/__builtin__/operation.py
--- a/pypy/module/__builtin__/operation.py
+++ b/pypy/module/__builtin__/operation.py
@@ -30,8 +30,8 @@
raise oefmt(space.w_ValueError, "unichr() arg out of range")
if code < 0x80:
flag = rutf8.FLAG_ASCII
- elif 0xDB80 <= code <= 0xCBFF or 0xD800 <= code <= 0xDB7F:
- flag = rutf8.FLAG_HAS_SURROGATE
+ elif 0xD800 <= code <= 0xDFFF:
+ flag = rutf8.FLAG_HAS_SURROGATES
else:
flag = rutf8.FLAG_REGULAR
return space.newutf8(s, 1, flag)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -516,8 +516,9 @@
return w_obj.listview_unicode()
if type(w_obj) is W_SetObject or type(w_obj) is W_FrozensetObject:
return w_obj.listview_unicode()
- #if isinstance(w_obj, W_UnicodeObject) and self._uni_uses_no_iter(w_obj):
- # return w_obj.listview_unicode()
+ if (isinstance(w_obj, W_UnicodeObject) and self._uni_uses_no_iter(w_obj)
+ and w_obj.is_ascii()):
+ return w_obj.listview_unicode()
if isinstance(w_obj, W_ListObject) and self._uses_list_iter(w_obj):
return w_obj.getitems_unicode()
return None
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -27,7 +27,6 @@
assert len(warnings) == 2
def test_listview_unicode(self):
- py.test.skip("skip for new")
w_str = self.space.newutf8('abcd', 4, rutf8.FLAG_ASCII)
assert self.space.listview_unicode(w_str) == list(u"abcd")
@@ -662,7 +661,6 @@
assert unicode('+AB', 'utf-7', 'replace') == u'\ufffd'
def test_codecs_utf8(self):
- skip("unskip this before merge")
assert u''.encode('utf-8') == ''
assert u'\u20ac'.encode('utf-8') == '\xe2\x82\xac'
assert u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82'
@@ -695,7 +693,6 @@
assert unicode('\xe2\x82\xac', 'utf-8') == u'\u20ac'
def test_codecs_errors(self):
- skip("some nonsense in handling of ignore and replace")
# Error handling (encoding)
raises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
raises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -93,6 +93,8 @@
return space.text_w(space.str(self))
def utf8_w(self, space):
+ if self._has_surrogates():
+ return rutf8.reencode_utf8_with_surrogates(self._utf8)
return self._utf8
def readbuf_w(self, space):
@@ -115,8 +117,8 @@
charbuf_w = str_w
def listview_unicode(self):
- XXX # fix at some point
- return _create_list_from_unicode(self._value)
+ assert self.is_ascii()
+ return _create_list_from_unicode(self._utf8)
def ord(self, space):
if self._len() != 1:
@@ -410,7 +412,7 @@
"or unicode")
try:
if codepoint >= 0x80:
- flag = self._combine_flags(flag, rutf8.FLAG_NORMAL)
+ flag = self._combine_flags(flag, rutf8.FLAG_REGULAR)
rutf8.unichr_as_utf8_append(result, codepoint,
allow_surrogates=True)
result_length += 1
@@ -632,7 +634,7 @@
return rutf8.FLAG_REGULAR
def _get_flag(self):
- if self._is_ascii():
+ if self.is_ascii():
return rutf8.FLAG_ASCII
elif self._has_surrogates():
return rutf8.FLAG_HAS_SURROGATES
@@ -977,7 +979,7 @@
end = rutf8.next_codepoint_pos(self._utf8, start)
return W_UnicodeObject(self._utf8[start:end], 1, self._get_flag())
- def _is_ascii(self):
+ def is_ascii(self):
return self._index_storage is rutf8.UTF8_IS_ASCII
def _has_surrogates(self):
@@ -986,7 +988,8 @@
self._index_storage.flag == rutf8.FLAG_HAS_SURROGATES))
def _index_to_byte(self, index):
- if self._is_ascii():
+ if self.is_ascii():
+ assert index >= 0
return index
return rutf8.codepoint_position_at_index(
self._utf8, self._get_index_storage(), index)
@@ -1195,7 +1198,7 @@
assert False, "always raises"
return space.newbytes(s)
if ((encoding is None and space.sys.defaultencoding == 'utf8') or
- encoding == 'utf-8'):
+ encoding == 'utf-8' or encoding == 'utf8'):
return space.newbytes(space.utf8_w(w_object))
if w_encoder is None:
from pypy.module._codecs.interp_codecs import lookup_codec
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -388,6 +388,34 @@
assert pos - continuation_bytes >= 0
return pos - continuation_bytes, flag
+def reencode_utf8_with_surrogates(utf8):
+ """ Receiving valid UTF8 which contains surrogates, combine surrogate
+ pairs into correct UTF8 with pairs collpased. This is a rare case
+ and you should not be using surrogate pairs in the first place,
+ so the performance here is a bit secondary
+ """
+ s = StringBuilder(len(utf8))
+ stop = len(utf8)
+ i = 0
+ while i < stop:
+ uchr = codepoint_at_pos(utf8, i)
+ if 0xD800 <= uchr <= 0xDBFF:
+ high = uchr
+ i = next_codepoint_pos(utf8, i)
+ if i >= stop:
+ unichr_as_utf8_append(s, uchr, True)
+ break
+ low = codepoint_at_pos(utf8, i)
+ if 0xDC00 <= low <= 0xDFFF:
+ uchr = 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00)
+ i = next_codepoint_pos(utf8, i)
+ # else not really a surrogate pair, just append high
+ else:
+ i = next_codepoint_pos(utf8, i)
+ unichr_as_utf8_append(s, uchr, True)
+ return s.build()
+
+
@jit.elidable
def codepoints_in_utf8(value, start=0, end=sys.maxint):
"""Return the number of codepoints in the UTF-8 byte string
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -40,9 +40,7 @@
def _test_check_utf8(s, allow_surrogates):
def _has_surrogates(s):
for u in s.decode('utf8'):
- if 0xD800 <= ord(u) <= 0xDB7F:
- return True
- if 0xDC00 <= ord(u) <= 0xDBFF:
+ if 0xD800 <= ord(u) <= 0xDFFF:
return True
return False
More information about the pypy-commit
mailing list