[pypy-commit] pypy default: merge heads
cfbolz
pypy.commits at gmail.com
Thu Sep 12 10:53:57 EDT 2019
Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch:
Changeset: r97462:5e5857c2fae6
Date: 2019-09-12 16:52 +0200
http://bitbucket.org/pypy/pypy/changeset/5e5857c2fae6/
Log: merge heads
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -60,12 +60,6 @@
return encode_object(space, w_data, encoding, errors)
-def _has_surrogate(u):
- for c in u:
- if 0xD800 <= ord(c) <= 0xDFFF:
- return True
- return False
-
# These functions take and return unwrapped rpython strings
def decode_unicode_escape(space, string):
from pypy.module._codecs import interp_codecs
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -433,7 +433,10 @@
end = len(self.text)
else:
end = self.pos + limit
- pos = self.text.find(marker, self.pos, end)
+ pos = self.pos
+ assert pos >= 0
+ assert end >= 0
+ pos = self.text.find(marker, pos, end)
if pos >= 0:
self.pos = self.upos = pos + 1
return True
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -874,11 +874,6 @@
def is_ascii(self):
return self._length == len(self._utf8)
- def _has_surrogates(self):
- if self.is_ascii():
- return False
- return rutf8.has_surrogates(self._utf8)
-
def _index_to_byte(self, index):
if self.is_ascii():
assert index >= 0
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -435,10 +435,17 @@
return result
def has_surrogates(utf8):
- # XXX write a faster version maybe
- for ch in Utf8StringIterator(utf8):
- if 0xD800 <= ch <= 0xDBFF:
+ # a surrogate starts with 0xed in utf-8 encoding
+ pos = 0
+ while True:
+ pos = utf8.find("\xed", pos)
+ if pos < 0:
+ return False
+ assert pos <= len(utf8) - 1 # otherwise invalid utf-8
+ ordch2 = ord(utf8[pos + 1])
+ if _invalid_byte_2_of_3(0xed, ordch2, allow_surrogates=False):
return True
+ pos += 1
return False
def reencode_utf8_with_surrogates(utf8):
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -238,3 +238,17 @@
assert pos == i
i = rutf8.next_codepoint_pos(utf8s, i)
assert list(arg) == l
+
+
+ at given(strategies.text(), strategies.integers(0xd800, 0xdfff))
+def test_has_surrogates(arg, surrogate):
+ b = (arg + unichr(surrogate) + arg).encode("utf-8")
+ assert not rutf8.has_surrogates(arg.encode("utf-8"))
+ assert rutf8.has_surrogates(unichr(surrogate).encode("utf-8"))
+ assert rutf8.has_surrogates(b)
+
+def test_has_surrogate_xed_no_surrogate():
+ u = unichr(55217) + unichr(54990)
+ b = u.encode("utf-8")
+ assert b.startswith(b"\xed")
+ assert not rutf8.has_surrogates(b)
More information about the pypy-commit
mailing list