[pypy-commit] pypy unicode-utf8: Handle errorhandlers that go backward
rlamy
pypy.commits at gmail.com
Sun Dec 10 00:17:32 EST 2017
Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch: unicode-utf8
Changeset: r93342:a4d68881a89d
Date: 2017-12-10 05:16 +0000
http://bitbucket.org/pypy/pypy/changeset/a4d68881a89d/
Log: Handle errorhandlers that go backward
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1101,19 +1101,16 @@
ru, newindex = errorhandler(errors, public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
- for j in range(newindex - index):
- pos = rutf8.next_codepoint_pos(s, pos)
- j = 0
- while j < len(ru):
- ch = rutf8.codepoint_at_pos(ru, j)
- if ord(ch) < 0xD800:
- _STORECHAR(result, ord(ch), byteorder)
+ for cp in rutf8.Utf8StringIterator(res_8):
+ if cp < 0xD800:
+ _STORECHAR(result, cp, byteorder)
else:
errorhandler('strict', public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
- j = rutf8.next_codepoint_pos(ru, j)
- index = newindex
+ if index != newindex: # Should be uncommon
+ index = newindex
+ pos = rutf8._pos_at_index(s, newindex)
continue
pos = rutf8.next_codepoint_pos(s, pos)
@@ -1282,22 +1279,19 @@
ch = rutf8.codepoint_at_pos(s, pos)
pos = rutf8.next_codepoint_pos(s, pos)
if not allow_surrogates and 0xD800 <= ch < 0xE000:
- ru, newindex = errorhandler(errors, public_encoding_name,
+ res_8, newindex = errorhandler(errors, public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
- for j in range(newindex - index):
- pos = rutf8.next_codepoint_pos(s, pos)
- j = 0
- while j < len(ru):
- ch = rutf8.codepoint_at_pos(ru, j)
- if ord(ch) < 0xD800:
- _STORECHAR32(result, ord(ch), byteorder)
+ for ch in rutf8.Utf8StringIterator(res_8):
+ if ch < 0xD800:
+ _STORECHAR32(result, ch, byteorder)
else:
errorhandler('strict', public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
- j = rutf8.next_codepoint_pos(ru, j)
- index = newindex
+ if index != newindex: # Should be uncommon
+ index = newindex
+ pos = rutf8._pos_at_index(s, newindex)
continue
_STORECHAR32(result, ch, byteorder)
index += 1
@@ -1459,9 +1453,7 @@
result.append(ch2)
if index != newindex: # Should be uncommon
index = newindex
- pos = 0
- for _ in range(newindex):
- pos = rutf8.next_codepoint_pos(s, pos)
+ pos = rutf8._pos_at_index(s, newindex)
continue
result.append(c)
index += 1
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -439,7 +439,7 @@
low = codepoint_at_pos(utf8, i)
if 0xDC00 <= low <= 0xDFFF:
uchr = 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00)
- i = next_codepoint_pos(utf8, i)
+ i = next_codepoint_pos(utf8, i)
# else not really a surrogate pair, just append high
else:
i = next_codepoint_pos(utf8, i)
@@ -537,6 +537,13 @@
else:
return next_codepoint_pos(utf8, next_codepoint_pos(utf8, bytepos))
+def _pos_at_index(utf8, index):
+ # Slow!
+ pos = 0
+ for _ in range(index):
+ pos = next_codepoint_pos(utf8, pos)
+ return pos
+
@jit.dont_look_inside
def codepoint_at_index(utf8, storage, index):
""" Return codepoint of a character inside utf8 encoded string, given
More information about the pypy-commit
mailing list