[pypy-commit] pypy unicode-utf8-py3: pos in encoding error handler is in unicode not utf8. refactor, use only utf8
mattip
pypy.commits at gmail.com
Thu Nov 15 04:12:04 EST 2018
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95318:83e8a364e912
Date: 2018-11-15 01:11 -0800
http://bitbucket.org/pypy/pypy/changeset/83e8a364e912/
Log: pos in encoding error handler is in unicode not utf8. refactor, use
only utf8
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -241,8 +241,9 @@
pos = e.pos
assert pos >= 0
start = s[:pos]
+ upos = rutf8.codepoints_in_utf8(s, end=pos)
ru, lgt = errorhandler(errors, 'utf8',
- 'surrogates not allowed', s, pos, pos + 1)
+ 'surrogates not allowed', s, upos, upos + 1)
end = utf8_encode_utf_8(s[pos+3:], errors, errorhandler,
allow_surrogates=allow_surrogates)
s = start + ru + end
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -379,10 +379,11 @@
def surrogatepass_errors(space, w_exc):
check_exception(space, w_exc)
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
- obj = space.realunicode_w(space.getattr(w_exc, space.newtext('object')))
+ utf8 = space.utf8_w(space.getattr(w_exc, space.newtext('object')))
start = space.int_w(space.getattr(w_exc, space.newtext('start')))
w_end = space.getattr(w_exc, space.newtext('end'))
encoding = space.text_w(space.getattr(w_exc, space.newtext('encoding')))
+ msg = space.text_w(space.getattr(w_exc, space.newtext('reason')))
bytelength, code = get_standard_encoding(encoding)
if code == ENC_UNKNOWN:
# Not supported, fail with original exception
@@ -390,8 +391,12 @@
end = space.int_w(w_end)
builder = StringBuilder()
pos = start
+ # start, end are in codepoint indices
+ itr = rutf8.Utf8StringIterator(utf8)
+ for i in range(pos):
+ itr.next()
while pos < end:
- ch = ord(obj[pos])
+ ch = itr.next()
pos += 1
if ch < 0xd800 or ch > 0xdfff:
# Not a surrogate, fail with original exception
@@ -466,8 +471,11 @@
end = space.int_w(w_end)
res = ''
pos = start
+ itr = rutf8.Utf8StringIterator(utf8)
+ for i in range(pos):
+ itr.next()
while pos < end:
- ch = rutf8.codepoint_at_pos(utf8, pos)
+ ch = itr.next()
pos += 1
if ch < 0xdc80 or ch > 0xdcff:
# Not a UTF-8b surrogate, fail with original exception
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -648,6 +648,8 @@
assert u'\ud8ae'.encode('utf_16_be', 'surrogatepass') == b'\xd8\xae'
assert (u'\U0000d8ae'.encode('utf-32-be', 'surrogatepass') ==
b'\x00\x00\xd8\xae')
+ assert (u'\x80\ud800'.encode('utf8', 'surrogatepass') ==
+ b'\xc2\x80\xed\xa0\x80')
def test_badandgoodsurrogatepassexceptions(self):
import codecs
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1209,28 +1209,26 @@
if errors is None:
errors = 'strict'
pos = rutf8.surrogate_in_utf8(utf8)
- if pos >= 0:
- handled_error = True
- else:
- handled_error = False
state = space.fromcache(CodecState)
eh = state.encode_error_handler
- while pos >= 0:
- start = utf8[:pos]
- ru, _pos = eh(errors, "utf8", "surrogates not allowed", utf8,
- pos, pos + 1)
- upos = rutf8.next_codepoint_pos(utf8, _pos)
- end = utf8[upos:]
- utf8 = start + ru + end
- _pos = rutf8.surrogate_in_utf8(utf8)
- if _pos <= pos:
- # surrogatepass?
- break
- pos = _pos
- if errors == 'surrogateescape' and handled_error:
- #escape
- return space.newbytes(utf8)
- w_object = space.newtext(utf8)
+ if pos >= 0:
+ while pos >= 0:
+ start = utf8[:pos]
+ upos = rutf8.codepoints_in_utf8(utf8, end=pos)
+ ru, _pos = eh(errors, "utf8", "surrogates not allowed", utf8,
+ upos, upos + 1)
+ upos = rutf8.next_codepoint_pos(utf8, _pos)
+ end = utf8[upos:]
+ utf8 = start + ru + end
+ _pos = rutf8.surrogate_in_utf8(utf8)
+ if _pos <= pos:
+ # surrogatepass?
+ break
+ pos = _pos
+ if errors == 'surrogateescape':
+ #escape
+ return space.newbytes(utf8)
+ w_object = space.newtext(utf8)
if errors is None or errors == 'strict':
if encoding is None or encoding == 'utf-8':
#if rutf8.has_surrogates(utf8):
More information about the pypy-commit
mailing list