[pypy-commit] pypy unicode-utf8-py3: try one approach
mattip
pypy.commits at gmail.com
Sun Nov 18 22:36:37 EST 2018
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95336:28269456b8aa
Date: 2018-11-16 12:28 -0800
http://bitbucket.org/pypy/pypy/changeset/28269456b8aa/
Log: try one approach
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1201,8 +1201,6 @@
def encode_object(space, w_object, encoding, errors, allow_surrogates=False):
- # TODO: refactor unnatrual use of error hanlders here,
- # we should make a single pass over the utf8 str
from pypy.module._codecs.interp_codecs import encode_text, CodecState
utf8 = space.utf8_w(w_object)
if not allow_surrogates:
@@ -1212,23 +1210,22 @@
state = space.fromcache(CodecState)
eh = state.encode_error_handler
if pos >= 0:
+ # remove surrogates in pieces, eh needs codepoint positions
+ res = []
while pos >= 0:
- start = utf8[:pos]
upos = rutf8.codepoints_in_utf8(utf8, end=pos)
- ru, _pos = eh(errors, "utf8", "surrogates not allowed", utf8,
+ ru, _pos = eh(errors, encoding, "surrogates not allowed", utf8,
upos, upos + 1)
- upos = rutf8.next_codepoint_pos(utf8, _pos)
- end = utf8[upos:]
- utf8 = start + ru + end
- _pos = rutf8.surrogate_in_utf8(utf8)
- if _pos <= pos:
- # surrogatepass?
- break
- pos = _pos
- if errors == 'surrogateescape':
- #escape
- return space.newbytes(utf8)
+ res.append(utf8[:pos])
+ res.append(ru)
+ utf8_pos = rutf8.next_codepoint_pos(utf8, _pos)
+ utf8 = utf8[utf8_pos:]
+ pos = rutf8.surrogate_in_utf8(utf8)
+ res.append(utf8)
+ utf8 = ''.join(res)
w_object = space.newtext(utf8)
+ # change the errors to only do the encoding now
+ errors = 'strict'
if errors is None or errors == 'strict':
if encoding is None or encoding == 'utf-8':
#if rutf8.has_surrogates(utf8):
@@ -1243,7 +1240,6 @@
a.pos, a.pos + 1)
assert False, "always raises"
return space.newbytes(utf8)
-
if encoding is None:
encoding = space.sys.defaultencoding
w_retval = encode_text(space, w_object, encoding, errors)
More information about the pypy-commit
mailing list