[pypy-commit] pypy unicode-utf8: some improvements for xmlcharrefreplace
fijal
pypy.commits at gmail.com
Tue Nov 21 09:20:39 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r93112:fd1b64ce9b80
Date: 2017-11-21 15:19 +0100
http://bitbucket.org/pypy/pypy/changeset/fd1b64ce9b80/
Log: some improvements for xmlcharrefreplace
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -164,26 +164,31 @@
while i < size:
if ord(s[i]) <= 0x7F:
res.append(s[i])
+ i += 1
+ cur += 1
else:
oc = rutf8.codepoint_at_pos(s, i)
if oc <= 0xFF:
res.append(chr(oc))
- i += 1
+ cur += 1
+ i = rutf8.next_codepoint_pos(s, i)
else:
r, pos = errorhandler(errors, 'latin1',
'ordinal not in range(256)', s, cur,
cur + 1)
for j in range(pos - cur):
+ i = rutf8.next_codepoint_pos(s, i)
+
+ j = 0
+ while j < len(r):
c = rutf8.codepoint_at_pos(r, j)
if c > 0xFF:
errorhandler("strict", 'latin1',
'ordinal not in range(256)', s,
cur, cur + 1)
+ j = rutf8.next_codepoint_pos(r, j)
res.append(chr(c))
- i = rutf8.next_codepoint_pos(s, i)
cur = pos
- cur += 1
- i += 1
r = res.build()
return r
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -215,27 +215,30 @@
"don't know how to handle %T in error callback", w_exc)
def xmlcharrefreplace_errors(space, w_exc):
+ from pypy.interpreter import unicodehelper
+
check_exception(space, w_exc)
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
- obj = space.realunicode_w(space.getattr(w_exc, space.newtext('object')))
+ w_obj = space.getattr(w_exc, space.newtext('object'))
+ space.realutf8_w(w_obj) # weeoes
+ w_obj = unicodehelper.convert_arg_to_w_unicode(space, w_obj)
start = space.int_w(space.getattr(w_exc, space.newtext('start')))
w_end = space.getattr(w_exc, space.newtext('end'))
end = space.int_w(w_end)
- builder = UnicodeBuilder()
+ start = w_obj._index_to_byte(start)
+ end = w_obj._index_to_byte(end)
+ builder = StringBuilder()
pos = start
+ obj = w_obj._utf8
while pos < end:
- code = ord(obj[pos])
- if (MAXUNICODE == 0xffff and 0xD800 <= code <= 0xDBFF and
- pos + 1 < end and 0xDC00 <= ord(obj[pos+1]) <= 0xDFFF):
- code = (code & 0x03FF) << 10
- code |= ord(obj[pos+1]) & 0x03FF
- code += 0x10000
- pos += 1
- builder.append(u"&#")
- builder.append(unicode(str(code)))
- builder.append(u";")
- pos += 1
- return space.newtuple([space.newunicode(builder.build()), w_end])
+ code = rutf8.codepoint_at_pos(obj, pos)
+ builder.append("&#")
+ builder.append(str(code))
+ builder.append(";")
+ pos = rutf8.next_codepoint_pos(obj, pos)
+ r = builder.build()
+ lgt, flag = rutf8.check_utf8(r, True)
+ return space.newtuple([space.newutf8(r, lgt, flag), w_end])
else:
raise oefmt(space.w_TypeError,
"don't know how to handle %T in error callback", w_exc)
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -750,3 +750,9 @@
assert _codecs.unicode_escape_decode(b) == (u'', 0)
assert _codecs.raw_unicode_escape_decode(b) == (u'', 0)
assert _codecs.unicode_internal_decode(b) == (u'', 0)
+
+ def test_xmlcharrefreplace(self):
+ r = u'\u1234\u0080\u2345\u0079\u00AB'.encode('latin1', 'xmlcharrefreplace')
+ assert r == 'ሴ\x80⍅y\xab'
+ r = u'\u1234\u0080\u2345\u0079\u00AB'.encode('ascii', 'xmlcharrefreplace')
+ assert r == 'ሴ⍅y«'
More information about the pypy-commit
mailing list