[pypy-commit] pypy default: Fix a corner case in multibytecodec: for stateful codecs, when encoding fails
arigo
pypy.commits at gmail.com
Thu Jan 23 05:38:34 EST 2020
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r98571:2ed84f7866b6
Date: 2020-01-23 11:37 +0100
http://bitbucket.org/pypy/pypy/changeset/2ed84f7866b6/
Log: Fix a corner case in multibytecodec: for stateful codecs, when
encoding fails and we use replacement, the replacement string must
be written in the output preserving the state.
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -194,17 +194,23 @@
rffi.SSIZE_T)
pypy_cjk_enc_getcodec = llexternal('pypy_cjk_enc_getcodec',
[ENCODEBUF_P], MULTIBYTECODEC_P)
+pypy_cjk_enc_copystate = llexternal('pypy_cjk_enc_copystate',
+ [ENCODEBUF_P, ENCODEBUF_P], lltype.Void)
MBENC_FLUSH = 1
MBENC_RESET = 2
def encode(codec, unicodedata, length, errors="strict", errorcb=None,
- namecb=None):
+ namecb=None, copystate=lltype.nullptr(ENCODEBUF_P.TO)):
encodebuf = pypy_cjk_enc_new(codec)
if not encodebuf:
raise MemoryError
+ if copystate:
+ pypy_cjk_enc_copystate(encodebuf, copystate)
try:
return encodeex(encodebuf, unicodedata, length, errors, errorcb, namecb)
finally:
+ if copystate:
+ pypy_cjk_enc_copystate(copystate, encodebuf)
pypy_cjk_enc_free(encodebuf)
def encodeex(encodebuf, utf8data, length, errors="strict", errorcb=None,
@@ -258,18 +264,18 @@
elif errors == "ignore":
replace = ""
elif errors == "replace":
- codec = pypy_cjk_enc_getcodec(encodebuf)
- try:
- replace = encode(codec, "?", 1)
- except EncodeDecodeError:
- replace = "?"
+ replace = "?" # utf-8 unicode
else:
assert errorcb
- rets, end = errorcb(errors, namecb, reason,
+ replace, end = errorcb(errors, namecb, reason,
unicodedata, start, end)
+ if len(replace) > 0:
codec = pypy_cjk_enc_getcodec(encodebuf)
- lgt = rutf8.codepoints_in_utf8(rets)
- replace = encode(codec, rets, lgt, "strict", errorcb, namecb)
+ lgt = rutf8.codepoints_in_utf8(replace)
+ replace = encode(codec, replace, lgt, copystate=encodebuf)
+ #else:
+ # replace is an empty utf-8 unicode, which we directly consider to
+ # encode as an empty byte string.
with rffi.scoped_nonmovingbuffer(replace) as inbuf:
r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
if r == MBERR_NOMEMORY:
diff --git a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c
--- a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c
+++ b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c
@@ -135,6 +135,11 @@
return d;
}
+void pypy_cjk_enc_copystate(struct pypy_cjk_enc_s *dst, struct pypy_cjk_enc_s *src)
+{
+ dst->state = src->state;
+}
+
Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d,
Py_UNICODE *inbuf, Py_ssize_t inlen)
{
diff --git a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h
--- a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h
+++ b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h
@@ -146,6 +146,8 @@
char *, pypymbc_ssize_t, pypymbc_ssize_t);
RPY_EXTERN
const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *);
+RPY_EXTERN
+void pypy_cjk_enc_copystate(struct pypy_cjk_enc_s *dst, struct pypy_cjk_enc_s *src);
/* list of codecs defined in the .c files */
diff --git a/pypy/module/_multibytecodec/test/test_app_codecs.py b/pypy/module/_multibytecodec/test/test_app_codecs.py
--- a/pypy/module/_multibytecodec/test/test_app_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_app_codecs.py
@@ -110,3 +110,33 @@
lambda e: ('\xc3', e.end))
raises(TypeError, u"\uDDA1".encode, "gbk",
"test.test_encode_custom_error_handler_type")
+
+ def test_encode_replacement_with_state(self):
+ import codecs
+ s = u'\u4ee4\u477c\u4ee4'.encode("iso-2022-jp", errors="replace")
+ assert s == '\x1b$BNa\x1b(B?\x1b$BNa\x1b(B'
+
+ def test_streaming_codec(self):
+ test_0 = u'\uc5fc\u76d0\u5869\u9e7d\u477c\u4e3d/\u3012'
+ test_1 = u'\u4ee4\u477c\u3080\u304b\u3057\u3080\u304b\u3057\u3042\u308b\u3068\u3053\u308d\u306b'
+ test_2 = u' foo = "Quoted string ****\u4ee4\u477c" '
+
+ ereplace = {'errors': 'replace'}
+ exml = {'errors': 'xmlcharrefreplace'}
+ for codec in ("iso-2022-jp", "iso-2022-jp-ext", "iso-2022-jp-1",
+ "iso-2022-jp-2", "iso-2022-jp-3", "iso-2022-jp-2004",
+ "iso-2022-kr",
+ ):
+
+ out_1 = test_1.encode(codec, **ereplace).decode(codec, **ereplace)
+ assert out_1.endswith(u'\u3080\u304b\u3057\u3080\u304b\u3057\u3042\u308b\u3068\u3053\u308d\u306b')
+
+ out_0a = test_0.encode(codec, **ereplace).decode(codec, **ereplace)
+ for n, char in enumerate(out_0a):
+ assert char in (test_0[n], "?")
+
+ out_0b = test_0.encode(codec, **exml).decode(codec, **ereplace)
+ assert "䝼" in out_0b
+
+ out_2 = test_2.encode(codec, **ereplace).decode(codec, **ereplace)
+ assert out_2.count('"') == 2
More information about the pypy-commit
mailing list