[pypy-commit] pypy py3.6: hg merge default
arigo
pypy.commits at gmail.com
Thu Jan 23 07:21:12 EST 2020
Author: Armin Rigo <arigo at tunes.org>
Branch: py3.6
Changeset: r98578:73d20edb41e9
Date: 2020-01-23 13:20 +0100
http://bitbucket.org/pypy/pypy/changeset/73d20edb41e9/
Log: hg merge default
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -194,17 +194,23 @@
rffi.SSIZE_T)
pypy_cjk_enc_getcodec = llexternal('pypy_cjk_enc_getcodec',
[ENCODEBUF_P], MULTIBYTECODEC_P)
+pypy_cjk_enc_copystate = llexternal('pypy_cjk_enc_copystate',
+ [ENCODEBUF_P, ENCODEBUF_P], lltype.Void)
MBENC_FLUSH = 1
MBENC_RESET = 2
def encode(codec, unicodedata, length, errors="strict", errorcb=None,
- namecb=None):
+ namecb=None, copystate=lltype.nullptr(ENCODEBUF_P.TO)):
encodebuf = pypy_cjk_enc_new(codec)
if not encodebuf:
raise MemoryError
+ if copystate:
+ pypy_cjk_enc_copystate(encodebuf, copystate)
try:
return encodeex(encodebuf, unicodedata, length, errors, errorcb, namecb)
finally:
+ if copystate:
+ pypy_cjk_enc_copystate(copystate, encodebuf)
pypy_cjk_enc_free(encodebuf)
def encodeex(encodebuf, utf8data, length, errors="strict", errorcb=None,
@@ -257,22 +263,21 @@
raise EncodeDecodeError(start, end, reason)
elif errors == "ignore":
replace = ""
+ rettype = 'b' # != 'u'
elif errors == "replace":
- codec = pypy_cjk_enc_getcodec(encodebuf)
- try:
- replace = encode(codec, "?", 1)
- except EncodeDecodeError:
- replace = "?"
+ replace = "?" # utf-8 unicode
+ rettype = 'u'
else:
assert errorcb
replace, end, rettype = errorcb(errors, namecb, reason,
unicodedata, start, end)
- if rettype == 'u':
- codec = pypy_cjk_enc_getcodec(encodebuf)
- lgt = rutf8.check_utf8(replace, False)
- replace = encode(codec, replace, lgt)
- lgt = len(replace)
+ if rettype == 'u':
+ codec = pypy_cjk_enc_getcodec(encodebuf)
+ lgt = rutf8.check_utf8(replace, False)
+ replace = encode(codec, replace, lgt, copystate=encodebuf)
+ #else:
+ # replace is meant to be a byte string already
with rffi.scoped_nonmovingbuffer(replace) as inbuf:
- r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, lgt, end)
+ r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
if r == MBERR_NOMEMORY:
raise MemoryError
diff --git a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c
--- a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c
+++ b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c
@@ -135,6 +135,11 @@
return d;
}
+void pypy_cjk_enc_copystate(struct pypy_cjk_enc_s *dst, struct pypy_cjk_enc_s *src)
+{
+ dst->state = src->state;
+}
+
Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d,
Py_UNICODE *inbuf, Py_ssize_t inlen)
{
diff --git a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h
--- a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h
+++ b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h
@@ -146,6 +146,8 @@
char *, pypymbc_ssize_t, pypymbc_ssize_t);
RPY_EXTERN
const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *);
+RPY_EXTERN
+void pypy_cjk_enc_copystate(struct pypy_cjk_enc_s *dst, struct pypy_cjk_enc_s *src);
/* list of codecs defined in the .c files */
diff --git a/pypy/module/_multibytecodec/test/test_app_codecs.py b/pypy/module/_multibytecodec/test/test_app_codecs.py
--- a/pypy/module/_multibytecodec/test/test_app_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_app_codecs.py
@@ -126,3 +126,33 @@
lambda e: (b'\xc3', e.end))
result = "\uDDA1".encode("gbk", "test.test_encode_custom_error_handler_type")
assert b'\xc3' in result
+
+ def test_encode_replacement_with_state(self):
+ import codecs
+ s = u'\u4ee4\u477c\u4ee4'.encode("iso-2022-jp", errors="replace")
+ assert s == b'\x1b$BNa\x1b(B?\x1b$BNa\x1b(B'
+
+ def test_streaming_codec(self):
+ test_0 = u'\uc5fc\u76d0\u5869\u9e7d\u477c\u4e3d/\u3012'
+ test_1 = u'\u4ee4\u477c\u3080\u304b\u3057\u3080\u304b\u3057\u3042\u308b\u3068\u3053\u308d\u306b'
+ test_2 = u' foo = "Quoted string ****\u4ee4\u477c" '
+
+ ereplace = {'errors': 'replace'}
+ exml = {'errors': 'xmlcharrefreplace'}
+ for codec in ("iso-2022-jp", "iso-2022-jp-ext", "iso-2022-jp-1",
+ "iso-2022-jp-2", "iso-2022-jp-3", "iso-2022-jp-2004",
+ "iso-2022-kr",
+ ):
+
+ out_1 = test_1.encode(codec, **ereplace).decode(codec, **ereplace)
+ assert out_1.endswith(u'\u3080\u304b\u3057\u3080\u304b\u3057\u3042\u308b\u3068\u3053\u308d\u306b')
+
+ out_0a = test_0.encode(codec, **ereplace).decode(codec, **ereplace)
+ for n, char in enumerate(out_0a):
+ assert char in (test_0[n], "?")
+
+ out_0b = test_0.encode(codec, **exml).decode(codec, **ereplace)
+ assert "䝼" in out_0b
+
+ out_2 = test_2.encode(codec, **ereplace).decode(codec, **ereplace)
+ assert out_2.count('"') == 2
diff --git a/pypy/module/cpyext/codecs.py b/pypy/module/cpyext/codecs.py
--- a/pypy/module/cpyext/codecs.py
+++ b/pypy/module/cpyext/codecs.py
@@ -20,3 +20,12 @@
else:
return space.call_method(w_codec, "incrementaldecoder")
+ at cpython_api([CONST_STRING], PyObject)
+def PyCodec_Encoder(space, encoding):
+ w_codec = interp_codecs.lookup_codec(space, rffi.charp2str(encoding))
+ return space.getitem(w_codec, space.newint(0))
+
+ at cpython_api([CONST_STRING], PyObject)
+def PyCodec_Decoder(space, encoding):
+ w_codec = interp_codecs.lookup_codec(space, rffi.charp2str(encoding))
+ return space.getitem(w_codec, space.newint(1))
diff --git a/pypy/module/cpyext/test/test_codecs.py b/pypy/module/cpyext/test/test_codecs.py
--- a/pypy/module/cpyext/test/test_codecs.py
+++ b/pypy/module/cpyext/test/test_codecs.py
@@ -2,7 +2,8 @@
from pypy.module.cpyext.test.test_api import BaseApiTest
from rpython.rtyper.lltypesystem import rffi
from pypy.module.cpyext.codecs import (
- PyCodec_IncrementalEncoder, PyCodec_IncrementalDecoder)
+ PyCodec_IncrementalEncoder, PyCodec_IncrementalDecoder,
+ PyCodec_Encoder, PyCodec_Decoder)
class TestCodecs(BaseApiTest):
def test_incremental(self, space):
@@ -13,3 +14,13 @@
w_decoded = space.call_method(w_decoder, 'decode', w_encoded)
assert space.utf8_w(w_decoded) == u'späm'.encode("utf-8")
rffi.free_charp(utf8)
+
+ def test_encoder_decoder(self, space):
+ utf8 = rffi.str2charp('utf-8')
+ w_encoder = PyCodec_Encoder(space, utf8)
+ w_decoder = PyCodec_Decoder(space, utf8)
+ rffi.free_charp(utf8)
+ space.appexec([w_encoder, w_decoder], r"""(encoder, decoder):
+ assert encoder(u"\u1234") == (b"\xe1\x88\xb4", 1)
+ assert decoder(b"\xe1\x88\xb4") == (u"\u1234", 3)
+ """)
More information about the pypy-commit
mailing list