[pypy-commit] pypy default: merge heads
arigo
pypy.commits at gmail.com
Thu Jan 23 07:21:08 EST 2020
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r98576:a7c7b4c7dcae
Date: 2020-01-23 13:08 +0100
http://bitbucket.org/pypy/pypy/changeset/a7c7b4c7dcae/
Log: merge heads
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -194,17 +194,23 @@
rffi.SSIZE_T)
pypy_cjk_enc_getcodec = llexternal('pypy_cjk_enc_getcodec',
[ENCODEBUF_P], MULTIBYTECODEC_P)
+pypy_cjk_enc_copystate = llexternal('pypy_cjk_enc_copystate',
+ [ENCODEBUF_P, ENCODEBUF_P], lltype.Void)
MBENC_FLUSH = 1
MBENC_RESET = 2
def encode(codec, unicodedata, length, errors="strict", errorcb=None,
- namecb=None):
+ namecb=None, copystate=lltype.nullptr(ENCODEBUF_P.TO)):
encodebuf = pypy_cjk_enc_new(codec)
if not encodebuf:
raise MemoryError
+ if copystate:
+ pypy_cjk_enc_copystate(encodebuf, copystate)
try:
return encodeex(encodebuf, unicodedata, length, errors, errorcb, namecb)
finally:
+ if copystate:
+ pypy_cjk_enc_copystate(copystate, encodebuf)
pypy_cjk_enc_free(encodebuf)
def encodeex(encodebuf, utf8data, length, errors="strict", errorcb=None,
@@ -258,18 +264,18 @@
elif errors == "ignore":
replace = ""
elif errors == "replace":
- codec = pypy_cjk_enc_getcodec(encodebuf)
- try:
- replace = encode(codec, "?", 1)
- except EncodeDecodeError:
- replace = "?"
+ replace = "?" # utf-8 unicode
else:
assert errorcb
- rets, end = errorcb(errors, namecb, reason,
+ replace, end = errorcb(errors, namecb, reason,
unicodedata, start, end)
+ if len(replace) > 0:
codec = pypy_cjk_enc_getcodec(encodebuf)
- lgt = rutf8.codepoints_in_utf8(rets)
- replace = encode(codec, rets, lgt, "strict", errorcb, namecb)
+ lgt = rutf8.codepoints_in_utf8(replace)
+ replace = encode(codec, replace, lgt, copystate=encodebuf)
+ #else:
+ # replace is an empty utf-8 unicode, which we directly consider to
+ # encode as an empty byte string.
with rffi.scoped_nonmovingbuffer(replace) as inbuf:
r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
if r == MBERR_NOMEMORY:
diff --git a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c
--- a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c
+++ b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.c
@@ -135,6 +135,11 @@
return d;
}
+void pypy_cjk_enc_copystate(struct pypy_cjk_enc_s *dst, struct pypy_cjk_enc_s *src)
+{
+ dst->state = src->state;
+}
+
Py_ssize_t pypy_cjk_enc_init(struct pypy_cjk_enc_s *d,
Py_UNICODE *inbuf, Py_ssize_t inlen)
{
diff --git a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h
--- a/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h
+++ b/pypy/module/_multibytecodec/src/cjkcodecs/multibytecodec.h
@@ -146,6 +146,8 @@
char *, pypymbc_ssize_t, pypymbc_ssize_t);
RPY_EXTERN
const MultibyteCodec *pypy_cjk_enc_getcodec(struct pypy_cjk_enc_s *);
+RPY_EXTERN
+void pypy_cjk_enc_copystate(struct pypy_cjk_enc_s *dst, struct pypy_cjk_enc_s *src);
/* list of codecs defined in the .c files */
diff --git a/pypy/module/_multibytecodec/test/test_app_codecs.py b/pypy/module/_multibytecodec/test/test_app_codecs.py
--- a/pypy/module/_multibytecodec/test/test_app_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_app_codecs.py
@@ -110,3 +110,33 @@
lambda e: ('\xc3', e.end))
raises(TypeError, u"\uDDA1".encode, "gbk",
"test.test_encode_custom_error_handler_type")
+
+ def test_encode_replacement_with_state(self):
+ import codecs
+ s = u'\u4ee4\u477c\u4ee4'.encode("iso-2022-jp", errors="replace")
+ assert s == '\x1b$BNa\x1b(B?\x1b$BNa\x1b(B'
+
+ def test_streaming_codec(self):
+ test_0 = u'\uc5fc\u76d0\u5869\u9e7d\u477c\u4e3d/\u3012'
+ test_1 = u'\u4ee4\u477c\u3080\u304b\u3057\u3080\u304b\u3057\u3042\u308b\u3068\u3053\u308d\u306b'
+ test_2 = u' foo = "Quoted string ****\u4ee4\u477c" '
+
+ ereplace = {'errors': 'replace'}
+ exml = {'errors': 'xmlcharrefreplace'}
+ for codec in ("iso-2022-jp", "iso-2022-jp-ext", "iso-2022-jp-1",
+ "iso-2022-jp-2", "iso-2022-jp-3", "iso-2022-jp-2004",
+ "iso-2022-kr",
+ ):
+
+ out_1 = test_1.encode(codec, **ereplace).decode(codec, **ereplace)
+ assert out_1.endswith(u'\u3080\u304b\u3057\u3080\u304b\u3057\u3042\u308b\u3068\u3053\u308d\u306b')
+
+ out_0a = test_0.encode(codec, **ereplace).decode(codec, **ereplace)
+ for n, char in enumerate(out_0a):
+ assert char in (test_0[n], "?")
+
+ out_0b = test_0.encode(codec, **exml).decode(codec, **ereplace)
+ assert "䝼" in out_0b
+
+ out_2 = test_2.encode(codec, **ereplace).decode(codec, **ereplace)
+ assert out_2.count('"') == 2
diff --git a/pypy/module/cpyext/codecs.py b/pypy/module/cpyext/codecs.py
--- a/pypy/module/cpyext/codecs.py
+++ b/pypy/module/cpyext/codecs.py
@@ -20,3 +20,12 @@
else:
return space.call_method(w_codec, "incrementaldecoder")
+ at cpython_api([CONST_STRING], PyObject)
+def PyCodec_Encoder(space, encoding):
+ w_codec = interp_codecs.lookup_codec(space, rffi.charp2str(encoding))
+ return space.getitem(w_codec, space.newint(0))
+
+ at cpython_api([CONST_STRING], PyObject)
+def PyCodec_Decoder(space, encoding):
+ w_codec = interp_codecs.lookup_codec(space, rffi.charp2str(encoding))
+ return space.getitem(w_codec, space.newint(1))
diff --git a/pypy/module/cpyext/test/test_codecs.py b/pypy/module/cpyext/test/test_codecs.py
--- a/pypy/module/cpyext/test/test_codecs.py
+++ b/pypy/module/cpyext/test/test_codecs.py
@@ -2,7 +2,8 @@
from pypy.module.cpyext.test.test_api import BaseApiTest
from rpython.rtyper.lltypesystem import rffi
from pypy.module.cpyext.codecs import (
- PyCodec_IncrementalEncoder, PyCodec_IncrementalDecoder)
+ PyCodec_IncrementalEncoder, PyCodec_IncrementalDecoder,
+ PyCodec_Encoder, PyCodec_Decoder)
class TestCodecs(BaseApiTest):
def test_incremental(self, space):
@@ -13,3 +14,13 @@
w_decoded = space.call_method(w_decoder, 'decode', w_encoded)
assert space.utf8_w(w_decoded) == u'späm'.encode('utf8')
rffi.free_charp(utf8)
+
+ def test_encoder_decoder(self, space):
+ utf8 = rffi.str2charp('utf-8')
+ w_encoder = PyCodec_Encoder(space, utf8)
+ w_decoder = PyCodec_Decoder(space, utf8)
+ rffi.free_charp(utf8)
+ space.appexec([w_encoder, w_decoder], """(encoder, decoder):
+ assert encoder(u"\u1234") == ('\xe1\x88\xb4', 1)
+ assert decoder("\xe1\x88\xb4") == (u'\u1234', 3)
+ """)
More information about the pypy-commit
mailing list