[pypy-commit] pypy default: Progress.
arigo
noreply at buildbot.pypy.org
Tue May 10 22:58:29 CEST 2011
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r44056:c5e9d584bb8f
Date: 2011-05-10 21:47 +0200
http://bitbucket.org/pypy/pypy/changeset/c5e9d584bb8f/
Log: Progress.
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -7,6 +7,16 @@
from pypy.tool.autopath import pypydir
+class EncodeDecodeError(Exception):
+ def __init__(self, start, end, reason):
+ self.start = start
+ self.end = end
+ self.reason = reason
+ def __str__(self):
+ return 'EncodeDecodeError(%r, %r, %r)' % (self.start, self.end,
+ self.reason)
+
+
srcdir = py.path.local(pypydir).join('module', '_multibytecodec', 'cjkcodecs')
eci = ExternalCompilationInfo(
@@ -21,6 +31,10 @@
],
)
+MBERR_TOOSMALL = -1 # insufficient output buffer space
+MBERR_TOOFEW = -2 # incomplete input buffer
+MBERR_INTERNAL = -3 # internal runtime error
+MBERR_NOMEMORY = -4 # out of memory
MULTIBYTECODEC_P = rffi.COpaquePtr('struct MultibyteCodec_s',
compilation_info=eci)
@@ -76,11 +90,15 @@
pypy_cjk_dec_free = llexternal('pypy_cjk_dec_free', [DECODEBUF_P],
lltype.Void)
pypy_cjk_dec_chunk = llexternal('pypy_cjk_dec_chunk', [DECODEBUF_P],
- lltype.Signed)
+ rffi.SSIZE_T)
pypy_cjk_dec_outbuf = llexternal('pypy_cjk_dec_outbuf', [DECODEBUF_P],
rffi.CWCHARP)
pypy_cjk_dec_outlen = llexternal('pypy_cjk_dec_outlen', [DECODEBUF_P],
rffi.SSIZE_T)
+pypy_cjk_dec_inbuf_remaining = llexternal('pypy_cjk_dec_inbuf_remaining',
+ [DECODEBUF_P], rffi.SSIZE_T)
+pypy_cjk_dec_inbuf_consumed = llexternal('pypy_cjk_dec_inbuf_consumed',
+ [DECODEBUF_P], rffi.SSIZE_T)
def decode(codec, stringdata):
inleft = len(stringdata)
@@ -96,7 +114,7 @@
r = pypy_cjk_dec_chunk(decodebuf)
if r == 0:
break
- multibytecodec_decerror(xxx)
+ multibytecodec_decerror(decodebuf, r)
src = pypy_cjk_dec_outbuf(decodebuf)
length = pypy_cjk_dec_outlen(decodebuf)
return unicode_from_raw(src, length)
@@ -107,6 +125,25 @@
finally:
rffi.free_nonmovingbuffer(stringdata, inbuf)
+def multibytecodec_decerror(decodebuf, e):
+ if e > 0:
+ reason = "illegal multibyte sequence"
+ esize = e
+ elif e == MBERR_TOOFEW:
+ reason = "incomplete multibyte sequence"
+ esize = pypy_cjk_dec_inbuf_remaining(decodebuf)
+ elif e == MBERR_NOMEMORY:
+ raise MemoryError
+ else:
+ raise RuntimeError
+ #
+ # if errors == ERROR_REPLACE:...
+ # if errors == ERROR_IGNORE or errors == ERROR_REPLACE:...
+ start = pypy_cjk_dec_inbuf_consumed(decodebuf)
+ end = start + esize
+ if 1: # errors == ERROR_STRICT:
+ raise EncodeDecodeError(start, end, reason)
+
# ____________________________________________________________
def unicode_from_raw(src, length):
diff --git a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c
--- a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c
+++ b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c
@@ -12,6 +12,7 @@
goto errorexit;
d->codec = codec;
+ d->inbuf_start = inbuf;
d->inbuf = inbuf;
d->inbuf_end = inbuf + inlen;
d->outbuf_start = malloc(inlen * sizeof(Py_UNICODE));
@@ -32,14 +33,40 @@
free(d);
}
-long pypy_cjk_dec_chunk(struct pypy_cjk_dec_s *d)
+static int expand_decodebuffer(struct pypy_cjk_dec_s *d, Py_ssize_t esize)
{
- Py_ssize_t inleft = (Py_ssize_t)(d->inbuf_end - d->inbuf);
- Py_ssize_t outleft = (Py_ssize_t)(d->outbuf_end - d->outbuf);
- if (inleft == 0)
- return 0;
- return d->codec->decode(&d->state, d->codec->config,
- &d->inbuf, inleft, &d->outbuf, outleft);
+ Py_ssize_t orgpos, orgsize;
+ Py_UNICODE *newbuf;
+
+ orgpos = d->outbuf - d->outbuf_start;
+ orgsize = d->outbuf_end - d->outbuf_start;
+ esize = orgsize + (esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize);
+ newbuf = realloc(d->outbuf_start, esize * sizeof(Py_UNICODE));
+ if (!newbuf)
+ return -1;
+ d->outbuf_start = newbuf;
+ d->outbuf = newbuf + orgpos;
+ d->outbuf_end = newbuf + esize;
+ return 0;
+}
+
+Py_ssize_t pypy_cjk_dec_chunk(struct pypy_cjk_dec_s *d)
+{
+ while (1)
+ {
+ Py_ssize_t r;
+ Py_ssize_t inleft = (Py_ssize_t)(d->inbuf_end - d->inbuf);
+ Py_ssize_t outleft = (Py_ssize_t)(d->outbuf_end - d->outbuf);
+ if (inleft == 0)
+ return 0;
+ r = d->codec->decode(&d->state, d->codec->config,
+ &d->inbuf, inleft, &d->outbuf, outleft);
+ if (r != MBERR_TOOSMALL)
+ return r;
+ /* output buffer too small; grow it and continue. */
+ if (expand_decodebuffer(d, -1) == -1)
+ return MBERR_NOMEMORY;
+ }
}
Py_UNICODE *pypy_cjk_dec_outbuf(struct pypy_cjk_dec_s *d)
@@ -51,3 +78,13 @@
{
return d->outbuf - d->outbuf_start;
}
+
+Py_ssize_t pypy_cjk_dec_inbuf_remaining(struct pypy_cjk_dec_s *d)
+{
+ return d->inbuf_end - d->inbuf;
+}
+
+Py_ssize_t pypy_cjk_dec_inbuf_consumed(struct pypy_cjk_dec_s* d)
+{
+ return d->inbuf - d->inbuf_start;
+}
diff --git a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h
--- a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h
+++ b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h
@@ -59,23 +59,26 @@
#define MBERR_TOOSMALL (-1) /* insufficient output buffer space */
#define MBERR_TOOFEW (-2) /* incomplete input buffer */
#define MBERR_INTERNAL (-3) /* internal runtime error */
+#define MBERR_NOMEMORY (-4) /* out of memory */
#define MBENC_FLUSH 0x0001 /* encode all characters encodable */
#define MBENC_MAX MBENC_FLUSH
struct pypy_cjk_dec_s {
- MultibyteCodec *codec;
+ const MultibyteCodec *codec;
MultibyteCodec_State state;
- char *inbuf, *inbuf_end;
+ const unsigned char *inbuf_start, *inbuf, *inbuf_end;
Py_UNICODE *outbuf_start, *outbuf, *outbuf_end;
};
struct pypy_cjk_dec_s *pypy_cjk_dec_init(const MultibyteCodec *codec,
char *inbuf, Py_ssize_t inlen);
void pypy_cjk_dec_free(struct pypy_cjk_dec_s *);
-long pypy_cjk_dec_chunk(struct pypy_cjk_dec_s *);
+Py_ssize_t pypy_cjk_dec_chunk(struct pypy_cjk_dec_s *);
Py_UNICODE *pypy_cjk_dec_outbuf(struct pypy_cjk_dec_s *);
Py_ssize_t pypy_cjk_dec_outlen(struct pypy_cjk_dec_s *);
+Py_ssize_t pypy_cjk_dec_inbuf_remaining(struct pypy_cjk_dec_s *d);
+Py_ssize_t pypy_cjk_dec_inbuf_consumed(struct pypy_cjk_dec_s* d);
#endif
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -1,5 +1,6 @@
+import py
from pypy.module._multibytecodec.c_codecs import getcodec, codecs
-from pypy.module._multibytecodec.c_codecs import decode
+from pypy.module._multibytecodec.c_codecs import decode, EncodeDecodeError
def test_codecs_existence():
@@ -9,7 +10,28 @@
c = getcodec("foobar")
assert not c
-def test_gbk_simple():
+def test_decode_gbk():
c = getcodec("gbk")
u = decode(c, "\xA1\xAA")
assert u == unichr(0x2014)
+ u = decode(c, "foobar")
+ assert u == u"foobar"
+
+def test_decode_hz():
+ # stateful
+ c = getcodec("hz")
+ u = decode(c, "~{abc}")
+ assert u == u'\u5f95\u6cef'
+
+def test_decode_hz_error():
+ # error
+ c = getcodec("hz")
+ e = py.test.raises(EncodeDecodeError, decode, c, "~{}").value
+ assert e.start == 2
+ assert e.end == 3
+ assert e.reason == "incomplete multibyte sequence"
+ #
+ e = py.test.raises(EncodeDecodeError, decode, c, "~{xyz}").value
+ assert e.start == 2
+ assert e.end == 4
+ assert e.reason == "illegal multibyte sequence"
More information about the pypy-commit
mailing list