[pypy-commit] pypy default: Progress.

arigo noreply at buildbot.pypy.org
Tue May 10 22:58:29 CEST 2011


Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r44056:c5e9d584bb8f
Date: 2011-05-10 21:47 +0200
http://bitbucket.org/pypy/pypy/changeset/c5e9d584bb8f/

Log:	Progress.

diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -7,6 +7,16 @@
 from pypy.tool.autopath import pypydir
 
 
+class EncodeDecodeError(Exception):
+    def __init__(self, start, end, reason):
+        self.start = start
+        self.end = end
+        self.reason = reason
+    def __str__(self):
+        return 'EncodeDecodeError(%r, %r, %r)' % (self.start, self.end,
+                                                  self.reason)
+
+
 srcdir = py.path.local(pypydir).join('module', '_multibytecodec', 'cjkcodecs')
 
 eci = ExternalCompilationInfo(
@@ -21,6 +31,10 @@
     ],
 )
 
+MBERR_TOOSMALL = -1  # insufficient output buffer space
+MBERR_TOOFEW   = -2  # incomplete input buffer
+MBERR_INTERNAL = -3  # internal runtime error
+MBERR_NOMEMORY = -4  # out of memory
 
 MULTIBYTECODEC_P = rffi.COpaquePtr('struct MultibyteCodec_s',
                                    compilation_info=eci)
@@ -76,11 +90,15 @@
 pypy_cjk_dec_free = llexternal('pypy_cjk_dec_free', [DECODEBUF_P],
                                lltype.Void)
 pypy_cjk_dec_chunk = llexternal('pypy_cjk_dec_chunk', [DECODEBUF_P],
-                                lltype.Signed)
+                                rffi.SSIZE_T)
 pypy_cjk_dec_outbuf = llexternal('pypy_cjk_dec_outbuf', [DECODEBUF_P],
                                  rffi.CWCHARP)
 pypy_cjk_dec_outlen = llexternal('pypy_cjk_dec_outlen', [DECODEBUF_P],
                                  rffi.SSIZE_T)
+pypy_cjk_dec_inbuf_remaining = llexternal('pypy_cjk_dec_inbuf_remaining',
+                                          [DECODEBUF_P], rffi.SSIZE_T)
+pypy_cjk_dec_inbuf_consumed = llexternal('pypy_cjk_dec_inbuf_consumed',
+                                         [DECODEBUF_P], rffi.SSIZE_T)
 
 def decode(codec, stringdata):
     inleft = len(stringdata)
@@ -96,7 +114,7 @@
                 r = pypy_cjk_dec_chunk(decodebuf)
                 if r == 0:
                     break
-                multibytecodec_decerror(xxx)
+                multibytecodec_decerror(decodebuf, r)
             src = pypy_cjk_dec_outbuf(decodebuf)
             length = pypy_cjk_dec_outlen(decodebuf)
             return unicode_from_raw(src, length)
@@ -107,6 +125,25 @@
     finally:
         rffi.free_nonmovingbuffer(stringdata, inbuf)
 
+def multibytecodec_decerror(decodebuf, e):
+    if e > 0:
+        reason = "illegal multibyte sequence"
+        esize = e
+    elif e == MBERR_TOOFEW:
+        reason = "incomplete multibyte sequence"
+        esize = pypy_cjk_dec_inbuf_remaining(decodebuf)
+    elif e == MBERR_NOMEMORY:
+        raise MemoryError
+    else:
+        raise RuntimeError
+    #
+    # if errors == ERROR_REPLACE:...
+    # if errors == ERROR_IGNORE or errors == ERROR_REPLACE:...
+    start = pypy_cjk_dec_inbuf_consumed(decodebuf)
+    end = start + esize
+    if 1:  # errors == ERROR_STRICT:
+        raise EncodeDecodeError(start, end, reason)
+
 # ____________________________________________________________
 
 def unicode_from_raw(src, length):
diff --git a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c
--- a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c
+++ b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.c
@@ -12,6 +12,7 @@
     goto errorexit;
 
   d->codec = codec;
+  d->inbuf_start = inbuf;
   d->inbuf = inbuf;
   d->inbuf_end = inbuf + inlen;
   d->outbuf_start = malloc(inlen * sizeof(Py_UNICODE));
@@ -32,14 +33,40 @@
   free(d);
 }
 
-long pypy_cjk_dec_chunk(struct pypy_cjk_dec_s *d)
+static int expand_decodebuffer(struct pypy_cjk_dec_s *d, Py_ssize_t esize)
 {
-  Py_ssize_t inleft = (Py_ssize_t)(d->inbuf_end - d->inbuf);
-  Py_ssize_t outleft = (Py_ssize_t)(d->outbuf_end - d->outbuf);
-  if (inleft == 0)
-    return 0;
-  return d->codec->decode(&d->state, d->codec->config,
-                          &d->inbuf, inleft, &d->outbuf, outleft);
+  Py_ssize_t orgpos, orgsize;
+  Py_UNICODE *newbuf;
+
+  orgpos = d->outbuf - d->outbuf_start;
+  orgsize = d->outbuf_end - d->outbuf_start;
+  esize = orgsize + (esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize);
+  newbuf = realloc(d->outbuf_start, esize * sizeof(Py_UNICODE));
+  if (!newbuf)
+    return -1;
+  d->outbuf_start = newbuf;
+  d->outbuf = newbuf + orgpos;
+  d->outbuf_end = newbuf + esize;
+  return 0;
+}
+
+Py_ssize_t pypy_cjk_dec_chunk(struct pypy_cjk_dec_s *d)
+{
+  while (1)
+    {
+      Py_ssize_t r;
+      Py_ssize_t inleft = (Py_ssize_t)(d->inbuf_end - d->inbuf);
+      Py_ssize_t outleft = (Py_ssize_t)(d->outbuf_end - d->outbuf);
+      if (inleft == 0)
+        return 0;
+      r = d->codec->decode(&d->state, d->codec->config,
+                           &d->inbuf, inleft, &d->outbuf, outleft);
+      if (r != MBERR_TOOSMALL)
+        return r;
+      /* output buffer too small; grow it and continue. */
+      if (expand_decodebuffer(d, -1) == -1)
+        return MBERR_NOMEMORY;
+    }
 }
 
 Py_UNICODE *pypy_cjk_dec_outbuf(struct pypy_cjk_dec_s *d)
@@ -51,3 +78,13 @@
 {
   return d->outbuf - d->outbuf_start;
 }
+
+Py_ssize_t pypy_cjk_dec_inbuf_remaining(struct pypy_cjk_dec_s *d)
+{
+  return d->inbuf_end - d->inbuf;
+}
+
+Py_ssize_t pypy_cjk_dec_inbuf_consumed(struct pypy_cjk_dec_s* d)
+{
+  return d->inbuf - d->inbuf_start;
+}
diff --git a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h
--- a/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h
+++ b/pypy/module/_multibytecodec/cjkcodecs/multibytecodec.h
@@ -59,23 +59,26 @@
 #define MBERR_TOOSMALL          (-1) /* insufficient output buffer space */
 #define MBERR_TOOFEW            (-2) /* incomplete input buffer */
 #define MBERR_INTERNAL          (-3) /* internal runtime error */
+#define MBERR_NOMEMORY          (-4) /* out of memory */
 
 #define MBENC_FLUSH             0x0001 /* encode all characters encodable */
 #define MBENC_MAX               MBENC_FLUSH
 
 
 struct pypy_cjk_dec_s {
-  MultibyteCodec *codec;
+  const MultibyteCodec *codec;
   MultibyteCodec_State state;
-  char *inbuf, *inbuf_end;
+  const unsigned char *inbuf_start, *inbuf, *inbuf_end;
   Py_UNICODE *outbuf_start, *outbuf, *outbuf_end;
 };
 
 struct pypy_cjk_dec_s *pypy_cjk_dec_init(const MultibyteCodec *codec,
                                          char *inbuf, Py_ssize_t inlen);
 void pypy_cjk_dec_free(struct pypy_cjk_dec_s *);
-long pypy_cjk_dec_chunk(struct pypy_cjk_dec_s *);
+Py_ssize_t pypy_cjk_dec_chunk(struct pypy_cjk_dec_s *);
 Py_UNICODE *pypy_cjk_dec_outbuf(struct pypy_cjk_dec_s *);
 Py_ssize_t pypy_cjk_dec_outlen(struct pypy_cjk_dec_s *);
+Py_ssize_t pypy_cjk_dec_inbuf_remaining(struct pypy_cjk_dec_s *d);
+Py_ssize_t pypy_cjk_dec_inbuf_consumed(struct pypy_cjk_dec_s* d);
 
 #endif
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -1,5 +1,6 @@
+import py
 from pypy.module._multibytecodec.c_codecs import getcodec, codecs
-from pypy.module._multibytecodec.c_codecs import decode
+from pypy.module._multibytecodec.c_codecs import decode, EncodeDecodeError
 
 
 def test_codecs_existence():
@@ -9,7 +10,28 @@
     c = getcodec("foobar")
     assert not c
 
-def test_gbk_simple():
+def test_decode_gbk():
     c = getcodec("gbk")
     u = decode(c, "\xA1\xAA")
     assert u == unichr(0x2014)
+    u = decode(c, "foobar")
+    assert u == u"foobar"
+
+def test_decode_hz():
+    # stateful
+    c = getcodec("hz")
+    u = decode(c, "~{abc}")
+    assert u == u'\u5f95\u6cef'
+
+def test_decode_hz_error():
+    # error
+    c = getcodec("hz")
+    e = py.test.raises(EncodeDecodeError, decode, c, "~{}").value
+    assert e.start == 2
+    assert e.end == 3
+    assert e.reason == "incomplete multibyte sequence"
+    #
+    e = py.test.raises(EncodeDecodeError, decode, c, "~{xyz}").value
+    assert e.start == 2
+    assert e.end == 4
+    assert e.reason == "illegal multibyte sequence"


More information about the pypy-commit mailing list