[Python-checkins] r46945 - in python/trunk: Doc/api/concrete.tex Include/unicodeobject.h Lib/encodings/mbcs.py Misc/NEWS Modules/_codecsmodule.c Objects/unicodeobject.c

martin.v.loewis python-checkins at python.org
Wed Jun 14 07:21:06 CEST 2006


Author: martin.v.loewis
Date: Wed Jun 14 07:21:04 2006
New Revision: 46945

Modified:
   python/trunk/Doc/api/concrete.tex
   python/trunk/Include/unicodeobject.h
   python/trunk/Lib/encodings/mbcs.py
   python/trunk/Misc/NEWS
   python/trunk/Modules/_codecsmodule.c
   python/trunk/Objects/unicodeobject.c
Log:
Patch #1455898: Incremental mode for "mbcs" codec.

Modified: python/trunk/Doc/api/concrete.tex
==============================================================================
--- python/trunk/Doc/api/concrete.tex	(original)
+++ python/trunk/Doc/api/concrete.tex	Wed Jun 14 07:21:04 2006
@@ -1431,6 +1431,18 @@
   raised by the codec.
 \end{cfuncdesc}
 
+\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeMBCSStateful}{const char *s,
+                                               int size,
+                                               const char *errors,
+                                               int *consumed}
+  If \var{consumed} is \NULL{}, behave like
+  \cfunction{PyUnicode_DecodeMBCS()}. If \var{consumed} is not \NULL{},
+  \cfunction{PyUnicode_DecodeMBCSStateful()} will not decode trailing lead
+  byte and the number of bytes that have been decoded will be stored in
+  \var{consumed}.
+  \versionadded{2.5}
+\end{cfuncdesc}
+
 \begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeMBCS}{const Py_UNICODE *s,
                                                Py_ssize_t size,
                                                const char *errors}

Modified: python/trunk/Include/unicodeobject.h
==============================================================================
--- python/trunk/Include/unicodeobject.h	(original)
+++ python/trunk/Include/unicodeobject.h	Wed Jun 14 07:21:04 2006
@@ -938,6 +938,13 @@
     const char *errors          /* error handling */
     );
 
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
+    const char *string,         /* MBCS encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
     PyObject *unicode           /* Unicode object */
     );

Modified: python/trunk/Lib/encodings/mbcs.py
==============================================================================
--- python/trunk/Lib/encodings/mbcs.py	(original)
+++ python/trunk/Lib/encodings/mbcs.py	Wed Jun 14 07:21:04 2006
@@ -22,9 +22,10 @@
     def encode(self, input, final=False):
         return codecs.mbcs_encode(input,self.errors)[0]
 
-class IncrementalDecoder(codecs.IncrementalDecoder):
-    def decode(self, input, final=False):
-        return codecs.mbcs_decode(input,self.errors)[0]
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    def _buffer_decode(self, input, errors, final):
+        return codecs.mbcs_decode(input,self.errors,final)
+
 class StreamWriter(Codec,codecs.StreamWriter):
     pass
 

Modified: python/trunk/Misc/NEWS
==============================================================================
--- python/trunk/Misc/NEWS	(original)
+++ python/trunk/Misc/NEWS	Wed Jun 14 07:21:04 2006
@@ -156,6 +156,9 @@
 Library
 -------
 
+- Patch #1455898: The MBCS codec now supports the incremental mode for
+  double-byte encodings.
+
 - ``difflib``'s ``SequenceMatcher.get_matching_blocks()`` was changed to
   guarantee that adjacent triples in the return list always describe
   non-adjacent blocks.  Previously, a pair of matching blocks could end

Modified: python/trunk/Modules/_codecsmodule.c
==============================================================================
--- python/trunk/Modules/_codecsmodule.c	(original)
+++ python/trunk/Modules/_codecsmodule.c	Wed Jun 14 07:21:04 2006
@@ -479,15 +479,20 @@
 	    PyObject *args)
 {
     const char *data;
-    Py_ssize_t size;
+    Py_ssize_t size, consumed;
     const char *errors = NULL;
+    int final = 1;
+    PyObject *decoded;
 
-    if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode",
-			  &data, &size, &errors))
+    if (!PyArg_ParseTuple(args, "t#|zi:mbcs_decode",
+			  &data, &size, &errors, &final))
 	return NULL;
 
-    return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors),
-		       size);
+    decoded = PyUnicode_DecodeMBCSStateful(
+	data, size, errors, final ? NULL : &consumed);
+    if (!decoded)
+	return NULL;
+    return codec_tuple(decoded, final ? size : consumed);
 }
 
 #endif /* MS_WINDOWS */

Modified: python/trunk/Objects/unicodeobject.c
==============================================================================
--- python/trunk/Objects/unicodeobject.c	(original)
+++ python/trunk/Objects/unicodeobject.c	Wed Jun 14 07:21:04 2006
@@ -2820,65 +2820,199 @@
 
 /* --- MBCS codecs for Windows -------------------------------------------- */
 
-PyObject *PyUnicode_DecodeMBCS(const char *s,
-				Py_ssize_t size,
-				const char *errors)
+#if SIZEOF_INT < SIZEOF_SSIZE_T
+#define NEED_RETRY
+#endif
+
+/* XXX This code is limited to "true" double-byte encodings, as
+   a) it assumes an incomplete character consists of a single byte, and
+   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
+      encodings, see IsDBCSLeadByteEx documentation. */
+
+static int is_dbcs_lead_byte(const char *s, int offset)
+{
+    const char *curr = s + offset;
+
+    if (IsDBCSLeadByte(*curr)) {
+	const char *prev = CharPrev(s, curr);
+	return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
+    }
+    return 0;
+}
+
+/*
+ * Decode MBCS string into unicode object. If 'final' is set, converts
+ * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
+ */
+static int decode_mbcs(PyUnicodeObject **v,
+			const char *s, /* MBCS string */
+			int size, /* sizeof MBCS string */
+			int final)
 {
-    PyUnicodeObject *v;
     Py_UNICODE *p;
-    DWORD usize;
+    Py_ssize_t n = 0;
+    int usize = 0;
+
+    assert(size >= 0);
+
+    /* Skip trailing lead-byte unless 'final' is set */
+    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
+	--size;
 
     /* First get the size of the result */
-    assert(size < INT_MAX);
-    usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
-    if (size > 0 && usize==0)
-        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    if (size > 0) {
+	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
+	if (usize == 0) {
+	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
+	    return -1;
+	}
+    }
 
-    v = _PyUnicode_New(usize);
-    if (v == NULL)
-        return NULL;
-    if (usize == 0)
-	return (PyObject *)v;
-    p = PyUnicode_AS_UNICODE(v);
-    if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
-        Py_DECREF(v);
-        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    if (*v == NULL) {
+	/* Create unicode object */
+	*v = _PyUnicode_New(usize);
+	if (*v == NULL)
+	    return -1;
+    }
+    else {
+	/* Extend unicode object */
+	n = PyUnicode_GET_SIZE(*v);
+	if (_PyUnicode_Resize(v, n + usize) < 0)
+	    return -1;
+    }
+
+    /* Do the conversion */
+    if (size > 0) {
+	p = PyUnicode_AS_UNICODE(*v) + n;
+	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
+	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
+	    return -1;
+	}
+    }
+
+    return size;
+}
+
+PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
+					Py_ssize_t size,
+					const char *errors,
+					Py_ssize_t *consumed)
+{
+    PyUnicodeObject *v = NULL;
+    int done;
+
+    if (consumed)
+	*consumed = 0;
+
+#ifdef NEED_RETRY
+  retry:
+    if (size > INT_MAX)
+	done = decode_mbcs(&v, s, INT_MAX, 0);
+    else
+#endif
+	done = decode_mbcs(&v, s, (int)size, !consumed);
+
+    if (done < 0) {
+        Py_XDECREF(v);
+	return NULL;
     }
 
+    if (consumed)
+	*consumed += done;
+
+#ifdef NEED_RETRY
+    if (size > INT_MAX) {
+	s += done;
+	size -= done;
+	goto retry;
+    }
+#endif
+
     return (PyObject *)v;
 }
 
-PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+PyObject *PyUnicode_DecodeMBCS(const char *s,
 				Py_ssize_t size,
 				const char *errors)
 {
-    PyObject *repr;
-    char *s;
-    DWORD mbcssize;
-
-    /* If there are no characters, bail now! */
-    if (size==0)
-	    return PyString_FromString("");
+    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
+}
+
+/*
+ * Convert unicode into string object (MBCS).
+ * Returns 0 if succeed, -1 otherwise.
+ */
+static int encode_mbcs(PyObject **repr,
+			const Py_UNICODE *p, /* unicode */
+			int size) /* size of unicode */
+{
+    int mbcssize = 0;
+    Py_ssize_t n = 0;
+
+    assert(size >= 0);
 
     /* First get the size of the result */
-    assert(size<INT_MAX);
-    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
-    if (mbcssize==0)
-        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    if (size > 0) {
+	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
+	if (mbcssize == 0) {
+	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
+	    return -1;
+	}
+    }
 
-    repr = PyString_FromStringAndSize(NULL, mbcssize);
-    if (repr == NULL)
-        return NULL;
-    if (mbcssize == 0)
-        return repr;
+    if (*repr == NULL) {
+	/* Create string object */
+	*repr = PyString_FromStringAndSize(NULL, mbcssize);
+	if (*repr == NULL)
+	    return -1;
+    }
+    else {
+	/* Extend string object */
+	n = PyString_Size(*repr);
+	if (_PyString_Resize(repr, n + mbcssize) < 0)
+	    return -1;
+    }
 
     /* Do the conversion */
-    s = PyString_AS_STRING(repr);
-    assert(size < INT_MAX);
-    if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
-        Py_DECREF(repr);
-        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    if (size > 0) {
+	char *s = PyString_AS_STRING(*repr) + n;
+	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
+	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
+	    return -1;
+	}
+    }
+
+    return 0;
+}
+
+PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+				Py_ssize_t size,
+				const char *errors)
+{
+    PyObject *repr = NULL;
+    int ret;
+
+#ifdef NEED_RETRY
+ retry:
+    if (size > INT_MAX)
+	ret = encode_mbcs(&repr, p, INT_MAX);
+    else
+#endif
+	ret = encode_mbcs(&repr, p, (int)size);
+
+    if (ret < 0) {
+	Py_XDECREF(repr);
+	return NULL;
     }
+
+#ifdef NEED_RETRY
+    if (size > INT_MAX) {
+	p += INT_MAX;
+	size -= INT_MAX;
+	goto retry;
+    }
+#endif
+
     return repr;
 }
 
@@ -2893,6 +3027,8 @@
 				NULL);
 }
 
+#undef NEED_RETRY
+
 #endif /* MS_WINDOWS */
 
 /* --- Character Mapping Codec -------------------------------------------- */


More information about the Python-checkins mailing list