[Python-checkins] cpython (merge 3.3 -> default): Issue #13612: handle unknown encodings without a buffer overflow.

eli.bendersky python-checkins at python.org
Sat May 25 14:27:35 CEST 2013


http://hg.python.org/cpython/rev/47e719b11c46
changeset:   83920:47e719b11c46
parent:      83918:0bf4a6b56eb5
parent:      83919:f7b47fb30169
user:        Eli Bendersky <eliben at gmail.com>
date:        Sat May 25 05:27:10 2013 -0700
summary:
  Issue #13612: handle unknown encodings without a buffer overflow.

This affects pyexpat and _elementtree. PyExpat_CAPI now exposes a new
function - DefaultUnknownEncodingHandler.

Based on a patch by Serhiy Storchaka.

files:
  Include/pyexpat.h          |   4 +-
  Lib/test/test_xml_etree.py |  92 ++++++++++++++++++++++++++
  Modules/_elementtree.c     |  43 +-----------
  Modules/pyexpat.c          |  58 +++++++--------
  4 files changed, 123 insertions(+), 74 deletions(-)


diff --git a/Include/pyexpat.h b/Include/pyexpat.h
--- a/Include/pyexpat.h
+++ b/Include/pyexpat.h
@@ -6,7 +6,7 @@
 #define PyExpat_CAPI_MAGIC  "pyexpat.expat_CAPI 1.0"
 #define PyExpat_CAPSULE_NAME "pyexpat.expat_CAPI"
 
-struct PyExpat_CAPI 
+struct PyExpat_CAPI
 {
     char* magic; /* set to PyExpat_CAPI_MAGIC */
     int size; /* set to sizeof(struct PyExpat_CAPI) */
@@ -46,6 +46,8 @@
     void (*SetStartDoctypeDeclHandler)(XML_Parser parser,
                                        XML_StartDoctypeDeclHandler start);
     enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding);
+    int (*DefaultUnknownEncodingHandler)(
+        void *encodingHandlerData, const XML_Char *name, XML_Encoding *info);
     /* always add new stuff to the end! */
 };
 
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -681,6 +681,98 @@
         check("cp437", '\u221a')
         check("mac-roman", '\u02da')
 
+        def xml(encoding):
+            return "<?xml version='1.0' encoding='%s'?><xml />" % encoding
+        def bxml(encoding):
+            return xml(encoding).encode(encoding)
+        supported_encodings = [
+            'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le',
+            'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
+            'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
+            'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
+            'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852',
+            'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862',
+            'cp863', 'cp865', 'cp866', 'cp869', 'cp874', 'cp1006', 'cp1250',
+            'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
+            'cp1257', 'cp1258',
+            'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
+            'mac-roman', 'mac-turkish',
+            'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
+            'iso2022-jp-3', 'iso2022-jp-ext',
+            'koi8-r', 'koi8-u',
+            'hz', 'ptcp154',
+        ]
+        for encoding in supported_encodings:
+            self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'<xml />')
+
+        unsupported_ascii_compatible_encodings = [
+            'big5', 'big5hkscs',
+            'cp932', 'cp949', 'cp950',
+            'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
+            'gb2312', 'gbk', 'gb18030',
+            'iso2022-kr', 'johab',
+            'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
+            'utf-7',
+        ]
+        for encoding in unsupported_ascii_compatible_encodings:
+            self.assertRaises(ValueError, ET.XML, bxml(encoding))
+
+        unsupported_ascii_incompatible_encodings = [
+            'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
+            'utf_32', 'utf_32_be', 'utf_32_le',
+        ]
+        for encoding in unsupported_ascii_incompatible_encodings:
+            self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
+
+        self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii'))
+        self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))
+
+        def xml(encoding):
+            return "<?xml version='1.0' encoding='%s'?><xml />" % encoding
+        def bxml(encoding):
+            return xml(encoding).encode(encoding)
+        supported_encodings = [
+            'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le',
+            'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
+            'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
+            'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
+            'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852',
+            'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862',
+            'cp863', 'cp865', 'cp866', 'cp869', 'cp874', 'cp1006', 'cp1250',
+            'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
+            'cp1257', 'cp1258',
+            'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
+            'mac-roman', 'mac-turkish',
+            'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
+            'iso2022-jp-3', 'iso2022-jp-ext',
+            'koi8-r', 'koi8-u',
+            'hz', 'ptcp154',
+        ]
+        for encoding in supported_encodings:
+            self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'<xml />')
+
+        unsupported_ascii_compatible_encodings = [
+            'big5', 'big5hkscs',
+            'cp932', 'cp949', 'cp950',
+            'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
+            'gb2312', 'gbk', 'gb18030',
+            'iso2022-kr', 'johab',
+            'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
+            'utf-7',
+        ]
+        for encoding in unsupported_ascii_compatible_encodings:
+            self.assertRaises(ValueError, ET.XML, bxml(encoding))
+
+        unsupported_ascii_incompatible_encodings = [
+            'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
+            'utf_32', 'utf_32_be', 'utf_32_le',
+        ]
+        for encoding in unsupported_ascii_incompatible_encodings:
+            self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
+
+        self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii'))
+        self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))
+
     def test_methods(self):
         # Test serialization methods.
 
diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c
--- a/Modules/_elementtree.c
+++ b/Modules/_elementtree.c
@@ -3094,47 +3094,6 @@
     }
 }
 
-static int
-expat_unknown_encoding_handler(XMLParserObject *self, const XML_Char *name,
-                               XML_Encoding *info)
-{
-    PyObject* u;
-    unsigned char s[256];
-    int i;
-    void *data;
-    unsigned int kind;
-
-    memset(info, 0, sizeof(XML_Encoding));
-
-    for (i = 0; i < 256; i++)
-        s[i] = i;
-
-    u = PyUnicode_Decode((char*) s, 256, name, "replace");
-    if (!u)
-        return XML_STATUS_ERROR;
-    if (PyUnicode_READY(u))
-        return XML_STATUS_ERROR;
-
-    if (PyUnicode_GET_LENGTH(u) != 256) {
-        Py_DECREF(u);
-        return XML_STATUS_ERROR;
-    }
-
-    kind = PyUnicode_KIND(u);
-    data = PyUnicode_DATA(u);
-    for (i = 0; i < 256; i++) {
-        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
-        if (ch != Py_UNICODE_REPLACEMENT_CHARACTER)
-            info->map[i] = ch;
-        else
-            info->map[i] = -1;
-    }
-
-    Py_DECREF(u);
-
-    return XML_STATUS_OK;
-}
-
 /* -------------------------------------------------------------------- */
 
 static PyObject *
@@ -3236,7 +3195,7 @@
         );
     EXPAT(SetUnknownEncodingHandler)(
         self_xp->parser,
-        (XML_UnknownEncodingHandler) expat_unknown_encoding_handler, NULL
+        EXPAT(DefaultUnknownEncodingHandler), NULL
         );
 
     return 0;
diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c
--- a/Modules/pyexpat.c
+++ b/Modules/pyexpat.c
@@ -1111,53 +1111,49 @@
    Make it as simple as possible.
 */
 
-static char template_buffer[257];
-
-static void
-init_template_buffer(void)
-{
-    int i;
-    for (i = 0; i < 256; i++) {
-        template_buffer[i] = i;
-    }
-    template_buffer[256] = 0;
-}
-
 static int
 PyUnknownEncodingHandler(void *encodingHandlerData,
                          const XML_Char *name,
                          XML_Encoding *info)
 {
-    PyUnicodeObject *_u_string = NULL;
-    int result = 0;
+    static unsigned char template_buffer[256] = {0};
+    PyObject* u;
     int i;
-    int kind;
     void *data;
+    unsigned int kind;
 
-    /* Yes, supports only 8bit encodings */
-    _u_string = (PyUnicodeObject *)
-        PyUnicode_Decode(template_buffer, 256, name, "replace");
+    if (template_buffer[1] == 0) {
+        for (i = 0; i < 256; i++)
+            template_buffer[i] = i;
+    }
 
-    if (_u_string == NULL || PyUnicode_READY(_u_string) == -1)
-        return result;
+    u = PyUnicode_Decode((char*) template_buffer, 256, name, "replace");
+    if (u == NULL || PyUnicode_READY(u))
+        return XML_STATUS_ERROR;
 
-    kind = PyUnicode_KIND(_u_string);
-    data = PyUnicode_DATA(_u_string);
+    if (PyUnicode_GET_LENGTH(u) != 256) {
+        Py_DECREF(u);
+        PyErr_SetString(PyExc_ValueError,
+                        "multi-byte encodings are not supported");
+        return XML_STATUS_ERROR;
+    }
 
+    kind = PyUnicode_KIND(u);
+    data = PyUnicode_DATA(u);
     for (i = 0; i < 256; i++) {
-        /* Stupid to access directly, but fast */
-        Py_UCS4 c = PyUnicode_READ(kind, data, i);
-        if (c == Py_UNICODE_REPLACEMENT_CHARACTER)
+        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+        if (ch != Py_UNICODE_REPLACEMENT_CHARACTER)
+            info->map[i] = ch;
+        else
             info->map[i] = -1;
-        else
-            info->map[i] = c;
     }
+
     info->data = NULL;
     info->convert = NULL;
     info->release = NULL;
-    result = 1;
-    Py_DECREF(_u_string);
-    return result;
+    Py_DECREF(u);
+
+    return XML_STATUS_OK;
 }
 
 
@@ -1752,7 +1748,6 @@
                            Py_BuildValue("(iii)", info.major,
                                          info.minor, info.micro));
     }
-    init_template_buffer();
     /* XXX When Expat supports some way of figuring out how it was
        compiled, this should check and set native_encoding
        appropriately.
@@ -1938,6 +1933,7 @@
     capi.SetUserData = XML_SetUserData;
     capi.SetStartDoctypeDeclHandler = XML_SetStartDoctypeDeclHandler;
     capi.SetEncoding = XML_SetEncoding;
+    capi.DefaultUnknownEncodingHandler = PyUnknownEncodingHandler;
 
     /* export using capsule */
     capi_object = PyCapsule_New(&capi, PyExpat_CAPSULE_NAME, NULL);

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list