[Python-checkins] cpython: Close #20404: blacklist non-text encodings in io.TextIOWrapper

Tue Feb 4 13:14:11 CET 2014

http://hg.python.org/cpython/rev/f3ec00d2b75e
changeset:   88954:f3ec00d2b75e
user:        Nick Coghlan <ncoghlan at gmail.com>
date:        Tue Feb 04 22:11:18 2014 +1000
summary:
  Close #20404: blacklist non-text encodings in io.TextIOWrapper

- io.TextIOWrapper (and hence the open() builtin) now use the
  internal codec marking system added for issue #19619
- also tweaked the C code to only look up the encoding once,
  rather than multiple times
- the existing output type checks remain in place to deal with
  unmarked third party codecs.

files:
  Include/codecs.h     |  20 +++++++
  Lib/_pyio.py         |   5 +
  Lib/test/test_io.py  |  33 +++++++++--
  Misc/NEWS            |   6 ++
  Modules/_io/textio.c |  34 ++++++++----
  Python/codecs.c      |  88 +++++++++++++++++++++++--------
  6 files changed, 143 insertions(+), 43 deletions(-)

diff --git a/Include/codecs.h b/Include/codecs.h
--- a/Include/codecs.h
+++ b/Include/codecs.h
@@ -104,7 +104,14 @@
    Please note that these APIs are internal and should not
    be used in Python C extensions.
 
+   XXX (ncoghlan): should we make these, or something like them, public
+   in Python 3.5+?
+
  */
+PyAPI_FUNC(PyObject *) _PyCodec_LookupTextEncoding(
+       const char *encoding,
+       const char *alternate_command
+       );
 
 PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
        PyObject *object,
@@ -117,6 +124,19 @@
        const char *encoding,
        const char *errors
        );
+
+/* These two aren't actually text encoding specific, but _io.TextIOWrapper
+ * is the only current API consumer.
+ */
+PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalDecoder(
+       PyObject *codec_info,
+       const char *errors
+       );
+
+PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalEncoder(
+       PyObject *codec_info,
+       const char *errors
+       );
 #endif
 
 
diff --git a/Lib/_pyio.py b/Lib/_pyio.py
--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py
@@ -1503,6 +1503,11 @@
         if not isinstance(encoding, str):
             raise ValueError("invalid encoding: %r" % encoding)
 
+        if not codecs.lookup(encoding)._is_text_encoding:
+            msg = ("%r is not a text encoding; "
+                   "use codecs.open() to handle arbitrary codecs")
+            raise LookupError(msg % encoding)
+
         if errors is None:
             errors = "strict"
         else:
diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -1929,6 +1929,15 @@
         self.assertRaises(TypeError, t.__init__, b, newline=42)
         self.assertRaises(ValueError, t.__init__, b, newline='xyzzy')
 
+    def test_non_text_encoding_codecs_are_rejected(self):
+        # Ensure the constructor complains if passed a codec that isn't
+        # marked as a text encoding
+        # http://bugs.python.org/issue20404
+        r = self.BytesIO()
+        b = self.BufferedWriter(r)
+        with self.assertRaisesRegex(LookupError, "is not a text encoding"):
+            self.TextIOWrapper(b, encoding="hex")
+
     def test_detach(self):
         r = self.BytesIO()
         b = self.BufferedWriter(r)
@@ -2579,15 +2588,22 @@
 
     def test_illegal_decoder(self):
         # Issue #17106
+        # Bypass the early encoding check added in issue 20404
+        def _make_illegal_wrapper():
+            quopri = codecs.lookup("quopri")
+            quopri._is_text_encoding = True
+            try:
+                t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'),
+                                       newline='\n', encoding="quopri")
+            finally:
+                quopri._is_text_encoding = False
+            return t
         # Crash when decoder returns non-string
-        t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
-                               encoding='quopri_codec')
+        t = _make_illegal_wrapper()
         self.assertRaises(TypeError, t.read, 1)
-        t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
-                               encoding='quopri_codec')
+        t = _make_illegal_wrapper()
         self.assertRaises(TypeError, t.readline)
-        t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
-                               encoding='quopri_codec')
+        t = _make_illegal_wrapper()
         self.assertRaises(TypeError, t.read)
 
     def _check_create_at_shutdown(self, **kwargs):
@@ -2616,8 +2632,7 @@
         if err:
             # Can error out with a RuntimeError if the module state
             # isn't found.
-            self.assertIn("RuntimeError: could not find io module state",
-                          err.decode())
+            self.assertIn(self.shutdown_error, err.decode())
         else:
             self.assertEqual("ok", out.decode().strip())
 
@@ -2630,6 +2645,7 @@
 
 class CTextIOWrapperTest(TextIOWrapperTest):
     io = io
+    shutdown_error = "RuntimeError: could not find io module state"
 
     def test_initialization(self):
         r = self.BytesIO(b"\xc3\xa9\n\n")
@@ -2674,6 +2690,7 @@
 
 class PyTextIOWrapperTest(TextIOWrapperTest):
     io = pyio
+    shutdown_error = "LookupError: unknown encoding: ascii"
 
 
 class IncrementalNewlineDecoderTest(unittest.TestCase):
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,12 @@
 Core and Builtins
 -----------------
 
+- Issue #20404: io.TextIOWrapper (and hence the open() builtin) now uses the
+  internal codec marking system added for issue #19619 to throw LookupError
+  for known non-text encodings at stream construction time. The existing
+  output type checks remain in place to deal with unmarked third party
+  codecs.
+
 - Issue #17162: Add PyType_GetSlot.
 
 - Issue #20162: Fix an alignment issue in the siphash24() hash function which
diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c
--- a/Modules/_io/textio.c
+++ b/Modules/_io/textio.c
@@ -849,7 +849,7 @@
     char *kwlist[] = {"buffer", "encoding", "errors",
                       "newline", "line_buffering", "write_through",
                       NULL};
-    PyObject *buffer, *raw;
+    PyObject *buffer, *raw, *codec_info = NULL;
     char *encoding = NULL;
     char *errors = NULL;
     char *newline = NULL;
@@ -961,6 +961,17 @@
                         "could not determine default encoding");
     }
 
+    /* Check we have been asked for a real text encoding */
+    codec_info = _PyCodec_LookupTextEncoding(encoding, "codecs.open()");
+    if (codec_info == NULL) {
+        Py_CLEAR(self->encoding);
+        goto error;
+    }
+
+    /* XXX: Failures beyond this point have the potential to leak elements
+     * of the partially constructed object (like self->encoding)
+     */
+
     if (errors == NULL)
         errors = "strict";
     self->errors = PyBytes_FromString(errors);
@@ -975,7 +986,7 @@
     if (newline) {
         self->readnl = PyUnicode_FromString(newline);
         if (self->readnl == NULL)
-            return -1;
+            goto error;
     }
     self->writetranslate = (newline == NULL || newline[0] != '\0');
     if (!self->readuniversal && self->readnl) {
@@ -999,8 +1010,8 @@
     if (r == -1)
         goto error;
     if (r == 1) {
-        self->decoder = PyCodec_IncrementalDecoder(
-            encoding, errors);
+        self->decoder = _PyCodecInfo_GetIncrementalDecoder(codec_info,
+                                                           errors);
         if (self->decoder == NULL)
             goto error;
 
@@ -1024,17 +1035,12 @@
     if (r == -1)
         goto error;
     if (r == 1) {
-        PyObject *ci;
-        self->encoder = PyCodec_IncrementalEncoder(
-            encoding, errors);
+        self->encoder = _PyCodecInfo_GetIncrementalEncoder(codec_info,
+                                                           errors);
         if (self->encoder == NULL)
             goto error;
         /* Get the normalized named of the codec */
-        ci = _PyCodec_Lookup(encoding);
-        if (ci == NULL)
-            goto error;
-        res = _PyObject_GetAttrId(ci, &PyId_name);
-        Py_DECREF(ci);
+        res = _PyObject_GetAttrId(codec_info, &PyId_name);
         if (res == NULL) {
             if (PyErr_ExceptionMatches(PyExc_AttributeError))
                 PyErr_Clear();
@@ -1054,6 +1060,9 @@
         Py_XDECREF(res);
     }
 
+    /* Finished sorting out the codec details */
+    Py_DECREF(codec_info);
+
     self->buffer = buffer;
     Py_INCREF(buffer);
 
@@ -1116,6 +1125,7 @@
     return 0;
 
   error:
+    Py_XDECREF(codec_info);
     return -1;
 }
 
diff --git a/Python/codecs.c b/Python/codecs.c
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -243,20 +243,15 @@
     return v;
 }
 
-/* Helper function to create an incremental codec. */
+/* Helper functions to create an incremental codec. */
+static
+PyObject *codec_makeincrementalcodec(PyObject *codec_info,
+                                     const char *errors,
+                                     const char *attrname)
+{
+    PyObject *ret, *inccodec;
 
-static
-PyObject *codec_getincrementalcodec(const char *encoding,
-                                    const char *errors,
-                                    const char *attrname)
-{
-    PyObject *codecs, *ret, *inccodec;
-
-    codecs = _PyCodec_Lookup(encoding);
-    if (codecs == NULL)
-        return NULL;
-    inccodec = PyObject_GetAttrString(codecs, attrname);
-    Py_DECREF(codecs);
+    inccodec = PyObject_GetAttrString(codec_info, attrname);
     if (inccodec == NULL)
         return NULL;
     if (errors)
@@ -267,6 +262,21 @@
     return ret;
 }
 
+static
+PyObject *codec_getincrementalcodec(const char *encoding,
+                                    const char *errors,
+                                    const char *attrname)
+{
+    PyObject *codec_info, *ret;
+
+    codec_info = _PyCodec_Lookup(encoding);
+    if (codec_info == NULL)
+        return NULL;
+    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
+    Py_DECREF(codec_info);
+    return ret;
+}
+
 /* Helper function to create a stream codec. */
 
 static
@@ -290,6 +300,24 @@
     return streamcodec;
 }
 
+/* Helpers to work with the result of _PyCodec_Lookup
+
+ */
+PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
+                                             const char *errors)
+{
+    return codec_makeincrementalcodec(codec_info, errors,
+                                      "incrementaldecoder");
+}
+
+PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
+                                             const char *errors)
+{
+    return codec_makeincrementalcodec(codec_info, errors,
+                                      "incrementalencoder");
+}
+
+
 /* Convenience APIs to query the Codec registry.
 
    All APIs return a codec object with incremented refcount.
@@ -467,15 +495,12 @@
 }
 
 /* Text encoding/decoding API */
-static
-PyObject *codec_getitem_checked(const char *encoding,
-                                const char *operation_name,
-                                int index)
+PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
+                                       const char *alternate_command)
 {
     _Py_IDENTIFIER(_is_text_encoding);
     PyObject *codec;
     PyObject *attr;
-    PyObject *v;
     int is_text_codec;
 
     codec = _PyCodec_Lookup(encoding);
@@ -502,27 +527,44 @@
                 Py_DECREF(codec);
                 PyErr_Format(PyExc_LookupError,
                              "'%.400s' is not a text encoding; "
-                             "use codecs.%s() to handle arbitrary codecs",
-                             encoding, operation_name);
+                             "use %s to handle arbitrary codecs",
+                             encoding, alternate_command);
                 return NULL;
             }
         }
     }
 
+    /* This appears to be a valid text encoding */
+    return codec;
+}
+
+
+static
+PyObject *codec_getitem_checked(const char *encoding,
+                                const char *alternate_command,
+                                int index)
+{
+    PyObject *codec;
+    PyObject *v;
+
+    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
+    if (codec == NULL)
+        return NULL;
+
     v = PyTuple_GET_ITEM(codec, index);
+    Py_INCREF(v);
     Py_DECREF(codec);
-    Py_INCREF(v);
     return v;
 }
 
 static PyObject * _PyCodec_TextEncoder(const char *encoding)
 {
-    return codec_getitem_checked(encoding, "encode", 0);
+    return codec_getitem_checked(encoding, "codecs.encode()", 0);
 }
 
 static PyObject * _PyCodec_TextDecoder(const char *encoding)
 {
-    return codec_getitem_checked(encoding, "decode", 1);
+    return codec_getitem_checked(encoding, "codecs.decode()", 1);
 }
 
 PyObject *_PyCodec_EncodeText(PyObject *object,

-- 
Repository URL: http://hg.python.org/cpython