[Python-checkins] cpython: Issue #27959: Adds oem encoding, alias ansi to mbcs, move aliasmbcs to codec

Tue Sep 6 22:42:59 EDT 2016

https://hg.python.org/cpython/rev/c499690f606c
changeset:   103199:c499690f606c
user:        Steve Dower <steve.dower at microsoft.com>
date:        Tue Sep 06 19:42:27 2016 -0700
summary:
  Issue #27959: Adds oem encoding, alias ansi to mbcs, move aliasmbcs to codec lookup

files:
  Include/unicodeobject.h          |   2 +-
  Lib/encodings/__init__.py        |  10 ++
  Lib/encodings/aliases.py         |   1 +
  Lib/encodings/oem.py             |  41 ++++++++++
  Lib/site.py                      |  16 ---
  Lib/test/test_codecs.py          |  62 +++++++--------
  Modules/_codecsmodule.c          |  36 ++++++++
  Modules/clinic/_codecsmodule.c.h |  81 +++++++++++++++++++-
  8 files changed, 198 insertions(+), 51 deletions(-)

diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1663,7 +1663,7 @@
 
 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
     const char *string,         /* MBCS encoded string */
-    Py_ssize_t length,              /* size of string */
+    Py_ssize_t length,          /* size of string */
     const char *errors          /* error handling */
     );
 
diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py
--- a/Lib/encodings/__init__.py
+++ b/Lib/encodings/__init__.py
@@ -29,6 +29,7 @@
 """#"
 
 import codecs
+import sys
 from . import aliases
 
 _cache = {}
@@ -151,3 +152,12 @@
 
 # Register the search_function in the Python codec registry
 codecs.register(search_function)
+
+if sys.platform == 'win32':
+    def _alias_mbcs(encoding):
+        import _bootlocale
+        if encoding == _bootlocale.getpreferredencoding(False):
+            import encodings.mbcs
+            return encodings.mbcs.getregentry()
+
+    codecs.register(_alias_mbcs)
diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py
--- a/Lib/encodings/aliases.py
+++ b/Lib/encodings/aliases.py
@@ -458,6 +458,7 @@
     'macturkish'         : 'mac_turkish',
 
     # mbcs codec
+    'ansi'               : 'mbcs',
     'dbcs'               : 'mbcs',
 
     # ptcp154 codec
diff --git a/Lib/encodings/oem.py b/Lib/encodings/oem.py
new file mode 100644
--- /dev/null
+++ b/Lib/encodings/oem.py
@@ -0,0 +1,41 @@
+""" Python 'oem' Codec for Windows
+
+"""
+# Import them explicitly to cause an ImportError
+# on non-Windows systems
+from codecs import oem_encode, oem_decode
+# for IncrementalDecoder, IncrementalEncoder, ...
+import codecs
+
+### Codec APIs
+
+encode = oem_encode
+
+def decode(input, errors='strict'):
+    return oem_decode(input, errors, True)
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+    def encode(self, input, final=False):
+        return oem_encode(input, self.errors)[0]
+
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    _buffer_decode = oem_decode
+
+class StreamWriter(codecs.StreamWriter):
+    encode = oem_encode
+
+class StreamReader(codecs.StreamReader):
+    decode = oem_decode
+
+### encodings module API
+
+def getregentry():
+    return codecs.CodecInfo(
+        name='oem',
+        encode=encode,
+        decode=decode,
+        incrementalencoder=IncrementalEncoder,
+        incrementaldecoder=IncrementalDecoder,
+        streamreader=StreamReader,
+        streamwriter=StreamWriter,
+    )
diff --git a/Lib/site.py b/Lib/site.py
--- a/Lib/site.py
+++ b/Lib/site.py
@@ -423,21 +423,6 @@
 
     sys.__interactivehook__ = register_readline
 
-def aliasmbcs():
-    """On Windows, some default encodings are not provided by Python,
-    while they are always available as "mbcs" in each locale. Make
-    them usable by aliasing to "mbcs" in such a case."""
-    if sys.platform == 'win32':
-        import _bootlocale, codecs
-        enc = _bootlocale.getpreferredencoding(False)
-        if enc.startswith('cp'):            # "cp***" ?
-            try:
-                codecs.lookup(enc)
-            except LookupError:
-                import encodings
-                encodings._cache[enc] = encodings._unknown
-                encodings.aliases.aliases[enc] = 'mbcs'
-
 CONFIG_LINE = r'^(?P<key>(\w|[-_])+)\s*=\s*(?P<value>.*)\s*$'
 
 def venv(known_paths):
@@ -560,7 +545,6 @@
     setcopyright()
     sethelper()
     enablerlcompleter()
-    aliasmbcs()
     execsitecustomize()
     if ENABLE_USER_SITE:
         execusercustomize()
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -8,11 +8,6 @@
 
 from test import support
 
-if sys.platform == 'win32':
-    VISTA_OR_LATER = (sys.getwindowsversion().major >= 6)
-else:
-    VISTA_OR_LATER = False
-
 try:
     import ctypes
 except ImportError:
@@ -841,18 +836,13 @@
             ('abc', 'strict', b'abc'),
             ('\xe9\u20ac', 'strict',  b'\xc3\xa9\xe2\x82\xac'),
             ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
+            ('\udc80', 'strict', None),
+            ('\udc80', 'ignore', b''),
+            ('\udc80', 'replace', b'?'),
+            ('\udc80', 'backslashreplace', b'\\udc80'),
+            ('\udc80', 'namereplace', b'\\udc80'),
+            ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
         ]
-        if VISTA_OR_LATER:
-            tests.extend((
-                ('\udc80', 'strict', None),
-                ('\udc80', 'ignore', b''),
-                ('\udc80', 'replace', b'?'),
-                ('\udc80', 'backslashreplace', b'\\udc80'),
-                ('\udc80', 'namereplace', b'\\udc80'),
-                ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
-            ))
-        else:
-            tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))
         for text, errors, expected in tests:
             if expected is not None:
                 try:
@@ -879,17 +869,10 @@
             (b'[\xff]', 'ignore', '[]'),
             (b'[\xff]', 'replace', '[\ufffd]'),
             (b'[\xff]', 'surrogateescape', '[\udcff]'),
+            (b'[\xed\xb2\x80]', 'strict', None),
+            (b'[\xed\xb2\x80]', 'ignore', '[]'),
+            (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
         ]
-        if VISTA_OR_LATER:
-            tests.extend((
-                (b'[\xed\xb2\x80]', 'strict', None),
-                (b'[\xed\xb2\x80]', 'ignore', '[]'),
-                (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
-            ))
-        else:
-            tests.extend((
-                (b'[\xed\xb2\x80]', 'strict', '[\udc80]'),
-            ))
         for raw, errors, expected in tests:
             if expected is not None:
                 try:
@@ -904,7 +887,6 @@
                 self.assertRaises(UnicodeDecodeError,
                     raw.decode, 'cp65001', errors)
 
-    @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
     def test_lone_surrogates(self):
         self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
         self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
@@ -921,7 +903,6 @@
         self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
                          b'[?]')
 
-    @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
     def test_surrogatepass_handler(self):
         self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
                          b"abc\xed\xa0\x80def")
@@ -1951,6 +1932,8 @@
 
 if hasattr(codecs, "mbcs_encode"):
     all_unicode_encodings.append("mbcs")
+if hasattr(codecs, "oem_encode"):
+    all_unicode_encodings.append("oem")
 
 # The following encoding is not tested, because it's not supposed
 # to work:
@@ -3119,11 +3102,10 @@
             (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
             (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
         ))
-        if VISTA_OR_LATER:
-            self.check_encode(self.CP_UTF8, (
-                ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
-                ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
-            ))
+        self.check_encode(self.CP_UTF8, (
+            ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
+            ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
+        ))
 
     def test_incremental(self):
         decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
@@ -3144,6 +3126,20 @@
                                           False)
         self.assertEqual(decoded, ('abc', 3))
 
+    def test_mbcs_alias(self):
+        # Check that looking up our 'default' codepage will return
+        # mbcs when we don't have a more specific one available
+        import _bootlocale
+        def _get_fake_codepage(*a):
+            return 'cp123'
+        old_getpreferredencoding = _bootlocale.getpreferredencoding
+        _bootlocale.getpreferredencoding = _get_fake_codepage
+        try:
+            codec = codecs.lookup('cp123')
+            self.assertEqual(codec.name, 'mbcs')
+        finally:
+            _bootlocale.getpreferredencoding = old_getpreferredencoding
+
 
 class ASCIITest(unittest.TestCase):
     def test_encode(self):
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -626,6 +626,25 @@
 }
 
 /*[clinic input]
+_codecs.oem_decode
+    data: Py_buffer
+    errors: str(accept={str, NoneType}) = NULL
+    final: int(c_default="0") = False
+    /
+[clinic start generated code]*/
+
+static PyObject *
+_codecs_oem_decode_impl(PyObject *module, Py_buffer *data,
+                        const char *errors, int final)
+/*[clinic end generated code: output=da1617612f3fcad8 input=95b8a92c446b03cd]*/
+{
+    Py_ssize_t consumed = data->len;
+    PyObject *decoded = PyUnicode_DecodeCodePageStateful(CP_OEMCP,
+        data->buf, data->len, errors, final ? NULL : &consumed);
+    return codec_tuple(decoded, consumed);
+}
+
+/*[clinic input]
 _codecs.code_page_decode
     codepage: int
     data: Py_buffer
@@ -971,6 +990,21 @@
 }
 
 /*[clinic input]
+_codecs.oem_encode
+    str: unicode
+    errors: str(accept={str, NoneType}) = NULL
+    /
+[clinic start generated code]*/
+
+static PyObject *
+_codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors)
+/*[clinic end generated code: output=65d5982c737de649 input=3fc5f0028aad3cda]*/
+{
+    return codec_tuple(PyUnicode_EncodeCodePage(CP_OEMCP, str, errors),
+        PyUnicode_GET_LENGTH(str));
+}
+
+/*[clinic input]
 _codecs.code_page_encode
     code_page: int
     str: unicode
@@ -1075,6 +1109,8 @@
     _CODECS_READBUFFER_ENCODE_METHODDEF
     _CODECS_MBCS_ENCODE_METHODDEF
     _CODECS_MBCS_DECODE_METHODDEF
+    _CODECS_OEM_ENCODE_METHODDEF
+    _CODECS_OEM_DECODE_METHODDEF
     _CODECS_CODE_PAGE_ENCODE_METHODDEF
     _CODECS_CODE_PAGE_DECODE_METHODDEF
     _CODECS_REGISTER_ERROR_METHODDEF
diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h
--- a/Modules/clinic/_codecsmodule.c.h
+++ b/Modules/clinic/_codecsmodule.c.h
@@ -805,6 +805,45 @@
 
 #if defined(HAVE_MBCS)
 
+PyDoc_STRVAR(_codecs_oem_decode__doc__,
+"oem_decode($module, data, errors=None, final=False, /)\n"
+"--\n"
+"\n");
+
+#define _CODECS_OEM_DECODE_METHODDEF    \
+    {"oem_decode", (PyCFunction)_codecs_oem_decode, METH_VARARGS, _codecs_oem_decode__doc__},
+
+static PyObject *
+_codecs_oem_decode_impl(PyObject *module, Py_buffer *data,
+                        const char *errors, int final);
+
+static PyObject *
+_codecs_oem_decode(PyObject *module, PyObject *args)
+{
+    PyObject *return_value = NULL;
+    Py_buffer data = {NULL, NULL};
+    const char *errors = NULL;
+    int final = 0;
+
+    if (!PyArg_ParseTuple(args, "y*|zi:oem_decode",
+        &data, &errors, &final)) {
+        goto exit;
+    }
+    return_value = _codecs_oem_decode_impl(module, &data, errors, final);
+
+exit:
+    /* Cleanup for data */
+    if (data.obj) {
+       PyBuffer_Release(&data);
+    }
+
+    return return_value;
+}
+
+#endif /* defined(HAVE_MBCS) */
+
+#if defined(HAVE_MBCS)
+
 PyDoc_STRVAR(_codecs_code_page_decode__doc__,
 "code_page_decode($module, codepage, data, errors=None, final=False, /)\n"
 "--\n"
@@ -1346,6 +1385,38 @@
 
 #if defined(HAVE_MBCS)
 
+PyDoc_STRVAR(_codecs_oem_encode__doc__,
+"oem_encode($module, str, errors=None, /)\n"
+"--\n"
+"\n");
+
+#define _CODECS_OEM_ENCODE_METHODDEF    \
+    {"oem_encode", (PyCFunction)_codecs_oem_encode, METH_VARARGS, _codecs_oem_encode__doc__},
+
+static PyObject *
+_codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors);
+
+static PyObject *
+_codecs_oem_encode(PyObject *module, PyObject *args)
+{
+    PyObject *return_value = NULL;
+    PyObject *str;
+    const char *errors = NULL;
+
+    if (!PyArg_ParseTuple(args, "U|z:oem_encode",
+        &str, &errors)) {
+        goto exit;
+    }
+    return_value = _codecs_oem_encode_impl(module, str, errors);
+
+exit:
+    return return_value;
+}
+
+#endif /* defined(HAVE_MBCS) */
+
+#if defined(HAVE_MBCS)
+
 PyDoc_STRVAR(_codecs_code_page_encode__doc__,
 "code_page_encode($module, code_page, str, errors=None, /)\n"
 "--\n"
@@ -1446,6 +1517,10 @@
     #define _CODECS_MBCS_DECODE_METHODDEF
 #endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */
 
+#ifndef _CODECS_OEM_DECODE_METHODDEF
+    #define _CODECS_OEM_DECODE_METHODDEF
+#endif /* !defined(_CODECS_OEM_DECODE_METHODDEF) */
+
 #ifndef _CODECS_CODE_PAGE_DECODE_METHODDEF
     #define _CODECS_CODE_PAGE_DECODE_METHODDEF
 #endif /* !defined(_CODECS_CODE_PAGE_DECODE_METHODDEF) */
@@ -1454,7 +1529,11 @@
     #define _CODECS_MBCS_ENCODE_METHODDEF
 #endif /* !defined(_CODECS_MBCS_ENCODE_METHODDEF) */
 
+#ifndef _CODECS_OEM_ENCODE_METHODDEF
+    #define _CODECS_OEM_ENCODE_METHODDEF
+#endif /* !defined(_CODECS_OEM_ENCODE_METHODDEF) */
+
 #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
     #define _CODECS_CODE_PAGE_ENCODE_METHODDEF
 #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
-/*[clinic end generated code: output=0221e4eece62c905 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=7874e2d559d49368 input=a9049054013a1b77]*/

-- 
Repository URL: https://hg.python.org/cpython