[Python-checkins] bpo-36778: cp65001 encoding becomes an alias to utf_8 (GH-13230)

Thu May 9 21:19:57 EDT 2019

https://github.com/python/cpython/commit/d267ac20c309e37d85a986b4417aa8ab4d05dabc
commit: d267ac20c309e37d85a986b4417aa8ab4d05dabc
branch: master
author: Victor Stinner <vstinner at redhat.com>
committer: GitHub <noreply at github.com>
date: 2019-05-10T03:19:54+02:00
summary:

bpo-36778: cp65001 encoding becomes an alias to utf_8 (GH-13230)

files:
A Misc/NEWS.d/next/Library/2019-05-10-01-06-36.bpo-36778.GRqeiS.rst
D Lib/encodings/cp65001.py
M Doc/library/codecs.rst
M Lib/encodings/aliases.py
M Lib/test/test_codecs.py

diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst
index b3246376846d..8d3daa35d153 100644
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -1106,8 +1106,7 @@ particular, the following variants typically exist:
 +-----------------+--------------------------------+--------------------------------+
 | cp1258          | windows-1258                   | Vietnamese                     |
 +-----------------+--------------------------------+--------------------------------+
-| cp65001         |                                | Windows only: Windows UTF-8    |
-|                 |                                | (``CP_UTF8``)                  |
+| cp65001         |                                | Alias to ``utf_8`` encoding    |
 |                 |                                |                                |
 |                 |                                | .. versionadded:: 3.3          |
 +-----------------+--------------------------------+--------------------------------+
diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py
index 2e63c2f9495e..5ef40a3438b9 100644
--- a/Lib/encodings/aliases.py
+++ b/Lib/encodings/aliases.py
@@ -534,6 +534,7 @@
     'utf8'               : 'utf_8',
     'utf8_ucs2'          : 'utf_8',
     'utf8_ucs4'          : 'utf_8',
+    'cp65001'            : 'utf_8',
 
     # uu_codec codec
     'uu'                 : 'uu_codec',
diff --git a/Lib/encodings/cp65001.py b/Lib/encodings/cp65001.py
deleted file mode 100644
index 95cb2aecf0cc..000000000000
--- a/Lib/encodings/cp65001.py
+++ /dev/null
@@ -1,43 +0,0 @@
-"""
-Code page 65001: Windows UTF-8 (CP_UTF8).
-"""
-
-import codecs
-import functools
-
-if not hasattr(codecs, 'code_page_encode'):
-    raise LookupError("cp65001 encoding is only available on Windows")
-
-### Codec APIs
-
-encode = functools.partial(codecs.code_page_encode, 65001)
-_decode = functools.partial(codecs.code_page_decode, 65001)
-
-def decode(input, errors='strict'):
-    return codecs.code_page_decode(65001, input, errors, True)
-
-class IncrementalEncoder(codecs.IncrementalEncoder):
-    def encode(self, input, final=False):
-        return encode(input, self.errors)[0]
-
-class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
-    _buffer_decode = _decode
-
-class StreamWriter(codecs.StreamWriter):
-    encode = encode
-
-class StreamReader(codecs.StreamReader):
-    decode = _decode
-
-### encodings module API
-
-def getregentry():
-    return codecs.CodecInfo(
-        name='cp65001',
-        encode=encode,
-        decode=decode,
-        incrementalencoder=IncrementalEncoder,
-        incrementaldecoder=IncrementalDecoder,
-        streamreader=StreamReader,
-        streamwriter=StreamWriter,
-    )
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 027a84e275e3..8c14f5981d0b 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -875,95 +875,6 @@ def test_surrogatepass_handler(self):
             b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
 
 
- at unittest.skipUnless(sys.platform == 'win32',
-                     'cp65001 is a Windows-only codec')
-class CP65001Test(ReadTest, unittest.TestCase):
-    encoding = "cp65001"
-
-    def test_encode(self):
-        tests = [
-            ('abc', 'strict', b'abc'),
-            ('\xe9\u20ac', 'strict',  b'\xc3\xa9\xe2\x82\xac'),
-            ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
-            ('\udc80', 'strict', None),
-            ('\udc80', 'ignore', b''),
-            ('\udc80', 'replace', b'?'),
-            ('\udc80', 'backslashreplace', b'\\udc80'),
-            ('\udc80', 'namereplace', b'\\udc80'),
-            ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
-        ]
-        for text, errors, expected in tests:
-            if expected is not None:
-                try:
-                    encoded = text.encode('cp65001', errors)
-                except UnicodeEncodeError as err:
-                    self.fail('Unable to encode %a to cp65001 with '
-                              'errors=%r: %s' % (text, errors, err))
-                self.assertEqual(encoded, expected,
-                    '%a.encode("cp65001", %r)=%a != %a'
-                    % (text, errors, encoded, expected))
-            else:
-                self.assertRaises(UnicodeEncodeError,
-                    text.encode, "cp65001", errors)
-
-    def test_decode(self):
-        tests = [
-            (b'abc', 'strict', 'abc'),
-            (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
-            (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
-            (b'\xef\xbf\xbd', 'strict', '\ufffd'),
-            (b'[\xc3\xa9]', 'strict', '[\xe9]'),
-            # invalid bytes
-            (b'[\xff]', 'strict', None),
-            (b'[\xff]', 'ignore', '[]'),
-            (b'[\xff]', 'replace', '[\ufffd]'),
-            (b'[\xff]', 'surrogateescape', '[\udcff]'),
-            (b'[\xed\xb2\x80]', 'strict', None),
-            (b'[\xed\xb2\x80]', 'ignore', '[]'),
-            (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
-        ]
-        for raw, errors, expected in tests:
-            if expected is not None:
-                try:
-                    decoded = raw.decode('cp65001', errors)
-                except UnicodeDecodeError as err:
-                    self.fail('Unable to decode %a from cp65001 with '
-                              'errors=%r: %s' % (raw, errors, err))
-                self.assertEqual(decoded, expected,
-                    '%a.decode("cp65001", %r)=%a != %a'
-                    % (raw, errors, decoded, expected))
-            else:
-                self.assertRaises(UnicodeDecodeError,
-                    raw.decode, 'cp65001', errors)
-
-    def test_lone_surrogates(self):
-        self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
-        self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
-        self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
-                         b'[\\udc80]')
-        self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
-                         b'[\\udc80]')
-        self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
-                         b'[�]')
-        self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
-                         b'[\x80]')
-        self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
-                         b'[]')
-        self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
-                         b'[?]')
-
-    def test_surrogatepass_handler(self):
-        self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
-                         b"abc\xed\xa0\x80def")
-        self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
-                         "abc\ud800def")
-        self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
-                         b"\xf0\x90\xbf\xbf\xed\xa0\x80")
-        self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
-                         "\U00010fff\uD800")
-        self.assertTrue(codecs.lookup_error("surrogatepass"))
-
-
 class UTF7Test(ReadTest, unittest.TestCase):
     encoding = "utf-7"
 
diff --git a/Misc/NEWS.d/next/Library/2019-05-10-01-06-36.bpo-36778.GRqeiS.rst b/Misc/NEWS.d/next/Library/2019-05-10-01-06-36.bpo-36778.GRqeiS.rst
new file mode 100644
index 000000000000..5e594a33447c
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-05-10-01-06-36.bpo-36778.GRqeiS.rst
@@ -0,0 +1,2 @@
+``cp65001`` encoding (Windows code page 65001) becomes an alias to ``utf_8``
+encoding.