[Python-checkins] bpo-28080: Add support for the fallback encoding in ZIP files (GH-32007)

serhiy-storchaka webhook-mailer at python.org
Tue Mar 22 05:53:25 EDT 2022


https://github.com/python/cpython/commit/a25a985535ccbb7df8caddc0017550ff4eae5855
commit: a25a985535ccbb7df8caddc0017550ff4eae5855
branch: main
author: Serhiy Storchaka <storchaka at gmail.com>
committer: serhiy-storchaka <storchaka at gmail.com>
date: 2022-03-22T11:52:55+02:00
summary:

bpo-28080: Add support for the fallback encoding in ZIP files (GH-32007)

* Add the metadata_encoding parameter in the zipfile.ZipFile constructor.
* Add the --metadata-encoding option in the zipfile CLI.

Co-authored-by: Stephen J. Turnbull <stephen at xemacs.org>

files:
A Misc/NEWS.d/next/Library/2022-03-20-15-54-41.bpo-28080.kn35Vk.rst
M Doc/library/zipfile.rst
M Doc/whatsnew/3.11.rst
M Lib/test/test_zipfile.py
M Lib/zipfile.py

diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst
index 9d0d894c05b57..bfcc883de6927 100644
--- a/Doc/library/zipfile.rst
+++ b/Doc/library/zipfile.rst
@@ -139,7 +139,8 @@ ZipFile Objects
 
 
 .. class:: ZipFile(file, mode='r', compression=ZIP_STORED, allowZip64=True, \
-                   compresslevel=None, *, strict_timestamps=True)
+                   compresslevel=None, *, strict_timestamps=True,
+                   metadata_encoding=None)
 
    Open a ZIP file, where *file* can be a path to a file (a string), a
    file-like object or a :term:`path-like object`.
@@ -183,6 +184,10 @@ ZipFile Objects
    Similar behavior occurs with files newer than 2107-12-31,
    the timestamp is also set to the limit.
 
+   When mode is ``'r'``, *metadata_encoding* may be set to the name of a codec,
+   which will be used to decode metadata such as the names of members and ZIP
+   comments.
+
    If the file is created with mode ``'w'``, ``'x'`` or ``'a'`` and then
    :meth:`closed <close>` without adding any files to the archive, the appropriate
    ZIP structures for an empty archive will be written to the file.
@@ -194,6 +199,19 @@ ZipFile Objects
       with ZipFile('spam.zip', 'w') as myzip:
           myzip.write('eggs.txt')
 
+   .. note::
+
+      *metadata_encoding* is an instance-wide setting for the ZipFile.
+      It is not currently possible to set this on a per-member basis.
+
+      This attribute is a workaround for legacy implementations which produce
+      archives with names in the current locale encoding or code page (mostly
+      on Windows).  According to the .ZIP standard, the encoding of metadata
+      may be specified to be either IBM code page (default) or UTF-8 by a flag
+      in the archive header.
+      That flag takes precedence over *metadata_encoding*, which is
+      a Python-specific extension.
+
    .. versionadded:: 3.2
       Added the ability to use :class:`ZipFile` as a context manager.
 
@@ -220,6 +238,10 @@ ZipFile Objects
    .. versionadded:: 3.8
       The *strict_timestamps* keyword-only argument
 
+   .. versionchanged:: 3.11
+      Added support for specifying member name encoding for reading
+      metadata in the zipfile's directory and file headers.
+
 
 .. method:: ZipFile.close()
 
@@ -395,6 +417,15 @@ ZipFile Objects
    given.
    The archive must be open with mode ``'w'``, ``'x'`` or ``'a'``.
 
+   .. note::
+
+      The ZIP file standard historically did not specify a metadata encoding,
+      but strongly recommended CP437 (the original IBM PC encoding) for
+      interoperability.  Recent versions allow use of UTF-8 (only).  In this
+      module, UTF-8 will automatically be used to write the member names if
+      they contain any non-ASCII characters.  It is not possible to write
+      member names in any encoding other than ASCII or UTF-8.
+
    .. note::
 
       Archive names should be relative to the archive root, that is, they should not
@@ -868,6 +899,14 @@ Command-line options
 
    Test whether the zipfile is valid or not.
 
+.. cmdoption:: --metadata-encoding <encoding>
+
+   Specify encoding of member names for :option:`-l`, :option:`-e` and
+   :option:`-t`.
+
+   .. versionadded:: 3.11
+
+
 Decompression pitfalls
 ----------------------
 
diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst
index 96db3a90cfce4..938a573d59574 100644
--- a/Doc/whatsnew/3.11.rst
+++ b/Doc/whatsnew/3.11.rst
@@ -432,6 +432,12 @@ venv
   Third party code that also creates new virtual environments should do the same.
   (Contributed by Miro Hrončok in :issue:`45413`.)
 
+zipfile
+-------
+
+* Added support for specifying member name encoding for reading
+  metadata in the zipfile's directory and file headers.
+  (Contributed by Stephen J. Turnbull and Serhiy Storchaka in :issue:`28080`.)
 
 fcntl
 -----
diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py
index 759a4abb9d4d4..26c40457e62a0 100644
--- a/Lib/test/test_zipfile.py
+++ b/Lib/test/test_zipfile.py
@@ -21,8 +21,10 @@
 from random import randint, random, randbytes
 
 from test.support import script_helper
-from test.support import (findfile, requires_zlib, requires_bz2,
-                          requires_lzma, captured_stdout, requires_subprocess)
+from test.support import (
+    findfile, requires_zlib, requires_bz2, requires_lzma,
+    captured_stdout, captured_stderr, requires_subprocess
+)
 from test.support.os_helper import (
     TESTFN, unlink, rmtree, temp_dir, temp_cwd, fd_count
 )
@@ -3210,5 +3212,139 @@ def test_inheritance(self, alpharep):
             assert isinstance(file, cls)
 
 
+class EncodedMetadataTests(unittest.TestCase):
+    file_names = ['\u4e00', '\u4e8c', '\u4e09']  # Han 'one', 'two', 'three'
+    file_content = [
+        "This is pure ASCII.\n".encode('ascii'),
+        # This is modern Japanese. (UTF-8)
+        "\u3053\u308c\u306f\u73fe\u4ee3\u7684\u65e5\u672c\u8a9e\u3067\u3059\u3002\n".encode('utf-8'),
+        # This is obsolete Japanese. (Shift JIS)
+        "\u3053\u308c\u306f\u53e4\u3044\u65e5\u672c\u8a9e\u3067\u3059\u3002\n".encode('shift_jis'),
+    ]
+
+    def setUp(self):
+        self.addCleanup(unlink, TESTFN)
+        # Create .zip of 3 members with Han names encoded in Shift JIS.
+        # Each name is 1 Han character encoding to 2 bytes in Shift JIS.
+        # The ASCII names are arbitrary as long as they are length 2 and
+        # not otherwise contained in the zip file.
+        # Data elements are encoded bytes (ascii, utf-8, shift_jis).
+        placeholders = ["n1", "n2"] + self.file_names[2:]
+        with zipfile.ZipFile(TESTFN, mode="w") as tf:
+            for temp, content in zip(placeholders, self.file_content):
+                tf.writestr(temp, content, zipfile.ZIP_STORED)
+        # Hack in the Shift JIS names with flag bit 11 (UTF-8) unset.
+        with open(TESTFN, "rb") as tf:
+            data = tf.read()
+        for name, temp in zip(self.file_names, placeholders[:2]):
+            data = data.replace(temp.encode('ascii'),
+                                name.encode('shift_jis'))
+        with open(TESTFN, "wb") as tf:
+            tf.write(data)
+
+    def _test_read(self, zipfp, expected_names, expected_content):
+        # Check the namelist
+        names = zipfp.namelist()
+        self.assertEqual(sorted(names), sorted(expected_names))
+
+        # Check infolist
+        infos = zipfp.infolist()
+        names = [zi.filename for zi in infos]
+        self.assertEqual(sorted(names), sorted(expected_names))
+
+        # check getinfo
+        for name, content in zip(expected_names, expected_content):
+            info = zipfp.getinfo(name)
+            self.assertEqual(info.filename, name)
+            self.assertEqual(info.file_size, len(content))
+            self.assertEqual(zipfp.read(name), content)
+
+    def test_read_with_metadata_encoding(self):
+        # Read the ZIP archive with correct metadata_encoding
+        with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
+            self._test_read(zipfp, self.file_names, self.file_content)
+
+    def test_read_without_metadata_encoding(self):
+        # Read the ZIP archive without metadata_encoding
+        expected_names = [name.encode('shift_jis').decode('cp437')
+                          for name in self.file_names[:2]] + self.file_names[2:]
+        with zipfile.ZipFile(TESTFN, "r") as zipfp:
+            self._test_read(zipfp, expected_names, self.file_content)
+
+    def test_read_with_incorrect_metadata_encoding(self):
+        # Read the ZIP archive with incorrect metadata_encoding
+        expected_names = [name.encode('shift_jis').decode('koi8-u')
+                          for name in self.file_names[:2]] + self.file_names[2:]
+        with zipfile.ZipFile(TESTFN, "r", metadata_encoding='koi8-u') as zipfp:
+            self._test_read(zipfp, expected_names, self.file_content)
+
+    def test_read_with_unsuitable_metadata_encoding(self):
+        # Read the ZIP archive with metadata_encoding unsuitable for
+        # decoding metadata
+        with self.assertRaises(UnicodeDecodeError):
+            zipfile.ZipFile(TESTFN, "r", metadata_encoding='ascii')
+        with self.assertRaises(UnicodeDecodeError):
+            zipfile.ZipFile(TESTFN, "r", metadata_encoding='utf-8')
+
+    def test_read_after_append(self):
+        newname = '\u56db'  # Han 'four'
+        expected_names = [name.encode('shift_jis').decode('cp437')
+                          for name in self.file_names[:2]] + self.file_names[2:]
+        expected_names.append(newname)
+        expected_content = (*self.file_content, b"newcontent")
+
+        with zipfile.ZipFile(TESTFN, "a") as zipfp:
+            zipfp.writestr(newname, "newcontent")
+            self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names))
+
+        with zipfile.ZipFile(TESTFN, "r") as zipfp:
+            self._test_read(zipfp, expected_names, expected_content)
+
+        with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
+            self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names))
+            for i, (name, content) in enumerate(zip(expected_names, expected_content)):
+                info = zipfp.getinfo(name)
+                self.assertEqual(info.filename, name)
+                self.assertEqual(info.file_size, len(content))
+                if i < 2:
+                    with self.assertRaises(zipfile.BadZipFile):
+                        zipfp.read(name)
+                else:
+                    self.assertEqual(zipfp.read(name), content)
+
+    def test_write_with_metadata_encoding(self):
+        ZF = zipfile.ZipFile
+        for mode in ("w", "x", "a"):
+            with self.assertRaisesRegex(ValueError,
+                                        "^metadata_encoding is only"):
+                ZF("nonesuch.zip", mode, metadata_encoding="shift_jis")
+
+    def test_cli_with_metadata_encoding(self):
+        errmsg = "Non-conforming encodings not supported with -c."
+        args = ["--metadata-encoding=shift_jis", "-c", "nonesuch", "nonesuch"]
+        with captured_stdout() as stdout:
+            with captured_stderr() as stderr:
+                self.assertRaises(SystemExit, zipfile.main, args)
+        self.assertEqual(stdout.getvalue(), "")
+        self.assertIn(errmsg, stderr.getvalue())
+
+        with captured_stdout() as stdout:
+            zipfile.main(["--metadata-encoding=shift_jis", "-t", TESTFN])
+        listing = stdout.getvalue()
+
+        with captured_stdout() as stdout:
+            zipfile.main(["--metadata-encoding=shift_jis", "-l", TESTFN])
+        listing = stdout.getvalue()
+        for name in self.file_names:
+            self.assertIn(name, listing)
+
+        os.mkdir(TESTFN2)
+        self.addCleanup(rmtree, TESTFN2)
+        zipfile.main(["--metadata-encoding=shift_jis", "-e", TESTFN, TESTFN2])
+        listing = os.listdir(TESTFN2)
+        for name in self.file_names:
+            self.assertIn(name, listing)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/Lib/zipfile.py b/Lib/zipfile.py
index 385adc897317f..721834aff13a7 100644
--- a/Lib/zipfile.py
+++ b/Lib/zipfile.py
@@ -480,7 +480,7 @@ def FileHeader(self, zip64=None):
 
     def _encodeFilenameFlags(self):
         try:
-            return self.filename.encode('ascii'), self.flag_bits
+            return self.filename.encode('ascii'), self.flag_bits & ~_MASK_UTF_FILENAME
         except UnicodeEncodeError:
             return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME
 
@@ -1240,7 +1240,7 @@ class ZipFile:
     _windows_illegal_name_trans_table = None
 
     def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True,
-                 compresslevel=None, *, strict_timestamps=True):
+                 compresslevel=None, *, strict_timestamps=True, metadata_encoding=None):
         """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x',
         or append 'a'."""
         if mode not in ('r', 'w', 'x', 'a'):
@@ -1259,6 +1259,12 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True,
         self.pwd = None
         self._comment = b''
         self._strict_timestamps = strict_timestamps
+        self.metadata_encoding = metadata_encoding
+
+        # Check that we don't try to write with nonconforming codecs
+        if self.metadata_encoding and mode != 'r':
+            raise ValueError(
+                "metadata_encoding is only supported for reading files")
 
         # Check if we were passed a file-like object
         if isinstance(file, os.PathLike):
@@ -1389,13 +1395,13 @@ def _RealGetContents(self):
             if self.debug > 2:
                 print(centdir)
             filename = fp.read(centdir[_CD_FILENAME_LENGTH])
-            flags = centdir[5]
+            flags = centdir[_CD_FLAG_BITS]
             if flags & _MASK_UTF_FILENAME:
                 # UTF-8 file names extension
                 filename = filename.decode('utf-8')
             else:
                 # Historical ZIP filename encoding
-                filename = filename.decode('cp437')
+                filename = filename.decode(self.metadata_encoding or 'cp437')
             # Create ZipInfo instance to store file information
             x = ZipInfo(filename)
             x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
@@ -1572,7 +1578,7 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False):
                 # UTF-8 filename
                 fname_str = fname.decode("utf-8")
             else:
-                fname_str = fname.decode("cp437")
+                fname_str = fname.decode(self.metadata_encoding or "cp437")
 
             if fname_str != zinfo.orig_filename:
                 raise BadZipFile(
@@ -2461,11 +2467,15 @@ def main(args=None):
                        help='Create zipfile from sources')
     group.add_argument('-t', '--test', metavar='<zipfile>',
                        help='Test if a zipfile is valid')
+    parser.add_argument('--metadata-encoding', metavar='<encoding>',
+                        help='Specify encoding of member names for -l, -e and -t')
     args = parser.parse_args(args)
 
+    encoding = args.metadata_encoding
+
     if args.test is not None:
         src = args.test
-        with ZipFile(src, 'r') as zf:
+        with ZipFile(src, 'r', metadata_encoding=encoding) as zf:
             badfile = zf.testzip()
         if badfile:
             print("The following enclosed file is corrupted: {!r}".format(badfile))
@@ -2473,15 +2483,20 @@ def main(args=None):
 
     elif args.list is not None:
         src = args.list
-        with ZipFile(src, 'r') as zf:
+        with ZipFile(src, 'r', metadata_encoding=encoding) as zf:
             zf.printdir()
 
     elif args.extract is not None:
         src, curdir = args.extract
-        with ZipFile(src, 'r') as zf:
+        with ZipFile(src, 'r', metadata_encoding=encoding) as zf:
             zf.extractall(curdir)
 
     elif args.create is not None:
+        if encoding:
+            print("Non-conforming encodings not supported with -c.",
+                  file=sys.stderr)
+            sys.exit(1)
+
         zip_name = args.create.pop(0)
         files = args.create
 
diff --git a/Misc/NEWS.d/next/Library/2022-03-20-15-54-41.bpo-28080.kn35Vk.rst b/Misc/NEWS.d/next/Library/2022-03-20-15-54-41.bpo-28080.kn35Vk.rst
new file mode 100644
index 0000000000000..08428e63be3d4
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-03-20-15-54-41.bpo-28080.kn35Vk.rst
@@ -0,0 +1,4 @@
+Add the *metadata_encoding* parameter in the :class:`zipfile.ZipFile`
+constructor and the ``--metadata-encoding`` option in the :mod:`zipfile`
+CLI to allow reading zipfiles using non-standard codecs to encode the
+filenames within the archive.



More information about the Python-checkins mailing list