[Python-checkins] bpo-39595: Improve zipfile.Path performance (#18406)

Jason R. Coombs webhook-mailer at python.org
Tue Feb 11 21:58:52 EST 2020


https://github.com/python/cpython/commit/e5bd73632e77dc5ab0cab77e48e94ca5e354be8a
commit: e5bd73632e77dc5ab0cab77e48e94ca5e354be8a
branch: master
author: Jason R. Coombs <jaraco at jaraco.com>
committer: GitHub <noreply at github.com>
date: 2020-02-11T21:58:47-05:00
summary:

bpo-39595: Improve zipfile.Path performance (#18406)

* Improve zipfile.Path performance on zipfiles with a large number of entries.

* 📜🤖 Added by blurb_it.

* Add bpo to blurb

* Sync with importlib_metadata 1.5 (6fe70ca)

* Update blurb.

* Remove compatibility code

* Add stubs module, omitted from earlier commit

Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>

files:
A Lib/test/test_importlib/stubs.py
A Misc/NEWS.d/next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst
M Lib/importlib/metadata.py
M Lib/test/test_importlib/fixtures.py
M Lib/test/test_importlib/test_main.py
M Lib/test/test_zipfile.py
M Lib/zipfile.py

diff --git a/Lib/importlib/metadata.py b/Lib/importlib/metadata.py
index ae8ecf9b8500c..831f593277ccd 100644
--- a/Lib/importlib/metadata.py
+++ b/Lib/importlib/metadata.py
@@ -391,6 +391,7 @@ class FastPath:
 
     def __init__(self, root):
         self.root = root
+        self.base = os.path.basename(root).lower()
 
     def joinpath(self, child):
         return pathlib.Path(self.root, child)
@@ -413,12 +414,11 @@ def zip_children(self):
             )
 
     def is_egg(self, search):
-        root_n_low = os.path.split(self.root)[1].lower()
-
+        base = self.base
         return (
-            root_n_low == search.normalized + '.egg'
-            or root_n_low.startswith(search.prefix)
-            and root_n_low.endswith('.egg'))
+            base == search.versionless_egg_name
+            or base.startswith(search.prefix)
+            and base.endswith('.egg'))
 
     def search(self, name):
         for child in self.children():
@@ -439,6 +439,7 @@ class Prepared:
     prefix = ''
     suffixes = '.dist-info', '.egg-info'
     exact_matches = [''][:0]
+    versionless_egg_name = ''
 
     def __init__(self, name):
         self.name = name
@@ -448,6 +449,7 @@ def __init__(self, name):
         self.prefix = self.normalized + '-'
         self.exact_matches = [
             self.normalized + suffix for suffix in self.suffixes]
+        self.versionless_egg_name = self.normalized + '.egg'
 
 
 class MetadataPathFinder(DistributionFinder):
diff --git a/Lib/test/test_importlib/fixtures.py b/Lib/test/test_importlib/fixtures.py
index 0b4ce18d5a6cd..695c92a786cb0 100644
--- a/Lib/test/test_importlib/fixtures.py
+++ b/Lib/test/test_importlib/fixtures.py
@@ -47,14 +47,28 @@ def tempdir_as_cwd():
             yield tmp
 
 
-class SiteDir:
+ at contextlib.contextmanager
+def install_finder(finder):
+    sys.meta_path.append(finder)
+    try:
+        yield
+    finally:
+        sys.meta_path.remove(finder)
+
+
+class Fixtures:
     def setUp(self):
         self.fixtures = ExitStack()
         self.addCleanup(self.fixtures.close)
+
+
+class SiteDir(Fixtures):
+    def setUp(self):
+        super(SiteDir, self).setUp()
         self.site_dir = self.fixtures.enter_context(tempdir())
 
 
-class OnSysPath:
+class OnSysPath(Fixtures):
     @staticmethod
     @contextlib.contextmanager
     def add_sys_path(dir):
@@ -198,3 +212,8 @@ def build_files(file_defs, prefix=pathlib.Path()):
 def DALS(str):
     "Dedent and left-strip"
     return textwrap.dedent(str).lstrip()
+
+
+class NullFinder:
+    def find_module(self, name):
+        pass
diff --git a/Lib/test/test_importlib/stubs.py b/Lib/test/test_importlib/stubs.py
new file mode 100644
index 0000000000000..e5b011c399fa9
--- /dev/null
+++ b/Lib/test/test_importlib/stubs.py
@@ -0,0 +1,10 @@
+import unittest
+
+
+class fake_filesystem_unittest:
+    """
+    Stubbed version of the pyfakefs module
+    """
+    class TestCase(unittest.TestCase):
+        def setUpPyfakefs(self):
+            self.skipTest("pyfakefs not available")
diff --git a/Lib/test/test_importlib/test_main.py b/Lib/test/test_importlib/test_main.py
index c5f1dbbae325e..42a79992ecc8c 100644
--- a/Lib/test/test_importlib/test_main.py
+++ b/Lib/test/test_importlib/test_main.py
@@ -7,6 +7,11 @@
 import unittest
 import importlib.metadata
 
+try:
+    import pyfakefs.fake_filesystem_unittest as ffs
+except ImportError:
+    from .stubs import fake_filesystem_unittest as ffs
+
 from . import fixtures
 from importlib.metadata import (
     Distribution, EntryPoint,
@@ -185,6 +190,33 @@ def test_egg(self):
                 version('foo')
 
 
+class MissingSysPath(fixtures.OnSysPath, unittest.TestCase):
+    site_dir = '/does-not-exist'
+
+    def test_discovery(self):
+        """
+        Discovering distributions should succeed even if
+        there is an invalid path on sys.path.
+        """
+        importlib.metadata.distributions()
+
+
+class InaccessibleSysPath(fixtures.OnSysPath, ffs.TestCase):
+    site_dir = '/access-denied'
+
+    def setUp(self):
+        super(InaccessibleSysPath, self).setUp()
+        self.setUpPyfakefs()
+        self.fs.create_dir(self.site_dir, perm_bits=000)
+
+    def test_discovery(self):
+        """
+        Discovering distributions should succeed even if
+        there is an invalid path on sys.path.
+        """
+        list(importlib.metadata.distributions())
+
+
 class TestEntryPoints(unittest.TestCase):
     def __init__(self, *args):
         super(TestEntryPoints, self).__init__(*args)
diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py
index c334715f3d81b..09fc850600610 100644
--- a/Lib/test/test_zipfile.py
+++ b/Lib/test/test_zipfile.py
@@ -2724,16 +2724,71 @@ def test_extract_command(self):
                                 self.assertEqual(f.read(), zf.read(zi))
 
 
+class TestExecutablePrependedZip(unittest.TestCase):
+    """Test our ability to open zip files with an executable prepended."""
+
+    def setUp(self):
+        self.exe_zip = findfile('exe_with_zip', subdir='ziptestdata')
+        self.exe_zip64 = findfile('exe_with_z64', subdir='ziptestdata')
+
+    def _test_zip_works(self, name):
+        # bpo28494 sanity check: ensure is_zipfile works on these.
+        self.assertTrue(zipfile.is_zipfile(name),
+                        f'is_zipfile failed on {name}')
+        # Ensure we can operate on these via ZipFile.
+        with zipfile.ZipFile(name) as zipfp:
+            for n in zipfp.namelist():
+                data = zipfp.read(n)
+                self.assertIn(b'FAVORITE_NUMBER', data)
+
+    def test_read_zip_with_exe_prepended(self):
+        self._test_zip_works(self.exe_zip)
+
+    def test_read_zip64_with_exe_prepended(self):
+        self._test_zip_works(self.exe_zip64)
+
+    @unittest.skipUnless(sys.executable, 'sys.executable required.')
+    @unittest.skipUnless(os.access('/bin/bash', os.X_OK),
+                         'Test relies on #!/bin/bash working.')
+    def test_execute_zip2(self):
+        output = subprocess.check_output([self.exe_zip, sys.executable])
+        self.assertIn(b'number in executable: 5', output)
+
+    @unittest.skipUnless(sys.executable, 'sys.executable required.')
+    @unittest.skipUnless(os.access('/bin/bash', os.X_OK),
+                         'Test relies on #!/bin/bash working.')
+    def test_execute_zip64(self):
+        output = subprocess.check_output([self.exe_zip64, sys.executable])
+        self.assertIn(b'number in executable: 5', output)
+
+
 # Poor man's technique to consume a (smallish) iterable.
 consume = tuple
 
 
+# from jaraco.itertools 5.0
+class jaraco:
+    class itertools:
+        class Counter:
+            def __init__(self, i):
+                self.count = 0
+                self._orig_iter = iter(i)
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                result = next(self._orig_iter)
+                self.count += 1
+                return result
+
+
 def add_dirs(zf):
     """
     Given a writable zip file zf, inject directory entries for
     any directories implied by the presence of children.
     """
-    for name in zipfile.Path._implied_dirs(zf.namelist()):
+    for name in zipfile.CompleteDirs._implied_dirs(zf.namelist()):
         zf.writestr(name, b"")
     return zf
 
@@ -2774,44 +2829,6 @@ def build_alpharep_fixture():
     return zf
 
 
-class TestExecutablePrependedZip(unittest.TestCase):
-    """Test our ability to open zip files with an executable prepended."""
-
-    def setUp(self):
-        self.exe_zip = findfile('exe_with_zip', subdir='ziptestdata')
-        self.exe_zip64 = findfile('exe_with_z64', subdir='ziptestdata')
-
-    def _test_zip_works(self, name):
-        # bpo-28494 sanity check: ensure is_zipfile works on these.
-        self.assertTrue(zipfile.is_zipfile(name),
-                        f'is_zipfile failed on {name}')
-        # Ensure we can operate on these via ZipFile.
-        with zipfile.ZipFile(name) as zipfp:
-            for n in zipfp.namelist():
-                data = zipfp.read(n)
-                self.assertIn(b'FAVORITE_NUMBER', data)
-
-    def test_read_zip_with_exe_prepended(self):
-        self._test_zip_works(self.exe_zip)
-
-    def test_read_zip64_with_exe_prepended(self):
-        self._test_zip_works(self.exe_zip64)
-
-    @unittest.skipUnless(sys.executable, 'sys.executable required.')
-    @unittest.skipUnless(os.access('/bin/bash', os.X_OK),
-                         'Test relies on #!/bin/bash working.')
-    def test_execute_zip2(self):
-        output = subprocess.check_output([self.exe_zip, sys.executable])
-        self.assertIn(b'number in executable: 5', output)
-
-    @unittest.skipUnless(sys.executable, 'sys.executable required.')
-    @unittest.skipUnless(os.access('/bin/bash', os.X_OK),
-                         'Test relies on #!/bin/bash working.')
-    def test_execute_zip64(self):
-        output = subprocess.check_output([self.exe_zip64, sys.executable])
-        self.assertIn(b'number in executable: 5', output)
-
-
 class TestPath(unittest.TestCase):
     def setUp(self):
         self.fixtures = contextlib.ExitStack()
@@ -2849,6 +2866,14 @@ def test_iterdir_and_types(self):
             i, = h.iterdir()
             assert i.is_file()
 
+    def test_subdir_is_dir(self):
+        for alpharep in self.zipfile_alpharep():
+            root = zipfile.Path(alpharep)
+            assert (root / 'b').is_dir()
+            assert (root / 'b/').is_dir()
+            assert (root / 'g').is_dir()
+            assert (root / 'g/').is_dir()
+
     def test_open(self):
         for alpharep in self.zipfile_alpharep():
             root = zipfile.Path(alpharep)
@@ -2910,6 +2935,45 @@ def test_missing_dir_parent(self):
             root = zipfile.Path(alpharep)
             assert (root / 'missing dir/').parent.at == ''
 
+    def test_mutability(self):
+        """
+        If the underlying zipfile is changed, the Path object should
+        reflect that change.
+        """
+        for alpharep in self.zipfile_alpharep():
+            root = zipfile.Path(alpharep)
+            a, b, g = root.iterdir()
+            alpharep.writestr('foo.txt', 'foo')
+            alpharep.writestr('bar/baz.txt', 'baz')
+            assert any(
+                child.name == 'foo.txt'
+                for child in root.iterdir())
+            assert (root / 'foo.txt').read_text() == 'foo'
+            baz, = (root / 'bar').iterdir()
+            assert baz.read_text() == 'baz'
+
+    HUGE_ZIPFILE_NUM_ENTRIES = 2 ** 13
+
+    def huge_zipfile(self):
+        """Create a read-only zipfile with a huge number of entries entries."""
+        strm = io.BytesIO()
+        zf = zipfile.ZipFile(strm, "w")
+        for entry in map(str, range(self.HUGE_ZIPFILE_NUM_ENTRIES)):
+            zf.writestr(entry, entry)
+        zf.mode = 'r'
+        return zf
+
+    def test_joinpath_constant_time(self):
+        """
+        Ensure joinpath on items in zipfile is linear time.
+        """
+        root = zipfile.Path(self.huge_zipfile())
+        entries = jaraco.itertools.Counter(root.iterdir())
+        for entry in entries:
+            entry.joinpath('suffix')
+        # Check the file iterated all items
+        assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/Lib/zipfile.py b/Lib/zipfile.py
index 2da87ef505e6e..4510fac250b97 100644
--- a/Lib/zipfile.py
+++ b/Lib/zipfile.py
@@ -16,6 +16,8 @@
 import sys
 import threading
 import time
+import contextlib
+from collections import OrderedDict
 
 try:
     import zlib # We may need its compression method
@@ -2159,6 +2161,79 @@ def _ancestry(path):
         path, tail = posixpath.split(path)
 
 
+class CompleteDirs(ZipFile):
+    """
+    A ZipFile subclass that ensures that implied directories
+    are always included in the namelist.
+    """
+
+    @staticmethod
+    def _implied_dirs(names):
+        parents = itertools.chain.from_iterable(map(_parents, names))
+        # Deduplicate entries in original order
+        implied_dirs = OrderedDict.fromkeys(
+            p + posixpath.sep for p in parents
+            # Cast names to a set for O(1) lookups
+            if p + posixpath.sep not in set(names)
+        )
+        return implied_dirs
+
+    def namelist(self):
+        names = super(CompleteDirs, self).namelist()
+        return names + list(self._implied_dirs(names))
+
+    def _name_set(self):
+        return set(self.namelist())
+
+    def resolve_dir(self, name):
+        """
+        If the name represents a directory, return that name
+        as a directory (with the trailing slash).
+        """
+        names = self._name_set()
+        dirname = name + '/'
+        dir_match = name not in names and dirname in names
+        return dirname if dir_match else name
+
+    @classmethod
+    def make(cls, source):
+        """
+        Given a source (filename or zipfile), return an
+        appropriate CompleteDirs subclass.
+        """
+        if isinstance(source, CompleteDirs):
+            return source
+
+        if not isinstance(source, ZipFile):
+            return cls(source)
+
+        # Only allow for FastPath when supplied zipfile is read-only
+        if 'r' not in source.mode:
+            cls = CompleteDirs
+
+        res = cls.__new__(cls)
+        vars(res).update(vars(source))
+        return res
+
+
+class FastLookup(CompleteDirs):
+    """
+    ZipFile subclass to ensure implicit
+    dirs exist and are resolved rapidly.
+    """
+    def namelist(self):
+        with contextlib.suppress(AttributeError):
+            return self.__names
+        self.__names = super(FastLookup, self).namelist()
+        return self.__names
+
+    def _name_set(self):
+        with contextlib.suppress(AttributeError):
+            return self.__lookup
+        self.__lookup = super(FastLookup, self)._name_set()
+        return self.__lookup
+
+
 class Path:
     """
     A pathlib-compatible interface for zip files.
@@ -2227,7 +2302,7 @@ class Path:
     __repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})"
 
     def __init__(self, root, at=""):
-        self.root = root if isinstance(root, ZipFile) else ZipFile(root)
+        self.root = FastLookup.make(root)
         self.at = at
 
     @property
@@ -2259,12 +2334,12 @@ def is_file(self):
         return not self.is_dir()
 
     def exists(self):
-        return self.at in self._names()
+        return self.at in self.root._name_set()
 
     def iterdir(self):
         if not self.is_dir():
             raise ValueError("Can't listdir a file")
-        subs = map(self._next, self._names())
+        subs = map(self._next, self.root.namelist())
         return filter(self._is_child, subs)
 
     def __str__(self):
@@ -2275,25 +2350,10 @@ def __repr__(self):
 
     def joinpath(self, add):
         next = posixpath.join(self.at, add)
-        next_dir = posixpath.join(self.at, add, "")
-        names = self._names()
-        return self._next(next_dir if next not in names and next_dir in names else next)
+        return self._next(self.root.resolve_dir(next))
 
     __truediv__ = joinpath
 
-    @staticmethod
-    def _implied_dirs(names):
-        return _unique_everseen(
-            parent + "/"
-            for name in names
-            for parent in _parents(name)
-            if parent + "/" not in names
-        )
-
-    @classmethod
-    def _add_implied_dirs(cls, names):
-        return names + list(cls._implied_dirs(names))
-
     @property
     def parent(self):
         parent_at = posixpath.dirname(self.at.rstrip('/'))
@@ -2301,9 +2361,6 @@ def parent(self):
             parent_at += '/'
         return self._next(parent_at)
 
-    def _names(self):
-        return self._add_implied_dirs(self.root.namelist())
-
 
 def main(args=None):
     import argparse
@@ -2365,5 +2422,6 @@ def addToZip(zf, path, zippath):
                     zippath = ''
                 addToZip(zf, path, zippath)
 
+
 if __name__ == "__main__":
     main()
diff --git a/Misc/NEWS.d/next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst b/Misc/NEWS.d/next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst
new file mode 100644
index 0000000000000..3a461389af7d1
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2020-02-07-23-14-14.bpo-39595.DHwddE.rst
@@ -0,0 +1 @@
+Improved performance of zipfile.Path for files with a large number of entries. Also improved performance and fixed minor issue as published with `importlib_metadata 1.5 <https://importlib-metadata.readthedocs.io/en/latest/changelog%20(links).html#v1-5-0>`_.



More information about the Python-checkins mailing list