[Python-checkins] [3.12] GH-105113: Improve performance of `pathlib.PurePath.match()` (GH-105114)

Wed May 31 16:37:44 EDT 2023

https://github.com/python/cpython/commit/e7cb216339e8aa8d00f7f55f9e37864b1720ca0f
commit: e7cb216339e8aa8d00f7f55f9e37864b1720ca0f
branch: 3.12
author: Barney Gale <barney.gale at gmail.com>
committer: barneygale <barney.gale at gmail.com>
date: 2023-05-31T21:37:37+01:00
summary:

[3.12] GH-105113: Improve performance of `pathlib.PurePath.match()` (GH-105114)

We now compile a `re.Pattern` object for the entire pattern. This is made
more difficult by `fnmatch` not treating directory separators as special
when evaluating wildcards (`*`, `?`, etc), and so we arrange the path parts
onto separate *lines* in a string, and ensure we don't set `re.DOTALL`.

Co-authored-by: Hugo van Kemenade <hugovk at users.noreply.github.com>
Co-authored-by: Alex Waygood <Alex.Waygood at Gmail.com>

files:
A Misc/NEWS.d/next/Library/2023-05-30-21-27-41.gh-issue-105113.bDUPl_.rst
M Doc/library/pathlib.rst
M Lib/pathlib.py

diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst
index 627f2df9263d..39b9eb84e7e0 100644
--- a/Doc/library/pathlib.rst
+++ b/Doc/library/pathlib.rst
@@ -569,6 +569,13 @@ Pure paths provide the following methods and properties:
       >>> PurePath('a/b.py').match('/*.py')
       False
 
+   The *pattern* may be another path object; this speeds up matching the same
+   pattern against multiple files::
+
+      >>> pattern = PurePath('*.py')
+      >>> PurePath('a/b.py').match(pattern)
+      True
+
    As with other methods, case-sensitivity follows platform defaults::
 
       >>> PurePosixPath('b.py').match('*.PY')
diff --git a/Lib/pathlib.py b/Lib/pathlib.py
index a42085e3a3f8..29517e4c74db 100644
--- a/Lib/pathlib.py
+++ b/Lib/pathlib.py
@@ -54,6 +54,7 @@ def _ignore_error(exception):
             getattr(exception, 'winerror', None) in _IGNORED_WINERRORS)
 
 
+ at functools.cache
 def _is_case_sensitive(flavour):
     return flavour.normcase('Aa') == 'Aa'
 
@@ -61,6 +62,22 @@ def _is_case_sensitive(flavour):
 # Globbing helpers
 #
 
+
+# fnmatch.translate() returns a regular expression that includes a prefix and
+# a suffix, which enable matching newlines and ensure the end of the string is
+# matched, respectively. These features are undesirable for our implementation
+# of PurePatch.match(), which represents path separators as newlines and joins
+# pattern segments together. As a workaround, we define a slice object that
+# can remove the prefix and suffix from any translate() result. See the
+# _compile_pattern_lines() function for more details.
+_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
+_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
+_SWAP_SEP_AND_NEWLINE = {
+    '/': str.maketrans({'/': '\n', '\n': '/'}),
+    '\\': str.maketrans({'\\': '\n', '\n': '\\'}),
+}
+
+
 @functools.lru_cache()
 def _make_selector(pattern_parts, flavour, case_sensitive):
     pat = pattern_parts[0]
@@ -92,6 +109,38 @@ def _compile_pattern(pat, case_sensitive):
     return re.compile(fnmatch.translate(pat), flags).match
 
 
+ at functools.lru_cache()
+def _compile_pattern_lines(pattern_lines, case_sensitive):
+    """Compile the given pattern lines to an `re.Pattern` object.
+
+    The *pattern_lines* argument is a glob-style pattern (e.g. '*/*.py') with
+    its path separators and newlines swapped (e.g. '*\n*.py`). By using
+    newlines to separate path components, and not setting `re.DOTALL`, we
+    ensure that the `*` wildcard cannot match path separators.
+
+    The returned `re.Pattern` object may have its `match()` method called to
+    match a complete pattern, or `search()` to match from the right. The
+    argument supplied to these methods must also have its path separators and
+    newlines swapped.
+    """
+
+    # Match the start of the path, or just after a path separator
+    parts = ['^']
+    for part in pattern_lines.splitlines(keepends=True):
+        # We slice off the common prefix and suffix added by translate() to
+        # ensure that re.DOTALL is not set, and the end of the string not
+        # matched, respectively. With DOTALL not set, '*' wildcards will not
+        # match path separators, because the '.' characters in the pattern
+        # will not match newlines.
+        parts.append(fnmatch.translate(part)[_FNMATCH_SLICE])
+    # Match the end of the path, always.
+    parts.append(r'\Z')
+    flags = re.MULTILINE
+    if not case_sensitive:
+        flags |= re.IGNORECASE
+    return re.compile(''.join(parts), flags=flags)
+
+
 class _Selector:
     """A selector matches a specific glob pattern part against the children
     of a given path."""
@@ -274,6 +323,10 @@ class PurePath(object):
         # to implement comparison methods like `__lt__()`.
         '_parts_normcase_cached',
 
+        # The `_lines_cached` slot stores the string path with path separators
+        # and newlines swapped. This is used to implement `match()`.
+        '_lines_cached',
+
         # The `_hash` slot stores the hash of the case-normalized string
         # path. It's set when `__hash__()` is called for the first time.
         '_hash',
@@ -439,6 +492,16 @@ def _parts_normcase(self):
             self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep)
             return self._parts_normcase_cached
 
+    @property
+    def _lines(self):
+        # Path with separators and newlines swapped, for pattern matching.
+        try:
+            return self._lines_cached
+        except AttributeError:
+            trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep]
+            self._lines_cached = str(self).translate(trans)
+            return self._lines_cached
+
     def __eq__(self, other):
         if not isinstance(other, PurePath):
             return NotImplemented
@@ -695,23 +758,18 @@ def match(self, path_pattern, *, case_sensitive=None):
         """
         Return True if this path matches the given pattern.
         """
+        if not isinstance(path_pattern, PurePath):
+            path_pattern = self.with_segments(path_pattern)
         if case_sensitive is None:
             case_sensitive = _is_case_sensitive(self._flavour)
-        pat = self.with_segments(path_pattern)
-        if not pat.parts:
+        pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
+        if path_pattern.drive or path_pattern.root:
+            return pattern.match(self._lines) is not None
+        elif path_pattern._tail:
+            return pattern.search(self._lines) is not None
+        else:
             raise ValueError("empty pattern")
-        pat_parts = pat.parts
-        parts = self.parts
-        if pat.drive or pat.root:
-            if len(pat_parts) != len(parts):
-                return False
-        elif len(pat_parts) > len(parts):
-            return False
-        for part, pat in zip(reversed(parts), reversed(pat_parts)):
-            match = _compile_pattern(pat, case_sensitive)
-            if not match(part):
-                return False
-        return True
+
 
 # Can't subclass os.PathLike from PurePath and keep the constructor
 # optimizations in PurePath.__slots__.
diff --git a/Misc/NEWS.d/next/Library/2023-05-30-21-27-41.gh-issue-105113.bDUPl_.rst b/Misc/NEWS.d/next/Library/2023-05-30-21-27-41.gh-issue-105113.bDUPl_.rst
new file mode 100644
index 000000000000..59164ae4734e
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-05-30-21-27-41.gh-issue-105113.bDUPl_.rst
@@ -0,0 +1,2 @@
+Improve performance of :meth:`pathlib.PurePath.match` by compiling an
+:class:`re.Pattern` object for the entire pattern.