[Python-checkins] cpython: Issue #13734: Add os.fwalk(), a directory walking function yielding file

charles-francois.natali python-checkins at python.org
Sun Feb 5 15:16:00 CET 2012


http://hg.python.org/cpython/rev/773a97b3927d
changeset:   74795:773a97b3927d
user:        Charles-François Natali <neologix at free.fr>
date:        Sun Feb 05 15:15:38 2012 +0100
summary:
  Issue #13734: Add os.fwalk(), a directory walking function yielding file
descriptors.

files:
  Doc/library/os.rst   |   51 +++++++++++++++
  Doc/whatsnew/3.3.rst |    4 +
  Lib/os.py            |  101 +++++++++++++++++++++++++++++-
  Lib/test/test_os.py  |   60 +++++++++++++++++-
  Misc/NEWS            |    3 +
  5 files changed, 212 insertions(+), 7 deletions(-)


diff --git a/Doc/library/os.rst b/Doc/library/os.rst
--- a/Doc/library/os.rst
+++ b/Doc/library/os.rst
@@ -2251,6 +2251,57 @@
               os.rmdir(os.path.join(root, name))
 
 
+.. function:: fwalk(top, topdown=True, onerror=None, followlinks=False)
+
+   .. index::
+      single: directory; walking
+      single: directory; traversal
+
+    This behaves exactly like :func:`walk`, except that it yields a 4-tuple
+    ``(dirpath, dirnames, filenames, dirfd)``.
+
+   *dirpath*, *dirnames* and *filenames* are identical to :func:`walk` output,
+   and *dirfd* is a file descriptor referring to the directory *dirpath*.
+
+   .. note::
+
+      Since :func:`fwalk` yields file descriptors, those are only valid until
+      the next iteration step, so you should duplicate them (e.g. with
+      :func:`dup`) if you want to keep them longer.
+
+   This example displays the number of bytes taken by non-directory files in each
+   directory under the starting directory, except that it doesn't look under any
+   CVS subdirectory::
+
+      import os
+      for root, dirs, files, rootfd in os.fwalk('python/Lib/email'):
+          print(root, "consumes", end="")
+          print(sum([os.fstatat(rootfd, name).st_size for name in files]),
+                end="")
+          print("bytes in", len(files), "non-directory files")
+          if 'CVS' in dirs:
+              dirs.remove('CVS')  # don't visit CVS directories
+
+   In the next example, walking the tree bottom-up is essential:
+   :func:`unlinkat` doesn't allow deleting a directory before the directory is
+   empty::
+
+      # Delete everything reachable from the directory named in "top",
+      # assuming there are no symbolic links.
+      # CAUTION:  This is dangerous!  For example, if top == '/', it
+      # could delete all your disk files.
+      import os
+      for root, dirs, files, rootfd in os.fwalk(top, topdown=False):
+          for name in files:
+              os.unlinkat(rootfd, name)
+          for name in dirs:
+              os.unlinkat(rootfd, name, os.AT_REMOVEDIR)
+
+   Availability: Unix.
+
+   .. versionadded:: 3.3
+
+
 .. _os-process:
 
 Process Management
diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst
--- a/Doc/whatsnew/3.3.rst
+++ b/Doc/whatsnew/3.3.rst
@@ -497,6 +497,10 @@
 
   (Patch submitted by Giampaolo Rodolà in :issue:`10784`.)
 
+* The :mod:`os` module has a new :func:`~os.fwalk` function similar to
+  :func:`~os.walk` except that it also yields file descriptors referring to the
+  directories visited. This is especially useful to avoid symlink races.
+
 * "at" functions (:issue:`4761`):
 
   * :func:`~os.faccessat`
diff --git a/Lib/os.py b/Lib/os.py
--- a/Lib/os.py
+++ b/Lib/os.py
@@ -24,6 +24,7 @@
 #'
 
 import sys, errno
+import stat as st
 
 _names = sys.builtin_module_names
 
@@ -32,6 +33,9 @@
            "defpath", "name", "path", "devnull",
            "SEEK_SET", "SEEK_CUR", "SEEK_END"]
 
+def _exists(name):
+    return name in globals()
+
 def _get_exports_list(module):
     try:
         return list(module.__all__)
@@ -120,7 +124,12 @@
     umask(mask)
     return mode & ~mask
 
-#'
+def _are_same_file(stat1, stat2):
+    """Helper function that checks whether two stat results refer to the same
+    file.
+    """
+    return (stat1.st_ino == stat2.st_ino and stat1.st_dev == stat2.st_dev)
+#
 
 # Super directory utilities.
 # (Inspired by Eric Raymond; the doc strings are mostly his)
@@ -151,7 +160,6 @@
     try:
         mkdir(name, mode)
     except OSError as e:
-        import stat as st
         if not (e.errno == errno.EEXIST and exist_ok and path.isdir(name) and
                 st.S_IMODE(lstat(name).st_mode) == _get_masked_mode(mode)):
             raise
@@ -298,6 +306,92 @@
 
 __all__.append("walk")
 
+if _exists("openat"):
+
+    def fwalk(top, topdown=True, onerror=None, followlinks=False):
+        """Directory tree generator.
+
+        This behaves exactly like walk(), except that it yields a 4-tuple
+
+            dirpath, dirnames, filenames, dirfd
+
+        `dirpath`, `dirnames` and `filenames` are identical to walk() output,
+        and `dirfd` is a file descriptor referring to the directory `dirpath`.
+
+        The advantage of walkfd() over walk() is that it's safe against symlink
+        races (when followlinks is False).
+
+        Caution:
+        Since fwalk() yields file descriptors, those are only valid until the
+        next iteration step, so you should dup() them if you want to keep them
+        for a longer period.
+
+        Example:
+
+        import os
+        for root, dirs, files, rootfd in os.fwalk('python/Lib/email'):
+            print(root, "consumes", end="")
+            print(sum([os.fstatat(rootfd, name).st_size for name in files]),
+                  end="")
+            print("bytes in", len(files), "non-directory files")
+            if 'CVS' in dirs:
+                dirs.remove('CVS')  # don't visit CVS directories
+        """
+        # Note: To guard against symlink races, we use the standard
+        # lstat()/open()/fstat() trick.
+        orig_st = lstat(top)
+        topfd = open(top, O_RDONLY)
+        try:
+            if (followlinks or (st.S_ISDIR(orig_st.st_mode) and
+                               _are_same_file(orig_st, fstat(topfd)))):
+                for x in _fwalk(topfd, top, topdown, onerror, followlinks):
+                    yield x
+        finally:
+            close(topfd)
+
+    def _fwalk(topfd, toppath, topdown, onerror, followlinks):
+        # Note: This uses O(depth of the directory tree) file descriptors: if
+        # necessary, it can be adapted to only require O(1) FDs, see issue
+        # #13734.
+
+        # whether to follow symlinks
+        flag = 0 if followlinks else AT_SYMLINK_NOFOLLOW
+
+        names = fdlistdir(topfd)
+        dirs, nondirs = [], []
+        for name in names:
+            # Here, we don't use AT_SYMLINK_NOFOLLOW to be consistent with
+            # walk() which reports symlinks to directories as directories. We do
+            # however check for symlinks before recursing into a subdirectory.
+            if st.S_ISDIR(fstatat(topfd, name).st_mode):
+                dirs.append(name)
+            else:
+                nondirs.append(name)
+
+        if topdown:
+            yield toppath, dirs, nondirs, topfd
+
+        for name in dirs:
+            try:
+                orig_st = fstatat(topfd, name, flag)
+                dirfd = openat(topfd, name, O_RDONLY)
+            except error as err:
+                if onerror is not None:
+                    onerror(err)
+                return
+            try:
+                if followlinks or _are_same_file(orig_st, fstat(dirfd)):
+                    dirpath = path.join(toppath, name)
+                    for x in _fwalk(dirfd, dirpath, topdown, onerror, followlinks):
+                        yield x
+            finally:
+                close(dirfd)
+
+        if not topdown:
+            yield toppath, dirs, nondirs, topfd
+
+    __all__.append("fwalk")
+
 # Make sure os.environ exists, at least
 try:
     environ
@@ -598,9 +692,6 @@
 fsencode, fsdecode = _fscodec()
 del _fscodec
 
-def _exists(name):
-    return name in globals()
-
 # Supply spawn*() (probably only for Unix)
 if _exists("fork") and not _exists("spawnv") and _exists("execv"):
 
diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py
--- a/Lib/test/test_os.py
+++ b/Lib/test/test_os.py
@@ -20,6 +20,8 @@
 import asyncore
 import asynchat
 import socket
+import itertools
+import stat
 try:
     import threading
 except ImportError:
@@ -159,7 +161,6 @@
         if not hasattr(os, "stat"):
             return
 
-        import stat
         result = os.stat(fname)
 
         # Make sure direct access works
@@ -476,7 +477,7 @@
 class WalkTests(unittest.TestCase):
     """Tests for os.walk()."""
 
-    def test_traversal(self):
+    def setUp(self):
         import os
         from os.path import join
 
@@ -586,6 +587,60 @@
                     os.remove(dirname)
         os.rmdir(support.TESTFN)
 
+
+ at unittest.skipUnless(hasattr(os, 'fwalk'), "Test needs os.fwalk()")
+class FwalkTests(WalkTests):
+    """Tests for os.fwalk()."""
+
+    def test_compare_to_walk(self):
+        # compare with walk() results
+        for topdown, followlinks in itertools.product((True, False), repeat=2):
+            args = support.TESTFN, topdown, None, followlinks
+            expected = {}
+            for root, dirs, files in os.walk(*args):
+                expected[root] = (set(dirs), set(files))
+
+            for root, dirs, files, rootfd in os.fwalk(*args):
+                self.assertIn(root, expected)
+                self.assertEqual(expected[root], (set(dirs), set(files)))
+
+    def test_dir_fd(self):
+        # check returned file descriptors
+        for topdown, followlinks in itertools.product((True, False), repeat=2):
+            args = support.TESTFN, topdown, None, followlinks
+            for root, dirs, files, rootfd in os.fwalk(*args):
+                # check that the FD is valid
+                os.fstat(rootfd)
+                # check that fdlistdir() returns consistent information
+                self.assertEqual(set(os.fdlistdir(rootfd)), set(dirs) | set(files))
+
+    def test_fd_leak(self):
+        # Since we're opening a lot of FDs, we must be careful to avoid leaks:
+        # we both check that calling fwalk() a large number of times doesn't
+        # yield EMFILE, and that the minimum allocated FD hasn't changed.
+        minfd = os.dup(1)
+        os.close(minfd)
+        for i in range(256):
+            for x in os.fwalk(support.TESTFN):
+                pass
+        newfd = os.dup(1)
+        self.addCleanup(os.close, newfd)
+        self.assertEqual(newfd, minfd)
+
+    def tearDown(self):
+        # cleanup
+        for root, dirs, files, rootfd in os.fwalk(support.TESTFN, topdown=False):
+            for name in files:
+                os.unlinkat(rootfd, name)
+            for name in dirs:
+                st = os.fstatat(rootfd, name, os.AT_SYMLINK_NOFOLLOW)
+                if stat.S_ISDIR(st.st_mode):
+                    os.unlinkat(rootfd, name, os.AT_REMOVEDIR)
+                else:
+                    os.unlinkat(rootfd, name)
+        os.rmdir(support.TESTFN)
+
+
 class MakedirTests(unittest.TestCase):
     def setUp(self):
         os.mkdir(support.TESTFN)
@@ -1700,6 +1755,7 @@
         StatAttributeTests,
         EnvironTests,
         WalkTests,
+        FwalkTests,
         MakedirTests,
         DevNullTests,
         URandomTests,
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -466,6 +466,9 @@
 Library
 -------
 
+- Issue #13734: Add os.fwalk(), a directory walking function yielding file
+  descriptors.
+
 - Issue #2945: Make the distutils upload command aware of bdist_rpm products.
 
 - Issue #13712: pysetup create should not convert package_data to extra_files.

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list