[issue13734] Add a generic directory walker method to avoid symlink attacks

Wed Jan 11 19:58:53 CET 2012

Charles-François Natali <neologix at free.fr> added the comment:

Here's an updated version.

Note that I'm not pushing towards changing the current behavior
pertaining to symlinks to directories, because if we change this, this
will break code.
For example to count the number of lines of all the files under a
directory, a code could go like this:

for root, dirs, files in os.walk(top):
    for file in files:
        f = open(file)
        for n, l in enumerate(f, 1):
            pass
        print(file, n)

If, suddently, a symlink to a directory appeared in files, this will
break. So I'm not convinced it's worth changing this. A symlink to a
directory is not much closer to a file than to a directory, it really
depends on the use case.
I'm also fine with keeping fdwalk() consistent with this to make
porting easier (and also because it makes it easy to test, I just have
to compare fdwlak()'s output to walk()'s output).

----------
Added file: http://bugs.python.org/file24202/fdwalk-1.diff

_______________________________________
Python tracker <report at bugs.python.org>
<http://bugs.python.org/issue13734>
_______________________________________
-------------- next part --------------

diff --git a/Doc/library/os.rst b/Doc/library/os.rst
--- a/Doc/library/os.rst
+++ b/Doc/library/os.rst
@@ -2240,6 +2240,58 @@
               os.rmdir(os.path.join(root, name))
 
 
+.. function:: fdwalk(top, topdown=True, onerror=None, followlinks=False)
+
+   .. index::
+      single: directory; walking
+      single: directory; traversal
+
+    This behaves exactly like :func:`walk`, except that it yields a 4-tuple
+    ``(dirpath, dirnames, filenames, dirfd)``.
+
+   *dirpath*, *dirnames* and *filenames* are identical to :func:`walk` output,
+   and *dirfd* is a file descriptor referring to the directory *dirpath*.
+
+   .. note::
+
+      Since :func:`fdwalk` yields file descriptors, those are only valid until
+      the next iteration step, so you should duplicate them (e.g. with
+      :func:`dup`) if you want to keep them longer.
+
+   .. note::
+
+      Contrarily to :func:`walk`, modifying the dirnames list in-place won't
+      affect the directories traversed.
+
+   This example displays the number of bytes taken by non-directory files in each
+   directory under the starting directory::
+
+      import os
+      for root, dirs, files, rootfd in os.fdwalk('python/Lib/email'):
+          print(root, "consumes", end="")
+          print(sum([os.fstatat(rootfd, name).st_size for name in files]),
+                end="")
+          print("bytes in", len(files), "non-directory files")
+
+   In the next example, walking the tree bottom-up is essential:
+   :func:`unlinkat` doesn't allow deleting a directory before the directory is
+   empty::
+
+      # Delete everything reachable from the directory named in "top",
+      # assuming there are no symbolic links.
+      # CAUTION:  This is dangerous!  For example, if top == '/', it
+      # could delete all your disk files.
+      import os
+      for root, dirs, files, rootfd in os.fdwalk(top, topdown=False):
+          for name in files:
+              os.unlinkat(rootfd, name)
+          for name in dirs:
+              os.unlinkat(rootfd, name, os.AT_REMOVEDIR)
+
+   Availability: Unix.
+
+   .. versionadded:: 3.3
+
 .. _os-process:
 
 Process Management
diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst
--- a/Doc/whatsnew/3.3.rst
+++ b/Doc/whatsnew/3.3.rst
@@ -478,6 +478,10 @@
 
   (Patch submitted by Giampaolo Rodol�� in :issue:`10784`.)
 
+* The :mod:`os` module has a new :func:`~os.fdwalk` function similar to
+  :func:`~os.walk` except that it also yields file descriptors referring to the
+  directories visited. This is especially useful to avoid symlink races.
+
 * "at" functions (:issue:`4761`):
 
   * :func:`~os.faccessat`
diff --git a/Lib/os.py b/Lib/os.py
--- a/Lib/os.py
+++ b/Lib/os.py
@@ -24,6 +24,7 @@
 #'
 
 import sys, errno
+import stat as st
 
 _names = sys.builtin_module_names
 
@@ -32,6 +33,9 @@
            "defpath", "name", "path", "devnull",
            "SEEK_SET", "SEEK_CUR", "SEEK_END"]
 
+def _exists(name):
+    return name in globals()
+
 def _get_exports_list(module):
     try:
         return list(module.__all__)
@@ -120,7 +124,13 @@
     umask(mask)
     return mode & ~mask
 
-#'
+def _are_same_file(stat1, stat2):
+    """Helper function that checks whether two stat results refer to the same
+    file.
+    """
+    return (stat1.st_mode == stat2.st_mode and stat1.st_ino == stat2.st_ino and
+            stat1.st_dev == stat2.st_dev)
+#
 
 # Super directory utilities.
 # (Inspired by Eric Raymond; the doc strings are mostly his)
@@ -151,7 +161,6 @@
     try:
         mkdir(name, mode)
     except OSError as e:
-        import stat as st
         if not (e.errno == errno.EEXIST and exist_ok and path.isdir(name) and
                 st.S_IMODE(lstat(name).st_mode) == _get_masked_mode(mode)):
             raise
@@ -298,6 +307,94 @@
 
 __all__.append("walk")
 
+if _exists("openat"):
+
+    def fdwalk(top, topdown=True, onerror=None, followlinks=False):
+        """Directory tree generator.
+
+        This behaves exactly like walk(), except that it yields a 4-tuple
+
+            dirpath, dirnames, filenames, dirfd
+
+        `dirpath`, `dirnames` and `filenames` are identical to walk() output,
+        and `dirfd` is a file descriptor referring to the directory `dirpath`.
+
+        The advantage of walkfd() over walk() is that it's safe against symlink
+        races (when followlinks is False).
+
+        Caution:
+        Since fdwalk() yields file descriptors, those are only valid until the
+        next iteration step, so you should dup() them if you want to keep them
+        for a longer period.
+        Also, contrarily to walk(), modifying the `dirnames` list in-place won't
+        affect the directories traversed.
+
+        Example:
+
+        import os
+        for root, dirs, files, rootfd in os.fdwalk('python/Lib/email'):
+            print(root, "consumes", end="")
+            print(sum([os.fstatat(rootfd, name).st_size for name in files]),
+                  end="")
+            print("bytes in", len(files), "non-directory files")
+        """
+        # Note: To guard against symlink races, we use the standard
+        # lstat()/open()/fstat() trick.
+        orig_st = lstat(top)
+        topfd = open(top, O_RDONLY)
+        try:
+            if (followlinks or (st.S_ISDIR(orig_st.st_mode) and
+                               _are_same_file(orig_st, fstat(topfd)))):
+                for x in _fdwalk(topfd, top, topdown, onerror, followlinks):
+                    yield x
+        finally:
+            close(topfd)
+
+    def _fdwalk(topfd, toppath, topdown, onerror, followlinks):
+        try:
+            names = fdlistdir(topfd)
+        except error as err:
+            if onerror is not None:
+                onerror(err)
+            return
+
+        dirs, nondirs = [], []
+        for name in names:
+            # Here, we don't use AT_SYMLINK_NOFOLLOW to be consistent with
+            # walk() which reports symlinks to directories as directories. We do
+            # however check for symlinks before recursing into a subdirectory.
+            if st.S_ISDIR(fstatat(topfd, name).st_mode):
+                dirs.append(name)
+            else:
+                nondirs.append(name)
+
+        # whether to follow symlinks
+        flag = 0 if followlinks else AT_SYMLINK_NOFOLLOW
+
+        if topdown:
+            yield toppath, dirs, nondirs, topfd
+
+        for name in dirs:
+            try:
+                orig_st = fstatat(topfd, name, flag)
+                dirfd = openat(topfd, name, O_RDONLY)
+            except error as err:
+                if onerror is not None:
+                    onerror(err)
+                return
+            try:
+                if followlinks or _are_same_file(orig_st, fstat(dirfd)):
+                    dirpath = path.join(toppath, name)
+                    for x in _fdwalk(dirfd, dirpath, topdown, onerror, followlinks):
+                        yield x
+            finally:
+                close(dirfd)
+
+        if not topdown:
+            yield toppath, dirs, nondirs, topfd
+
+    __all__.append("fdwalk")
+
 # Make sure os.environ exists, at least
 try:
     environ
@@ -598,9 +695,6 @@
 fsencode, fsdecode = _fscodec()
 del _fscodec
 
-def _exists(name):
-    return name in globals()
-
 # Supply spawn*() (probably only for Unix)
 if _exists("fork") and not _exists("spawnv") and _exists("execv"):
 
diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py
--- a/Lib/test/test_os.py
+++ b/Lib/test/test_os.py
@@ -20,6 +20,8 @@
 import asyncore
 import asynchat
 import socket
+import itertools
+import stat
 try:
     import threading
 except ImportError:
@@ -147,7 +149,6 @@
         if not hasattr(os, "stat"):
             return
 
-        import stat
         result = os.stat(fname)
 
         # Make sure direct access works
@@ -464,7 +465,7 @@
 class WalkTests(unittest.TestCase):
     """Tests for os.walk()."""
 
-    def test_traversal(self):
+    def setUp(self):
         import os
         from os.path import join
 
@@ -569,6 +570,58 @@
                     os.remove(dirname)
         os.rmdir(support.TESTFN)
 
+ at unittest.skipUnless(hasattr(os, 'fdwalk'), "Test needs os.fdwalk()")
+class FdWalkTests(WalkTests):
+    """Tests for os.fdwalk()."""
+
+    def test_compare_to_walk(self):
+        # compare with walk() results
+        for topdown, followlinks in itertools.product((True, False), repeat=2):
+            args = support.TESTFN, topdown, None, followlinks
+            expected = {}
+            for root, dirs, files in os.walk(*args):
+                expected[root] = (set(dirs), set(files))
+
+            for root, dirs, files, rootfd in os.fdwalk(*args):
+                self.assertIn(root, expected)
+                self.assertEqual(expected[root], (set(dirs), set(files)))
+
+    def test_dir_fd(self):
+        # check returned file descriptors
+        for topdown, followlinks in itertools.product((True, False), repeat=2):
+            args = support.TESTFN, topdown, None, followlinks
+            for root, dirs, files, rootfd in os.fdwalk(*args):
+                # check that the FD is valid
+                os.fstat(rootfd)
+                # check that fdlistdir() returns consistent information
+                self.assertEqual(set(os.fdlistdir(rootfd)), set(dirs) | set(files))
+
+    def test_fd_leak(self):
+        # Since we're opening a lot of FDs, we must be careful to avoid leaks:
+        # we both check that calling fdwalk() a large number of times doesn't
+        # yield EMFILE, and that the minimum allocated FD hasn't changed.
+        minfd = os.dup(1)
+        os.close(minfd)
+        for i in range(512):
+            for x in os.fdwalk(support.TESTFN):
+                pass
+        newfd = os.dup(1)
+        self.addCleanup(os.close, newfd)
+        self.assertEqual(newfd, minfd)
+
+    def tearDown(self):
+        # cleanup
+        for root, dirs, files, rootfd in os.fdwalk(support.TESTFN, topdown=False):
+            for name in files:
+                os.unlinkat(rootfd, name)
+            for name in dirs:
+                st = os.fstatat(rootfd, name, os.AT_SYMLINK_NOFOLLOW)
+                if stat.S_ISDIR(st.st_mode):
+                    os.unlinkat(rootfd, name, os.AT_REMOVEDIR)
+                else:
+                    os.unlinkat(rootfd, name)
+        os.rmdir(support.TESTFN)
+
 class MakedirTests(unittest.TestCase):
     def setUp(self):
         os.mkdir(support.TESTFN)
@@ -1683,6 +1736,7 @@
         StatAttributeTests,
         EnvironTests,
         WalkTests,
+        FdWalkTests,
         MakedirTests,
         DevNullTests,
         URandomTests,