[Python-checkins] bpo-20928: support base-URL and recursive includes in etree.ElementInclude (#5723)

Stefan Behnel webhook-mailer at python.org
Mon Nov 25 10:36:29 EST 2019


https://github.com/python/cpython/commit/c6a7bdb356835c9d7513b1ea6846683d446fe6c3
commit: c6a7bdb356835c9d7513b1ea6846683d446fe6c3
branch: master
author: Stefan Behnel <stefan_ml at behnel.de>
committer: GitHub <noreply at github.com>
date: 2019-11-25T16:36:25+01:00
summary:

bpo-20928: support base-URL and recursive includes in etree.ElementInclude (#5723)

* bpo-20928: bring elementtree's XInclude support en-par with the implementation in lxml by adding support for recursive includes and a base-URL.

* bpo-20928: Support xincluding the same file multiple times, just not recursively.

* bpo-20928: Add 'max_depth' parameter to xinclude that limits the maximum recursion depth to 6 by default.

* Add news entry for updated ElementInclude support

files:
A Misc/NEWS.d/next/Library/2018-03-30-16-18-12.bpo-20928.ieXu6I.rst
M Lib/test/test_xml_etree.py
M Lib/xml/etree/ElementInclude.py

diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index 9fbb4ac1813ac..09c234ca6890a 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -1668,6 +1668,17 @@ def test_unknown_event(self):
 </document>
 """.format(html.escape(SIMPLE_XMLFILE, True))
 
+XINCLUDE["include_c1_repeated.xml"] = """\
+<?xml version='1.0'?>
+<document xmlns:xi="http://www.w3.org/2001/XInclude">
+  <p>The following is the source code of Recursive1.xml:</p>
+  <xi:include href="C1.xml"/>
+  <xi:include href="C1.xml"/>
+  <xi:include href="C1.xml"/>
+  <xi:include href="C1.xml"/>
+</document>
+"""
+
 #
 # badly formatted xi:include tags
 
@@ -1688,6 +1699,31 @@ def test_unknown_event(self):
 </div>
 """
 
+XINCLUDE["Recursive1.xml"] = """\
+<?xml version='1.0'?>
+<document xmlns:xi="http://www.w3.org/2001/XInclude">
+  <p>The following is the source code of Recursive2.xml:</p>
+  <xi:include href="Recursive2.xml"/>
+</document>
+"""
+
+XINCLUDE["Recursive2.xml"] = """\
+<?xml version='1.0'?>
+<document xmlns:xi="http://www.w3.org/2001/XInclude">
+  <p>The following is the source code of Recursive3.xml:</p>
+  <xi:include href="Recursive3.xml"/>
+</document>
+"""
+
+XINCLUDE["Recursive3.xml"] = """\
+<?xml version='1.0'?>
+<document xmlns:xi="http://www.w3.org/2001/XInclude">
+  <p>The following is the source code of Recursive1.xml:</p>
+  <xi:include href="Recursive1.xml"/>
+</document>
+"""
+
+
 class XIncludeTest(unittest.TestCase):
 
     def xinclude_loader(self, href, parse="xml", encoding=None):
@@ -1789,6 +1825,13 @@ def test_xinclude(self):
             '  </ns0:include>\n'
             '</div>') # C5
 
+    def test_xinclude_repeated(self):
+        from xml.etree import ElementInclude
+
+        document = self.xinclude_loader("include_c1_repeated.xml")
+        ElementInclude.include(document, self.xinclude_loader)
+        self.assertEqual(1+4*2, len(document.findall(".//p")))
+
     def test_xinclude_failures(self):
         from xml.etree import ElementInclude
 
@@ -1821,6 +1864,45 @@ def test_xinclude_failures(self):
                 "xi:fallback tag must be child of xi:include "
                 "('{http://www.w3.org/2001/XInclude}fallback')")
 
+        # Test infinitely recursive includes.
+        document = self.xinclude_loader("Recursive1.xml")
+        with self.assertRaises(ElementInclude.FatalIncludeError) as cm:
+            ElementInclude.include(document, self.xinclude_loader)
+        self.assertEqual(str(cm.exception),
+                "recursive include of Recursive2.xml")
+
+        # Test 'max_depth' limitation.
+        document = self.xinclude_loader("Recursive1.xml")
+        with self.assertRaises(ElementInclude.FatalIncludeError) as cm:
+            ElementInclude.include(document, self.xinclude_loader, max_depth=None)
+        self.assertEqual(str(cm.exception),
+                "recursive include of Recursive2.xml")
+
+        document = self.xinclude_loader("Recursive1.xml")
+        with self.assertRaises(ElementInclude.LimitedRecursiveIncludeError) as cm:
+            ElementInclude.include(document, self.xinclude_loader, max_depth=0)
+        self.assertEqual(str(cm.exception),
+                "maximum xinclude depth reached when including file Recursive2.xml")
+
+        document = self.xinclude_loader("Recursive1.xml")
+        with self.assertRaises(ElementInclude.LimitedRecursiveIncludeError) as cm:
+            ElementInclude.include(document, self.xinclude_loader, max_depth=1)
+        self.assertEqual(str(cm.exception),
+                "maximum xinclude depth reached when including file Recursive3.xml")
+
+        document = self.xinclude_loader("Recursive1.xml")
+        with self.assertRaises(ElementInclude.LimitedRecursiveIncludeError) as cm:
+            ElementInclude.include(document, self.xinclude_loader, max_depth=2)
+        self.assertEqual(str(cm.exception),
+                "maximum xinclude depth reached when including file Recursive1.xml")
+
+        document = self.xinclude_loader("Recursive1.xml")
+        with self.assertRaises(ElementInclude.FatalIncludeError) as cm:
+            ElementInclude.include(document, self.xinclude_loader, max_depth=3)
+        self.assertEqual(str(cm.exception),
+                "recursive include of Recursive2.xml")
+
+
 # --------------------------------------------------------------------
 # reported bugs
 
diff --git a/Lib/xml/etree/ElementInclude.py b/Lib/xml/etree/ElementInclude.py
index 963470e3b15c7..5303062716c47 100644
--- a/Lib/xml/etree/ElementInclude.py
+++ b/Lib/xml/etree/ElementInclude.py
@@ -50,18 +50,28 @@
 
 import copy
 from . import ElementTree
+from urllib.parse import urljoin
 
 XINCLUDE = "{http://www.w3.org/2001/XInclude}"
 
 XINCLUDE_INCLUDE = XINCLUDE + "include"
 XINCLUDE_FALLBACK = XINCLUDE + "fallback"
 
+# For security reasons, the inclusion depth is limited to this read-only value by default.
+DEFAULT_MAX_INCLUSION_DEPTH = 6
+
+
 ##
 # Fatal include error.
 
 class FatalIncludeError(SyntaxError):
     pass
 
+
+class LimitedRecursiveIncludeError(FatalIncludeError):
+    pass
+
+
 ##
 # Default loader.  This loader reads an included resource from disk.
 #
@@ -92,13 +102,33 @@ def default_loader(href, parse, encoding=None):
 # @param loader Optional resource loader.  If omitted, it defaults
 #     to {@link default_loader}.  If given, it should be a callable
 #     that implements the same interface as <b>default_loader</b>.
+# @param base_url The base URL of the original file, to resolve
+#     relative include file references.
+# @param max_depth The maximum number of recursive inclusions.
+#     Limited to reduce the risk of malicious content explosion.
+#     Pass a negative value to disable the limitation.
+# @throws LimitedRecursiveIncludeError If the {@link max_depth} was exceeded.
 # @throws FatalIncludeError If the function fails to include a given
 #     resource, or if the tree contains malformed XInclude elements.
-# @throws OSError If the function fails to load a given resource.
+# @throws IOError If the function fails to load a given resource.
+# @returns the node or its replacement if it was an XInclude node
 
-def include(elem, loader=None):
+def include(elem, loader=None, base_url=None,
+            max_depth=DEFAULT_MAX_INCLUSION_DEPTH):
+    if max_depth is None:
+        max_depth = -1
+    elif max_depth < 0:
+        raise ValueError("expected non-negative depth or None for 'max_depth', got %r" % max_depth)
+
+    if hasattr(elem, 'getroot'):
+        elem = elem.getroot()
     if loader is None:
         loader = default_loader
+
+    _include(elem, loader, base_url, max_depth, set())
+
+
+def _include(elem, loader, base_url, max_depth, _parent_hrefs):
     # look for xinclude elements
     i = 0
     while i < len(elem):
@@ -106,14 +136,24 @@ def include(elem, loader=None):
         if e.tag == XINCLUDE_INCLUDE:
             # process xinclude directive
             href = e.get("href")
+            if base_url:
+                href = urljoin(base_url, href)
             parse = e.get("parse", "xml")
             if parse == "xml":
+                if href in _parent_hrefs:
+                    raise FatalIncludeError("recursive include of %s" % href)
+                if max_depth == 0:
+                    raise LimitedRecursiveIncludeError(
+                        "maximum xinclude depth reached when including file %s" % href)
+                _parent_hrefs.add(href)
                 node = loader(href, parse)
                 if node is None:
                     raise FatalIncludeError(
                         "cannot load %r as %r" % (href, parse)
                         )
-                node = copy.copy(node)
+                node = copy.copy(node)  # FIXME: this makes little sense with recursive includes
+                _include(node, loader, href, max_depth - 1, _parent_hrefs)
+                _parent_hrefs.remove(href)
                 if e.tail:
                     node.tail = (node.tail or "") + e.tail
                 elem[i] = node
@@ -123,11 +163,13 @@ def include(elem, loader=None):
                     raise FatalIncludeError(
                         "cannot load %r as %r" % (href, parse)
                         )
+                if e.tail:
+                    text += e.tail
                 if i:
                     node = elem[i-1]
-                    node.tail = (node.tail or "") + text + (e.tail or "")
+                    node.tail = (node.tail or "") + text
                 else:
-                    elem.text = (elem.text or "") + text + (e.tail or "")
+                    elem.text = (elem.text or "") + text
                 del elem[i]
                 continue
             else:
@@ -139,5 +181,5 @@ def include(elem, loader=None):
                 "xi:fallback tag must be child of xi:include (%r)" % e.tag
                 )
         else:
-            include(e, loader)
-        i = i + 1
+            _include(e, loader, base_url, max_depth, _parent_hrefs)
+        i += 1
diff --git a/Misc/NEWS.d/next/Library/2018-03-30-16-18-12.bpo-20928.ieXu6I.rst b/Misc/NEWS.d/next/Library/2018-03-30-16-18-12.bpo-20928.ieXu6I.rst
new file mode 100644
index 0000000000000..2585400907799
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-03-30-16-18-12.bpo-20928.ieXu6I.rst
@@ -0,0 +1 @@
+ElementTree supports recursive XInclude processing.  Patch by Stefan Behnel.



More information about the Python-checkins mailing list