intolerant HTML parser

Phlip phlip2005 at gmail.com
Mon Feb 8 13:16:22 EST 2010


and the tweak is:

                parser = etree.HTMLParser(recover=False)
                return etree.HTML(xml, parser)

That reduces tolerance. The entire assert_xml() is (apologies for
wrapping lines!):

    def _xml_to_tree(self, xml):
        from lxml import etree
        self._xml = xml

        try:
            if '<html' in xml[:200]:  #  NOTE the condition COULD suck
more!
                parser = etree.HTMLParser(recover=False)
                return etree.HTML(xml, parser)
                return etree.HTML(xml)
            else:
                return etree.XML(xml)

        except ValueError:  #  TODO  don't rely on exceptions for
normal control flow
            tree = xml
            self._xml = str(tree)  #  CONSIDER  does this reconstitute
the nested XML ?
            return tree

    def assert_xml(self, xml, xpath, **kw):
        'Check that a given extent of XML or HTML contains a given
XPath, and return its first node'

        tree = self._xml_to_tree(xml)
        nodes = tree.xpath(xpath)
        self.assertTrue(len(nodes) > 0, xpath + ' not found in ' +
self._xml)
        node = nodes[0]
        if kw.get('verbose', False):  self.reveal_xml(node)  #  "here
have ye been? What have ye seen?"--Morgoth
        return node

    def reveal_xml(self, node):
        'Spews an XML node as source, for diagnosis'

        from lxml import etree
        print etree.tostring(node, pretty_print=True)  #  CONSIDER
does pretty_print work? why not?

    def deny_xml(self, xml, xpath):
        'Check that a given extent of XML or HTML does not contain a
given XPath'

        tree = self._xml_to_tree(xml)
        nodes = tree.xpath(xpath)
        self.assertEqual(0, len(nodes), xpath + ' should not appear in
' + self._xml)



More information about the Python-list mailing list