How to get xml.etree.ElementTree not bomb on invalid characters in XML file ?

Tue May 4 03:01:35 EDT 2010

Hi,

I'm parsing XML files using ElementTree from xml.etree (see code below (and attached xml_parse_example.py)).

However, I'm coming across input XML files (attached an example: tmp.xml) which include invalid characters, that produce the following traceback:

$ python xml_parse_example.py
Traceback (most recent call last):
  File "xml_parse_example.py", line 63, in <module>
    tree = xml2dict.open_and_parse_xml_file()
  File "xml_parse_example.py", line 14, in open_and_parse_xml_file
    tree = ElementTree.parse(f)
  File "c:\Python26\lib\xml\etree\ElementTree.py", line 862, in parse
    tree.parse(source, parser)
  File "c:\Python26\lib\xml\etree\ElementTree.py", line 586, in parse
    parser.feed(data)
  File "c:\Python26\lib\xml\etree\ElementTree.py", line 1245, in feed
    self._parser.Parse(data, 0)
xml.parsers.expat.ExpatError: not well-formed (invalid token): line 6, column 34

I read the documentation for xml.etree.ElementTree and see that it may take an optional parser parameter, but I don't know what this parser should be - to ignore the invalid characters.

Could you suggest a way to call ElementTree, so it won't bomb on these invalid characters ?

Thanks,
Ron.

________________________________

#!/usr/bin/env python

from xml.etree import ElementTree
import pprint

compute_tail = False

class XmlFileToDict():
    def __init__(self, xml_file_path):
        self.xml_file_path = xml_file_path

    def open_and_parse_xml_file(self):
        with open(self.xml_file_path, 'rt') as f:
            tree = ElementTree.parse(f)
        return tree

    def dict_list(self, node):
            res = {}
            res[node.tag] = []
            self.xml_to_dict(node,res[node.tag])
            reply = {}
            if compute_tail:
                reply[node.tag] = {'value':res[node.tag],'attribs':node.attrib,'tail':node.tail}
            else:
                reply[node.tag] = {'value':res[node.tag],'attribs':node.attrib}

            return reply

    def xml_to_dict(self, node, res):
            rep = {}

            if len(node):
                    #n = 0
                    for n in list(node):
                            rep[node.tag] = []
                            value = self.xml_to_dict(n,rep[node.tag])
                            if len(n):
                                    if compute_tail:
                                        value = {'value':rep[node.tag],'attributes':n.attrib,'tail':n.tail}
                                    else:
                                        value = {'value':rep[node.tag],'attributes':n.attrib}
                                    res.append({n.tag:value})
                            else :

                                    res.append(rep[node.tag][0])

            else:

                    value = {}
                    if compute_tail:
                        value = {'value':node.text,'attributes':node.attrib,'tail':node.tail}
                    else:
                        value = {'value':node.text,'attributes':node.attrib}

                    res.append({node.tag:value})

            return

if __name__ == '__main__' :
    xml_file_path ='tmp.xml'
    xml2dict = XmlFileToDict(xml_file_path)
    tree = xml2dict.open_and_parse_xml_file()
    xml_dict = xml2dict.dict_list(tree.getroot())
    pprint.pprint(xml_dict)

________________________________

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/python-list/attachments/20100504/be038d45/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: tmp.xml
Type: application/xml
Size: 637 bytes
Desc: tmp.xml
URL: <http://mail.python.org/pipermail/python-list/attachments/20100504/be038d45/attachment.xml>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: xml_parse_example.py
Type: application/octet-stream
Size: 2294 bytes
Desc: xml_parse_example.py
URL: <http://mail.python.org/pipermail/python-list/attachments/20100504/be038d45/attachment.obj>