simple ElementTree based parser that allows entity definition map

Robin Becker robin at reportlab.com
Wed Dec 4 08:42:44 EST 2013


I'm tasked with writing  a 'simple' ElementTree based parser with support for 
unknown entities eg &foo;.

This code derived from FL's old documentation fails in both python 2 and 3.
########################
import xml.etree.ElementTree as ET
try:
     ascii
except:
     from future_builtins import ascii

class EchoTarget:
     def start(self, tag, attrib):
         print("start %s %s"%(tag, ascii(attrib)))
     def end(self, tag):
         print("end %s"%tag)
     def data(self, data):
         print("data %s" % ascii(data))
     def close(self):
         print("close")

     def __getattr__(self,a):
         print('target attempting to get attribute %s' % a)

target = EchoTarget()
parser = ET.XMLParser(target=target)
parser.entity['foo'] = b'AAAA&fum;BBBB'
parser.entity['fum'] = b'CCCC'
print("parser.entity=%s" % ascii(parser.entity))
parser.feed("<element>some text &foo;</element>")
parser.feed("")
parser.close()
########################

The entity value doesn't seem to get referenced.



I tried this derived from
http://stackoverflow.com/questions/7237466/python-elementtree-support-for-parsing-unknown-xml-entities

########################
__all__=tuple(filter(None,'''
         Xml2TT
         EntityMap
         '''.split()))
import xml.etree.ElementTree as ET
try:
     from StringIO import StringIO
except ImportError:
     from io import StringIO

class EntityMap(dict):
     def __getitem__(self,key):
         try:
             r = dict.__getitem__(self,key)
         except:
             r = '&' + key +';'
         return r

class Xml2TT:
     '''
     create a callable object that can turns xml into a tupletree
     if mutable is set to True then it's really a list tree
     '''
     def __init__(self,mutable=False,entityMap=None):
         self._mutable = mutable
         self._parser = parser = ET.XMLParser()
         parser.parser.UseForeignDTD(True)
         parser.entity = self._entityMap = entityMap

     def __call__(self,xml):
         r = self._mtt(ET.ElementTree().parse(StringIO(xml.strip()), 
parser=self._parser))
         return r[0]

     def _mtt(self,node):
         t = [node.text] if node.text else []
         e = t.extend
         for c in node:
             e(self._mtt(c))
         t = (node.tag,node.attrib,t,None)
         if self._mutable:
             t = list(t)
         return [t,node.tail] if node.tail else [t]

if __name__=='__main__':
     print(repr(Xml2TT()('<a>aaaaa<b>bbbb<c ca="123"/>22</b></a>')))
     print(repr(Xml2TT()('<a>aaaaa=&=bbbbb< ></a>')))
     print(repr(Xml2TT(entityMap=EntityMap({'mu': '&#x85;','foo': 
'AAA&fum;BBB','fum':'CCC'}))('<a>amp=& moo=&moo; lt=< gt=> mu=μ 
foo=&foo;</a>')))
########################

and it sort of works in python2, fails in python3 with

AttributeError: 'xml.etree.ElementTree.XMLParser' object has no attribute
'parser'

Even in python 2 there's a subtle bug as the output is

('a', {}, ['aaaaa', ('b', {}, ['bbbb', ('c', {'ca': '123'}, [], None), '22'], 
None)], None)
('a', {}, ['aaaaa=&=bbbbb< >'], None)
('a', {}, [u'amp=& moo=&moo; lt=< gt=> mu=&#x85; foo=AAA&fum;BBB'], None)

ie the result of the &foo; lookup is not re-parsed so &fum; is not translated.

Is there a way to get a simple ElementTree based parser that can do what I want? 
I have several hundred entities and the size of the DTD would probably be larger 
than 99% of the strings I need to parse. I think I can live with the 
non-reparsing of the map output, but can I get Python 3 to do the UseForeignDTD 
thing?
-- 
Robin Becker




More information about the Python-list mailing list