[XML-SIG] Text in ElementTree?
Greg Wilson
gvwilson at cs.utoronto.ca
Fri Apr 15 19:47:12 CEST 2005
I'm trying to write an example of how to move elements around in a
document using ElementTree. The objective is to take things like this:
<em><h1>Heading</h1></em>
and turn them into:
<h1><em>Heading</em></h1>
i.e., put the emphasis inside the h1-h4 elements, instead of the other
way around.
It's almost working, but I'm still having trouble handling nodes whose
children are interspersed text and elements.
The script below shows the problem. Run it from the command line with
no argument, and it'll break on the 'Single Inversion' text. Run it
again with 'movetext' as its only argument, and it'll break on the last
test (in which the body element has strings, em+heading, and other
elements as children).
I think the root of my problem is that I don't understand how
ElementTree stores text --- if you have:
<p> a <b>c</b> d <e> f <g/></e> </p>
then what are p's children? What is p.text? What happens if you assign
a new value to p.text?
Thanks,
Greg
(Note: if your news reader breaks the 'Mixed Content' and 'Nested' test
cases across lines, you may have to edit them.)
import sys
from cElementTree import Element, fromstring, tostring
# from visitor import Visitor
class Visitor(object):
def __init__(self):
pass
def visit(self, root):
self.beforeAll(root)
self.traverse(root)
self.afterAll(root)
def traverse(self, current):
self.beforeNode(current)
self.atNode(current)
for child in current:
self.traverse(child)
self.afterNode(current)
def doNothing(self, node):
pass
beforeAll = doNothing
afterAll = doNothing
beforeNode = doNothing
afterNode = doNothing
atNode = doNothing
HeadingTags = ('h1', 'h2', 'h3', 'h4')
def containsOnlyHeading(node):
'''Does a node contain only a single heading?'''
return (len(node) == 1) and \
(node[0].tag in HeadingTags)
class Finder(Visitor):
'''Locate all nodes in a tree that have emphasized nodes containing
a single heading as children.'''
def beforeAll(self, root):
self.nodes = []
def atNode(self, node):
for child in node:
if (child.tag == 'em') and containsOnlyHeading(child):
self.nodes.append(node)
return
def transform(parent):
'''Transform a node that has emphasized children containing
headings.'''
print '..parent', tostring(parent)
# Helper function to locate a child in a parent.
def findIndex(parent, child):
for i in range(len(parent)):
if parent[i] is child:
return i
return -1
# Get all emphasized nodes, and filter to get the ones to be modified.
allEmph = parent.findall('em')
allEmph = [x for x in allEmph if containsOnlyHeading(x)]
assert allEmph
# Transform each in turn.
for emph in allEmph:
print '....emph', tostring(emph)
# Get the heading.
assert len(emph) == 1
heading = emph[0]
assert heading.tag in HeadingTags
print '....heading', tostring(heading)
# Take the heading out of the emphasized node.
emph.remove(heading)
print '....after removing heading, emph is', tostring(emph)
# Put the heading in the parent in the emphasized node's place.
loc = findIndex(parent, emph)
assert loc >= 0
parent[loc] = heading
print '....after putting heading in emph place, parent is',
tostring(parent)
# Move the heading's children and text to the emphasized node.
if 'movetext' in sys.argv[1:]:
emph.text = heading.text
heading.text = None
print '....after moving text, heading is',
tostring(heading), 'and emph is', tostring(emph)
else:
print '....not moving text'
while len(heading):
child = heading[0]
emph.append(child)
heading.remove(child)
print '......after moving', tostring(child), 'emph is',
tostring(emph), 'and heading is', tostring(heading)
# Make the emphasized node the heading's only child.
heading.append(emph)
print 'after attaching emph to heading, heading is',
tostring(heading)
def normalize(root):
'''Normalize an entire document.'''
f = Finder()
f.visit(root)
for node in f.nodes:
transform(node)
if __name__ == '__main__':
tests = (
('Empty',
'<empty />',
'<empty />'),
('Single',
'<single><child /></single>',
'<single><child /></single>'),
('Em Only',
'<html><em>unchanged</em></html>',
'<html><em>unchanged</em></html>'),
('H1 Only',
'<html><h1>unchanged</h1></html>',
'<html><h1>unchanged</h1></html>'),
('Already Normalized',
'<html><h1><em>unchanged</em></h1></html>',
'<html><h1><em>unchanged</em></h1></html>'),
('Single Inversion',
'<html><em><h1>changed</h1></em></html>',
'<html><h1><em>changed</em></h1></html>'),
('Mixed Content',
'<html><em><h1><b>change</b> this <b>and</b>
that</h1></em></html>',
'<html><h1><em><b>change</b> this <b>and</b>
that</em></h1></html>'),
('Nested',
'<html><body><em><h2>x</h2></em> <p>space</p>
<em><h3>y</h3></em> space</body></html>',
'<html><body><h2><em>x</em></h2> <p>space</p>
<h3><em>y</em></h3> space</body></html>')
)
for (name, input, expected) in tests:
print name
print 'INPUT', input
doc = fromstring(input)
normalize(doc)
actual = tostring(doc)
print 'EXPECTED', expected
print 'ACTUAL', actual
print
assert actual == expected
More information about the XML-SIG
mailing list