[XML-SIG] Text in ElementTree?

Fri Apr 15 19:47:12 CEST 2005

I'm trying to write an example of how to move elements around in a 
document using ElementTree.  The objective is to take things like this:

     <em><h1>Heading</h1></em>

and turn them into:

     <h1><em>Heading</em></h1>

i.e., put the emphasis inside the h1-h4 elements, instead of the other 
way around.

It's almost working, but I'm still having trouble handling nodes whose 
children are interspersed text and elements.

The script below shows the problem.  Run it from the command line with
no argument, and it'll break on the 'Single Inversion' text.  Run it 
again with 'movetext' as its only argument, and it'll break on the last
test (in which the body element has strings, em+heading, and other 
elements as children).

I think the root of my problem is that I don't understand how 
ElementTree stores text --- if you have:

     <p> a <b>c</b> d <e> f <g/></e> </p>

then what are p's children?  What is p.text?  What happens if you assign 
a new value to p.text?

Thanks,
Greg

(Note: if your news reader breaks the 'Mixed Content' and 'Nested' test 
cases across lines, you may have to edit them.)

import sys
from cElementTree import Element, fromstring, tostring

# from visitor import Visitor
class Visitor(object):

     def __init__(self):
         pass

     def visit(self, root):
         self.beforeAll(root)
         self.traverse(root)
         self.afterAll(root)

     def traverse(self, current):
         self.beforeNode(current)
         self.atNode(current)
         for child in current:
             self.traverse(child)
         self.afterNode(current)

     def doNothing(self, node):
         pass

     beforeAll = doNothing
     afterAll = doNothing
     beforeNode = doNothing
     afterNode = doNothing
     atNode = doNothing

HeadingTags = ('h1', 'h2', 'h3', 'h4')

def containsOnlyHeading(node):
     '''Does a node contain only a single heading?'''
     return (len(node) == 1) and \
            (node[0].tag in HeadingTags)

class Finder(Visitor):
     '''Locate all nodes in a tree that have emphasized nodes containing
     a single heading as children.'''

     def beforeAll(self, root):
         self.nodes = []

     def atNode(self, node):
         for child in node:
             if (child.tag == 'em') and containsOnlyHeading(child):
                 self.nodes.append(node)
                 return

def transform(parent):
     '''Transform a node that has emphasized children containing 
headings.'''

     print '..parent', tostring(parent)

     # Helper function to locate a child in a parent.
     def findIndex(parent, child):
         for i in range(len(parent)):
             if parent[i] is child:
                 return i
         return -1

     # Get all emphasized nodes, and filter to get the ones to be modified.
     allEmph = parent.findall('em')
     allEmph = [x for x in allEmph if containsOnlyHeading(x)]
     assert allEmph

     # Transform each in turn.
     for emph in allEmph:

         print '....emph', tostring(emph)

         # Get the heading.
         assert len(emph) == 1
         heading = emph[0]
         assert heading.tag in HeadingTags

         print '....heading', tostring(heading)

         # Take the heading out of the emphasized node.
         emph.remove(heading)

         print '....after removing heading, emph is', tostring(emph)

         # Put the heading in the parent in the emphasized node's place.
         loc = findIndex(parent, emph)
         assert loc >= 0
         parent[loc] = heading

         print '....after putting heading in emph place, parent is', 
tostring(parent)

         # Move the heading's children and text to the emphasized node.
         if 'movetext' in sys.argv[1:]:
             emph.text = heading.text
             heading.text = None
             print '....after moving text, heading is', 
tostring(heading), 'and emph is', tostring(emph)
         else:
             print '....not moving text'
         while len(heading):
             child = heading[0]
             emph.append(child)
             heading.remove(child)
             print '......after moving', tostring(child), 'emph is', 
tostring(emph), 'and heading is', tostring(heading)

         # Make the emphasized node the heading's only child.
         heading.append(emph)
         print 'after attaching emph to heading, heading is', 
tostring(heading)

def normalize(root):
     '''Normalize an entire document.'''
     f = Finder()
     f.visit(root)
     for node in f.nodes:
         transform(node)

if __name__ == '__main__':

     tests = (
         ('Empty',
          '<empty />',
          '<empty />'),

         ('Single',
          '<single><child /></single>',
          '<single><child /></single>'),

         ('Em Only',
          '<html><em>unchanged</em></html>',
          '<html><em>unchanged</em></html>'),

         ('H1 Only',
          '<html><h1>unchanged</h1></html>',
          '<html><h1>unchanged</h1></html>'),

         ('Already Normalized',
          '<html><h1><em>unchanged</em></h1></html>',
          '<html><h1><em>unchanged</em></h1></html>'),

         ('Single Inversion',
          '<html><em><h1>changed</h1></em></html>',
          '<html><h1><em>changed</em></h1></html>'),

         ('Mixed Content',
          '<html><em><h1><b>change</b> this <b>and</b> 
that</h1></em></html>',
          '<html><h1><em><b>change</b> this <b>and</b> 
that</em></h1></html>'),

         ('Nested',
          '<html><body><em><h2>x</h2></em> <p>space</p> 
<em><h3>y</h3></em> space</body></html>',
          '<html><body><h2><em>x</em></h2> <p>space</p> 
<h3><em>y</em></h3> space</body></html>')
     )

     for (name, input, expected) in tests:
         print name
         print 'INPUT', input
         doc = fromstring(input)
         normalize(doc)
         actual = tostring(doc)
         print 'EXPECTED', expected
         print 'ACTUAL', actual
         print
         assert actual == expected