Trying to get cleaner XML output from a text file

iainemsley iainemsley at googlemail.com
Fri May 29 13:09:10 EDT 2009


I'm using Python2.5 to try and convert some text files into XML using
xml.minidom. I'm currently doing some plays which have a structure
like
Scene 1
Act 1
blah blah
Act2
blah blah
Scene 2
Act 1
and so on.
I'm trying to turn it into
<div type="scene">1
  <div type="act">1
   <speech />
  </div>
  <div type="act">2
   <speech />
  </div>
</div>
(or ideally <div type="scene" id="1"> bit I can always come back to
this later)
I've currently got:
 <div id="" type="scene">
  <div id=" " type="act">
   <speech>
    II
   </speech>
  </div>
 </div>
 <div id="" type="scene">
  <div id=" " type="act">
   <speech>
    II
   </speech>
  </div>
 </div>
 <div id="" type="scene">
  <div id=" " type="act">
The code I'm currently working with is:
from itertools import groupby
from xml.dom.minidom import Document

import re

text = open('\\texts\\midsummer_nights_dream_gut.txt').read()

def paragraphs(lines, is_separator=str.isspace, joiner=''.join):
    for separator_group, lineiter in groupby(lines, key=is_separator):
        if not separator_group:
            yield joiner(lineiter)

def scene_node(scene):
    global docText
    docText = doc.createElement("div")
    #need to set the type to book, verse, drama
    docText.setAttribute("type", "scene")
    #need set the id to what ever break name or id: i.e. chapter 1 or
act 1
    docText.setAttribute("id", '')
    tei.appendChild(docText)
    for acts in actTxt.split(scene):
        act_node(acts)

def act_node(act):
    global actText
    actText = doc.createElement("div")
    #need to set the type to book, verse, drama
    actText.setAttribute("type", "act")
    #need set the id to what ever id: 1 or I
    actText.setAttribute("id", ' ')
    docText.appendChild(actText)
    for p in paragraphs(act.splitlines(True)):
        speech_node(p)

def speech_node(speech):
        para = doc.createElement("speech")
        actText.appendChild(para)
        ptext = doc.createTextNode(speech)
        para.appendChild(ptext)


doc = Document()
tei = doc.createElement("body")
doc.appendChild(tei)

sideTxt = re.compile(r"Scene\s+([1-9])", re.I)
actTxt = re.compile(r"Act\s+([1-9])", re.I)
for textStr in sideTxt.split(text):
    scene_node(textStr)

print doc.toprettyxml(indent = " ")
I'd be grateful for some pointers about getting a cleaner output.

Thanks,

Iain



More information about the Python-list mailing list