[XML-SIG] python, xml, html tags
Alexey U. Gudchenko
prog at goodok.ru
Tue Mar 29 00:31:03 CEST 2005
Здравствуйте, Necati.
Вы писали 28 марта 2005 г., 22:01:36:
ND> Hi,
ND> I can't do something with Python and XML.
ND> i have the following file;
ND> <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
ND> <test>
ND> <content> Hello </content>
ND> <content> <b> Hello </b> </content>
ND> </test>
ND> Ok. it is simple :)
ND> And i have the following python codes;
ND> #!/usr/bin/python
ND> from xml.dom import minidom
ND> file = open("test.xml","r")
ND> xml = minidom.parse(file)
ND> print
ND> xml.childNodes[0].getElementsByTagName("content")[0].firstChild.data
ND> print
ND> xml.childNodes[0].getElementsByTagName("content")[1].firstChild.data
ND> Again simple one :)
ND> But when i run these codes, i have the following output;
ND> Hello
ND> How can i access the second one. Yes, i know it contains html tags so it
ND> doesn't give me the result. I wanna get whole of the content as data.
ND> How can i do this?
try this code and read comments
-----------------------------------
from xml.dom import minidom
file = open("test.xml","r")
oDoc = minidom.parse(file)
oRoot = oDoc.childNodes[0] # <test>
oContesnts = oRoot.getElementsByTagName("content") # [<content>, <content>]
# your code
print "\tyour code"
print oContesnts[0].firstChild.data
print oContesnts[1].firstChild.data
# " Hello
# "
# See manual: "data - The content of the _text_node_(!) as a string. "
# indeed:
print "\t node Types"
print oContesnts[0].firstChild.nodeType == oDoc.TEXT_NODE # True
print oContesnts[1].firstChild.nodeType == oDoc.TEXT_NODE # True
print
# for example if you'll try this xml
#<test>
# <content>{thisTextNnode1...} Hello </content>
# <content>{thisTextNnode2}<b> Hello Other </b> </content>
# </test>
# you get this output:
#"{thisTextNnode2}Hello
#{thisTextNnode2}"
#
# So minidom think each text between the tags as node (textNode)
# print "oContesnts[1] has %d childs - not 1 !" % len(oContesnts[1].childNodes)
# print
# Three childs are: text node {thisTextNnode2}, element node <b>..</b>,
# and text node of space between </b> and </content> closing tags/
# if you want to get the content as plain text then try recursive:
def getInnerText(oNode):
rc = ""
nodelist = oNode.childNodes
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc = rc + node.data
elif node.nodeType==node.ELEMENT_NODE:
rc = rc + getInnerText(node) # recursive !!!
elif node.nodeType==node.CDATA_SECTION_NODE:
rc = rc + node.data
else:
# node.nodeType: PROCESSING_INSTRUCTION_NODE, COMMENT_NODE, DOCUMENT_NODE, NOTATION_NODE and so on
pass
return rc
print "\tInnerText:"
print getInnerText(oContesnts[0])
print getInnerText(oContesnts[1])
print
#"{thisTextNnode1}Hello
#{thisTextNnode2} Hello Other"
# And if you want to retrieve structure (innerHTML analog)
def getInnerHTML(oNode):
rc = ""
nodelist = oNode.childNodes
for node in nodelist:
rc = rc + node.toxml()
return rc
print "\tInner as Structure:"
print getInnerHTML(oContesnts[0])
print getInnerHTML(oContesnts[1])
print
# Or if you want the outer XML (and this will true xml) just
print "\tFull nodes printing:"
print oContesnts[0].toxml() # or you can use .toprettyxml() method
print oContesnts[1].toxml()
print
-----------------------------------
Output:
==================================================
your code
{thisTextNnode1}Hello
{thisTextNnode2}
node Types
True
True
oContesnts[1] has 3 childs - not 1 !
InnerText:
{thisTextNnode1}Hello
{thisTextNnode2} Hello2
Inner as Structure:
{thisTextNnode1}Hello
{thisTextNnode2}<b> Hello2 </b>
Full nodes printing:
<content>{thisTextNnode1}Hello </content>
<content>{thisTextNnode2}<b> Hello2 </b> </content>
==================================================
--
С уважением,
Alexey mailto:prog at goodok.ru
More information about the XML-SIG
mailing list