[XML-SIG] python, xml, html tags

Tue Mar 29 00:31:03 CEST 2005

Здравствуйте, Necati.

Вы писали 28 марта 2005 г., 22:01:36:

ND> Hi,
ND> I can't do something with Python and XML.

ND> i have the following file;

ND> <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
ND>  <test>
ND>   <content> Hello </content>
ND>   <content> <b> Hello </b> </content>
ND>  </test>

ND> Ok. it is simple :)

ND> And i have the following python codes;

ND> #!/usr/bin/python
ND> from xml.dom import minidom

ND> file = open("test.xml","r")
ND> xml = minidom.parse(file)
ND> print
ND> xml.childNodes[0].getElementsByTagName("content")[0].firstChild.data
ND> print
ND> xml.childNodes[0].getElementsByTagName("content")[1].firstChild.data

ND> Again simple one :)

ND> But when i run these codes, i have the following output;
ND> Hello

ND> How can i access the second one. Yes, i know it contains html tags so it
ND> doesn't give me the result. I wanna get whole of the content as data.
ND> How can i do this?

try this code and read comments
-----------------------------------
from xml.dom import minidom

file = open("test.xml","r")
oDoc = minidom.parse(file)
oRoot = oDoc.childNodes[0]      # <test>
oContesnts = oRoot.getElementsByTagName("content")  # [<content>, <content>]

# your code
print "\tyour code"
print oContesnts[0].firstChild.data
print oContesnts[1].firstChild.data
# " Hello
#        "
# See manual: "data - The content of the _text_node_(!) as a string. "
# indeed:
print "\t node Types"
print oContesnts[0].firstChild.nodeType == oDoc.TEXT_NODE # True
print oContesnts[1].firstChild.nodeType == oDoc.TEXT_NODE # True
print

# for example if you'll try this xml 
#<test>
#  <content>{thisTextNnode1...} Hello </content>
#  <content>{thisTextNnode2}<b> Hello Other </b> </content>
# </test>
# you  get this output:
#"{thisTextNnode2}Hello
#{thisTextNnode2}"
# 
# So minidom think each text between the tags as node (textNode)
# print "oContesnts[1] has %d childs - not 1 !" % len(oContesnts[1].childNodes)
# print
# Three childs are: text node {thisTextNnode2}, element node <b>..</b>,
# and text node of space between </b> and </content> closing tags/

# if you want to get the content as plain text then try recursive:
def getInnerText(oNode):
    rc = ""
    nodelist = oNode.childNodes
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
            rc = rc + node.data
        elif node.nodeType==node.ELEMENT_NODE:
            rc = rc + getInnerText(node)   # recursive !!!
        elif node.nodeType==node.CDATA_SECTION_NODE:
            rc = rc + node.data
        else:
            # node.nodeType: PROCESSING_INSTRUCTION_NODE, COMMENT_NODE, DOCUMENT_NODE, NOTATION_NODE and so on
           pass
    return rc

print "\tInnerText:"
print getInnerText(oContesnts[0])
print getInnerText(oContesnts[1])
print
#"{thisTextNnode1}Hello
#{thisTextNnode2} Hello Other"

# And if you want to retrieve structure (innerHTML analog)

def getInnerHTML(oNode):
    rc = ""
    nodelist = oNode.childNodes
    for node in nodelist:
        rc = rc + node.toxml()
    return rc   

print "\tInner as Structure:"
print getInnerHTML(oContesnts[0])
print getInnerHTML(oContesnts[1])
print

# Or if you want the outer XML (and this will true xml) just 
print "\tFull nodes printing:"
print oContesnts[0].toxml()   # or you can use .toprettyxml() method
print oContesnts[1].toxml()
print
-----------------------------------

Output:
==================================================
        your code
{thisTextNnode1}Hello
{thisTextNnode2}
         node Types
True
True

oContesnts[1] has 3 childs - not 1 !

        InnerText:
{thisTextNnode1}Hello
{thisTextNnode2} Hello2

        Inner as Structure:
{thisTextNnode1}Hello
{thisTextNnode2}<b> Hello2 </b>

        Full nodes printing:
<content>{thisTextNnode1}Hello </content>
<content>{thisTextNnode2}<b> Hello2 </b> </content>
==================================================

-- 
С уважением,
 Alexey                          mailto:prog at goodok.ru