[XML-SIG] dom building, sax, and namespaces

Andrew Dalke dalke@acm.org
Wed, 23 Jan 2002 09:59:31 -0700


This is a multi-part message in MIME format.
--------------7135BB0CCAF7385C00ED9BB3
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

Me:
> Is there some way I can build a proper namespace'd DOM without having
> the parser support feature_namespaces?  Some sort of adapter, perhaps?
> I couldn't find such in the codebase.  I did find code as in
> 
>    sax/drivers2/drv_xmlproc.py
> 
> which I think I could use to write such an adapter, but it would
> be better if someone pointed out to me existing code rather than
> chance my misunderstandings and errors.

Attached is the adapter code I copied and tweaked from drv_xmlproc.py.
It does seem to work in that the following gives me output.  Yippee!
(BTW, is there no built-in function to get the concatenation of all
the text nodes, like my get_text function, below?)

#######################
from xml.dom import pulldom
import SaxNSAdapter  # <<----- my adapter

real_builder = pulldom.SAX2DOM()

builder = SaxNSAdapter.SaxNSAdapter(real_builder)

builder.startDocument()
builder.startElement('bioformat:dataset',
             {'xmlns:bioformat': 'http://biopython.org/bioformat'})

builder.startElement("bioformat:record", {})
builder.startElement("bioformat:dbid", {"type": "primary"})
builder.characters("100K_RAT")
builder.endElement("bioformat:dbid")
builder.startElement("bioformat:dbid", {"type": "accession"})
builder.characters("Q61294")
builder.endElement("bioformat:dbid")
builder.characters("Andrew")
builder.endElement("bioformat:record")

builder.startElement("bioformat:record", {})
builder.startElement("bioformat:dbid", {"type": "primary"})
builder.characters("A1AT_BOMMO")
builder.endElement("bioformat:dbid")
builder.characters("Dalke")
builder.endElement("bioformat:record")
builder.endElement('bioformat:dataset')
builder.endDocument()  

dom_node = real_builder.document

from xml.xpath import Compile
from xml.xpath.Context import Context
path = Compile('//bioformat:dbid[@type="primary"]')
context = Context(dom_node,
           processorNss={'bioformat' :
'http://biopython.org/bioformat'})
node_set = path.evaluate(context)

def get_text(node):
    words = []
    def _get_text(nodeList, words = words):
        for subnode in nodeList:
            if subnode.nodeType == subnode.ELEMENT_NODE:
                _get_text(subnode.childNodes)
            elif subnode.nodeType == subnode.TEXT_NODE:
                words.append(subnode.data)
    _get_text([node])
    return "".join(words)

for node in node_set:
    print "-->", get_text(node)


					Andrew
					dalke@dalkescientific.com
--------------7135BB0CCAF7385C00ED9BB3
Content-Type: text/plain; charset=us-ascii;
 name="SaxNSAdapter.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="SaxNSAdapter.py"

from xml.sax import handler
from xml.sax.xmlreader import AttributesImpl, AttributesNSImpl

class SaxNSAdapter(handler.ContentHandler):
    def __init__(self, cont_handler):
        self._cont_handler = cont_handler

    def startDocument(self):
        self.ns_map = {"": None}  # Current prefix - URI map
        self.ns_map["xml"] = "http://www.w3.org/XML/1998/namespace"
        self.ns_stack = []     # Pushed for each element, used to maint ns_map
        self.rep_ns_attrs = 0  # Report xmlns-attributes?
        self._cont_handler.startDocument()
        
    def endDocument(self):
        self._cont_handler.endDocument()

    def characters(self, s):
        self._cont_handler.characters(s)


    def startElement(self, name, attrs):
        old_ns = {} # Reset ns_map to these values when we leave this element
        del_ns = [] # Delete these prefixes from ns_map when we leave element

        # Find declarations, update self.ns_map and self.ns_stack
        for (a,v) in attrs.items():
            if a[:6]=="xmlns:":
                prefix=a[6:]
                if prefix.find(":") != -1:
                    raise TypeError("unknown double namespace: %r" % a)
 
            elif a=="xmlns":
                prefix=""
            else:
                continue
 
            if self.ns_map.has_key(prefix):
                old_ns[prefix]=self.ns_map[prefix]
            if v:
                self.ns_map[prefix]=v
            else:
                del self.ns_map[prefix]
 
            if not self.rep_ns_attrs:
                del attrs[a]
 
        self.ns_stack.append((old_ns,del_ns))
 
        # Process elem and attr names
        cooked_name = self.__process_name(name)
        ns = cooked_name[0]
 
        rawnames = {}
        for (a,v) in attrs.items():
            del attrs[a]
            aname = self.__process_name(a, is_attr=1)
            if attrs.has_key(aname):
                self.parser.report_error(1903)
            attrs[aname] = v
            rawnames[aname] = a
 
        # Report event
        self._cont_handler.startElementNS(cooked_name, name,
                                          AttributesNSImpl(attrs, rawnames)) 


    def endElement(self, rawname):
        name = self.__process_name(rawname)

        # Clean up self.ns_map and self.ns_stack
        (old_ns,del_ns)=self.ns_stack[-1]
        del self.ns_stack[-1]
 
        self.ns_map.update(old_ns)
        for prefix in del_ns:
            del self.ns_map[prefix]
 
        self._cont_handler.endElementNS(name, rawname)
        
    # ------ internal
    
    def __process_name(self, name, default_to=None, is_attr=0):
        n = name.split(":")
        if len(n)>2:
            raise TypeError("Unsupported double namespace: %r" % name)
            return (None, name)
        elif len(n)==2:
            if n[0]=="xmlns":
                return (None, name)
 
            try:
                return (self.ns_map[n[0]], n[1])
            except KeyError:
                raise
                return (None, name)
        elif is_attr:
            return (None, name)
        elif default_to != None:
            return (default_to, name)
        elif self.ns_map.has_key("") and name != "xmlns":
            return (self.ns_map[""],name)
        else:
            return (None, name)

--------------7135BB0CCAF7385C00ED9BB3--