[XML-SIG] dom building, sax, and namespaces
Andrew Dalke
dalke@acm.org
Wed, 23 Jan 2002 09:59:31 -0700
This is a multi-part message in MIME format.
--------------7135BB0CCAF7385C00ED9BB3
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
Me:
> Is there some way I can build a proper namespace'd DOM without having
> the parser support feature_namespaces? Some sort of adapter, perhaps?
> I couldn't find such in the codebase. I did find code as in
>
> sax/drivers2/drv_xmlproc.py
>
> which I think I could use to write such an adapter, but it would
> be better if someone pointed out to me existing code rather than
> chance my misunderstandings and errors.
Attached is the adapter code I copied and tweaked from drv_xmlproc.py.
It does seem to work in that the following gives me output. Yippee!
(BTW, is there no built-in function to get the concatenation of all
the text nodes, like my get_text function, below?)
#######################
from xml.dom import pulldom
import SaxNSAdapter # <<----- my adapter
real_builder = pulldom.SAX2DOM()
builder = SaxNSAdapter.SaxNSAdapter(real_builder)
builder.startDocument()
builder.startElement('bioformat:dataset',
{'xmlns:bioformat': 'http://biopython.org/bioformat'})
builder.startElement("bioformat:record", {})
builder.startElement("bioformat:dbid", {"type": "primary"})
builder.characters("100K_RAT")
builder.endElement("bioformat:dbid")
builder.startElement("bioformat:dbid", {"type": "accession"})
builder.characters("Q61294")
builder.endElement("bioformat:dbid")
builder.characters("Andrew")
builder.endElement("bioformat:record")
builder.startElement("bioformat:record", {})
builder.startElement("bioformat:dbid", {"type": "primary"})
builder.characters("A1AT_BOMMO")
builder.endElement("bioformat:dbid")
builder.characters("Dalke")
builder.endElement("bioformat:record")
builder.endElement('bioformat:dataset')
builder.endDocument()
dom_node = real_builder.document
from xml.xpath import Compile
from xml.xpath.Context import Context
path = Compile('//bioformat:dbid[@type="primary"]')
context = Context(dom_node,
processorNss={'bioformat' :
'http://biopython.org/bioformat'})
node_set = path.evaluate(context)
def get_text(node):
words = []
def _get_text(nodeList, words = words):
for subnode in nodeList:
if subnode.nodeType == subnode.ELEMENT_NODE:
_get_text(subnode.childNodes)
elif subnode.nodeType == subnode.TEXT_NODE:
words.append(subnode.data)
_get_text([node])
return "".join(words)
for node in node_set:
print "-->", get_text(node)
Andrew
dalke@dalkescientific.com
--------------7135BB0CCAF7385C00ED9BB3
Content-Type: text/plain; charset=us-ascii;
name="SaxNSAdapter.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename="SaxNSAdapter.py"
from xml.sax import handler
from xml.sax.xmlreader import AttributesImpl, AttributesNSImpl
class SaxNSAdapter(handler.ContentHandler):
def __init__(self, cont_handler):
self._cont_handler = cont_handler
def startDocument(self):
self.ns_map = {"": None} # Current prefix - URI map
self.ns_map["xml"] = "http://www.w3.org/XML/1998/namespace"
self.ns_stack = [] # Pushed for each element, used to maint ns_map
self.rep_ns_attrs = 0 # Report xmlns-attributes?
self._cont_handler.startDocument()
def endDocument(self):
self._cont_handler.endDocument()
def characters(self, s):
self._cont_handler.characters(s)
def startElement(self, name, attrs):
old_ns = {} # Reset ns_map to these values when we leave this element
del_ns = [] # Delete these prefixes from ns_map when we leave element
# Find declarations, update self.ns_map and self.ns_stack
for (a,v) in attrs.items():
if a[:6]=="xmlns:":
prefix=a[6:]
if prefix.find(":") != -1:
raise TypeError("unknown double namespace: %r" % a)
elif a=="xmlns":
prefix=""
else:
continue
if self.ns_map.has_key(prefix):
old_ns[prefix]=self.ns_map[prefix]
if v:
self.ns_map[prefix]=v
else:
del self.ns_map[prefix]
if not self.rep_ns_attrs:
del attrs[a]
self.ns_stack.append((old_ns,del_ns))
# Process elem and attr names
cooked_name = self.__process_name(name)
ns = cooked_name[0]
rawnames = {}
for (a,v) in attrs.items():
del attrs[a]
aname = self.__process_name(a, is_attr=1)
if attrs.has_key(aname):
self.parser.report_error(1903)
attrs[aname] = v
rawnames[aname] = a
# Report event
self._cont_handler.startElementNS(cooked_name, name,
AttributesNSImpl(attrs, rawnames))
def endElement(self, rawname):
name = self.__process_name(rawname)
# Clean up self.ns_map and self.ns_stack
(old_ns,del_ns)=self.ns_stack[-1]
del self.ns_stack[-1]
self.ns_map.update(old_ns)
for prefix in del_ns:
del self.ns_map[prefix]
self._cont_handler.endElementNS(name, rawname)
# ------ internal
def __process_name(self, name, default_to=None, is_attr=0):
n = name.split(":")
if len(n)>2:
raise TypeError("Unsupported double namespace: %r" % name)
return (None, name)
elif len(n)==2:
if n[0]=="xmlns":
return (None, name)
try:
return (self.ns_map[n[0]], n[1])
except KeyError:
raise
return (None, name)
elif is_attr:
return (None, name)
elif default_to != None:
return (default_to, name)
elif self.ns_map.has_key("") and name != "xmlns":
return (self.ns_map[""],name)
else:
return (None, name)
--------------7135BB0CCAF7385C00ED9BB3--