[XML-SIG] I am confused...

Martin v. Loewis martin@mira.cs.tu-berlin.de
Mon, 29 Jan 2001 17:34:20 +0100


> I do not remember if this was what I used for measuring, but
> this was my another effort to create query-mechanisms
> (It doesnt work anymore due to lack of xml.dom.utils)

Thanks. I've ported it to minidom, see the code below. Fortunately,
the DOM implementations follow the official API quite closely these
days, so it is easy to move from one implementation to another.

Using Uche's 640k document, I get the following timings:

minidom: 6.4s
4DOM: 45s
pDomlette: 8.9s

cDomlette fails since it does not support createElement (pDomlette
only has create*NS operations, so I added None as the namespace
everywhere).

Remember, this is the same machine where Uche's cDomlette/XPath query
took 0.5s. So it *does* matter how exactly you approach a certain task
(you can easily get a factor of 90 between solutions). However, if I
had to guess in advance what the approximate outcome would have been
in each of the solutions, I had been totally wrong.

Regards,
Martin

#!/usr/local/bin/python

print "1. simple"

from xml.dom import minidom
from string import split, index

def portr(node):
    typ = node.nodeType
    value = node.nodeValue
    name = node.nodeName
    atts = node.attributes
    par = node.parentNode
    print "t ",   typ, "v ",value, "n ",name, "a ", atts, "p ", par

class strstream:
  def __init__(self, str):
     self.str = str
#     print "strstream init"

  def read(self, n):
     tmp = self.str[:n]
     self.str = self.str[n:]
     return tmp

  def readline(self):
     return self.str

def _normalize_tokens(tl):
    """ rules:
    $,word,$ --> $word$
    """
    rules2 = {
    ("/","/") : "//",
    (".","/") : "./",
    ("!","=") : "$ne$",
    ("<","=") : "$le$",
    (">","=") : "$ge$",
    ("=","~") : "$match$",
    ("!","~") : "$no_match$",
    (";",";") : ";",
    }

    rules1 = {
    "=" : "$eq$",
    "!" : "$lt$",
    "<" : "$lt$",
    ">" : "$gt$",
    }

    ntl = []
    i = 0
    while i < len(tl)-1:
      if rules2.has_key( tuple(tl[i:i+2]) ):
        toapp = rules2[tuple(tl[i:i+2])]
        i = i+2
      else:
        if tl[i] == "$":
          if i+2 < len(tl):
            toapp = tl[i] + tl[i+1] + tl[i+2]
            i = i+3
          else:
            raise "Query error !!!" + `tl`
        else:
          toapp = tl[i]
          i = i+1
      if rules1.has_key( toapp ):
        toapp = rules1[toapp]
      ntl.append( toapp )
    return ntl

def _parse_query(q):
    from shlex import shlex
    #  i1 = index(q, "/")
    lexer = shlex(strstream(q))
    tokens = []
    tt = lexer.get_token()
    while tt:
      tokens.append(tt)
      tt = lexer.get_token()
    return _normalize_tokens(tokens)

def find_all_descendants(node, cond):
    return None     # XXX !!! stub

def find_all_children(node, cond):
    lst = []
    exec(cond)       ### must define condition !!!
    for n in node.childNodes:
      if condition(n):
        lst.append(n)
    return lst

class PYQL:
  def __init__(self, file):
    self.dom = minidom.parse(file)
    self.docel = self.dom.documentElement

  def query(self, q):
    qr = self._query(self.docel, _parse_query(q), self.dom)
    qel = self.dom.createElement("xql:result")
    if qr:
      qel.appendChild(qr)
    qel.setAttribute("orig", str(q))
    return qel

  def _query(self, node, subq, qrdoc):
    #print subq
    #print find_all_children(node,
    #"""def condition(n): return n.nodeName == "fig" """)
    if subq[0] == "//":
      self._query(node, subq[1:], qrdoc)
    elif subq[0] == "/":
      if subq[1] == node.nodeName:
        if len(subq) > 2:
          if subq[2] == "/":
            qel = qrdoc.createElement(node.nodeName)
            for a in node.attributes.keys():
              qel.setAttribute(a, node.attributes[a].nodeValue)
            for node1 in node.childNodes:
              q2 = self._query(node1, subq[2:], qrdoc)
#              print "q2: ", q2
              if q2:
                 qel.appendChild(q2)
            if len(qel.childNodes)==0:
              del qel
              return None
            else:
              return qel
          else:
            return node
        else:
          return node
      else:
        return None


a = PYQL('bigxml')
#  a.query('$or$ != 1.23E-4          /article/text/topic$')
#  print a.query('/article/text/topic.').toxml()
import time;start=time.time()
res=a.query('/article/author/name.').toxml()
print time.time()-start
print len(res)
#   print a.query('//fig.').toxml()