[XML-SIG] Expat as xmllib

Paul Prescod paul@prescod.net
Mon, 24 Jan 2000 09:56:47 -0600


This is a multi-part message in MIME format.
--------------E2834FEC56D5F06E9B5E259A
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

The attached library allows expat to be used as a basis for a parser
with the xmllib interface.

Performance:

Without any xmllib-specific optimization, pyexpat runs almost as fast as
sgmlop:

raw sgmlop: 13222 items; 0.426 seconds; 1281.29 kbytes per second
fast xmllib: 13222 items; 1.445 seconds; 378.03 kbytes per second
slow xmllib: 13222 items; 6.651 seconds; 82.11 kbytes per second
pyexpat: 13210 items; 1.527 seconds; 357.68 kbytes per second

I can think of several optimizations that could speed it up quite a bit.
Also if you compare it to the xmllib in the standard distribution, we
are talking night and day so if we bundle expat we're only improving
things for them.

Conformance

Pyexpat caught more errors than xmllib, was more accepting of legal XML
input (e.g. <?foo?>) and handled entities (especially character
entities) in a manner consistent with the XML specification.

These explain the differenced in the number of "items" above.

Backwards Compatibility

The only big compatibility difference between xmllib on pyexpat and
xmllib on sgmlop is that expat expands entity references like &amp; to
"&" instead of to a separate event. This is actually a feature of expat
because it is doing entity expansion *for you*. The XML spec requires
this behavior.

The library and a test program are attached.

-- 
 Paul Prescod  - ISOGEN Consulting Engineer speaking for himself
Earth will soon support only survivor species -- dandelions, roaches, 
lizards, thistles, crows, rats. Not to mention 10 billion humans.
	- Planet of the Weeds, Harper's Magazine, October 1998
--------------E2834FEC56D5F06E9B5E259A
Content-Type: text/plain; charset=us-ascii;
 name="ExpatOp.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="ExpatOp.py"

from xml.parsers import xmllib
import pyexpat

handlerMap=[("finish_starttag", "StartElementHandler"),
	    ("finish_endtag", "EndElementHandler"),
	    ("handle_data","CharacterDataHandler"),
	    ("handle_proc","ProcessingInstructionHandler")]

class ExpatPretendingToBeSGMLOp:
	def __init__(self, encoding=None):
		if encoding:
			self.pyexpat=pyexpat.ParserCreate(encoding)
		else:
			self.pyexpat=pyexpat.ParserCreate()
	def close( self ):
		self.pyexpat.Parse( "", 1 )
	def parse( self, data ):
		self.pyexpat.Parse( data, 1 )
	def feed( self, data ):
		self.pyexpat.Parse( data, 0 )
	def register( self, obj ):
		for oldname,newname in handlerMap:
			method=getattr( obj, oldname, None )
			setattr( self.pyexpat, newname, method )

class XMLParser( xmllib.FastXMLParser ):
	def reset( self ):
		xmllib.FastXMLParser.reset(self)
		self.parser=ExpatPretendingToBeSGMLOp()
		self.feed=self.parser.pyexpat.Parse
		self.parser.register( self )

if __name__=="__main__":
	import sys
	junk = open( "out.tmp","w")

	if len( sys.argv )>1:
		filename=sys.argv[1]
	else:
		filename="hamlet.xml"

	class myparser( XMLParser ):
		def handle_proc(self, target,data):
			junk.write( "\n?"+target+data )
		def handle_data( self, data):
			junk.write( "\n'"+data)
		def finish_starttag(self,gi,attrs):
			junk.write( "\n<>"+gi+ `attrs` )
		def finish_endtag( self, gi ):
			junk.write( "\n</>"+gi )

	myparser().feed( open( filename).read() )



--------------E2834FEC56D5F06E9B5E259A
Content-Type: text/plain; charset=us-ascii;
 name="testxml1.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="testxml1.py"

# basic tests

test_sgmlop = 1

import sys
import time, string
from xml.parsers import sgmlop, xmllib, ExpatOp

try:
    FILE, VERBOSE = sys.argv[1], 2
except IndexError:
    FILE, VERBOSE = "hamlet.xml", 1

print
print "test collecting parsers on", FILE
print

# --------------------------------------------------------------------
# sgmlop

class myCollector:
    def __init__(self):
        self.data = []
        self.text = []
    def finish_starttag(self, tag, data):
        if self.text:
            self.data.append(repr(string.join(self.text, "")))
            self.text = []
        self.data.append("start", tag, data)
    def handle_proc(self, tag, data):
        if self.text:
            self.data.append(repr(string.join(self.text, "")))
            self.text = []
        self.data.append("pi", tag, data)
    def handle_special(self, data):
        if self.text:
            self.data.append(repr(string.join(self.text, "")))
            self.text = []
        self.data.append("special", data)
    def handle_entityref(self, data):
        if self.text:
            self.data.append(repr(string.join(self.text, "")))
            self.text = []
        self.data.append("entity", data)
    def handle_data(self, data):
        self.text.append(data)
    def handle_cdata(self, data):
        self.text.append("CDATA" + data)

def doRawSGMLOp():
	global parser
	t = time.clock()
	for i in range(1):
	    out = myCollector()
	    fp = open(FILE)
	    parser = sgmlop.XMLParser()
	    parser.register(out)
	    b = 0
	    while 1:
		data = fp.read(512)
		if not data:
		    break
		parser.feed(data)
		b = b + len(data)
	    parser.close()
	t1 = time.clock() - t

	print "raw sgmlop:", len(out.data), "items;", round(t1, 3), "seconds;",
	print round(b / t1 / 512, 2), "kbytes per second"
	return t1

# --------------------------------------------------------------------
# xmllib

base=None

def makeparser( basecls ):
	global base
	base=basecls
	class FastXMLParser(base):
	    def __init__(self):
		base.__init__(self)
		self.data = []
		self.text = []
	    def unknown_starttag(self, tag, data):
		if self.text:
		    self.data.append(repr(string.join(self.text, "")))
		    self.text = []
		self.data.append("start", tag, data)
	    def handle_proc(self, tag, data):
		if self.text:
		    self.data.append(repr(string.join(self.text, "")))
		    self.text = []
		self.data.append("pi", tag, data)
	    def handle_special(self, data):
		if self.text:
		    self.data.append(repr(string.join(self.text, "")))
		    self.text = []
		self.data.append("special", data)
	    def handle_entityref(self, data):
		if self.text:
		    self.data.append(repr(string.join(self.text, "")))
		    self.text = []
		self.data.append("entity", data)
	    def handle_data(self, data):
		self.text.append(data)
	    def handle_cdata(self, data):
		self.text.append("CDATA" + data)
	return FastXMLParser

def doFastXMLLib():
	global parser2

	FastXMLParser = makeparser( xmllib.FastXMLParser )

	t = time.clock()
	for i in range(1):
	    fp = open(FILE)
	    parser2 = FastXMLParser()
	    b = 0
	    while 1:
		data = fp.read(512)
		if not data:
		    break
		parser2.feed(data)
		b = b + len(data)
	    parser2.close()
	t2 = time.clock() - t

	print "fast xmllib:", len(parser2.data), "items;", round(t2, 3), "seconds;",
	print round(b / t2 / 512, 2), "kbytes per second"
	return t2

class SlowXMLParser(xmllib.SlowXMLParser):
    def __init__(self):
        xmllib.SlowXMLParser.__init__(self)
        self.data = []
        self.text = []
    def unknown_starttag(self, tag, data):
        if self.text:
            self.data.append(repr(string.join(self.text, "")))
            self.text = []
        self.data.append("start", tag, data)
    def handle_proc(self, tag, data):
        if self.text:
            self.data.append(repr(string.join(self.text, "")))
            self.text = []
        self.data.append("pi", tag, data)
    def handle_special(self, data):
        if self.text:
            self.data.append(repr(string.join(self.text, "")))
            self.text = []
        self.data.append("special", data)
    def handle_entityref(self, data):
        if self.text:
            self.data.append(repr(string.join(self.text, "")))
            self.text = []
        self.data.append("entity", data)
    def handle_data(self, data):
        self.text.append(data)
    def handle_cdata(self, data):
        self.text.append("CDATA" + data)

def doSlowXMLLib():
	global parser3
	t = time.clock()
	for i in range(1):
	    fp = open(FILE)
	    parser3 = SlowXMLParser()
	    b = 0
	    while 1:
		data = fp.read(512)
		if not data:
		    break
		parser3.feed(data)
		b = b + len(data)
	    parser3.close()
	t3 = time.clock() - t

	print "slow xmllib:", len(parser3.data), "items;", round(t3, 3), "seconds;",
	print round(b / t3 / 512, 2), "kbytes per second"
	return t3

def doPyExpat():
	global parser4
	# PyExpat
	FastXMLParser = makeparser( ExpatOp.XMLParser )

	t = time.clock()
	for i in range(1):
	    fp = open(FILE)
	    parser4 = FastXMLParser()
	    b = 0
	    while 1:
		data = fp.read(512)
		if not data:
		    break
		parser4.feed(data)
		b = b + len(data)
	    parser4.close()
	t4 = time.clock() - t

	print "pyexpat:", len(parser4.data), "items;", round(t4, 3), "seconds;",
	print round(b / t4 / 512, 2), "kbytes per second"
	return t4

t1=doRawSGMLOp()
t2=doFastXMLLib()
t3=doSlowXMLLib()
t4=doPyExpat()

print
print "normalized timing:"
print "slow xmllib", 1.0
print "fast xmllib", round(t2 / t3, 2), "(%sx)" % round(t3 / t2, 1)
print "sgmlop     ", round(t1 / t3, 2), "(%sx)" % round(t3 / t1, 1)
print "pyexpat ", round(t4 / t3, 2), "(%sx)" % round(t3 / t4, 1)
print

print "looking for differences:"

items = min(len(parser2.data), len(parser4.data))

for i in xrange(items):
    if parser2.data[i] != parser3.data[i]:
        for j in range(max(i-5, 0), min(i+5, items)):
            if parser2.data[j] != parser3.data[j]:
                print "+", j+1, parser2.data[j]
                print "*", j+1, parser3.data[j]
            else:
                print "=", j+1, parser2.data[j]
        break
else:
    print "(no differences)"


--------------E2834FEC56D5F06E9B5E259A--