[XML-SIG] The fastest XML parser around

Daniel Veillard veillard@redhat.com
Mon, 1 Apr 2002 05:08:02 -0500


--/Uq4LBwYP4y1W6pO
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

On Mon, Apr 01, 2002 at 11:19:55AM +0200, Hannu Krosing wrote:
> On Mon, 2002-04-01 at 10:34, Daniel Veillard wrote:
> >      - the conformance against the W3C/NIST regression test suite made
> >        of 1800+ documents is done as a Python script calling the libxml2
> >        bindings and takes approximately 3.5 seconds.
> 
> Is this test script available anywhere ?

  Well it's in libxml CVS, and should be in the distribution tarballs,
it's called check-xml-test-suite.py . It requires a checkout of the
W3C/OASIS/NIST test suite from the W3C CVS base under a xml-test-suite
subdir. I don't know if the announcement of the suite has been done yet
at least the list is public as well as W3C CVS base :-)
   http://lists.w3.org/Archives/Public/public-xml-testsuite/
   http://dev.w3.org/cvsweb/

----
paphio:~/XML -> cat xml-test-suite/CVS/Root 
dev.w3.org:/sources/public
paphio:~/XML -> cat xml-test-suite/CVS/Repository 
2001/XML-Test-Suite
paphio:~/XML -> 
----

  I enclose a copy of the script to this mail, here is the output:

-----------------------------------------------------
paphio:~/XML -> ./check-xml-test-suite.py 
OASIS draft v1.0 Second Edition; with 15 March updates by Mary Brady)
=> James Clark XMLTEST cases, 18-Nov-1998
Test xml-test-suite/xmlconf/xmltest/invalid/009.xml missing: base xml-test-suite/xmlconf/xmltest/ uri invalid/009.xml
Test xml-test-suite/xmlconf/xmltest/invalid/010.xml missing: base xml-test-suite/xmlconf/xmltest/ uri invalid/010.xml
Test xml-test-suite/xmlconf/xmltest/invalid/011.xml missing: base xml-test-suite/xmlconf/xmltest/ uri invalid/011.xml
invalid-not-sa-022: error: Validity error not detected
   Ran 363 tests: 362 suceeded, 1 failed and 0 generated an error
=> Fuji Xerox Japanese Text Tests
   Ran 12 tests: 12 suceeded, 0 failed and 0 generated an error
=> Sun Microsystems XML Tests
sa03: warning: valid document reported an error
sa04: warning: valid document reported an error
Test xml-test-suite/xmlconf/sun/invalid/pe01.xml missing: base xml-test-suite/xmlconf/sun/ uri invalid/pe01.xml
uri01: warning: failed to parse the document but accepted
   Ran 164 tests: 164 suceeded, 0 failed and 0 generated an error
=> OASIS/NIST TESTS, 1-Nov-1998
o-p57pass1: error: Validity check failed
o-p58pass1: error: Validity check failed
   Ran 347 tests: 345 suceeded, 2 failed and 0 generated an error
=> IBM XML Tests
=> IBM XML Conformance Test Suite - invalid tests
=> IBM XML Conformance Test Suite - not-wf tests
=> IBM XML Conformance Test Suite - valid tests
ibm-valid-P11-ibm11v01.xml: warning: valid document reported an error
ibm-valid-P11-ibm11v02.xml: warning: valid document reported an error
ibm-valid-P56-ibm56v08.xml: warning: valid document reported an error
   Ran 928 tests: 928 suceeded, 0 failed and 0 generated an error
Ran 1814 tests: 1811 suceeded, 3 failed and 0 generated an error in 3.35 s.
paphio:~/XML -> 
-----------------------------------------------------

  Seems there is a few document which have been removed from the suite
o-p57pass1 and o-p58pass1 are actually not valid, invalid-not-sa-022 
status was changed it seems, and I didn't yet had a look at it.
  Sometimes libxml will report errors but keep the well formedness or validity
status of the document, for example it complains when Windows style paths
are used for URIs (of course that doesn't pass libxml RFC2396 internal
implementation).

Daniel

-- 
Daniel Veillard      | Red Hat Network https://rhn.redhat.com/
veillard@redhat.com  | libxml GNOME XML XSLT toolkit  http://xmlsoft.org/
http://veillard.com/ | Rpmfind RPM search engine http://rpmfind.net/

--/Uq4LBwYP4y1W6pO
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="check-xml-test-suite.py"

#!/usr/bin/python
import sys
import time
import os
import string
sys.path.append("python")
import libxml2

#
# the testsuite description
#
CONF="xml-test-suite/xmlconf/xmlconf.xml"
LOG="check-xml-test-suite.log"

log = open(LOG, "w")

#
# Error and warning handlers
#
error_nr = 0
error_msg = ''
def errorHandler(ctx, str):
    global error_nr
    global error_msg

    if string.find(str, "error:") >= 0:
	error_nr = error_nr + 1
    if len(error_msg) < 300:
        if len(error_msg) == 0 or error_msg[-1] == '\n':
	    error_msg = error_msg + "   >>" + str
	else:
	    error_msg = error_msg + str

libxml2.registerErrorHandler(errorHandler, None)

#warning_nr = 0
#warning = ''
#def warningHandler(ctx, str):
#    global warning_nr
#    global warning
#
#    warning_nr = warning_nr + 1
#    warning = warning + str
#
#libxml2.registerWarningHandler(warningHandler, None)

#
# Used to load the XML testsuite description
#
def loadNoentDoc(filename):
    ctxt = libxml2.createFileParserCtxt(filename)
    if ctxt == None:
        return None
    ctxt.replaceEntities(1)
    ctxt.parseDocument()
    try:
	doc = ctxt.doc()
    except:
        doc = None
    if ctxt.wellFormed() != 1:
        doc.freeDoc()
	return None
    return doc

#
# The conformance testing routines
#

def testNotWf(filename, id):
    global error_nr
    global error_msg
    global log

    error_nr = 0
    error_msg = ''

    ctxt = libxml2.createFileParserCtxt(filename)
    if ctxt == None:
        return -1
    ctxt.parseDocument()

    try:
	doc = ctxt.doc()
    except:
        doc = None
    if error_nr == 0 or ctxt.wellFormed() != 0:
        print "%s: error: Well Formedness error not detected" % (id)
	log.write("%s: error: Well Formedness error not detected\n" % (id))
	doc.freeDoc()
	return 0
    return 1

def testNotWfEnt(filename, id):
    global error_nr
    global error_msg
    global log

    error_nr = 0
    error_msg = ''

    ctxt = libxml2.createFileParserCtxt(filename)
    if ctxt == None:
        return -1
    ctxt.replaceEntities(1)
    ctxt.parseDocument()

    try:
	doc = ctxt.doc()
    except:
        doc = None
    if error_nr == 0 or ctxt.wellFormed() != 0:
        print "%s: error: Well Formedness error not detected" % (id)
	log.write("%s: error: Well Formedness error not detected\n" % (id))
	doc.freeDoc()
	return 0
    return 1

def testNotWfEntDtd(filename, id):
    global error_nr
    global error_msg
    global log

    error_nr = 0
    error_msg = ''

    ctxt = libxml2.createFileParserCtxt(filename)
    if ctxt == None:
        return -1
    ctxt.replaceEntities(1)
    ctxt.loadSubset(1)
    ctxt.parseDocument()

    try:
	doc = ctxt.doc()
    except:
        doc = None
    if error_nr == 0 or ctxt.wellFormed() != 0:
        print "%s: error: Well Formedness error not detected" % (id)
	log.write("%s: error: Well Formedness error not detected\n" % (id))
	doc.freeDoc()
	return 0
    return 1

def testWfEntDtd(filename, id):
    global error_nr
    global error_msg
    global log

    error_nr = 0
    error_msg = ''

    ctxt = libxml2.createFileParserCtxt(filename)
    if ctxt == None:
        return -1
    ctxt.replaceEntities(1)
    ctxt.loadSubset(1)
    ctxt.parseDocument()

    try:
	doc = ctxt.doc()
    except:
        doc = None
    if ctxt.wellFormed() == 0:
        print "%s: error: wrongly failed to parse the document" % (id)
	log.write("%s: error: wrongly failed to parse the document\n" % (id))
	return 0
    if error_nr != 0:
        print "%s: warning: WF document generated an error msg" % (id)
	log.write("%s: error: WF document generated an error msg\n" % (id))
	doc.freeDoc()
	return 2
    doc.freeDoc()
    return 1

def testError(filename, id):
    global error_nr
    global error_msg
    global log

    error_nr = 0
    error_msg = ''

    ctxt = libxml2.createFileParserCtxt(filename)
    if ctxt == None:
        return -1
    ctxt.replaceEntities(1)
    ctxt.loadSubset(1)
    ctxt.parseDocument()

    try:
	doc = ctxt.doc()
    except:
        doc = None
    if ctxt.wellFormed() == 0:
        print "%s: warning: failed to parse the document but accepted" % (id)
	log.write("%s: warning: failed to parse the document but accepte\n" % (id))
	return 2
    if error_nr != 0:
        print "%s: warning: WF document generated an error msg" % (id)
	log.write("%s: error: WF document generated an error msg\n" % (id))
	doc.freeDoc()
	return 2
    doc.freeDoc()
    return 1

def testInvalid(filename, id):
    global error_nr
    global error_msg
    global log

    error_nr = 0
    error_msg = ''

    ctxt = libxml2.createFileParserCtxt(filename)
    if ctxt == None:
        return -1
    ctxt.validate(1)
    ctxt.parseDocument()

    try:
	doc = ctxt.doc()
    except:
        doc = None
    valid = ctxt.isValid()
    if doc == None:
        print "%s: error: wrongly failed to parse the document" % (id)
	log.write("%s: error: wrongly failed to parse the document\n" % (id))
	return 0
    if valid == 1:
        print "%s: error: Validity error not detected" % (id)
	log.write("%s: error: Validity error not detected\n" % (id))
	doc.freeDoc()
	return 0
    if error_nr == 0:
        print "%s: warning: Validity error not reported" % (id)
	log.write("%s: warning: Validity error not reported\n" % (id))
	doc.freeDoc()
	return 2
        
    doc.freeDoc()
    return 1

def testValid(filename, id):
    global error_nr
    global error_msg

    error_nr = 0
    error_msg = ''

    ctxt = libxml2.createFileParserCtxt(filename)
    if ctxt == None:
        return -1
    ctxt.validate(1)
    ctxt.parseDocument()

    try:
	doc = ctxt.doc()
    except:
        doc = None
    valid = ctxt.isValid()
    if doc == None:
        print "%s: error: wrongly failed to parse the document" % (id)
	log.write("%s: error: wrongly failed to parse the document\n" % (id))
	return 0
    if valid != 1:
        print "%s: error: Validity check failed" % (id)
	log.write("%s: error: Validity check failed\n" % (id))
	doc.freeDoc()
	return 0
    if error_nr != 0 or valid != 1:
        print "%s: warning: valid document reported an error" % (id)
	log.write("%s: warning: valid document reported an error\n" % (id))
	doc.freeDoc()
	return 2
    doc.freeDoc()
    return 1

test_nr = 0
test_succeed = 0
test_failed = 0
test_error = 0
def runTest(test):
    global test_nr
    global test_failed
    global test_error
    global test_succeed
    global error_msg
    global log

    uri = test.prop('URI')
    id = test.prop('ID')
    if uri == None:
        print "Test without ID:", uri
	return -1
    if id == None:
        print "Test without URI:", id
	return -1
    base = test.getBase(None)
    URI = libxml2.buildURI(uri, base)
    if os.access(URI, os.R_OK) == 0:
        print "Test %s missing: base %s uri %s" % (URI, base, uri)
	return -1
    type = test.prop('TYPE')
    if type == None:
        print "Test %s missing TYPE" % (id)
	return -1

    extra = None
    if type == "invalid":
        res = testInvalid(URI, id)
    elif type == "valid":
        res = testValid(URI, id)
    elif type == "not-wf":
        extra =  test.prop('ENTITIES')
	# print URI
	#if extra == None:
	#    res = testNotWfEntDtd(URI, id)
 	#elif extra == 'none':
	#    res = testNotWf(URI, id)
	#elif extra == 'general':
	#    res = testNotWfEnt(URI, id)
	#elif extra == 'both' or extra == 'parameter':
	res = testNotWfEntDtd(URI, id)
	#else:
	#    print "Unknow value %s for an ENTITIES test value" % (extra)
	#    return -1
    elif type == "error":
	res = testError(URI, id)
    else:
        # TODO skipped for now
	return -1

    test_nr = test_nr + 1
    if res > 0:
	test_succeed = test_succeed + 1
    elif res == 0:
	test_failed = test_failed + 1
    elif res < 0:
	test_error = test_error + 1

    # Log the ontext
    if res != 1:
	log.write("   File: %s\n" % (URI))
	content = string.strip(test.content)
	while content[-1] == '\n':
	    content = content[0:-1]
	if extra != None:
	    log.write("   %s:%s:%s\n" % (type, extra, content))
	else:
	    log.write("   %s:%s\n\n" % (type, content))
	if error_msg != '':
	    log.write("   ----\n%s   ----\n" % (error_msg))
	    error_msg = ''
	log.write("\n")

    return 0
	    

def runTestCases(case):
    profile = case.prop('PROFILE')
    if profile != None and \
       string.find(profile, "IBM XML Conformance Test Suite - Production") < 0:
	print "=>", profile
    test = case.children
    while test != None:
        if test.name == 'TEST':
	    runTest(test)
	if test.name == 'TESTCASES':
	    runTestCases(test)
        test = test.next
        
conf = loadNoentDoc(CONF)
if conf == None:
    print "Unable to load %s" % CONF
    sys.exit(1)

testsuite = conf.getRootElement()
if testsuite.name != 'TESTSUITE':
    print "Expecting TESTSUITE root element: aborting"
    sys.exit(1)

profile = testsuite.prop('PROFILE')
if profile != None:
    print profile

start = time.time()

case = testsuite.children
while case != None:
    global test_nr
    global test_succeed
    global test_failed
    global test_error

    if case.name == 'TESTCASES':
	old_test_nr = test_nr
	old_test_succeed = test_succeed
	old_test_failed = test_failed
	old_test_error = test_error
        runTestCases(case)
	print "   Ran %d tests: %d suceeded, %d failed and %d generated an error" % (
	       test_nr - old_test_nr, test_succeed - old_test_succeed,
	       test_failed - old_test_failed, test_error - old_test_error)
    case = case.next

conf.freeDoc()
log.close()

print "Ran %d tests: %d suceeded, %d failed and %d generated an error in %.2f s." % (
      test_nr, test_succeed, test_failed, test_error, time.time() - start)

--/Uq4LBwYP4y1W6pO--