Using sax libxml2 html parser

Fri Jan 5 06:09:30 EST 2007

Hi all,

I have created an example using libxml2 based in the code that appears
in http://xmlsoft.org/python.html.
My example processes an enough amount of html files to see that the
memory consumption rises till the process ends (I check it with the
'top' command).

I don´t know if I am forgetting something in the code, as I have not
been able to find any example on the web.

Thanks in advance, Cesar

Note: I have also tried to put the cleanup functions inside the 'for'
loop.

****************************************] The Code
[****************************************

#!/usr/bin/python -u
import libxml2

#------------------------------------------------------------------------------

# Memory debug specific
libxml2.debugMemory(1)

#------------------------------------------------------------------------------

class callback:
    def startDocument(self):
            print "."

    def endDocument(self):
        pass

    def startElement(self, tag, attrs):
        pass

    def endElement(self, tag):
        pass

    def characters(self, data):
        pass

    def warning(self, msg):
        pass

    def error(self, msg):
        pass

    def fatalError(self, msg):
        pass

#------------------------------------------------------------------------------
#------------------------------------------------------------------------------

import os
import sys

programName = os.path.basename(sys.argv[0])

if len(sys.argv) != 2:
  print "Use: %s <dir html files>" % programName
  sys.exit(1)

inputPath = sys.argv[1]

if not os.path.exists (inputPath):
  print "Error: directory does not exist"
  sys.exit(1)

inputFileNames = []
dirContent = os.listdir(inputPath)
for fichero in dirContent:
  extension1=fichero.rfind(".htm")
  extension2=fichero.rfind(".html")
  dot = fichero.rfind(".")
  extension = max(extension1,extension2)
  if extension != -1 and extension == dot:
      inputFileNames.append (fichero)

if len(inputFileNames) == 0:
  print "Error: no input files"
  sys.exit(1)

handler = callback()
NUM_ITERS = 5
for i in range(NUM_ITERS):
  for inputFileName in inputFileNames:
    print inputFileName
    inputFilePath = inputPath + inputFileName
    f = open(inputFilePath)
    data = f.read()
    f.close()

    ctxt = libxml2.htmlCreatePushParser(handler, "", 0, inputFileName)

    ctxt.htmlParseChunk(data, len(data), 1)
    ctxt = None

# Memory debug specific
libxml2.cleanupParser()
if libxml2.debugMemory(1) == 0:
    print "OK"
else:
    print "Memory leak %d bytes" % (libxml2.debugMemory(1))
    libxml2.dumpMemory()

# Other cleanup functions
#libxml2.cleanupCharEncodingHandlers()
#libxml2.cleanupEncodingAliases()
#libxml2.cleanupGlobals()
#libxml2.cleanupInputCallbacks()
#libxml2.cleanupOutputCallbacks()
#libxml2.cleanupPredefinedEntities()