Memory problems (garbage collection)

Thu Apr 23 08:09:46 EDT 2009

Thanks for the help.
I converted everything into the StringIO() format. Memory is still getting 
chewed up. I will look at ElementTree later but for now I believe the speed 
issue must be related to the amount of memory that is getting used. It is 
causing all of windows to slow to a crawl. gc.collect() still reports the 
same quantity as before.
Don't know what to try next. Updated program is below:

from xml.dom import minidom
import os
from cStringIO import StringIO

class xmlProcessing:
    """ General class for XML processing"""

    def process(self, filename="", xmlString=""):
        if xmlString:
            pass
        elif filename:
            xmldoc = minidom.parse(filename)
        self.parse( xmldoc.documentElement )

    def parseBranch(self, parentNode):
        """ Process an XML branch """
        for node in parentNode.childNodes:
            try:
                parseMethod = getattr(self, "parse_%s" % 
node.__class__.__name__)
            except AttributeError:
                continue
            if parseMethod(node):
                continue
            self.parseBranch(node)
            del node

    def parse_Document(self, node):
        pass

    def parse_Text(self, node):
        pass

    def parse_Comment(self, node):
        pass

    def parse_Element(self, node):
        try:
            handlerMethod = getattr(self, "do_%s" % node.tagName)
        except AttributeError:
            return False
        handlerMethod(node)
        return True

class reptorParsing(xmlProcessing):
    """ Specific class for generating a SQLalchemy program to create tables
    and populate them with data"""

    def __init__(self):
        self.schemaPreface = StringIO()
        self.schemaPreface.write("""from sqlalchemy import *
from sqlalchemy.ext.declarative import declarative_base
engine = create_engine('sqlite:///tutorial.db', echo=False)
metadata = MetaData()
Base = declarative_base()""")
        self.schemaTables = StringIO()
        self.schemaFields = StringIO()
        self.dataUpdate = StringIO()
        self.tableDict = {}
        self.tableName = StringIO()
        self.tables = StringIO()

    def parse(self, parentNode):
        """Main entry point to begin processing a XML document"""
        self.parseBranch(parentNode)
        # Properties such as schemaTables and .tables are populated by the 
various methods below
        fupdate=open(os.path.join(os.getcwd(), "update.py"), 'w')
        if self.schemaTables:
            fupdate.write("import schema\n")
            f=open(os.path.join(os.getcwd(), "schema.py"), 'w')
            f.write(self.schemaPreface+"\n"+self.schemaTables+
                '\n' + "metadata.create_all(engine)\n"+
                "print 'hello 2'")
            f.close()
        if self.tables:
            fupdate.write(self.tables)
        fupdate.close()

    def do_TABLES(self, tableNode):
        """Process schema for tables"""
        for node in tableNode.childNodes:
            self.tableName = node.tagName
            # Define a declaritive mapping class
            self.schemaTables.write("""\nclass %s(Base):
    __tablename__ = '%s'
""" % (self.tableName, self.tableName))
            self.schemaFields = StringIO()
            # allow for userA = users("Billy","Bob") via a __init__()
            self.schemaInitPreface = StringIO()
            self.schemaInitPreface.write("    def __init__(self")
            self.schemaInitBody = StringIO()
            self.parseBranch(node)
            self.schemaInitPreface.write("):\n")
            self.schemaTables.write(self.schemaFields.read() + "\n" + \
                self.schemaInitPreface.read() + \
                self.schemaInitBody.read() + "\n")

    def do_FIELDS(self, fieldsNode):
        """Process schema for fields within tables"""
        for node in fieldsNode.childNodes:
            if self.schemaFields:
                self.schemaFields.write("\n")
            cType = ""
            # The attribute type holds the type of field
            crType = node.attributes["type"].value
            if crType==u"C":
                cType = "String(length=%s)" % node.attributes["len"].value
            elif crType==u"N" and node.attributes["dec"].value==u'0':
                cType = "Integer"
            elif crType==u"N":
                cType = "Numeric(precision=%s, scale=%s)" % 
(node.attributes["len"].value,node.attributes["dec"].value)
            elif crType==u"L":
                cType = "Boolean"
            elif crType==u"T":
                cType = "DateTime"
            elif crType==u"D":
                cType = "Date"
            elif crType==u"M" or crType==u"G":
                cType = "Text"

            if node.attributes.getNamedItem("primary"):
                cType += ", primary_key=True"
            self.schemaFields.write("    %s = Column(%s)" % (node.tagName, 
cType))
            self.schemaInitPreface.write(", \\\n        %s" % 
(node.tagName))
            self.schemaInitBody.write("            self.%s = %s\n" % 
(node.tagName, node.tagName))
            self.tableDict[self.tableName + "." + node.tagName] = crType

    def do_DATA(self, dataNode):
        """This is for processing actual data to be pushed into the tables

        Layout is DATA -> TABLE_NAME key='primary_field' -> TUPLE -> 
FIELD_NAME -> VALUE"""
        for node in dataNode.childNodes:
            self.tableName = node.tagName
            self.dataUpdate=open(os.path.join(os.getcwd(), self.tableName + 
"_update.py"), 'w')
            self.dataUpdate.write("""
import time
from datetime import *
from sqlalchemy import *
from sqlalchemy.orm import *
engine = create_engine('sqlite:///tutorial.db', echo=False)
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()
""")
            self.keyValue = ""
            self.keyField = node.attributes["key"].value
            self.parseBranch(node)
            self.tables.write("\nimport %s_update.py" % (self.tableName))
#            f.write(self.dataUpdate)
            self.dataUpdate.close()

    def do_TUPLE(self, tupleNode):
        """ A TUPLE is what the XML file refers to a table row
        Sits below a DATA child"""
        self.dataUpdate.write("""
entry = %s()
session.add(entry)
""" % (self.tableName))
        for node in tupleNode.childNodes:
            for dataNode in node.childNodes:
                crType = self.tableDict[self.tableName + "." + node.tagName]

                if crType==u"C" or crType==u"M":
                    cValue = u'"""%s"""' % dataNode.data
                elif crType==u"T":
                    cValue = 'datetime.strptime("'+dataNode.data+'", 
"%Y-%m-%d %H:%M")'
                elif crType==u"D":
                    cValue = 'datetime.strptime("'+dataNode.data+'", 
"%Y-%m-%d")'
                else:
                    cValue = dataNode.data
                self.dataUpdate.write(u"\nentry."+node.tagName+ u" = " + 
cValue)

        self.dataUpdate.write("\nsession.commit()")

if __name__ == '__main__':
    replicate = reptorParsing()
    replicate.process(filename=os.path.join(os.getcwd(), "request.xml"))
    import update