Not sure why this is filling my sys memory

Vincent Davis vincent at vincentdavis.net
Sat Feb 20 20:07:59 EST 2010


> Code is below, The files are about 5mb and 230,000 rows. When I have 43
> files of them and when I get to the 35th (reading it in) my system gets so
> slow that it is nearly functionless. I am on a mac and activity monitor
> shows that python is using 2.99GB of memory (of 4GB). (python 2.6 64bit).
> The getsizeof() returns 6424 bytes for the alldata . So I am not sure what
> is happening.
> Any ideas
> Thanks
>
> import csv, os, glob
import sys

def read_data_file(filename):
    reader = csv.reader(open(filename, "U"),delimiter='\t')

    data = []
    mask = []
    outliers = []
    modified = []

    data_append = data.append
    mask_append = mask.append
    outliers_append = outliers.append
    modified_append = modified.append

    maskcount = 0
    outliercount = 0
    modifiedcount = 0

    for row in reader:
        if '[MASKS]' in row:
            maskcount += 1
        if '[OUTLIERS]' in row:
            outliercount += 1
        if '[MODIFIED]' in row:
            modifiedcount += 1
        if not any((maskcount, outliercount, modifiedcount, not row)):
            data_append(row)
        elif not any((outliercount, modifiedcount, not row)):
            mask_append(row)
        elif not any((modifiedcount, not row)):
            outliers_append(row)
        else:
            if row: modified_append(row)

    data = data[1:]
    mask = mask[3:]
    outliers = outliers[3:]
    modified = modified[3:]

    return [data, mask, outliers, modified]


def ImportDataFrom(folder):
    print 'Importing files from: ', folder
    alldata = dict()
    infolder = glob.glob( os.path.join(folder, '*.txt') )
    numfiles = len(infolder)
    print 'Importing ' + str(numfiles) + ' files from: ', folder
    for infile in infolder:
        print "Loading into memory: " + os.path.split(infile)[1]
        fname = os.path.split(infile)[1]

        filedata = dict(zip([fname + '_data', fname + '_mask', fname +
'_outliers', fname+'_modified'], read_data_file(infile)))
        print fname + ' has ' + str(len(filedata[fname + '_data'])) + ' rows
of data'
        print fname + ' has ' + str(len(filedata[fname + '_mask'])) + ' rows
of masked data'
        print fname + ' has ' + str(len(filedata[fname + '_outliers'])) + '
rows of outliers'
        print fname + ' has ' + str(len(filedata[fname +'_modified'])) + '
modified rows of data'
        print str(sys.getsizeof(filedata)) +'bytes'' of memory used for '+
fname
        print ' '
        alldata.update(filedata)
        print str(len(alldata)/4) + ' files of ' + str(numfiles) + ' using '
+ str(sys.getsizeof(alldata)) + ' bytes of memory'
    return alldata


ImportDataFrom("/Users/vmd/Dropbox/dna/data/rawdata")
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/python-list/attachments/20100220/96772dfc/attachment-0001.html>


More information about the Python-list mailing list