CSV performance

Mon Apr 27 14:20:50 EDT 2009

psaffrey at googlemail.com wrote:
> Thanks for your replies. Many apologies for not including the right
> information first time around. More information is below....
Here is another way to try (untested):

import numpy
import time

chrommap = dict(chrY='y', chrX='x', chr13='c', chr12='b', chr11='a',
                 chr10='0', chr17='g', chr16='f', chr15='e', chr14='d',
                 chr19='i', chr18='h', chrM='m', chr22='l', chr20='j',
                 chr21='k', chr7='7', chr6='6', chr5='5', chr4='4',
                 chr3='3', chr2='2', chr1='1', chr9='9', chr8='8')

def consume_file(file_name, chunks)
     numpy.zeros(size_guess)
     lx = []
     cx = []
     px = []
     block = []
     with open(file_name) as fh:
         for line in enumerate(fh):
             chrom, coord, point = row.split()
             lx.append(chrommap[chrom])
             cx.append(coord)
             px.append(point)
             if len(cx) >= chunks:
                 block.append(.''.join(lx))
                 block.append(numpy.array(cx, dtype=int))
                 block.append(numpy.array(px, dtype=float))
                 lx = []
                 cx = []
                 px = []
         if lx:
             block.append(.''.join(lx))
             block.append(numpy.array(cx))
             block.append(numpy.array(px))

     return (''.join(block[0::3]),
             numpy.concatenate(block[1::3]),
             numpy.concatenate(block[2::3]))


# The following repeats 128, to avoid initial read issues.
# Treat the diff twixt the two 128s as read overhead.
for CHUNKS in 128, 128, 256, 1024, 4096, 16384:
     t0 = time.clock()
     letters, coords, points = consume_file("largefile.txt", CHUNKS)
     t1 = time.clock()
     print "finished %s in %s chunks: %f.2" % (
          len(letters), CHUNKS, t1 - t0)


--Scott David Daniels
Scott.Daniels at Acm.Org