find max and min values from a column of a csv file

Thu Jul 7 08:33:25 EDT 2011

prakash jp wrote:

> Could any one help to get max and min values from a specified column of a
> csv file. The* csv file is large* and hence the below code did go bad.
> *Alternate
> methods would be highly appreciated
> **
> minimum.py*:
> import csv
> filename = "testLog_4.csv"
> f = open(filename)
> def col_min(mincname):
>     with open(filename, 'rb') as f:
>         reader = csv.reader(f, delimiter=",")
>         #print reader
>         #print mincname
>         col_index = next(reader).index(mincname)
>         #print col_index
>         least = min(rec[col_index] for rec in reader)
>     print least
> 
> col_min(str("Server Latency"))
> col_min(str("Client Latency"))
> f.close()

I don't see a way to make this much faster as it is probably already i/o-
bound. You can try and calculate all four extrema at once, so you have to 
iterate over the file only once:

$ cat csv_minmax_chunked.py 
#!/usr/bin/env python       
import csv                  
import locale               
from itertools import islice

def make_key(index, type):
    def key(row):         
        return type(row[index])
    return key                 

class Aggregator(object):
    def __init__(self, name, index, type, key=None):
        self.name = name                            
        self.index = index                          
        self.type = type or float                   
        self.getvalue = make_key(index, type)       
        self.key = {"key": key} if key is not None else {}
    def __repr__(self):                                   
        return "{0}(name={1}, index={2}, type={3}".format(
            type(self).__name__, self.name, self.index, self.type)
    def __str__(self):                                            
        try:                                                      
            return "%s: %s...%s" % (self.name, self.min, self.max)
        except AttributeError:                                    
            return "%s: (no values)" % self.name                  
    def update(self, rows):                                       
        values = map(self.getvalue, rows)                         
        chunkmin = min(values, **self.key)                        
        chunkmax = max(values, **self.key)                        
        try:                                                      
            curmin = self.min                                     
            curmax = self.max                                     
        except AttributeError:                                    
            self.min = chunkmin                                   
            self.max = chunkmax                                   
        else:                                                     
            self.min = min(curmin, chunkmin, **self.key)          
            self.max = max(curmax, chunkmax, **self.key)          

def identity(s):
    return s    

convert = {
    "float": (float,),
    "int": (int,),    
    "text": (lambda s: s, locale.strxfrm),
    "": (identity,)
    }

def main():
    locale.setlocale(locale.LC_ALL, "")

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("csvfile")
    parser.add_argument("columns", nargs="+")
    parser.add_argument("--chunksize", "-s", type=int, default=2**12)
    args = parser.parse_args()

    ntpairs = (column.partition(":")[::2] for column in args.columns)
    with open(args.csvfile, "rb") as instream:
        rows = csv.reader(instream)
        header = dict((name, index) for index, name in 
enumerate(next(rows)))
        aggregators = [
            Aggregator(
                "{0}({1})".format(name, type or "str"),
                header[name],
                *convert[type])
            for name, type in ntpairs]
        chunksize = args.chunksize
        while True:
            chunk = list(islice(rows, chunksize))
            if not chunk:
                break
            for agg in aggregators:
                agg.update(chunk)
    for agg in aggregators:
        print agg

if __name__ == "__main__":
    main()
$ head sample.csv
alpha,beta,gamma,delta
69,-454,460960,ELIJAH
92,796,278885,SUFFICIENCY
42,895,934523,PIONEERED
88,-213,360633,reassert
63,-518,327010,mcmahon
84,604,540764,deprecatory
83,117,621832,VEGETARIANISM'S
61,411,844243,tan
29,-147,982043,plushest
$ ./csv_minmax_chunked.py sample.csv alpha:int beta:int beta gamma:float 
delta delta:text
alpha(int): 0...99
beta(int): -1000...999
beta(str): -1...999
gamma(float): 0.0...999997.0
delta(str): AALIYAH...études
delta(text): a...ZYUGANOV'S