find max and min values from a column of a csv file
Peter Otten
__peter__ at web.de
Thu Jul 7 08:33:25 EDT 2011
prakash jp wrote:
> Could any one help to get max and min values from a specified column of a
> csv file. The* csv file is large* and hence the below code did go bad.
> *Alternate
> methods would be highly appreciated
> **
> minimum.py*:
> import csv
> filename = "testLog_4.csv"
> f = open(filename)
> def col_min(mincname):
> with open(filename, 'rb') as f:
> reader = csv.reader(f, delimiter=",")
> #print reader
> #print mincname
> col_index = next(reader).index(mincname)
> #print col_index
> least = min(rec[col_index] for rec in reader)
> print least
>
> col_min(str("Server Latency"))
> col_min(str("Client Latency"))
> f.close()
I don't see a way to make this much faster as it is probably already i/o-
bound. You can try and calculate all four extrema at once, so you have to
iterate over the file only once:
$ cat csv_minmax_chunked.py
#!/usr/bin/env python
import csv
import locale
from itertools import islice
def make_key(index, type):
def key(row):
return type(row[index])
return key
class Aggregator(object):
def __init__(self, name, index, type, key=None):
self.name = name
self.index = index
self.type = type or float
self.getvalue = make_key(index, type)
self.key = {"key": key} if key is not None else {}
def __repr__(self):
return "{0}(name={1}, index={2}, type={3}".format(
type(self).__name__, self.name, self.index, self.type)
def __str__(self):
try:
return "%s: %s...%s" % (self.name, self.min, self.max)
except AttributeError:
return "%s: (no values)" % self.name
def update(self, rows):
values = map(self.getvalue, rows)
chunkmin = min(values, **self.key)
chunkmax = max(values, **self.key)
try:
curmin = self.min
curmax = self.max
except AttributeError:
self.min = chunkmin
self.max = chunkmax
else:
self.min = min(curmin, chunkmin, **self.key)
self.max = max(curmax, chunkmax, **self.key)
def identity(s):
return s
convert = {
"float": (float,),
"int": (int,),
"text": (lambda s: s, locale.strxfrm),
"": (identity,)
}
def main():
locale.setlocale(locale.LC_ALL, "")
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("csvfile")
parser.add_argument("columns", nargs="+")
parser.add_argument("--chunksize", "-s", type=int, default=2**12)
args = parser.parse_args()
ntpairs = (column.partition(":")[::2] for column in args.columns)
with open(args.csvfile, "rb") as instream:
rows = csv.reader(instream)
header = dict((name, index) for index, name in
enumerate(next(rows)))
aggregators = [
Aggregator(
"{0}({1})".format(name, type or "str"),
header[name],
*convert[type])
for name, type in ntpairs]
chunksize = args.chunksize
while True:
chunk = list(islice(rows, chunksize))
if not chunk:
break
for agg in aggregators:
agg.update(chunk)
for agg in aggregators:
print agg
if __name__ == "__main__":
main()
$ head sample.csv
alpha,beta,gamma,delta
69,-454,460960,ELIJAH
92,796,278885,SUFFICIENCY
42,895,934523,PIONEERED
88,-213,360633,reassert
63,-518,327010,mcmahon
84,604,540764,deprecatory
83,117,621832,VEGETARIANISM'S
61,411,844243,tan
29,-147,982043,plushest
$ ./csv_minmax_chunked.py sample.csv alpha:int beta:int beta gamma:float
delta delta:text
alpha(int): 0...99
beta(int): -1000...999
beta(str): -1...999
gamma(float): 0.0...999997.0
delta(str): AALIYAH...études
delta(text): a...ZYUGANOV'S
More information about the Python-list
mailing list