Flat file to associative array.

Tue Jul 22 09:59:19 EDT 2003

>>>>> "Alex" == Alex  <bogus at antispam.com> writes:

    Alex> Hello, What would be the best way to convert a flat file of
    Alex> products into a multi-dimensional array? The flat file looks
    Alex> like this:

    Alex> Beer|Molson|Dry|4.50 Beer|Molson|Export|4.50
    Alex> Shot|Scotch|Macallan18|18.50 Shot|Regular|Jameson|3.00

I work with CSV files a lot, and do something similar.  I have written
some helper functions and classes that allow you to slice through flat
files as a 2D array, where the indices of the slices can either be
normal integer indices, or string keys, as in 

   from csvhelpers import getrows, rows_to_array
   #split the data on the | symbol
   def splitpipe(x):
       if len(x.replace('|','').strip())==0: return [] # empty row
       return x.split('|')

   rows = getrows( file('beer.flat'), splitpipe)
   headers=('category', 'brand', 'type', 'price')
   data = rows_to_array( rows,  headers)

   print data[:,'price']  # prints all the prices (as floats)
   print data[0]          # prints the first row
   print data             # prints everything

The getrows function does some extra work: for example it ignores
empty rows, tries to convert the individual data cells to int then
float, and ignores comments if you pass it an extra comment arg as in

   rows = getrows( file('beer.flat'), splitpipe, comment='#')

With this call, your flat file can look like

   Beer|Molson|Dry|4.50
   Beer|Molson|Export|4.50

   # now for some booze
   Shot|Scotch|Macallan18|18.50
   Shot|Regular|Jameson|3.00

For many of the CSV files I work with, there are headers in the first
non empty row of the file.  If you pass 'headers=1', rows_to_array
will get the headers and allow you to index the array with the header
strings.  Also, I often have a unique key for each row, like a medical
ID number.  An optional arg rowKeyInd lets you index the data by row
key string as well.  Here is an example CSV file containing stock
quote information.  The row keys are in the first column

   ,,,
   ticker,date,open,price
   ADPT,2001-12-13,100.0,20.0
   ,,,
   QQQ,2001-12-13,100.0,103.0
   AAA,2001-12-13,100.0,10.2
   PPP,2001-12-13,100.0,1.3

Note that the ,,, thing is fairly typical of CSV files that started
their life as an Excel spreadsheet.  They represent the empty rows.
The first nonempty row contains the headers, and the first column the
row keys.

   # use csv module in real life to handle legit commas in fields, etc..
   def splitcomma(x):
       if len(x.replace(',','').strip())==0: return [] # empty row
       return  x.split(',')

   rows = getrows( file('test.csv'), splitcomma)
   data = rows_to_array( rows, headers=1, rowKeyInd=0)

   print data[:,'open']
   print data[:,'price']
   print data[:,'date']
   print data['ADPT',:]

I am using Numeric arrays under the hood, and data[:,'open] and
data[:,'price'] are Numeric arrays of Floats.  Thus you can do things
like

    change = data[:,'price'] / data[:,'open']

You can also use any valid slice, as in

  data[1:3,3]  

Hope this helps,
John Hunter

# csvhelpers.py
# Requires Numeric: http://pfdubois.com/numpy/ and python2.2
from __future__ import generators
from Numeric import array, Int, Float 

def iterable(obj):
   try: iter(obj)
   except: return 0
   return 1

def enumerate(seq):
     for i in range(len(seq)):
         yield i, seq[i]

def is_string_like(obj):
   try: obj + ''
   except (TypeError, ValueError): return 0
   return 1

class rows_to_array:
    def __init__(self, rows, headers, rowKeyInd=None):
        """
        rows is a generator that produces the rows of the flatfile,
        optionally including the header row.  See getrows.

        If headers is iterable, then it is a list containing the
        headers.  otherwise use the first row for the headers.
        headers must be unique

        rowKeyInd, if not None, is the column index of the row key so
        that rows can be identified by key
        """

        if not iterable(headers):
            headers = [ h.strip() for h in rows.next()]

        # make dict from header to col index
        self.headerd = {}
        for i,h in enumerate(headers):
            key = h.strip()
            if self.headerd.has_key(key):
                raise RuntimeError, 'Headers must be unique.  ' + \
                      'Found duplicate: ' +  key
            self.headerd[key] = i

        data = [row for row in rows]                

        # use the first row to determine types; this should be improved
        self.types = []
        for val in data[0]:
            if isinstance(val, int):
                self.types.append(Int)
            elif isinstance(val, float):
                self.types.append(Float)
            else:
                self.types.append('O')

        if rowKeyInd is not None:
            self.rowKeys = {}
            for i,row in enumerate(data):
                self.rowKeys[row[rowKeyInd]] = i
        self.data = array(data, typecode='O')

    def __getitem__(self, key):

        try:
            rowKey, colKey = key
        except TypeError:
            # row index only
            if is_string_like(key):
                rowSlice = self.rowKeys[key]
            else:
                rowSlice = key
            return self.data[rowSlice]

        type = None
        if is_string_like(rowKey):
            rowSlice = self.rowKeys[rowKey]
        else:
            rowSlice = rowKey
        if is_string_like(colKey):
            colSlice = self.headerd[colKey]
            type = self.types[colSlice]
        else:
            colSlice = colKey

        ret = self.data[rowSlice, colSlice]

        if type is not None:  return ret.astype(type)
        else: return ret

    def __str__(self):
        s = str(self.headerd.keys())  + '\n'
        s += str(self.data)
        return s

def try_to_num(val):
    try: return int(val)
    except ValueError:
        try: return float(val)
        except ValueError:
            return val

def getrows(fh, parseline, comment=None):
    """
    return the rows of the data in file object fh, converting to int
    or float if possible

    parseline returns the row as a list, ie, splits ths row.
    An empty row should be a list of length 0

    if comment is not None, ignore lines starting with comment symbol
    """
    while 1:
        line = fh.readline()
        if comment is not None and line.startswith(comment):
            continue
        if len(line)==0: return 
        if len(line.strip())==0: continue
        vals = parseline(line)
        if len(vals):
            maxLen = max([len(entry) for entry in vals])
            if maxLen==0: continue
            yield [ try_to_num(val) for val in vals]