Elegant solution needed: Data manipulation

Thu Jan 31 18:54:38 EST 2002

What follows is what I came up with from reading the re stuff
overnight and toying around a bit.  I hope it doesn't fall into the
Python "anti-idiom" trap.  I'm definitely open to a communal "code
review".
Regards,
Mark

from operator import *
import random
import re
#
#just a helper function to make up toy data
#
def randlist(dist, n):
    s = []
    for i in range(n):
        s.append(round(eval(dist)))
    return s
#
#a printer helper function
#
def myprint(in_data):
    for i in in_data.keys():
        print i,'\t\t',
    print

    for i in range(numobs):
        for j in in_data.keys():
            print in_data[j][i],'\t\t',
        print

#
#we'll assume these are available from another file that feeds the
program
#that produces the equation (this is 100% reasonable for the ops list
and 80%
#reasonable for the vars list ... regardless, we could make a re or do
list processing
#to get the var names ... I did it with list comprehensions)
#
numobs = 5
ops = ["add", "mul"]
vars = ["shoesize", "height", "weight"]

#
#we'll make up some temporary data here
#
data = {}
for i in vars:
    data[i] = randlist("random.gauss(6,1)", numobs)

#
#print out the data so we can do a before/after comparison
#
myprint(data)

#
#read in the trouble making equation    
#
f = open("newdata0.names","r")
x = f.readline()

print "equation:"
print x

regexps = {}
replacements = {}
#
#the following:
#  1. creates a regular expression for the ith operator "add(" etc
#  2. creates a replacement string for this operator "map ( add ," etc
#  3. performs the search and destroy (err, replace)
#as the replacements, the result is put back into the original string;
#by the time it is done it is modified by each of the replacements
#
for i in ops:
    regexps[i] = re.compile(i + " \(")
    replacements[i] = "map( " + i + " , "
    x = regexps[i].sub(replacements[i], x)

regexps = {}
replacements = {}
#
#similar format to that listed above
#except we're taking "feature1" to "data["feature1"]"
#
for i in vars:
    regexps[i] = re.compile(i)
    replacements[i] = "data[\"" + i + "\"]"
    x = regexps[i].sub(replacements[i], x)

print "executable string:"
print x

#
#with more than one equation string in the equation file, this would
be a
#within a large outer loop over all of those strings and we'd have to
come
#up with some naming scheme for the new columns
#
data["new"] = eval(x)

#
#do the "after" printout
#
myprint(data)