Elegant solution needed: Data manipulation
Mark
Aristotle_00 at yahoo.com
Thu Jan 31 18:54:38 EST 2002
What follows is what I came up with from reading the re stuff
overnight and toying around a bit. I hope it doesn't fall into the
Python "anti-idiom" trap. I'm definitely open to a communal "code
review".
Regards,
Mark
from operator import *
import random
import re
#
#just a helper function to make up toy data
#
def randlist(dist, n):
s = []
for i in range(n):
s.append(round(eval(dist)))
return s
#
#a printer helper function
#
def myprint(in_data):
for i in in_data.keys():
print i,'\t\t',
print
for i in range(numobs):
for j in in_data.keys():
print in_data[j][i],'\t\t',
print
#
#we'll assume these are available from another file that feeds the
program
#that produces the equation (this is 100% reasonable for the ops list
and 80%
#reasonable for the vars list ... regardless, we could make a re or do
list processing
#to get the var names ... I did it with list comprehensions)
#
numobs = 5
ops = ["add", "mul"]
vars = ["shoesize", "height", "weight"]
#
#we'll make up some temporary data here
#
data = {}
for i in vars:
data[i] = randlist("random.gauss(6,1)", numobs)
#
#print out the data so we can do a before/after comparison
#
myprint(data)
#
#read in the trouble making equation
#
f = open("newdata0.names","r")
x = f.readline()
print "equation:"
print x
regexps = {}
replacements = {}
#
#the following:
# 1. creates a regular expression for the ith operator "add(" etc
# 2. creates a replacement string for this operator "map ( add ," etc
# 3. performs the search and destroy (err, replace)
#as the replacements, the result is put back into the original string;
#by the time it is done it is modified by each of the replacements
#
for i in ops:
regexps[i] = re.compile(i + " \(")
replacements[i] = "map( " + i + " , "
x = regexps[i].sub(replacements[i], x)
regexps = {}
replacements = {}
#
#similar format to that listed above
#except we're taking "feature1" to "data["feature1"]"
#
for i in vars:
regexps[i] = re.compile(i)
replacements[i] = "data[\"" + i + "\"]"
x = regexps[i].sub(replacements[i], x)
print "executable string:"
print x
#
#with more than one equation string in the equation file, this would
be a
#within a large outer loop over all of those strings and we'd have to
come
#up with some naming scheme for the new columns
#
data["new"] = eval(x)
#
#do the "after" printout
#
myprint(data)
More information about the Python-list
mailing list