[Edu-sig] [edupython] Python in Education Advocacy Article
Toby Donaldson
tjd at sfu.ca
Tue Mar 27 00:51:13 CEST 2007
Michael,
On the sigcse list there was recently a coding challenge
(http://www.cs.duke.edu/csed/code) that asked for solutions to a word
frequency problem in various languages. I believe the author is
planning to eventually list all the solutions received (entries came
in many different languages); that could make for some interesting
comparisons.
I wrote one solution in Python, and one in Java, which I give below;
needless to say, I found the Python version far easier to write.
Toby
-------------------------------------------------------------------------------------------------------------------------------
import sys
if __name__ == '__main__':
# get the path from the command-line
fname = sys.argv[1]
# read in the file as a list of tokens
tokens = [tok.strip().lower() for tok in open(fname, 'r').read().split()]
# calculate the frequency of each token
freq = {}
for tok in tokens:
if tok in freq:
freq[tok] += 1
else:
freq[tok] = 1
# Sort the list in highest frequency to lowest frequency,
# with ties sorted by lexicographics order of the words.
# Uses the Python sort-decorate-unsort idiom. We sort by
# the negation of the frequency values to get the proper
# ordering.
lst = [(-freq[tok], tok) for tok in freq] # decorate
lst.sort() # sort
lst = [(-freq, tok) for freq, tok in lst] # undecorate
# print the results
for freq, tok in lst:
print '%s\t%s' % (freq, tok)
-------------------------------------------------------------------------------------------------------------------------------
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Scanner;
import java.util.TreeMap;
import java.util.TreeSet;
public class Puzzle {
public static void main(String[] args) throws IOException {
// get the file to process
String fname = args[0];
Scanner sc = new Scanner(new File(fname));
// initialize the map for the words and counts;
// a TreeMap is always ordered by keys
TreeMap<String, Integer> map = new TreeMap<String, Integer>();
// process the file a line at a time
while (sc.hasNextLine()) {
// chop each line into its constituent tokens
// "\\s+" is a regular expression that matches
one or more
// whitespace characters
String[] tokens = sc.nextLine().split("\\s+");
// make all the strings lower case, and remove
any excess whitespace
for (int i = 0; i < tokens.length; ++i) {
tokens[i] = tokens[i].toLowerCase().trim();
}
// add each token to the map
for (String tok : tokens) {
if (map.containsKey(tok)) {
map.put(tok, map.get(tok) + 1);
} else {
map.put(tok, 1);
}
}
}
// remove the empty string if it is present
map.remove("");
// sort the data by storing each word that occurs the
same number of
// times in a TreeMap of sets keyed by count; TreeSet stores its
// values in sorted order
TreeMap<Integer, TreeSet<String>> sortMap = new TreeMap<Integer,
TreeSet<String>>();
for (String tok : map.keySet()) {
int count = map.get(tok);
if (sortMap.containsKey(count)) {
TreeSet<String> arr = sortMap.get(count);
arr.add(tok);
sortMap.put(count, arr);
} else {
TreeSet<String> arr = new TreeSet<String>();
arr.add(tok);
sortMap.put(count, arr);
}
}
// print the data
// first reverse the keys to print data in the proper order
ArrayList<Integer> idx = new ArrayList<Integer>();
idx.addAll(sortMap.keySet());
Collections.reverse(idx);
// print it to stdout
for (Integer key : idx) {
TreeSet<String> toks = sortMap.get(key);
for (String t : toks) {
System.out.printf("%s\t%s\n", key, t);
}
}
}
}
More information about the Edu-sig
mailing list