[Tutor] MapReduce
Steve Nelson
sanelson at gmail.com
Tue Feb 6 09:22:30 CET 2007
On 2/5/07, Kent Johnson <kent37 at tds.net> wrote:
> You can also do this operation easily with dicts (not tested!):
Thank you - code now complete and tests passing. Would appreciate
comments / criticisms. I did wonder if I should create a UrlAnalyser
Class rather than have hanging methods:
#!/usr/bin/python
import unittest
def myMap(data, search):
"""Take list of tuples of record number and url accessed and return
list of tuples keyed by url with record number as value if search is
in the url"""
return [(value, key) for key, value in data if search in value]
def myGroup(data):
"""Take list of tuples keyed by url with record number as value, and
group together same urls with list of record numbers as value."""
groups = {}
for value, index in data:
groups.setdefault(value, []).append(index)
return sorted(groups.items())
def myReduce(data):
"""Process list of tuples of url and record number list and return
list of url and frequency of occurence."""
return [(value, len(occurences)) for value, occurences in data]
class UnitTests(unittest.TestCase):
"""Do not taunt unit tests."""
def setUp(self):
pass
def tearDown(self):
pass
def testMapper(self):
"""Produce set of intermediate key value pairs, with record
content as key and record number as value, if a condition is met."""
test_pairs = [(1, 'a'), (2, 'b'), (3, 'c'), (4, 'a'), (5, 'd')]
intermediate_list = [('a', 1), ('a', 4)]
self.assertEqual(myMap(test_pairs, "a"), intermediate_list)
def testGrouper(self):
"""Group occurences of a record together:
[('fred', 1), ('jim', 2), ('bill', 3), ('jim', 4)] -> [(fred, 1),
('jim', [2, 4]), ('bill' ,3)]"""
test_list = [('fred', 1), ('jim', 2), ('bill', 3), ('jim', 4)]
grouped_list = [('bill', [3]), ('fred', [1]), ('jim', [2, 4])]
self.assertEqual(myGroup(test_list), grouped_list)
def testReduce(self):
"""Aggregate results of map and group functions to produce value
and frequency."""
test_intermediate = [('bill', [3]), ('fred', [1]), ('jim', [2, 4])]
test_summary = [('bill', 1), ('fred', 1), ('jim', 2)]
self.assertEqual(myReduce(test_intermediate), test_summary)
def doTests():
"""Run our test suite"""
suite = unittest.makeSuite(UnitTests,'test')
runner = unittest.TextTestRunner()
result = runner.run(suite)
return result
def main():
"""Main program here"""
print "Analysing URL data:\n"
url_data = [(1, 'http://www.beer.com'), (2,
'http://www.ban-beer.com'), (3, 'http://www.bbc.co.uk'), (4,
'http://www.beer.com'), (5, 'http://wwww.kernel.org')]
print myReduce(myGroup(myMap(url_data, "beer")))
if __name__ == "__main__":
result = doTests()
if result.wasSuccessful():
main()
else:
print "Error - check test output."
More information about the Tutor
mailing list