[Tutor] MapReduce

Kent Johnson kent37 at tds.net
Tue Feb 6 12:12:49 CET 2007


Steve Nelson wrote:
> On 2/5/07, Kent Johnson <kent37 at tds.net> wrote:
>> You can also do this operation easily with dicts (not tested!):
> 
> Thank you - code now complete and tests passing.  Would appreciate
> comments / criticisms.  I did wonder if I should create a UrlAnalyser
> Class rather than have hanging methods:

NOOOOOOOO! Python is not Java! There is nothing wrong with a "hanging 
method". In Python we call them functions. :-)

The code looks fine to me.

Kent

> 
> #!/usr/bin/python
> import unittest
> 
> def myMap(data, search):
>   """Take list of tuples of record number and url accessed and return
> list of tuples keyed by url with record number as value if search is
> in the url"""
>   return [(value, key) for key, value in data if search in value]
> 
> def myGroup(data):
>   """Take list of tuples keyed by url with record number as value, and
> group together same urls with list of record numbers as value."""
>   groups = {}
>   for value, index in data:
>     groups.setdefault(value, []).append(index)
>   return sorted(groups.items())
> 
> def myReduce(data):
>   """Process list of tuples of url and record number list and return
> list of url and frequency of occurence."""
>   return [(value, len(occurences)) for value, occurences in data]
> 
> class UnitTests(unittest.TestCase):
>   """Do not taunt unit tests."""
> 
>   def setUp(self):
>     pass
> 
>   def tearDown(self):
>     pass
> 
>   def testMapper(self):
>     """Produce set of intermediate key value pairs, with record
> content as key and record number as value, if a condition is met."""
>     test_pairs = [(1, 'a'), (2, 'b'), (3, 'c'), (4, 'a'), (5, 'd')]
>     intermediate_list = [('a', 1), ('a', 4)]
>     self.assertEqual(myMap(test_pairs, "a"), intermediate_list)
> 
>   def testGrouper(self):
>     """Group occurences of a record together:
> 
>     [('fred', 1), ('jim', 2), ('bill', 3), ('jim', 4)] -> [(fred, 1),
> ('jim', [2, 4]), ('bill' ,3)]"""
> 
>     test_list = [('fred', 1), ('jim', 2), ('bill', 3), ('jim', 4)]
>     grouped_list = [('bill', [3]), ('fred', [1]), ('jim', [2, 4])]
>     self.assertEqual(myGroup(test_list), grouped_list)
> 
>   def testReduce(self):
>     """Aggregate results of map and group functions to produce value
> and frequency."""
>     test_intermediate =  [('bill', [3]), ('fred', [1]), ('jim', [2, 4])]
>     test_summary = [('bill', 1), ('fred', 1), ('jim', 2)]
>     self.assertEqual(myReduce(test_intermediate), test_summary)
> 
> def doTests():
>   """Run our test suite"""
>   suite = unittest.makeSuite(UnitTests,'test')
>   runner = unittest.TextTestRunner()
>   result = runner.run(suite)
>   return result
> 
> def main():
>   """Main program here"""
>   print "Analysing URL data:\n"
>   url_data = [(1, 'http://www.beer.com'), (2,
> 'http://www.ban-beer.com'), (3, 'http://www.bbc.co.uk'), (4,
> 'http://www.beer.com'), (5, 'http://wwww.kernel.org')]
>   print myReduce(myGroup(myMap(url_data, "beer")))
> 
> if __name__ == "__main__":
>   result = doTests()
>   if result.wasSuccessful():
>     main()
>   else:
>     print "Error - check test output."
> 
> 




More information about the Tutor mailing list