[Tutor] MapReduce

Steve Nelson sanelson at gmail.com
Tue Feb 6 09:22:30 CET 2007


On 2/5/07, Kent Johnson <kent37 at tds.net> wrote:
> You can also do this operation easily with dicts (not tested!):

Thank you - code now complete and tests passing.  Would appreciate
comments / criticisms.  I did wonder if I should create a UrlAnalyser
Class rather than have hanging methods:

#!/usr/bin/python
import unittest

def myMap(data, search):
  """Take list of tuples of record number and url accessed and return
list of tuples keyed by url with record number as value if search is
in the url"""
  return [(value, key) for key, value in data if search in value]

def myGroup(data):
  """Take list of tuples keyed by url with record number as value, and
group together same urls with list of record numbers as value."""
  groups = {}
  for value, index in data:
    groups.setdefault(value, []).append(index)
  return sorted(groups.items())

def myReduce(data):
  """Process list of tuples of url and record number list and return
list of url and frequency of occurence."""
  return [(value, len(occurences)) for value, occurences in data]

class UnitTests(unittest.TestCase):
  """Do not taunt unit tests."""

  def setUp(self):
    pass

  def tearDown(self):
    pass

  def testMapper(self):
    """Produce set of intermediate key value pairs, with record
content as key and record number as value, if a condition is met."""
    test_pairs = [(1, 'a'), (2, 'b'), (3, 'c'), (4, 'a'), (5, 'd')]
    intermediate_list = [('a', 1), ('a', 4)]
    self.assertEqual(myMap(test_pairs, "a"), intermediate_list)

  def testGrouper(self):
    """Group occurences of a record together:

    [('fred', 1), ('jim', 2), ('bill', 3), ('jim', 4)] -> [(fred, 1),
('jim', [2, 4]), ('bill' ,3)]"""

    test_list = [('fred', 1), ('jim', 2), ('bill', 3), ('jim', 4)]
    grouped_list = [('bill', [3]), ('fred', [1]), ('jim', [2, 4])]
    self.assertEqual(myGroup(test_list), grouped_list)

  def testReduce(self):
    """Aggregate results of map and group functions to produce value
and frequency."""
    test_intermediate =  [('bill', [3]), ('fred', [1]), ('jim', [2, 4])]
    test_summary = [('bill', 1), ('fred', 1), ('jim', 2)]
    self.assertEqual(myReduce(test_intermediate), test_summary)

def doTests():
  """Run our test suite"""
  suite = unittest.makeSuite(UnitTests,'test')
  runner = unittest.TextTestRunner()
  result = runner.run(suite)
  return result

def main():
  """Main program here"""
  print "Analysing URL data:\n"
  url_data = [(1, 'http://www.beer.com'), (2,
'http://www.ban-beer.com'), (3, 'http://www.bbc.co.uk'), (4,
'http://www.beer.com'), (5, 'http://wwww.kernel.org')]
  print myReduce(myGroup(myMap(url_data, "beer")))

if __name__ == "__main__":
  result = doTests()
  if result.wasSuccessful():
    main()
  else:
    print "Error - check test output."


More information about the Tutor mailing list