Parsing HTML

Thu Sep 23 07:44:57 EDT 2004

> I want to extract some info from a some specific HTML pages, Microsofts
> International Word list (e.g.
> http://msdn.microsoft.com/library/en-us/dnwue/html/swe_word_list.htm). I
> want to take all the words, both English and the other language and create
> a dictionary. so that I can look up About and get Om as the answer.

BeautifulSoup (http://www.crummy.com/software/BeautifulSoup/) is perfect for
this job:

import urllib2, pprint
from BeautifulSoup import BeautifulSoup

def cellToWord(cell):
   """Given a table cell, return the word in that cell."""
   # Some words are in bold.
   if cell('b'):
      return cell.first('b').string.strip()      # Return the bold piece.
   else:
      return cell.string.split('.')[1].strip()   # Remove the number.

def parse(url):
   """Parse the given URL and return a dictionary mapping US words to
   foreign words."""

   # Read the URL and pass it to BeautifulSoup.
   html = urllib2.urlopen(url).read()
   soup = BeautifulSoup()
   soup.feed(html)

   # Read the main table, extracting the words from the table cells.
   USToForeign = {}
   mainTable = soup.first('table')
   rows = mainTable('tr')
   for row in rows[1:]:        # Exclude the first (headings) row.
      cells = row('td')
      if len(cells) == 3:      # Some rows have a single colspan="3" cell.
         US = cellToWord(cells[0])
         foreign = cellToWord(cells[1])
         USToForeign[US] = foreign

   return USToForeign

if __name__ == '__main__':
   url = 'http://msdn.microsoft.com/library/en-us/dnwue/html/FRE_word_list.htm'
   USToForeign = parse(url)
   pairs = USToForeign.items()
   pairs.sort(lambda a, b: cmp(a[0].lower(), b[0].lower()))  # Web page order
   pprint.pprint(pairs)

-- 
Richie Hindle
richie at entrian.com