Searching a string and extract all occurancies of a substring

Nico Grubert nicogrubert at gmail.com
Thu Aug 31 12:26:29 EDT 2006


> Try Beautiful Soup, or if your input is simple enough, the re module.

Hi Gabriel,

I first tried "HTMLParser" and wrote this short script:

from HTMLParser import HTMLParser
from htmlentitydefs import entitydefs

class MyDocParser(HTMLParser):

     def __init__(self):
         self.paths = []
         self.readingpaths = 0     # flag
         HTMLParser.__init__(self)

     def handle_starttag(self, tag, attrs):
         if tag == 'parameter':
             self.readingpaths = 1

     def handle_endtag(self, tag):
         if tag == 'parameter':
             self.readingpaths = 0

     def handle_data(self, data):
         if self.readingpaths:
             self.paths.append(data)

     def handle_entityref(self, name):
         " handle values like 'Home & Products' "
         if entitydefs.has_key(name):
             self.handle_data(entitydefs[name])
         else:
             self.handle_data('&' + name + ';')

     def handle_charref(self, name):
         """ handle values like 'Home & Products®'
             Ignores invalid character references
         """
         try:
             charnum = int(name)
         except ValueError:
             return

         if charnum < 1 or charnum > 255:
             return

     def get_paths(self):
         return self.paths


def parse_content(content):
     """ parse
     """

     parser = MyDocParser()
     parser.feed(content)

     paths = parser.get_paths()

     return paths

# /end

This works as long as there are no other <paramter> Tags in the content 
that I parse.


Nico



More information about the Python-list mailing list