Searching a string and extract all occurancies of a substring
Nico Grubert
nicogrubert at gmail.com
Thu Aug 31 12:26:29 EDT 2006
> Try Beautiful Soup, or if your input is simple enough, the re module.
Hi Gabriel,
I first tried "HTMLParser" and wrote this short script:
from HTMLParser import HTMLParser
from htmlentitydefs import entitydefs
class MyDocParser(HTMLParser):
def __init__(self):
self.paths = []
self.readingpaths = 0 # flag
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
if tag == 'parameter':
self.readingpaths = 1
def handle_endtag(self, tag):
if tag == 'parameter':
self.readingpaths = 0
def handle_data(self, data):
if self.readingpaths:
self.paths.append(data)
def handle_entityref(self, name):
" handle values like 'Home & Products' "
if entitydefs.has_key(name):
self.handle_data(entitydefs[name])
else:
self.handle_data('&' + name + ';')
def handle_charref(self, name):
""" handle values like 'Home & Products®'
Ignores invalid character references
"""
try:
charnum = int(name)
except ValueError:
return
if charnum < 1 or charnum > 255:
return
def get_paths(self):
return self.paths
def parse_content(content):
""" parse
"""
parser = MyDocParser()
parser.feed(content)
paths = parser.get_paths()
return paths
# /end
This works as long as there are no other <paramter> Tags in the content
that I parse.
Nico
More information about the Python-list
mailing list