NEWBIE: Removing HTML/JavaScript from a webpage
Thomas Guettler
zopestoller at thomas-guettler.de
Mon Jul 22 06:54:12 EDT 2002
Owen Marshall wrote:
> Ok...here is my question. I have this bit of code:
>
> import urllib
>
> response =
> urllib.urlopen('http://movies.go.com/cgi/movielistings/request.dll?ZIPSPECIFIC&zip_code=40004&date=07/20/2002')
>
>
> resp = response.read()
Use the htmlparser from htmllib. The parser gives you an event for every encountered
tag. Ignore the tags you don't like.
If you want to know the parent-tags use a list as a stack to
keep track of the tags. Use push() if an start-tag occurs
and pop() if a end-tag occurs. The example below uses in_td, which is worse
than using a stack.
Here is an example for parsing the html output of excel:
class MSExcelHTMLParser(htmllib.HTMLParser):
def __init__(self):
htmllib.HTMLParser.__init__(
self, formatter.NullFormatter())
self.content=''
self.translateDict={}
self.in_td=0
self.in_tr=0
self.column_count=0
self.table_header=0
def start_tr(self, attributes):
if self.in_tr:
raise "Parse Error: <tr> in <tr>"
self.currColumn=0
self.rowData=[]
def end_tr(self):
self.in_tr=0
len_rd=len(self.rowData)
if len_rd==0:
if self.table_header:
raise 'Parse Error: Two rows with no <td> tags'
else:
self.table_header=1
elif len_rd!=2:
raise 'Parse Error: Row with not 2 <td> tags'
# Hash "deutsch" --> "bulgarisch"
self.translateDict[self.rowData[0]]=self.rowData[1]
def start_td(self, attributes):
self.content=''
if self.currColumn>1:
raise "Parse Error: Table has more than 2 columns"
if self.in_td:
raise "Parse Error: <td> in <td>"
self.in_td=1
def end_td(self):
self.in_td=0
self.content=self.content.strip()
if len(self.content)==1 and ord(self.content)==160:
self.content=''
self.rowData.append(self.content)
self.currColumn+=1
def handle_data(self, content):
self.content+=content
fd=open(filename)
def main():
#Remove Microsoft's non standard conforming HTML-Tags
content=[]
while 1:
line=fd.readline()
if not line:
break
if re.match(r'\s*<!\[if.*>$', line):
#Ignore non standard conforming HTML of MS-Excel
while 1:
line=fd.readline()
if not line:
raise 'Parsing error: EOF in Microsoft special tag'
if re.match(r'\s*<!\[endif]>', line):
break
#print "Ignoring:", line
else:
content.append(line)
fd.close()
content=string.join(content)
parser=MSExcelHTMLParser()
parser.feed(content)
parser.close()
thomas
More information about the Python-list
mailing list