NEWBIE: Removing HTML/JavaScript from a webpage

Thomas Guettler zopestoller at thomas-guettler.de
Mon Jul 22 06:54:12 EDT 2002


Owen Marshall wrote:

> Ok...here is my question. I have this bit of code:
> 
> import urllib
> 
> response = 
> urllib.urlopen('http://movies.go.com/cgi/movielistings/request.dll?ZIPSPECIFIC&zip_code=40004&date=07/20/2002') 
> 
> 
> resp = response.read()


Use the htmlparser from htmllib. The parser gives you an event for every encountered
tag. Ignore the tags you don't like.

If you want to know the parent-tags use a list as a stack to
keep track of the tags. Use push() if an start-tag occurs
and pop() if a end-tag occurs. The example below uses in_td, which is worse
than using a stack.

Here is an example for parsing the html output of excel:

class MSExcelHTMLParser(htmllib.HTMLParser):
     def __init__(self):
         htmllib.HTMLParser.__init__(
             self, formatter.NullFormatter())
         self.content=''
         self.translateDict={}
         self.in_td=0
         self.in_tr=0
         self.column_count=0
         self.table_header=0

     def start_tr(self, attributes):
         if self.in_tr:
             raise "Parse Error: <tr> in <tr>"
         self.currColumn=0
         self.rowData=[]

     def end_tr(self):
         self.in_tr=0
         len_rd=len(self.rowData)
         if len_rd==0:
             if self.table_header:
                 raise 'Parse Error: Two rows with no <td> tags'
             else:
                 self.table_header=1
         elif len_rd!=2:
             raise 'Parse Error: Row with not 2 <td> tags'
         # Hash "deutsch" --> "bulgarisch"
         self.translateDict[self.rowData[0]]=self.rowData[1]

     def start_td(self, attributes):
         self.content=''
         if self.currColumn>1:
             raise "Parse Error: Table has more than 2 columns"
         if self.in_td:
             raise "Parse Error: <td> in <td>"
         self.in_td=1

     def end_td(self):
         self.in_td=0
         self.content=self.content.strip()
         if len(self.content)==1 and ord(self.content)==160:
             self.content=''
         self.rowData.append(self.content)
         self.currColumn+=1

     def handle_data(self, content):
         self.content+=content
     fd=open(filename)

def main():
     #Remove Microsoft's non standard conforming HTML-Tags
     content=[]
     while 1:
         line=fd.readline()
         if not line:
             break
         if re.match(r'\s*<!\[if.*>$', line):
             #Ignore non standard conforming HTML of MS-Excel
             while 1:
                 line=fd.readline()
                 if not line:
                     raise 'Parsing error: EOF in Microsoft special tag'
                 if re.match(r'\s*<!\[endif]>', line):
                     break
                 #print "Ignoring:", line
         else:
             content.append(line)

     fd.close()
     content=string.join(content)
     parser=MSExcelHTMLParser()
     parser.feed(content)
     parser.close()

  thomas




More information about the Python-list mailing list