how to get rid of html tags

Richie Hindle richie at entrian.com
Thu Oct 3 05:38:36 EDT 2002


Hi koko,

> I am trying to retrieve a web page.
> But I only want to keep the content of the webpage without the html tags.
> How can I  parse the webpage to get rid of the tags?

Use sgmllib.SGMLParser (htmllib looks like a more obvious choice, but
that's more for processing specific HTML tags, plus it's designed for
HTML 2.0).

Here's an example, html2text.py.  It uses sgmllib.SGMLParser to do
exactly what you're asking for, with various options.  If you want to
use it as a library rather than a program, just import the Parser class
and use it the way the __main__ block uses it.

----------------------------------------------------------------------

import sys, re, glob, cStringIO, sgmllib, htmlentitydefs

usage = """
Usage:    html2text [options]           - Read from standard input
          html2text [options] files...  - Read from the given files/wildcards

Options:  -h, --help       - Display this help message

          -a, --all        - Include various user-visible attributes that
                             aren't in the page itself: img/href, a/title and
                             the content of meta/description tags.

          -v, --verbose    - Print **** filename **** at the top of each
                             file's output.

Examples: html2text -a *.html | wc               - Count the words in your site
          html2text -a *.html | spell | sort -u  - Spell-check your site
          echo "Test <em>me</em>." | html2text   - Check that html2text works
"""

class Parser( sgmllib.SGMLParser ):
   """Parses the HTML and generates plain text.  Feed HTML to feed() and retrieve the text
   from getText()."""
   def __init__( self, all ):
      sgmllib.SGMLParser.__init__( self )
      self.all = all
      self.out = cStringIO.StringIO()
   
   def unknown_starttag( self, tag, attributes ):
      # If --all was given, output the extra user-visible attributes.
      if self.all:
         attributeMap = {}
         for a in attributes:
            attributeMap[ a[ 0 ] ] = a[ 1 ]
         if tag == 'a' and attributeMap.has_key( 'title' ):
            self.out.write( ' ' + attributeMap[ 'title' ] + ' ' )
         elif tag == 'img' and attributeMap.has_key( 'alt' ):
            self.out.write( ' ' + attributeMap[ 'alt' ] + ' ' )
         elif tag == 'meta' and attributeMap.has_key( 'content' ) and attributeMap.get( 'name', '' ) in [ 'description' ]:
            self.out.write( ' ' + attributeMap[ 'content' ] + ' ' )

   def handle_data( self, data ):
      self.out.write( data )
   
   def handle_entityref( self, text ):
      if text == 'nbsp':
         self.out.write( ' ' )
      else:
         self.out.write( htmlentitydefs.entitydefs[ text ] )

   def getText( self ):
      # Collapse runs of whitespace down to a single space, remove leading and trailing space
      # from each line, and collapse runs of three or more newlines down to two newlines.
      sgmllib.SGMLParser.close( self )
      text = re.sub( r'[ \t]{2,}', ' ', self.out.getvalue() )
      text = re.sub( r'[ \t]+\n', '\n', text )
      text = re.sub( r'\n[ \t]+', '\n', text )
      text = re.sub( r'\n{3,}', '\n\n', text )
      
      # Reset and return the text.
      self.out = cStringIO.StringIO()
      return text


if __name__ == '__main__':
   # Are they asking for help?
   if len( sys.argv ) > 1 and sys.argv[ 1 ] in [ "/?", "-?", "/h", "-h", "/help", "-help", "--help" ]:
      print usage
      sys.exit()

   # Process the command-line arguments.
   all = 0
   verbose = 0
   while len( sys.argv ) > 1 and sys.argv[ 1 ][ 0 ] == "-":
      if sys.argv[ 1 ] in [ "-a", "--all" ]:
         all = 1
         del sys.argv[ 1 ]
      elif sys.argv[ 1 ] in [ "-v", "--verbose" ]:
         verbose = 1
         del sys.argv[ 1 ]
      else:
         print "\nUnknown switch: '%s'  Use --help for help" % sys.argv[ 1 ]
         sys.exit()

   # Create a parser and feed the text into it.
   parser = Parser( all )
   if len( sys.argv ) == 1:
      # No files specified - use stdin.
      parser.feed( sys.stdin.read() )
      print parser.getText()
   else:
      # Some files or wildcards were specified.
      for pattern in sys.argv[ 1: ]:
         for filename in glob.glob( pattern ):
            if verbose:
               print "\n**** %s ****\n" % filename
            parser.feed( open( filename ).read() )
            print parser.getText()

-- 
Richie Hindle
richie at entrian.com




More information about the Python-list mailing list