how to get rid of html tags
Richie Hindle
richie at entrian.com
Thu Oct 3 05:38:36 EDT 2002
Hi koko,
> I am trying to retrieve a web page.
> But I only want to keep the content of the webpage without the html tags.
> How can I parse the webpage to get rid of the tags?
Use sgmllib.SGMLParser (htmllib looks like a more obvious choice, but
that's more for processing specific HTML tags, plus it's designed for
HTML 2.0).
Here's an example, html2text.py. It uses sgmllib.SGMLParser to do
exactly what you're asking for, with various options. If you want to
use it as a library rather than a program, just import the Parser class
and use it the way the __main__ block uses it.
----------------------------------------------------------------------
import sys, re, glob, cStringIO, sgmllib, htmlentitydefs
usage = """
Usage: html2text [options] - Read from standard input
html2text [options] files... - Read from the given files/wildcards
Options: -h, --help - Display this help message
-a, --all - Include various user-visible attributes that
aren't in the page itself: img/href, a/title and
the content of meta/description tags.
-v, --verbose - Print **** filename **** at the top of each
file's output.
Examples: html2text -a *.html | wc - Count the words in your site
html2text -a *.html | spell | sort -u - Spell-check your site
echo "Test <em>me</em>." | html2text - Check that html2text works
"""
class Parser( sgmllib.SGMLParser ):
"""Parses the HTML and generates plain text. Feed HTML to feed() and retrieve the text
from getText()."""
def __init__( self, all ):
sgmllib.SGMLParser.__init__( self )
self.all = all
self.out = cStringIO.StringIO()
def unknown_starttag( self, tag, attributes ):
# If --all was given, output the extra user-visible attributes.
if self.all:
attributeMap = {}
for a in attributes:
attributeMap[ a[ 0 ] ] = a[ 1 ]
if tag == 'a' and attributeMap.has_key( 'title' ):
self.out.write( ' ' + attributeMap[ 'title' ] + ' ' )
elif tag == 'img' and attributeMap.has_key( 'alt' ):
self.out.write( ' ' + attributeMap[ 'alt' ] + ' ' )
elif tag == 'meta' and attributeMap.has_key( 'content' ) and attributeMap.get( 'name', '' ) in [ 'description' ]:
self.out.write( ' ' + attributeMap[ 'content' ] + ' ' )
def handle_data( self, data ):
self.out.write( data )
def handle_entityref( self, text ):
if text == 'nbsp':
self.out.write( ' ' )
else:
self.out.write( htmlentitydefs.entitydefs[ text ] )
def getText( self ):
# Collapse runs of whitespace down to a single space, remove leading and trailing space
# from each line, and collapse runs of three or more newlines down to two newlines.
sgmllib.SGMLParser.close( self )
text = re.sub( r'[ \t]{2,}', ' ', self.out.getvalue() )
text = re.sub( r'[ \t]+\n', '\n', text )
text = re.sub( r'\n[ \t]+', '\n', text )
text = re.sub( r'\n{3,}', '\n\n', text )
# Reset and return the text.
self.out = cStringIO.StringIO()
return text
if __name__ == '__main__':
# Are they asking for help?
if len( sys.argv ) > 1 and sys.argv[ 1 ] in [ "/?", "-?", "/h", "-h", "/help", "-help", "--help" ]:
print usage
sys.exit()
# Process the command-line arguments.
all = 0
verbose = 0
while len( sys.argv ) > 1 and sys.argv[ 1 ][ 0 ] == "-":
if sys.argv[ 1 ] in [ "-a", "--all" ]:
all = 1
del sys.argv[ 1 ]
elif sys.argv[ 1 ] in [ "-v", "--verbose" ]:
verbose = 1
del sys.argv[ 1 ]
else:
print "\nUnknown switch: '%s' Use --help for help" % sys.argv[ 1 ]
sys.exit()
# Create a parser and feed the text into it.
parser = Parser( all )
if len( sys.argv ) == 1:
# No files specified - use stdin.
parser.feed( sys.stdin.read() )
print parser.getText()
else:
# Some files or wildcards were specified.
for pattern in sys.argv[ 1: ]:
for filename in glob.glob( pattern ):
if verbose:
print "\n**** %s ****\n" % filename
parser.feed( open( filename ).read() )
print parser.getText()
--
Richie Hindle
richie at entrian.com
More information about the Python-list
mailing list