[Tutor] Re: How do I get text from an HTML document.

Fri, 16 Aug 2002 09:09:37 +0530

Hi,

On Wed, 14 Aug 2002 SA spewed into the ether:
> Hi Everyone-
> 
> I have HTML docs that have text between the comment tags:
> <!--Story-->
> Some text here
> <!--Story-->
> 
> What would be the simplest way to get this text. The text will also have
> some common html tags mixed in like <p>. So I want to strip all the html
> tags from the text I get also.
[-- snippity --]

    This is a modified version of a script I found on the net
sometime back (I think on the Python Cookbook site). It defines
a derived clas of the sgmllib.SGMLParser class, and subsequently
uses the handle_starttag() and handle_endtag() methods to strip
out unwanted tags. Here goes :

"""
import sgmllib

class StrippingParser(sgmllib.SGMLParser):

    # These are the HTML tags that we will leave intact
    valid_tags = ('b', 'i', 'p')

    from htmlentitydefs import entitydefs # replace entitydefs from sgmllib

    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self.result = []
        self.endTagList = []

    def handle_data(self, data):
        if data:
            self.result.append(data)

    def handle_charref(self, name):
        self.result.append("&#%s;" % name)

    def handle_entityref(self, name):
        x = '' # this breaks unstandard entities that end with ';'
        if self.entitydefs.has_key(name):
            x = ';'
        self.result.append("&%s%s" % (name, x))

    def unknown_starttag(self, tag, attrs):
        """ Delete all tags except for legal ones """
        if tag in self.valid_tags:
            self.result.append('<%s' % tag)
            for k, v in attrs:
                if k[0:2].lower() != 'on' and v[0:10].lower() != 'javascript':
                    self.result.append(' %s="%s"' % (k, v))
            endTag = '</%s>' % tag
            self.endTagList.insert(0,endTag)
            self.result.append('>')

    def unknown_endtag(self, tag):
        if tag in self.valid_tags:
            self.result.append("</%s>" % tag)
            remTag = '</%s>' % tag
            self.endTagList.remove(remTag)

    def cleanup(self):
        """ Append missing closing tags """
        for i in self.endTagList :
            self.result.append(i)
        self.result = "".join(self.result)

def strip(s):
    """ Strip illegal HTML tags from string s """
    parser = StrippingParser()
    parser.feed(s)
    parser.close()
    parser.cleanup()
    return parser.result

if __name__ == "__main__" :
    import sys
    file = sys.argv[1]
    fd = open(file,'r')
    res = strip(fd.read())
    fd.close()
    print res

"""

    HTH,

pv.
-- 
Prahlad Vaidyanathan  <http://www.symonds.net/~prahladv/>

Children are like cats, they can tell when you don't like them.  That's
when they come over and violate your body space.