Undocumented unescape() method in HTMLParser?

Tobiah toby at tobiah.org
Fri May 25 11:49:17 EDT 2018

I came across its usage in StackOverflow somewhere, but didn't see
it in the docs.  I'm using 2.7.

I needed it while writing a class for generating text documents out of
HTML documents for attaching to emails, which lowers spam scores.  I lifted
the basis for this from the top answer here:  https://tinyurl.com/yb92x8ra

While not complete, I thought it might be of interest.  Improvements


from HTMLParser import HTMLParser

def main():

         parser = TextExtractor()
         html = '''
                 <p>"Hi there!"</p>
                 <script> some javascript </script>
                 <style> class{style}</style>
                 <scrip>Print this</scrip>
                 <b><And this></b>

         print parser.strip_tags(html)

class TextExtractor(HTMLParser):

         def __init__(self):
                 self.silent_tag = None
                 self.fed = []
                 self.silent_tags = ['head', 'script', 'style']

         def handle_starttag(self, tag, atts):
                 if tag in self.silent_tags:
                         self.silent_tag = tag

         def handle_endtag(self, tag):
                 if tag == self.silent_tag:
                         self.silent_tag = None

         def handle_data(self, d):
                 if not self.silent_tag:

         def handle_entityref(self, name):
                 self.fed.append(self.unescape("&%s;" % name))

         def get_data(self):
                 return ''.join(self.fed)

         def strip_tags(self, html):
                 data = self.get_data()
                 self.fed = []
                 return data




                 "Hi there!"

                 Print this
                 <And this>


