HTML to formatted text conversion function
Duncan Booth
duncan at NOSPAMrcp.co.uk
Wed Jul 25 06:52:55 EDT 2001
rupe at metro.yak.net (Rupert Scammell) wrote in
news:79179cf5.0107241216.7345d345 at posting.google.com:
> Recently I've been using a call like os.system("/usr/bin/lynx -dump
> http://www.sample.com > /tmp/site-text.txt") to grab formatted text
> versions of pages (without HTML) for subsequent processing. However,
> I don't like the fact that this technique introduces an additional
> dependency into my code (lynx). I was wondering if anyone could
> recommend an equivalent Python function or module that lets me do this
> without introducing a platform specific dependency?
>
> urllib.urlretrieve() gets back the raw HTML page, so it's not really
> helpful to me, except as a starting point for processing.
>
Is this what you need?
--- begin strip.py ---
# This example will convert a simple HTML file into a plain text
# equivalent. This is useful for readme files etc.
import sys,formatter,StringIO,htmllib,string
from urllib import urlretrieve,urlcleanup
# Strip all HTML formatting.
class MyParser(htmllib.HTMLParser):
def __init__(self):
self.bodytext = StringIO.StringIO()
writer = formatter.DumbWriter(self.bodytext)
htmllib.HTMLParser.__init__(self,
formatter.AbstractFormatter(writer))
def gettext(self):
return self.bodytext.getvalue()
def GetPage(url):
try:
fn, h = urlretrieve(url)
text = open(fn, "r").read()
finally:
urlcleanup()
return text
if __name__=='__main__':
arg = sys.argv[1]
if arg[:7]=='http://':
data = GetPage(sys.argv[1])
else:
data = open(arg, 'r').read()
p = MyParser()
p.feed(data)
p.close()
text = string.replace(p.gettext(), '\xa0', ' ')
print text
anchors = p.anchorlist
for i in range(len(anchors)):
print "[%d]: %s" % (i+1, anchors[i])
--- end strip.py ---
--
Duncan Booth duncan at rcp.co.uk
int month(char *p){return(124864/((p[0]+p[1]-p[2]&0x1f)+1)%12)["\5\x8\3"
"\6\7\xb\1\x9\xa\2\0\4"];} // Who said my code was obscure?
More information about the Python-list
mailing list