HTMLParser error
jonbutler88 at googlemail.com
jonbutler88 at googlemail.com
Thu May 22 04:22:35 EDT 2008
On May 22, 2:40 am, alex23 <wuwe... at gmail.com> wrote:
> On May 22, 8:18 am, jonbutle... at googlemail.com wrote:
>
> > Sorry, im new to both python and newsgroups, this is all pretty
> > confusing. So I need a line in my __init__ function of my class? The
> > spider class I made inherits from HTMLParser. Its just using the
> > feed() function that produces errors though, the rest seems to work
> > fine.
>
> Let me repeat: it would make this a lot easier if you would paste
> actual code.
>
> As you say, your Spider class inherits from HTMLParser, so you need to
> make sure that you set it up correctly so that the HTMLParser
> functionality you've inherited will work correctly (or work as you
> want it to work). If you've added your own __init__ to Spider, then
> the __init__ on HTMLParser is no longer called unless you *explicitly*
> call it yourself.
>
> Unfortunately, my earlier advice wasn't totally correct... HTMLParser
> is an old-style object, whereas super() only works for new-style
> objects, I believe. (If you don't know about old- v new-style objects,
> seehttp://docs.python.org/ref/node33.html). So there are a couple of
> approaches that should work for you:
>
> class SpiderBroken(HTMLParser):
> def __init__(self):
> pass # don't do any ancestral setup
>
> class SpiderOldStyle(HTMLParser):
> def __init__(self):
> HTMLParser.__init__(self)
>
> class SpiderNewStyle(HTMLParser, object):
> def __init__(self):
> super(SpiderNewStyle, self).__init__()
>
> Python 2.5.1 (r251:54863, May 1 2007, 17:47:05) [MSC v.1310 32 bit
> (Intel)] on win32
> Type "help", "copyright", "credits" or "license" for more information.>>> html = open('temp.html','r').read()
> >>> from spider import *
> >>> sb = SpiderBroken()
> >>> sb.feed(html)
>
> Traceback (most recent call last):
> File "<stdin>", line 1, in <module>
> File "C:\Python25\lib\HTMLParser.py", line 107, in feed
> self.rawdata = self.rawdata + data
> AttributeError: SpiderBroken instance has no attribute 'rawdata'
>
> >>> so = SpiderOldStyle()
> >>> so.feed(html)
> >>> sn = SpiderNewStyle()
> >>> sn.feed(html)
>
> The old-style version is probably easiest, so putting this line in
> your __init__ should fix your issue:
>
> HTMLParser.__init__(self)
>
> If this still isn't clear, please let me know.
>
> - alex23
OK, heres what I have so far:
#!/usr/bin/env python
from HTMLParser import HTMLParser
from urllib2 import urlopen, HTTPError
class Spider(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.found = []
self.queue = []
def handle_starttag(self, tag, attrs):
try:
if tag == 'a':
if attrs[0][0] == 'href':
self.queue.append(attrs[0][1])
except HTMLParseError:
print 'Error parsing HTML tags'
def parse(self, page):
try:
self.feed(urlopen('http://' + page).read())
except HTTPError:
print 'Error getting page source'
def crawl(self, site):
self.queue.append(site)
while 1:
try:
url = self.queue.pop(0)
self.parse(url)
except IndexError:
break
self.found.append(url)
return self.found
if __name__ == '__main__':
s = Spider()
site = raw_input("What site would you like to scan? http://")
s.crawl(site)
Still getting very odd errors though, this being the latest:
Traceback (most recent call last):
File "spider.py", line 38, in <module>
s.crawl(site)
File "spider.py", line 30, in crawl
self.parse(url)
File "spider.py", line 21, in parse
self.feed(urlopen('http://' + page).read())
File "/Library/Frameworks/Python.framework/Versions/2.5/lib/
python2.5/urllib2.py", line 124, in urlopen
return _opener.open(url, data)
File "/Library/Frameworks/Python.framework/Versions/2.5/lib/
python2.5/urllib2.py", line 381, in open
response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/2.5/lib/
python2.5/urllib2.py", line 399, in _open
'_open', req)
File "/Library/Frameworks/Python.framework/Versions/2.5/lib/
python2.5/urllib2.py", line 360, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/2.5/lib/
python2.5/urllib2.py", line 1107, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/Library/Frameworks/Python.framework/Versions/2.5/lib/
python2.5/urllib2.py", line 1064, in do_open
h = http_class(host) # will parse host:port
File "/Library/Frameworks/Python.framework/Versions/2.5/lib/
python2.5/httplib.py", line 639, in __init__
self._set_hostport(host, port)
File "/Library/Frameworks/Python.framework/Versions/2.5/lib/
python2.5/httplib.py", line 651, in _set_hostport
raise InvalidURL("nonnumeric port: '%s'" % host[i+1:])
httplib.InvalidURL: nonnumeric port: ''
Also could you explain why I needed to add that
HTMLParser.__init__(self) line? Does it matter that I have overwritten
the __init__ function of spider?
Thanks
More information about the Python-list
mailing list