htmllib samples
Doug Fort
dougfort at downright.com
Thu Mar 29 07:24:13 EST 2001
Oliver Vecernik wrote:
> Hi,
>
> has anyone got some samples or point me to where to find them on how to
> implement this module?
>
> Oliver
I've attached a simple parser we use. It may help.
Note that Frederick Lundh modestly failed to mention his upcoming book on
'The Python Library'. There's an ebook version of his 'eff-bot' Guide,
which is the first place I look for code examples.
--
Doug Fort (dougfort at downright.com)
Senior Meat Manager
Downright Software LLC
http://www.dougfort.net
-------------- next part --------------
#!/usr/bin/env python
"""
FormFieldParser
This object parses HTML text and builds a dictionary of
dictionaries of form fields
$Id: formfieldparser.py,v 1.1 2001/01/26 15:18:30 dougfort Exp $
"""
__author__="Downright Software LLC"
__version__="$Revision: 1.1 $"[11:-2]
import sgmllib
import string
import cStringIO
import urllib
import re
import webnudge.util.misc
import webnudge.util.document
class FormFieldParserException:
def __init__(self, message):
self._message = message
def __str__(self):
return self._message
###########################################################
class FormFieldParser(sgmllib.SGMLParser):
###########################################################
"""
FormFieldParser class. Parse a page from a website,
creating a dictionary of dictionairies of form
fields
"""
#----------------------------------------------------------
def __init__(self):
#----------------------------------------------------------
"""
Constructor
"""
sgmllib.SGMLParser.__init__(self)
self._formcount = 0
self._formdict = {}
#----------------------------------------------------------
def parse(self, text):
#----------------------------------------------------------
"""
parse some text, without trashing javascript
"""
self.feed(text)
self.close()
return self._formdict
#----------------------------------------------------------
def start_form(self,attributes):
#----------------------------------------------------------
"""
start a form
"""
self._formdict[self._formcount] = {}
#----------------------------------------------------------
def end_form(self):
#----------------------------------------------------------
"""
end a form
"""
self._formcount += 1
#----------------------------------------------------------
def _storeformfield(self,attributes,multivalue=0):
#----------------------------------------------------------
"""
Capture name and value attributes of a form field
"""
tagname = None
tagvalue = ""
selected = 0
for key, value in attributes:
if key == "name":
tagname = value
continue
if key == "value":
tagvalue = value
continue
if key == "selected":
selected = 1
continue
if multivalue and not selected:
return
if tagname:
self._formdict[self._formcount][tagname] = tagvalue
#----------------------------------------------------------
def do_input(self,attributes):
#----------------------------------------------------------
"""
Capture <input> element
"""
self._storeformfield(attributes)
#----------------------------------------------------------
def do_option(self,attributes):
#----------------------------------------------------------
"""
Capture <option> element
"""
self._storeformfield(attributes, multivalue=1)
#----------------------------------------------------------
def do_select(self,attributes):
#----------------------------------------------------------
"""
Capture <select> element
"""
self._storeformfield(attributes, multivalue=1)
#----------------------------------------------------------
def do_textarea(self,attributes):
#----------------------------------------------------------
"""
Capture <textarea> element
"""
self._storeformfield(attributes)
#----------------------------------------------------------
if __name__ == "__main__":
#----------------------------------------------------------
"""
Code for commandline testing
"""
import sys
if len(sys.argv) != 2:
print "Usage: filteringparser.py <url>"
sys.exit(-1)
import webnudge.util.rawhtmlpage
page = webnudge.util.rawhtmlpage.RawHTMLPage()
page.load(sys.argv[1])
if not page:
print "*** Error *** %s" % (page._message)
sys.exit(-1)
result = FormFieldParser().parse(page._data)
sys.stdout.write(repr(result))
More information about the Python-list
mailing list