htmllib samples

Doug Fort dougfort at downright.com
Thu Mar 29 07:24:13 EST 2001


Oliver Vecernik wrote:

> Hi,
>
> has anyone got some samples or point me to where to find them on how to
> implement this module?
>
> Oliver

I've attached a simple parser we use.  It may help.

Note that Frederick  Lundh modestly failed to mention his upcoming book on
'The Python Library'.  There's an ebook version of his 'eff-bot' Guide,
which is the  first place I look for code  examples.

--
Doug Fort (dougfort at downright.com)
Senior Meat Manager
Downright Software LLC
http://www.dougfort.net


-------------- next part --------------
#!/usr/bin/env python
"""
FormFieldParser

This object parses HTML text and builds a dictionary of
dictionaries of form fields

$Id: formfieldparser.py,v 1.1 2001/01/26 15:18:30 dougfort Exp $
"""
__author__="Downright Software LLC"
__version__="$Revision: 1.1 $"[11:-2]

import sgmllib
import string
import cStringIO
import urllib
import re

import webnudge.util.misc
import webnudge.util.document

class FormFieldParserException:
    def __init__(self, message):
        self._message = message
    def __str__(self):
        return self._message

###########################################################
class FormFieldParser(sgmllib.SGMLParser):
###########################################################
    """
    FormFieldParser class. Parse a page from a website,
    creating a dictionary of dictionairies of form
    fields
    """

    #----------------------------------------------------------
    def __init__(self):
    #----------------------------------------------------------
        """
        Constructor
        """
        sgmllib.SGMLParser.__init__(self)

        self._formcount = 0
        self._formdict = {}

    #----------------------------------------------------------
    def parse(self, text):
    #----------------------------------------------------------
        """
        parse some text, without trashing javascript
        """
        self.feed(text)
        self.close()
        return self._formdict
        
    #----------------------------------------------------------
    def start_form(self,attributes):
    #----------------------------------------------------------
        """
        start a form
        """
        self._formdict[self._formcount] = {}
    #----------------------------------------------------------
    def end_form(self):
    #----------------------------------------------------------
        """
        end a form
        """
        self._formcount += 1
        
    #----------------------------------------------------------
    def _storeformfield(self,attributes,multivalue=0):
    #----------------------------------------------------------
        """
        Capture name and value attributes of a form field
        """
        tagname = None
        tagvalue = ""
        selected = 0
        for key, value in attributes:
            if key == "name":
                tagname = value
                continue
            if key == "value":
                tagvalue = value
                continue
            if key == "selected":
                selected = 1
                continue
        if multivalue and not selected:
            return
            
        if tagname:
            self._formdict[self._formcount][tagname] = tagvalue
        
    #----------------------------------------------------------
    def do_input(self,attributes):
    #----------------------------------------------------------
        """
        Capture <input> element
        """
        self._storeformfield(attributes)
        
    #----------------------------------------------------------
    def do_option(self,attributes):
    #----------------------------------------------------------
        """
        Capture <option> element
        """
        self._storeformfield(attributes, multivalue=1)
        
    #----------------------------------------------------------
    def do_select(self,attributes):
    #----------------------------------------------------------
        """
        Capture <select> element
        """
        self._storeformfield(attributes, multivalue=1)
        
    #----------------------------------------------------------
    def do_textarea(self,attributes):
    #----------------------------------------------------------
        """
        Capture <textarea> element
        """
        self._storeformfield(attributes)
        
#----------------------------------------------------------
if __name__ == "__main__":
#----------------------------------------------------------
    """
    Code for commandline testing
    """
    import sys
    if len(sys.argv) != 2:
        print "Usage:  filteringparser.py <url>"
        sys.exit(-1)

    import webnudge.util.rawhtmlpage
    page = webnudge.util.rawhtmlpage.RawHTMLPage()
    page.load(sys.argv[1])
    if not page:
        print "*** Error *** %s" % (page._message)
        sys.exit(-1)

    result = FormFieldParser().parse(page._data)
    
    sys.stdout.write(repr(result))




More information about the Python-list mailing list