urllib2.UserAgent [was: Re: [Web-SIG] So what's missing?]

John J Lee jjl at pobox.com
Sun Nov 2 09:34:11 EST 2003


How about something like this (unfinished, untested!)?

Sorry for the 1.5.2-isms.


class UserAgent(OpenerDirector):
    """Convenient user-agent class.

    Do not modify the addheaders attribute directly.

    """
    # XXX
    # AbstractHTTPHandler should be updated to use HTTP{S,}Connection.
    # Either AbstractHTTPHandler or auth (/proxy?) classes need to use
    #  httpx and this interface adjusted as appropriate.
    # Conditional fetches??

    # XXX should this be public?
    self.handler_classes = {
        "http": HTTPHandler,
        "https": HTTPSHandler,
        "ftp": CacheFTPHandler,
        # XXX etc.

        # XXX
        # rest of auth
        # proxies
        "_authen": HTTPBasicAuthHandler,
        "_cookies": HTTPCookieProcessor,
        "_robots": RobotRulesProcessor,
        "_refresh": HTTPRefreshProcessor,
        "_equiv": HTTPEquivProcessor,
        "_seek": SeekableProcessor,

        "_debug_redirect": HTTPRedirectDebugProcessor,
        "_debug_response_body": HTTPResponseDebugProcessor,
        }
    self.default_schemes = ["http", "https", "ftp"]
    self.default_handlers = ["_authen"]

    def __init__(self):
        OpenerDirector.__init__(self)

        self._handlers = {}
        for scheme, klass in self.default_schemes+self.default_handlers:
            self._handlers[scheme] = klass()

    # XXX
##     def set_timeout(self, timeout):
##         self._timeout = timeout
##     def set_connection_cache(self, conn_cache):
##         self._conn_cache = conn_cache
##     def set_cache(self, cache):
##         self._cache = cache

    def set_handled_schemes(self, schemes):
        """Set sequence of protocol scheme strings."""
        schemesd = {}
        for scheme in schemes:
            if startswith(scheme, "_"):
                raise ValueError("invalid scheme '%s'" % scheme)
            schemesd[scheme] = None

        # get rid of scheme handlers we don't want
        for scheme, oldhandler in self._handlers.items():
            if startswith(scheme, "_"): continue  # not a scheme handler
            if not schemesd.has_key[scheme]:
                self._replace_handler(oldhandler, None)
            else:
                del schemesd[scheme]
        # add the scheme handlers that are missing
        for scheme in schemesd.keys():
            handler_class = self.handler_classes[scheme]
            self.add_handler(handler_class())

    def set_persistent_headers(self, headers):
        """Set sequence of header name, value pairs.

        These headers are sent with every request, as long as they are not
        overridden in the Request.

        >>> ua = UserAgent()
        >>> ua.set_peristent_headers(
        ...  [("User-agent", "Mozilla/5.0 (compatible)"),
        ...   ("From", "responsible.person at example.com")])

        """
        # XXX tie in with robots stuff
        d = {}
        for name, value in headers:
            d[name.capitalize()] = value
        self.addheaders = d.items()

    def _set_handler(self, key, obj=None):
        oldhandler = self._handlers.get(key)
        handler_class = self.handler_classes[key]
        if obj is not None:
            newhandler = handler_class(obj)
        else:
            newhandler = handler_class()
        self._replace_handler(oldhandler, newhandler)

    def set_cookiejar(self, cookiejar):
        """Set a ClientCookie.CookieJar, or None."""
        self._set_handler("_cookies", cookiejar)
    def set_robotfileparser(self, rfp):
        """Set a robots.RobotFileParser, or None."""
        self._set_handler("_robots", cookiejar)
    def set_robotfileparser(self, credentials):
        """Set a urllib2.HTTPPasswordMgr, or None."""
        # XXX httpx?
        self._set_handler("_authen", credentials)

    # these methods all take a boolean parameter
    def set_handle_refresh(self, handle):
        """Set whether to handle HTTP Refresh headers."""
        self._set_handler("_refresh")
    def set_handle_equiv(self, handle):
        """Set whether to treat HTML http-equiv headers like HTTP headers.

        Implies seekable responses.

        """
        self.set_seekable_responses(True)
        self._set_handler("_equiv")
    def set_seekable_responses(self, handle):
        """Make response objects .seek()able."""
        self._set_handler("_seek")
    # XXX haven't thought through debugging...
    def set_debug_redirects(self, handle):
        """Print information about HTTP redirects."""
        self._set_handler("_debug_redirect")
    def set_debug_responses(self, handle):
        """Print HTTP response bodies."""
        self._set_handler("_debug_response_body")

    def http_get(self, fullurl, ranges=None, conditions=None):
        """HTTP GET.

        ranges: sequence of pairs of byte ranges (start, end) to fetch;

        Ranges follow the usual Python rules (the start byte is included,
        the end byte is not; negative numbers count back from the end of
        the entity; start None means start of entity; end None means end of
        entity).  There are restrictions, though: end must not be negative,
        and if start is negative, end must be None.

        >>> ua.http_get("http://www.example.com/big.dat",
                        [(0, 10), (-10, None)])  # first and last 10 bytes
        >>> ua.http_get("http://www.example.com/big.dat",
                        [(50000, None)])  # from byte 500000 to the end

        """
        req = self._request(fullurl, data)
        assert req.get_type() == "http", "http_get for non-HTTP URI"
        rs = []
        for start, end in ranges:
            if start < 0:
                assert end is None, "invalid range"
                start = ""
            else:
                assert 0 <= start <= end, "invalid range"
                if start == end: continue
                end = end - 1
            rs.append("%s-%s" % range)
        req.add_header(("Range", "bytes=" % string.join(rs, ", ")))
        return self.open(req)

    # XXX how to support these methods using Request class?
##     def http_head(self, fullurl):

##     def http_put(self, fullurl, data=None):
##         # XXX what about 30x handling?

    def _replace_handler(self, handler, newhandler=None):
        # first, if handler was previously added, remove it
        for dict_ in [self.handlers,
                      self.handle_open, self.handle_error,
                      self.process_request, self.process_response]:
            if handler is None:
                break
            for handlers in dict_.values():
                for i in range(len(handlers)):
                    if handlers[i] is handler:
                        del handlers[i]
        # then add the replacement, if any
        if newhandler is not None:
            self.add_handler(newhandler)



More information about the Web-SIG mailing list