urllib2.UserAgent [was: Re: [Web-SIG] So what's missing?]
John J Lee
jjl at pobox.com
Sun Nov 2 09:34:11 EST 2003
How about something like this (unfinished, untested!)?
Sorry for the 1.5.2-isms.
class UserAgent(OpenerDirector):
"""Convenient user-agent class.
Do not modify the addheaders attribute directly.
"""
# XXX
# AbstractHTTPHandler should be updated to use HTTP{S,}Connection.
# Either AbstractHTTPHandler or auth (/proxy?) classes need to use
# httpx and this interface adjusted as appropriate.
# Conditional fetches??
# XXX should this be public?
self.handler_classes = {
"http": HTTPHandler,
"https": HTTPSHandler,
"ftp": CacheFTPHandler,
# XXX etc.
# XXX
# rest of auth
# proxies
"_authen": HTTPBasicAuthHandler,
"_cookies": HTTPCookieProcessor,
"_robots": RobotRulesProcessor,
"_refresh": HTTPRefreshProcessor,
"_equiv": HTTPEquivProcessor,
"_seek": SeekableProcessor,
"_debug_redirect": HTTPRedirectDebugProcessor,
"_debug_response_body": HTTPResponseDebugProcessor,
}
self.default_schemes = ["http", "https", "ftp"]
self.default_handlers = ["_authen"]
def __init__(self):
OpenerDirector.__init__(self)
self._handlers = {}
for scheme, klass in self.default_schemes+self.default_handlers:
self._handlers[scheme] = klass()
# XXX
## def set_timeout(self, timeout):
## self._timeout = timeout
## def set_connection_cache(self, conn_cache):
## self._conn_cache = conn_cache
## def set_cache(self, cache):
## self._cache = cache
def set_handled_schemes(self, schemes):
"""Set sequence of protocol scheme strings."""
schemesd = {}
for scheme in schemes:
if startswith(scheme, "_"):
raise ValueError("invalid scheme '%s'" % scheme)
schemesd[scheme] = None
# get rid of scheme handlers we don't want
for scheme, oldhandler in self._handlers.items():
if startswith(scheme, "_"): continue # not a scheme handler
if not schemesd.has_key[scheme]:
self._replace_handler(oldhandler, None)
else:
del schemesd[scheme]
# add the scheme handlers that are missing
for scheme in schemesd.keys():
handler_class = self.handler_classes[scheme]
self.add_handler(handler_class())
def set_persistent_headers(self, headers):
"""Set sequence of header name, value pairs.
These headers are sent with every request, as long as they are not
overridden in the Request.
>>> ua = UserAgent()
>>> ua.set_peristent_headers(
... [("User-agent", "Mozilla/5.0 (compatible)"),
... ("From", "responsible.person at example.com")])
"""
# XXX tie in with robots stuff
d = {}
for name, value in headers:
d[name.capitalize()] = value
self.addheaders = d.items()
def _set_handler(self, key, obj=None):
oldhandler = self._handlers.get(key)
handler_class = self.handler_classes[key]
if obj is not None:
newhandler = handler_class(obj)
else:
newhandler = handler_class()
self._replace_handler(oldhandler, newhandler)
def set_cookiejar(self, cookiejar):
"""Set a ClientCookie.CookieJar, or None."""
self._set_handler("_cookies", cookiejar)
def set_robotfileparser(self, rfp):
"""Set a robots.RobotFileParser, or None."""
self._set_handler("_robots", cookiejar)
def set_robotfileparser(self, credentials):
"""Set a urllib2.HTTPPasswordMgr, or None."""
# XXX httpx?
self._set_handler("_authen", credentials)
# these methods all take a boolean parameter
def set_handle_refresh(self, handle):
"""Set whether to handle HTTP Refresh headers."""
self._set_handler("_refresh")
def set_handle_equiv(self, handle):
"""Set whether to treat HTML http-equiv headers like HTTP headers.
Implies seekable responses.
"""
self.set_seekable_responses(True)
self._set_handler("_equiv")
def set_seekable_responses(self, handle):
"""Make response objects .seek()able."""
self._set_handler("_seek")
# XXX haven't thought through debugging...
def set_debug_redirects(self, handle):
"""Print information about HTTP redirects."""
self._set_handler("_debug_redirect")
def set_debug_responses(self, handle):
"""Print HTTP response bodies."""
self._set_handler("_debug_response_body")
def http_get(self, fullurl, ranges=None, conditions=None):
"""HTTP GET.
ranges: sequence of pairs of byte ranges (start, end) to fetch;
Ranges follow the usual Python rules (the start byte is included,
the end byte is not; negative numbers count back from the end of
the entity; start None means start of entity; end None means end of
entity). There are restrictions, though: end must not be negative,
and if start is negative, end must be None.
>>> ua.http_get("http://www.example.com/big.dat",
[(0, 10), (-10, None)]) # first and last 10 bytes
>>> ua.http_get("http://www.example.com/big.dat",
[(50000, None)]) # from byte 500000 to the end
"""
req = self._request(fullurl, data)
assert req.get_type() == "http", "http_get for non-HTTP URI"
rs = []
for start, end in ranges:
if start < 0:
assert end is None, "invalid range"
start = ""
else:
assert 0 <= start <= end, "invalid range"
if start == end: continue
end = end - 1
rs.append("%s-%s" % range)
req.add_header(("Range", "bytes=" % string.join(rs, ", ")))
return self.open(req)
# XXX how to support these methods using Request class?
## def http_head(self, fullurl):
## def http_put(self, fullurl, data=None):
## # XXX what about 30x handling?
def _replace_handler(self, handler, newhandler=None):
# first, if handler was previously added, remove it
for dict_ in [self.handlers,
self.handle_open, self.handle_error,
self.process_request, self.process_response]:
if handler is None:
break
for handlers in dict_.values():
for i in range(len(handlers)):
if handlers[i] is handler:
del handlers[i]
# then add the replacement, if any
if newhandler is not None:
self.add_handler(newhandler)
More information about the Web-SIG
mailing list