[3] | 1 | """Convenient HTTP UserAgent class. |
---|
| 2 | |
---|
| 3 | This is a subclass of urllib2.OpenerDirector. |
---|
| 4 | |
---|
| 5 | |
---|
| 6 | Copyright 2003-2006 John J. Lee <jjl@pobox.com> |
---|
| 7 | |
---|
| 8 | This code is free software; you can redistribute it and/or modify it under |
---|
| 9 | the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt |
---|
| 10 | included with the distribution). |
---|
| 11 | |
---|
| 12 | """ |
---|
| 13 | |
---|
| 14 | import sys, warnings, urllib2 |
---|
| 15 | |
---|
| 16 | import _opener |
---|
| 17 | import _urllib2 |
---|
| 18 | import _auth |
---|
| 19 | import _gzip |
---|
| 20 | import _response |
---|
| 21 | |
---|
| 22 | |
---|
| 23 | class UserAgentBase(_opener.OpenerDirector): |
---|
| 24 | """Convenient user-agent class. |
---|
| 25 | |
---|
| 26 | Do not use .add_handler() to add a handler for something already dealt with |
---|
| 27 | by this code. |
---|
| 28 | |
---|
| 29 | The only reason at present for the distinction between UserAgent and |
---|
| 30 | UserAgentBase is so that classes that depend on .seek()able responses |
---|
| 31 | (e.g. mechanize.Browser) can inherit from UserAgentBase. The subclass |
---|
| 32 | UserAgent exposes a .set_seekable_responses() method that allows switching |
---|
| 33 | off the adding of a .seek() method to responses. |
---|
| 34 | |
---|
| 35 | Public attributes: |
---|
| 36 | |
---|
| 37 | addheaders: list of (name, value) pairs specifying headers to send with |
---|
| 38 | every request, unless they are overridden in the Request instance. |
---|
| 39 | |
---|
| 40 | >>> ua = UserAgentBase() |
---|
| 41 | >>> ua.addheaders = [ |
---|
| 42 | ... ("User-agent", "Mozilla/5.0 (compatible)"), |
---|
| 43 | ... ("From", "responsible.person@example.com")] |
---|
| 44 | |
---|
| 45 | """ |
---|
| 46 | |
---|
| 47 | handler_classes = { |
---|
| 48 | # scheme handlers |
---|
| 49 | "http": _urllib2.HTTPHandler, |
---|
| 50 | # CacheFTPHandler is buggy, at least in 2.3, so we don't use it |
---|
| 51 | "ftp": _urllib2.FTPHandler, |
---|
| 52 | "file": _urllib2.FileHandler, |
---|
| 53 | |
---|
| 54 | # other handlers |
---|
| 55 | "_unknown": _urllib2.UnknownHandler, |
---|
| 56 | # HTTP{S,}Handler depend on HTTPErrorProcessor too |
---|
| 57 | "_http_error": _urllib2.HTTPErrorProcessor, |
---|
| 58 | "_http_request_upgrade": _urllib2.HTTPRequestUpgradeProcessor, |
---|
| 59 | "_http_default_error": _urllib2.HTTPDefaultErrorHandler, |
---|
| 60 | |
---|
| 61 | # feature handlers |
---|
| 62 | "_basicauth": _urllib2.HTTPBasicAuthHandler, |
---|
| 63 | "_digestauth": _urllib2.HTTPDigestAuthHandler, |
---|
| 64 | "_redirect": _urllib2.HTTPRedirectHandler, |
---|
| 65 | "_cookies": _urllib2.HTTPCookieProcessor, |
---|
| 66 | "_refresh": _urllib2.HTTPRefreshProcessor, |
---|
| 67 | "_equiv": _urllib2.HTTPEquivProcessor, |
---|
| 68 | "_proxy": _urllib2.ProxyHandler, |
---|
| 69 | "_proxy_basicauth": _urllib2.ProxyBasicAuthHandler, |
---|
| 70 | "_proxy_digestauth": _urllib2.ProxyDigestAuthHandler, |
---|
| 71 | "_robots": _urllib2.HTTPRobotRulesProcessor, |
---|
| 72 | "_gzip": _gzip.HTTPGzipProcessor, # experimental! |
---|
| 73 | |
---|
| 74 | # debug handlers |
---|
| 75 | "_debug_redirect": _urllib2.HTTPRedirectDebugProcessor, |
---|
| 76 | "_debug_response_body": _urllib2.HTTPResponseDebugProcessor, |
---|
| 77 | } |
---|
| 78 | |
---|
| 79 | default_schemes = ["http", "ftp", "file"] |
---|
| 80 | default_others = ["_unknown", "_http_error", "_http_request_upgrade", |
---|
| 81 | "_http_default_error", |
---|
| 82 | ] |
---|
| 83 | default_features = ["_redirect", "_cookies", |
---|
| 84 | "_refresh", "_equiv", |
---|
| 85 | "_basicauth", "_digestauth", |
---|
| 86 | "_proxy", "_proxy_basicauth", "_proxy_digestauth", |
---|
| 87 | "_robots", |
---|
| 88 | ] |
---|
| 89 | if hasattr(_urllib2, 'HTTPSHandler'): |
---|
| 90 | handler_classes["https"] = _urllib2.HTTPSHandler |
---|
| 91 | default_schemes.append("https") |
---|
| 92 | |
---|
| 93 | def __init__(self): |
---|
| 94 | _opener.OpenerDirector.__init__(self) |
---|
| 95 | |
---|
| 96 | ua_handlers = self._ua_handlers = {} |
---|
| 97 | for scheme in (self.default_schemes+ |
---|
| 98 | self.default_others+ |
---|
| 99 | self.default_features): |
---|
| 100 | klass = self.handler_classes[scheme] |
---|
| 101 | ua_handlers[scheme] = klass() |
---|
| 102 | for handler in ua_handlers.itervalues(): |
---|
| 103 | self.add_handler(handler) |
---|
| 104 | |
---|
| 105 | # Yuck. |
---|
| 106 | # Ensure correct default constructor args were passed to |
---|
| 107 | # HTTPRefreshProcessor and HTTPEquivProcessor. |
---|
| 108 | if "_refresh" in ua_handlers: |
---|
| 109 | self.set_handle_refresh(True) |
---|
| 110 | if "_equiv" in ua_handlers: |
---|
| 111 | self.set_handle_equiv(True) |
---|
| 112 | # Ensure default password managers are installed. |
---|
| 113 | pm = ppm = None |
---|
| 114 | if "_basicauth" in ua_handlers or "_digestauth" in ua_handlers: |
---|
| 115 | pm = _urllib2.HTTPPasswordMgrWithDefaultRealm() |
---|
| 116 | if ("_proxy_basicauth" in ua_handlers or |
---|
| 117 | "_proxy_digestauth" in ua_handlers): |
---|
| 118 | ppm = _auth.HTTPProxyPasswordMgr() |
---|
| 119 | self.set_password_manager(pm) |
---|
| 120 | self.set_proxy_password_manager(ppm) |
---|
| 121 | # set default certificate manager |
---|
| 122 | if "https" in ua_handlers: |
---|
| 123 | cm = _urllib2.HTTPSClientCertMgr() |
---|
| 124 | self.set_client_cert_manager(cm) |
---|
| 125 | |
---|
| 126 | def close(self): |
---|
| 127 | _opener.OpenerDirector.close(self) |
---|
| 128 | self._ua_handlers = None |
---|
| 129 | |
---|
| 130 | # XXX |
---|
| 131 | ## def set_timeout(self, timeout): |
---|
| 132 | ## self._timeout = timeout |
---|
| 133 | ## def set_http_connection_cache(self, conn_cache): |
---|
| 134 | ## self._http_conn_cache = conn_cache |
---|
| 135 | ## def set_ftp_connection_cache(self, conn_cache): |
---|
| 136 | ## # XXX ATM, FTP has cache as part of handler; should it be separate? |
---|
| 137 | ## self._ftp_conn_cache = conn_cache |
---|
| 138 | |
---|
| 139 | def set_handled_schemes(self, schemes): |
---|
| 140 | """Set sequence of URL scheme (protocol) strings. |
---|
| 141 | |
---|
| 142 | For example: ua.set_handled_schemes(["http", "ftp"]) |
---|
| 143 | |
---|
| 144 | If this fails (with ValueError) because you've passed an unknown |
---|
| 145 | scheme, the set of handled schemes will not be changed. |
---|
| 146 | |
---|
| 147 | """ |
---|
| 148 | want = {} |
---|
| 149 | for scheme in schemes: |
---|
| 150 | if scheme.startswith("_"): |
---|
| 151 | raise ValueError("not a scheme '%s'" % scheme) |
---|
| 152 | if scheme not in self.handler_classes: |
---|
| 153 | raise ValueError("unknown scheme '%s'") |
---|
| 154 | want[scheme] = None |
---|
| 155 | |
---|
| 156 | # get rid of scheme handlers we don't want |
---|
| 157 | for scheme, oldhandler in self._ua_handlers.items(): |
---|
| 158 | if scheme.startswith("_"): continue # not a scheme handler |
---|
| 159 | if scheme not in want: |
---|
| 160 | self._replace_handler(scheme, None) |
---|
| 161 | else: |
---|
| 162 | del want[scheme] # already got it |
---|
| 163 | # add the scheme handlers that are missing |
---|
| 164 | for scheme in want.keys(): |
---|
| 165 | self._set_handler(scheme, True) |
---|
| 166 | |
---|
| 167 | def set_cookiejar(self, cookiejar): |
---|
| 168 | """Set a mechanize.CookieJar, or None.""" |
---|
| 169 | self._set_handler("_cookies", obj=cookiejar) |
---|
| 170 | |
---|
| 171 | # XXX could use Greg Stein's httpx for some of this instead? |
---|
| 172 | # or httplib2?? |
---|
| 173 | def set_proxies(self, proxies): |
---|
| 174 | """Set a dictionary mapping URL scheme to proxy specification, or None. |
---|
| 175 | |
---|
| 176 | e.g. {"http": "joe:password@myproxy.example.com:3128", |
---|
| 177 | "ftp": "proxy.example.com"} |
---|
| 178 | |
---|
| 179 | """ |
---|
| 180 | self._set_handler("_proxy", obj=proxies) |
---|
| 181 | |
---|
| 182 | def add_password(self, url, user, password, realm=None): |
---|
| 183 | self._password_manager.add_password(realm, url, user, password) |
---|
| 184 | def add_proxy_password(self, user, password, hostport=None, realm=None): |
---|
| 185 | self._proxy_password_manager.add_password( |
---|
| 186 | realm, hostport, user, password) |
---|
| 187 | |
---|
| 188 | def add_client_certificate(self, url, key_file, cert_file): |
---|
| 189 | """Add an SSL client certificate, for HTTPS client auth. |
---|
| 190 | |
---|
| 191 | key_file and cert_file must be filenames of the key and certificate |
---|
| 192 | files, in PEM format. You can use e.g. OpenSSL to convert a p12 (PKCS |
---|
| 193 | 12) file to PEM format: |
---|
| 194 | |
---|
| 195 | openssl pkcs12 -clcerts -nokeys -in cert.p12 -out cert.pem |
---|
| 196 | openssl pkcs12 -nocerts -in cert.p12 -out key.pem |
---|
| 197 | |
---|
| 198 | |
---|
| 199 | Note that client certificate password input is very inflexible ATM. At |
---|
| 200 | the moment this seems to be console only, which is presumably the |
---|
| 201 | default behaviour of libopenssl. In future mechanize may support |
---|
| 202 | third-party libraries that (I assume) allow more options here. |
---|
| 203 | |
---|
| 204 | """ |
---|
| 205 | self._client_cert_manager.add_key_cert(url, key_file, cert_file) |
---|
| 206 | |
---|
| 207 | # the following are rarely useful -- use add_password / add_proxy_password |
---|
| 208 | # instead |
---|
| 209 | def set_password_manager(self, password_manager): |
---|
| 210 | """Set a mechanize.HTTPPasswordMgrWithDefaultRealm, or None.""" |
---|
| 211 | self._password_manager = password_manager |
---|
| 212 | self._set_handler("_basicauth", obj=password_manager) |
---|
| 213 | self._set_handler("_digestauth", obj=password_manager) |
---|
| 214 | def set_proxy_password_manager(self, password_manager): |
---|
| 215 | """Set a mechanize.HTTPProxyPasswordMgr, or None.""" |
---|
| 216 | self._proxy_password_manager = password_manager |
---|
| 217 | self._set_handler("_proxy_basicauth", obj=password_manager) |
---|
| 218 | self._set_handler("_proxy_digestauth", obj=password_manager) |
---|
| 219 | def set_client_cert_manager(self, cert_manager): |
---|
| 220 | """Set a mechanize.HTTPClientCertMgr, or None.""" |
---|
| 221 | self._client_cert_manager = cert_manager |
---|
| 222 | handler = self._ua_handlers["https"] |
---|
| 223 | handler.client_cert_manager = cert_manager |
---|
| 224 | |
---|
| 225 | # these methods all take a boolean parameter |
---|
| 226 | def set_handle_robots(self, handle): |
---|
| 227 | """Set whether to observe rules from robots.txt.""" |
---|
| 228 | self._set_handler("_robots", handle) |
---|
| 229 | def set_handle_redirect(self, handle): |
---|
| 230 | """Set whether to handle HTTP 30x redirections.""" |
---|
| 231 | self._set_handler("_redirect", handle) |
---|
| 232 | def set_handle_refresh(self, handle, max_time=None, honor_time=True): |
---|
| 233 | """Set whether to handle HTTP Refresh headers.""" |
---|
| 234 | self._set_handler("_refresh", handle, constructor_kwds= |
---|
| 235 | {"max_time": max_time, "honor_time": honor_time}) |
---|
| 236 | def set_handle_equiv(self, handle, head_parser_class=None): |
---|
| 237 | """Set whether to treat HTML http-equiv headers like HTTP headers. |
---|
| 238 | |
---|
| 239 | Response objects may be .seek()able if this is set (currently returned |
---|
| 240 | responses are, raised HTTPError exception responses are not). |
---|
| 241 | |
---|
| 242 | """ |
---|
| 243 | if head_parser_class is not None: |
---|
| 244 | constructor_kwds = {"head_parser_class": head_parser_class} |
---|
| 245 | else: |
---|
| 246 | constructor_kwds={} |
---|
| 247 | self._set_handler("_equiv", handle, constructor_kwds=constructor_kwds) |
---|
| 248 | def set_handle_gzip(self, handle): |
---|
| 249 | """Handle gzip transfer encoding. |
---|
| 250 | |
---|
| 251 | """ |
---|
| 252 | if handle: |
---|
| 253 | warnings.warn( |
---|
| 254 | "gzip transfer encoding is experimental!", stacklevel=2) |
---|
| 255 | self._set_handler("_gzip", handle) |
---|
| 256 | def set_debug_redirects(self, handle): |
---|
| 257 | """Log information about HTTP redirects (including refreshes). |
---|
| 258 | |
---|
| 259 | Logging is performed using module logging. The logger name is |
---|
| 260 | "mechanize.http_redirects". To actually print some debug output, |
---|
| 261 | eg: |
---|
| 262 | |
---|
| 263 | import sys, logging |
---|
| 264 | logger = logging.getLogger("mechanize.http_redirects") |
---|
| 265 | logger.addHandler(logging.StreamHandler(sys.stdout)) |
---|
| 266 | logger.setLevel(logging.INFO) |
---|
| 267 | |
---|
| 268 | Other logger names relevant to this module: |
---|
| 269 | |
---|
| 270 | "mechanize.http_responses" |
---|
| 271 | "mechanize.cookies" (or "cookielib" if running Python 2.4) |
---|
| 272 | |
---|
| 273 | To turn on everything: |
---|
| 274 | |
---|
| 275 | import sys, logging |
---|
| 276 | logger = logging.getLogger("mechanize") |
---|
| 277 | logger.addHandler(logging.StreamHandler(sys.stdout)) |
---|
| 278 | logger.setLevel(logging.INFO) |
---|
| 279 | |
---|
| 280 | """ |
---|
| 281 | self._set_handler("_debug_redirect", handle) |
---|
| 282 | def set_debug_responses(self, handle): |
---|
| 283 | """Log HTTP response bodies. |
---|
| 284 | |
---|
| 285 | See docstring for .set_debug_redirects() for details of logging. |
---|
| 286 | |
---|
| 287 | Response objects may be .seek()able if this is set (currently returned |
---|
| 288 | responses are, raised HTTPError exception responses are not). |
---|
| 289 | |
---|
| 290 | """ |
---|
| 291 | self._set_handler("_debug_response_body", handle) |
---|
| 292 | def set_debug_http(self, handle): |
---|
| 293 | """Print HTTP headers to sys.stdout.""" |
---|
| 294 | level = int(bool(handle)) |
---|
| 295 | for scheme in "http", "https": |
---|
| 296 | h = self._ua_handlers.get(scheme) |
---|
| 297 | if h is not None: |
---|
| 298 | h.set_http_debuglevel(level) |
---|
| 299 | |
---|
| 300 | def _set_handler(self, name, handle=None, obj=None, |
---|
| 301 | constructor_args=(), constructor_kwds={}): |
---|
| 302 | if handle is None: |
---|
| 303 | handle = obj is not None |
---|
| 304 | if handle: |
---|
| 305 | handler_class = self.handler_classes[name] |
---|
| 306 | if obj is not None: |
---|
| 307 | newhandler = handler_class(obj) |
---|
| 308 | else: |
---|
| 309 | newhandler = handler_class(*constructor_args, **constructor_kwds) |
---|
| 310 | else: |
---|
| 311 | newhandler = None |
---|
| 312 | self._replace_handler(name, newhandler) |
---|
| 313 | |
---|
| 314 | def _replace_handler(self, name, newhandler=None): |
---|
| 315 | # first, if handler was previously added, remove it |
---|
| 316 | if name is not None: |
---|
| 317 | handler = self._ua_handlers.get(name) |
---|
| 318 | if handler: |
---|
| 319 | try: |
---|
| 320 | self.handlers.remove(handler) |
---|
| 321 | except ValueError: |
---|
| 322 | pass |
---|
| 323 | # then add the replacement, if any |
---|
| 324 | if newhandler is not None: |
---|
| 325 | self.add_handler(newhandler) |
---|
| 326 | self._ua_handlers[name] = newhandler |
---|
| 327 | |
---|
| 328 | |
---|
| 329 | class UserAgent(UserAgentBase): |
---|
| 330 | |
---|
| 331 | def __init__(self): |
---|
| 332 | UserAgentBase.__init__(self) |
---|
| 333 | self._seekable = False |
---|
| 334 | |
---|
| 335 | def set_seekable_responses(self, handle): |
---|
| 336 | """Make response objects .seek()able.""" |
---|
| 337 | self._seekable = bool(handle) |
---|
| 338 | |
---|
| 339 | def open(self, fullurl, data=None): |
---|
| 340 | if self._seekable: |
---|
| 341 | def bound_open(fullurl, data=None): |
---|
| 342 | return UserAgentBase.open(self, fullurl, data) |
---|
| 343 | response = _opener.wrapped_open( |
---|
| 344 | bound_open, _response.seek_wrapped_response, fullurl, data) |
---|
| 345 | else: |
---|
| 346 | response = UserAgentBase.open(self, fullurl, data) |
---|
| 347 | return response |
---|