1 | """Convenient HTTP UserAgent class. |
---|
2 | |
---|
3 | This is a subclass of urllib2.OpenerDirector. |
---|
4 | |
---|
5 | |
---|
6 | Copyright 2003-2006 John J. Lee <jjl@pobox.com> |
---|
7 | |
---|
8 | This code is free software; you can redistribute it and/or modify it under |
---|
9 | the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt |
---|
10 | included with the distribution). |
---|
11 | |
---|
12 | """ |
---|
13 | |
---|
14 | import sys, warnings, urllib2 |
---|
15 | |
---|
16 | import _opener |
---|
17 | import _urllib2 |
---|
18 | import _auth |
---|
19 | import _gzip |
---|
20 | import _response |
---|
21 | |
---|
22 | |
---|
23 | class UserAgentBase(_opener.OpenerDirector): |
---|
24 | """Convenient user-agent class. |
---|
25 | |
---|
26 | Do not use .add_handler() to add a handler for something already dealt with |
---|
27 | by this code. |
---|
28 | |
---|
29 | The only reason at present for the distinction between UserAgent and |
---|
30 | UserAgentBase is so that classes that depend on .seek()able responses |
---|
31 | (e.g. mechanize.Browser) can inherit from UserAgentBase. The subclass |
---|
32 | UserAgent exposes a .set_seekable_responses() method that allows switching |
---|
33 | off the adding of a .seek() method to responses. |
---|
34 | |
---|
35 | Public attributes: |
---|
36 | |
---|
37 | addheaders: list of (name, value) pairs specifying headers to send with |
---|
38 | every request, unless they are overridden in the Request instance. |
---|
39 | |
---|
40 | >>> ua = UserAgentBase() |
---|
41 | >>> ua.addheaders = [ |
---|
42 | ... ("User-agent", "Mozilla/5.0 (compatible)"), |
---|
43 | ... ("From", "responsible.person@example.com")] |
---|
44 | |
---|
45 | """ |
---|
46 | |
---|
47 | handler_classes = { |
---|
48 | # scheme handlers |
---|
49 | "http": _urllib2.HTTPHandler, |
---|
50 | # CacheFTPHandler is buggy, at least in 2.3, so we don't use it |
---|
51 | "ftp": _urllib2.FTPHandler, |
---|
52 | "file": _urllib2.FileHandler, |
---|
53 | |
---|
54 | # other handlers |
---|
55 | "_unknown": _urllib2.UnknownHandler, |
---|
56 | # HTTP{S,}Handler depend on HTTPErrorProcessor too |
---|
57 | "_http_error": _urllib2.HTTPErrorProcessor, |
---|
58 | "_http_request_upgrade": _urllib2.HTTPRequestUpgradeProcessor, |
---|
59 | "_http_default_error": _urllib2.HTTPDefaultErrorHandler, |
---|
60 | |
---|
61 | # feature handlers |
---|
62 | "_basicauth": _urllib2.HTTPBasicAuthHandler, |
---|
63 | "_digestauth": _urllib2.HTTPDigestAuthHandler, |
---|
64 | "_redirect": _urllib2.HTTPRedirectHandler, |
---|
65 | "_cookies": _urllib2.HTTPCookieProcessor, |
---|
66 | "_refresh": _urllib2.HTTPRefreshProcessor, |
---|
67 | "_equiv": _urllib2.HTTPEquivProcessor, |
---|
68 | "_proxy": _urllib2.ProxyHandler, |
---|
69 | "_proxy_basicauth": _urllib2.ProxyBasicAuthHandler, |
---|
70 | "_proxy_digestauth": _urllib2.ProxyDigestAuthHandler, |
---|
71 | "_robots": _urllib2.HTTPRobotRulesProcessor, |
---|
72 | "_gzip": _gzip.HTTPGzipProcessor, # experimental! |
---|
73 | |
---|
74 | # debug handlers |
---|
75 | "_debug_redirect": _urllib2.HTTPRedirectDebugProcessor, |
---|
76 | "_debug_response_body": _urllib2.HTTPResponseDebugProcessor, |
---|
77 | } |
---|
78 | |
---|
79 | default_schemes = ["http", "ftp", "file"] |
---|
80 | default_others = ["_unknown", "_http_error", "_http_request_upgrade", |
---|
81 | "_http_default_error", |
---|
82 | ] |
---|
83 | default_features = ["_redirect", "_cookies", |
---|
84 | "_refresh", "_equiv", |
---|
85 | "_basicauth", "_digestauth", |
---|
86 | "_proxy", "_proxy_basicauth", "_proxy_digestauth", |
---|
87 | "_robots", |
---|
88 | ] |
---|
89 | if hasattr(_urllib2, 'HTTPSHandler'): |
---|
90 | handler_classes["https"] = _urllib2.HTTPSHandler |
---|
91 | default_schemes.append("https") |
---|
92 | |
---|
93 | def __init__(self): |
---|
94 | _opener.OpenerDirector.__init__(self) |
---|
95 | |
---|
96 | ua_handlers = self._ua_handlers = {} |
---|
97 | for scheme in (self.default_schemes+ |
---|
98 | self.default_others+ |
---|
99 | self.default_features): |
---|
100 | klass = self.handler_classes[scheme] |
---|
101 | ua_handlers[scheme] = klass() |
---|
102 | for handler in ua_handlers.itervalues(): |
---|
103 | self.add_handler(handler) |
---|
104 | |
---|
105 | # Yuck. |
---|
106 | # Ensure correct default constructor args were passed to |
---|
107 | # HTTPRefreshProcessor and HTTPEquivProcessor. |
---|
108 | if "_refresh" in ua_handlers: |
---|
109 | self.set_handle_refresh(True) |
---|
110 | if "_equiv" in ua_handlers: |
---|
111 | self.set_handle_equiv(True) |
---|
112 | # Ensure default password managers are installed. |
---|
113 | pm = ppm = None |
---|
114 | if "_basicauth" in ua_handlers or "_digestauth" in ua_handlers: |
---|
115 | pm = _urllib2.HTTPPasswordMgrWithDefaultRealm() |
---|
116 | if ("_proxy_basicauth" in ua_handlers or |
---|
117 | "_proxy_digestauth" in ua_handlers): |
---|
118 | ppm = _auth.HTTPProxyPasswordMgr() |
---|
119 | self.set_password_manager(pm) |
---|
120 | self.set_proxy_password_manager(ppm) |
---|
121 | # set default certificate manager |
---|
122 | if "https" in ua_handlers: |
---|
123 | cm = _urllib2.HTTPSClientCertMgr() |
---|
124 | self.set_client_cert_manager(cm) |
---|
125 | |
---|
126 | def close(self): |
---|
127 | _opener.OpenerDirector.close(self) |
---|
128 | self._ua_handlers = None |
---|
129 | |
---|
130 | # XXX |
---|
131 | ## def set_timeout(self, timeout): |
---|
132 | ## self._timeout = timeout |
---|
133 | ## def set_http_connection_cache(self, conn_cache): |
---|
134 | ## self._http_conn_cache = conn_cache |
---|
135 | ## def set_ftp_connection_cache(self, conn_cache): |
---|
136 | ## # XXX ATM, FTP has cache as part of handler; should it be separate? |
---|
137 | ## self._ftp_conn_cache = conn_cache |
---|
138 | |
---|
139 | def set_handled_schemes(self, schemes): |
---|
140 | """Set sequence of URL scheme (protocol) strings. |
---|
141 | |
---|
142 | For example: ua.set_handled_schemes(["http", "ftp"]) |
---|
143 | |
---|
144 | If this fails (with ValueError) because you've passed an unknown |
---|
145 | scheme, the set of handled schemes will not be changed. |
---|
146 | |
---|
147 | """ |
---|
148 | want = {} |
---|
149 | for scheme in schemes: |
---|
150 | if scheme.startswith("_"): |
---|
151 | raise ValueError("not a scheme '%s'" % scheme) |
---|
152 | if scheme not in self.handler_classes: |
---|
153 | raise ValueError("unknown scheme '%s'") |
---|
154 | want[scheme] = None |
---|
155 | |
---|
156 | # get rid of scheme handlers we don't want |
---|
157 | for scheme, oldhandler in self._ua_handlers.items(): |
---|
158 | if scheme.startswith("_"): continue # not a scheme handler |
---|
159 | if scheme not in want: |
---|
160 | self._replace_handler(scheme, None) |
---|
161 | else: |
---|
162 | del want[scheme] # already got it |
---|
163 | # add the scheme handlers that are missing |
---|
164 | for scheme in want.keys(): |
---|
165 | self._set_handler(scheme, True) |
---|
166 | |
---|
167 | def set_cookiejar(self, cookiejar): |
---|
168 | """Set a mechanize.CookieJar, or None.""" |
---|
169 | self._set_handler("_cookies", obj=cookiejar) |
---|
170 | |
---|
171 | # XXX could use Greg Stein's httpx for some of this instead? |
---|
172 | # or httplib2?? |
---|
173 | def set_proxies(self, proxies): |
---|
174 | """Set a dictionary mapping URL scheme to proxy specification, or None. |
---|
175 | |
---|
176 | e.g. {"http": "joe:password@myproxy.example.com:3128", |
---|
177 | "ftp": "proxy.example.com"} |
---|
178 | |
---|
179 | """ |
---|
180 | self._set_handler("_proxy", obj=proxies) |
---|
181 | |
---|
182 | def add_password(self, url, user, password, realm=None): |
---|
183 | self._password_manager.add_password(realm, url, user, password) |
---|
184 | def add_proxy_password(self, user, password, hostport=None, realm=None): |
---|
185 | self._proxy_password_manager.add_password( |
---|
186 | realm, hostport, user, password) |
---|
187 | |
---|
188 | def add_client_certificate(self, url, key_file, cert_file): |
---|
189 | """Add an SSL client certificate, for HTTPS client auth. |
---|
190 | |
---|
191 | key_file and cert_file must be filenames of the key and certificate |
---|
192 | files, in PEM format. You can use e.g. OpenSSL to convert a p12 (PKCS |
---|
193 | 12) file to PEM format: |
---|
194 | |
---|
195 | openssl pkcs12 -clcerts -nokeys -in cert.p12 -out cert.pem |
---|
196 | openssl pkcs12 -nocerts -in cert.p12 -out key.pem |
---|
197 | |
---|
198 | |
---|
199 | Note that client certificate password input is very inflexible ATM. At |
---|
200 | the moment this seems to be console only, which is presumably the |
---|
201 | default behaviour of libopenssl. In future mechanize may support |
---|
202 | third-party libraries that (I assume) allow more options here. |
---|
203 | |
---|
204 | """ |
---|
205 | self._client_cert_manager.add_key_cert(url, key_file, cert_file) |
---|
206 | |
---|
207 | # the following are rarely useful -- use add_password / add_proxy_password |
---|
208 | # instead |
---|
209 | def set_password_manager(self, password_manager): |
---|
210 | """Set a mechanize.HTTPPasswordMgrWithDefaultRealm, or None.""" |
---|
211 | self._password_manager = password_manager |
---|
212 | self._set_handler("_basicauth", obj=password_manager) |
---|
213 | self._set_handler("_digestauth", obj=password_manager) |
---|
214 | def set_proxy_password_manager(self, password_manager): |
---|
215 | """Set a mechanize.HTTPProxyPasswordMgr, or None.""" |
---|
216 | self._proxy_password_manager = password_manager |
---|
217 | self._set_handler("_proxy_basicauth", obj=password_manager) |
---|
218 | self._set_handler("_proxy_digestauth", obj=password_manager) |
---|
219 | def set_client_cert_manager(self, cert_manager): |
---|
220 | """Set a mechanize.HTTPClientCertMgr, or None.""" |
---|
221 | self._client_cert_manager = cert_manager |
---|
222 | handler = self._ua_handlers["https"] |
---|
223 | handler.client_cert_manager = cert_manager |
---|
224 | |
---|
225 | # these methods all take a boolean parameter |
---|
226 | def set_handle_robots(self, handle): |
---|
227 | """Set whether to observe rules from robots.txt.""" |
---|
228 | self._set_handler("_robots", handle) |
---|
229 | def set_handle_redirect(self, handle): |
---|
230 | """Set whether to handle HTTP 30x redirections.""" |
---|
231 | self._set_handler("_redirect", handle) |
---|
232 | def set_handle_refresh(self, handle, max_time=None, honor_time=True): |
---|
233 | """Set whether to handle HTTP Refresh headers.""" |
---|
234 | self._set_handler("_refresh", handle, constructor_kwds= |
---|
235 | {"max_time": max_time, "honor_time": honor_time}) |
---|
236 | def set_handle_equiv(self, handle, head_parser_class=None): |
---|
237 | """Set whether to treat HTML http-equiv headers like HTTP headers. |
---|
238 | |
---|
239 | Response objects may be .seek()able if this is set (currently returned |
---|
240 | responses are, raised HTTPError exception responses are not). |
---|
241 | |
---|
242 | """ |
---|
243 | if head_parser_class is not None: |
---|
244 | constructor_kwds = {"head_parser_class": head_parser_class} |
---|
245 | else: |
---|
246 | constructor_kwds={} |
---|
247 | self._set_handler("_equiv", handle, constructor_kwds=constructor_kwds) |
---|
248 | def set_handle_gzip(self, handle): |
---|
249 | """Handle gzip transfer encoding. |
---|
250 | |
---|
251 | """ |
---|
252 | if handle: |
---|
253 | warnings.warn( |
---|
254 | "gzip transfer encoding is experimental!", stacklevel=2) |
---|
255 | self._set_handler("_gzip", handle) |
---|
256 | def set_debug_redirects(self, handle): |
---|
257 | """Log information about HTTP redirects (including refreshes). |
---|
258 | |
---|
259 | Logging is performed using module logging. The logger name is |
---|
260 | "mechanize.http_redirects". To actually print some debug output, |
---|
261 | eg: |
---|
262 | |
---|
263 | import sys, logging |
---|
264 | logger = logging.getLogger("mechanize.http_redirects") |
---|
265 | logger.addHandler(logging.StreamHandler(sys.stdout)) |
---|
266 | logger.setLevel(logging.INFO) |
---|
267 | |
---|
268 | Other logger names relevant to this module: |
---|
269 | |
---|
270 | "mechanize.http_responses" |
---|
271 | "mechanize.cookies" (or "cookielib" if running Python 2.4) |
---|
272 | |
---|
273 | To turn on everything: |
---|
274 | |
---|
275 | import sys, logging |
---|
276 | logger = logging.getLogger("mechanize") |
---|
277 | logger.addHandler(logging.StreamHandler(sys.stdout)) |
---|
278 | logger.setLevel(logging.INFO) |
---|
279 | |
---|
280 | """ |
---|
281 | self._set_handler("_debug_redirect", handle) |
---|
282 | def set_debug_responses(self, handle): |
---|
283 | """Log HTTP response bodies. |
---|
284 | |
---|
285 | See docstring for .set_debug_redirects() for details of logging. |
---|
286 | |
---|
287 | Response objects may be .seek()able if this is set (currently returned |
---|
288 | responses are, raised HTTPError exception responses are not). |
---|
289 | |
---|
290 | """ |
---|
291 | self._set_handler("_debug_response_body", handle) |
---|
292 | def set_debug_http(self, handle): |
---|
293 | """Print HTTP headers to sys.stdout.""" |
---|
294 | level = int(bool(handle)) |
---|
295 | for scheme in "http", "https": |
---|
296 | h = self._ua_handlers.get(scheme) |
---|
297 | if h is not None: |
---|
298 | h.set_http_debuglevel(level) |
---|
299 | |
---|
300 | def _set_handler(self, name, handle=None, obj=None, |
---|
301 | constructor_args=(), constructor_kwds={}): |
---|
302 | if handle is None: |
---|
303 | handle = obj is not None |
---|
304 | if handle: |
---|
305 | handler_class = self.handler_classes[name] |
---|
306 | if obj is not None: |
---|
307 | newhandler = handler_class(obj) |
---|
308 | else: |
---|
309 | newhandler = handler_class(*constructor_args, **constructor_kwds) |
---|
310 | else: |
---|
311 | newhandler = None |
---|
312 | self._replace_handler(name, newhandler) |
---|
313 | |
---|
314 | def _replace_handler(self, name, newhandler=None): |
---|
315 | # first, if handler was previously added, remove it |
---|
316 | if name is not None: |
---|
317 | handler = self._ua_handlers.get(name) |
---|
318 | if handler: |
---|
319 | try: |
---|
320 | self.handlers.remove(handler) |
---|
321 | except ValueError: |
---|
322 | pass |
---|
323 | # then add the replacement, if any |
---|
324 | if newhandler is not None: |
---|
325 | self.add_handler(newhandler) |
---|
326 | self._ua_handlers[name] = newhandler |
---|
327 | |
---|
328 | |
---|
329 | class UserAgent(UserAgentBase): |
---|
330 | |
---|
331 | def __init__(self): |
---|
332 | UserAgentBase.__init__(self) |
---|
333 | self._seekable = False |
---|
334 | |
---|
335 | def set_seekable_responses(self, handle): |
---|
336 | """Make response objects .seek()able.""" |
---|
337 | self._seekable = bool(handle) |
---|
338 | |
---|
339 | def open(self, fullurl, data=None): |
---|
340 | if self._seekable: |
---|
341 | def bound_open(fullurl, data=None): |
---|
342 | return UserAgentBase.open(self, fullurl, data) |
---|
343 | response = _opener.wrapped_open( |
---|
344 | bound_open, _response.seek_wrapped_response, fullurl, data) |
---|
345 | else: |
---|
346 | response = UserAgentBase.open(self, fullurl, data) |
---|
347 | return response |
---|