1 | """HTTP related handlers. |
---|
2 | |
---|
3 | Note that some other HTTP handlers live in more specific modules: _auth.py, |
---|
4 | _gzip.py, etc. |
---|
5 | |
---|
6 | |
---|
7 | Copyright 2002-2006 John J Lee <jjl@pobox.com> |
---|
8 | |
---|
9 | This code is free software; you can redistribute it and/or modify it |
---|
10 | under the terms of the BSD or ZPL 2.1 licenses (see the file |
---|
11 | COPYING.txt included with the distribution). |
---|
12 | |
---|
13 | """ |
---|
14 | |
---|
15 | import copy, time, tempfile, htmlentitydefs, re, logging, socket, \ |
---|
16 | urllib2, urllib, httplib, sgmllib |
---|
17 | from urllib2 import URLError, HTTPError, BaseHandler |
---|
18 | from cStringIO import StringIO |
---|
19 | |
---|
20 | from _request import Request |
---|
21 | from _util import isstringlike |
---|
22 | from _response import closeable_response, response_seek_wrapper |
---|
23 | from _html import unescape, unescape_charref |
---|
24 | from _headersutil import is_html |
---|
25 | from _clientcookie import CookieJar, request_host |
---|
26 | import _rfc3986 |
---|
27 | |
---|
28 | debug = logging.getLogger("mechanize").debug |
---|
29 | |
---|
30 | # monkeypatch urllib2.HTTPError to show URL |
---|
31 | ## def urllib2_str(self): |
---|
32 | ## return 'HTTP Error %s: %s (%s)' % ( |
---|
33 | ## self.code, self.msg, self.geturl()) |
---|
34 | ## urllib2.HTTPError.__str__ = urllib2_str |
---|
35 | |
---|
36 | |
---|
37 | CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes |
---|
38 | DEFAULT_ENCODING = 'latin-1' |
---|
39 | |
---|
40 | |
---|
41 | # This adds "refresh" to the list of redirectables and provides a redirection |
---|
42 | # algorithm that doesn't go into a loop in the presence of cookies |
---|
43 | # (Python 2.4 has this new algorithm, 2.3 doesn't). |
---|
44 | class HTTPRedirectHandler(BaseHandler): |
---|
45 | # maximum number of redirections to any single URL |
---|
46 | # this is needed because of the state that cookies introduce |
---|
47 | max_repeats = 4 |
---|
48 | # maximum total number of redirections (regardless of URL) before |
---|
49 | # assuming we're in a loop |
---|
50 | max_redirections = 10 |
---|
51 | |
---|
52 | # Implementation notes: |
---|
53 | |
---|
54 | # To avoid the server sending us into an infinite loop, the request |
---|
55 | # object needs to track what URLs we have already seen. Do this by |
---|
56 | # adding a handler-specific attribute to the Request object. The value |
---|
57 | # of the dict is used to count the number of times the same URL has |
---|
58 | # been visited. This is needed because visiting the same URL twice |
---|
59 | # does not necessarily imply a loop, thanks to state introduced by |
---|
60 | # cookies. |
---|
61 | |
---|
62 | # Always unhandled redirection codes: |
---|
63 | # 300 Multiple Choices: should not handle this here. |
---|
64 | # 304 Not Modified: no need to handle here: only of interest to caches |
---|
65 | # that do conditional GETs |
---|
66 | # 305 Use Proxy: probably not worth dealing with here |
---|
67 | # 306 Unused: what was this for in the previous versions of protocol?? |
---|
68 | |
---|
69 | def redirect_request(self, newurl, req, fp, code, msg, headers): |
---|
70 | """Return a Request or None in response to a redirect. |
---|
71 | |
---|
72 | This is called by the http_error_30x methods when a redirection |
---|
73 | response is received. If a redirection should take place, return a |
---|
74 | new Request to allow http_error_30x to perform the redirect; |
---|
75 | otherwise, return None to indicate that an HTTPError should be |
---|
76 | raised. |
---|
77 | |
---|
78 | """ |
---|
79 | if code in (301, 302, 303, "refresh") or \ |
---|
80 | (code == 307 and not req.has_data()): |
---|
81 | # Strictly (according to RFC 2616), 301 or 302 in response to |
---|
82 | # a POST MUST NOT cause a redirection without confirmation |
---|
83 | # from the user (of urllib2, in this case). In practice, |
---|
84 | # essentially all clients do redirect in this case, so we do |
---|
85 | # the same. |
---|
86 | # XXX really refresh redirections should be visiting; tricky to |
---|
87 | # fix, so this will wait until post-stable release |
---|
88 | new = Request(newurl, |
---|
89 | headers=req.headers, |
---|
90 | origin_req_host=req.get_origin_req_host(), |
---|
91 | unverifiable=True, |
---|
92 | visit=False, |
---|
93 | ) |
---|
94 | new._origin_req = getattr(req, "_origin_req", req) |
---|
95 | return new |
---|
96 | else: |
---|
97 | raise HTTPError(req.get_full_url(), code, msg, headers, fp) |
---|
98 | |
---|
99 | def http_error_302(self, req, fp, code, msg, headers): |
---|
100 | # Some servers (incorrectly) return multiple Location headers |
---|
101 | # (so probably same goes for URI). Use first header. |
---|
102 | if headers.has_key('location'): |
---|
103 | newurl = headers.getheaders('location')[0] |
---|
104 | elif headers.has_key('uri'): |
---|
105 | newurl = headers.getheaders('uri')[0] |
---|
106 | else: |
---|
107 | return |
---|
108 | newurl = _rfc3986.clean_url(newurl, "latin-1") |
---|
109 | newurl = _rfc3986.urljoin(req.get_full_url(), newurl) |
---|
110 | |
---|
111 | # XXX Probably want to forget about the state of the current |
---|
112 | # request, although that might interact poorly with other |
---|
113 | # handlers that also use handler-specific request attributes |
---|
114 | new = self.redirect_request(newurl, req, fp, code, msg, headers) |
---|
115 | if new is None: |
---|
116 | return |
---|
117 | |
---|
118 | # loop detection |
---|
119 | # .redirect_dict has a key url if url was previously visited. |
---|
120 | if hasattr(req, 'redirect_dict'): |
---|
121 | visited = new.redirect_dict = req.redirect_dict |
---|
122 | if (visited.get(newurl, 0) >= self.max_repeats or |
---|
123 | len(visited) >= self.max_redirections): |
---|
124 | raise HTTPError(req.get_full_url(), code, |
---|
125 | self.inf_msg + msg, headers, fp) |
---|
126 | else: |
---|
127 | visited = new.redirect_dict = req.redirect_dict = {} |
---|
128 | visited[newurl] = visited.get(newurl, 0) + 1 |
---|
129 | |
---|
130 | # Don't close the fp until we are sure that we won't use it |
---|
131 | # with HTTPError. |
---|
132 | fp.read() |
---|
133 | fp.close() |
---|
134 | |
---|
135 | return self.parent.open(new) |
---|
136 | |
---|
137 | http_error_301 = http_error_303 = http_error_307 = http_error_302 |
---|
138 | http_error_refresh = http_error_302 |
---|
139 | |
---|
140 | inf_msg = "The HTTP server returned a redirect error that would " \ |
---|
141 | "lead to an infinite loop.\n" \ |
---|
142 | "The last 30x error message was:\n" |
---|
143 | |
---|
144 | |
---|
145 | # XXX would self.reset() work, instead of raising this exception? |
---|
146 | class EndOfHeadError(Exception): pass |
---|
147 | class AbstractHeadParser: |
---|
148 | # only these elements are allowed in or before HEAD of document |
---|
149 | head_elems = ("html", "head", |
---|
150 | "title", "base", |
---|
151 | "script", "style", "meta", "link", "object") |
---|
152 | _entitydefs = htmlentitydefs.name2codepoint |
---|
153 | _encoding = DEFAULT_ENCODING |
---|
154 | |
---|
155 | def __init__(self): |
---|
156 | self.http_equiv = [] |
---|
157 | |
---|
158 | def start_meta(self, attrs): |
---|
159 | http_equiv = content = None |
---|
160 | for key, value in attrs: |
---|
161 | if key == "http-equiv": |
---|
162 | http_equiv = self.unescape_attr_if_required(value) |
---|
163 | elif key == "content": |
---|
164 | content = self.unescape_attr_if_required(value) |
---|
165 | if http_equiv is not None and content is not None: |
---|
166 | self.http_equiv.append((http_equiv, content)) |
---|
167 | |
---|
168 | def end_head(self): |
---|
169 | raise EndOfHeadError() |
---|
170 | |
---|
171 | def handle_entityref(self, name): |
---|
172 | #debug("%s", name) |
---|
173 | self.handle_data(unescape( |
---|
174 | '&%s;' % name, self._entitydefs, self._encoding)) |
---|
175 | |
---|
176 | def handle_charref(self, name): |
---|
177 | #debug("%s", name) |
---|
178 | self.handle_data(unescape_charref(name, self._encoding)) |
---|
179 | |
---|
180 | def unescape_attr(self, name): |
---|
181 | #debug("%s", name) |
---|
182 | return unescape(name, self._entitydefs, self._encoding) |
---|
183 | |
---|
184 | def unescape_attrs(self, attrs): |
---|
185 | #debug("%s", attrs) |
---|
186 | escaped_attrs = {} |
---|
187 | for key, val in attrs.items(): |
---|
188 | escaped_attrs[key] = self.unescape_attr(val) |
---|
189 | return escaped_attrs |
---|
190 | |
---|
191 | def unknown_entityref(self, ref): |
---|
192 | self.handle_data("&%s;" % ref) |
---|
193 | |
---|
194 | def unknown_charref(self, ref): |
---|
195 | self.handle_data("&#%s;" % ref) |
---|
196 | |
---|
197 | |
---|
198 | try: |
---|
199 | import HTMLParser |
---|
200 | except ImportError: |
---|
201 | pass |
---|
202 | else: |
---|
203 | class XHTMLCompatibleHeadParser(AbstractHeadParser, |
---|
204 | HTMLParser.HTMLParser): |
---|
205 | def __init__(self): |
---|
206 | HTMLParser.HTMLParser.__init__(self) |
---|
207 | AbstractHeadParser.__init__(self) |
---|
208 | |
---|
209 | def handle_starttag(self, tag, attrs): |
---|
210 | if tag not in self.head_elems: |
---|
211 | raise EndOfHeadError() |
---|
212 | try: |
---|
213 | method = getattr(self, 'start_' + tag) |
---|
214 | except AttributeError: |
---|
215 | try: |
---|
216 | method = getattr(self, 'do_' + tag) |
---|
217 | except AttributeError: |
---|
218 | pass # unknown tag |
---|
219 | else: |
---|
220 | method(attrs) |
---|
221 | else: |
---|
222 | method(attrs) |
---|
223 | |
---|
224 | def handle_endtag(self, tag): |
---|
225 | if tag not in self.head_elems: |
---|
226 | raise EndOfHeadError() |
---|
227 | try: |
---|
228 | method = getattr(self, 'end_' + tag) |
---|
229 | except AttributeError: |
---|
230 | pass # unknown tag |
---|
231 | else: |
---|
232 | method() |
---|
233 | |
---|
234 | def unescape(self, name): |
---|
235 | # Use the entitydefs passed into constructor, not |
---|
236 | # HTMLParser.HTMLParser's entitydefs. |
---|
237 | return self.unescape_attr(name) |
---|
238 | |
---|
239 | def unescape_attr_if_required(self, name): |
---|
240 | return name # HTMLParser.HTMLParser already did it |
---|
241 | |
---|
242 | class HeadParser(AbstractHeadParser, sgmllib.SGMLParser): |
---|
243 | |
---|
244 | def _not_called(self): |
---|
245 | assert False |
---|
246 | |
---|
247 | def __init__(self): |
---|
248 | sgmllib.SGMLParser.__init__(self) |
---|
249 | AbstractHeadParser.__init__(self) |
---|
250 | |
---|
251 | def handle_starttag(self, tag, method, attrs): |
---|
252 | if tag not in self.head_elems: |
---|
253 | raise EndOfHeadError() |
---|
254 | if tag == "meta": |
---|
255 | method(attrs) |
---|
256 | |
---|
257 | def unknown_starttag(self, tag, attrs): |
---|
258 | self.handle_starttag(tag, self._not_called, attrs) |
---|
259 | |
---|
260 | def handle_endtag(self, tag, method): |
---|
261 | if tag in self.head_elems: |
---|
262 | method() |
---|
263 | else: |
---|
264 | raise EndOfHeadError() |
---|
265 | |
---|
266 | def unescape_attr_if_required(self, name): |
---|
267 | return self.unescape_attr(name) |
---|
268 | |
---|
269 | def parse_head(fileobj, parser): |
---|
270 | """Return a list of key, value pairs.""" |
---|
271 | while 1: |
---|
272 | data = fileobj.read(CHUNK) |
---|
273 | try: |
---|
274 | parser.feed(data) |
---|
275 | except EndOfHeadError: |
---|
276 | break |
---|
277 | if len(data) != CHUNK: |
---|
278 | # this should only happen if there is no HTML body, or if |
---|
279 | # CHUNK is big |
---|
280 | break |
---|
281 | return parser.http_equiv |
---|
282 | |
---|
283 | class HTTPEquivProcessor(BaseHandler): |
---|
284 | """Append META HTTP-EQUIV headers to regular HTTP headers.""" |
---|
285 | |
---|
286 | handler_order = 300 # before handlers that look at HTTP headers |
---|
287 | |
---|
288 | def __init__(self, head_parser_class=HeadParser, |
---|
289 | i_want_broken_xhtml_support=False, |
---|
290 | ): |
---|
291 | self.head_parser_class = head_parser_class |
---|
292 | self._allow_xhtml = i_want_broken_xhtml_support |
---|
293 | |
---|
294 | def http_response(self, request, response): |
---|
295 | if not hasattr(response, "seek"): |
---|
296 | response = response_seek_wrapper(response) |
---|
297 | http_message = response.info() |
---|
298 | url = response.geturl() |
---|
299 | ct_hdrs = http_message.getheaders("content-type") |
---|
300 | if is_html(ct_hdrs, url, self._allow_xhtml): |
---|
301 | try: |
---|
302 | try: |
---|
303 | html_headers = parse_head(response, self.head_parser_class()) |
---|
304 | finally: |
---|
305 | response.seek(0) |
---|
306 | except (HTMLParser.HTMLParseError, |
---|
307 | sgmllib.SGMLParseError): |
---|
308 | pass |
---|
309 | else: |
---|
310 | for hdr, val in html_headers: |
---|
311 | # add a header |
---|
312 | http_message.dict[hdr.lower()] = val |
---|
313 | text = hdr + ": " + val |
---|
314 | for line in text.split("\n"): |
---|
315 | http_message.headers.append(line + "\n") |
---|
316 | return response |
---|
317 | |
---|
318 | https_response = http_response |
---|
319 | |
---|
320 | class HTTPCookieProcessor(BaseHandler): |
---|
321 | """Handle HTTP cookies. |
---|
322 | |
---|
323 | Public attributes: |
---|
324 | |
---|
325 | cookiejar: CookieJar instance |
---|
326 | |
---|
327 | """ |
---|
328 | def __init__(self, cookiejar=None): |
---|
329 | if cookiejar is None: |
---|
330 | cookiejar = CookieJar() |
---|
331 | self.cookiejar = cookiejar |
---|
332 | |
---|
333 | def http_request(self, request): |
---|
334 | self.cookiejar.add_cookie_header(request) |
---|
335 | return request |
---|
336 | |
---|
337 | def http_response(self, request, response): |
---|
338 | self.cookiejar.extract_cookies(response, request) |
---|
339 | return response |
---|
340 | |
---|
341 | https_request = http_request |
---|
342 | https_response = http_response |
---|
343 | |
---|
344 | try: |
---|
345 | import robotparser |
---|
346 | except ImportError: |
---|
347 | pass |
---|
348 | else: |
---|
349 | class MechanizeRobotFileParser(robotparser.RobotFileParser): |
---|
350 | |
---|
351 | def __init__(self, url='', opener=None): |
---|
352 | import _opener |
---|
353 | robotparser.RobotFileParser.__init__(self, url) |
---|
354 | self._opener = opener |
---|
355 | |
---|
356 | def set_opener(self, opener=None): |
---|
357 | if opener is None: |
---|
358 | opener = _opener.OpenerDirector() |
---|
359 | self._opener = opener |
---|
360 | |
---|
361 | def read(self): |
---|
362 | """Reads the robots.txt URL and feeds it to the parser.""" |
---|
363 | if self._opener is None: |
---|
364 | self.set_opener() |
---|
365 | req = Request(self.url, unverifiable=True, visit=False) |
---|
366 | try: |
---|
367 | f = self._opener.open(req) |
---|
368 | except HTTPError, f: |
---|
369 | pass |
---|
370 | except (IOError, socket.error, OSError), exc: |
---|
371 | robotparser._debug("ignoring error opening %r: %s" % |
---|
372 | (self.url, exc)) |
---|
373 | return |
---|
374 | lines = [] |
---|
375 | line = f.readline() |
---|
376 | while line: |
---|
377 | lines.append(line.strip()) |
---|
378 | line = f.readline() |
---|
379 | status = f.code |
---|
380 | if status == 401 or status == 403: |
---|
381 | self.disallow_all = True |
---|
382 | robotparser._debug("disallow all") |
---|
383 | elif status >= 400: |
---|
384 | self.allow_all = True |
---|
385 | robotparser._debug("allow all") |
---|
386 | elif status == 200 and lines: |
---|
387 | robotparser._debug("parse lines") |
---|
388 | self.parse(lines) |
---|
389 | |
---|
390 | class RobotExclusionError(urllib2.HTTPError): |
---|
391 | def __init__(self, request, *args): |
---|
392 | apply(urllib2.HTTPError.__init__, (self,)+args) |
---|
393 | self.request = request |
---|
394 | |
---|
395 | class HTTPRobotRulesProcessor(BaseHandler): |
---|
396 | # before redirections, after everything else |
---|
397 | handler_order = 800 |
---|
398 | |
---|
399 | try: |
---|
400 | from httplib import HTTPMessage |
---|
401 | except: |
---|
402 | from mimetools import Message |
---|
403 | http_response_class = Message |
---|
404 | else: |
---|
405 | http_response_class = HTTPMessage |
---|
406 | |
---|
407 | def __init__(self, rfp_class=MechanizeRobotFileParser): |
---|
408 | self.rfp_class = rfp_class |
---|
409 | self.rfp = None |
---|
410 | self._host = None |
---|
411 | |
---|
412 | def http_request(self, request): |
---|
413 | scheme = request.get_type() |
---|
414 | if scheme not in ["http", "https"]: |
---|
415 | # robots exclusion only applies to HTTP |
---|
416 | return request |
---|
417 | |
---|
418 | if request.get_selector() == "/robots.txt": |
---|
419 | # /robots.txt is always OK to fetch |
---|
420 | return request |
---|
421 | |
---|
422 | host = request.get_host() |
---|
423 | |
---|
424 | # robots.txt requests don't need to be allowed by robots.txt :-) |
---|
425 | origin_req = getattr(request, "_origin_req", None) |
---|
426 | if (origin_req is not None and |
---|
427 | origin_req.get_selector() == "/robots.txt" and |
---|
428 | origin_req.get_host() == host |
---|
429 | ): |
---|
430 | return request |
---|
431 | |
---|
432 | if host != self._host: |
---|
433 | self.rfp = self.rfp_class() |
---|
434 | try: |
---|
435 | self.rfp.set_opener(self.parent) |
---|
436 | except AttributeError: |
---|
437 | debug("%r instance does not support set_opener" % |
---|
438 | self.rfp.__class__) |
---|
439 | self.rfp.set_url(scheme+"://"+host+"/robots.txt") |
---|
440 | self.rfp.read() |
---|
441 | self._host = host |
---|
442 | |
---|
443 | ua = request.get_header("User-agent", "") |
---|
444 | if self.rfp.can_fetch(ua, request.get_full_url()): |
---|
445 | return request |
---|
446 | else: |
---|
447 | # XXX This should really have raised URLError. Too late now... |
---|
448 | msg = "request disallowed by robots.txt" |
---|
449 | raise RobotExclusionError( |
---|
450 | request, |
---|
451 | request.get_full_url(), |
---|
452 | 403, msg, |
---|
453 | self.http_response_class(StringIO()), StringIO(msg)) |
---|
454 | |
---|
455 | https_request = http_request |
---|
456 | |
---|
457 | class HTTPRefererProcessor(BaseHandler): |
---|
458 | """Add Referer header to requests. |
---|
459 | |
---|
460 | This only makes sense if you use each RefererProcessor for a single |
---|
461 | chain of requests only (so, for example, if you use a single |
---|
462 | HTTPRefererProcessor to fetch a series of URLs extracted from a single |
---|
463 | page, this will break). |
---|
464 | |
---|
465 | There's a proper implementation of this in mechanize.Browser. |
---|
466 | |
---|
467 | """ |
---|
468 | def __init__(self): |
---|
469 | self.referer = None |
---|
470 | |
---|
471 | def http_request(self, request): |
---|
472 | if ((self.referer is not None) and |
---|
473 | not request.has_header("Referer")): |
---|
474 | request.add_unredirected_header("Referer", self.referer) |
---|
475 | return request |
---|
476 | |
---|
477 | def http_response(self, request, response): |
---|
478 | self.referer = response.geturl() |
---|
479 | return response |
---|
480 | |
---|
481 | https_request = http_request |
---|
482 | https_response = http_response |
---|
483 | |
---|
484 | |
---|
485 | def clean_refresh_url(url): |
---|
486 | # e.g. Firefox 1.5 does (something like) this |
---|
487 | if ((url.startswith('"') and url.endswith('"')) or |
---|
488 | (url.startswith("'") and url.endswith("'"))): |
---|
489 | url = url[1:-1] |
---|
490 | return _rfc3986.clean_url(url, "latin-1") # XXX encoding |
---|
491 | |
---|
492 | def parse_refresh_header(refresh): |
---|
493 | """ |
---|
494 | >>> parse_refresh_header("1; url=http://example.com/") |
---|
495 | (1.0, 'http://example.com/') |
---|
496 | >>> parse_refresh_header("1; url='http://example.com/'") |
---|
497 | (1.0, 'http://example.com/') |
---|
498 | >>> parse_refresh_header("1") |
---|
499 | (1.0, None) |
---|
500 | >>> parse_refresh_header("blah") |
---|
501 | Traceback (most recent call last): |
---|
502 | ValueError: invalid literal for float(): blah |
---|
503 | |
---|
504 | """ |
---|
505 | |
---|
506 | ii = refresh.find(";") |
---|
507 | if ii != -1: |
---|
508 | pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:] |
---|
509 | jj = newurl_spec.find("=") |
---|
510 | key = None |
---|
511 | if jj != -1: |
---|
512 | key, newurl = newurl_spec[:jj], newurl_spec[jj+1:] |
---|
513 | newurl = clean_refresh_url(newurl) |
---|
514 | if key is None or key.strip().lower() != "url": |
---|
515 | raise ValueError() |
---|
516 | else: |
---|
517 | pause, newurl = float(refresh), None |
---|
518 | return pause, newurl |
---|
519 | |
---|
520 | class HTTPRefreshProcessor(BaseHandler): |
---|
521 | """Perform HTTP Refresh redirections. |
---|
522 | |
---|
523 | Note that if a non-200 HTTP code has occurred (for example, a 30x |
---|
524 | redirect), this processor will do nothing. |
---|
525 | |
---|
526 | By default, only zero-time Refresh headers are redirected. Use the |
---|
527 | max_time attribute / constructor argument to allow Refresh with longer |
---|
528 | pauses. Use the honor_time attribute / constructor argument to control |
---|
529 | whether the requested pause is honoured (with a time.sleep()) or |
---|
530 | skipped in favour of immediate redirection. |
---|
531 | |
---|
532 | Public attributes: |
---|
533 | |
---|
534 | max_time: see above |
---|
535 | honor_time: see above |
---|
536 | |
---|
537 | """ |
---|
538 | handler_order = 1000 |
---|
539 | |
---|
540 | def __init__(self, max_time=0, honor_time=True): |
---|
541 | self.max_time = max_time |
---|
542 | self.honor_time = honor_time |
---|
543 | |
---|
544 | def http_response(self, request, response): |
---|
545 | code, msg, hdrs = response.code, response.msg, response.info() |
---|
546 | |
---|
547 | if code == 200 and hdrs.has_key("refresh"): |
---|
548 | refresh = hdrs.getheaders("refresh")[0] |
---|
549 | try: |
---|
550 | pause, newurl = parse_refresh_header(refresh) |
---|
551 | except ValueError: |
---|
552 | debug("bad Refresh header: %r" % refresh) |
---|
553 | return response |
---|
554 | if newurl is None: |
---|
555 | newurl = response.geturl() |
---|
556 | if (self.max_time is None) or (pause <= self.max_time): |
---|
557 | if pause > 1E-3 and self.honor_time: |
---|
558 | time.sleep(pause) |
---|
559 | hdrs["location"] = newurl |
---|
560 | # hardcoded http is NOT a bug |
---|
561 | response = self.parent.error( |
---|
562 | "http", request, response, |
---|
563 | "refresh", msg, hdrs) |
---|
564 | |
---|
565 | return response |
---|
566 | |
---|
567 | https_response = http_response |
---|
568 | |
---|
569 | class HTTPErrorProcessor(BaseHandler): |
---|
570 | """Process HTTP error responses. |
---|
571 | |
---|
572 | The purpose of this handler is to to allow other response processors a |
---|
573 | look-in by removing the call to parent.error() from |
---|
574 | AbstractHTTPHandler. |
---|
575 | |
---|
576 | For non-200 error codes, this just passes the job on to the |
---|
577 | Handler.<proto>_error_<code> methods, via the OpenerDirector.error |
---|
578 | method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an |
---|
579 | HTTPError if no other handler handles the error. |
---|
580 | |
---|
581 | """ |
---|
582 | handler_order = 1000 # after all other processors |
---|
583 | |
---|
584 | def http_response(self, request, response): |
---|
585 | code, msg, hdrs = response.code, response.msg, response.info() |
---|
586 | |
---|
587 | if code != 200: |
---|
588 | # hardcoded http is NOT a bug |
---|
589 | response = self.parent.error( |
---|
590 | "http", request, response, code, msg, hdrs) |
---|
591 | |
---|
592 | return response |
---|
593 | |
---|
594 | https_response = http_response |
---|
595 | |
---|
596 | |
---|
597 | class HTTPDefaultErrorHandler(BaseHandler): |
---|
598 | def http_error_default(self, req, fp, code, msg, hdrs): |
---|
599 | # why these error methods took the code, msg, headers args in the first |
---|
600 | # place rather than a response object, I don't know, but to avoid |
---|
601 | # multiple wrapping, we're discarding them |
---|
602 | |
---|
603 | if isinstance(fp, urllib2.HTTPError): |
---|
604 | response = fp |
---|
605 | else: |
---|
606 | response = urllib2.HTTPError( |
---|
607 | req.get_full_url(), code, msg, hdrs, fp) |
---|
608 | assert code == response.code |
---|
609 | assert msg == response.msg |
---|
610 | assert hdrs == response.hdrs |
---|
611 | raise response |
---|
612 | |
---|
613 | |
---|
614 | class AbstractHTTPHandler(BaseHandler): |
---|
615 | |
---|
616 | def __init__(self, debuglevel=0): |
---|
617 | self._debuglevel = debuglevel |
---|
618 | |
---|
619 | def set_http_debuglevel(self, level): |
---|
620 | self._debuglevel = level |
---|
621 | |
---|
622 | def do_request_(self, request): |
---|
623 | host = request.get_host() |
---|
624 | if not host: |
---|
625 | raise URLError('no host given') |
---|
626 | |
---|
627 | if request.has_data(): # POST |
---|
628 | data = request.get_data() |
---|
629 | if not request.has_header('Content-type'): |
---|
630 | request.add_unredirected_header( |
---|
631 | 'Content-type', |
---|
632 | 'application/x-www-form-urlencoded') |
---|
633 | |
---|
634 | scheme, sel = urllib.splittype(request.get_selector()) |
---|
635 | sel_host, sel_path = urllib.splithost(sel) |
---|
636 | if not request.has_header('Host'): |
---|
637 | request.add_unredirected_header('Host', sel_host or host) |
---|
638 | for name, value in self.parent.addheaders: |
---|
639 | name = name.capitalize() |
---|
640 | if not request.has_header(name): |
---|
641 | request.add_unredirected_header(name, value) |
---|
642 | |
---|
643 | return request |
---|
644 | |
---|
645 | def do_open(self, http_class, req): |
---|
646 | """Return an addinfourl object for the request, using http_class. |
---|
647 | |
---|
648 | http_class must implement the HTTPConnection API from httplib. |
---|
649 | The addinfourl return value is a file-like object. It also |
---|
650 | has methods and attributes including: |
---|
651 | - info(): return a mimetools.Message object for the headers |
---|
652 | - geturl(): return the original request URL |
---|
653 | - code: HTTP status code |
---|
654 | """ |
---|
655 | host = req.get_host() |
---|
656 | if not host: |
---|
657 | raise URLError('no host given') |
---|
658 | |
---|
659 | h = http_class(host) # will parse host:port |
---|
660 | h.set_debuglevel(self._debuglevel) |
---|
661 | |
---|
662 | headers = dict(req.headers) |
---|
663 | headers.update(req.unredirected_hdrs) |
---|
664 | # We want to make an HTTP/1.1 request, but the addinfourl |
---|
665 | # class isn't prepared to deal with a persistent connection. |
---|
666 | # It will try to read all remaining data from the socket, |
---|
667 | # which will block while the server waits for the next request. |
---|
668 | # So make sure the connection gets closed after the (only) |
---|
669 | # request. |
---|
670 | headers["Connection"] = "close" |
---|
671 | headers = dict( |
---|
672 | [(name.title(), val) for name, val in headers.items()]) |
---|
673 | try: |
---|
674 | h.request(req.get_method(), req.get_selector(), req.data, headers) |
---|
675 | r = h.getresponse() |
---|
676 | except socket.error, err: # XXX what error? |
---|
677 | raise URLError(err) |
---|
678 | |
---|
679 | # Pick apart the HTTPResponse object to get the addinfourl |
---|
680 | # object initialized properly. |
---|
681 | |
---|
682 | # Wrap the HTTPResponse object in socket's file object adapter |
---|
683 | # for Windows. That adapter calls recv(), so delegate recv() |
---|
684 | # to read(). This weird wrapping allows the returned object to |
---|
685 | # have readline() and readlines() methods. |
---|
686 | |
---|
687 | # XXX It might be better to extract the read buffering code |
---|
688 | # out of socket._fileobject() and into a base class. |
---|
689 | |
---|
690 | r.recv = r.read |
---|
691 | fp = socket._fileobject(r) |
---|
692 | |
---|
693 | resp = closeable_response(fp, r.msg, req.get_full_url(), |
---|
694 | r.status, r.reason) |
---|
695 | return resp |
---|
696 | |
---|
697 | |
---|
698 | class HTTPHandler(AbstractHTTPHandler): |
---|
699 | def http_open(self, req): |
---|
700 | return self.do_open(httplib.HTTPConnection, req) |
---|
701 | |
---|
702 | http_request = AbstractHTTPHandler.do_request_ |
---|
703 | |
---|
704 | if hasattr(httplib, 'HTTPS'): |
---|
705 | |
---|
706 | class HTTPSConnectionFactory: |
---|
707 | def __init__(self, key_file, cert_file): |
---|
708 | self._key_file = key_file |
---|
709 | self._cert_file = cert_file |
---|
710 | def __call__(self, hostport): |
---|
711 | return httplib.HTTPSConnection( |
---|
712 | hostport, |
---|
713 | key_file=self._key_file, cert_file=self._cert_file) |
---|
714 | |
---|
715 | class HTTPSHandler(AbstractHTTPHandler): |
---|
716 | def __init__(self, client_cert_manager=None): |
---|
717 | AbstractHTTPHandler.__init__(self) |
---|
718 | self.client_cert_manager = client_cert_manager |
---|
719 | |
---|
720 | def https_open(self, req): |
---|
721 | if self.client_cert_manager is not None: |
---|
722 | key_file, cert_file = self.client_cert_manager.find_key_cert( |
---|
723 | req.get_full_url()) |
---|
724 | conn_factory = HTTPSConnectionFactory(key_file, cert_file) |
---|
725 | else: |
---|
726 | conn_factory = httplib.HTTPSConnection |
---|
727 | return self.do_open(conn_factory, req) |
---|
728 | |
---|
729 | https_request = AbstractHTTPHandler.do_request_ |
---|