1 | """Integration with Python standard library module urllib2: OpenerDirector |
---|
2 | class. |
---|
3 | |
---|
4 | Copyright 2004-2006 John J Lee <jjl@pobox.com> |
---|
5 | |
---|
6 | This code is free software; you can redistribute it and/or modify it |
---|
7 | under the terms of the BSD or ZPL 2.1 licenses (see the file |
---|
8 | COPYING.txt included with the distribution). |
---|
9 | |
---|
10 | """ |
---|
11 | |
---|
12 | import os, urllib2, bisect, urllib, httplib, types, tempfile |
---|
13 | try: |
---|
14 | import threading as _threading |
---|
15 | except ImportError: |
---|
16 | import dummy_threading as _threading |
---|
17 | try: |
---|
18 | set |
---|
19 | except NameError: |
---|
20 | import sets |
---|
21 | set = sets.Set |
---|
22 | |
---|
23 | import _http |
---|
24 | import _upgrade |
---|
25 | import _rfc3986 |
---|
26 | import _response |
---|
27 | from _util import isstringlike |
---|
28 | from _request import Request |
---|
29 | |
---|
30 | |
---|
31 | class ContentTooShortError(urllib2.URLError): |
---|
32 | def __init__(self, reason, result): |
---|
33 | urllib2.URLError.__init__(self, reason) |
---|
34 | self.result = result |
---|
35 | |
---|
36 | |
---|
37 | class OpenerDirector(urllib2.OpenerDirector): |
---|
38 | def __init__(self): |
---|
39 | urllib2.OpenerDirector.__init__(self) |
---|
40 | # really none of these are (sanely) public -- the lack of initial |
---|
41 | # underscore on some is just due to following urllib2 |
---|
42 | self.process_response = {} |
---|
43 | self.process_request = {} |
---|
44 | self._any_request = {} |
---|
45 | self._any_response = {} |
---|
46 | self._handler_index_valid = True |
---|
47 | self._tempfiles = [] |
---|
48 | |
---|
49 | def add_handler(self, handler): |
---|
50 | if handler in self.handlers: |
---|
51 | return |
---|
52 | # XXX why does self.handlers need to be sorted? |
---|
53 | bisect.insort(self.handlers, handler) |
---|
54 | handler.add_parent(self) |
---|
55 | self._handler_index_valid = False |
---|
56 | |
---|
57 | def _maybe_reindex_handlers(self): |
---|
58 | if self._handler_index_valid: |
---|
59 | return |
---|
60 | |
---|
61 | handle_error = {} |
---|
62 | handle_open = {} |
---|
63 | process_request = {} |
---|
64 | process_response = {} |
---|
65 | any_request = set() |
---|
66 | any_response = set() |
---|
67 | unwanted = [] |
---|
68 | |
---|
69 | for handler in self.handlers: |
---|
70 | added = False |
---|
71 | for meth in dir(handler): |
---|
72 | if meth in ["redirect_request", "do_open", "proxy_open"]: |
---|
73 | # oops, coincidental match |
---|
74 | continue |
---|
75 | |
---|
76 | if meth == "any_request": |
---|
77 | any_request.add(handler) |
---|
78 | added = True |
---|
79 | continue |
---|
80 | elif meth == "any_response": |
---|
81 | any_response.add(handler) |
---|
82 | added = True |
---|
83 | continue |
---|
84 | |
---|
85 | ii = meth.find("_") |
---|
86 | scheme = meth[:ii] |
---|
87 | condition = meth[ii+1:] |
---|
88 | |
---|
89 | if condition.startswith("error"): |
---|
90 | jj = meth[ii+1:].find("_") + ii + 1 |
---|
91 | kind = meth[jj+1:] |
---|
92 | try: |
---|
93 | kind = int(kind) |
---|
94 | except ValueError: |
---|
95 | pass |
---|
96 | lookup = handle_error.setdefault(scheme, {}) |
---|
97 | elif condition == "open": |
---|
98 | kind = scheme |
---|
99 | lookup = handle_open |
---|
100 | elif condition == "request": |
---|
101 | kind = scheme |
---|
102 | lookup = process_request |
---|
103 | elif condition == "response": |
---|
104 | kind = scheme |
---|
105 | lookup = process_response |
---|
106 | else: |
---|
107 | continue |
---|
108 | |
---|
109 | lookup.setdefault(kind, set()).add(handler) |
---|
110 | added = True |
---|
111 | |
---|
112 | if not added: |
---|
113 | unwanted.append(handler) |
---|
114 | |
---|
115 | for handler in unwanted: |
---|
116 | self.handlers.remove(handler) |
---|
117 | |
---|
118 | # sort indexed methods |
---|
119 | # XXX could be cleaned up |
---|
120 | for lookup in [process_request, process_response]: |
---|
121 | for scheme, handlers in lookup.iteritems(): |
---|
122 | lookup[scheme] = handlers |
---|
123 | for scheme, lookup in handle_error.iteritems(): |
---|
124 | for code, handlers in lookup.iteritems(): |
---|
125 | handlers = list(handlers) |
---|
126 | handlers.sort() |
---|
127 | lookup[code] = handlers |
---|
128 | for scheme, handlers in handle_open.iteritems(): |
---|
129 | handlers = list(handlers) |
---|
130 | handlers.sort() |
---|
131 | handle_open[scheme] = handlers |
---|
132 | |
---|
133 | # cache the indexes |
---|
134 | self.handle_error = handle_error |
---|
135 | self.handle_open = handle_open |
---|
136 | self.process_request = process_request |
---|
137 | self.process_response = process_response |
---|
138 | self._any_request = any_request |
---|
139 | self._any_response = any_response |
---|
140 | |
---|
141 | def _request(self, url_or_req, data, visit): |
---|
142 | if isstringlike(url_or_req): |
---|
143 | req = Request(url_or_req, data, visit=visit) |
---|
144 | else: |
---|
145 | # already a urllib2.Request or mechanize.Request instance |
---|
146 | req = url_or_req |
---|
147 | if data is not None: |
---|
148 | req.add_data(data) |
---|
149 | # XXX yuck, give request a .visit attribute if it doesn't have one |
---|
150 | try: |
---|
151 | req.visit |
---|
152 | except AttributeError: |
---|
153 | req.visit = None |
---|
154 | if visit is not None: |
---|
155 | req.visit = visit |
---|
156 | return req |
---|
157 | |
---|
158 | def open(self, fullurl, data=None): |
---|
159 | req = self._request(fullurl, data, None) |
---|
160 | req_scheme = req.get_type() |
---|
161 | |
---|
162 | self._maybe_reindex_handlers() |
---|
163 | |
---|
164 | # pre-process request |
---|
165 | # XXX should we allow a Processor to change the URL scheme |
---|
166 | # of the request? |
---|
167 | request_processors = set(self.process_request.get(req_scheme, [])) |
---|
168 | request_processors.update(self._any_request) |
---|
169 | request_processors = list(request_processors) |
---|
170 | request_processors.sort() |
---|
171 | for processor in request_processors: |
---|
172 | for meth_name in ["any_request", req_scheme+"_request"]: |
---|
173 | meth = getattr(processor, meth_name, None) |
---|
174 | if meth: |
---|
175 | req = meth(req) |
---|
176 | |
---|
177 | # In Python >= 2.4, .open() supports processors already, so we must |
---|
178 | # call ._open() instead. |
---|
179 | urlopen = getattr(urllib2.OpenerDirector, "_open", |
---|
180 | urllib2.OpenerDirector.open) |
---|
181 | response = urlopen(self, req, data) |
---|
182 | |
---|
183 | # post-process response |
---|
184 | response_processors = set(self.process_response.get(req_scheme, [])) |
---|
185 | response_processors.update(self._any_response) |
---|
186 | response_processors = list(response_processors) |
---|
187 | response_processors.sort() |
---|
188 | for processor in response_processors: |
---|
189 | for meth_name in ["any_response", req_scheme+"_response"]: |
---|
190 | meth = getattr(processor, meth_name, None) |
---|
191 | if meth: |
---|
192 | response = meth(req, response) |
---|
193 | |
---|
194 | return response |
---|
195 | |
---|
196 | def error(self, proto, *args): |
---|
197 | if proto in ['http', 'https']: |
---|
198 | # XXX http[s] protocols are special-cased |
---|
199 | dict = self.handle_error['http'] # https is not different than http |
---|
200 | proto = args[2] # YUCK! |
---|
201 | meth_name = 'http_error_%s' % proto |
---|
202 | http_err = 1 |
---|
203 | orig_args = args |
---|
204 | else: |
---|
205 | dict = self.handle_error |
---|
206 | meth_name = proto + '_error' |
---|
207 | http_err = 0 |
---|
208 | args = (dict, proto, meth_name) + args |
---|
209 | result = apply(self._call_chain, args) |
---|
210 | if result: |
---|
211 | return result |
---|
212 | |
---|
213 | if http_err: |
---|
214 | args = (dict, 'default', 'http_error_default') + orig_args |
---|
215 | return apply(self._call_chain, args) |
---|
216 | |
---|
217 | BLOCK_SIZE = 1024*8 |
---|
218 | def retrieve(self, fullurl, filename=None, reporthook=None, data=None): |
---|
219 | """Returns (filename, headers). |
---|
220 | |
---|
221 | For remote objects, the default filename will refer to a temporary |
---|
222 | file. Temporary files are removed when the OpenerDirector.close() |
---|
223 | method is called. |
---|
224 | |
---|
225 | For file: URLs, at present the returned filename is None. This may |
---|
226 | change in future. |
---|
227 | |
---|
228 | If the actual number of bytes read is less than indicated by the |
---|
229 | Content-Length header, raises ContentTooShortError (a URLError |
---|
230 | subclass). The exception's .result attribute contains the (filename, |
---|
231 | headers) that would have been returned. |
---|
232 | |
---|
233 | """ |
---|
234 | req = self._request(fullurl, data, False) |
---|
235 | scheme = req.get_type() |
---|
236 | fp = self.open(req) |
---|
237 | headers = fp.info() |
---|
238 | if filename is None and scheme == 'file': |
---|
239 | # XXX req.get_selector() seems broken here, return None, |
---|
240 | # pending sanity :-/ |
---|
241 | return None, headers |
---|
242 | #return urllib.url2pathname(req.get_selector()), headers |
---|
243 | if filename: |
---|
244 | tfp = open(filename, 'wb') |
---|
245 | else: |
---|
246 | path = _rfc3986.urlsplit(fullurl)[2] |
---|
247 | suffix = os.path.splitext(path)[1] |
---|
248 | fd, filename = tempfile.mkstemp(suffix) |
---|
249 | self._tempfiles.append(filename) |
---|
250 | tfp = os.fdopen(fd, 'wb') |
---|
251 | |
---|
252 | result = filename, headers |
---|
253 | bs = self.BLOCK_SIZE |
---|
254 | size = -1 |
---|
255 | read = 0 |
---|
256 | blocknum = 0 |
---|
257 | if reporthook: |
---|
258 | if "content-length" in headers: |
---|
259 | size = int(headers["Content-Length"]) |
---|
260 | reporthook(blocknum, bs, size) |
---|
261 | while 1: |
---|
262 | block = fp.read(bs) |
---|
263 | if block == "": |
---|
264 | break |
---|
265 | read += len(block) |
---|
266 | tfp.write(block) |
---|
267 | blocknum += 1 |
---|
268 | if reporthook: |
---|
269 | reporthook(blocknum, bs, size) |
---|
270 | fp.close() |
---|
271 | tfp.close() |
---|
272 | del fp |
---|
273 | del tfp |
---|
274 | |
---|
275 | # raise exception if actual size does not match content-length header |
---|
276 | if size >= 0 and read < size: |
---|
277 | raise ContentTooShortError( |
---|
278 | "retrieval incomplete: " |
---|
279 | "got only %i out of %i bytes" % (read, size), |
---|
280 | result |
---|
281 | ) |
---|
282 | |
---|
283 | return result |
---|
284 | |
---|
285 | def close(self): |
---|
286 | urllib2.OpenerDirector.close(self) |
---|
287 | |
---|
288 | # make it very obvious this object is no longer supposed to be used |
---|
289 | self.open = self.error = self.retrieve = self.add_handler = None |
---|
290 | |
---|
291 | if self._tempfiles: |
---|
292 | for filename in self._tempfiles: |
---|
293 | try: |
---|
294 | os.unlink(filename) |
---|
295 | except OSError: |
---|
296 | pass |
---|
297 | del self._tempfiles[:] |
---|
298 | |
---|
299 | |
---|
300 | def wrapped_open(urlopen, process_response_object, fullurl, data=None): |
---|
301 | success = True |
---|
302 | try: |
---|
303 | response = urlopen(fullurl, data) |
---|
304 | except urllib2.HTTPError, error: |
---|
305 | success = False |
---|
306 | if error.fp is None: # not a response |
---|
307 | raise |
---|
308 | response = error |
---|
309 | |
---|
310 | if response is not None: |
---|
311 | response = process_response_object(response) |
---|
312 | |
---|
313 | if not success: |
---|
314 | raise response |
---|
315 | return response |
---|
316 | |
---|
317 | class ResponseProcessingOpener(OpenerDirector): |
---|
318 | |
---|
319 | def open(self, fullurl, data=None): |
---|
320 | def bound_open(fullurl, data=None): |
---|
321 | return OpenerDirector.open(self, fullurl, data) |
---|
322 | return wrapped_open( |
---|
323 | bound_open, self.process_response_object, fullurl, data) |
---|
324 | |
---|
325 | def process_response_object(self, response): |
---|
326 | return response |
---|
327 | |
---|
328 | |
---|
329 | class SeekableResponseOpener(ResponseProcessingOpener): |
---|
330 | def process_response_object(self, response): |
---|
331 | return _response.seek_wrapped_response(response) |
---|
332 | |
---|
333 | |
---|
334 | class OpenerFactory: |
---|
335 | """This class's interface is quite likely to change.""" |
---|
336 | |
---|
337 | default_classes = [ |
---|
338 | # handlers |
---|
339 | urllib2.ProxyHandler, |
---|
340 | urllib2.UnknownHandler, |
---|
341 | _http.HTTPHandler, # derived from new AbstractHTTPHandler |
---|
342 | _http.HTTPDefaultErrorHandler, |
---|
343 | _http.HTTPRedirectHandler, # bugfixed |
---|
344 | urllib2.FTPHandler, |
---|
345 | urllib2.FileHandler, |
---|
346 | # processors |
---|
347 | _upgrade.HTTPRequestUpgradeProcessor, |
---|
348 | _http.HTTPCookieProcessor, |
---|
349 | _http.HTTPErrorProcessor, |
---|
350 | ] |
---|
351 | if hasattr(httplib, 'HTTPS'): |
---|
352 | default_classes.append(_http.HTTPSHandler) |
---|
353 | handlers = [] |
---|
354 | replacement_handlers = [] |
---|
355 | |
---|
356 | def __init__(self, klass=OpenerDirector): |
---|
357 | self.klass = klass |
---|
358 | |
---|
359 | def build_opener(self, *handlers): |
---|
360 | """Create an opener object from a list of handlers and processors. |
---|
361 | |
---|
362 | The opener will use several default handlers and processors, including |
---|
363 | support for HTTP and FTP. |
---|
364 | |
---|
365 | If any of the handlers passed as arguments are subclasses of the |
---|
366 | default handlers, the default handlers will not be used. |
---|
367 | |
---|
368 | """ |
---|
369 | opener = self.klass() |
---|
370 | default_classes = list(self.default_classes) |
---|
371 | skip = [] |
---|
372 | for klass in default_classes: |
---|
373 | for check in handlers: |
---|
374 | if type(check) == types.ClassType: |
---|
375 | if issubclass(check, klass): |
---|
376 | skip.append(klass) |
---|
377 | elif type(check) == types.InstanceType: |
---|
378 | if isinstance(check, klass): |
---|
379 | skip.append(klass) |
---|
380 | for klass in skip: |
---|
381 | default_classes.remove(klass) |
---|
382 | |
---|
383 | for klass in default_classes: |
---|
384 | opener.add_handler(klass()) |
---|
385 | for h in handlers: |
---|
386 | if type(h) == types.ClassType: |
---|
387 | h = h() |
---|
388 | opener.add_handler(h) |
---|
389 | |
---|
390 | return opener |
---|
391 | |
---|
392 | |
---|
393 | build_opener = OpenerFactory().build_opener |
---|
394 | |
---|
395 | _opener = None |
---|
396 | urlopen_lock = _threading.Lock() |
---|
397 | def urlopen(url, data=None): |
---|
398 | global _opener |
---|
399 | if _opener is None: |
---|
400 | urlopen_lock.acquire() |
---|
401 | try: |
---|
402 | if _opener is None: |
---|
403 | _opener = build_opener() |
---|
404 | finally: |
---|
405 | urlopen_lock.release() |
---|
406 | return _opener.open(url, data) |
---|
407 | |
---|
408 | def urlretrieve(url, filename=None, reporthook=None, data=None): |
---|
409 | global _opener |
---|
410 | if _opener is None: |
---|
411 | urlopen_lock.acquire() |
---|
412 | try: |
---|
413 | if _opener is None: |
---|
414 | _opener = build_opener() |
---|
415 | finally: |
---|
416 | urlopen_lock.release() |
---|
417 | return _opener.retrieve(url, filename, reporthook, data) |
---|
418 | |
---|
419 | def install_opener(opener): |
---|
420 | global _opener |
---|
421 | _opener = opener |
---|