Context Navigation

_http.py

リビジョン 3, 25.2 KB (コミッタ: kohda, 15 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	"""HTTP related handlers.
2
3	Note that some other HTTP handlers live in more specific modules: _auth.py,
4	_gzip.py, etc.
5
6
7	Copyright 2002-2006 John J Lee <jjl@pobox.com>
8
9	This code is free software; you can redistribute it and/or modify it
10	under the terms of the BSD or ZPL 2.1 licenses (see the file
11	COPYING.txt included with the distribution).
12
13	"""
14
15	import copy, time, tempfile, htmlentitydefs, re, logging, socket, \
16	urllib2, urllib, httplib, sgmllib
17	from urllib2 import URLError, HTTPError, BaseHandler
18	from cStringIO import StringIO
19
20	from _request import Request
21	from _util import isstringlike
22	from _response import closeable_response, response_seek_wrapper
23	from _html import unescape, unescape_charref
24	from _headersutil import is_html
25	from _clientcookie import CookieJar, request_host
26	import _rfc3986
27
28	debug = logging.getLogger("mechanize").debug
29
30	# monkeypatch urllib2.HTTPError to show URL
31	## def urllib2_str(self):
32	## return 'HTTP Error %s: %s (%s)' % (
33	## self.code, self.msg, self.geturl())
34	## urllib2.HTTPError.__str__ = urllib2_str
35
36
37	CHUNK = 1024 # size of chunks fed to HTML HEAD parser, in bytes
38	DEFAULT_ENCODING = 'latin-1'
39
40
41	# This adds "refresh" to the list of redirectables and provides a redirection
42	# algorithm that doesn't go into a loop in the presence of cookies
43	# (Python 2.4 has this new algorithm, 2.3 doesn't).
44	class HTTPRedirectHandler(BaseHandler):
45	# maximum number of redirections to any single URL
46	# this is needed because of the state that cookies introduce
47	max_repeats = 4
48	# maximum total number of redirections (regardless of URL) before
49	# assuming we're in a loop
50	max_redirections = 10
51
52	# Implementation notes:
53
54	# To avoid the server sending us into an infinite loop, the request
55	# object needs to track what URLs we have already seen. Do this by
56	# adding a handler-specific attribute to the Request object. The value
57	# of the dict is used to count the number of times the same URL has
58	# been visited. This is needed because visiting the same URL twice
59	# does not necessarily imply a loop, thanks to state introduced by
60	# cookies.
61
62	# Always unhandled redirection codes:
63	# 300 Multiple Choices: should not handle this here.
64	# 304 Not Modified: no need to handle here: only of interest to caches
65	# that do conditional GETs
66	# 305 Use Proxy: probably not worth dealing with here
67	# 306 Unused: what was this for in the previous versions of protocol??
68
69	def redirect_request(self, newurl, req, fp, code, msg, headers):
70	"""Return a Request or None in response to a redirect.
71
72	This is called by the http_error_30x methods when a redirection
73	response is received. If a redirection should take place, return a
74	new Request to allow http_error_30x to perform the redirect;
75	otherwise, return None to indicate that an HTTPError should be
76	raised.
77
78	"""
79	if code in (301, 302, 303, "refresh") or \
80	(code == 307 and not req.has_data()):
81	# Strictly (according to RFC 2616), 301 or 302 in response to
82	# a POST MUST NOT cause a redirection without confirmation
83	# from the user (of urllib2, in this case). In practice,
84	# essentially all clients do redirect in this case, so we do
85	# the same.
86	# XXX really refresh redirections should be visiting; tricky to
87	# fix, so this will wait until post-stable release
88	new = Request(newurl,
89	headers=req.headers,
90	origin_req_host=req.get_origin_req_host(),
91	unverifiable=True,
92	visit=False,
93	)
94	new._origin_req = getattr(req, "_origin_req", req)
95	return new
96	else:
97	raise HTTPError(req.get_full_url(), code, msg, headers, fp)
98
99	def http_error_302(self, req, fp, code, msg, headers):
100	# Some servers (incorrectly) return multiple Location headers
101	# (so probably same goes for URI). Use first header.
102	if headers.has_key('location'):
103	newurl = headers.getheaders('location')[0]
104	elif headers.has_key('uri'):
105	newurl = headers.getheaders('uri')[0]
106	else:
107	return
108	newurl = _rfc3986.clean_url(newurl, "latin-1")
109	newurl = _rfc3986.urljoin(req.get_full_url(), newurl)
110
111	# XXX Probably want to forget about the state of the current
112	# request, although that might interact poorly with other
113	# handlers that also use handler-specific request attributes
114	new = self.redirect_request(newurl, req, fp, code, msg, headers)
115	if new is None:
116	return
117
118	# loop detection
119	# .redirect_dict has a key url if url was previously visited.
120	if hasattr(req, 'redirect_dict'):
121	visited = new.redirect_dict = req.redirect_dict
122	if (visited.get(newurl, 0) >= self.max_repeats or
123	len(visited) >= self.max_redirections):
124	raise HTTPError(req.get_full_url(), code,
125	self.inf_msg + msg, headers, fp)
126	else:
127	visited = new.redirect_dict = req.redirect_dict = {}
128	visited[newurl] = visited.get(newurl, 0) + 1
129
130	# Don't close the fp until we are sure that we won't use it
131	# with HTTPError.
132	fp.read()
133	fp.close()
134
135	return self.parent.open(new)
136
137	http_error_301 = http_error_303 = http_error_307 = http_error_302
138	http_error_refresh = http_error_302
139
140	inf_msg = "The HTTP server returned a redirect error that would " \
141	"lead to an infinite loop.\n" \
142	"The last 30x error message was:\n"
143
144
145	# XXX would self.reset() work, instead of raising this exception?
146	class EndOfHeadError(Exception): pass
147	class AbstractHeadParser:
148	# only these elements are allowed in or before HEAD of document
149	head_elems = ("html", "head",
150	"title", "base",
151	"script", "style", "meta", "link", "object")
152	_entitydefs = htmlentitydefs.name2codepoint
153	_encoding = DEFAULT_ENCODING
154
155	def __init__(self):
156	self.http_equiv = []
157
158	def start_meta(self, attrs):
159	http_equiv = content = None
160	for key, value in attrs:
161	if key == "http-equiv":
162	http_equiv = self.unescape_attr_if_required(value)
163	elif key == "content":
164	content = self.unescape_attr_if_required(value)
165	if http_equiv is not None and content is not None:
166	self.http_equiv.append((http_equiv, content))
167
168	def end_head(self):
169	raise EndOfHeadError()
170
171	def handle_entityref(self, name):
172	#debug("%s", name)
173	self.handle_data(unescape(
174	'&%s;' % name, self._entitydefs, self._encoding))
175
176	def handle_charref(self, name):
177	#debug("%s", name)
178	self.handle_data(unescape_charref(name, self._encoding))
179
180	def unescape_attr(self, name):
181	#debug("%s", name)
182	return unescape(name, self._entitydefs, self._encoding)
183
184	def unescape_attrs(self, attrs):
185	#debug("%s", attrs)
186	escaped_attrs = {}
187	for key, val in attrs.items():
188	escaped_attrs[key] = self.unescape_attr(val)
189	return escaped_attrs
190
191	def unknown_entityref(self, ref):
192	self.handle_data("&%s;" % ref)
193
194	def unknown_charref(self, ref):
195	self.handle_data("&#%s;" % ref)
196
197
198	try:
199	import HTMLParser
200	except ImportError:
201	pass
202	else:
203	class XHTMLCompatibleHeadParser(AbstractHeadParser,
204	HTMLParser.HTMLParser):
205	def __init__(self):
206	HTMLParser.HTMLParser.__init__(self)
207	AbstractHeadParser.__init__(self)
208
209	def handle_starttag(self, tag, attrs):
210	if tag not in self.head_elems:
211	raise EndOfHeadError()
212	try:
213	method = getattr(self, 'start_' + tag)
214	except AttributeError:
215	try:
216	method = getattr(self, 'do_' + tag)
217	except AttributeError:
218	pass # unknown tag
219	else:
220	method(attrs)
221	else:
222	method(attrs)
223
224	def handle_endtag(self, tag):
225	if tag not in self.head_elems:
226	raise EndOfHeadError()
227	try:
228	method = getattr(self, 'end_' + tag)
229	except AttributeError:
230	pass # unknown tag
231	else:
232	method()
233
234	def unescape(self, name):
235	# Use the entitydefs passed into constructor, not
236	# HTMLParser.HTMLParser's entitydefs.
237	return self.unescape_attr(name)
238
239	def unescape_attr_if_required(self, name):
240	return name # HTMLParser.HTMLParser already did it
241
242	class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
243
244	def _not_called(self):
245	assert False
246
247	def __init__(self):
248	sgmllib.SGMLParser.__init__(self)
249	AbstractHeadParser.__init__(self)
250
251	def handle_starttag(self, tag, method, attrs):
252	if tag not in self.head_elems:
253	raise EndOfHeadError()
254	if tag == "meta":
255	method(attrs)
256
257	def unknown_starttag(self, tag, attrs):
258	self.handle_starttag(tag, self._not_called, attrs)
259
260	def handle_endtag(self, tag, method):
261	if tag in self.head_elems:
262	method()
263	else:
264	raise EndOfHeadError()
265
266	def unescape_attr_if_required(self, name):
267	return self.unescape_attr(name)
268
269	def parse_head(fileobj, parser):
270	"""Return a list of key, value pairs."""
271	while 1:
272	data = fileobj.read(CHUNK)
273	try:
274	parser.feed(data)
275	except EndOfHeadError:
276	break
277	if len(data) != CHUNK:
278	# this should only happen if there is no HTML body, or if
279	# CHUNK is big
280	break
281	return parser.http_equiv
282
283	class HTTPEquivProcessor(BaseHandler):
284	"""Append META HTTP-EQUIV headers to regular HTTP headers."""
285
286	handler_order = 300 # before handlers that look at HTTP headers
287
288	def __init__(self, head_parser_class=HeadParser,
289	i_want_broken_xhtml_support=False,
290	):
291	self.head_parser_class = head_parser_class
292	self._allow_xhtml = i_want_broken_xhtml_support
293
294	def http_response(self, request, response):
295	if not hasattr(response, "seek"):
296	response = response_seek_wrapper(response)
297	http_message = response.info()
298	url = response.geturl()
299	ct_hdrs = http_message.getheaders("content-type")
300	if is_html(ct_hdrs, url, self._allow_xhtml):
301	try:
302	try:
303	html_headers = parse_head(response, self.head_parser_class())
304	finally:
305	response.seek(0)
306	except (HTMLParser.HTMLParseError,
307	sgmllib.SGMLParseError):
308	pass
309	else:
310	for hdr, val in html_headers:
311	# add a header
312	http_message.dict[hdr.lower()] = val
313	text = hdr + ": " + val
314	for line in text.split("\n"):
315	http_message.headers.append(line + "\n")
316	return response
317
318	https_response = http_response
319
320	class HTTPCookieProcessor(BaseHandler):
321	"""Handle HTTP cookies.
322
323	Public attributes:
324
325	cookiejar: CookieJar instance
326
327	"""
328	def __init__(self, cookiejar=None):
329	if cookiejar is None:
330	cookiejar = CookieJar()
331	self.cookiejar = cookiejar
332
333	def http_request(self, request):
334	self.cookiejar.add_cookie_header(request)
335	return request
336
337	def http_response(self, request, response):
338	self.cookiejar.extract_cookies(response, request)
339	return response
340
341	https_request = http_request
342	https_response = http_response
343
344	try:
345	import robotparser
346	except ImportError:
347	pass
348	else:
349	class MechanizeRobotFileParser(robotparser.RobotFileParser):
350
351	def __init__(self, url='', opener=None):
352	import _opener
353	robotparser.RobotFileParser.__init__(self, url)
354	self._opener = opener
355
356	def set_opener(self, opener=None):
357	if opener is None:
358	opener = _opener.OpenerDirector()
359	self._opener = opener
360
361	def read(self):
362	"""Reads the robots.txt URL and feeds it to the parser."""
363	if self._opener is None:
364	self.set_opener()
365	req = Request(self.url, unverifiable=True, visit=False)
366	try:
367	f = self._opener.open(req)
368	except HTTPError, f:
369	pass
370	except (IOError, socket.error, OSError), exc:
371	robotparser._debug("ignoring error opening %r: %s" %
372	(self.url, exc))
373	return
374	lines = []
375	line = f.readline()
376	while line:
377	lines.append(line.strip())
378	line = f.readline()
379	status = f.code
380	if status == 401 or status == 403:
381	self.disallow_all = True
382	robotparser._debug("disallow all")
383	elif status >= 400:
384	self.allow_all = True
385	robotparser._debug("allow all")
386	elif status == 200 and lines:
387	robotparser._debug("parse lines")
388	self.parse(lines)
389
390	class RobotExclusionError(urllib2.HTTPError):
391	def __init__(self, request, *args):
392	apply(urllib2.HTTPError.__init__, (self,)+args)
393	self.request = request
394
395	class HTTPRobotRulesProcessor(BaseHandler):
396	# before redirections, after everything else
397	handler_order = 800
398
399	try:
400	from httplib import HTTPMessage
401	except:
402	from mimetools import Message
403	http_response_class = Message
404	else:
405	http_response_class = HTTPMessage
406
407	def __init__(self, rfp_class=MechanizeRobotFileParser):
408	self.rfp_class = rfp_class
409	self.rfp = None
410	self._host = None
411
412	def http_request(self, request):
413	scheme = request.get_type()
414	if scheme not in ["http", "https"]:
415	# robots exclusion only applies to HTTP
416	return request
417
418	if request.get_selector() == "/robots.txt":
419	# /robots.txt is always OK to fetch
420	return request
421
422	host = request.get_host()
423
424	# robots.txt requests don't need to be allowed by robots.txt :-)
425	origin_req = getattr(request, "_origin_req", None)
426	if (origin_req is not None and
427	origin_req.get_selector() == "/robots.txt" and
428	origin_req.get_host() == host
429	):
430	return request
431
432	if host != self._host:
433	self.rfp = self.rfp_class()
434	try:
435	self.rfp.set_opener(self.parent)
436	except AttributeError:
437	debug("%r instance does not support set_opener" %
438	self.rfp.__class__)
439	self.rfp.set_url(scheme+"://"+host+"/robots.txt")
440	self.rfp.read()
441	self._host = host
442
443	ua = request.get_header("User-agent", "")
444	if self.rfp.can_fetch(ua, request.get_full_url()):
445	return request
446	else:
447	# XXX This should really have raised URLError. Too late now...
448	msg = "request disallowed by robots.txt"
449	raise RobotExclusionError(
450	request,
451	request.get_full_url(),
452	403, msg,
453	self.http_response_class(StringIO()), StringIO(msg))
454
455	https_request = http_request
456
457	class HTTPRefererProcessor(BaseHandler):
458	"""Add Referer header to requests.
459
460	This only makes sense if you use each RefererProcessor for a single
461	chain of requests only (so, for example, if you use a single
462	HTTPRefererProcessor to fetch a series of URLs extracted from a single
463	page, this will break).
464
465	There's a proper implementation of this in mechanize.Browser.
466
467	"""
468	def __init__(self):
469	self.referer = None
470
471	def http_request(self, request):
472	if ((self.referer is not None) and
473	not request.has_header("Referer")):
474	request.add_unredirected_header("Referer", self.referer)
475	return request
476
477	def http_response(self, request, response):
478	self.referer = response.geturl()
479	return response
480
481	https_request = http_request
482	https_response = http_response
483
484
485	def clean_refresh_url(url):
486	# e.g. Firefox 1.5 does (something like) this
487	if ((url.startswith('"') and url.endswith('"')) or
488	(url.startswith("'") and url.endswith("'"))):
489	url = url[1:-1]
490	return _rfc3986.clean_url(url, "latin-1") # XXX encoding
491
492	def parse_refresh_header(refresh):
493	"""
494	>>> parse_refresh_header("1; url=http://example.com/")
495	(1.0, 'http://example.com/')
496	>>> parse_refresh_header("1; url='http://example.com/'")
497	(1.0, 'http://example.com/')
498	>>> parse_refresh_header("1")
499	(1.0, None)
500	>>> parse_refresh_header("blah")
501	Traceback (most recent call last):
502	ValueError: invalid literal for float(): blah
503
504	"""
505
506	ii = refresh.find(";")
507	if ii != -1:
508	pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:]
509	jj = newurl_spec.find("=")
510	key = None
511	if jj != -1:
512	key, newurl = newurl_spec[:jj], newurl_spec[jj+1:]
513	newurl = clean_refresh_url(newurl)
514	if key is None or key.strip().lower() != "url":
515	raise ValueError()
516	else:
517	pause, newurl = float(refresh), None
518	return pause, newurl
519
520	class HTTPRefreshProcessor(BaseHandler):
521	"""Perform HTTP Refresh redirections.
522
523	Note that if a non-200 HTTP code has occurred (for example, a 30x
524	redirect), this processor will do nothing.
525
526	By default, only zero-time Refresh headers are redirected. Use the
527	max_time attribute / constructor argument to allow Refresh with longer
528	pauses. Use the honor_time attribute / constructor argument to control
529	whether the requested pause is honoured (with a time.sleep()) or
530	skipped in favour of immediate redirection.
531
532	Public attributes:
533
534	max_time: see above
535	honor_time: see above
536
537	"""
538	handler_order = 1000
539
540	def __init__(self, max_time=0, honor_time=True):
541	self.max_time = max_time
542	self.honor_time = honor_time
543
544	def http_response(self, request, response):
545	code, msg, hdrs = response.code, response.msg, response.info()
546
547	if code == 200 and hdrs.has_key("refresh"):
548	refresh = hdrs.getheaders("refresh")[0]
549	try:
550	pause, newurl = parse_refresh_header(refresh)
551	except ValueError:
552	debug("bad Refresh header: %r" % refresh)
553	return response
554	if newurl is None:
555	newurl = response.geturl()
556	if (self.max_time is None) or (pause <= self.max_time):
557	if pause > 1E-3 and self.honor_time:
558	time.sleep(pause)
559	hdrs["location"] = newurl
560	# hardcoded http is NOT a bug
561	response = self.parent.error(
562	"http", request, response,
563	"refresh", msg, hdrs)
564
565	return response
566
567	https_response = http_response
568
569	class HTTPErrorProcessor(BaseHandler):
570	"""Process HTTP error responses.
571
572	The purpose of this handler is to to allow other response processors a
573	look-in by removing the call to parent.error() from
574	AbstractHTTPHandler.
575
576	For non-200 error codes, this just passes the job on to the
577	Handler.<proto>_error_<code> methods, via the OpenerDirector.error
578	method. Eventually, urllib2.HTTPDefaultErrorHandler will raise an
579	HTTPError if no other handler handles the error.
580
581	"""
582	handler_order = 1000 # after all other processors
583
584	def http_response(self, request, response):
585	code, msg, hdrs = response.code, response.msg, response.info()
586
587	if code != 200:
588	# hardcoded http is NOT a bug
589	response = self.parent.error(
590	"http", request, response, code, msg, hdrs)
591
592	return response
593
594	https_response = http_response
595
596
597	class HTTPDefaultErrorHandler(BaseHandler):
598	def http_error_default(self, req, fp, code, msg, hdrs):
599	# why these error methods took the code, msg, headers args in the first
600	# place rather than a response object, I don't know, but to avoid
601	# multiple wrapping, we're discarding them
602
603	if isinstance(fp, urllib2.HTTPError):
604	response = fp
605	else:
606	response = urllib2.HTTPError(
607	req.get_full_url(), code, msg, hdrs, fp)
608	assert code == response.code
609	assert msg == response.msg
610	assert hdrs == response.hdrs
611	raise response
612
613
614	class AbstractHTTPHandler(BaseHandler):
615
616	def __init__(self, debuglevel=0):
617	self._debuglevel = debuglevel
618
619	def set_http_debuglevel(self, level):
620	self._debuglevel = level
621
622	def do_request_(self, request):
623	host = request.get_host()
624	if not host:
625	raise URLError('no host given')
626
627	if request.has_data(): # POST
628	data = request.get_data()
629	if not request.has_header('Content-type'):
630	request.add_unredirected_header(
631	'Content-type',
632	'application/x-www-form-urlencoded')
633
634	scheme, sel = urllib.splittype(request.get_selector())
635	sel_host, sel_path = urllib.splithost(sel)
636	if not request.has_header('Host'):
637	request.add_unredirected_header('Host', sel_host or host)
638	for name, value in self.parent.addheaders:
639	name = name.capitalize()
640	if not request.has_header(name):
641	request.add_unredirected_header(name, value)
642
643	return request
644
645	def do_open(self, http_class, req):
646	"""Return an addinfourl object for the request, using http_class.
647
648	http_class must implement the HTTPConnection API from httplib.
649	The addinfourl return value is a file-like object. It also
650	has methods and attributes including:
651	- info(): return a mimetools.Message object for the headers
652	- geturl(): return the original request URL
653	- code: HTTP status code
654	"""
655	host = req.get_host()
656	if not host:
657	raise URLError('no host given')
658
659	h = http_class(host) # will parse host:port
660	h.set_debuglevel(self._debuglevel)
661
662	headers = dict(req.headers)
663	headers.update(req.unredirected_hdrs)
664	# We want to make an HTTP/1.1 request, but the addinfourl
665	# class isn't prepared to deal with a persistent connection.
666	# It will try to read all remaining data from the socket,
667	# which will block while the server waits for the next request.
668	# So make sure the connection gets closed after the (only)
669	# request.
670	headers["Connection"] = "close"
671	headers = dict(
672	[(name.title(), val) for name, val in headers.items()])
673	try:
674	h.request(req.get_method(), req.get_selector(), req.data, headers)
675	r = h.getresponse()
676	except socket.error, err: # XXX what error?
677	raise URLError(err)
678
679	# Pick apart the HTTPResponse object to get the addinfourl
680	# object initialized properly.
681
682	# Wrap the HTTPResponse object in socket's file object adapter
683	# for Windows. That adapter calls recv(), so delegate recv()
684	# to read(). This weird wrapping allows the returned object to
685	# have readline() and readlines() methods.
686
687	# XXX It might be better to extract the read buffering code
688	# out of socket._fileobject() and into a base class.
689
690	r.recv = r.read
691	fp = socket._fileobject(r)
692
693	resp = closeable_response(fp, r.msg, req.get_full_url(),
694	r.status, r.reason)
695	return resp
696
697
698	class HTTPHandler(AbstractHTTPHandler):
699	def http_open(self, req):
700	return self.do_open(httplib.HTTPConnection, req)
701
702	http_request = AbstractHTTPHandler.do_request_
703
704	if hasattr(httplib, 'HTTPS'):
705
706	class HTTPSConnectionFactory:
707	def __init__(self, key_file, cert_file):
708	self._key_file = key_file
709	self._cert_file = cert_file
710	def __call__(self, hostport):
711	return httplib.HTTPSConnection(
712	hostport,
713	key_file=self._key_file, cert_file=self._cert_file)
714
715	class HTTPSHandler(AbstractHTTPHandler):
716	def __init__(self, client_cert_manager=None):
717	AbstractHTTPHandler.__init__(self)
718	self.client_cert_manager = client_cert_manager
719
720	def https_open(self, req):
721	if self.client_cert_manager is not None:
722	key_file, cert_file = self.client_cert_manager.find_key_cert(
723	req.get_full_url())
724	conn_factory = HTTPSConnectionFactory(key_file, cert_file)
725	else:
726	conn_factory = httplib.HTTPSConnection
727	return self.do_open(conn_factory, req)
728
729	https_request = AbstractHTTPHandler.do_request_

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/twill-0.9-py2.6.egg/twill/other_packages/_mechanize_dist/_http.py

異なるフォーマットでダウンロード: