root/galaxy-central/eggs/twill-0.9-py2.6.egg/twill/other_packages/_mechanize_dist/_http.py @ 3

リビジョン 3, 25.2 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""HTTP related handlers.
2
3Note that some other HTTP handlers live in more specific modules: _auth.py,
4_gzip.py, etc.
5
6
7Copyright 2002-2006 John J Lee <jjl@pobox.com>
8
9This code is free software; you can redistribute it and/or modify it
10under the terms of the BSD or ZPL 2.1 licenses (see the file
11COPYING.txt included with the distribution).
12
13"""
14
15import copy, time, tempfile, htmlentitydefs, re, logging, socket, \
16       urllib2, urllib, httplib, sgmllib
17from urllib2 import URLError, HTTPError, BaseHandler
18from cStringIO import StringIO
19
20from _request import Request
21from _util import isstringlike
22from _response import closeable_response, response_seek_wrapper
23from _html import unescape, unescape_charref
24from _headersutil import is_html
25from _clientcookie import CookieJar, request_host
26import _rfc3986
27
28debug = logging.getLogger("mechanize").debug
29
30# monkeypatch urllib2.HTTPError to show URL
31## def urllib2_str(self):
32##     return 'HTTP Error %s: %s (%s)' % (
33##         self.code, self.msg, self.geturl())
34## urllib2.HTTPError.__str__ = urllib2_str
35
36
37CHUNK = 1024  # size of chunks fed to HTML HEAD parser, in bytes
38DEFAULT_ENCODING = 'latin-1'
39
40
41# This adds "refresh" to the list of redirectables and provides a redirection
42# algorithm that doesn't go into a loop in the presence of cookies
43# (Python 2.4 has this new algorithm, 2.3 doesn't).
44class HTTPRedirectHandler(BaseHandler):
45    # maximum number of redirections to any single URL
46    # this is needed because of the state that cookies introduce
47    max_repeats = 4
48    # maximum total number of redirections (regardless of URL) before
49    # assuming we're in a loop
50    max_redirections = 10
51
52    # Implementation notes:
53
54    # To avoid the server sending us into an infinite loop, the request
55    # object needs to track what URLs we have already seen.  Do this by
56    # adding a handler-specific attribute to the Request object.  The value
57    # of the dict is used to count the number of times the same URL has
58    # been visited.  This is needed because visiting the same URL twice
59    # does not necessarily imply a loop, thanks to state introduced by
60    # cookies.
61
62    # Always unhandled redirection codes:
63    # 300 Multiple Choices: should not handle this here.
64    # 304 Not Modified: no need to handle here: only of interest to caches
65    #     that do conditional GETs
66    # 305 Use Proxy: probably not worth dealing with here
67    # 306 Unused: what was this for in the previous versions of protocol??
68
69    def redirect_request(self, newurl, req, fp, code, msg, headers):
70        """Return a Request or None in response to a redirect.
71
72        This is called by the http_error_30x methods when a redirection
73        response is received.  If a redirection should take place, return a
74        new Request to allow http_error_30x to perform the redirect;
75        otherwise, return None to indicate that an HTTPError should be
76        raised.
77
78        """
79        if code in (301, 302, 303, "refresh") or \
80               (code == 307 and not req.has_data()):
81            # Strictly (according to RFC 2616), 301 or 302 in response to
82            # a POST MUST NOT cause a redirection without confirmation
83            # from the user (of urllib2, in this case).  In practice,
84            # essentially all clients do redirect in this case, so we do
85            # the same.
86            # XXX really refresh redirections should be visiting; tricky to
87            #  fix, so this will wait until post-stable release
88            new = Request(newurl,
89                          headers=req.headers,
90                          origin_req_host=req.get_origin_req_host(),
91                          unverifiable=True,
92                          visit=False,
93                          )
94            new._origin_req = getattr(req, "_origin_req", req)
95            return new
96        else:
97            raise HTTPError(req.get_full_url(), code, msg, headers, fp)
98
99    def http_error_302(self, req, fp, code, msg, headers):
100        # Some servers (incorrectly) return multiple Location headers
101        # (so probably same goes for URI).  Use first header.
102        if headers.has_key('location'):
103            newurl = headers.getheaders('location')[0]
104        elif headers.has_key('uri'):
105            newurl = headers.getheaders('uri')[0]
106        else:
107            return
108        newurl = _rfc3986.clean_url(newurl, "latin-1")
109        newurl = _rfc3986.urljoin(req.get_full_url(), newurl)
110
111        # XXX Probably want to forget about the state of the current
112        # request, although that might interact poorly with other
113        # handlers that also use handler-specific request attributes
114        new = self.redirect_request(newurl, req, fp, code, msg, headers)
115        if new is None:
116            return
117
118        # loop detection
119        # .redirect_dict has a key url if url was previously visited.
120        if hasattr(req, 'redirect_dict'):
121            visited = new.redirect_dict = req.redirect_dict
122            if (visited.get(newurl, 0) >= self.max_repeats or
123                len(visited) >= self.max_redirections):
124                raise HTTPError(req.get_full_url(), code,
125                                self.inf_msg + msg, headers, fp)
126        else:
127            visited = new.redirect_dict = req.redirect_dict = {}
128        visited[newurl] = visited.get(newurl, 0) + 1
129
130        # Don't close the fp until we are sure that we won't use it
131        # with HTTPError. 
132        fp.read()
133        fp.close()
134
135        return self.parent.open(new)
136
137    http_error_301 = http_error_303 = http_error_307 = http_error_302
138    http_error_refresh = http_error_302
139
140    inf_msg = "The HTTP server returned a redirect error that would " \
141              "lead to an infinite loop.\n" \
142              "The last 30x error message was:\n"
143
144
145# XXX would self.reset() work, instead of raising this exception?
146class EndOfHeadError(Exception): pass
147class AbstractHeadParser:
148    # only these elements are allowed in or before HEAD of document
149    head_elems = ("html", "head",
150                  "title", "base",
151                  "script", "style", "meta", "link", "object")
152    _entitydefs = htmlentitydefs.name2codepoint
153    _encoding = DEFAULT_ENCODING
154
155    def __init__(self):
156        self.http_equiv = []
157
158    def start_meta(self, attrs):
159        http_equiv = content = None
160        for key, value in attrs:
161            if key == "http-equiv":
162                http_equiv = self.unescape_attr_if_required(value)
163            elif key == "content":
164                content = self.unescape_attr_if_required(value)
165        if http_equiv is not None and content is not None:
166            self.http_equiv.append((http_equiv, content))
167
168    def end_head(self):
169        raise EndOfHeadError()
170
171    def handle_entityref(self, name):
172        #debug("%s", name)
173        self.handle_data(unescape(
174            '&%s;' % name, self._entitydefs, self._encoding))
175
176    def handle_charref(self, name):
177        #debug("%s", name)
178        self.handle_data(unescape_charref(name, self._encoding))
179
180    def unescape_attr(self, name):
181        #debug("%s", name)
182        return unescape(name, self._entitydefs, self._encoding)
183
184    def unescape_attrs(self, attrs):
185        #debug("%s", attrs)
186        escaped_attrs = {}
187        for key, val in attrs.items():
188            escaped_attrs[key] = self.unescape_attr(val)
189        return escaped_attrs
190
191    def unknown_entityref(self, ref):
192        self.handle_data("&%s;" % ref)
193
194    def unknown_charref(self, ref):
195        self.handle_data("&#%s;" % ref)
196
197
198try:
199    import HTMLParser
200except ImportError:
201    pass
202else:
203    class XHTMLCompatibleHeadParser(AbstractHeadParser,
204                                    HTMLParser.HTMLParser):
205        def __init__(self):
206            HTMLParser.HTMLParser.__init__(self)
207            AbstractHeadParser.__init__(self)
208
209        def handle_starttag(self, tag, attrs):
210            if tag not in self.head_elems:
211                raise EndOfHeadError()
212            try:
213                method = getattr(self, 'start_' + tag)
214            except AttributeError:
215                try:
216                    method = getattr(self, 'do_' + tag)
217                except AttributeError:
218                    pass # unknown tag
219                else:
220                    method(attrs)
221            else:
222                method(attrs)
223
224        def handle_endtag(self, tag):
225            if tag not in self.head_elems:
226                raise EndOfHeadError()
227            try:
228                method = getattr(self, 'end_' + tag)
229            except AttributeError:
230                pass # unknown tag
231            else:
232                method()
233
234        def unescape(self, name):
235            # Use the entitydefs passed into constructor, not
236            # HTMLParser.HTMLParser's entitydefs.
237            return self.unescape_attr(name)
238
239        def unescape_attr_if_required(self, name):
240            return name  # HTMLParser.HTMLParser already did it
241
242class HeadParser(AbstractHeadParser, sgmllib.SGMLParser):
243
244    def _not_called(self):
245        assert False
246
247    def __init__(self):
248        sgmllib.SGMLParser.__init__(self)
249        AbstractHeadParser.__init__(self)
250
251    def handle_starttag(self, tag, method, attrs):
252        if tag not in self.head_elems:
253            raise EndOfHeadError()
254        if tag == "meta":
255            method(attrs)
256
257    def unknown_starttag(self, tag, attrs):
258        self.handle_starttag(tag, self._not_called, attrs)
259
260    def handle_endtag(self, tag, method):
261        if tag in self.head_elems:
262            method()
263        else:
264            raise EndOfHeadError()
265
266    def unescape_attr_if_required(self, name):
267        return self.unescape_attr(name)
268
269def parse_head(fileobj, parser):
270    """Return a list of key, value pairs."""
271    while 1:
272        data = fileobj.read(CHUNK)
273        try:
274            parser.feed(data)
275        except EndOfHeadError:
276            break
277        if len(data) != CHUNK:
278            # this should only happen if there is no HTML body, or if
279            # CHUNK is big
280            break
281    return parser.http_equiv
282
283class HTTPEquivProcessor(BaseHandler):
284    """Append META HTTP-EQUIV headers to regular HTTP headers."""
285
286    handler_order = 300  # before handlers that look at HTTP headers
287
288    def __init__(self, head_parser_class=HeadParser,
289                 i_want_broken_xhtml_support=False,
290                 ):
291        self.head_parser_class = head_parser_class
292        self._allow_xhtml = i_want_broken_xhtml_support
293
294    def http_response(self, request, response):
295        if not hasattr(response, "seek"):
296            response = response_seek_wrapper(response)
297        http_message = response.info()
298        url = response.geturl()
299        ct_hdrs = http_message.getheaders("content-type")
300        if is_html(ct_hdrs, url, self._allow_xhtml):
301            try:
302                try:
303                    html_headers = parse_head(response, self.head_parser_class())
304                finally:
305                    response.seek(0)
306            except (HTMLParser.HTMLParseError,
307                    sgmllib.SGMLParseError):
308                pass
309            else:
310                for hdr, val in html_headers:
311                    # add a header
312                    http_message.dict[hdr.lower()] = val
313                    text = hdr + ": " + val
314                    for line in text.split("\n"):
315                        http_message.headers.append(line + "\n")
316        return response
317
318    https_response = http_response
319
320class HTTPCookieProcessor(BaseHandler):
321    """Handle HTTP cookies.
322
323    Public attributes:
324
325    cookiejar: CookieJar instance
326
327    """
328    def __init__(self, cookiejar=None):
329        if cookiejar is None:
330            cookiejar = CookieJar()
331        self.cookiejar = cookiejar
332
333    def http_request(self, request):
334        self.cookiejar.add_cookie_header(request)
335        return request
336
337    def http_response(self, request, response):
338        self.cookiejar.extract_cookies(response, request)
339        return response
340
341    https_request = http_request
342    https_response = http_response
343
344try:
345    import robotparser
346except ImportError:
347    pass
348else:
349    class MechanizeRobotFileParser(robotparser.RobotFileParser):
350
351        def __init__(self, url='', opener=None):
352            import _opener
353            robotparser.RobotFileParser.__init__(self, url)
354            self._opener = opener
355
356        def set_opener(self, opener=None):
357            if opener is None:
358                opener = _opener.OpenerDirector()
359            self._opener = opener
360
361        def read(self):
362            """Reads the robots.txt URL and feeds it to the parser."""
363            if self._opener is None:
364                self.set_opener()
365            req = Request(self.url, unverifiable=True, visit=False)
366            try:
367                f = self._opener.open(req)
368            except HTTPError, f:
369                pass
370            except (IOError, socket.error, OSError), exc:
371                robotparser._debug("ignoring error opening %r: %s" %
372                                   (self.url, exc))
373                return
374            lines = []
375            line = f.readline()
376            while line:
377                lines.append(line.strip())
378                line = f.readline()
379            status = f.code
380            if status == 401 or status == 403:
381                self.disallow_all = True
382                robotparser._debug("disallow all")
383            elif status >= 400:
384                self.allow_all = True
385                robotparser._debug("allow all")
386            elif status == 200 and lines:
387                robotparser._debug("parse lines")
388                self.parse(lines)
389
390    class RobotExclusionError(urllib2.HTTPError):
391        def __init__(self, request, *args):
392            apply(urllib2.HTTPError.__init__, (self,)+args)
393            self.request = request
394
395    class HTTPRobotRulesProcessor(BaseHandler):
396        # before redirections, after everything else
397        handler_order = 800
398
399        try:
400            from httplib import HTTPMessage
401        except:
402            from mimetools import Message
403            http_response_class = Message
404        else:
405            http_response_class = HTTPMessage
406
407        def __init__(self, rfp_class=MechanizeRobotFileParser):
408            self.rfp_class = rfp_class
409            self.rfp = None
410            self._host = None
411
412        def http_request(self, request):
413            scheme = request.get_type()
414            if scheme not in ["http", "https"]:
415                # robots exclusion only applies to HTTP
416                return request
417
418            if request.get_selector() == "/robots.txt":
419                # /robots.txt is always OK to fetch
420                return request
421
422            host = request.get_host()
423
424            # robots.txt requests don't need to be allowed by robots.txt :-)
425            origin_req = getattr(request, "_origin_req", None)
426            if (origin_req is not None and
427                origin_req.get_selector() == "/robots.txt" and
428                origin_req.get_host() == host
429                ):
430                return request
431
432            if host != self._host:
433                self.rfp = self.rfp_class()
434                try:
435                    self.rfp.set_opener(self.parent)
436                except AttributeError:
437                    debug("%r instance does not support set_opener" %
438                          self.rfp.__class__)
439                self.rfp.set_url(scheme+"://"+host+"/robots.txt")
440                self.rfp.read()
441                self._host = host
442
443            ua = request.get_header("User-agent", "")
444            if self.rfp.can_fetch(ua, request.get_full_url()):
445                return request
446            else:
447                # XXX This should really have raised URLError.  Too late now...
448                msg = "request disallowed by robots.txt"
449                raise RobotExclusionError(
450                    request,
451                    request.get_full_url(),
452                    403, msg,
453                    self.http_response_class(StringIO()), StringIO(msg))
454
455        https_request = http_request
456
457class HTTPRefererProcessor(BaseHandler):
458    """Add Referer header to requests.
459
460    This only makes sense if you use each RefererProcessor for a single
461    chain of requests only (so, for example, if you use a single
462    HTTPRefererProcessor to fetch a series of URLs extracted from a single
463    page, this will break).
464
465    There's a proper implementation of this in mechanize.Browser.
466
467    """
468    def __init__(self):
469        self.referer = None
470
471    def http_request(self, request):
472        if ((self.referer is not None) and
473            not request.has_header("Referer")):
474            request.add_unredirected_header("Referer", self.referer)
475        return request
476
477    def http_response(self, request, response):
478        self.referer = response.geturl()
479        return response
480
481    https_request = http_request
482    https_response = http_response
483
484
485def clean_refresh_url(url):
486    # e.g. Firefox 1.5 does (something like) this
487    if ((url.startswith('"') and url.endswith('"')) or
488        (url.startswith("'") and url.endswith("'"))):
489        url = url[1:-1]
490    return _rfc3986.clean_url(url, "latin-1")  # XXX encoding
491
492def parse_refresh_header(refresh):
493    """
494    >>> parse_refresh_header("1; url=http://example.com/")
495    (1.0, 'http://example.com/')
496    >>> parse_refresh_header("1; url='http://example.com/'")
497    (1.0, 'http://example.com/')
498    >>> parse_refresh_header("1")
499    (1.0, None)
500    >>> parse_refresh_header("blah")
501    Traceback (most recent call last):
502    ValueError: invalid literal for float(): blah
503
504    """
505
506    ii = refresh.find(";")
507    if ii != -1:
508        pause, newurl_spec = float(refresh[:ii]), refresh[ii+1:]
509        jj = newurl_spec.find("=")
510        key = None
511        if jj != -1:
512            key, newurl = newurl_spec[:jj], newurl_spec[jj+1:]
513            newurl = clean_refresh_url(newurl)
514        if key is None or key.strip().lower() != "url":
515            raise ValueError()
516    else:
517        pause, newurl = float(refresh), None
518    return pause, newurl
519
520class HTTPRefreshProcessor(BaseHandler):
521    """Perform HTTP Refresh redirections.
522
523    Note that if a non-200 HTTP code has occurred (for example, a 30x
524    redirect), this processor will do nothing.
525
526    By default, only zero-time Refresh headers are redirected.  Use the
527    max_time attribute / constructor argument to allow Refresh with longer
528    pauses.  Use the honor_time attribute / constructor argument to control
529    whether the requested pause is honoured (with a time.sleep()) or
530    skipped in favour of immediate redirection.
531
532    Public attributes:
533
534    max_time: see above
535    honor_time: see above
536
537    """
538    handler_order = 1000
539
540    def __init__(self, max_time=0, honor_time=True):
541        self.max_time = max_time
542        self.honor_time = honor_time
543
544    def http_response(self, request, response):
545        code, msg, hdrs = response.code, response.msg, response.info()
546
547        if code == 200 and hdrs.has_key("refresh"):
548            refresh = hdrs.getheaders("refresh")[0]
549            try:
550                pause, newurl = parse_refresh_header(refresh)
551            except ValueError:
552                debug("bad Refresh header: %r" % refresh)
553                return response
554            if newurl is None:
555                newurl = response.geturl()
556            if (self.max_time is None) or (pause <= self.max_time):
557                if pause > 1E-3 and self.honor_time:
558                    time.sleep(pause)
559                hdrs["location"] = newurl
560                # hardcoded http is NOT a bug
561                response = self.parent.error(
562                    "http", request, response,
563                    "refresh", msg, hdrs)
564
565        return response
566
567    https_response = http_response
568
569class HTTPErrorProcessor(BaseHandler):
570    """Process HTTP error responses.
571
572    The purpose of this handler is to to allow other response processors a
573    look-in by removing the call to parent.error() from
574    AbstractHTTPHandler.
575
576    For non-200 error codes, this just passes the job on to the
577    Handler.<proto>_error_<code> methods, via the OpenerDirector.error
578    method.  Eventually, urllib2.HTTPDefaultErrorHandler will raise an
579    HTTPError if no other handler handles the error.
580
581    """
582    handler_order = 1000  # after all other processors
583
584    def http_response(self, request, response):
585        code, msg, hdrs = response.code, response.msg, response.info()
586
587        if code != 200:
588            # hardcoded http is NOT a bug
589            response = self.parent.error(
590                "http", request, response, code, msg, hdrs)
591
592        return response
593
594    https_response = http_response
595
596
597class HTTPDefaultErrorHandler(BaseHandler):
598    def http_error_default(self, req, fp, code, msg, hdrs):
599        # why these error methods took the code, msg, headers args in the first
600        # place rather than a response object, I don't know, but to avoid
601        # multiple wrapping, we're discarding them
602
603        if isinstance(fp, urllib2.HTTPError):
604            response = fp
605        else:
606            response = urllib2.HTTPError(
607                req.get_full_url(), code, msg, hdrs, fp)
608        assert code == response.code
609        assert msg == response.msg
610        assert hdrs == response.hdrs
611        raise response
612
613
614class AbstractHTTPHandler(BaseHandler):
615
616    def __init__(self, debuglevel=0):
617        self._debuglevel = debuglevel
618
619    def set_http_debuglevel(self, level):
620        self._debuglevel = level
621
622    def do_request_(self, request):
623        host = request.get_host()
624        if not host:
625            raise URLError('no host given')
626
627        if request.has_data():  # POST
628            data = request.get_data()
629            if not request.has_header('Content-type'):
630                request.add_unredirected_header(
631                    'Content-type',
632                    'application/x-www-form-urlencoded')
633
634        scheme, sel = urllib.splittype(request.get_selector())
635        sel_host, sel_path = urllib.splithost(sel)
636        if not request.has_header('Host'):
637            request.add_unredirected_header('Host', sel_host or host)
638        for name, value in self.parent.addheaders:
639            name = name.capitalize()
640            if not request.has_header(name):
641                request.add_unredirected_header(name, value)
642
643        return request
644
645    def do_open(self, http_class, req):
646        """Return an addinfourl object for the request, using http_class.
647
648        http_class must implement the HTTPConnection API from httplib.
649        The addinfourl return value is a file-like object.  It also
650        has methods and attributes including:
651            - info(): return a mimetools.Message object for the headers
652            - geturl(): return the original request URL
653            - code: HTTP status code
654        """
655        host = req.get_host()
656        if not host:
657            raise URLError('no host given')
658
659        h = http_class(host) # will parse host:port
660        h.set_debuglevel(self._debuglevel)
661
662        headers = dict(req.headers)
663        headers.update(req.unredirected_hdrs)
664        # We want to make an HTTP/1.1 request, but the addinfourl
665        # class isn't prepared to deal with a persistent connection.
666        # It will try to read all remaining data from the socket,
667        # which will block while the server waits for the next request.
668        # So make sure the connection gets closed after the (only)
669        # request.
670        headers["Connection"] = "close"
671        headers = dict(
672            [(name.title(), val) for name, val in headers.items()])
673        try:
674            h.request(req.get_method(), req.get_selector(), req.data, headers)
675            r = h.getresponse()
676        except socket.error, err: # XXX what error?
677            raise URLError(err)
678
679        # Pick apart the HTTPResponse object to get the addinfourl
680        # object initialized properly.
681
682        # Wrap the HTTPResponse object in socket's file object adapter
683        # for Windows.  That adapter calls recv(), so delegate recv()
684        # to read().  This weird wrapping allows the returned object to
685        # have readline() and readlines() methods.
686
687        # XXX It might be better to extract the read buffering code
688        # out of socket._fileobject() and into a base class.
689
690        r.recv = r.read
691        fp = socket._fileobject(r)
692
693        resp = closeable_response(fp, r.msg, req.get_full_url(),
694                                  r.status, r.reason)
695        return resp
696
697
698class HTTPHandler(AbstractHTTPHandler):
699    def http_open(self, req):
700        return self.do_open(httplib.HTTPConnection, req)
701
702    http_request = AbstractHTTPHandler.do_request_
703
704if hasattr(httplib, 'HTTPS'):
705
706    class HTTPSConnectionFactory:
707        def __init__(self, key_file, cert_file):
708            self._key_file = key_file
709            self._cert_file = cert_file
710        def __call__(self, hostport):
711            return httplib.HTTPSConnection(
712                hostport,
713                key_file=self._key_file, cert_file=self._cert_file)
714
715    class HTTPSHandler(AbstractHTTPHandler):
716        def __init__(self, client_cert_manager=None):
717            AbstractHTTPHandler.__init__(self)
718            self.client_cert_manager = client_cert_manager
719
720        def https_open(self, req):
721            if self.client_cert_manager is not None:
722                key_file, cert_file = self.client_cert_manager.find_key_cert(
723                    req.get_full_url())
724                conn_factory = HTTPSConnectionFactory(key_file, cert_file)
725            else:
726                conn_factory = httplib.HTTPSConnection
727            return self.do_open(conn_factory, req)
728
729        https_request = AbstractHTTPHandler.do_request_
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。