root/galaxy-central/eggs/twill-0.9-py2.6.egg/twill/other_packages/_mechanize_dist/_html.py @ 3

リビジョン 3, 19.8 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""HTML handling.
2
3Copyright 2003-2006 John J. Lee <jjl@pobox.com>
4
5This code is free software; you can redistribute it and/or modify it under
6the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
7included with the distribution).
8
9"""
10
11import re, copy, htmlentitydefs
12import sgmllib, HTMLParser, ClientForm
13
14import _request
15from _headersutil import split_header_words, is_html as _is_html
16import _rfc3986
17
18DEFAULT_ENCODING = "latin-1"
19
20
21# the base classe is purely for backwards compatibility
22class ParseError(ClientForm.ParseError): pass
23
24
25class CachingGeneratorFunction(object):
26    """Caching wrapper around a no-arguments iterable."""
27
28    def __init__(self, iterable):
29        self._cache = []
30        # wrap iterable to make it non-restartable (otherwise, repeated
31        # __call__ would give incorrect results)
32        self._iterator = iter(iterable)
33
34    def __call__(self):
35        cache = self._cache
36        for item in cache:
37            yield item
38        for item in self._iterator:
39            cache.append(item)
40            yield item
41
42
43class EncodingFinder:
44    def __init__(self, default_encoding):
45        self._default_encoding = default_encoding
46    def encoding(self, response):
47        # HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
48        # headers may be in the response.  HTTP-EQUIV headers come last,
49        # so try in order from first to last.
50        for ct in response.info().getheaders("content-type"):
51            for k, v in split_header_words([ct])[0]:
52                if k == "charset":
53                    return v
54        return self._default_encoding
55
56class ResponseTypeFinder:
57    def __init__(self, allow_xhtml):
58        self._allow_xhtml = allow_xhtml
59    def is_html(self, response, encoding):
60        ct_hdrs = response.info().getheaders("content-type")
61        url = response.geturl()
62        # XXX encoding
63        return _is_html(ct_hdrs, url, self._allow_xhtml)
64
65
66# idea for this argument-processing trick is from Peter Otten
67class Args:
68    def __init__(self, args_map):
69        self.dictionary = dict(args_map)
70    def __getattr__(self, key):
71        try:
72            return self.dictionary[key]
73        except KeyError:
74            return getattr(self.__class__, key)
75
76def form_parser_args(
77    select_default=False,
78    form_parser_class=None,
79    request_class=None,
80    backwards_compat=False,
81    ):
82    return Args(locals())
83
84
85class Link:
86    def __init__(self, base_url, url, text, tag, attrs):
87        assert None not in [url, tag, attrs]
88        self.base_url = base_url
89        self.absolute_url = _rfc3986.urljoin(base_url, url)
90        self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
91    def __cmp__(self, other):
92        try:
93            for name in "url", "text", "tag", "attrs":
94                if getattr(self, name) != getattr(other, name):
95                    return -1
96        except AttributeError:
97            return -1
98        return 0
99    def __repr__(self):
100        return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
101            self.base_url, self.url, self.text, self.tag, self.attrs)
102
103
104class LinksFactory:
105
106    def __init__(self,
107                 link_parser_class=None,
108                 link_class=Link,
109                 urltags=None,
110                 ):
111        import _pullparser
112        if link_parser_class is None:
113            link_parser_class = _pullparser.TolerantPullParser
114        self.link_parser_class = link_parser_class
115        self.link_class = link_class
116        if urltags is None:
117            urltags = {
118                "a": "href",
119                "area": "href",
120                "frame": "src",
121                "iframe": "src",
122                }
123        self.urltags = urltags
124        self._response = None
125        self._encoding = None
126
127    def set_response(self, response, base_url, encoding):
128        self._response = response
129        self._encoding = encoding
130        self._base_url = base_url
131
132    def links(self):
133        """Return an iterator that provides links of the document."""
134        response = self._response
135        encoding = self._encoding
136        base_url = self._base_url
137        p = self.link_parser_class(response, encoding=encoding)
138
139        try:
140            for token in p.tags(*(self.urltags.keys()+["base"])):
141                if token.type == "endtag":
142                    continue
143                if token.data == "base":
144                    base_href = dict(token.attrs).get("href")
145                    if base_href is not None:
146                        base_url = base_href
147                    continue
148                attrs = dict(token.attrs)
149                tag = token.data
150                name = attrs.get("name")
151                text = None
152                # XXX use attr_encoding for ref'd doc if that doc does not
153                #  provide one by other means
154                #attr_encoding = attrs.get("charset")
155                url = attrs.get(self.urltags[tag])  # XXX is "" a valid URL?
156                if not url:
157                    # Probably an <A NAME="blah"> link or <AREA NOHREF...>.
158                    # For our purposes a link is something with a URL, so
159                    # ignore this.
160                    continue
161
162                url = _rfc3986.clean_url(url, encoding)
163                if tag == "a":
164                    if token.type != "startendtag":
165                        # hmm, this'd break if end tag is missing
166                        text = p.get_compressed_text(("endtag", tag))
167                    # but this doesn't work for eg.
168                    # <a href="blah"><b>Andy</b></a>
169                    #text = p.get_compressed_text()
170
171                yield Link(base_url, url, text, tag, token.attrs)
172        except sgmllib.SGMLParseError, exc:
173            raise ParseError(exc)
174
175class FormsFactory:
176
177    """Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
178
179    After calling .forms(), the .global_form attribute is a form object
180    containing all controls not a descendant of any FORM element.
181
182    For constructor argument docs, see ClientForm.ParseResponse
183    argument docs.
184
185    """
186
187    def __init__(self,
188                 select_default=False,
189                 form_parser_class=None,
190                 request_class=None,
191                 backwards_compat=False,
192                 ):
193        import ClientForm
194        self.select_default = select_default
195        if form_parser_class is None:
196            form_parser_class = ClientForm.FormParser
197        self.form_parser_class = form_parser_class
198        if request_class is None:
199            request_class = _request.Request
200        self.request_class = request_class
201        self.backwards_compat = backwards_compat
202        self._response = None
203        self.encoding = None
204        self.global_form = None
205
206    def set_response(self, response, encoding):
207        self._response = response
208        self.encoding = encoding
209        self.global_form = None
210
211    def forms(self):
212        import ClientForm
213        encoding = self.encoding
214        try:
215            forms = ClientForm.ParseResponseEx(
216                self._response,
217                select_default=self.select_default,
218                form_parser_class=self.form_parser_class,
219                request_class=self.request_class,
220                encoding=encoding,
221                _urljoin=_rfc3986.urljoin,
222                _urlparse=_rfc3986.urlsplit,
223                _urlunparse=_rfc3986.urlunsplit,
224                )
225        except ClientForm.ParseError, exc:
226            raise ParseError(exc)
227        self.global_form = forms[0]
228        return forms[1:]
229
230class TitleFactory:
231    def __init__(self):
232        self._response = self._encoding = None
233
234    def set_response(self, response, encoding):
235        self._response = response
236        self._encoding = encoding
237
238    def title(self):
239        import _pullparser
240        p = _pullparser.TolerantPullParser(
241            self._response, encoding=self._encoding)
242        try:
243            try:
244                p.get_tag("title")
245            except _pullparser.NoMoreTokensError:
246                return None
247            else:
248                return p.get_text()
249        except sgmllib.SGMLParseError, exc:
250            raise ParseError(exc)
251
252
253def unescape(data, entities, encoding):
254    if data is None or "&" not in data:
255        return data
256
257    def replace_entities(match):
258        ent = match.group()
259        if ent[1] == "#":
260            return unescape_charref(ent[2:-1], encoding)
261
262        repl = entities.get(ent[1:-1])
263        if repl is not None:
264            repl = unichr(repl)
265            if type(repl) != type(""):
266                try:
267                    repl = repl.encode(encoding)
268                except UnicodeError:
269                    repl = ent
270        else:
271            repl = ent
272        return repl
273
274    return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
275
276def unescape_charref(data, encoding):
277    name, base = data, 10
278    if name.startswith("x"):
279        name, base= name[1:], 16
280    uc = unichr(int(name, base))
281    if encoding is None:
282        return uc
283    else:
284        try:
285            repl = uc.encode(encoding)
286        except UnicodeError:
287            repl = "&#%s;" % data
288        return repl
289
290
291# bizarre import gymnastics for bundled BeautifulSoup
292import _beautifulsoup
293import ClientForm
294RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
295    _beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
296    )
297# monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
298import sgmllib
299sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
300
301class MechanizeBs(_beautifulsoup.BeautifulSoup):
302    _entitydefs = htmlentitydefs.name2codepoint
303    # don't want the magic Microsoft-char workaround
304    PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
305                       lambda(x):x.group(1) + ' />'),
306                      (re.compile('<!\s+([^<>]*)>'),
307                       lambda(x):'<!' + x.group(1) + '>')
308                      ]
309
310    def __init__(self, encoding, text=None, avoidParserProblems=True,
311                 initialTextIsEverything=True):
312        self._encoding = encoding
313        _beautifulsoup.BeautifulSoup.__init__(
314            self, text, avoidParserProblems, initialTextIsEverything)
315
316    def handle_charref(self, ref):
317        t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
318        self.handle_data(t)
319    def handle_entityref(self, ref):
320        t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
321        self.handle_data(t)
322    def unescape_attrs(self, attrs):
323        escaped_attrs = []
324        for key, val in attrs:
325            val = unescape(val, self._entitydefs, self._encoding)
326            escaped_attrs.append((key, val))
327        return escaped_attrs
328
329class RobustLinksFactory:
330
331    compress_re = re.compile(r"\s+")
332
333    def __init__(self,
334                 link_parser_class=None,
335                 link_class=Link,
336                 urltags=None,
337                 ):
338        import _beautifulsoup
339        if link_parser_class is None:
340            link_parser_class = MechanizeBs
341        self.link_parser_class = link_parser_class
342        self.link_class = link_class
343        if urltags is None:
344            urltags = {
345                "a": "href",
346                "area": "href",
347                "frame": "src",
348                "iframe": "src",
349                }
350        self.urltags = urltags
351        self._bs = None
352        self._encoding = None
353        self._base_url = None
354
355    def set_soup(self, soup, base_url, encoding):
356        self._bs = soup
357        self._base_url = base_url
358        self._encoding = encoding
359
360    def links(self):
361        import _beautifulsoup
362        bs = self._bs
363        base_url = self._base_url
364        encoding = self._encoding
365        gen = bs.recursiveChildGenerator()
366        for ch in bs.recursiveChildGenerator():
367            if (isinstance(ch, _beautifulsoup.Tag) and
368                ch.name in self.urltags.keys()+["base"]):
369                link = ch
370                attrs = bs.unescape_attrs(link.attrs)
371                attrs_dict = dict(attrs)
372                if link.name == "base":
373                    base_href = attrs_dict.get("href")
374                    if base_href is not None:
375                        base_url = base_href
376                    continue
377                url_attr = self.urltags[link.name]
378                url = attrs_dict.get(url_attr)
379                if not url:
380                    continue
381                url = _rfc3986.clean_url(url, encoding)
382                text = link.fetchText(lambda t: True)
383                if not text:
384                    # follow _pullparser's weird behaviour rigidly
385                    if link.name == "a":
386                        text = ""
387                    else:
388                        text = None
389                else:
390                    text = self.compress_re.sub(" ", " ".join(text).strip())
391                yield Link(base_url, url, text, link.name, attrs)
392
393
394class RobustFormsFactory(FormsFactory):
395    def __init__(self, *args, **kwds):
396        import ClientForm
397        args = form_parser_args(*args, **kwds)
398        if args.form_parser_class is None:
399            args.form_parser_class = RobustFormParser
400        FormsFactory.__init__(self, **args.dictionary)
401
402    def set_response(self, response, encoding):
403        self._response = response
404        self.encoding = encoding
405
406
407class RobustTitleFactory:
408    def __init__(self):
409        self._bs = self._encoding = None
410
411    def set_soup(self, soup, encoding):
412        self._bs = soup
413        self._encoding = encoding
414
415    def title(self):
416        import _beautifulsoup
417        title = self._bs.first("title")
418        if title == _beautifulsoup.Null:
419            return None
420        else:
421            return title.firstText(lambda t: True)
422
423
424class Factory:
425    """Factory for forms, links, etc.
426
427    This interface may expand in future.
428
429    Public methods:
430
431    set_request_class(request_class)
432    set_response(response)
433    forms()
434    links()
435
436    Public attributes:
437
438    Note that accessing these attributes may raise ParseError.
439
440    encoding: string specifying the encoding of response if it contains a text
441     document (this value is left unspecified for documents that do not have
442     an encoding, e.g. an image file)
443    is_html: true if response contains an HTML document (XHTML may be
444     regarded as HTML too)
445    title: page title, or None if no title or not HTML
446    global_form: form object containing all controls that are not descendants
447     of any FORM element, or None if the forms_factory does not support
448     supplying a global form
449
450    """
451
452    LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
453
454    def __init__(self, forms_factory, links_factory, title_factory,
455                 encoding_finder=EncodingFinder(DEFAULT_ENCODING),
456                 response_type_finder=ResponseTypeFinder(allow_xhtml=False),
457                 ):
458        """
459
460        Pass keyword arguments only.
461
462        default_encoding: character encoding to use if encoding cannot be
463         determined (or guessed) from the response.  You should turn on
464         HTTP-EQUIV handling if you want the best chance of getting this right
465         without resorting to this default.  The default value of this
466         parameter (currently latin-1) may change in future.
467
468        """
469        self._forms_factory = forms_factory
470        self._links_factory = links_factory
471        self._title_factory = title_factory
472        self._encoding_finder = encoding_finder
473        self._response_type_finder = response_type_finder
474
475        self.set_response(None)
476
477    def set_request_class(self, request_class):
478        """Set urllib2.Request class.
479
480        ClientForm.HTMLForm instances returned by .forms() will return
481        instances of this class when .click()ed.
482
483        """
484        self._forms_factory.request_class = request_class
485
486    def set_response(self, response):
487        """Set response.
488
489        The response must either be None or implement the same interface as
490        objects returned by urllib2.urlopen().
491
492        """
493        self._response = response
494        self._forms_genf = self._links_genf = None
495        self._get_title = None
496        for name in self.LAZY_ATTRS:
497            try:
498                delattr(self, name)
499            except AttributeError:
500                pass
501
502    def __getattr__(self, name):
503        if name not in self.LAZY_ATTRS:
504            return getattr(self.__class__, name)
505
506        if name == "encoding":
507            self.encoding = self._encoding_finder.encoding(
508                copy.copy(self._response))
509            return self.encoding
510        elif name == "is_html":
511            self.is_html = self._response_type_finder.is_html(
512                copy.copy(self._response), self.encoding)
513            return self.is_html
514        elif name == "title":
515            if self.is_html:
516                self.title = self._title_factory.title()
517            else:
518                self.title = None
519            return self.title
520        elif name == "global_form":
521            self.forms()
522            return self.global_form
523
524    def forms(self):
525        """Return iterable over ClientForm.HTMLForm-like objects.
526
527        Raises mechanize.ParseError on failure.
528        """
529        # this implementation sets .global_form as a side-effect, for benefit
530        # of __getattr__ impl
531        if self._forms_genf is None:
532            try:
533                self._forms_genf = CachingGeneratorFunction(
534                    self._forms_factory.forms())
535            except:  # XXXX define exception!
536                self.set_response(self._response)
537                raise
538            self.global_form = getattr(
539                self._forms_factory, "global_form", None)
540        return self._forms_genf()
541
542    def links(self):
543        """Return iterable over mechanize.Link-like objects.
544
545        Raises mechanize.ParseError on failure.
546        """
547        if self._links_genf is None:
548            try:
549                self._links_genf = CachingGeneratorFunction(
550                    self._links_factory.links())
551            except:  # XXXX define exception!
552                self.set_response(self._response)
553                raise
554        return self._links_genf()
555
556class DefaultFactory(Factory):
557    """Based on sgmllib."""
558    def __init__(self, i_want_broken_xhtml_support=False):
559        Factory.__init__(
560            self,
561            forms_factory=FormsFactory(),
562            links_factory=LinksFactory(),
563            title_factory=TitleFactory(),
564            response_type_finder=ResponseTypeFinder(
565                allow_xhtml=i_want_broken_xhtml_support),
566            )
567
568    def set_response(self, response):
569        Factory.set_response(self, response)
570        if response is not None:
571            self._forms_factory.set_response(
572                copy.copy(response), self.encoding)
573            self._links_factory.set_response(
574                copy.copy(response), response.geturl(), self.encoding)
575            self._title_factory.set_response(
576                copy.copy(response), self.encoding)
577
578class RobustFactory(Factory):
579    """Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
580    DefaultFactory.
581
582    """
583    def __init__(self, i_want_broken_xhtml_support=False,
584                 soup_class=None):
585        Factory.__init__(
586            self,
587            forms_factory=RobustFormsFactory(),
588            links_factory=RobustLinksFactory(),
589            title_factory=RobustTitleFactory(),
590            response_type_finder=ResponseTypeFinder(
591                allow_xhtml=i_want_broken_xhtml_support),
592            )
593        if soup_class is None:
594            soup_class = MechanizeBs
595        self._soup_class = soup_class
596
597    def set_response(self, response):
598        import _beautifulsoup
599        Factory.set_response(self, response)
600        if response is not None:
601            data = response.read()
602            soup = self._soup_class(self.encoding, data)
603            self._forms_factory.set_response(
604                copy.copy(response), self.encoding)
605            self._links_factory.set_soup(
606                soup, response.geturl(), self.encoding)
607            self._title_factory.set_soup(soup, self.encoding)
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。