root/galaxy-central/eggs/twill-0.9-py2.6.egg/twill/other_packages/_mechanize_dist/_pullparser.py

リビジョン 3, 12.2 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
2
3Examples
4
5This program extracts all links from a document.  It will print one
6line for each link, containing the URL and the textual description
7between the <A>...</A> tags:
8
9import pullparser, sys
10f = file(sys.argv[1])
11p = pullparser.PullParser(f)
12for token in p.tags("a"):
13    if token.type == "endtag": continue
14    url = dict(token.attrs).get("href", "-")
15    text = p.get_compressed_text(endat=("endtag", "a"))
16    print "%s\t%s" % (url, text)
17
18This program extracts the <TITLE> from the document:
19
20import pullparser, sys
21f = file(sys.argv[1])
22p = pullparser.PullParser(f)
23if p.get_tag("title"):
24    title = p.get_compressed_text()
25    print "Title: %s" % title
26
27
28Copyright 2003-2006 John J. Lee <jjl@pobox.com>
29Copyright 1998-2001 Gisle Aas (original libwww-perl code)
30
31This code is free software; you can redistribute it and/or modify it
32under the terms of the BSD or ZPL 2.1 licenses.
33
34"""
35
36import re, htmlentitydefs
37import sgmllib, HTMLParser
38
39from _html import unescape, unescape_charref
40
41
42class NoMoreTokensError(Exception): pass
43
44class Token:
45    """Represents an HTML tag, declaration, processing instruction etc.
46
47    Behaves as both a tuple-like object (ie. iterable) and has attributes
48    .type, .data and .attrs.
49
50    >>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
51    >>> t == ("starttag", "a", [("href", "http://www.python.org/")])
52    True
53    >>> (t.type, t.data) == ("starttag", "a")
54    True
55    >>> t.attrs == [("href", "http://www.python.org/")]
56    True
57
58    Public attributes
59
60    type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
61     "data", "comment", "decl", "pi", after the corresponding methods of
62     HTMLParser.HTMLParser
63    data: For a tag, the tag name; otherwise, the relevant data carried by the
64     tag, as a string
65    attrs: list of (name, value) pairs representing HTML attributes
66     (or None if token does not represent an opening tag)
67
68    """
69    def __init__(self, type, data, attrs=None):
70        self.type = type
71        self.data = data
72        self.attrs = attrs
73    def __iter__(self):
74        return iter((self.type, self.data, self.attrs))
75    def __eq__(self, other):
76        type, data, attrs = other
77        if (self.type == type and
78            self.data == data and
79            self.attrs == attrs):
80            return True
81        else:
82            return False
83    def __ne__(self, other): return not self.__eq__(other)
84    def __repr__(self):
85        args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
86        return self.__class__.__name__+"(%s)" % args
87
88def iter_until_exception(fn, exception, *args, **kwds):
89    while 1:
90        try:
91            yield fn(*args, **kwds)
92        except exception:
93            raise StopIteration
94
95
96class _AbstractParser:
97    chunk = 1024
98    compress_re = re.compile(r"\s+")
99    def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
100                 encoding="ascii", entitydefs=None):
101        """
102        fh: file-like object (only a .read() method is required) from which to
103         read HTML to be parsed
104        textify: mapping used by .get_text() and .get_compressed_text() methods
105         to represent opening tags as text
106        encoding: encoding used to encode numeric character references by
107         .get_text() and .get_compressed_text() ("ascii" by default)
108
109        entitydefs: mapping like {"amp": "&", ...} containing HTML entity
110         definitions (a sensible default is used).  This is used to unescape
111         entities in .get_text() (and .get_compressed_text()) and attribute
112         values.  If the encoding can not represent the character, the entity
113         reference is left unescaped.  Note that entity references (both
114         numeric - e.g. &#123; or &#xabc; - and non-numeric - e.g. &amp;) are
115         unescaped in attribute values and the return value of .get_text(), but
116         not in data outside of tags.  Instead, entity references outside of
117         tags are represented as tokens.  This is a bit odd, it's true :-/
118
119        If the element name of an opening tag matches a key in the textify
120        mapping then that tag is converted to text.  The corresponding value is
121        used to specify which tag attribute to obtain the text from.  textify
122        maps from element names to either:
123
124          - an HTML attribute name, in which case the HTML attribute value is
125            used as its text value along with the element name in square
126            brackets (eg."alt text goes here[IMG]", or, if the alt attribute
127            were missing, just "[IMG]")
128          - a callable object (eg. a function) which takes a Token and returns
129            the string to be used as its text value
130
131        If textify has no key for an element name, nothing is substituted for
132        the opening tag.
133
134        Public attributes:
135
136        encoding and textify: see above
137
138        """
139        self._fh = fh
140        self._tokenstack = []  # FIFO
141        self.textify = textify
142        self.encoding = encoding
143        if entitydefs is None:
144            entitydefs = htmlentitydefs.name2codepoint
145        self._entitydefs = entitydefs
146
147    def __iter__(self): return self
148
149    def tags(self, *names):
150        return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
151
152    def tokens(self, *tokentypes):
153        return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes)
154
155    def next(self):
156        try:
157            return self.get_token()
158        except NoMoreTokensError:
159            raise StopIteration()
160
161    def get_token(self, *tokentypes):
162        """Pop the next Token object from the stack of parsed tokens.
163
164        If arguments are given, they are taken to be token types in which the
165        caller is interested: tokens representing other elements will be
166        skipped.  Element names must be given in lower case.
167
168        Raises NoMoreTokensError.
169
170        """
171        while 1:
172            while self._tokenstack:
173                token = self._tokenstack.pop(0)
174                if tokentypes:
175                    if token.type in tokentypes:
176                        return token
177                else:
178                    return token
179            data = self._fh.read(self.chunk)
180            if not data:
181                raise NoMoreTokensError()
182            self.feed(data)
183
184    def unget_token(self, token):
185        """Push a Token back onto the stack."""
186        self._tokenstack.insert(0, token)
187
188    def get_tag(self, *names):
189        """Return the next Token that represents an opening or closing tag.
190
191        If arguments are given, they are taken to be element names in which the
192        caller is interested: tags representing other elements will be skipped.
193        Element names must be given in lower case.
194
195        Raises NoMoreTokensError.
196
197        """
198        while 1:
199            tok = self.get_token()
200            if tok.type not in ["starttag", "endtag", "startendtag"]:
201                continue
202            if names:
203                if tok.data in names:
204                    return tok
205            else:
206                return tok
207
208    def get_text(self, endat=None):
209        """Get some text.
210
211        endat: stop reading text at this tag (the tag is included in the
212         returned text); endtag is a tuple (type, name) where type is
213         "starttag", "endtag" or "startendtag", and name is the element name of
214         the tag (element names must be given in lower case)
215
216        If endat is not given, .get_text() will stop at the next opening or
217        closing tag, or when there are no more tokens (no exception is raised).
218        Note that .get_text() includes the text representation (if any) of the
219        opening tag, but pushes the opening tag back onto the stack.  As a
220        result, if you want to call .get_text() again, you need to call
221        .get_tag() first (unless you want an empty string returned when you
222        next call .get_text()).
223
224        Entity references are translated using the value of the entitydefs
225        constructor argument (a mapping from names to characters like that
226        provided by the standard module htmlentitydefs).  Named entity
227        references that are not in this mapping are left unchanged.
228
229        The textify attribute is used to translate opening tags into text: see
230        the class docstring.
231
232        """
233        text = []
234        tok = None
235        while 1:
236            try:
237                tok = self.get_token()
238            except NoMoreTokensError:
239                # unget last token (not the one we just failed to get)
240                if tok: self.unget_token(tok)
241                break
242            if tok.type == "data":
243                text.append(tok.data)
244            elif tok.type == "entityref":
245                t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)
246                text.append(t)
247            elif tok.type == "charref":
248                t = unescape_charref(tok.data, self.encoding)
249                text.append(t)
250            elif tok.type in ["starttag", "endtag", "startendtag"]:
251                tag_name = tok.data
252                if tok.type in ["starttag", "startendtag"]:
253                    alt = self.textify.get(tag_name)
254                    if alt is not None:
255                        if callable(alt):
256                            text.append(alt(tok))
257                        elif tok.attrs is not None:
258                            for k, v in tok.attrs:
259                                if k == alt:
260                                    text.append(v)
261                            text.append("[%s]" % tag_name.upper())
262                if endat is None or endat == (tok.type, tag_name):
263                    self.unget_token(tok)
264                    break
265        return "".join(text)
266
267    def get_compressed_text(self, *args, **kwds):
268        """
269        As .get_text(), but collapses each group of contiguous whitespace to a
270        single space character, and removes all initial and trailing
271        whitespace.
272
273        """
274        text = self.get_text(*args, **kwds)
275        text = text.strip()
276        return self.compress_re.sub(" ", text)
277
278    def handle_startendtag(self, tag, attrs):
279        self._tokenstack.append(Token("startendtag", tag, attrs))
280    def handle_starttag(self, tag, attrs):
281        self._tokenstack.append(Token("starttag", tag, attrs))
282    def handle_endtag(self, tag):
283        self._tokenstack.append(Token("endtag", tag))
284    def handle_charref(self, name):
285        self._tokenstack.append(Token("charref", name))
286    def handle_entityref(self, name):
287        self._tokenstack.append(Token("entityref", name))
288    def handle_data(self, data):
289        self._tokenstack.append(Token("data", data))
290    def handle_comment(self, data):
291        self._tokenstack.append(Token("comment", data))
292    def handle_decl(self, decl):
293        self._tokenstack.append(Token("decl", decl))
294    def unknown_decl(self, data):
295        # XXX should this call self.error instead?
296        #self.error("unknown declaration: " + `data`)
297        self._tokenstack.append(Token("decl", data))
298    def handle_pi(self, data):
299        self._tokenstack.append(Token("pi", data))
300
301    def unescape_attr(self, name):
302        return unescape(name, self._entitydefs, self.encoding)
303    def unescape_attrs(self, attrs):
304        escaped_attrs = []
305        for key, val in attrs:
306            escaped_attrs.append((key, self.unescape_attr(val)))
307        return escaped_attrs
308
309class PullParser(_AbstractParser, HTMLParser.HTMLParser):
310    def __init__(self, *args, **kwds):
311        HTMLParser.HTMLParser.__init__(self)
312        _AbstractParser.__init__(self, *args, **kwds)
313    def unescape(self, name):
314        # Use the entitydefs passed into constructor, not
315        # HTMLParser.HTMLParser's entitydefs.
316        return self.unescape_attr(name)
317
318class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
319    def __init__(self, *args, **kwds):
320        sgmllib.SGMLParser.__init__(self)
321        _AbstractParser.__init__(self, *args, **kwds)
322    def unknown_starttag(self, tag, attrs):
323        attrs = self.unescape_attrs(attrs)
324        self._tokenstack.append(Token("starttag", tag, attrs))
325    def unknown_endtag(self, tag):
326        self._tokenstack.append(Token("endtag", tag))
327
328
329def _test():
330   import doctest, _pullparser
331   return doctest.testmod(_pullparser)
332
333if __name__ == "__main__":
334   _test()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。