1 | """A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser. |
---|
2 | |
---|
3 | Examples |
---|
4 | |
---|
5 | This program extracts all links from a document. It will print one |
---|
6 | line for each link, containing the URL and the textual description |
---|
7 | between the <A>...</A> tags: |
---|
8 | |
---|
9 | import pullparser, sys |
---|
10 | f = file(sys.argv[1]) |
---|
11 | p = pullparser.PullParser(f) |
---|
12 | for token in p.tags("a"): |
---|
13 | if token.type == "endtag": continue |
---|
14 | url = dict(token.attrs).get("href", "-") |
---|
15 | text = p.get_compressed_text(endat=("endtag", "a")) |
---|
16 | print "%s\t%s" % (url, text) |
---|
17 | |
---|
18 | This program extracts the <TITLE> from the document: |
---|
19 | |
---|
20 | import pullparser, sys |
---|
21 | f = file(sys.argv[1]) |
---|
22 | p = pullparser.PullParser(f) |
---|
23 | if p.get_tag("title"): |
---|
24 | title = p.get_compressed_text() |
---|
25 | print "Title: %s" % title |
---|
26 | |
---|
27 | |
---|
28 | Copyright 2003-2006 John J. Lee <jjl@pobox.com> |
---|
29 | Copyright 1998-2001 Gisle Aas (original libwww-perl code) |
---|
30 | |
---|
31 | This code is free software; you can redistribute it and/or modify it |
---|
32 | under the terms of the BSD or ZPL 2.1 licenses. |
---|
33 | |
---|
34 | """ |
---|
35 | |
---|
36 | import re, htmlentitydefs |
---|
37 | import sgmllib, HTMLParser |
---|
38 | |
---|
39 | from _html import unescape, unescape_charref |
---|
40 | |
---|
41 | |
---|
42 | class NoMoreTokensError(Exception): pass |
---|
43 | |
---|
44 | class Token: |
---|
45 | """Represents an HTML tag, declaration, processing instruction etc. |
---|
46 | |
---|
47 | Behaves as both a tuple-like object (ie. iterable) and has attributes |
---|
48 | .type, .data and .attrs. |
---|
49 | |
---|
50 | >>> t = Token("starttag", "a", [("href", "http://www.python.org/")]) |
---|
51 | >>> t == ("starttag", "a", [("href", "http://www.python.org/")]) |
---|
52 | True |
---|
53 | >>> (t.type, t.data) == ("starttag", "a") |
---|
54 | True |
---|
55 | >>> t.attrs == [("href", "http://www.python.org/")] |
---|
56 | True |
---|
57 | |
---|
58 | Public attributes |
---|
59 | |
---|
60 | type: one of "starttag", "endtag", "startendtag", "charref", "entityref", |
---|
61 | "data", "comment", "decl", "pi", after the corresponding methods of |
---|
62 | HTMLParser.HTMLParser |
---|
63 | data: For a tag, the tag name; otherwise, the relevant data carried by the |
---|
64 | tag, as a string |
---|
65 | attrs: list of (name, value) pairs representing HTML attributes |
---|
66 | (or None if token does not represent an opening tag) |
---|
67 | |
---|
68 | """ |
---|
69 | def __init__(self, type, data, attrs=None): |
---|
70 | self.type = type |
---|
71 | self.data = data |
---|
72 | self.attrs = attrs |
---|
73 | def __iter__(self): |
---|
74 | return iter((self.type, self.data, self.attrs)) |
---|
75 | def __eq__(self, other): |
---|
76 | type, data, attrs = other |
---|
77 | if (self.type == type and |
---|
78 | self.data == data and |
---|
79 | self.attrs == attrs): |
---|
80 | return True |
---|
81 | else: |
---|
82 | return False |
---|
83 | def __ne__(self, other): return not self.__eq__(other) |
---|
84 | def __repr__(self): |
---|
85 | args = ", ".join(map(repr, [self.type, self.data, self.attrs])) |
---|
86 | return self.__class__.__name__+"(%s)" % args |
---|
87 | |
---|
88 | def iter_until_exception(fn, exception, *args, **kwds): |
---|
89 | while 1: |
---|
90 | try: |
---|
91 | yield fn(*args, **kwds) |
---|
92 | except exception: |
---|
93 | raise StopIteration |
---|
94 | |
---|
95 | |
---|
96 | class _AbstractParser: |
---|
97 | chunk = 1024 |
---|
98 | compress_re = re.compile(r"\s+") |
---|
99 | def __init__(self, fh, textify={"img": "alt", "applet": "alt"}, |
---|
100 | encoding="ascii", entitydefs=None): |
---|
101 | """ |
---|
102 | fh: file-like object (only a .read() method is required) from which to |
---|
103 | read HTML to be parsed |
---|
104 | textify: mapping used by .get_text() and .get_compressed_text() methods |
---|
105 | to represent opening tags as text |
---|
106 | encoding: encoding used to encode numeric character references by |
---|
107 | .get_text() and .get_compressed_text() ("ascii" by default) |
---|
108 | |
---|
109 | entitydefs: mapping like {"amp": "&", ...} containing HTML entity |
---|
110 | definitions (a sensible default is used). This is used to unescape |
---|
111 | entities in .get_text() (and .get_compressed_text()) and attribute |
---|
112 | values. If the encoding can not represent the character, the entity |
---|
113 | reference is left unescaped. Note that entity references (both |
---|
114 | numeric - e.g. { or ઼ - and non-numeric - e.g. &) are |
---|
115 | unescaped in attribute values and the return value of .get_text(), but |
---|
116 | not in data outside of tags. Instead, entity references outside of |
---|
117 | tags are represented as tokens. This is a bit odd, it's true :-/ |
---|
118 | |
---|
119 | If the element name of an opening tag matches a key in the textify |
---|
120 | mapping then that tag is converted to text. The corresponding value is |
---|
121 | used to specify which tag attribute to obtain the text from. textify |
---|
122 | maps from element names to either: |
---|
123 | |
---|
124 | - an HTML attribute name, in which case the HTML attribute value is |
---|
125 | used as its text value along with the element name in square |
---|
126 | brackets (eg."alt text goes here[IMG]", or, if the alt attribute |
---|
127 | were missing, just "[IMG]") |
---|
128 | - a callable object (eg. a function) which takes a Token and returns |
---|
129 | the string to be used as its text value |
---|
130 | |
---|
131 | If textify has no key for an element name, nothing is substituted for |
---|
132 | the opening tag. |
---|
133 | |
---|
134 | Public attributes: |
---|
135 | |
---|
136 | encoding and textify: see above |
---|
137 | |
---|
138 | """ |
---|
139 | self._fh = fh |
---|
140 | self._tokenstack = [] # FIFO |
---|
141 | self.textify = textify |
---|
142 | self.encoding = encoding |
---|
143 | if entitydefs is None: |
---|
144 | entitydefs = htmlentitydefs.name2codepoint |
---|
145 | self._entitydefs = entitydefs |
---|
146 | |
---|
147 | def __iter__(self): return self |
---|
148 | |
---|
149 | def tags(self, *names): |
---|
150 | return iter_until_exception(self.get_tag, NoMoreTokensError, *names) |
---|
151 | |
---|
152 | def tokens(self, *tokentypes): |
---|
153 | return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes) |
---|
154 | |
---|
155 | def next(self): |
---|
156 | try: |
---|
157 | return self.get_token() |
---|
158 | except NoMoreTokensError: |
---|
159 | raise StopIteration() |
---|
160 | |
---|
161 | def get_token(self, *tokentypes): |
---|
162 | """Pop the next Token object from the stack of parsed tokens. |
---|
163 | |
---|
164 | If arguments are given, they are taken to be token types in which the |
---|
165 | caller is interested: tokens representing other elements will be |
---|
166 | skipped. Element names must be given in lower case. |
---|
167 | |
---|
168 | Raises NoMoreTokensError. |
---|
169 | |
---|
170 | """ |
---|
171 | while 1: |
---|
172 | while self._tokenstack: |
---|
173 | token = self._tokenstack.pop(0) |
---|
174 | if tokentypes: |
---|
175 | if token.type in tokentypes: |
---|
176 | return token |
---|
177 | else: |
---|
178 | return token |
---|
179 | data = self._fh.read(self.chunk) |
---|
180 | if not data: |
---|
181 | raise NoMoreTokensError() |
---|
182 | self.feed(data) |
---|
183 | |
---|
184 | def unget_token(self, token): |
---|
185 | """Push a Token back onto the stack.""" |
---|
186 | self._tokenstack.insert(0, token) |
---|
187 | |
---|
188 | def get_tag(self, *names): |
---|
189 | """Return the next Token that represents an opening or closing tag. |
---|
190 | |
---|
191 | If arguments are given, they are taken to be element names in which the |
---|
192 | caller is interested: tags representing other elements will be skipped. |
---|
193 | Element names must be given in lower case. |
---|
194 | |
---|
195 | Raises NoMoreTokensError. |
---|
196 | |
---|
197 | """ |
---|
198 | while 1: |
---|
199 | tok = self.get_token() |
---|
200 | if tok.type not in ["starttag", "endtag", "startendtag"]: |
---|
201 | continue |
---|
202 | if names: |
---|
203 | if tok.data in names: |
---|
204 | return tok |
---|
205 | else: |
---|
206 | return tok |
---|
207 | |
---|
208 | def get_text(self, endat=None): |
---|
209 | """Get some text. |
---|
210 | |
---|
211 | endat: stop reading text at this tag (the tag is included in the |
---|
212 | returned text); endtag is a tuple (type, name) where type is |
---|
213 | "starttag", "endtag" or "startendtag", and name is the element name of |
---|
214 | the tag (element names must be given in lower case) |
---|
215 | |
---|
216 | If endat is not given, .get_text() will stop at the next opening or |
---|
217 | closing tag, or when there are no more tokens (no exception is raised). |
---|
218 | Note that .get_text() includes the text representation (if any) of the |
---|
219 | opening tag, but pushes the opening tag back onto the stack. As a |
---|
220 | result, if you want to call .get_text() again, you need to call |
---|
221 | .get_tag() first (unless you want an empty string returned when you |
---|
222 | next call .get_text()). |
---|
223 | |
---|
224 | Entity references are translated using the value of the entitydefs |
---|
225 | constructor argument (a mapping from names to characters like that |
---|
226 | provided by the standard module htmlentitydefs). Named entity |
---|
227 | references that are not in this mapping are left unchanged. |
---|
228 | |
---|
229 | The textify attribute is used to translate opening tags into text: see |
---|
230 | the class docstring. |
---|
231 | |
---|
232 | """ |
---|
233 | text = [] |
---|
234 | tok = None |
---|
235 | while 1: |
---|
236 | try: |
---|
237 | tok = self.get_token() |
---|
238 | except NoMoreTokensError: |
---|
239 | # unget last token (not the one we just failed to get) |
---|
240 | if tok: self.unget_token(tok) |
---|
241 | break |
---|
242 | if tok.type == "data": |
---|
243 | text.append(tok.data) |
---|
244 | elif tok.type == "entityref": |
---|
245 | t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding) |
---|
246 | text.append(t) |
---|
247 | elif tok.type == "charref": |
---|
248 | t = unescape_charref(tok.data, self.encoding) |
---|
249 | text.append(t) |
---|
250 | elif tok.type in ["starttag", "endtag", "startendtag"]: |
---|
251 | tag_name = tok.data |
---|
252 | if tok.type in ["starttag", "startendtag"]: |
---|
253 | alt = self.textify.get(tag_name) |
---|
254 | if alt is not None: |
---|
255 | if callable(alt): |
---|
256 | text.append(alt(tok)) |
---|
257 | elif tok.attrs is not None: |
---|
258 | for k, v in tok.attrs: |
---|
259 | if k == alt: |
---|
260 | text.append(v) |
---|
261 | text.append("[%s]" % tag_name.upper()) |
---|
262 | if endat is None or endat == (tok.type, tag_name): |
---|
263 | self.unget_token(tok) |
---|
264 | break |
---|
265 | return "".join(text) |
---|
266 | |
---|
267 | def get_compressed_text(self, *args, **kwds): |
---|
268 | """ |
---|
269 | As .get_text(), but collapses each group of contiguous whitespace to a |
---|
270 | single space character, and removes all initial and trailing |
---|
271 | whitespace. |
---|
272 | |
---|
273 | """ |
---|
274 | text = self.get_text(*args, **kwds) |
---|
275 | text = text.strip() |
---|
276 | return self.compress_re.sub(" ", text) |
---|
277 | |
---|
278 | def handle_startendtag(self, tag, attrs): |
---|
279 | self._tokenstack.append(Token("startendtag", tag, attrs)) |
---|
280 | def handle_starttag(self, tag, attrs): |
---|
281 | self._tokenstack.append(Token("starttag", tag, attrs)) |
---|
282 | def handle_endtag(self, tag): |
---|
283 | self._tokenstack.append(Token("endtag", tag)) |
---|
284 | def handle_charref(self, name): |
---|
285 | self._tokenstack.append(Token("charref", name)) |
---|
286 | def handle_entityref(self, name): |
---|
287 | self._tokenstack.append(Token("entityref", name)) |
---|
288 | def handle_data(self, data): |
---|
289 | self._tokenstack.append(Token("data", data)) |
---|
290 | def handle_comment(self, data): |
---|
291 | self._tokenstack.append(Token("comment", data)) |
---|
292 | def handle_decl(self, decl): |
---|
293 | self._tokenstack.append(Token("decl", decl)) |
---|
294 | def unknown_decl(self, data): |
---|
295 | # XXX should this call self.error instead? |
---|
296 | #self.error("unknown declaration: " + `data`) |
---|
297 | self._tokenstack.append(Token("decl", data)) |
---|
298 | def handle_pi(self, data): |
---|
299 | self._tokenstack.append(Token("pi", data)) |
---|
300 | |
---|
301 | def unescape_attr(self, name): |
---|
302 | return unescape(name, self._entitydefs, self.encoding) |
---|
303 | def unescape_attrs(self, attrs): |
---|
304 | escaped_attrs = [] |
---|
305 | for key, val in attrs: |
---|
306 | escaped_attrs.append((key, self.unescape_attr(val))) |
---|
307 | return escaped_attrs |
---|
308 | |
---|
309 | class PullParser(_AbstractParser, HTMLParser.HTMLParser): |
---|
310 | def __init__(self, *args, **kwds): |
---|
311 | HTMLParser.HTMLParser.__init__(self) |
---|
312 | _AbstractParser.__init__(self, *args, **kwds) |
---|
313 | def unescape(self, name): |
---|
314 | # Use the entitydefs passed into constructor, not |
---|
315 | # HTMLParser.HTMLParser's entitydefs. |
---|
316 | return self.unescape_attr(name) |
---|
317 | |
---|
318 | class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser): |
---|
319 | def __init__(self, *args, **kwds): |
---|
320 | sgmllib.SGMLParser.__init__(self) |
---|
321 | _AbstractParser.__init__(self, *args, **kwds) |
---|
322 | def unknown_starttag(self, tag, attrs): |
---|
323 | attrs = self.unescape_attrs(attrs) |
---|
324 | self._tokenstack.append(Token("starttag", tag, attrs)) |
---|
325 | def unknown_endtag(self, tag): |
---|
326 | self._tokenstack.append(Token("endtag", tag)) |
---|
327 | |
---|
328 | |
---|
329 | def _test(): |
---|
330 | import doctest, _pullparser |
---|
331 | return doctest.testmod(_pullparser) |
---|
332 | |
---|
333 | if __name__ == "__main__": |
---|
334 | _test() |
---|