| 1 | """A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser. |
|---|
| 2 | |
|---|
| 3 | Examples |
|---|
| 4 | |
|---|
| 5 | This program extracts all links from a document. It will print one |
|---|
| 6 | line for each link, containing the URL and the textual description |
|---|
| 7 | between the <A>...</A> tags: |
|---|
| 8 | |
|---|
| 9 | import pullparser, sys |
|---|
| 10 | f = file(sys.argv[1]) |
|---|
| 11 | p = pullparser.PullParser(f) |
|---|
| 12 | for token in p.tags("a"): |
|---|
| 13 | if token.type == "endtag": continue |
|---|
| 14 | url = dict(token.attrs).get("href", "-") |
|---|
| 15 | text = p.get_compressed_text(endat=("endtag", "a")) |
|---|
| 16 | print "%s\t%s" % (url, text) |
|---|
| 17 | |
|---|
| 18 | This program extracts the <TITLE> from the document: |
|---|
| 19 | |
|---|
| 20 | import pullparser, sys |
|---|
| 21 | f = file(sys.argv[1]) |
|---|
| 22 | p = pullparser.PullParser(f) |
|---|
| 23 | if p.get_tag("title"): |
|---|
| 24 | title = p.get_compressed_text() |
|---|
| 25 | print "Title: %s" % title |
|---|
| 26 | |
|---|
| 27 | |
|---|
| 28 | Copyright 2003-2006 John J. Lee <jjl@pobox.com> |
|---|
| 29 | Copyright 1998-2001 Gisle Aas (original libwww-perl code) |
|---|
| 30 | |
|---|
| 31 | This code is free software; you can redistribute it and/or modify it |
|---|
| 32 | under the terms of the BSD or ZPL 2.1 licenses. |
|---|
| 33 | |
|---|
| 34 | """ |
|---|
| 35 | |
|---|
| 36 | import re, htmlentitydefs |
|---|
| 37 | import sgmllib, HTMLParser |
|---|
| 38 | |
|---|
| 39 | from _html import unescape, unescape_charref |
|---|
| 40 | |
|---|
| 41 | |
|---|
| 42 | class NoMoreTokensError(Exception): pass |
|---|
| 43 | |
|---|
| 44 | class Token: |
|---|
| 45 | """Represents an HTML tag, declaration, processing instruction etc. |
|---|
| 46 | |
|---|
| 47 | Behaves as both a tuple-like object (ie. iterable) and has attributes |
|---|
| 48 | .type, .data and .attrs. |
|---|
| 49 | |
|---|
| 50 | >>> t = Token("starttag", "a", [("href", "http://www.python.org/")]) |
|---|
| 51 | >>> t == ("starttag", "a", [("href", "http://www.python.org/")]) |
|---|
| 52 | True |
|---|
| 53 | >>> (t.type, t.data) == ("starttag", "a") |
|---|
| 54 | True |
|---|
| 55 | >>> t.attrs == [("href", "http://www.python.org/")] |
|---|
| 56 | True |
|---|
| 57 | |
|---|
| 58 | Public attributes |
|---|
| 59 | |
|---|
| 60 | type: one of "starttag", "endtag", "startendtag", "charref", "entityref", |
|---|
| 61 | "data", "comment", "decl", "pi", after the corresponding methods of |
|---|
| 62 | HTMLParser.HTMLParser |
|---|
| 63 | data: For a tag, the tag name; otherwise, the relevant data carried by the |
|---|
| 64 | tag, as a string |
|---|
| 65 | attrs: list of (name, value) pairs representing HTML attributes |
|---|
| 66 | (or None if token does not represent an opening tag) |
|---|
| 67 | |
|---|
| 68 | """ |
|---|
| 69 | def __init__(self, type, data, attrs=None): |
|---|
| 70 | self.type = type |
|---|
| 71 | self.data = data |
|---|
| 72 | self.attrs = attrs |
|---|
| 73 | def __iter__(self): |
|---|
| 74 | return iter((self.type, self.data, self.attrs)) |
|---|
| 75 | def __eq__(self, other): |
|---|
| 76 | type, data, attrs = other |
|---|
| 77 | if (self.type == type and |
|---|
| 78 | self.data == data and |
|---|
| 79 | self.attrs == attrs): |
|---|
| 80 | return True |
|---|
| 81 | else: |
|---|
| 82 | return False |
|---|
| 83 | def __ne__(self, other): return not self.__eq__(other) |
|---|
| 84 | def __repr__(self): |
|---|
| 85 | args = ", ".join(map(repr, [self.type, self.data, self.attrs])) |
|---|
| 86 | return self.__class__.__name__+"(%s)" % args |
|---|
| 87 | |
|---|
| 88 | def iter_until_exception(fn, exception, *args, **kwds): |
|---|
| 89 | while 1: |
|---|
| 90 | try: |
|---|
| 91 | yield fn(*args, **kwds) |
|---|
| 92 | except exception: |
|---|
| 93 | raise StopIteration |
|---|
| 94 | |
|---|
| 95 | |
|---|
| 96 | class _AbstractParser: |
|---|
| 97 | chunk = 1024 |
|---|
| 98 | compress_re = re.compile(r"\s+") |
|---|
| 99 | def __init__(self, fh, textify={"img": "alt", "applet": "alt"}, |
|---|
| 100 | encoding="ascii", entitydefs=None): |
|---|
| 101 | """ |
|---|
| 102 | fh: file-like object (only a .read() method is required) from which to |
|---|
| 103 | read HTML to be parsed |
|---|
| 104 | textify: mapping used by .get_text() and .get_compressed_text() methods |
|---|
| 105 | to represent opening tags as text |
|---|
| 106 | encoding: encoding used to encode numeric character references by |
|---|
| 107 | .get_text() and .get_compressed_text() ("ascii" by default) |
|---|
| 108 | |
|---|
| 109 | entitydefs: mapping like {"amp": "&", ...} containing HTML entity |
|---|
| 110 | definitions (a sensible default is used). This is used to unescape |
|---|
| 111 | entities in .get_text() (and .get_compressed_text()) and attribute |
|---|
| 112 | values. If the encoding can not represent the character, the entity |
|---|
| 113 | reference is left unescaped. Note that entity references (both |
|---|
| 114 | numeric - e.g. { or ઼ - and non-numeric - e.g. &) are |
|---|
| 115 | unescaped in attribute values and the return value of .get_text(), but |
|---|
| 116 | not in data outside of tags. Instead, entity references outside of |
|---|
| 117 | tags are represented as tokens. This is a bit odd, it's true :-/ |
|---|
| 118 | |
|---|
| 119 | If the element name of an opening tag matches a key in the textify |
|---|
| 120 | mapping then that tag is converted to text. The corresponding value is |
|---|
| 121 | used to specify which tag attribute to obtain the text from. textify |
|---|
| 122 | maps from element names to either: |
|---|
| 123 | |
|---|
| 124 | - an HTML attribute name, in which case the HTML attribute value is |
|---|
| 125 | used as its text value along with the element name in square |
|---|
| 126 | brackets (eg."alt text goes here[IMG]", or, if the alt attribute |
|---|
| 127 | were missing, just "[IMG]") |
|---|
| 128 | - a callable object (eg. a function) which takes a Token and returns |
|---|
| 129 | the string to be used as its text value |
|---|
| 130 | |
|---|
| 131 | If textify has no key for an element name, nothing is substituted for |
|---|
| 132 | the opening tag. |
|---|
| 133 | |
|---|
| 134 | Public attributes: |
|---|
| 135 | |
|---|
| 136 | encoding and textify: see above |
|---|
| 137 | |
|---|
| 138 | """ |
|---|
| 139 | self._fh = fh |
|---|
| 140 | self._tokenstack = [] # FIFO |
|---|
| 141 | self.textify = textify |
|---|
| 142 | self.encoding = encoding |
|---|
| 143 | if entitydefs is None: |
|---|
| 144 | entitydefs = htmlentitydefs.name2codepoint |
|---|
| 145 | self._entitydefs = entitydefs |
|---|
| 146 | |
|---|
| 147 | def __iter__(self): return self |
|---|
| 148 | |
|---|
| 149 | def tags(self, *names): |
|---|
| 150 | return iter_until_exception(self.get_tag, NoMoreTokensError, *names) |
|---|
| 151 | |
|---|
| 152 | def tokens(self, *tokentypes): |
|---|
| 153 | return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes) |
|---|
| 154 | |
|---|
| 155 | def next(self): |
|---|
| 156 | try: |
|---|
| 157 | return self.get_token() |
|---|
| 158 | except NoMoreTokensError: |
|---|
| 159 | raise StopIteration() |
|---|
| 160 | |
|---|
| 161 | def get_token(self, *tokentypes): |
|---|
| 162 | """Pop the next Token object from the stack of parsed tokens. |
|---|
| 163 | |
|---|
| 164 | If arguments are given, they are taken to be token types in which the |
|---|
| 165 | caller is interested: tokens representing other elements will be |
|---|
| 166 | skipped. Element names must be given in lower case. |
|---|
| 167 | |
|---|
| 168 | Raises NoMoreTokensError. |
|---|
| 169 | |
|---|
| 170 | """ |
|---|
| 171 | while 1: |
|---|
| 172 | while self._tokenstack: |
|---|
| 173 | token = self._tokenstack.pop(0) |
|---|
| 174 | if tokentypes: |
|---|
| 175 | if token.type in tokentypes: |
|---|
| 176 | return token |
|---|
| 177 | else: |
|---|
| 178 | return token |
|---|
| 179 | data = self._fh.read(self.chunk) |
|---|
| 180 | if not data: |
|---|
| 181 | raise NoMoreTokensError() |
|---|
| 182 | self.feed(data) |
|---|
| 183 | |
|---|
| 184 | def unget_token(self, token): |
|---|
| 185 | """Push a Token back onto the stack.""" |
|---|
| 186 | self._tokenstack.insert(0, token) |
|---|
| 187 | |
|---|
| 188 | def get_tag(self, *names): |
|---|
| 189 | """Return the next Token that represents an opening or closing tag. |
|---|
| 190 | |
|---|
| 191 | If arguments are given, they are taken to be element names in which the |
|---|
| 192 | caller is interested: tags representing other elements will be skipped. |
|---|
| 193 | Element names must be given in lower case. |
|---|
| 194 | |
|---|
| 195 | Raises NoMoreTokensError. |
|---|
| 196 | |
|---|
| 197 | """ |
|---|
| 198 | while 1: |
|---|
| 199 | tok = self.get_token() |
|---|
| 200 | if tok.type not in ["starttag", "endtag", "startendtag"]: |
|---|
| 201 | continue |
|---|
| 202 | if names: |
|---|
| 203 | if tok.data in names: |
|---|
| 204 | return tok |
|---|
| 205 | else: |
|---|
| 206 | return tok |
|---|
| 207 | |
|---|
| 208 | def get_text(self, endat=None): |
|---|
| 209 | """Get some text. |
|---|
| 210 | |
|---|
| 211 | endat: stop reading text at this tag (the tag is included in the |
|---|
| 212 | returned text); endtag is a tuple (type, name) where type is |
|---|
| 213 | "starttag", "endtag" or "startendtag", and name is the element name of |
|---|
| 214 | the tag (element names must be given in lower case) |
|---|
| 215 | |
|---|
| 216 | If endat is not given, .get_text() will stop at the next opening or |
|---|
| 217 | closing tag, or when there are no more tokens (no exception is raised). |
|---|
| 218 | Note that .get_text() includes the text representation (if any) of the |
|---|
| 219 | opening tag, but pushes the opening tag back onto the stack. As a |
|---|
| 220 | result, if you want to call .get_text() again, you need to call |
|---|
| 221 | .get_tag() first (unless you want an empty string returned when you |
|---|
| 222 | next call .get_text()). |
|---|
| 223 | |
|---|
| 224 | Entity references are translated using the value of the entitydefs |
|---|
| 225 | constructor argument (a mapping from names to characters like that |
|---|
| 226 | provided by the standard module htmlentitydefs). Named entity |
|---|
| 227 | references that are not in this mapping are left unchanged. |
|---|
| 228 | |
|---|
| 229 | The textify attribute is used to translate opening tags into text: see |
|---|
| 230 | the class docstring. |
|---|
| 231 | |
|---|
| 232 | """ |
|---|
| 233 | text = [] |
|---|
| 234 | tok = None |
|---|
| 235 | while 1: |
|---|
| 236 | try: |
|---|
| 237 | tok = self.get_token() |
|---|
| 238 | except NoMoreTokensError: |
|---|
| 239 | # unget last token (not the one we just failed to get) |
|---|
| 240 | if tok: self.unget_token(tok) |
|---|
| 241 | break |
|---|
| 242 | if tok.type == "data": |
|---|
| 243 | text.append(tok.data) |
|---|
| 244 | elif tok.type == "entityref": |
|---|
| 245 | t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding) |
|---|
| 246 | text.append(t) |
|---|
| 247 | elif tok.type == "charref": |
|---|
| 248 | t = unescape_charref(tok.data, self.encoding) |
|---|
| 249 | text.append(t) |
|---|
| 250 | elif tok.type in ["starttag", "endtag", "startendtag"]: |
|---|
| 251 | tag_name = tok.data |
|---|
| 252 | if tok.type in ["starttag", "startendtag"]: |
|---|
| 253 | alt = self.textify.get(tag_name) |
|---|
| 254 | if alt is not None: |
|---|
| 255 | if callable(alt): |
|---|
| 256 | text.append(alt(tok)) |
|---|
| 257 | elif tok.attrs is not None: |
|---|
| 258 | for k, v in tok.attrs: |
|---|
| 259 | if k == alt: |
|---|
| 260 | text.append(v) |
|---|
| 261 | text.append("[%s]" % tag_name.upper()) |
|---|
| 262 | if endat is None or endat == (tok.type, tag_name): |
|---|
| 263 | self.unget_token(tok) |
|---|
| 264 | break |
|---|
| 265 | return "".join(text) |
|---|
| 266 | |
|---|
| 267 | def get_compressed_text(self, *args, **kwds): |
|---|
| 268 | """ |
|---|
| 269 | As .get_text(), but collapses each group of contiguous whitespace to a |
|---|
| 270 | single space character, and removes all initial and trailing |
|---|
| 271 | whitespace. |
|---|
| 272 | |
|---|
| 273 | """ |
|---|
| 274 | text = self.get_text(*args, **kwds) |
|---|
| 275 | text = text.strip() |
|---|
| 276 | return self.compress_re.sub(" ", text) |
|---|
| 277 | |
|---|
| 278 | def handle_startendtag(self, tag, attrs): |
|---|
| 279 | self._tokenstack.append(Token("startendtag", tag, attrs)) |
|---|
| 280 | def handle_starttag(self, tag, attrs): |
|---|
| 281 | self._tokenstack.append(Token("starttag", tag, attrs)) |
|---|
| 282 | def handle_endtag(self, tag): |
|---|
| 283 | self._tokenstack.append(Token("endtag", tag)) |
|---|
| 284 | def handle_charref(self, name): |
|---|
| 285 | self._tokenstack.append(Token("charref", name)) |
|---|
| 286 | def handle_entityref(self, name): |
|---|
| 287 | self._tokenstack.append(Token("entityref", name)) |
|---|
| 288 | def handle_data(self, data): |
|---|
| 289 | self._tokenstack.append(Token("data", data)) |
|---|
| 290 | def handle_comment(self, data): |
|---|
| 291 | self._tokenstack.append(Token("comment", data)) |
|---|
| 292 | def handle_decl(self, decl): |
|---|
| 293 | self._tokenstack.append(Token("decl", decl)) |
|---|
| 294 | def unknown_decl(self, data): |
|---|
| 295 | # XXX should this call self.error instead? |
|---|
| 296 | #self.error("unknown declaration: " + `data`) |
|---|
| 297 | self._tokenstack.append(Token("decl", data)) |
|---|
| 298 | def handle_pi(self, data): |
|---|
| 299 | self._tokenstack.append(Token("pi", data)) |
|---|
| 300 | |
|---|
| 301 | def unescape_attr(self, name): |
|---|
| 302 | return unescape(name, self._entitydefs, self.encoding) |
|---|
| 303 | def unescape_attrs(self, attrs): |
|---|
| 304 | escaped_attrs = [] |
|---|
| 305 | for key, val in attrs: |
|---|
| 306 | escaped_attrs.append((key, self.unescape_attr(val))) |
|---|
| 307 | return escaped_attrs |
|---|
| 308 | |
|---|
| 309 | class PullParser(_AbstractParser, HTMLParser.HTMLParser): |
|---|
| 310 | def __init__(self, *args, **kwds): |
|---|
| 311 | HTMLParser.HTMLParser.__init__(self) |
|---|
| 312 | _AbstractParser.__init__(self, *args, **kwds) |
|---|
| 313 | def unescape(self, name): |
|---|
| 314 | # Use the entitydefs passed into constructor, not |
|---|
| 315 | # HTMLParser.HTMLParser's entitydefs. |
|---|
| 316 | return self.unescape_attr(name) |
|---|
| 317 | |
|---|
| 318 | class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser): |
|---|
| 319 | def __init__(self, *args, **kwds): |
|---|
| 320 | sgmllib.SGMLParser.__init__(self) |
|---|
| 321 | _AbstractParser.__init__(self, *args, **kwds) |
|---|
| 322 | def unknown_starttag(self, tag, attrs): |
|---|
| 323 | attrs = self.unescape_attrs(attrs) |
|---|
| 324 | self._tokenstack.append(Token("starttag", tag, attrs)) |
|---|
| 325 | def unknown_endtag(self, tag): |
|---|
| 326 | self._tokenstack.append(Token("endtag", tag)) |
|---|
| 327 | |
|---|
| 328 | |
|---|
| 329 | def _test(): |
|---|
| 330 | import doctest, _pullparser |
|---|
| 331 | return doctest.testmod(_pullparser) |
|---|
| 332 | |
|---|
| 333 | if __name__ == "__main__": |
|---|
| 334 | _test() |
|---|