| 1 | """A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser. | 
|---|
| 2 |  | 
|---|
| 3 | Examples | 
|---|
| 4 |  | 
|---|
| 5 | This program extracts all links from a document.  It will print one | 
|---|
| 6 | line for each link, containing the URL and the textual description | 
|---|
| 7 | between the <A>...</A> tags: | 
|---|
| 8 |  | 
|---|
| 9 | import pullparser, sys | 
|---|
| 10 | f = file(sys.argv[1]) | 
|---|
| 11 | p = pullparser.PullParser(f) | 
|---|
| 12 | for token in p.tags("a"): | 
|---|
| 13 | if token.type == "endtag": continue | 
|---|
| 14 | url = dict(token.attrs).get("href", "-") | 
|---|
| 15 | text = p.get_compressed_text(endat=("endtag", "a")) | 
|---|
| 16 | print "%s\t%s" % (url, text) | 
|---|
| 17 |  | 
|---|
| 18 | This program extracts the <TITLE> from the document: | 
|---|
| 19 |  | 
|---|
| 20 | import pullparser, sys | 
|---|
| 21 | f = file(sys.argv[1]) | 
|---|
| 22 | p = pullparser.PullParser(f) | 
|---|
| 23 | if p.get_tag("title"): | 
|---|
| 24 | title = p.get_compressed_text() | 
|---|
| 25 | print "Title: %s" % title | 
|---|
| 26 |  | 
|---|
| 27 |  | 
|---|
| 28 | Copyright 2003-2006 John J. Lee <jjl@pobox.com> | 
|---|
| 29 | Copyright 1998-2001 Gisle Aas (original libwww-perl code) | 
|---|
| 30 |  | 
|---|
| 31 | This code is free software; you can redistribute it and/or modify it | 
|---|
| 32 | under the terms of the BSD or ZPL 2.1 licenses. | 
|---|
| 33 |  | 
|---|
| 34 | """ | 
|---|
| 35 |  | 
|---|
| 36 | import re, htmlentitydefs | 
|---|
| 37 | import sgmllib, HTMLParser | 
|---|
| 38 |  | 
|---|
| 39 | from _html import unescape, unescape_charref | 
|---|
| 40 |  | 
|---|
| 41 |  | 
|---|
| 42 | class NoMoreTokensError(Exception): pass | 
|---|
| 43 |  | 
|---|
| 44 | class Token: | 
|---|
| 45 | """Represents an HTML tag, declaration, processing instruction etc. | 
|---|
| 46 |  | 
|---|
| 47 | Behaves as both a tuple-like object (ie. iterable) and has attributes | 
|---|
| 48 | .type, .data and .attrs. | 
|---|
| 49 |  | 
|---|
| 50 | >>> t = Token("starttag", "a", [("href", "http://www.python.org/")]) | 
|---|
| 51 | >>> t == ("starttag", "a", [("href", "http://www.python.org/")]) | 
|---|
| 52 | True | 
|---|
| 53 | >>> (t.type, t.data) == ("starttag", "a") | 
|---|
| 54 | True | 
|---|
| 55 | >>> t.attrs == [("href", "http://www.python.org/")] | 
|---|
| 56 | True | 
|---|
| 57 |  | 
|---|
| 58 | Public attributes | 
|---|
| 59 |  | 
|---|
| 60 | type: one of "starttag", "endtag", "startendtag", "charref", "entityref", | 
|---|
| 61 | "data", "comment", "decl", "pi", after the corresponding methods of | 
|---|
| 62 | HTMLParser.HTMLParser | 
|---|
| 63 | data: For a tag, the tag name; otherwise, the relevant data carried by the | 
|---|
| 64 | tag, as a string | 
|---|
| 65 | attrs: list of (name, value) pairs representing HTML attributes | 
|---|
| 66 | (or None if token does not represent an opening tag) | 
|---|
| 67 |  | 
|---|
| 68 | """ | 
|---|
| 69 | def __init__(self, type, data, attrs=None): | 
|---|
| 70 | self.type = type | 
|---|
| 71 | self.data = data | 
|---|
| 72 | self.attrs = attrs | 
|---|
| 73 | def __iter__(self): | 
|---|
| 74 | return iter((self.type, self.data, self.attrs)) | 
|---|
| 75 | def __eq__(self, other): | 
|---|
| 76 | type, data, attrs = other | 
|---|
| 77 | if (self.type == type and | 
|---|
| 78 | self.data == data and | 
|---|
| 79 | self.attrs == attrs): | 
|---|
| 80 | return True | 
|---|
| 81 | else: | 
|---|
| 82 | return False | 
|---|
| 83 | def __ne__(self, other): return not self.__eq__(other) | 
|---|
| 84 | def __repr__(self): | 
|---|
| 85 | args = ", ".join(map(repr, [self.type, self.data, self.attrs])) | 
|---|
| 86 | return self.__class__.__name__+"(%s)" % args | 
|---|
| 87 |  | 
|---|
| 88 | def iter_until_exception(fn, exception, *args, **kwds): | 
|---|
| 89 | while 1: | 
|---|
| 90 | try: | 
|---|
| 91 | yield fn(*args, **kwds) | 
|---|
| 92 | except exception: | 
|---|
| 93 | raise StopIteration | 
|---|
| 94 |  | 
|---|
| 95 |  | 
|---|
| 96 | class _AbstractParser: | 
|---|
| 97 | chunk = 1024 | 
|---|
| 98 | compress_re = re.compile(r"\s+") | 
|---|
| 99 | def __init__(self, fh, textify={"img": "alt", "applet": "alt"}, | 
|---|
| 100 | encoding="ascii", entitydefs=None): | 
|---|
| 101 | """ | 
|---|
| 102 | fh: file-like object (only a .read() method is required) from which to | 
|---|
| 103 | read HTML to be parsed | 
|---|
| 104 | textify: mapping used by .get_text() and .get_compressed_text() methods | 
|---|
| 105 | to represent opening tags as text | 
|---|
| 106 | encoding: encoding used to encode numeric character references by | 
|---|
| 107 | .get_text() and .get_compressed_text() ("ascii" by default) | 
|---|
| 108 |  | 
|---|
| 109 | entitydefs: mapping like {"amp": "&", ...} containing HTML entity | 
|---|
| 110 | definitions (a sensible default is used).  This is used to unescape | 
|---|
| 111 | entities in .get_text() (and .get_compressed_text()) and attribute | 
|---|
| 112 | values.  If the encoding can not represent the character, the entity | 
|---|
| 113 | reference is left unescaped.  Note that entity references (both | 
|---|
| 114 | numeric - e.g. { or ઼ - and non-numeric - e.g. &) are | 
|---|
| 115 | unescaped in attribute values and the return value of .get_text(), but | 
|---|
| 116 | not in data outside of tags.  Instead, entity references outside of | 
|---|
| 117 | tags are represented as tokens.  This is a bit odd, it's true :-/ | 
|---|
| 118 |  | 
|---|
| 119 | If the element name of an opening tag matches a key in the textify | 
|---|
| 120 | mapping then that tag is converted to text.  The corresponding value is | 
|---|
| 121 | used to specify which tag attribute to obtain the text from.  textify | 
|---|
| 122 | maps from element names to either: | 
|---|
| 123 |  | 
|---|
| 124 | - an HTML attribute name, in which case the HTML attribute value is | 
|---|
| 125 | used as its text value along with the element name in square | 
|---|
| 126 | brackets (eg."alt text goes here[IMG]", or, if the alt attribute | 
|---|
| 127 | were missing, just "[IMG]") | 
|---|
| 128 | - a callable object (eg. a function) which takes a Token and returns | 
|---|
| 129 | the string to be used as its text value | 
|---|
| 130 |  | 
|---|
| 131 | If textify has no key for an element name, nothing is substituted for | 
|---|
| 132 | the opening tag. | 
|---|
| 133 |  | 
|---|
| 134 | Public attributes: | 
|---|
| 135 |  | 
|---|
| 136 | encoding and textify: see above | 
|---|
| 137 |  | 
|---|
| 138 | """ | 
|---|
| 139 | self._fh = fh | 
|---|
| 140 | self._tokenstack = []  # FIFO | 
|---|
| 141 | self.textify = textify | 
|---|
| 142 | self.encoding = encoding | 
|---|
| 143 | if entitydefs is None: | 
|---|
| 144 | entitydefs = htmlentitydefs.name2codepoint | 
|---|
| 145 | self._entitydefs = entitydefs | 
|---|
| 146 |  | 
|---|
| 147 | def __iter__(self): return self | 
|---|
| 148 |  | 
|---|
| 149 | def tags(self, *names): | 
|---|
| 150 | return iter_until_exception(self.get_tag, NoMoreTokensError, *names) | 
|---|
| 151 |  | 
|---|
| 152 | def tokens(self, *tokentypes): | 
|---|
| 153 | return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes) | 
|---|
| 154 |  | 
|---|
| 155 | def next(self): | 
|---|
| 156 | try: | 
|---|
| 157 | return self.get_token() | 
|---|
| 158 | except NoMoreTokensError: | 
|---|
| 159 | raise StopIteration() | 
|---|
| 160 |  | 
|---|
| 161 | def get_token(self, *tokentypes): | 
|---|
| 162 | """Pop the next Token object from the stack of parsed tokens. | 
|---|
| 163 |  | 
|---|
| 164 | If arguments are given, they are taken to be token types in which the | 
|---|
| 165 | caller is interested: tokens representing other elements will be | 
|---|
| 166 | skipped.  Element names must be given in lower case. | 
|---|
| 167 |  | 
|---|
| 168 | Raises NoMoreTokensError. | 
|---|
| 169 |  | 
|---|
| 170 | """ | 
|---|
| 171 | while 1: | 
|---|
| 172 | while self._tokenstack: | 
|---|
| 173 | token = self._tokenstack.pop(0) | 
|---|
| 174 | if tokentypes: | 
|---|
| 175 | if token.type in tokentypes: | 
|---|
| 176 | return token | 
|---|
| 177 | else: | 
|---|
| 178 | return token | 
|---|
| 179 | data = self._fh.read(self.chunk) | 
|---|
| 180 | if not data: | 
|---|
| 181 | raise NoMoreTokensError() | 
|---|
| 182 | self.feed(data) | 
|---|
| 183 |  | 
|---|
| 184 | def unget_token(self, token): | 
|---|
| 185 | """Push a Token back onto the stack.""" | 
|---|
| 186 | self._tokenstack.insert(0, token) | 
|---|
| 187 |  | 
|---|
| 188 | def get_tag(self, *names): | 
|---|
| 189 | """Return the next Token that represents an opening or closing tag. | 
|---|
| 190 |  | 
|---|
| 191 | If arguments are given, they are taken to be element names in which the | 
|---|
| 192 | caller is interested: tags representing other elements will be skipped. | 
|---|
| 193 | Element names must be given in lower case. | 
|---|
| 194 |  | 
|---|
| 195 | Raises NoMoreTokensError. | 
|---|
| 196 |  | 
|---|
| 197 | """ | 
|---|
| 198 | while 1: | 
|---|
| 199 | tok = self.get_token() | 
|---|
| 200 | if tok.type not in ["starttag", "endtag", "startendtag"]: | 
|---|
| 201 | continue | 
|---|
| 202 | if names: | 
|---|
| 203 | if tok.data in names: | 
|---|
| 204 | return tok | 
|---|
| 205 | else: | 
|---|
| 206 | return tok | 
|---|
| 207 |  | 
|---|
| 208 | def get_text(self, endat=None): | 
|---|
| 209 | """Get some text. | 
|---|
| 210 |  | 
|---|
| 211 | endat: stop reading text at this tag (the tag is included in the | 
|---|
| 212 | returned text); endtag is a tuple (type, name) where type is | 
|---|
| 213 | "starttag", "endtag" or "startendtag", and name is the element name of | 
|---|
| 214 | the tag (element names must be given in lower case) | 
|---|
| 215 |  | 
|---|
| 216 | If endat is not given, .get_text() will stop at the next opening or | 
|---|
| 217 | closing tag, or when there are no more tokens (no exception is raised). | 
|---|
| 218 | Note that .get_text() includes the text representation (if any) of the | 
|---|
| 219 | opening tag, but pushes the opening tag back onto the stack.  As a | 
|---|
| 220 | result, if you want to call .get_text() again, you need to call | 
|---|
| 221 | .get_tag() first (unless you want an empty string returned when you | 
|---|
| 222 | next call .get_text()). | 
|---|
| 223 |  | 
|---|
| 224 | Entity references are translated using the value of the entitydefs | 
|---|
| 225 | constructor argument (a mapping from names to characters like that | 
|---|
| 226 | provided by the standard module htmlentitydefs).  Named entity | 
|---|
| 227 | references that are not in this mapping are left unchanged. | 
|---|
| 228 |  | 
|---|
| 229 | The textify attribute is used to translate opening tags into text: see | 
|---|
| 230 | the class docstring. | 
|---|
| 231 |  | 
|---|
| 232 | """ | 
|---|
| 233 | text = [] | 
|---|
| 234 | tok = None | 
|---|
| 235 | while 1: | 
|---|
| 236 | try: | 
|---|
| 237 | tok = self.get_token() | 
|---|
| 238 | except NoMoreTokensError: | 
|---|
| 239 | # unget last token (not the one we just failed to get) | 
|---|
| 240 | if tok: self.unget_token(tok) | 
|---|
| 241 | break | 
|---|
| 242 | if tok.type == "data": | 
|---|
| 243 | text.append(tok.data) | 
|---|
| 244 | elif tok.type == "entityref": | 
|---|
| 245 | t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding) | 
|---|
| 246 | text.append(t) | 
|---|
| 247 | elif tok.type == "charref": | 
|---|
| 248 | t = unescape_charref(tok.data, self.encoding) | 
|---|
| 249 | text.append(t) | 
|---|
| 250 | elif tok.type in ["starttag", "endtag", "startendtag"]: | 
|---|
| 251 | tag_name = tok.data | 
|---|
| 252 | if tok.type in ["starttag", "startendtag"]: | 
|---|
| 253 | alt = self.textify.get(tag_name) | 
|---|
| 254 | if alt is not None: | 
|---|
| 255 | if callable(alt): | 
|---|
| 256 | text.append(alt(tok)) | 
|---|
| 257 | elif tok.attrs is not None: | 
|---|
| 258 | for k, v in tok.attrs: | 
|---|
| 259 | if k == alt: | 
|---|
| 260 | text.append(v) | 
|---|
| 261 | text.append("[%s]" % tag_name.upper()) | 
|---|
| 262 | if endat is None or endat == (tok.type, tag_name): | 
|---|
| 263 | self.unget_token(tok) | 
|---|
| 264 | break | 
|---|
| 265 | return "".join(text) | 
|---|
| 266 |  | 
|---|
| 267 | def get_compressed_text(self, *args, **kwds): | 
|---|
| 268 | """ | 
|---|
| 269 | As .get_text(), but collapses each group of contiguous whitespace to a | 
|---|
| 270 | single space character, and removes all initial and trailing | 
|---|
| 271 | whitespace. | 
|---|
| 272 |  | 
|---|
| 273 | """ | 
|---|
| 274 | text = self.get_text(*args, **kwds) | 
|---|
| 275 | text = text.strip() | 
|---|
| 276 | return self.compress_re.sub(" ", text) | 
|---|
| 277 |  | 
|---|
| 278 | def handle_startendtag(self, tag, attrs): | 
|---|
| 279 | self._tokenstack.append(Token("startendtag", tag, attrs)) | 
|---|
| 280 | def handle_starttag(self, tag, attrs): | 
|---|
| 281 | self._tokenstack.append(Token("starttag", tag, attrs)) | 
|---|
| 282 | def handle_endtag(self, tag): | 
|---|
| 283 | self._tokenstack.append(Token("endtag", tag)) | 
|---|
| 284 | def handle_charref(self, name): | 
|---|
| 285 | self._tokenstack.append(Token("charref", name)) | 
|---|
| 286 | def handle_entityref(self, name): | 
|---|
| 287 | self._tokenstack.append(Token("entityref", name)) | 
|---|
| 288 | def handle_data(self, data): | 
|---|
| 289 | self._tokenstack.append(Token("data", data)) | 
|---|
| 290 | def handle_comment(self, data): | 
|---|
| 291 | self._tokenstack.append(Token("comment", data)) | 
|---|
| 292 | def handle_decl(self, decl): | 
|---|
| 293 | self._tokenstack.append(Token("decl", decl)) | 
|---|
| 294 | def unknown_decl(self, data): | 
|---|
| 295 | # XXX should this call self.error instead? | 
|---|
| 296 | #self.error("unknown declaration: " + `data`) | 
|---|
| 297 | self._tokenstack.append(Token("decl", data)) | 
|---|
| 298 | def handle_pi(self, data): | 
|---|
| 299 | self._tokenstack.append(Token("pi", data)) | 
|---|
| 300 |  | 
|---|
| 301 | def unescape_attr(self, name): | 
|---|
| 302 | return unescape(name, self._entitydefs, self.encoding) | 
|---|
| 303 | def unescape_attrs(self, attrs): | 
|---|
| 304 | escaped_attrs = [] | 
|---|
| 305 | for key, val in attrs: | 
|---|
| 306 | escaped_attrs.append((key, self.unescape_attr(val))) | 
|---|
| 307 | return escaped_attrs | 
|---|
| 308 |  | 
|---|
| 309 | class PullParser(_AbstractParser, HTMLParser.HTMLParser): | 
|---|
| 310 | def __init__(self, *args, **kwds): | 
|---|
| 311 | HTMLParser.HTMLParser.__init__(self) | 
|---|
| 312 | _AbstractParser.__init__(self, *args, **kwds) | 
|---|
| 313 | def unescape(self, name): | 
|---|
| 314 | # Use the entitydefs passed into constructor, not | 
|---|
| 315 | # HTMLParser.HTMLParser's entitydefs. | 
|---|
| 316 | return self.unescape_attr(name) | 
|---|
| 317 |  | 
|---|
| 318 | class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser): | 
|---|
| 319 | def __init__(self, *args, **kwds): | 
|---|
| 320 | sgmllib.SGMLParser.__init__(self) | 
|---|
| 321 | _AbstractParser.__init__(self, *args, **kwds) | 
|---|
| 322 | def unknown_starttag(self, tag, attrs): | 
|---|
| 323 | attrs = self.unescape_attrs(attrs) | 
|---|
| 324 | self._tokenstack.append(Token("starttag", tag, attrs)) | 
|---|
| 325 | def unknown_endtag(self, tag): | 
|---|
| 326 | self._tokenstack.append(Token("endtag", tag)) | 
|---|
| 327 |  | 
|---|
| 328 |  | 
|---|
| 329 | def _test(): | 
|---|
| 330 | import doctest, _pullparser | 
|---|
| 331 | return doctest.testmod(_pullparser) | 
|---|
| 332 |  | 
|---|
| 333 | if __name__ == "__main__": | 
|---|
| 334 | _test() | 
|---|