Context Navigation

_pullparser.py @ 3

リビジョン 3, 12.2 KB (コミッタ: kohda, 14 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser.
2
3	Examples
4
5	This program extracts all links from a document. It will print one
6	line for each link, containing the URL and the textual description
7	between the <A>...</A> tags:
8
9	import pullparser, sys
10	f = file(sys.argv[1])
11	p = pullparser.PullParser(f)
12	for token in p.tags("a"):
13	if token.type == "endtag": continue
14	url = dict(token.attrs).get("href", "-")
15	text = p.get_compressed_text(endat=("endtag", "a"))
16	print "%s\t%s" % (url, text)
17
18	This program extracts the <TITLE> from the document:
19
20	import pullparser, sys
21	f = file(sys.argv[1])
22	p = pullparser.PullParser(f)
23	if p.get_tag("title"):
24	title = p.get_compressed_text()
25	print "Title: %s" % title
26
27
28	Copyright 2003-2006 John J. Lee <jjl@pobox.com>
29	Copyright 1998-2001 Gisle Aas (original libwww-perl code)
30
31	This code is free software; you can redistribute it and/or modify it
32	under the terms of the BSD or ZPL 2.1 licenses.
33
34	"""
35
36	import re, htmlentitydefs
37	import sgmllib, HTMLParser
38
39	from _html import unescape, unescape_charref
40
41
42	class NoMoreTokensError(Exception): pass
43
44	class Token:
45	"""Represents an HTML tag, declaration, processing instruction etc.
46
47	Behaves as both a tuple-like object (ie. iterable) and has attributes
48	.type, .data and .attrs.
49
50	>>> t = Token("starttag", "a", [("href", "http://www.python.org/")])
51	>>> t == ("starttag", "a", [("href", "http://www.python.org/")])
52	True
53	>>> (t.type, t.data) == ("starttag", "a")
54	True
55	>>> t.attrs == [("href", "http://www.python.org/")]
56	True
57
58	Public attributes
59
60	type: one of "starttag", "endtag", "startendtag", "charref", "entityref",
61	"data", "comment", "decl", "pi", after the corresponding methods of
62	HTMLParser.HTMLParser
63	data: For a tag, the tag name; otherwise, the relevant data carried by the
64	tag, as a string
65	attrs: list of (name, value) pairs representing HTML attributes
66	(or None if token does not represent an opening tag)
67
68	"""
69	def __init__(self, type, data, attrs=None):
70	self.type = type
71	self.data = data
72	self.attrs = attrs
73	def __iter__(self):
74	return iter((self.type, self.data, self.attrs))
75	def __eq__(self, other):
76	type, data, attrs = other
77	if (self.type == type and
78	self.data == data and
79	self.attrs == attrs):
80	return True
81	else:
82	return False
83	def __ne__(self, other): return not self.__eq__(other)
84	def __repr__(self):
85	args = ", ".join(map(repr, [self.type, self.data, self.attrs]))
86	return self.__class__.__name__+"(%s)" % args
87
88	def iter_until_exception(fn, exception, args, *kwds):
89	while 1:
90	try:
91	yield fn(args, *kwds)
92	except exception:
93	raise StopIteration
94
95
96	class _AbstractParser:
97	chunk = 1024
98	compress_re = re.compile(r"\s+")
99	def __init__(self, fh, textify={"img": "alt", "applet": "alt"},
100	encoding="ascii", entitydefs=None):
101	"""
102	fh: file-like object (only a .read() method is required) from which to
103	read HTML to be parsed
104	textify: mapping used by .get_text() and .get_compressed_text() methods
105	to represent opening tags as text
106	encoding: encoding used to encode numeric character references by
107	.get_text() and .get_compressed_text() ("ascii" by default)
108
109	entitydefs: mapping like {"amp": "&", ...} containing HTML entity
110	definitions (a sensible default is used). This is used to unescape
111	entities in .get_text() (and .get_compressed_text()) and attribute
112	values. If the encoding can not represent the character, the entity
113	reference is left unescaped. Note that entity references (both
114	numeric - e.g. { or ઼ - and non-numeric - e.g. &) are
115	unescaped in attribute values and the return value of .get_text(), but
116	not in data outside of tags. Instead, entity references outside of
117	tags are represented as tokens. This is a bit odd, it's true :-/
118
119	If the element name of an opening tag matches a key in the textify
120	mapping then that tag is converted to text. The corresponding value is
121	used to specify which tag attribute to obtain the text from. textify
122	maps from element names to either:
123
124	- an HTML attribute name, in which case the HTML attribute value is
125	used as its text value along with the element name in square
126	brackets (eg."alt text goes here[IMG]", or, if the alt attribute
127	were missing, just "[IMG]")
128	- a callable object (eg. a function) which takes a Token and returns
129	the string to be used as its text value
130
131	If textify has no key for an element name, nothing is substituted for
132	the opening tag.
133
134	Public attributes:
135
136	encoding and textify: see above
137
138	"""
139	self._fh = fh
140	self._tokenstack = [] # FIFO
141	self.textify = textify
142	self.encoding = encoding
143	if entitydefs is None:
144	entitydefs = htmlentitydefs.name2codepoint
145	self._entitydefs = entitydefs
146
147	def __iter__(self): return self
148
149	def tags(self, *names):
150	return iter_until_exception(self.get_tag, NoMoreTokensError, *names)
151
152	def tokens(self, *tokentypes):
153	return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes)
154
155	def next(self):
156	try:
157	return self.get_token()
158	except NoMoreTokensError:
159	raise StopIteration()
160
161	def get_token(self, *tokentypes):
162	"""Pop the next Token object from the stack of parsed tokens.
163
164	If arguments are given, they are taken to be token types in which the
165	caller is interested: tokens representing other elements will be
166	skipped. Element names must be given in lower case.
167
168	Raises NoMoreTokensError.
169
170	"""
171	while 1:
172	while self._tokenstack:
173	token = self._tokenstack.pop(0)
174	if tokentypes:
175	if token.type in tokentypes:
176	return token
177	else:
178	return token
179	data = self._fh.read(self.chunk)
180	if not data:
181	raise NoMoreTokensError()
182	self.feed(data)
183
184	def unget_token(self, token):
185	"""Push a Token back onto the stack."""
186	self._tokenstack.insert(0, token)
187
188	def get_tag(self, *names):
189	"""Return the next Token that represents an opening or closing tag.
190
191	If arguments are given, they are taken to be element names in which the
192	caller is interested: tags representing other elements will be skipped.
193	Element names must be given in lower case.
194
195	Raises NoMoreTokensError.
196
197	"""
198	while 1:
199	tok = self.get_token()
200	if tok.type not in ["starttag", "endtag", "startendtag"]:
201	continue
202	if names:
203	if tok.data in names:
204	return tok
205	else:
206	return tok
207
208	def get_text(self, endat=None):
209	"""Get some text.
210
211	endat: stop reading text at this tag (the tag is included in the
212	returned text); endtag is a tuple (type, name) where type is
213	"starttag", "endtag" or "startendtag", and name is the element name of
214	the tag (element names must be given in lower case)
215
216	If endat is not given, .get_text() will stop at the next opening or
217	closing tag, or when there are no more tokens (no exception is raised).
218	Note that .get_text() includes the text representation (if any) of the
219	opening tag, but pushes the opening tag back onto the stack. As a
220	result, if you want to call .get_text() again, you need to call
221	.get_tag() first (unless you want an empty string returned when you
222	next call .get_text()).
223
224	Entity references are translated using the value of the entitydefs
225	constructor argument (a mapping from names to characters like that
226	provided by the standard module htmlentitydefs). Named entity
227	references that are not in this mapping are left unchanged.
228
229	The textify attribute is used to translate opening tags into text: see
230	the class docstring.
231
232	"""
233	text = []
234	tok = None
235	while 1:
236	try:
237	tok = self.get_token()
238	except NoMoreTokensError:
239	# unget last token (not the one we just failed to get)
240	if tok: self.unget_token(tok)
241	break
242	if tok.type == "data":
243	text.append(tok.data)
244	elif tok.type == "entityref":
245	t = unescape("&%s;"%tok.data, self._entitydefs, self.encoding)
246	text.append(t)
247	elif tok.type == "charref":
248	t = unescape_charref(tok.data, self.encoding)
249	text.append(t)
250	elif tok.type in ["starttag", "endtag", "startendtag"]:
251	tag_name = tok.data
252	if tok.type in ["starttag", "startendtag"]:
253	alt = self.textify.get(tag_name)
254	if alt is not None:
255	if callable(alt):
256	text.append(alt(tok))
257	elif tok.attrs is not None:
258	for k, v in tok.attrs:
259	if k == alt:
260	text.append(v)
261	text.append("[%s]" % tag_name.upper())
262	if endat is None or endat == (tok.type, tag_name):
263	self.unget_token(tok)
264	break
265	return "".join(text)
266
267	def get_compressed_text(self, args, *kwds):
268	"""
269	As .get_text(), but collapses each group of contiguous whitespace to a
270	single space character, and removes all initial and trailing
271	whitespace.
272
273	"""
274	text = self.get_text(args, *kwds)
275	text = text.strip()
276	return self.compress_re.sub(" ", text)
277
278	def handle_startendtag(self, tag, attrs):
279	self._tokenstack.append(Token("startendtag", tag, attrs))
280	def handle_starttag(self, tag, attrs):
281	self._tokenstack.append(Token("starttag", tag, attrs))
282	def handle_endtag(self, tag):
283	self._tokenstack.append(Token("endtag", tag))
284	def handle_charref(self, name):
285	self._tokenstack.append(Token("charref", name))
286	def handle_entityref(self, name):
287	self._tokenstack.append(Token("entityref", name))
288	def handle_data(self, data):
289	self._tokenstack.append(Token("data", data))
290	def handle_comment(self, data):
291	self._tokenstack.append(Token("comment", data))
292	def handle_decl(self, decl):
293	self._tokenstack.append(Token("decl", decl))
294	def unknown_decl(self, data):
295	# XXX should this call self.error instead?
296	#self.error("unknown declaration: " + `data`)
297	self._tokenstack.append(Token("decl", data))
298	def handle_pi(self, data):
299	self._tokenstack.append(Token("pi", data))
300
301	def unescape_attr(self, name):
302	return unescape(name, self._entitydefs, self.encoding)
303	def unescape_attrs(self, attrs):
304	escaped_attrs = []
305	for key, val in attrs:
306	escaped_attrs.append((key, self.unescape_attr(val)))
307	return escaped_attrs
308
309	class PullParser(_AbstractParser, HTMLParser.HTMLParser):
310	def __init__(self, args, *kwds):
311	HTMLParser.HTMLParser.__init__(self)
312	_AbstractParser.__init__(self, args, *kwds)
313	def unescape(self, name):
314	# Use the entitydefs passed into constructor, not
315	# HTMLParser.HTMLParser's entitydefs.
316	return self.unescape_attr(name)
317
318	class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser):
319	def __init__(self, args, *kwds):
320	sgmllib.SGMLParser.__init__(self)
321	_AbstractParser.__init__(self, args, *kwds)
322	def unknown_starttag(self, tag, attrs):
323	attrs = self.unescape_attrs(attrs)
324	self._tokenstack.append(Token("starttag", tag, attrs))
325	def unknown_endtag(self, tag):
326	self._tokenstack.append(Token("endtag", tag))
327
328
329	def _test():
330	import doctest, _pullparser
331	return doctest.testmod(_pullparser)
332
333	if __name__ == "__main__":
334	_test()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/twill-0.9-py2.6.egg/twill/other_packages/_mechanize_dist/_pullparser.py @ 3

異なるフォーマットでダウンロード: