Context Navigation

_html.py @ 3

リビジョン 3, 19.8 KB (コミッタ: kohda, 14 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	"""HTML handling.
2
3	Copyright 2003-2006 John J. Lee <jjl@pobox.com>
4
5	This code is free software; you can redistribute it and/or modify it under
6	the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
7	included with the distribution).
8
9	"""
10
11	import re, copy, htmlentitydefs
12	import sgmllib, HTMLParser, ClientForm
13
14	import _request
15	from _headersutil import split_header_words, is_html as _is_html
16	import _rfc3986
17
18	DEFAULT_ENCODING = "latin-1"
19
20
21	# the base classe is purely for backwards compatibility
22	class ParseError(ClientForm.ParseError): pass
23
24
25	class CachingGeneratorFunction(object):
26	"""Caching wrapper around a no-arguments iterable."""
27
28	def __init__(self, iterable):
29	self._cache = []
30	# wrap iterable to make it non-restartable (otherwise, repeated
31	# __call__ would give incorrect results)
32	self._iterator = iter(iterable)
33
34	def __call__(self):
35	cache = self._cache
36	for item in cache:
37	yield item
38	for item in self._iterator:
39	cache.append(item)
40	yield item
41
42
43	class EncodingFinder:
44	def __init__(self, default_encoding):
45	self._default_encoding = default_encoding
46	def encoding(self, response):
47	# HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
48	# headers may be in the response. HTTP-EQUIV headers come last,
49	# so try in order from first to last.
50	for ct in response.info().getheaders("content-type"):
51	for k, v in split_header_words([ct])[0]:
52	if k == "charset":
53	return v
54	return self._default_encoding
55
56	class ResponseTypeFinder:
57	def __init__(self, allow_xhtml):
58	self._allow_xhtml = allow_xhtml
59	def is_html(self, response, encoding):
60	ct_hdrs = response.info().getheaders("content-type")
61	url = response.geturl()
62	# XXX encoding
63	return _is_html(ct_hdrs, url, self._allow_xhtml)
64
65
66	# idea for this argument-processing trick is from Peter Otten
67	class Args:
68	def __init__(self, args_map):
69	self.dictionary = dict(args_map)
70	def __getattr__(self, key):
71	try:
72	return self.dictionary[key]
73	except KeyError:
74	return getattr(self.__class__, key)
75
76	def form_parser_args(
77	select_default=False,
78	form_parser_class=None,
79	request_class=None,
80	backwards_compat=False,
81	):
82	return Args(locals())
83
84
85	class Link:
86	def __init__(self, base_url, url, text, tag, attrs):
87	assert None not in [url, tag, attrs]
88	self.base_url = base_url
89	self.absolute_url = _rfc3986.urljoin(base_url, url)
90	self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
91	def __cmp__(self, other):
92	try:
93	for name in "url", "text", "tag", "attrs":
94	if getattr(self, name) != getattr(other, name):
95	return -1
96	except AttributeError:
97	return -1
98	return 0
99	def __repr__(self):
100	return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
101	self.base_url, self.url, self.text, self.tag, self.attrs)
102
103
104	class LinksFactory:
105
106	def __init__(self,
107	link_parser_class=None,
108	link_class=Link,
109	urltags=None,
110	):
111	import _pullparser
112	if link_parser_class is None:
113	link_parser_class = _pullparser.TolerantPullParser
114	self.link_parser_class = link_parser_class
115	self.link_class = link_class
116	if urltags is None:
117	urltags = {
118	"a": "href",
119	"area": "href",
120	"frame": "src",
121	"iframe": "src",
122	}
123	self.urltags = urltags
124	self._response = None
125	self._encoding = None
126
127	def set_response(self, response, base_url, encoding):
128	self._response = response
129	self._encoding = encoding
130	self._base_url = base_url
131
132	def links(self):
133	"""Return an iterator that provides links of the document."""
134	response = self._response
135	encoding = self._encoding
136	base_url = self._base_url
137	p = self.link_parser_class(response, encoding=encoding)
138
139	try:
140	for token in p.tags(*(self.urltags.keys()+["base"])):
141	if token.type == "endtag":
142	continue
143	if token.data == "base":
144	base_href = dict(token.attrs).get("href")
145	if base_href is not None:
146	base_url = base_href
147	continue
148	attrs = dict(token.attrs)
149	tag = token.data
150	name = attrs.get("name")
151	text = None
152	# XXX use attr_encoding for ref'd doc if that doc does not
153	# provide one by other means
154	#attr_encoding = attrs.get("charset")
155	url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
156	if not url:
157	# Probably an <A NAME="blah"> link or <AREA NOHREF...>.
158	# For our purposes a link is something with a URL, so
159	# ignore this.
160	continue
161
162	url = _rfc3986.clean_url(url, encoding)
163	if tag == "a":
164	if token.type != "startendtag":
165	# hmm, this'd break if end tag is missing
166	text = p.get_compressed_text(("endtag", tag))
167	# but this doesn't work for eg.
168	# <a href="blah"><b>Andy</b></a>
169	#text = p.get_compressed_text()
170
171	yield Link(base_url, url, text, tag, token.attrs)
172	except sgmllib.SGMLParseError, exc:
173	raise ParseError(exc)
174
175	class FormsFactory:
176
177	"""Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
178
179	After calling .forms(), the .global_form attribute is a form object
180	containing all controls not a descendant of any FORM element.
181
182	For constructor argument docs, see ClientForm.ParseResponse
183	argument docs.
184
185	"""
186
187	def __init__(self,
188	select_default=False,
189	form_parser_class=None,
190	request_class=None,
191	backwards_compat=False,
192	):
193	import ClientForm
194	self.select_default = select_default
195	if form_parser_class is None:
196	form_parser_class = ClientForm.FormParser
197	self.form_parser_class = form_parser_class
198	if request_class is None:
199	request_class = _request.Request
200	self.request_class = request_class
201	self.backwards_compat = backwards_compat
202	self._response = None
203	self.encoding = None
204	self.global_form = None
205
206	def set_response(self, response, encoding):
207	self._response = response
208	self.encoding = encoding
209	self.global_form = None
210
211	def forms(self):
212	import ClientForm
213	encoding = self.encoding
214	try:
215	forms = ClientForm.ParseResponseEx(
216	self._response,
217	select_default=self.select_default,
218	form_parser_class=self.form_parser_class,
219	request_class=self.request_class,
220	encoding=encoding,
221	_urljoin=_rfc3986.urljoin,
222	_urlparse=_rfc3986.urlsplit,
223	_urlunparse=_rfc3986.urlunsplit,
224	)
225	except ClientForm.ParseError, exc:
226	raise ParseError(exc)
227	self.global_form = forms[0]
228	return forms[1:]
229
230	class TitleFactory:
231	def __init__(self):
232	self._response = self._encoding = None
233
234	def set_response(self, response, encoding):
235	self._response = response
236	self._encoding = encoding
237
238	def title(self):
239	import _pullparser
240	p = _pullparser.TolerantPullParser(
241	self._response, encoding=self._encoding)
242	try:
243	try:
244	p.get_tag("title")
245	except _pullparser.NoMoreTokensError:
246	return None
247	else:
248	return p.get_text()
249	except sgmllib.SGMLParseError, exc:
250	raise ParseError(exc)
251
252
253	def unescape(data, entities, encoding):
254	if data is None or "&" not in data:
255	return data
256
257	def replace_entities(match):
258	ent = match.group()
259	if ent[1] == "#":
260	return unescape_charref(ent[2:-1], encoding)
261
262	repl = entities.get(ent[1:-1])
263	if repl is not None:
264	repl = unichr(repl)
265	if type(repl) != type(""):
266	try:
267	repl = repl.encode(encoding)
268	except UnicodeError:
269	repl = ent
270	else:
271	repl = ent
272	return repl
273
274	return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
275
276	def unescape_charref(data, encoding):
277	name, base = data, 10
278	if name.startswith("x"):
279	name, base= name[1:], 16
280	uc = unichr(int(name, base))
281	if encoding is None:
282	return uc
283	else:
284	try:
285	repl = uc.encode(encoding)
286	except UnicodeError:
287	repl = "&#%s;" % data
288	return repl
289
290
291	# bizarre import gymnastics for bundled BeautifulSoup
292	import _beautifulsoup
293	import ClientForm
294	RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
295	_beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
296	)
297	# monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
298	import sgmllib
299	sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
300
301	class MechanizeBs(_beautifulsoup.BeautifulSoup):
302	_entitydefs = htmlentitydefs.name2codepoint
303	# don't want the magic Microsoft-char workaround
304	PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
305	lambda(x):x.group(1) + ' />'),
306	(re.compile('<!\s+([^<>]*)>'),
307	lambda(x):'<!' + x.group(1) + '>')
308	]
309
310	def __init__(self, encoding, text=None, avoidParserProblems=True,
311	initialTextIsEverything=True):
312	self._encoding = encoding
313	_beautifulsoup.BeautifulSoup.__init__(
314	self, text, avoidParserProblems, initialTextIsEverything)
315
316	def handle_charref(self, ref):
317	t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
318	self.handle_data(t)
319	def handle_entityref(self, ref):
320	t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
321	self.handle_data(t)
322	def unescape_attrs(self, attrs):
323	escaped_attrs = []
324	for key, val in attrs:
325	val = unescape(val, self._entitydefs, self._encoding)
326	escaped_attrs.append((key, val))
327	return escaped_attrs
328
329	class RobustLinksFactory:
330
331	compress_re = re.compile(r"\s+")
332
333	def __init__(self,
334	link_parser_class=None,
335	link_class=Link,
336	urltags=None,
337	):
338	import _beautifulsoup
339	if link_parser_class is None:
340	link_parser_class = MechanizeBs
341	self.link_parser_class = link_parser_class
342	self.link_class = link_class
343	if urltags is None:
344	urltags = {
345	"a": "href",
346	"area": "href",
347	"frame": "src",
348	"iframe": "src",
349	}
350	self.urltags = urltags
351	self._bs = None
352	self._encoding = None
353	self._base_url = None
354
355	def set_soup(self, soup, base_url, encoding):
356	self._bs = soup
357	self._base_url = base_url
358	self._encoding = encoding
359
360	def links(self):
361	import _beautifulsoup
362	bs = self._bs
363	base_url = self._base_url
364	encoding = self._encoding
365	gen = bs.recursiveChildGenerator()
366	for ch in bs.recursiveChildGenerator():
367	if (isinstance(ch, _beautifulsoup.Tag) and
368	ch.name in self.urltags.keys()+["base"]):
369	link = ch
370	attrs = bs.unescape_attrs(link.attrs)
371	attrs_dict = dict(attrs)
372	if link.name == "base":
373	base_href = attrs_dict.get("href")
374	if base_href is not None:
375	base_url = base_href
376	continue
377	url_attr = self.urltags[link.name]
378	url = attrs_dict.get(url_attr)
379	if not url:
380	continue
381	url = _rfc3986.clean_url(url, encoding)
382	text = link.fetchText(lambda t: True)
383	if not text:
384	# follow _pullparser's weird behaviour rigidly
385	if link.name == "a":
386	text = ""
387	else:
388	text = None
389	else:
390	text = self.compress_re.sub(" ", " ".join(text).strip())
391	yield Link(base_url, url, text, link.name, attrs)
392
393
394	class RobustFormsFactory(FormsFactory):
395	def __init__(self, args, *kwds):
396	import ClientForm
397	args = form_parser_args(args, *kwds)
398	if args.form_parser_class is None:
399	args.form_parser_class = RobustFormParser
400	FormsFactory.__init__(self, **args.dictionary)
401
402	def set_response(self, response, encoding):
403	self._response = response
404	self.encoding = encoding
405
406
407	class RobustTitleFactory:
408	def __init__(self):
409	self._bs = self._encoding = None
410
411	def set_soup(self, soup, encoding):
412	self._bs = soup
413	self._encoding = encoding
414
415	def title(self):
416	import _beautifulsoup
417	title = self._bs.first("title")
418	if title == _beautifulsoup.Null:
419	return None
420	else:
421	return title.firstText(lambda t: True)
422
423
424	class Factory:
425	"""Factory for forms, links, etc.
426
427	This interface may expand in future.
428
429	Public methods:
430
431	set_request_class(request_class)
432	set_response(response)
433	forms()
434	links()
435
436	Public attributes:
437
438	Note that accessing these attributes may raise ParseError.
439
440	encoding: string specifying the encoding of response if it contains a text
441	document (this value is left unspecified for documents that do not have
442	an encoding, e.g. an image file)
443	is_html: true if response contains an HTML document (XHTML may be
444	regarded as HTML too)
445	title: page title, or None if no title or not HTML
446	global_form: form object containing all controls that are not descendants
447	of any FORM element, or None if the forms_factory does not support
448	supplying a global form
449
450	"""
451
452	LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
453
454	def __init__(self, forms_factory, links_factory, title_factory,
455	encoding_finder=EncodingFinder(DEFAULT_ENCODING),
456	response_type_finder=ResponseTypeFinder(allow_xhtml=False),
457	):
458	"""
459
460	Pass keyword arguments only.
461
462	default_encoding: character encoding to use if encoding cannot be
463	determined (or guessed) from the response. You should turn on
464	HTTP-EQUIV handling if you want the best chance of getting this right
465	without resorting to this default. The default value of this
466	parameter (currently latin-1) may change in future.
467
468	"""
469	self._forms_factory = forms_factory
470	self._links_factory = links_factory
471	self._title_factory = title_factory
472	self._encoding_finder = encoding_finder
473	self._response_type_finder = response_type_finder
474
475	self.set_response(None)
476
477	def set_request_class(self, request_class):
478	"""Set urllib2.Request class.
479
480	ClientForm.HTMLForm instances returned by .forms() will return
481	instances of this class when .click()ed.
482
483	"""
484	self._forms_factory.request_class = request_class
485
486	def set_response(self, response):
487	"""Set response.
488
489	The response must either be None or implement the same interface as
490	objects returned by urllib2.urlopen().
491
492	"""
493	self._response = response
494	self._forms_genf = self._links_genf = None
495	self._get_title = None
496	for name in self.LAZY_ATTRS:
497	try:
498	delattr(self, name)
499	except AttributeError:
500	pass
501
502	def __getattr__(self, name):
503	if name not in self.LAZY_ATTRS:
504	return getattr(self.__class__, name)
505
506	if name == "encoding":
507	self.encoding = self._encoding_finder.encoding(
508	copy.copy(self._response))
509	return self.encoding
510	elif name == "is_html":
511	self.is_html = self._response_type_finder.is_html(
512	copy.copy(self._response), self.encoding)
513	return self.is_html
514	elif name == "title":
515	if self.is_html:
516	self.title = self._title_factory.title()
517	else:
518	self.title = None
519	return self.title
520	elif name == "global_form":
521	self.forms()
522	return self.global_form
523
524	def forms(self):
525	"""Return iterable over ClientForm.HTMLForm-like objects.
526
527	Raises mechanize.ParseError on failure.
528	"""
529	# this implementation sets .global_form as a side-effect, for benefit
530	# of __getattr__ impl
531	if self._forms_genf is None:
532	try:
533	self._forms_genf = CachingGeneratorFunction(
534	self._forms_factory.forms())
535	except: # XXXX define exception!
536	self.set_response(self._response)
537	raise
538	self.global_form = getattr(
539	self._forms_factory, "global_form", None)
540	return self._forms_genf()
541
542	def links(self):
543	"""Return iterable over mechanize.Link-like objects.
544
545	Raises mechanize.ParseError on failure.
546	"""
547	if self._links_genf is None:
548	try:
549	self._links_genf = CachingGeneratorFunction(
550	self._links_factory.links())
551	except: # XXXX define exception!
552	self.set_response(self._response)
553	raise
554	return self._links_genf()
555
556	class DefaultFactory(Factory):
557	"""Based on sgmllib."""
558	def __init__(self, i_want_broken_xhtml_support=False):
559	Factory.__init__(
560	self,
561	forms_factory=FormsFactory(),
562	links_factory=LinksFactory(),
563	title_factory=TitleFactory(),
564	response_type_finder=ResponseTypeFinder(
565	allow_xhtml=i_want_broken_xhtml_support),
566	)
567
568	def set_response(self, response):
569	Factory.set_response(self, response)
570	if response is not None:
571	self._forms_factory.set_response(
572	copy.copy(response), self.encoding)
573	self._links_factory.set_response(
574	copy.copy(response), response.geturl(), self.encoding)
575	self._title_factory.set_response(
576	copy.copy(response), self.encoding)
577
578	class RobustFactory(Factory):
579	"""Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
580	DefaultFactory.
581
582	"""
583	def __init__(self, i_want_broken_xhtml_support=False,
584	soup_class=None):
585	Factory.__init__(
586	self,
587	forms_factory=RobustFormsFactory(),
588	links_factory=RobustLinksFactory(),
589	title_factory=RobustTitleFactory(),
590	response_type_finder=ResponseTypeFinder(
591	allow_xhtml=i_want_broken_xhtml_support),
592	)
593	if soup_class is None:
594	soup_class = MechanizeBs
595	self._soup_class = soup_class
596
597	def set_response(self, response):
598	import _beautifulsoup
599	Factory.set_response(self, response)
600	if response is not None:
601	data = response.read()
602	soup = self._soup_class(self.encoding, data)
603	self._forms_factory.set_response(
604	copy.copy(response), self.encoding)
605	self._links_factory.set_soup(
606	soup, response.geturl(), self.encoding)
607	self._title_factory.set_soup(soup, self.encoding)

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/twill-0.9-py2.6.egg/twill/other_packages/_mechanize_dist/_html.py @ 3

異なるフォーマットでダウンロード: