Context Navigation

_html.py @ 3

リビジョン 3, 19.8 KB (コミッタ: kohda, 14 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

Rev	行番号
[3]	1	"""HTML handling.
	2
	3	Copyright 2003-2006 John J. Lee <jjl@pobox.com>
	4
	5	This code is free software; you can redistribute it and/or modify it under
	6	the terms of the BSD or ZPL 2.1 licenses (see the file COPYING.txt
	7	included with the distribution).
	8
	9	"""
	10
	11	import re, copy, htmlentitydefs
	12	import sgmllib, HTMLParser, ClientForm
	13
	14	import _request
	15	from _headersutil import split_header_words, is_html as _is_html
	16	import _rfc3986
	17
	18	DEFAULT_ENCODING = "latin-1"
	19
	20
	21	# the base classe is purely for backwards compatibility
	22	class ParseError(ClientForm.ParseError): pass
	23
	24
	25	class CachingGeneratorFunction(object):
	26	"""Caching wrapper around a no-arguments iterable."""
	27
	28	def __init__(self, iterable):
	29	self._cache = []
	30	# wrap iterable to make it non-restartable (otherwise, repeated
	31	# __call__ would give incorrect results)
	32	self._iterator = iter(iterable)
	33
	34	def __call__(self):
	35	cache = self._cache
	36	for item in cache:
	37	yield item
	38	for item in self._iterator:
	39	cache.append(item)
	40	yield item
	41
	42
	43	class EncodingFinder:
	44	def __init__(self, default_encoding):
	45	self._default_encoding = default_encoding
	46	def encoding(self, response):
	47	# HTTPEquivProcessor may be in use, so both HTTP and HTTP-EQUIV
	48	# headers may be in the response. HTTP-EQUIV headers come last,
	49	# so try in order from first to last.
	50	for ct in response.info().getheaders("content-type"):
	51	for k, v in split_header_words([ct])[0]:
	52	if k == "charset":
	53	return v
	54	return self._default_encoding
	55
	56	class ResponseTypeFinder:
	57	def __init__(self, allow_xhtml):
	58	self._allow_xhtml = allow_xhtml
	59	def is_html(self, response, encoding):
	60	ct_hdrs = response.info().getheaders("content-type")
	61	url = response.geturl()
	62	# XXX encoding
	63	return _is_html(ct_hdrs, url, self._allow_xhtml)
	64
	65
	66	# idea for this argument-processing trick is from Peter Otten
	67	class Args:
	68	def __init__(self, args_map):
	69	self.dictionary = dict(args_map)
	70	def __getattr__(self, key):
	71	try:
	72	return self.dictionary[key]
	73	except KeyError:
	74	return getattr(self.__class__, key)
	75
	76	def form_parser_args(
	77	select_default=False,
	78	form_parser_class=None,
	79	request_class=None,
	80	backwards_compat=False,
	81	):
	82	return Args(locals())
	83
	84
	85	class Link:
	86	def __init__(self, base_url, url, text, tag, attrs):
	87	assert None not in [url, tag, attrs]
	88	self.base_url = base_url
	89	self.absolute_url = _rfc3986.urljoin(base_url, url)
	90	self.url, self.text, self.tag, self.attrs = url, text, tag, attrs
	91	def __cmp__(self, other):
	92	try:
	93	for name in "url", "text", "tag", "attrs":
	94	if getattr(self, name) != getattr(other, name):
	95	return -1
	96	except AttributeError:
	97	return -1
	98	return 0
	99	def __repr__(self):
	100	return "Link(base_url=%r, url=%r, text=%r, tag=%r, attrs=%r)" % (
	101	self.base_url, self.url, self.text, self.tag, self.attrs)
	102
	103
	104	class LinksFactory:
	105
	106	def __init__(self,
	107	link_parser_class=None,
	108	link_class=Link,
	109	urltags=None,
	110	):
	111	import _pullparser
	112	if link_parser_class is None:
	113	link_parser_class = _pullparser.TolerantPullParser
	114	self.link_parser_class = link_parser_class
	115	self.link_class = link_class
	116	if urltags is None:
	117	urltags = {
	118	"a": "href",
	119	"area": "href",
	120	"frame": "src",
	121	"iframe": "src",
	122	}
	123	self.urltags = urltags
	124	self._response = None
	125	self._encoding = None
	126
	127	def set_response(self, response, base_url, encoding):
	128	self._response = response
	129	self._encoding = encoding
	130	self._base_url = base_url
	131
	132	def links(self):
	133	"""Return an iterator that provides links of the document."""
	134	response = self._response
	135	encoding = self._encoding
	136	base_url = self._base_url
	137	p = self.link_parser_class(response, encoding=encoding)
	138
	139	try:
	140	for token in p.tags(*(self.urltags.keys()+["base"])):
	141	if token.type == "endtag":
	142	continue
	143	if token.data == "base":
	144	base_href = dict(token.attrs).get("href")
	145	if base_href is not None:
	146	base_url = base_href
	147	continue
	148	attrs = dict(token.attrs)
	149	tag = token.data
	150	name = attrs.get("name")
	151	text = None
	152	# XXX use attr_encoding for ref'd doc if that doc does not
	153	# provide one by other means
	154	#attr_encoding = attrs.get("charset")
	155	url = attrs.get(self.urltags[tag]) # XXX is "" a valid URL?
	156	if not url:
	157	# Probably an <A NAME="blah"> link or <AREA NOHREF...>.
	158	# For our purposes a link is something with a URL, so
	159	# ignore this.
	160	continue
	161
	162	url = _rfc3986.clean_url(url, encoding)
	163	if tag == "a":
	164	if token.type != "startendtag":
	165	# hmm, this'd break if end tag is missing
	166	text = p.get_compressed_text(("endtag", tag))
	167	# but this doesn't work for eg.
	168	# <a href="blah"><b>Andy</b></a>
	169	#text = p.get_compressed_text()
	170
	171	yield Link(base_url, url, text, tag, token.attrs)
	172	except sgmllib.SGMLParseError, exc:
	173	raise ParseError(exc)
	174
	175	class FormsFactory:
	176
	177	"""Makes a sequence of objects satisfying ClientForm.HTMLForm interface.
	178
	179	After calling .forms(), the .global_form attribute is a form object
	180	containing all controls not a descendant of any FORM element.
	181
	182	For constructor argument docs, see ClientForm.ParseResponse
	183	argument docs.
	184
	185	"""
	186
	187	def __init__(self,
	188	select_default=False,
	189	form_parser_class=None,
	190	request_class=None,
	191	backwards_compat=False,
	192	):
	193	import ClientForm
	194	self.select_default = select_default
	195	if form_parser_class is None:
	196	form_parser_class = ClientForm.FormParser
	197	self.form_parser_class = form_parser_class
	198	if request_class is None:
	199	request_class = _request.Request
	200	self.request_class = request_class
	201	self.backwards_compat = backwards_compat
	202	self._response = None
	203	self.encoding = None
	204	self.global_form = None
	205
	206	def set_response(self, response, encoding):
	207	self._response = response
	208	self.encoding = encoding
	209	self.global_form = None
	210
	211	def forms(self):
	212	import ClientForm
	213	encoding = self.encoding
	214	try:
	215	forms = ClientForm.ParseResponseEx(
	216	self._response,
	217	select_default=self.select_default,
	218	form_parser_class=self.form_parser_class,
	219	request_class=self.request_class,
	220	encoding=encoding,
	221	_urljoin=_rfc3986.urljoin,
	222	_urlparse=_rfc3986.urlsplit,
	223	_urlunparse=_rfc3986.urlunsplit,
	224	)
	225	except ClientForm.ParseError, exc:
	226	raise ParseError(exc)
	227	self.global_form = forms[0]
	228	return forms[1:]
	229
	230	class TitleFactory:
	231	def __init__(self):
	232	self._response = self._encoding = None
	233
	234	def set_response(self, response, encoding):
	235	self._response = response
	236	self._encoding = encoding
	237
	238	def title(self):
	239	import _pullparser
	240	p = _pullparser.TolerantPullParser(
	241	self._response, encoding=self._encoding)
	242	try:
	243	try:
	244	p.get_tag("title")
	245	except _pullparser.NoMoreTokensError:
	246	return None
	247	else:
	248	return p.get_text()
	249	except sgmllib.SGMLParseError, exc:
	250	raise ParseError(exc)
	251
	252
	253	def unescape(data, entities, encoding):
	254	if data is None or "&" not in data:
	255	return data
	256
	257	def replace_entities(match):
	258	ent = match.group()
	259	if ent[1] == "#":
	260	return unescape_charref(ent[2:-1], encoding)
	261
	262	repl = entities.get(ent[1:-1])
	263	if repl is not None:
	264	repl = unichr(repl)
	265	if type(repl) != type(""):
	266	try:
	267	repl = repl.encode(encoding)
	268	except UnicodeError:
	269	repl = ent
	270	else:
	271	repl = ent
	272	return repl
	273
	274	return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
	275
	276	def unescape_charref(data, encoding):
	277	name, base = data, 10
	278	if name.startswith("x"):
	279	name, base= name[1:], 16
	280	uc = unichr(int(name, base))
	281	if encoding is None:
	282	return uc
	283	else:
	284	try:
	285	repl = uc.encode(encoding)
	286	except UnicodeError:
	287	repl = "&#%s;" % data
	288	return repl
	289
	290
	291	# bizarre import gymnastics for bundled BeautifulSoup
	292	import _beautifulsoup
	293	import ClientForm
	294	RobustFormParser, NestingRobustFormParser = ClientForm._create_bs_classes(
	295	_beautifulsoup.BeautifulSoup, _beautifulsoup.ICantBelieveItsBeautifulSoup
	296	)
	297	# monkeypatch sgmllib to fix http://www.python.org/sf/803422 :-(
	298	import sgmllib
	299	sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
	300
	301	class MechanizeBs(_beautifulsoup.BeautifulSoup):
	302	_entitydefs = htmlentitydefs.name2codepoint
	303	# don't want the magic Microsoft-char workaround
	304	PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
	305	lambda(x):x.group(1) + ' />'),
	306	(re.compile('<!\s+([^<>]*)>'),
	307	lambda(x):'<!' + x.group(1) + '>')
	308	]
	309
	310	def __init__(self, encoding, text=None, avoidParserProblems=True,
	311	initialTextIsEverything=True):
	312	self._encoding = encoding
	313	_beautifulsoup.BeautifulSoup.__init__(
	314	self, text, avoidParserProblems, initialTextIsEverything)
	315
	316	def handle_charref(self, ref):
	317	t = unescape("&#%s;"%ref, self._entitydefs, self._encoding)
	318	self.handle_data(t)
	319	def handle_entityref(self, ref):
	320	t = unescape("&%s;"%ref, self._entitydefs, self._encoding)
	321	self.handle_data(t)
	322	def unescape_attrs(self, attrs):
	323	escaped_attrs = []
	324	for key, val in attrs:
	325	val = unescape(val, self._entitydefs, self._encoding)
	326	escaped_attrs.append((key, val))
	327	return escaped_attrs
	328
	329	class RobustLinksFactory:
	330
	331	compress_re = re.compile(r"\s+")
	332
	333	def __init__(self,
	334	link_parser_class=None,
	335	link_class=Link,
	336	urltags=None,
	337	):
	338	import _beautifulsoup
	339	if link_parser_class is None:
	340	link_parser_class = MechanizeBs
	341	self.link_parser_class = link_parser_class
	342	self.link_class = link_class
	343	if urltags is None:
	344	urltags = {
	345	"a": "href",
	346	"area": "href",
	347	"frame": "src",
	348	"iframe": "src",
	349	}
	350	self.urltags = urltags
	351	self._bs = None
	352	self._encoding = None
	353	self._base_url = None
	354
	355	def set_soup(self, soup, base_url, encoding):
	356	self._bs = soup
	357	self._base_url = base_url
	358	self._encoding = encoding
	359
	360	def links(self):
	361	import _beautifulsoup
	362	bs = self._bs
	363	base_url = self._base_url
	364	encoding = self._encoding
	365	gen = bs.recursiveChildGenerator()
	366	for ch in bs.recursiveChildGenerator():
	367	if (isinstance(ch, _beautifulsoup.Tag) and
	368	ch.name in self.urltags.keys()+["base"]):
	369	link = ch
	370	attrs = bs.unescape_attrs(link.attrs)
	371	attrs_dict = dict(attrs)
	372	if link.name == "base":
	373	base_href = attrs_dict.get("href")
	374	if base_href is not None:
	375	base_url = base_href
	376	continue
	377	url_attr = self.urltags[link.name]
	378	url = attrs_dict.get(url_attr)
	379	if not url:
	380	continue
	381	url = _rfc3986.clean_url(url, encoding)
	382	text = link.fetchText(lambda t: True)
	383	if not text:
	384	# follow _pullparser's weird behaviour rigidly
	385	if link.name == "a":
	386	text = ""
	387	else:
	388	text = None
	389	else:
	390	text = self.compress_re.sub(" ", " ".join(text).strip())
	391	yield Link(base_url, url, text, link.name, attrs)
	392
	393
	394	class RobustFormsFactory(FormsFactory):
	395	def __init__(self, args, *kwds):
	396	import ClientForm
	397	args = form_parser_args(args, *kwds)
	398	if args.form_parser_class is None:
	399	args.form_parser_class = RobustFormParser
	400	FormsFactory.__init__(self, **args.dictionary)
	401
	402	def set_response(self, response, encoding):
	403	self._response = response
	404	self.encoding = encoding
	405
	406
	407	class RobustTitleFactory:
	408	def __init__(self):
	409	self._bs = self._encoding = None
	410
	411	def set_soup(self, soup, encoding):
	412	self._bs = soup
	413	self._encoding = encoding
	414
	415	def title(self):
	416	import _beautifulsoup
	417	title = self._bs.first("title")
	418	if title == _beautifulsoup.Null:
	419	return None
	420	else:
	421	return title.firstText(lambda t: True)
	422
	423
	424	class Factory:
	425	"""Factory for forms, links, etc.
	426
	427	This interface may expand in future.
	428
	429	Public methods:
	430
	431	set_request_class(request_class)
	432	set_response(response)
	433	forms()
	434	links()
	435
	436	Public attributes:
	437
	438	Note that accessing these attributes may raise ParseError.
	439
	440	encoding: string specifying the encoding of response if it contains a text
	441	document (this value is left unspecified for documents that do not have
	442	an encoding, e.g. an image file)
	443	is_html: true if response contains an HTML document (XHTML may be
	444	regarded as HTML too)
	445	title: page title, or None if no title or not HTML
	446	global_form: form object containing all controls that are not descendants
	447	of any FORM element, or None if the forms_factory does not support
	448	supplying a global form
	449
	450	"""
	451
	452	LAZY_ATTRS = ["encoding", "is_html", "title", "global_form"]
	453
	454	def __init__(self, forms_factory, links_factory, title_factory,
	455	encoding_finder=EncodingFinder(DEFAULT_ENCODING),
	456	response_type_finder=ResponseTypeFinder(allow_xhtml=False),
	457	):
	458	"""
	459
	460	Pass keyword arguments only.
	461
	462	default_encoding: character encoding to use if encoding cannot be
	463	determined (or guessed) from the response. You should turn on
	464	HTTP-EQUIV handling if you want the best chance of getting this right
	465	without resorting to this default. The default value of this
	466	parameter (currently latin-1) may change in future.
	467
	468	"""
	469	self._forms_factory = forms_factory
	470	self._links_factory = links_factory
	471	self._title_factory = title_factory
	472	self._encoding_finder = encoding_finder
	473	self._response_type_finder = response_type_finder
	474
	475	self.set_response(None)
	476
	477	def set_request_class(self, request_class):
	478	"""Set urllib2.Request class.
	479
	480	ClientForm.HTMLForm instances returned by .forms() will return
	481	instances of this class when .click()ed.
	482
	483	"""
	484	self._forms_factory.request_class = request_class
	485
	486	def set_response(self, response):
	487	"""Set response.
	488
	489	The response must either be None or implement the same interface as
	490	objects returned by urllib2.urlopen().
	491
	492	"""
	493	self._response = response
	494	self._forms_genf = self._links_genf = None
	495	self._get_title = None
	496	for name in self.LAZY_ATTRS:
	497	try:
	498	delattr(self, name)
	499	except AttributeError:
	500	pass
	501
	502	def __getattr__(self, name):
	503	if name not in self.LAZY_ATTRS:
	504	return getattr(self.__class__, name)
	505
	506	if name == "encoding":
	507	self.encoding = self._encoding_finder.encoding(
	508	copy.copy(self._response))
	509	return self.encoding
	510	elif name == "is_html":
	511	self.is_html = self._response_type_finder.is_html(
	512	copy.copy(self._response), self.encoding)
	513	return self.is_html
	514	elif name == "title":
	515	if self.is_html:
	516	self.title = self._title_factory.title()
	517	else:
	518	self.title = None
	519	return self.title
	520	elif name == "global_form":
	521	self.forms()
	522	return self.global_form
	523
	524	def forms(self):
	525	"""Return iterable over ClientForm.HTMLForm-like objects.
	526
	527	Raises mechanize.ParseError on failure.
	528	"""
	529	# this implementation sets .global_form as a side-effect, for benefit
	530	# of __getattr__ impl
	531	if self._forms_genf is None:
	532	try:
	533	self._forms_genf = CachingGeneratorFunction(
	534	self._forms_factory.forms())
	535	except: # XXXX define exception!
	536	self.set_response(self._response)
	537	raise
	538	self.global_form = getattr(
	539	self._forms_factory, "global_form", None)
	540	return self._forms_genf()
	541
	542	def links(self):
	543	"""Return iterable over mechanize.Link-like objects.
	544
	545	Raises mechanize.ParseError on failure.
	546	"""
	547	if self._links_genf is None:
	548	try:
	549	self._links_genf = CachingGeneratorFunction(
	550	self._links_factory.links())
	551	except: # XXXX define exception!
	552	self.set_response(self._response)
	553	raise
	554	return self._links_genf()
	555
	556	class DefaultFactory(Factory):
	557	"""Based on sgmllib."""
	558	def __init__(self, i_want_broken_xhtml_support=False):
	559	Factory.__init__(
	560	self,
	561	forms_factory=FormsFactory(),
	562	links_factory=LinksFactory(),
	563	title_factory=TitleFactory(),
	564	response_type_finder=ResponseTypeFinder(
	565	allow_xhtml=i_want_broken_xhtml_support),
	566	)
	567
	568	def set_response(self, response):
	569	Factory.set_response(self, response)
	570	if response is not None:
	571	self._forms_factory.set_response(
	572	copy.copy(response), self.encoding)
	573	self._links_factory.set_response(
	574	copy.copy(response), response.geturl(), self.encoding)
	575	self._title_factory.set_response(
	576	copy.copy(response), self.encoding)
	577
	578	class RobustFactory(Factory):
	579	"""Based on BeautifulSoup, hopefully a bit more robust to bad HTML than is
	580	DefaultFactory.
	581
	582	"""
	583	def __init__(self, i_want_broken_xhtml_support=False,
	584	soup_class=None):
	585	Factory.__init__(
	586	self,
	587	forms_factory=RobustFormsFactory(),
	588	links_factory=RobustLinksFactory(),
	589	title_factory=RobustTitleFactory(),
	590	response_type_finder=ResponseTypeFinder(
	591	allow_xhtml=i_want_broken_xhtml_support),
	592	)
	593	if soup_class is None:
	594	soup_class = MechanizeBs
	595	self._soup_class = soup_class
	596
	597	def set_response(self, response):
	598	import _beautifulsoup
	599	Factory.set_response(self, response)
	600	if response is not None:
	601	data = response.read()
	602	soup = self._soup_class(self.encoding, data)
	603	self._forms_factory.set_response(
	604	copy.copy(response), self.encoding)
	605	self._links_factory.set_soup(
	606	soup, response.geturl(), self.encoding)
	607	self._title_factory.set_soup(soup, self.encoding)

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/twill-0.9-py2.6.egg/twill/other_packages/_mechanize_dist/_html.py @ 3

異なるフォーマットでダウンロード: