Context Navigation

BeautifulSoup.py @ 2

リビジョン 2, 69.9 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

Rev	行番号
[2]	1	"""Beautiful Soup
	2	Elixir and Tonic
	3	"The Screen-Scraper's Friend"
	4	http://www.crummy.com/software/BeautifulSoup/
	5
	6	Beautiful Soup parses a (possibly invalid) XML or HTML document into a
	7	tree representation. It provides methods and Pythonic idioms that make
	8	it easy to navigate, search, and modify the tree.
	9
	10	A well-structured XML/HTML document yields a well-behaved data
	11	structure. An ill-structured XML/HTML document yields a
	12	correspondingly ill-behaved data structure. If your document is only
	13	locally well-structured, you can use this library to find and process
	14	the well-structured part of it.
	15
	16	Beautiful Soup works with Python 2.2 and up. It has no external
	17	dependencies, but you'll have more success at converting data to UTF-8
	18	if you also install these three packages:
	19
	20	* chardet, for auto-detecting character encodings
	21	http://chardet.feedparser.org/
	22	* cjkcodecs and iconv_codec, which add more encodings to the ones supported
	23	by stock Python.
	24	http://cjkpython.i18n.org/
	25
	26	Beautiful Soup defines classes for two main parsing strategies:
	27
	28	* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
	29	language that kind of looks like XML.
	30
	31	* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
	32	or invalid. This class has web browser-like heuristics for
	33	obtaining a sensible parse tree in the face of common HTML errors.
	34
	35	Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
	36	the encoding of an HTML or XML document, and converting it to
	37	Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed
	38	Parser.
	39
	40	For more than you ever wanted to know about Beautiful Soup, see the
	41	documentation:
	42	http://www.crummy.com/software/BeautifulSoup/documentation.html
	43	"""
	44	from __future__ import generators
	45
	46	__author__ = "Leonard Richardson (crummy.com)"
	47	__contributors__ = ["Sam Ruby (intertwingly.net)",
	48	"the unwitting Mark Pilgrim (diveintomark.org)",
	49	"http://www.crummy.com/software/BeautifulSoup/AUTHORS.html"]
	50	__version__ = "3.0.3"
	51	__copyright__ = "Copyright (c) 2004-2006 Leonard Richardson"
	52	__license__ = "PSF"
	53
	54	from sgmllib import SGMLParser, SGMLParseError
	55	import codecs
	56	import types
	57	import re
	58	import sgmllib
	59	from htmlentitydefs import name2codepoint
	60
	61	# This RE makes Beautiful Soup able to parse XML with namespaces.
	62	sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
	63
	64	# This RE makes Beautiful Soup capable of recognizing numeric character
	65	# references that use hexadecimal.
	66	sgmllib.charref = re.compile('&#(\d+\|x[0-9a-fA-F]+);')
	67
	68	DEFAULT_OUTPUT_ENCODING = "utf-8"
	69
	70	# First, the classes that represent markup elements.
	71
	72	class PageElement:
	73	"""Contains the navigational information for some part of the page
	74	(either a tag or a piece of text)"""
	75
	76	def setup(self, parent=None, previous=None):
	77	"""Sets up the initial relations between this element and
	78	other elements."""
	79	self.parent = parent
	80	self.previous = previous
	81	self.next = None
	82	self.previousSibling = None
	83	self.nextSibling = None
	84	if self.parent and self.parent.contents:
	85	self.previousSibling = self.parent.contents[-1]
	86	self.previousSibling.nextSibling = self
	87
	88	def replaceWith(self, replaceWith):
	89	oldParent = self.parent
	90	myIndex = self.parent.contents.index(self)
	91	if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
	92	# We're replacing this element with one of its siblings.
	93	index = self.parent.contents.index(replaceWith)
	94	if index and index < myIndex:
	95	# Furthermore, it comes before this element. That
	96	# means that when we extract it, the index of this
	97	# element will change.
	98	myIndex = myIndex - 1
	99	self.extract()
	100	oldParent.insert(myIndex, replaceWith)
	101
	102	def extract(self):
	103	"""Destructively rips this element out of the tree."""
	104	if self.parent:
	105	try:
	106	self.parent.contents.remove(self)
	107	except ValueError:
	108	pass
	109
	110	#Find the two elements that would be next to each other if
	111	#this element (and any children) hadn't been parsed. Connect
	112	#the two.
	113	lastChild = self._lastRecursiveChild()
	114	nextElement = lastChild.next
	115
	116	if self.previous:
	117	self.previous.next = nextElement
	118	if nextElement:
	119	nextElement.previous = self.previous
	120	self.previous = None
	121	lastChild.next = None
	122
	123	self.parent = None
	124	if self.previousSibling:
	125	self.previousSibling.nextSibling = self.nextSibling
	126	if self.nextSibling:
	127	self.nextSibling.previousSibling = self.previousSibling
	128	self.previousSibling = self.nextSibling = None
	129
	130	def _lastRecursiveChild(self):
	131	"Finds the last element beneath this object to be parsed."
	132	lastChild = self
	133	while hasattr(lastChild, 'contents') and lastChild.contents:
	134	lastChild = lastChild.contents[-1]
	135	return lastChild
	136
	137	def insert(self, position, newChild):
	138	if (isinstance(newChild, basestring)
	139	or isinstance(newChild, unicode)) \
	140	and not isinstance(newChild, NavigableString):
	141	newChild = NavigableString(newChild)
	142
	143	position = min(position, len(self.contents))
	144	if hasattr(newChild, 'parent') and newChild.parent != None:
	145	# We're 'inserting' an element that's already one
	146	# of this object's children.
	147	if newChild.parent == self:
	148	index = self.find(newChild)
	149	if index and index < position:
	150	# Furthermore we're moving it further down the
	151	# list of this object's children. That means that
	152	# when we extract this element, our target index
	153	# will jump down one.
	154	position = position - 1
	155	newChild.extract()
	156
	157	newChild.parent = self
	158	previousChild = None
	159	if position == 0:
	160	newChild.previousSibling = None
	161	newChild.previous = self
	162	else:
	163	previousChild = self.contents[position-1]
	164	newChild.previousSibling = previousChild
	165	newChild.previousSibling.nextSibling = newChild
	166	newChild.previous = previousChild._lastRecursiveChild()
	167	if newChild.previous:
	168	newChild.previous.next = newChild
	169
	170	newChildsLastElement = newChild._lastRecursiveChild()
	171
	172	if position >= len(self.contents):
	173	newChild.nextSibling = None
	174
	175	parent = self
	176	parentsNextSibling = None
	177	while not parentsNextSibling:
	178	parentsNextSibling = parent.nextSibling
	179	parent = parent.parent
	180	if not parent: # This is the last element in the document.
	181	break
	182	if parentsNextSibling:
	183	newChildsLastElement.next = parentsNextSibling
	184	else:
	185	newChildsLastElement.next = None
	186	else:
	187	nextChild = self.contents[position]
	188	newChild.nextSibling = nextChild
	189	if newChild.nextSibling:
	190	newChild.nextSibling.previousSibling = newChild
	191	newChildsLastElement.next = nextChild
	192
	193	if newChildsLastElement.next:
	194	newChildsLastElement.next.previous = newChildsLastElement
	195	self.contents.insert(position, newChild)
	196
	197	def findNext(self, name=None, attrs={}, text=None, **kwargs):
	198	"""Returns the first item that matches the given criteria and
	199	appears after this Tag in the document."""
	200	return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
	201
	202	def findAllNext(self, name=None, attrs={}, text=None, limit=None,
	203	**kwargs):
	204	"""Returns all items that match the given criteria and appear
	205	before after Tag in the document."""
	206	return self._findAll(name, attrs, text, limit, self.nextGenerator)
	207
	208	def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
	209	"""Returns the closest sibling to this Tag that matches the
	210	given criteria and appears after this Tag in the document."""
	211	return self._findOne(self.findNextSiblings, name, attrs, text,
	212	**kwargs)
	213
	214	def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
	215	**kwargs):
	216	"""Returns the siblings of this Tag that match the given
	217	criteria and appear after this Tag in the document."""
	218	return self._findAll(name, attrs, text, limit,
	219	self.nextSiblingGenerator, **kwargs)
	220	fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
	221
	222	def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
	223	"""Returns the first item that matches the given criteria and
	224	appears before this Tag in the document."""
	225	return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
	226
	227	def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
	228	**kwargs):
	229	"""Returns all items that match the given criteria and appear
	230	before this Tag in the document."""
	231	return self._findAll(name, attrs, text, limit, self.previousGenerator,
	232	**kwargs)
	233	fetchPrevious = findAllPrevious # Compatibility with pre-3.x
	234
	235	def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
	236	"""Returns the closest sibling to this Tag that matches the
	237	given criteria and appears before this Tag in the document."""
	238	return self._findOne(self.findPreviousSiblings, name, attrs, text,
	239	**kwargs)
	240
	241	def findPreviousSiblings(self, name=None, attrs={}, text=None,
	242	limit=None, **kwargs):
	243	"""Returns the siblings of this Tag that match the given
	244	criteria and appear before this Tag in the document."""
	245	return self._findAll(name, attrs, text, limit,
	246	self.previousSiblingGenerator, **kwargs)
	247	fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
	248
	249	def findParent(self, name=None, attrs={}, **kwargs):
	250	"""Returns the closest parent of this Tag that matches the given
	251	criteria."""
	252	# NOTE: We can't use _findOne because findParents takes a different
	253	# set of arguments.
	254	r = None
	255	l = self.findParents(name, attrs, 1)
	256	if l:
	257	r = l[0]
	258	return r
	259
	260	def findParents(self, name=None, attrs={}, limit=None, **kwargs):
	261	"""Returns the parents of this Tag that match the given
	262	criteria."""
	263
	264	return self._findAll(name, attrs, None, limit, self.parentGenerator,
	265	**kwargs)
	266	fetchParents = findParents # Compatibility with pre-3.x
	267
	268	#These methods do the real heavy lifting.
	269
	270	def _findOne(self, method, name, attrs, text, **kwargs):
	271	r = None
	272	l = method(name, attrs, text, 1, **kwargs)
	273	if l:
	274	r = l[0]
	275	return r
	276
	277	def _findAll(self, name, attrs, text, limit, generator, **kwargs):
	278	"Iterates over a generator looking for things that match."
	279
	280	if isinstance(name, SoupStrainer):
	281	strainer = name
	282	else:
	283	# Build a SoupStrainer
	284	strainer = SoupStrainer(name, attrs, text, **kwargs)
	285	results = ResultSet(strainer)
	286	g = generator()
	287	while True:
	288	try:
	289	i = g.next()
	290	except StopIteration:
	291	break
	292	if i:
	293	found = strainer.search(i)
	294	if found:
	295	results.append(found)
	296	if limit and len(results) >= limit:
	297	break
	298	return results
	299
	300	#These Generators can be used to navigate starting from both
	301	#NavigableStrings and Tags.
	302	def nextGenerator(self):
	303	i = self
	304	while i:
	305	i = i.next
	306	yield i
	307
	308	def nextSiblingGenerator(self):
	309	i = self
	310	while i:
	311	i = i.nextSibling
	312	yield i
	313
	314	def previousGenerator(self):
	315	i = self
	316	while i:
	317	i = i.previous
	318	yield i
	319
	320	def previousSiblingGenerator(self):
	321	i = self
	322	while i:
	323	i = i.previousSibling
	324	yield i
	325
	326	def parentGenerator(self):
	327	i = self
	328	while i:
	329	i = i.parent
	330	yield i
	331
	332	# Utility methods
	333	def substituteEncoding(self, str, encoding=None):
	334	encoding = encoding or "utf-8"
	335	return str.replace("%SOUP-ENCODING%", encoding)
	336
	337	def toEncoding(self, s, encoding=None):
	338	"""Encodes an object to a string in some encoding, or to Unicode.
	339	."""
	340	if isinstance(s, unicode):
	341	if encoding:
	342	s = s.encode(encoding)
	343	elif isinstance(s, str):
	344	if encoding:
	345	s = s.encode(encoding)
	346	else:
	347	s = unicode(s)
	348	else:
	349	if encoding:
	350	s = self.toEncoding(str(s), encoding)
	351	else:
	352	s = unicode(s)
	353	return s
	354
	355	class NavigableString(unicode, PageElement):
	356
	357	def __getattr__(self, attr):
	358	"""text.string gives you text. This is for backwards
	359	compatibility for NavigableString, but for CData it lets you
	360	get the string without the CData wrapper."""
	361	if attr == 'string':
	362	return self
	363	else:
	364	raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
	365
	366	def __unicode__(self):
	367	return __str__(self, None)
	368
	369	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
	370	if encoding:
	371	return self.encode(encoding)
	372	else:
	373	return self
	374
	375	class CData(NavigableString):
	376
	377	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
	378	return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
	379
	380	class ProcessingInstruction(NavigableString):
	381	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
	382	output = self
	383	if "%SOUP-ENCODING%" in output:
	384	output = self.substituteEncoding(output, encoding)
	385	return "<?%s?>" % self.toEncoding(output, encoding)
	386
	387	class Comment(NavigableString):
	388	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
	389	return "<!--%s-->" % NavigableString.__str__(self, encoding)
	390
	391	class Declaration(NavigableString):
	392	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
	393	return "<!%s>" % NavigableString.__str__(self, encoding)
	394
	395	class Tag(PageElement):
	396	"""Represents a found HTML tag with its attributes and contents."""
	397
	398	XML_ENTITIES_TO_CHARS = { 'apos' : "'",
	399	"quot" : '"',
	400	"amp" : "&",
	401	"lt" : "<",
	402	"gt" : ">"
	403	}
	404	# An RE for finding ampersands that aren't the start of of a
	405	# numeric entity.
	406	BARE_AMPERSAND = re.compile("&(?!#\d+;\|#x[0-9a-fA-F]+;\|\w+;)")
	407
	408	def __init__(self, parser, name, attrs=None, parent=None,
	409	previous=None):
	410	"Basic constructor."
	411
	412	# We don't actually store the parser object: that lets extracted
	413	# chunks be garbage-collected
	414	self.parserClass = parser.__class__
	415	self.isSelfClosing = parser.isSelfClosingTag(name)
	416	self.convertHTMLEntities = parser.convertHTMLEntities
	417	self.name = name
	418	if attrs == None:
	419	attrs = []
	420	self.attrs = attrs
	421	self.contents = []
	422	self.setup(parent, previous)
	423	self.hidden = False
	424	self.containsSubstitutions = False
	425
	426	def get(self, key, default=None):
	427	"""Returns the value of the 'key' attribute for the tag, or
	428	the value given for 'default' if it doesn't have that
	429	attribute."""
	430	return self._getAttrMap().get(key, default)
	431
	432	def has_key(self, key):
	433	return self._getAttrMap().has_key(key)
	434
	435	def __getitem__(self, key):
	436	"""tag[key] returns the value of the 'key' attribute for the tag,
	437	and throws an exception if it's not there."""
	438	return self._getAttrMap()[key]
	439
	440	def __iter__(self):
	441	"Iterating over a tag iterates over its contents."
	442	return iter(self.contents)
	443
	444	def __len__(self):
	445	"The length of a tag is the length of its list of contents."
	446	return len(self.contents)
	447
	448	def __contains__(self, x):
	449	return x in self.contents
	450
	451	def __nonzero__(self):
	452	"A tag is non-None even if it has no contents."
	453	return True
	454
	455	def __setitem__(self, key, value):
	456	"""Setting tag[key] sets the value of the 'key' attribute for the
	457	tag."""
	458	self._getAttrMap()
	459	self.attrMap[key] = value
	460	found = False
	461	for i in range(0, len(self.attrs)):
	462	if self.attrs[i][0] == key:
	463	self.attrs[i] = (key, value)
	464	found = True
	465	if not found:
	466	self.attrs.append((key, value))
	467	self._getAttrMap()[key] = value
	468
	469	def __delitem__(self, key):
	470	"Deleting tag[key] deletes all 'key' attributes for the tag."
	471	for item in self.attrs:
	472	if item[0] == key:
	473	self.attrs.remove(item)
	474	#We don't break because bad HTML can define the same
	475	#attribute multiple times.
	476	self._getAttrMap()
	477	if self.attrMap.has_key(key):
	478	del self.attrMap[key]
	479
	480	def __call__(self, args, *kwargs):
	481	"""Calling a tag like a function is the same as calling its
	482	findAll() method. Eg. tag('a') returns a list of all the A tags
	483	found within this tag."""
	484	return apply(self.findAll, args, kwargs)
	485
	486	def __getattr__(self, tag):
	487	#print "Getattr %s.%s" % (self.__class__, tag)
	488	if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
	489	return self.find(tag[:-3])
	490	elif tag.find('__') != 0:
	491	return self.find(tag)
	492
	493	def __eq__(self, other):
	494	"""Returns true iff this tag has the same name, the same attributes,
	495	and the same contents (recursively) as the given tag.
	496
	497	NOTE: right now this will return false if two tags have the
	498	same attributes in a different order. Should this be fixed?"""
	499	if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
	500	return False
	501	for i in range(0, len(self.contents)):
	502	if self.contents[i] != other.contents[i]:
	503	return False
	504	return True
	505
	506	def __ne__(self, other):
	507	"""Returns true iff this tag is not identical to the other tag,
	508	as defined in __eq__."""
	509	return not self == other
	510
	511	def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
	512	"""Renders this tag as a string."""
	513	return self.__str__(encoding)
	514
	515	def __unicode__(self):
	516	return self.__str__(None)
	517
	518	def _convertEntities(self, match):
	519	x = match.group(1)
	520	if x in name2codepoint:
	521	return unichr(name2codepoint[x])
	522	elif "&" + x + ";" in self.XML_ENTITIES_TO_CHARS:
	523	return '&%s;' % x
	524	else:
	525	return '&%s;' % x
	526
	527	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
	528	prettyPrint=False, indentLevel=0):
	529	"""Returns a string or Unicode representation of this tag and
	530	its contents. To get Unicode, pass None for encoding.
	531
	532	NOTE: since Python's HTML parser consumes whitespace, this
	533	method is not certain to reproduce the whitespace present in
	534	the original string."""
	535
	536	encodedName = self.toEncoding(self.name, encoding)
	537
	538	attrs = []
	539	if self.attrs:
	540	for key, val in self.attrs:
	541	fmt = '%s="%s"'
	542	if isString(val):
	543	if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
	544	val = self.substituteEncoding(val, encoding)
	545
	546	# The attribute value either:
	547	#
	548	# * Contains no embedded double quotes or single quotes.
	549	# No problem: we enclose it in double quotes.
	550	# * Contains embedded single quotes. No problem:
	551	# double quotes work here too.
	552	# * Contains embedded double quotes. No problem:
	553	# we enclose it in single quotes.
	554	# * Embeds both single _and_ double quotes. This
	555	# can't happen naturally, but it can happen if
	556	# you modify an attribute value after parsing
	557	# the document. Now we have a bit of a
	558	# problem. We solve it by enclosing the
	559	# attribute in single quotes, and escaping any
	560	# embedded single quotes to XML entities.
	561	if '"' in val:
	562	# This can't happen naturally, but it can happen
	563	# if you modify an attribute value after parsing.
	564	if "'" in val:
	565	val = val.replace('"', """)
	566	else:
	567	fmt = "%s='%s'"
	568
	569	# Optionally convert any HTML entities
	570	if self.convertHTMLEntities:
	571	val = re.sub("&(\w+);", self._convertEntities, val)
	572
	573	# Now we're okay w/r/t quotes. But the attribute
	574	# value might also contain angle brackets, or
	575	# ampersands that aren't part of entities. We need
	576	# to escape those to XML entities too.
	577	val = val.replace("<", "<").replace(">", ">")
	578	val = self.BARE_AMPERSAND.sub("&", val)
	579
	580
	581	attrs.append(fmt % (self.toEncoding(key, encoding),
	582	self.toEncoding(val, encoding)))
	583	close = ''
	584	closeTag = ''
	585	if self.isSelfClosing:
	586	close = ' /'
	587	else:
	588	closeTag = '</%s>' % encodedName
	589
	590	indentTag, indentContents = 0, 0
	591	if prettyPrint:
	592	indentTag = indentLevel
	593	space = (' ' * (indentTag-1))
	594	indentContents = indentTag + 1
	595	contents = self.renderContents(encoding, prettyPrint, indentContents)
	596	if self.hidden:
	597	s = contents
	598	else:
	599	s = []
	600	attributeString = ''
	601	if attrs:
	602	attributeString = ' ' + ' '.join(attrs)
	603	if prettyPrint:
	604	s.append(space)
	605	s.append('<%s%s%s>' % (encodedName, attributeString, close))
	606	if prettyPrint:
	607	s.append("\n")
	608	s.append(contents)
	609	if prettyPrint and contents and contents[-1] != "\n":
	610	s.append("\n")
	611	if prettyPrint and closeTag:
	612	s.append(space)
	613	s.append(closeTag)
	614	if prettyPrint and closeTag and self.nextSibling:
	615	s.append("\n")
	616	s = ''.join(s)
	617	return s
	618
	619	def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
	620	return self.__str__(encoding, True)
	621
	622	def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
	623	prettyPrint=False, indentLevel=0):
	624	"""Renders the contents of this tag as a string in the given
	625	encoding. If encoding is None, returns a Unicode string.."""
	626	s=[]
	627	for c in self:
	628	text = None
	629	if isinstance(c, NavigableString):
	630	text = c.__str__(encoding)
	631	elif isinstance(c, Tag):
	632	s.append(c.__str__(encoding, prettyPrint, indentLevel))
	633	if text and prettyPrint:
	634	text = text.strip()
	635	if text:
	636	if prettyPrint:
	637	s.append(" " * (indentLevel-1))
	638	s.append(text)
	639	if prettyPrint:
	640	s.append("\n")
	641	return ''.join(s)
	642
	643	#Soup methods
	644
	645	def find(self, name=None, attrs={}, recursive=True, text=None,
	646	**kwargs):
	647	"""Return only the first child of this Tag matching the given
	648	criteria."""
	649	r = None
	650	l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
	651	if l:
	652	r = l[0]
	653	return r
	654	findChild = find
	655
	656	def findAll(self, name=None, attrs={}, recursive=True, text=None,
	657	limit=None, **kwargs):
	658	"""Extracts a list of Tag objects that match the given
	659	criteria. You can specify the name of the Tag and any
	660	attributes you want the Tag to have.
	661
	662	The value of a key-value pair in the 'attrs' map can be a
	663	string, a list of strings, a regular expression object, or a
	664	callable that takes a string and returns whether or not the
	665	string matches for some custom definition of 'matches'. The
	666	same is true of the tag name."""
	667	generator = self.recursiveChildGenerator
	668	if not recursive:
	669	generator = self.childGenerator
	670	return self._findAll(name, attrs, text, limit, generator, **kwargs)
	671	findChildren = findAll
	672
	673	# Pre-3.x compatibility methods
	674	first = find
	675	fetch = findAll
	676
	677	def fetchText(self, text=None, recursive=True, limit=None):
	678	return self.findAll(text=text, recursive=recursive, limit=limit)
	679
	680	def firstText(self, text=None, recursive=True):
	681	return self.find(text=text, recursive=recursive)
	682
	683	#Utility methods
	684
	685	def append(self, tag):
	686	"""Appends the given tag to the contents of this tag."""
	687	self.contents.append(tag)
	688
	689	#Private methods
	690
	691	def _getAttrMap(self):
	692	"""Initializes a map representation of this tag's attributes,
	693	if not already initialized."""
	694	if not getattr(self, 'attrMap'):
	695	self.attrMap = {}
	696	for (key, value) in self.attrs:
	697	self.attrMap[key] = value
	698	return self.attrMap
	699
	700	#Generator methods
	701	def childGenerator(self):
	702	for i in range(0, len(self.contents)):
	703	yield self.contents[i]
	704	raise StopIteration
	705
	706	def recursiveChildGenerator(self):
	707	stack = [(self, 0)]
	708	while stack:
	709	tag, start = stack.pop()
	710	if isinstance(tag, Tag):
	711	for i in range(start, len(tag.contents)):
	712	a = tag.contents[i]
	713	yield a
	714	if isinstance(a, Tag) and tag.contents:
	715	if i < len(tag.contents) - 1:
	716	stack.append((tag, i+1))
	717	stack.append((a, 0))
	718	break
	719	raise StopIteration
	720
	721	# Next, a couple classes to represent queries and their results.
	722	class SoupStrainer:
	723	"""Encapsulates a number of ways of matching a markup element (tag or
	724	text)."""
	725
	726	def __init__(self, name=None, attrs={}, text=None, **kwargs):
	727	self.name = name
	728	if isString(attrs):
	729	kwargs['class'] = attrs
	730	attrs = None
	731	if kwargs:
	732	if attrs:
	733	attrs = attrs.copy()
	734	attrs.update(kwargs)
	735	else:
	736	attrs = kwargs
	737	self.attrs = attrs
	738	self.text = text
	739
	740	def __str__(self):
	741	if self.text:
	742	return self.text
	743	else:
	744	return "%s\|%s" % (self.name, self.attrs)
	745
	746	def searchTag(self, markupName=None, markupAttrs={}):
	747	found = None
	748	markup = None
	749	if isinstance(markupName, Tag):
	750	markup = markupName
	751	markupAttrs = markup
	752	callFunctionWithTagData = callable(self.name) \
	753	and not isinstance(markupName, Tag)
	754
	755	if (not self.name) \
	756	or callFunctionWithTagData \
	757	or (markup and self._matches(markup, self.name)) \
	758	or (not markup and self._matches(markupName, self.name)):
	759	if callFunctionWithTagData:
	760	match = self.name(markupName, markupAttrs)
	761	else:
	762	match = True
	763	markupAttrMap = None
	764	for attr, matchAgainst in self.attrs.items():
	765	if not markupAttrMap:
	766	if hasattr(markupAttrs, 'get'):
	767	markupAttrMap = markupAttrs
	768	else:
	769	markupAttrMap = {}
	770	for k,v in markupAttrs:
	771	markupAttrMap[k] = v
	772	attrValue = markupAttrMap.get(attr)
	773	if not self._matches(attrValue, matchAgainst):
	774	match = False
	775	break
	776	if match:
	777	if markup:
	778	found = markup
	779	else:
	780	found = markupName
	781	return found
	782
	783	def search(self, markup):
	784	#print 'looking for %s in %s' % (self, markup)
	785	found = None
	786	# If given a list of items, scan it for a text element that
	787	# matches.
	788	if isList(markup) and not isinstance(markup, Tag):
	789	for element in markup:
	790	if isinstance(element, NavigableString) \
	791	and self.search(element):
	792	found = element
	793	break
	794	# If it's a Tag, make sure its name or attributes match.
	795	# Don't bother with Tags if we're searching for text.
	796	elif isinstance(markup, Tag):
	797	if not self.text:
	798	found = self.searchTag(markup)
	799	# If it's text, make sure the text matches.
	800	elif isinstance(markup, NavigableString) or \
	801	isString(markup):
	802	if self._matches(markup, self.text):
	803	found = markup
	804	else:
	805	raise Exception, "I don't know how to match against a %s" \
	806	% markup.__class__
	807	return found
	808
	809	def _matches(self, markup, matchAgainst):
	810	#print "Matching %s against %s" % (markup, matchAgainst)
	811	result = False
	812	if matchAgainst == True and type(matchAgainst) == types.BooleanType:
	813	result = markup != None
	814	elif callable(matchAgainst):
	815	result = matchAgainst(markup)
	816	else:
	817	#Custom match methods take the tag as an argument, but all
	818	#other ways of matching match the tag name as a string.
	819	if isinstance(markup, Tag):
	820	markup = markup.name
	821	if markup and not isString(markup):
	822	markup = unicode(markup)
	823	#Now we know that chunk is either a string, or None.
	824	if hasattr(matchAgainst, 'match'):
	825	# It's a regexp object.
	826	result = markup and matchAgainst.search(markup)
	827	elif isList(matchAgainst):
	828	result = markup in matchAgainst
	829	elif hasattr(matchAgainst, 'items'):
	830	result = markup.has_key(matchAgainst)
	831	elif matchAgainst and isString(markup):
	832	if isinstance(markup, unicode):
	833	matchAgainst = unicode(matchAgainst)
	834	else:
	835	matchAgainst = str(matchAgainst)
	836
	837	if not result:
	838	result = matchAgainst == markup
	839	return result
	840
	841	class ResultSet(list):
	842	"""A ResultSet is just a list that keeps track of the SoupStrainer
	843	that created it."""
	844	def __init__(self, source):
	845	list.__init__([])
	846	self.source = source
	847
	848	# Now, some helper functions.
	849
	850	def isList(l):
	851	"""Convenience method that works with all 2.x versions of Python
	852	to determine whether or not something is listlike."""
	853	return hasattr(l, '__iter__') \
	854	or (type(l) in (types.ListType, types.TupleType))
	855
	856	def isString(s):
	857	"""Convenience method that works with all 2.x versions of Python
	858	to determine whether or not something is stringlike."""
	859	try:
	860	return isinstance(s, unicode) or isintance(s, basestring)
	861	except NameError:
	862	return isinstance(s, str)
	863
	864	def buildTagMap(default, *args):
	865	"""Turns a list of maps, lists, or scalars into a single map.
	866	Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
	867	NESTING_RESET_TAGS maps out of lists and partial maps."""
	868	built = {}
	869	for portion in args:
	870	if hasattr(portion, 'items'):
	871	#It's a map. Merge it.
	872	for k,v in portion.items():
	873	built[k] = v
	874	elif isList(portion):
	875	#It's a list. Map each item to the default.
	876	for k in portion:
	877	built[k] = default
	878	else:
	879	#It's a scalar. Map it to the default.
	880	built[portion] = default
	881	return built
	882
	883	# Now, the parser classes.
	884
	885	class BeautifulStoneSoup(Tag, SGMLParser):
	886
	887	"""This class contains the basic parser and search code. It defines
	888	a parser that knows nothing about tag behavior except for the
	889	following:
	890
	891	You can't close a tag without closing all the tags it encloses.
	892	That is, "<foo><bar></foo>" actually means
	893	"<foo><bar></bar></foo>".
	894
	895	[Another possible explanation is "<foo><bar /></foo>", but since
	896	this class defines no SELF_CLOSING_TAGS, it will never use that
	897	explanation.]
	898
	899	This class is useful for parsing XML or made-up markup languages,
	900	or when BeautifulSoup makes an assumption counter to what you were
	901	expecting."""
	902
	903	SELF_CLOSING_TAGS = {}
	904	NESTABLE_TAGS = {}
	905	RESET_NESTING_TAGS = {}
	906	QUOTE_TAGS = {}
	907
	908	MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
	909	lambda x: x.group(1) + ' />'),
	910	(re.compile('<!\s+([^<>]*)>'),
	911	lambda x: '<!' + x.group(1) + '>')
	912	]
	913
	914	ROOT_TAG_NAME = u'[document]'
	915
	916	HTML_ENTITIES = "html"
	917	XML_ENTITIES = "xml"
	918	ALL_ENTITIES = [HTML_ENTITIES, XML_ENTITIES]
	919
	920	def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
	921	markupMassage=True, smartQuotesTo=XML_ENTITIES,
	922	convertEntities=None, selfClosingTags=None):
	923	"""The Soup object is initialized as the 'root tag', and the
	924	provided markup (which can be a string or a file-like object)
	925	is fed into the underlying parser.
	926
	927	sgmllib will process most bad HTML, and the BeautifulSoup
	928	class has some tricks for dealing with some HTML that kills
	929	sgmllib, but Beautiful Soup can nonetheless choke or lose data
	930	if your data uses self-closing tags or declarations
	931	incorrectly.
	932
	933	By default, Beautiful Soup uses regexes to sanitize input,
	934	avoiding the vast majority of these problems. If the problems
	935	don't apply to you, pass in False for markupMassage, and
	936	you'll get better performance.
	937
	938	The default parser massage techniques fix the two most common
	939	instances of invalid HTML that choke sgmllib:
	940
	941	<br/> (No space between name of closing tag and tag close)
	942	<! --Comment--> (Extraneous whitespace in declaration)
	943
	944	You can pass in a custom list of (RE object, replace method)
	945	tuples to get Beautiful Soup to scrub your input the way you
	946	want."""
	947
	948	self.parseOnlyThese = parseOnlyThese
	949	self.fromEncoding = fromEncoding
	950	self.smartQuotesTo = smartQuotesTo
	951
	952	if convertEntities:
	953	# It doesn't make sense to convert encoded characters to
	954	# entities even while you're converting entities to Unicode.
	955	# Just convert it all to Unicode.
	956	self.smartQuotesTo = None
	957
	958	if isList(convertEntities):
	959	self.convertHTMLEntities = self.HTML_ENTITIES in convertEntities
	960	self.convertXMLEntities = self.XML_ENTITIES in convertEntities
	961	else:
	962	self.convertHTMLEntities = self.HTML_ENTITIES == convertEntities
	963	self.convertXMLEntities = self.XML_ENTITIES == convertEntities
	964
	965	self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
	966	SGMLParser.__init__(self)
	967
	968	if hasattr(markup, 'read'): # It's a file-type object.
	969	markup = markup.read()
	970	self.markup = markup
	971	self.markupMassage = markupMassage
	972	try:
	973	self._feed()
	974	except StopParsing:
	975	pass
	976	self.markup = None # The markup can now be GCed
	977
	978	def _feed(self, inDocumentEncoding=None):
	979	# Convert the document to Unicode.
	980	markup = self.markup
	981	if isinstance(markup, unicode):
	982	if not hasattr(self, 'originalEncoding'):
	983	self.originalEncoding = None
	984	else:
	985	dammit = UnicodeDammit\
	986	(markup, [self.fromEncoding, inDocumentEncoding],
	987	smartQuotesTo=self.smartQuotesTo)
	988	markup = dammit.unicode
	989	self.originalEncoding = dammit.originalEncoding
	990	if markup:
	991	if self.markupMassage:
	992	if not isList(self.markupMassage):
	993	self.markupMassage = self.MARKUP_MASSAGE
	994	for fix, m in self.markupMassage:
	995	markup = fix.sub(m, markup)
	996	self.reset()
	997
	998	SGMLParser.feed(self, markup or "")
	999	SGMLParser.close(self)
	1000	# Close out any unfinished strings and close all the open tags.
	1001	self.endData()
	1002	while self.currentTag.name != self.ROOT_TAG_NAME:
	1003	self.popTag()
	1004
	1005	def __getattr__(self, methodName):
	1006	"""This method routes method call requests to either the SGMLParser
	1007	superclass or the Tag superclass, depending on the method name."""
	1008	#print "__getattr__ called on %s.%s" % (self.__class__, methodName)
	1009
	1010	if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
	1011	or methodName.find('do_') == 0:
	1012	return SGMLParser.__getattr__(self, methodName)
	1013	elif methodName.find('__') != 0:
	1014	return Tag.__getattr__(self, methodName)
	1015	else:
	1016	raise AttributeError
	1017
	1018	def isSelfClosingTag(self, name):
	1019	"""Returns true iff the given string is the name of a
	1020	self-closing tag according to this parser."""
	1021	return self.SELF_CLOSING_TAGS.has_key(name) \
	1022	or self.instanceSelfClosingTags.has_key(name)
	1023
	1024	def reset(self):
	1025	Tag.__init__(self, self, self.ROOT_TAG_NAME)
	1026	self.hidden = 1
	1027	SGMLParser.reset(self)
	1028	self.currentData = []
	1029	self.currentTag = None
	1030	self.tagStack = []
	1031	self.quoteStack = []
	1032	self.pushTag(self)
	1033
	1034	def popTag(self):
	1035	tag = self.tagStack.pop()
	1036	# Tags with just one string-owning child get the child as a
	1037	# 'string' property, so that soup.tag.string is shorthand for
	1038	# soup.tag.contents[0]
	1039	if len(self.currentTag.contents) == 1 and \
	1040	isinstance(self.currentTag.contents[0], NavigableString):
	1041	self.currentTag.string = self.currentTag.contents[0]
	1042
	1043	#print "Pop", tag.name
	1044	if self.tagStack:
	1045	self.currentTag = self.tagStack[-1]
	1046	return self.currentTag
	1047
	1048	def pushTag(self, tag):
	1049	#print "Push", tag.name
	1050	if self.currentTag:
	1051	self.currentTag.append(tag)
	1052	self.tagStack.append(tag)
	1053	self.currentTag = self.tagStack[-1]
	1054
	1055	def endData(self, containerClass=NavigableString):
	1056	if self.currentData:
	1057	currentData = ''.join(self.currentData)
	1058	if currentData.endswith('<') and self.convertHTMLEntities:
	1059	currentData = currentData[:-1] + '<'
	1060	if not currentData.strip():
	1061	if '\n' in currentData:
	1062	currentData = '\n'
	1063	else:
	1064	currentData = ' '
	1065	self.currentData = []
	1066	if self.parseOnlyThese and len(self.tagStack) <= 1 and \
	1067	(not self.parseOnlyThese.text or \
	1068	not self.parseOnlyThese.search(currentData)):
	1069	return
	1070	o = containerClass(currentData)
	1071	o.setup(self.currentTag, self.previous)
	1072	if self.previous:
	1073	self.previous.next = o
	1074	self.previous = o
	1075	self.currentTag.contents.append(o)
	1076
	1077
	1078	def _popToTag(self, name, inclusivePop=True):
	1079	"""Pops the tag stack up to and including the most recent
	1080	instance of the given tag. If inclusivePop is false, pops the tag
	1081	stack up to but not including the most recent instqance of
	1082	the given tag."""
	1083	#print "Popping to %s" % name
	1084	if name == self.ROOT_TAG_NAME:
	1085	return
	1086
	1087	numPops = 0
	1088	mostRecentTag = None
	1089	for i in range(len(self.tagStack)-1, 0, -1):
	1090	if name == self.tagStack[i].name:
	1091	numPops = len(self.tagStack)-i
	1092	break
	1093	if not inclusivePop:
	1094	numPops = numPops - 1
	1095
	1096	for i in range(0, numPops):
	1097	mostRecentTag = self.popTag()
	1098	return mostRecentTag
	1099
	1100	def _smartPop(self, name):
	1101
	1102	"""We need to pop up to the previous tag of this type, unless
	1103	one of this tag's nesting reset triggers comes between this
	1104	tag and the previous tag of this type, OR unless this tag is a
	1105	generic nesting trigger and another generic nesting trigger
	1106	comes between this tag and the previous tag of this type.
	1107
	1108	Examples:
	1109	<p>Foo<b>Bar<p> should pop to 'p', not 'b'.
	1110	<p>Foo<table>Bar<p> should pop to 'table', not 'p'.
	1111	<p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
	1112	<p>Foo<b>Bar<p> should pop to 'p', not 'b'.
	1113
	1114	<li><ul><li> <li> should pop to 'ul', not the first 'li'.
	1115	<tr><table><tr> <tr> should pop to 'table', not the first 'tr'
	1116	<td><tr><td> <td> should pop to 'tr', not the first 'td'
	1117	"""
	1118
	1119	nestingResetTriggers = self.NESTABLE_TAGS.get(name)
	1120	isNestable = nestingResetTriggers != None
	1121	isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
	1122	popTo = None
	1123	inclusive = True
	1124	for i in range(len(self.tagStack)-1, 0, -1):
	1125	p = self.tagStack[i]
	1126	if (not p or p.name == name) and not isNestable:
	1127	#Non-nestable tags get popped to the top or to their
	1128	#last occurance.
	1129	popTo = name
	1130	break
	1131	if (nestingResetTriggers != None
	1132	and p.name in nestingResetTriggers) \
	1133	or (nestingResetTriggers == None and isResetNesting
	1134	and self.RESET_NESTING_TAGS.has_key(p.name)):
	1135
	1136	#If we encounter one of the nesting reset triggers
	1137	#peculiar to this tag, or we encounter another tag
	1138	#that causes nesting to reset, pop up to but not
	1139	#including that tag.
	1140	popTo = p.name
	1141	inclusive = False
	1142	break
	1143	p = p.parent
	1144	if popTo:
	1145	self._popToTag(popTo, inclusive)
	1146
	1147	def unknown_starttag(self, name, attrs, selfClosing=0):
	1148	#print "Start tag %s: %s" % (name, attrs)
	1149	if self.quoteStack:
	1150	#This is not a real tag.
	1151	#print "<%s> is not real!" % name
	1152	attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
	1153	self.currentData.append('<%s%s>' % (name, attrs))
	1154	return
	1155	self.endData()
	1156
	1157	if not self.isSelfClosingTag(name) and not selfClosing:
	1158	self._smartPop(name)
	1159
	1160	if self.parseOnlyThese and len(self.tagStack) <= 1 \
	1161	and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
	1162	return
	1163
	1164	tag = Tag(self, name, attrs, self.currentTag, self.previous)
	1165	if self.previous:
	1166	self.previous.next = tag
	1167	self.previous = tag
	1168	self.pushTag(tag)
	1169	if selfClosing or self.isSelfClosingTag(name):
	1170	self.popTag()
	1171	if name in self.QUOTE_TAGS:
	1172	#print "Beginning quote (%s)" % name
	1173	self.quoteStack.append(name)
	1174	self.literal = 1
	1175	return tag
	1176
	1177	def unknown_endtag(self, name):
	1178	#print "End tag %s" % name
	1179	if self.quoteStack and self.quoteStack[-1] != name:
	1180	#This is not a real end tag.
	1181	#print "</%s> is not real!" % name
	1182	self.currentData.append('</%s>' % name)
	1183	return
	1184	self.endData()
	1185	self._popToTag(name)
	1186	if self.quoteStack and self.quoteStack[-1] == name:
	1187	self.quoteStack.pop()
	1188	self.literal = (len(self.quoteStack) > 0)
	1189
	1190	def handle_data(self, data):
	1191	if self.convertHTMLEntities:
	1192	if data[0] == '&':
	1193	data = self.BARE_AMPERSAND.sub("&",data)
	1194	else:
	1195	data = data.replace('&','&') \
	1196	.replace('<','<') \
	1197	.replace('>','>')
	1198	self.currentData.append(data)
	1199
	1200	def _toStringSubclass(self, text, subclass):
	1201	"""Adds a certain piece of text to the tree as a NavigableString
	1202	subclass."""
	1203	self.endData()
	1204	self.handle_data(text)
	1205	self.endData(subclass)
	1206
	1207	def handle_pi(self, text):
	1208	"""Handle a processing instruction as a ProcessingInstruction
	1209	object, possibly one with a %SOUP-ENCODING% slot into which an
	1210	encoding will be plugged later."""
	1211	if text[:3] == "xml":
	1212	text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
	1213	self._toStringSubclass(text, ProcessingInstruction)
	1214
	1215	def handle_comment(self, text):
	1216	"Handle comments as Comment objects."
	1217	self._toStringSubclass(text, Comment)
	1218
	1219	def handle_charref(self, ref):
	1220	"Handle character references as data."
	1221	if ref[0] == 'x':
	1222	data = unichr(int(ref[1:],16))
	1223	else:
	1224	data = unichr(int(ref))
	1225
	1226	if u'\x80' <= data <= u'\x9F':
	1227	data = UnicodeDammit.subMSChar(chr(ord(data)), self.smartQuotesTo)
	1228	elif not self.convertHTMLEntities and not self.convertXMLEntities:
	1229	data = '&#%s;' % ref
	1230
	1231	self.handle_data(data)
	1232
	1233	def handle_entityref(self, ref):
	1234	"""Handle entity references as data, possibly converting known
	1235	HTML entity references to the corresponding Unicode
	1236	characters."""
	1237	replaceWithXMLEntity = self.convertXMLEntities and \
	1238	self.XML_ENTITIES_TO_CHARS.has_key(ref)
	1239	if self.convertHTMLEntities or replaceWithXMLEntity:
	1240	try:
	1241	data = unichr(name2codepoint[ref])
	1242	except KeyError:
	1243	if replaceWithXMLEntity:
	1244	data = self.XML_ENTITIES_TO_CHARS.get(ref)
	1245	else:
	1246	data="&%s" % ref
	1247	else:
	1248	data = '&%s;' % ref
	1249	self.handle_data(data)
	1250
	1251	def handle_decl(self, data):
	1252	"Handle DOCTYPEs and the like as Declaration objects."
	1253	self._toStringSubclass(data, Declaration)
	1254
	1255	def parse_declaration(self, i):
	1256	"""Treat a bogus SGML declaration as raw data. Treat a CDATA
	1257	declaration as a CData object."""
	1258	j = None
	1259	if self.rawdata[i:i+9] == '<![CDATA[':
	1260	k = self.rawdata.find(']]>', i)
	1261	if k == -1:
	1262	k = len(self.rawdata)
	1263	data = self.rawdata[i+9:k]
	1264	j = k+3
	1265	self._toStringSubclass(data, CData)
	1266	else:
	1267	try:
	1268	j = SGMLParser.parse_declaration(self, i)
	1269	except SGMLParseError:
	1270	toHandle = self.rawdata[i:]
	1271	self.handle_data(toHandle)
	1272	j = i + len(toHandle)
	1273	return j
	1274
	1275	class BeautifulSoup(BeautifulStoneSoup):
	1276
	1277	"""This parser knows the following facts about HTML:
	1278
	1279	* Some tags have no closing tag and should be interpreted as being
	1280	closed as soon as they are encountered.
	1281
	1282	* The text inside some tags (ie. 'script') may contain tags which
	1283	are not really part of the document and which should be parsed
	1284	as text, not tags. If you want to parse the text as tags, you can
	1285	always fetch it and parse it explicitly.
	1286
	1287	* Tag nesting rules:
	1288
	1289	Most tags can't be nested at all. For instance, the occurance of
	1290	a <p> tag should implicitly close the previous <p> tag.
	1291
	1292	<p>Para1<p>Para2
	1293	should be transformed into:
	1294	<p>Para1</p><p>Para2
	1295
	1296	Some tags can be nested arbitrarily. For instance, the occurance
	1297	of a <blockquote> tag should _not_ implicitly close the previous
	1298	<blockquote> tag.
	1299
	1300	Alice said: <blockquote>Bob said: <blockquote>Blah
	1301	should NOT be transformed into:
	1302	Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
	1303
	1304	Some tags can be nested, but the nesting is reset by the
	1305	interposition of other tags. For instance, a <tr> tag should
	1306	implicitly close the previous <tr> tag within the same <table>,
	1307	but not close a <tr> tag in another table.
	1308
	1309	<table><tr>Blah<tr>Blah
	1310	should be transformed into:
	1311	<table><tr>Blah</tr><tr>Blah
	1312	but,
	1313	<tr>Blah<table><tr>Blah
	1314	should NOT be transformed into
	1315	<tr>Blah<table></tr><tr>Blah
	1316
	1317	Differing assumptions about tag nesting rules are a major source
	1318	of problems with the BeautifulSoup class. If BeautifulSoup is not
	1319	treating as nestable a tag your page author treats as nestable,
	1320	try ICantBelieveItsBeautifulSoup, MinimalSoup, or
	1321	BeautifulStoneSoup before writing your own subclass."""
	1322
	1323	def __init__(self, args, *kwargs):
	1324	if not kwargs.has_key('smartQuotesTo'):
	1325	kwargs['smartQuotesTo'] = self.HTML_ENTITIES
	1326	BeautifulStoneSoup.__init__(self, args, *kwargs)
	1327
	1328	SELF_CLOSING_TAGS = buildTagMap(None,
	1329	['br' , 'hr', 'input', 'img', 'meta',
	1330	'spacer', 'link', 'frame', 'base'])
	1331
	1332	QUOTE_TAGS = {'script': None}
	1333
	1334	#According to the HTML standard, each of these inline tags can
	1335	#contain another tag of the same type. Furthermore, it's common
	1336	#to actually use these tags this way.
	1337	NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
	1338	'center']
	1339
	1340	#According to the HTML standard, these block tags can contain
	1341	#another tag of the same type. Furthermore, it's common
	1342	#to actually use these tags this way.
	1343	NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
	1344
	1345	#Lists can contain other lists, but there are restrictions.
	1346	NESTABLE_LIST_TAGS = { 'ol' : [],
	1347	'ul' : [],
	1348	'li' : ['ul', 'ol'],
	1349	'dl' : [],
	1350	'dd' : ['dl'],
	1351	'dt' : ['dl'] }
	1352
	1353	#Tables can contain other tables, but there are restrictions.
	1354	NESTABLE_TABLE_TAGS = {'table' : [],
	1355	'tr' : ['table', 'tbody', 'tfoot', 'thead'],
	1356	'td' : ['tr'],
	1357	'th' : ['tr'],
	1358	'thead' : ['table'],
	1359	'tbody' : ['table'],
	1360	'tfoot' : ['table'],
	1361	}
	1362
	1363	NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
	1364
	1365	#If one of these tags is encountered, all tags up to the next tag of
	1366	#this type are popped.
	1367	RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
	1368	NON_NESTABLE_BLOCK_TAGS,
	1369	NESTABLE_LIST_TAGS,
	1370	NESTABLE_TABLE_TAGS)
	1371
	1372	NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
	1373	NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
	1374
	1375	# Used to detect the charset in a META tag; see start_meta
	1376	CHARSET_RE = re.compile("((^\|;)\scharset=)([^;])")
	1377
	1378	def start_meta(self, attrs):
	1379	"""Beautiful Soup can detect a charset included in a META tag,
	1380	try to convert the document to that charset, and re-parse the
	1381	document from the beginning."""
	1382	httpEquiv = None
	1383	contentType = None
	1384	contentTypeIndex = None
	1385	tagNeedsEncodingSubstitution = False
	1386
	1387	for i in range(0, len(attrs)):
	1388	key, value = attrs[i]
	1389	key = key.lower()
	1390	if key == 'http-equiv':
	1391	httpEquiv = value
	1392	elif key == 'content':
	1393	contentType = value
	1394	contentTypeIndex = i
	1395
	1396	if httpEquiv and contentType: # It's an interesting meta tag.
	1397	match = self.CHARSET_RE.search(contentType)
	1398	if match:
	1399	if getattr(self, 'declaredHTMLEncoding') or \
	1400	(self.originalEncoding == self.fromEncoding):
	1401	# This is our second pass through the document, or
	1402	# else an encoding was specified explicitly and it
	1403	# worked. Rewrite the meta tag.
	1404	newAttr = self.CHARSET_RE.sub\
	1405	(lambda(match):match.group(1) +
	1406	"%SOUP-ENCODING%", value)
	1407	attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
	1408	newAttr)
	1409	tagNeedsEncodingSubstitution = True
	1410	else:
	1411	# This is our first pass through the document.
	1412	# Go through it again with the new information.
	1413	newCharset = match.group(3)
	1414	if newCharset and newCharset != self.originalEncoding:
	1415	self.declaredHTMLEncoding = newCharset
	1416	self._feed(self.declaredHTMLEncoding)
	1417	raise StopParsing
	1418	tag = self.unknown_starttag("meta", attrs)
	1419	if tag and tagNeedsEncodingSubstitution:
	1420	tag.containsSubstitutions = True
	1421
	1422	class StopParsing(Exception):
	1423	pass
	1424
	1425	class ICantBelieveItsBeautifulSoup(BeautifulSoup):
	1426
	1427	"""The BeautifulSoup class is oriented towards skipping over
	1428	common HTML errors like unclosed tags. However, sometimes it makes
	1429	errors of its own. For instance, consider this fragment:
	1430
	1431	<b>Foo<b>Bar</b></b>
	1432
	1433	This is perfectly valid (if bizarre) HTML. However, the
	1434	BeautifulSoup class will implicitly close the first b tag when it
	1435	encounters the second 'b'. It will think the author wrote
	1436	"<b>Foo<b>Bar", and didn't close the first 'b' tag, because
	1437	there's no real-world reason to bold something that's already
	1438	bold. When it encounters '</b></b>' it will close two more 'b'
	1439	tags, for a grand total of three tags closed instead of two. This
	1440	can throw off the rest of your document structure. The same is
	1441	true of a number of other tags, listed below.
	1442
	1443	It's much more common for someone to forget to close a 'b' tag
	1444	than to actually use nested 'b' tags, and the BeautifulSoup class
	1445	handles the common case. This class handles the not-co-common
	1446	case: where you can't believe someone wrote what they did, but
	1447	it's valid HTML and BeautifulSoup screwed up by assuming it
	1448	wouldn't be."""
	1449
	1450	I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
	1451	['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
	1452	'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
	1453	'big']
	1454
	1455	I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
	1456
	1457	NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
	1458	I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
	1459	I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
	1460
	1461	class MinimalSoup(BeautifulSoup):
	1462	"""The MinimalSoup class is for parsing HTML that contains
	1463	pathologically bad markup. It makes no assumptions about tag
	1464	nesting, but it does know which tags are self-closing, that
	1465	<script> tags contain Javascript and should not be parsed, that
	1466	META tags may contain encoding information, and so on.
	1467
	1468	This also makes it better for subclassing than BeautifulStoneSoup
	1469	or BeautifulSoup."""
	1470
	1471	RESET_NESTING_TAGS = buildTagMap('noscript')
	1472	NESTABLE_TAGS = {}
	1473
	1474	class BeautifulSOAP(BeautifulStoneSoup):
	1475	"""This class will push a tag with only a single string child into
	1476	the tag's parent as an attribute. The attribute's name is the tag
	1477	name, and the value is the string child. An example should give
	1478	the flavor of the change:
	1479
	1480	<foo><bar>baz</bar></foo>
	1481	=>
	1482	<foo bar="baz"><bar>baz</bar></foo>
	1483
	1484	You can then access fooTag['bar'] instead of fooTag.barTag.string.
	1485
	1486	This is, of course, useful for scraping structures that tend to
	1487	use subelements instead of attributes, such as SOAP messages. Note
	1488	that it modifies its input, so don't print the modified version
	1489	out.
	1490
	1491	I'm not sure how many people really want to use this class; let me
	1492	know if you do. Mainly I like the name."""
	1493
	1494	def popTag(self):
	1495	if len(self.tagStack) > 1:
	1496	tag = self.tagStack[-1]
	1497	parent = self.tagStack[-2]
	1498	parent._getAttrMap()
	1499	if (isinstance(tag, Tag) and len(tag.contents) == 1 and
	1500	isinstance(tag.contents[0], NavigableString) and
	1501	not parent.attrMap.has_key(tag.name)):
	1502	parent[tag.name] = tag.contents[0]
	1503	BeautifulStoneSoup.popTag(self)
	1504
	1505	#Enterprise class names! It has come to our attention that some people
	1506	#think the names of the Beautiful Soup parser classes are too silly
	1507	#and "unprofessional" for use in enterprise screen-scraping. We feel
	1508	#your pain! For such-minded folk, the Beautiful Soup Consortium And
	1509	#All-Night Kosher Bakery recommends renaming this file to
	1510	#"RobustParser.py" (or, in cases of extreme enterprisitude,
	1511	#"RobustParserBeanInterface.class") and using the following
	1512	#enterprise-friendly class aliases:
	1513	class RobustXMLParser(BeautifulStoneSoup):
	1514	pass
	1515	class RobustHTMLParser(BeautifulSoup):
	1516	pass
	1517	class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
	1518	pass
	1519	class RobustInsanelyWackAssHTMLParser(MinimalSoup):
	1520	pass
	1521	class SimplifyingSOAPParser(BeautifulSOAP):
	1522	pass
	1523
	1524	######################################################
	1525	#
	1526	# Bonus library: Unicode, Dammit
	1527	#
	1528	# This class forces XML data into a standard format (usually to UTF-8
	1529	# or Unicode). It is heavily based on code from Mark Pilgrim's
	1530	# Universal Feed Parser. It does not rewrite the XML or HTML to
	1531	# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
	1532	# (XML) and BeautifulSoup.start_meta (HTML).
	1533
	1534	# Autodetects character encodings.
	1535	# Download from http://chardet.feedparser.org/
	1536	try:
	1537	import chardet
	1538	# import chardet.constants
	1539	# chardet.constants._debug = 1
	1540	except:
	1541	chardet = None
	1542	chardet = None
	1543
	1544	# cjkcodecs and iconv_codec make Python know about more character encodings.
	1545	# Both are available from http://cjkpython.i18n.org/
	1546	# They're built in if you use Python 2.4.
	1547	try:
	1548	import cjkcodecs.aliases
	1549	except:
	1550	pass
	1551	try:
	1552	import iconv_codec
	1553	except:
	1554	pass
	1555
	1556	class UnicodeDammit:
	1557	"""A class for detecting the encoding of a *ML document and
	1558	converting it to a Unicode string. If the source encoding is
	1559	windows-1252, can replace MS smart quotes with their HTML or XML
	1560	equivalents."""
	1561
	1562	# This dictionary maps commonly seen values for "charset" in HTML
	1563	# meta tags to the corresponding Python codec names. It only covers
	1564	# values that aren't in Python's aliases and can't be determined
	1565	# by the heuristics in find_codec.
	1566	CHARSET_ALIASES = { "macintosh" : "mac-roman",
	1567	"x-sjis" : "shift-jis" }
	1568
	1569	def __init__(self, markup, overrideEncodings=[],
	1570	smartQuotesTo='xml'):
	1571	self.markup, documentEncoding, sniffedEncoding = \
	1572	self._detectEncoding(markup)
	1573	self.smartQuotesTo = smartQuotesTo
	1574	self.triedEncodings = []
	1575	if isinstance(markup, unicode):
	1576	return markup
	1577
	1578	u = None
	1579	for proposedEncoding in overrideEncodings:
	1580	u = self._convertFrom(proposedEncoding)
	1581	if u: break
	1582	if not u:
	1583	for proposedEncoding in (documentEncoding, sniffedEncoding):
	1584	u = self._convertFrom(proposedEncoding)
	1585	if u: break
	1586
	1587	# If no luck and we have auto-detection library, try that:
	1588	if not u and chardet and not isinstance(self.markup, unicode):
	1589	u = self._convertFrom(chardet.detect(self.markup)['encoding'])
	1590
	1591	# As a last resort, try utf-8 and windows-1252:
	1592	if not u:
	1593	for proposed_encoding in ("utf-8", "windows-1252"):
	1594	u = self._convertFrom(proposed_encoding)
	1595	if u: break
	1596	self.unicode = u
	1597	if not u: self.originalEncoding = None
	1598
	1599	def subMSChar(orig, smartQuotesTo):
	1600	"""Changes a MS smart quote character to an XML or HTML
	1601	entity."""
	1602	sub = UnicodeDammit.MS_CHARS.get(orig)
	1603	if type(sub) == types.TupleType:
	1604	if smartQuotesTo == 'xml':
	1605	sub = '&#x%s;' % sub[1]
	1606	elif smartQuotesTo == 'html':
	1607	sub = '&%s;' % sub[0]
	1608	else:
	1609	sub = unichr(int(sub[1],16))
	1610	return sub
	1611	subMSChar = staticmethod(subMSChar)
	1612
	1613	def _convertFrom(self, proposed):
	1614	proposed = self.find_codec(proposed)
	1615	if not proposed or proposed in self.triedEncodings:
	1616	return None
	1617	self.triedEncodings.append(proposed)
	1618	markup = self.markup
	1619
	1620	# Convert smart quotes to HTML if coming from an encoding
	1621	# that might have them.
	1622	if self.smartQuotesTo and proposed in("windows-1252",
	1623	"ISO-8859-1",
	1624	"ISO-8859-2"):
	1625	markup = re.compile("([\x80-\x9f])").sub \
	1626	(lambda(x): self.subMSChar(x.group(1),self.smartQuotesTo),
	1627	markup)
	1628
	1629	try:
	1630	# print "Trying to convert document to %s" % proposed
	1631	u = self._toUnicode(markup, proposed)
	1632	self.markup = u
	1633	self.originalEncoding = proposed
	1634	except Exception, e:
	1635	# print "That didn't work!"
	1636	# print e
	1637	return None
	1638	#print "Correct encoding: %s" % proposed
	1639	return self.markup
	1640
	1641	def _toUnicode(self, data, encoding):
	1642	'''Given a string and its encoding, decodes the string into Unicode.
	1643	%encoding is a string recognized by encodings.aliases'''
	1644
	1645	# strip Byte Order Mark (if present)
	1646	if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
	1647	and (data[2:4] != '\x00\x00'):
	1648	encoding = 'utf-16be'
	1649	data = data[2:]
	1650	elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
	1651	and (data[2:4] != '\x00\x00'):
	1652	encoding = 'utf-16le'
	1653	data = data[2:]
	1654	elif data[:3] == '\xef\xbb\xbf':
	1655	encoding = 'utf-8'
	1656	data = data[3:]
	1657	elif data[:4] == '\x00\x00\xfe\xff':
	1658	encoding = 'utf-32be'
	1659	data = data[4:]
	1660	elif data[:4] == '\xff\xfe\x00\x00':
	1661	encoding = 'utf-32le'
	1662	data = data[4:]
	1663	newdata = unicode(data, encoding)
	1664	return newdata
	1665
	1666	def _detectEncoding(self, xml_data):
	1667	"""Given a document, tries to detect its XML encoding."""
	1668	xml_encoding = sniffed_xml_encoding = None
	1669	try:
	1670	if xml_data[:4] == '\x4c\x6f\xa7\x94':
	1671	# EBCDIC
	1672	xml_data = self._ebcdic_to_ascii(xml_data)
	1673	elif xml_data[:4] == '\x00\x3c\x00\x3f':
	1674	# UTF-16BE
	1675	sniffed_xml_encoding = 'utf-16be'
	1676	xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
	1677	elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
	1678	and (xml_data[2:4] != '\x00\x00'):
	1679	# UTF-16BE with BOM
	1680	sniffed_xml_encoding = 'utf-16be'
	1681	xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
	1682	elif xml_data[:4] == '\x3c\x00\x3f\x00':
	1683	# UTF-16LE
	1684	sniffed_xml_encoding = 'utf-16le'
	1685	xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
	1686	elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
	1687	(xml_data[2:4] != '\x00\x00'):
	1688	# UTF-16LE with BOM
	1689	sniffed_xml_encoding = 'utf-16le'
	1690	xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
	1691	elif xml_data[:4] == '\x00\x00\x00\x3c':
	1692	# UTF-32BE
	1693	sniffed_xml_encoding = 'utf-32be'
	1694	xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
	1695	elif xml_data[:4] == '\x3c\x00\x00\x00':
	1696	# UTF-32LE
	1697	sniffed_xml_encoding = 'utf-32le'
	1698	xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
	1699	elif xml_data[:4] == '\x00\x00\xfe\xff':
	1700	# UTF-32BE with BOM
	1701	sniffed_xml_encoding = 'utf-32be'
	1702	xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
	1703	elif xml_data[:4] == '\xff\xfe\x00\x00':
	1704	# UTF-32LE with BOM
	1705	sniffed_xml_encoding = 'utf-32le'
	1706	xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
	1707	elif xml_data[:3] == '\xef\xbb\xbf':
	1708	# UTF-8 with BOM
	1709	sniffed_xml_encoding = 'utf-8'
	1710	xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
	1711	else:
	1712	sniffed_xml_encoding = 'ascii'
	1713	pass
	1714	xml_encoding_match = re.compile \
	1715	('^<\?.encoding=[\'"](.?)[\'"].*\?>')\
	1716	.match(xml_data)
	1717	except:
	1718	xml_encoding_match = None
	1719	if xml_encoding_match:
	1720	xml_encoding = xml_encoding_match.groups()[0].lower()
	1721	if sniffed_xml_encoding and \
	1722	(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
	1723	'iso-10646-ucs-4', 'ucs-4', 'csucs4',
	1724	'utf-16', 'utf-32', 'utf_16', 'utf_32',
	1725	'utf16', 'u16')):
	1726	xml_encoding = sniffed_xml_encoding
	1727	return xml_data, xml_encoding, sniffed_xml_encoding
	1728
	1729
	1730	def find_codec(self, charset):
	1731	return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
	1732	or (charset and self._codec(charset.replace("-", ""))) \
	1733	or (charset and self._codec(charset.replace("-", "_"))) \
	1734	or charset
	1735
	1736	def _codec(self, charset):
	1737	if not charset: return charset
	1738	codec = None
	1739	try:
	1740	codecs.lookup(charset)
	1741	codec = charset
	1742	except LookupError:
	1743	pass
	1744	return codec
	1745
	1746	EBCDIC_TO_ASCII_MAP = None
	1747	def _ebcdic_to_ascii(self, s):
	1748	c = self.__class__
	1749	if not c.EBCDIC_TO_ASCII_MAP:
	1750	emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
	1751	16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
	1752	128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
	1753	144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
	1754	32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
	1755	38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
	1756	45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
	1757	186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
	1758	195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
	1759	201,202,106,107,108,109,110,111,112,113,114,203,204,205,
	1760	206,207,208,209,126,115,116,117,118,119,120,121,122,210,
	1761	211,212,213,214,215,216,217,218,219,220,221,222,223,224,
	1762	225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
	1763	73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
	1764	82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
	1765	90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
	1766	250,251,252,253,254,255)
	1767	import string
	1768	c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
	1769	''.join(map(chr, range(256))), ''.join(map(chr, emap)))
	1770	return s.translate(c.EBCDIC_TO_ASCII_MAP)
	1771
	1772	MS_CHARS = { '\x80' : ('euro', '20AC'),
	1773	'\x81' : ' ',
	1774	'\x82' : ('sbquo', '201A'),
	1775	'\x83' : ('fnof', '192'),
	1776	'\x84' : ('bdquo', '201E'),
	1777	'\x85' : ('hellip', '2026'),
	1778	'\x86' : ('dagger', '2020'),
	1779	'\x87' : ('Dagger', '2021'),
	1780	'\x88' : ('circ', '2C6'),
	1781	'\x89' : ('permil', '2030'),
	1782	'\x8A' : ('Scaron', '160'),
	1783	'\x8B' : ('lsaquo', '2039'),
	1784	'\x8C' : ('OElig', '152'),
	1785	'\x8D' : '?',
	1786	'\x8E' : ('#x17D', '17D'),
	1787	'\x8F' : '?',
	1788	'\x90' : '?',
	1789	'\x91' : ('lsquo', '2018'),
	1790	'\x92' : ('rsquo', '2019'),
	1791	'\x93' : ('ldquo', '201C'),
	1792	'\x94' : ('rdquo', '201D'),
	1793	'\x95' : ('bull', '2022'),
	1794	'\x96' : ('ndash', '2013'),
	1795	'\x97' : ('mdash', '2014'),
	1796	'\x98' : ('tilde', '2DC'),
	1797	'\x99' : ('trade', '2122'),
	1798	'\x9a' : ('scaron', '161'),
	1799	'\x9b' : ('rsaquo', '203A'),
	1800	'\x9c' : ('oelig', '153'),
	1801	'\x9d' : '?',
	1802	'\x9e' : ('#x17E', '17E'),
	1803	'\x9f' : ('Yuml', '178'),}
	1804
	1805	#######################################################################
	1806
	1807
	1808	#By default, act as an HTML pretty-printer.
	1809	if __name__ == '__main__':
	1810	import sys
	1811	soup = BeautifulSoup(sys.stdin.read())
	1812	print soup.prettify()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/scripts/microbes/BeautifulSoup.py @ 2

異なるフォーマットでダウンロード: