Context Navigation

BeautifulSoup.py

リビジョン 2, 69.9 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	"""Beautiful Soup
2	Elixir and Tonic
3	"The Screen-Scraper's Friend"
4	http://www.crummy.com/software/BeautifulSoup/
5
6	Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7	tree representation. It provides methods and Pythonic idioms that make
8	it easy to navigate, search, and modify the tree.
9
10	A well-structured XML/HTML document yields a well-behaved data
11	structure. An ill-structured XML/HTML document yields a
12	correspondingly ill-behaved data structure. If your document is only
13	locally well-structured, you can use this library to find and process
14	the well-structured part of it.
15
16	Beautiful Soup works with Python 2.2 and up. It has no external
17	dependencies, but you'll have more success at converting data to UTF-8
18	if you also install these three packages:
19
20	* chardet, for auto-detecting character encodings
21	http://chardet.feedparser.org/
22	* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23	by stock Python.
24	http://cjkpython.i18n.org/
25
26	Beautiful Soup defines classes for two main parsing strategies:
27
28	* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29	language that kind of looks like XML.
30
31	* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32	or invalid. This class has web browser-like heuristics for
33	obtaining a sensible parse tree in the face of common HTML errors.
34
35	Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36	the encoding of an HTML or XML document, and converting it to
37	Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed
38	Parser.
39
40	For more than you ever wanted to know about Beautiful Soup, see the
41	documentation:
42	http://www.crummy.com/software/BeautifulSoup/documentation.html
43	"""
44	from __future__ import generators
45
46	__author__ = "Leonard Richardson (crummy.com)"
47	__contributors__ = ["Sam Ruby (intertwingly.net)",
48	"the unwitting Mark Pilgrim (diveintomark.org)",
49	"http://www.crummy.com/software/BeautifulSoup/AUTHORS.html"]
50	__version__ = "3.0.3"
51	__copyright__ = "Copyright (c) 2004-2006 Leonard Richardson"
52	__license__ = "PSF"
53
54	from sgmllib import SGMLParser, SGMLParseError
55	import codecs
56	import types
57	import re
58	import sgmllib
59	from htmlentitydefs import name2codepoint
60
61	# This RE makes Beautiful Soup able to parse XML with namespaces.
62	sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
63
64	# This RE makes Beautiful Soup capable of recognizing numeric character
65	# references that use hexadecimal.
66	sgmllib.charref = re.compile('&#(\d+\|x[0-9a-fA-F]+);')
67
68	DEFAULT_OUTPUT_ENCODING = "utf-8"
69
70	# First, the classes that represent markup elements.
71
72	class PageElement:
73	"""Contains the navigational information for some part of the page
74	(either a tag or a piece of text)"""
75
76	def setup(self, parent=None, previous=None):
77	"""Sets up the initial relations between this element and
78	other elements."""
79	self.parent = parent
80	self.previous = previous
81	self.next = None
82	self.previousSibling = None
83	self.nextSibling = None
84	if self.parent and self.parent.contents:
85	self.previousSibling = self.parent.contents[-1]
86	self.previousSibling.nextSibling = self
87
88	def replaceWith(self, replaceWith):
89	oldParent = self.parent
90	myIndex = self.parent.contents.index(self)
91	if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
92	# We're replacing this element with one of its siblings.
93	index = self.parent.contents.index(replaceWith)
94	if index and index < myIndex:
95	# Furthermore, it comes before this element. That
96	# means that when we extract it, the index of this
97	# element will change.
98	myIndex = myIndex - 1
99	self.extract()
100	oldParent.insert(myIndex, replaceWith)
101
102	def extract(self):
103	"""Destructively rips this element out of the tree."""
104	if self.parent:
105	try:
106	self.parent.contents.remove(self)
107	except ValueError:
108	pass
109
110	#Find the two elements that would be next to each other if
111	#this element (and any children) hadn't been parsed. Connect
112	#the two.
113	lastChild = self._lastRecursiveChild()
114	nextElement = lastChild.next
115
116	if self.previous:
117	self.previous.next = nextElement
118	if nextElement:
119	nextElement.previous = self.previous
120	self.previous = None
121	lastChild.next = None
122
123	self.parent = None
124	if self.previousSibling:
125	self.previousSibling.nextSibling = self.nextSibling
126	if self.nextSibling:
127	self.nextSibling.previousSibling = self.previousSibling
128	self.previousSibling = self.nextSibling = None
129
130	def _lastRecursiveChild(self):
131	"Finds the last element beneath this object to be parsed."
132	lastChild = self
133	while hasattr(lastChild, 'contents') and lastChild.contents:
134	lastChild = lastChild.contents[-1]
135	return lastChild
136
137	def insert(self, position, newChild):
138	if (isinstance(newChild, basestring)
139	or isinstance(newChild, unicode)) \
140	and not isinstance(newChild, NavigableString):
141	newChild = NavigableString(newChild)
142
143	position = min(position, len(self.contents))
144	if hasattr(newChild, 'parent') and newChild.parent != None:
145	# We're 'inserting' an element that's already one
146	# of this object's children.
147	if newChild.parent == self:
148	index = self.find(newChild)
149	if index and index < position:
150	# Furthermore we're moving it further down the
151	# list of this object's children. That means that
152	# when we extract this element, our target index
153	# will jump down one.
154	position = position - 1
155	newChild.extract()
156
157	newChild.parent = self
158	previousChild = None
159	if position == 0:
160	newChild.previousSibling = None
161	newChild.previous = self
162	else:
163	previousChild = self.contents[position-1]
164	newChild.previousSibling = previousChild
165	newChild.previousSibling.nextSibling = newChild
166	newChild.previous = previousChild._lastRecursiveChild()
167	if newChild.previous:
168	newChild.previous.next = newChild
169
170	newChildsLastElement = newChild._lastRecursiveChild()
171
172	if position >= len(self.contents):
173	newChild.nextSibling = None
174
175	parent = self
176	parentsNextSibling = None
177	while not parentsNextSibling:
178	parentsNextSibling = parent.nextSibling
179	parent = parent.parent
180	if not parent: # This is the last element in the document.
181	break
182	if parentsNextSibling:
183	newChildsLastElement.next = parentsNextSibling
184	else:
185	newChildsLastElement.next = None
186	else:
187	nextChild = self.contents[position]
188	newChild.nextSibling = nextChild
189	if newChild.nextSibling:
190	newChild.nextSibling.previousSibling = newChild
191	newChildsLastElement.next = nextChild
192
193	if newChildsLastElement.next:
194	newChildsLastElement.next.previous = newChildsLastElement
195	self.contents.insert(position, newChild)
196
197	def findNext(self, name=None, attrs={}, text=None, **kwargs):
198	"""Returns the first item that matches the given criteria and
199	appears after this Tag in the document."""
200	return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
201
202	def findAllNext(self, name=None, attrs={}, text=None, limit=None,
203	**kwargs):
204	"""Returns all items that match the given criteria and appear
205	before after Tag in the document."""
206	return self._findAll(name, attrs, text, limit, self.nextGenerator)
207
208	def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
209	"""Returns the closest sibling to this Tag that matches the
210	given criteria and appears after this Tag in the document."""
211	return self._findOne(self.findNextSiblings, name, attrs, text,
212	**kwargs)
213
214	def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
215	**kwargs):
216	"""Returns the siblings of this Tag that match the given
217	criteria and appear after this Tag in the document."""
218	return self._findAll(name, attrs, text, limit,
219	self.nextSiblingGenerator, **kwargs)
220	fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
221
222	def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
223	"""Returns the first item that matches the given criteria and
224	appears before this Tag in the document."""
225	return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
226
227	def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
228	**kwargs):
229	"""Returns all items that match the given criteria and appear
230	before this Tag in the document."""
231	return self._findAll(name, attrs, text, limit, self.previousGenerator,
232	**kwargs)
233	fetchPrevious = findAllPrevious # Compatibility with pre-3.x
234
235	def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
236	"""Returns the closest sibling to this Tag that matches the
237	given criteria and appears before this Tag in the document."""
238	return self._findOne(self.findPreviousSiblings, name, attrs, text,
239	**kwargs)
240
241	def findPreviousSiblings(self, name=None, attrs={}, text=None,
242	limit=None, **kwargs):
243	"""Returns the siblings of this Tag that match the given
244	criteria and appear before this Tag in the document."""
245	return self._findAll(name, attrs, text, limit,
246	self.previousSiblingGenerator, **kwargs)
247	fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
248
249	def findParent(self, name=None, attrs={}, **kwargs):
250	"""Returns the closest parent of this Tag that matches the given
251	criteria."""
252	# NOTE: We can't use _findOne because findParents takes a different
253	# set of arguments.
254	r = None
255	l = self.findParents(name, attrs, 1)
256	if l:
257	r = l[0]
258	return r
259
260	def findParents(self, name=None, attrs={}, limit=None, **kwargs):
261	"""Returns the parents of this Tag that match the given
262	criteria."""
263
264	return self._findAll(name, attrs, None, limit, self.parentGenerator,
265	**kwargs)
266	fetchParents = findParents # Compatibility with pre-3.x
267
268	#These methods do the real heavy lifting.
269
270	def _findOne(self, method, name, attrs, text, **kwargs):
271	r = None
272	l = method(name, attrs, text, 1, **kwargs)
273	if l:
274	r = l[0]
275	return r
276
277	def _findAll(self, name, attrs, text, limit, generator, **kwargs):
278	"Iterates over a generator looking for things that match."
279
280	if isinstance(name, SoupStrainer):
281	strainer = name
282	else:
283	# Build a SoupStrainer
284	strainer = SoupStrainer(name, attrs, text, **kwargs)
285	results = ResultSet(strainer)
286	g = generator()
287	while True:
288	try:
289	i = g.next()
290	except StopIteration:
291	break
292	if i:
293	found = strainer.search(i)
294	if found:
295	results.append(found)
296	if limit and len(results) >= limit:
297	break
298	return results
299
300	#These Generators can be used to navigate starting from both
301	#NavigableStrings and Tags.
302	def nextGenerator(self):
303	i = self
304	while i:
305	i = i.next
306	yield i
307
308	def nextSiblingGenerator(self):
309	i = self
310	while i:
311	i = i.nextSibling
312	yield i
313
314	def previousGenerator(self):
315	i = self
316	while i:
317	i = i.previous
318	yield i
319
320	def previousSiblingGenerator(self):
321	i = self
322	while i:
323	i = i.previousSibling
324	yield i
325
326	def parentGenerator(self):
327	i = self
328	while i:
329	i = i.parent
330	yield i
331
332	# Utility methods
333	def substituteEncoding(self, str, encoding=None):
334	encoding = encoding or "utf-8"
335	return str.replace("%SOUP-ENCODING%", encoding)
336
337	def toEncoding(self, s, encoding=None):
338	"""Encodes an object to a string in some encoding, or to Unicode.
339	."""
340	if isinstance(s, unicode):
341	if encoding:
342	s = s.encode(encoding)
343	elif isinstance(s, str):
344	if encoding:
345	s = s.encode(encoding)
346	else:
347	s = unicode(s)
348	else:
349	if encoding:
350	s = self.toEncoding(str(s), encoding)
351	else:
352	s = unicode(s)
353	return s
354
355	class NavigableString(unicode, PageElement):
356
357	def __getattr__(self, attr):
358	"""text.string gives you text. This is for backwards
359	compatibility for NavigableString, but for CData it lets you
360	get the string without the CData wrapper."""
361	if attr == 'string':
362	return self
363	else:
364	raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
365
366	def __unicode__(self):
367	return __str__(self, None)
368
369	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
370	if encoding:
371	return self.encode(encoding)
372	else:
373	return self
374
375	class CData(NavigableString):
376
377	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
378	return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
379
380	class ProcessingInstruction(NavigableString):
381	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
382	output = self
383	if "%SOUP-ENCODING%" in output:
384	output = self.substituteEncoding(output, encoding)
385	return "<?%s?>" % self.toEncoding(output, encoding)
386
387	class Comment(NavigableString):
388	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
389	return "<!--%s-->" % NavigableString.__str__(self, encoding)
390
391	class Declaration(NavigableString):
392	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
393	return "<!%s>" % NavigableString.__str__(self, encoding)
394
395	class Tag(PageElement):
396	"""Represents a found HTML tag with its attributes and contents."""
397
398	XML_ENTITIES_TO_CHARS = { 'apos' : "'",
399	"quot" : '"',
400	"amp" : "&",
401	"lt" : "<",
402	"gt" : ">"
403	}
404	# An RE for finding ampersands that aren't the start of of a
405	# numeric entity.
406	BARE_AMPERSAND = re.compile("&(?!#\d+;\|#x[0-9a-fA-F]+;\|\w+;)")
407
408	def __init__(self, parser, name, attrs=None, parent=None,
409	previous=None):
410	"Basic constructor."
411
412	# We don't actually store the parser object: that lets extracted
413	# chunks be garbage-collected
414	self.parserClass = parser.__class__
415	self.isSelfClosing = parser.isSelfClosingTag(name)
416	self.convertHTMLEntities = parser.convertHTMLEntities
417	self.name = name
418	if attrs == None:
419	attrs = []
420	self.attrs = attrs
421	self.contents = []
422	self.setup(parent, previous)
423	self.hidden = False
424	self.containsSubstitutions = False
425
426	def get(self, key, default=None):
427	"""Returns the value of the 'key' attribute for the tag, or
428	the value given for 'default' if it doesn't have that
429	attribute."""
430	return self._getAttrMap().get(key, default)
431
432	def has_key(self, key):
433	return self._getAttrMap().has_key(key)
434
435	def __getitem__(self, key):
436	"""tag[key] returns the value of the 'key' attribute for the tag,
437	and throws an exception if it's not there."""
438	return self._getAttrMap()[key]
439
440	def __iter__(self):
441	"Iterating over a tag iterates over its contents."
442	return iter(self.contents)
443
444	def __len__(self):
445	"The length of a tag is the length of its list of contents."
446	return len(self.contents)
447
448	def __contains__(self, x):
449	return x in self.contents
450
451	def __nonzero__(self):
452	"A tag is non-None even if it has no contents."
453	return True
454
455	def __setitem__(self, key, value):
456	"""Setting tag[key] sets the value of the 'key' attribute for the
457	tag."""
458	self._getAttrMap()
459	self.attrMap[key] = value
460	found = False
461	for i in range(0, len(self.attrs)):
462	if self.attrs[i][0] == key:
463	self.attrs[i] = (key, value)
464	found = True
465	if not found:
466	self.attrs.append((key, value))
467	self._getAttrMap()[key] = value
468
469	def __delitem__(self, key):
470	"Deleting tag[key] deletes all 'key' attributes for the tag."
471	for item in self.attrs:
472	if item[0] == key:
473	self.attrs.remove(item)
474	#We don't break because bad HTML can define the same
475	#attribute multiple times.
476	self._getAttrMap()
477	if self.attrMap.has_key(key):
478	del self.attrMap[key]
479
480	def __call__(self, args, *kwargs):
481	"""Calling a tag like a function is the same as calling its
482	findAll() method. Eg. tag('a') returns a list of all the A tags
483	found within this tag."""
484	return apply(self.findAll, args, kwargs)
485
486	def __getattr__(self, tag):
487	#print "Getattr %s.%s" % (self.__class__, tag)
488	if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
489	return self.find(tag[:-3])
490	elif tag.find('__') != 0:
491	return self.find(tag)
492
493	def __eq__(self, other):
494	"""Returns true iff this tag has the same name, the same attributes,
495	and the same contents (recursively) as the given tag.
496
497	NOTE: right now this will return false if two tags have the
498	same attributes in a different order. Should this be fixed?"""
499	if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
500	return False
501	for i in range(0, len(self.contents)):
502	if self.contents[i] != other.contents[i]:
503	return False
504	return True
505
506	def __ne__(self, other):
507	"""Returns true iff this tag is not identical to the other tag,
508	as defined in __eq__."""
509	return not self == other
510
511	def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
512	"""Renders this tag as a string."""
513	return self.__str__(encoding)
514
515	def __unicode__(self):
516	return self.__str__(None)
517
518	def _convertEntities(self, match):
519	x = match.group(1)
520	if x in name2codepoint:
521	return unichr(name2codepoint[x])
522	elif "&" + x + ";" in self.XML_ENTITIES_TO_CHARS:
523	return '&%s;' % x
524	else:
525	return '&%s;' % x
526
527	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
528	prettyPrint=False, indentLevel=0):
529	"""Returns a string or Unicode representation of this tag and
530	its contents. To get Unicode, pass None for encoding.
531
532	NOTE: since Python's HTML parser consumes whitespace, this
533	method is not certain to reproduce the whitespace present in
534	the original string."""
535
536	encodedName = self.toEncoding(self.name, encoding)
537
538	attrs = []
539	if self.attrs:
540	for key, val in self.attrs:
541	fmt = '%s="%s"'
542	if isString(val):
543	if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
544	val = self.substituteEncoding(val, encoding)
545
546	# The attribute value either:
547	#
548	# * Contains no embedded double quotes or single quotes.
549	# No problem: we enclose it in double quotes.
550	# * Contains embedded single quotes. No problem:
551	# double quotes work here too.
552	# * Contains embedded double quotes. No problem:
553	# we enclose it in single quotes.
554	# * Embeds both single _and_ double quotes. This
555	# can't happen naturally, but it can happen if
556	# you modify an attribute value after parsing
557	# the document. Now we have a bit of a
558	# problem. We solve it by enclosing the
559	# attribute in single quotes, and escaping any
560	# embedded single quotes to XML entities.
561	if '"' in val:
562	# This can't happen naturally, but it can happen
563	# if you modify an attribute value after parsing.
564	if "'" in val:
565	val = val.replace('"', """)
566	else:
567	fmt = "%s='%s'"
568
569	# Optionally convert any HTML entities
570	if self.convertHTMLEntities:
571	val = re.sub("&(\w+);", self._convertEntities, val)
572
573	# Now we're okay w/r/t quotes. But the attribute
574	# value might also contain angle brackets, or
575	# ampersands that aren't part of entities. We need
576	# to escape those to XML entities too.
577	val = val.replace("<", "<").replace(">", ">")
578	val = self.BARE_AMPERSAND.sub("&", val)
579
580
581	attrs.append(fmt % (self.toEncoding(key, encoding),
582	self.toEncoding(val, encoding)))
583	close = ''
584	closeTag = ''
585	if self.isSelfClosing:
586	close = ' /'
587	else:
588	closeTag = '</%s>' % encodedName
589
590	indentTag, indentContents = 0, 0
591	if prettyPrint:
592	indentTag = indentLevel
593	space = (' ' * (indentTag-1))
594	indentContents = indentTag + 1
595	contents = self.renderContents(encoding, prettyPrint, indentContents)
596	if self.hidden:
597	s = contents
598	else:
599	s = []
600	attributeString = ''
601	if attrs:
602	attributeString = ' ' + ' '.join(attrs)
603	if prettyPrint:
604	s.append(space)
605	s.append('<%s%s%s>' % (encodedName, attributeString, close))
606	if prettyPrint:
607	s.append("\n")
608	s.append(contents)
609	if prettyPrint and contents and contents[-1] != "\n":
610	s.append("\n")
611	if prettyPrint and closeTag:
612	s.append(space)
613	s.append(closeTag)
614	if prettyPrint and closeTag and self.nextSibling:
615	s.append("\n")
616	s = ''.join(s)
617	return s
618
619	def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
620	return self.__str__(encoding, True)
621
622	def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
623	prettyPrint=False, indentLevel=0):
624	"""Renders the contents of this tag as a string in the given
625	encoding. If encoding is None, returns a Unicode string.."""
626	s=[]
627	for c in self:
628	text = None
629	if isinstance(c, NavigableString):
630	text = c.__str__(encoding)
631	elif isinstance(c, Tag):
632	s.append(c.__str__(encoding, prettyPrint, indentLevel))
633	if text and prettyPrint:
634	text = text.strip()
635	if text:
636	if prettyPrint:
637	s.append(" " * (indentLevel-1))
638	s.append(text)
639	if prettyPrint:
640	s.append("\n")
641	return ''.join(s)
642
643	#Soup methods
644
645	def find(self, name=None, attrs={}, recursive=True, text=None,
646	**kwargs):
647	"""Return only the first child of this Tag matching the given
648	criteria."""
649	r = None
650	l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
651	if l:
652	r = l[0]
653	return r
654	findChild = find
655
656	def findAll(self, name=None, attrs={}, recursive=True, text=None,
657	limit=None, **kwargs):
658	"""Extracts a list of Tag objects that match the given
659	criteria. You can specify the name of the Tag and any
660	attributes you want the Tag to have.
661
662	The value of a key-value pair in the 'attrs' map can be a
663	string, a list of strings, a regular expression object, or a
664	callable that takes a string and returns whether or not the
665	string matches for some custom definition of 'matches'. The
666	same is true of the tag name."""
667	generator = self.recursiveChildGenerator
668	if not recursive:
669	generator = self.childGenerator
670	return self._findAll(name, attrs, text, limit, generator, **kwargs)
671	findChildren = findAll
672
673	# Pre-3.x compatibility methods
674	first = find
675	fetch = findAll
676
677	def fetchText(self, text=None, recursive=True, limit=None):
678	return self.findAll(text=text, recursive=recursive, limit=limit)
679
680	def firstText(self, text=None, recursive=True):
681	return self.find(text=text, recursive=recursive)
682
683	#Utility methods
684
685	def append(self, tag):
686	"""Appends the given tag to the contents of this tag."""
687	self.contents.append(tag)
688
689	#Private methods
690
691	def _getAttrMap(self):
692	"""Initializes a map representation of this tag's attributes,
693	if not already initialized."""
694	if not getattr(self, 'attrMap'):
695	self.attrMap = {}
696	for (key, value) in self.attrs:
697	self.attrMap[key] = value
698	return self.attrMap
699
700	#Generator methods
701	def childGenerator(self):
702	for i in range(0, len(self.contents)):
703	yield self.contents[i]
704	raise StopIteration
705
706	def recursiveChildGenerator(self):
707	stack = [(self, 0)]
708	while stack:
709	tag, start = stack.pop()
710	if isinstance(tag, Tag):
711	for i in range(start, len(tag.contents)):
712	a = tag.contents[i]
713	yield a
714	if isinstance(a, Tag) and tag.contents:
715	if i < len(tag.contents) - 1:
716	stack.append((tag, i+1))
717	stack.append((a, 0))
718	break
719	raise StopIteration
720
721	# Next, a couple classes to represent queries and their results.
722	class SoupStrainer:
723	"""Encapsulates a number of ways of matching a markup element (tag or
724	text)."""
725
726	def __init__(self, name=None, attrs={}, text=None, **kwargs):
727	self.name = name
728	if isString(attrs):
729	kwargs['class'] = attrs
730	attrs = None
731	if kwargs:
732	if attrs:
733	attrs = attrs.copy()
734	attrs.update(kwargs)
735	else:
736	attrs = kwargs
737	self.attrs = attrs
738	self.text = text
739
740	def __str__(self):
741	if self.text:
742	return self.text
743	else:
744	return "%s\|%s" % (self.name, self.attrs)
745
746	def searchTag(self, markupName=None, markupAttrs={}):
747	found = None
748	markup = None
749	if isinstance(markupName, Tag):
750	markup = markupName
751	markupAttrs = markup
752	callFunctionWithTagData = callable(self.name) \
753	and not isinstance(markupName, Tag)
754
755	if (not self.name) \
756	or callFunctionWithTagData \
757	or (markup and self._matches(markup, self.name)) \
758	or (not markup and self._matches(markupName, self.name)):
759	if callFunctionWithTagData:
760	match = self.name(markupName, markupAttrs)
761	else:
762	match = True
763	markupAttrMap = None
764	for attr, matchAgainst in self.attrs.items():
765	if not markupAttrMap:
766	if hasattr(markupAttrs, 'get'):
767	markupAttrMap = markupAttrs
768	else:
769	markupAttrMap = {}
770	for k,v in markupAttrs:
771	markupAttrMap[k] = v
772	attrValue = markupAttrMap.get(attr)
773	if not self._matches(attrValue, matchAgainst):
774	match = False
775	break
776	if match:
777	if markup:
778	found = markup
779	else:
780	found = markupName
781	return found
782
783	def search(self, markup):
784	#print 'looking for %s in %s' % (self, markup)
785	found = None
786	# If given a list of items, scan it for a text element that
787	# matches.
788	if isList(markup) and not isinstance(markup, Tag):
789	for element in markup:
790	if isinstance(element, NavigableString) \
791	and self.search(element):
792	found = element
793	break
794	# If it's a Tag, make sure its name or attributes match.
795	# Don't bother with Tags if we're searching for text.
796	elif isinstance(markup, Tag):
797	if not self.text:
798	found = self.searchTag(markup)
799	# If it's text, make sure the text matches.
800	elif isinstance(markup, NavigableString) or \
801	isString(markup):
802	if self._matches(markup, self.text):
803	found = markup
804	else:
805	raise Exception, "I don't know how to match against a %s" \
806	% markup.__class__
807	return found
808
809	def _matches(self, markup, matchAgainst):
810	#print "Matching %s against %s" % (markup, matchAgainst)
811	result = False
812	if matchAgainst == True and type(matchAgainst) == types.BooleanType:
813	result = markup != None
814	elif callable(matchAgainst):
815	result = matchAgainst(markup)
816	else:
817	#Custom match methods take the tag as an argument, but all
818	#other ways of matching match the tag name as a string.
819	if isinstance(markup, Tag):
820	markup = markup.name
821	if markup and not isString(markup):
822	markup = unicode(markup)
823	#Now we know that chunk is either a string, or None.
824	if hasattr(matchAgainst, 'match'):
825	# It's a regexp object.
826	result = markup and matchAgainst.search(markup)
827	elif isList(matchAgainst):
828	result = markup in matchAgainst
829	elif hasattr(matchAgainst, 'items'):
830	result = markup.has_key(matchAgainst)
831	elif matchAgainst and isString(markup):
832	if isinstance(markup, unicode):
833	matchAgainst = unicode(matchAgainst)
834	else:
835	matchAgainst = str(matchAgainst)
836
837	if not result:
838	result = matchAgainst == markup
839	return result
840
841	class ResultSet(list):
842	"""A ResultSet is just a list that keeps track of the SoupStrainer
843	that created it."""
844	def __init__(self, source):
845	list.__init__([])
846	self.source = source
847
848	# Now, some helper functions.
849
850	def isList(l):
851	"""Convenience method that works with all 2.x versions of Python
852	to determine whether or not something is listlike."""
853	return hasattr(l, '__iter__') \
854	or (type(l) in (types.ListType, types.TupleType))
855
856	def isString(s):
857	"""Convenience method that works with all 2.x versions of Python
858	to determine whether or not something is stringlike."""
859	try:
860	return isinstance(s, unicode) or isintance(s, basestring)
861	except NameError:
862	return isinstance(s, str)
863
864	def buildTagMap(default, *args):
865	"""Turns a list of maps, lists, or scalars into a single map.
866	Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
867	NESTING_RESET_TAGS maps out of lists and partial maps."""
868	built = {}
869	for portion in args:
870	if hasattr(portion, 'items'):
871	#It's a map. Merge it.
872	for k,v in portion.items():
873	built[k] = v
874	elif isList(portion):
875	#It's a list. Map each item to the default.
876	for k in portion:
877	built[k] = default
878	else:
879	#It's a scalar. Map it to the default.
880	built[portion] = default
881	return built
882
883	# Now, the parser classes.
884
885	class BeautifulStoneSoup(Tag, SGMLParser):
886
887	"""This class contains the basic parser and search code. It defines
888	a parser that knows nothing about tag behavior except for the
889	following:
890
891	You can't close a tag without closing all the tags it encloses.
892	That is, "<foo><bar></foo>" actually means
893	"<foo><bar></bar></foo>".
894
895	[Another possible explanation is "<foo><bar /></foo>", but since
896	this class defines no SELF_CLOSING_TAGS, it will never use that
897	explanation.]
898
899	This class is useful for parsing XML or made-up markup languages,
900	or when BeautifulSoup makes an assumption counter to what you were
901	expecting."""
902
903	SELF_CLOSING_TAGS = {}
904	NESTABLE_TAGS = {}
905	RESET_NESTING_TAGS = {}
906	QUOTE_TAGS = {}
907
908	MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
909	lambda x: x.group(1) + ' />'),
910	(re.compile('<!\s+([^<>]*)>'),
911	lambda x: '<!' + x.group(1) + '>')
912	]
913
914	ROOT_TAG_NAME = u'[document]'
915
916	HTML_ENTITIES = "html"
917	XML_ENTITIES = "xml"
918	ALL_ENTITIES = [HTML_ENTITIES, XML_ENTITIES]
919
920	def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
921	markupMassage=True, smartQuotesTo=XML_ENTITIES,
922	convertEntities=None, selfClosingTags=None):
923	"""The Soup object is initialized as the 'root tag', and the
924	provided markup (which can be a string or a file-like object)
925	is fed into the underlying parser.
926
927	sgmllib will process most bad HTML, and the BeautifulSoup
928	class has some tricks for dealing with some HTML that kills
929	sgmllib, but Beautiful Soup can nonetheless choke or lose data
930	if your data uses self-closing tags or declarations
931	incorrectly.
932
933	By default, Beautiful Soup uses regexes to sanitize input,
934	avoiding the vast majority of these problems. If the problems
935	don't apply to you, pass in False for markupMassage, and
936	you'll get better performance.
937
938	The default parser massage techniques fix the two most common
939	instances of invalid HTML that choke sgmllib:
940
941	<br/> (No space between name of closing tag and tag close)
942	<! --Comment--> (Extraneous whitespace in declaration)
943
944	You can pass in a custom list of (RE object, replace method)
945	tuples to get Beautiful Soup to scrub your input the way you
946	want."""
947
948	self.parseOnlyThese = parseOnlyThese
949	self.fromEncoding = fromEncoding
950	self.smartQuotesTo = smartQuotesTo
951
952	if convertEntities:
953	# It doesn't make sense to convert encoded characters to
954	# entities even while you're converting entities to Unicode.
955	# Just convert it all to Unicode.
956	self.smartQuotesTo = None
957
958	if isList(convertEntities):
959	self.convertHTMLEntities = self.HTML_ENTITIES in convertEntities
960	self.convertXMLEntities = self.XML_ENTITIES in convertEntities
961	else:
962	self.convertHTMLEntities = self.HTML_ENTITIES == convertEntities
963	self.convertXMLEntities = self.XML_ENTITIES == convertEntities
964
965	self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
966	SGMLParser.__init__(self)
967
968	if hasattr(markup, 'read'): # It's a file-type object.
969	markup = markup.read()
970	self.markup = markup
971	self.markupMassage = markupMassage
972	try:
973	self._feed()
974	except StopParsing:
975	pass
976	self.markup = None # The markup can now be GCed
977
978	def _feed(self, inDocumentEncoding=None):
979	# Convert the document to Unicode.
980	markup = self.markup
981	if isinstance(markup, unicode):
982	if not hasattr(self, 'originalEncoding'):
983	self.originalEncoding = None
984	else:
985	dammit = UnicodeDammit\
986	(markup, [self.fromEncoding, inDocumentEncoding],
987	smartQuotesTo=self.smartQuotesTo)
988	markup = dammit.unicode
989	self.originalEncoding = dammit.originalEncoding
990	if markup:
991	if self.markupMassage:
992	if not isList(self.markupMassage):
993	self.markupMassage = self.MARKUP_MASSAGE
994	for fix, m in self.markupMassage:
995	markup = fix.sub(m, markup)
996	self.reset()
997
998	SGMLParser.feed(self, markup or "")
999	SGMLParser.close(self)
1000	# Close out any unfinished strings and close all the open tags.
1001	self.endData()
1002	while self.currentTag.name != self.ROOT_TAG_NAME:
1003	self.popTag()
1004
1005	def __getattr__(self, methodName):
1006	"""This method routes method call requests to either the SGMLParser
1007	superclass or the Tag superclass, depending on the method name."""
1008	#print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1009
1010	if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1011	or methodName.find('do_') == 0:
1012	return SGMLParser.__getattr__(self, methodName)
1013	elif methodName.find('__') != 0:
1014	return Tag.__getattr__(self, methodName)
1015	else:
1016	raise AttributeError
1017
1018	def isSelfClosingTag(self, name):
1019	"""Returns true iff the given string is the name of a
1020	self-closing tag according to this parser."""
1021	return self.SELF_CLOSING_TAGS.has_key(name) \
1022	or self.instanceSelfClosingTags.has_key(name)
1023
1024	def reset(self):
1025	Tag.__init__(self, self, self.ROOT_TAG_NAME)
1026	self.hidden = 1
1027	SGMLParser.reset(self)
1028	self.currentData = []
1029	self.currentTag = None
1030	self.tagStack = []
1031	self.quoteStack = []
1032	self.pushTag(self)
1033
1034	def popTag(self):
1035	tag = self.tagStack.pop()
1036	# Tags with just one string-owning child get the child as a
1037	# 'string' property, so that soup.tag.string is shorthand for
1038	# soup.tag.contents[0]
1039	if len(self.currentTag.contents) == 1 and \
1040	isinstance(self.currentTag.contents[0], NavigableString):
1041	self.currentTag.string = self.currentTag.contents[0]
1042
1043	#print "Pop", tag.name
1044	if self.tagStack:
1045	self.currentTag = self.tagStack[-1]
1046	return self.currentTag
1047
1048	def pushTag(self, tag):
1049	#print "Push", tag.name
1050	if self.currentTag:
1051	self.currentTag.append(tag)
1052	self.tagStack.append(tag)
1053	self.currentTag = self.tagStack[-1]
1054
1055	def endData(self, containerClass=NavigableString):
1056	if self.currentData:
1057	currentData = ''.join(self.currentData)
1058	if currentData.endswith('<') and self.convertHTMLEntities:
1059	currentData = currentData[:-1] + '<'
1060	if not currentData.strip():
1061	if '\n' in currentData:
1062	currentData = '\n'
1063	else:
1064	currentData = ' '
1065	self.currentData = []
1066	if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1067	(not self.parseOnlyThese.text or \
1068	not self.parseOnlyThese.search(currentData)):
1069	return
1070	o = containerClass(currentData)
1071	o.setup(self.currentTag, self.previous)
1072	if self.previous:
1073	self.previous.next = o
1074	self.previous = o
1075	self.currentTag.contents.append(o)
1076
1077
1078	def _popToTag(self, name, inclusivePop=True):
1079	"""Pops the tag stack up to and including the most recent
1080	instance of the given tag. If inclusivePop is false, pops the tag
1081	stack up to but not including the most recent instqance of
1082	the given tag."""
1083	#print "Popping to %s" % name
1084	if name == self.ROOT_TAG_NAME:
1085	return
1086
1087	numPops = 0
1088	mostRecentTag = None
1089	for i in range(len(self.tagStack)-1, 0, -1):
1090	if name == self.tagStack[i].name:
1091	numPops = len(self.tagStack)-i
1092	break
1093	if not inclusivePop:
1094	numPops = numPops - 1
1095
1096	for i in range(0, numPops):
1097	mostRecentTag = self.popTag()
1098	return mostRecentTag
1099
1100	def _smartPop(self, name):
1101
1102	"""We need to pop up to the previous tag of this type, unless
1103	one of this tag's nesting reset triggers comes between this
1104	tag and the previous tag of this type, OR unless this tag is a
1105	generic nesting trigger and another generic nesting trigger
1106	comes between this tag and the previous tag of this type.
1107
1108	Examples:
1109	<p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1110	<p>Foo<table>Bar<p> should pop to 'table', not 'p'.
1111	<p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
1112	<p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1113
1114	<li><ul><li> <li> should pop to 'ul', not the first 'li'.
1115	<tr><table><tr> <tr> should pop to 'table', not the first 'tr'
1116	<td><tr><td> <td> should pop to 'tr', not the first 'td'
1117	"""
1118
1119	nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1120	isNestable = nestingResetTriggers != None
1121	isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1122	popTo = None
1123	inclusive = True
1124	for i in range(len(self.tagStack)-1, 0, -1):
1125	p = self.tagStack[i]
1126	if (not p or p.name == name) and not isNestable:
1127	#Non-nestable tags get popped to the top or to their
1128	#last occurance.
1129	popTo = name
1130	break
1131	if (nestingResetTriggers != None
1132	and p.name in nestingResetTriggers) \
1133	or (nestingResetTriggers == None and isResetNesting
1134	and self.RESET_NESTING_TAGS.has_key(p.name)):
1135
1136	#If we encounter one of the nesting reset triggers
1137	#peculiar to this tag, or we encounter another tag
1138	#that causes nesting to reset, pop up to but not
1139	#including that tag.
1140	popTo = p.name
1141	inclusive = False
1142	break
1143	p = p.parent
1144	if popTo:
1145	self._popToTag(popTo, inclusive)
1146
1147	def unknown_starttag(self, name, attrs, selfClosing=0):
1148	#print "Start tag %s: %s" % (name, attrs)
1149	if self.quoteStack:
1150	#This is not a real tag.
1151	#print "<%s> is not real!" % name
1152	attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1153	self.currentData.append('<%s%s>' % (name, attrs))
1154	return
1155	self.endData()
1156
1157	if not self.isSelfClosingTag(name) and not selfClosing:
1158	self._smartPop(name)
1159
1160	if self.parseOnlyThese and len(self.tagStack) <= 1 \
1161	and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1162	return
1163
1164	tag = Tag(self, name, attrs, self.currentTag, self.previous)
1165	if self.previous:
1166	self.previous.next = tag
1167	self.previous = tag
1168	self.pushTag(tag)
1169	if selfClosing or self.isSelfClosingTag(name):
1170	self.popTag()
1171	if name in self.QUOTE_TAGS:
1172	#print "Beginning quote (%s)" % name
1173	self.quoteStack.append(name)
1174	self.literal = 1
1175	return tag
1176
1177	def unknown_endtag(self, name):
1178	#print "End tag %s" % name
1179	if self.quoteStack and self.quoteStack[-1] != name:
1180	#This is not a real end tag.
1181	#print "</%s> is not real!" % name
1182	self.currentData.append('</%s>' % name)
1183	return
1184	self.endData()
1185	self._popToTag(name)
1186	if self.quoteStack and self.quoteStack[-1] == name:
1187	self.quoteStack.pop()
1188	self.literal = (len(self.quoteStack) > 0)
1189
1190	def handle_data(self, data):
1191	if self.convertHTMLEntities:
1192	if data[0] == '&':
1193	data = self.BARE_AMPERSAND.sub("&",data)
1194	else:
1195	data = data.replace('&','&') \
1196	.replace('<','<') \
1197	.replace('>','>')
1198	self.currentData.append(data)
1199
1200	def _toStringSubclass(self, text, subclass):
1201	"""Adds a certain piece of text to the tree as a NavigableString
1202	subclass."""
1203	self.endData()
1204	self.handle_data(text)
1205	self.endData(subclass)
1206
1207	def handle_pi(self, text):
1208	"""Handle a processing instruction as a ProcessingInstruction
1209	object, possibly one with a %SOUP-ENCODING% slot into which an
1210	encoding will be plugged later."""
1211	if text[:3] == "xml":
1212	text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
1213	self._toStringSubclass(text, ProcessingInstruction)
1214
1215	def handle_comment(self, text):
1216	"Handle comments as Comment objects."
1217	self._toStringSubclass(text, Comment)
1218
1219	def handle_charref(self, ref):
1220	"Handle character references as data."
1221	if ref[0] == 'x':
1222	data = unichr(int(ref[1:],16))
1223	else:
1224	data = unichr(int(ref))
1225
1226	if u'\x80' <= data <= u'\x9F':
1227	data = UnicodeDammit.subMSChar(chr(ord(data)), self.smartQuotesTo)
1228	elif not self.convertHTMLEntities and not self.convertXMLEntities:
1229	data = '&#%s;' % ref
1230
1231	self.handle_data(data)
1232
1233	def handle_entityref(self, ref):
1234	"""Handle entity references as data, possibly converting known
1235	HTML entity references to the corresponding Unicode
1236	characters."""
1237	replaceWithXMLEntity = self.convertXMLEntities and \
1238	self.XML_ENTITIES_TO_CHARS.has_key(ref)
1239	if self.convertHTMLEntities or replaceWithXMLEntity:
1240	try:
1241	data = unichr(name2codepoint[ref])
1242	except KeyError:
1243	if replaceWithXMLEntity:
1244	data = self.XML_ENTITIES_TO_CHARS.get(ref)
1245	else:
1246	data="&%s" % ref
1247	else:
1248	data = '&%s;' % ref
1249	self.handle_data(data)
1250
1251	def handle_decl(self, data):
1252	"Handle DOCTYPEs and the like as Declaration objects."
1253	self._toStringSubclass(data, Declaration)
1254
1255	def parse_declaration(self, i):
1256	"""Treat a bogus SGML declaration as raw data. Treat a CDATA
1257	declaration as a CData object."""
1258	j = None
1259	if self.rawdata[i:i+9] == '<![CDATA[':
1260	k = self.rawdata.find(']]>', i)
1261	if k == -1:
1262	k = len(self.rawdata)
1263	data = self.rawdata[i+9:k]
1264	j = k+3
1265	self._toStringSubclass(data, CData)
1266	else:
1267	try:
1268	j = SGMLParser.parse_declaration(self, i)
1269	except SGMLParseError:
1270	toHandle = self.rawdata[i:]
1271	self.handle_data(toHandle)
1272	j = i + len(toHandle)
1273	return j
1274
1275	class BeautifulSoup(BeautifulStoneSoup):
1276
1277	"""This parser knows the following facts about HTML:
1278
1279	* Some tags have no closing tag and should be interpreted as being
1280	closed as soon as they are encountered.
1281
1282	* The text inside some tags (ie. 'script') may contain tags which
1283	are not really part of the document and which should be parsed
1284	as text, not tags. If you want to parse the text as tags, you can
1285	always fetch it and parse it explicitly.
1286
1287	* Tag nesting rules:
1288
1289	Most tags can't be nested at all. For instance, the occurance of
1290	a <p> tag should implicitly close the previous <p> tag.
1291
1292	<p>Para1<p>Para2
1293	should be transformed into:
1294	<p>Para1</p><p>Para2
1295
1296	Some tags can be nested arbitrarily. For instance, the occurance
1297	of a <blockquote> tag should _not_ implicitly close the previous
1298	<blockquote> tag.
1299
1300	Alice said: <blockquote>Bob said: <blockquote>Blah
1301	should NOT be transformed into:
1302	Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1303
1304	Some tags can be nested, but the nesting is reset by the
1305	interposition of other tags. For instance, a <tr> tag should
1306	implicitly close the previous <tr> tag within the same <table>,
1307	but not close a <tr> tag in another table.
1308
1309	<table><tr>Blah<tr>Blah
1310	should be transformed into:
1311	<table><tr>Blah</tr><tr>Blah
1312	but,
1313	<tr>Blah<table><tr>Blah
1314	should NOT be transformed into
1315	<tr>Blah<table></tr><tr>Blah
1316
1317	Differing assumptions about tag nesting rules are a major source
1318	of problems with the BeautifulSoup class. If BeautifulSoup is not
1319	treating as nestable a tag your page author treats as nestable,
1320	try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1321	BeautifulStoneSoup before writing your own subclass."""
1322
1323	def __init__(self, args, *kwargs):
1324	if not kwargs.has_key('smartQuotesTo'):
1325	kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1326	BeautifulStoneSoup.__init__(self, args, *kwargs)
1327
1328	SELF_CLOSING_TAGS = buildTagMap(None,
1329	['br' , 'hr', 'input', 'img', 'meta',
1330	'spacer', 'link', 'frame', 'base'])
1331
1332	QUOTE_TAGS = {'script': None}
1333
1334	#According to the HTML standard, each of these inline tags can
1335	#contain another tag of the same type. Furthermore, it's common
1336	#to actually use these tags this way.
1337	NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1338	'center']
1339
1340	#According to the HTML standard, these block tags can contain
1341	#another tag of the same type. Furthermore, it's common
1342	#to actually use these tags this way.
1343	NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1344
1345	#Lists can contain other lists, but there are restrictions.
1346	NESTABLE_LIST_TAGS = { 'ol' : [],
1347	'ul' : [],
1348	'li' : ['ul', 'ol'],
1349	'dl' : [],
1350	'dd' : ['dl'],
1351	'dt' : ['dl'] }
1352
1353	#Tables can contain other tables, but there are restrictions.
1354	NESTABLE_TABLE_TAGS = {'table' : [],
1355	'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1356	'td' : ['tr'],
1357	'th' : ['tr'],
1358	'thead' : ['table'],
1359	'tbody' : ['table'],
1360	'tfoot' : ['table'],
1361	}
1362
1363	NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1364
1365	#If one of these tags is encountered, all tags up to the next tag of
1366	#this type are popped.
1367	RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1368	NON_NESTABLE_BLOCK_TAGS,
1369	NESTABLE_LIST_TAGS,
1370	NESTABLE_TABLE_TAGS)
1371
1372	NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1373	NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1374
1375	# Used to detect the charset in a META tag; see start_meta
1376	CHARSET_RE = re.compile("((^\|;)\scharset=)([^;])")
1377
1378	def start_meta(self, attrs):
1379	"""Beautiful Soup can detect a charset included in a META tag,
1380	try to convert the document to that charset, and re-parse the
1381	document from the beginning."""
1382	httpEquiv = None
1383	contentType = None
1384	contentTypeIndex = None
1385	tagNeedsEncodingSubstitution = False
1386
1387	for i in range(0, len(attrs)):
1388	key, value = attrs[i]
1389	key = key.lower()
1390	if key == 'http-equiv':
1391	httpEquiv = value
1392	elif key == 'content':
1393	contentType = value
1394	contentTypeIndex = i
1395
1396	if httpEquiv and contentType: # It's an interesting meta tag.
1397	match = self.CHARSET_RE.search(contentType)
1398	if match:
1399	if getattr(self, 'declaredHTMLEncoding') or \
1400	(self.originalEncoding == self.fromEncoding):
1401	# This is our second pass through the document, or
1402	# else an encoding was specified explicitly and it
1403	# worked. Rewrite the meta tag.
1404	newAttr = self.CHARSET_RE.sub\
1405	(lambda(match):match.group(1) +
1406	"%SOUP-ENCODING%", value)
1407	attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1408	newAttr)
1409	tagNeedsEncodingSubstitution = True
1410	else:
1411	# This is our first pass through the document.
1412	# Go through it again with the new information.
1413	newCharset = match.group(3)
1414	if newCharset and newCharset != self.originalEncoding:
1415	self.declaredHTMLEncoding = newCharset
1416	self._feed(self.declaredHTMLEncoding)
1417	raise StopParsing
1418	tag = self.unknown_starttag("meta", attrs)
1419	if tag and tagNeedsEncodingSubstitution:
1420	tag.containsSubstitutions = True
1421
1422	class StopParsing(Exception):
1423	pass
1424
1425	class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1426
1427	"""The BeautifulSoup class is oriented towards skipping over
1428	common HTML errors like unclosed tags. However, sometimes it makes
1429	errors of its own. For instance, consider this fragment:
1430
1431	<b>Foo<b>Bar</b></b>
1432
1433	This is perfectly valid (if bizarre) HTML. However, the
1434	BeautifulSoup class will implicitly close the first b tag when it
1435	encounters the second 'b'. It will think the author wrote
1436	"<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1437	there's no real-world reason to bold something that's already
1438	bold. When it encounters '</b></b>' it will close two more 'b'
1439	tags, for a grand total of three tags closed instead of two. This
1440	can throw off the rest of your document structure. The same is
1441	true of a number of other tags, listed below.
1442
1443	It's much more common for someone to forget to close a 'b' tag
1444	than to actually use nested 'b' tags, and the BeautifulSoup class
1445	handles the common case. This class handles the not-co-common
1446	case: where you can't believe someone wrote what they did, but
1447	it's valid HTML and BeautifulSoup screwed up by assuming it
1448	wouldn't be."""
1449
1450	I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1451	['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1452	'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1453	'big']
1454
1455	I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1456
1457	NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1458	I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1459	I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1460
1461	class MinimalSoup(BeautifulSoup):
1462	"""The MinimalSoup class is for parsing HTML that contains
1463	pathologically bad markup. It makes no assumptions about tag
1464	nesting, but it does know which tags are self-closing, that
1465	<script> tags contain Javascript and should not be parsed, that
1466	META tags may contain encoding information, and so on.
1467
1468	This also makes it better for subclassing than BeautifulStoneSoup
1469	or BeautifulSoup."""
1470
1471	RESET_NESTING_TAGS = buildTagMap('noscript')
1472	NESTABLE_TAGS = {}
1473
1474	class BeautifulSOAP(BeautifulStoneSoup):
1475	"""This class will push a tag with only a single string child into
1476	the tag's parent as an attribute. The attribute's name is the tag
1477	name, and the value is the string child. An example should give
1478	the flavor of the change:
1479
1480	<foo><bar>baz</bar></foo>
1481	=>
1482	<foo bar="baz"><bar>baz</bar></foo>
1483
1484	You can then access fooTag['bar'] instead of fooTag.barTag.string.
1485
1486	This is, of course, useful for scraping structures that tend to
1487	use subelements instead of attributes, such as SOAP messages. Note
1488	that it modifies its input, so don't print the modified version
1489	out.
1490
1491	I'm not sure how many people really want to use this class; let me
1492	know if you do. Mainly I like the name."""
1493
1494	def popTag(self):
1495	if len(self.tagStack) > 1:
1496	tag = self.tagStack[-1]
1497	parent = self.tagStack[-2]
1498	parent._getAttrMap()
1499	if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1500	isinstance(tag.contents[0], NavigableString) and
1501	not parent.attrMap.has_key(tag.name)):
1502	parent[tag.name] = tag.contents[0]
1503	BeautifulStoneSoup.popTag(self)
1504
1505	#Enterprise class names! It has come to our attention that some people
1506	#think the names of the Beautiful Soup parser classes are too silly
1507	#and "unprofessional" for use in enterprise screen-scraping. We feel
1508	#your pain! For such-minded folk, the Beautiful Soup Consortium And
1509	#All-Night Kosher Bakery recommends renaming this file to
1510	#"RobustParser.py" (or, in cases of extreme enterprisitude,
1511	#"RobustParserBeanInterface.class") and using the following
1512	#enterprise-friendly class aliases:
1513	class RobustXMLParser(BeautifulStoneSoup):
1514	pass
1515	class RobustHTMLParser(BeautifulSoup):
1516	pass
1517	class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1518	pass
1519	class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1520	pass
1521	class SimplifyingSOAPParser(BeautifulSOAP):
1522	pass
1523
1524	######################################################
1525	#
1526	# Bonus library: Unicode, Dammit
1527	#
1528	# This class forces XML data into a standard format (usually to UTF-8
1529	# or Unicode). It is heavily based on code from Mark Pilgrim's
1530	# Universal Feed Parser. It does not rewrite the XML or HTML to
1531	# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1532	# (XML) and BeautifulSoup.start_meta (HTML).
1533
1534	# Autodetects character encodings.
1535	# Download from http://chardet.feedparser.org/
1536	try:
1537	import chardet
1538	# import chardet.constants
1539	# chardet.constants._debug = 1
1540	except:
1541	chardet = None
1542	chardet = None
1543
1544	# cjkcodecs and iconv_codec make Python know about more character encodings.
1545	# Both are available from http://cjkpython.i18n.org/
1546	# They're built in if you use Python 2.4.
1547	try:
1548	import cjkcodecs.aliases
1549	except:
1550	pass
1551	try:
1552	import iconv_codec
1553	except:
1554	pass
1555
1556	class UnicodeDammit:
1557	"""A class for detecting the encoding of a *ML document and
1558	converting it to a Unicode string. If the source encoding is
1559	windows-1252, can replace MS smart quotes with their HTML or XML
1560	equivalents."""
1561
1562	# This dictionary maps commonly seen values for "charset" in HTML
1563	# meta tags to the corresponding Python codec names. It only covers
1564	# values that aren't in Python's aliases and can't be determined
1565	# by the heuristics in find_codec.
1566	CHARSET_ALIASES = { "macintosh" : "mac-roman",
1567	"x-sjis" : "shift-jis" }
1568
1569	def __init__(self, markup, overrideEncodings=[],
1570	smartQuotesTo='xml'):
1571	self.markup, documentEncoding, sniffedEncoding = \
1572	self._detectEncoding(markup)
1573	self.smartQuotesTo = smartQuotesTo
1574	self.triedEncodings = []
1575	if isinstance(markup, unicode):
1576	return markup
1577
1578	u = None
1579	for proposedEncoding in overrideEncodings:
1580	u = self._convertFrom(proposedEncoding)
1581	if u: break
1582	if not u:
1583	for proposedEncoding in (documentEncoding, sniffedEncoding):
1584	u = self._convertFrom(proposedEncoding)
1585	if u: break
1586
1587	# If no luck and we have auto-detection library, try that:
1588	if not u and chardet and not isinstance(self.markup, unicode):
1589	u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1590
1591	# As a last resort, try utf-8 and windows-1252:
1592	if not u:
1593	for proposed_encoding in ("utf-8", "windows-1252"):
1594	u = self._convertFrom(proposed_encoding)
1595	if u: break
1596	self.unicode = u
1597	if not u: self.originalEncoding = None
1598
1599	def subMSChar(orig, smartQuotesTo):
1600	"""Changes a MS smart quote character to an XML or HTML
1601	entity."""
1602	sub = UnicodeDammit.MS_CHARS.get(orig)
1603	if type(sub) == types.TupleType:
1604	if smartQuotesTo == 'xml':
1605	sub = '&#x%s;' % sub[1]
1606	elif smartQuotesTo == 'html':
1607	sub = '&%s;' % sub[0]
1608	else:
1609	sub = unichr(int(sub[1],16))
1610	return sub
1611	subMSChar = staticmethod(subMSChar)
1612
1613	def _convertFrom(self, proposed):
1614	proposed = self.find_codec(proposed)
1615	if not proposed or proposed in self.triedEncodings:
1616	return None
1617	self.triedEncodings.append(proposed)
1618	markup = self.markup
1619
1620	# Convert smart quotes to HTML if coming from an encoding
1621	# that might have them.
1622	if self.smartQuotesTo and proposed in("windows-1252",
1623	"ISO-8859-1",
1624	"ISO-8859-2"):
1625	markup = re.compile("([\x80-\x9f])").sub \
1626	(lambda(x): self.subMSChar(x.group(1),self.smartQuotesTo),
1627	markup)
1628
1629	try:
1630	# print "Trying to convert document to %s" % proposed
1631	u = self._toUnicode(markup, proposed)
1632	self.markup = u
1633	self.originalEncoding = proposed
1634	except Exception, e:
1635	# print "That didn't work!"
1636	# print e
1637	return None
1638	#print "Correct encoding: %s" % proposed
1639	return self.markup
1640
1641	def _toUnicode(self, data, encoding):
1642	'''Given a string and its encoding, decodes the string into Unicode.
1643	%encoding is a string recognized by encodings.aliases'''
1644
1645	# strip Byte Order Mark (if present)
1646	if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1647	and (data[2:4] != '\x00\x00'):
1648	encoding = 'utf-16be'
1649	data = data[2:]
1650	elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1651	and (data[2:4] != '\x00\x00'):
1652	encoding = 'utf-16le'
1653	data = data[2:]
1654	elif data[:3] == '\xef\xbb\xbf':
1655	encoding = 'utf-8'
1656	data = data[3:]
1657	elif data[:4] == '\x00\x00\xfe\xff':
1658	encoding = 'utf-32be'
1659	data = data[4:]
1660	elif data[:4] == '\xff\xfe\x00\x00':
1661	encoding = 'utf-32le'
1662	data = data[4:]
1663	newdata = unicode(data, encoding)
1664	return newdata
1665
1666	def _detectEncoding(self, xml_data):
1667	"""Given a document, tries to detect its XML encoding."""
1668	xml_encoding = sniffed_xml_encoding = None
1669	try:
1670	if xml_data[:4] == '\x4c\x6f\xa7\x94':
1671	# EBCDIC
1672	xml_data = self._ebcdic_to_ascii(xml_data)
1673	elif xml_data[:4] == '\x00\x3c\x00\x3f':
1674	# UTF-16BE
1675	sniffed_xml_encoding = 'utf-16be'
1676	xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1677	elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1678	and (xml_data[2:4] != '\x00\x00'):
1679	# UTF-16BE with BOM
1680	sniffed_xml_encoding = 'utf-16be'
1681	xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1682	elif xml_data[:4] == '\x3c\x00\x3f\x00':
1683	# UTF-16LE
1684	sniffed_xml_encoding = 'utf-16le'
1685	xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1686	elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1687	(xml_data[2:4] != '\x00\x00'):
1688	# UTF-16LE with BOM
1689	sniffed_xml_encoding = 'utf-16le'
1690	xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1691	elif xml_data[:4] == '\x00\x00\x00\x3c':
1692	# UTF-32BE
1693	sniffed_xml_encoding = 'utf-32be'
1694	xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1695	elif xml_data[:4] == '\x3c\x00\x00\x00':
1696	# UTF-32LE
1697	sniffed_xml_encoding = 'utf-32le'
1698	xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1699	elif xml_data[:4] == '\x00\x00\xfe\xff':
1700	# UTF-32BE with BOM
1701	sniffed_xml_encoding = 'utf-32be'
1702	xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1703	elif xml_data[:4] == '\xff\xfe\x00\x00':
1704	# UTF-32LE with BOM
1705	sniffed_xml_encoding = 'utf-32le'
1706	xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1707	elif xml_data[:3] == '\xef\xbb\xbf':
1708	# UTF-8 with BOM
1709	sniffed_xml_encoding = 'utf-8'
1710	xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1711	else:
1712	sniffed_xml_encoding = 'ascii'
1713	pass
1714	xml_encoding_match = re.compile \
1715	('^<\?.encoding=[\'"](.?)[\'"].*\?>')\
1716	.match(xml_data)
1717	except:
1718	xml_encoding_match = None
1719	if xml_encoding_match:
1720	xml_encoding = xml_encoding_match.groups()[0].lower()
1721	if sniffed_xml_encoding and \
1722	(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1723	'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1724	'utf-16', 'utf-32', 'utf_16', 'utf_32',
1725	'utf16', 'u16')):
1726	xml_encoding = sniffed_xml_encoding
1727	return xml_data, xml_encoding, sniffed_xml_encoding
1728
1729
1730	def find_codec(self, charset):
1731	return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1732	or (charset and self._codec(charset.replace("-", ""))) \
1733	or (charset and self._codec(charset.replace("-", "_"))) \
1734	or charset
1735
1736	def _codec(self, charset):
1737	if not charset: return charset
1738	codec = None
1739	try:
1740	codecs.lookup(charset)
1741	codec = charset
1742	except LookupError:
1743	pass
1744	return codec
1745
1746	EBCDIC_TO_ASCII_MAP = None
1747	def _ebcdic_to_ascii(self, s):
1748	c = self.__class__
1749	if not c.EBCDIC_TO_ASCII_MAP:
1750	emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1751	16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1752	128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1753	144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1754	32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1755	38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1756	45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1757	186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1758	195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1759	201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1760	206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1761	211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1762	225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1763	73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1764	82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1765	90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1766	250,251,252,253,254,255)
1767	import string
1768	c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1769	''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1770	return s.translate(c.EBCDIC_TO_ASCII_MAP)
1771
1772	MS_CHARS = { '\x80' : ('euro', '20AC'),
1773	'\x81' : ' ',
1774	'\x82' : ('sbquo', '201A'),
1775	'\x83' : ('fnof', '192'),
1776	'\x84' : ('bdquo', '201E'),
1777	'\x85' : ('hellip', '2026'),
1778	'\x86' : ('dagger', '2020'),
1779	'\x87' : ('Dagger', '2021'),
1780	'\x88' : ('circ', '2C6'),
1781	'\x89' : ('permil', '2030'),
1782	'\x8A' : ('Scaron', '160'),
1783	'\x8B' : ('lsaquo', '2039'),
1784	'\x8C' : ('OElig', '152'),
1785	'\x8D' : '?',
1786	'\x8E' : ('#x17D', '17D'),
1787	'\x8F' : '?',
1788	'\x90' : '?',
1789	'\x91' : ('lsquo', '2018'),
1790	'\x92' : ('rsquo', '2019'),
1791	'\x93' : ('ldquo', '201C'),
1792	'\x94' : ('rdquo', '201D'),
1793	'\x95' : ('bull', '2022'),
1794	'\x96' : ('ndash', '2013'),
1795	'\x97' : ('mdash', '2014'),
1796	'\x98' : ('tilde', '2DC'),
1797	'\x99' : ('trade', '2122'),
1798	'\x9a' : ('scaron', '161'),
1799	'\x9b' : ('rsaquo', '203A'),
1800	'\x9c' : ('oelig', '153'),
1801	'\x9d' : '?',
1802	'\x9e' : ('#x17E', '17E'),
1803	'\x9f' : ('Yuml', '178'),}
1804
1805	#######################################################################
1806
1807
1808	#By default, act as an HTML pretty-printer.
1809	if __name__ == '__main__':
1810	import sys
1811	soup = BeautifulSoup(sys.stdin.read())
1812	print soup.prettify()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/scripts/microbes/BeautifulSoup.py

異なるフォーマットでダウンロード: