root/galaxy-central/scripts/microbes/BeautifulSoup.py

リビジョン 2, 69.9 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1"""Beautiful Soup
2Elixir and Tonic
3"The Screen-Scraper's Friend"
4http://www.crummy.com/software/BeautifulSoup/
5
6Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7tree representation. It provides methods and Pythonic idioms that make
8it easy to navigate, search, and modify the tree.
9
10A well-structured XML/HTML document yields a well-behaved data
11structure. An ill-structured XML/HTML document yields a
12correspondingly ill-behaved data structure. If your document is only
13locally well-structured, you can use this library to find and process
14the well-structured part of it.
15
16Beautiful Soup works with Python 2.2 and up. It has no external
17dependencies, but you'll have more success at converting data to UTF-8
18if you also install these three packages:
19
20* chardet, for auto-detecting character encodings
21  http://chardet.feedparser.org/
22* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23  by stock Python.
24  http://cjkpython.i18n.org/
25
26Beautiful Soup defines classes for two main parsing strategies:
27   
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29   language that kind of looks like XML.
30
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32   or invalid. This class has web browser-like heuristics for
33   obtaining a sensible parse tree in the face of common HTML errors.
34
35Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36the encoding of an HTML or XML document, and converting it to
37Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed
38Parser.
39
40For more than you ever wanted to know about Beautiful Soup, see the
41documentation:
42http://www.crummy.com/software/BeautifulSoup/documentation.html
43"""
44from __future__ import generators
45
46__author__ = "Leonard Richardson (crummy.com)"
47__contributors__ = ["Sam Ruby (intertwingly.net)",
48                    "the unwitting Mark Pilgrim (diveintomark.org)",
49                    "http://www.crummy.com/software/BeautifulSoup/AUTHORS.html"]
50__version__ = "3.0.3"
51__copyright__ = "Copyright (c) 2004-2006 Leonard Richardson"
52__license__ = "PSF"
53
54from sgmllib import SGMLParser, SGMLParseError
55import codecs
56import types
57import re
58import sgmllib
59from htmlentitydefs import name2codepoint
60
61# This RE makes Beautiful Soup able to parse XML with namespaces.
62sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
63
64# This RE makes Beautiful Soup capable of recognizing numeric character
65# references that use hexadecimal.
66sgmllib.charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
67
68DEFAULT_OUTPUT_ENCODING = "utf-8"
69
70# First, the classes that represent markup elements.
71
72class PageElement:
73    """Contains the navigational information for some part of the page
74    (either a tag or a piece of text)"""
75
76    def setup(self, parent=None, previous=None):
77        """Sets up the initial relations between this element and
78        other elements."""       
79        self.parent = parent
80        self.previous = previous
81        self.next = None
82        self.previousSibling = None
83        self.nextSibling = None
84        if self.parent and self.parent.contents:
85            self.previousSibling = self.parent.contents[-1]
86            self.previousSibling.nextSibling = self
87
88    def replaceWith(self, replaceWith):       
89        oldParent = self.parent
90        myIndex = self.parent.contents.index(self)
91        if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
92            # We're replacing this element with one of its siblings.
93            index = self.parent.contents.index(replaceWith)
94            if index and index < myIndex:
95                # Furthermore, it comes before this element. That
96                # means that when we extract it, the index of this
97                # element will change.
98                myIndex = myIndex - 1
99        self.extract()       
100        oldParent.insert(myIndex, replaceWith)
101       
102    def extract(self):
103        """Destructively rips this element out of the tree."""       
104        if self.parent:
105            try:
106                self.parent.contents.remove(self)
107            except ValueError:
108                pass
109
110        #Find the two elements that would be next to each other if
111        #this element (and any children) hadn't been parsed. Connect
112        #the two.       
113        lastChild = self._lastRecursiveChild()
114        nextElement = lastChild.next
115
116        if self.previous:
117            self.previous.next = nextElement
118        if nextElement:
119            nextElement.previous = self.previous
120        self.previous = None
121        lastChild.next = None
122
123        self.parent = None       
124        if self.previousSibling:
125            self.previousSibling.nextSibling = self.nextSibling
126        if self.nextSibling:
127            self.nextSibling.previousSibling = self.previousSibling
128        self.previousSibling = self.nextSibling = None       
129
130    def _lastRecursiveChild(self):
131        "Finds the last element beneath this object to be parsed."
132        lastChild = self
133        while hasattr(lastChild, 'contents') and lastChild.contents:
134            lastChild = lastChild.contents[-1]
135        return lastChild
136
137    def insert(self, position, newChild):
138        if (isinstance(newChild, basestring)
139            or isinstance(newChild, unicode)) \
140            and not isinstance(newChild, NavigableString):
141            newChild = NavigableString(newChild)       
142
143        position =  min(position, len(self.contents))
144        if hasattr(newChild, 'parent') and newChild.parent != None:
145            # We're 'inserting' an element that's already one
146            # of this object's children.
147            if newChild.parent == self:
148                index = self.find(newChild)
149                if index and index < position:
150                    # Furthermore we're moving it further down the
151                    # list of this object's children. That means that
152                    # when we extract this element, our target index
153                    # will jump down one.
154                    position = position - 1
155            newChild.extract()
156           
157        newChild.parent = self
158        previousChild = None
159        if position == 0:
160            newChild.previousSibling = None
161            newChild.previous = self
162        else:
163            previousChild = self.contents[position-1]
164            newChild.previousSibling = previousChild
165            newChild.previousSibling.nextSibling = newChild
166            newChild.previous = previousChild._lastRecursiveChild()
167        if newChild.previous:
168            newChild.previous.next = newChild       
169
170        newChildsLastElement = newChild._lastRecursiveChild()
171
172        if position >= len(self.contents):
173            newChild.nextSibling = None
174           
175            parent = self
176            parentsNextSibling = None
177            while not parentsNextSibling:
178                parentsNextSibling = parent.nextSibling
179                parent = parent.parent
180                if not parent: # This is the last element in the document.
181                    break
182            if parentsNextSibling:
183                newChildsLastElement.next = parentsNextSibling
184            else:
185                newChildsLastElement.next = None
186        else:
187            nextChild = self.contents[position]           
188            newChild.nextSibling = nextChild           
189            if newChild.nextSibling:
190                newChild.nextSibling.previousSibling = newChild
191            newChildsLastElement.next = nextChild
192
193        if newChildsLastElement.next:
194            newChildsLastElement.next.previous = newChildsLastElement
195        self.contents.insert(position, newChild)
196
197    def findNext(self, name=None, attrs={}, text=None, **kwargs):
198        """Returns the first item that matches the given criteria and
199        appears after this Tag in the document."""
200        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
201
202    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
203                    **kwargs):
204        """Returns all items that match the given criteria and appear
205        before after Tag in the document."""
206        return self._findAll(name, attrs, text, limit, self.nextGenerator)
207
208    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
209        """Returns the closest sibling to this Tag that matches the
210        given criteria and appears after this Tag in the document."""
211        return self._findOne(self.findNextSiblings, name, attrs, text,
212                             **kwargs)
213
214    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
215                         **kwargs):
216        """Returns the siblings of this Tag that match the given
217        criteria and appear after this Tag in the document."""
218        return self._findAll(name, attrs, text, limit,
219                             self.nextSiblingGenerator, **kwargs)
220    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
221
222    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
223        """Returns the first item that matches the given criteria and
224        appears before this Tag in the document."""
225        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
226
227    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
228                        **kwargs):
229        """Returns all items that match the given criteria and appear
230        before this Tag in the document."""
231        return self._findAll(name, attrs, text, limit, self.previousGenerator,
232                           **kwargs)
233    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
234
235    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
236        """Returns the closest sibling to this Tag that matches the
237        given criteria and appears before this Tag in the document."""
238        return self._findOne(self.findPreviousSiblings, name, attrs, text,
239                             **kwargs)
240
241    def findPreviousSiblings(self, name=None, attrs={}, text=None,
242                             limit=None, **kwargs):
243        """Returns the siblings of this Tag that match the given
244        criteria and appear before this Tag in the document."""
245        return self._findAll(name, attrs, text, limit,
246                             self.previousSiblingGenerator, **kwargs)
247    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
248
249    def findParent(self, name=None, attrs={}, **kwargs):
250        """Returns the closest parent of this Tag that matches the given
251        criteria."""
252        # NOTE: We can't use _findOne because findParents takes a different
253        # set of arguments.
254        r = None
255        l = self.findParents(name, attrs, 1)
256        if l:
257            r = l[0]
258        return r
259
260    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
261        """Returns the parents of this Tag that match the given
262        criteria."""
263
264        return self._findAll(name, attrs, None, limit, self.parentGenerator,
265                             **kwargs)
266    fetchParents = findParents # Compatibility with pre-3.x
267
268    #These methods do the real heavy lifting.
269
270    def _findOne(self, method, name, attrs, text, **kwargs):
271        r = None
272        l = method(name, attrs, text, 1, **kwargs)
273        if l:
274            r = l[0]
275        return r
276   
277    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
278        "Iterates over a generator looking for things that match."
279
280        if isinstance(name, SoupStrainer):
281            strainer = name
282        else:
283            # Build a SoupStrainer
284            strainer = SoupStrainer(name, attrs, text, **kwargs)
285        results = ResultSet(strainer)
286        g = generator()
287        while True:
288            try:
289                i = g.next()
290            except StopIteration:
291                break
292            if i:
293                found = strainer.search(i)
294                if found:
295                    results.append(found)
296                    if limit and len(results) >= limit:
297                        break
298        return results
299
300    #These Generators can be used to navigate starting from both
301    #NavigableStrings and Tags.               
302    def nextGenerator(self):
303        i = self
304        while i:
305            i = i.next
306            yield i
307
308    def nextSiblingGenerator(self):
309        i = self
310        while i:
311            i = i.nextSibling
312            yield i
313
314    def previousGenerator(self):
315        i = self
316        while i:
317            i = i.previous
318            yield i
319
320    def previousSiblingGenerator(self):
321        i = self
322        while i:
323            i = i.previousSibling
324            yield i
325
326    def parentGenerator(self):
327        i = self
328        while i:
329            i = i.parent
330            yield i
331
332    # Utility methods
333    def substituteEncoding(self, str, encoding=None):
334        encoding = encoding or "utf-8"
335        return str.replace("%SOUP-ENCODING%", encoding)   
336
337    def toEncoding(self, s, encoding=None):
338        """Encodes an object to a string in some encoding, or to Unicode.
339        ."""
340        if isinstance(s, unicode):
341            if encoding:
342                s = s.encode(encoding)
343        elif isinstance(s, str):
344            if encoding:
345                s = s.encode(encoding)
346            else:
347                s = unicode(s)
348        else:
349            if encoding:
350                s  = self.toEncoding(str(s), encoding)
351            else:
352                s = unicode(s)
353        return s
354
355class NavigableString(unicode, PageElement):
356
357    def __getattr__(self, attr):
358        """text.string gives you text. This is for backwards
359        compatibility for Navigable*String, but for CData* it lets you
360        get the string without the CData wrapper."""
361        if attr == 'string':
362            return self
363        else:
364            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
365
366    def __unicode__(self):
367        return __str__(self, None)
368
369    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
370        if encoding:
371            return self.encode(encoding)
372        else:
373            return self
374       
375class CData(NavigableString):
376
377    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
378        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
379
380class ProcessingInstruction(NavigableString):
381    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
382        output = self
383        if "%SOUP-ENCODING%" in output:
384            output = self.substituteEncoding(output, encoding)
385        return "<?%s?>" % self.toEncoding(output, encoding)
386
387class Comment(NavigableString):
388    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
389        return "<!--%s-->" % NavigableString.__str__(self, encoding)   
390
391class Declaration(NavigableString):
392    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
393        return "<!%s>" % NavigableString.__str__(self, encoding)       
394
395class Tag(PageElement):
396    """Represents a found HTML tag with its attributes and contents."""
397
398    XML_ENTITIES_TO_CHARS = { 'apos' : "'",
399                              "quot" : '"',
400                              "amp" : "&",
401                              "lt" : "<",
402                              "gt" : ">"
403                              }
404    # An RE for finding ampersands that aren't the start of of a
405    # numeric entity.
406    BARE_AMPERSAND = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
407
408    def __init__(self, parser, name, attrs=None, parent=None,
409                 previous=None):
410        "Basic constructor."
411
412        # We don't actually store the parser object: that lets extracted
413        # chunks be garbage-collected
414        self.parserClass = parser.__class__
415        self.isSelfClosing = parser.isSelfClosingTag(name)
416        self.convertHTMLEntities = parser.convertHTMLEntities
417        self.name = name
418        if attrs == None:
419            attrs = []
420        self.attrs = attrs
421        self.contents = []
422        self.setup(parent, previous)
423        self.hidden = False
424        self.containsSubstitutions = False
425
426    def get(self, key, default=None):
427        """Returns the value of the 'key' attribute for the tag, or
428        the value given for 'default' if it doesn't have that
429        attribute."""
430        return self._getAttrMap().get(key, default)   
431
432    def has_key(self, key):
433        return self._getAttrMap().has_key(key)
434
435    def __getitem__(self, key):
436        """tag[key] returns the value of the 'key' attribute for the tag,
437        and throws an exception if it's not there."""
438        return self._getAttrMap()[key]
439
440    def __iter__(self):
441        "Iterating over a tag iterates over its contents."
442        return iter(self.contents)
443
444    def __len__(self):
445        "The length of a tag is the length of its list of contents."
446        return len(self.contents)
447
448    def __contains__(self, x):
449        return x in self.contents
450
451    def __nonzero__(self):
452        "A tag is non-None even if it has no contents."
453        return True
454
455    def __setitem__(self, key, value):       
456        """Setting tag[key] sets the value of the 'key' attribute for the
457        tag."""
458        self._getAttrMap()
459        self.attrMap[key] = value
460        found = False
461        for i in range(0, len(self.attrs)):
462            if self.attrs[i][0] == key:
463                self.attrs[i] = (key, value)
464                found = True
465        if not found:
466            self.attrs.append((key, value))
467        self._getAttrMap()[key] = value
468
469    def __delitem__(self, key):
470        "Deleting tag[key] deletes all 'key' attributes for the tag."
471        for item in self.attrs:
472            if item[0] == key:
473                self.attrs.remove(item)
474                #We don't break because bad HTML can define the same
475                #attribute multiple times.
476            self._getAttrMap()
477            if self.attrMap.has_key(key):
478                del self.attrMap[key]
479
480    def __call__(self, *args, **kwargs):
481        """Calling a tag like a function is the same as calling its
482        findAll() method. Eg. tag('a') returns a list of all the A tags
483        found within this tag."""
484        return apply(self.findAll, args, kwargs)
485
486    def __getattr__(self, tag):
487        #print "Getattr %s.%s" % (self.__class__, tag)
488        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
489            return self.find(tag[:-3])
490        elif tag.find('__') != 0:
491            return self.find(tag)
492
493    def __eq__(self, other):
494        """Returns true iff this tag has the same name, the same attributes,
495        and the same contents (recursively) as the given tag.
496
497        NOTE: right now this will return false if two tags have the
498        same attributes in a different order. Should this be fixed?"""
499        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
500            return False
501        for i in range(0, len(self.contents)):
502            if self.contents[i] != other.contents[i]:
503                return False
504        return True
505
506    def __ne__(self, other):
507        """Returns true iff this tag is not identical to the other tag,
508        as defined in __eq__."""
509        return not self == other
510
511    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
512        """Renders this tag as a string."""
513        return self.__str__(encoding)
514
515    def __unicode__(self):
516        return self.__str__(None)
517
518    def _convertEntities(self, match):
519        x = match.group(1)
520        if x in name2codepoint:
521            return unichr(name2codepoint[x])           
522        elif "&" + x + ";" in self.XML_ENTITIES_TO_CHARS:
523            return '&%s;' % x
524        else:
525            return '&amp;%s;' % x
526
527    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
528                prettyPrint=False, indentLevel=0):
529        """Returns a string or Unicode representation of this tag and
530        its contents. To get Unicode, pass None for encoding.
531
532        NOTE: since Python's HTML parser consumes whitespace, this
533        method is not certain to reproduce the whitespace present in
534        the original string."""
535
536        encodedName = self.toEncoding(self.name, encoding)
537
538        attrs = []
539        if self.attrs:
540            for key, val in self.attrs:
541                fmt = '%s="%s"'
542                if isString(val):                   
543                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
544                        val = self.substituteEncoding(val, encoding)
545
546                    # The attribute value either:
547                    #
548                    # * Contains no embedded double quotes or single quotes.
549                    #   No problem: we enclose it in double quotes.
550                    # * Contains embedded single quotes. No problem:
551                    #   double quotes work here too.
552                    # * Contains embedded double quotes. No problem:
553                    #   we enclose it in single quotes.
554                    # * Embeds both single _and_ double quotes. This
555                    #   can't happen naturally, but it can happen if
556                    #   you modify an attribute value after parsing
557                    #   the document. Now we have a bit of a
558                    #   problem. We solve it by enclosing the
559                    #   attribute in single quotes, and escaping any
560                    #   embedded single quotes to XML entities.
561                    if '"' in val:
562                        # This can't happen naturally, but it can happen
563                        # if you modify an attribute value after parsing.
564                        if "'" in val:
565                            val = val.replace('"', "&quot;")
566                        else:
567                            fmt = "%s='%s'"
568
569                    # Optionally convert any HTML entities
570                    if self.convertHTMLEntities:
571                        val = re.sub("&(\w+);", self._convertEntities, val)
572
573                    # Now we're okay w/r/t quotes. But the attribute
574                    # value might also contain angle brackets, or
575                    # ampersands that aren't part of entities. We need
576                    # to escape those to XML entities too.
577                    val = val.replace("<", "&lt;").replace(">", "&gt;")
578                    val = self.BARE_AMPERSAND.sub("&amp;", val)
579
580                                     
581                attrs.append(fmt % (self.toEncoding(key, encoding),
582                                    self.toEncoding(val, encoding)))
583        close = ''
584        closeTag = ''
585        if self.isSelfClosing:
586            close = ' /'
587        else:
588            closeTag = '</%s>' % encodedName
589
590        indentTag, indentContents = 0, 0
591        if prettyPrint:
592            indentTag = indentLevel
593            space = (' ' * (indentTag-1))
594            indentContents = indentTag + 1
595        contents = self.renderContents(encoding, prettyPrint, indentContents)
596        if self.hidden:
597            s = contents
598        else:
599            s = []
600            attributeString = ''
601            if attrs:
602                attributeString = ' ' + ' '.join(attrs)           
603            if prettyPrint:
604                s.append(space)
605            s.append('<%s%s%s>' % (encodedName, attributeString, close))
606            if prettyPrint:
607                s.append("\n")
608            s.append(contents)
609            if prettyPrint and contents and contents[-1] != "\n":
610                s.append("\n")
611            if prettyPrint and closeTag:
612                s.append(space)
613            s.append(closeTag)
614            if prettyPrint and closeTag and self.nextSibling:
615                s.append("\n")
616            s = ''.join(s)
617        return s
618
619    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
620        return self.__str__(encoding, True)
621
622    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
623                       prettyPrint=False, indentLevel=0):
624        """Renders the contents of this tag as a string in the given
625        encoding. If encoding is None, returns a Unicode string.."""
626        s=[]
627        for c in self:
628            text = None
629            if isinstance(c, NavigableString):
630                text = c.__str__(encoding)
631            elif isinstance(c, Tag):
632                s.append(c.__str__(encoding, prettyPrint, indentLevel))
633            if text and prettyPrint:
634                text = text.strip()             
635            if text:
636                if prettyPrint:
637                    s.append(" " * (indentLevel-1))
638                s.append(text)
639                if prettyPrint:
640                    s.append("\n")
641        return ''.join(s)   
642
643    #Soup methods
644
645    def find(self, name=None, attrs={}, recursive=True, text=None,
646             **kwargs):
647        """Return only the first child of this Tag matching the given
648        criteria."""
649        r = None
650        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
651        if l:
652            r = l[0]
653        return r
654    findChild = find
655
656    def findAll(self, name=None, attrs={}, recursive=True, text=None,
657                limit=None, **kwargs):
658        """Extracts a list of Tag objects that match the given
659        criteria.  You can specify the name of the Tag and any
660        attributes you want the Tag to have.
661
662        The value of a key-value pair in the 'attrs' map can be a
663        string, a list of strings, a regular expression object, or a
664        callable that takes a string and returns whether or not the
665        string matches for some custom definition of 'matches'. The
666        same is true of the tag name."""
667        generator = self.recursiveChildGenerator
668        if not recursive:
669            generator = self.childGenerator
670        return self._findAll(name, attrs, text, limit, generator, **kwargs)
671    findChildren = findAll
672
673    # Pre-3.x compatibility methods
674    first = find
675    fetch = findAll
676   
677    def fetchText(self, text=None, recursive=True, limit=None):
678        return self.findAll(text=text, recursive=recursive, limit=limit)
679
680    def firstText(self, text=None, recursive=True):
681        return self.find(text=text, recursive=recursive)
682   
683    #Utility methods
684
685    def append(self, tag):
686        """Appends the given tag to the contents of this tag."""
687        self.contents.append(tag)
688
689    #Private methods
690
691    def _getAttrMap(self):
692        """Initializes a map representation of this tag's attributes,
693        if not already initialized."""
694        if not getattr(self, 'attrMap'):
695            self.attrMap = {}
696            for (key, value) in self.attrs:
697                self.attrMap[key] = value
698        return self.attrMap
699
700    #Generator methods
701    def childGenerator(self):
702        for i in range(0, len(self.contents)):
703            yield self.contents[i]
704        raise StopIteration
705   
706    def recursiveChildGenerator(self):
707        stack = [(self, 0)]
708        while stack:
709            tag, start = stack.pop()
710            if isinstance(tag, Tag):           
711                for i in range(start, len(tag.contents)):
712                    a = tag.contents[i]
713                    yield a
714                    if isinstance(a, Tag) and tag.contents:
715                        if i < len(tag.contents) - 1:
716                            stack.append((tag, i+1))
717                        stack.append((a, 0))
718                        break
719        raise StopIteration
720
721# Next, a couple classes to represent queries and their results.
722class SoupStrainer:
723    """Encapsulates a number of ways of matching a markup element (tag or
724    text)."""
725
726    def __init__(self, name=None, attrs={}, text=None, **kwargs):
727        self.name = name
728        if isString(attrs):
729            kwargs['class'] = attrs
730            attrs = None
731        if kwargs:
732            if attrs:
733                attrs = attrs.copy()
734                attrs.update(kwargs)
735            else:
736                attrs = kwargs
737        self.attrs = attrs
738        self.text = text
739
740    def __str__(self):
741        if self.text:
742            return self.text
743        else:
744            return "%s|%s" % (self.name, self.attrs)
745   
746    def searchTag(self, markupName=None, markupAttrs={}):
747        found = None
748        markup = None
749        if isinstance(markupName, Tag):
750            markup = markupName
751            markupAttrs = markup
752        callFunctionWithTagData = callable(self.name) \
753                                and not isinstance(markupName, Tag)
754
755        if (not self.name) \
756               or callFunctionWithTagData \
757               or (markup and self._matches(markup, self.name)) \
758               or (not markup and self._matches(markupName, self.name)):
759            if callFunctionWithTagData:
760                match = self.name(markupName, markupAttrs)
761            else:
762                match = True           
763                markupAttrMap = None
764                for attr, matchAgainst in self.attrs.items():
765                    if not markupAttrMap:
766                         if hasattr(markupAttrs, 'get'):
767                            markupAttrMap = markupAttrs
768                         else:
769                            markupAttrMap = {}
770                            for k,v in markupAttrs:
771                                markupAttrMap[k] = v
772                    attrValue = markupAttrMap.get(attr)
773                    if not self._matches(attrValue, matchAgainst):
774                        match = False
775                        break
776            if match:
777                if markup:
778                    found = markup
779                else:
780                    found = markupName
781        return found
782
783    def search(self, markup):
784        #print 'looking for %s in %s' % (self, markup)
785        found = None
786        # If given a list of items, scan it for a text element that
787        # matches.       
788        if isList(markup) and not isinstance(markup, Tag):
789            for element in markup:
790                if isinstance(element, NavigableString) \
791                       and self.search(element):
792                    found = element
793                    break
794        # If it's a Tag, make sure its name or attributes match.
795        # Don't bother with Tags if we're searching for text.
796        elif isinstance(markup, Tag):
797            if not self.text:
798                found = self.searchTag(markup)
799        # If it's text, make sure the text matches.
800        elif isinstance(markup, NavigableString) or \
801                 isString(markup):
802            if self._matches(markup, self.text):
803                found = markup
804        else:
805            raise Exception, "I don't know how to match against a %s" \
806                  % markup.__class__
807        return found
808       
809    def _matches(self, markup, matchAgainst):   
810        #print "Matching %s against %s" % (markup, matchAgainst)
811        result = False
812        if matchAgainst == True and type(matchAgainst) == types.BooleanType:
813            result = markup != None
814        elif callable(matchAgainst):
815            result = matchAgainst(markup)
816        else:
817            #Custom match methods take the tag as an argument, but all
818            #other ways of matching match the tag name as a string.
819            if isinstance(markup, Tag):
820                markup = markup.name
821            if markup and not isString(markup):
822                markup = unicode(markup)
823            #Now we know that chunk is either a string, or None.
824            if hasattr(matchAgainst, 'match'):
825                # It's a regexp object.
826                result = markup and matchAgainst.search(markup)
827            elif isList(matchAgainst):
828                result = markup in matchAgainst
829            elif hasattr(matchAgainst, 'items'):
830                result = markup.has_key(matchAgainst)
831            elif matchAgainst and isString(markup):
832                if isinstance(markup, unicode):
833                    matchAgainst = unicode(matchAgainst)
834                else:
835                    matchAgainst = str(matchAgainst)
836
837            if not result:
838                result = matchAgainst == markup
839        return result
840
841class ResultSet(list):
842    """A ResultSet is just a list that keeps track of the SoupStrainer
843    that created it."""
844    def __init__(self, source):
845        list.__init__([])
846        self.source = source
847
848# Now, some helper functions.
849
850def isList(l):
851    """Convenience method that works with all 2.x versions of Python
852    to determine whether or not something is listlike."""
853    return hasattr(l, '__iter__') \
854           or (type(l) in (types.ListType, types.TupleType))
855
856def isString(s):
857    """Convenience method that works with all 2.x versions of Python
858    to determine whether or not something is stringlike."""
859    try:
860        return isinstance(s, unicode) or isintance(s, basestring)
861    except NameError:
862        return isinstance(s, str)
863
864def buildTagMap(default, *args):
865    """Turns a list of maps, lists, or scalars into a single map.
866    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
867    NESTING_RESET_TAGS maps out of lists and partial maps."""
868    built = {}
869    for portion in args:
870        if hasattr(portion, 'items'):
871            #It's a map. Merge it.
872            for k,v in portion.items():
873                built[k] = v
874        elif isList(portion):
875            #It's a list. Map each item to the default.
876            for k in portion:
877                built[k] = default
878        else:
879            #It's a scalar. Map it to the default.
880            built[portion] = default
881    return built
882
883# Now, the parser classes.
884
885class BeautifulStoneSoup(Tag, SGMLParser):
886
887    """This class contains the basic parser and search code. It defines
888    a parser that knows nothing about tag behavior except for the
889    following:
890   
891      You can't close a tag without closing all the tags it encloses.
892      That is, "<foo><bar></foo>" actually means
893      "<foo><bar></bar></foo>".
894
895    [Another possible explanation is "<foo><bar /></foo>", but since
896    this class defines no SELF_CLOSING_TAGS, it will never use that
897    explanation.]
898
899    This class is useful for parsing XML or made-up markup languages,
900    or when BeautifulSoup makes an assumption counter to what you were
901    expecting."""
902
903    SELF_CLOSING_TAGS = {}
904    NESTABLE_TAGS = {}
905    RESET_NESTING_TAGS = {}
906    QUOTE_TAGS = {}
907
908    MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
909                       lambda x: x.group(1) + ' />'),
910                      (re.compile('<!\s+([^<>]*)>'),
911                       lambda x: '<!' + x.group(1) + '>')
912                      ]
913
914    ROOT_TAG_NAME = u'[document]'
915
916    HTML_ENTITIES = "html"
917    XML_ENTITIES = "xml"
918    ALL_ENTITIES = [HTML_ENTITIES, XML_ENTITIES]
919
920    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
921                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
922                 convertEntities=None, selfClosingTags=None):
923        """The Soup object is initialized as the 'root tag', and the
924        provided markup (which can be a string or a file-like object)
925        is fed into the underlying parser.
926
927        sgmllib will process most bad HTML, and the BeautifulSoup
928        class has some tricks for dealing with some HTML that kills
929        sgmllib, but Beautiful Soup can nonetheless choke or lose data
930        if your data uses self-closing tags or declarations
931        incorrectly.
932
933        By default, Beautiful Soup uses regexes to sanitize input,
934        avoiding the vast majority of these problems. If the problems
935        don't apply to you, pass in False for markupMassage, and
936        you'll get better performance.
937
938        The default parser massage techniques fix the two most common
939        instances of invalid HTML that choke sgmllib:
940
941         <br/> (No space between name of closing tag and tag close)
942         <! --Comment--> (Extraneous whitespace in declaration)
943
944        You can pass in a custom list of (RE object, replace method)
945        tuples to get Beautiful Soup to scrub your input the way you
946        want."""
947
948        self.parseOnlyThese = parseOnlyThese
949        self.fromEncoding = fromEncoding
950        self.smartQuotesTo = smartQuotesTo
951
952        if convertEntities:
953            # It doesn't make sense to convert encoded characters to
954            # entities even while you're converting entities to Unicode.
955            # Just convert it all to Unicode.
956            self.smartQuotesTo = None
957
958        if isList(convertEntities):
959            self.convertHTMLEntities = self.HTML_ENTITIES in convertEntities
960            self.convertXMLEntities = self.XML_ENTITIES in convertEntities
961        else:
962            self.convertHTMLEntities = self.HTML_ENTITIES == convertEntities
963            self.convertXMLEntities = self.XML_ENTITIES == convertEntities
964
965        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
966        SGMLParser.__init__(self)
967           
968        if hasattr(markup, 'read'):        # It's a file-type object.
969            markup = markup.read()
970        self.markup = markup
971        self.markupMassage = markupMassage
972        try:
973            self._feed()
974        except StopParsing:
975            pass
976        self.markup = None                 # The markup can now be GCed
977
978    def _feed(self, inDocumentEncoding=None):
979        # Convert the document to Unicode.
980        markup = self.markup
981        if isinstance(markup, unicode):
982            if not hasattr(self, 'originalEncoding'):
983                self.originalEncoding = None
984        else:
985            dammit = UnicodeDammit\
986                     (markup, [self.fromEncoding, inDocumentEncoding],
987                      smartQuotesTo=self.smartQuotesTo)
988            markup = dammit.unicode
989            self.originalEncoding = dammit.originalEncoding
990        if markup:
991            if self.markupMassage:
992                if not isList(self.markupMassage):
993                    self.markupMassage = self.MARKUP_MASSAGE           
994                for fix, m in self.markupMassage:
995                    markup = fix.sub(m, markup)
996        self.reset()
997
998        SGMLParser.feed(self, markup or "")
999        SGMLParser.close(self)
1000        # Close out any unfinished strings and close all the open tags.
1001        self.endData()
1002        while self.currentTag.name != self.ROOT_TAG_NAME:
1003            self.popTag()
1004
1005    def __getattr__(self, methodName):
1006        """This method routes method call requests to either the SGMLParser
1007        superclass or the Tag superclass, depending on the method name."""
1008        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1009
1010        if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1011               or methodName.find('do_') == 0:
1012            return SGMLParser.__getattr__(self, methodName)
1013        elif methodName.find('__') != 0:
1014            return Tag.__getattr__(self, methodName)
1015        else:
1016            raise AttributeError
1017
1018    def isSelfClosingTag(self, name):
1019        """Returns true iff the given string is the name of a
1020        self-closing tag according to this parser."""
1021        return self.SELF_CLOSING_TAGS.has_key(name) \
1022               or self.instanceSelfClosingTags.has_key(name)
1023           
1024    def reset(self):
1025        Tag.__init__(self, self, self.ROOT_TAG_NAME)
1026        self.hidden = 1
1027        SGMLParser.reset(self)
1028        self.currentData = []
1029        self.currentTag = None
1030        self.tagStack = []
1031        self.quoteStack = []
1032        self.pushTag(self)
1033   
1034    def popTag(self):
1035        tag = self.tagStack.pop()
1036        # Tags with just one string-owning child get the child as a
1037        # 'string' property, so that soup.tag.string is shorthand for
1038        # soup.tag.contents[0]
1039        if len(self.currentTag.contents) == 1 and \
1040           isinstance(self.currentTag.contents[0], NavigableString):
1041            self.currentTag.string = self.currentTag.contents[0]
1042
1043        #print "Pop", tag.name
1044        if self.tagStack:
1045            self.currentTag = self.tagStack[-1]
1046        return self.currentTag
1047
1048    def pushTag(self, tag):
1049        #print "Push", tag.name
1050        if self.currentTag:
1051            self.currentTag.append(tag)
1052        self.tagStack.append(tag)
1053        self.currentTag = self.tagStack[-1]
1054
1055    def endData(self, containerClass=NavigableString):
1056        if self.currentData:
1057            currentData = ''.join(self.currentData)
1058            if currentData.endswith('<') and self.convertHTMLEntities:
1059                currentData = currentData[:-1] + '&lt;'
1060            if not currentData.strip():
1061                if '\n' in currentData:
1062                    currentData = '\n'
1063                else:
1064                    currentData = ' '
1065            self.currentData = []
1066            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1067                   (not self.parseOnlyThese.text or \
1068                    not self.parseOnlyThese.search(currentData)):
1069                return
1070            o = containerClass(currentData)
1071            o.setup(self.currentTag, self.previous)
1072            if self.previous:
1073                self.previous.next = o
1074            self.previous = o
1075            self.currentTag.contents.append(o)
1076
1077
1078    def _popToTag(self, name, inclusivePop=True):
1079        """Pops the tag stack up to and including the most recent
1080        instance of the given tag. If inclusivePop is false, pops the tag
1081        stack up to but *not* including the most recent instqance of
1082        the given tag."""
1083        #print "Popping to %s" % name
1084        if name == self.ROOT_TAG_NAME:
1085            return           
1086
1087        numPops = 0
1088        mostRecentTag = None
1089        for i in range(len(self.tagStack)-1, 0, -1):
1090            if name == self.tagStack[i].name:
1091                numPops = len(self.tagStack)-i
1092                break
1093        if not inclusivePop:
1094            numPops = numPops - 1
1095
1096        for i in range(0, numPops):
1097            mostRecentTag = self.popTag()
1098        return mostRecentTag   
1099
1100    def _smartPop(self, name):
1101
1102        """We need to pop up to the previous tag of this type, unless
1103        one of this tag's nesting reset triggers comes between this
1104        tag and the previous tag of this type, OR unless this tag is a
1105        generic nesting trigger and another generic nesting trigger
1106        comes between this tag and the previous tag of this type.
1107
1108        Examples:
1109         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1110         <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
1111         <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
1112         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1113
1114         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1115         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1116         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1117        """
1118
1119        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1120        isNestable = nestingResetTriggers != None
1121        isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1122        popTo = None
1123        inclusive = True
1124        for i in range(len(self.tagStack)-1, 0, -1):
1125            p = self.tagStack[i]
1126            if (not p or p.name == name) and not isNestable:
1127                #Non-nestable tags get popped to the top or to their
1128                #last occurance.
1129                popTo = name
1130                break
1131            if (nestingResetTriggers != None
1132                and p.name in nestingResetTriggers) \
1133                or (nestingResetTriggers == None and isResetNesting
1134                    and self.RESET_NESTING_TAGS.has_key(p.name)):
1135               
1136                #If we encounter one of the nesting reset triggers
1137                #peculiar to this tag, or we encounter another tag
1138                #that causes nesting to reset, pop up to but not
1139                #including that tag.
1140                popTo = p.name
1141                inclusive = False
1142                break
1143            p = p.parent
1144        if popTo:
1145            self._popToTag(popTo, inclusive)
1146
1147    def unknown_starttag(self, name, attrs, selfClosing=0):
1148        #print "Start tag %s: %s" % (name, attrs)
1149        if self.quoteStack:
1150            #This is not a real tag.
1151            #print "<%s> is not real!" % name
1152            attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1153            self.currentData.append('<%s%s>' % (name, attrs))
1154            return       
1155        self.endData()
1156
1157        if not self.isSelfClosingTag(name) and not selfClosing:
1158            self._smartPop(name)
1159
1160        if self.parseOnlyThese and len(self.tagStack) <= 1 \
1161               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1162            return
1163
1164        tag = Tag(self, name, attrs, self.currentTag, self.previous)
1165        if self.previous:
1166            self.previous.next = tag
1167        self.previous = tag
1168        self.pushTag(tag)
1169        if selfClosing or self.isSelfClosingTag(name):
1170            self.popTag()               
1171        if name in self.QUOTE_TAGS:
1172            #print "Beginning quote (%s)" % name
1173            self.quoteStack.append(name)
1174            self.literal = 1
1175        return tag
1176
1177    def unknown_endtag(self, name):
1178        #print "End tag %s" % name
1179        if self.quoteStack and self.quoteStack[-1] != name:
1180            #This is not a real end tag.
1181            #print "</%s> is not real!" % name
1182            self.currentData.append('</%s>' % name)
1183            return
1184        self.endData()
1185        self._popToTag(name)
1186        if self.quoteStack and self.quoteStack[-1] == name:
1187            self.quoteStack.pop()
1188            self.literal = (len(self.quoteStack) > 0)
1189
1190    def handle_data(self, data):
1191        if self.convertHTMLEntities:
1192            if data[0] == '&':
1193                data = self.BARE_AMPERSAND.sub("&amp;",data)
1194            else:
1195                data = data.replace('&','&amp;') \
1196                           .replace('<','&lt;') \
1197                           .replace('>','&gt;')
1198        self.currentData.append(data)
1199
1200    def _toStringSubclass(self, text, subclass):
1201        """Adds a certain piece of text to the tree as a NavigableString
1202        subclass."""
1203        self.endData()
1204        self.handle_data(text)
1205        self.endData(subclass)
1206
1207    def handle_pi(self, text):
1208        """Handle a processing instruction as a ProcessingInstruction
1209        object, possibly one with a %SOUP-ENCODING% slot into which an
1210        encoding will be plugged later."""
1211        if text[:3] == "xml":
1212            text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
1213        self._toStringSubclass(text, ProcessingInstruction)
1214
1215    def handle_comment(self, text):
1216        "Handle comments as Comment objects."
1217        self._toStringSubclass(text, Comment)
1218
1219    def handle_charref(self, ref):
1220        "Handle character references as data."
1221        if ref[0] == 'x':
1222            data = unichr(int(ref[1:],16))
1223        else:
1224            data = unichr(int(ref))
1225       
1226        if u'\x80' <= data <= u'\x9F':
1227            data = UnicodeDammit.subMSChar(chr(ord(data)), self.smartQuotesTo)
1228        elif not self.convertHTMLEntities and not self.convertXMLEntities:
1229            data = '&#%s;' % ref
1230
1231        self.handle_data(data)
1232
1233    def handle_entityref(self, ref):
1234        """Handle entity references as data, possibly converting known
1235        HTML entity references to the corresponding Unicode
1236        characters."""
1237        replaceWithXMLEntity = self.convertXMLEntities and \
1238                               self.XML_ENTITIES_TO_CHARS.has_key(ref)
1239        if self.convertHTMLEntities or replaceWithXMLEntity:
1240            try:
1241                data = unichr(name2codepoint[ref])
1242            except KeyError:
1243                if replaceWithXMLEntity:
1244                    data = self.XML_ENTITIES_TO_CHARS.get(ref)
1245                else:
1246                    data="&amp;%s" % ref
1247        else:
1248            data = '&%s;' % ref
1249        self.handle_data(data)
1250       
1251    def handle_decl(self, data):
1252        "Handle DOCTYPEs and the like as Declaration objects."
1253        self._toStringSubclass(data, Declaration)
1254
1255    def parse_declaration(self, i):
1256        """Treat a bogus SGML declaration as raw data. Treat a CDATA
1257        declaration as a CData object."""
1258        j = None
1259        if self.rawdata[i:i+9] == '<![CDATA[':
1260             k = self.rawdata.find(']]>', i)
1261             if k == -1:
1262                 k = len(self.rawdata)
1263             data = self.rawdata[i+9:k]
1264             j = k+3
1265             self._toStringSubclass(data, CData)
1266        else:
1267            try:
1268                j = SGMLParser.parse_declaration(self, i)
1269            except SGMLParseError:
1270                toHandle = self.rawdata[i:]
1271                self.handle_data(toHandle)
1272                j = i + len(toHandle)
1273        return j
1274
1275class BeautifulSoup(BeautifulStoneSoup):
1276
1277    """This parser knows the following facts about HTML:
1278
1279    * Some tags have no closing tag and should be interpreted as being
1280      closed as soon as they are encountered.
1281
1282    * The text inside some tags (ie. 'script') may contain tags which
1283      are not really part of the document and which should be parsed
1284      as text, not tags. If you want to parse the text as tags, you can
1285      always fetch it and parse it explicitly.
1286
1287    * Tag nesting rules:
1288
1289      Most tags can't be nested at all. For instance, the occurance of
1290      a <p> tag should implicitly close the previous <p> tag.
1291
1292       <p>Para1<p>Para2
1293        should be transformed into:
1294       <p>Para1</p><p>Para2
1295
1296      Some tags can be nested arbitrarily. For instance, the occurance
1297      of a <blockquote> tag should _not_ implicitly close the previous
1298      <blockquote> tag.
1299
1300       Alice said: <blockquote>Bob said: <blockquote>Blah
1301        should NOT be transformed into:
1302       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1303
1304      Some tags can be nested, but the nesting is reset by the
1305      interposition of other tags. For instance, a <tr> tag should
1306      implicitly close the previous <tr> tag within the same <table>,
1307      but not close a <tr> tag in another table.
1308
1309       <table><tr>Blah<tr>Blah
1310        should be transformed into:
1311       <table><tr>Blah</tr><tr>Blah
1312        but,
1313       <tr>Blah<table><tr>Blah
1314        should NOT be transformed into
1315       <tr>Blah<table></tr><tr>Blah
1316
1317    Differing assumptions about tag nesting rules are a major source
1318    of problems with the BeautifulSoup class. If BeautifulSoup is not
1319    treating as nestable a tag your page author treats as nestable,
1320    try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1321    BeautifulStoneSoup before writing your own subclass."""
1322
1323    def __init__(self, *args, **kwargs):
1324        if not kwargs.has_key('smartQuotesTo'):
1325            kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1326        BeautifulStoneSoup.__init__(self, *args, **kwargs)
1327
1328    SELF_CLOSING_TAGS = buildTagMap(None,
1329                                    ['br' , 'hr', 'input', 'img', 'meta',
1330                                    'spacer', 'link', 'frame', 'base'])
1331
1332    QUOTE_TAGS = {'script': None}
1333   
1334    #According to the HTML standard, each of these inline tags can
1335    #contain another tag of the same type. Furthermore, it's common
1336    #to actually use these tags this way.
1337    NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1338                            'center']
1339
1340    #According to the HTML standard, these block tags can contain
1341    #another tag of the same type. Furthermore, it's common
1342    #to actually use these tags this way.
1343    NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1344
1345    #Lists can contain other lists, but there are restrictions.   
1346    NESTABLE_LIST_TAGS = { 'ol' : [],
1347                           'ul' : [],
1348                           'li' : ['ul', 'ol'],
1349                           'dl' : [],
1350                           'dd' : ['dl'],
1351                           'dt' : ['dl'] }
1352
1353    #Tables can contain other tables, but there are restrictions.   
1354    NESTABLE_TABLE_TAGS = {'table' : [],
1355                           'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1356                           'td' : ['tr'],
1357                           'th' : ['tr'],
1358                           'thead' : ['table'],
1359                           'tbody' : ['table'],
1360                           'tfoot' : ['table'],
1361                           }
1362
1363    NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1364
1365    #If one of these tags is encountered, all tags up to the next tag of
1366    #this type are popped.
1367    RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1368                                     NON_NESTABLE_BLOCK_TAGS,
1369                                     NESTABLE_LIST_TAGS,
1370                                     NESTABLE_TABLE_TAGS)
1371
1372    NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1373                                NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1374
1375    # Used to detect the charset in a META tag; see start_meta
1376    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
1377
1378    def start_meta(self, attrs):
1379        """Beautiful Soup can detect a charset included in a META tag,
1380        try to convert the document to that charset, and re-parse the
1381        document from the beginning."""
1382        httpEquiv = None
1383        contentType = None
1384        contentTypeIndex = None
1385        tagNeedsEncodingSubstitution = False
1386
1387        for i in range(0, len(attrs)):
1388            key, value = attrs[i]
1389            key = key.lower()
1390            if key == 'http-equiv':
1391                httpEquiv = value
1392            elif key == 'content':
1393                contentType = value
1394                contentTypeIndex = i
1395
1396        if httpEquiv and contentType: # It's an interesting meta tag.
1397            match = self.CHARSET_RE.search(contentType)
1398            if match:
1399                if getattr(self, 'declaredHTMLEncoding') or \
1400                       (self.originalEncoding == self.fromEncoding):
1401                    # This is our second pass through the document, or
1402                    # else an encoding was specified explicitly and it
1403                    # worked. Rewrite the meta tag.
1404                    newAttr = self.CHARSET_RE.sub\
1405                              (lambda(match):match.group(1) +
1406                               "%SOUP-ENCODING%", value)
1407                    attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1408                                               newAttr)
1409                    tagNeedsEncodingSubstitution = True
1410                else:
1411                    # This is our first pass through the document.
1412                    # Go through it again with the new information.
1413                    newCharset = match.group(3)
1414                    if newCharset and newCharset != self.originalEncoding:
1415                        self.declaredHTMLEncoding = newCharset
1416                        self._feed(self.declaredHTMLEncoding)
1417                        raise StopParsing
1418        tag = self.unknown_starttag("meta", attrs)
1419        if tag and tagNeedsEncodingSubstitution:
1420            tag.containsSubstitutions = True
1421
1422class StopParsing(Exception):
1423    pass
1424   
1425class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1426
1427    """The BeautifulSoup class is oriented towards skipping over
1428    common HTML errors like unclosed tags. However, sometimes it makes
1429    errors of its own. For instance, consider this fragment:
1430
1431     <b>Foo<b>Bar</b></b>
1432
1433    This is perfectly valid (if bizarre) HTML. However, the
1434    BeautifulSoup class will implicitly close the first b tag when it
1435    encounters the second 'b'. It will think the author wrote
1436    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1437    there's no real-world reason to bold something that's already
1438    bold. When it encounters '</b></b>' it will close two more 'b'
1439    tags, for a grand total of three tags closed instead of two. This
1440    can throw off the rest of your document structure. The same is
1441    true of a number of other tags, listed below.
1442
1443    It's much more common for someone to forget to close a 'b' tag
1444    than to actually use nested 'b' tags, and the BeautifulSoup class
1445    handles the common case. This class handles the not-co-common
1446    case: where you can't believe someone wrote what they did, but
1447    it's valid HTML and BeautifulSoup screwed up by assuming it
1448    wouldn't be."""
1449
1450    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1451     ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1452      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1453      'big']
1454
1455    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1456
1457    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1458                                I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1459                                I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1460
1461class MinimalSoup(BeautifulSoup):
1462    """The MinimalSoup class is for parsing HTML that contains
1463    pathologically bad markup. It makes no assumptions about tag
1464    nesting, but it does know which tags are self-closing, that
1465    <script> tags contain Javascript and should not be parsed, that
1466    META tags may contain encoding information, and so on.
1467
1468    This also makes it better for subclassing than BeautifulStoneSoup
1469    or BeautifulSoup."""
1470   
1471    RESET_NESTING_TAGS = buildTagMap('noscript')
1472    NESTABLE_TAGS = {}
1473
1474class BeautifulSOAP(BeautifulStoneSoup):
1475    """This class will push a tag with only a single string child into
1476    the tag's parent as an attribute. The attribute's name is the tag
1477    name, and the value is the string child. An example should give
1478    the flavor of the change:
1479
1480    <foo><bar>baz</bar></foo>
1481     =>
1482    <foo bar="baz"><bar>baz</bar></foo>
1483
1484    You can then access fooTag['bar'] instead of fooTag.barTag.string.
1485
1486    This is, of course, useful for scraping structures that tend to
1487    use subelements instead of attributes, such as SOAP messages. Note
1488    that it modifies its input, so don't print the modified version
1489    out.
1490
1491    I'm not sure how many people really want to use this class; let me
1492    know if you do. Mainly I like the name."""
1493
1494    def popTag(self):
1495        if len(self.tagStack) > 1:
1496            tag = self.tagStack[-1]
1497            parent = self.tagStack[-2]
1498            parent._getAttrMap()
1499            if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1500                isinstance(tag.contents[0], NavigableString) and
1501                not parent.attrMap.has_key(tag.name)):
1502                parent[tag.name] = tag.contents[0]
1503        BeautifulStoneSoup.popTag(self)
1504
1505#Enterprise class names! It has come to our attention that some people
1506#think the names of the Beautiful Soup parser classes are too silly
1507#and "unprofessional" for use in enterprise screen-scraping. We feel
1508#your pain! For such-minded folk, the Beautiful Soup Consortium And
1509#All-Night Kosher Bakery recommends renaming this file to
1510#"RobustParser.py" (or, in cases of extreme enterprisitude,
1511#"RobustParserBeanInterface.class") and using the following
1512#enterprise-friendly class aliases:
1513class RobustXMLParser(BeautifulStoneSoup):
1514    pass
1515class RobustHTMLParser(BeautifulSoup):
1516    pass
1517class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1518    pass
1519class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1520    pass
1521class SimplifyingSOAPParser(BeautifulSOAP):
1522    pass
1523
1524######################################################
1525#
1526# Bonus library: Unicode, Dammit
1527#
1528# This class forces XML data into a standard format (usually to UTF-8
1529# or Unicode).  It is heavily based on code from Mark Pilgrim's
1530# Universal Feed Parser. It does not rewrite the XML or HTML to
1531# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1532# (XML) and BeautifulSoup.start_meta (HTML).
1533
1534# Autodetects character encodings.
1535# Download from http://chardet.feedparser.org/
1536try:
1537    import chardet
1538#    import chardet.constants
1539#    chardet.constants._debug = 1
1540except:
1541    chardet = None
1542chardet = None
1543
1544# cjkcodecs and iconv_codec make Python know about more character encodings.
1545# Both are available from http://cjkpython.i18n.org/
1546# They're built in if you use Python 2.4.
1547try:
1548    import cjkcodecs.aliases
1549except:
1550    pass
1551try:
1552    import iconv_codec
1553except:
1554    pass
1555
1556class UnicodeDammit:
1557    """A class for detecting the encoding of a *ML document and
1558    converting it to a Unicode string. If the source encoding is
1559    windows-1252, can replace MS smart quotes with their HTML or XML
1560    equivalents."""
1561
1562    # This dictionary maps commonly seen values for "charset" in HTML
1563    # meta tags to the corresponding Python codec names. It only covers
1564    # values that aren't in Python's aliases and can't be determined
1565    # by the heuristics in find_codec.
1566    CHARSET_ALIASES = { "macintosh" : "mac-roman",
1567                        "x-sjis" : "shift-jis" }
1568   
1569    def __init__(self, markup, overrideEncodings=[],
1570                 smartQuotesTo='xml'):
1571        self.markup, documentEncoding, sniffedEncoding = \
1572                     self._detectEncoding(markup)
1573        self.smartQuotesTo = smartQuotesTo
1574        self.triedEncodings = []
1575        if isinstance(markup, unicode):
1576            return markup
1577
1578        u = None
1579        for proposedEncoding in overrideEncodings:
1580            u = self._convertFrom(proposedEncoding)
1581            if u: break
1582        if not u:
1583            for proposedEncoding in (documentEncoding, sniffedEncoding):
1584                u = self._convertFrom(proposedEncoding)
1585                if u: break
1586               
1587        # If no luck and we have auto-detection library, try that:
1588        if not u and chardet and not isinstance(self.markup, unicode):
1589            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1590
1591        # As a last resort, try utf-8 and windows-1252:
1592        if not u:
1593            for proposed_encoding in ("utf-8", "windows-1252"):
1594                u = self._convertFrom(proposed_encoding)
1595                if u: break
1596        self.unicode = u
1597        if not u: self.originalEncoding = None
1598
1599    def subMSChar(orig, smartQuotesTo):
1600        """Changes a MS smart quote character to an XML or HTML
1601        entity."""
1602        sub = UnicodeDammit.MS_CHARS.get(orig)
1603        if type(sub) == types.TupleType:
1604            if smartQuotesTo == 'xml':
1605                sub = '&#x%s;' % sub[1]
1606            elif smartQuotesTo == 'html':
1607                sub = '&%s;' % sub[0]
1608            else:
1609                sub = unichr(int(sub[1],16))
1610        return sub           
1611    subMSChar = staticmethod(subMSChar)
1612
1613    def _convertFrom(self, proposed):       
1614        proposed = self.find_codec(proposed)
1615        if not proposed or proposed in self.triedEncodings:
1616            return None
1617        self.triedEncodings.append(proposed)
1618        markup = self.markup
1619
1620        # Convert smart quotes to HTML if coming from an encoding
1621        # that might have them.
1622        if self.smartQuotesTo and proposed in("windows-1252",
1623                                              "ISO-8859-1",
1624                                              "ISO-8859-2"):
1625            markup = re.compile("([\x80-\x9f])").sub \
1626                     (lambda(x): self.subMSChar(x.group(1),self.smartQuotesTo),
1627                      markup)
1628
1629        try:
1630            # print "Trying to convert document to %s" % proposed
1631            u = self._toUnicode(markup, proposed)
1632            self.markup = u       
1633            self.originalEncoding = proposed
1634        except Exception, e:
1635            # print "That didn't work!"
1636            # print e
1637            return None       
1638        #print "Correct encoding: %s" % proposed
1639        return self.markup
1640
1641    def _toUnicode(self, data, encoding):
1642        '''Given a string and its encoding, decodes the string into Unicode.
1643        %encoding is a string recognized by encodings.aliases'''
1644
1645        # strip Byte Order Mark (if present)
1646        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1647               and (data[2:4] != '\x00\x00'):
1648            encoding = 'utf-16be'
1649            data = data[2:]
1650        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1651                 and (data[2:4] != '\x00\x00'):
1652            encoding = 'utf-16le'
1653            data = data[2:]
1654        elif data[:3] == '\xef\xbb\xbf':
1655            encoding = 'utf-8'
1656            data = data[3:]
1657        elif data[:4] == '\x00\x00\xfe\xff':
1658            encoding = 'utf-32be'
1659            data = data[4:]
1660        elif data[:4] == '\xff\xfe\x00\x00':
1661            encoding = 'utf-32le'
1662            data = data[4:]
1663        newdata = unicode(data, encoding)
1664        return newdata
1665   
1666    def _detectEncoding(self, xml_data):
1667        """Given a document, tries to detect its XML encoding."""
1668        xml_encoding = sniffed_xml_encoding = None
1669        try:
1670            if xml_data[:4] == '\x4c\x6f\xa7\x94':
1671                # EBCDIC
1672                xml_data = self._ebcdic_to_ascii(xml_data)
1673            elif xml_data[:4] == '\x00\x3c\x00\x3f':
1674                # UTF-16BE
1675                sniffed_xml_encoding = 'utf-16be'
1676                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1677            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1678                     and (xml_data[2:4] != '\x00\x00'):
1679                # UTF-16BE with BOM
1680                sniffed_xml_encoding = 'utf-16be'
1681                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1682            elif xml_data[:4] == '\x3c\x00\x3f\x00':
1683                # UTF-16LE
1684                sniffed_xml_encoding = 'utf-16le'
1685                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1686            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1687                     (xml_data[2:4] != '\x00\x00'):
1688                # UTF-16LE with BOM
1689                sniffed_xml_encoding = 'utf-16le'
1690                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1691            elif xml_data[:4] == '\x00\x00\x00\x3c':
1692                # UTF-32BE
1693                sniffed_xml_encoding = 'utf-32be'
1694                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1695            elif xml_data[:4] == '\x3c\x00\x00\x00':
1696                # UTF-32LE
1697                sniffed_xml_encoding = 'utf-32le'
1698                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1699            elif xml_data[:4] == '\x00\x00\xfe\xff':
1700                # UTF-32BE with BOM
1701                sniffed_xml_encoding = 'utf-32be'
1702                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1703            elif xml_data[:4] == '\xff\xfe\x00\x00':
1704                # UTF-32LE with BOM
1705                sniffed_xml_encoding = 'utf-32le'
1706                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1707            elif xml_data[:3] == '\xef\xbb\xbf':
1708                # UTF-8 with BOM
1709                sniffed_xml_encoding = 'utf-8'
1710                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1711            else:
1712                sniffed_xml_encoding = 'ascii'
1713                pass
1714            xml_encoding_match = re.compile \
1715                                 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
1716                                 .match(xml_data)
1717        except:
1718            xml_encoding_match = None
1719        if xml_encoding_match:
1720            xml_encoding = xml_encoding_match.groups()[0].lower()
1721            if sniffed_xml_encoding and \
1722               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1723                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1724                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1725                                 'utf16', 'u16')):
1726                xml_encoding = sniffed_xml_encoding
1727        return xml_data, xml_encoding, sniffed_xml_encoding
1728
1729
1730    def find_codec(self, charset):
1731        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1732               or (charset and self._codec(charset.replace("-", ""))) \
1733               or (charset and self._codec(charset.replace("-", "_"))) \
1734               or charset
1735
1736    def _codec(self, charset):
1737        if not charset: return charset
1738        codec = None
1739        try:
1740            codecs.lookup(charset)
1741            codec = charset
1742        except LookupError:
1743            pass
1744        return codec
1745
1746    EBCDIC_TO_ASCII_MAP = None
1747    def _ebcdic_to_ascii(self, s):
1748        c = self.__class__
1749        if not c.EBCDIC_TO_ASCII_MAP:
1750            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1751                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1752                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1753                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1754                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1755                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1756                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1757                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1758                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1759                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1760                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1761                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1762                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1763                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1764                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1765                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1766                    250,251,252,253,254,255)
1767            import string
1768            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1769            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1770        return s.translate(c.EBCDIC_TO_ASCII_MAP)
1771
1772    MS_CHARS = { '\x80' : ('euro', '20AC'),
1773                 '\x81' : ' ',
1774                 '\x82' : ('sbquo', '201A'),
1775                 '\x83' : ('fnof', '192'),
1776                 '\x84' : ('bdquo', '201E'),
1777                 '\x85' : ('hellip', '2026'),
1778                 '\x86' : ('dagger', '2020'),
1779                 '\x87' : ('Dagger', '2021'),
1780                 '\x88' : ('circ', '2C6'),
1781                 '\x89' : ('permil', '2030'),
1782                 '\x8A' : ('Scaron', '160'),
1783                 '\x8B' : ('lsaquo', '2039'),
1784                 '\x8C' : ('OElig', '152'),
1785                 '\x8D' : '?',
1786                 '\x8E' : ('#x17D', '17D'),
1787                 '\x8F' : '?',
1788                 '\x90' : '?',
1789                 '\x91' : ('lsquo', '2018'),
1790                 '\x92' : ('rsquo', '2019'),
1791                 '\x93' : ('ldquo', '201C'),
1792                 '\x94' : ('rdquo', '201D'),
1793                 '\x95' : ('bull', '2022'),
1794                 '\x96' : ('ndash', '2013'),
1795                 '\x97' : ('mdash', '2014'),
1796                 '\x98' : ('tilde', '2DC'),
1797                 '\x99' : ('trade', '2122'),
1798                 '\x9a' : ('scaron', '161'),
1799                 '\x9b' : ('rsaquo', '203A'),
1800                 '\x9c' : ('oelig', '153'),
1801                 '\x9d' : '?',
1802                 '\x9e' : ('#x17E', '17E'),
1803                 '\x9f' : ('Yuml', '178'),}
1804
1805#######################################################################
1806
1807
1808#By default, act as an HTML pretty-printer.
1809if __name__ == '__main__':
1810    import sys
1811    soup = BeautifulSoup(sys.stdin.read())
1812    print soup.prettify()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。