root/galaxy-central/eggs/twill-0.9-py2.6.egg/twill/other_packages/_mechanize_dist/_beautifulsoup.py @ 3

リビジョン 3, 39.9 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""Beautiful Soup
2Elixir and Tonic
3"The Screen-Scraper's Friend"
4v2.1.1
5http://www.crummy.com/software/BeautifulSoup/
6
7Beautiful Soup parses arbitrarily invalid XML- or HTML-like substance
8into a tree representation. It provides methods and Pythonic idioms
9that make it easy to search and modify the tree.
10
11A well-formed XML/HTML document will yield a well-formed data
12structure. An ill-formed XML/HTML document will yield a
13correspondingly ill-formed data structure. If your document is only
14locally well-formed, you can use this library to find and process the
15well-formed part of it. The BeautifulSoup class has heuristics for
16obtaining a sensible parse tree in the face of common HTML errors.
17
18Beautiful Soup has no external dependencies. It works with Python 2.2
19and up.
20
21Beautiful Soup defines classes for four different parsing strategies:
22
23 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
24   language that kind of looks like XML.
25
26 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
27   or invalid.
28
29 * ICantBelieveItsBeautifulSoup, for parsing valid but bizarre HTML
30   that trips up BeautifulSoup.
31
32 * BeautifulSOAP, for making it easier to parse XML documents that use
33   lots of subelements containing a single string, where you'd prefer
34   they put that string into an attribute (such as SOAP messages).
35
36You can subclass BeautifulStoneSoup or BeautifulSoup to create a
37parsing strategy specific to an XML schema or a particular bizarre
38HTML document. Typically your subclass would just override
39SELF_CLOSING_TAGS and/or NESTABLE_TAGS.
40""" #"
41from __future__ import generators
42
43__author__ = "Leonard Richardson (leonardr@segfault.org)"
44__version__ = "2.1.1"
45__date__ = "$Date: 2004/10/18 00:14:20 $"
46__copyright__ = "Copyright (c) 2004-2005 Leonard Richardson"
47__license__ = "PSF"
48
49from sgmllib import SGMLParser, SGMLParseError
50import types
51import re
52import sgmllib
53
54#This code makes Beautiful Soup able to parse XML with namespaces
55sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
56
57class NullType(object):
58
59    """Similar to NoneType with a corresponding singleton instance
60    'Null' that, unlike None, accepts any message and returns itself.
61
62    Examples:
63    >>> Null("send", "a", "message")("and one more",
64    ...      "and what you get still") is Null
65    True
66    """
67
68    def __new__(cls):                    return Null
69    def __call__(self, *args, **kwargs): return Null
70##    def __getstate__(self, *args):       return Null
71    def __getattr__(self, attr):         return Null
72    def __getitem__(self, item):         return Null
73    def __setattr__(self, attr, value):  pass
74    def __setitem__(self, item, value):  pass
75    def __len__(self):                   return 0
76    # FIXME: is this a python bug? otherwise ``for x in Null: pass``
77    #        never terminates...
78    def __iter__(self):                  return iter([])
79    def __contains__(self, item):        return False
80    def __repr__(self):                  return "Null"
81Null = object.__new__(NullType)
82
83class PageElement:
84    """Contains the navigational information for some part of the page
85    (either a tag or a piece of text)"""
86
87    def setup(self, parent=Null, previous=Null):
88        """Sets up the initial relations between this element and
89        other elements."""
90        self.parent = parent
91        self.previous = previous
92        self.next = Null
93        self.previousSibling = Null
94        self.nextSibling = Null
95        if self.parent and self.parent.contents:
96            self.previousSibling = self.parent.contents[-1]
97            self.previousSibling.nextSibling = self
98
99    def findNext(self, name=None, attrs={}, text=None):
100        """Returns the first item that matches the given criteria and
101        appears after this Tag in the document."""
102        return self._first(self.fetchNext, name, attrs, text)
103    firstNext = findNext
104
105    def fetchNext(self, name=None, attrs={}, text=None, limit=None):
106        """Returns all items that match the given criteria and appear
107        before after Tag in the document."""
108        return self._fetch(name, attrs, text, limit, self.nextGenerator)
109
110    def findNextSibling(self, name=None, attrs={}, text=None):
111        """Returns the closest sibling to this Tag that matches the
112        given criteria and appears after this Tag in the document."""
113        return self._first(self.fetchNextSiblings, name, attrs, text)
114    firstNextSibling = findNextSibling
115
116    def fetchNextSiblings(self, name=None, attrs={}, text=None, limit=None):
117        """Returns the siblings of this Tag that match the given
118        criteria and appear after this Tag in the document."""
119        return self._fetch(name, attrs, text, limit, self.nextSiblingGenerator)
120
121    def findPrevious(self, name=None, attrs={}, text=None):
122        """Returns the first item that matches the given criteria and
123        appears before this Tag in the document."""
124        return self._first(self.fetchPrevious, name, attrs, text)
125
126    def fetchPrevious(self, name=None, attrs={}, text=None, limit=None):
127        """Returns all items that match the given criteria and appear
128        before this Tag in the document."""
129        return self._fetch(name, attrs, text, limit, self.previousGenerator)
130    firstPrevious = findPrevious
131
132    def findPreviousSibling(self, name=None, attrs={}, text=None):
133        """Returns the closest sibling to this Tag that matches the
134        given criteria and appears before this Tag in the document."""
135        return self._first(self.fetchPreviousSiblings, name, attrs, text)
136    firstPreviousSibling = findPreviousSibling
137
138    def fetchPreviousSiblings(self, name=None, attrs={}, text=None,
139                              limit=None):
140        """Returns the siblings of this Tag that match the given
141        criteria and appear before this Tag in the document."""
142        return self._fetch(name, attrs, text, limit,
143                           self.previousSiblingGenerator)
144
145    def findParent(self, name=None, attrs={}):
146        """Returns the closest parent of this Tag that matches the given
147        criteria."""
148        r = Null
149        l = self.fetchParents(name, attrs, 1)
150        if l:
151            r = l[0]
152        return r
153    firstParent = findParent
154
155    def fetchParents(self, name=None, attrs={}, limit=None):
156        """Returns the parents of this Tag that match the given
157        criteria."""
158        return self._fetch(name, attrs, None, limit, self.parentGenerator)
159
160    #These methods do the real heavy lifting.
161
162    def _first(self, method, name, attrs, text):
163        r = Null
164        l = method(name, attrs, text, 1)
165        if l:
166            r = l[0]
167        return r
168   
169    def _fetch(self, name, attrs, text, limit, generator):
170        "Iterates over a generator looking for things that match."
171        if not hasattr(attrs, 'items'):
172            attrs = {'class' : attrs}
173
174        results = []
175        g = generator()
176        while True:
177            try:
178                i = g.next()
179            except StopIteration:
180                break
181            found = None
182            if isinstance(i, Tag):
183                if not text:
184                    if not name or self._matches(i, name):
185                        match = True
186                        for attr, matchAgainst in attrs.items():
187                            check = i.get(attr)
188                            if not self._matches(check, matchAgainst):
189                                match = False
190                                break
191                        if match:
192                            found = i
193            elif text:
194                if self._matches(i, text):
195                    found = i                   
196            if found:
197                results.append(found)
198                if limit and len(results) >= limit:
199                    break
200        return results
201
202    #Generators that can be used to navigate starting from both
203    #NavigableTexts and Tags.               
204    def nextGenerator(self):
205        i = self
206        while i:
207            i = i.next
208            yield i
209
210    def nextSiblingGenerator(self):
211        i = self
212        while i:
213            i = i.nextSibling
214            yield i
215
216    def previousGenerator(self):
217        i = self
218        while i:
219            i = i.previous
220            yield i
221
222    def previousSiblingGenerator(self):
223        i = self
224        while i:
225            i = i.previousSibling
226            yield i
227
228    def parentGenerator(self):
229        i = self
230        while i:
231            i = i.parent
232            yield i
233
234    def _matches(self, chunk, howToMatch):
235        #print 'looking for %s in %s' % (howToMatch, chunk)
236        #
237        # If given a list of items, return true if the list contains a
238        # text element that matches.
239        if isList(chunk) and not isinstance(chunk, Tag):
240            for tag in chunk:
241                if isinstance(tag, NavigableText) and self._matches(tag, howToMatch):
242                    return True
243            return False
244        if callable(howToMatch):
245            return howToMatch(chunk)
246        if isinstance(chunk, Tag):
247            #Custom match methods take the tag as an argument, but all other
248            #ways of matching match the tag name as a string
249            chunk = chunk.name
250        #Now we know that chunk is a string
251        if not isinstance(chunk, basestring):
252            chunk = str(chunk)
253        if hasattr(howToMatch, 'match'):
254            # It's a regexp object.
255            return howToMatch.search(chunk)
256        if isList(howToMatch):
257            return chunk in howToMatch
258        if hasattr(howToMatch, 'items'):
259            return howToMatch.has_key(chunk)
260        #It's just a string
261        return str(howToMatch) == chunk
262
263class NavigableText(PageElement):
264
265    def __getattr__(self, attr):
266        "For backwards compatibility, text.string gives you text"
267        if attr == 'string':
268            return self
269        else:
270            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
271       
272class NavigableString(str, NavigableText):
273    pass
274
275class NavigableUnicodeString(unicode, NavigableText):
276    pass
277
278class Tag(PageElement):
279
280    """Represents a found HTML tag with its attributes and contents."""
281
282    def __init__(self, name, attrs=None, parent=Null, previous=Null):
283        "Basic constructor."
284        self.name = name
285        if attrs == None:
286            attrs = []
287        self.attrs = attrs
288        self.contents = []
289        self.setup(parent, previous)
290        self.hidden = False
291
292    def get(self, key, default=None):
293        """Returns the value of the 'key' attribute for the tag, or
294        the value given for 'default' if it doesn't have that
295        attribute."""
296        return self._getAttrMap().get(key, default)   
297
298    def __getitem__(self, key):
299        """tag[key] returns the value of the 'key' attribute for the tag,
300        and throws an exception if it's not there."""
301        return self._getAttrMap()[key]
302
303    def __iter__(self):
304        "Iterating over a tag iterates over its contents."
305        return iter(self.contents)
306
307    def __len__(self):
308        "The length of a tag is the length of its list of contents."
309        return len(self.contents)
310
311    def __contains__(self, x):
312        return x in self.contents
313
314    def __nonzero__(self):
315        "A tag is non-None even if it has no contents."
316        return True
317
318    def __setitem__(self, key, value):       
319        """Setting tag[key] sets the value of the 'key' attribute for the
320        tag."""
321        self._getAttrMap()
322        self.attrMap[key] = value
323        found = False
324        for i in range(0, len(self.attrs)):
325            if self.attrs[i][0] == key:
326                self.attrs[i] = (key, value)
327                found = True
328        if not found:
329            self.attrs.append((key, value))
330        self._getAttrMap()[key] = value
331
332    def __delitem__(self, key):
333        "Deleting tag[key] deletes all 'key' attributes for the tag."
334        for item in self.attrs:
335            if item[0] == key:
336                self.attrs.remove(item)
337                #We don't break because bad HTML can define the same
338                #attribute multiple times.
339            self._getAttrMap()
340            if self.attrMap.has_key(key):
341                del self.attrMap[key]
342
343    def __call__(self, *args, **kwargs):
344        """Calling a tag like a function is the same as calling its
345        fetch() method. Eg. tag('a') returns a list of all the A tags
346        found within this tag."""
347        return apply(self.fetch, args, kwargs)
348
349    def __getattr__(self, tag):
350        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
351            return self.first(tag[:-3])
352        elif tag.find('__') != 0:
353            return self.first(tag)
354
355    def __eq__(self, other):
356        """Returns true iff this tag has the same name, the same attributes,
357        and the same contents (recursively) as the given tag.
358
359        NOTE: right now this will return false if two tags have the
360        same attributes in a different order. Should this be fixed?"""
361        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
362            return False
363        for i in range(0, len(self.contents)):
364            if self.contents[i] != other.contents[i]:
365                return False
366        return True
367
368    def __ne__(self, other):
369        """Returns true iff this tag is not identical to the other tag,
370        as defined in __eq__."""
371        return not self == other
372
373    def __repr__(self):
374        """Renders this tag as a string."""
375        return str(self)
376
377    def __unicode__(self):
378        return self.__str__(1)
379
380    def __str__(self, needUnicode=None, showStructureIndent=None):
381        """Returns a string or Unicode representation of this tag and
382        its contents.
383
384        NOTE: since Python's HTML parser consumes whitespace, this
385        method is not certain to reproduce the whitespace present in
386        the original string."""
387       
388        attrs = []
389        if self.attrs:
390            for key, val in self.attrs:
391                attrs.append('%s="%s"' % (key, val))
392        close = ''
393        closeTag = ''
394        if self.isSelfClosing():
395            close = ' /'
396        else:
397            closeTag = '</%s>' % self.name
398        indentIncrement = None       
399        if showStructureIndent != None:
400            indentIncrement = showStructureIndent
401            if not self.hidden:
402                indentIncrement += 1
403        contents = self.renderContents(indentIncrement, needUnicode=needUnicode)       
404        if showStructureIndent:
405            space = '\n%s' % (' ' * showStructureIndent)
406        if self.hidden:
407            s = contents
408        else:
409            s = []
410            attributeString = ''
411            if attrs:
412                attributeString = ' ' + ' '.join(attrs)           
413            if showStructureIndent:
414                s.append(space)
415            s.append('<%s%s%s>' % (self.name, attributeString, close))
416            s.append(contents)
417            if closeTag and showStructureIndent != None:
418                s.append(space)
419            s.append(closeTag)
420            s = ''.join(s)
421        isUnicode = type(s) == types.UnicodeType
422        if needUnicode and not isUnicode:
423            s = unicode(s)
424        elif isUnicode and needUnicode==False:
425            s = str(s)
426        return s
427
428    def prettify(self, needUnicode=None):
429        return self.__str__(needUnicode, showStructureIndent=True)
430
431    def renderContents(self, showStructureIndent=None, needUnicode=None):
432        """Renders the contents of this tag as a (possibly Unicode)
433        string."""
434        s=[]
435        for c in self:
436            text = None
437            if isinstance(c, NavigableUnicodeString) or type(c) == types.UnicodeType:
438                text = unicode(c)
439            elif isinstance(c, Tag):
440                s.append(c.__str__(needUnicode, showStructureIndent))
441            elif needUnicode:
442                text = unicode(c)
443            else:
444                text = str(c)
445            if text:
446                if showStructureIndent != None:
447                    if text[-1] == '\n':
448                        text = text[:-1]
449                s.append(text)
450        return ''.join(s)   
451
452    #Soup methods
453
454    def firstText(self, text, recursive=True):
455        """Convenience method to retrieve the first piece of text matching the
456        given criteria. 'text' can be a string, a regular expression object,
457        a callable that takes a string and returns whether or not the
458        string 'matches', etc."""
459        return self.first(recursive=recursive, text=text)
460
461    def fetchText(self, text, recursive=True, limit=None):
462        """Convenience method to retrieve all pieces of text matching the
463        given criteria. 'text' can be a string, a regular expression object,
464        a callable that takes a string and returns whether or not the
465        string 'matches', etc."""
466        return self.fetch(recursive=recursive, text=text, limit=limit)
467
468    def first(self, name=None, attrs={}, recursive=True, text=None):
469        """Return only the first child of this
470        Tag matching the given criteria."""
471        r = Null
472        l = self.fetch(name, attrs, recursive, text, 1)
473        if l:
474            r = l[0]
475        return r
476    findChild = first
477
478    def fetch(self, name=None, attrs={}, recursive=True, text=None,
479              limit=None):
480        """Extracts a list of Tag objects that match the given
481        criteria.  You can specify the name of the Tag and any
482        attributes you want the Tag to have.
483
484        The value of a key-value pair in the 'attrs' map can be a
485        string, a list of strings, a regular expression object, or a
486        callable that takes a string and returns whether or not the
487        string matches for some custom definition of 'matches'. The
488        same is true of the tag name."""
489        generator = self.recursiveChildGenerator
490        if not recursive:
491            generator = self.childGenerator
492        return self._fetch(name, attrs, text, limit, generator)
493    fetchChildren = fetch
494   
495    #Utility methods
496
497    def isSelfClosing(self):
498        """Returns true iff this is a self-closing tag as defined in the HTML
499        standard.
500
501        TODO: This is specific to BeautifulSoup and its subclasses, but it's
502        used by __str__"""
503        return self.name in BeautifulSoup.SELF_CLOSING_TAGS
504
505    def append(self, tag):
506        """Appends the given tag to the contents of this tag."""
507        self.contents.append(tag)
508
509    #Private methods
510
511    def _getAttrMap(self):
512        """Initializes a map representation of this tag's attributes,
513        if not already initialized."""
514        if not getattr(self, 'attrMap'):
515            self.attrMap = {}
516            for (key, value) in self.attrs:
517                self.attrMap[key] = value
518        return self.attrMap
519
520    #Generator methods
521    def childGenerator(self):
522        for i in range(0, len(self.contents)):
523            yield self.contents[i]
524        raise StopIteration
525   
526    def recursiveChildGenerator(self):
527        stack = [(self, 0)]
528        while stack:
529            tag, start = stack.pop()
530            if isinstance(tag, Tag):           
531                for i in range(start, len(tag.contents)):
532                    a = tag.contents[i]
533                    yield a
534                    if isinstance(a, Tag) and tag.contents:
535                        if i < len(tag.contents) - 1:
536                            stack.append((tag, i+1))
537                        stack.append((a, 0))
538                        break
539        raise StopIteration
540
541
542def isList(l):
543    """Convenience method that works with all 2.x versions of Python
544    to determine whether or not something is listlike."""
545    return hasattr(l, '__iter__') \
546           or (type(l) in (types.ListType, types.TupleType))
547
548def buildTagMap(default, *args):
549    """Turns a list of maps, lists, or scalars into a single map.
550    Used to build the SELF_CLOSING_TAGS and NESTABLE_TAGS maps out
551    of lists and partial maps."""
552    built = {}
553    for portion in args:
554        if hasattr(portion, 'items'):
555            #It's a map. Merge it.
556            for k,v in portion.items():
557                built[k] = v
558        elif isList(portion):
559            #It's a list. Map each item to the default.
560            for k in portion:
561                built[k] = default
562        else:
563            #It's a scalar. Map it to the default.
564            built[portion] = default
565    return built
566
567class BeautifulStoneSoup(Tag, SGMLParser):
568
569    """This class contains the basic parser and fetch code. It defines
570    a parser that knows nothing about tag behavior except for the
571    following:
572   
573      You can't close a tag without closing all the tags it encloses.
574      That is, "<foo><bar></foo>" actually means
575      "<foo><bar></bar></foo>".
576
577    [Another possible explanation is "<foo><bar /></foo>", but since
578    this class defines no SELF_CLOSING_TAGS, it will never use that
579    explanation.]
580
581    This class is useful for parsing XML or made-up markup languages,
582    or when BeautifulSoup makes an assumption counter to what you were
583    expecting."""
584
585    SELF_CLOSING_TAGS = {}
586    NESTABLE_TAGS = {}
587    RESET_NESTING_TAGS = {}
588    QUOTE_TAGS = {}
589
590    #As a public service we will by default silently replace MS smart quotes
591    #and similar characters with their HTML or ASCII equivalents.
592    MS_CHARS = { '\x80' : '&euro;',
593                 '\x81' : ' ',
594                 '\x82' : '&sbquo;',
595                 '\x83' : '&fnof;',
596                 '\x84' : '&bdquo;',
597                 '\x85' : '&hellip;',
598                 '\x86' : '&dagger;',
599                 '\x87' : '&Dagger;',
600                 '\x88' : '&caret;',
601                 '\x89' : '%',
602                 '\x8A' : '&Scaron;',
603                 '\x8B' : '&lt;',
604                 '\x8C' : '&OElig;',
605                 '\x8D' : '?',
606                 '\x8E' : 'Z',
607                 '\x8F' : '?',
608                 '\x90' : '?',
609                 '\x91' : '&lsquo;',
610                 '\x92' : '&rsquo;',
611                 '\x93' : '&ldquo;',
612                 '\x94' : '&rdquo;',
613                 '\x95' : '&bull;',
614                 '\x96' : '&ndash;',
615                 '\x97' : '&mdash;',
616                 '\x98' : '&tilde;',
617                 '\x99' : '&trade;',
618                 '\x9a' : '&scaron;',
619                 '\x9b' : '&gt;',
620                 '\x9c' : '&oelig;',
621                 '\x9d' : '?',
622                 '\x9e' : 'z',
623                 '\x9f' : '&Yuml;',}
624
625    PARSER_MASSAGE = [(re.compile('(<[^<>]*)/>'),
626                       lambda(x):x.group(1) + ' />'),
627                      (re.compile('<!\s+([^<>]*)>'),
628                       lambda(x):'<!' + x.group(1) + '>'),
629                      (re.compile("([\x80-\x9f])"),
630                       lambda(x): BeautifulStoneSoup.MS_CHARS.get(x.group(1)))
631                      ]
632
633    ROOT_TAG_NAME = '[document]'
634
635    def __init__(self, text=None, avoidParserProblems=True,
636                 initialTextIsEverything=True):
637        """Initialize this as the 'root tag' and feed in any text to
638        the parser.
639
640        NOTE about avoidParserProblems: sgmllib will process most bad
641        HTML, and BeautifulSoup has tricks for dealing with some HTML
642        that kills sgmllib, but Beautiful Soup can nonetheless choke
643        or lose data if your data uses self-closing tags or
644        declarations incorrectly. By default, Beautiful Soup sanitizes
645        its input to avoid the vast majority of these problems. The
646        problems are relatively rare, even in bad HTML, so feel free
647        to pass in False to avoidParserProblems if they don't apply to
648        you, and you'll get better performance. The only reason I have
649        this turned on by default is so I don't get so many tech
650        support questions.
651
652        The two most common instances of invalid HTML that will choke
653        sgmllib are fixed by the default parser massage techniques:
654
655         <br/> (No space between name of closing tag and tag close)
656         <! --Comment--> (Extraneous whitespace in declaration)
657
658        You can pass in a custom list of (RE object, replace method)
659        tuples to get Beautiful Soup to scrub your input the way you
660        want."""
661        Tag.__init__(self, self.ROOT_TAG_NAME)
662        if avoidParserProblems \
663           and not isList(avoidParserProblems):
664            avoidParserProblems = self.PARSER_MASSAGE           
665        self.avoidParserProblems = avoidParserProblems
666        SGMLParser.__init__(self)
667        self.quoteStack = []
668        self.hidden = 1
669        self.reset()
670        if hasattr(text, 'read'):
671            #It's a file-type object.
672            text = text.read()
673        if text:
674            self.feed(text)
675        if initialTextIsEverything:
676            self.done()
677
678    def __getattr__(self, methodName):
679        """This method routes method call requests to either the SGMLParser
680        superclass or the Tag superclass, depending on the method name."""
681        if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
682               or methodName.find('do_') == 0:
683            return SGMLParser.__getattr__(self, methodName)
684        elif methodName.find('__') != 0:
685            return Tag.__getattr__(self, methodName)
686        else:
687            raise AttributeError
688
689    def feed(self, text):
690        if self.avoidParserProblems:
691            for fix, m in self.avoidParserProblems:
692                text = fix.sub(m, text)
693        SGMLParser.feed(self, text)
694
695    def done(self):
696        """Called when you're done parsing, so that the unclosed tags can be
697        correctly processed."""
698        self.endData() #NEW
699        while self.currentTag.name != self.ROOT_TAG_NAME:
700            self.popTag()
701           
702    def reset(self):
703        SGMLParser.reset(self)
704        self.currentData = []
705        self.currentTag = None
706        self.tagStack = []
707        self.pushTag(self)       
708   
709    def popTag(self):
710        tag = self.tagStack.pop()
711        # Tags with just one string-owning child get the child as a
712        # 'string' property, so that soup.tag.string is shorthand for
713        # soup.tag.contents[0]
714        if len(self.currentTag.contents) == 1 and \
715           isinstance(self.currentTag.contents[0], NavigableText):
716            self.currentTag.string = self.currentTag.contents[0]
717
718        #print "Pop", tag.name
719        if self.tagStack:
720            self.currentTag = self.tagStack[-1]
721        return self.currentTag
722
723    def pushTag(self, tag):
724        #print "Push", tag.name
725        if self.currentTag:
726            self.currentTag.append(tag)
727        self.tagStack.append(tag)
728        self.currentTag = self.tagStack[-1]
729
730    def endData(self):
731        currentData = ''.join(self.currentData)
732        if currentData:
733            if not currentData.strip():
734                if '\n' in currentData:
735                    currentData = '\n'
736                else:
737                    currentData = ' '
738            c = NavigableString
739            if type(currentData) == types.UnicodeType:
740                c = NavigableUnicodeString
741            o = c(currentData)
742            o.setup(self.currentTag, self.previous)
743            if self.previous:
744                self.previous.next = o
745            self.previous = o
746            self.currentTag.contents.append(o)
747        self.currentData = []
748
749    def _popToTag(self, name, inclusivePop=True):
750        """Pops the tag stack up to and including the most recent
751        instance of the given tag. If inclusivePop is false, pops the tag
752        stack up to but *not* including the most recent instqance of
753        the given tag."""
754        if name == self.ROOT_TAG_NAME:
755            return           
756
757        numPops = 0
758        mostRecentTag = None
759        for i in range(len(self.tagStack)-1, 0, -1):
760            if name == self.tagStack[i].name:
761                numPops = len(self.tagStack)-i
762                break
763        if not inclusivePop:
764            numPops = numPops - 1
765
766        for i in range(0, numPops):
767            mostRecentTag = self.popTag()
768        return mostRecentTag   
769
770    def _smartPop(self, name):
771
772        """We need to pop up to the previous tag of this type, unless
773        one of this tag's nesting reset triggers comes between this
774        tag and the previous tag of this type, OR unless this tag is a
775        generic nesting trigger and another generic nesting trigger
776        comes between this tag and the previous tag of this type.
777
778        Examples:
779         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
780         <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
781         <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
782         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
783
784         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
785         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
786         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
787        """
788
789        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
790        isNestable = nestingResetTriggers != None
791        isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
792        popTo = None
793        inclusive = True
794        for i in range(len(self.tagStack)-1, 0, -1):
795            p = self.tagStack[i]
796            if (not p or p.name == name) and not isNestable:
797                #Non-nestable tags get popped to the top or to their
798                #last occurance.
799                popTo = name
800                break
801            if (nestingResetTriggers != None
802                and p.name in nestingResetTriggers) \
803                or (nestingResetTriggers == None and isResetNesting
804                    and self.RESET_NESTING_TAGS.has_key(p.name)):
805               
806                #If we encounter one of the nesting reset triggers
807                #peculiar to this tag, or we encounter another tag
808                #that causes nesting to reset, pop up to but not
809                #including that tag.
810
811                popTo = p.name
812                inclusive = False
813                break
814            p = p.parent
815        if popTo:
816            self._popToTag(popTo, inclusive)
817
818    def unknown_starttag(self, name, attrs, selfClosing=0):
819        #print "Start tag %s" % name
820        if self.quoteStack:
821            #This is not a real tag.
822            #print "<%s> is not real!" % name
823            attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
824            self.handle_data('<%s%s>' % (name, attrs))
825            return
826        self.endData()
827        if not name in self.SELF_CLOSING_TAGS and not selfClosing:
828            self._smartPop(name)
829        tag = Tag(name, attrs, self.currentTag, self.previous)       
830        if self.previous:
831            self.previous.next = tag
832        self.previous = tag
833        self.pushTag(tag)
834        if selfClosing or name in self.SELF_CLOSING_TAGS:
835            self.popTag()               
836        if name in self.QUOTE_TAGS:
837            #print "Beginning quote (%s)" % name
838            self.quoteStack.append(name)
839            self.literal = 1
840
841    def unknown_endtag(self, name):
842        if self.quoteStack and self.quoteStack[-1] != name:
843            #This is not a real end tag.
844            #print "</%s> is not real!" % name
845            self.handle_data('</%s>' % name)
846            return
847        self.endData()
848        self._popToTag(name)
849        if self.quoteStack and self.quoteStack[-1] == name:
850            self.quoteStack.pop()
851            self.literal = (len(self.quoteStack) > 0)
852
853    def handle_data(self, data):
854        self.currentData.append(data)
855
856    def handle_pi(self, text):
857        "Propagate processing instructions right through."
858        self.handle_data("<?%s>" % text)
859
860    def handle_comment(self, text):
861        "Propagate comments right through."
862        self.handle_data("<!--%s-->" % text)
863
864    def handle_charref(self, ref):
865        "Propagate char refs right through."
866        self.handle_data('&#%s;' % ref)
867
868    def handle_entityref(self, ref):
869        "Propagate entity refs right through."
870        self.handle_data('&%s;' % ref)
871       
872    def handle_decl(self, data):
873        "Propagate DOCTYPEs and the like right through."
874        self.handle_data('<!%s>' % data)
875
876    def parse_declaration(self, i):
877        """Treat a bogus SGML declaration as raw data. Treat a CDATA
878        declaration as regular data."""
879        j = None
880        if self.rawdata[i:i+9] == '<![CDATA[':
881             k = self.rawdata.find(']]>', i)
882             if k == -1:
883                 k = len(self.rawdata)
884             self.handle_data(self.rawdata[i+9:k])
885             j = k+3
886        else:
887            try:
888                j = SGMLParser.parse_declaration(self, i)
889            except SGMLParseError:
890                toHandle = self.rawdata[i:]
891                self.handle_data(toHandle)
892                j = i + len(toHandle)
893        return j
894
895class BeautifulSoup(BeautifulStoneSoup):
896
897    """This parser knows the following facts about HTML:
898
899    * Some tags have no closing tag and should be interpreted as being
900      closed as soon as they are encountered.
901
902    * The text inside some tags (ie. 'script') may contain tags which
903      are not really part of the document and which should be parsed
904      as text, not tags. If you want to parse the text as tags, you can
905      always fetch it and parse it explicitly.
906
907    * Tag nesting rules:
908
909      Most tags can't be nested at all. For instance, the occurance of
910      a <p> tag should implicitly close the previous <p> tag.
911
912       <p>Para1<p>Para2
913        should be transformed into:
914       <p>Para1</p><p>Para2
915
916      Some tags can be nested arbitrarily. For instance, the occurance
917      of a <blockquote> tag should _not_ implicitly close the previous
918      <blockquote> tag.
919
920       Alice said: <blockquote>Bob said: <blockquote>Blah
921        should NOT be transformed into:
922       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
923
924      Some tags can be nested, but the nesting is reset by the
925      interposition of other tags. For instance, a <tr> tag should
926      implicitly close the previous <tr> tag within the same <table>,
927      but not close a <tr> tag in another table.
928
929       <table><tr>Blah<tr>Blah
930        should be transformed into:
931       <table><tr>Blah</tr><tr>Blah
932        but,
933       <tr>Blah<table><tr>Blah
934        should NOT be transformed into
935       <tr>Blah<table></tr><tr>Blah
936
937    Differing assumptions about tag nesting rules are a major source
938    of problems with the BeautifulSoup class. If BeautifulSoup is not
939    treating as nestable a tag your page author treats as nestable,
940    try ICantBelieveItsBeautifulSoup before writing your own
941    subclass."""
942
943    SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta',
944                                           'spacer', 'link', 'frame', 'base'])
945
946    QUOTE_TAGS = {'script': None}
947   
948    #According to the HTML standard, each of these inline tags can
949    #contain another tag of the same type. Furthermore, it's common
950    #to actually use these tags this way.
951    NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
952                            'center']
953
954    #According to the HTML standard, these block tags can contain
955    #another tag of the same type. Furthermore, it's common
956    #to actually use these tags this way.
957    NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
958
959    #Lists can contain other lists, but there are restrictions.   
960    NESTABLE_LIST_TAGS = { 'ol' : [],
961                           'ul' : [],
962                           'li' : ['ul', 'ol'],
963                           'dl' : [],
964                           'dd' : ['dl'],
965                           'dt' : ['dl'] }
966
967    #Tables can contain other tables, but there are restrictions.   
968    NESTABLE_TABLE_TAGS = {'table' : [],
969                           'tr' : ['table', 'tbody', 'tfoot', 'thead'],
970                           'td' : ['tr'],
971                           'th' : ['tr'],
972                           }
973
974    NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
975
976    #If one of these tags is encountered, all tags up to the next tag of
977    #this type are popped.
978    RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
979                                     NON_NESTABLE_BLOCK_TAGS,
980                                     NESTABLE_LIST_TAGS,
981                                     NESTABLE_TABLE_TAGS)
982
983    NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
984                                NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
985   
986class ICantBelieveItsBeautifulSoup(BeautifulSoup):
987
988    """The BeautifulSoup class is oriented towards skipping over
989    common HTML errors like unclosed tags. However, sometimes it makes
990    errors of its own. For instance, consider this fragment:
991
992     <b>Foo<b>Bar</b></b>
993
994    This is perfectly valid (if bizarre) HTML. However, the
995    BeautifulSoup class will implicitly close the first b tag when it
996    encounters the second 'b'. It will think the author wrote
997    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
998    there's no real-world reason to bold something that's already
999    bold. When it encounters '</b></b>' it will close two more 'b'
1000    tags, for a grand total of three tags closed instead of two. This
1001    can throw off the rest of your document structure. The same is
1002    true of a number of other tags, listed below.
1003
1004    It's much more common for someone to forget to close (eg.) a 'b'
1005    tag than to actually use nested 'b' tags, and the BeautifulSoup
1006    class handles the common case. This class handles the
1007    not-co-common case: where you can't believe someone wrote what
1008    they did, but it's valid HTML and BeautifulSoup screwed up by
1009    assuming it wouldn't be.
1010
1011    If this doesn't do what you need, try subclassing this class or
1012    BeautifulSoup, and providing your own list of NESTABLE_TAGS."""
1013
1014    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1015     ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1016      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1017      'big']
1018
1019    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1020
1021    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1022                                I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1023                                I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1024
1025class BeautifulSOAP(BeautifulStoneSoup):
1026    """This class will push a tag with only a single string child into
1027    the tag's parent as an attribute. The attribute's name is the tag
1028    name, and the value is the string child. An example should give
1029    the flavor of the change:
1030
1031    <foo><bar>baz</bar></foo>
1032     =>
1033    <foo bar="baz"><bar>baz</bar></foo>
1034
1035    You can then access fooTag['bar'] instead of fooTag.barTag.string.
1036
1037    This is, of course, useful for scraping structures that tend to
1038    use subelements instead of attributes, such as SOAP messages. Note
1039    that it modifies its input, so don't print the modified version
1040    out.
1041
1042    I'm not sure how many people really want to use this class; let me
1043    know if you do. Mainly I like the name."""
1044
1045    def popTag(self):
1046        if len(self.tagStack) > 1:
1047            tag = self.tagStack[-1]
1048            parent = self.tagStack[-2]
1049            parent._getAttrMap()
1050            if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1051                isinstance(tag.contents[0], NavigableText) and
1052                not parent.attrMap.has_key(tag.name)):
1053                parent[tag.name] = tag.contents[0]
1054        BeautifulStoneSoup.popTag(self)
1055
1056#Enterprise class names! It has come to our attention that some people
1057#think the names of the Beautiful Soup parser classes are too silly
1058#and "unprofessional" for use in enterprise screen-scraping. We feel
1059#your pain! For such-minded folk, the Beautiful Soup Consortium And
1060#All-Night Kosher Bakery recommends renaming this file to
1061#"RobustParser.py" (or, in cases of extreme enterprisitude,
1062#"RobustParserBeanInterface.class") and using the following
1063#enterprise-friendly class aliases:
1064class RobustXMLParser(BeautifulStoneSoup):
1065    pass
1066class RobustHTMLParser(BeautifulSoup):
1067    pass
1068class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1069    pass
1070class SimplifyingSOAPParser(BeautifulSOAP):
1071    pass
1072
1073###
1074
1075
1076#By default, act as an HTML pretty-printer.
1077if __name__ == '__main__':
1078    import sys
1079    soup = BeautifulStoneSoup(sys.stdin.read())
1080    print soup.prettify()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。