source: exist/branches/proglue_production_rev4605_xquery4884/python/build/lib/ndgUtils/BeautifulSoup.py @ 4886

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/branches/proglue_production_rev4605_xquery4884/python/build/lib/ndgUtils/BeautifulSoup.py@4886
Revision 4886, 67.9 KB checked in by sdonegan, 10 years ago (diff)

Creating operational branch for ndgUtils as used on proglue

Line 
1"""Beautiful Soup
2Elixir and Tonic
3"The Screen-Scraper's Friend"
4http://www.crummy.com/software/BeautifulSoup/
5
6Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7tree representation. It provides methods and Pythonic idioms that make
8it easy to navigate, search, and modify the tree.
9
10A well-formed XML/HTML document yields a well-formed data
11structure. An ill-formed XML/HTML document yields a correspondingly
12ill-formed data structure. If your document is only locally
13well-formed, you can use this library to find and process the
14well-formed part of it. The BeautifulSoup class
15
16Beautiful Soup works with Python 2.2 and up. It has no external
17dependencies, but you'll have more success at converting data to UTF-8
18if you also install these three packages:
19
20* chardet, for auto-detecting character encodings
21  http://chardet.feedparser.org/
22* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23  by stock Python.
24  http://cjkpython.i18n.org/
25
26Beautiful Soup defines classes for two main parsing strategies:
27   
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29   language that kind of looks like XML.
30
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32   or invalid. This class has web browser-like heuristics for
33   obtaining a sensible parse tree in the face of common HTML errors.
34
35Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36the encoding of an HTML or XML document, and converting it to
37Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38
39For more than you ever wanted to know about Beautiful Soup, see the
40documentation:
41http://www.crummy.com/software/BeautifulSoup/documentation.html
42
43"""
44from __future__ import generators
45
46__author__ = "Leonard Richardson (leonardr@segfault.org)"
47__version__ = "3.0.4"
48__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
49__license__ = "PSF"
50
51from sgmllib import SGMLParser, SGMLParseError
52import codecs
53import types
54import re
55import sgmllib
56try:
57  from htmlentitydefs import name2codepoint
58except ImportError:
59  name2codepoint = {}
60
61#This hack makes Beautiful Soup able to parse XML with namespaces
62sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
63
64DEFAULT_OUTPUT_ENCODING = "utf-8"
65
66# First, the classes that represent markup elements.
67
68class PageElement:
69    """Contains the navigational information for some part of the page
70    (either a tag or a piece of text)"""
71
72    def setup(self, parent=None, previous=None):
73        """Sets up the initial relations between this element and
74        other elements."""       
75        self.parent = parent
76        self.previous = previous
77        self.next = None
78        self.previousSibling = None
79        self.nextSibling = None
80        if self.parent and self.parent.contents:
81            self.previousSibling = self.parent.contents[-1]
82            self.previousSibling.nextSibling = self
83
84    def replaceWith(self, replaceWith):       
85        oldParent = self.parent
86        myIndex = self.parent.contents.index(self)
87        if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
88            # We're replacing this element with one of its siblings.
89            index = self.parent.contents.index(replaceWith)
90            if index and index < myIndex:
91                # Furthermore, it comes before this element. That
92                # means that when we extract it, the index of this
93                # element will change.
94                myIndex = myIndex - 1
95        self.extract()       
96        oldParent.insert(myIndex, replaceWith)
97       
98    def extract(self):
99        """Destructively rips this element out of the tree."""       
100        if self.parent:
101            try:
102                self.parent.contents.remove(self)
103            except ValueError:
104                pass
105
106        #Find the two elements that would be next to each other if
107        #this element (and any children) hadn't been parsed. Connect
108        #the two.       
109        lastChild = self._lastRecursiveChild()
110        nextElement = lastChild.next
111
112        if self.previous:
113            self.previous.next = nextElement
114        if nextElement:
115            nextElement.previous = self.previous
116        self.previous = None
117        lastChild.next = None
118
119        self.parent = None       
120        if self.previousSibling:
121            self.previousSibling.nextSibling = self.nextSibling
122        if self.nextSibling:
123            self.nextSibling.previousSibling = self.previousSibling
124        self.previousSibling = self.nextSibling = None       
125
126    def _lastRecursiveChild(self):
127        "Finds the last element beneath this object to be parsed."
128        lastChild = self
129        while hasattr(lastChild, 'contents') and lastChild.contents:
130            lastChild = lastChild.contents[-1]
131        return lastChild
132
133    def insert(self, position, newChild):
134        if (isinstance(newChild, basestring)
135            or isinstance(newChild, unicode)) \
136            and not isinstance(newChild, NavigableString):
137            newChild = NavigableString(newChild)       
138
139        position =  min(position, len(self.contents))
140        if hasattr(newChild, 'parent') and newChild.parent != None:
141            # We're 'inserting' an element that's already one
142            # of this object's children.
143            if newChild.parent == self:
144                index = self.find(newChild)
145                if index and index < position:
146                    # Furthermore we're moving it further down the
147                    # list of this object's children. That means that
148                    # when we extract this element, our target index
149                    # will jump down one.
150                    position = position - 1
151            newChild.extract()
152           
153        newChild.parent = self
154        previousChild = None
155        if position == 0:
156            newChild.previousSibling = None
157            newChild.previous = self
158        else:
159            previousChild = self.contents[position-1]
160            newChild.previousSibling = previousChild
161            newChild.previousSibling.nextSibling = newChild
162            newChild.previous = previousChild._lastRecursiveChild()
163        if newChild.previous:
164            newChild.previous.next = newChild       
165
166        newChildsLastElement = newChild._lastRecursiveChild()
167
168        if position >= len(self.contents):
169            newChild.nextSibling = None
170           
171            parent = self
172            parentsNextSibling = None
173            while not parentsNextSibling:
174                parentsNextSibling = parent.nextSibling
175                parent = parent.parent
176                if not parent: # This is the last element in the document.
177                    break
178            if parentsNextSibling:
179                newChildsLastElement.next = parentsNextSibling
180            else:
181                newChildsLastElement.next = None
182        else:
183            nextChild = self.contents[position]           
184            newChild.nextSibling = nextChild           
185            if newChild.nextSibling:
186                newChild.nextSibling.previousSibling = newChild
187            newChildsLastElement.next = nextChild
188
189        if newChildsLastElement.next:
190            newChildsLastElement.next.previous = newChildsLastElement
191        self.contents.insert(position, newChild)
192
193    def findNext(self, name=None, attrs={}, text=None, **kwargs):
194        """Returns the first item that matches the given criteria and
195        appears after this Tag in the document."""
196        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
197
198    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
199                    **kwargs):
200        """Returns all items that match the given criteria and appear
201        before after Tag in the document."""
202        return self._findAll(name, attrs, text, limit, self.nextGenerator)
203
204    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
205        """Returns the closest sibling to this Tag that matches the
206        given criteria and appears after this Tag in the document."""
207        return self._findOne(self.findNextSiblings, name, attrs, text,
208                             **kwargs)
209
210    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
211                         **kwargs):
212        """Returns the siblings of this Tag that match the given
213        criteria and appear after this Tag in the document."""
214        return self._findAll(name, attrs, text, limit,
215                             self.nextSiblingGenerator, **kwargs)
216    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
217
218    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
219        """Returns the first item that matches the given criteria and
220        appears before this Tag in the document."""
221        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
222
223    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
224                        **kwargs):
225        """Returns all items that match the given criteria and appear
226        before this Tag in the document."""
227        return self._findAll(name, attrs, text, limit, self.previousGenerator,
228                           **kwargs)
229    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
230
231    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
232        """Returns the closest sibling to this Tag that matches the
233        given criteria and appears before this Tag in the document."""
234        return self._findOne(self.findPreviousSiblings, name, attrs, text,
235                             **kwargs)
236
237    def findPreviousSiblings(self, name=None, attrs={}, text=None,
238                             limit=None, **kwargs):
239        """Returns the siblings of this Tag that match the given
240        criteria and appear before this Tag in the document."""
241        return self._findAll(name, attrs, text, limit,
242                             self.previousSiblingGenerator, **kwargs)
243    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
244
245    def findParent(self, name=None, attrs={}, **kwargs):
246        """Returns the closest parent of this Tag that matches the given
247        criteria."""
248        # NOTE: We can't use _findOne because findParents takes a different
249        # set of arguments.
250        r = None
251        l = self.findParents(name, attrs, 1)
252        if l:
253            r = l[0]
254        return r
255
256    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
257        """Returns the parents of this Tag that match the given
258        criteria."""
259
260        return self._findAll(name, attrs, None, limit, self.parentGenerator,
261                             **kwargs)
262    fetchParents = findParents # Compatibility with pre-3.x
263
264    #These methods do the real heavy lifting.
265
266    def _findOne(self, method, name, attrs, text, **kwargs):
267        r = None
268        l = method(name, attrs, text, 1, **kwargs)
269        if l:
270            r = l[0]
271        return r
272   
273    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
274        "Iterates over a generator looking for things that match."
275
276        if isinstance(name, SoupStrainer):
277            strainer = name
278        else:
279            # Build a SoupStrainer
280            strainer = SoupStrainer(name, attrs, text, **kwargs)
281        results = ResultSet(strainer)
282        g = generator()
283        while True:
284            try:
285                i = g.next()
286            except StopIteration:
287                break
288            if i:
289                found = strainer.search(i)
290                if found:
291                    results.append(found)
292                    if limit and len(results) >= limit:
293                        break
294        return results
295
296    #These Generators can be used to navigate starting from both
297    #NavigableStrings and Tags.               
298    def nextGenerator(self):
299        i = self
300        while i:
301            i = i.next
302            yield i
303
304    def nextSiblingGenerator(self):
305        i = self
306        while i:
307            i = i.nextSibling
308            yield i
309
310    def previousGenerator(self):
311        i = self
312        while i:
313            i = i.previous
314            yield i
315
316    def previousSiblingGenerator(self):
317        i = self
318        while i:
319            i = i.previousSibling
320            yield i
321
322    def parentGenerator(self):
323        i = self
324        while i:
325            i = i.parent
326            yield i
327
328    # Utility methods
329    def substituteEncoding(self, str, encoding=None):
330        encoding = encoding or "utf-8"
331        return str.replace("%SOUP-ENCODING%", encoding)   
332
333    def toEncoding(self, s, encoding=None):
334        """Encodes an object to a string in some encoding, or to Unicode.
335        ."""
336        if isinstance(s, unicode):
337            if encoding:
338                s = s.encode(encoding)
339        elif isinstance(s, str):
340            if encoding:
341                s = s.encode(encoding)
342            else:
343                s = unicode(s)
344        else:
345            if encoding:
346                s  = self.toEncoding(str(s), encoding)
347            else:
348                s = unicode(s)
349        return s
350
351class NavigableString(unicode, PageElement):
352
353    def __getattr__(self, attr):
354        """text.string gives you text. This is for backwards
355        compatibility for Navigable*String, but for CData* it lets you
356        get the string without the CData wrapper."""
357        if attr == 'string':
358            return self
359        else:
360            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
361
362    def __unicode__(self):
363        return self.__str__(None)
364
365    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
366        if encoding:
367            return self.encode(encoding)
368        else:
369            return self
370       
371class CData(NavigableString):
372
373    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
374        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
375
376class ProcessingInstruction(NavigableString):
377    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
378        output = self
379        if "%SOUP-ENCODING%" in output:
380            output = self.substituteEncoding(output, encoding)
381        return "<?%s?>" % self.toEncoding(output, encoding)
382
383class Comment(NavigableString):
384    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
385        return "<!--%s-->" % NavigableString.__str__(self, encoding)   
386
387class Declaration(NavigableString):
388    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
389        return "<!%s>" % NavigableString.__str__(self, encoding)       
390
391class Tag(PageElement):
392
393    """Represents a found HTML tag with its attributes and contents."""
394
395    XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot",
396                                      '"' : "quote",
397                                      "&" : "amp",
398                                      "<" : "lt",
399                                      ">" : "gt" }
400
401    def __init__(self, parser, name, attrs=None, parent=None,
402                 previous=None):
403        "Basic constructor."
404
405        # We don't actually store the parser object: that lets extracted
406        # chunks be garbage-collected
407        self.parserClass = parser.__class__
408        self.isSelfClosing = parser.isSelfClosingTag(name)
409        self.name = name
410        if attrs == None:
411            attrs = []
412        self.attrs = attrs
413        self.contents = []
414        self.setup(parent, previous)
415        self.hidden = False
416        self.containsSubstitutions = False
417
418    def get(self, key, default=None):
419        """Returns the value of the 'key' attribute for the tag, or
420        the value given for 'default' if it doesn't have that
421        attribute."""
422        return self._getAttrMap().get(key, default)   
423
424    def has_key(self, key):
425        return self._getAttrMap().has_key(key)
426
427    def __getitem__(self, key):
428        """tag[key] returns the value of the 'key' attribute for the tag,
429        and throws an exception if it's not there."""
430        return self._getAttrMap()[key]
431
432    def __iter__(self):
433        "Iterating over a tag iterates over its contents."
434        return iter(self.contents)
435
436    def __len__(self):
437        "The length of a tag is the length of its list of contents."
438        return len(self.contents)
439
440    def __contains__(self, x):
441        return x in self.contents
442
443    def __nonzero__(self):
444        "A tag is non-None even if it has no contents."
445        return True
446
447    def __setitem__(self, key, value):       
448        """Setting tag[key] sets the value of the 'key' attribute for the
449        tag."""
450        self._getAttrMap()
451        self.attrMap[key] = value
452        found = False
453        for i in range(0, len(self.attrs)):
454            if self.attrs[i][0] == key:
455                self.attrs[i] = (key, value)
456                found = True
457        if not found:
458            self.attrs.append((key, value))
459        self._getAttrMap()[key] = value
460
461    def __delitem__(self, key):
462        "Deleting tag[key] deletes all 'key' attributes for the tag."
463        for item in self.attrs:
464            if item[0] == key:
465                self.attrs.remove(item)
466                #We don't break because bad HTML can define the same
467                #attribute multiple times.
468            self._getAttrMap()
469            if self.attrMap.has_key(key):
470                del self.attrMap[key]
471
472    def __call__(self, *args, **kwargs):
473        """Calling a tag like a function is the same as calling its
474        findAll() method. Eg. tag('a') returns a list of all the A tags
475        found within this tag."""
476        return apply(self.findAll, args, kwargs)
477
478    def __getattr__(self, tag):
479        #print "Getattr %s.%s" % (self.__class__, tag)
480        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
481            return self.find(tag[:-3])
482        elif tag.find('__') != 0:
483            return self.find(tag)
484
485    def __eq__(self, other):
486        """Returns true iff this tag has the same name, the same attributes,
487        and the same contents (recursively) as the given tag.
488
489        NOTE: right now this will return false if two tags have the
490        same attributes in a different order. Should this be fixed?"""
491        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
492            return False
493        for i in range(0, len(self.contents)):
494            if self.contents[i] != other.contents[i]:
495                return False
496        return True
497
498    def __ne__(self, other):
499        """Returns true iff this tag is not identical to the other tag,
500        as defined in __eq__."""
501        return not self == other
502
503    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
504        """Renders this tag as a string."""
505        return self.__str__(encoding)
506
507    def __unicode__(self):
508        return self.__str__(None)
509
510    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
511                prettyPrint=False, indentLevel=0):
512        """Returns a string or Unicode representation of this tag and
513        its contents. To get Unicode, pass None for encoding.
514
515        NOTE: since Python's HTML parser consumes whitespace, this
516        method is not certain to reproduce the whitespace present in
517        the original string."""
518
519        encodedName = self.toEncoding(self.name, encoding)
520
521        attrs = []
522        if self.attrs:
523            for key, val in self.attrs:
524                fmt = '%s="%s"'
525                if isString(val):                   
526                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
527                        val = self.substituteEncoding(val, encoding)
528
529                    # The attribute value either:
530                    #
531                    # * Contains no embedded double quotes or single quotes.
532                    #   No problem: we enclose it in double quotes.
533                    # * Contains embedded single quotes. No problem:
534                    #   double quotes work here too.
535                    # * Contains embedded double quotes. No problem:
536                    #   we enclose it in single quotes.
537                    # * Embeds both single _and_ double quotes. This
538                    #   can't happen naturally, but it can happen if
539                    #   you modify an attribute value after parsing
540                    #   the document. Now we have a bit of a
541                    #   problem. We solve it by enclosing the
542                    #   attribute in single quotes, and escaping any
543                    #   embedded single quotes to XML entities.
544                    if '"' in val:
545                        fmt = "%s='%s'"
546                        # This can't happen naturally, but it can happen
547                        # if you modify an attribute value after parsing.
548                        if "'" in val:
549                            val = val.replace("'", "&squot;")
550
551                    # Now we're okay w/r/t quotes. But the attribute
552                    # value might also contain angle brackets, or
553                    # ampersands that aren't part of entities. We need
554                    # to escape those to XML entities too.
555                    val = re.sub("([<>]|&(?![^\s]+;))",
556                                 lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";",
557                                 val)
558                                     
559                attrs.append(fmt % (self.toEncoding(key, encoding),
560                                    self.toEncoding(val, encoding)))
561        close = ''
562        closeTag = ''
563        if self.isSelfClosing:
564            close = ' /'
565        else:
566            closeTag = '</%s>' % encodedName
567
568        indentTag, indentContents = 0, 0
569        if prettyPrint:
570            indentTag = indentLevel
571            space = (' ' * (indentTag-1))
572            indentContents = indentTag + 1
573        contents = self.renderContents(encoding, prettyPrint, indentContents)
574        if self.hidden:
575            s = contents
576        else:
577            s = []
578            attributeString = ''
579            if attrs:
580                attributeString = ' ' + ' '.join(attrs)           
581            if prettyPrint:
582                s.append(space)
583            s.append('<%s%s%s>' % (encodedName, attributeString, close))
584            if prettyPrint:
585                s.append("\n")
586            s.append(contents)
587            if prettyPrint and contents and contents[-1] != "\n":
588                s.append("\n")
589            if prettyPrint and closeTag:
590                s.append(space)
591            s.append(closeTag)
592            if prettyPrint and closeTag and self.nextSibling:
593                s.append("\n")
594            s = ''.join(s)
595        return s
596
597    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
598        return self.__str__(encoding, True)
599
600    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
601                       prettyPrint=False, indentLevel=0):
602        """Renders the contents of this tag as a string in the given
603        encoding. If encoding is None, returns a Unicode string.."""
604        s=[]
605        for c in self:
606            text = None
607            if isinstance(c, NavigableString):
608                text = c.__str__(encoding)
609            elif isinstance(c, Tag):
610                s.append(c.__str__(encoding, prettyPrint, indentLevel))
611            if text and prettyPrint:
612                text = text.strip()             
613            if text:
614                if prettyPrint:
615                    s.append(" " * (indentLevel-1))
616                s.append(text)
617                if prettyPrint:
618                    s.append("\n")
619        return ''.join(s)   
620
621    #Soup methods
622
623    def find(self, name=None, attrs={}, recursive=True, text=None,
624             **kwargs):
625        """Return only the first child of this Tag matching the given
626        criteria."""
627        r = None
628        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
629        if l:
630            r = l[0]
631        return r
632    findChild = find
633
634    def findAll(self, name=None, attrs={}, recursive=True, text=None,
635                limit=None, **kwargs):
636        """Extracts a list of Tag objects that match the given
637        criteria.  You can specify the name of the Tag and any
638        attributes you want the Tag to have.
639
640        The value of a key-value pair in the 'attrs' map can be a
641        string, a list of strings, a regular expression object, or a
642        callable that takes a string and returns whether or not the
643        string matches for some custom definition of 'matches'. The
644        same is true of the tag name."""
645        generator = self.recursiveChildGenerator
646        if not recursive:
647            generator = self.childGenerator
648        return self._findAll(name, attrs, text, limit, generator, **kwargs)
649    findChildren = findAll
650
651    # Pre-3.x compatibility methods
652    first = find
653    fetch = findAll
654   
655    def fetchText(self, text=None, recursive=True, limit=None):
656        return self.findAll(text=text, recursive=recursive, limit=limit)
657
658    def firstText(self, text=None, recursive=True):
659        return self.find(text=text, recursive=recursive)
660   
661    #Utility methods
662
663    def append(self, tag):
664        """Appends the given tag to the contents of this tag."""
665        self.contents.append(tag)
666
667    #Private methods
668
669    def _getAttrMap(self):
670        """Initializes a map representation of this tag's attributes,
671        if not already initialized."""
672        if not getattr(self, 'attrMap'):
673            self.attrMap = {}
674            for (key, value) in self.attrs:
675                self.attrMap[key] = value
676        return self.attrMap
677
678    #Generator methods
679    def childGenerator(self):
680        for i in range(0, len(self.contents)):
681            yield self.contents[i]
682        raise StopIteration
683   
684    def recursiveChildGenerator(self):
685        stack = [(self, 0)]
686        while stack:
687            tag, start = stack.pop()
688            if isinstance(tag, Tag):           
689                for i in range(start, len(tag.contents)):
690                    a = tag.contents[i]
691                    yield a
692                    if isinstance(a, Tag) and tag.contents:
693                        if i < len(tag.contents) - 1:
694                            stack.append((tag, i+1))
695                        stack.append((a, 0))
696                        break
697        raise StopIteration
698
699# Next, a couple classes to represent queries and their results.
700class SoupStrainer:
701    """Encapsulates a number of ways of matching a markup element (tag or
702    text)."""
703
704    def __init__(self, name=None, attrs={}, text=None, **kwargs):
705        self.name = name
706        if isString(attrs):
707            kwargs['class'] = attrs
708            attrs = None
709        if kwargs:
710            if attrs:
711                attrs = attrs.copy()
712                attrs.update(kwargs)
713            else:
714                attrs = kwargs
715        self.attrs = attrs
716        self.text = text
717
718    def __str__(self):
719        if self.text:
720            return self.text
721        else:
722            return "%s|%s" % (self.name, self.attrs)
723   
724    def searchTag(self, markupName=None, markupAttrs={}):
725        found = None
726        markup = None
727        if isinstance(markupName, Tag):
728            markup = markupName
729            markupAttrs = markup
730        callFunctionWithTagData = callable(self.name) \
731                                and not isinstance(markupName, Tag)
732
733        if (not self.name) \
734               or callFunctionWithTagData \
735               or (markup and self._matches(markup, self.name)) \
736               or (not markup and self._matches(markupName, self.name)):
737            if callFunctionWithTagData:
738                match = self.name(markupName, markupAttrs)
739            else:
740                match = True           
741                markupAttrMap = None
742                for attr, matchAgainst in self.attrs.items():
743                    if not markupAttrMap:
744                         if hasattr(markupAttrs, 'get'):
745                            markupAttrMap = markupAttrs
746                         else:
747                            markupAttrMap = {}
748                            for k,v in markupAttrs:
749                                markupAttrMap[k] = v
750                    attrValue = markupAttrMap.get(attr)
751                    if not self._matches(attrValue, matchAgainst):
752                        match = False
753                        break
754            if match:
755                if markup:
756                    found = markup
757                else:
758                    found = markupName
759        return found
760
761    def search(self, markup):
762        #print 'looking for %s in %s' % (self, markup)
763        found = None
764        # If given a list of items, scan it for a text element that
765        # matches.       
766        if isList(markup) and not isinstance(markup, Tag):
767            for element in markup:
768                if isinstance(element, NavigableString) \
769                       and self.search(element):
770                    found = element
771                    break
772        # If it's a Tag, make sure its name or attributes match.
773        # Don't bother with Tags if we're searching for text.
774        elif isinstance(markup, Tag):
775            if not self.text:
776                found = self.searchTag(markup)
777        # If it's text, make sure the text matches.
778        elif isinstance(markup, NavigableString) or \
779                 isString(markup):
780            if self._matches(markup, self.text):
781                found = markup
782        else:
783            raise Exception, "I don't know how to match against a %s" \
784                  % markup.__class__
785        return found
786       
787    def _matches(self, markup, matchAgainst):   
788        #print "Matching %s against %s" % (markup, matchAgainst)
789        result = False
790        if matchAgainst == True and type(matchAgainst) == types.BooleanType:
791            result = markup != None
792        elif callable(matchAgainst):
793            result = matchAgainst(markup)
794        else:
795            #Custom match methods take the tag as an argument, but all
796            #other ways of matching match the tag name as a string.
797            if isinstance(markup, Tag):
798                markup = markup.name
799            if markup and not isString(markup):
800                markup = unicode(markup)
801            #Now we know that chunk is either a string, or None.
802            if hasattr(matchAgainst, 'match'):
803                # It's a regexp object.
804                result = markup and matchAgainst.search(markup)
805            elif isList(matchAgainst):
806                result = markup in matchAgainst
807            elif hasattr(matchAgainst, 'items'):
808                result = markup.has_key(matchAgainst)
809            elif matchAgainst and isString(markup):
810                if isinstance(markup, unicode):
811                    matchAgainst = unicode(matchAgainst)
812                else:
813                    matchAgainst = str(matchAgainst)
814
815            if not result:
816                result = matchAgainst == markup
817        return result
818
819class ResultSet(list):
820    """A ResultSet is just a list that keeps track of the SoupStrainer
821    that created it."""
822    def __init__(self, source):
823        list.__init__([])
824        self.source = source
825
826# Now, some helper functions.
827
828def isList(l):
829    """Convenience method that works with all 2.x versions of Python
830    to determine whether or not something is listlike."""
831    return hasattr(l, '__iter__') \
832           or (type(l) in (types.ListType, types.TupleType))
833
834def isString(s):
835    """Convenience method that works with all 2.x versions of Python
836    to determine whether or not something is stringlike."""
837    try:
838        return isinstance(s, unicode) or isintance(s, basestring) 
839    except NameError:
840        return isinstance(s, str)
841
842def buildTagMap(default, *args):
843    """Turns a list of maps, lists, or scalars into a single map.
844    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
845    NESTING_RESET_TAGS maps out of lists and partial maps."""
846    built = {}
847    for portion in args:
848        if hasattr(portion, 'items'):
849            #It's a map. Merge it.
850            for k,v in portion.items():
851                built[k] = v
852        elif isList(portion):
853            #It's a list. Map each item to the default.
854            for k in portion:
855                built[k] = default
856        else:
857            #It's a scalar. Map it to the default.
858            built[portion] = default
859    return built
860
861# Now, the parser classes.
862
863class BeautifulStoneSoup(Tag, SGMLParser):
864
865    """This class contains the basic parser and search code. It defines
866    a parser that knows nothing about tag behavior except for the
867    following:
868   
869      You can't close a tag without closing all the tags it encloses.
870      That is, "<foo><bar></foo>" actually means
871      "<foo><bar></bar></foo>".
872
873    [Another possible explanation is "<foo><bar /></foo>", but since
874    this class defines no SELF_CLOSING_TAGS, it will never use that
875    explanation.]
876
877    This class is useful for parsing XML or made-up markup languages,
878    or when BeautifulSoup makes an assumption counter to what you were
879    expecting."""
880
881    XML_ENTITY_LIST = {}
882    for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values():
883        XML_ENTITY_LIST[i] = True 
884
885    SELF_CLOSING_TAGS = {}
886    NESTABLE_TAGS = {}
887    RESET_NESTING_TAGS = {}
888    QUOTE_TAGS = {}
889
890    MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
891                       lambda x: x.group(1) + ' />'),
892                      (re.compile('<!\s+([^<>]*)>'),
893                       lambda x: '<!' + x.group(1) + '>')
894                      ]
895
896    ROOT_TAG_NAME = u'[document]'
897
898    HTML_ENTITIES = "html"
899    XML_ENTITIES = "xml"
900
901    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
902                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
903                 convertEntities=None, selfClosingTags=None):
904        """The Soup object is initialized as the 'root tag', and the
905        provided markup (which can be a string or a file-like object)
906        is fed into the underlying parser.
907
908        sgmllib will process most bad HTML, and the BeautifulSoup
909        class has some tricks for dealing with some HTML that kills
910        sgmllib, but Beautiful Soup can nonetheless choke or lose data
911        if your data uses self-closing tags or declarations
912        incorrectly.
913
914        By default, Beautiful Soup uses regexes to sanitize input,
915        avoiding the vast majority of these problems. If the problems
916        don't apply to you, pass in False for markupMassage, and
917        you'll get better performance.
918
919        The default parser massage techniques fix the two most common
920        instances of invalid HTML that choke sgmllib:
921
922         <br/> (No space between name of closing tag and tag close)
923         <! --Comment--> (Extraneous whitespace in declaration)
924
925        You can pass in a custom list of (RE object, replace method)
926        tuples to get Beautiful Soup to scrub your input the way you
927        want."""
928
929        self.parseOnlyThese = parseOnlyThese
930        self.fromEncoding = fromEncoding
931        self.smartQuotesTo = smartQuotesTo
932        self.convertEntities = convertEntities
933        if self.convertEntities:
934            # It doesn't make sense to convert encoded characters to
935            # entities even while you're converting entities to Unicode.
936            # Just convert it all to Unicode.
937            self.smartQuotesTo = None
938        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
939        SGMLParser.__init__(self)
940           
941        if hasattr(markup, 'read'):        # It's a file-type object.
942            markup = markup.read()
943        self.markup = markup
944        self.markupMassage = markupMassage
945        try:
946            self._feed()
947        except StopParsing:
948            pass
949        self.markup = None                 # The markup can now be GCed
950       
951    def _feed(self, inDocumentEncoding=None):
952        # Convert the document to Unicode.
953        markup = self.markup
954        if isinstance(markup, unicode):
955            if not hasattr(self, 'originalEncoding'):
956                self.originalEncoding = None
957        else:
958            dammit = UnicodeDammit\
959                     (markup, [self.fromEncoding, inDocumentEncoding],
960                      smartQuotesTo=self.smartQuotesTo)
961            markup = dammit.unicode
962            self.originalEncoding = dammit.originalEncoding
963        if markup:
964            if self.markupMassage:
965                if not isList(self.markupMassage):
966                    self.markupMassage = self.MARKUP_MASSAGE           
967                for fix, m in self.markupMassage:
968                    markup = fix.sub(m, markup)
969        self.reset()
970
971        SGMLParser.feed(self, markup)
972        # Close out any unfinished strings and close all the open tags.
973        self.endData()
974        while self.currentTag.name != self.ROOT_TAG_NAME:
975            self.popTag()
976
977    def __getattr__(self, methodName):
978        """This method routes method call requests to either the SGMLParser
979        superclass or the Tag superclass, depending on the method name."""
980        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
981
982        if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
983               or methodName.find('do_') == 0:
984            return SGMLParser.__getattr__(self, methodName)
985        elif methodName.find('__') != 0:
986            return Tag.__getattr__(self, methodName)
987        else:
988            raise AttributeError
989
990    def isSelfClosingTag(self, name):
991        """Returns true iff the given string is the name of a
992        self-closing tag according to this parser."""
993        return self.SELF_CLOSING_TAGS.has_key(name) \
994               or self.instanceSelfClosingTags.has_key(name)
995           
996    def reset(self):
997        Tag.__init__(self, self, self.ROOT_TAG_NAME)
998        self.hidden = 1
999        SGMLParser.reset(self)
1000        self.currentData = []
1001        self.currentTag = None
1002        self.tagStack = []
1003        self.quoteStack = []
1004        self.pushTag(self)
1005   
1006    def popTag(self):
1007        tag = self.tagStack.pop()
1008        # Tags with just one string-owning child get the child as a
1009        # 'string' property, so that soup.tag.string is shorthand for
1010        # soup.tag.contents[0]
1011        if len(self.currentTag.contents) == 1 and \
1012           isinstance(self.currentTag.contents[0], NavigableString):
1013            self.currentTag.string = self.currentTag.contents[0]
1014
1015        #print "Pop", tag.name
1016        if self.tagStack:
1017            self.currentTag = self.tagStack[-1]
1018        return self.currentTag
1019
1020    def pushTag(self, tag):
1021        #print "Push", tag.name
1022        if self.currentTag:
1023            self.currentTag.append(tag)
1024        self.tagStack.append(tag)
1025        self.currentTag = self.tagStack[-1]
1026
1027    def endData(self, containerClass=NavigableString):
1028        if self.currentData:
1029            currentData = ''.join(self.currentData)
1030            if not currentData.strip():
1031                if '\n' in currentData:
1032                    currentData = '\n'
1033                else:
1034                    currentData = ' '
1035            self.currentData = []
1036            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1037                   (not self.parseOnlyThese.text or \
1038                    not self.parseOnlyThese.search(currentData)):
1039                return
1040            o = containerClass(currentData)
1041            o.setup(self.currentTag, self.previous)
1042            if self.previous:
1043                self.previous.next = o
1044            self.previous = o
1045            self.currentTag.contents.append(o)
1046
1047
1048    def _popToTag(self, name, inclusivePop=True):
1049        """Pops the tag stack up to and including the most recent
1050        instance of the given tag. If inclusivePop is false, pops the tag
1051        stack up to but *not* including the most recent instqance of
1052        the given tag."""
1053        #print "Popping to %s" % name
1054        if name == self.ROOT_TAG_NAME:
1055            return           
1056
1057        numPops = 0
1058        mostRecentTag = None
1059        for i in range(len(self.tagStack)-1, 0, -1):
1060            if name == self.tagStack[i].name:
1061                numPops = len(self.tagStack)-i
1062                break
1063        if not inclusivePop:
1064            numPops = numPops - 1
1065
1066        for i in range(0, numPops):
1067            mostRecentTag = self.popTag()
1068        return mostRecentTag   
1069
1070    def _smartPop(self, name):
1071
1072        """We need to pop up to the previous tag of this type, unless
1073        one of this tag's nesting reset triggers comes between this
1074        tag and the previous tag of this type, OR unless this tag is a
1075        generic nesting trigger and another generic nesting trigger
1076        comes between this tag and the previous tag of this type.
1077
1078        Examples:
1079         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1080         <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
1081         <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
1082         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1083
1084         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1085         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1086         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1087        """
1088
1089        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1090        isNestable = nestingResetTriggers != None
1091        isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1092        popTo = None
1093        inclusive = True
1094        for i in range(len(self.tagStack)-1, 0, -1):
1095            p = self.tagStack[i]
1096            if (not p or p.name == name) and not isNestable:
1097                #Non-nestable tags get popped to the top or to their
1098                #last occurance.
1099                popTo = name
1100                break
1101            if (nestingResetTriggers != None
1102                and p.name in nestingResetTriggers) \
1103                or (nestingResetTriggers == None and isResetNesting
1104                    and self.RESET_NESTING_TAGS.has_key(p.name)):
1105               
1106                #If we encounter one of the nesting reset triggers
1107                #peculiar to this tag, or we encounter another tag
1108                #that causes nesting to reset, pop up to but not
1109                #including that tag.
1110                popTo = p.name
1111                inclusive = False
1112                break
1113            p = p.parent
1114        if popTo:
1115            self._popToTag(popTo, inclusive)
1116
1117    def unknown_starttag(self, name, attrs, selfClosing=0):
1118        #print "Start tag %s: %s" % (name, attrs)
1119        if self.quoteStack:
1120            #This is not a real tag.
1121            #print "<%s> is not real!" % name
1122            attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1123            self.handle_data('<%s%s>' % (name, attrs))
1124            return       
1125        self.endData()
1126
1127        if not self.isSelfClosingTag(name) and not selfClosing:
1128            self._smartPop(name)
1129
1130        if self.parseOnlyThese and len(self.tagStack) <= 1 \
1131               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1132            return
1133
1134        tag = Tag(self, name, attrs, self.currentTag, self.previous)
1135        if self.previous:
1136            self.previous.next = tag
1137        self.previous = tag
1138        self.pushTag(tag)
1139        if selfClosing or self.isSelfClosingTag(name):
1140            self.popTag()               
1141        if name in self.QUOTE_TAGS:
1142            #print "Beginning quote (%s)" % name
1143            self.quoteStack.append(name)
1144            self.literal = 1
1145        return tag
1146
1147    def unknown_endtag(self, name):
1148        #print "End tag %s" % name
1149        if self.quoteStack and self.quoteStack[-1] != name:
1150            #This is not a real end tag.
1151            #print "</%s> is not real!" % name
1152            self.handle_data('</%s>' % name)
1153            return
1154        self.endData()
1155        self._popToTag(name)
1156        if self.quoteStack and self.quoteStack[-1] == name:
1157            self.quoteStack.pop()
1158            self.literal = (len(self.quoteStack) > 0)
1159
1160    def handle_data(self, data):
1161        self.currentData.append(data)
1162
1163    def _toStringSubclass(self, text, subclass):
1164        """Adds a certain piece of text to the tree as a NavigableString
1165        subclass."""
1166        self.endData()
1167        self.handle_data(text)
1168        self.endData(subclass)
1169
1170    def handle_pi(self, text):
1171        """Handle a processing instruction as a ProcessingInstruction
1172        object, possibly one with a %SOUP-ENCODING% slot into which an
1173        encoding will be plugged later."""
1174        if text[:3] == "xml":
1175            text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
1176        self._toStringSubclass(text, ProcessingInstruction)
1177
1178    def handle_comment(self, text):
1179        "Handle comments as Comment objects."
1180        self._toStringSubclass(text, Comment)
1181
1182    def handle_charref(self, ref):
1183        "Handle character references as data."
1184        if self.convertEntities in [self.HTML_ENTITIES,
1185                                    self.XML_ENTITIES]:
1186            data = unichr(int(ref))
1187        else:
1188            data = '&#%s;' % ref
1189        self.handle_data(data)
1190
1191    def handle_entityref(self, ref):
1192        """Handle entity references as data, possibly converting known
1193        HTML entity references to the corresponding Unicode
1194        characters."""
1195        data = None
1196        if self.convertEntities == self.HTML_ENTITIES or \
1197               (self.convertEntities == self.XML_ENTITIES and \
1198                self.XML_ENTITY_LIST.get(ref)):
1199            try:
1200                data = unichr(name2codepoint[ref])
1201            except KeyError:
1202                pass
1203        if not data:
1204            data = '&%s;' % ref
1205        self.handle_data(data)
1206       
1207    def handle_decl(self, data):
1208        "Handle DOCTYPEs and the like as Declaration objects."
1209        self._toStringSubclass(data, Declaration)
1210
1211    def parse_declaration(self, i):
1212        """Treat a bogus SGML declaration as raw data. Treat a CDATA
1213        declaration as a CData object."""
1214        j = None
1215        if self.rawdata[i:i+9] == '<![CDATA[':
1216             k = self.rawdata.find(']]>', i)
1217             if k == -1:
1218                 k = len(self.rawdata)
1219             data = self.rawdata[i+9:k]
1220             j = k+3
1221             self._toStringSubclass(data, CData)
1222        else:
1223            try:
1224                j = SGMLParser.parse_declaration(self, i)
1225            except SGMLParseError:
1226                toHandle = self.rawdata[i:]
1227                self.handle_data(toHandle)
1228                j = i + len(toHandle)
1229        return j
1230
1231class BeautifulSoup(BeautifulStoneSoup):
1232
1233    """This parser knows the following facts about HTML:
1234
1235    * Some tags have no closing tag and should be interpreted as being
1236      closed as soon as they are encountered.
1237
1238    * The text inside some tags (ie. 'script') may contain tags which
1239      are not really part of the document and which should be parsed
1240      as text, not tags. If you want to parse the text as tags, you can
1241      always fetch it and parse it explicitly.
1242
1243    * Tag nesting rules:
1244
1245      Most tags can't be nested at all. For instance, the occurance of
1246      a <p> tag should implicitly close the previous <p> tag.
1247
1248       <p>Para1<p>Para2
1249        should be transformed into:
1250       <p>Para1</p><p>Para2
1251
1252      Some tags can be nested arbitrarily. For instance, the occurance
1253      of a <blockquote> tag should _not_ implicitly close the previous
1254      <blockquote> tag.
1255
1256       Alice said: <blockquote>Bob said: <blockquote>Blah
1257        should NOT be transformed into:
1258       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1259
1260      Some tags can be nested, but the nesting is reset by the
1261      interposition of other tags. For instance, a <tr> tag should
1262      implicitly close the previous <tr> tag within the same <table>,
1263      but not close a <tr> tag in another table.
1264
1265       <table><tr>Blah<tr>Blah
1266        should be transformed into:
1267       <table><tr>Blah</tr><tr>Blah
1268        but,
1269       <tr>Blah<table><tr>Blah
1270        should NOT be transformed into
1271       <tr>Blah<table></tr><tr>Blah
1272
1273    Differing assumptions about tag nesting rules are a major source
1274    of problems with the BeautifulSoup class. If BeautifulSoup is not
1275    treating as nestable a tag your page author treats as nestable,
1276    try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1277    BeautifulStoneSoup before writing your own subclass."""
1278
1279    def __init__(self, *args, **kwargs):
1280        if not kwargs.has_key('smartQuotesTo'):
1281            kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1282        BeautifulStoneSoup.__init__(self, *args, **kwargs)
1283
1284    SELF_CLOSING_TAGS = buildTagMap(None,
1285                                    ['br' , 'hr', 'input', 'img', 'meta',
1286                                    'spacer', 'link', 'frame', 'base'])
1287
1288    QUOTE_TAGS = {'script': None}
1289   
1290    #According to the HTML standard, each of these inline tags can
1291    #contain another tag of the same type. Furthermore, it's common
1292    #to actually use these tags this way.
1293    NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1294                            'center']
1295
1296    #According to the HTML standard, these block tags can contain
1297    #another tag of the same type. Furthermore, it's common
1298    #to actually use these tags this way.
1299    NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1300
1301    #Lists can contain other lists, but there are restrictions.   
1302    NESTABLE_LIST_TAGS = { 'ol' : [],
1303                           'ul' : [],
1304                           'li' : ['ul', 'ol'],
1305                           'dl' : [],
1306                           'dd' : ['dl'],
1307                           'dt' : ['dl'] }
1308
1309    #Tables can contain other tables, but there are restrictions.   
1310    NESTABLE_TABLE_TAGS = {'table' : [], 
1311                           'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1312                           'td' : ['tr'],
1313                           'th' : ['tr'],
1314                           'thead' : ['table'],
1315                           'tbody' : ['table'],
1316                           'tfoot' : ['table'],
1317                           }
1318
1319    NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1320
1321    #If one of these tags is encountered, all tags up to the next tag of
1322    #this type are popped.
1323    RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1324                                     NON_NESTABLE_BLOCK_TAGS,
1325                                     NESTABLE_LIST_TAGS,
1326                                     NESTABLE_TABLE_TAGS)
1327
1328    NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1329                                NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1330
1331    # Used to detect the charset in a META tag; see start_meta
1332    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
1333
1334    def start_meta(self, attrs):
1335        """Beautiful Soup can detect a charset included in a META tag,
1336        try to convert the document to that charset, and re-parse the
1337        document from the beginning."""
1338        httpEquiv = None
1339        contentType = None
1340        contentTypeIndex = None
1341        tagNeedsEncodingSubstitution = False
1342
1343        for i in range(0, len(attrs)):
1344            key, value = attrs[i]
1345            key = key.lower()
1346            if key == 'http-equiv':
1347                httpEquiv = value
1348            elif key == 'content':
1349                contentType = value
1350                contentTypeIndex = i
1351
1352        if httpEquiv and contentType: # It's an interesting meta tag.
1353            match = self.CHARSET_RE.search(contentType)
1354            if match:
1355                if getattr(self, 'declaredHTMLEncoding') or \
1356                       (self.originalEncoding == self.fromEncoding):
1357                    # This is our second pass through the document, or
1358                    # else an encoding was specified explicitly and it
1359                    # worked. Rewrite the meta tag.
1360                    newAttr = self.CHARSET_RE.sub\
1361                              (lambda(match):match.group(1) +
1362                               "%SOUP-ENCODING%", value)
1363                    attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1364                                               newAttr)
1365                    tagNeedsEncodingSubstitution = True
1366                else:
1367                    # This is our first pass through the document.
1368                    # Go through it again with the new information.
1369                    newCharset = match.group(3)
1370                    if newCharset and newCharset != self.originalEncoding:
1371                        self.declaredHTMLEncoding = newCharset
1372                        self._feed(self.declaredHTMLEncoding)
1373                        raise StopParsing
1374        tag = self.unknown_starttag("meta", attrs)
1375        if tag and tagNeedsEncodingSubstitution:
1376            tag.containsSubstitutions = True
1377
1378class StopParsing(Exception):
1379    pass
1380   
1381class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1382
1383    """The BeautifulSoup class is oriented towards skipping over
1384    common HTML errors like unclosed tags. However, sometimes it makes
1385    errors of its own. For instance, consider this fragment:
1386
1387     <b>Foo<b>Bar</b></b>
1388
1389    This is perfectly valid (if bizarre) HTML. However, the
1390    BeautifulSoup class will implicitly close the first b tag when it
1391    encounters the second 'b'. It will think the author wrote
1392    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1393    there's no real-world reason to bold something that's already
1394    bold. When it encounters '</b></b>' it will close two more 'b'
1395    tags, for a grand total of three tags closed instead of two. This
1396    can throw off the rest of your document structure. The same is
1397    true of a number of other tags, listed below.
1398
1399    It's much more common for someone to forget to close a 'b' tag
1400    than to actually use nested 'b' tags, and the BeautifulSoup class
1401    handles the common case. This class handles the not-co-common
1402    case: where you can't believe someone wrote what they did, but
1403    it's valid HTML and BeautifulSoup screwed up by assuming it
1404    wouldn't be."""
1405
1406    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1407     ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1408      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1409      'big']
1410
1411    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1412
1413    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1414                                I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1415                                I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1416
1417class MinimalSoup(BeautifulSoup):
1418    """The MinimalSoup class is for parsing HTML that contains
1419    pathologically bad markup. It makes no assumptions about tag
1420    nesting, but it does know which tags are self-closing, that
1421    <script> tags contain Javascript and should not be parsed, that
1422    META tags may contain encoding information, and so on.
1423
1424    This also makes it better for subclassing than BeautifulStoneSoup
1425    or BeautifulSoup."""
1426   
1427    RESET_NESTING_TAGS = buildTagMap('noscript')
1428    NESTABLE_TAGS = {}
1429
1430class BeautifulSOAP(BeautifulStoneSoup):
1431    """This class will push a tag with only a single string child into
1432    the tag's parent as an attribute. The attribute's name is the tag
1433    name, and the value is the string child. An example should give
1434    the flavor of the change:
1435
1436    <foo><bar>baz</bar></foo>
1437     =>
1438    <foo bar="baz"><bar>baz</bar></foo>
1439
1440    You can then access fooTag['bar'] instead of fooTag.barTag.string.
1441
1442    This is, of course, useful for scraping structures that tend to
1443    use subelements instead of attributes, such as SOAP messages. Note
1444    that it modifies its input, so don't print the modified version
1445    out.
1446
1447    I'm not sure how many people really want to use this class; let me
1448    know if you do. Mainly I like the name."""
1449
1450    def popTag(self):
1451        if len(self.tagStack) > 1:
1452            tag = self.tagStack[-1]
1453            parent = self.tagStack[-2]
1454            parent._getAttrMap()
1455            if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1456                isinstance(tag.contents[0], NavigableString) and 
1457                not parent.attrMap.has_key(tag.name)):
1458                parent[tag.name] = tag.contents[0]
1459        BeautifulStoneSoup.popTag(self)
1460
1461#Enterprise class names! It has come to our attention that some people
1462#think the names of the Beautiful Soup parser classes are too silly
1463#and "unprofessional" for use in enterprise screen-scraping. We feel
1464#your pain! For such-minded folk, the Beautiful Soup Consortium And
1465#All-Night Kosher Bakery recommends renaming this file to
1466#"RobustParser.py" (or, in cases of extreme enterprisness,
1467#"RobustParserBeanInterface.class") and using the following
1468#enterprise-friendly class aliases:
1469class RobustXMLParser(BeautifulStoneSoup):
1470    pass
1471class RobustHTMLParser(BeautifulSoup):
1472    pass
1473class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1474    pass
1475class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1476    pass
1477class SimplifyingSOAPParser(BeautifulSOAP):
1478    pass
1479
1480######################################################
1481#
1482# Bonus library: Unicode, Dammit
1483#
1484# This class forces XML data into a standard format (usually to UTF-8
1485# or Unicode).  It is heavily based on code from Mark Pilgrim's
1486# Universal Feed Parser. It does not rewrite the XML or HTML to
1487# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1488# (XML) and BeautifulSoup.start_meta (HTML).
1489
1490# Autodetects character encodings.
1491# Download from http://chardet.feedparser.org/
1492try:
1493    import chardet
1494#    import chardet.constants
1495#    chardet.constants._debug = 1
1496except:
1497    chardet = None
1498chardet = None
1499
1500# cjkcodecs and iconv_codec make Python know about more character encodings.
1501# Both are available from http://cjkpython.i18n.org/
1502# They're built in if you use Python 2.4.
1503try:
1504    import cjkcodecs.aliases
1505except:
1506    pass
1507try:
1508    import iconv_codec
1509except:
1510    pass
1511
1512class UnicodeDammit:
1513    """A class for detecting the encoding of a *ML document and
1514    converting it to a Unicode string. If the source encoding is
1515    windows-1252, can replace MS smart quotes with their HTML or XML
1516    equivalents."""
1517
1518    # This dictionary maps commonly seen values for "charset" in HTML
1519    # meta tags to the corresponding Python codec names. It only covers
1520    # values that aren't in Python's aliases and can't be determined
1521    # by the heuristics in find_codec.
1522    CHARSET_ALIASES = { "macintosh" : "mac-roman",
1523                        "x-sjis" : "shift-jis" }
1524
1525    def __init__(self, markup, overrideEncodings=[],
1526                 smartQuotesTo='xml'):
1527        self.markup, documentEncoding, sniffedEncoding = \
1528                     self._detectEncoding(markup)
1529        self.smartQuotesTo = smartQuotesTo
1530        self.triedEncodings = []
1531        if markup == '' or isinstance(markup, unicode):
1532            self.originalEncoding = None
1533            self.unicode = unicode(markup)           
1534            return
1535       
1536        u = None
1537        for proposedEncoding in overrideEncodings:
1538            u = self._convertFrom(proposedEncoding)
1539            if u: break
1540        if not u:
1541            for proposedEncoding in (documentEncoding, sniffedEncoding):
1542                u = self._convertFrom(proposedEncoding)
1543                if u: break
1544               
1545        # If no luck and we have auto-detection library, try that:
1546        if not u and chardet and not isinstance(self.markup, unicode):
1547            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1548
1549        # As a last resort, try utf-8 and windows-1252:
1550        if not u:
1551            for proposed_encoding in ("utf-8", "windows-1252"):
1552                u = self._convertFrom(proposed_encoding)
1553                if u: break
1554        self.unicode = u
1555        if not u: self.originalEncoding = None
1556
1557    def _subMSChar(self, orig):
1558        """Changes a MS smart quote character to an XML or HTML
1559        entity."""
1560        sub = self.MS_CHARS.get(orig)
1561        if type(sub) == types.TupleType:
1562            if self.smartQuotesTo == 'xml':
1563                sub = '&#x%s;' % sub[1]
1564            else:
1565                sub = '&%s;' % sub[0]
1566        return sub           
1567
1568    def _convertFrom(self, proposed):       
1569        proposed = self.find_codec(proposed)
1570        if not proposed or proposed in self.triedEncodings:
1571            return None
1572        self.triedEncodings.append(proposed)
1573        markup = self.markup
1574
1575        # Convert smart quotes to HTML if coming from an encoding
1576        # that might have them.
1577        if self.smartQuotesTo and proposed.lower() in("windows-1252",
1578                                                      "iso-8859-1",
1579                                                      "iso-8859-2"):
1580            markup = re.compile("([\x80-\x9f])").sub \
1581                     (lambda(x): self._subMSChar(x.group(1)),
1582                      markup)
1583
1584        try:
1585            # print "Trying to convert document to %s" % proposed
1586            u = self._toUnicode(markup, proposed)
1587            self.markup = u       
1588            self.originalEncoding = proposed
1589        except Exception, e:
1590            # print "That didn't work!"
1591            # print e
1592            return None       
1593        #print "Correct encoding: %s" % proposed
1594        return self.markup
1595
1596    def _toUnicode(self, data, encoding):
1597        '''Given a string and its encoding, decodes the string into Unicode.
1598        %encoding is a string recognized by encodings.aliases'''
1599
1600        # strip Byte Order Mark (if present)
1601        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1602               and (data[2:4] != '\x00\x00'):
1603            encoding = 'utf-16be'
1604            data = data[2:]
1605        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1606                 and (data[2:4] != '\x00\x00'):
1607            encoding = 'utf-16le'
1608            data = data[2:]
1609        elif data[:3] == '\xef\xbb\xbf':
1610            encoding = 'utf-8'
1611            data = data[3:]
1612        elif data[:4] == '\x00\x00\xfe\xff':
1613            encoding = 'utf-32be'
1614            data = data[4:]
1615        elif data[:4] == '\xff\xfe\x00\x00':
1616            encoding = 'utf-32le'
1617            data = data[4:]
1618        newdata = unicode(data, encoding)
1619        return newdata
1620   
1621    def _detectEncoding(self, xml_data):
1622        """Given a document, tries to detect its XML encoding."""
1623        xml_encoding = sniffed_xml_encoding = None
1624        try:
1625            if xml_data[:4] == '\x4c\x6f\xa7\x94':
1626                # EBCDIC
1627                xml_data = self._ebcdic_to_ascii(xml_data)
1628            elif xml_data[:4] == '\x00\x3c\x00\x3f':
1629                # UTF-16BE
1630                sniffed_xml_encoding = 'utf-16be'
1631                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1632            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1633                     and (xml_data[2:4] != '\x00\x00'):
1634                # UTF-16BE with BOM
1635                sniffed_xml_encoding = 'utf-16be'
1636                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1637            elif xml_data[:4] == '\x3c\x00\x3f\x00':
1638                # UTF-16LE
1639                sniffed_xml_encoding = 'utf-16le'
1640                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1641            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1642                     (xml_data[2:4] != '\x00\x00'):
1643                # UTF-16LE with BOM
1644                sniffed_xml_encoding = 'utf-16le'
1645                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1646            elif xml_data[:4] == '\x00\x00\x00\x3c':
1647                # UTF-32BE
1648                sniffed_xml_encoding = 'utf-32be'
1649                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1650            elif xml_data[:4] == '\x3c\x00\x00\x00':
1651                # UTF-32LE
1652                sniffed_xml_encoding = 'utf-32le'
1653                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1654            elif xml_data[:4] == '\x00\x00\xfe\xff':
1655                # UTF-32BE with BOM
1656                sniffed_xml_encoding = 'utf-32be'
1657                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1658            elif xml_data[:4] == '\xff\xfe\x00\x00':
1659                # UTF-32LE with BOM
1660                sniffed_xml_encoding = 'utf-32le'
1661                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1662            elif xml_data[:3] == '\xef\xbb\xbf':
1663                # UTF-8 with BOM
1664                sniffed_xml_encoding = 'utf-8'
1665                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1666            else:
1667                sniffed_xml_encoding = 'ascii'
1668                pass
1669            xml_encoding_match = re.compile \
1670                                 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
1671                                 .match(xml_data)
1672        except:
1673            xml_encoding_match = None
1674        if xml_encoding_match:
1675            xml_encoding = xml_encoding_match.groups()[0].lower()
1676            if sniffed_xml_encoding and \
1677               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1678                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1679                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1680                                 'utf16', 'u16')):
1681                xml_encoding = sniffed_xml_encoding
1682        return xml_data, xml_encoding, sniffed_xml_encoding
1683
1684
1685    def find_codec(self, charset):
1686        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1687               or (charset and self._codec(charset.replace("-", ""))) \
1688               or (charset and self._codec(charset.replace("-", "_"))) \
1689               or charset
1690
1691    def _codec(self, charset):
1692        if not charset: return charset
1693        codec = None
1694        try:
1695            codecs.lookup(charset)
1696            codec = charset
1697        except LookupError:
1698            pass
1699        return codec
1700
1701    EBCDIC_TO_ASCII_MAP = None
1702    def _ebcdic_to_ascii(self, s):
1703        c = self.__class__
1704        if not c.EBCDIC_TO_ASCII_MAP:
1705            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1706                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1707                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1708                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1709                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1710                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1711                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1712                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1713                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1714                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1715                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1716                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1717                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1718                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1719                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1720                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1721                    250,251,252,253,254,255)
1722            import string
1723            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1724            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1725        return s.translate(c.EBCDIC_TO_ASCII_MAP)
1726
1727    MS_CHARS = { '\x80' : ('euro', '20AC'),
1728                 '\x81' : ' ',
1729                 '\x82' : ('sbquo', '201A'),
1730                 '\x83' : ('fnof', '192'),
1731                 '\x84' : ('bdquo', '201E'),
1732                 '\x85' : ('hellip', '2026'),
1733                 '\x86' : ('dagger', '2020'),
1734                 '\x87' : ('Dagger', '2021'),
1735                 '\x88' : ('circ', '2C6'),
1736                 '\x89' : ('permil', '2030'),
1737                 '\x8A' : ('Scaron', '160'),
1738                 '\x8B' : ('lsaquo', '2039'),
1739                 '\x8C' : ('OElig', '152'),
1740                 '\x8D' : '?',
1741                 '\x8E' : ('#x17D', '17D'),
1742                 '\x8F' : '?',
1743                 '\x90' : '?',
1744                 '\x91' : ('lsquo', '2018'),
1745                 '\x92' : ('rsquo', '2019'),
1746                 '\x93' : ('ldquo', '201C'),
1747                 '\x94' : ('rdquo', '201D'),
1748                 '\x95' : ('bull', '2022'),
1749                 '\x96' : ('ndash', '2013'),
1750                 '\x97' : ('mdash', '2014'),
1751                 '\x98' : ('tilde', '2DC'),
1752                 '\x99' : ('trade', '2122'),
1753                 '\x9a' : ('scaron', '161'),
1754                 '\x9b' : ('rsaquo', '203A'),
1755                 '\x9c' : ('oelig', '153'),
1756                 '\x9d' : '?',
1757                 '\x9e' : ('#x17E', '17E'),
1758                 '\x9f' : ('Yuml', ''),}
1759
1760#######################################################################
1761
1762
1763#By default, act as an HTML pretty-printer.
1764if __name__ == '__main__':
1765    import sys
1766    soup = BeautifulSoup(sys.stdin.read())
1767    print soup.prettify()
Note: See TracBrowser for help on using the repository browser.