source: TI07-MOLES/trunk/StubB/XSLT/browse/portal/cgi/browse/ElementTree.py @ 1164

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI07-MOLES/trunk/StubB/XSLT/browse/portal/cgi/browse/ElementTree.py@1182
Revision 1164, 39.9 KB checked in by lawrence, 14 years ago (diff)

Putting browse code into a module to make it easier
to coexist with other cgi scripts etc ...

Line 
1#
2# ElementTree
3# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
4#
5# light-weight XML support for Python 1.5.2 and later.
6#
7# history:
8# 2001-10-20 fl   created (from various sources)
9# 2001-11-01 fl   return root from parse method
10# 2002-02-16 fl   sort attributes in lexical order
11# 2002-04-06 fl   TreeBuilder refactoring, added PythonDoc markup
12# 2002-05-01 fl   finished TreeBuilder refactoring
13# 2002-07-14 fl   added basic namespace support to ElementTree.write
14# 2002-07-25 fl   added QName attribute support
15# 2002-10-20 fl   fixed encoding in write
16# 2002-11-24 fl   changed default encoding to ascii; fixed attribute encoding
17# 2002-11-27 fl   accept file objects or file names for parse/write
18# 2002-12-04 fl   moved XMLTreeBuilder back to this module
19# 2003-01-11 fl   fixed entity encoding glitch for us-ascii
20# 2003-02-13 fl   added XML literal factory
21# 2003-02-21 fl   added ProcessingInstruction/PI factory
22# 2003-05-11 fl   added tostring/fromstring helpers
23# 2003-05-26 fl   added ElementPath support
24# 2003-07-05 fl   added makeelement factory method
25# 2003-07-28 fl   added more well-known namespace prefixes
26# 2003-08-15 fl   fixed typo in ElementTree.findtext (Thomas Dartsch)
27# 2003-09-04 fl   fall back on emulator if ElementPath is not installed
28# 2003-10-31 fl   markup updates
29# 2003-11-15 fl   fixed nested namespace bug
30# 2004-03-28 fl   added XMLID helper
31# 2004-06-02 fl   added default support to findtext
32# 2004-06-08 fl   fixed encoding of non-ascii element/attribute names
33# 2004-08-23 fl   take advantage of post-2.1 expat features
34# 2005-02-01 fl   added iterparse implementation
35# 2005-03-02 fl   fixed iterparse support for pre-2.2 versions
36#
37# Copyright (c) 1999-2005 by Fredrik Lundh.  All rights reserved.
38#
39# fredrik@pythonware.com
40# http://www.pythonware.com
41#
42# --------------------------------------------------------------------
43# The ElementTree toolkit is
44#
45# Copyright (c) 1999-2005 by Fredrik Lundh
46#
47# By obtaining, using, and/or copying this software and/or its
48# associated documentation, you agree that you have read, understood,
49# and will comply with the following terms and conditions:
50#
51# Permission to use, copy, modify, and distribute this software and
52# its associated documentation for any purpose and without fee is
53# hereby granted, provided that the above copyright notice appears in
54# all copies, and that both that copyright notice and this permission
55# notice appear in supporting documentation, and that the name of
56# Secret Labs AB or the author not be used in advertising or publicity
57# pertaining to distribution of the software without specific, written
58# prior permission.
59#
60# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
61# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
62# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
63# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
64# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
65# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
66# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
67# OF THIS SOFTWARE.
68# --------------------------------------------------------------------
69
70__all__ = [
71    # public symbols
72    "Comment",
73    "dump",
74    "Element", "ElementTree",
75    "fromstring",
76    "iselement", "iterparse",
77    "parse",
78    "PI", "ProcessingInstruction",
79    "QName",
80    "SubElement",
81    "tostring",
82    "TreeBuilder",
83    "VERSION", "XML",
84    "XMLTreeBuilder",
85    ]
86
87##
88# The <b>Element</b> type is a flexible container object, designed to
89# store hierarchical data structures in memory. The type can be
90# described as a cross between a list and a dictionary.
91# <p>
92# Each element has a number of properties associated with it:
93# <ul>
94# <li>a <i>tag</i>. This is a string identifying what kind of data
95# this element represents (the element type, in other words).</li>
96# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
97# <li>a <i>text</i> string.</li>
98# <li>an optional <i>tail</i> string.</li>
99# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
100# </ul>
101#
102# To create an element instance, use the {@link #Element} or {@link
103# #SubElement} factory functions.
104# <p>
105# The {@link #ElementTree} class can be used to wrap an element
106# structure, and convert it from and to XML.
107##
108
109import string, sys, re
110
111class _SimpleElementPath:
112    # emulate pre-1.2 find/findtext/findall behaviour
113    def find(self, element, tag):
114        for elem in element:
115            if elem.tag == tag:
116                return elem
117        return None
118    def findtext(self, element, tag, default=None):
119        for elem in element:
120            if elem.tag == tag:
121                return elem.text or ""
122        return default
123    def findall(self, element, tag):
124        if tag[:3] == ".//":
125            return element.getiterator(tag[3:])
126        result = []
127        for elem in element:
128            if elem.tag == tag:
129                result.append(elem)
130        return result
131
132try:
133    import ElementPath
134except ImportError:
135    # FIXME: issue warning in this case?
136    ElementPath = _SimpleElementPath()
137
138# TODO: add support for custom namespace resolvers/default namespaces
139# TODO: add improved support for incremental parsing
140
141VERSION = "1.2.6"
142
143##
144# Internal element class.  This class defines the Element interface,
145# and provides a reference implementation of this interface.
146# <p>
147# You should not create instances of this class directly.  Use the
148# appropriate factory functions instead, such as {@link #Element}
149# and {@link #SubElement}.
150#
151# @see Element
152# @see SubElement
153# @see Comment
154# @see ProcessingInstruction
155
156class _ElementInterface:
157    # <tag attrib>text<child/>...</tag>tail
158
159    ##
160    # (Attribute) Element tag.
161
162    tag = None
163
164    ##
165    # (Attribute) Element attribute dictionary.  Where possible, use
166    # {@link #_ElementInterface.get},
167    # {@link #_ElementInterface.set},
168    # {@link #_ElementInterface.keys}, and
169    # {@link #_ElementInterface.items} to access
170    # element attributes.
171
172    attrib = None
173
174    ##
175    # (Attribute) Text before first subelement.  This is either a
176    # string or the value None, if there was no text.
177
178    text = None
179
180    ##
181    # (Attribute) Text after this element's end tag, but before the
182    # next sibling element's start tag.  This is either a string or
183    # the value None, if there was no text.
184
185    tail = None # text after end tag, if any
186
187    def __init__(self, tag, attrib):
188        self.tag = tag
189        self.attrib = attrib
190        self._children = []
191
192    def __repr__(self):
193        return "<Element %s at %x>" % (self.tag, id(self))
194
195    ##
196    # Creates a new element object of the same type as this element.
197    #
198    # @param tag Element tag.
199    # @param attrib Element attributes, given as a dictionary.
200    # @return A new element instance.
201
202    def makeelement(self, tag, attrib):
203        return Element(tag, attrib)
204
205    ##
206    # Returns the number of subelements.
207    #
208    # @return The number of subelements.
209
210    def __len__(self):
211        return len(self._children)
212
213    ##
214    # Returns the given subelement.
215    #
216    # @param index What subelement to return.
217    # @return The given subelement.
218    # @exception IndexError If the given element does not exist.
219
220    def __getitem__(self, index):
221        return self._children[index]
222
223    ##
224    # Replaces the given subelement.
225    #
226    # @param index What subelement to replace.
227    # @param element The new element value.
228    # @exception IndexError If the given element does not exist.
229    # @exception AssertionError If element is not a valid object.
230
231    def __setitem__(self, index, element):
232        assert iselement(element)
233        self._children[index] = element
234
235    ##
236    # Deletes the given subelement.
237    #
238    # @param index What subelement to delete.
239    # @exception IndexError If the given element does not exist.
240
241    def __delitem__(self, index):
242        del self._children[index]
243
244    ##
245    # Returns a list containing subelements in the given range.
246    #
247    # @param start The first subelement to return.
248    # @param stop The first subelement that shouldn't be returned.
249    # @return A sequence object containing subelements.
250
251    def __getslice__(self, start, stop):
252        return self._children[start:stop]
253
254    ##
255    # Replaces a number of subelements with elements from a sequence.
256    #
257    # @param start The first subelement to replace.
258    # @param stop The first subelement that shouldn't be replaced.
259    # @param elements A sequence object with zero or more elements.
260    # @exception AssertionError If a sequence member is not a valid object.
261
262    def __setslice__(self, start, stop, elements):
263        for element in elements:
264            assert iselement(element)
265        self._children[start:stop] = list(elements)
266
267    ##
268    # Deletes a number of subelements.
269    #
270    # @param start The first subelement to delete.
271    # @param stop The first subelement to leave in there.
272
273    def __delslice__(self, start, stop):
274        del self._children[start:stop]
275
276    ##
277    # Adds a subelement to the end of this element.
278    #
279    # @param element The element to add.
280    # @exception AssertionError If a sequence member is not a valid object.
281
282    def append(self, element):
283        assert iselement(element)
284        self._children.append(element)
285
286    ##
287    # Inserts a subelement at the given position in this element.
288    #
289    # @param index Where to insert the new subelement.
290    # @exception AssertionError If the element is not a valid object.
291
292    def insert(self, index, element):
293        assert iselement(element)
294        self._children.insert(index, element)
295
296    ##
297    # Removes a matching subelement.  Unlike the <b>find</b> methods,
298    # this method compares elements based on identity, not on tag
299    # value or contents.
300    #
301    # @param element What element to remove.
302    # @exception ValueError If a matching element could not be found.
303    # @exception AssertionError If the element is not a valid object.
304
305    def remove(self, element):
306        assert iselement(element)
307        self._children.remove(element)
308
309    ##
310    # Returns all subelements.  The elements are returned in document
311    # order.
312    #
313    # @return A list of subelements.
314    # @defreturn list of Element instances
315
316    def getchildren(self):
317        return self._children
318
319    ##
320    # Finds the first matching subelement, by tag name or path.
321    #
322    # @param path What element to look for.
323    # @return The first matching element, or None if no element was found.
324    # @defreturn Element or None
325
326    def find(self, path):
327        return ElementPath.find(self, path)
328
329    ##
330    # Finds text for the first matching subelement, by tag name or path.
331    #
332    # @param path What element to look for.
333    # @param default What to return if the element was not found.
334    # @return The text content of the first matching element, or the
335    #     default value no element was found.  Note that if the element
336    #     has is found, but has no text content, this method returns an
337    #     empty string.
338    # @defreturn string
339
340    def findtext(self, path, default=None):
341        return ElementPath.findtext(self, path, default)
342
343    ##
344    # Finds all matching subelements, by tag name or path.
345    #
346    # @param path What element to look for.
347    # @return A list or iterator containing all matching elements,
348    #    in document order.
349    # @defreturn list of Element instances
350
351    def findall(self, path):
352        return ElementPath.findall(self, path)
353
354    ##
355    # Resets an element.  This function removes all subelements, clears
356    # all attributes, and sets the text and tail attributes to None.
357
358    def clear(self):
359        self.attrib.clear()
360        self._children = []
361        self.text = self.tail = None
362
363    ##
364    # Gets an element attribute.
365    #
366    # @param key What attribute to look for.
367    # @param default What to return if the attribute was not found.
368    # @return The attribute value, or the default value, if the
369    #     attribute was not found.
370    # @defreturn string or None
371
372    def get(self, key, default=None):
373        return self.attrib.get(key, default)
374
375    ##
376    # Sets an element attribute.
377    #
378    # @param key What attribute to set.
379    # @param value The attribute value.
380
381    def set(self, key, value):
382        self.attrib[key] = value
383
384    ##
385    # Gets a list of attribute names.  The names are returned in an
386    # arbitrary order (just like for an ordinary Python dictionary).
387    #
388    # @return A list of element attribute names.
389    # @defreturn list of strings
390
391    def keys(self):
392        return self.attrib.keys()
393
394    ##
395    # Gets element attributes, as a sequence.  The attributes are
396    # returned in an arbitrary order.
397    #
398    # @return A list of (name, value) tuples for all attributes.
399    # @defreturn list of (string, string) tuples
400
401    def items(self):
402        return self.attrib.items()
403
404    ##
405    # Creates a tree iterator.  The iterator loops over this element
406    # and all subelements, in document order, and returns all elements
407    # with a matching tag.
408    # <p>
409    # If the tree structure is modified during iteration, the result
410    # is undefined.
411    #
412    # @param tag What tags to look for (default is to return all elements).
413    # @return A list or iterator containing all the matching elements.
414    # @defreturn list or iterator
415
416    def getiterator(self, tag=None):
417        nodes = []
418        if tag == "*":
419            tag = None
420        if tag is None or self.tag == tag:
421            nodes.append(self)
422        for node in self._children:
423            nodes.extend(node.getiterator(tag))
424        return nodes
425
426# compatibility
427_Element = _ElementInterface
428
429##
430# Element factory.  This function returns an object implementing the
431# standard Element interface.  The exact class or type of that object
432# is implementation dependent, but it will always be compatible with
433# the {@link #_ElementInterface} class in this module.
434# <p>
435# The element name, attribute names, and attribute values can be
436# either 8-bit ASCII strings or Unicode strings.
437#
438# @param tag The element name.
439# @param attrib An optional dictionary, containing element attributes.
440# @param **extra Additional attributes, given as keyword arguments.
441# @return An element instance.
442# @defreturn Element
443
444def Element(tag, attrib={}, **extra):
445    attrib = attrib.copy()
446    attrib.update(extra)
447    return _ElementInterface(tag, attrib)
448
449##
450# Subelement factory.  This function creates an element instance, and
451# appends it to an existing element.
452# <p>
453# The element name, attribute names, and attribute values can be
454# either 8-bit ASCII strings or Unicode strings.
455#
456# @param parent The parent element.
457# @param tag The subelement name.
458# @param attrib An optional dictionary, containing element attributes.
459# @param **extra Additional attributes, given as keyword arguments.
460# @return An element instance.
461# @defreturn Element
462
463def SubElement(parent, tag, attrib={}, **extra):
464    attrib = attrib.copy()
465    attrib.update(extra)
466    element = parent.makeelement(tag, attrib)
467    parent.append(element)
468    return element
469
470##
471# Comment element factory.  This factory function creates a special
472# element that will be serialized as an XML comment.
473# <p>
474# The comment string can be either an 8-bit ASCII string or a Unicode
475# string.
476#
477# @param text A string containing the comment string.
478# @return An element instance, representing a comment.
479# @defreturn Element
480
481def Comment(text=None):
482    element = Element(Comment)
483    element.text = text
484    return element
485
486##
487# PI element factory.  This factory function creates a special element
488# that will be serialized as an XML processing instruction.
489#
490# @param target A string containing the PI target.
491# @param text A string containing the PI contents, if any.
492# @return An element instance, representing a PI.
493# @defreturn Element
494
495def ProcessingInstruction(target, text=None):
496    element = Element(ProcessingInstruction)
497    element.text = target
498    if text:
499        element.text = element.text + " " + text
500    return element
501
502PI = ProcessingInstruction
503
504##
505# QName wrapper.  This can be used to wrap a QName attribute value, in
506# order to get proper namespace handling on output.
507#
508# @param text A string containing the QName value, in the form {uri}local,
509#     or, if the tag argument is given, the URI part of a QName.
510# @param tag Optional tag.  If given, the first argument is interpreted as
511#     an URI, and this argument is interpreted as a local name.
512# @return An opaque object, representing the QName.
513
514class QName:
515    def __init__(self, text_or_uri, tag=None):
516        if tag:
517            text_or_uri = "{%s}%s" % (text_or_uri, tag)
518        self.text = text_or_uri
519    def __str__(self):
520        return self.text
521    def __hash__(self):
522        return hash(self.text)
523    def __cmp__(self, other):
524        if isinstance(other, QName):
525            return cmp(self.text, other.text)
526        return cmp(self.text, other)
527
528##
529# ElementTree wrapper class.  This class represents an entire element
530# hierarchy, and adds some extra support for serialization to and from
531# standard XML.
532#
533# @param element Optional root element.
534# @keyparam file Optional file handle or name.  If given, the
535#     tree is initialized with the contents of this XML file.
536
537class ElementTree:
538
539    def __init__(self, element=None, file=None):
540        assert element is None or iselement(element)
541        self._root = element # first node
542        if file:
543            self.parse(file)
544
545    ##
546    # Gets the root element for this tree.
547    #
548    # @return An element instance.
549    # @defreturn Element
550
551    def getroot(self):
552        return self._root
553
554    ##
555    # Replaces the root element for this tree.  This discards the
556    # current contents of the tree, and replaces it with the given
557    # element.  Use with care.
558    #
559    # @param element An element instance.
560
561    def _setroot(self, element):
562        assert iselement(element)
563        self._root = element
564
565    ##
566    # Loads an external XML document into this element tree.
567    #
568    # @param source A file name or file object.
569    # @param parser An optional parser instance.  If not given, the
570    #     standard {@link XMLTreeBuilder} parser is used.
571    # @return The document root element.
572    # @defreturn Element
573
574    def parse(self, source, parser=None):
575        if not hasattr(source, "read"):
576            source = open(source, "rb")
577        if not parser:
578            parser = XMLTreeBuilder()
579        while 1:
580            data = source.read(32768)
581            if not data:
582                break
583            parser.feed(data)
584        self._root = parser.close()
585        return self._root
586
587    ##
588    # Creates a tree iterator for the root element.  The iterator loops
589    # over all elements in this tree, in document order.
590    #
591    # @param tag What tags to look for (default is to return all elements)
592    # @return An iterator.
593    # @defreturn iterator
594
595    def getiterator(self, tag=None):
596        assert self._root is not None
597        return self._root.getiterator(tag)
598
599    ##
600    # Finds the first toplevel element with given tag.
601    # Same as getroot().find(path).
602    #
603    # @param path What element to look for.
604    # @return The first matching element, or None if no element was found.
605    # @defreturn Element or None
606
607    def find(self, path):
608        assert self._root is not None
609        if path[:1] == "/":
610            path = "." + path
611        return self._root.find(path)
612
613    ##
614    # Finds the element text for the first toplevel element with given
615    # tag.  Same as getroot().findtext(path).
616    #
617    # @param path What toplevel element to look for.
618    # @param default What to return if the element was not found.
619    # @return The text content of the first matching element, or the
620    #     default value no element was found.  Note that if the element
621    #     has is found, but has no text content, this method returns an
622    #     empty string.
623    # @defreturn string
624
625    def findtext(self, path, default=None):
626        assert self._root is not None
627        if path[:1] == "/":
628            path = "." + path
629        return self._root.findtext(path, default)
630
631    ##
632    # Finds all toplevel elements with the given tag.
633    # Same as getroot().findall(path).
634    #
635    # @param path What element to look for.
636    # @return A list or iterator containing all matching elements,
637    #    in document order.
638    # @defreturn list of Element instances
639
640    def findall(self, path):
641        assert self._root is not None
642        if path[:1] == "/":
643            path = "." + path
644        return self._root.findall(path)
645
646    ##
647    # Writes the element tree to a file, as XML.
648    #
649    # @param file A file name, or a file object opened for writing.
650    # @param encoding Optional output encoding (default is US-ASCII).
651
652    def write(self, file, encoding="us-ascii"):
653        assert self._root is not None
654        if not hasattr(file, "write"):
655            file = open(file, "wb")
656        if not encoding:
657            encoding = "us-ascii"
658        elif encoding != "utf-8" and encoding != "us-ascii":
659            file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
660        self._write(file, self._root, encoding, {})
661
662    def _write(self, file, node, encoding, namespaces):
663        # write XML to file
664        tag = node.tag
665        if tag is Comment:
666            file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
667        elif tag is ProcessingInstruction:
668            file.write("<?%s?>" % _escape_cdata(node.text, encoding))
669        else:
670            items = node.items()
671            xmlns_items = [] # new namespaces in this scope
672            try:
673                if isinstance(tag, QName) or tag[:1] == "{":
674                    tag, xmlns = fixtag(tag, namespaces)
675                    if xmlns: xmlns_items.append(xmlns)
676            except TypeError:
677                _raise_serialization_error(tag)
678            file.write("<" + _encode(tag, encoding))
679            if items or xmlns_items:
680                items.sort() # lexical order
681                for k, v in items:
682                    try:
683                        if isinstance(k, QName) or k[:1] == "{":
684                            k, xmlns = fixtag(k, namespaces)
685                            if xmlns: xmlns_items.append(xmlns)
686                    except TypeError:
687                        _raise_serialization_error(k)
688                    try:
689                        if isinstance(v, QName):
690                            v, xmlns = fixtag(v, namespaces)
691                            if xmlns: xmlns_items.append(xmlns)
692                    except TypeError:
693                        _raise_serialization_error(v)
694                    file.write(" %s=\"%s\"" % (_encode(k, encoding),
695                                               _escape_attrib(v, encoding)))
696                for k, v in xmlns_items:
697                    file.write(" %s=\"%s\"" % (_encode(k, encoding),
698                                               _escape_attrib(v, encoding)))
699            if node.text or len(node):
700                file.write(">")
701                if node.text:
702                    file.write(_escape_cdata(node.text, encoding))
703                for n in node:
704                    self._write(file, n, encoding, namespaces)
705                file.write("</" + _encode(tag, encoding) + ">")
706            else:
707                file.write(" />")
708            for k, v in xmlns_items:
709                del namespaces[v]
710        if node.tail:
711            file.write(_escape_cdata(node.tail, encoding))
712
713# --------------------------------------------------------------------
714# helpers
715
716##
717# Checks if an object appears to be a valid element object.
718#
719# @param An element instance.
720# @return A true value if this is an element object.
721# @defreturn flag
722
723def iselement(element):
724    # FIXME: not sure about this; might be a better idea to look
725    # for tag/attrib/text attributes
726    return isinstance(element, _ElementInterface) or hasattr(element, "tag")
727
728##
729# Writes an element tree or element structure to sys.stdout.  This
730# function should be used for debugging only.
731# <p>
732# The exact output format is implementation dependent.  In this
733# version, it's written as an ordinary XML file.
734#
735# @param elem An element tree or an individual element.
736
737def dump(elem):
738    # debugging
739    if not isinstance(elem, ElementTree):
740        elem = ElementTree(elem)
741    elem.write(sys.stdout)
742    tail = elem.getroot().tail
743    if not tail or tail[-1] != "\n":
744        sys.stdout.write("\n")
745
746def _encode(s, encoding):
747    try:
748        return s.encode(encoding)
749    except AttributeError:
750        return s # 1.5.2: assume the string uses the right encoding
751
752if sys.version[:3] == "1.5":
753    _escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
754else:
755    _escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
756
757_escape_map = {
758    "&": "&amp;",
759    "<": "&lt;",
760    ">": "&gt;",
761    '"': "&quot;",
762}
763
764_namespace_map = {
765    # "well-known" namespace prefixes
766    "http://www.w3.org/XML/1998/namespace": "xml",
767    "http://www.w3.org/1999/xhtml": "html",
768    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
769    "http://schemas.xmlsoap.org/wsdl/": "wsdl",
770}
771
772def _raise_serialization_error(text):
773    raise TypeError(
774        "cannot serialize %r (type %s)" % (text, type(text).__name__)
775        )
776
777def _encode_entity(text, pattern=_escape):
778    # map reserved and non-ascii characters to numerical entities
779    def escape_entities(m, map=_escape_map):
780        out = []
781        append = out.append
782        for char in m.group():
783            text = map.get(char)
784            if text is None:
785                text = "&#%d;" % ord(char)
786            append(text)
787        return string.join(out, "")
788    try:
789        return _encode(pattern.sub(escape_entities, text), "ascii")
790    except TypeError:
791        _raise_serialization_error(text)
792
793#
794# the following functions assume an ascii-compatible encoding
795# (or "utf-16")
796
797def _escape_cdata(text, encoding=None, replace=string.replace):
798    # escape character data
799    try:
800        if encoding:
801            try:
802                text = _encode(text, encoding)
803            except UnicodeError:
804                return _encode_entity(text)
805        text = replace(text, "&", "&amp;")
806        text = replace(text, "<", "&lt;")
807        text = replace(text, ">", "&gt;")
808        return text
809    except (TypeError, AttributeError):
810        _raise_serialization_error(text)
811
812def _escape_attrib(text, encoding=None, replace=string.replace):
813    # escape attribute value
814    try:
815        if encoding:
816            try:
817                text = _encode(text, encoding)
818            except UnicodeError:
819                return _encode_entity(text)
820        text = replace(text, "&", "&amp;")
821        text = replace(text, "'", "&apos;") # FIXME: overkill
822        text = replace(text, "\"", "&quot;")
823        text = replace(text, "<", "&lt;")
824        text = replace(text, ">", "&gt;")
825        return text
826    except (TypeError, AttributeError):
827        _raise_serialization_error(text)
828
829def fixtag(tag, namespaces):
830    # given a decorated tag (of the form {uri}tag), return prefixed
831    # tag and namespace declaration, if any
832    if isinstance(tag, QName):
833        tag = tag.text
834    namespace_uri, tag = string.split(tag[1:], "}", 1)
835    prefix = namespaces.get(namespace_uri)
836    if prefix is None:
837        prefix = _namespace_map.get(namespace_uri)
838        if prefix is None:
839            prefix = "ns%d" % len(namespaces)
840        namespaces[namespace_uri] = prefix
841        if prefix == "xml":
842            xmlns = None
843        else:
844            xmlns = ("xmlns:%s" % prefix, namespace_uri)
845    else:
846        xmlns = None
847    return "%s:%s" % (prefix, tag), xmlns
848
849##
850# Parses an XML document into an element tree.
851#
852# @param source A filename or file object containing XML data.
853# @param parser An optional parser instance.  If not given, the
854#     standard {@link XMLTreeBuilder} parser is used.
855# @return An ElementTree instance
856
857def parse(source, parser=None):
858    tree = ElementTree()
859    tree.parse(source, parser)
860    return tree
861
862##
863# Parses an XML document into an element tree incrementally, and reports
864# what's going on to the user.
865#
866# @param source A filename or file object containing XML data.
867# @param events A list of events to report back.  If omitted, only "end"
868#     events are reported.
869# @return A (event, elem) iterator.
870
871class iterparse:
872
873    def __init__(self, source, events=None):
874        if not hasattr(source, "read"):
875            source = open(source, "rb")
876        self._file = source
877        self._events = []
878        self._index = 0
879        self.root = self._root = None
880        self._parser = XMLTreeBuilder()
881        # wire up the parser for event reporting
882        parser = self._parser._parser
883        append = self._events.append
884        if events is None:
885            events = ["end"]
886        for event in events:
887            if event == "start":
888                try:
889                    parser.ordered_attributes = 1
890                    parser.specified_attributes = 1
891                    def handler(tag, attrib_in, event=event, append=append,
892                                start=self._parser._start_list):
893                        append((event, start(tag, attrib_in)))
894                    parser.StartElementHandler = handler
895                except AttributeError:
896                    def handler(tag, attrib_in, event=event, append=append,
897                                start=self._parser._start):
898                        append((event, start(tag, attrib_in)))
899                    parser.StartElementHandler = handler
900            elif event == "end":
901                def handler(tag, event=event, append=append,
902                            end=self._parser._end):
903                    append((event, end(tag)))
904                parser.EndElementHandler = handler
905            elif event == "start-ns":
906                def handler(prefix, uri, event=event, append=append):
907                    try:
908                        uri = _encode(uri, "ascii")
909                    except UnicodeError:
910                        pass
911                    append((event, (prefix or "", uri)))
912                parser.StartNamespaceDeclHandler = handler
913            elif event == "end-ns":
914                def handler(prefix, event=event, append=append):
915                    append((event, None))
916                parser.EndNamespaceDeclHandler = handler
917
918    def next(self):
919        while 1:
920            try:
921                item = self._events[self._index]
922            except IndexError:
923                if self._parser is None:
924                    self.root = self._root
925                    try:
926                        raise StopIteration
927                    except NameError:
928                        raise IndexError
929                # load event buffer
930                del self._events[:]
931                self._index = 0
932                data = self._file.read(16384)
933                if data:
934                    self._parser.feed(data)
935                else:
936                    self._root = self._parser.close()
937                    self._parser = None
938            else:
939                self._index = self._index + 1
940                return item
941
942    try:
943        iter
944        def __iter__(self):
945            return self
946    except NameError:
947        def __getitem__(self, index):
948            return self.next()
949
950##
951# Parses an XML document from a string constant.  This function can
952# be used to embed "XML literals" in Python code.
953#
954# @param source A string containing XML data.
955# @return An Element instance.
956# @defreturn Element
957
958def XML(text):
959    parser = XMLTreeBuilder()
960    parser.feed(text)
961    return parser.close()
962
963##
964# Parses an XML document from a string constant, and also returns
965# a dictionary which maps from element id:s to elements.
966#
967# @param source A string containing XML data.
968# @return A tuple containing an Element instance and a dictionary.
969# @defreturn (Element, dictionary)
970
971def XMLID(text):
972    parser = XMLTreeBuilder()
973    parser.feed(text)
974    tree = parser.close()
975    ids = {}
976    for elem in tree.getiterator():
977        id = elem.get("id")
978        if id:
979            ids[id] = elem
980    return tree, ids
981
982##
983# Parses an XML document from a string constant.  Same as {@link #XML}.
984#
985# @def fromstring(text)
986# @param source A string containing XML data.
987# @return An Element instance.
988# @defreturn Element
989
990fromstring = XML
991
992##
993# Generates a string representation of an XML element, including all
994# subelements.
995#
996# @param element An Element instance.
997# @return An encoded string containing the XML data.
998# @defreturn string
999
1000def tostring(element, encoding=None):
1001    class dummy:
1002        pass
1003    data = []
1004    file = dummy()
1005    file.write = data.append
1006    ElementTree(element).write(file, encoding)
1007    return string.join(data, "")
1008
1009##
1010# Generic element structure builder.  This builder converts a sequence
1011# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1012# #TreeBuilder.end} method calls to a well-formed element structure.
1013# <p>
1014# You can use this class to build an element structure using a custom XML
1015# parser, or a parser for some other XML-like format.
1016#
1017# @param element_factory Optional element factory.  This factory
1018#    is called to create new Element instances, as necessary.
1019
1020class TreeBuilder:
1021
1022    def __init__(self, element_factory=None):
1023        self._data = [] # data collector
1024        self._elem = [] # element stack
1025        self._last = None # last element
1026        self._tail = None # true if we're after an end tag
1027        if element_factory is None:
1028            element_factory = _ElementInterface
1029        self._factory = element_factory
1030
1031    ##
1032    # Flushes the parser buffers, and returns the toplevel documen
1033    # element.
1034    #
1035    # @return An Element instance.
1036    # @defreturn Element
1037
1038    def close(self):
1039        assert len(self._elem) == 0, "missing end tags"
1040        assert self._last != None, "missing toplevel element"
1041        return self._last
1042
1043    def _flush(self):
1044        if self._data:
1045            if self._last is not None:
1046                text = string.join(self._data, "")
1047                if self._tail:
1048                    assert self._last.tail is None, "internal error (tail)"
1049                    self._last.tail = text
1050                else:
1051                    assert self._last.text is None, "internal error (text)"
1052                    self._last.text = text
1053            self._data = []
1054
1055    ##
1056    # Adds text to the current element.
1057    #
1058    # @param data A string.  This should be either an 8-bit string
1059    #    containing ASCII text, or a Unicode string.
1060
1061    def data(self, data):
1062        self._data.append(data)
1063
1064    ##
1065    # Opens a new element.
1066    #
1067    # @param tag The element name.
1068    # @param attrib A dictionary containing element attributes.
1069    # @return The opened element.
1070    # @defreturn Element
1071
1072    def start(self, tag, attrs):
1073        self._flush()
1074        self._last = elem = self._factory(tag, attrs)
1075        if self._elem:
1076            self._elem[-1].append(elem)
1077        self._elem.append(elem)
1078        self._tail = 0
1079        return elem
1080
1081    ##
1082    # Closes the current element.
1083    #
1084    # @param tag The element name.
1085    # @return The closed element.
1086    # @defreturn Element
1087
1088    def end(self, tag):
1089        self._flush()
1090        self._last = self._elem.pop()
1091        assert self._last.tag == tag,\
1092               "end tag mismatch (expected %s, got %s)" % (
1093                   self._last.tag, tag)
1094        self._tail = 1
1095        return self._last
1096
1097##
1098# Element structure builder for XML source data, based on the
1099# <b>expat</b> parser.
1100#
1101# @keyparam target Target object.  If omitted, the builder uses an
1102#     instance of the standard {@link #TreeBuilder} class.
1103# @keyparam html Predefine HTML entities.  This flag is not supported
1104#     by the current implementation.
1105# @see #ElementTree
1106# @see #TreeBuilder
1107
1108class XMLTreeBuilder:
1109
1110    def __init__(self, html=0, target=None):
1111        try:
1112            from xml.parsers import expat
1113        except ImportError:
1114            raise ImportError(
1115                "No module named expat; use SimpleXMLTreeBuilder instead"
1116                )
1117        self._parser = parser = expat.ParserCreate(None, "}")
1118        if target is None:
1119            target = TreeBuilder()
1120        self._target = target
1121        self._names = {} # name memo cache
1122        # callbacks
1123        parser.DefaultHandlerExpand = self._default
1124        parser.StartElementHandler = self._start
1125        parser.EndElementHandler = self._end
1126        parser.CharacterDataHandler = self._data
1127        # let expat do the buffering, if supported
1128        try:
1129            self._parser.buffer_text = 1
1130        except AttributeError:
1131            pass
1132        # use new-style attribute handling, if supported
1133        try:
1134            self._parser.ordered_attributes = 1
1135            self._parser.specified_attributes = 1
1136            parser.StartElementHandler = self._start_list
1137        except AttributeError:
1138            pass
1139        encoding = None
1140        if not parser.returns_unicode:
1141            encoding = "utf-8"
1142        # target.xml(encoding, None)
1143        self._doctype = None
1144        self.entity = {}
1145
1146    def _fixtext(self, text):
1147        # convert text string to ascii, if possible
1148        try:
1149            return _encode(text, "ascii")
1150        except UnicodeError:
1151            return text
1152
1153    def _fixname(self, key):
1154        # expand qname, and convert name string to ascii, if possible
1155        try:
1156            name = self._names[key]
1157        except KeyError:
1158            name = key
1159            if "}" in name:
1160                name = "{" + name
1161            self._names[key] = name = self._fixtext(name)
1162        return name
1163
1164    def _start(self, tag, attrib_in):
1165        fixname = self._fixname
1166        tag = fixname(tag)
1167        attrib = {}
1168        for key, value in attrib_in.items():
1169            attrib[fixname(key)] = self._fixtext(value)
1170        return self._target.start(tag, attrib)
1171
1172    def _start_list(self, tag, attrib_in):
1173        fixname = self._fixname
1174        tag = fixname(tag)
1175        attrib = {}
1176        if attrib_in:
1177            for i in range(0, len(attrib_in), 2):
1178                attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
1179        return self._target.start(tag, attrib)
1180
1181    def _data(self, text):
1182        return self._target.data(self._fixtext(text))
1183
1184    def _end(self, tag):
1185        return self._target.end(self._fixname(tag))
1186
1187    def _default(self, text):
1188        prefix = text[:1]
1189        if prefix == "&":
1190            # deal with undefined entities
1191            try:
1192                self._target.data(self.entity[text[1:-1]])
1193            except KeyError:
1194                from xml.parsers import expat
1195                raise expat.error(
1196                    "undefined entity %s: line %d, column %d" %
1197                    (text, self._parser.ErrorLineNumber,
1198                    self._parser.ErrorColumnNumber)
1199                    )
1200        elif prefix == "<" and text[:9] == "<!DOCTYPE":
1201            self._doctype = [] # inside a doctype declaration
1202        elif self._doctype is not None:
1203            # parse doctype contents
1204            if prefix == ">":
1205                self._doctype = None
1206                return
1207            text = string.strip(text)
1208            if not text:
1209                return
1210            self._doctype.append(text)
1211            n = len(self._doctype)
1212            if n > 2:
1213                type = self._doctype[1]
1214                if type == "PUBLIC" and n == 4:
1215                    name, type, pubid, system = self._doctype
1216                elif type == "SYSTEM" and n == 3:
1217                    name, type, system = self._doctype
1218                    pubid = None
1219                else:
1220                    return
1221                if pubid:
1222                    pubid = pubid[1:-1]
1223                self.doctype(name, pubid, system[1:-1])
1224                self._doctype = None
1225
1226    ##
1227    # Handles a doctype declaration.
1228    #
1229    # @param name Doctype name.
1230    # @param pubid Public identifier.
1231    # @param system System identifier.
1232
1233    def doctype(self, name, pubid, system):
1234        pass
1235
1236    ##
1237    # Feeds data to the parser.
1238    #
1239    # @param data Encoded data.
1240
1241    def feed(self, data):
1242        self._parser.Parse(data, 0)
1243
1244    ##
1245    # Finishes feeding data to the parser.
1246    #
1247    # @return An element structure.
1248    # @defreturn Element
1249
1250    def close(self):
1251        self._parser.Parse("", 1) # end of data
1252        tree = self._target.close()
1253        del self._target, self._parser # get rid of circular references
1254        return tree
Note: See TracBrowser for help on using the repository browser.