source: exist/trunk/python/elementtree-1.3/elementtree/ElementTree.py @ 3150

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/elementtree-1.3/elementtree/ElementTree.py@3150
Revision 3150, 50.2 KB checked in by lawrence, 12 years ago (diff)

woops, we didn't include elementtree itself.

Line 
1#
2# ElementTree
3# $Id: ElementTree.py 3276 2007-09-12 06:52:30Z fredrik $
4#
5# light-weight XML support for Python 2.2 and later.
6#
7# history:
8# 2001-10-20 fl   created (from various sources)
9# 2001-11-01 fl   return root from parse method
10# 2002-02-16 fl   sort attributes in lexical order
11# 2002-04-06 fl   TreeBuilder refactoring, added PythonDoc markup
12# 2002-05-01 fl   finished TreeBuilder refactoring
13# 2002-07-14 fl   added basic namespace support to ElementTree.write
14# 2002-07-25 fl   added QName attribute support
15# 2002-10-20 fl   fixed encoding in write
16# 2002-11-24 fl   changed default encoding to ascii; fixed attribute encoding
17# 2002-11-27 fl   accept file objects or file names for parse/write
18# 2002-12-04 fl   moved XMLTreeBuilder back to this module
19# 2003-01-11 fl   fixed entity encoding glitch for us-ascii
20# 2003-02-13 fl   added XML literal factory
21# 2003-02-21 fl   added ProcessingInstruction/PI factory
22# 2003-05-11 fl   added tostring/fromstring helpers
23# 2003-05-26 fl   added ElementPath support
24# 2003-07-05 fl   added makeelement factory method
25# 2003-07-28 fl   added more well-known namespace prefixes
26# 2003-08-15 fl   fixed typo in ElementTree.findtext (Thomas Dartsch)
27# 2003-09-04 fl   fall back on emulator if ElementPath is not installed
28# 2003-10-31 fl   markup updates
29# 2003-11-15 fl   fixed nested namespace bug
30# 2004-03-28 fl   added XMLID helper
31# 2004-06-02 fl   added default support to findtext
32# 2004-06-08 fl   fixed encoding of non-ascii element/attribute names
33# 2004-08-23 fl   take advantage of post-2.1 expat features
34# 2004-09-03 fl   made Element class visible; removed factory
35# 2005-02-01 fl   added iterparse implementation
36# 2005-03-02 fl   fixed iterparse support for pre-2.2 versions
37# 2005-11-12 fl   added tostringlist/fromstringlist helpers
38# 2006-07-05 fl   merged in selected changes from the 1.3 sandbox
39# 2006-07-05 fl   removed support for 2.1 and earlier
40# 2007-06-21 fl   added deprecation/future warnings
41# 2007-08-25 fl   added doctype hook, added parser version attribute etc
42# 2007-08-26 fl   added new serializer code (better namespace handling, etc)
43# 2007-08-27 fl   warn for broken /tag searches on tree level
44# 2007-09-02 fl   added html/text methods to serializer (experimental)
45# 2007-09-05 fl   added method argument to tostring/tostringlist
46# 2007-09-06 fl   improved error handling
47#
48# Copyright (c) 1999-2007 by Fredrik Lundh.  All rights reserved.
49#
50# fredrik@pythonware.com
51# http://www.pythonware.com
52#
53# --------------------------------------------------------------------
54# The ElementTree toolkit is
55#
56# Copyright (c) 1999-2007 by Fredrik Lundh
57#
58# By obtaining, using, and/or copying this software and/or its
59# associated documentation, you agree that you have read, understood,
60# and will comply with the following terms and conditions:
61#
62# Permission to use, copy, modify, and distribute this software and
63# its associated documentation for any purpose and without fee is
64# hereby granted, provided that the above copyright notice appears in
65# all copies, and that both that copyright notice and this permission
66# notice appear in supporting documentation, and that the name of
67# Secret Labs AB or the author not be used in advertising or publicity
68# pertaining to distribution of the software without specific, written
69# prior permission.
70#
71# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
72# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
73# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
74# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
75# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
76# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
77# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
78# OF THIS SOFTWARE.
79# --------------------------------------------------------------------
80
81from __future__ import generators
82
83__all__ = [
84    # public symbols
85    "Comment",
86    "dump",
87    "Element", "ElementTree",
88    "fromstring", "fromstringlist",
89    "iselement", "iterparse",
90    "parse", "ParseError",
91    "PI", "ProcessingInstruction",
92    "QName",
93    "SubElement",
94    "tostring", "tostringlist",
95    "TreeBuilder",
96    "VERSION",
97    "XML",
98    "XMLParser", "XMLTreeBuilder",
99    ]
100
101##
102# The <b>Element</b> type is a flexible container object, designed to
103# store hierarchical data structures in memory. The type can be
104# described as a cross between a list and a dictionary.
105# <p>
106# Each element has a number of properties associated with it:
107# <ul>
108# <li>a <i>tag</i>. This is a string identifying what kind of data
109# this element represents (the element type, in other words).</li>
110# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
111# <li>a <i>text</i> string.</li>
112# <li>an optional <i>tail</i> string.</li>
113# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
114# </ul>
115#
116# To create an element instance, use the {@link #Element} constructor
117# or the {@link #SubElement} factory function.
118# <p>
119# The {@link #ElementTree} class can be used to wrap an element
120# structure, and convert it from and to XML.
121##
122
123import sys, re
124
125class _SimpleElementPath(object):
126    # emulate pre-1.2 find/findtext/findall behaviour
127    def find(self, element, tag):
128        for elem in element:
129            if elem.tag == tag:
130                return elem
131        return None
132    def findtext(self, element, tag, default=None):
133        for elem in element:
134            if elem.tag == tag:
135                return elem.text or ""
136        return default
137    def findall(self, element, tag):
138        if tag[:3] == ".//":
139            return element.getiterator(tag[3:])
140        result = []
141        for elem in element:
142            if elem.tag == tag:
143                result.append(elem)
144        return result
145
146try:
147    import ElementPath
148except ImportError:
149    # FIXME: issue warning in this case?
150    ElementPath = _SimpleElementPath()
151
152VERSION = "1.3a2"
153
154class ParseError(SyntaxError):
155    pass
156
157# --------------------------------------------------------------------
158
159##
160# Checks if an object appears to be a valid element object.
161#
162# @param An element instance.
163# @return A true value if this is an element object.
164# @defreturn flag
165
166def iselement(element):
167    # FIXME: not sure about this; might be a better idea to look
168    # for tag/attrib/text attributes
169    return isinstance(element, Element) or hasattr(element, "tag")
170
171##
172# Element class.  This class defines the Element interface, and
173# provides a reference implementation of this interface.
174# <p>
175# The element name, attribute names, and attribute values can be
176# either 8-bit ASCII strings or Unicode strings.
177#
178# @param tag The element name.
179# @param attrib An optional dictionary, containing element attributes.
180# @param **extra Additional attributes, given as keyword arguments.
181# @see Element
182# @see SubElement
183# @see Comment
184# @see ProcessingInstruction
185
186class Element(object):
187    # <tag attrib>text<child/>...</tag>tail
188
189    ##
190    # (Attribute) Element tag.
191
192    tag = None
193
194    ##
195    # (Attribute) Element attribute dictionary.  Where possible, use
196    # {@link #Element.get},
197    # {@link #Element.set},
198    # {@link #Element.keys}, and
199    # {@link #Element.items} to access
200    # element attributes.
201
202    attrib = None
203
204    ##
205    # (Attribute) Text before first subelement.  This is either a
206    # string or the value None, if there was no text.
207
208    text = None
209
210    ##
211    # (Attribute) Text after this element's end tag, but before the
212    # next sibling element's start tag.  This is either a string or
213    # the value None, if there was no text.
214
215    tail = None # text after end tag, if any
216
217    def __init__(self, tag, attrib={}, **extra):
218        attrib = attrib.copy()
219        attrib.update(extra)
220        self.tag = tag
221        self.attrib = attrib
222        self._children = []
223
224    def __repr__(self):
225        return "<Element %s at %x>" % (repr(self.tag), id(self))
226
227    ##
228    # Creates a new element object of the same type as this element.
229    #
230    # @param tag Element tag.
231    # @param attrib Element attributes, given as a dictionary.
232    # @return A new element instance.
233
234    def makeelement(self, tag, attrib):
235        return Element(tag, attrib)
236
237    ##
238    # Returns the number of subelements.
239    #
240    # @return The number of subelements.
241
242    def __len__(self):
243        return len(self._children)
244
245    def __nonzero__(self):
246        import warnings
247        warnings.warn(
248            "The behavior of this method will change in future versions. "
249            "Use specific 'len(elem)' or 'elem is not None' test instead.",
250            FutureWarning
251            )
252        return len(self._children) != 0 # emulate old behaviour
253
254    ##
255    # Returns the given subelement.
256    #
257    # @param index What subelement to return.
258    # @return The given subelement.
259    # @exception IndexError If the given element does not exist.
260
261    def __getitem__(self, index):
262        return self._children[index]
263
264    ##
265    # Replaces the given subelement.
266    #
267    # @param index What subelement to replace.
268    # @param element The new element value.
269    # @exception IndexError If the given element does not exist.
270    # @exception AssertionError If element is not a valid object.
271
272    def __setitem__(self, index, element):
273        assert iselement(element)
274        self._children[index] = element
275
276    ##
277    # Deletes the given subelement.
278    #
279    # @param index What subelement to delete.
280    # @exception IndexError If the given element does not exist.
281
282    def __delitem__(self, index):
283        del self._children[index]
284
285    ##
286    # Returns a list containing subelements in the given range.
287    #
288    # @param start The first subelement to return.
289    # @param stop The first subelement that shouldn't be returned.
290    # @return A sequence object containing subelements.
291
292    def __getslice__(self, start, stop):
293        return self._children[start:stop]
294
295    ##
296    # Replaces a number of subelements with elements from a sequence.
297    #
298    # @param start The first subelement to replace.
299    # @param stop The first subelement that shouldn't be replaced.
300    # @param elements A sequence object with zero or more elements.
301    # @exception AssertionError If a sequence member is not a valid object.
302
303    def __setslice__(self, start, stop, elements):
304        for element in elements:
305            assert iselement(element)
306        self._children[start:stop] = list(elements)
307
308    ##
309    # Deletes a number of subelements.
310    #
311    # @param start The first subelement to delete.
312    # @param stop The first subelement to leave in there.
313
314    def __delslice__(self, start, stop):
315        del self._children[start:stop]
316
317    ##
318    # Adds a subelement to the end of this element.
319    #
320    # @param element The element to add.
321    # @exception AssertionError If a sequence member is not a valid object.
322
323    def append(self, element):
324        assert iselement(element)
325        self._children.append(element)
326
327    ##
328    # Appends subelements from a sequence.
329    #
330    # @param elements A sequence object with zero or more elements.
331    # @exception AssertionError If a subelement is not a valid object.
332    # @since 1.3
333
334    def extend(self, elements):
335        for element in elements:
336            assert iselement(element)
337        self._children.extend(elements)
338
339    ##
340    # Inserts a subelement at the given position in this element.
341    #
342    # @param index Where to insert the new subelement.
343    # @exception AssertionError If the element is not a valid object.
344
345    def insert(self, index, element):
346        assert iselement(element)
347        self._children.insert(index, element)
348
349    ##
350    # Removes a matching subelement.  Unlike the <b>find</b> methods,
351    # this method compares elements based on identity, not on tag
352    # value or contents.
353    #
354    # @param element What element to remove.
355    # @exception ValueError If a matching element could not be found.
356    # @exception AssertionError If the element is not a valid object.
357
358    def remove(self, element):
359        assert iselement(element)
360        self._children.remove(element)
361
362    ##
363    # (Deprecated) Returns all subelements.  The elements are returned
364    # in document order.
365    #
366    # @return A list of subelements.
367    # @defreturn list of Element instances
368
369    def getchildren(self):
370        import warnings
371        warnings.warn(
372            "This method will be removed in future versions. "
373            "Use 'list(elem)' or iteration over elem instead.",
374            DeprecationWarning
375            )
376        return self._children
377
378    ##
379    # Finds the first matching subelement, by tag name or path.
380    #
381    # @param path What element to look for.
382    # @return The first matching element, or None if no element was found.
383    # @defreturn Element or None
384
385    def find(self, path):
386        return ElementPath.find(self, path)
387
388    ##
389    # Finds text for the first matching subelement, by tag name or path.
390    #
391    # @param path What element to look for.
392    # @param default What to return if the element was not found.
393    # @return The text content of the first matching element, or the
394    #     default value no element was found.  Note that if the element
395    #     has is found, but has no text content, this method returns an
396    #     empty string.
397    # @defreturn string
398
399    def findtext(self, path, default=None):
400        return ElementPath.findtext(self, path, default)
401
402    ##
403    # Finds all matching subelements, by tag name or path.
404    #
405    # @param path What element to look for.
406    # @return A list or iterator containing all matching elements,
407    #    in document order.
408    # @defreturn list of Element instances
409
410    def findall(self, path):
411        return ElementPath.findall(self, path)
412
413    ##
414    # Resets an element.  This function removes all subelements, clears
415    # all attributes, and sets the text and tail attributes to None.
416
417    def clear(self):
418        self.attrib.clear()
419        self._children = []
420        self.text = self.tail = None
421
422    ##
423    # Gets an element attribute.
424    #
425    # @param key What attribute to look for.
426    # @param default What to return if the attribute was not found.
427    # @return The attribute value, or the default value, if the
428    #     attribute was not found.
429    # @defreturn string or None
430
431    def get(self, key, default=None):
432        return self.attrib.get(key, default)
433
434    ##
435    # Sets an element attribute.
436    #
437    # @param key What attribute to set.
438    # @param value The attribute value.
439
440    def set(self, key, value):
441        self.attrib[key] = value
442
443    ##
444    # Gets a list of attribute names.  The names are returned in an
445    # arbitrary order (just like for an ordinary Python dictionary).
446    #
447    # @return A list of element attribute names.
448    # @defreturn list of strings
449
450    def keys(self):
451        return self.attrib.keys()
452
453    ##
454    # Gets element attributes, as a sequence.  The attributes are
455    # returned in an arbitrary order.
456    #
457    # @return A list of (name, value) tuples for all attributes.
458    # @defreturn list of (string, string) tuples
459
460    def items(self):
461        return self.attrib.items()
462
463    ##
464    # Creates a tree iterator.  The iterator loops over this element
465    # and all subelements, in document order, and returns all elements
466    # with a matching tag.
467    # <p>
468    # If the tree structure is modified during iteration, new or removed
469    # elements may or may not be included.  To get a stable set, use the
470    # list() function on the iterator, and loop over the resulting list.
471    #
472    # @param tag What tags to look for (default is to return all elements).
473    # @return An iterator containing all the matching elements.
474    # @defreturn iterator
475
476    def iter(self, tag=None):
477        if tag == "*":
478            tag = None
479        if tag is None or self.tag == tag:
480            yield self
481        for e in self._children:
482            for e in e.iter(tag):
483                yield e
484
485    # compatibility (FIXME: preserve list behaviour too? see below)
486    getiterator = iter
487
488    # def getiterator(self, tag=None):
489    #     return list(tag)
490
491    ##
492    # Creates a text iterator.  The iterator loops over this element
493    # and all subelements, in document order, and returns all inner
494    # text.
495    #
496    # @return An iterator containing all inner text.
497    # @defreturn iterator
498
499    def itertext(self):
500        if self.text:
501            yield self.text
502        for e in self:
503            for s in e.itertext():
504                yield s
505            if e.tail:
506                yield e.tail
507
508# compatibility
509_Element = _ElementInterface = Element
510
511##
512# Subelement factory.  This function creates an element instance, and
513# appends it to an existing element.
514# <p>
515# The element name, attribute names, and attribute values can be
516# either 8-bit ASCII strings or Unicode strings.
517#
518# @param parent The parent element.
519# @param tag The subelement name.
520# @param attrib An optional dictionary, containing element attributes.
521# @param **extra Additional attributes, given as keyword arguments.
522# @return An element instance.
523# @defreturn Element
524
525def SubElement(parent, tag, attrib={}, **extra):
526    attrib = attrib.copy()
527    attrib.update(extra)
528    element = parent.makeelement(tag, attrib)
529    parent.append(element)
530    return element
531
532##
533# Comment element factory.  This factory function creates a special
534# element that will be serialized as an XML comment by the standard
535# serializer.
536# <p>
537# The comment string can be either an 8-bit ASCII string or a Unicode
538# string.
539#
540# @param text A string containing the comment string.
541# @return An element instance, representing a comment.
542# @defreturn Element
543
544def Comment(text=None):
545    element = Element(Comment)
546    element.text = text
547    return element
548
549##
550# PI element factory.  This factory function creates a special element
551# that will be serialized as an XML processing instruction by the standard
552# serializer.
553#
554# @param target A string containing the PI target.
555# @param text A string containing the PI contents, if any.
556# @return An element instance, representing a PI.
557# @defreturn Element
558
559def ProcessingInstruction(target, text=None):
560    element = Element(ProcessingInstruction)
561    element.text = target
562    if text:
563        element.text = element.text + " " + text
564    return element
565
566PI = ProcessingInstruction
567
568##
569# QName wrapper.  This can be used to wrap a QName attribute value, in
570# order to get proper namespace handling on output.
571#
572# @param text A string containing the QName value, in the form {uri}local,
573#     or, if the tag argument is given, the URI part of a QName.
574# @param tag Optional tag.  If given, the first argument is interpreted as
575#     an URI, and this argument is interpreted as a local name.
576# @return An opaque object, representing the QName.
577
578class QName(object):
579    def __init__(self, text_or_uri, tag=None):
580        if tag:
581            text_or_uri = "{%s}%s" % (text_or_uri, tag)
582        self.text = text_or_uri
583    def __str__(self):
584        return self.text
585    def __hash__(self):
586        return hash(self.text)
587    def __cmp__(self, other):
588        if isinstance(other, QName):
589            return cmp(self.text, other.text)
590        return cmp(self.text, other)
591
592# --------------------------------------------------------------------
593
594##
595# ElementTree wrapper class.  This class represents an entire element
596# hierarchy, and adds some extra support for serialization to and from
597# standard XML.
598#
599# @param element Optional root element.
600# @keyparam file Optional file handle or file name.  If given, the
601#     tree is initialized with the contents of this XML file.
602
603class ElementTree(object):
604
605    def __init__(self, element=None, file=None):
606        assert element is None or iselement(element)
607        self._root = element # first node
608        if file:
609            self.parse(file)
610
611    ##
612    # Gets the root element for this tree.
613    #
614    # @return An element instance.
615    # @defreturn Element
616
617    def getroot(self):
618        return self._root
619
620    ##
621    # Replaces the root element for this tree.  This discards the
622    # current contents of the tree, and replaces it with the given
623    # element.  Use with care.
624    #
625    # @param element An element instance.
626
627    def _setroot(self, element):
628        assert iselement(element)
629        self._root = element
630
631    ##
632    # Loads an external XML document into this element tree.
633    #
634    # @param source A file name or file object.
635    # @keyparam parser An optional parser instance.  If not given, the
636    #     standard {@link XMLParser} parser is used.
637    # @return The document root element.
638    # @defreturn Element
639
640    def parse(self, source, parser=None):
641        if not hasattr(source, "read"):
642            source = open(source, "rb")
643        if not parser:
644            parser = XMLParser(target=TreeBuilder())
645        while 1:
646            data = source.read(32768)
647            if not data:
648                break
649            parser.feed(data)
650        self._root = parser.close()
651        return self._root
652
653    ##
654    # Creates a tree iterator for the root element.  The iterator loops
655    # over all elements in this tree, in document order.
656    #
657    # @param tag What tags to look for (default is to return all elements)
658    # @return An iterator.
659    # @defreturn iterator
660
661    def iter(self, tag=None):
662        assert self._root is not None
663        return self._root.iter(tag)
664
665    getiterator = iter
666
667    ##
668    # Finds the first toplevel element with given tag.
669    # Same as getroot().find(path).
670    #
671    # @param path What element to look for.
672    # @return The first matching element, or None if no element was found.
673    # @defreturn Element or None
674
675    def find(self, path):
676        assert self._root is not None
677        if path[:1] == "/":
678            path = "." + path
679            import warnings
680            warnings.warn(
681                "This search is broken in 1.3 and earlier; if you rely "
682                "on the current behaviour, change it to %r" % path,
683                FutureWarning
684                )
685        return self._root.find(path)
686
687    ##
688    # Finds the element text for the first toplevel element with given
689    # tag.  Same as getroot().findtext(path).
690    #
691    # @param path What toplevel element to look for.
692    # @param default What to return if the element was not found.
693    # @return The text content of the first matching element, or the
694    #     default value no element was found.  Note that if the element
695    #     has is found, but has no text content, this method returns an
696    #     empty string.
697    # @defreturn string
698
699    def findtext(self, path, default=None):
700        assert self._root is not None
701        if path[:1] == "/":
702            path = "." + path
703            import warnings
704            warnings.warn(
705                "This search is broken in 1.3 and earlier; if you rely "
706                "on the current behaviour, change it to %r" % path,
707                FutureWarning
708                )
709        return self._root.findtext(path, default)
710
711    ##
712    # Finds all toplevel elements with the given tag.
713    # Same as getroot().findall(path).
714    #
715    # @param path What element to look for.
716    # @return A list or iterator containing all matching elements,
717    #    in document order.
718    # @defreturn list of Element instances
719
720    def findall(self, path):
721        assert self._root is not None
722        if path[:1] == "/":
723            path = "." + path
724            import warnings
725            warnings.warn(
726                "This search is broken in 1.3 and earlier; if you rely "
727                "on the current behaviour, change it to %r" % path,
728                FutureWarning
729                )
730        return self._root.findall(path)
731
732    ##
733    # Writes the element tree to a file, as XML.
734    #
735    # @param file A file name, or a file object opened for writing.
736    # @keyparam encoding Optional output encoding (default is US-ASCII).
737    # @keyparam method Optional output method ("xml" or "html"; default
738    #     is "xml".
739    # @keyparam xml_declaration Controls if an XML declaration should
740    #     be added to the file.  Use False for never, True for always,
741    #     None for only if not US-ASCII or UTF-8.  None is default.
742
743    def write(self, file,
744              # keyword arguments
745              encoding="us-ascii",
746              xml_declaration=None,
747              default_namespace=None,
748              method=None):
749        assert self._root is not None
750        if not hasattr(file, "write"):
751            file = open(file, "wb")
752        write = file.write
753        if not method:
754            method = "xml"
755        if not encoding:
756            encoding = "us-ascii"
757        elif xml_declaration or (xml_declaration is None and
758                                 encoding not in ("utf-8", "us-ascii")):
759            write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
760        if method == "text":
761            _serialize_text(write, self._root, encoding)
762        else:
763            qnames, namespaces = _namespaces(
764                self._root, encoding, default_namespace
765                )
766            if method == "xml":
767                _serialize_xml(
768                    write, self._root, encoding, qnames, namespaces
769                    )
770            elif method == "html":
771                _serialize_html(
772                    write, self._root, encoding, qnames, namespaces
773                    )
774            else:
775                raise ValueError("unknown method %r" % method)
776
777# --------------------------------------------------------------------
778# serialization support
779
780def _namespaces(elem, encoding, default_namespace=None):
781    # identify namespaces used in this tree
782
783    # maps qnames to *encoded* prefix:local names
784    qnames = {None: None}
785
786    # maps uri:s to prefixes
787    namespaces = {}
788    if default_namespace:
789        namespaces[default_namespace] = ""
790
791    def encode(text):
792        return text.encode(encoding)
793
794    def add_qname(qname):
795        # calculate serialized qname representation
796        try:
797            if qname[:1] == "{":
798                uri, tag = qname[1:].split("}", 1)
799                prefix = namespaces.get(uri)
800                if prefix is None:
801                    prefix = _namespace_map.get(uri)
802                    if prefix is None:
803                        prefix = "ns%d" % len(namespaces)
804                    if prefix != "xml":
805                        namespaces[uri] = prefix
806                if prefix:
807                    qnames[qname] = encode("%s:%s" % (prefix, tag))
808                else:
809                    qnames[qname] = encode(tag) # default element
810            else:
811                if default_namespace:
812                    # FIXME: can this be handled in XML 1.0?
813                    raise ValueError(
814                        "cannot use non-qualified names with "
815                        "default_namespace option"
816                        )
817                qnames[qname] = encode(qname)
818        except TypeError:
819            _raise_serialization_error(qname)
820
821    # populate qname and namespaces table
822    try:
823        iterate = elem.iter
824    except AttributeError:
825        iterate = elem.getiterator # cET compatibility
826    for elem in iterate():
827        tag = elem.tag
828        if isinstance(tag, QName) and tag.text not in qnames:
829            add_qname(tag.text)
830        elif isinstance(tag, basestring):
831            if tag not in qnames:
832                add_qname(tag)
833        elif tag is not None and tag is not Comment and tag is not PI:
834            _raise_serialization_error(tag)
835        for key, value in elem.items():
836            if isinstance(key, QName):
837                key = key.text
838            if key not in qnames:
839                add_qname(key)
840            if isinstance(value, QName) and value.text not in qnames:
841                add_qname(value.text)
842        text = elem.text
843        if isinstance(text, QName) and text.text not in qnames:
844            add_qname(text.text)
845    return qnames, namespaces
846
847def _serialize_xml(write, elem, encoding, qnames, namespaces):
848    tag = elem.tag
849    text = elem.text
850    if tag is Comment:
851        write("<!--%s-->" % _escape_cdata(text, encoding))
852    elif tag is ProcessingInstruction:
853        write("<?%s?>" % _escape_cdata(text, encoding))
854    else:
855        tag = qnames[tag]
856        if tag is None:
857            if text:
858                write(_escape_cdata(text, encoding))
859            for e in elem:
860                _serialize_xml(write, e, encoding, qnames, None)
861        else:
862            write("<" + tag)
863            items = elem.items()
864            if items or namespaces:
865                items.sort() # lexical order
866                for k, v in items:
867                    if isinstance(k, QName):
868                        k = k.text
869                    if isinstance(v, QName):
870                        v = qnames[v.text]
871                    else:
872                        v = _escape_attrib(v, encoding)
873                    write(" %s=\"%s\"" % (qnames[k], v))
874                if namespaces:
875                    items = namespaces.items()
876                    items.sort(key=lambda x: x[1]) # sort on prefix
877                    for v, k in items:
878                        if k:
879                            k = ":" + k
880                        write(" xmlns%s=\"%s\"" % (
881                            k.encode(encoding),
882                            _escape_attrib(v, encoding)
883                            ))
884            if text or len(elem):
885                write(">")
886                if text:
887                    write(_escape_cdata(text, encoding))
888                for e in elem:
889                    _serialize_xml(write, e, encoding, qnames, None)
890                write("</" + tag + ">")
891            else:
892                write(" />")
893    if elem.tail:
894        write(_escape_cdata(elem.tail, encoding))
895
896HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
897              "img", "input", "isindex", "link", "meta" "param")
898
899try:
900    HTML_EMPTY = set(HTML_EMPTY)
901except NameError:
902    pass
903
904def _serialize_html(write, elem, encoding, qnames, namespaces):
905    tag = elem.tag
906    text = elem.text
907    if tag is Comment:
908        write("<!--%s-->" % _escape_cdata(text, encoding))
909    elif tag is ProcessingInstruction:
910        write("<?%s?>" % _escape_cdata(text, encoding))
911    else:
912        tag = qnames[tag]
913        if tag is None:
914            if text:
915                write(_escape_cdata(text, encoding))
916            for e in elem:
917                _serialize_html(write, e, encoding, qnames, None)
918        else:
919            write("<" + tag)
920            items = elem.items()
921            if items or namespaces:
922                items.sort() # lexical order
923                for k, v in items:
924                    if isinstance(k, QName):
925                        k = k.text
926                    if isinstance(v, QName):
927                        v = qnames[v.text]
928                    else:
929                        v = _escape_attrib_html(v, encoding)
930                    # FIXME: handle boolean attributes
931                    write(" %s=\"%s\"" % (qnames[k], v))
932                if namespaces:
933                    items = namespaces.items()
934                    items.sort(key=lambda x: x[1]) # sort on prefix
935                    for v, k in items:
936                        if k:
937                            k = ":" + k
938                        write(" xmlns%s=\"%s\"" % (
939                            k.encode(encoding),
940                            _escape_attrib(v, encoding)
941                            ))
942            write(">")
943            tag = tag.lower()
944            if text:
945                if tag == "script" or tag == "style":
946                    write(_encode(text, encoding))
947                else:
948                    write(_escape_cdata(text, encoding))
949            for e in elem:
950                _serialize_html(write, e, encoding, qnames, None)
951            if tag not in HTML_EMPTY:
952                write("</" + tag + ">")
953    if elem.tail:
954        write(_escape_cdata(elem.tail, encoding))
955
956def _serialize_text(write, elem, encoding):
957    for part in elem.itertext():
958        write(part.encode(encoding))
959    if elem.tail:
960        write(elem.tail.encode(encoding))
961
962##
963# Registers a namespace prefix.  The registry is global, and any
964# existing mapping for either the given prefix or the namespace URI
965# will be removed.
966#
967# @param prefix Namespace prefix.
968# @param uri Namespace uri.  Tags and attributes in this namespace
969#     will be serialized with the given prefix, if at all possible.
970# @raise ValueError If the prefix is reserved, or is otherwise
971#     invalid.
972
973def register_namespace(prefix, uri):
974    if re.match("ns\d+$", prefix):
975        raise ValueError("Prefix format reserved for internal use")
976    for k, v in _namespace_map.items():
977        if k == uri or v == prefix:
978            del _namespace_map[k]
979    _namespace_map[uri] = prefix
980
981_namespace_map = {
982    # "well-known" namespace prefixes
983    "http://www.w3.org/XML/1998/namespace": "xml",
984    "http://www.w3.org/1999/xhtml": "html",
985    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
986    "http://schemas.xmlsoap.org/wsdl/": "wsdl",
987    # xml schema
988    "http://www.w3.org/2001/XMLSchema": "xs",
989    "http://www.w3.org/2001/XMLSchema-instance": "xsi",
990    # dublic core
991    "http://purl.org/dc/elements/1.1/": "dc",
992}
993
994def _raise_serialization_error(text):
995    raise TypeError(
996        "cannot serialize %r (type %s)" % (text, type(text).__name__)
997        )
998
999def _encode(text, encoding):
1000    try:
1001        return text.encode(encoding, "xmlcharrefreplace")
1002    except (TypeError, AttributeError):
1003        _raise_serialization_error(text)
1004
1005def _escape_cdata(text, encoding):
1006    # escape character data
1007    try:
1008        # it's worth avoiding do-nothing calls for strings that are
1009        # shorter than 500 character, or so.  assume that's, by far,
1010        # the most common case in most applications.
1011        if "&" in text:
1012            text = text.replace("&", "&amp;")
1013        if "<" in text:
1014            text = text.replace("<", "&lt;")
1015        if ">" in text:
1016            text = text.replace(">", "&gt;")
1017        return text.encode(encoding, "xmlcharrefreplace")
1018    except (TypeError, AttributeError):
1019        _raise_serialization_error(text)
1020
1021def _escape_attrib(text, encoding):
1022    # escape attribute value
1023    try:
1024        if "&" in text:
1025            text = text.replace("&", "&amp;")
1026        if "<" in text:
1027            text = text.replace("<", "&lt;")
1028        if ">" in text:
1029            text = text.replace(">", "&gt;")
1030        if "\"" in text:
1031            text = text.replace("\"", "&quot;")
1032        if "\n" in text:
1033            text = text.replace("\n", "&#10;")
1034        return text.encode(encoding, "xmlcharrefreplace")
1035    except (TypeError, AttributeError):
1036        _raise_serialization_error(text)
1037
1038def _escape_attrib_html(text, encoding):
1039    # escape attribute value
1040    try:
1041        if "&" in text:
1042            text = text.replace("&", "&amp;")
1043        if ">" in text:
1044            text = text.replace(">", "&gt;")
1045        if "\"" in text:
1046            text = text.replace("\"", "&quot;")
1047        return text.encode(encoding, "xmlcharrefreplace")
1048    except (TypeError, AttributeError):
1049        _raise_serialization_error(text)
1050
1051# --------------------------------------------------------------------
1052
1053##
1054# Generates a string representation of an XML element, including all
1055# subelements.
1056#
1057# @param element An Element instance.
1058# @return An encoded string containing the XML data.
1059# @defreturn string
1060
1061def tostring(element, encoding=None, method=None):
1062    class dummy:
1063        pass
1064    data = []
1065    file = dummy()
1066    file.write = data.append
1067    ElementTree(element).write(file, encoding, method=method)
1068    return "".join(data)
1069
1070##
1071# Generates a string representation of an XML element, including all
1072# subelements.  The string is returned as a sequence of string fragments.
1073#
1074# @param element An Element instance.
1075# @return A sequence object containing the XML data.
1076# @defreturn sequence
1077# @since 1.3
1078
1079def tostringlist(element, encoding=None):
1080    class dummy:
1081        pass
1082    data = []
1083    file = dummy()
1084    file.write = data.append
1085    ElementTree(element).write(file, encoding)
1086    # FIXME: merge small fragments into larger parts
1087    return data
1088
1089##
1090# Writes an element tree or element structure to sys.stdout.  This
1091# function should be used for debugging only.
1092# <p>
1093# The exact output format is implementation dependent.  In this
1094# version, it's written as an ordinary XML file.
1095#
1096# @param elem An element tree or an individual element.
1097
1098def dump(elem):
1099    # debugging
1100    if not isinstance(elem, ElementTree):
1101        elem = ElementTree(elem)
1102    elem.write(sys.stdout)
1103    tail = elem.getroot().tail
1104    if not tail or tail[-1] != "\n":
1105        sys.stdout.write("\n")
1106
1107# --------------------------------------------------------------------
1108# parsing
1109
1110##
1111# Parses an XML document into an element tree.
1112#
1113# @param source A filename or file object containing XML data.
1114# @param parser An optional parser instance.  If not given, the
1115#     standard {@link XMLParser} parser is used.
1116# @return An ElementTree instance
1117
1118def parse(source, parser=None):
1119    tree = ElementTree()
1120    tree.parse(source, parser)
1121    return tree
1122
1123##
1124# Parses an XML document into an element tree incrementally, and reports
1125# what's going on to the user.
1126#
1127# @param source A filename or file object containing XML data.
1128# @param events A list of events to report back.  If omitted, only "end"
1129#     events are reported.
1130# @param parser An optional parser instance.  If not given, the
1131#     standard {@link XMLParser} parser is used.
1132# @return A (event, elem) iterator.
1133
1134def iterparse(source, events=None, parser=None):
1135    if not hasattr(source, "read"):
1136        source = open(source, "rb")
1137    if not parser:
1138        parser = XMLParser(target=TreeBuilder())
1139    return _IterParseIterator(source, events, parser)
1140
1141class _IterParseIterator(object):
1142
1143    def __init__(self, source, events, parser):
1144        self._file = source
1145        self._events = []
1146        self._index = 0
1147        self.root = self._root = None
1148        self._parser = parser
1149        # wire up the parser for event reporting
1150        parser = self._parser._parser
1151        append = self._events.append
1152        if events is None:
1153            events = ["end"]
1154        for event in events:
1155            if event == "start":
1156                try:
1157                    parser.ordered_attributes = 1
1158                    parser.specified_attributes = 1
1159                    def handler(tag, attrib_in, event=event, append=append,
1160                                start=self._parser._start_list):
1161                        append((event, start(tag, attrib_in)))
1162                    parser.StartElementHandler = handler
1163                except AttributeError:
1164                    def handler(tag, attrib_in, event=event, append=append,
1165                                start=self._parser._start):
1166                        append((event, start(tag, attrib_in)))
1167                    parser.StartElementHandler = handler
1168            elif event == "end":
1169                def handler(tag, event=event, append=append,
1170                            end=self._parser._end):
1171                    append((event, end(tag)))
1172                parser.EndElementHandler = handler
1173            elif event == "start-ns":
1174                def handler(prefix, uri, event=event, append=append):
1175                    try:
1176                        uri = uri.encode("ascii")
1177                    except UnicodeError:
1178                        pass
1179                    append((event, (prefix or "", uri)))
1180                parser.StartNamespaceDeclHandler = handler
1181            elif event == "end-ns":
1182                def handler(prefix, event=event, append=append):
1183                    append((event, None))
1184                parser.EndNamespaceDeclHandler = handler
1185
1186    def next(self):
1187        while 1:
1188            try:
1189                item = self._events[self._index]
1190            except IndexError:
1191                if self._parser is None:
1192                    self.root = self._root
1193                    raise StopIteration
1194                # load event buffer
1195                del self._events[:]
1196                self._index = 0
1197                data = self._file.read(16384)
1198                if data:
1199                    self._parser.feed(data)
1200                else:
1201                    self._root = self._parser.close()
1202                    self._parser = None
1203            else:
1204                self._index = self._index + 1
1205                return item
1206
1207    def __iter__(self):
1208        return self
1209
1210##
1211# Parses an XML document from a string constant.  This function can
1212# be used to embed "XML literals" in Python code.
1213#
1214# @param source A string containing XML data.
1215# @param parser An optional parser instance.  If not given, the
1216#     standard {@link XMLParser} parser is used.
1217# @return An Element instance.
1218# @defreturn Element
1219
1220def XML(text, parser=None):
1221    if not parser:
1222        parser = XMLParser(target=TreeBuilder())
1223    parser.feed(text)
1224    return parser.close()
1225
1226##
1227# Parses an XML document from a string constant, and also returns
1228# a dictionary which maps from element id:s to elements.
1229#
1230# @param source A string containing XML data.
1231# @param parser An optional parser instance.  If not given, the
1232#     standard {@link XMLParser} parser is used.
1233# @return A tuple containing an Element instance and a dictionary.
1234# @defreturn (Element, dictionary)
1235
1236def XMLID(text, parser=None):
1237    if not parser:
1238        parser = XMLParser(target=TreeBuilder())
1239    parser.feed(text)
1240    tree = parser.close()
1241    ids = {}
1242    for elem in tree.getiterator():
1243        id = elem.get("id")
1244        if id:
1245            ids[id] = elem
1246    return tree, ids
1247
1248##
1249# Parses an XML document from a string constant.  Same as {@link #XML}.
1250#
1251# @def fromstring(text)
1252# @param source A string containing XML data.
1253# @return An Element instance.
1254# @defreturn Element
1255
1256fromstring = XML
1257
1258##
1259# Parses an XML document from a sequence of string fragments.
1260#
1261# @param sequence A list or other sequence containing XML data fragments.
1262# @param parser An optional parser instance.  If not given, the
1263#     standard {@link XMLParser} parser is used.
1264# @return An Element instance.
1265# @defreturn Element
1266# @since 1.3
1267
1268def fromstringlist(sequence, parser=None):
1269    if not parser:
1270        parser = XMLParser(target=TreeBuilder())
1271    for text in sequence:
1272        parser.feed(text)
1273    return parser.close()
1274
1275# --------------------------------------------------------------------
1276
1277##
1278# Generic element structure builder.  This builder converts a sequence
1279# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1280# #TreeBuilder.end} method calls to a well-formed element structure.
1281# <p>
1282# You can use this class to build an element structure using a custom XML
1283# parser, or a parser for some other XML-like format.
1284#
1285# @param element_factory Optional element factory.  This factory
1286#    is called to create new Element instances, as necessary.
1287
1288class TreeBuilder(object):
1289
1290    def __init__(self, element_factory=None):
1291        self._data = [] # data collector
1292        self._elem = [] # element stack
1293        self._last = None # last element
1294        self._tail = None # true if we're after an end tag
1295        if element_factory is None:
1296            element_factory = Element
1297        self._factory = element_factory
1298
1299    ##
1300    # Flushes the builder buffers, and returns the toplevel document
1301    # element.
1302    #
1303    # @return An Element instance.
1304    # @defreturn Element
1305
1306    def close(self):
1307        assert len(self._elem) == 0, "missing end tags"
1308        assert self._last != None, "missing toplevel element"
1309        return self._last
1310
1311    def _flush(self):
1312        if self._data:
1313            if self._last is not None:
1314                text = "".join(self._data)
1315                if self._tail:
1316                    assert self._last.tail is None, "internal error (tail)"
1317                    self._last.tail = text
1318                else:
1319                    assert self._last.text is None, "internal error (text)"
1320                    self._last.text = text
1321            self._data = []
1322
1323    ##
1324    # Adds text to the current element.
1325    #
1326    # @param data A string.  This should be either an 8-bit string
1327    #    containing ASCII text, or a Unicode string.
1328
1329    def data(self, data):
1330        self._data.append(data)
1331
1332    ##
1333    # Opens a new element.
1334    #
1335    # @param tag The element name.
1336    # @param attrib A dictionary containing element attributes.
1337    # @return The opened element.
1338    # @defreturn Element
1339
1340    def start(self, tag, attrs):
1341        self._flush()
1342        self._last = elem = self._factory(tag, attrs)
1343        if self._elem:
1344            self._elem[-1].append(elem)
1345        self._elem.append(elem)
1346        self._tail = 0
1347        return elem
1348
1349    ##
1350    # Closes the current element.
1351    #
1352    # @param tag The element name.
1353    # @return The closed element.
1354    # @defreturn Element
1355
1356    def end(self, tag):
1357        self._flush()
1358        self._last = self._elem.pop()
1359        assert self._last.tag == tag,\
1360               "end tag mismatch (expected %s, got %s)" % (
1361                   self._last.tag, tag)
1362        self._tail = 1
1363        return self._last
1364
1365##
1366# Element structure builder for XML source data, based on the
1367# <b>expat</b> parser.
1368#
1369# @keyparam target Target object.  If omitted, the builder uses an
1370#     instance of the standard {@link #TreeBuilder} class.
1371# @keyparam html Predefine HTML entities.  This flag is not supported
1372#     by the current implementation.
1373# @keyparam encoding Optional encoding.  If given, the value overrides
1374#     the encoding specified in the XML file.
1375# @see #ElementTree
1376# @see #TreeBuilder
1377
1378class XMLParser(object):
1379
1380    def __init__(self, html=0, target=None, encoding=None):
1381        try:
1382            from xml.parsers import expat
1383        except ImportError:
1384            try:
1385                import pyexpat; expat = pyexpat
1386            except ImportError:
1387                raise ImportError(
1388                    "No module named expat; use SimpleXMLTreeBuilder instead"
1389                    )
1390        parser = expat.ParserCreate(encoding, "}")
1391        if target is None:
1392            target = TreeBuilder()
1393        # underscored names are provided for compatibility only
1394        self.parser = self._parser = parser
1395        self.target = self._target = target
1396        self._error = expat.error
1397        self._names = {} # name memo cache
1398        # callbacks
1399        parser.DefaultHandlerExpand = self._default
1400        parser.StartElementHandler = self._start
1401        parser.EndElementHandler = self._end
1402        parser.CharacterDataHandler = self._data
1403        # let expat do the buffering, if supported
1404        try:
1405            self._parser.buffer_text = 1
1406        except AttributeError:
1407            pass
1408        # use new-style attribute handling, if supported
1409        try:
1410            self._parser.ordered_attributes = 1
1411            self._parser.specified_attributes = 1
1412            parser.StartElementHandler = self._start_list
1413        except AttributeError:
1414            pass
1415        self._doctype = None
1416        self.entity = {}
1417        try:
1418            self.version = "Expat %d.%d.%d" % expat.version_info
1419        except AttributeError:
1420            pass # unknown
1421
1422    def _raiseerror(self, value):
1423        err = ParseError(value)
1424        err.code = value.code
1425        err.position = value.lineno, value.offset
1426        raise err
1427
1428    def _fixtext(self, text):
1429        # convert text string to ascii, if possible
1430        try:
1431            return text.encode("ascii")
1432        except UnicodeError:
1433            return text
1434
1435    def _fixname(self, key):
1436        # expand qname, and convert name string to ascii, if possible
1437        try:
1438            name = self._names[key]
1439        except KeyError:
1440            name = key
1441            if "}" in name:
1442                name = "{" + name
1443            self._names[key] = name = self._fixtext(name)
1444        return name
1445
1446    def _start(self, tag, attrib_in):
1447        fixname = self._fixname
1448        fixtext = self._fixtext
1449        tag = fixname(tag)
1450        attrib = {}
1451        for key, value in attrib_in.items():
1452            attrib[fixname(key)] = fixtext(value)
1453        return self.target.start(tag, attrib)
1454
1455    def _start_list(self, tag, attrib_in):
1456        fixname = self._fixname
1457        fixtext = self._fixtext
1458        tag = fixname(tag)
1459        attrib = {}
1460        if attrib_in:
1461            for i in range(0, len(attrib_in), 2):
1462                attrib[fixname(attrib_in[i])] = fixtext(attrib_in[i+1])
1463        return self.target.start(tag, attrib)
1464
1465    def _data(self, text):
1466        return self.target.data(self._fixtext(text))
1467
1468    def _end(self, tag):
1469        return self.target.end(self._fixname(tag))
1470
1471    def _default(self, text):
1472        prefix = text[:1]
1473        if prefix == "&":
1474            # deal with undefined entities
1475            try:
1476                self.target.data(self.entity[text[1:-1]])
1477            except KeyError:
1478                from xml.parsers import expat
1479                err = expat.error(
1480                    "undefined entity %s: line %d, column %d" %
1481                    (text, self._parser.ErrorLineNumber,
1482                    self._parser.ErrorColumnNumber)
1483                    )
1484                err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
1485                err.lineno = self._parser.ErrorLineNumber
1486                err.offset = self._parser.ErrorColumnNumber
1487                raise err
1488        elif prefix == "<" and text[:9] == "<!DOCTYPE":
1489            self._doctype = [] # inside a doctype declaration
1490        elif self._doctype is not None:
1491            # parse doctype contents
1492            if prefix == ">":
1493                self._doctype = None
1494                return
1495            text = text.strip()
1496            if not text:
1497                return
1498            self._doctype.append(text)
1499            n = len(self._doctype)
1500            if n > 2:
1501                type = self._doctype[1]
1502                if type == "PUBLIC" and n == 4:
1503                    name, type, pubid, system = self._doctype
1504                elif type == "SYSTEM" and n == 3:
1505                    name, type, system = self._doctype
1506                    pubid = None
1507                else:
1508                    return
1509                if pubid:
1510                    pubid = pubid[1:-1]
1511                if hasattr(self.target, "doctype"):
1512                    self.target.doctype(name, pubid, system[1:-1])
1513                self._doctype = None
1514
1515    ##
1516    # Feeds data to the parser.
1517    #
1518    # @param data Encoded data.
1519
1520    def feed(self, data):
1521        try:
1522            self._parser.Parse(data, 0)
1523        except self._error, v:
1524            self._raiseerror(v)
1525
1526    ##
1527    # Finishes feeding data to the parser.
1528    #
1529    # @return An element structure.
1530    # @defreturn Element
1531
1532    def close(self):
1533        try:
1534            self._parser.Parse("", 1) # end of data
1535        except self._error, v:
1536            self._raiseerror(v)
1537        tree = self.target.close()
1538        del self.target, self._parser # get rid of circular references
1539        return tree
1540
1541# compatibility
1542XMLTreeBuilder = XMLParser
Note: See TracBrowser for help on using the repository browser.