source: exist/trunk/python/elementtree-1.3a6-20070310-badc/elementtree/ElementC14N.py @ 3578

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/elementtree-1.3a6-20070310-badc/elementtree/ElementC14N.py@3578
Revision 3578, 13.0 KB checked in by pjkersha, 11 years ago (diff)

Latest releases from Fredrik Lundh. 10 March release has exclusive C14N support with namespace prefixes.

Line 
1#
2# ElementTree
3# $Id: ElementC14N.py 3392 2008-03-10 23:32:45Z fredrik $
4#
5# canonicalisation (c14n) support for element trees
6#
7# history:
8# 2007-12-14 fl   created (normalized version)
9# 2008-02-12 fl   roundtrip support
10# 2008-03-03 fl   fixed parent map and scope setting/sorting bugs
11# 2008-03-05 fl   fixed namespace declarations in exclusive mode
12# 2008-03-10 fl   added inclusive subset support
13#
14# Copyright (c) 2007-2008 by Fredrik Lundh.  All rights reserved.
15#
16# fredrik@pythonware.com
17# http://www.pythonware.com
18#
19# --------------------------------------------------------------------
20# The ElementTree toolkit is
21#
22# Copyright (c) 1999-2008 by Fredrik Lundh
23#
24# By obtaining, using, and/or copying this software and/or its
25# associated documentation, you agree that you have read, understood,
26# and will comply with the following terms and conditions:
27#
28# Permission to use, copy, modify, and distribute this software and
29# its associated documentation for any purpose and without fee is
30# hereby granted, provided that the above copyright notice appears in
31# all copies, and that both that copyright notice and this permission
32# notice appear in supporting documentation, and that the name of
33# Secret Labs AB or the author not be used in advertising or publicity
34# pertaining to distribution of the software without specific, written
35# prior permission.
36#
37# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
38# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
39# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
40# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
41# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
42# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
43# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
44# OF THIS SOFTWARE.
45# --------------------------------------------------------------------
46
47from ElementTree import QName
48from ElementTree import ElementTree, iterparse
49from ElementTree import _namespaces, _raise_serialization_error
50
51# 2.3 compatibility
52
53try:
54    sorted
55except NameError:
56    def sorted(list, key=None):
57        if key:
58            list.sort(lambda a, b: cmp(key(a), key(b)))
59        else:
60            list.sort()
61        return list
62
63try:
64    set
65except NameError:
66    from sets import Set as set
67
68# C14N escape methods
69
70def _escape_cdata_c14n(text):
71    # escape character data
72    try:
73        # it's worth avoiding do-nothing calls for strings that are
74        # shorter than 500 character, or so.  assume that's, by far,
75        # the most common case in most applications.
76        if "&" in text:
77            text = text.replace("&", "&")
78        if "<" in text:
79            text = text.replace("<", "&lt;")
80        if ">" in text:
81            text = text.replace(">", "&gt;")
82        if "\r" in text:
83            text = text.replace("\n", "&#xD;")
84        return text.encode("utf-8")
85    except (TypeError, AttributeError):
86        _raise_serialization_error(text)
87
88def _escape_attrib_c14n(text):
89    # escape attribute value
90    try:
91        if "&" in text:
92            text = text.replace("&", "&amp;")
93        if "<" in text:
94            text = text.replace("<", "&lt;")
95        if "\"" in text:
96            text = text.replace("\"", "&quot;")
97        if "\t" in text:
98            text = text.replace("\t", "&#x9;")
99        if "\n" in text:
100            text = text.replace("\n", "&#xA;")
101        if "\r" in text:
102            text = text.replace("\r", "&#xD;")
103        return text.encode("utf-8")
104    except (TypeError, AttributeError):
105        _raise_serialization_error(text)
106
107class WriteC14N:
108    # C14N writer target
109
110    def __init__(self, write):
111        self.write = write
112
113    def start(self, tag, attrs):
114        # expects to get the attributes as a list of pairs, *in order*
115        # FIXME: pass in prefix/uri/tag triples instead?
116        write = self.write
117        write("<" + tag.encode("utf-8"))
118        for k, v in attrs:
119            write(" %s=\"%s\"" % (k.encode("utf-8"), _escape_attrib_c14n(v)))
120        write(">")
121
122    def data(self, data):
123        self.write(_escape_cdata_c14n(data))
124
125    def end(self, tag):
126        self.write("</" + tag.encode("utf-8") + ">")
127
128def _serialize(elem, target, qnames, namespaces):
129
130    # event generator
131    def emit(elem, namespaces=None):
132        tag = qnames[elem.tag]
133        attrib = []
134        # namespaces first, sorted by prefix
135        if namespaces:
136            for v, k in sorted(namespaces.items(), key=lambda x: x[1]):
137                attrib.append(("xmlns:" + k, v))
138        # attributes next, sorted by (uri, local)
139        for k, v in sorted(elem.attrib.items()):
140            attrib.append((qnames[k], v))
141        target.start(tag, attrib)
142        if elem.text:
143            target.data(elem.text)
144        for e in elem:
145            emit(e)
146        target.end(tag)
147        if elem.tail:
148            target.data(elem.tail)
149
150    emit(elem, namespaces)
151
152def _serialize_inclusive(elem, target, scope, parent, nsmap):
153
154    def qname(elem, qname):
155        if qname[:1] == "{":
156            uri, tag = qname[1:].split("}", 1)
157            for prefix, u in _listscopes(elem, scope, parent):
158                if u == uri:
159                    break
160            else:
161                raise IOError("%s not in scope" % uri) # FIXME
162            if prefix == "":
163                return tag # default namespace
164            return prefix + ":" + tag
165        else:
166            return qname
167
168    def emit(elem, nsmap):
169        tag = qname(elem, elem.tag)
170        attrib = []
171        # namespaces first, sorted by prefix
172        namespaces = scope.get(elem)
173        if namespaces or nsmap:
174            if not namespaces:
175                namespaces = []
176            if nsmap:
177                nsdict = dict(namespaces)
178                for p, u in nsmap:
179                    if p not in nsdict:
180                        namespaces.append((p, u))
181            for p, u in sorted(namespaces):
182                if p:
183                    attrib.append(("xmlns:" + p, u))
184        # attributes next, sorted by (uri, local)
185        for k, v in sorted(elem.attrib.items()):
186            attrib.append((qname(elem, k), v))
187        target.start(tag, attrib)
188        if elem.text:
189            target.data(elem.text)
190        for e in elem:
191            emit(e, None)
192        target.end(tag)
193        if elem.tail:
194            target.data(elem.tail)
195
196    emit(elem, nsmap)
197
198def _serialize_exclusive(elem, target, scope, parent, nsinclude):
199
200    def qname(elem, qname):
201        if qname[:1] == "{":
202            uri, tag = qname[1:].split("}", 1)
203            for prefix, u in _listscopes(elem, scope, parent):
204                if u == uri:
205                    break
206            else:
207                raise IOError("%s not in scope" % uri)
208            return prefix, uri, prefix + ":" + tag
209        else:
210            return None, None, qname
211
212    stack = [{}]
213
214    def emit(elem):
215        # identify target namespaces
216        namespaces = {}
217        rendered = stack[-1].copy()
218        # element tag
219        prefix, uri, tag = qname(elem, elem.tag)
220        if prefix:
221            namespaces[prefix] = uri
222        # attributes
223        attrib = []
224        for k, v in sorted(elem.attrib.items()):
225            prefix, uri, k = qname(elem, k)
226            if prefix:
227                namespaces[prefix] = uri
228            attrib.append((k, v))
229        # explicitly included namespaces
230        if nsinclude and elem in scope:
231            for p, u in scope[elem]:
232                if p not in namespaces and p in nsinclude:
233                    namespaces[p] = u
234        # build namespace attribute list
235        xmlns = []
236        for p, u in sorted(namespaces.items()):
237            if p and rendered.get(p) != u:
238                xmlns.append(("xmlns:" + p, u))
239            rendered[p] = u
240        # serialize
241        target.start(tag, xmlns + attrib)
242        if elem.text:
243            target.data(elem.text)
244        stack.append(rendered)
245        for e in elem:
246            emit(e)
247        stack.pop()
248        target.end(tag)
249        if elem.tail:
250            target.data(elem.tail)
251
252    emit(elem)
253
254##
255# (Internal) Hook used by ElementTree's c14n output method
256
257def _serialize_c14n(write, elem, encoding, qnames, namespaces):
258    if encoding != "utf-8":
259        raise ValueError("invalid encoding (%s)" % encoding)
260    _serialize(elem, WriteC14N(write), qnames, namespaces)
261
262##
263# Writes a canonicalized document.
264#
265# @def write(elem, file, subset=None, **options)
266# @param elem Element or ElementTree.  If passed a tree created by {@link
267#     parse}, the function attempts to preserve existing prefixes.
268#     Otherwise, new prefixes are allocated.
269# @param file Output file.  Can be either a filename or a file-like object.
270# @param subset Subset element, if applicable.
271# @param **options Options, given as keyword arguments.
272# @keyparam exclusive Use exclusive C14N.  In this mode, namespaces
273#     declarations are moved to the first element (in document order)
274#     that actually uses the namespace.
275# @keyparam inclusive_namespaces If given, a list or set of prefxies
276#     that should be retained in the serialized document, even if
277#     they're not used.  This applies to exclusive serialization only
278#     (for inclusive subsets, all prefixes are always included).
279
280def write(elem, file_or_filename, subset=None,
281          exclusive=False, inclusive_namespaces=None):
282    if hasattr(file_or_filename, "write"):
283        file = file_or_filename
284    else:
285        file = open(file_or_filename, "wb")
286    out = WriteC14N(file.write)
287    try:
288        if not hasattr(elem, "_scope"):
289            # ordinary tree; allocate new prefixes up front
290            if subset is not None:
291                raise ValueError("subset only works for scoped trees")
292            qnames, namespaces = _namespaces(elem, "utf-8")
293            _serialize(elem, out, qnames, namespaces)
294        else:
295            # scoped tree
296            scope = elem._scope
297            parent = elem._parent
298            if exclusive:
299                # exclusive mode
300                if subset is None:
301                    elem = elem.getroot()
302                else:
303                    elem = subset
304                nsinclude = set(inclusive_namespaces or [])
305                _serialize_exclusive(elem, out, scope, parent, nsinclude)
306            else:
307                # inclusive mode
308                if subset is None:
309                    elem = elem.getroot()
310                    nsmap = []
311                else:
312                    # bring used namespaces into scope
313                    nsmap = {}
314                    elem = subset
315                    for p, u in _listscopes(elem, scope, parent):
316                        if p not in nsmap:
317                            nsmap[p] = u
318                    nsmap = nsmap.items()
319                _serialize_inclusive(elem, out, scope, parent, nsmap)
320    finally:
321        if file is not file_or_filename:
322            file.close()
323
324##
325# Parses an XML file, and builds a tree annotated with scope and parent
326# information.  To parse from a string, use the StringIO module.
327#
328# @param file A file name or file object.
329# @return An extended ElementTree, with extra scope and parent information
330#    attached to the ElementTree object.
331
332def parse(file):
333
334    events = "start", "start-ns", "end"
335
336    root = None
337    ns_map = []
338
339    scope = {}
340    parent = {}
341
342    stack = []
343
344    for event, elem in iterparse(file, events):
345
346        if event == "start-ns":
347            ns_map.append(elem)
348
349        elif event == "start":
350            if stack:
351                parent[elem] = stack[-1]
352            stack.append(elem)
353            if root is None:
354                root = elem
355            if ns_map:
356                scope[elem] = ns_map
357                ns_map = []
358
359        elif event == "end":
360            stack.pop()
361
362    # FIXME: remove this assertion before release
363    assert parent == dict([(c, p) for p in root.getiterator() for c in p])
364
365    tree = ElementTree(root)
366    tree._scope = scope
367    tree._parent = parent
368
369    return tree
370
371##
372# (Internal) Finds undefined URI:s in a scoped tree.
373
374def _find_open_uris(elem, scope, parent):
375    uris = {} # set of open URIs
376    stack = [{}] # stack of namespace maps
377    def qname(qname):
378        if qname[:1] == "{":
379            uri, tag = qname[1:].split("}", 1)
380            if uri not in stack[-1]:
381                uris[uri] = None
382    def check(elem):
383        ns = stack[-1].copy()
384        if elem in scope:
385            for prefix, uri in scope[elem]:
386                ns[uri] = prefix
387        stack.append(ns)
388        qname(elem.tag)
389        map(qname, elem.keys())
390        map(check, elem)
391        stack.pop()
392    check(elem)
393    return uris.keys()
394
395##
396# (Internal) Returns a sequence of (prefix, uri) pairs.
397
398def _listscopes(elem, scope, parent):
399    while elem is not None:
400        ns = scope.get(elem)
401        if ns:
402            for prefix_uri in ns:
403                yield prefix_uri
404        elem = parent.get(elem)
405
406##
407# (Internal) Finds prefix for given URI in a scoped tree.
408
409def _findprefix(elem, scope, parent, uri):
410    for p, u in _listscopes(elem, scope, parent):
411        if u == uri:
412            return p
413    return None
414
415
Note: See TracBrowser for help on using the repository browser.