source: exist/trunk/python/elementtree-1.3a6-20070312-badc/elementtree/ElementC14N.py @ 3638

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/elementtree-1.3a6-20070312-badc/elementtree/ElementC14N.py@3638
Revision 3638, 13.3 KB checked in by pjkersha, 13 years ago (diff)

Latest ET snapshot includes fix to Exclusive C14N inclusive namespace functionality

Line 
1#
2# ElementTree
3# $Id: ElementC14N.py 3394 2008-03-16 23:16:19Z fredrik $
4#
5# canonicalisation (c14n) support for element trees
6#
7# history:
8# 2007-12-14 fl   created (normalized version)
9# 2008-02-12 fl   roundtrip support
10# 2008-03-03 fl   fixed parent map and scope setting/sorting bugs
11# 2008-03-05 fl   fixed namespace declarations in exclusive mode
12# 2008-03-10 fl   added inclusive subset support
13# 2008-03-12 fl   fixed scope import in inclusive subset mode
14#
15# Copyright (c) 2007-2008 by Fredrik Lundh.  All rights reserved.
16#
17# fredrik@pythonware.com
18# http://www.pythonware.com
19#
20# --------------------------------------------------------------------
21# The ElementTree toolkit is
22#
23# Copyright (c) 1999-2008 by Fredrik Lundh
24#
25# By obtaining, using, and/or copying this software and/or its
26# associated documentation, you agree that you have read, understood,
27# and will comply with the following terms and conditions:
28#
29# Permission to use, copy, modify, and distribute this software and
30# its associated documentation for any purpose and without fee is
31# hereby granted, provided that the above copyright notice appears in
32# all copies, and that both that copyright notice and this permission
33# notice appear in supporting documentation, and that the name of
34# Secret Labs AB or the author not be used in advertising or publicity
35# pertaining to distribution of the software without specific, written
36# prior permission.
37#
38# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
39# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
40# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
41# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
42# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
43# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
44# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
45# OF THIS SOFTWARE.
46# --------------------------------------------------------------------
47
48from ElementTree import QName
49from ElementTree import ElementTree, iterparse
50from ElementTree import _namespaces, _raise_serialization_error
51
52# 2.3 compatibility
53
54try:
55    sorted
56except NameError:
57    def sorted(list, key=None):
58        if key:
59            list.sort(lambda a, b: cmp(key(a), key(b)))
60        else:
61            list.sort()
62        return list
63
64try:
65    set
66except NameError:
67    from sets import Set as set
68
69# C14N escape methods
70
71def _escape_cdata_c14n(text):
72    # escape character data
73    try:
74        # it's worth avoiding do-nothing calls for strings that are
75        # shorter than 500 character, or so.  assume that's, by far,
76        # the most common case in most applications.
77        if "&" in text:
78            text = text.replace("&", "&")
79        if "<" in text:
80            text = text.replace("<", "&lt;")
81        if ">" in text:
82            text = text.replace(">", "&gt;")
83        if "\r" in text:
84            text = text.replace("\n", "&#xD;")
85        return text.encode("utf-8")
86    except (TypeError, AttributeError):
87        _raise_serialization_error(text)
88
89def _escape_attrib_c14n(text):
90    # escape attribute value
91    try:
92        if "&" in text:
93            text = text.replace("&", "&amp;")
94        if "<" in text:
95            text = text.replace("<", "&lt;")
96        if "\"" in text:
97            text = text.replace("\"", "&quot;")
98        if "\t" in text:
99            text = text.replace("\t", "&#x9;")
100        if "\n" in text:
101            text = text.replace("\n", "&#xA;")
102        if "\r" in text:
103            text = text.replace("\r", "&#xD;")
104        return text.encode("utf-8")
105    except (TypeError, AttributeError):
106        _raise_serialization_error(text)
107
108class WriteC14N:
109    # C14N writer target
110
111    def __init__(self, write):
112        self.write = write
113
114    def start(self, tag, attrs):
115        # expects to get the attributes as a list of pairs, *in order*
116        # FIXME: pass in prefix/uri/tag triples instead?
117        write = self.write
118        write("<" + tag.encode("utf-8"))
119        for k, v in attrs:
120            write(" %s=\"%s\"" % (k.encode("utf-8"), _escape_attrib_c14n(v)))
121        write(">")
122
123    def data(self, data):
124        self.write(_escape_cdata_c14n(data))
125
126    def end(self, tag):
127        self.write("</" + tag.encode("utf-8") + ">")
128
129def _serialize(elem, target, qnames, namespaces):
130
131    # event generator
132    def emit(elem, namespaces=None):
133        tag = qnames[elem.tag]
134        attrib = []
135        # namespaces first, sorted by prefix
136        if namespaces:
137            for v, k in sorted(namespaces.items(), key=lambda x: x[1]):
138                attrib.append(("xmlns:" + k, v))
139        # attributes next, sorted by (uri, local)
140        for k, v in sorted(elem.attrib.items()):
141            attrib.append((qnames[k], v))
142        target.start(tag, attrib)
143        if elem.text:
144            target.data(elem.text)
145        for e in elem:
146            emit(e)
147        target.end(tag)
148        if elem.tail:
149            target.data(elem.tail)
150
151    emit(elem, namespaces)
152
153def _serialize_inclusive(elem, target, scope, parent, nsmap):
154
155    def qname(elem, qname):
156        if qname[:1] == "{":
157            uri, tag = qname[1:].split("}", 1)
158            for prefix, u in _listscopes(elem, scope, parent):
159                if u == uri:
160                    break
161            else:
162                raise IOError("%s not in scope" % uri) # FIXME
163            if prefix == "":
164                return tag # default namespace
165            return prefix + ":" + tag
166        else:
167            return qname
168
169    def emit(elem, nsmap):
170        tag = qname(elem, elem.tag)
171        attrib = []
172        # namespaces first, sorted by prefix
173        namespaces = scope.get(elem)
174        if namespaces or nsmap:
175            if not namespaces:
176                namespaces = []
177            if nsmap:
178                nsdict = dict(namespaces)
179                for p, u in nsmap:
180                    if p not in nsdict:
181                        namespaces.append((p, u))
182            for p, u in sorted(namespaces):
183                if p:
184                    attrib.append(("xmlns:" + p, u))
185        # attributes next, sorted by (uri, local)
186        for k, v in sorted(elem.attrib.items()):
187            attrib.append((qname(elem, k), v))
188        target.start(tag, attrib)
189        if elem.text:
190            target.data(elem.text)
191        for e in elem:
192            emit(e, None)
193        target.end(tag)
194        if elem.tail:
195            target.data(elem.tail)
196
197    emit(elem, nsmap)
198
199def _serialize_exclusive(elem, target, scope, parent, nsmap, nsinclude):
200
201    def qname(elem, qname):
202        if qname[:1] == "{":
203            uri, tag = qname[1:].split("}", 1)
204            for prefix, u in _listscopes(elem, scope, parent):
205                if u == uri:
206                    break
207            else:
208                raise IOError("%s not in scope" % uri)
209            return prefix, uri, prefix + ":" + tag
210        else:
211            return None, None, qname
212
213    stack = [{}]
214
215    def emit(elem, nsmap):
216        # identify target namespaces
217        namespaces = {}
218        rendered = stack[-1].copy()
219        # element tag
220        prefix, uri, tag = qname(elem, elem.tag)
221        if prefix:
222            namespaces[prefix] = uri
223        # attributes
224        attrib = []
225        for k, v in sorted(elem.attrib.items()):
226            prefix, uri, k = qname(elem, k)
227            if prefix:
228                namespaces[prefix] = uri
229            attrib.append((k, v))
230        # explicitly included namespaces
231        if nsinclude:
232            if nsmap:
233                for p, u in nsmap:
234                    if p not in namespaces and p in nsinclude:
235                        namespaces[p] = u
236            if elem in scope:
237                for p, u in scope[elem]:
238                    if p not in namespaces and p in nsinclude:
239                        namespaces[p] = u
240        # build namespace attribute list
241        xmlns = []
242        for p, u in sorted(namespaces.items()):
243            if p and rendered.get(p) != u:
244                xmlns.append(("xmlns:" + p, u))
245            rendered[p] = u
246        # serialize
247        target.start(tag, xmlns + attrib)
248        if elem.text:
249            target.data(elem.text)
250        stack.append(rendered)
251        for e in elem:
252            emit(e, None)
253        stack.pop()
254        target.end(tag)
255        if elem.tail:
256            target.data(elem.tail)
257
258    emit(elem, nsmap)
259
260##
261# (Internal) Hook used by ElementTree's c14n output method
262
263def _serialize_c14n(write, elem, encoding, qnames, namespaces):
264    if encoding != "utf-8":
265        raise ValueError("invalid encoding (%s)" % encoding)
266    _serialize(elem, WriteC14N(write), qnames, namespaces)
267
268##
269# Writes a canonicalized document.
270#
271# @def write(elem, file, subset=None, **options)
272# @param elem Element or ElementTree.  If passed a tree created by {@link
273#     parse}, the function attempts to preserve existing prefixes.
274#     Otherwise, new prefixes are allocated.
275# @param file Output file.  Can be either a filename or a file-like object.
276# @param subset Subset element, if applicable.
277# @param **options Options, given as keyword arguments.
278# @keyparam exclusive Use exclusive C14N.  In this mode, namespaces
279#     declarations are moved to the first element (in document order)
280#     that actually uses the namespace.
281# @keyparam inclusive_namespaces If given, a list or set of prefxies
282#     that should be retained in the serialized document, even if
283#     they're not used.  This applies to exclusive serialization only
284#     (for inclusive subsets, all prefixes are always included).
285
286def write(elem, file_or_filename, subset=None,
287          exclusive=False, inclusive_namespaces=None):
288    if hasattr(file_or_filename, "write"):
289        file = file_or_filename
290    else:
291        file = open(file_or_filename, "wb")
292    out = WriteC14N(file.write)
293    try:
294        if not hasattr(elem, "_scope"):
295            # ordinary tree; allocate new prefixes up front
296            if subset is not None:
297                raise ValueError("subset only works for scoped trees")
298            qnames, namespaces = _namespaces(elem, "utf-8")
299            _serialize(elem, out, qnames, namespaces)
300            return
301
302        # scoped tree
303        scope = elem._scope
304        parent = elem._parent
305
306        if subset is not None:
307            # get list of imported scopes
308            nsmap = {}
309            for p, u in _listscopes(subset, scope, parent):
310                if p not in nsmap:
311                    nsmap[p] = u
312            nsmap = nsmap.items()
313            elem = subset
314        else:
315            elem = elem.getroot()
316            nsmap = []
317
318        if exclusive:
319            # exclusive mode
320            nsinclude = set(inclusive_namespaces or [])
321            _serialize_exclusive(elem, out, scope, parent, nsmap, nsinclude)
322            return
323        else:
324            # inclusive mode
325            _serialize_inclusive(elem, out, scope, parent, nsmap)
326
327    finally:
328        if file is not file_or_filename:
329            file.close()
330
331##
332# Parses an XML file, and builds a tree annotated with scope and parent
333# information.  To parse from a string, use the StringIO module.
334#
335# @param file A file name or file object.
336# @return An extended ElementTree, with extra scope and parent information
337#    attached to the ElementTree object.
338
339def parse(file):
340
341    events = "start", "start-ns", "end"
342
343    root = None
344    ns_map = []
345
346    scope = {}
347    parent = {}
348
349    stack = []
350
351    for event, elem in iterparse(file, events):
352
353        if event == "start-ns":
354            ns_map.append(elem)
355
356        elif event == "start":
357            if stack:
358                parent[elem] = stack[-1]
359            stack.append(elem)
360            if root is None:
361                root = elem
362            if ns_map:
363                scope[elem] = ns_map
364                ns_map = []
365
366        elif event == "end":
367            stack.pop()
368
369    tree = ElementTree(root)
370    tree._scope = scope
371    tree._parent = parent
372
373    return tree
374
375##
376# (Internal) Finds undefined URI:s in a scoped tree.
377
378def _find_open_uris(elem, scope, parent):
379    uris = {} # set of open URIs
380    stack = [{}] # stack of namespace maps
381    def qname(qname):
382        if qname[:1] == "{":
383            uri, tag = qname[1:].split("}", 1)
384            if uri not in stack[-1]:
385                uris[uri] = None
386    def check(elem):
387        ns = stack[-1].copy()
388        if elem in scope:
389            for prefix, uri in scope[elem]:
390                ns[uri] = prefix
391        stack.append(ns)
392        qname(elem.tag)
393        map(qname, elem.keys())
394        map(check, elem)
395        stack.pop()
396    check(elem)
397    return uris.keys()
398
399##
400# (Internal) Returns a sequence of (prefix, uri) pairs.
401
402def _listscopes(elem, scope, parent):
403    while elem is not None:
404        ns = scope.get(elem)
405        if ns:
406            for prefix_uri in ns:
407                yield prefix_uri
408        elem = parent.get(elem)
409
410##
411# (Internal) Finds prefix for given URI in a scoped tree.
412
413def _findprefix(elem, scope, parent, uri):
414    for p, u in _listscopes(elem, scope, parent):
415        if u == uri:
416            return p
417    return None
418
419
Note: See TracBrowser for help on using the repository browser.