source: TI07-MOLES/trunk/PythonCode/wsgi/xmlHandler.py @ 1925

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI07-MOLES/trunk/PythonCode/wsgi/xmlHandler.py@1925
Revision 1925, 7.5 KB checked in by lawrence, 14 years ago (diff)

Further modifications to wsgi discovery and browse. Discovery now links
to the Dublin Core discovery elements, and some browse functions are
beginning to emerge, but it's still rather broken (and insecure).

Line 
1# Copyright Bryan Lawrence, CCLRC, 2006
2
3try:
4    from xml.etree import ElementTree as ET
5except ImportError:
6    try:
7        import ElementTree as ET
8    except ImportError:
9        # For some reason when I install ElementTree with easyinstall it
10        # is called "elementree".
11        try:
12            import elementtree.ElementTree as ET
13        except ImportError:
14            from leonardo.thirdParty import ElementTree as ET
15
16import StringIO
17
18
19class xmlHandler:
20   
21    ''' Takes an xml string and works out how to handle the various namespaces
22    in a nicer way than element tree alone will do '''
23   
24    def __init__(self,arg,nsMap={},string=0,tagDict=None):
25       
26        """ Take an xml file or xml string (string=1) argument (arg), and load it into an element
27        tree  instance which becomes the (.tree) attribute of the class. The original nameserver
28        map is added as an attribute to the tree, for use by external code.
29       
30        At the same time, create an html and text renditions of the xml.
31        This usage is for text or html pretty printing. Note that this text and html
32        is created on the initial parse, so one can't use this method on a pre-existing
33        element-tree object, although the nsMap which is handed out can be used externally
34        if the tree is subsequently modified. These renditions are availabe as .html and .text
35        attributes of this class.
36       
37        Note that in the html case, the element and attribute tags, as well as attribute
38        values are wrapped in spans with specific tags given in the internal tagDict dictionary.
39        Every element is wrapped in a div element, with class name also in the tagDict. The
40        caller can modify these class names by providing an alternative tagDict with the
41        four required keys:
42            'e' (element div class),
43            'et' (tag span class),
44            'at' (attribute tag span class)
45            'av' (attribute value span class)
46           
47        Known problems:
48            in text parsing, the attribute line widths are not controlled
49            comments are stripped
50            empty elements do not preserver the <blah/> syntax
51        """
52       
53        if tagDict is None:
54            self.tagDict={'e':'xmlElem','et':'xmlElemTag','at':'xmlAttrTyp','av':'xmlAttrVal'}
55        else:
56            self.tagDict=tagDict
57
58        f=arg
59        if string ==1: f=StringIO.StringIO(arg)
60        self.html=''
61        self.text=''
62        self.tab='   '
63        self.indent=[]
64        self.nsMap={}
65        self.pw=72
66        start=1
67       
68        for event,elem in ET.iterparse(f,events=('start','end','start-ns')):
69            if start and event=='start':
70                self.__parse(event,elem,self.__extra())
71                start=0
72            else: 
73                self.__parse(event,elem)
74        self.tree=elem
75        self.tree.nsMap=self.nsMap
76       
77    def __parse(self,event,elem,nsextra=''):
78        ''' parse each event and create appropriate html and text'''
79        lt,gt='<b>&lt;</b>','<b>&gt;</b>'
80        divs='<div class="%s">'%self.tagDict['e']
81        dive='</div>'
82        def etag(s): return '<span class="%s">%s</span>'%(self.tagDict['et'],s)
83        if event=='start-ns':
84            self.nsMap[elem[1]]=elem[0]
85        elif event=='start':
86            tag=self._parseClark(elem.tag)
87            self.html+=''.join([divs,lt,etag(tag),
88                                self.__HTMLattribs(nsextra),self.__HTMLattribs(elem.items()),gt])
89            self.text+=''.join(self.indent) # ie tabulate the elements
90            self.text+=''.join(['<',tag,
91                                self.__textattribs(nsextra),self.__textattribs(elem.items()),'>\n'])
92            self.indent.append(self.tab)
93            if elem.text is not None: 
94                self.html+=elem.text
95                self.text+=self.__lines(elem.text)
96        elif event=='end':
97            tag=self._parseClark(elem.tag)
98            self.html+=''.join([lt,'/',etag(tag),gt,dive])
99            self.indent.pop()
100            self.text+=''.join(self.indent)
101            self.text+='</%s>\n'%tag
102            if elem.tail is not None: 
103                self.text+=self.__lines(elem.tail)
104                self.html+=elem.tail
105               
106    def __HTMLattribs(self,tuplePairs):
107        ''' encodes attributes from the tuplePairs'''
108        def atag(s): return '<span class="%s">%s</span>'%(self.tagDict['at'],s)
109        def aval(s): return '<span class="%s">%s</span>'%(self.tagDict['av'],s)
110        s=''
111        for name,value in tuplePairs:
112            newatt=self._parseClark(name)
113            s+=' %s="%s"'%(atag(newatt),aval(value))
114        return s
115       
116    def __textattribs(self,tuplePairs):
117        s=''
118        for name,value in tuplePairs:
119            newatt=self._parseClark(name)
120            s+=' %s="%s"'%(newatt,value)
121        return s
122       
123    def _parseClark(self,name):
124        ''' parses the Clark notation to identify namespaces and return them appropriately'''
125        if name[0]!='{': return name
126        ns,local=name[1:].split('}')
127        if ns in self.nsMap:
128            if self.nsMap[ns]<>'':
129                return '%s:%s'%(self.nsMap[ns],local)
130            else: return local
131        else:
132            return name
133       
134    def __extra(self):
135        ''' Provides tuplePairs of the namespaces themselves for subsequent encoding'''
136        name='xmlns'
137        tuplePairs=[]
138        for i in self.nsMap:
139            if self.nsMap[i]<>'':
140                tuplePairs.append(('%s:%s'%(name,self.nsMap[i]),i))
141            else:
142                tuplePairs.append((name,i))
143        return tuplePairs
144           
145    def __lines(self,text):
146        """ Based on http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
147        but avoiding the reduce (coz I don't understand it without too much thinking,
148        and we know it will be history in future versions of python). Also ignoring
149        internal line breaks ... for the moment ...
150        Note the possibility of using concepts of recipe 358117 if unicode is an issue """
151        text=text.strip()
152        if text =='': return text
153        indent=''.join(self.indent)
154        line=indent
155        s=''
156        nl='\n'
157        for word in text.split(' '):
158            tmp=word.split(nl,1)[0]
159            if len(line)+len(tmp)<=self.pw: 
160                line+=tmp
161            else:
162                s+=line+nl
163                line=indent+tmp
164        s+=line+nl
165        return s
166           
167def stripNamespace(tagName):
168    ''' Given a tag name in Clark notation, strip the namespace completely.
169    Convenience function '''
170    t=tagName.split('}')
171    if len(t)==1: return tagName
172    return t[1]
173
174import unittest
175class TestCase(unittest.TestCase):
176    """Try to generate HTML and TXT from the test xml.
177    """
178
179    inputFile = 'instance0.xml'
180   
181    def setUp(self):
182        # If pkg_resources is available assume the module is eggified and
183        # get a stream to the input data from the egg.
184        try:
185            import pkg_resources
186            f = pkg_resources.resource_stream(__name__, self.inputFile)
187        except ImportError:
188            # Else take the input file from __file__
189            import os
190            f=file(os.path.join(os.path.basepath(__file__), self.inputFile))
191
192        self.xml = xmlHandler(f)
193
194    def testHtml(self):
195        g = file('instance0.html', 'w')
196        g.write(self.xml.html)
197
198    def testTxt(self):
199        h = file('instance0.txt', 'w')
200        h.write(self.xml.text)
201
202if __name__=="__main__":
203    unittest.main()
Note: See TracBrowser for help on using the repository browser.