source: TI07-MOLES/trunk/PythonCode/wsgi/xmlHandler.py @ 2487

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI07-MOLES/trunk/PythonCode/wsgi/xmlHandler.py
Revision 2487, 7.9 KB checked in by lawrence, 13 years ago (diff)

Modifications to support cut-down stub-b xquery.
Slight rearrangement of page positioning in discovery. (Needs CSS
support)

Line 
1# Copyright Bryan Lawrence, CCLRC, 2006
2
3try:
4    from xml.etree import cElementTree as ET
5except ImportError:
6    try:
7        import ElementTree as ET
8    except ImportError:
9        # For some reason when I install ElementTree with easyinstall it
10        # is called "elementree".
11        try:
12            import elementtree.ElementTree as ET
13        except ImportError:
14            from leonardo.thirdParty import ElementTree as ET
15from xml.parsers.expat import ExpatError
16import StringIO
17
18
19class xmlHandler:
20   
21    ''' Takes an xml string and works out how to handle the various namespaces
22    in a nicer way than element tree alone will do '''
23   
24    def __init__(self,arg,nsMap={},string=0,tagDict=None):
25       
26        """ Take an xml file or xml string (string=1) argument (arg), and load it into an element
27        tree  instance which becomes the (.tree) attribute of the class. The original nameserver
28        map is added as an attribute to the tree, for use by external code.
29       
30        At the same time, create an html and text renditions of the xml.
31        This usage is for text or html pretty printing. Note that this text and html
32        is created on the initial parse, so one can't use this method on a pre-existing
33        element-tree object, although the nsMap which is handed out can be used externally
34        if the tree is subsequently modified. These renditions are availabe as .html and .text
35        attributes of this class.
36       
37        Note that in the html case, the element and attribute tags, as well as attribute
38        values are wrapped in spans with specific tags given in the internal tagDict dictionary.
39        Every element is wrapped in a div element, with class name also in the tagDict. The
40        caller can modify these class names by providing an alternative tagDict with the
41        four required keys:
42            'e' (element div class),
43            'et' (tag span class),
44            'at' (attribute tag span class)
45            'av' (attribute value span class)
46           
47        Known problems:
48            in text parsing, the attribute line widths are not controlled
49            comments are stripped
50            empty elements do not preserver the <blah/> syntax
51        """
52       
53        if tagDict is None:
54            self.tagDict={'e':'xmlElem','et':'xmlElemTag','at':'xmlAttrTyp','av':'xmlAttrVal'}
55        else:
56            self.tagDict=tagDict
57
58        f=arg
59        # The following is an ugly unicode bug fix, and I don't like it ... we should really
60        # use the encoding specified in the file ... but we don't know it properly.
61        if string ==1: f=StringIO.StringIO(arg.encode('utf-8'))
62        self.html=''
63        self.text=''
64        self.tab='   '
65        self.indent=[]
66        self.nsMap={}
67        self.pw=72
68        start=1
69       
70        try:
71            for event,elem in ET.iterparse(f,events=('start','end','start-ns')):
72                if start and event=='start':
73                    self.__parse(event,elem,self.__extra())
74                    start=0
75                else: 
76                    self.__parse(event,elem)
77        except ExpatError,e:
78            #.seek(0)
79            #rint 'Failed to parse:\n%s\n!!!!'%f.read()
80            raise ValueError('XML Parsing error in xmlHandler:%s'%e)
81        self.tree=elem
82       
83    def __parse(self,event,elem,nsextra=''):
84        ''' parse each event and create appropriate html and text'''
85        lt,gt='<b>&lt;</b>','<b>&gt;</b>'
86        divs='<div class="%s">'%self.tagDict['e']
87        dive='</div>'
88        def etag(s): return '<span class="%s">%s</span>'%(self.tagDict['et'],s)
89        if event=='start-ns':
90            self.nsMap[elem[1]]=elem[0]
91        elif event=='start':
92            tag=self._parseClark(elem.tag)
93            self.html+=''.join([divs,lt,etag(tag),
94                                self.__HTMLattribs(nsextra),self.__HTMLattribs(elem.items()),gt])
95            self.text+=''.join(self.indent) # ie tabulate the elements
96            self.text+=''.join(['<',tag,
97                                self.__textattribs(nsextra),self.__textattribs(elem.items()),'>\n'])
98            self.indent.append(self.tab)
99            if elem.text is not None: 
100                self.html+=elem.text
101                self.text+=self.__lines(elem.text)
102        elif event=='end':
103            tag=self._parseClark(elem.tag)
104            self.html+=''.join([lt,'/',etag(tag),gt,dive])
105            self.indent.pop()
106            self.text+=''.join(self.indent)
107            self.text+='</%s>\n'%tag
108            if elem.tail is not None: 
109                self.text+=self.__lines(elem.tail)
110                self.html+=elem.tail
111               
112    def __HTMLattribs(self,tuplePairs):
113        ''' encodes attributes from the tuplePairs'''
114        def atag(s): return '<span class="%s">%s</span>'%(self.tagDict['at'],s)
115        def aval(s): return '<span class="%s">%s</span>'%(self.tagDict['av'],s)
116        s=''
117        for name,value in tuplePairs:
118            newatt=self._parseClark(name)
119            s+=' %s="%s"'%(atag(newatt),aval(value))
120        return s
121       
122    def __textattribs(self,tuplePairs):
123        s=''
124        for name,value in tuplePairs:
125            newatt=self._parseClark(name)
126            s+=' %s="%s"'%(newatt,value)
127        return s
128       
129    def _parseClark(self,name):
130        ''' parses the Clark notation to identify namespaces and return them appropriately'''
131        if name[0]!='{': return name
132        ns,local=name[1:].split('}')
133        if ns in self.nsMap:
134            if self.nsMap[ns]<>'':
135                return '%s:%s'%(self.nsMap[ns],local)
136            else: return local
137        else:
138            return name
139       
140    def __extra(self):
141        ''' Provides tuplePairs of the namespaces themselves for subsequent encoding'''
142        name='xmlns'
143        tuplePairs=[]
144        for i in self.nsMap:
145            if self.nsMap[i]<>'':
146                tuplePairs.append(('%s:%s'%(name,self.nsMap[i]),i))
147            else:
148                tuplePairs.append((name,i))
149        return tuplePairs
150           
151    def __lines(self,text):
152        """ Based on http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
153        but avoiding the reduce (coz I don't understand it without too much thinking,
154        and we know it will be history in future versions of python). Also ignoring
155        internal line breaks ... for the moment ...
156        Note the possibility of using concepts of recipe 358117 if unicode is an issue """
157        text=text.strip()
158        if text =='': return text
159        indent=''.join(self.indent)
160        line=indent
161        s=''
162        nl='\n'
163        for word in text.split(' '):
164            tmp=word.split(nl,1)[0]
165            if len(line)+len(tmp)<=self.pw: 
166                line+=tmp
167            else:
168                s+=line+nl
169                line=indent+tmp
170        s+=line+nl
171        return s
172           
173def stripNamespace(tagName):
174    ''' Given a tag name in Clark notation, strip the namespace completely.
175    Convenience function '''
176    t=tagName.split('}')
177    if len(t)==1: return tagName
178    return t[1]
179
180import unittest
181class TestCase(unittest.TestCase):
182    """Try to generate HTML and TXT from the test xml.
183    """
184
185    inputFile = 'examples/neodc.eg1.dif'
186   
187    def setUp(self):
188        # If pkg_resources is available assume the module is eggified and
189        # get a stream to the input data from the egg.
190        try:
191            import pkg_resources
192            f = pkg_resources.resource_stream(__name__, self.inputFile)
193        except ImportError:
194            # Else take the input file from __file__
195            import os
196            f=file(os.path.join(os.path.basepath(__file__), self.inputFile))
197
198        self.xml = xmlHandler(f)
199
200    def testHtml(self):
201        g = file('instance0.html', 'w')
202        g.write(self.xml.html)
203
204    def testTxt(self):
205        h = file('instance0.txt', 'w')
206        h.write(self.xml.text)
207
208if __name__=="__main__":
209    unittest.main()
Note: See TracBrowser for help on using the repository browser.