source: TI07-MOLES/trunk/PythonCode/wsgi/xmlHandler.py @ 2330

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI07-MOLES/trunk/PythonCode/wsgi/xmlHandler.py@2330
Revision 2330, 7.9 KB checked in by lawrence, 15 years ago (diff)

Sundry testcases plus a a bug fix for ticket:666 ... it's done, but I
don't like this bugfix because it shows that I don't really understand
unicode properly. Need to revisit unicode again ... later.

Line 
1# Copyright Bryan Lawrence, CCLRC, 2006
2
3try:
4    from xml.etree import ElementTree as ET
5except ImportError:
6    try:
7        import ElementTree as ET
8    except ImportError:
9        # For some reason when I install ElementTree with easyinstall it
10        # is called "elementree".
11        try:
12            import elementtree.ElementTree as ET
13        except ImportError:
14            from leonardo.thirdParty import ElementTree as ET
15from xml.parsers.expat import ExpatError
16import StringIO
17
18
19class xmlHandler:
20   
21    ''' Takes an xml string and works out how to handle the various namespaces
22    in a nicer way than element tree alone will do '''
23   
24    def __init__(self,arg,nsMap={},string=0,tagDict=None):
25       
26        """ Take an xml file or xml string (string=1) argument (arg), and load it into an element
27        tree  instance which becomes the (.tree) attribute of the class. The original nameserver
28        map is added as an attribute to the tree, for use by external code.
29       
30        At the same time, create an html and text renditions of the xml.
31        This usage is for text or html pretty printing. Note that this text and html
32        is created on the initial parse, so one can't use this method on a pre-existing
33        element-tree object, although the nsMap which is handed out can be used externally
34        if the tree is subsequently modified. These renditions are availabe as .html and .text
35        attributes of this class.
36       
37        Note that in the html case, the element and attribute tags, as well as attribute
38        values are wrapped in spans with specific tags given in the internal tagDict dictionary.
39        Every element is wrapped in a div element, with class name also in the tagDict. The
40        caller can modify these class names by providing an alternative tagDict with the
41        four required keys:
42            'e' (element div class),
43            'et' (tag span class),
44            'at' (attribute tag span class)
45            'av' (attribute value span class)
46           
47        Known problems:
48            in text parsing, the attribute line widths are not controlled
49            comments are stripped
50            empty elements do not preserver the <blah/> syntax
51        """
52       
53        if tagDict is None:
54            self.tagDict={'e':'xmlElem','et':'xmlElemTag','at':'xmlAttrTyp','av':'xmlAttrVal'}
55        else:
56            self.tagDict=tagDict
57
58        f=arg
59        # The following is an ugly unicode bug fix, and I don't like it ... we should really
60        # use the encoding specified in the file ... but we don't know it properly.
61        if string ==1: f=StringIO.StringIO(arg.encode('utf-8'))
62        self.html=''
63        self.text=''
64        self.tab='   '
65        self.indent=[]
66        self.nsMap={}
67        self.pw=72
68        start=1
69       
70        try:
71            for event,elem in ET.iterparse(f,events=('start','end','start-ns')):
72                if start and event=='start':
73                    self.__parse(event,elem,self.__extra())
74                    start=0
75                else: 
76                    self.__parse(event,elem)
77        except ExpatError,e:
78            #.seek(0)
79            #rint 'Failed to parse:\n%s\n!!!!'%f.read()
80            raise ValueError('XML Parsing error in xmlHandler:%s'%e)
81        self.tree=elem
82        self.tree.nsMap=self.nsMap
83       
84    def __parse(self,event,elem,nsextra=''):
85        ''' parse each event and create appropriate html and text'''
86        lt,gt='<b>&lt;</b>','<b>&gt;</b>'
87        divs='<div class="%s">'%self.tagDict['e']
88        dive='</div>'
89        def etag(s): return '<span class="%s">%s</span>'%(self.tagDict['et'],s)
90        if event=='start-ns':
91            self.nsMap[elem[1]]=elem[0]
92        elif event=='start':
93            tag=self._parseClark(elem.tag)
94            self.html+=''.join([divs,lt,etag(tag),
95                                self.__HTMLattribs(nsextra),self.__HTMLattribs(elem.items()),gt])
96            self.text+=''.join(self.indent) # ie tabulate the elements
97            self.text+=''.join(['<',tag,
98                                self.__textattribs(nsextra),self.__textattribs(elem.items()),'>\n'])
99            self.indent.append(self.tab)
100            if elem.text is not None: 
101                self.html+=elem.text
102                self.text+=self.__lines(elem.text)
103        elif event=='end':
104            tag=self._parseClark(elem.tag)
105            self.html+=''.join([lt,'/',etag(tag),gt,dive])
106            self.indent.pop()
107            self.text+=''.join(self.indent)
108            self.text+='</%s>\n'%tag
109            if elem.tail is not None: 
110                self.text+=self.__lines(elem.tail)
111                self.html+=elem.tail
112               
113    def __HTMLattribs(self,tuplePairs):
114        ''' encodes attributes from the tuplePairs'''
115        def atag(s): return '<span class="%s">%s</span>'%(self.tagDict['at'],s)
116        def aval(s): return '<span class="%s">%s</span>'%(self.tagDict['av'],s)
117        s=''
118        for name,value in tuplePairs:
119            newatt=self._parseClark(name)
120            s+=' %s="%s"'%(atag(newatt),aval(value))
121        return s
122       
123    def __textattribs(self,tuplePairs):
124        s=''
125        for name,value in tuplePairs:
126            newatt=self._parseClark(name)
127            s+=' %s="%s"'%(newatt,value)
128        return s
129       
130    def _parseClark(self,name):
131        ''' parses the Clark notation to identify namespaces and return them appropriately'''
132        if name[0]!='{': return name
133        ns,local=name[1:].split('}')
134        if ns in self.nsMap:
135            if self.nsMap[ns]<>'':
136                return '%s:%s'%(self.nsMap[ns],local)
137            else: return local
138        else:
139            return name
140       
141    def __extra(self):
142        ''' Provides tuplePairs of the namespaces themselves for subsequent encoding'''
143        name='xmlns'
144        tuplePairs=[]
145        for i in self.nsMap:
146            if self.nsMap[i]<>'':
147                tuplePairs.append(('%s:%s'%(name,self.nsMap[i]),i))
148            else:
149                tuplePairs.append((name,i))
150        return tuplePairs
151           
152    def __lines(self,text):
153        """ Based on http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
154        but avoiding the reduce (coz I don't understand it without too much thinking,
155        and we know it will be history in future versions of python). Also ignoring
156        internal line breaks ... for the moment ...
157        Note the possibility of using concepts of recipe 358117 if unicode is an issue """
158        text=text.strip()
159        if text =='': return text
160        indent=''.join(self.indent)
161        line=indent
162        s=''
163        nl='\n'
164        for word in text.split(' '):
165            tmp=word.split(nl,1)[0]
166            if len(line)+len(tmp)<=self.pw: 
167                line+=tmp
168            else:
169                s+=line+nl
170                line=indent+tmp
171        s+=line+nl
172        return s
173           
174def stripNamespace(tagName):
175    ''' Given a tag name in Clark notation, strip the namespace completely.
176    Convenience function '''
177    t=tagName.split('}')
178    if len(t)==1: return tagName
179    return t[1]
180
181import unittest
182class TestCase(unittest.TestCase):
183    """Try to generate HTML and TXT from the test xml.
184    """
185
186    inputFile = 'examples/neodc.eg1.dif'
187   
188    def setUp(self):
189        # If pkg_resources is available assume the module is eggified and
190        # get a stream to the input data from the egg.
191        try:
192            import pkg_resources
193            f = pkg_resources.resource_stream(__name__, self.inputFile)
194        except ImportError:
195            # Else take the input file from __file__
196            import os
197            f=file(os.path.join(os.path.basepath(__file__), self.inputFile))
198
199        self.xml = xmlHandler(f)
200
201    def testHtml(self):
202        g = file('instance0.html', 'w')
203        g.write(self.xml.html)
204
205    def testTxt(self):
206        h = file('instance0.txt', 'w')
207        h.write(self.xml.text)
208
209if __name__=="__main__":
210    unittest.main()
Note: See TracBrowser for help on using the repository browser.