source: TI07-MOLES/trunk/PythonCode/wsgi/xmlHandler.py @ 2315

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI07-MOLES/trunk/PythonCode/wsgi/xmlHandler.py@2315
Revision 2315, 7.7 KB checked in by lawrence, 14 years ago (diff)

Improved parameter handling, and better test cases.

Line 
1# Copyright Bryan Lawrence, CCLRC, 2006
2
3try:
4    from xml.etree import ElementTree as ET
5except ImportError:
6    try:
7        import ElementTree as ET
8    except ImportError:
9        # For some reason when I install ElementTree with easyinstall it
10        # is called "elementree".
11        try:
12            import elementtree.ElementTree as ET
13        except ImportError:
14            from leonardo.thirdParty import ElementTree as ET
15from xml.parsers.expat import ExpatError
16import StringIO
17
18
19class xmlHandler:
20   
21    ''' Takes an xml string and works out how to handle the various namespaces
22    in a nicer way than element tree alone will do '''
23   
24    def __init__(self,arg,nsMap={},string=0,tagDict=None):
25       
26        """ Take an xml file or xml string (string=1) argument (arg), and load it into an element
27        tree  instance which becomes the (.tree) attribute of the class. The original nameserver
28        map is added as an attribute to the tree, for use by external code.
29       
30        At the same time, create an html and text renditions of the xml.
31        This usage is for text or html pretty printing. Note that this text and html
32        is created on the initial parse, so one can't use this method on a pre-existing
33        element-tree object, although the nsMap which is handed out can be used externally
34        if the tree is subsequently modified. These renditions are availabe as .html and .text
35        attributes of this class.
36       
37        Note that in the html case, the element and attribute tags, as well as attribute
38        values are wrapped in spans with specific tags given in the internal tagDict dictionary.
39        Every element is wrapped in a div element, with class name also in the tagDict. The
40        caller can modify these class names by providing an alternative tagDict with the
41        four required keys:
42            'e' (element div class),
43            'et' (tag span class),
44            'at' (attribute tag span class)
45            'av' (attribute value span class)
46           
47        Known problems:
48            in text parsing, the attribute line widths are not controlled
49            comments are stripped
50            empty elements do not preserver the <blah/> syntax
51        """
52       
53        if tagDict is None:
54            self.tagDict={'e':'xmlElem','et':'xmlElemTag','at':'xmlAttrTyp','av':'xmlAttrVal'}
55        else:
56            self.tagDict=tagDict
57
58        f=arg
59        if string ==1: f=StringIO.StringIO(arg)
60        self.html=''
61        self.text=''
62        self.tab='   '
63        self.indent=[]
64        self.nsMap={}
65        self.pw=72
66        start=1
67       
68        try:
69            for event,elem in ET.iterparse(f,events=('start','end','start-ns')):
70                if start and event=='start':
71                    self.__parse(event,elem,self.__extra())
72                    start=0
73                else: 
74                    self.__parse(event,elem)
75        except ExpatError,e:
76            #.seek(0)
77            #rint 'Failed to parse:\n%s\n!!!!'%f.read()
78            raise ValueError('XML Parsing error in xmlHandler:%s'%e)
79        self.tree=elem
80        self.tree.nsMap=self.nsMap
81       
82    def __parse(self,event,elem,nsextra=''):
83        ''' parse each event and create appropriate html and text'''
84        lt,gt='<b>&lt;</b>','<b>&gt;</b>'
85        divs='<div class="%s">'%self.tagDict['e']
86        dive='</div>'
87        def etag(s): return '<span class="%s">%s</span>'%(self.tagDict['et'],s)
88        if event=='start-ns':
89            self.nsMap[elem[1]]=elem[0]
90        elif event=='start':
91            tag=self._parseClark(elem.tag)
92            self.html+=''.join([divs,lt,etag(tag),
93                                self.__HTMLattribs(nsextra),self.__HTMLattribs(elem.items()),gt])
94            self.text+=''.join(self.indent) # ie tabulate the elements
95            self.text+=''.join(['<',tag,
96                                self.__textattribs(nsextra),self.__textattribs(elem.items()),'>\n'])
97            self.indent.append(self.tab)
98            if elem.text is not None: 
99                self.html+=elem.text
100                self.text+=self.__lines(elem.text)
101        elif event=='end':
102            tag=self._parseClark(elem.tag)
103            self.html+=''.join([lt,'/',etag(tag),gt,dive])
104            self.indent.pop()
105            self.text+=''.join(self.indent)
106            self.text+='</%s>\n'%tag
107            if elem.tail is not None: 
108                self.text+=self.__lines(elem.tail)
109                self.html+=elem.tail
110               
111    def __HTMLattribs(self,tuplePairs):
112        ''' encodes attributes from the tuplePairs'''
113        def atag(s): return '<span class="%s">%s</span>'%(self.tagDict['at'],s)
114        def aval(s): return '<span class="%s">%s</span>'%(self.tagDict['av'],s)
115        s=''
116        for name,value in tuplePairs:
117            newatt=self._parseClark(name)
118            s+=' %s="%s"'%(atag(newatt),aval(value))
119        return s
120       
121    def __textattribs(self,tuplePairs):
122        s=''
123        for name,value in tuplePairs:
124            newatt=self._parseClark(name)
125            s+=' %s="%s"'%(newatt,value)
126        return s
127       
128    def _parseClark(self,name):
129        ''' parses the Clark notation to identify namespaces and return them appropriately'''
130        if name[0]!='{': return name
131        ns,local=name[1:].split('}')
132        if ns in self.nsMap:
133            if self.nsMap[ns]<>'':
134                return '%s:%s'%(self.nsMap[ns],local)
135            else: return local
136        else:
137            return name
138       
139    def __extra(self):
140        ''' Provides tuplePairs of the namespaces themselves for subsequent encoding'''
141        name='xmlns'
142        tuplePairs=[]
143        for i in self.nsMap:
144            if self.nsMap[i]<>'':
145                tuplePairs.append(('%s:%s'%(name,self.nsMap[i]),i))
146            else:
147                tuplePairs.append((name,i))
148        return tuplePairs
149           
150    def __lines(self,text):
151        """ Based on http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
152        but avoiding the reduce (coz I don't understand it without too much thinking,
153        and we know it will be history in future versions of python). Also ignoring
154        internal line breaks ... for the moment ...
155        Note the possibility of using concepts of recipe 358117 if unicode is an issue """
156        text=text.strip()
157        if text =='': return text
158        indent=''.join(self.indent)
159        line=indent
160        s=''
161        nl='\n'
162        for word in text.split(' '):
163            tmp=word.split(nl,1)[0]
164            if len(line)+len(tmp)<=self.pw: 
165                line+=tmp
166            else:
167                s+=line+nl
168                line=indent+tmp
169        s+=line+nl
170        return s
171           
172def stripNamespace(tagName):
173    ''' Given a tag name in Clark notation, strip the namespace completely.
174    Convenience function '''
175    t=tagName.split('}')
176    if len(t)==1: return tagName
177    return t[1]
178
179import unittest
180class TestCase(unittest.TestCase):
181    """Try to generate HTML and TXT from the test xml.
182    """
183
184    inputFile = 'examples/neodc.eg1.dif'
185   
186    def setUp(self):
187        # If pkg_resources is available assume the module is eggified and
188        # get a stream to the input data from the egg.
189        try:
190            import pkg_resources
191            f = pkg_resources.resource_stream(__name__, self.inputFile)
192        except ImportError:
193            # Else take the input file from __file__
194            import os
195            f=file(os.path.join(os.path.basepath(__file__), self.inputFile))
196
197        self.xml = xmlHandler(f)
198
199    def testHtml(self):
200        g = file('instance0.html', 'w')
201        g.write(self.xml.html)
202
203    def testTxt(self):
204        h = file('instance0.txt', 'w')
205        h.write(self.xml.text)
206
207if __name__=="__main__":
208    unittest.main()
Note: See TracBrowser for help on using the repository browser.