source: exist/trunk/python/ndgUtils/ETxmlView.py @ 3494

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/ETxmlView.py@3494
Revision 3494, 8.1 KB checked in by cbyrom, 12 years ago (diff)

Add undo function missing from this branch compared with code trunk.

Line 
1# Copyright Bryan Lawrence, Rutherford Appleton Laboratory, CCLRC, 2006
2#
3# This code is made available under the GPL, if you don't know what
4# that means, you have no rights to copy it or use it!
5#
6# Code to support xml and elementTree viewing as text and html
7try: #python 2.5
8    from xml.etree import ElementTree as ET
9except ImportError:
10    try:
11        # if you've installed it yourself it comes this way
12        import ElementTree as ET
13    except ImportError:
14        # if you've egged it this is the way it comes
15        from elementtree import ElementTree as ET
16import re
17
18class subAI:
19    ''' This is Alan Iwi's substitute and replace orphan <> and & code '''
20    def __init__(self):
21        self.r1=re.compile('<([^>]*(<|$))')
22        self.r2=re.compile('((^|>)[^<]*)>')
23        self.r3=re.compile('&(?!(amp|lt|gt);)')
24    def sub(self,s):
25        old=''
26        s=self.r3.sub(r'&amp;',s)
27        s=s.replace(';amp',';')
28        while s != old:
29            old=s
30            s=self.r1.sub(r'&lt;\1',s)
31            s=self.r2.sub(r'\1&gt;',s)
32        return s
33    def undo(self,s):
34        s=s.replace('&amp;','&')
35        s=s.replace('&gt;','>')
36        s=s.replace('&lt;','<')
37        return s
38       
39class nsdumb:
40    ''' provides an xpath interface to element tree nodes which
41    is namespace agnostic '''
42    def __init__(self,root=None,encoding='utf-8'):
43        ''' Provide a root element with namespace definitions when
44        instantiatin '''
45        self.encoding=encoding
46        self.cleanup=subAI()
47        if root is None: 
48            self.xmlns=''
49            return
50        ns=['xmlns','{http://www.w3.org/2001/XMLSchema-instance}schemaLocation']
51       
52        for i in ns: 
53            if i in root.keys():
54                self.xmlns='{%s}'%root.attrib[i].split(' ')[0]
55                break
56            else:
57                if root.tag[0]<>'{': 
58                    self.xmlns=''
59                else:
60                    ns,local=root.tag.split('}')
61                    self.xmlns='{%s}'%ns[1:]
62       
63    def __str__(self):
64        return 'Element Tree namespace helper with namespace: [%s]'%self.xmlns
65   
66    def __distributens(self,xpathExpression):
67        ''' Actually we only support tag finding in this '''
68        tags=xpathExpression.split('/')
69        new=['%s%s'%(self.xmlns,i) for i in tags]
70        return '/'.join(new)
71    def getText(self,elem,xpathExpression,multiple=0):
72        ''' Get a text object sensibly '''
73        if elem is None: 
74            if multiple:
75                return ['',]
76            else: return '' 
77        if multiple:
78                r=elem.findall(self.__distributens(xpathExpression))
79        else:
80                r=[elem.find(self.__distributens(xpathExpression)),]
81        rr=[]
82        for i in r:
83            if i is not None: 
84                rr.append(self.cleanup.sub(i.text or ''))
85            else: rr.append('') 
86        if multiple: 
87                return rr
88        else: return rr[0]
89       
90    def find(self,elem,xpathExpression):
91       ''' Return relevant subelement '''
92       if elem is None: return ''
93       xe=self.__distributens(xpathExpression)
94       return elem.find(xe)
95   
96    def findall(self,elem,xpathExpression):
97       ''' Return all relevant subelements '''
98       if elem is None: return []
99       xe=self.__distributens(xpathExpression)
100       return elem.findall(xe)
101
102    def strip(self,tag):
103        ''' Given a tag, strip the default namespace '''
104        return tag.lstrip(self.xmlns)
105
106def et2text(elem,indent='',html=0,space='   ',helper=None):
107        '''Lightweight pretty printing of elementTree elements'''
108        def estrip(elem):
109                ''' Just want to get rid of unwanted whitespace '''
110                if elem is None:
111                        return ''
112                else:
113                        return elem.strip()
114        ns=helper
115        if ns is None: ns=nsdumb(elem)
116        strAttrib=''
117        for att in elem.attrib:
118                strAttrib+=' %s="%s"'%(att,elem.attrib[att])
119        result='%s<%s%s>%s'%(indent,ns.strip(elem.tag),strAttrib,estrip(elem.text))
120        children=len(elem)
121        if children:
122                for item in elem:
123                        result+='\n'+et2text(item,indent=indent+space,helper=ns)
124                result+='\n%s%s</%s>'%(indent,estrip(item.tail),ns.strip(elem.tag))
125        else:
126                result+='</%s>'%(ns.strip(elem.tag))
127        return result
128
129
130def et2html(elem,matchList=[],number=0,helper=None):
131    #this method is NDG code ... copyright CCLRC ...
132    '''Lightweight HTML pretty printing of elementTree elements + highlight
133    any words which occur in the element text (and tails) which occur in matchList,
134    and formatted using a css something like this:
135    ===
136    DIV.xmlElem {PADDING-LEFT: 20px;}
137    .xmlAttrVal {COLOR:Red; }
138    .xmlAttrTyp {COLOR:Green; }
139    .xmlElemTag {COLOR:Blue; }
140.   highlight {BACKGROUND-COLOR:Yellow; }
141    ===
142    Line number is not yet implemented.
143    '''
144    def span(x,c): return '<span class="%s">%s</span>'%(c,x)
145    def div(x,c): return '<div class="%s">%s</div>'%(c,x)
146    def match(x): 
147        if x is None: return ''
148        for w in matchList: x=re.sub(w,span(w,'highlight'),x)
149        return x
150    lt,gt='<b>&lt;</b>','<b>&gt;</b>'
151    ns=helper
152    if ns is None: ns=nsdumb(elem) 
153    strAttrib=''
154    for att in elem.attrib:
155        strAttrib+=' %s="%s"'%(span(att,'xmlAttrTyp'),span(elem.attrib[att],'xmlAttrVal'))
156    result='%s%s%s%s%s'%(lt,span(ns.strip(elem.tag),"xmlElemTag"),strAttrib,gt,match(elem.text))
157    children=len(elem)
158    if children:
159        for item in elem:
160            result+=et2html(item,matchList,helper=ns)
161        result+='%s%s/%s%s'%(match(item.tail),lt,span(ns.strip(elem.tag),'xmlElemTag'),gt)
162    else:
163        result+='%s/%s%s'%(lt,span(ns.strip(elem.tag),'xmlElemTag'),gt)
164    return div(result,'xmlElem')
165   
166def loadET(inputString):
167    ''' This method returns an elementtree object after some cleaning
168    of the string, essentially a hack to make sure that xml doesn't contain any
169    naughty & characters alone (typically from URL copies), or unescaped orphan
170    < or > signs ... and that the unicode has been processed to something that
171    might work'''
172   
173    if inputString is None: return None
174    inputString=re.sub(r'&(?!\w+;)', '&amp;', inputString)
175 
176    # first just try and do it so we don't waste time if we don't need to ...
177    try:
178        elem=ET.fromstring(inputString)
179        return elem
180    except:
181        pass
182        # and carry on
183
184    #ok, let's deal with orphan > and < signs then ...
185    subtool=subAI()
186    s=subtool.sub(inputString)
187
188    #now let's sort out an encoding
189    encodings=['utf-8','latin-1','iso-8859-1','ascii',]
190    elem=None
191    for option in encodings:
192        try:
193            s=s.encode(option,'replace')
194            try:
195                elem=ET.fromstring(s)
196            except:
197                s=re.sub('\n','<br/>LINE: ',s)
198                print '<p>%s</p>'%s
199                raise
200        except UnicodeError:
201            pass
202        else:
203            break
204    return elem
205
206def xml2text(xmlString):
207    tree=loadET(xmlString)
208    return et2text(tree)
209
210def xml2HTML(xmlString,**kw):
211    tree=loadET(xmlString)
212    return et2HTML(tree,**kw)
213
214import unittest
215from os.path import splitext
216
217class TestCase(unittest.TestCase):
218    """
219    """
220    def setUp(self):
221        ''' Load example files for testing, and get pointers to their dataset identifiers '''
222        inputFiles = ['examples/neodc.eg1.dif','examples/ukho.eg1.mdip',
223                      'examples/bodc.eg2.edmed.dif']
224        indexes={'.dif':'Entry_ID','.mdip':'DatasetIdentifier'}
225        results=['NOCSDAT192','RSDRA2006000377384','grid.bodc.nerc.ac.uk__DIF__EDMED1048006']
226        ids=[]
227        for f in inputFiles:
228            ps=splitext(f)
229            ids.append(indexes[ps[1]])
230        files=[file(i,'r') for i in inputFiles]
231        self.strings=[i.read() for i in files]
232        self.ids=ids
233        self.results=results
234
235    def testEntries(self):
236        ''' Testing the file objects can be loaded using loadET'''
237        for s in self.strings:
238            tree=loadET(s)
239            #for i in tree: print i.tag
240           
241    def testnsDump(self):
242        ''' Test that nsdumb can mediate access to identifiers '''
243        for i in range(len(self.strings)):
244            tree=loadET(self.strings[i])
245            helper=nsdumb(tree)
246            self.assertEqual(helper.getText(tree,self.ids[i]),self.results[i])
247
248
249if __name__=="__main__":
250    unittest.main()
Note: See TracBrowser for help on using the repository browser.