source: ndgCommon/trunk/ndg/common/src/lib/ETxmlView.py @ 4793

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/ndgCommon/trunk/ndg/common/src/lib/ETxmlView.py@4991
Revision 4793, 7.9 KB checked in by cbyrom, 11 years ago (diff)

Checking in initial codebase for ndgUtils restructure.

Line 
1# Copyright Bryan Lawrence, Rutherford Appleton Laboratory, CCLRC, 2006
2#
3# This code is made available under the GPL, if you don't know what
4# that means, you have no rights to copy it or use it!
5#
6# Code to support xml and elementTree viewing as text and html
7from xml.etree import ElementTree as ET
8import re
9
10class subAI:
11    ''' This is Alan Iwi's substitute and replace orphan <> and & code '''
12    def __init__(self):
13        self.r1=re.compile('<([^>]*(<|$))')
14        self.r2=re.compile('((^|>)[^<]*)>')
15        self.r3=re.compile('&(?!(amp|lt|gt);)')
16    def sub(self,s):
17        old=''
18        s=self.r3.sub(r'&amp;',s)
19        s=s.replace(';amp',';')
20        while s != old:
21            old=s
22            s=self.r1.sub(r'&lt;\1',s)
23            s=self.r2.sub(r'\1&gt;',s)
24        return s
25    def undo(self,s):
26        s=s.replace('&amp;','&')
27        s=s.replace('&gt;','>')
28        s=s.replace('&lt;','<')
29        return s
30       
31class nsdumb:
32    ''' provides an xpath interface to element tree nodes which
33    is namespace agnostic '''
34    def __init__(self,root=None,encoding='utf-8'):
35        ''' Provide a root element with namespace definitions when
36        instantiatin '''
37        self.encoding=encoding
38        self.cleanup=subAI()
39        if root is None: 
40            self.xmlns=''
41            return
42        ns=['xmlns','{http://www.w3.org/2001/XMLSchema-instance}schemaLocation']
43       
44        for i in ns: 
45            if i in root.keys():
46                self.xmlns='{%s}'%root.attrib[i].split(' ')[0]
47                break
48            else:
49                if root.tag[0]<>'{': 
50                    self.xmlns=''
51                else:
52                    ns,local=root.tag.split('}')
53                    self.xmlns='{%s}'%ns[1:]
54       
55    def __str__(self):
56        return 'Element Tree namespace helper with namespace: [%s]'%self.xmlns
57   
58    def __distributens(self,xpathExpression):
59        ''' Actually we only support tag finding in this '''
60        tags=xpathExpression.split('/')
61        new=['%s%s'%(self.xmlns,i) for i in tags]
62        return '/'.join(new)
63    def getText(self,elem,xpathExpression,multiple=0):
64        ''' Get a text object sensibly '''
65        if elem is None: 
66            if multiple:
67                return ['',]
68            else: return '' 
69        if multiple:
70                r=elem.findall(self.__distributens(xpathExpression))
71        else:
72                r=[elem.find(self.__distributens(xpathExpression)),]
73        rr=[]
74        for i in r:
75            if i is not None: 
76                rr.append(self.cleanup.sub(i.text or ''))
77            else: rr.append('') 
78        if multiple: 
79                return rr
80        else: return rr[0]
81       
82    def find(self,elem,xpathExpression):
83       ''' Return relevant subelement '''
84       if elem is None: return ''
85       xe=self.__distributens(xpathExpression)
86       return elem.find(xe)
87   
88    def findall(self,elem,xpathExpression):
89       ''' Return all relevant subelements '''
90       if elem is None: return []
91       xe=self.__distributens(xpathExpression)
92       return elem.findall(xe)
93
94    def strip(self,tag):
95        ''' Given a tag, strip the default namespace '''
96        return tag.replace(self.xmlns, '', 1)
97
98def et2text(elem,indent='',html=0,space='   ',helper=None):
99        '''Lightweight pretty printing of elementTree elements'''
100        def estrip(elem):
101                ''' Just want to get rid of unwanted whitespace '''
102                if elem is None:
103                        return ''
104                else:
105                        return elem.strip()
106        ns=helper
107        if ns is None: ns=nsdumb(elem)
108        strAttrib=''
109        for att in elem.attrib:
110                strAttrib+=' %s="%s"'%(att,elem.attrib[att])
111        result='%s<%s%s>%s'%(indent,ns.strip(elem.tag),strAttrib,estrip(elem.text))
112        children=len(elem)
113        if children:
114                for item in elem:
115                        result+='\n'+et2text(item,indent=indent+space,helper=ns)
116                result+='\n%s%s</%s>'%(indent,estrip(item.tail),ns.strip(elem.tag))
117        else:
118                result+='</%s>'%(ns.strip(elem.tag))
119        return result
120
121
122def et2html(elem,matchList=[],number=0,helper=None):
123    #this method is NDG code ... copyright CCLRC ...
124    '''Lightweight HTML pretty printing of elementTree elements + highlight
125    any words which occur in the element text (and tails) which occur in matchList,
126    and formatted using a css something like this:
127    ===
128    DIV.xmlElem {PADDING-LEFT: 20px;}
129    .xmlAttrVal {COLOR:Red; }
130    .xmlAttrTyp {COLOR:Green; }
131    .xmlElemTag {COLOR:Blue; }
132.   highlight {BACKGROUND-COLOR:Yellow; }
133    ===
134    Line number is not yet implemented.
135    '''
136    def span(x,c): return '<span class="%s">%s</span>'%(c,x)
137    def div(x,c): return '<div class="%s">%s</div>'%(c,x)
138    def match(x): 
139        if x is None: return ''
140        for w in matchList: x=re.sub(w,span(w,'highlight'),x)
141        return x
142    lt,gt='<b>&lt;</b>','<b>&gt;</b>'
143    ns=helper
144    if ns is None: ns=nsdumb(elem) 
145    strAttrib=''
146    for att in elem.attrib:
147        strAttrib+=' %s="%s"'%(span(att,'xmlAttrTyp'),span(elem.attrib[att],'xmlAttrVal'))
148    result='%s%s%s%s%s'%(lt,span(ns.strip(elem.tag),"xmlElemTag"),strAttrib,gt,match(elem.text))
149    children=len(elem)
150    if children:
151        for item in elem:
152            result+=et2html(item,matchList,helper=ns)
153        result+='%s%s/%s%s'%(match(item.tail),lt,span(ns.strip(elem.tag),'xmlElemTag'),gt)
154    else:
155        result+='%s/%s%s'%(lt,span(ns.strip(elem.tag),'xmlElemTag'),gt)
156    return div(result,'xmlElem')
157   
158def loadET(inputString):
159    ''' This method returns an elementtree object after some cleaning
160    of the string, essentially a hack to make sure that xml doesn't contain any
161    naughty & characters alone (typically from URL copies), or unescaped orphan
162    < or > signs ... and that the unicode has been processed to something that
163    might work'''
164   
165    if inputString is None: return None
166    inputString=re.sub(r'&(?!\w+;)', '&amp;', inputString)
167 
168    # first just try and do it so we don't waste time if we don't need to ...
169    try:
170        elem=ET.fromstring(inputString)
171        return elem
172    except:
173        pass
174        # and carry on
175
176    #ok, let's deal with orphan > and < signs then ...
177    subtool=subAI()
178    s=subtool.sub(inputString)
179
180    #now let's sort out an encoding
181    encodings=['utf-8','latin-1','iso-8859-1','ascii',]
182    elem=None
183    for option in encodings:
184        try:
185            s=s.encode(option,'replace')
186            try:
187                elem=ET.fromstring(s)
188            except:
189                s=re.sub('\n','<br/>LINE: ',s)
190                print '<p>%s</p>'%s
191                raise
192        except UnicodeError:
193            pass
194        else:
195            break
196    return elem
197
198def xml2text(xmlString):
199    tree=loadET(xmlString)
200    return et2text(tree)
201
202def xml2HTML(xmlString,**kw):
203    tree=loadET(xmlString)
204    return et2HTML(tree,**kw)
205
206import unittest
207from os.path import splitext
208
209class TestCase(unittest.TestCase):
210    """
211    """
212    def setUp(self):
213        ''' Load example files for testing, and get pointers to their dataset identifiers '''
214        inputFiles = ['examples/neodc.eg1.dif','examples/ukho.eg1.mdip',
215                      'examples/bodc.eg2.edmed.dif']
216        indexes={'.dif':'Entry_ID','.mdip':'DatasetIdentifier'}
217        results=['NOCSDAT192','RSDRA2006000377384','grid.bodc.nerc.ac.uk__DIF__EDMED1048006']
218        ids=[]
219        for f in inputFiles:
220            ps=splitext(f)
221            ids.append(indexes[ps[1]])
222        files=[file(i,'r') for i in inputFiles]
223        self.strings=[i.read() for i in files]
224        self.ids=ids
225        self.results=results
226
227    def testEntries(self):
228        ''' Testing the file objects can be loaded using loadET'''
229        for s in self.strings:
230            tree=loadET(s)
231            #for i in tree: print i.tag
232           
233    def testnsDump(self):
234        ''' Test that nsdumb can mediate access to identifiers '''
235        for i in range(len(self.strings)):
236            tree=loadET(self.strings[i])
237            helper=nsdumb(tree)
238            self.assertEqual(helper.getText(tree,self.ids[i]),self.results[i])
239
240
241if __name__=="__main__":
242    unittest.main()
Note: See TracBrowser for help on using the repository browser.