source: TI07-MOLES/trunk/PythonCode/wsgi/ETxmlView.py @ 2433

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI07-MOLES/trunk/PythonCode/wsgi/ETxmlView.py@2433
Revision 2433, 8.0 KB checked in by lawrence, 12 years ago (diff)

Fixes for ticket:722, changing to internal unicode (probably not complete),
with decode on the edge coming out ... fixes 722 but may have some
other problems now ...

Line 
1# Copyright Bryan Lawrence, Rutherford Appleton Laboratory, CCLRC, 2006
2#
3# This code is made available under the GPL, if you don't know what
4# that means, you have no rights to copy it or use it!
5#
6# Code to support xml and elementTree viewing as text and html
7try: #python 2.5
8    from xml.etree import ElementTree as ET
9except ImportError:
10    try:
11        # if you've installed it yourself it comes this way
12        import ElementTree as ET
13    except ImportError:
14        # if you've egged it this is the way it comes
15        from elementtree import ElementTree as ET
16import re
17
18class subAI:
19    ''' This is Alan Iwi's substitute and replace orphan <> code '''
20    def __init__(self):
21        self.r1=re.compile('<([^>]*(<|$))')
22        self.r2=re.compile('((^|>)[^<]*)>')
23    def sub(self,s):
24        old=''
25        while s != old:
26            old=s
27            s=self.r1.sub(r'&lt;\1',s)
28            s=self.r2.sub(r'\1&gt;',s)
29        return s
30       
31class nsdumb:
32    ''' provides an xpath interface to element tree nodes which
33    is namespace agnostic '''
34    def __init__(self,root=None,encoding='utf-8'):
35        ''' Provide a root element with namespace definitions when
36        instantiatin '''
37        self.encoding=encoding
38        if root is None: 
39            self.xmlns=''
40            return
41        ns=['xmlns','{http://www.w3.org/2001/XMLSchema-instance}schemaLocation']
42        for i in ns: 
43            if i in root.keys():
44                self.xmlns='{%s}'%root.attrib[i].split(' ')[0]
45                break
46            else:
47                if root.tag[0]<>'{': 
48                    self.xmlns=''
49                else:
50                    ns,local=root.tag.split('}')
51                    self.xmlns='{%s}'%ns[1:]
52       
53    def __str__(self):
54        return 'Element Tree namespace helper with namespace: [%s]'%self.xmlns
55   
56    def __distributens(self,xpathExpression):
57        ''' Actually we only support tag finding in this '''
58        tags=xpathExpression.split('/')
59        new=''
60        for t in tags: new+=self.xmlns+t+'/'
61        new=new[0:-1]
62        return new
63    def getText(self,elem,xpathExpression,multiple=0):
64        ''' Get a text object sensibly '''
65        if elem is None: 
66            if multiple:
67                return ['',]
68            else: return '' 
69        if multiple:
70                r=elem.findall(self.__distributens(xpathExpression))
71        else:
72                r=[elem.find(self.__distributens(xpathExpression)),]
73        try:  # if element is None, this should fail ...
74                rr=[]
75                for i in r:
76                    t=i.text
77                    if t is not None: 
78                        #rr.append(t.decode(self.encoding))
79                        rr.append(t)
80                    else: rr.append('')
81        except:
82                rr=['',]
83        if multiple: 
84                return rr
85        else: return rr[0]
86       
87    def find(self,elem,xpathExpression):
88       ''' Return relevant subelement '''
89       if elem is None: return ''
90       xe=self.__distributens(xpathExpression)
91       return elem.find(xe)
92   
93    def findall(self,elem,xpathExpression):
94       ''' Return all relevant subelements '''
95       if elem is None: return []
96       xe=self.__distributens(xpathExpression)
97       return elem.findall(xe)
98
99    def strip(self,tag):
100        ''' Given a tag, strip the default namespace '''
101        return tag.lstrip(self.xmlns)
102
103def et2text(elem,indent='',html=0,space='   ',helper=None):
104        '''Lightweight pretty printing of elementTree elements'''
105        def estrip(elem):
106                ''' Just want to get rid of unwanted whitespace '''
107                if elem is None:
108                        return ''
109                else:
110                        return elem.strip()
111        ns=helper
112        if ns is None: ns=nsdumb(elem)
113        strAttrib=''
114        for att in elem.attrib:
115                strAttrib+=' %s="%s"'%(att,elem.attrib[att])
116        result='%s<%s%s>%s'%(indent,ns.strip(elem.tag),strAttrib,estrip(elem.text))
117        children=len(elem)
118        if children:
119                for item in elem:
120                        result+='\n'+et2text(item,indent=indent+space,helper=ns)
121                result+='\n%s%s</%s>'%(indent,estrip(item.tail),ns.strip(elem.tag))
122        else:
123                result+='</%s>'%(ns.strip(elem.tag))
124        return result
125
126
127def et2html(elem,matchList=[],number=0,helper=None):
128    #this method is NDG code ... copyright CCLRC ...
129    '''Lightweight HTML pretty printing of elementTree elements + highlight
130    any words which occur in the element text (and tails) which occur in matchList,
131    and formatted using a css something like this:
132    ===
133    DIV.xmlElem {PADDING-LEFT: 20px;}
134    .xmlAttrVal {COLOR:Red; }
135    .xmlAttrTyp {COLOR:Green; }
136    .xmlElemTag {COLOR:Blue; }
137.   highlight {BACKGROUND-COLOR:Yellow; }
138    ===
139    Line number is not yet implemented.
140    '''
141    def span(x,c): return '<span class="%s">%s</span>'%(c,x)
142    def div(x,c): return '<div class="%s">%s</div>'%(c,x)
143    def match(x): 
144        if x is None: return ''
145        for w in matchList: x=re.sub(w,span(w,'highlight'),x)
146        return x
147    lt,gt='<b>&lt;</b>','<b>&gt;</b>'
148    ns=helper
149    if ns is None: ns=nsdumb(elem) 
150    strAttrib=''
151    for att in elem.attrib:
152        strAttrib+=' %s="%s"'%(span(att,'xmlAttrTyp'),span(elem.attrib[att],'xmlAttrVal'))
153    result='%s%s%s%s%s'%(lt,span(ns.strip(elem.tag),"xmlElemTag"),strAttrib,gt,match(elem.text))
154    children=len(elem)
155    if children:
156        for item in elem:
157            result+=et2html(item,matchList,helper=ns)
158        result+='%s%s/%s%s'%(match(item.tail),lt,span(ns.strip(elem.tag),'xmlElemTag'),gt)
159    else:
160        result+='%s/%s%s'%(lt,span(ns.strip(elem.tag),'xmlElemTag'),gt)
161    return div(result,'xmlElem')
162   
163def loadET(inputString):
164    ''' This method returns an elementtree object after some cleaning
165    of the string, essentially a hack to make sure that xml doesn't contain any
166    naughty & characters alone (typically from URL copies), or unescaped orphan
167    < or > signs ... and that the unicode has been processed to something that
168    might work'''
169   
170    if inputString is None: return None
171    inputString=re.sub(r'&(?!\w+;)', '&amp;', inputString)
172 
173    # first just try and do it so we don't waste time if we don't need to ...
174    try:
175        elem=ET.fromstring(inputString)
176        return elem
177    except:
178        pass
179        # and carry on
180
181    #ok, let's deal with orphan > and < signs then ...
182    subtool=subAI()
183    s=subtool.sub(inputString)
184
185    #now let's sort out an encoding
186    encodings=['utf-8','latin-1','iso-8859-1','ascii',]
187    elem=None
188    for option in encodings:
189        try:
190            s=s.encode(option,'replace')
191            try:
192                elem=ET.fromstring(s)
193            except:
194                s=re.sub('\n','<br/>LINE: ',s)
195                print '<p>%s</p>'%s
196                raise
197        except UnicodeError:
198            pass
199        else:
200            break
201    return elem
202
203def xml2text(xmlString):
204    tree=loadET(xmlString)
205    return et2text(tree)
206
207def xml2HTML(xmlString,**kw):
208    tree=loadET(xmlString)
209    return et2HTML(tree,**kw)
210
211import unittest
212from os.path import splitext
213
214class TestCase(unittest.TestCase):
215    """
216    """
217    def setUp(self):
218        ''' Load example files for testing, and get pointers to their dataset identifiers '''
219        inputFiles = ['examples/neodc.eg1.dif','examples/ukho.eg1.mdip',
220                      'examples/bodc.eg2.edmed.dif']
221        indexes={'.dif':'Entry_ID','.mdip':'DatasetIdentifier'}
222        results=['NOCSDAT192','RSDRA2006000377384','grid.bodc.nerc.ac.uk__DIF__EDMED1048006']
223        ids=[]
224        for f in inputFiles:
225            ps=splitext(f)
226            ids.append(indexes[ps[1]])
227        files=[file(i,'r') for i in inputFiles]
228        self.strings=[i.read() for i in files]
229        self.ids=ids
230        self.results=results
231
232    def testEntries(self):
233        ''' Testing the file objects can be loaded using loadET'''
234        for s in self.strings:
235            tree=loadET(s)
236            #for i in tree: print i.tag
237           
238    def testnsDump(self):
239        ''' Test that nsdumb can mediate access to identifiers '''
240        for i in range(len(self.strings)):
241            tree=loadET(self.strings[i])
242            helper=nsdumb(tree)
243            self.assertEqual(helper.getText(tree,self.ids[i]),self.results[i])
244       
245
246if __name__=="__main__":
247    unittest.main()
Note: See TracBrowser for help on using the repository browser.