source: exist/trunk/python/ndgUtils/xmlHandler2.py @ 4696

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/xmlHandler2.py@5371
Revision 4696, 8.5 KB checked in by cbyrom, 11 years ago (diff)

Adjust Atom and MolesEntity? data models to properly use namespaces when
dealing with xpath queries - rather than having these stripped out. This
avoids problems when namespaces are given arbitrary names and is a more
exact, hence robust, approach.
Create new test class to put the xmlhandler2 tests separately in.
Add delete function to granulite - to allow data granules, and their
connections to data entities, to be removed + add 'roll back' functionality
to cope with scenarios when granulite replace/delete fails to complete
properly. Add new methods to the existdbclient to allow the restore/delete/backup
functionality.
Extend test suite to exercise new functionality.

Line 
1try:
2    from xml.etree import cElementTree as ET
3    from xml.etree import ElementTree as pET
4except ImportError:
5    try:
6        import cElementTree as ET
7        import ElementTree as pET
8    except ImportError:
9        # For some reason when I install ElementTree with easyinstall it
10        # is called "elementree".
11        import elementtree.ElementTree as ET
12        pET=ET
13       
14from xml.parsers.expat import ExpatError
15import StringIO, re, logging
16XMLHDR='<?xml version="1.0"'
17from ETxmlView import subAI
18
19class xmlHandler:
20
21    def __init__(self,xml,string=0):
22        ''' Open an xml file (or string) and
23           - Correct nasty characters and/or orphans before passing to ET
24           - load up an element-tree
25           - collect a namespace map '''
26        logging.info("Reading in XML file - to create an elementtree object")
27        self.cleanup=subAI()
28
29        if string:
30            self.xmls=xml  #we better hope that xml is a unicode or string object otherwise odd things happen
31            xmlf=None
32        else:
33            xmlf=xml
34            self.xmls=file(xmlf,'r').read()
35       
36        #this ought to be relatively efficient
37        self.xmls=self.cleanup.sub(self.xmls)
38       
39        # Unfortunately we never know whether the incoming xml content is
40        # unicode or a string ...
41        encoding='utf-8'
42        try:
43            ss=xml[0:39].split('"')
44            if ss[2]=='encoding=': encoding=ss[3]
45        except:pass
46        try:
47            self.xmls=unicode(self.xmls,encoding)
48        except:pass
49        self.xmls=self.xmls.encode(encoding,'replace')
50       
51        self.realns={}           
52        self.__getns()
53       
54        self.tree=ET.XML(self.xmls)
55        self.__updatens()
56        logging.info("XML file read into elementtree object")
57   
58    def __getns(self):
59       
60        ''' Get what the user intended out of elementtree namespaces '''
61        #ought to do this with a regular expression, but needs must
62        # or bettter yet, use iterparse in the first place, but that seemed slow.
63        self.defns=None
64        if self.xmls[0:19]==XMLHDR:
65            self.root=1
66            hb1=self.xmls.find('>')+1
67        else:
68            hb1=0
69            self.root=0
70       
71        hb=self.xmls[hb1:].find('<')+1,self.xmls[hb1:].find('>')
72        s=self.xmls[hb1+hb[0]:hb1+hb[1]]
73        for w in s.split():
74            n=w.split('=')
75            if n[0]=='xmlns':
76                self.realns[n[1][1:-1]]='default'
77                self.defns=n[1][1:-1]#self.realns[n[1][1:-1]]
78            elif n[0][0:6]=='xmlns:':
79                self.realns[n[1][1:-1]]=n[0][6:]
80           
81    def tohtml(self):
82        '''Lightweight HTML pretty printing of elementTree elements
83           and formatted using a css something like this:
84            ===
85            DIV.xmlElem {PADDING-LEFT: 20px;}
86            .xmlAttrVal {COLOR:Red; }
87            .xmlAttrTyp {COLOR:Green; }
88            .xmlElemTag {COLOR:Blue; }
89        .   highlight {BACKGROUND-COLOR:Yellow; }
90            ===
91            Line number is not yet implemented.
92            '''
93        lt,gt='<b>&lt;</b>','<b>&gt;</b>'
94        def span(x,c): return '<span class="%s">%s</span>'%(c,x)
95        def div(x,c): return '<div class="%s">%s</div>'%(c,x)
96        def fix(x): return (x or '')
97        def et2html(elem):   
98            strAttrib=''
99            for att in elem.attrib:
100                strAttrib+=' %s="%s"'%(span(att,'xmlAttrTyp'),span(elem.attrib[att],'xmlAttrVal'))
101            result='%s%s%s%s%s'%(lt,span(elem.tag,"xmlElemTag"),strAttrib,gt,fix(elem.text))
102            children=len(elem)
103            if children:
104                for item in elem:
105                    result+=et2html(item)
106                result+='%s%s/%s%s'%(fix(elem.tail),lt,span(elem.tag,'xmlElemTag'),gt)
107            else:
108                result+='%s/%s%s'%(lt,span(elem.tag,'xmlElemTag'),gt)
109            return div(result,'xmlElem')
110           
111        ss=et2html(self.tree)
112        h=''
113        if self.root:h='%s%s %s="%s" %s="%s"%s'%(
114            lt,'?xml',span('version','xmlAttrTyp'),'1.0',span('encoding','xmlAttrTyp'),'utf-8',gt)
115        if self.realns=={}: 
116            r=h+ss
117        else: r= h+self.__nsfixpretty(ss,span)
118        return '<div class="xmlDoc">%s</div>'%r
119         
120    def __nsfixpretty(self,s,span):
121        ''' Yet another careful fix '''
122        for ns in self.realns:
123            r='{%s}'%ns
124            if self.realns[ns]=='default':
125                s=s.replace(r,'')
126            else:
127                s=s.replace(r,'%s:'%self.realns[ns])
128        if self.realns=={}: return s
129        # at this point we have no namespace list at the top
130        rightArrow=s.find('</span>') # this is just after the tag, where we do want the namespace list
131        nslist={} 
132        for ns in self.realns: nslist[self.realns[ns]]=ns
133        if 'default' in nslist: r=' %s="%s"'%(span('xmlns','xmlAttrTyp'),span(nslist['default'],'xmlAttrVal'))
134        else: r=''
135        for ns in nslist:
136            if ns<>'default': r+=' %s="%s"'%(span('xmlns:%s'%ns,'xmlAttrTyp'),span(nslist[ns],'xmlAttrVal'))
137        return s[:rightArrow]+r+s[rightArrow:]
138       
139    def __updatens(self):
140        ''' Update the element tree namespace map with our own map '''
141        # *c*ElementTree doesn't have this update method (or at
142        # least I can't find it), so you have to import ElementTree and call it on
143        # that, then it all mysteriously works in cElementTree...
144
145        pET._namespace_map.update(self.realns)
146
147         
148    def __str__(self):
149        ### actually we should consider whether this was in the input or not
150        h=''
151        if self.root:h='<?xml version="1.0" encoding="utf-8">'
152        ss=ET.tostring(self.tree)
153        ### ugly as sin, what happens if default: is in the text? We really ought to do this
154        #properly in iterparse on loading the thing ...
155        if self.realns=={}: return h+ss
156        return self.__fixns(h,ss)
157       
158    def __fixns(self,h,ss):
159        ''' Fix the namespaces after ET has produced a string '''
160        ss=ss.replace('default:','')
161        for ns in self.realns:
162            r='xmlns:%s="%s"'%(self.realns[ns],ns)
163            ss=ss.replace(r,'')
164        #now fix the namespaces back in the first element
165        rightArrow=ss.find('>')
166        #reorder dictionary (I know I didn't need to do it but
167        #code readability is worth a millisecond or two.
168        nslist={}
169        for ns in self.realns: nslist[self.realns[ns]]=ns
170        if 'default' in nslist: r='xmlns="%s"'%nslist['default']
171        for ns in nslist:
172            if ns<>'default': r+=' xmlns:%s="%s"'%(ns,nslist[ns])
173        h+=ss[:rightArrow]+r+ss[rightArrow:]
174        return h
175     
176    def _distributens(self, xpathExpression, alternativeNS=None):
177        '''
178        Update xpath expression to include namespaces.  NB, the
179        default doc NS is used if available and if the keyword, 'alternativeNS'
180        has not been set to specify a different namespace
181        @param xpathExpression: xpath expression to update with namespace
182        @type xpathExpression: str
183        @keyword alternativeNS: alternative namespace to decorate the xpath
184        expression with
185        @type alternativeNS: str
186        @return: xpath expression with name space decoration 
187        '''
188        logging.debug("Adding namespace to xpath expression, '%s'" \
189                      %xpathExpression)
190        if self.defns is None and not alternativeNS:
191            logging.debug("- no namespace specified - returning")
192            return xpathExpression
193       
194        tags=xpathExpression.split('/')
195        new=''
196        ns = (alternativeNS or self.defns)
197        for t in tags: 
198            if t[1]<>'{': 
199                new+='{%s}%s/'%(ns, t)
200            else:
201                new+=t+'/'
202        new=new[0:-1]
203        logging.debug("- added namespace, '%s'" %ns)
204        logging.debug("- final xpath expression, '%s'" %new)
205        return new
206       
207    def getText(self,xpathExpression,multiple=0):
208        ''' Get a text object sensibly, given ET API for xml doesn't handle
209        namespaces gracefully '''
210        elem=self.tree
211        if multiple:
212                r=elem.findall(self._distributens(xpathExpression))
213        else:
214                r=[elem.find(self._distributens(xpathExpression)),]
215        try:  # if element is None, this should fail ...
216                rr=[]
217                for i in r:
218                    t=i.text
219                    if t is not None: 
220                        rr.append(t)
221                    else: rr.append('')
222        except:
223                rr=['',]
224        if multiple: 
225                return rr
226        else: return rr[0]
227               
Note: See TracBrowser for help on using the repository browser.