1 | try: |
---|
2 | from xml.etree import cElementTree as ET |
---|
3 | from xml.etree import ElementTree as pET |
---|
4 | except ImportError: |
---|
5 | try: |
---|
6 | import cElementTree as ET |
---|
7 | import ElementTree as pET |
---|
8 | except ImportError: |
---|
9 | # For some reason when I install ElementTree with easyinstall it |
---|
10 | # is called "elementree". |
---|
11 | import elementtree.ElementTree as ET |
---|
12 | pET=ET |
---|
13 | |
---|
14 | from xml.parsers.expat import ExpatError |
---|
15 | import StringIO, re |
---|
16 | XMLHDR='<?xml version="1.0"' |
---|
17 | |
---|
18 | class xmlHandler: |
---|
19 | |
---|
20 | def __init__(self,xml,string=0): |
---|
21 | ''' Open an xml file (or string) and |
---|
22 | - if necessary correct nasty characters and/or orphans before passing to ET |
---|
23 | - load up an element-tree |
---|
24 | - collect a namespace map ''' |
---|
25 | |
---|
26 | self.r1=None # we only use the regex if we need them |
---|
27 | |
---|
28 | if string: |
---|
29 | self.xmls=xml |
---|
30 | xmlf=None |
---|
31 | else: |
---|
32 | xmlf=xml |
---|
33 | self.xmls=file(xmlf,'r').read() |
---|
34 | |
---|
35 | # Unfortunately we never know whether the incoming xml content is |
---|
36 | # unicode or a string ... |
---|
37 | try: |
---|
38 | self.xmls=unicode(self.xmls,'utf-8') |
---|
39 | except: |
---|
40 | pass |
---|
41 | self.xmls=self.xmls.encode('utf-8','replace') |
---|
42 | |
---|
43 | self.realns={} |
---|
44 | self.__getns() |
---|
45 | |
---|
46 | try: |
---|
47 | self.tree=ET.XML(self.xmls) |
---|
48 | except SyntaxError: |
---|
49 | self.xmls=self.__fixXML(self.xmls) |
---|
50 | self.tree=ET.XML(self.xmls) |
---|
51 | |
---|
52 | |
---|
53 | self.__updatens() |
---|
54 | |
---|
55 | def __getns(self): |
---|
56 | |
---|
57 | ''' Get what the user intended out of elementtree namespaces ''' |
---|
58 | #ought to do this with a regular expression, but needs must |
---|
59 | # or bettter yet, use iterparse in the first place, but that seemed slow. |
---|
60 | |
---|
61 | |
---|
62 | if self.xmls[0:19]==XMLHDR: |
---|
63 | self.root=1 |
---|
64 | hb1=self.xmls.find('>')+1 |
---|
65 | else: |
---|
66 | hb1=0 |
---|
67 | self.root=0 |
---|
68 | |
---|
69 | hb=self.xmls[hb1:].find('<')+1,self.xmls[hb1:].find('>') |
---|
70 | s=self.xmls[hb1+hb[0]:hb1+hb[1]] |
---|
71 | for w in s.split(): |
---|
72 | n=w.split('=') |
---|
73 | if n[0]=='xmlns': |
---|
74 | self.realns[n[1][1:-1]]='default' |
---|
75 | self.defns=n[1][1:-1]#self.realns[n[1][1:-1]] |
---|
76 | elif n[0][0:6]=='xmlns:': |
---|
77 | self.realns[n[1][1:-1]]=n[0][6:] |
---|
78 | |
---|
79 | def tohtml(self): |
---|
80 | '''Lightweight HTML pretty printing of elementTree elements |
---|
81 | and formatted using a css something like this: |
---|
82 | === |
---|
83 | DIV.xmlElem {PADDING-LEFT: 20px;} |
---|
84 | .xmlAttrVal {COLOR:Red; } |
---|
85 | .xmlAttrTyp {COLOR:Green; } |
---|
86 | .xmlElemTag {COLOR:Blue; } |
---|
87 | . highlight {BACKGROUND-COLOR:Yellow; } |
---|
88 | === |
---|
89 | Line number is not yet implemented. |
---|
90 | ''' |
---|
91 | lt,gt='<b><</b>','<b>></b>' |
---|
92 | def span(x,c): return '<span class="%s">%s</span>'%(c,x) |
---|
93 | def div(x,c): return '<div class="%s">%s</div>'%(c,x) |
---|
94 | def fix(x): |
---|
95 | if x is None: return '' |
---|
96 | return x |
---|
97 | def et2html(elem): |
---|
98 | strAttrib='' |
---|
99 | for att in elem.attrib: |
---|
100 | strAttrib+=' %s="%s"'%(span(att,'xmlAttrTyp'),span(elem.attrib[att],'xmlAttrVal')) |
---|
101 | result='%s%s%s%s%s'%(lt,span(elem.tag,"xmlElemTag"),strAttrib,gt,fix(elem.text)) |
---|
102 | children=len(elem) |
---|
103 | if children: |
---|
104 | for item in elem: |
---|
105 | result+=et2html(item) |
---|
106 | result+='%s%s/%s%s'%(fix(elem.tail),lt,span(elem.tag,'xmlElemTag'),gt) |
---|
107 | else: |
---|
108 | result+='%s/%s%s'%(lt,span(elem.tag,'xmlElemTag'),gt) |
---|
109 | return div(result,'xmlElem') |
---|
110 | |
---|
111 | ss=et2html(self.tree) |
---|
112 | h='' |
---|
113 | if self.root:h='%s%s %s="%s" %s="%s"%s'%( |
---|
114 | lt,'?xml',span('version','xmlAttrTyp'),'1.0',span('encoding','xmlAttrTyp'),'utf-8',gt) |
---|
115 | ss=self.__fixXML(ss) |
---|
116 | if self.realns=={}: |
---|
117 | r=h+ss |
---|
118 | else: r= h+self.__nsfixpretty(ss,span) |
---|
119 | return '<div class="xmlDoc">%s</div>'%r |
---|
120 | |
---|
121 | def __nsfixpretty(self,s,span): |
---|
122 | ''' Yet another careful fix ''' |
---|
123 | for ns in self.realns: |
---|
124 | r='{%s}'%ns |
---|
125 | if self.realns[ns]=='default': |
---|
126 | s=s.replace(r,'') |
---|
127 | else: |
---|
128 | s=s.replace(r,'%s:'%self.realns[ns]) |
---|
129 | if self.realns=={}: return s |
---|
130 | # at this point we have no namespace list at the top |
---|
131 | rightArrow=s.find('</span>') # this is just after the tag, where we do want the namespace list |
---|
132 | nslist={} |
---|
133 | for ns in self.realns: nslist[self.realns[ns]]=ns |
---|
134 | r=' %s="%s"'%(span('xmlns','xmlAttrTyp'),span(nslist['default'],'xmlAttrVal')) |
---|
135 | for ns in nslist: |
---|
136 | if ns<>'default': r+=' %s="%s"'%(span('xmlns:%s'%ns,'xmlAttrTyp'),span(nslist[ns],'xmlAttrVal')) |
---|
137 | return s[:rightArrow]+r+s[rightArrow:] |
---|
138 | |
---|
139 | def __updatens(self): |
---|
140 | ''' Update the element tree namespace map with our own map ''' |
---|
141 | # *c*ElementTree doesn't have this update method (or at |
---|
142 | # least I can't find it), so you have to import ElementTree and call it on |
---|
143 | # that, then it all mysteriously works in cElementTree... |
---|
144 | |
---|
145 | pET._namespace_map.update(self.realns) |
---|
146 | |
---|
147 | |
---|
148 | def __str__(self): |
---|
149 | ### actually we should consider whether this was in the input or not |
---|
150 | h='' |
---|
151 | if self.root:h='<?xml version="1.0" encoding="utf-8">' |
---|
152 | ss=ET.tostring(self.tree) |
---|
153 | ### ugly as sin, what happens if default: is in the text? We really ought to do this |
---|
154 | #properly in iterparse on loading the thing ... |
---|
155 | if self.realns=={}: return h+ss |
---|
156 | return self.__fixns(h,ss) |
---|
157 | |
---|
158 | def __fixns(self,h,ss): |
---|
159 | ''' Fix the namespaces after ET has produced a string ''' |
---|
160 | ss=ss.replace('default:','') |
---|
161 | for ns in self.realns: |
---|
162 | r='xmlns:%s="%s"'%(self.realns[ns],ns) |
---|
163 | ss=ss.replace(r,'') |
---|
164 | #now fix the namespaces back in the first element |
---|
165 | rightArrow=ss.find('>') |
---|
166 | #reorder dictionary (I know I didn't need to do it but |
---|
167 | #code readability is worth a millisecond or two. |
---|
168 | nslist={} |
---|
169 | for ns in self.realns: nslist[self.realns[ns]]=ns |
---|
170 | r='xmlns="%s"'%nslist['default'] |
---|
171 | for ns in nslist: |
---|
172 | if ns<>'default': r+=' xmlns:%s="%s"'%(ns,nslist[ns]) |
---|
173 | h+=ss[:rightArrow]+r+ss[rightArrow:] |
---|
174 | return h |
---|
175 | |
---|
176 | def __fixXML(self,s): |
---|
177 | #first those nasty ampersands |
---|
178 | s=re.sub(r'&(?!\w+;)', '&', s) |
---|
179 | #and now orphan > < signs |
---|
180 | if self.r1 is None: |
---|
181 | self.r1=re.compile('<([^>]*(<|$))') |
---|
182 | self.r2=re.compile('((^|>)[^<]*)>') |
---|
183 | old='' |
---|
184 | while s != old: |
---|
185 | old=s |
---|
186 | s=self.r1.sub(r'<\1',s) |
---|
187 | s=self.r2.sub(r'\1>',s) |
---|
188 | return s |
---|
189 | |
---|
190 | def _distributens(self,xpathExpression): |
---|
191 | ''' Actually we only support tag finding in this ''' |
---|
192 | tags=xpathExpression.split('/') |
---|
193 | new='' |
---|
194 | for t in tags: |
---|
195 | if t[1]<>'{': |
---|
196 | new+='{%s}%s/'%(self.defns,t) |
---|
197 | else: |
---|
198 | new+=t+'/' |
---|
199 | new=new[0:-1] |
---|
200 | return new |
---|
201 | |
---|
202 | def getText(self,xpathExpression,multiple=0): |
---|
203 | ''' Get a text object sensibly, given ET API for xml doesn't handle |
---|
204 | namespaces gracefully ''' |
---|
205 | elem=self.tree |
---|
206 | if multiple: |
---|
207 | r=elem.findall(self._distributens(xpathExpression)) |
---|
208 | else: |
---|
209 | r=[elem.find(self._distributens(xpathExpression)),] |
---|
210 | try: # if element is None, this should fail ... |
---|
211 | rr=[] |
---|
212 | for i in r: |
---|
213 | t=i.text |
---|
214 | if t is not None: |
---|
215 | rr.append(t) |
---|
216 | else: rr.append('') |
---|
217 | except: |
---|
218 | rr=['',] |
---|
219 | if multiple: |
---|
220 | return rr |
---|
221 | else: return rr[0] |
---|
222 | |
---|
223 | |
---|
224 | if __name__=="__main__": |
---|
225 | |
---|
226 | import unittest |
---|
227 | |
---|
228 | class TestCase(unittest.TestCase): |
---|
229 | |
---|
230 | def setup(self): |
---|
231 | self.ss='''<?xml version="1.0" encoding="UTF-8"?> |
---|
232 | <Dataset xmlns:swe="http://www.opengis.net/swe" xmlns:gml="http://www.opengis.net/gml" |
---|
233 | xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:moles="http://ndg.nerc.ac.uk/moles" |
---|
234 | xmlns:om="http://www.opengis.net/om" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://ndg.nerc.ac.uk/csml" |
---|
235 | id="FGPfF9i0"><CSMLFeatureCollection gml:id="AfEj15o6"/><om:blah>blahvalue</om:blah><foo>foovalue</foo></Dataset>''' |
---|
236 | |
---|
237 | def testns(self): |
---|
238 | ''' Make sure we extract the namespaces correctly ''' |
---|
239 | self.setup() |
---|
240 | x=xmlHandler(self.ss,string=1) |
---|
241 | self.assertEqual(x.realns,{'http://www.opengis.net/om':'om', 'http://www.opengis.net/gml':'gml', |
---|
242 | 'http://ndg.nerc.ac.uk/csml':'default', 'http://www.opengis.net/swe':'swe', |
---|
243 | 'http://www.w3.org/1999/xlink':'xlink', |
---|
244 | 'http://www.w3.org/2001/XMLSchema-instance':'xsi', 'http://ndg.nerc.ac.uk/moles':'moles'}) |
---|
245 | |
---|
246 | def teststr(self): |
---|
247 | ''' Make sure we can get a string version after loading ''' |
---|
248 | self.setup() |
---|
249 | x=xmlHandler(self.ss,string=1) |
---|
250 | self.assertEqual('<?xml version="1.0" encoding="utf-8"><Dataset id="FGPfF9i0" xmlns="http://ndg.nerc.ac.uk/csml" xmlns:om="http://www.opengis.net/om" xmlns:gml="http://www.opengis.net/gml" xmlns:swe="http://www.opengis.net/swe" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:moles="http://ndg.nerc.ac.uk/moles"><CSMLFeatureCollection gml:id="AfEj15o6" /><om:blah >blahvalue</om:blah><foo>foovalue</foo></Dataset>',str(x)) |
---|
251 | |
---|
252 | def testorphans(self): |
---|
253 | ''' Make sure we can handle orphan characters properly ''' |
---|
254 | s='<data> 1<2</data>' |
---|
255 | x=xmlHandler(s,string=1) |
---|
256 | self.assertEqual('<data> 1<2</data>',str(x)) |
---|
257 | |
---|
258 | def testAmpsersand1(self): |
---|
259 | ''' Can we load unescaped ampersands?''' |
---|
260 | s='<data> a & b </data>' |
---|
261 | x=xmlHandler(s,string=1) |
---|
262 | self.assertEqual('<data> a & b </data>',str(x)) |
---|
263 | |
---|
264 | def testAmpersand2(self): |
---|
265 | ''' Do we output proper things? ''' |
---|
266 | s='<data> 2 & 3 < 8 </data>' |
---|
267 | x=xmlHandler(s,string=1) |
---|
268 | self.assertEqual('<data> 2 & 3 < 8 </data>',str(x)) |
---|
269 | |
---|
270 | def testPrettyPrint(self): |
---|
271 | ''' Test a simple pretty print ''' |
---|
272 | s='<?xml version="1.0" encoding="utf-8"?><data><element>stuff</element></data>' |
---|
273 | x=xmlHandler(s,string=1) |
---|
274 | h=x.tohtml() # only testing the mechanics, not the result |
---|
275 | |
---|
276 | |
---|
277 | #turn off the test |
---|
278 | def AtestRealDIF(self): |
---|
279 | ''' Test a real DIF from the ndgRetrieve stable ''' |
---|
280 | f='ndgRetrieve.badc.nerc.ac.uk__DIF__dataent_11738019833217179.debug.xml' |
---|
281 | x=xmlHandler(f) |
---|
282 | y=str(x) # only testing the mechanics, not the result |
---|
283 | |
---|
284 | |
---|
285 | def testDIF(self): |
---|
286 | s='''<DIF xmlns="http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><Entry_ID>badc.nerc.ac.uk__DIF__dataent_11738019833217179</Entry_ID></DIF>''' |
---|
287 | x=xmlHandler(s,string=1) |
---|
288 | print x.realns |
---|
289 | print str(x) |
---|
290 | h=x.tohtml() |
---|
291 | print h |
---|
292 | |
---|
293 | unittest.main() |
---|
294 | |
---|
295 | |
---|