1 | #!/usr/bin/env python |
---|
2 | ''' |
---|
3 | Class representing the contents of a row in the metadata_record postgres DB table |
---|
4 | C Byrom Apr 08 |
---|
5 | ''' |
---|
6 | try: #python 2.5 |
---|
7 | from xml.etree import ElementTree as ET |
---|
8 | except ImportError: |
---|
9 | try: |
---|
10 | # if you've installed it yourself it comes this way |
---|
11 | import ElementTree as ET |
---|
12 | except ImportError: |
---|
13 | # if you've egged it this is the way it comes |
---|
14 | from elementtree import ElementTree as ET |
---|
15 | #this is a fix to the ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc. |
---|
16 | #ET._namespace_map.update({'http://www.oceannet.org/mdip/xml': 'mdip', 'http://www.w3.org/1999/xlink':'xlink'}) |
---|
17 | |
---|
18 | import os, sys, logging |
---|
19 | from ETxmlView import loadET, nsdumb |
---|
20 | import molesReadWrite as MRW |
---|
21 | from ndgUtils.ndgObject import ndgObject |
---|
22 | from FileUtilities import FileUtilities |
---|
23 | |
---|
24 | class PostgresRecord: |
---|
25 | ''' |
---|
26 | Class representing the contents of a row in the metadata_record postgres DB table |
---|
27 | @param filename: Name of file to use a metadata record |
---|
28 | @param |
---|
29 | ''' |
---|
30 | documentTypes = ['MOLES', 'DIF', 'DC', 'MDIP', 'ISO19139'] |
---|
31 | |
---|
32 | def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType): |
---|
33 | logging.info("Setting up Postgres record for file, " + filename) |
---|
34 | self.filename = filename |
---|
35 | |
---|
36 | # NB, if we're dealing with an NDG data provider, the details are slightly different |
---|
37 | if ndg_dataprovider: |
---|
38 | discObj=ndgObject(discovery_id) |
---|
39 | self._local_id = discObj.localID |
---|
40 | self._repository_local_id = discObj.repository |
---|
41 | else: |
---|
42 | self._local_id = discovery_id |
---|
43 | self._repository_local_id = datacentre_namespace |
---|
44 | |
---|
45 | self._datacentre_groups = datacentre_groups |
---|
46 | self._repository = datacentre_namespace |
---|
47 | self.discovery_id = discovery_id |
---|
48 | self._xq = xq |
---|
49 | self.docType = docType |
---|
50 | |
---|
51 | self._molesFormat = None # initialise this, so we can guarantee a value - to avoid using getattr |
---|
52 | self._allDocs = [] # array to store all the transformed docs - for easy retrieval by the DAO |
---|
53 | |
---|
54 | self._fileUtils = FileUtilities() |
---|
55 | |
---|
56 | # get the dir of the file - needed by the xquery to use as the target collection |
---|
57 | tmp = filename.split('/') |
---|
58 | self._dir = '/'.join(tmp[0:len(tmp)-1]) |
---|
59 | self._shortFilename = tmp[len(tmp)-1] |
---|
60 | |
---|
61 | # dir to store a temp copy of the moles file, when produced - for use by other transforms |
---|
62 | self._molesDir = None |
---|
63 | |
---|
64 | # firstly load contents of file |
---|
65 | self.originalFormat = file(filename).read() |
---|
66 | |
---|
67 | # we use loadET to protect ourselves from scummy characters and unicode problems |
---|
68 | # DO WE NEED TO DO THIS?? |
---|
69 | self.correctedFormat = loadET(self.originalFormat) |
---|
70 | |
---|
71 | |
---|
72 | # initialise the various record fields |
---|
73 | self.db_id = None # the DB ID of the record, for easy reference when it is created |
---|
74 | self.molesFormat = None |
---|
75 | self.dcFormat = None |
---|
76 | self.mdipFormat = None |
---|
77 | self.iso19139Format = None |
---|
78 | |
---|
79 | # do some initial setting up of record |
---|
80 | self.doRecordTransforms() |
---|
81 | self.getSpatioTemporalData() |
---|
82 | |
---|
83 | |
---|
84 | def doRecordTransforms(self): |
---|
85 | ''' |
---|
86 | Run various transforms on the original doc, to populate the record with |
---|
87 | the other types of doc used elsewhere |
---|
88 | ''' |
---|
89 | logging.info("Running transforms for all document types") |
---|
90 | for docType in self.documentTypes: |
---|
91 | self.getDocumentFormat(docType) |
---|
92 | |
---|
93 | logging.info("Transforms complete") |
---|
94 | |
---|
95 | |
---|
96 | def createMolesFile(self): |
---|
97 | ''' |
---|
98 | Check if a moles file exists on the system; if not, assume the moles transform has not |
---|
99 | been ran and then produce this file - to allow for use in the various xqueries |
---|
100 | ''' |
---|
101 | logging.info("Creating moles file on system - for use with other xquery transforms") |
---|
102 | self._molesDir = self._dir + "/moles/" |
---|
103 | self._fileUtils.setUpDir(self._molesDir) |
---|
104 | |
---|
105 | if self._molesFormat is None: |
---|
106 | self.doMolesTransform() |
---|
107 | |
---|
108 | self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat) |
---|
109 | logging.info("Moles file created - at %s" %self._molesDir) |
---|
110 | |
---|
111 | |
---|
112 | def doTransform(self, xQueryType): |
---|
113 | ''' |
---|
114 | Transform the record according to the specified XQuery type |
---|
115 | @param xQueryType: XQuery doc to use to do the transform |
---|
116 | @return: the metadata record in the required transformed format |
---|
117 | ''' |
---|
118 | logging.info("Running XQuery transform, " + xQueryType + " to create transformed document") |
---|
119 | |
---|
120 | # firstly, check if this is a moles -> something else query; if so, ensure there is a valid |
---|
121 | # moles file available for the transform - and use the correct dir for the xquery collection |
---|
122 | dir = self._dir |
---|
123 | if xQueryType.find('moles2') > -1: |
---|
124 | if self._molesDir is None: |
---|
125 | self.createMolesFile() |
---|
126 | |
---|
127 | dir = self._molesDir |
---|
128 | |
---|
129 | # get the query and set this up to use properly |
---|
130 | xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id) |
---|
131 | |
---|
132 | # sort out the input ID stuff |
---|
133 | xquery=xquery.replace('Input_Entry_ID', self.discovery_id) |
---|
134 | xquery=xquery.replace('repository_localid', self._repository) |
---|
135 | |
---|
136 | # strip out the eXist reference to the libraries; these files should be available in the |
---|
137 | # running dir - as set up by oai_ingest.py |
---|
138 | xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '') |
---|
139 | xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '') |
---|
140 | |
---|
141 | # write the query to file, to make it easier to input |
---|
142 | # NB, running directly at the command line leads to problems with the interpretation of $ characters |
---|
143 | xqFile = "currentQuery.xq" |
---|
144 | self._fileUtils.createFile(xqFile, xquery) |
---|
145 | |
---|
146 | # Now do the transform |
---|
147 | os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.') |
---|
148 | xqCommand = "java -cp saxon9.jar net.sf.saxon.Query " + xqFile |
---|
149 | logging.debug("Running saxon command: " + xqCommand) |
---|
150 | pipe = os.popen(xqCommand + " 2>&1") |
---|
151 | output = pipe.read() |
---|
152 | status = pipe.close() |
---|
153 | |
---|
154 | print output |
---|
155 | if status is not None: |
---|
156 | sys.exit("Failed at running the XQuery") |
---|
157 | |
---|
158 | # now remove the temp xquery file |
---|
159 | status = os.unlink(xqFile) |
---|
160 | if status is not None: |
---|
161 | sys.exit("Failed to remove the temporary xquery file, " + xqFile) |
---|
162 | |
---|
163 | logging.info("Transform completed successfully") |
---|
164 | |
---|
165 | return output |
---|
166 | |
---|
167 | |
---|
168 | def doMolesTransform(self): |
---|
169 | ''' |
---|
170 | Set up the basic moles doc - according to the type of document first ingested |
---|
171 | ''' |
---|
172 | logging.info("Creating moles document - for use with other transforms") |
---|
173 | xqName = None |
---|
174 | if self.docType == "DIF": |
---|
175 | xqName = "dif2moles" |
---|
176 | elif self.docType == "MDIP": |
---|
177 | xqName = "mdip2moles" |
---|
178 | else: |
---|
179 | sys.exit("ERROR: No XQuery exists to transform input document type, %s, into moles format" \ |
---|
180 | %self.docType) |
---|
181 | |
---|
182 | # add keywords, if required |
---|
183 | if self._datacentre_groups != "": |
---|
184 | addKeywords() |
---|
185 | |
---|
186 | # now run the appropriate transform and set the attribute |
---|
187 | setattr(self, "_molesFormat", self.doTransform(xqName)) |
---|
188 | logging.info("moles document created") |
---|
189 | |
---|
190 | |
---|
191 | def addKeywords(self): |
---|
192 | ''' |
---|
193 | If datacentre groups have been specified, these need to be added as keywords |
---|
194 | - NB, this is rather clumsy approach but uses old code to achieve the result |
---|
195 | ''' |
---|
196 | logging.info("Adding datacentre keywords to moles file") |
---|
197 | # NB, use temporary directories to do the keyword additions |
---|
198 | tmpDir = os.getcwd() + "/" |
---|
199 | tmpKeywordsDir = os.getcwd() + "/kewordsAdded/" |
---|
200 | self._fileUtils.setUpDir(tmpDir) |
---|
201 | self._fileUtils.setUpDir(tmpKeywordsDir) |
---|
202 | tmpFile = 'tmpFile.xml' |
---|
203 | self._fileUtils.createFile(tmpDir + "/" + tmpFile, self._molesFormat) |
---|
204 | |
---|
205 | keywordAdder.main(tmpDir, tmpKeywordsDir, self.datacentre_groups) |
---|
206 | |
---|
207 | # Now load in the converted file |
---|
208 | f=open(tmpKeywordsDir + "/" + tmpFile, 'r') |
---|
209 | self._molesFormat = f.read() |
---|
210 | f.close |
---|
211 | |
---|
212 | # Finally, tidy up temp dirs |
---|
213 | self._fileUtils.cleanDir(tmpDir) |
---|
214 | self._fileUtils.clearDir(tmpKeywordsDir) |
---|
215 | logging.info("Completed adding keywords") |
---|
216 | |
---|
217 | |
---|
218 | def getDocumentFormat(self, docType): |
---|
219 | ''' |
---|
220 | Lookup document format; if it is already defined then return it, else do the required XQuery |
---|
221 | transform. NB, transforms are ran on the molesFormat document - so ensure this is available |
---|
222 | @param docType: format of document to return |
---|
223 | ''' |
---|
224 | logging.info("Retrieving document type, " + docType) |
---|
225 | xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType] |
---|
226 | attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType] |
---|
227 | |
---|
228 | # check we have the moles format available; if not create it |
---|
229 | if self._molesFormat is None: |
---|
230 | self.doMolesTransform() |
---|
231 | self.createMolesFile() |
---|
232 | |
---|
233 | # check the document isn't already defined |
---|
234 | try: |
---|
235 | doc = getattr(self, attributeName) |
---|
236 | if doc is not None: |
---|
237 | logging.info("Found existing document - returning this now") |
---|
238 | return doc |
---|
239 | except: |
---|
240 | logging.info("Document not available - creating new transformed document") |
---|
241 | |
---|
242 | # the doc type doesn't exist - so run the xquery |
---|
243 | setattr(self, attributeName, self.doTransform(xqName)) |
---|
244 | |
---|
245 | |
---|
246 | def getAllDocs(self): |
---|
247 | ''' |
---|
248 | Return a list of all the available doc types in the record |
---|
249 | ''' |
---|
250 | if len(self._allDocs) > 0: |
---|
251 | return self._allDocs |
---|
252 | |
---|
253 | for docType in documentTypes: |
---|
254 | self._allDocs.append([docType, getDocumentFormat(docType)]) |
---|
255 | return self._allDocs |
---|
256 | |
---|
257 | |
---|
258 | def listify(item): |
---|
259 | ''' |
---|
260 | listify checks if an item is a list, if it isn't it puts it |
---|
261 | inside a list and returns it. Always returns a list object. |
---|
262 | @param item: object to check |
---|
263 | @return: item as a list object |
---|
264 | ''' |
---|
265 | if type(item) is list: |
---|
266 | return item |
---|
267 | else: |
---|
268 | return [item] |
---|
269 | |
---|
270 | |
---|
271 | def getSpatioTemporalData(self): |
---|
272 | ''' |
---|
273 | Extract spatio temporal data from the original document |
---|
274 | ''' |
---|
275 | ET._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'}) |
---|
276 | no_bbox = False |
---|
277 | no_dates = False |
---|
278 | self.east = 'null' |
---|
279 | self.west = 'null' |
---|
280 | self.north = 'null' |
---|
281 | self.south = 'null' |
---|
282 | self.startdate='nostartdate' |
---|
283 | self.enddate='noenddate' |
---|
284 | |
---|
285 | dgMeta=MRW.dgMetadata() |
---|
286 | try: |
---|
287 | dgMeta.fromXML(cElementTree.ElementTree(file=self.filename).getroot()) |
---|
288 | except: |
---|
289 | logging.warning("WARNING: Cannot parse the XML moles document %s. Will not process" %self.filename) |
---|
290 | return |
---|
291 | try: |
---|
292 | bbox_list=listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox) |
---|
293 | except: |
---|
294 | logging.info("XML moles document " + self.filename + \ |
---|
295 | " does not contain a bounding box.") |
---|
296 | no_bbox=True |
---|
297 | |
---|
298 | try: |
---|
299 | dates=dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange |
---|
300 | print "startdate = %s" %dates.DateRangeStart |
---|
301 | print "enddate = %s" %dates.DateRangeEnd |
---|
302 | except: |
---|
303 | logging.info("XML moles document " + self.filename + " does not contain temporal info.") |
---|
304 | no_dates=True |
---|
305 | |
---|
306 | if no_bbox and no_dates: |
---|
307 | logging.info("XML moles document " + self.filename + " does not contain any spatiotemporal info.") |
---|
308 | return |
---|
309 | |
---|
310 | if not no_dates: |
---|
311 | startdate=dates.DateRangeStart |
---|
312 | enddate= dates.DateRangeEnd |
---|
313 | if startdate==None or startdate=='None': |
---|
314 | startdate="nostartdate" |
---|
315 | if enddate==None or enddate=='None': |
---|
316 | enddate="noenddate" |
---|
317 | self.startdate = startdate |
---|
318 | self.enddate = enddate |
---|
319 | |
---|
320 | if not no_bbox: |
---|
321 | #parse the coordinates somewhat - only use the first bounding box. |
---|
322 | bbox=bbox_list[0] |
---|
323 | try: |
---|
324 | west = bbox.LimitWest.strip() |
---|
325 | except: |
---|
326 | print "ERROR: Will not process File %s. Contains incorrect West bounding box limit." %self.filename |
---|
327 | return |
---|
328 | if west.endswith('E'): |
---|
329 | west=bbox.LimitWest.split('E')[0] |
---|
330 | elif west.endswith('W'): |
---|
331 | if west.startswith('-'): |
---|
332 | west = bbox.LimitWest.split('W')[0] |
---|
333 | else: |
---|
334 | west = "-" +bbox.LimitWest.split('W')[0] |
---|
335 | try: |
---|
336 | float(west) |
---|
337 | except: |
---|
338 | print "ERROR: Will not process File %s. Contains incorrect West bounding box limit." %self.filename |
---|
339 | return |
---|
340 | self.west = west |
---|
341 | |
---|
342 | try: |
---|
343 | east = bbox.LimitEast.strip() |
---|
344 | except: |
---|
345 | print "ERROR: Will not process File %s. Contains incorrect East bounding box limit." %self.filename |
---|
346 | return |
---|
347 | if east.endswith('E'): |
---|
348 | east=bbox.LimitEast.split('E')[0] |
---|
349 | elif east.endswith('W'): |
---|
350 | if east.startswith('-'): |
---|
351 | east = bbox.LimitEast.split('W')[0] |
---|
352 | else: |
---|
353 | east = "-" +bbox.LimitEast.split('W')[0] |
---|
354 | try: |
---|
355 | float(east) |
---|
356 | except: |
---|
357 | print "ERROR: Will not process File %s. Contains incorrect East bounding box limit." %self.filename |
---|
358 | return |
---|
359 | self.east = east |
---|
360 | |
---|
361 | try: |
---|
362 | north = bbox.LimitNorth.strip() |
---|
363 | except: |
---|
364 | print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %self.filename |
---|
365 | return |
---|
366 | if north.endswith('N'): |
---|
367 | north=bbox.LimitNorth.split('N')[0] |
---|
368 | elif north.endswith('S'): |
---|
369 | if north.startswith('-'): |
---|
370 | north = bbox.LimitNorth.split('S')[0] |
---|
371 | else: |
---|
372 | north = "-" +bbox.LimitNorth.split('S')[0] |
---|
373 | try: |
---|
374 | float(north) |
---|
375 | except: |
---|
376 | print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %self.filename |
---|
377 | return |
---|
378 | self.north = north |
---|
379 | |
---|
380 | try: |
---|
381 | south = bbox.LimitSouth.strip() |
---|
382 | except: |
---|
383 | print "ERROR: Will not process File %s. Contains incorrect South bounding box limit." %self.filename |
---|
384 | return |
---|
385 | if south.endswith('N'): |
---|
386 | south=bbox.LimitSouth.split('N')[0] |
---|
387 | elif south.endswith('S'): |
---|
388 | if south.startswith('-'): |
---|
389 | south = bbox.LimitSouth.split('S')[0] |
---|
390 | else: |
---|
391 | south = "-" +bbox.LimitSouth.split('S')[0] |
---|
392 | try: |
---|
393 | float(south) |
---|
394 | except: |
---|
395 | print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %self.filename |
---|
396 | return |
---|
397 | self.south = south |
---|
398 | |
---|
399 | logging.info("Spatial info: west= " + self.west + ",south " + self.south + ", east " + \ |
---|
400 | self.east + ", north " + self.north + "") |
---|
401 | logging.info("Temporal info: startdate " + self.startdate + ", enddate " + self.enddate) |
---|
402 | |
---|
403 | |
---|
404 | def hasNullCoords(): |
---|
405 | if str(self.west)=='null' or \ |
---|
406 | str(self.south)=='null' or \ |
---|
407 | str(self.east)=='null' or \ |
---|
408 | str(self.north)=='null': |
---|
409 | return True; |
---|
410 | else: |
---|
411 | return False; |
---|
412 | |
---|