1 | #!/usr/bin/env python |
---|
2 | ''' |
---|
3 | Class representing the contents of a row in the metadata_record postgres DB table |
---|
4 | C Byrom Apr 08 |
---|
5 | ''' |
---|
6 | try: #python 2.5 |
---|
7 | from xml.etree import cElementTree |
---|
8 | except ImportError: |
---|
9 | try: |
---|
10 | # if you've installed it yourself it comes this way |
---|
11 | import cElementTree |
---|
12 | except ImportError: |
---|
13 | # if you've egged it this is the way it comes |
---|
14 | from ndgUtils.elementtree import cElementTree |
---|
15 | |
---|
16 | import os, sys, logging, re |
---|
17 | import molesReadWrite as MRW |
---|
18 | from ndgUtils.ndgObject import ndgObject |
---|
19 | from FileUtilities import FileUtilities |
---|
20 | from SpatioTemporalData import SpatioTemporalData |
---|
21 | import keywordAdder |
---|
22 | |
---|
23 | class PostgresRecord: |
---|
24 | ''' |
---|
25 | Class representing the contents of a row in the metadata_record postgres DB table |
---|
26 | @param filename: Name of file to use a metadata record |
---|
27 | @param |
---|
28 | ''' |
---|
29 | documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP'] |
---|
30 | |
---|
31 | def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType): |
---|
32 | logging.info("Setting up Postgres record for file, " + filename) |
---|
33 | self.filename = filename |
---|
34 | |
---|
35 | # NB, if we're dealing with an NDG data provider, the details are slightly different |
---|
36 | if ndg_dataprovider: |
---|
37 | discObj=ndgObject(discovery_id) |
---|
38 | self._local_id = discObj.localID |
---|
39 | self._repository_local_id = discObj.repository |
---|
40 | else: |
---|
41 | self._local_id = discovery_id |
---|
42 | self._repository_local_id = datacentre_namespace |
---|
43 | |
---|
44 | self._datacentre_groups = datacentre_groups |
---|
45 | self._repository = datacentre_namespace |
---|
46 | self.discovery_id = discovery_id |
---|
47 | self._xq = xq |
---|
48 | self.docType = docType |
---|
49 | |
---|
50 | self._molesFormat = None # initialise this, so we can guarantee a value - to avoid using getattr |
---|
51 | self._allDocs = [] # array to store all the transformed docs - for easy retrieval by the DAO |
---|
52 | |
---|
53 | self._fileUtils = FileUtilities() |
---|
54 | |
---|
55 | # get the dir of the file - needed by the xquery to use as the target collection |
---|
56 | tmp = filename.split('/') |
---|
57 | self._dir = '/'.join(tmp[0:len(tmp)-1]) |
---|
58 | self._shortFilename = tmp[len(tmp)-1] |
---|
59 | |
---|
60 | # dir to store a temp copy of the moles file, when produced - for use by other transforms |
---|
61 | self._molesDir = None |
---|
62 | |
---|
63 | # firstly load contents of file |
---|
64 | self.originalFormat = file(filename).read() |
---|
65 | |
---|
66 | # escape any apostrophes |
---|
67 | self.originalFormat = self.escapeSpecialCharacters(self.originalFormat) |
---|
68 | |
---|
69 | # initialise the various record fields |
---|
70 | self.db_id = None # the DB ID of the record, for easy reference when it is created |
---|
71 | self.molesFormat = None |
---|
72 | self.dcFormat = None |
---|
73 | self.mdipFormat = None |
---|
74 | self.iso19139Format = None |
---|
75 | self.scn = 1 # system change number - keeps track of number of mods to a particular row |
---|
76 | |
---|
77 | # spatiotemporal data object |
---|
78 | self.stData = None |
---|
79 | |
---|
80 | def escapeSpecialCharacters(self, inputString): |
---|
81 | ''' |
---|
82 | Adjust the input string to escape any characters that would interfere with string or DB |
---|
83 | operations |
---|
84 | @param inputString: string to correct |
---|
85 | @return: corrected string |
---|
86 | ''' |
---|
87 | return re.sub(r'\'', '\\\'', inputString) |
---|
88 | |
---|
89 | |
---|
90 | def doRecordTransforms(self): |
---|
91 | ''' |
---|
92 | Run various transforms on the original doc, to populate the record with |
---|
93 | the other types of doc used elsewhere |
---|
94 | ''' |
---|
95 | logging.info("Running transforms for all document types") |
---|
96 | for docType in self.documentTypes: |
---|
97 | self.getDocumentFormat(docType) |
---|
98 | |
---|
99 | logging.info("Transforms complete") |
---|
100 | |
---|
101 | |
---|
102 | def createMolesFile(self): |
---|
103 | ''' |
---|
104 | Check if a moles file exists on the system; if not, assume the moles transform has not |
---|
105 | been ran and then produce this file - to allow for use in the various xqueries |
---|
106 | ''' |
---|
107 | logging.info("Creating moles file on system - for use with other xquery transforms") |
---|
108 | self._molesDir = self._dir + "/moles/" |
---|
109 | self._fileUtils.setUpDir(self._molesDir) |
---|
110 | |
---|
111 | if self._molesFormat is None: |
---|
112 | self.doMolesTransform() |
---|
113 | |
---|
114 | self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat) |
---|
115 | logging.info("Moles file created - at %s" %self._molesDir) |
---|
116 | |
---|
117 | |
---|
118 | def doTransform(self, xQueryType): |
---|
119 | ''' |
---|
120 | Transform the record according to the specified XQuery type |
---|
121 | @param xQueryType: XQuery doc to use to do the transform |
---|
122 | @return: the metadata record in the required transformed format |
---|
123 | ''' |
---|
124 | logging.info("Running XQuery transform, " + xQueryType + " to create transformed document") |
---|
125 | |
---|
126 | # firstly, check if this is a moles -> something else query; if so, ensure there is a valid |
---|
127 | # moles file available for the transform - and use the correct dir for the xquery collection |
---|
128 | dir = self._dir |
---|
129 | if xQueryType.find('moles2') > -1: |
---|
130 | if self._molesDir is None: |
---|
131 | self.createMolesFile() |
---|
132 | |
---|
133 | dir = self._molesDir |
---|
134 | |
---|
135 | # get the query and set this up to use properly |
---|
136 | xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id) |
---|
137 | |
---|
138 | # sort out the input ID stuff |
---|
139 | xquery=xquery.replace('Input_Entry_ID', self.discovery_id) |
---|
140 | xquery=xquery.replace('repository_localid', self._repository) |
---|
141 | |
---|
142 | # strip out the eXist reference to the libraries; these files should be available in the |
---|
143 | # running dir - as set up by oai_ingest.py |
---|
144 | xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '') |
---|
145 | xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '') |
---|
146 | |
---|
147 | # write the query to file, to make it easier to input |
---|
148 | # NB, running directly at the command line leads to problems with the interpretation of $ characters |
---|
149 | xqFile = "currentQuery.xq" |
---|
150 | self._fileUtils.createFile(xqFile, xquery) |
---|
151 | |
---|
152 | # Now do the transform |
---|
153 | os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.') |
---|
154 | xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes" |
---|
155 | logging.debug("Running saxon command: " + xqCommand) |
---|
156 | pipe = os.popen(xqCommand + " 2>&1") |
---|
157 | output = pipe.read() |
---|
158 | status = pipe.close() |
---|
159 | |
---|
160 | if status is not None: |
---|
161 | raise SystemError, 'Failed at running the XQuery' |
---|
162 | |
---|
163 | # now remove the temp xquery file |
---|
164 | status = os.unlink(xqFile) |
---|
165 | if status is not None: |
---|
166 | raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile |
---|
167 | |
---|
168 | logging.info("Transform completed successfully") |
---|
169 | |
---|
170 | return output |
---|
171 | |
---|
172 | |
---|
173 | def doMolesTransform(self): |
---|
174 | ''' |
---|
175 | Set up the basic moles doc - according to the type of document first ingested |
---|
176 | ''' |
---|
177 | logging.info("Creating moles document - for use with other transforms") |
---|
178 | xqName = None |
---|
179 | if self.docType == "DIF": |
---|
180 | xqName = "dif2moles" |
---|
181 | elif self.docType == "MDIP": |
---|
182 | xqName = "mdip2moles" |
---|
183 | else: |
---|
184 | sys.exit("ERROR: No XQuery exists to transform input document type, %s, into moles format" \ |
---|
185 | %self.docType) |
---|
186 | |
---|
187 | # now run the appropriate transform and set the attribute |
---|
188 | setattr(self, "_molesFormat", self.doTransform(xqName)) |
---|
189 | |
---|
190 | # add keywords, if required |
---|
191 | if self._datacentre_groups != "": |
---|
192 | self.addKeywords() |
---|
193 | |
---|
194 | # escape any apostrophes |
---|
195 | self._molesFormat = self.escapeSpecialCharacters(self._molesFormat) |
---|
196 | |
---|
197 | logging.info("moles document created") |
---|
198 | |
---|
199 | |
---|
200 | def addKeywords(self): |
---|
201 | ''' |
---|
202 | If datacentre groups have been specified, these need to be added as keywords |
---|
203 | - NB, this is rather clumsy approach but uses old code to achieve the result |
---|
204 | ''' |
---|
205 | logging.info("Adding datacentre keywords to moles file") |
---|
206 | |
---|
207 | # NB, use temporary directories to do the keyword additions |
---|
208 | tmpDir = os.getcwd() + "/tmp/" |
---|
209 | tmpKeywordsDir = os.getcwd() + "/keywordsAdded/" |
---|
210 | self._fileUtils.setUpDir(tmpDir) |
---|
211 | self._fileUtils.setUpDir(tmpKeywordsDir) |
---|
212 | tmpFile = 'tmpFile.xml' |
---|
213 | self._fileUtils.createFile(tmpDir + tmpFile, self._molesFormat) |
---|
214 | |
---|
215 | keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups) |
---|
216 | |
---|
217 | sys.exit() |
---|
218 | # Now load in the converted file |
---|
219 | f=open(tmpKeywordsDir + "/" + tmpFile, 'r') |
---|
220 | self._molesFormat = f.read() |
---|
221 | f.close |
---|
222 | |
---|
223 | # Finally, tidy up temp dirs |
---|
224 | self._fileUtils.cleanDir(tmpDir) |
---|
225 | self._fileUtils.clearDir(tmpKeywordsDir) |
---|
226 | logging.info("Completed adding keywords") |
---|
227 | |
---|
228 | |
---|
229 | def getDocumentFormat(self, docType): |
---|
230 | ''' |
---|
231 | Lookup document format; if it is already defined then return it, else do the required XQuery |
---|
232 | transform. NB, transforms are ran on the molesFormat document - so ensure this is available |
---|
233 | @param docType: format of document to return |
---|
234 | ''' |
---|
235 | logging.info("Retrieving document type, " + docType) |
---|
236 | xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType] |
---|
237 | attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType] |
---|
238 | |
---|
239 | # check we have the moles format available; if not create it |
---|
240 | if self._molesFormat is None: |
---|
241 | self.doMolesTransform() |
---|
242 | self.createMolesFile() |
---|
243 | |
---|
244 | # check the document isn't already defined |
---|
245 | try: |
---|
246 | doc = getattr(self, attributeName) |
---|
247 | if doc is not None: |
---|
248 | logging.info("Found existing document - returning this now") |
---|
249 | return doc |
---|
250 | except: |
---|
251 | logging.info("Document not available - creating new transformed document") |
---|
252 | |
---|
253 | # the doc type doesn't exist - so run the xquery |
---|
254 | transformedDoc = self.doTransform(xqName) |
---|
255 | setattr(self, attributeName, transformedDoc) |
---|
256 | return transformedDoc |
---|
257 | |
---|
258 | |
---|
259 | def getAllDocs(self): |
---|
260 | ''' |
---|
261 | Return a list of all the available doc types in the record |
---|
262 | ''' |
---|
263 | # if the stored docs array is the same size as the array of all doc types |
---|
264 | # assume all transforms have been done - and just return these |
---|
265 | if len(self._allDocs) == len(self.documentTypes): |
---|
266 | return self._allDocs |
---|
267 | |
---|
268 | for docType in self.documentTypes: |
---|
269 | self._allDocs.append([docType, self.getDocumentFormat(docType)]) |
---|
270 | |
---|
271 | return self._allDocs |
---|
272 | |
---|
273 | |
---|
274 | def getTemporalData(self): |
---|
275 | ''' |
---|
276 | Retrieves the temporal data for the record; if this hasn't been discovered yet, |
---|
277 | do the necessary parsing |
---|
278 | @return: TimeRange object array with temporal data |
---|
279 | ''' |
---|
280 | if self.stData is None: |
---|
281 | self.getSpatioTemporalData() |
---|
282 | |
---|
283 | return self.stData.getTemporalData() |
---|
284 | |
---|
285 | |
---|
286 | def getSpatialData(self): |
---|
287 | ''' |
---|
288 | Retrieves the spatial data for the record; if this hasn't been discovered yet, |
---|
289 | do the necessary parsing |
---|
290 | @return: Coords object array with spatial data |
---|
291 | ''' |
---|
292 | if self.stData is None: |
---|
293 | self.getSpatioTemporalData() |
---|
294 | |
---|
295 | return self.stData.getSpatialData() |
---|
296 | |
---|
297 | |
---|
298 | def listify(self, item): |
---|
299 | ''' |
---|
300 | listify checks if an item is a list, if it isn't it puts it |
---|
301 | inside a list and returns it. Always returns a list object. |
---|
302 | @param item: object to check |
---|
303 | @return: item as a list object |
---|
304 | ''' |
---|
305 | if type(item) is list: |
---|
306 | return item |
---|
307 | else: |
---|
308 | return [item] |
---|
309 | |
---|
310 | |
---|
311 | def getSpatioTemporalData(self): |
---|
312 | ''' |
---|
313 | Extract spatio temporal data from the original document |
---|
314 | ''' |
---|
315 | # initialise the various spatiotemporal arrays used to extract data to |
---|
316 | self.stData = SpatioTemporalData() |
---|
317 | |
---|
318 | molesFile = self._molesDir + self._shortFilename |
---|
319 | logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile) |
---|
320 | |
---|
321 | # load in the moles file and put this into an object for direct access to the xml elements |
---|
322 | dgMeta=MRW.dgMetadata() |
---|
323 | try: |
---|
324 | dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot()) |
---|
325 | except Exception, detail: |
---|
326 | raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail) |
---|
327 | |
---|
328 | # do quick checks to see if the relevant data exists |
---|
329 | if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary: |
---|
330 | logging.info("No data summary elements found - assuming no spatiotemporal data available") |
---|
331 | return |
---|
332 | |
---|
333 | if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage: |
---|
334 | logging.info("No data coverage elements found - assuming no spatiotemporal data available") |
---|
335 | return |
---|
336 | |
---|
337 | if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage: |
---|
338 | logging.info("No spatial coverage elements found - assuming no spatial data available") |
---|
339 | else: |
---|
340 | self.getCoordData(dgMeta) |
---|
341 | |
---|
342 | if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage: |
---|
343 | logging.info("No temporal coverage elements found - assuming no temporal data available") |
---|
344 | else: |
---|
345 | self.getTimeRangeData(dgMeta) |
---|
346 | |
---|
347 | |
---|
348 | def getTimeRangeData(self, dgMeta): |
---|
349 | ''' |
---|
350 | Parse an xml tree and add any time range data found |
---|
351 | @param dgMeta: xml fragment for the time range |
---|
352 | ''' |
---|
353 | logging.info("Extracting time range info") |
---|
354 | try: |
---|
355 | dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange |
---|
356 | |
---|
357 | if not dates: |
---|
358 | logging.info("No temporal info found for document") |
---|
359 | |
---|
360 | dates_list = self.listify(dates) |
---|
361 | for date in dates_list: |
---|
362 | startdate=date.DateRangeStart |
---|
363 | enddate= date.DateRangeEnd |
---|
364 | if startdate==None or startdate=='None': |
---|
365 | startdate="null" |
---|
366 | if enddate==None or enddate=='None': |
---|
367 | enddate="null" |
---|
368 | |
---|
369 | self.stData.addTimeRange(startdate, enddate) |
---|
370 | logging.info("Temporal info: startdate " + \ |
---|
371 | startdate + ", enddate " + enddate) |
---|
372 | except Exception, detail: |
---|
373 | logging.info("Document does not contain temporal info.") |
---|
374 | logging.info(detail) |
---|
375 | |
---|
376 | |
---|
377 | def getCoordData(self, dgMeta): |
---|
378 | ''' |
---|
379 | Parse an xml tree and add any coord data found |
---|
380 | @param dgMeta: xml fragment for the bounding boxes |
---|
381 | ''' |
---|
382 | logging.info("Extracting bounding box info") |
---|
383 | try: |
---|
384 | |
---|
385 | bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox |
---|
386 | |
---|
387 | if not bboxes: |
---|
388 | logging.info("No bounding box info found for document") |
---|
389 | return |
---|
390 | |
---|
391 | bbox_list=self.listify(bboxes) |
---|
392 | #parse the list of coordinates |
---|
393 | for bbox in bbox_list: |
---|
394 | north = self.parseCoord(bbox.LimitNorth, 'S', 'N') |
---|
395 | south = self.parseCoord(bbox.LimitSouth, 'S', 'N') |
---|
396 | east = self.parseCoord(bbox.LimitEast, 'W', 'E') |
---|
397 | west = self.parseCoord(bbox.LimitWest, 'W', 'E') |
---|
398 | self.stData.addCoords(north, south, east, west) |
---|
399 | logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \ |
---|
400 | east + ", north " + north + "") |
---|
401 | |
---|
402 | except Exception, detail: |
---|
403 | logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \ |
---|
404 | "to an incomplete set of metadata being ingested. \nDetail: %s" %detail) |
---|
405 | |
---|
406 | |
---|
407 | def parseCoord(self, coordValue, minField, maxField): |
---|
408 | ''' |
---|
409 | Take a coordinate value extracted from a molefile bbox limit - together with |
---|
410 | the appropriate max/min limits and extract the correct value from it |
---|
411 | @param coordValue: the contents of the bbox limit tage |
---|
412 | @param minField: the expected min field of the coord range - i.e. 'W' or 'S' |
---|
413 | @param maxField: the expected max field of the coord range - i.e. 'E' or 'N' |
---|
414 | @return: coord - the value of the coordinate as a string |
---|
415 | ''' |
---|
416 | logging.debug("Parsing document coordinates") |
---|
417 | try: |
---|
418 | coord = coordValue.strip() |
---|
419 | if coord.endswith(maxField): |
---|
420 | coord=coordValue.split(maxField)[0] |
---|
421 | elif coord.endswith(minField): |
---|
422 | if coord.startswith('-'): |
---|
423 | coord = coordValue.split(minField)[0] |
---|
424 | else: |
---|
425 | coord = "-" + coordValue.split(minField)[0] |
---|
426 | |
---|
427 | return '%s' % float(coord) |
---|
428 | except: |
---|
429 | raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue |
---|
430 | |
---|
431 | |
---|
432 | def hasNullCoords(): |
---|
433 | ''' |
---|
434 | Checks a record to determine whether it has any coordinates set to null |
---|
435 | ''' |
---|
436 | if str(self.west)=='null' or \ |
---|
437 | str(self.south)=='null' or \ |
---|
438 | str(self.east)=='null' or \ |
---|
439 | str(self.north)=='null': |
---|
440 | return True; |
---|
441 | else: |
---|
442 | return False; |
---|
443 | |
---|