1 | #!/usr/bin/env python |
---|
2 | ''' |
---|
3 | Class representing the contents of a row in the metadata_record postgres DB table |
---|
4 | C Byrom Apr 08 |
---|
5 | ''' |
---|
6 | try: #python 2.5 |
---|
7 | from xml.etree import cElementTree |
---|
8 | except ImportError: |
---|
9 | try: |
---|
10 | # if you've installed it yourself it comes this way |
---|
11 | import cElementTree |
---|
12 | except ImportError: |
---|
13 | # if you've egged it this is the way it comes |
---|
14 | from ndgUtils.elementtree import cElementTree |
---|
15 | |
---|
16 | import os, sys, logging |
---|
17 | #from ETxmlView import loadET, nsdumb |
---|
18 | import molesReadWrite as MRW |
---|
19 | from ndgUtils.ndgObject import ndgObject |
---|
20 | from FileUtilities import FileUtilities |
---|
21 | from SpatioTemporalData import SpatioTemporalData |
---|
22 | |
---|
23 | class PostgresRecord: |
---|
24 | ''' |
---|
25 | Class representing the contents of a row in the metadata_record postgres DB table |
---|
26 | @param filename: Name of file to use a metadata record |
---|
27 | @param |
---|
28 | ''' |
---|
29 | documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP'] |
---|
30 | |
---|
31 | def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType): |
---|
32 | logging.info("Setting up Postgres record for file, " + filename) |
---|
33 | self.filename = filename |
---|
34 | |
---|
35 | # NB, if we're dealing with an NDG data provider, the details are slightly different |
---|
36 | if ndg_dataprovider: |
---|
37 | discObj=ndgObject(discovery_id) |
---|
38 | self._local_id = discObj.localID |
---|
39 | self._repository_local_id = discObj.repository |
---|
40 | else: |
---|
41 | self._local_id = discovery_id |
---|
42 | self._repository_local_id = datacentre_namespace |
---|
43 | |
---|
44 | self._datacentre_groups = datacentre_groups |
---|
45 | self._repository = datacentre_namespace |
---|
46 | self.discovery_id = discovery_id |
---|
47 | self._xq = xq |
---|
48 | self.docType = docType |
---|
49 | |
---|
50 | self._molesFormat = None # initialise this, so we can guarantee a value - to avoid using getattr |
---|
51 | self._allDocs = [] # array to store all the transformed docs - for easy retrieval by the DAO |
---|
52 | |
---|
53 | self._fileUtils = FileUtilities() |
---|
54 | |
---|
55 | # get the dir of the file - needed by the xquery to use as the target collection |
---|
56 | tmp = filename.split('/') |
---|
57 | self._dir = '/'.join(tmp[0:len(tmp)-1]) |
---|
58 | self._shortFilename = tmp[len(tmp)-1] |
---|
59 | |
---|
60 | # dir to store a temp copy of the moles file, when produced - for use by other transforms |
---|
61 | self._molesDir = None |
---|
62 | |
---|
63 | # firstly load contents of file |
---|
64 | self.originalFormat = file(filename).read() |
---|
65 | |
---|
66 | # initialise the various record fields |
---|
67 | self.db_id = None # the DB ID of the record, for easy reference when it is created |
---|
68 | self.molesFormat = None |
---|
69 | self.dcFormat = None |
---|
70 | self.mdipFormat = None |
---|
71 | self.iso19139Format = None |
---|
72 | self.scn = 1 # system change number - keeps track of number of mods to a particular row |
---|
73 | |
---|
74 | # do some initial setting up of record |
---|
75 | self.doRecordTransforms() |
---|
76 | self.getSpatioTemporalData() |
---|
77 | |
---|
78 | |
---|
79 | def doRecordTransforms(self): |
---|
80 | ''' |
---|
81 | Run various transforms on the original doc, to populate the record with |
---|
82 | the other types of doc used elsewhere |
---|
83 | ''' |
---|
84 | logging.info("Running transforms for all document types") |
---|
85 | for docType in self.documentTypes: |
---|
86 | self.getDocumentFormat(docType) |
---|
87 | |
---|
88 | logging.info("Transforms complete") |
---|
89 | |
---|
90 | |
---|
91 | def createMolesFile(self): |
---|
92 | ''' |
---|
93 | Check if a moles file exists on the system; if not, assume the moles transform has not |
---|
94 | been ran and then produce this file - to allow for use in the various xqueries |
---|
95 | ''' |
---|
96 | logging.info("Creating moles file on system - for use with other xquery transforms") |
---|
97 | self._molesDir = self._dir + "/moles/" |
---|
98 | self._fileUtils.setUpDir(self._molesDir) |
---|
99 | |
---|
100 | if self._molesFormat is None: |
---|
101 | self.doMolesTransform() |
---|
102 | |
---|
103 | self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat) |
---|
104 | logging.info("Moles file created - at %s" %self._molesDir) |
---|
105 | |
---|
106 | |
---|
107 | def doTransform(self, xQueryType): |
---|
108 | ''' |
---|
109 | Transform the record according to the specified XQuery type |
---|
110 | @param xQueryType: XQuery doc to use to do the transform |
---|
111 | @return: the metadata record in the required transformed format |
---|
112 | ''' |
---|
113 | logging.info("Running XQuery transform, " + xQueryType + " to create transformed document") |
---|
114 | |
---|
115 | # firstly, check if this is a moles -> something else query; if so, ensure there is a valid |
---|
116 | # moles file available for the transform - and use the correct dir for the xquery collection |
---|
117 | dir = self._dir |
---|
118 | if xQueryType.find('moles2') > -1: |
---|
119 | if self._molesDir is None: |
---|
120 | self.createMolesFile() |
---|
121 | |
---|
122 | dir = self._molesDir |
---|
123 | |
---|
124 | # get the query and set this up to use properly |
---|
125 | xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id) |
---|
126 | |
---|
127 | # sort out the input ID stuff |
---|
128 | xquery=xquery.replace('Input_Entry_ID', self.discovery_id) |
---|
129 | xquery=xquery.replace('repository_localid', self._repository) |
---|
130 | |
---|
131 | # strip out the eXist reference to the libraries; these files should be available in the |
---|
132 | # running dir - as set up by oai_ingest.py |
---|
133 | xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '') |
---|
134 | xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '') |
---|
135 | |
---|
136 | # write the query to file, to make it easier to input |
---|
137 | # NB, running directly at the command line leads to problems with the interpretation of $ characters |
---|
138 | xqFile = "currentQuery.xq" |
---|
139 | self._fileUtils.createFile(xqFile, xquery) |
---|
140 | |
---|
141 | # Now do the transform |
---|
142 | os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.') |
---|
143 | xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes" |
---|
144 | logging.debug("Running saxon command: " + xqCommand) |
---|
145 | pipe = os.popen(xqCommand + " 2>&1") |
---|
146 | output = pipe.read() |
---|
147 | status = pipe.close() |
---|
148 | |
---|
149 | if status is not None: |
---|
150 | raise SystemError, 'Failed at running the XQuery' |
---|
151 | |
---|
152 | # now remove the temp xquery file |
---|
153 | status = os.unlink(xqFile) |
---|
154 | if status is not None: |
---|
155 | raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile |
---|
156 | |
---|
157 | logging.info("Transform completed successfully") |
---|
158 | |
---|
159 | # f=open(xQueryType + "_doc.xml", 'w') |
---|
160 | # f.write(output) |
---|
161 | # f.close() |
---|
162 | |
---|
163 | return output |
---|
164 | |
---|
165 | |
---|
166 | def doMolesTransform(self): |
---|
167 | ''' |
---|
168 | Set up the basic moles doc - according to the type of document first ingested |
---|
169 | ''' |
---|
170 | logging.info("Creating moles document - for use with other transforms") |
---|
171 | xqName = None |
---|
172 | if self.docType == "DIF": |
---|
173 | xqName = "dif2moles" |
---|
174 | elif self.docType == "MDIP": |
---|
175 | xqName = "mdip2moles" |
---|
176 | else: |
---|
177 | sys.exit("ERROR: No XQuery exists to transform input document type, %s, into moles format" \ |
---|
178 | %self.docType) |
---|
179 | |
---|
180 | # add keywords, if required |
---|
181 | if self._datacentre_groups != "": |
---|
182 | addKeywords() |
---|
183 | |
---|
184 | # now run the appropriate transform and set the attribute |
---|
185 | setattr(self, "_molesFormat", self.doTransform(xqName)) |
---|
186 | logging.info("moles document created") |
---|
187 | |
---|
188 | |
---|
189 | def addKeywords(self): |
---|
190 | ''' |
---|
191 | If datacentre groups have been specified, these need to be added as keywords |
---|
192 | - NB, this is rather clumsy approach but uses old code to achieve the result |
---|
193 | ''' |
---|
194 | logging.info("Adding datacentre keywords to moles file") |
---|
195 | # NB, use temporary directories to do the keyword additions |
---|
196 | tmpDir = os.getcwd() + "/" |
---|
197 | tmpKeywordsDir = os.getcwd() + "/kewordsAdded/" |
---|
198 | self._fileUtils.setUpDir(tmpDir) |
---|
199 | self._fileUtils.setUpDir(tmpKeywordsDir) |
---|
200 | tmpFile = 'tmpFile.xml' |
---|
201 | self._fileUtils.createFile(tmpDir + "/" + tmpFile, self._molesFormat) |
---|
202 | |
---|
203 | keywordAdder.main(tmpDir, tmpKeywordsDir, self.datacentre_groups) |
---|
204 | |
---|
205 | # Now load in the converted file |
---|
206 | f=open(tmpKeywordsDir + "/" + tmpFile, 'r') |
---|
207 | self._molesFormat = f.read() |
---|
208 | f.close |
---|
209 | |
---|
210 | # Finally, tidy up temp dirs |
---|
211 | self._fileUtils.cleanDir(tmpDir) |
---|
212 | self._fileUtils.clearDir(tmpKeywordsDir) |
---|
213 | logging.info("Completed adding keywords") |
---|
214 | |
---|
215 | |
---|
216 | def getDocumentFormat(self, docType): |
---|
217 | ''' |
---|
218 | Lookup document format; if it is already defined then return it, else do the required XQuery |
---|
219 | transform. NB, transforms are ran on the molesFormat document - so ensure this is available |
---|
220 | @param docType: format of document to return |
---|
221 | ''' |
---|
222 | logging.info("Retrieving document type, " + docType) |
---|
223 | xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType] |
---|
224 | attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType] |
---|
225 | |
---|
226 | # check we have the moles format available; if not create it |
---|
227 | if self._molesFormat is None: |
---|
228 | self.doMolesTransform() |
---|
229 | self.createMolesFile() |
---|
230 | |
---|
231 | # check the document isn't already defined |
---|
232 | try: |
---|
233 | doc = getattr(self, attributeName) |
---|
234 | if doc is not None: |
---|
235 | logging.info("Found existing document - returning this now") |
---|
236 | return doc |
---|
237 | except: |
---|
238 | logging.info("Document not available - creating new transformed document") |
---|
239 | |
---|
240 | # the doc type doesn't exist - so run the xquery |
---|
241 | setattr(self, attributeName, self.doTransform(xqName)) |
---|
242 | |
---|
243 | |
---|
244 | def getAllDocs(self): |
---|
245 | ''' |
---|
246 | Return a list of all the available doc types in the record |
---|
247 | ''' |
---|
248 | if len(self._allDocs) > 0: |
---|
249 | return self._allDocs |
---|
250 | |
---|
251 | for docType in self.documentTypes: |
---|
252 | self._allDocs.append([docType, self.getDocumentFormat(docType)]) |
---|
253 | return self._allDocs |
---|
254 | |
---|
255 | |
---|
256 | def listify(self, item): |
---|
257 | ''' |
---|
258 | listify checks if an item is a list, if it isn't it puts it |
---|
259 | inside a list and returns it. Always returns a list object. |
---|
260 | @param item: object to check |
---|
261 | @return: item as a list object |
---|
262 | ''' |
---|
263 | if type(item) is list: |
---|
264 | return item |
---|
265 | else: |
---|
266 | return [item] |
---|
267 | |
---|
268 | |
---|
269 | def getSpatioTemporalData(self): |
---|
270 | ''' |
---|
271 | Extract spatio temporal data from the original document |
---|
272 | ''' |
---|
273 | # initialise the various spatiotemporal arrays used to extract data to |
---|
274 | self.stData = SpatioTemporalData() |
---|
275 | |
---|
276 | molesFile = self._molesDir + self._shortFilename |
---|
277 | logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile) |
---|
278 | |
---|
279 | # load in the moles file and put this into an object for direct access to the xml elements |
---|
280 | dgMeta=MRW.dgMetadata() |
---|
281 | try: |
---|
282 | dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot()) |
---|
283 | except Exception, detail: |
---|
284 | raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail) |
---|
285 | |
---|
286 | # do quick checks to see if the relevant data exists |
---|
287 | if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary: |
---|
288 | logging.info("No data summary elements found - assuming no spatiotemporal data available") |
---|
289 | return |
---|
290 | |
---|
291 | if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage: |
---|
292 | logging.info("No data coverage elements found - assuming no spatiotemporal data available") |
---|
293 | return |
---|
294 | |
---|
295 | if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage: |
---|
296 | logging.info("No spatial coverage elements found - assuming no spatial data available") |
---|
297 | else: |
---|
298 | self.getCoordData(dgMeta) |
---|
299 | |
---|
300 | if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage: |
---|
301 | logging.info("No temporal coverage elements found - assuming no temporal data available") |
---|
302 | else: |
---|
303 | self.getTimeRangeData(dgMeta) |
---|
304 | |
---|
305 | |
---|
306 | def getTimeRangeData(self, dgMeta): |
---|
307 | ''' |
---|
308 | Parse an xml tree and add any time range data found |
---|
309 | @param dgMeta: xml fragment for the time range |
---|
310 | ''' |
---|
311 | logging.info("Extracting time range info") |
---|
312 | try: |
---|
313 | dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange |
---|
314 | |
---|
315 | if not dates: |
---|
316 | logging.info("No temporal info found for document") |
---|
317 | |
---|
318 | dates_list = self.listify(dates) |
---|
319 | for date in dates_list: |
---|
320 | startdate=date.DateRangeStart |
---|
321 | enddate= date.DateRangeEnd |
---|
322 | if startdate==None or startdate=='None': |
---|
323 | startdate="null" |
---|
324 | if enddate==None or enddate=='None': |
---|
325 | enddate="null" |
---|
326 | |
---|
327 | self.stData.addTimeRange(startdate, enddate) |
---|
328 | logging.info("Temporal info: startdate " + \ |
---|
329 | startdate + ", enddate " + enddate) |
---|
330 | except Exception, detail: |
---|
331 | logging.info("Document does not contain temporal info.") |
---|
332 | logging.info(detail) |
---|
333 | |
---|
334 | |
---|
335 | def getCoordData(self, dgMeta): |
---|
336 | ''' |
---|
337 | Parse an xml tree and add any coord data found |
---|
338 | @param dgMeta: xml fragment for the bounding boxes |
---|
339 | ''' |
---|
340 | logging.info("Extracting bounding box info") |
---|
341 | try: |
---|
342 | |
---|
343 | bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox |
---|
344 | |
---|
345 | if not bboxes: |
---|
346 | logging.info("No bounding box info found for document") |
---|
347 | return |
---|
348 | |
---|
349 | bbox_list=self.listify(bboxes) |
---|
350 | #parse the list of coordinates |
---|
351 | for bbox in bbox_list: |
---|
352 | north = self.parseCoord(bbox.LimitNorth, 'S', 'N') |
---|
353 | south = self.parseCoord(bbox.LimitSouth, 'S', 'N') |
---|
354 | east = self.parseCoord(bbox.LimitEast, 'W', 'E') |
---|
355 | west = self.parseCoord(bbox.LimitWest, 'W', 'E') |
---|
356 | self.stData.addCoords(north, south, east, west) |
---|
357 | logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \ |
---|
358 | east + ", north " + north + "") |
---|
359 | |
---|
360 | except Exception, detail: |
---|
361 | logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \ |
---|
362 | "to an incomplete set of metadata being ingested. \nDetail: %s" %detail) |
---|
363 | |
---|
364 | |
---|
365 | def parseCoord(self, coordValue, minField, maxField): |
---|
366 | ''' |
---|
367 | Take a coordinate value extracted from a molefile bbox limit - together with |
---|
368 | the appropriate max/min limits and extract the correct value from it |
---|
369 | @param coordValue: the contents of the bbox limit tage |
---|
370 | @param minField: the expected min field of the coord range - i.e. 'W' or 'S' |
---|
371 | @param maxField: the expected max field of the coord range - i.e. 'E' or 'N' |
---|
372 | @return: coord - the value of the coordinate as a string |
---|
373 | ''' |
---|
374 | logging.debug("Parsing document coordinates") |
---|
375 | try: |
---|
376 | coord = coordValue.strip() |
---|
377 | if coord.endswith(maxField): |
---|
378 | coord=coordValue.split(maxField)[0] |
---|
379 | elif coord.endswith(minField): |
---|
380 | if coord.startswith('-'): |
---|
381 | coord = coordValue.split(minField)[0] |
---|
382 | else: |
---|
383 | coord = "-" + coordValue.split(minField)[0] |
---|
384 | |
---|
385 | return '%s' % float(coord) |
---|
386 | except: |
---|
387 | raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue |
---|
388 | |
---|
389 | |
---|
390 | def hasNullCoords(): |
---|
391 | ''' |
---|
392 | Checks a record to determine whether it has any coordinates set to null |
---|
393 | ''' |
---|
394 | if str(self.west)=='null' or \ |
---|
395 | str(self.south)=='null' or \ |
---|
396 | str(self.east)=='null' or \ |
---|
397 | str(self.north)=='null': |
---|
398 | return True; |
---|
399 | else: |
---|
400 | return False; |
---|
401 | |
---|