source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 3912

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@3912
Revision 3912, 17.5 KB checked in by cbyrom, 11 years ago (diff)

Adjust logging in oai_document_ingester + adjust exception handling,
avoiding use of sys.exit() to allow processing workflow to continue
with other files + general tidy up and small fixes of several files.

Line 
1#!/usr/bin/env python
2'''
3Class representing the contents of a row in the metadata_record postgres DB table
4C Byrom Apr 08
5'''
6try: #python 2.5
7    from xml.etree import cElementTree
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree
15
16import os, sys, logging, re
17import molesReadWrite as MRW
18from ndgUtils.ndgObject import ndgObject
19from FileUtilities import FileUtilities
20from SpatioTemporalData import SpatioTemporalData
21import keywordAdder
22
23class PostgresRecord:
24    '''
25    Class representing the contents of a row in the metadata_record postgres DB table
26    @param filename: Name of file to use a metadata record
27    @param
28    '''
29    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
30    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
31       
32    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
33        logging.info("Setting up Postgres record for file, " + filename)
34        self.filename = filename
35   
36        # NB, if we're dealing with an NDG data provider, the details are slightly different
37        if ndg_dataprovider:
38            discObj=ndgObject(discovery_id)
39            self._local_id = discObj.localID
40            self._repository_local_id = discObj.repository
41        else:
42            self._local_id = discovery_id
43            self._repository_local_id = datacentre_namespace
44           
45        self._datacentre_groups = datacentre_groups
46        self._repository = datacentre_namespace
47        self.discovery_id = discovery_id
48        self._xq = xq
49        self.docType = docType
50
51        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
52        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
53
54        self._fileUtils = FileUtilities()
55
56        # get the dir of the file - needed by the xquery to use as the target collection
57        tmp = filename.split('/')
58        self._dir = '/'.join(tmp[0:len(tmp)-1])
59        self._shortFilename = tmp[len(tmp)-1]
60       
61        # dir to store a temp copy of the moles file, when produced - for use by other transforms
62        self._molesDir = None
63
64        # firstly load contents of file
65        self.originalFormat = file(filename).read()
66       
67        # escape any apostrophes
68        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
69
70        # initialise the various record fields
71        self.db_id = None    # the DB ID of the record, for easy reference when it is created
72        self.molesFormat = None
73        self.dcFormat = None
74        self.mdipFormat = None
75        self.iso19139Format = None
76        self.scn = 1    # system change number - keeps track of number of mods to a particular row
77       
78        # spatiotemporal data object
79        self.stData = None
80
81    def escapeSpecialCharacters(self, inputString):
82        '''
83        Adjust the input string to escape any characters that would interfere with string or DB
84        operations
85        @param inputString: string to correct
86        @return: corrected string
87        '''
88        return re.sub(r'\'', '\\\'', inputString)
89   
90   
91    def doRecordTransforms(self):
92        '''
93        Run various transforms on the original doc, to populate the record with
94        the other types of doc used elsewhere
95        '''
96        logging.info("Running transforms for all document types")
97        for docType in self.documentTypes:
98            self.getDocumentFormat(docType)
99           
100        logging.info("Transforms complete")
101
102
103    def createMolesFile(self):
104        '''
105        Check if a moles file exists on the system; if not, assume the moles transform has not
106        been ran and then produce this file - to allow for use in the various xqueries
107        '''
108        logging.info("Creating moles file on system - for use with other xquery transforms")
109        self._molesDir = self._dir + "/moles/"
110        self._fileUtils.setUpDir(self._molesDir)
111       
112        if self._molesFormat is None:
113            self.doMolesTransform()
114           
115        self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat)
116        logging.info("Moles file created - at %s" %self._molesDir)
117           
118
119    def doTransform(self, xQueryType):
120        '''
121        Transform the record according to the specified XQuery type
122        @param xQueryType: XQuery doc to use to do the transform
123        @return: the metadata record in the required transformed format
124        '''
125        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
126
127        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
128        # moles file available for the transform - and use the correct dir for the xquery collection
129        dir = self._dir
130        if xQueryType.find('moles2') > -1:
131            if self._molesDir is None:
132                self.createMolesFile()
133               
134            dir = self._molesDir
135           
136        # get the query and set this up to use properly
137        xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
138
139        # sort out the input ID stuff
140        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
141        xquery=xquery.replace('repository_localid', self._repository)
142
143        # strip out the eXist reference to the libraries; these files should be available in the
144        # running dir - as set up by oai_ingest.py
145        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
146        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
147
148        # write the query to file, to make it easier to input
149        # NB, running directly at the command line leads to problems with the interpretation of $ characters
150        xqFile = "currentQuery.xq"
151        self._fileUtils.createFile(xqFile, xquery)
152
153        # Now do the transform
154        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
155        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes"
156        logging.debug("Running saxon command: " + xqCommand)
157        pipe = os.popen(xqCommand + " 2>&1")
158        output = pipe.read()
159        status = pipe.close()
160
161        if status is not None:
162            raise SystemError, 'Failed at running the XQuery'
163
164        # now remove the temp xquery file
165        status = os.unlink(xqFile)
166        if status is not None:
167            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile
168       
169        logging.info("Transform completed successfully")
170
171        return output
172
173
174    def doMolesTransform(self):
175        '''
176        Set up the basic moles doc - according to the type of document first ingested
177        '''
178        logging.info("Creating moles document - for use with other transforms")
179        xqName = None
180        if self.docType == "DIF":
181            xqName = "dif2moles"
182        elif self.docType == "MDIP":
183            xqName = "mdip2moles"
184        else:
185            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
186                     %self.docType
187
188        # now run the appropriate transform and set the attribute
189        setattr(self, "_molesFormat", self.doTransform(xqName))
190
191        # add keywords, if required
192        if self._datacentre_groups != "":
193            self.addKeywords()
194       
195        # escape any apostrophes
196        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
197
198        logging.info("moles document created")
199       
200
201    def addKeywords(self):
202        '''
203        If datacentre groups have been specified, these need to be added as keywords
204        - NB, this is rather clumsy approach but uses old code to achieve the result
205        '''
206        logging.info("Adding datacentre keywords to moles file")
207
208        # NB, use temporary directories to do the keyword additions
209        tmpDir = os.getcwd() + "/tmp/"
210        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
211        self._fileUtils.setUpDir(tmpDir)
212        self._fileUtils.setUpDir(tmpKeywordsDir)
213        tmpFile = 'tmpFile.xml'
214        self._fileUtils.createFile(tmpDir + tmpFile, self._molesFormat)
215
216        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
217
218        # Now load in the converted file
219        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
220        self._molesFormat = f.read()
221        f.close
222       
223        # Finally, tidy up temp dirs
224        self._fileUtils.cleanDir(tmpDir)
225        self._fileUtils.cleanDir(tmpKeywordsDir)
226        logging.info("Completed adding keywords")
227       
228
229    def getDocumentFormat(self, docType):
230        '''
231        Lookup document format; if it is already defined then return it, else do the required XQuery
232        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
233        @param docType: format of document to return
234        '''
235        logging.info("Retrieving document type, " + docType)
236        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
237        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
238       
239        # check we have the moles format available; if not create it
240        if self._molesFormat is None:
241            self.doMolesTransform()
242            self.createMolesFile()
243       
244        # check the document isn't already defined
245        try:
246            doc = getattr(self, attributeName)
247            if doc is not None:
248                logging.info("Found existing document - returning this now")
249                return doc
250        except:
251            logging.info("Document not available - creating new transformed document")
252
253        # the doc type doesn't exist - so run the xquery
254        transformedDoc = self.doTransform(xqName)
255        setattr(self, attributeName, transformedDoc)
256        return transformedDoc
257       
258   
259    def getAllDocs(self):
260        '''
261        Return a list of all the available doc types in the record
262        '''
263        # if the stored docs array is the same size as the array of all doc types
264        # assume all transforms have been done - and just return these
265        if len(self._allDocs) == len(self.documentTypes):
266            return self._allDocs
267       
268        for docType in self.documentTypes:
269            self._allDocs.append([docType, self.getDocumentFormat(docType)])
270
271        return self._allDocs
272       
273   
274    def getTemporalData(self):
275        '''
276        Retrieves the temporal data for the record; if this hasn't been discovered yet,
277        do the necessary parsing
278        @return: TimeRange object array with temporal data
279        '''
280        if self.stData is None:
281            self.getSpatioTemporalData()
282       
283        return self.stData.getTemporalData()
284       
285   
286    def getSpatialData(self):
287        '''
288        Retrieves the spatial data for the record; if this hasn't been discovered yet,
289        do the necessary parsing
290        @return: Coords object array with spatial data
291        '''
292        if self.stData is None:
293            self.getSpatioTemporalData()
294       
295        return self.stData.getSpatialData()
296       
297
298    def listify(self, item):
299        '''
300        listify checks if an item is a list, if it isn't it puts it
301        inside a list and returns it. Always returns a list object.
302        @param item: object to check
303        @return: item as a list object
304        '''
305        if type(item) is list:
306            return item
307        else:
308            return [item]
309       
310   
311    def getSpatioTemporalData(self):
312        '''
313        Extract spatio temporal data from the original document
314        '''
315        # initialise the various spatiotemporal arrays used to extract data to
316        self.stData = SpatioTemporalData()
317       
318        molesFile = self._molesDir + self._shortFilename
319        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
320       
321        # load in the moles file and put this into an object for direct access to the xml elements
322        dgMeta=MRW.dgMetadata()
323        try:
324            dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
325        except Exception, detail:
326            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
327
328        # do quick checks to see if the relevant data exists
329        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
330            logging.info("No data summary elements found - assuming no spatiotemporal data available")
331            return
332       
333        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
334            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
335            return
336       
337        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
338            logging.info("No spatial coverage elements found - assuming no spatial data available")
339        else:
340            self.getCoordData(dgMeta)
341
342        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
343            logging.info("No temporal coverage elements found - assuming no temporal data available")
344        else:
345            self.getTimeRangeData(dgMeta)
346
347
348    def getTimeRangeData(self, dgMeta):
349        '''
350        Parse an xml tree and add any time range data found
351        @param dgMeta: xml fragment for the time range
352        '''
353        logging.info("Extracting time range info")
354        try:
355            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
356           
357            if not dates:
358                logging.info("No temporal info found for document")
359               
360            dates_list = self.listify(dates)
361            for date in dates_list:
362                startdate=date.DateRangeStart
363                enddate= date.DateRangeEnd
364                if startdate==None or startdate=='None':
365                    startdate="null"
366                if enddate==None or enddate=='None':
367                    enddate="null"
368                   
369                self.stData.addTimeRange(startdate, enddate)
370                logging.info("Temporal info: startdate " + \
371                             startdate + ", enddate " + enddate) 
372        except Exception, detail:
373            logging.info("Document does not contain temporal info.")
374            logging.info(detail)
375
376       
377    def getCoordData(self, dgMeta):
378        '''
379        Parse an xml tree and add any coord data found
380        @param dgMeta: xml fragment for the bounding boxes
381        '''
382        logging.info("Extracting bounding box info")
383        try:
384
385            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
386           
387            if not bboxes:
388                logging.info("No bounding box info found for document")
389                return
390               
391            bbox_list=self.listify(bboxes)
392            #parse the list of coordinates
393            for bbox in bbox_list:
394                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
395                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
396                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
397                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
398                self.stData.addCoords(north, south, east, west)
399                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
400                    east + ", north " + north + "")
401               
402        except Exception, detail:
403            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
404                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
405
406
407    def parseCoord(self, coordValue, minField, maxField):
408        '''
409        Take a coordinate value extracted from a molefile bbox limit - together with
410        the appropriate max/min limits and extract the correct value from it
411        @param coordValue: the contents of the bbox limit tage
412        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
413        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
414        @return: coord - the value of the coordinate as a string   
415        '''
416        logging.debug("Parsing document coordinates")
417        try:
418            coord = coordValue.strip()
419            if coord.endswith(maxField):
420                coord=coordValue.split(maxField)[0]
421            elif coord.endswith(minField):
422                if coord.startswith('-'):
423                    coord = coordValue.split(minField)[0]
424                else:
425                    coord = "-" + coordValue.split(minField)[0]
426   
427            return '%s' % float(coord)
428        except:
429            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
430
431           
432    def hasNullCoords():
433        '''
434        Checks a record to determine whether it has any coordinates set to null
435        '''
436        if str(self.west)=='null' or \
437            str(self.south)=='null' or \
438            str(self.east)=='null' or \
439            str(self.north)=='null':
440            return True;
441        else:
442            return False;
443       
Note: See TracBrowser for help on using the repository browser.