source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 3967

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@3967
Revision 3967, 21.8 KB checked in by cbyrom, 11 years ago (diff)

Update code to allow the extraction of authors, parameters and scope
from moles files + adjust the data model to handle these new data.

Line 
1#!/usr/bin/env python
2'''
3Class representing the a document to be ingested into the postgres DB table
4C Byrom Apr 08
5'''
6try: #python 2.5
7    from xml.etree import cElementTree
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree
15
16import os, sys, logging, re
17import molesReadWrite as MRW
18from ndgUtils.ndgObject import ndgObject
19from FileUtilities import FileUtilities
20from SpatioTemporalData import SpatioTemporalData
21import keywordAdder
22
23class PostgresRecord:
24    '''
25    Class representing the a document to be ingested into the postgres DB table
26    @param filename: Name of file to use a metadata record
27    @param ndg_dataprovider
28    @param datacentre_groups
29    @param datacentre_namespace
30    @param discovery_id
31    @param xq
32    @param doctype - type of doc to process
33    '''
34    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
35    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
36   
37    # vocab server - used for finding scope values in the moles files
38    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010"
39       
40    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
41        logging.info("Setting up Postgres record for file, " + filename)
42        self.filename = filename
43   
44        # NB, if we're dealing with an NDG data provider, the details are slightly different
45        if ndg_dataprovider:
46            discObj=ndgObject(discovery_id)
47            self._local_id = discObj.localID
48            self._repository_local_id = discObj.repository
49        else:
50            self._local_id = discovery_id
51            self._repository_local_id = datacentre_namespace
52           
53        self._datacentre_groups = datacentre_groups
54        self._repository = datacentre_namespace
55        self.discovery_id = discovery_id
56        self._xq = xq
57        self.docType = docType
58
59        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
60        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
61
62        self._fileUtils = FileUtilities()
63
64        # get the dir of the file - needed by the xquery to use as the target collection
65        tmp = filename.split('/')
66        self._dir = '/'.join(tmp[0:len(tmp)-1])
67        self._shortFilename = tmp[len(tmp)-1]
68       
69        # dir to store a temp copy of the moles file, when produced - for use by other transforms
70        self._molesDir = None
71        # object to hold the moles file - this will be loaded in when it is created - in order to extract
72        # spatiotemporal data, etc
73        self.dgMeta = None
74
75        # firstly load contents of file
76        self.originalFormat = file(filename).read()
77       
78        # escape any apostrophes
79        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
80
81        # initialise the various record fields
82        self.db_id = None    # the DB ID of the record, for easy reference when it is created
83        self.molesFormat = None
84        self.dcFormat = None
85        self.mdipFormat = None
86        self.iso19139Format = None
87        self.scn = 1    # system change number - keeps track of number of mods to a particular row
88       
89        # spatiotemporal data object
90        self.stData = None
91       
92        # fields to hold author, parameter and scope data
93        self.authors = None
94        self.parameters = None
95        self.scope = None
96
97    def escapeSpecialCharacters(self, inputString):
98        '''
99        Adjust the input string to escape any characters that would interfere with string or DB
100        operations
101        @param inputString: string to correct
102        @return: corrected string
103        '''
104        return re.sub(r'\'', '\\\'', inputString)
105   
106   
107    def doRecordTransforms(self):
108        '''
109        Run various transforms on the original doc, to populate the record with
110        the other types of doc used elsewhere
111        '''
112        logging.info("Running transforms for all document types")
113        for docType in self.documentTypes:
114            self.getDocumentFormat(docType)
115           
116        logging.info("Transforms complete")
117
118
119    def createMolesFile(self):
120        '''
121        Check if a moles file exists on the system; if not, assume the moles transform has not
122        been ran and then produce this file - to allow for use in the various xqueries
123        '''
124        logging.info("Creating moles file on system - for use with other xquery transforms")
125        self._molesDir = self._dir + "/moles/"
126        self._fileUtils.setUpDir(self._molesDir)
127       
128        if self._molesFormat is None:
129            self.doMolesTransform()
130           
131        self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat)
132        logging.info("Moles file created - at %s" %self._molesDir)
133       
134        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on       
135        molesFile = self._molesDir + self._shortFilename
136        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
137       
138        # load in the moles file and put this into an object for direct access to the xml elements
139        self.dgMeta=MRW.dgMetadata()
140        try:
141            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
142        except Exception, detail:
143            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
144
145           
146
147    def doTransform(self, xQueryType):
148        '''
149        Transform the record according to the specified XQuery type
150        @param xQueryType: XQuery doc to use to do the transform
151        @return: the metadata record in the required transformed format
152        '''
153        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
154
155        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
156        # moles file available for the transform - and use the correct dir for the xquery collection
157        dir = self._dir
158        if xQueryType.find('moles2') > -1:
159            if self._molesDir is None:
160                self.createMolesFile()
161               
162            dir = self._molesDir
163           
164        # get the query and set this up to use properly
165        xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
166
167        # sort out the input ID stuff
168        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
169        xquery=xquery.replace('repository_localid', self._repository)
170
171        # strip out the eXist reference to the libraries; these files should be available in the
172        # running dir - as set up by oai_ingest.py
173        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
174        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
175
176        # write the query to file, to make it easier to input
177        # NB, running directly at the command line leads to problems with the interpretation of $ characters
178        xqFile = "currentQuery.xq"
179        self._fileUtils.createFile(xqFile, xquery)
180
181        # Now do the transform
182        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
183        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes"
184        logging.debug("Running saxon command: " + xqCommand)
185        pipe = os.popen(xqCommand + " 2>&1")
186        output = pipe.read()
187        status = pipe.close()
188
189        if status is not None:
190            raise SystemError, 'Failed at running the XQuery'
191
192        # now remove the temp xquery file
193        status = os.unlink(xqFile)
194        if status is not None:
195            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile
196       
197        logging.info("Transform completed successfully")
198
199        return output
200
201
202    def doMolesTransform(self):
203        '''
204        Set up the basic moles doc - according to the type of document first ingested
205        '''
206        logging.info("Creating moles document - for use with other transforms")
207        xqName = None
208        if self.docType == "DIF":
209            xqName = "dif2moles"
210        elif self.docType == "MDIP":
211            xqName = "mdip2moles"
212        else:
213            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
214                     %self.docType
215
216        # now run the appropriate transform and set the attribute
217        setattr(self, "_molesFormat", self.doTransform(xqName))
218
219        # add keywords, if required
220        if self._datacentre_groups != "":
221            self.addKeywords()
222       
223        # escape any apostrophes
224        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
225
226        logging.info("moles document created")
227       
228
229    def addKeywords(self):
230        '''
231        If datacentre groups have been specified, these need to be added as keywords
232        - NB, this is rather clumsy approach but uses old code to achieve the result
233        '''
234        logging.info("Adding datacentre keywords to moles file")
235
236        # NB, use temporary directories to do the keyword additions
237        tmpDir = os.getcwd() + "/tmp/"
238        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
239        self._fileUtils.setUpDir(tmpDir)
240        self._fileUtils.setUpDir(tmpKeywordsDir)
241        tmpFile = 'tmpFile.xml'
242        self._fileUtils.createFile(tmpDir + tmpFile, self._molesFormat)
243
244        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
245
246        # Now load in the converted file
247        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
248        self._molesFormat = f.read()
249        f.close
250       
251        # Finally, tidy up temp dirs
252        self._fileUtils.cleanDir(tmpDir)
253        self._fileUtils.cleanDir(tmpKeywordsDir)
254        logging.info("Completed adding keywords")
255       
256
257    def getDocumentFormat(self, docType):
258        '''
259        Lookup document format; if it is already defined then return it, else do the required XQuery
260        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
261        @param docType: format of document to return
262        '''
263        logging.info("Retrieving document type, " + docType)
264        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
265        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
266       
267        # check we have the moles format available; if not create it
268        if self._molesFormat is None:
269            self.doMolesTransform()
270            self.createMolesFile()
271       
272        # check the document isn't already defined
273        try:
274            doc = getattr(self, attributeName)
275            if doc is not None:
276                logging.info("Found existing document - returning this now")
277                return doc
278        except:
279            logging.info("Document not available - creating new transformed document")
280
281        # the doc type doesn't exist - so run the xquery
282        transformedDoc = self.doTransform(xqName)
283        setattr(self, attributeName, transformedDoc)
284        return transformedDoc
285       
286   
287    def getAllDocs(self):
288        '''
289        Return a list of all the available doc types in the record
290        '''
291        # if the stored docs array is the same size as the array of all doc types
292        # assume all transforms have been done - and just return these
293        if len(self._allDocs) == len(self.documentTypes):
294            return self._allDocs
295       
296        for docType in self.documentTypes:
297            self._allDocs.append([docType, self.getDocumentFormat(docType)])
298
299        return self._allDocs
300       
301   
302    def getTemporalData(self):
303        '''
304        Retrieves the temporal data for the record; if this hasn't been discovered yet,
305        do the necessary parsing
306        @return: TimeRange object array with temporal data
307        '''
308        if self.stData is None:
309            self.getSpatioTemporalData()
310       
311        return self.stData.getTemporalData()
312       
313   
314    def getSpatialData(self):
315        '''
316        Retrieves the spatial data for the record; if this hasn't been discovered yet,
317        do the necessary parsing
318        @return: Coords object array with spatial data
319        '''
320        if self.stData is None:
321            self.getSpatioTemporalData()
322       
323        return self.stData.getSpatialData()
324       
325
326    def listify(self, item):
327        '''
328        listify checks if an item is a list, if it isn't it puts it
329        inside a list and returns it. Always returns a list object.
330        @param item: object to check
331        @return: item as a list object
332        '''
333        if type(item) is list:
334            return item
335        else:
336            return [item]
337       
338   
339    def getSpatioTemporalData(self):
340        '''
341        Extract spatio temporal data from the original document
342        '''
343        logging.info('Retrieving spatiotemporal info from moles file')
344        # initialise the various spatiotemporal arrays used to extract data to
345        self.stData = SpatioTemporalData()
346       
347        if self.dgMeta is None:
348            self.createMolesFile()
349           
350        # do quick checks to see if the relevant data exists
351        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
352            logging.info("No data summary elements found - assuming no spatiotemporal data available")
353            return
354       
355        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
356            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
357            return
358       
359        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
360            logging.info("No spatial coverage elements found - assuming no spatial data available")
361        else:
362            self.getCoordData(self.dgMeta)
363
364        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
365            logging.info("No temporal coverage elements found - assuming no temporal data available")
366        else:
367            self.getTimeRangeData(self.dgMeta)
368
369   
370    def getAuthorsInfo(self):
371        '''
372        Extract authors info from the moles file
373        '''
374        logging.info('Retrieving authors info from moles file')
375       
376        if self.dgMeta is None:
377            self.createMolesFile()
378           
379        logging.info("Extracting author info")
380        creators = ""
381        authors = ""
382        try:
383            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator
384            logging.info("Found creator information - adding this to authors record")
385           
386        except Exception, detail:
387            logging.info("Exception thrown whilst trying to find creator information:")
388            logging.info(detail)
389            logging.info("- this suggests document does not contain creator information.")
390
391        try:
392            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors
393            logging.info("Found cited author information - adding this to authors record")
394           
395        except Exception, detail:
396            logging.info("Exception thrown whilst trying to find cited author information:")
397            logging.info(detail)
398            logging.info("- this suggests document does not contain cited author information.")
399           
400        self.authors = authors + " " + creators
401        return self.authors
402   
403   
404    def getParametersInfo(self):
405        '''
406        Extract parameters info from the moles file
407        '''
408        logging.info('Retrieving parameters info from moles file')
409       
410        if self.dgMeta is None:
411            self.createMolesFile()
412           
413        params = ""
414        try:
415            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary
416            logging.info("Found parameter information - adding this to record")
417           
418        except Exception, detail:
419            logging.info("Exception thrown whilst trying to find parameter information:")
420            logging.info(detail)
421            logging.info("- this suggests document does not contain parameter information.")
422       
423        self.parameters = params
424        return self.parameters
425   
426   
427    def getScopeInfo(self):
428        '''
429        Extract scope info from the moles file
430        '''
431        logging.info('Retrieving scope info from moles file')
432       
433        if self.dgMeta is None:
434            self.createMolesFile()
435           
436        scope = ""
437        try:
438            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword
439            logging.info("Found keyword information - parsing this for scope")
440
441            keywords_list = self.listify(keywords)
442            for keyword in keywords_list:
443                if keyword.dgValidTermID:
444                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab):
445                        logging.info("Found scope value - adding this to record")
446                        scope += " " + keyword.dgValidTerm.strip()
447           
448        except Exception, detail:
449            logging.info("Exception thrown whilst trying to find scope information:")
450            logging.info(detail)
451            logging.info("- this suggests document does not contain scope information.")
452       
453        self.scope = scope
454        return self.scope
455           
456           
457    def getTimeRangeData(self, dgMeta):
458        '''
459        Parse an xml tree and add any time range data found
460        @param dgMeta: xml fragment for the time range
461        '''
462        logging.info("Extracting time range info")
463        try:
464            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
465           
466            if not dates:
467                logging.info("No temporal info found for document")
468               
469            dates_list = self.listify(dates)
470            for date in dates_list:
471                startdate=date.DateRangeStart
472                enddate= date.DateRangeEnd
473                if startdate==None or startdate=='None':
474                    startdate="null"
475                if enddate==None or enddate=='None':
476                    enddate="null"
477                   
478                self.stData.addTimeRange(startdate, enddate)
479                logging.info("Temporal info: startdate " + \
480                             startdate + ", enddate " + enddate) 
481        except Exception, detail:
482            logging.info("Document does not contain temporal info.")
483            logging.info(detail)
484
485       
486    def getCoordData(self, dgMeta):
487        '''
488        Parse an xml tree and add any coord data found
489        @param dgMeta: xml fragment for the bounding boxes
490        '''
491        logging.info("Extracting bounding box info")
492        try:
493
494            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
495           
496            if not bboxes:
497                logging.info("No bounding box info found for document")
498                return
499               
500            bbox_list=self.listify(bboxes)
501            #parse the list of coordinates
502            for bbox in bbox_list:
503                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
504                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
505                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
506                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
507                self.stData.addCoords(north, south, east, west)
508                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
509                    east + ", north " + north + "")
510               
511        except Exception, detail:
512            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
513                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
514
515
516    def parseCoord(self, coordValue, minField, maxField):
517        '''
518        Take a coordinate value extracted from a molefile bbox limit - together with
519        the appropriate max/min limits and extract the correct value from it
520        @param coordValue: the contents of the bbox limit tage
521        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
522        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
523        @return: coord - the value of the coordinate as a string   
524        '''
525        logging.debug("Parsing document coordinates")
526        try:
527            coord = coordValue.strip()
528            if coord.endswith(maxField):
529                coord=coordValue.split(maxField)[0]
530            elif coord.endswith(minField):
531                if coord.startswith('-'):
532                    coord = coordValue.split(minField)[0]
533                else:
534                    coord = "-" + coordValue.split(minField)[0]
535   
536            return '%s' % float(coord)
537        except:
538            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
539
540           
541    def hasNullCoords():
542        '''
543        Checks a record to determine whether it has any coordinates set to null
544        '''
545        if str(self.west)=='null' or \
546            str(self.south)=='null' or \
547            str(self.east)=='null' or \
548            str(self.north)=='null':
549            return True;
550        else:
551            return False;
552       
Note: See TracBrowser for help on using the repository browser.