source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 5248

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@5248
Revision 5248, 23.4 KB checked in by cbyrom, 10 years ago (diff)

Update ingest scripts for use with the OAIInfoEditor harvest
functionality - to allow config and jar file resources to be
retrieved even if not running in ingest package + allow different
harvest directory and format, compared with that of the config
file, to be specified.

Line 
1#!/usr/bin/env python
2'''
3Class representing the a document to be ingested into the postgres DB table
4C Byrom Apr 08
5'''
6from xml.etree import cElementTree
7import os, sys, logging, re, pkg_resources
8import csml.csml2Moles.molesReadWrite as MRW
9from ndg.common.src.models.ndgObject import ndgObject
10from ndg.common.src.lib.ndgresources import ndgResources
11import ndg.common.src.lib.fileutilities as FileUtilities
12from SpatioTemporalData import SpatioTemporalData
13import keywordAdder
14
15SAXON_JAR_FILE = 'lib/saxon9.jar'
16
17class PostgresRecord:
18    '''
19    Class representing the a document to be ingested into the postgres DB table
20    @param filename: Name of file to use a metadata record
21    @param ndg_dataprovider
22    @param datacentre_groups
23    @param datacentre_namespace
24    @param discovery_id
25    @param xq
26    @param doctype - type of doc to process
27    '''
28    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
29    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139', 'MDIP']
30   
31    # vocab server - used for finding scope values in the moles files
32    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010"
33       
34    def __init__(self, filename, ndg_dataprovider, datacentre_groups, \
35                 datacentre_namespace, discovery_id, xq, docType):
36        logging.info("Setting up Postgres record for file, " + filename)
37        self.filename = filename
38   
39        # NB, if we're dealing with an NDG data provider, the details are slightly different
40        if ndg_dataprovider:
41            discObj=ndgObject(discovery_id)
42            self._local_id = discObj.localID
43            self._repository_local_id = discObj.repository
44        else:
45            self._local_id = discovery_id
46            self._repository_local_id = datacentre_namespace
47           
48        self._datacentre_groups = datacentre_groups
49        self._repository = datacentre_namespace
50        self.discovery_id = discovery_id
51        self._xq = xq
52        self.docType = docType
53
54        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
55        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
56
57        # get the dir of the file - needed by the xquery to use as the target collection
58        tmp = filename.split('/')
59        self._dir = '/'.join(tmp[0:len(tmp)-1])
60        self.shortFilename = tmp[-1]
61       
62        # dir to store a temp copy of the moles file, when produced - for use by other transforms
63        self._molesDir = None
64        # object to hold the moles file - this will be loaded in when it is created - in order to extract
65        # spatiotemporal data, etc
66        self.dgMeta = None
67
68        # firstly load contents of file
69        self.originalFormat = file(filename).read()
70       
71        # escape any apostrophes
72        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
73
74        # initialise the various record fields
75        self.db_id = None    # the DB ID of the record, for easy reference when it is created
76        self.molesFormat = None
77        self.dcFormat = None
78        self.mdipFormat = None
79        self.iso19139Format = None
80        self.scn = 1    # system change number - keeps track of number of mods to a particular row
81       
82        # spatiotemporal data object
83        self.stData = None
84       
85        # fields to hold author, parameter and scope data
86        self.authors = None
87        self.parameters = None
88        self.scope = None
89
90    def escapeSpecialCharacters(self, inputString):
91        '''
92        Adjust the input string to escape any characters that would interfere with string or DB
93        operations
94        @param inputString: string to correct
95        @return: corrected string
96        '''
97        return re.sub(r'\'', '\\\'', inputString)
98
99
100    def unescapeSpecialCharacters(self, inputString):
101        '''
102        Adjust the input string to remove escaped characters that would interfere with string or DB
103        operations
104        @param inputString: string to correct
105        @return: corrected string
106        '''
107        str = re.sub(r'%20', ' ', inputString)
108        return 
109   
110   
111    def doRecordTransforms(self):
112        '''
113        Run various transforms on the original doc, to populate the record with
114        the other types of doc used elsewhere
115        '''
116        logging.info("Running transforms for all document types")
117        for docType in self.documentTypes:
118            self.getDocumentFormat(docType)
119           
120        logging.info("Transforms complete")
121
122
123    def createMolesFile(self):
124        '''
125        Check if a moles file exists on the system; if not, assume the moles transform has not
126        been ran and then produce this file - to allow for use in the various xqueries
127        '''
128        logging.info("Creating moles file on system - for use with other xquery transforms")
129        self._molesDir = self._dir + "/moles/"
130        FileUtilities.setUpDir(self._molesDir)
131       
132        if self._molesFormat is None:
133            self.doMolesTransform()
134           
135        FileUtilities.createFile(self._molesDir + self.shortFilename, self._molesFormat)
136        logging.info("Moles file created - at %s" %self._molesDir)
137       
138        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on       
139        molesFile = self._molesDir + self.shortFilename
140        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
141       
142        # load in the moles file and put this into an object for direct access to the xml elements
143       
144        self.dgMeta=MRW.dgMetadata()
145        try:
146            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
147        except Exception, detail:
148            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
149
150
151    def doTransform(self, xQueryType):
152        '''
153        Transform the record according to the specified XQuery type
154        @param xQueryType: XQuery doc to use to do the transform
155        @return: the metadata record in the required transformed format
156        '''
157        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
158
159        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
160        # moles file available for the transform - and use the correct dir for the xquery collection
161        dir = self._dir
162        if xQueryType.find('moles2') > -1:
163            if self._molesDir is None:
164                self.createMolesFile()
165               
166            dir = self._molesDir
167           
168        # get the query and set this up to use properly
169       
170        #xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
171        #SJD - added this bit in (missed?) to upgrade to ndgCommon.
172        self.xqueryLib = ndgResources()       
173        xquery = self.xqueryLib.createXQuery(xQueryType,dir, self._repository_local_id, self._local_id)
174     
175        # sort out the input ID stuff
176        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
177        xquery=xquery.replace('repository_localid', self._repository)
178
179        # strip out the eXist reference to the libraries; these files should be available in the
180        # running dir - as set up by oai_ingest.py
181        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
182        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
183
184        # write the query to file, to make it easier to input
185        # NB, running directly at the command line leads to problems with the interpretation of $ characters
186        xqFile = "currentQuery" + xQueryType + ".xq" 
187        FileUtilities.createFile(xqFile, xquery)
188       
189        # ensure the jar file is available - NB, this may be running from a different
190        # location - e.g. the OAIInfoEditor.lib.harvester - and this won't have the
191        # saxon file directly on its filesystem
192        jarFile = pkg_resources.resource_filename('OAIBatch', SAXON_JAR_FILE)
193
194        # Now do the transform
195        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
196        xqCommand = "java -cp %s net.sf.saxon.Query %s !omit-xml-declaration=yes" %(jarFile, xqFile)
197        logging.debug("Running saxon command: " + xqCommand)
198        pipe = os.popen(xqCommand + " 2>&1")
199        output = pipe.read()
200        status = pipe.close()
201
202        if status is not None:
203            raise SystemError, 'Failed at running the XQuery'
204
205        # now remove the temp xquery file
206        '''status = os.unlink(xqFile)
207        if status is not None:
208            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile'''
209       
210        logging.info("Transform completed successfully")
211
212        return output
213
214
215    def doMolesTransform(self):
216        '''
217        Set up the basic moles doc - according to the type of document first ingested
218        '''
219        logging.info("Creating moles document - for use with other transforms")
220        xqName = None
221        if self.docType == "DIF":
222            xqName = "dif2moles"
223        elif self.docType == "MDIP":
224            xqName = "mdip2moles"
225        else:
226            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
227                     %self.docType
228
229        # now run the appropriate transform and set the attribute
230        setattr(self, "_molesFormat", self.doTransform(xqName))
231
232        # add keywords, if required
233        if self._datacentre_groups:
234            self.addKeywords()
235       
236        # escape any apostrophes
237        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
238
239        logging.info("moles document created")
240       
241
242    def addKeywords(self):
243        '''
244        If datacentre groups have been specified, these need to be added as keywords
245        - NB, this is rather clumsy approach but uses old code to achieve the result
246        '''
247        logging.info("Adding datacentre keywords to moles file")
248
249        # NB, use temporary directories to do the keyword additions
250        tmpDir = os.getcwd() + "/tmp/"
251        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
252        FileUtilities.setUpDir(tmpDir)
253        FileUtilities.setUpDir(tmpKeywordsDir)
254        tmpFile = 'tmpFile.xml'
255        FileUtilities.createFile(tmpDir + tmpFile, self._molesFormat)
256
257        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
258
259        # Now load in the converted file
260        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
261        self._molesFormat = f.read()
262        f.close
263       
264        # Finally, tidy up temp dirs
265        FileUtilities.cleanDir(tmpDir)
266        FileUtilities.cleanDir(tmpKeywordsDir)
267        logging.info("Completed adding keywords")
268       
269
270    def getDocumentFormat(self, docType):
271        '''
272        Lookup document format; if it is already defined then return it, else do the required XQuery
273        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
274        @param docType: format of document to return
275        '''
276        logging.info("Retrieving document type, " + docType)
277        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
278        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
279       
280        # check we have the moles format available; if not create it
281        if self._molesFormat is None:
282            self.doMolesTransform()
283            self.createMolesFile()
284       
285        # check the document isn't already defined
286        try:
287            doc = getattr(self, attributeName)
288            if doc is not None:
289                logging.info("Found existing document - returning this now")
290                return doc
291        except:
292            logging.info("Document not available - creating new transformed document")
293
294        # the doc type doesn't exist - so run the xquery
295        transformedDoc = self.doTransform(xqName)
296        setattr(self, attributeName, transformedDoc)
297        return transformedDoc
298       
299   
300    def getAllDocs(self):
301        '''
302        Return a list of all the available doc types in the record
303        '''
304        # if the stored docs array is the same size as the array of all doc types
305        # assume all transforms have been done - and just return these
306        if len(self._allDocs) == len(self.documentTypes):
307            return self._allDocs
308       
309        for docType in self.documentTypes:
310            self._allDocs.append([docType, self.getDocumentFormat(docType)])
311
312        return self._allDocs
313       
314   
315    def getTemporalData(self):
316        '''
317        Retrieves the temporal data for the record; if this hasn't been discovered yet,
318        do the necessary parsing
319        @return: TimeRange object array with temporal data
320        '''
321        if self.stData is None:
322            self.getSpatioTemporalData()
323       
324        return self.stData.getTemporalData()
325       
326   
327    def getSpatialData(self):
328        '''
329        Retrieves the spatial data for the record; if this hasn't been discovered yet,
330        do the necessary parsing
331        @return: Coords object array with spatial data
332        '''
333        if self.stData is None:
334            self.getSpatioTemporalData()
335       
336        return self.stData.getSpatialData()
337       
338
339    def listify(self, item):
340        '''
341        listify checks if an item is a list, if it isn't it puts it
342        inside a list and returns it. Always returns a list object.
343        @param item: object to check
344        @return: item as a list object
345        '''
346        if type(item) is list:
347            return item
348        else:
349            return [item]
350       
351   
352    def getSpatioTemporalData(self):
353        '''
354        Extract spatio temporal data from the original document
355        '''
356        logging.info('Retrieving spatiotemporal info from moles file')
357        # initialise the various spatiotemporal arrays used to extract data to
358        self.stData = SpatioTemporalData()
359       
360        if self.dgMeta is None:
361            self.createMolesFile()
362           
363        # do quick checks to see if the relevant data exists
364        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
365            logging.info("No data summary elements found - assuming no spatiotemporal data available")
366            return
367       
368        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
369            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
370            return
371       
372        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
373            logging.info("No spatial coverage elements found - assuming no spatial data available")
374        else:
375            self.getCoordData(self.dgMeta)
376
377        #SJD error with line below- this is where 23/09/08 edit in PostgresDAO fudge sorts...
378        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
379            logging.info("No temporal coverage elements found - assuming no temporal data available")
380        else:
381            self.getTimeRangeData(self.dgMeta)
382
383   
384    def getAuthorsInfo(self):
385        '''
386        Extract authors info from the moles file
387        '''
388        logging.info('Retrieving authors info from moles file')
389       
390        if self.dgMeta is None:
391            self.createMolesFile()
392           
393        logging.info("Extracting author info")
394        creators = ""
395        authors = ""
396        try:
397            # TODO: check this is the correct path for author data - NB, this is not obvious from example files
398            # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn
399            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier
400            logging.info("Found creator information - adding this to authors record")
401           
402        except Exception, detail:
403            logging.info("Exception thrown whilst trying to find creator information:")
404            logging.info(detail)
405            logging.info("- this suggests document does not contain creator information.")
406
407        try:
408            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors
409            logging.info("Found cited author information - adding this to authors record")
410           
411        except Exception, detail:
412            logging.info("Exception thrown whilst trying to find cited author information:")
413            logging.info(detail)
414            logging.info("- this suggests document does not contain cited author information.")
415       
416        self.authors = authors + " " + creators
417        return self.authors
418   
419   
420    def getParametersInfo(self):
421        '''
422        Extract parameters info from the moles file
423        '''
424        logging.info('Retrieving parameters info from moles file')
425       
426        if self.dgMeta is None:
427            self.createMolesFile()
428           
429        params = ""
430        try:
431            # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files
432            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured
433            parameters_list = self.listify(parameters)
434            for parameter in parameters_list:
435                if parameters.dgValidTerm:
436                    logging.info("Found parameter information - adding this to record")
437                    params += " " + parameters.dgValidTerm
438           
439           
440        except Exception, detail:
441            logging.info("Exception thrown whilst trying to find parameter information:")
442            logging.info(detail)
443            logging.info("- this suggests document does not contain parameter information.")
444       
445        self.parameters = params
446        return self.parameters
447   
448   
449    def getScopeInfo(self):
450        '''
451        Extract scope info from the moles file
452        '''
453        logging.info('Retrieving scope info from moles file')
454       
455        if self.dgMeta is None:
456            self.createMolesFile()
457           
458        scope = ""
459        try:
460            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword
461            logging.info("Found keyword information - parsing this for scope")
462
463            keywords_list = self.listify(keywords)
464            for keyword in keywords_list:
465                if keyword.dgValidTermID:
466                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab):
467                        logging.info("Found scope value - adding this to record")
468                        scope += " " + keyword.dgValidTerm.strip()
469           
470        except Exception, detail:
471            logging.info("Exception thrown whilst trying to find scope information:")
472            logging.info(detail)
473            logging.info("- this suggests document does not contain scope information.")
474
475        # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC'
476        # - to avoid this, use the following delimiter
477        self.scope = re.sub(r'_', 'UNDERSCORE', scope)
478        return self.scope
479           
480           
481    def getTimeRangeData(self, dgMeta):
482        '''
483        Parse an xml tree and add any time range data found
484        @param dgMeta: xml fragment for the time range
485        '''
486        logging.info("Extracting time range info")
487        try:
488            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
489           
490            if not dates:
491                logging.info("No temporal info found for document")
492               
493            dates_list = self.listify(dates)
494            for date in dates_list:
495                startdate=date.DateRangeStart
496                enddate= date.DateRangeEnd
497                if startdate==None or startdate=='None':
498                    startdate="null"
499                if enddate==None or enddate=='None':
500                    enddate="null"
501                   
502                self.stData.addTimeRange(startdate, enddate)
503                logging.info("Temporal info: startdate " + \
504                             startdate + ", enddate " + enddate) 
505        except Exception, detail:
506            logging.info("Document does not contain temporal info.")
507            logging.info(detail)
508
509       
510    def getCoordData(self, dgMeta):
511        '''
512        Parse an xml tree and add any coord data found
513        @param dgMeta: xml fragment for the bounding boxes
514        '''
515        logging.info("Extracting bounding box info")
516        try:
517
518            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
519           
520            if not bboxes:
521                logging.info("No bounding box info found for document")
522                return
523               
524            bbox_list=self.listify(bboxes)
525            #parse the list of coordinates
526            for bbox in bbox_list:
527                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
528                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
529                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
530                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
531                self.stData.addCoords(north, south, east, west)
532                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
533                    east + ", north " + north + "")
534               
535        except Exception, detail:
536            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
537                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
538
539
540    def parseCoord(self, coordValue, minField, maxField):
541        '''
542        Take a coordinate value extracted from a molefile bbox limit - together with
543        the appropriate max/min limits and extract the correct value from it
544        @param coordValue: the contents of the bbox limit tage
545        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
546        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
547        @return: coord - the value of the coordinate as a string   
548        '''
549        logging.debug("Parsing document coordinates")
550        try:
551            coord = coordValue.strip()
552            if coord.endswith(maxField):
553                coord=coordValue.split(maxField)[0]
554            elif coord.endswith(minField):
555                if coord.startswith('-'):
556                    coord = coordValue.split(minField)[0]
557                else:
558                    coord = "-" + coordValue.split(minField)[0]
559   
560            return '%s' % float(coord)
561        except:
562            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
563
564           
565    def hasNullCoords():
566        '''
567        Checks a record to determine whether it has any coordinates set to null
568        '''
569        if str(self.west)=='null' or \
570            str(self.south)=='null' or \
571            str(self.east)=='null' or \
572            str(self.north)=='null':
573            return True;
574        else:
575            return False;
576       
Note: See TracBrowser for help on using the repository browser.