source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 5524

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@5524
Revision 5524, 23.9 KB checked in by sdonegan, 11 years ago (diff)

Updated to allow extraction of end date into special column to allow ordering by this

Line 
1#!/usr/bin/env python
2'''
3Class representing the a document to be ingested into the postgres DB table
4C Byrom Apr 08
5'''
6from xml.etree import cElementTree
7import os, sys, logging, re, pkg_resources
8import csml.csml2Moles.molesReadWrite as MRW
9from ndg.common.src.models.ndgObject import ndgObject
10from ndg.common.src.lib.ndgresources import ndgResources
11import ndg.common.src.lib.fileutilities as FileUtilities
12from SpatioTemporalData import SpatioTemporalData
13import keywordAdder
14
15SAXON_JAR_FILE = 'lib/saxon9.jar'
16
17class PostgresRecord:
18    '''
19    Class representing the a document to be ingested into the postgres DB table
20    @param filename: Name of file to use a metadata record
21    @param ndg_dataprovider
22    @param datacentre_groups
23    @param datacentre_namespace
24    @param discovery_id
25    @param xq
26    @param doctype - type of doc to process
27    '''
28    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
29    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
30   
31    # vocab server - used for finding scope values in the moles files
32    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010"
33       
34    #def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
35    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id,datasetName,datacentreName,datasetLastEditUpdateDate,datasetStartDateNom, datasetEndDateNom, xq, docType):
36                 
37        logging.info("Setting up Postgres record for file, " + filename)
38        self.filename = filename
39   
40        # NB, if we're dealing with an NDG data provider, the details are slightly different
41        if ndg_dataprovider:
42            discObj=ndgObject(discovery_id)
43            self._local_id = discObj.localID
44            self._repository_local_id = discObj.repository
45        else:
46            self._local_id = discovery_id
47            self._repository_local_id = datacentre_namespace
48           
49        self._datacentre_groups = datacentre_groups
50        self._repository = datacentre_namespace
51        self.discovery_id = discovery_id
52        self._xq = xq
53        # simplify processing by uppercasing format at initialisation
54        self.docType = docType.upper()   
55       
56        self.dataset_name = datasetName
57        self.dataset_lastEdit = datasetLastEditUpdateDate
58        self.datacentre_name = datacentreName
59        self.datasetStartNom = datasetStartDateNom
60        self.datasetEndNom = datasetEndDateNom
61
62        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
63        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
64
65        # get the dir of the file - needed by the xquery to use as the target collection
66        tmp = filename.split('/')
67        self._dir = '/'.join(tmp[0:len(tmp)-1])
68        self.shortFilename = tmp[-1]
69       
70        # dir to store a temp copy of the moles file, when produced - for use by other transforms
71        self._molesDir = None
72        # object to hold the moles file - this will be loaded in when it is created - in order to extract
73        # spatiotemporal data, etc
74        self.dgMeta = None
75
76        # firstly load contents of file
77        self.originalFormat = file(filename).read()
78       
79        # escape any apostrophes
80        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
81
82        # initialise the various record fields
83        self.db_id = None    # the DB ID of the record, for easy reference when it is created
84        self.molesFormat = None
85        self.dcFormat = None
86        self.mdipFormat = None
87        self.iso19139Format = None
88        self.scn = 1    # system change number - keeps track of number of mods to a particular row
89       
90        # spatiotemporal data object
91        self.stData = None
92       
93        # fields to hold author, parameter and scope data
94        self.authors = None
95        self.parameters = None
96        self.scope = None
97
98    def escapeSpecialCharacters(self, inputString):
99        '''
100        Adjust the input string to escape any characters that would interfere with string or DB
101        operations
102        @param inputString: string to correct
103        @return: corrected string
104        '''
105        return re.sub(r'\'', '\\\'', inputString)
106
107
108    def unescapeSpecialCharacters(self, inputString):
109        '''
110        Adjust the input string to remove escaped characters that would interfere with string or DB
111        operations
112        @param inputString: string to correct
113        @return: corrected string
114        '''
115        str = re.sub(r'%20', ' ', inputString)
116        return 
117   
118   
119    def doRecordTransforms(self):
120        '''
121        Run various transforms on the original doc, to populate the record with
122        the other types of doc used elsewhere
123        '''
124        logging.info("Running transforms for all document types")
125        for docType in self.documentTypes:
126            self.getDocumentFormat(docType)
127           
128        logging.info("Transforms complete")
129
130
131    def createMolesFile(self):
132        '''
133        Check if a moles file exists on the system; if not, assume the moles transform has not
134        been ran and then produce this file - to allow for use in the various xqueries
135        '''
136        logging.info("Creating moles file on system - for use with other xquery transforms")
137        self._molesDir = self._dir + "/moles/"
138        FileUtilities.setUpDir(self._molesDir)
139       
140        if self._molesFormat is None:
141            self.doMolesTransform()
142           
143        FileUtilities.createFile(self._molesDir + self.shortFilename, self._molesFormat)
144        logging.info("Moles file created - at %s" %self._molesDir)
145       
146        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on       
147        molesFile = self._molesDir + self.shortFilename
148        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
149       
150        # load in the moles file and put this into an object for direct access to the xml elements
151       
152        self.dgMeta=MRW.dgMetadata()
153        try:
154            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
155        except Exception, detail:
156            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
157
158
159    def doTransform(self, xQueryType):
160        '''
161        Transform the record according to the specified XQuery type
162        @param xQueryType: XQuery doc to use to do the transform
163        @return: the metadata record in the required transformed format
164        '''
165        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
166
167        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
168        # moles file available for the transform - and use the correct dir for the xquery collection
169        dir = self._dir
170        if xQueryType.find('moles2') > -1:
171            if self._molesDir is None:
172                self.createMolesFile()
173               
174            dir = self._molesDir
175           
176        # get the query and set this up to use properly
177       
178        #xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
179        #SJD - added this bit in (missed?) to upgrade to ndgCommon.
180        self.xqueryLib = ndgResources()       
181        xquery = self.xqueryLib.createXQuery(xQueryType,dir, self._repository_local_id, self._local_id)
182     
183        # sort out the input ID stuff
184        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
185        xquery=xquery.replace('repository_localid', self._repository)
186
187        # strip out the eXist reference to the libraries; these files should be available in the
188        # running dir - as set up by oai_ingest.py
189        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
190        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
191
192        # write the query to file, to make it easier to input
193        # NB, running directly at the command line leads to problems with the interpretation of $ characters
194        xqFile = "currentQuery" + xQueryType + ".xq" 
195        FileUtilities.createFile(xqFile, xquery)
196       
197        # ensure the jar file is available - NB, this may be running from a different
198        # location - e.g. the OAIInfoEditor.lib.harvester - and this won't have the
199        # saxon file directly on its filesystem
200        jarFile = pkg_resources.resource_filename('OAIBatch', SAXON_JAR_FILE)
201
202        # Now do the transform
203        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
204        xqCommand = "java -cp %s net.sf.saxon.Query %s !omit-xml-declaration=yes" %(jarFile, xqFile)
205        logging.debug("Running saxon command: " + xqCommand)
206        pipe = os.popen(xqCommand + " 2>&1")
207        output = pipe.read()
208        status = pipe.close()
209
210        if status is not None:
211            raise SystemError, 'Failed at running the XQuery'
212
213        # now remove the temp xquery file
214        '''status = os.unlink(xqFile)
215        if status is not None:
216            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile'''
217       
218        logging.info("Transform completed successfully")
219
220        return output
221
222
223    def doMolesTransform(self):
224        '''
225        Set up the basic moles doc - according to the type of document first ingested
226        '''
227        logging.info("Creating moles document - for use with other transforms")
228        xqName = None
229        if self.docType == "DIF":
230            xqName = "dif2moles"
231        elif self.docType == "MDIP":
232            xqName = "mdip2moles"
233        else:
234            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
235                     %self.docType
236
237        # now run the appropriate transform and set the attribute
238        setattr(self, "_molesFormat", self.doTransform(xqName))
239
240        # add keywords, if required
241        if self._datacentre_groups:
242            self.addKeywords()
243       
244        # escape any apostrophes
245        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
246
247        logging.info("moles document created")
248       
249
250    def addKeywords(self):
251        '''
252        If datacentre groups have been specified, these need to be added as keywords
253        - NB, this is rather clumsy approach but uses old code to achieve the result
254        '''
255        logging.info("Adding datacentre keywords to moles file")
256
257        # NB, use temporary directories to do the keyword additions
258        tmpDir = os.getcwd() + "/tmp/"
259        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
260        FileUtilities.setUpDir(tmpDir)
261        FileUtilities.setUpDir(tmpKeywordsDir)
262        tmpFile = 'tmpFile.xml'
263        FileUtilities.createFile(tmpDir + tmpFile, self._molesFormat)
264
265        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
266
267        # Now load in the converted file
268        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
269        self._molesFormat = f.read()
270        f.close
271       
272        # Finally, tidy up temp dirs
273        FileUtilities.cleanDir(tmpDir)
274        FileUtilities.cleanDir(tmpKeywordsDir)
275        logging.info("Completed adding keywords")
276       
277
278    def getDocumentFormat(self, docType):
279        '''
280        Lookup document format; if it is already defined then return it, else do the required XQuery
281        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
282        @param docType: format of document to return
283        '''
284        logging.info("Retrieving document type, " + docType)
285        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
286        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
287       
288        # check we have the moles format available; if not create it
289        if self._molesFormat is None:
290            self.doMolesTransform()
291            self.createMolesFile()
292       
293        # check the document isn't already defined
294        try:
295            doc = getattr(self, attributeName)
296            if doc is not None:
297                logging.info("Found existing document - returning this now")
298                return doc
299        except:
300            logging.info("Document not available - creating new transformed document")
301
302        # the doc type doesn't exist - so run the xquery
303        transformedDoc = self.doTransform(xqName)
304        setattr(self, attributeName, transformedDoc)
305        return transformedDoc
306       
307   
308    def getAllDocs(self):
309        '''
310        Return a list of all the available doc types in the record
311        '''
312        # if the stored docs array is the same size as the array of all doc types
313        # assume all transforms have been done - and just return these
314        if len(self._allDocs) == len(self.documentTypes):
315            return self._allDocs
316       
317        for docType in self.documentTypes:
318            self._allDocs.append([docType, self.getDocumentFormat(docType)])
319
320        return self._allDocs
321       
322   
323    def getTemporalData(self):
324        '''
325        Retrieves the temporal data for the record; if this hasn't been discovered yet,
326        do the necessary parsing
327        @return: TimeRange object array with temporal data
328        '''
329        if self.stData is None:
330            self.getSpatioTemporalData()
331       
332        return self.stData.getTemporalData()
333       
334   
335    def getSpatialData(self):
336        '''
337        Retrieves the spatial data for the record; if this hasn't been discovered yet,
338        do the necessary parsing
339        @return: Coords object array with spatial data
340        '''
341        if self.stData is None:
342            self.getSpatioTemporalData()
343       
344        return self.stData.getSpatialData()
345       
346
347    def listify(self, item):
348        '''
349        listify checks if an item is a list, if it isn't it puts it
350        inside a list and returns it. Always returns a list object.
351        @param item: object to check
352        @return: item as a list object
353        '''
354        if type(item) is list:
355            return item
356        else:
357            return [item]
358       
359   
360    def getSpatioTemporalData(self):
361        '''
362        Extract spatio temporal data from the original document
363        '''
364        logging.info('Retrieving spatiotemporal info from moles file')
365        # initialise the various spatiotemporal arrays used to extract data to
366        self.stData = SpatioTemporalData()
367       
368        if self.dgMeta is None:
369            self.createMolesFile()
370           
371        # do quick checks to see if the relevant data exists
372        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
373            logging.info("No data summary elements found - assuming no spatiotemporal data available")
374            return
375       
376        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
377            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
378            return
379       
380        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
381            logging.info("No spatial coverage elements found - assuming no spatial data available")
382        else:
383            self.getCoordData(self.dgMeta)
384
385        #SJD error with line below- this is where 23/09/08 edit in PostgresDAO fudge sorts...
386        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
387            logging.info("No temporal coverage elements found - assuming no temporal data available")
388        else:
389            self.getTimeRangeData(self.dgMeta)
390
391   
392    def getAuthorsInfo(self):
393        '''
394        Extract authors info from the moles file
395        '''
396        logging.info('Retrieving authors info from moles file')
397       
398        if self.dgMeta is None:
399            self.createMolesFile()
400           
401        logging.info("Extracting author info")
402        creators = ""
403        authors = ""
404        try:
405            # TODO: check this is the correct path for author data - NB, this is not obvious from example files
406            # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn
407            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier
408            logging.info("Found creator information - adding this to authors record")
409           
410        except Exception, detail:
411            logging.info("Exception thrown whilst trying to find creator information:")
412            logging.info(detail)
413            logging.info("- this suggests document does not contain creator information.")
414
415        try:
416            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors
417            logging.info("Found cited author information - adding this to authors record")
418           
419        except Exception, detail:
420            logging.info("Exception thrown whilst trying to find cited author information:")
421            logging.info(detail)
422            logging.info("- this suggests document does not contain cited author information.")
423       
424        self.authors = authors + " " + creators
425        return self.authors
426   
427   
428    def getParametersInfo(self):
429        '''
430        Extract parameters info from the moles file
431        '''
432        logging.info('Retrieving parameters info from moles file')
433       
434        if self.dgMeta is None:
435            self.createMolesFile()
436           
437        params = ""
438        try:
439            # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files
440            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured
441            parameters_list = self.listify(parameters)
442            for parameter in parameters_list:
443                if parameters.dgValidTerm:
444                    logging.info("Found parameter information - adding this to record")
445                    params += " " + parameters.dgValidTerm
446           
447           
448        except Exception, detail:
449            logging.info("Exception thrown whilst trying to find parameter information:")
450            logging.info(detail)
451            logging.info("- this suggests document does not contain parameter information.")
452       
453        self.parameters = params
454        return self.parameters
455   
456   
457    def getScopeInfo(self):
458        '''
459        Extract scope info from the moles file
460        '''
461        logging.info('Retrieving scope info from moles file')
462       
463        if self.dgMeta is None:
464            self.createMolesFile()
465           
466        scope = ""
467        try:
468            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword
469            logging.info("Found keyword information - parsing this for scope")
470
471            keywords_list = self.listify(keywords)
472            for keyword in keywords_list:
473                if keyword.dgValidTermID:
474                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab):
475                        logging.info("Found scope value - adding this to record")
476                        scope += " " + keyword.dgValidTerm.strip()
477           
478        except Exception, detail:
479            logging.info("Exception thrown whilst trying to find scope information:")
480            logging.info(detail)
481            logging.info("- this suggests document does not contain scope information.")
482
483        # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC'
484        # - to avoid this, use the following delimiter
485        self.scope = re.sub(r'_', 'UNDERSCORE', scope)
486        return self.scope
487           
488           
489    def getTimeRangeData(self, dgMeta):
490        '''
491        Parse an xml tree and add any time range data found
492        @param dgMeta: xml fragment for the time range
493        '''
494        logging.info("Extracting time range info")
495        try:
496            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
497           
498            if not dates:
499                logging.info("No temporal info found for document")
500               
501            dates_list = self.listify(dates)
502            for date in dates_list:
503                startdate=date.DateRangeStart
504                enddate= date.DateRangeEnd
505                if startdate==None or startdate=='None':
506                    startdate="null"
507                if enddate==None or enddate=='None':
508                    enddate="null"
509                   
510                self.stData.addTimeRange(startdate, enddate)
511                logging.info("Temporal info: startdate " + \
512                             startdate + ", enddate " + enddate) 
513        except Exception, detail:
514            logging.info("Document does not contain temporal info.")
515            logging.info(detail)
516
517       
518    def getCoordData(self, dgMeta):
519        '''
520        Parse an xml tree and add any coord data found
521        @param dgMeta: xml fragment for the bounding boxes
522        '''
523        logging.info("Extracting bounding box info")
524        try:
525
526            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
527           
528            if not bboxes:
529                logging.info("No bounding box info found for document")
530                return
531               
532            bbox_list=self.listify(bboxes)
533            #parse the list of coordinates
534            for bbox in bbox_list:
535                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
536                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
537                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
538                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
539                self.stData.addCoords(north, south, east, west)
540                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
541                    east + ", north " + north + "")
542               
543        except Exception, detail:
544            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
545                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
546
547
548    def parseCoord(self, coordValue, minField, maxField):
549        '''
550        Take a coordinate value extracted from a molefile bbox limit - together with
551        the appropriate max/min limits and extract the correct value from it
552        @param coordValue: the contents of the bbox limit tage
553        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
554        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
555        @return: coord - the value of the coordinate as a string   
556        '''
557        logging.debug("Parsing document coordinates")
558        try:
559            coord = coordValue.strip()
560            if coord.endswith(maxField):
561                coord=coordValue.split(maxField)[0]
562            elif coord.endswith(minField):
563                if coord.startswith('-'):
564                    coord = coordValue.split(minField)[0]
565                else:
566                    coord = "-" + coordValue.split(minField)[0]
567   
568            return '%s' % float(coord)
569        except:
570            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
571
572           
573    def hasNullCoords():
574        '''
575        Checks a record to determine whether it has any coordinates set to null
576        '''
577        if str(self.west)=='null' or \
578            str(self.south)=='null' or \
579            str(self.east)=='null' or \
580            str(self.north)=='null':
581            return True;
582        else:
583            return False;
584       
Note: See TracBrowser for help on using the repository browser.