source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 5464

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@5464
Revision 5464, 23.8 KB checked in by sdonegan, 11 years ago (diff)

Allow start date to be inserted into new column in original doc to allow ordering by date - even if no date entry in record

Line 
1#!/usr/bin/env python
2'''
3Class representing the a document to be ingested into the postgres DB table
4C Byrom Apr 08
5'''
6from xml.etree import cElementTree
7import os, sys, logging, re, pkg_resources
8import csml.csml2Moles.molesReadWrite as MRW
9from ndg.common.src.models.ndgObject import ndgObject
10from ndg.common.src.lib.ndgresources import ndgResources
11import ndg.common.src.lib.fileutilities as FileUtilities
12from SpatioTemporalData import SpatioTemporalData
13import keywordAdder
14
15SAXON_JAR_FILE = 'lib/saxon9.jar'
16
17class PostgresRecord:
18    '''
19    Class representing the a document to be ingested into the postgres DB table
20    @param filename: Name of file to use a metadata record
21    @param ndg_dataprovider
22    @param datacentre_groups
23    @param datacentre_namespace
24    @param discovery_id
25    @param xq
26    @param doctype - type of doc to process
27    '''
28    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
29    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
30   
31    # vocab server - used for finding scope values in the moles files
32    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010"
33       
34    #def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
35    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id,datasetName,datacentreName,datasetLastEditUpdateDate,datasetStartDateNom, xq, docType):
36                 
37        logging.info("Setting up Postgres record for file, " + filename)
38        self.filename = filename
39   
40        # NB, if we're dealing with an NDG data provider, the details are slightly different
41        if ndg_dataprovider:
42            discObj=ndgObject(discovery_id)
43            self._local_id = discObj.localID
44            self._repository_local_id = discObj.repository
45        else:
46            self._local_id = discovery_id
47            self._repository_local_id = datacentre_namespace
48           
49        self._datacentre_groups = datacentre_groups
50        self._repository = datacentre_namespace
51        self.discovery_id = discovery_id
52        self._xq = xq
53        # simplify processing by uppercasing format at initialisation
54        self.docType = docType.upper()   
55       
56        self.dataset_name = datasetName
57        self.dataset_lastEdit = datasetLastEditUpdateDate
58        self.datacentre_name = datacentreName
59        self.datasetStartNom = datasetStartDateNom
60
61        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
62        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
63
64        # get the dir of the file - needed by the xquery to use as the target collection
65        tmp = filename.split('/')
66        self._dir = '/'.join(tmp[0:len(tmp)-1])
67        self.shortFilename = tmp[-1]
68       
69        # dir to store a temp copy of the moles file, when produced - for use by other transforms
70        self._molesDir = None
71        # object to hold the moles file - this will be loaded in when it is created - in order to extract
72        # spatiotemporal data, etc
73        self.dgMeta = None
74
75        # firstly load contents of file
76        self.originalFormat = file(filename).read()
77       
78        # escape any apostrophes
79        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
80
81        # initialise the various record fields
82        self.db_id = None    # the DB ID of the record, for easy reference when it is created
83        self.molesFormat = None
84        self.dcFormat = None
85        self.mdipFormat = None
86        self.iso19139Format = None
87        self.scn = 1    # system change number - keeps track of number of mods to a particular row
88       
89        # spatiotemporal data object
90        self.stData = None
91       
92        # fields to hold author, parameter and scope data
93        self.authors = None
94        self.parameters = None
95        self.scope = None
96
97    def escapeSpecialCharacters(self, inputString):
98        '''
99        Adjust the input string to escape any characters that would interfere with string or DB
100        operations
101        @param inputString: string to correct
102        @return: corrected string
103        '''
104        return re.sub(r'\'', '\\\'', inputString)
105
106
107    def unescapeSpecialCharacters(self, inputString):
108        '''
109        Adjust the input string to remove escaped characters that would interfere with string or DB
110        operations
111        @param inputString: string to correct
112        @return: corrected string
113        '''
114        str = re.sub(r'%20', ' ', inputString)
115        return 
116   
117   
118    def doRecordTransforms(self):
119        '''
120        Run various transforms on the original doc, to populate the record with
121        the other types of doc used elsewhere
122        '''
123        logging.info("Running transforms for all document types")
124        for docType in self.documentTypes:
125            self.getDocumentFormat(docType)
126           
127        logging.info("Transforms complete")
128
129
130    def createMolesFile(self):
131        '''
132        Check if a moles file exists on the system; if not, assume the moles transform has not
133        been ran and then produce this file - to allow for use in the various xqueries
134        '''
135        logging.info("Creating moles file on system - for use with other xquery transforms")
136        self._molesDir = self._dir + "/moles/"
137        FileUtilities.setUpDir(self._molesDir)
138       
139        if self._molesFormat is None:
140            self.doMolesTransform()
141           
142        FileUtilities.createFile(self._molesDir + self.shortFilename, self._molesFormat)
143        logging.info("Moles file created - at %s" %self._molesDir)
144       
145        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on       
146        molesFile = self._molesDir + self.shortFilename
147        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
148       
149        # load in the moles file and put this into an object for direct access to the xml elements
150       
151        self.dgMeta=MRW.dgMetadata()
152        try:
153            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
154        except Exception, detail:
155            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
156
157
158    def doTransform(self, xQueryType):
159        '''
160        Transform the record according to the specified XQuery type
161        @param xQueryType: XQuery doc to use to do the transform
162        @return: the metadata record in the required transformed format
163        '''
164        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
165
166        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
167        # moles file available for the transform - and use the correct dir for the xquery collection
168        dir = self._dir
169        if xQueryType.find('moles2') > -1:
170            if self._molesDir is None:
171                self.createMolesFile()
172               
173            dir = self._molesDir
174           
175        # get the query and set this up to use properly
176       
177        #xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
178        #SJD - added this bit in (missed?) to upgrade to ndgCommon.
179        self.xqueryLib = ndgResources()       
180        xquery = self.xqueryLib.createXQuery(xQueryType,dir, self._repository_local_id, self._local_id)
181     
182        # sort out the input ID stuff
183        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
184        xquery=xquery.replace('repository_localid', self._repository)
185
186        # strip out the eXist reference to the libraries; these files should be available in the
187        # running dir - as set up by oai_ingest.py
188        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
189        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
190
191        # write the query to file, to make it easier to input
192        # NB, running directly at the command line leads to problems with the interpretation of $ characters
193        xqFile = "currentQuery" + xQueryType + ".xq" 
194        FileUtilities.createFile(xqFile, xquery)
195       
196        # ensure the jar file is available - NB, this may be running from a different
197        # location - e.g. the OAIInfoEditor.lib.harvester - and this won't have the
198        # saxon file directly on its filesystem
199        jarFile = pkg_resources.resource_filename('OAIBatch', SAXON_JAR_FILE)
200
201        # Now do the transform
202        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
203        xqCommand = "java -cp %s net.sf.saxon.Query %s !omit-xml-declaration=yes" %(jarFile, xqFile)
204        logging.debug("Running saxon command: " + xqCommand)
205        pipe = os.popen(xqCommand + " 2>&1")
206        output = pipe.read()
207        status = pipe.close()
208
209        if status is not None:
210            raise SystemError, 'Failed at running the XQuery'
211
212        # now remove the temp xquery file
213        '''status = os.unlink(xqFile)
214        if status is not None:
215            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile'''
216       
217        logging.info("Transform completed successfully")
218
219        return output
220
221
222    def doMolesTransform(self):
223        '''
224        Set up the basic moles doc - according to the type of document first ingested
225        '''
226        logging.info("Creating moles document - for use with other transforms")
227        xqName = None
228        if self.docType == "DIF":
229            xqName = "dif2moles"
230        elif self.docType == "MDIP":
231            xqName = "mdip2moles"
232        else:
233            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
234                     %self.docType
235
236        # now run the appropriate transform and set the attribute
237        setattr(self, "_molesFormat", self.doTransform(xqName))
238
239        # add keywords, if required
240        if self._datacentre_groups:
241            self.addKeywords()
242       
243        # escape any apostrophes
244        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
245
246        logging.info("moles document created")
247       
248
249    def addKeywords(self):
250        '''
251        If datacentre groups have been specified, these need to be added as keywords
252        - NB, this is rather clumsy approach but uses old code to achieve the result
253        '''
254        logging.info("Adding datacentre keywords to moles file")
255
256        # NB, use temporary directories to do the keyword additions
257        tmpDir = os.getcwd() + "/tmp/"
258        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
259        FileUtilities.setUpDir(tmpDir)
260        FileUtilities.setUpDir(tmpKeywordsDir)
261        tmpFile = 'tmpFile.xml'
262        FileUtilities.createFile(tmpDir + tmpFile, self._molesFormat)
263
264        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
265
266        # Now load in the converted file
267        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
268        self._molesFormat = f.read()
269        f.close
270       
271        # Finally, tidy up temp dirs
272        FileUtilities.cleanDir(tmpDir)
273        FileUtilities.cleanDir(tmpKeywordsDir)
274        logging.info("Completed adding keywords")
275       
276
277    def getDocumentFormat(self, docType):
278        '''
279        Lookup document format; if it is already defined then return it, else do the required XQuery
280        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
281        @param docType: format of document to return
282        '''
283        logging.info("Retrieving document type, " + docType)
284        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
285        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
286       
287        # check we have the moles format available; if not create it
288        if self._molesFormat is None:
289            self.doMolesTransform()
290            self.createMolesFile()
291       
292        # check the document isn't already defined
293        try:
294            doc = getattr(self, attributeName)
295            if doc is not None:
296                logging.info("Found existing document - returning this now")
297                return doc
298        except:
299            logging.info("Document not available - creating new transformed document")
300
301        # the doc type doesn't exist - so run the xquery
302        transformedDoc = self.doTransform(xqName)
303        setattr(self, attributeName, transformedDoc)
304        return transformedDoc
305       
306   
307    def getAllDocs(self):
308        '''
309        Return a list of all the available doc types in the record
310        '''
311        # if the stored docs array is the same size as the array of all doc types
312        # assume all transforms have been done - and just return these
313        if len(self._allDocs) == len(self.documentTypes):
314            return self._allDocs
315       
316        for docType in self.documentTypes:
317            self._allDocs.append([docType, self.getDocumentFormat(docType)])
318
319        return self._allDocs
320       
321   
322    def getTemporalData(self):
323        '''
324        Retrieves the temporal data for the record; if this hasn't been discovered yet,
325        do the necessary parsing
326        @return: TimeRange object array with temporal data
327        '''
328        if self.stData is None:
329            self.getSpatioTemporalData()
330       
331        return self.stData.getTemporalData()
332       
333   
334    def getSpatialData(self):
335        '''
336        Retrieves the spatial data for the record; if this hasn't been discovered yet,
337        do the necessary parsing
338        @return: Coords object array with spatial data
339        '''
340        if self.stData is None:
341            self.getSpatioTemporalData()
342       
343        return self.stData.getSpatialData()
344       
345
346    def listify(self, item):
347        '''
348        listify checks if an item is a list, if it isn't it puts it
349        inside a list and returns it. Always returns a list object.
350        @param item: object to check
351        @return: item as a list object
352        '''
353        if type(item) is list:
354            return item
355        else:
356            return [item]
357       
358   
359    def getSpatioTemporalData(self):
360        '''
361        Extract spatio temporal data from the original document
362        '''
363        logging.info('Retrieving spatiotemporal info from moles file')
364        # initialise the various spatiotemporal arrays used to extract data to
365        self.stData = SpatioTemporalData()
366       
367        if self.dgMeta is None:
368            self.createMolesFile()
369           
370        # do quick checks to see if the relevant data exists
371        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
372            logging.info("No data summary elements found - assuming no spatiotemporal data available")
373            return
374       
375        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
376            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
377            return
378       
379        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
380            logging.info("No spatial coverage elements found - assuming no spatial data available")
381        else:
382            self.getCoordData(self.dgMeta)
383
384        #SJD error with line below- this is where 23/09/08 edit in PostgresDAO fudge sorts...
385        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
386            logging.info("No temporal coverage elements found - assuming no temporal data available")
387        else:
388            self.getTimeRangeData(self.dgMeta)
389
390   
391    def getAuthorsInfo(self):
392        '''
393        Extract authors info from the moles file
394        '''
395        logging.info('Retrieving authors info from moles file')
396       
397        if self.dgMeta is None:
398            self.createMolesFile()
399           
400        logging.info("Extracting author info")
401        creators = ""
402        authors = ""
403        try:
404            # TODO: check this is the correct path for author data - NB, this is not obvious from example files
405            # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn
406            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier
407            logging.info("Found creator information - adding this to authors record")
408           
409        except Exception, detail:
410            logging.info("Exception thrown whilst trying to find creator information:")
411            logging.info(detail)
412            logging.info("- this suggests document does not contain creator information.")
413
414        try:
415            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors
416            logging.info("Found cited author information - adding this to authors record")
417           
418        except Exception, detail:
419            logging.info("Exception thrown whilst trying to find cited author information:")
420            logging.info(detail)
421            logging.info("- this suggests document does not contain cited author information.")
422       
423        self.authors = authors + " " + creators
424        return self.authors
425   
426   
427    def getParametersInfo(self):
428        '''
429        Extract parameters info from the moles file
430        '''
431        logging.info('Retrieving parameters info from moles file')
432       
433        if self.dgMeta is None:
434            self.createMolesFile()
435           
436        params = ""
437        try:
438            # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files
439            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured
440            parameters_list = self.listify(parameters)
441            for parameter in parameters_list:
442                if parameters.dgValidTerm:
443                    logging.info("Found parameter information - adding this to record")
444                    params += " " + parameters.dgValidTerm
445           
446           
447        except Exception, detail:
448            logging.info("Exception thrown whilst trying to find parameter information:")
449            logging.info(detail)
450            logging.info("- this suggests document does not contain parameter information.")
451       
452        self.parameters = params
453        return self.parameters
454   
455   
456    def getScopeInfo(self):
457        '''
458        Extract scope info from the moles file
459        '''
460        logging.info('Retrieving scope info from moles file')
461       
462        if self.dgMeta is None:
463            self.createMolesFile()
464           
465        scope = ""
466        try:
467            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword
468            logging.info("Found keyword information - parsing this for scope")
469
470            keywords_list = self.listify(keywords)
471            for keyword in keywords_list:
472                if keyword.dgValidTermID:
473                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab):
474                        logging.info("Found scope value - adding this to record")
475                        scope += " " + keyword.dgValidTerm.strip()
476           
477        except Exception, detail:
478            logging.info("Exception thrown whilst trying to find scope information:")
479            logging.info(detail)
480            logging.info("- this suggests document does not contain scope information.")
481
482        # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC'
483        # - to avoid this, use the following delimiter
484        self.scope = re.sub(r'_', 'UNDERSCORE', scope)
485        return self.scope
486           
487           
488    def getTimeRangeData(self, dgMeta):
489        '''
490        Parse an xml tree and add any time range data found
491        @param dgMeta: xml fragment for the time range
492        '''
493        logging.info("Extracting time range info")
494        try:
495            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
496           
497            if not dates:
498                logging.info("No temporal info found for document")
499               
500            dates_list = self.listify(dates)
501            for date in dates_list:
502                startdate=date.DateRangeStart
503                enddate= date.DateRangeEnd
504                if startdate==None or startdate=='None':
505                    startdate="null"
506                if enddate==None or enddate=='None':
507                    enddate="null"
508                   
509                self.stData.addTimeRange(startdate, enddate)
510                logging.info("Temporal info: startdate " + \
511                             startdate + ", enddate " + enddate) 
512        except Exception, detail:
513            logging.info("Document does not contain temporal info.")
514            logging.info(detail)
515
516       
517    def getCoordData(self, dgMeta):
518        '''
519        Parse an xml tree and add any coord data found
520        @param dgMeta: xml fragment for the bounding boxes
521        '''
522        logging.info("Extracting bounding box info")
523        try:
524
525            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
526           
527            if not bboxes:
528                logging.info("No bounding box info found for document")
529                return
530               
531            bbox_list=self.listify(bboxes)
532            #parse the list of coordinates
533            for bbox in bbox_list:
534                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
535                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
536                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
537                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
538                self.stData.addCoords(north, south, east, west)
539                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
540                    east + ", north " + north + "")
541               
542        except Exception, detail:
543            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
544                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
545
546
547    def parseCoord(self, coordValue, minField, maxField):
548        '''
549        Take a coordinate value extracted from a molefile bbox limit - together with
550        the appropriate max/min limits and extract the correct value from it
551        @param coordValue: the contents of the bbox limit tage
552        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
553        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
554        @return: coord - the value of the coordinate as a string   
555        '''
556        logging.debug("Parsing document coordinates")
557        try:
558            coord = coordValue.strip()
559            if coord.endswith(maxField):
560                coord=coordValue.split(maxField)[0]
561            elif coord.endswith(minField):
562                if coord.startswith('-'):
563                    coord = coordValue.split(minField)[0]
564                else:
565                    coord = "-" + coordValue.split(minField)[0]
566   
567            return '%s' % float(coord)
568        except:
569            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
570
571           
572    def hasNullCoords():
573        '''
574        Checks a record to determine whether it has any coordinates set to null
575        '''
576        if str(self.west)=='null' or \
577            str(self.south)=='null' or \
578            str(self.east)=='null' or \
579            str(self.north)=='null':
580            return True;
581        else:
582            return False;
583       
Note: See TracBrowser for help on using the repository browser.