source: TI01-discovery/branches/ingestion-MEDIN/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 6364

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestion-MEDIN/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@6364
Revision 6364, 24.8 KB checked in by sdonegan, 11 years ago (diff)

sync dirs

Line 
1#!/usr/bin/env python
2'''
3Class representing the a document to be ingested into the postgres DB table
4C Byrom Apr 08
5'''
6from xml.etree import cElementTree
7import os, sys, logging, re, pkg_resources
8import csml.csml2Moles.molesReadWrite as MRW
9from ndg.common.src.models.ndgObject import ndgObject
10from ndg.common.src.lib.ndgresources import ndgResources
11import ndg.common.src.lib.fileutilities as FileUtilities
12from SpatioTemporalData import SpatioTemporalData
13import keywordAdder
14
15SAXON_JAR_FILE = 'lib/saxon9.jar'
16
17class PostgresRecord:
18    '''
19    Class representing the a document to be ingested into the postgres DB table
20    @param filename: Name of file to use a metadata record
21    @param ndg_dataprovider
22    @param datacentre_groups
23    @param datacentre_namespace
24    @param discovery_id
25    @param xq
26    @param doctype - type of doc to process
27    '''
28    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
29    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
30   
31    # vocab server - used for finding scope values in the moles files
32    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010"
33       
34    #def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
35    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, isoDataModel, xq, docType):
36                 
37                 
38        logging.info("Setting up Postgres record for file, " + filename)
39       
40        self.filename = filename
41   
42        self.isoDataModel = isoDataModel
43       
44        #note method of extracting info from isoDataModel - nested lists, so if one value then use [0][0]
45        discovery_id = self.isoDataModel.datasetID[0][0]
46   
47        # NB, if we're dealing with an NDG data provider, the details are slightly different
48        if ndg_dataprovider:
49            discObj=ndgObject(discovery_id)
50            self._local_id = discObj.localID
51            self._repository_local_id = discObj.repository
52        else:
53            self._local_id = discovery_id
54            self._repository_local_id = datacentre_namespace
55           
56        self._datacentre_groups = datacentre_groups
57        self._repository = datacentre_namespace
58        self.discovery_id = discovery_id # just a single val..
59        self._xq = xq
60        # simplify processing by uppercasing format at initialisation
61        self.docType = docType.upper()   
62       
63       
64        #make sure we escape any special characters in this field... SJD 20/10/09       
65        #self.dataset_name = self.escapeSpecialCharacters(self.isoDataModel.datasetName[0])
66        self.dataset_name = self.escapeSpecialCharacters(self.isoDataModel.datasetName[0][0])
67       
68        #self.dataset_lastEdit = datasetLastEditUpdateDate   
69        self.dataset_lastEdit = self.isoDataModel.revisionDate[0][0]
70       
71        #for nominal start and end dates need to get extreme if multiple values in there
72        # - just create list of all values present, sort it by date then select min and max
73       
74       
75        #self.datasetStartNom = datasetStartDateNom
76        self.datasetStartNom = self.isoDataModel.boundingDates['start'] #dictionary method!
77       
78        #self.datasetEndNom = datasetEndDateNom
79        self.datasetEndNom = self.isoDataModel.boundingDates['end'] #dictionary method!
80               
81        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
82        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
83
84        # get the dir of the file - needed by the xquery to use as the target collection
85        tmp = filename.split('/')
86        self._dir = '/'.join(tmp[0:len(tmp)-1])
87        self.shortFilename = tmp[-1]
88       
89        # dir to store a temp copy of the moles file, when produced - for use by other transforms
90        self._molesDir = None
91        # object to hold the moles file - this will be loaded in when it is created - in order to extract
92        # spatiotemporal data, etc
93        self.dgMeta = None
94
95        # firstly load contents of file
96        self.originalFormat = file(filename).read()
97       
98        # escape any apostrophes
99        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
100
101        # initialise the various record fields
102        self.db_id = None    # the DB ID of the record, for easy reference when it is created
103        self.molesFormat = None
104        self.dcFormat = None
105        self.mdipFormat = None
106        self.iso19139Format = None
107        self.scn = 1    # system change number - keeps track of number of mods to a particular row
108       
109        # spatiotemporal data object
110        self.stData = None
111       
112        # fields to hold author, parameter and scope data
113        self.authors = None
114        self.parameters = None
115        self.scope = None
116       
117       
118
119    def escapeSpecialCharacters(self, inputString):
120        '''
121        Adjust the input string to escape any characters that would interfere with string or DB
122        operations
123        @param inputString: string to correct
124        @return: corrected string
125        '''
126        return re.sub(r'\'', '\\\'', inputString)
127
128
129    def unescapeSpecialCharacters(self, inputString):
130        '''
131        Adjust the input string to remove escaped characters that would interfere with string or DB
132        operations
133        @param inputString: string to correct
134        @return: corrected string
135        '''
136        str = re.sub(r'%20', ' ', inputString)
137        return 
138   
139   
140    def doRecordTransforms(self):
141        '''
142        Run various transforms on the original doc, to populate the record with
143        the other types of doc used elsewhere
144        '''
145        logging.info("Running transforms for all document types")
146        for docType in self.documentTypes:
147            self.getDocumentFormat(docType)
148           
149        logging.info("Transforms complete")
150
151
152    def createMolesFile(self):
153        '''
154        Check if a moles file exists on the system; if not, assume the moles transform has not
155        been ran and then produce this file - to allow for use in the various xqueries
156        '''
157        logging.info("Creating moles file on system - for use with other xquery transforms")
158        self._molesDir = self._dir + "/moles/"
159        FileUtilities.setUpDir(self._molesDir)
160       
161        if self._molesFormat is None:
162            self.doMolesTransform()
163           
164        FileUtilities.createFile(self._molesDir + self.shortFilename, self._molesFormat)
165        logging.info("Moles file created - at %s" %self._molesDir)
166       
167        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on       
168        molesFile = self._molesDir + self.shortFilename
169        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
170       
171        # load in the moles file and put this into an object for direct access to the xml elements
172       
173        self.dgMeta=MRW.dgMetadata()
174        try:
175            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
176        except Exception, detail:
177            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
178
179
180    def doTransform(self, xQueryType):
181        '''
182        Transform the record according to the specified XQuery type
183        @param xQueryType: XQuery doc to use to do the transform
184        @return: the metadata record in the required transformed format
185        '''
186        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
187
188        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
189        # moles file available for the transform - and use the correct dir for the xquery collection
190        dir = self._dir
191        if xQueryType.find('moles2') > -1:
192            if self._molesDir is None:
193                self.createMolesFile()
194               
195            dir = self._molesDir
196           
197        # get the query and set this up to use properly
198       
199        #xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
200        #SJD - added this bit in (missed?) to upgrade to ndgCommon.
201        self.xqueryLib = ndgResources()       
202        xquery = self.xqueryLib.createXQuery(xQueryType,dir, self._repository_local_id, self._local_id)
203     
204        # sort out the input ID stuff
205        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
206        xquery=xquery.replace('repository_localid', self._repository)
207
208        # strip out the eXist reference to the libraries; these files should be available in the
209        # running dir - as set up by oai_ingest.py
210        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
211        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
212
213        # write the query to file, to make it easier to input
214        # NB, running directly at the command line leads to problems with the interpretation of $ characters
215        xqFile = "currentQuery" + xQueryType + ".xq" 
216        FileUtilities.createFile(xqFile, xquery)
217       
218        # ensure the jar file is available - NB, this may be running from a different
219        # location - e.g. the OAIInfoEditor.lib.harvester - and this won't have the
220        # saxon file directly on its filesystem
221        jarFile = pkg_resources.resource_filename('OAIBatch', SAXON_JAR_FILE)
222
223        # Now do the transform
224        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
225        xqCommand = "java -cp %s net.sf.saxon.Query %s !omit-xml-declaration=yes" %(jarFile, xqFile)
226        logging.debug("Running saxon command: " + xqCommand)
227        pipe = os.popen(xqCommand + " 2>&1")
228        output = pipe.read()
229        status = pipe.close()
230
231        if status is not None:
232            raise SystemError, 'Failed at running the XQuery'
233
234        # now remove the temp xquery file
235        '''status = os.unlink(xqFile)
236        if status is not None:
237            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile'''
238       
239        logging.info("Transform completed successfully")
240
241        return output
242
243
244    def doMolesTransform(self):
245        '''
246        Set up the basic moles doc - according to the type of document first ingested
247        '''
248        logging.info("Creating moles document - for use with other transforms")
249        xqName = None
250        if self.docType == "DIF":
251            xqName = "dif2moles"
252        elif self.docType == "MDIP":
253            xqName = "mdip2moles"
254        else:
255            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
256                     %self.docType
257
258        # now run the appropriate transform and set the attribute
259        setattr(self, "_molesFormat", self.doTransform(xqName))
260
261        # add keywords, if required
262        if self._datacentre_groups:
263            self.addKeywords()
264       
265        # escape any apostrophes
266        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
267
268        logging.info("moles document created")
269       
270
271    def addKeywords(self):
272        '''
273        If datacentre groups have been specified, these need to be added as keywords
274        - NB, this is rather clumsy approach but uses old code to achieve the result
275        '''
276        logging.info("Adding datacentre keywords to moles file")
277
278        # NB, use temporary directories to do the keyword additions
279        tmpDir = os.getcwd() + "/tmp/"
280        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
281        FileUtilities.setUpDir(tmpDir)
282        FileUtilities.setUpDir(tmpKeywordsDir)
283        tmpFile = 'tmpFile.xml'
284        FileUtilities.createFile(tmpDir + tmpFile, self._molesFormat)
285
286        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
287
288        # Now load in the converted file
289        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
290        self._molesFormat = f.read()
291        f.close
292       
293        # Finally, tidy up temp dirs
294        FileUtilities.cleanDir(tmpDir)
295        FileUtilities.cleanDir(tmpKeywordsDir)
296        logging.info("Completed adding keywords")
297       
298
299    def getDocumentFormat(self, docType):
300        '''
301        Lookup document format; if it is already defined then return it, else do the required XQuery
302        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
303        @param docType: format of document to return
304        '''
305        logging.info("Retrieving document type, " + docType)
306        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
307        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
308       
309        # check we have the moles format available; if not create it
310        if self._molesFormat is None:
311            self.doMolesTransform()
312            self.createMolesFile()
313       
314        # check the document isn't already defined
315        try:
316            doc = getattr(self, attributeName)
317            if doc is not None:
318                logging.info("Found existing document - returning this now")
319                return doc
320        except:
321            logging.info("Document not available - creating new transformed document")
322
323        # the doc type doesn't exist - so run the xquery
324        transformedDoc = self.doTransform(xqName)
325        setattr(self, attributeName, transformedDoc)
326        return transformedDoc
327       
328   
329    def getAllDocs(self):
330        '''
331        Return a list of all the available doc types in the record
332        '''
333        # if the stored docs array is the same size as the array of all doc types
334        # assume all transforms have been done - and just return these
335        if len(self._allDocs) == len(self.documentTypes):
336            return self._allDocs
337       
338        for docType in self.documentTypes:
339            self._allDocs.append([docType, self.getDocumentFormat(docType)])
340
341        return self._allDocs
342       
343   
344    def getTemporalData(self):
345        '''
346        Retrieves the temporal data for the record; if this hasn't been discovered yet,
347        do the necessary parsing
348        @return: TimeRange object array with temporal data
349        '''
350        if self.stData is None:
351            self.getSpatioTemporalData()
352       
353        return self.stData.getTemporalData()
354       
355   
356    def getSpatialData(self):
357        '''
358        Retrieves the spatial data for the record; if this hasn't been discovered yet,
359        do the necessary parsing
360        @return: Coords object array with spatial data
361        '''
362        if self.stData is None:
363            self.getSpatioTemporalData()
364       
365        return self.stData.getSpatialData()
366       
367
368    def listify(self, item):
369        '''
370        listify checks if an item is a list, if it isn't it puts it
371        inside a list and returns it. Always returns a list object.
372        @param item: object to check
373        @return: item as a list object
374        '''
375        if type(item) is list:
376            return item
377        else:
378            return [item]
379       
380   
381    def getSpatioTemporalData(self):
382        '''
383        Extract spatio temporal data from the original document
384        '''
385        logging.info('Retrieving spatiotemporal info from moles file')
386        # initialise the various spatiotemporal arrays used to extract data to
387        self.stData = SpatioTemporalData()
388       
389        if self.dgMeta is None:
390            self.createMolesFile()
391           
392        # do quick checks to see if the relevant data exists
393        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
394            logging.info("No data summary elements found - assuming no spatiotemporal data available")
395            return
396       
397        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
398            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
399            return
400       
401        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
402            logging.info("No spatial coverage elements found - assuming no spatial data available")
403        else:
404            self.getCoordData(self.dgMeta)
405
406        #SJD error with line below- this is where 23/09/08 edit in PostgresDAO fudge sorts...
407        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
408            logging.info("No temporal coverage elements found - assuming no temporal data available")
409        else:
410            self.getTimeRangeData(self.dgMeta)
411
412   
413    def getAuthorsInfo(self):
414        '''
415        Extract authors info from the moles file
416        '''
417        logging.info('Retrieving authors info from moles file')
418       
419        if self.dgMeta is None:
420            self.createMolesFile()
421           
422        logging.info("Extracting author info")
423        creators = ""
424        authors = ""
425        try:
426            # TODO: check this is the correct path for author data - NB, this is not obvious from example files
427            # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn
428            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier
429            logging.info("Found creator information - adding this to authors record")
430           
431        except Exception, detail:
432            logging.info("Exception thrown whilst trying to find creator information:")
433            logging.info(detail)
434            logging.info("- this suggests document does not contain creator information.")
435
436        try:
437            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors
438            logging.info("Found cited author information - adding this to authors record")
439           
440        except Exception, detail:
441            logging.info("Exception thrown whilst trying to find cited author information:")
442            logging.info(detail)
443            logging.info("- this suggests document does not contain cited author information.")
444       
445        self.authors = authors + " " + creators
446        return self.authors
447   
448   
449    def getParametersInfo(self):
450        '''
451        Extract parameters info from the moles file
452        '''
453        logging.info('Retrieving parameters info from moles file')
454       
455        if self.dgMeta is None:
456            self.createMolesFile()
457           
458        params = ""
459        try:
460            # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files
461            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured
462            parameters_list = self.listify(parameters)
463            for parameter in parameters_list:
464                if parameters.dgValidTerm:
465                    logging.info("Found parameter information - adding this to record")
466                    params += " " + parameters.dgValidTerm
467           
468           
469        except Exception, detail:
470            logging.info("Exception thrown whilst trying to find parameter information:")
471            logging.info(detail)
472            logging.info("- this suggests document does not contain parameter information.")
473       
474        self.parameters = params
475        return self.parameters
476   
477   
478    def getScopeInfo(self):
479        '''
480        Extract scope info from the moles file
481        '''
482        logging.info('Retrieving scope info from moles file')
483       
484        if self.dgMeta is None:
485            self.createMolesFile()
486           
487        scope = ""
488        try:
489            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword
490            logging.info("Found keyword information - parsing this for scope")
491
492            keywords_list = self.listify(keywords)
493            for keyword in keywords_list:
494                if keyword.dgValidTermID:
495                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab):
496                        logging.info("Found scope value - adding this to record")
497                        scope += " " + keyword.dgValidTerm.strip()
498           
499        except Exception, detail:
500            logging.info("Exception thrown whilst trying to find scope information:")
501            logging.info(detail)
502            logging.info("- this suggests document does not contain scope information.")
503
504        # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC'
505        # - to avoid this, use the following delimiter
506        self.scope = re.sub(r'_', 'UNDERSCORE', scope)
507        return self.scope
508           
509           
510    def getTimeRangeData(self, dgMeta):
511        '''
512        Parse an xml tree and add any time range data found
513        @param dgMeta: xml fragment for the time range
514        '''
515        logging.info("Extracting time range info")
516        try:
517            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
518           
519            if not dates:
520                logging.info("No temporal info found for document")
521               
522            dates_list = self.listify(dates)
523            for date in dates_list:
524                startdate=date.DateRangeStart
525                enddate= date.DateRangeEnd
526                if startdate==None or startdate=='None':
527                    startdate="null"
528                if enddate==None or enddate=='None':
529                    enddate="null"
530                   
531                self.stData.addTimeRange(startdate, enddate)
532                logging.info("Temporal info: startdate " + \
533                             startdate + ", enddate " + enddate) 
534        except Exception, detail:
535            logging.info("Document does not contain temporal info.")
536            logging.info(detail)
537
538       
539    def getCoordData(self, dgMeta):
540        '''
541        Parse an xml tree and add any coord data found
542        @param dgMeta: xml fragment for the bounding boxes
543        '''
544        logging.info("Extracting bounding box info")
545        try:
546
547            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
548           
549            if not bboxes:
550                logging.info("No bounding box info found for document")
551                return
552               
553            bbox_list=self.listify(bboxes)
554            #parse the list of coordinates
555            for bbox in bbox_list:
556                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
557                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
558                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
559                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
560                self.stData.addCoords(north, south, east, west)
561                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
562                    east + ", north " + north + "")
563               
564        except Exception, detail:
565            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
566                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
567
568
569    def parseCoord(self, coordValue, minField, maxField):
570        '''
571        Take a coordinate value extracted from a molefile bbox limit - together with
572        the appropriate max/min limits and extract the correct value from it
573        @param coordValue: the contents of the bbox limit tage
574        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
575        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
576        @return: coord - the value of the coordinate as a string   
577        '''
578        logging.debug("Parsing document coordinates")
579        try:
580            coord = coordValue.strip()
581            if coord.endswith(maxField):
582                coord=coordValue.split(maxField)[0]
583            elif coord.endswith(minField):
584                if coord.startswith('-'):
585                    coord = coordValue.split(minField)[0]
586                else:
587                    coord = "-" + coordValue.split(minField)[0]
588   
589            return '%s' % float(coord)
590        except:
591            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
592
593           
594    def hasNullCoords():
595        '''
596        Checks a record to determine whether it has any coordinates set to null
597        '''
598        if str(self.west)=='null' or \
599            str(self.south)=='null' or \
600            str(self.east)=='null' or \
601            str(self.north)=='null':
602            return True;
603        else:
604            return False;
605       
Note: See TracBrowser for help on using the repository browser.