source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 4257

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@4257
Revision 4257, 23.0 KB checked in by cbyrom, 12 years ago (diff)

Fix handling of ndg hosted data - properly reading config settings from file
in ingest + improve use of default value checking.

Line 
1#!/usr/bin/env python
2'''
3Class representing the a document to be ingested into the postgres DB table
4C Byrom Apr 08
5'''
6try: #python 2.5
7    from xml.etree import cElementTree
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree
15
16import os, sys, logging, re
17import csml.csml2Moles.molesReadWrite as MRW
18from ndgUtils.ndgObject import ndgObject
19from FileUtilities import FileUtilities
20from SpatioTemporalData import SpatioTemporalData
21import keywordAdder
22
23class PostgresRecord:
24    '''
25    Class representing the a document to be ingested into the postgres DB table
26    @param filename: Name of file to use a metadata record
27    @param ndg_dataprovider
28    @param datacentre_groups
29    @param datacentre_namespace
30    @param discovery_id
31    @param xq
32    @param doctype - type of doc to process
33    '''
34    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
35    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139', 'MDIP']
36   
37    # vocab server - used for finding scope values in the moles files
38    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010"
39       
40    def __init__(self, filename, ndg_dataprovider, datacentre_groups, \
41                 datacentre_namespace, discovery_id, xq, docType):
42        logging.info("Setting up Postgres record for file, " + filename)
43        self.filename = filename
44   
45        # NB, if we're dealing with an NDG data provider, the details are slightly different
46        if ndg_dataprovider:
47            discObj=ndgObject(discovery_id)
48            self._local_id = discObj.localID
49            self._repository_local_id = discObj.repository
50        else:
51            self._local_id = discovery_id
52            self._repository_local_id = datacentre_namespace
53           
54        self._datacentre_groups = datacentre_groups
55        self._repository = datacentre_namespace
56        self.discovery_id = discovery_id
57        self._xq = xq
58        self.docType = docType
59
60        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
61        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
62
63        self._fileUtils = FileUtilities()
64
65        # get the dir of the file - needed by the xquery to use as the target collection
66        tmp = filename.split('/')
67        self._dir = '/'.join(tmp[0:len(tmp)-1])
68        self.shortFilename = tmp[len(tmp)-1]
69       
70        # dir to store a temp copy of the moles file, when produced - for use by other transforms
71        self._molesDir = None
72        # object to hold the moles file - this will be loaded in when it is created - in order to extract
73        # spatiotemporal data, etc
74        self.dgMeta = None
75
76        # firstly load contents of file
77        self.originalFormat = file(filename).read()
78       
79        # escape any apostrophes
80        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
81
82        # initialise the various record fields
83        self.db_id = None    # the DB ID of the record, for easy reference when it is created
84        self.molesFormat = None
85        self.dcFormat = None
86        self.mdipFormat = None
87        self.iso19139Format = None
88        self.scn = 1    # system change number - keeps track of number of mods to a particular row
89       
90        # spatiotemporal data object
91        self.stData = None
92       
93        # fields to hold author, parameter and scope data
94        self.authors = None
95        self.parameters = None
96        self.scope = None
97
98    def escapeSpecialCharacters(self, inputString):
99        '''
100        Adjust the input string to escape any characters that would interfere with string or DB
101        operations
102        @param inputString: string to correct
103        @return: corrected string
104        '''
105        return re.sub(r'\'', '\\\'', inputString)
106
107
108    def unescapeSpecialCharacters(self, inputString):
109        '''
110        Adjust the input string to remove escaped characters that would interfere with string or DB
111        operations
112        @param inputString: string to correct
113        @return: corrected string
114        '''
115        str = re.sub(r'%20', ' ', inputString)
116        return 
117   
118   
119    def doRecordTransforms(self):
120        '''
121        Run various transforms on the original doc, to populate the record with
122        the other types of doc used elsewhere
123        '''
124        logging.info("Running transforms for all document types")
125        for docType in self.documentTypes:
126            self.getDocumentFormat(docType)
127           
128        logging.info("Transforms complete")
129
130
131    def createMolesFile(self):
132        '''
133        Check if a moles file exists on the system; if not, assume the moles transform has not
134        been ran and then produce this file - to allow for use in the various xqueries
135        '''
136        logging.info("Creating moles file on system - for use with other xquery transforms")
137        self._molesDir = self._dir + "/moles/"
138        self._fileUtils.setUpDir(self._molesDir)
139       
140        if self._molesFormat is None:
141            self.doMolesTransform()
142           
143        self._fileUtils.createFile(self._molesDir + self.shortFilename, self._molesFormat)
144        logging.info("Moles file created - at %s" %self._molesDir)
145       
146        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on       
147        molesFile = self._molesDir + self.shortFilename
148        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
149       
150        # load in the moles file and put this into an object for direct access to the xml elements
151        self.dgMeta=MRW.dgMetadata()
152        try:
153            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
154        except Exception, detail:
155            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
156
157           
158
159    def doTransform(self, xQueryType):
160        '''
161        Transform the record according to the specified XQuery type
162        @param xQueryType: XQuery doc to use to do the transform
163        @return: the metadata record in the required transformed format
164        '''
165        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
166
167        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
168        # moles file available for the transform - and use the correct dir for the xquery collection
169        dir = self._dir
170        if xQueryType.find('moles2') > -1:
171            if self._molesDir is None:
172                self.createMolesFile()
173               
174            dir = self._molesDir
175           
176        # get the query and set this up to use properly
177        xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
178
179        # sort out the input ID stuff
180        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
181        xquery=xquery.replace('repository_localid', self._repository)
182
183        # strip out the eXist reference to the libraries; these files should be available in the
184        # running dir - as set up by oai_ingest.py
185        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
186        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
187
188        # write the query to file, to make it easier to input
189        # NB, running directly at the command line leads to problems with the interpretation of $ characters
190        xqFile = "currentQuery.xq"
191        self._fileUtils.createFile(xqFile, xquery)
192
193        # Now do the transform
194        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
195        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes"
196        logging.debug("Running saxon command: " + xqCommand)
197        pipe = os.popen(xqCommand + " 2>&1")
198        output = pipe.read()
199        status = pipe.close()
200
201        if status is not None:
202            raise SystemError, 'Failed at running the XQuery'
203
204        # now remove the temp xquery file
205        status = os.unlink(xqFile)
206        if status is not None:
207            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile
208       
209        logging.info("Transform completed successfully")
210
211        return output
212
213
214    def doMolesTransform(self):
215        '''
216        Set up the basic moles doc - according to the type of document first ingested
217        '''
218        logging.info("Creating moles document - for use with other transforms")
219        xqName = None
220        if self.docType == "DIF":
221            xqName = "dif2moles"
222        elif self.docType == "MDIP":
223            xqName = "mdip2moles"
224        else:
225            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
226                     %self.docType
227
228        # now run the appropriate transform and set the attribute
229        setattr(self, "_molesFormat", self.doTransform(xqName))
230
231        # add keywords, if required
232        if self._datacentre_groups:
233            self.addKeywords()
234       
235        # escape any apostrophes
236        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
237
238        logging.info("moles document created")
239       
240
241    def addKeywords(self):
242        '''
243        If datacentre groups have been specified, these need to be added as keywords
244        - NB, this is rather clumsy approach but uses old code to achieve the result
245        '''
246        logging.info("Adding datacentre keywords to moles file")
247
248        # NB, use temporary directories to do the keyword additions
249        tmpDir = os.getcwd() + "/tmp/"
250        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
251        self._fileUtils.setUpDir(tmpDir)
252        self._fileUtils.setUpDir(tmpKeywordsDir)
253        tmpFile = 'tmpFile.xml'
254        self._fileUtils.createFile(tmpDir + tmpFile, self._molesFormat)
255
256        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
257
258        # Now load in the converted file
259        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
260        self._molesFormat = f.read()
261        f.close
262       
263        # Finally, tidy up temp dirs
264        self._fileUtils.cleanDir(tmpDir)
265        self._fileUtils.cleanDir(tmpKeywordsDir)
266        logging.info("Completed adding keywords")
267       
268
269    def getDocumentFormat(self, docType):
270        '''
271        Lookup document format; if it is already defined then return it, else do the required XQuery
272        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
273        @param docType: format of document to return
274        '''
275        logging.info("Retrieving document type, " + docType)
276        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
277        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
278       
279        # check we have the moles format available; if not create it
280        if self._molesFormat is None:
281            self.doMolesTransform()
282            self.createMolesFile()
283       
284        # check the document isn't already defined
285        try:
286            doc = getattr(self, attributeName)
287            if doc is not None:
288                logging.info("Found existing document - returning this now")
289                return doc
290        except:
291            logging.info("Document not available - creating new transformed document")
292
293        # the doc type doesn't exist - so run the xquery
294        transformedDoc = self.doTransform(xqName)
295        setattr(self, attributeName, transformedDoc)
296        return transformedDoc
297       
298   
299    def getAllDocs(self):
300        '''
301        Return a list of all the available doc types in the record
302        '''
303        # if the stored docs array is the same size as the array of all doc types
304        # assume all transforms have been done - and just return these
305        if len(self._allDocs) == len(self.documentTypes):
306            return self._allDocs
307       
308        for docType in self.documentTypes:
309            self._allDocs.append([docType, self.getDocumentFormat(docType)])
310
311        return self._allDocs
312       
313   
314    def getTemporalData(self):
315        '''
316        Retrieves the temporal data for the record; if this hasn't been discovered yet,
317        do the necessary parsing
318        @return: TimeRange object array with temporal data
319        '''
320        if self.stData is None:
321            self.getSpatioTemporalData()
322       
323        return self.stData.getTemporalData()
324       
325   
326    def getSpatialData(self):
327        '''
328        Retrieves the spatial data for the record; if this hasn't been discovered yet,
329        do the necessary parsing
330        @return: Coords object array with spatial data
331        '''
332        if self.stData is None:
333            self.getSpatioTemporalData()
334       
335        return self.stData.getSpatialData()
336       
337
338    def listify(self, item):
339        '''
340        listify checks if an item is a list, if it isn't it puts it
341        inside a list and returns it. Always returns a list object.
342        @param item: object to check
343        @return: item as a list object
344        '''
345        if type(item) is list:
346            return item
347        else:
348            return [item]
349       
350   
351    def getSpatioTemporalData(self):
352        '''
353        Extract spatio temporal data from the original document
354        '''
355        logging.info('Retrieving spatiotemporal info from moles file')
356        # initialise the various spatiotemporal arrays used to extract data to
357        self.stData = SpatioTemporalData()
358       
359        if self.dgMeta is None:
360            self.createMolesFile()
361           
362        # do quick checks to see if the relevant data exists
363        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
364            logging.info("No data summary elements found - assuming no spatiotemporal data available")
365            return
366       
367        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
368            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
369            return
370       
371        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
372            logging.info("No spatial coverage elements found - assuming no spatial data available")
373        else:
374            self.getCoordData(self.dgMeta)
375
376        #SJD error with line below- this is where 23/09/08 edit in PostgresDAO fudge sorts...
377        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
378            logging.info("No temporal coverage elements found - assuming no temporal data available")
379        else:
380            self.getTimeRangeData(self.dgMeta)
381
382   
383    def getAuthorsInfo(self):
384        '''
385        Extract authors info from the moles file
386        '''
387        logging.info('Retrieving authors info from moles file')
388       
389        if self.dgMeta is None:
390            self.createMolesFile()
391           
392        logging.info("Extracting author info")
393        creators = ""
394        authors = ""
395        try:
396            # TODO: check this is the correct path for author data - NB, this is not obvious from example files
397            # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn
398            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier
399            logging.info("Found creator information - adding this to authors record")
400           
401        except Exception, detail:
402            logging.info("Exception thrown whilst trying to find creator information:")
403            logging.info(detail)
404            logging.info("- this suggests document does not contain creator information.")
405
406        try:
407            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors
408            logging.info("Found cited author information - adding this to authors record")
409           
410        except Exception, detail:
411            logging.info("Exception thrown whilst trying to find cited author information:")
412            logging.info(detail)
413            logging.info("- this suggests document does not contain cited author information.")
414       
415        self.authors = authors + " " + creators
416        return self.authors
417   
418   
419    def getParametersInfo(self):
420        '''
421        Extract parameters info from the moles file
422        '''
423        logging.info('Retrieving parameters info from moles file')
424       
425        if self.dgMeta is None:
426            self.createMolesFile()
427           
428        params = ""
429        try:
430            # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files
431            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured
432            parameters_list = self.listify(parameters)
433            for parameter in parameters_list:
434                if parameters.dgValidTerm:
435                    logging.info("Found parameter information - adding this to record")
436                    params += " " + parameters.dgValidTerm
437           
438           
439        except Exception, detail:
440            logging.info("Exception thrown whilst trying to find parameter information:")
441            logging.info(detail)
442            logging.info("- this suggests document does not contain parameter information.")
443       
444        self.parameters = params
445        return self.parameters
446   
447   
448    def getScopeInfo(self):
449        '''
450        Extract scope info from the moles file
451        '''
452        logging.info('Retrieving scope info from moles file')
453       
454        if self.dgMeta is None:
455            self.createMolesFile()
456           
457        scope = ""
458        try:
459            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword
460            logging.info("Found keyword information - parsing this for scope")
461
462            keywords_list = self.listify(keywords)
463            for keyword in keywords_list:
464                if keyword.dgValidTermID:
465                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab):
466                        logging.info("Found scope value - adding this to record")
467                        scope += " " + keyword.dgValidTerm.strip()
468           
469        except Exception, detail:
470            logging.info("Exception thrown whilst trying to find scope information:")
471            logging.info(detail)
472            logging.info("- this suggests document does not contain scope information.")
473
474        # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC'
475        # - to avoid this, use the following delimiter
476        self.scope = re.sub(r'_', 'UNDERSCORE', scope)
477        return self.scope
478           
479           
480    def getTimeRangeData(self, dgMeta):
481        '''
482        Parse an xml tree and add any time range data found
483        @param dgMeta: xml fragment for the time range
484        '''
485        logging.info("Extracting time range info")
486        try:
487            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
488           
489            if not dates:
490                logging.info("No temporal info found for document")
491               
492            dates_list = self.listify(dates)
493            for date in dates_list:
494                startdate=date.DateRangeStart
495                enddate= date.DateRangeEnd
496                if startdate==None or startdate=='None':
497                    startdate="null"
498                if enddate==None or enddate=='None':
499                    enddate="null"
500                   
501                self.stData.addTimeRange(startdate, enddate)
502                logging.info("Temporal info: startdate " + \
503                             startdate + ", enddate " + enddate) 
504        except Exception, detail:
505            logging.info("Document does not contain temporal info.")
506            logging.info(detail)
507
508       
509    def getCoordData(self, dgMeta):
510        '''
511        Parse an xml tree and add any coord data found
512        @param dgMeta: xml fragment for the bounding boxes
513        '''
514        logging.info("Extracting bounding box info")
515        try:
516
517            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
518           
519            if not bboxes:
520                logging.info("No bounding box info found for document")
521                return
522               
523            bbox_list=self.listify(bboxes)
524            #parse the list of coordinates
525            for bbox in bbox_list:
526                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
527                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
528                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
529                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
530                self.stData.addCoords(north, south, east, west)
531                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
532                    east + ", north " + north + "")
533               
534        except Exception, detail:
535            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
536                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
537
538
539    def parseCoord(self, coordValue, minField, maxField):
540        '''
541        Take a coordinate value extracted from a molefile bbox limit - together with
542        the appropriate max/min limits and extract the correct value from it
543        @param coordValue: the contents of the bbox limit tage
544        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
545        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
546        @return: coord - the value of the coordinate as a string   
547        '''
548        logging.debug("Parsing document coordinates")
549        try:
550            coord = coordValue.strip()
551            if coord.endswith(maxField):
552                coord=coordValue.split(maxField)[0]
553            elif coord.endswith(minField):
554                if coord.startswith('-'):
555                    coord = coordValue.split(minField)[0]
556                else:
557                    coord = "-" + coordValue.split(minField)[0]
558   
559            return '%s' % float(coord)
560        except:
561            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
562
563           
564    def hasNullCoords():
565        '''
566        Checks a record to determine whether it has any coordinates set to null
567        '''
568        if str(self.west)=='null' or \
569            str(self.south)=='null' or \
570            str(self.east)=='null' or \
571            str(self.north)=='null':
572            return True;
573        else:
574            return False;
575       
Note: See TracBrowser for help on using the repository browser.