source: TI01-discovery/tags/stable-TI01-ingestAutomation_Proglue_upgradesAndReporting/temp/OAIBatch/PostgresRecord.py @ 4888

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/tags/stable-TI01-ingestAutomation_Proglue_upgradesAndReporting/temp/OAIBatch/PostgresRecord.py@4888
Revision 4888, 23.8 KB checked in by sdonegan, 12 years ago (diff)

Create tagged release of stable developed version for ingestion and ingestion reporting. Note - this is pre- Calums restructuring of ndgUtils and update to use atom feed. Will need to merge the two at some stage.

Line 
1#!/usr/bin/env python
2'''
3Class representing the a document to be ingested into the postgres DB table
4C Byrom Apr 08
5'''
6try: #python 2.5
7    from xml.etree import cElementTree
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree
15
16import os, sys, logging, re
17import csml.csml2Moles.molesReadWrite as MRW
18from ndgUtils.ndgObject import ndgObject
19from FileUtilities import FileUtilities
20from SpatioTemporalData import SpatioTemporalData
21import keywordAdder
22
23class PostgresRecord:
24    '''
25    Class representing the a document to be ingested into the postgres DB table
26    @param filename: Name of file to use a metadata record
27    @param ndg_dataprovider
28    @param datacentre_groups
29    @param datacentre_namespace
30    @param discovery_id
31    @param xq
32    @param doctype - type of doc to process
33    '''
34    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
35    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139','MDIP']
36   
37    # vocab server - used for finding scope values in the moles files
38    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010"
39       
40    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
41        logging.info("Setting up Postgres record for file, " + filename)
42        self.filename = filename
43   
44        # NB, if we're dealing with an NDG data provider, the details are slightly different
45        if ndg_dataprovider:
46            discObj=ndgObject(discovery_id)
47            self._local_id = discObj.localID
48            self._repository_local_id = discObj.repository
49        else:
50            self._local_id = discovery_id
51            self._repository_local_id = datacentre_namespace
52           
53        self._datacentre_groups = datacentre_groups
54        self._repository = datacentre_namespace
55        self.discovery_id = discovery_id
56        self._xq = xq
57        self.docType = docType
58
59        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
60        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
61
62        self._fileUtils = FileUtilities()
63
64        # get the dir of the file - needed by the xquery to use as the target collection
65        tmp = filename.split('/')
66        self._dir = '/'.join(tmp[0:len(tmp)-1])
67        self.shortFilename = tmp[len(tmp)-1]
68       
69        # dir to store a temp copy of the moles file, when produced - for use by other transforms
70        self._molesDir = None
71        # object to hold the moles file - this will be loaded in when it is created - in order to extract
72        # spatiotemporal data, etc
73        self.dgMeta = None
74
75        # firstly load contents of file
76        self.originalFormat = file(filename).read()
77       
78        # escape any apostrophes
79        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
80
81        # initialise the various record fields
82        self.db_id = None    # the DB ID of the record, for easy reference when it is created
83        self.molesFormat = None
84        self.dcFormat = None
85        self.mdipFormat = None
86        self.iso19139Format = None
87        self.scn = 1    # system change number - keeps track of number of mods to a particular row
88       
89        # spatiotemporal data object
90        self.stData = None
91       
92        # fields to hold author, parameter and scope data
93        self.authors = None
94        self.parameters = None
95        self.scope = None
96
97    def escapeSpecialCharacters(self, inputString):
98        '''
99        Adjust the input string to escape any characters that would interfere with string or DB
100        operations
101        @param inputString: string to correct
102        @return: corrected string
103        '''
104        return re.sub(r'\'', '\\\'', inputString)
105
106
107    def unescapeSpecialCharacters(self, inputString):
108        '''
109        Adjust the input string to remove escaped characters that would interfere with string or DB
110        operations
111        @param inputString: string to correct
112        @return: corrected string
113        '''
114        str = re.sub(r'%20', ' ', inputString)
115        return 
116   
117   
118    def doRecordTransforms(self):
119        '''
120        Run various transforms on the original doc, to populate the record with
121        the other types of doc used elsewhere
122        '''
123        logging.info("Running transforms for all document types")
124        for docType in self.documentTypes:
125            self.getDocumentFormat(docType)
126           
127        logging.info("Transforms complete")
128
129
130    def createMolesFile(self):
131        '''
132        Check if a moles file exists on the system; if not, assume the moles transform has not
133        been ran and then produce this file - to allow for use in the various xqueries
134        '''
135        logging.info("Creating moles file on system - for use with other xquery transforms")
136        self._molesDir = self._dir + "/moles/"
137        self._fileUtils.setUpDir(self._molesDir)
138       
139        if self._molesFormat is None:
140            self.doMolesTransform()
141           
142        self._fileUtils.createFile(self._molesDir + self.shortFilename, self._molesFormat)
143        logging.info("Moles file created - at %s" %self._molesDir)
144       
145        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on       
146        molesFile = self._molesDir + self.shortFilename
147        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
148       
149        # load in the moles file and put this into an object for direct access to the xml elements
150        self.dgMeta=MRW.dgMetadata()
151        try:
152            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
153        except Exception, detail:
154            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
155
156           
157
158    def doTransform(self, xQueryType):
159        '''
160        Transform the record according to the specified XQuery type
161        @param xQueryType: XQuery doc to use to do the transform
162        @return: the metadata record in the required transformed format
163        '''
164        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
165
166        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
167        # moles file available for the transform - and use the correct dir for the xquery collection
168        dir = self._dir
169        if xQueryType.find('moles2') > -1:
170            if self._molesDir is None:
171                self.createMolesFile()
172               
173            dir = self._molesDir
174           
175        # get the query and set this up to use properly
176        xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
177
178        # sort out the input ID stuff
179        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
180        xquery=xquery.replace('repository_localid', self._repository)
181
182        # strip out the eXist reference to the libraries; these files should be available in the
183        # running dir - as set up by oai_ingest.py
184        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
185        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
186
187        # write the query to file, to make it easier to input
188        # NB, running directly at the command line leads to problems with the interpretation of $ characters
189        #xqFile = "currentQuery.xq"
190        xqFile="currentQuery_" +xQueryType + ".xq"
191        self._fileUtils.createFile(xqFile, xquery)
192
193        # Now do the transform
194        #os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
195        os.putenv ('PATH', ':/usr/java/jdk1.5.0_06/bin:/usr/java/jdk1.5.0_06:/usr/java/jdk1.5.0_06/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
196        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes"
197        logging.debug("Running saxon command: " + xqCommand)
198        pipe = os.popen(xqCommand + " 2>&1")
199       
200        output = pipe.read()
201        status = pipe.close()
202
203        #for debug - write output to a local file for checking
204        #xqOpLocalFile= xQueryType + "_results.xml"
205        #print "++++++++++++++++++++++++++++++++++++++++++++++SJD: Printing xq op to local file: " + xqOpLocalFile
206        #self._fileUtils.createFile(xqOpLocalFile,output)
207        #file=open(xqOpLocalFile)
208        #file.writelines(output)
209        #file.close
210       
211        if status is not None:
212            raise SystemError, 'Failed at running the XQuery'
213
214        # now remove the temp xquery file
215        #status = os.unlink(xqFile)
216        #if status is not None:
217         #   raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile
218       
219        logging.info("Transform completed successfully")
220
221        return output
222
223
224    def doMolesTransform(self):
225        '''
226        Set up the basic moles doc - according to the type of document first ingested
227        '''
228        logging.info("Creating moles document - for use with other transforms")
229        xqName = None
230        if self.docType == "DIF":
231            xqName = "dif2moles"
232        elif self.docType == "MDIP":
233            xqName = "mdip2moles"
234        else:
235            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
236                     %self.docType
237
238        # now run the appropriate transform and set the attribute
239        setattr(self, "_molesFormat", self.doTransform(xqName))
240
241        # add keywords, if required
242        if self._datacentre_groups != "":
243            self.addKeywords()
244       
245        # escape any apostrophes
246        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
247
248        logging.info("moles document created")
249       
250
251    def addKeywords(self):
252        '''
253        If datacentre groups have been specified, these need to be added as keywords
254        - NB, this is rather clumsy approach but uses old code to achieve the result
255        '''
256        logging.info("Adding datacentre keywords to moles file")
257
258        # NB, use temporary directories to do the keyword additions
259        tmpDir = os.getcwd() + "/tmp/"
260        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
261        self._fileUtils.setUpDir(tmpDir)
262        self._fileUtils.setUpDir(tmpKeywordsDir)
263        tmpFile = 'tmpFile.xml'
264        self._fileUtils.createFile(tmpDir + tmpFile, self._molesFormat)
265
266        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
267
268        # Now load in the converted file
269        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
270        self._molesFormat = f.read()
271        f.close
272       
273        # Finally, tidy up temp dirs
274        self._fileUtils.cleanDir(tmpDir)
275        self._fileUtils.cleanDir(tmpKeywordsDir)
276        logging.info("Completed adding keywords")
277       
278
279    def getDocumentFormat(self, docType):
280        '''
281        Lookup document format; if it is already defined then return it, else do the required XQuery
282        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
283        @param docType: format of document to return
284        '''
285        logging.info("Retrieving document type, " + docType)
286        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
287        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
288       
289        # check we have the moles format available; if not create it
290        if self._molesFormat is None:
291            self.doMolesTransform()
292            self.createMolesFile()
293       
294        # check the document isn't already defined
295        try:
296            doc = getattr(self, attributeName)
297            if doc is not None:
298                logging.info("Found existing document - returning this now")
299                return doc
300        except:
301            logging.info("Document not available - creating new transformed document")
302
303        # the doc type doesn't exist - so run the xquery
304        transformedDoc = self.doTransform(xqName)
305        setattr(self, attributeName, transformedDoc)
306        return transformedDoc
307       
308   
309    def getAllDocs(self):
310        '''
311        Return a list of all the available doc types in the record
312        '''
313        # if the stored docs array is the same size as the array of all doc types
314        # assume all transforms have been done - and just return these
315        if len(self._allDocs) == len(self.documentTypes):
316            return self._allDocs
317       
318        for docType in self.documentTypes:
319            self._allDocs.append([docType, self.getDocumentFormat(docType)])
320
321        return self._allDocs
322       
323   
324    def getTemporalData(self):
325        '''
326        Retrieves the temporal data for the record; if this hasn't been discovered yet,
327        do the necessary parsing
328        @return: TimeRange object array with temporal data
329        '''
330       
331        if self.stData is None:
332           
333            self.getSpatioTemporalData()
334 
335        return self.stData.getTemporalData()
336       
337   
338    def getSpatialData(self):
339        '''
340        Retrieves the spatial data for the record; if this hasn't been discovered yet,
341        do the necessary parsing
342        @return: Coords object array with spatial data
343        '''
344        if self.stData is None:
345            self.getSpatioTemporalData()
346       
347        return self.stData.getSpatialData()
348       
349
350    def listify(self, item):
351        '''
352        listify checks if an item is a list, if it isn't it puts it
353        inside a list and returns it. Always returns a list object.
354        @param item: object to check
355        @return: item as a list object
356        '''
357        if type(item) is list:
358            return item
359        else:
360            return [item]
361       
362   
363    def getSpatioTemporalData(self):
364        '''
365        Extract spatio temporal data from the original document
366        '''
367       
368       
369        logging.info('Retrieving spatiotemporal info from moles file')
370        # initialise the various spatiotemporal arrays used to extract data to
371        self.stData = SpatioTemporalData()
372       
373        logging.info('ggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg : ' + self.stData)
374       
375        if self.dgMeta is None:
376            self.createMolesFile()
377           
378        # do quick checks to see if the relevant data exists
379        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
380            logging.info("No data summary elements found - assuming no spatiotemporal data available")
381            return
382       
383        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
384            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
385            return
386       
387        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
388            logging.info("No spatial coverage elements found - assuming no spatial data available")
389        else:
390            self.getCoordData(self.dgMeta)
391
392       
393
394        #SJD error with line below- this is where 23/09/08 edit in PostgresDAO fudge sorts...
395        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
396           
397            logging.info("No temporal coverage elements found - assuming no temporal data available")
398        else:
399           
400            self.getTimeRangeData(self.dgMeta)
401
402   
403    def getAuthorsInfo(self):
404        '''
405        Extract authors info from the moles file
406        '''
407        logging.info('Retrieving authors info from moles file')
408       
409        if self.dgMeta is None:
410            self.createMolesFile()
411           
412        logging.info("Extracting author info")
413        creators = ""
414        authors = ""
415        try:
416            # TODO: check this is the correct path for author data - NB, this is not obvious from example files
417            # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn
418            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier
419            logging.info("Found creator information - adding this to authors record")
420           
421        except Exception, detail:
422            logging.info("Exception thrown whilst trying to find creator information:")
423            logging.info(detail)
424            logging.info("- this suggests document does not contain creator information.")
425
426        try:
427            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors
428            logging.info("Found cited author information - adding this to authors record")
429           
430        except Exception, detail:
431            logging.info("Exception thrown whilst trying to find cited author information:")
432            logging.info(detail)
433            logging.info("- this suggests document does not contain cited author information.")
434       
435        self.authors = authors + " " + creators
436        return self.authors
437   
438   
439    def getParametersInfo(self):
440        '''
441        Extract parameters info from the moles file
442        '''
443        logging.info('Retrieving parameters info from moles file')
444       
445        if self.dgMeta is None:
446            self.createMolesFile()
447           
448        params = ""
449        try:
450            # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files
451            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured
452            parameters_list = self.listify(parameters)
453            for parameter in parameters_list:
454                if parameters.dgValidTerm:
455                    logging.info("Found parameter information - adding this to record")
456                    params += " " + parameters.dgValidTerm
457           
458           
459        except Exception, detail:
460            logging.info("Exception thrown whilst trying to find parameter information:")
461            logging.info(detail)
462            logging.info("- this suggests document does not contain parameter information.")
463       
464        self.parameters = params
465        return self.parameters
466   
467   
468    def getScopeInfo(self):
469        '''
470        Extract scope info from the moles file
471        '''
472        logging.info('Retrieving scope info from moles file')
473       
474        if self.dgMeta is None:
475            self.createMolesFile()
476           
477        scope = ""
478        try:
479            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword
480            logging.info("Found keyword information - parsing this for scope")
481
482            keywords_list = self.listify(keywords)
483            for keyword in keywords_list:
484                if keyword.dgValidTermID:
485                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab):
486                        logging.info("Found scope value - adding this to record")
487                        scope += " " + keyword.dgValidTerm.strip()
488           
489        except Exception, detail:
490            logging.info("Exception thrown whilst trying to find scope information:")
491            logging.info(detail)
492            logging.info("- this suggests document does not contain scope information.")
493
494        # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC'
495        # - to avoid this, use the following delimiter
496        self.scope = re.sub(r'_', 'UNDERSCORE', scope)
497        return self.scope
498           
499           
500    def getTimeRangeData(self, dgMeta):
501        '''
502        Parse an xml tree and add any time range data found
503        @param dgMeta: xml fragment for the time range
504        '''
505        logging.info("Extracting time range info")
506        try:
507            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
508           
509            if not dates:
510                logging.info("No temporal info found for document")
511               
512            dates_list = self.listify(dates)
513            for date in dates_list:
514                startdate=date.DateRangeStart
515                enddate= date.DateRangeEnd
516                if startdate==None or startdate=='None':
517                    startdate="null"
518                if enddate==None or enddate=='None':
519                    enddate="null"
520                   
521                self.stData.addTimeRange(startdate, enddate)
522                logging.info("Temporal info: startdate " + \
523                             startdate + ", enddate " + enddate) 
524        except Exception, detail:
525            logging.info("Document does not contain temporal info.")
526            logging.info(detail)
527
528       
529    def getCoordData(self, dgMeta):
530        '''
531        Parse an xml tree and add any coord data found
532        @param dgMeta: xml fragment for the bounding boxes
533        '''
534        logging.info("Extracting bounding box info")
535        try:
536
537           
538            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
539           
540            if not bboxes:
541                logging.info("No bounding box info found for document")
542                return
543           
544            bbox_list=self.listify(bboxes)
545            #parse the list of coordinates
546            for bbox in bbox_list:
547                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
548                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
549                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
550                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
551                self.stData.addCoords(north, south, east, west)
552                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
553                    east + ", north " + north + "")
554               
555        except Exception, detail:
556            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
557                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
558
559
560    def parseCoord(self, coordValue, minField, maxField):
561        '''
562        Take a coordinate value extracted from a molefile bbox limit - together with
563        the appropriate max/min limits and extract the correct value from it
564        @param coordValue: the contents of the bbox limit tage
565        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
566        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
567        @return: coord - the value of the coordinate as a string   
568        '''
569        logging.debug("Parsing document coordinates")
570        try:
571            coord = coordValue.strip()
572            if coord.endswith(maxField):
573                coord=coordValue.split(maxField)[0]
574            elif coord.endswith(minField):
575                if coord.startswith('-'):
576                    coord = coordValue.split(minField)[0]
577                else:
578                    coord = "-" + coordValue.split(minField)[0]
579   
580            return '%s' % float(coord)
581        except:
582            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
583
584           
585    def hasNullCoords():
586        '''
587        Checks a record to determine whether it has any coordinates set to null
588        '''
589        if str(self.west)=='null' or \
590            str(self.south)=='null' or \
591            str(self.east)=='null' or \
592            str(self.north)=='null':
593            return True;
594        else:
595            return False;
596       
Note: See TracBrowser for help on using the repository browser.