source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 4854

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@4854
Revision 4854, 22.7 KB checked in by cbyrom, 11 years ago (diff)

Add new ingest script - to allow ingest of DIF docs from eXist hosted
atom feed. NB, this required restructure of original OAI harvester
to allow re-use of shared code - by abstracting this out into new class,
absstractdocumentingester.

Add new documentation and tidy up codebase removing dependencies where possible to simplify things.

Line 
1#!/usr/bin/env python
2'''
3Class representing the a document to be ingested into the postgres DB table
4C Byrom Apr 08
5'''
6from xml.etree import cElementTree
7import os, sys, logging, re
8#import csml.csml2Moles.molesReadWrite as MRW
9from ndg.common.src.models.ndgObject import ndgObject
10import ndg.common.src.lib.fileutilities as FileUtilities
11from SpatioTemporalData import SpatioTemporalData
12import keywordAdder
13
14class PostgresRecord:
15    '''
16    Class representing the a document to be ingested into the postgres DB table
17    @param filename: Name of file to use a metadata record
18    @param ndg_dataprovider
19    @param datacentre_groups
20    @param datacentre_namespace
21    @param discovery_id
22    @param xq
23    @param doctype - type of doc to process
24    '''
25    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
26    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139', 'MDIP']
27   
28    # vocab server - used for finding scope values in the moles files
29    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010"
30       
31    def __init__(self, filename, ndg_dataprovider, datacentre_groups, \
32                 datacentre_namespace, discovery_id, xq, docType):
33        logging.info("Setting up Postgres record for file, " + filename)
34        self.filename = filename
35   
36        # NB, if we're dealing with an NDG data provider, the details are slightly different
37        if ndg_dataprovider:
38            discObj=ndgObject(discovery_id)
39            self._local_id = discObj.localID
40            self._repository_local_id = discObj.repository
41        else:
42            self._local_id = discovery_id
43            self._repository_local_id = datacentre_namespace
44           
45        self._datacentre_groups = datacentre_groups
46        self._repository = datacentre_namespace
47        self.discovery_id = discovery_id
48        self._xq = xq
49        self.docType = docType
50
51        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
52        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
53
54        # get the dir of the file - needed by the xquery to use as the target collection
55        tmp = filename.split('/')
56        self._dir = '/'.join(tmp[0:len(tmp)-1])
57        self.shortFilename = tmp[-1]
58       
59        # dir to store a temp copy of the moles file, when produced - for use by other transforms
60        self._molesDir = None
61        # object to hold the moles file - this will be loaded in when it is created - in order to extract
62        # spatiotemporal data, etc
63        self.dgMeta = None
64
65        # firstly load contents of file
66        self.originalFormat = file(filename).read()
67       
68        # escape any apostrophes
69        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
70
71        # initialise the various record fields
72        self.db_id = None    # the DB ID of the record, for easy reference when it is created
73        self.molesFormat = None
74        self.dcFormat = None
75        self.mdipFormat = None
76        self.iso19139Format = None
77        self.scn = 1    # system change number - keeps track of number of mods to a particular row
78       
79        # spatiotemporal data object
80        self.stData = None
81       
82        # fields to hold author, parameter and scope data
83        self.authors = None
84        self.parameters = None
85        self.scope = None
86
87    def escapeSpecialCharacters(self, inputString):
88        '''
89        Adjust the input string to escape any characters that would interfere with string or DB
90        operations
91        @param inputString: string to correct
92        @return: corrected string
93        '''
94        return re.sub(r'\'', '\\\'', inputString)
95
96
97    def unescapeSpecialCharacters(self, inputString):
98        '''
99        Adjust the input string to remove escaped characters that would interfere with string or DB
100        operations
101        @param inputString: string to correct
102        @return: corrected string
103        '''
104        str = re.sub(r'%20', ' ', inputString)
105        return 
106   
107   
108    def doRecordTransforms(self):
109        '''
110        Run various transforms on the original doc, to populate the record with
111        the other types of doc used elsewhere
112        '''
113        logging.info("Running transforms for all document types")
114        for docType in self.documentTypes:
115            self.getDocumentFormat(docType)
116           
117        logging.info("Transforms complete")
118
119
120    def createMolesFile(self):
121        '''
122        Check if a moles file exists on the system; if not, assume the moles transform has not
123        been ran and then produce this file - to allow for use in the various xqueries
124        '''
125        logging.info("Creating moles file on system - for use with other xquery transforms")
126        self._molesDir = self._dir + "/moles/"
127        FileUtilities.setUpDir(self._molesDir)
128       
129        if self._molesFormat is None:
130            self.doMolesTransform()
131           
132        FileUtilities.createFile(self._molesDir + self.shortFilename, self._molesFormat)
133        logging.info("Moles file created - at %s" %self._molesDir)
134       
135        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on       
136        molesFile = self._molesDir + self.shortFilename
137        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
138       
139        # load in the moles file and put this into an object for direct access to the xml elements
140        self.dgMeta=MRW.dgMetadata()
141        try:
142            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
143        except Exception, detail:
144            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
145
146
147    def doTransform(self, xQueryType):
148        '''
149        Transform the record according to the specified XQuery type
150        @param xQueryType: XQuery doc to use to do the transform
151        @return: the metadata record in the required transformed format
152        '''
153        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
154
155        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
156        # moles file available for the transform - and use the correct dir for the xquery collection
157        dir = self._dir
158        if xQueryType.find('moles2') > -1:
159            if self._molesDir is None:
160                self.createMolesFile()
161               
162            dir = self._molesDir
163           
164        # get the query and set this up to use properly
165        xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
166
167        # sort out the input ID stuff
168        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
169        xquery=xquery.replace('repository_localid', self._repository)
170
171        # strip out the eXist reference to the libraries; these files should be available in the
172        # running dir - as set up by oai_ingest.py
173        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
174        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
175
176        # write the query to file, to make it easier to input
177        # NB, running directly at the command line leads to problems with the interpretation of $ characters
178        xqFile = "currentQuery.xq"
179        FileUtilities.createFile(xqFile, xquery)
180
181        # Now do the transform
182        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
183        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes"
184        logging.debug("Running saxon command: " + xqCommand)
185        pipe = os.popen(xqCommand + " 2>&1")
186        output = pipe.read()
187        status = pipe.close()
188
189        if status is not None:
190            raise SystemError, 'Failed at running the XQuery'
191
192        # now remove the temp xquery file
193        status = os.unlink(xqFile)
194        if status is not None:
195            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile
196       
197        logging.info("Transform completed successfully")
198
199        return output
200
201
202    def doMolesTransform(self):
203        '''
204        Set up the basic moles doc - according to the type of document first ingested
205        '''
206        logging.info("Creating moles document - for use with other transforms")
207        xqName = None
208        if self.docType == "DIF":
209            xqName = "dif2moles"
210        elif self.docType == "MDIP":
211            xqName = "mdip2moles"
212        else:
213            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
214                     %self.docType
215
216        # now run the appropriate transform and set the attribute
217        setattr(self, "_molesFormat", self.doTransform(xqName))
218
219        # add keywords, if required
220        if self._datacentre_groups:
221            self.addKeywords()
222       
223        # escape any apostrophes
224        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
225
226        logging.info("moles document created")
227       
228
229    def addKeywords(self):
230        '''
231        If datacentre groups have been specified, these need to be added as keywords
232        - NB, this is rather clumsy approach but uses old code to achieve the result
233        '''
234        logging.info("Adding datacentre keywords to moles file")
235
236        # NB, use temporary directories to do the keyword additions
237        tmpDir = os.getcwd() + "/tmp/"
238        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
239        FileUtilities.setUpDir(tmpDir)
240        FileUtilities.setUpDir(tmpKeywordsDir)
241        tmpFile = 'tmpFile.xml'
242        FileUtilities.createFile(tmpDir + tmpFile, self._molesFormat)
243
244        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
245
246        # Now load in the converted file
247        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
248        self._molesFormat = f.read()
249        f.close
250       
251        # Finally, tidy up temp dirs
252        FileUtilities.cleanDir(tmpDir)
253        FileUtilities.cleanDir(tmpKeywordsDir)
254        logging.info("Completed adding keywords")
255       
256
257    def getDocumentFormat(self, docType):
258        '''
259        Lookup document format; if it is already defined then return it, else do the required XQuery
260        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
261        @param docType: format of document to return
262        '''
263        logging.info("Retrieving document type, " + docType)
264        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
265        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
266       
267        # check we have the moles format available; if not create it
268        if self._molesFormat is None:
269            self.doMolesTransform()
270            self.createMolesFile()
271       
272        # check the document isn't already defined
273        try:
274            doc = getattr(self, attributeName)
275            if doc is not None:
276                logging.info("Found existing document - returning this now")
277                return doc
278        except:
279            logging.info("Document not available - creating new transformed document")
280
281        # the doc type doesn't exist - so run the xquery
282        transformedDoc = self.doTransform(xqName)
283        setattr(self, attributeName, transformedDoc)
284        return transformedDoc
285       
286   
287    def getAllDocs(self):
288        '''
289        Return a list of all the available doc types in the record
290        '''
291        # if the stored docs array is the same size as the array of all doc types
292        # assume all transforms have been done - and just return these
293        if len(self._allDocs) == len(self.documentTypes):
294            return self._allDocs
295       
296        for docType in self.documentTypes:
297            self._allDocs.append([docType, self.getDocumentFormat(docType)])
298
299        return self._allDocs
300       
301   
302    def getTemporalData(self):
303        '''
304        Retrieves the temporal data for the record; if this hasn't been discovered yet,
305        do the necessary parsing
306        @return: TimeRange object array with temporal data
307        '''
308        if self.stData is None:
309            self.getSpatioTemporalData()
310       
311        return self.stData.getTemporalData()
312       
313   
314    def getSpatialData(self):
315        '''
316        Retrieves the spatial data for the record; if this hasn't been discovered yet,
317        do the necessary parsing
318        @return: Coords object array with spatial data
319        '''
320        if self.stData is None:
321            self.getSpatioTemporalData()
322       
323        return self.stData.getSpatialData()
324       
325
326    def listify(self, item):
327        '''
328        listify checks if an item is a list, if it isn't it puts it
329        inside a list and returns it. Always returns a list object.
330        @param item: object to check
331        @return: item as a list object
332        '''
333        if type(item) is list:
334            return item
335        else:
336            return [item]
337       
338   
339    def getSpatioTemporalData(self):
340        '''
341        Extract spatio temporal data from the original document
342        '''
343        logging.info('Retrieving spatiotemporal info from moles file')
344        # initialise the various spatiotemporal arrays used to extract data to
345        self.stData = SpatioTemporalData()
346       
347        if self.dgMeta is None:
348            self.createMolesFile()
349           
350        # do quick checks to see if the relevant data exists
351        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
352            logging.info("No data summary elements found - assuming no spatiotemporal data available")
353            return
354       
355        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
356            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
357            return
358       
359        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
360            logging.info("No spatial coverage elements found - assuming no spatial data available")
361        else:
362            self.getCoordData(self.dgMeta)
363
364        #SJD error with line below- this is where 23/09/08 edit in PostgresDAO fudge sorts...
365        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
366            logging.info("No temporal coverage elements found - assuming no temporal data available")
367        else:
368            self.getTimeRangeData(self.dgMeta)
369
370   
371    def getAuthorsInfo(self):
372        '''
373        Extract authors info from the moles file
374        '''
375        logging.info('Retrieving authors info from moles file')
376       
377        if self.dgMeta is None:
378            self.createMolesFile()
379           
380        logging.info("Extracting author info")
381        creators = ""
382        authors = ""
383        try:
384            # TODO: check this is the correct path for author data - NB, this is not obvious from example files
385            # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn
386            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier
387            logging.info("Found creator information - adding this to authors record")
388           
389        except Exception, detail:
390            logging.info("Exception thrown whilst trying to find creator information:")
391            logging.info(detail)
392            logging.info("- this suggests document does not contain creator information.")
393
394        try:
395            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors
396            logging.info("Found cited author information - adding this to authors record")
397           
398        except Exception, detail:
399            logging.info("Exception thrown whilst trying to find cited author information:")
400            logging.info(detail)
401            logging.info("- this suggests document does not contain cited author information.")
402       
403        self.authors = authors + " " + creators
404        return self.authors
405   
406   
407    def getParametersInfo(self):
408        '''
409        Extract parameters info from the moles file
410        '''
411        logging.info('Retrieving parameters info from moles file')
412       
413        if self.dgMeta is None:
414            self.createMolesFile()
415           
416        params = ""
417        try:
418            # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files
419            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured
420            parameters_list = self.listify(parameters)
421            for parameter in parameters_list:
422                if parameters.dgValidTerm:
423                    logging.info("Found parameter information - adding this to record")
424                    params += " " + parameters.dgValidTerm
425           
426           
427        except Exception, detail:
428            logging.info("Exception thrown whilst trying to find parameter information:")
429            logging.info(detail)
430            logging.info("- this suggests document does not contain parameter information.")
431       
432        self.parameters = params
433        return self.parameters
434   
435   
436    def getScopeInfo(self):
437        '''
438        Extract scope info from the moles file
439        '''
440        logging.info('Retrieving scope info from moles file')
441       
442        if self.dgMeta is None:
443            self.createMolesFile()
444           
445        scope = ""
446        try:
447            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword
448            logging.info("Found keyword information - parsing this for scope")
449
450            keywords_list = self.listify(keywords)
451            for keyword in keywords_list:
452                if keyword.dgValidTermID:
453                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab):
454                        logging.info("Found scope value - adding this to record")
455                        scope += " " + keyword.dgValidTerm.strip()
456           
457        except Exception, detail:
458            logging.info("Exception thrown whilst trying to find scope information:")
459            logging.info(detail)
460            logging.info("- this suggests document does not contain scope information.")
461
462        # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC'
463        # - to avoid this, use the following delimiter
464        self.scope = re.sub(r'_', 'UNDERSCORE', scope)
465        return self.scope
466           
467           
468    def getTimeRangeData(self, dgMeta):
469        '''
470        Parse an xml tree and add any time range data found
471        @param dgMeta: xml fragment for the time range
472        '''
473        logging.info("Extracting time range info")
474        try:
475            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
476           
477            if not dates:
478                logging.info("No temporal info found for document")
479               
480            dates_list = self.listify(dates)
481            for date in dates_list:
482                startdate=date.DateRangeStart
483                enddate= date.DateRangeEnd
484                if startdate==None or startdate=='None':
485                    startdate="null"
486                if enddate==None or enddate=='None':
487                    enddate="null"
488                   
489                self.stData.addTimeRange(startdate, enddate)
490                logging.info("Temporal info: startdate " + \
491                             startdate + ", enddate " + enddate) 
492        except Exception, detail:
493            logging.info("Document does not contain temporal info.")
494            logging.info(detail)
495
496       
497    def getCoordData(self, dgMeta):
498        '''
499        Parse an xml tree and add any coord data found
500        @param dgMeta: xml fragment for the bounding boxes
501        '''
502        logging.info("Extracting bounding box info")
503        try:
504
505            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
506           
507            if not bboxes:
508                logging.info("No bounding box info found for document")
509                return
510               
511            bbox_list=self.listify(bboxes)
512            #parse the list of coordinates
513            for bbox in bbox_list:
514                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
515                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
516                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
517                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
518                self.stData.addCoords(north, south, east, west)
519                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
520                    east + ", north " + north + "")
521               
522        except Exception, detail:
523            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
524                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
525
526
527    def parseCoord(self, coordValue, minField, maxField):
528        '''
529        Take a coordinate value extracted from a molefile bbox limit - together with
530        the appropriate max/min limits and extract the correct value from it
531        @param coordValue: the contents of the bbox limit tage
532        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
533        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
534        @return: coord - the value of the coordinate as a string   
535        '''
536        logging.debug("Parsing document coordinates")
537        try:
538            coord = coordValue.strip()
539            if coord.endswith(maxField):
540                coord=coordValue.split(maxField)[0]
541            elif coord.endswith(minField):
542                if coord.startswith('-'):
543                    coord = coordValue.split(minField)[0]
544                else:
545                    coord = "-" + coordValue.split(minField)[0]
546   
547            return '%s' % float(coord)
548        except:
549            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
550
551           
552    def hasNullCoords():
553        '''
554        Checks a record to determine whether it has any coordinates set to null
555        '''
556        if str(self.west)=='null' or \
557            str(self.south)=='null' or \
558            str(self.east)=='null' or \
559            str(self.north)=='null':
560            return True;
561        else:
562            return False;
563       
Note: See TracBrowser for help on using the repository browser.