source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 5415

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@5415
Revision 5415, 23.8 KB checked in by sdonegan, 10 years ago (diff)

Support extra abstractdocumentingester.py functionality - deletions and updates

Line 
1#!/usr/bin/env python
2'''
3Class representing the a document to be ingested into the postgres DB table
4C Byrom Apr 08
5'''
6from xml.etree import cElementTree
7import os, sys, logging, re, pkg_resources
8import csml.csml2Moles.molesReadWrite as MRW
9from ndg.common.src.models.ndgObject import ndgObject
10from ndg.common.src.lib.ndgresources import ndgResources
11import ndg.common.src.lib.fileutilities as FileUtilities
12from SpatioTemporalData import SpatioTemporalData
13import keywordAdder
14
15SAXON_JAR_FILE = 'lib/saxon9.jar'
16
17class PostgresRecord:
18    '''
19    Class representing the a document to be ingested into the postgres DB table
20    @param filename: Name of file to use a metadata record
21    @param ndg_dataprovider
22    @param datacentre_groups
23    @param datacentre_namespace
24    @param discovery_id
25    @param xq
26    @param doctype - type of doc to process
27    '''
28    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
29    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
30   
31    # vocab server - used for finding scope values in the moles files
32    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010"
33       
34    #def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
35    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id,datasetName,datacentreName,datasetLastEditUpdateDate, xq, docType):
36                 
37        logging.info("Setting up Postgres record for file, " + filename)
38        self.filename = filename
39   
40        # NB, if we're dealing with an NDG data provider, the details are slightly different
41        if ndg_dataprovider:
42            discObj=ndgObject(discovery_id)
43            self._local_id = discObj.localID
44            self._repository_local_id = discObj.repository
45        else:
46            self._local_id = discovery_id
47            self._repository_local_id = datacentre_namespace
48           
49        self._datacentre_groups = datacentre_groups
50        self._repository = datacentre_namespace
51        self.discovery_id = discovery_id
52        self._xq = xq
53        # simplify processing by uppercasing format at initialisation
54        self.docType = docType.upper()   
55       
56        self.dataset_name = datasetName
57        self.dataset_lastEdit = datasetLastEditUpdateDate
58        self.datacentre_name = datacentreName
59
60        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
61        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
62
63        # get the dir of the file - needed by the xquery to use as the target collection
64        tmp = filename.split('/')
65        self._dir = '/'.join(tmp[0:len(tmp)-1])
66        self.shortFilename = tmp[-1]
67       
68        # dir to store a temp copy of the moles file, when produced - for use by other transforms
69        self._molesDir = None
70        # object to hold the moles file - this will be loaded in when it is created - in order to extract
71        # spatiotemporal data, etc
72        self.dgMeta = None
73
74        # firstly load contents of file
75        self.originalFormat = file(filename).read()
76       
77        # escape any apostrophes
78        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
79
80        # initialise the various record fields
81        self.db_id = None    # the DB ID of the record, for easy reference when it is created
82        self.molesFormat = None
83        self.dcFormat = None
84        self.mdipFormat = None
85        self.iso19139Format = None
86        self.scn = 1    # system change number - keeps track of number of mods to a particular row
87       
88        # spatiotemporal data object
89        self.stData = None
90       
91        # fields to hold author, parameter and scope data
92        self.authors = None
93        self.parameters = None
94        self.scope = None
95
96    def escapeSpecialCharacters(self, inputString):
97        '''
98        Adjust the input string to escape any characters that would interfere with string or DB
99        operations
100        @param inputString: string to correct
101        @return: corrected string
102        '''
103        return re.sub(r'\'', '\\\'', inputString)
104
105
106    def unescapeSpecialCharacters(self, inputString):
107        '''
108        Adjust the input string to remove escaped characters that would interfere with string or DB
109        operations
110        @param inputString: string to correct
111        @return: corrected string
112        '''
113        str = re.sub(r'%20', ' ', inputString)
114        return 
115   
116   
117    def doRecordTransforms(self):
118        '''
119        Run various transforms on the original doc, to populate the record with
120        the other types of doc used elsewhere
121        '''
122        logging.info("Running transforms for all document types")
123        for docType in self.documentTypes:
124            self.getDocumentFormat(docType)
125           
126        logging.info("Transforms complete")
127
128
129    def createMolesFile(self):
130        '''
131        Check if a moles file exists on the system; if not, assume the moles transform has not
132        been ran and then produce this file - to allow for use in the various xqueries
133        '''
134        logging.info("Creating moles file on system - for use with other xquery transforms")
135        self._molesDir = self._dir + "/moles/"
136        FileUtilities.setUpDir(self._molesDir)
137       
138        if self._molesFormat is None:
139            self.doMolesTransform()
140           
141        FileUtilities.createFile(self._molesDir + self.shortFilename, self._molesFormat)
142        logging.info("Moles file created - at %s" %self._molesDir)
143       
144        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on       
145        molesFile = self._molesDir + self.shortFilename
146        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
147       
148        # load in the moles file and put this into an object for direct access to the xml elements
149       
150        self.dgMeta=MRW.dgMetadata()
151        try:
152            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
153        except Exception, detail:
154            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
155
156
157    def doTransform(self, xQueryType):
158        '''
159        Transform the record according to the specified XQuery type
160        @param xQueryType: XQuery doc to use to do the transform
161        @return: the metadata record in the required transformed format
162        '''
163        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
164
165        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
166        # moles file available for the transform - and use the correct dir for the xquery collection
167        dir = self._dir
168        if xQueryType.find('moles2') > -1:
169            if self._molesDir is None:
170                self.createMolesFile()
171               
172            dir = self._molesDir
173           
174        # get the query and set this up to use properly
175       
176        #xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
177        #SJD - added this bit in (missed?) to upgrade to ndgCommon.
178        self.xqueryLib = ndgResources()       
179        xquery = self.xqueryLib.createXQuery(xQueryType,dir, self._repository_local_id, self._local_id)
180     
181        # sort out the input ID stuff
182        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
183        xquery=xquery.replace('repository_localid', self._repository)
184
185        # strip out the eXist reference to the libraries; these files should be available in the
186        # running dir - as set up by oai_ingest.py
187        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
188        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
189
190        # write the query to file, to make it easier to input
191        # NB, running directly at the command line leads to problems with the interpretation of $ characters
192        xqFile = "currentQuery" + xQueryType + ".xq" 
193        FileUtilities.createFile(xqFile, xquery)
194       
195        # ensure the jar file is available - NB, this may be running from a different
196        # location - e.g. the OAIInfoEditor.lib.harvester - and this won't have the
197        # saxon file directly on its filesystem
198        jarFile = pkg_resources.resource_filename('OAIBatch', SAXON_JAR_FILE)
199
200        # Now do the transform
201        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
202        xqCommand = "java -cp %s net.sf.saxon.Query %s !omit-xml-declaration=yes" %(jarFile, xqFile)
203        logging.debug("Running saxon command: " + xqCommand)
204        pipe = os.popen(xqCommand + " 2>&1")
205        output = pipe.read()
206        status = pipe.close()
207
208        if status is not None:
209            raise SystemError, 'Failed at running the XQuery'
210
211        # now remove the temp xquery file
212        '''status = os.unlink(xqFile)
213        if status is not None:
214            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile'''
215       
216        logging.info("Transform completed successfully")
217
218        return output
219
220
221    def doMolesTransform(self):
222        '''
223        Set up the basic moles doc - according to the type of document first ingested
224        '''
225        logging.info("Creating moles document - for use with other transforms")
226        xqName = None
227        if self.docType == "DIF":
228            xqName = "dif2moles"
229        elif self.docType == "MDIP":
230            xqName = "mdip2moles"
231        else:
232            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
233                     %self.docType
234
235        # now run the appropriate transform and set the attribute
236        setattr(self, "_molesFormat", self.doTransform(xqName))
237
238        # add keywords, if required
239        if self._datacentre_groups:
240            self.addKeywords()
241       
242        # escape any apostrophes
243        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
244
245        logging.info("moles document created")
246       
247
248    def addKeywords(self):
249        '''
250        If datacentre groups have been specified, these need to be added as keywords
251        - NB, this is rather clumsy approach but uses old code to achieve the result
252        '''
253        logging.info("Adding datacentre keywords to moles file")
254
255        # NB, use temporary directories to do the keyword additions
256        tmpDir = os.getcwd() + "/tmp/"
257        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
258        FileUtilities.setUpDir(tmpDir)
259        FileUtilities.setUpDir(tmpKeywordsDir)
260        tmpFile = 'tmpFile.xml'
261        FileUtilities.createFile(tmpDir + tmpFile, self._molesFormat)
262
263        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
264
265        # Now load in the converted file
266        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
267        self._molesFormat = f.read()
268        f.close
269       
270        # Finally, tidy up temp dirs
271        FileUtilities.cleanDir(tmpDir)
272        FileUtilities.cleanDir(tmpKeywordsDir)
273        logging.info("Completed adding keywords")
274       
275
276    def getDocumentFormat(self, docType):
277        '''
278        Lookup document format; if it is already defined then return it, else do the required XQuery
279        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
280        @param docType: format of document to return
281        '''
282        logging.info("Retrieving document type, " + docType)
283        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
284        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
285       
286        # check we have the moles format available; if not create it
287        if self._molesFormat is None:
288            self.doMolesTransform()
289            self.createMolesFile()
290       
291        # check the document isn't already defined
292        try:
293            doc = getattr(self, attributeName)
294            if doc is not None:
295                logging.info("Found existing document - returning this now")
296                return doc
297        except:
298            logging.info("Document not available - creating new transformed document")
299
300        # the doc type doesn't exist - so run the xquery
301        transformedDoc = self.doTransform(xqName)
302        setattr(self, attributeName, transformedDoc)
303        return transformedDoc
304       
305   
306    def getAllDocs(self):
307        '''
308        Return a list of all the available doc types in the record
309        '''
310        # if the stored docs array is the same size as the array of all doc types
311        # assume all transforms have been done - and just return these
312        if len(self._allDocs) == len(self.documentTypes):
313            return self._allDocs
314       
315        for docType in self.documentTypes:
316            self._allDocs.append([docType, self.getDocumentFormat(docType)])
317
318        return self._allDocs
319       
320   
321    def getTemporalData(self):
322        '''
323        Retrieves the temporal data for the record; if this hasn't been discovered yet,
324        do the necessary parsing
325        @return: TimeRange object array with temporal data
326        '''
327        if self.stData is None:
328            self.getSpatioTemporalData()
329       
330        return self.stData.getTemporalData()
331       
332   
333    def getSpatialData(self):
334        '''
335        Retrieves the spatial data for the record; if this hasn't been discovered yet,
336        do the necessary parsing
337        @return: Coords object array with spatial data
338        '''
339        if self.stData is None:
340            self.getSpatioTemporalData()
341       
342        return self.stData.getSpatialData()
343       
344
345    def listify(self, item):
346        '''
347        listify checks if an item is a list, if it isn't it puts it
348        inside a list and returns it. Always returns a list object.
349        @param item: object to check
350        @return: item as a list object
351        '''
352        if type(item) is list:
353            return item
354        else:
355            return [item]
356       
357   
358    def getSpatioTemporalData(self):
359        '''
360        Extract spatio temporal data from the original document
361        '''
362        logging.info('Retrieving spatiotemporal info from moles file')
363        # initialise the various spatiotemporal arrays used to extract data to
364        self.stData = SpatioTemporalData()
365       
366        if self.dgMeta is None:
367            self.createMolesFile()
368           
369        # do quick checks to see if the relevant data exists
370        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
371            logging.info("No data summary elements found - assuming no spatiotemporal data available")
372            return
373       
374        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
375            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
376            return
377       
378        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
379            logging.info("No spatial coverage elements found - assuming no spatial data available")
380        else:
381            self.getCoordData(self.dgMeta)
382
383        #SJD error with line below- this is where 23/09/08 edit in PostgresDAO fudge sorts...
384        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
385            logging.info("No temporal coverage elements found - assuming no temporal data available")
386        else:
387            self.getTimeRangeData(self.dgMeta)
388
389   
390    def getAuthorsInfo(self):
391        '''
392        Extract authors info from the moles file
393        '''
394        logging.info('Retrieving authors info from moles file')
395       
396        if self.dgMeta is None:
397            self.createMolesFile()
398           
399        logging.info("Extracting author info")
400        creators = ""
401        authors = ""
402        try:
403            # TODO: check this is the correct path for author data - NB, this is not obvious from example files
404            # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn
405            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier
406            logging.info("Found creator information - adding this to authors record")
407           
408        except Exception, detail:
409            logging.info("Exception thrown whilst trying to find creator information:")
410            logging.info(detail)
411            logging.info("- this suggests document does not contain creator information.")
412
413        try:
414            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors
415            logging.info("Found cited author information - adding this to authors record")
416           
417        except Exception, detail:
418            logging.info("Exception thrown whilst trying to find cited author information:")
419            logging.info(detail)
420            logging.info("- this suggests document does not contain cited author information.")
421       
422        self.authors = authors + " " + creators
423        return self.authors
424   
425   
426    def getParametersInfo(self):
427        '''
428        Extract parameters info from the moles file
429        '''
430        logging.info('Retrieving parameters info from moles file')
431       
432        if self.dgMeta is None:
433            self.createMolesFile()
434           
435        params = ""
436        try:
437            # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files
438            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured
439            parameters_list = self.listify(parameters)
440            for parameter in parameters_list:
441                if parameters.dgValidTerm:
442                    logging.info("Found parameter information - adding this to record")
443                    params += " " + parameters.dgValidTerm
444           
445           
446        except Exception, detail:
447            logging.info("Exception thrown whilst trying to find parameter information:")
448            logging.info(detail)
449            logging.info("- this suggests document does not contain parameter information.")
450       
451        self.parameters = params
452        return self.parameters
453   
454   
455    def getScopeInfo(self):
456        '''
457        Extract scope info from the moles file
458        '''
459        logging.info('Retrieving scope info from moles file')
460       
461        if self.dgMeta is None:
462            self.createMolesFile()
463           
464        scope = ""
465        try:
466            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword
467            logging.info("Found keyword information - parsing this for scope")
468
469            keywords_list = self.listify(keywords)
470            for keyword in keywords_list:
471                if keyword.dgValidTermID:
472                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab):
473                        logging.info("Found scope value - adding this to record")
474                        scope += " " + keyword.dgValidTerm.strip()
475           
476        except Exception, detail:
477            logging.info("Exception thrown whilst trying to find scope information:")
478            logging.info(detail)
479            logging.info("- this suggests document does not contain scope information.")
480
481        # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC'
482        # - to avoid this, use the following delimiter
483        self.scope = re.sub(r'_', 'UNDERSCORE', scope)
484        return self.scope
485           
486           
487    def getTimeRangeData(self, dgMeta):
488        '''
489        Parse an xml tree and add any time range data found
490        @param dgMeta: xml fragment for the time range
491        '''
492        logging.info("Extracting time range info")
493        try:
494            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
495           
496            if not dates:
497                logging.info("No temporal info found for document")
498               
499            dates_list = self.listify(dates)
500            for date in dates_list:
501                startdate=date.DateRangeStart
502                enddate= date.DateRangeEnd
503                if startdate==None or startdate=='None':
504                    startdate="null"
505                if enddate==None or enddate=='None':
506                    enddate="null"
507                   
508                self.stData.addTimeRange(startdate, enddate)
509                logging.info("Temporal info: startdate " + \
510                             startdate + ", enddate " + enddate) 
511        except Exception, detail:
512            logging.info("Document does not contain temporal info.")
513            logging.info(detail)
514
515       
516    def getCoordData(self, dgMeta):
517        '''
518        Parse an xml tree and add any coord data found
519        @param dgMeta: xml fragment for the bounding boxes
520        '''
521        logging.info("Extracting bounding box info")
522        try:
523
524            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
525           
526            if not bboxes:
527                logging.info("No bounding box info found for document")
528                return
529               
530            bbox_list=self.listify(bboxes)
531            #parse the list of coordinates
532            for bbox in bbox_list:
533                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
534                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
535                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
536                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
537                self.stData.addCoords(north, south, east, west)
538                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
539                    east + ", north " + north + "")
540               
541        except Exception, detail:
542            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
543                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
544
545
546    def parseCoord(self, coordValue, minField, maxField):
547        '''
548        Take a coordinate value extracted from a molefile bbox limit - together with
549        the appropriate max/min limits and extract the correct value from it
550        @param coordValue: the contents of the bbox limit tage
551        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
552        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
553        @return: coord - the value of the coordinate as a string   
554        '''
555        logging.debug("Parsing document coordinates")
556        try:
557            coord = coordValue.strip()
558            if coord.endswith(maxField):
559                coord=coordValue.split(maxField)[0]
560            elif coord.endswith(minField):
561                if coord.startswith('-'):
562                    coord = coordValue.split(minField)[0]
563                else:
564                    coord = "-" + coordValue.split(minField)[0]
565   
566            return '%s' % float(coord)
567        except:
568            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
569
570           
571    def hasNullCoords():
572        '''
573        Checks a record to determine whether it has any coordinates set to null
574        '''
575        if str(self.west)=='null' or \
576            str(self.south)=='null' or \
577            str(self.east)=='null' or \
578            str(self.north)=='null':
579            return True;
580        else:
581            return False;
582       
Note: See TracBrowser for help on using the repository browser.