source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 5252

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@5326
Revision 5252, 23.5 KB checked in by cbyrom, 11 years ago (diff)

Simplify error handling, improve output logging + standardise use of
upper case doc formats + switch off MDIP again since this mostly
breaks things.

Line 
1#!/usr/bin/env python
2'''
3Class representing the a document to be ingested into the postgres DB table
4C Byrom Apr 08
5'''
6from xml.etree import cElementTree
7import os, sys, logging, re, pkg_resources
8import csml.csml2Moles.molesReadWrite as MRW
9from ndg.common.src.models.ndgObject import ndgObject
10from ndg.common.src.lib.ndgresources import ndgResources
11import ndg.common.src.lib.fileutilities as FileUtilities
12from SpatioTemporalData import SpatioTemporalData
13import keywordAdder
14
15SAXON_JAR_FILE = 'lib/saxon9.jar'
16
17class PostgresRecord:
18    '''
19    Class representing the a document to be ingested into the postgres DB table
20    @param filename: Name of file to use a metadata record
21    @param ndg_dataprovider
22    @param datacentre_groups
23    @param datacentre_namespace
24    @param discovery_id
25    @param xq
26    @param doctype - type of doc to process
27    '''
28    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
29    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
30   
31    # vocab server - used for finding scope values in the moles files
32    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010"
33       
34    def __init__(self, filename, ndg_dataprovider, datacentre_groups, \
35                 datacentre_namespace, discovery_id, xq, docType):
36        logging.info("Setting up Postgres record for file, " + filename)
37        self.filename = filename
38   
39        # NB, if we're dealing with an NDG data provider, the details are slightly different
40        if ndg_dataprovider:
41            discObj=ndgObject(discovery_id)
42            self._local_id = discObj.localID
43            self._repository_local_id = discObj.repository
44        else:
45            self._local_id = discovery_id
46            self._repository_local_id = datacentre_namespace
47           
48        self._datacentre_groups = datacentre_groups
49        self._repository = datacentre_namespace
50        self.discovery_id = discovery_id
51        self._xq = xq
52        # simplify processing by uppercasing format at initialisation
53        self.docType = docType.upper()   
54
55        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
56        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
57
58        # get the dir of the file - needed by the xquery to use as the target collection
59        tmp = filename.split('/')
60        self._dir = '/'.join(tmp[0:len(tmp)-1])
61        self.shortFilename = tmp[-1]
62       
63        # dir to store a temp copy of the moles file, when produced - for use by other transforms
64        self._molesDir = None
65        # object to hold the moles file - this will be loaded in when it is created - in order to extract
66        # spatiotemporal data, etc
67        self.dgMeta = None
68
69        # firstly load contents of file
70        self.originalFormat = file(filename).read()
71       
72        # escape any apostrophes
73        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
74
75        # initialise the various record fields
76        self.db_id = None    # the DB ID of the record, for easy reference when it is created
77        self.molesFormat = None
78        self.dcFormat = None
79        self.mdipFormat = None
80        self.iso19139Format = None
81        self.scn = 1    # system change number - keeps track of number of mods to a particular row
82       
83        # spatiotemporal data object
84        self.stData = None
85       
86        # fields to hold author, parameter and scope data
87        self.authors = None
88        self.parameters = None
89        self.scope = None
90
91    def escapeSpecialCharacters(self, inputString):
92        '''
93        Adjust the input string to escape any characters that would interfere with string or DB
94        operations
95        @param inputString: string to correct
96        @return: corrected string
97        '''
98        return re.sub(r'\'', '\\\'', inputString)
99
100
101    def unescapeSpecialCharacters(self, inputString):
102        '''
103        Adjust the input string to remove escaped characters that would interfere with string or DB
104        operations
105        @param inputString: string to correct
106        @return: corrected string
107        '''
108        str = re.sub(r'%20', ' ', inputString)
109        return 
110   
111   
112    def doRecordTransforms(self):
113        '''
114        Run various transforms on the original doc, to populate the record with
115        the other types of doc used elsewhere
116        '''
117        logging.info("Running transforms for all document types")
118        for docType in self.documentTypes:
119            self.getDocumentFormat(docType)
120           
121        logging.info("Transforms complete")
122
123
124    def createMolesFile(self):
125        '''
126        Check if a moles file exists on the system; if not, assume the moles transform has not
127        been ran and then produce this file - to allow for use in the various xqueries
128        '''
129        logging.info("Creating moles file on system - for use with other xquery transforms")
130        self._molesDir = self._dir + "/moles/"
131        FileUtilities.setUpDir(self._molesDir)
132       
133        if self._molesFormat is None:
134            self.doMolesTransform()
135           
136        FileUtilities.createFile(self._molesDir + self.shortFilename, self._molesFormat)
137        logging.info("Moles file created - at %s" %self._molesDir)
138       
139        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on       
140        molesFile = self._molesDir + self.shortFilename
141        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
142       
143        # load in the moles file and put this into an object for direct access to the xml elements
144       
145        self.dgMeta=MRW.dgMetadata()
146        try:
147            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
148        except Exception, detail:
149            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
150
151
152    def doTransform(self, xQueryType):
153        '''
154        Transform the record according to the specified XQuery type
155        @param xQueryType: XQuery doc to use to do the transform
156        @return: the metadata record in the required transformed format
157        '''
158        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
159
160        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
161        # moles file available for the transform - and use the correct dir for the xquery collection
162        dir = self._dir
163        if xQueryType.find('moles2') > -1:
164            if self._molesDir is None:
165                self.createMolesFile()
166               
167            dir = self._molesDir
168           
169        # get the query and set this up to use properly
170       
171        #xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
172        #SJD - added this bit in (missed?) to upgrade to ndgCommon.
173        self.xqueryLib = ndgResources()       
174        xquery = self.xqueryLib.createXQuery(xQueryType,dir, self._repository_local_id, self._local_id)
175     
176        # sort out the input ID stuff
177        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
178        xquery=xquery.replace('repository_localid', self._repository)
179
180        # strip out the eXist reference to the libraries; these files should be available in the
181        # running dir - as set up by oai_ingest.py
182        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
183        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
184
185        # write the query to file, to make it easier to input
186        # NB, running directly at the command line leads to problems with the interpretation of $ characters
187        xqFile = "currentQuery" + xQueryType + ".xq" 
188        FileUtilities.createFile(xqFile, xquery)
189       
190        # ensure the jar file is available - NB, this may be running from a different
191        # location - e.g. the OAIInfoEditor.lib.harvester - and this won't have the
192        # saxon file directly on its filesystem
193        jarFile = pkg_resources.resource_filename('OAIBatch', SAXON_JAR_FILE)
194
195        # Now do the transform
196        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
197        xqCommand = "java -cp %s net.sf.saxon.Query %s !omit-xml-declaration=yes" %(jarFile, xqFile)
198        logging.debug("Running saxon command: " + xqCommand)
199        pipe = os.popen(xqCommand + " 2>&1")
200        output = pipe.read()
201        status = pipe.close()
202
203        if status is not None:
204            raise SystemError, 'Failed at running the XQuery'
205
206        # now remove the temp xquery file
207        '''status = os.unlink(xqFile)
208        if status is not None:
209            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile'''
210       
211        logging.info("Transform completed successfully")
212
213        return output
214
215
216    def doMolesTransform(self):
217        '''
218        Set up the basic moles doc - according to the type of document first ingested
219        '''
220        logging.info("Creating moles document - for use with other transforms")
221        xqName = None
222        if self.docType == "DIF":
223            xqName = "dif2moles"
224        elif self.docType == "MDIP":
225            xqName = "mdip2moles"
226        else:
227            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
228                     %self.docType
229
230        # now run the appropriate transform and set the attribute
231        setattr(self, "_molesFormat", self.doTransform(xqName))
232
233        # add keywords, if required
234        if self._datacentre_groups:
235            self.addKeywords()
236       
237        # escape any apostrophes
238        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
239
240        logging.info("moles document created")
241       
242
243    def addKeywords(self):
244        '''
245        If datacentre groups have been specified, these need to be added as keywords
246        - NB, this is rather clumsy approach but uses old code to achieve the result
247        '''
248        logging.info("Adding datacentre keywords to moles file")
249
250        # NB, use temporary directories to do the keyword additions
251        tmpDir = os.getcwd() + "/tmp/"
252        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
253        FileUtilities.setUpDir(tmpDir)
254        FileUtilities.setUpDir(tmpKeywordsDir)
255        tmpFile = 'tmpFile.xml'
256        FileUtilities.createFile(tmpDir + tmpFile, self._molesFormat)
257
258        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
259
260        # Now load in the converted file
261        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
262        self._molesFormat = f.read()
263        f.close
264       
265        # Finally, tidy up temp dirs
266        FileUtilities.cleanDir(tmpDir)
267        FileUtilities.cleanDir(tmpKeywordsDir)
268        logging.info("Completed adding keywords")
269       
270
271    def getDocumentFormat(self, docType):
272        '''
273        Lookup document format; if it is already defined then return it, else do the required XQuery
274        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
275        @param docType: format of document to return
276        '''
277        logging.info("Retrieving document type, " + docType)
278        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
279        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
280       
281        # check we have the moles format available; if not create it
282        if self._molesFormat is None:
283            self.doMolesTransform()
284            self.createMolesFile()
285       
286        # check the document isn't already defined
287        try:
288            doc = getattr(self, attributeName)
289            if doc is not None:
290                logging.info("Found existing document - returning this now")
291                return doc
292        except:
293            logging.info("Document not available - creating new transformed document")
294
295        # the doc type doesn't exist - so run the xquery
296        transformedDoc = self.doTransform(xqName)
297        setattr(self, attributeName, transformedDoc)
298        return transformedDoc
299       
300   
301    def getAllDocs(self):
302        '''
303        Return a list of all the available doc types in the record
304        '''
305        # if the stored docs array is the same size as the array of all doc types
306        # assume all transforms have been done - and just return these
307        if len(self._allDocs) == len(self.documentTypes):
308            return self._allDocs
309       
310        for docType in self.documentTypes:
311            self._allDocs.append([docType, self.getDocumentFormat(docType)])
312
313        return self._allDocs
314       
315   
316    def getTemporalData(self):
317        '''
318        Retrieves the temporal data for the record; if this hasn't been discovered yet,
319        do the necessary parsing
320        @return: TimeRange object array with temporal data
321        '''
322        if self.stData is None:
323            self.getSpatioTemporalData()
324       
325        return self.stData.getTemporalData()
326       
327   
328    def getSpatialData(self):
329        '''
330        Retrieves the spatial data for the record; if this hasn't been discovered yet,
331        do the necessary parsing
332        @return: Coords object array with spatial data
333        '''
334        if self.stData is None:
335            self.getSpatioTemporalData()
336       
337        return self.stData.getSpatialData()
338       
339
340    def listify(self, item):
341        '''
342        listify checks if an item is a list, if it isn't it puts it
343        inside a list and returns it. Always returns a list object.
344        @param item: object to check
345        @return: item as a list object
346        '''
347        if type(item) is list:
348            return item
349        else:
350            return [item]
351       
352   
353    def getSpatioTemporalData(self):
354        '''
355        Extract spatio temporal data from the original document
356        '''
357        logging.info('Retrieving spatiotemporal info from moles file')
358        # initialise the various spatiotemporal arrays used to extract data to
359        self.stData = SpatioTemporalData()
360       
361        if self.dgMeta is None:
362            self.createMolesFile()
363           
364        # do quick checks to see if the relevant data exists
365        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
366            logging.info("No data summary elements found - assuming no spatiotemporal data available")
367            return
368       
369        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
370            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
371            return
372       
373        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
374            logging.info("No spatial coverage elements found - assuming no spatial data available")
375        else:
376            self.getCoordData(self.dgMeta)
377
378        #SJD error with line below- this is where 23/09/08 edit in PostgresDAO fudge sorts...
379        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
380            logging.info("No temporal coverage elements found - assuming no temporal data available")
381        else:
382            self.getTimeRangeData(self.dgMeta)
383
384   
385    def getAuthorsInfo(self):
386        '''
387        Extract authors info from the moles file
388        '''
389        logging.info('Retrieving authors info from moles file')
390       
391        if self.dgMeta is None:
392            self.createMolesFile()
393           
394        logging.info("Extracting author info")
395        creators = ""
396        authors = ""
397        try:
398            # TODO: check this is the correct path for author data - NB, this is not obvious from example files
399            # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn
400            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier
401            logging.info("Found creator information - adding this to authors record")
402           
403        except Exception, detail:
404            logging.info("Exception thrown whilst trying to find creator information:")
405            logging.info(detail)
406            logging.info("- this suggests document does not contain creator information.")
407
408        try:
409            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors
410            logging.info("Found cited author information - adding this to authors record")
411           
412        except Exception, detail:
413            logging.info("Exception thrown whilst trying to find cited author information:")
414            logging.info(detail)
415            logging.info("- this suggests document does not contain cited author information.")
416       
417        self.authors = authors + " " + creators
418        return self.authors
419   
420   
421    def getParametersInfo(self):
422        '''
423        Extract parameters info from the moles file
424        '''
425        logging.info('Retrieving parameters info from moles file')
426       
427        if self.dgMeta is None:
428            self.createMolesFile()
429           
430        params = ""
431        try:
432            # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files
433            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured
434            parameters_list = self.listify(parameters)
435            for parameter in parameters_list:
436                if parameters.dgValidTerm:
437                    logging.info("Found parameter information - adding this to record")
438                    params += " " + parameters.dgValidTerm
439           
440           
441        except Exception, detail:
442            logging.info("Exception thrown whilst trying to find parameter information:")
443            logging.info(detail)
444            logging.info("- this suggests document does not contain parameter information.")
445       
446        self.parameters = params
447        return self.parameters
448   
449   
450    def getScopeInfo(self):
451        '''
452        Extract scope info from the moles file
453        '''
454        logging.info('Retrieving scope info from moles file')
455       
456        if self.dgMeta is None:
457            self.createMolesFile()
458           
459        scope = ""
460        try:
461            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword
462            logging.info("Found keyword information - parsing this for scope")
463
464            keywords_list = self.listify(keywords)
465            for keyword in keywords_list:
466                if keyword.dgValidTermID:
467                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab):
468                        logging.info("Found scope value - adding this to record")
469                        scope += " " + keyword.dgValidTerm.strip()
470           
471        except Exception, detail:
472            logging.info("Exception thrown whilst trying to find scope information:")
473            logging.info(detail)
474            logging.info("- this suggests document does not contain scope information.")
475
476        # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC'
477        # - to avoid this, use the following delimiter
478        self.scope = re.sub(r'_', 'UNDERSCORE', scope)
479        return self.scope
480           
481           
482    def getTimeRangeData(self, dgMeta):
483        '''
484        Parse an xml tree and add any time range data found
485        @param dgMeta: xml fragment for the time range
486        '''
487        logging.info("Extracting time range info")
488        try:
489            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
490           
491            if not dates:
492                logging.info("No temporal info found for document")
493               
494            dates_list = self.listify(dates)
495            for date in dates_list:
496                startdate=date.DateRangeStart
497                enddate= date.DateRangeEnd
498                if startdate==None or startdate=='None':
499                    startdate="null"
500                if enddate==None or enddate=='None':
501                    enddate="null"
502                   
503                self.stData.addTimeRange(startdate, enddate)
504                logging.info("Temporal info: startdate " + \
505                             startdate + ", enddate " + enddate) 
506        except Exception, detail:
507            logging.info("Document does not contain temporal info.")
508            logging.info(detail)
509
510       
511    def getCoordData(self, dgMeta):
512        '''
513        Parse an xml tree and add any coord data found
514        @param dgMeta: xml fragment for the bounding boxes
515        '''
516        logging.info("Extracting bounding box info")
517        try:
518
519            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
520           
521            if not bboxes:
522                logging.info("No bounding box info found for document")
523                return
524               
525            bbox_list=self.listify(bboxes)
526            #parse the list of coordinates
527            for bbox in bbox_list:
528                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
529                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
530                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
531                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
532                self.stData.addCoords(north, south, east, west)
533                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
534                    east + ", north " + north + "")
535               
536        except Exception, detail:
537            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
538                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
539
540
541    def parseCoord(self, coordValue, minField, maxField):
542        '''
543        Take a coordinate value extracted from a molefile bbox limit - together with
544        the appropriate max/min limits and extract the correct value from it
545        @param coordValue: the contents of the bbox limit tage
546        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
547        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
548        @return: coord - the value of the coordinate as a string   
549        '''
550        logging.debug("Parsing document coordinates")
551        try:
552            coord = coordValue.strip()
553            if coord.endswith(maxField):
554                coord=coordValue.split(maxField)[0]
555            elif coord.endswith(minField):
556                if coord.startswith('-'):
557                    coord = coordValue.split(minField)[0]
558                else:
559                    coord = "-" + coordValue.split(minField)[0]
560   
561            return '%s' % float(coord)
562        except:
563            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
564
565           
566    def hasNullCoords():
567        '''
568        Checks a record to determine whether it has any coordinates set to null
569        '''
570        if str(self.west)=='null' or \
571            str(self.south)=='null' or \
572            str(self.east)=='null' or \
573            str(self.north)=='null':
574            return True;
575        else:
576            return False;
577       
Note: See TracBrowser for help on using the repository browser.