source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 3972

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@3972
Revision 3972, 22.9 KB checked in by cbyrom, 13 years ago (diff)

Use the short filename in the postgres DB for storing the original
document filename.
Add fix to allow proper handling of scope fields as a ts_vector.
Add TODO comments to highlight areas of concern + update docs.

Line 
1#!/usr/bin/env python
2'''
3Class representing the a document to be ingested into the postgres DB table
4C Byrom Apr 08
5'''
6try: #python 2.5
7    from xml.etree import cElementTree
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree
15
16import os, sys, logging, re
17import molesReadWrite as MRW
18from ndgUtils.ndgObject import ndgObject
19from FileUtilities import FileUtilities
20from SpatioTemporalData import SpatioTemporalData
21import keywordAdder
22
23class PostgresRecord:
24    '''
25    Class representing the a document to be ingested into the postgres DB table
26    @param filename: Name of file to use a metadata record
27    @param ndg_dataprovider
28    @param datacentre_groups
29    @param datacentre_namespace
30    @param discovery_id
31    @param xq
32    @param doctype - type of doc to process
33    '''
34    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
35    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
36   
37    # vocab server - used for finding scope values in the moles files
38    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010"
39       
40    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
41        logging.info("Setting up Postgres record for file, " + filename)
42        self.filename = filename
43   
44        # NB, if we're dealing with an NDG data provider, the details are slightly different
45        if ndg_dataprovider:
46            discObj=ndgObject(discovery_id)
47            self._local_id = discObj.localID
48            self._repository_local_id = discObj.repository
49        else:
50            self._local_id = discovery_id
51            self._repository_local_id = datacentre_namespace
52           
53        self._datacentre_groups = datacentre_groups
54        self._repository = datacentre_namespace
55        self.discovery_id = discovery_id
56        self._xq = xq
57        self.docType = docType
58
59        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
60        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
61
62        self._fileUtils = FileUtilities()
63
64        # get the dir of the file - needed by the xquery to use as the target collection
65        tmp = filename.split('/')
66        self._dir = '/'.join(tmp[0:len(tmp)-1])
67        self.shortFilename = tmp[len(tmp)-1]
68       
69        # dir to store a temp copy of the moles file, when produced - for use by other transforms
70        self._molesDir = None
71        # object to hold the moles file - this will be loaded in when it is created - in order to extract
72        # spatiotemporal data, etc
73        self.dgMeta = None
74
75        # firstly load contents of file
76        self.originalFormat = file(filename).read()
77       
78        # escape any apostrophes
79        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
80
81        # initialise the various record fields
82        self.db_id = None    # the DB ID of the record, for easy reference when it is created
83        self.molesFormat = None
84        self.dcFormat = None
85        self.mdipFormat = None
86        self.iso19139Format = None
87        self.scn = 1    # system change number - keeps track of number of mods to a particular row
88       
89        # spatiotemporal data object
90        self.stData = None
91       
92        # fields to hold author, parameter and scope data
93        self.authors = None
94        self.parameters = None
95        self.scope = None
96
97    def escapeSpecialCharacters(self, inputString):
98        '''
99        Adjust the input string to escape any characters that would interfere with string or DB
100        operations
101        @param inputString: string to correct
102        @return: corrected string
103        '''
104        return re.sub(r'\'', '\\\'', inputString)
105
106
107    def unescapeSpecialCharacters(self, inputString):
108        '''
109        Adjust the input string to remove escaped characters that would interfere with string or DB
110        operations
111        @param inputString: string to correct
112        @return: corrected string
113        '''
114        str = re.sub(r'%20', ' ', inputString)
115        return 
116   
117   
118    def doRecordTransforms(self):
119        '''
120        Run various transforms on the original doc, to populate the record with
121        the other types of doc used elsewhere
122        '''
123        logging.info("Running transforms for all document types")
124        for docType in self.documentTypes:
125            self.getDocumentFormat(docType)
126           
127        logging.info("Transforms complete")
128
129
130    def createMolesFile(self):
131        '''
132        Check if a moles file exists on the system; if not, assume the moles transform has not
133        been ran and then produce this file - to allow for use in the various xqueries
134        '''
135        logging.info("Creating moles file on system - for use with other xquery transforms")
136        self._molesDir = self._dir + "/moles/"
137        self._fileUtils.setUpDir(self._molesDir)
138       
139        if self._molesFormat is None:
140            self.doMolesTransform()
141           
142        self._fileUtils.createFile(self._molesDir + self.shortFilename, self._molesFormat)
143        logging.info("Moles file created - at %s" %self._molesDir)
144       
145        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on       
146        molesFile = self._molesDir + self.shortFilename
147        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
148       
149        # load in the moles file and put this into an object for direct access to the xml elements
150        self.dgMeta=MRW.dgMetadata()
151        try:
152            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
153        except Exception, detail:
154            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
155
156           
157
158    def doTransform(self, xQueryType):
159        '''
160        Transform the record according to the specified XQuery type
161        @param xQueryType: XQuery doc to use to do the transform
162        @return: the metadata record in the required transformed format
163        '''
164        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
165
166        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
167        # moles file available for the transform - and use the correct dir for the xquery collection
168        dir = self._dir
169        if xQueryType.find('moles2') > -1:
170            if self._molesDir is None:
171                self.createMolesFile()
172               
173            dir = self._molesDir
174           
175        # get the query and set this up to use properly
176        xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
177
178        # sort out the input ID stuff
179        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
180        xquery=xquery.replace('repository_localid', self._repository)
181
182        # strip out the eXist reference to the libraries; these files should be available in the
183        # running dir - as set up by oai_ingest.py
184        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
185        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
186
187        # write the query to file, to make it easier to input
188        # NB, running directly at the command line leads to problems with the interpretation of $ characters
189        xqFile = "currentQuery.xq"
190        self._fileUtils.createFile(xqFile, xquery)
191
192        # Now do the transform
193        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
194        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes"
195        logging.debug("Running saxon command: " + xqCommand)
196        pipe = os.popen(xqCommand + " 2>&1")
197        output = pipe.read()
198        status = pipe.close()
199
200        if status is not None:
201            raise SystemError, 'Failed at running the XQuery'
202
203        # now remove the temp xquery file
204        status = os.unlink(xqFile)
205        if status is not None:
206            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile
207       
208        logging.info("Transform completed successfully")
209
210        return output
211
212
213    def doMolesTransform(self):
214        '''
215        Set up the basic moles doc - according to the type of document first ingested
216        '''
217        logging.info("Creating moles document - for use with other transforms")
218        xqName = None
219        if self.docType == "DIF":
220            xqName = "dif2moles"
221        elif self.docType == "MDIP":
222            xqName = "mdip2moles"
223        else:
224            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
225                     %self.docType
226
227        # now run the appropriate transform and set the attribute
228        setattr(self, "_molesFormat", self.doTransform(xqName))
229
230        # add keywords, if required
231        if self._datacentre_groups != "":
232            self.addKeywords()
233       
234        # escape any apostrophes
235        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
236
237        logging.info("moles document created")
238       
239
240    def addKeywords(self):
241        '''
242        If datacentre groups have been specified, these need to be added as keywords
243        - NB, this is rather clumsy approach but uses old code to achieve the result
244        '''
245        logging.info("Adding datacentre keywords to moles file")
246
247        # NB, use temporary directories to do the keyword additions
248        tmpDir = os.getcwd() + "/tmp/"
249        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
250        self._fileUtils.setUpDir(tmpDir)
251        self._fileUtils.setUpDir(tmpKeywordsDir)
252        tmpFile = 'tmpFile.xml'
253        self._fileUtils.createFile(tmpDir + tmpFile, self._molesFormat)
254
255        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
256
257        # Now load in the converted file
258        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
259        self._molesFormat = f.read()
260        f.close
261       
262        # Finally, tidy up temp dirs
263        self._fileUtils.cleanDir(tmpDir)
264        self._fileUtils.cleanDir(tmpKeywordsDir)
265        logging.info("Completed adding keywords")
266       
267
268    def getDocumentFormat(self, docType):
269        '''
270        Lookup document format; if it is already defined then return it, else do the required XQuery
271        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
272        @param docType: format of document to return
273        '''
274        logging.info("Retrieving document type, " + docType)
275        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
276        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
277       
278        # check we have the moles format available; if not create it
279        if self._molesFormat is None:
280            self.doMolesTransform()
281            self.createMolesFile()
282       
283        # check the document isn't already defined
284        try:
285            doc = getattr(self, attributeName)
286            if doc is not None:
287                logging.info("Found existing document - returning this now")
288                return doc
289        except:
290            logging.info("Document not available - creating new transformed document")
291
292        # the doc type doesn't exist - so run the xquery
293        transformedDoc = self.doTransform(xqName)
294        setattr(self, attributeName, transformedDoc)
295        return transformedDoc
296       
297   
298    def getAllDocs(self):
299        '''
300        Return a list of all the available doc types in the record
301        '''
302        # if the stored docs array is the same size as the array of all doc types
303        # assume all transforms have been done - and just return these
304        if len(self._allDocs) == len(self.documentTypes):
305            return self._allDocs
306       
307        for docType in self.documentTypes:
308            self._allDocs.append([docType, self.getDocumentFormat(docType)])
309
310        return self._allDocs
311       
312   
313    def getTemporalData(self):
314        '''
315        Retrieves the temporal data for the record; if this hasn't been discovered yet,
316        do the necessary parsing
317        @return: TimeRange object array with temporal data
318        '''
319        if self.stData is None:
320            self.getSpatioTemporalData()
321       
322        return self.stData.getTemporalData()
323       
324   
325    def getSpatialData(self):
326        '''
327        Retrieves the spatial data for the record; if this hasn't been discovered yet,
328        do the necessary parsing
329        @return: Coords object array with spatial data
330        '''
331        if self.stData is None:
332            self.getSpatioTemporalData()
333       
334        return self.stData.getSpatialData()
335       
336
337    def listify(self, item):
338        '''
339        listify checks if an item is a list, if it isn't it puts it
340        inside a list and returns it. Always returns a list object.
341        @param item: object to check
342        @return: item as a list object
343        '''
344        if type(item) is list:
345            return item
346        else:
347            return [item]
348       
349   
350    def getSpatioTemporalData(self):
351        '''
352        Extract spatio temporal data from the original document
353        '''
354        logging.info('Retrieving spatiotemporal info from moles file')
355        # initialise the various spatiotemporal arrays used to extract data to
356        self.stData = SpatioTemporalData()
357       
358        if self.dgMeta is None:
359            self.createMolesFile()
360           
361        # do quick checks to see if the relevant data exists
362        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
363            logging.info("No data summary elements found - assuming no spatiotemporal data available")
364            return
365       
366        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
367            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
368            return
369       
370        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
371            logging.info("No spatial coverage elements found - assuming no spatial data available")
372        else:
373            self.getCoordData(self.dgMeta)
374
375        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
376            logging.info("No temporal coverage elements found - assuming no temporal data available")
377        else:
378            self.getTimeRangeData(self.dgMeta)
379
380   
381    def getAuthorsInfo(self):
382        '''
383        Extract authors info from the moles file
384        '''
385        logging.info('Retrieving authors info from moles file')
386       
387        if self.dgMeta is None:
388            self.createMolesFile()
389           
390        logging.info("Extracting author info")
391        creators = ""
392        authors = ""
393        try:
394            # TODO: check this is the correct path for author data - NB, this is not obvious from example files
395            # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn
396            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier
397            logging.info("Found creator information - adding this to authors record")
398           
399        except Exception, detail:
400            logging.info("Exception thrown whilst trying to find creator information:")
401            logging.info(detail)
402            logging.info("- this suggests document does not contain creator information.")
403
404        try:
405            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors
406            logging.info("Found cited author information - adding this to authors record")
407           
408        except Exception, detail:
409            logging.info("Exception thrown whilst trying to find cited author information:")
410            logging.info(detail)
411            logging.info("- this suggests document does not contain cited author information.")
412       
413        self.authors = authors + " " + creators
414        return self.authors
415   
416   
417    def getParametersInfo(self):
418        '''
419        Extract parameters info from the moles file
420        '''
421        logging.info('Retrieving parameters info from moles file')
422       
423        if self.dgMeta is None:
424            self.createMolesFile()
425           
426        params = ""
427        try:
428            # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files
429            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured
430            parameters_list = self.listify(parameters)
431            for parameter in parameters_list:
432                if parameters.dgValidTerm:
433                    logging.info("Found parameter information - adding this to record")
434                    params += " " + parameters.dgValidTerm
435           
436           
437        except Exception, detail:
438            logging.info("Exception thrown whilst trying to find parameter information:")
439            logging.info(detail)
440            logging.info("- this suggests document does not contain parameter information.")
441       
442        self.parameters = params
443        return self.parameters
444   
445   
446    def getScopeInfo(self):
447        '''
448        Extract scope info from the moles file
449        '''
450        logging.info('Retrieving scope info from moles file')
451       
452        if self.dgMeta is None:
453            self.createMolesFile()
454           
455        scope = ""
456        try:
457            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword
458            logging.info("Found keyword information - parsing this for scope")
459
460            keywords_list = self.listify(keywords)
461            for keyword in keywords_list:
462                if keyword.dgValidTermID:
463                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab):
464                        logging.info("Found scope value - adding this to record")
465                        scope += " " + keyword.dgValidTerm.strip()
466           
467        except Exception, detail:
468            logging.info("Exception thrown whilst trying to find scope information:")
469            logging.info(detail)
470            logging.info("- this suggests document does not contain scope information.")
471
472        # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC'
473        # - to avoid this, use the following delimiter
474        self.scope = re.sub(r'_', 'UNDERSCORE', scope)
475        return self.scope
476           
477           
478    def getTimeRangeData(self, dgMeta):
479        '''
480        Parse an xml tree and add any time range data found
481        @param dgMeta: xml fragment for the time range
482        '''
483        logging.info("Extracting time range info")
484        try:
485            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
486           
487            if not dates:
488                logging.info("No temporal info found for document")
489               
490            dates_list = self.listify(dates)
491            for date in dates_list:
492                startdate=date.DateRangeStart
493                enddate= date.DateRangeEnd
494                if startdate==None or startdate=='None':
495                    startdate="null"
496                if enddate==None or enddate=='None':
497                    enddate="null"
498                   
499                self.stData.addTimeRange(startdate, enddate)
500                logging.info("Temporal info: startdate " + \
501                             startdate + ", enddate " + enddate) 
502        except Exception, detail:
503            logging.info("Document does not contain temporal info.")
504            logging.info(detail)
505
506       
507    def getCoordData(self, dgMeta):
508        '''
509        Parse an xml tree and add any coord data found
510        @param dgMeta: xml fragment for the bounding boxes
511        '''
512        logging.info("Extracting bounding box info")
513        try:
514
515            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
516           
517            if not bboxes:
518                logging.info("No bounding box info found for document")
519                return
520               
521            bbox_list=self.listify(bboxes)
522            #parse the list of coordinates
523            for bbox in bbox_list:
524                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
525                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
526                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
527                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
528                self.stData.addCoords(north, south, east, west)
529                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
530                    east + ", north " + north + "")
531               
532        except Exception, detail:
533            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
534                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
535
536
537    def parseCoord(self, coordValue, minField, maxField):
538        '''
539        Take a coordinate value extracted from a molefile bbox limit - together with
540        the appropriate max/min limits and extract the correct value from it
541        @param coordValue: the contents of the bbox limit tage
542        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
543        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
544        @return: coord - the value of the coordinate as a string   
545        '''
546        logging.debug("Parsing document coordinates")
547        try:
548            coord = coordValue.strip()
549            if coord.endswith(maxField):
550                coord=coordValue.split(maxField)[0]
551            elif coord.endswith(minField):
552                if coord.startswith('-'):
553                    coord = coordValue.split(minField)[0]
554                else:
555                    coord = "-" + coordValue.split(minField)[0]
556   
557            return '%s' % float(coord)
558        except:
559            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
560
561           
562    def hasNullCoords():
563        '''
564        Checks a record to determine whether it has any coordinates set to null
565        '''
566        if str(self.west)=='null' or \
567            str(self.south)=='null' or \
568            str(self.east)=='null' or \
569            str(self.north)=='null':
570            return True;
571        else:
572            return False;
573       
Note: See TracBrowser for help on using the repository browser.