source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 5167

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@5167
Revision 5167, 23.0 KB checked in by sdonegan, 11 years ago (diff)

updates to allow oai_document_ingester to work

Line 
1#!/usr/bin/env python
2'''
3Class representing the a document to be ingested into the postgres DB table
4C Byrom Apr 08
5'''
6from xml.etree import cElementTree
7import os, sys, logging, re
8import csml.csml2Moles.molesReadWrite as MRW
9from ndg.common.src.models.ndgObject import ndgObject
10from ndg.common.src.lib.ndgresources import ndgResources
11import ndg.common.src.lib.fileutilities as FileUtilities
12from SpatioTemporalData import SpatioTemporalData
13import keywordAdder
14
15class PostgresRecord:
16    '''
17    Class representing the a document to be ingested into the postgres DB table
18    @param filename: Name of file to use a metadata record
19    @param ndg_dataprovider
20    @param datacentre_groups
21    @param datacentre_namespace
22    @param discovery_id
23    @param xq
24    @param doctype - type of doc to process
25    '''
26    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
27    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139', 'MDIP']
28   
29    # vocab server - used for finding scope values in the moles files
30    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010"
31       
32    def __init__(self, filename, ndg_dataprovider, datacentre_groups, \
33                 datacentre_namespace, discovery_id, xq, docType):
34        logging.info("Setting up Postgres record for file, " + filename)
35        self.filename = filename
36   
37        # NB, if we're dealing with an NDG data provider, the details are slightly different
38        if ndg_dataprovider:
39            discObj=ndgObject(discovery_id)
40            self._local_id = discObj.localID
41            self._repository_local_id = discObj.repository
42        else:
43            self._local_id = discovery_id
44            self._repository_local_id = datacentre_namespace
45           
46        self._datacentre_groups = datacentre_groups
47        self._repository = datacentre_namespace
48        self.discovery_id = discovery_id
49        self._xq = xq
50        self.docType = docType
51
52        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
53        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
54
55        # get the dir of the file - needed by the xquery to use as the target collection
56        tmp = filename.split('/')
57        self._dir = '/'.join(tmp[0:len(tmp)-1])
58        self.shortFilename = tmp[-1]
59       
60        # dir to store a temp copy of the moles file, when produced - for use by other transforms
61        self._molesDir = None
62        # object to hold the moles file - this will be loaded in when it is created - in order to extract
63        # spatiotemporal data, etc
64        self.dgMeta = None
65
66        # firstly load contents of file
67        self.originalFormat = file(filename).read()
68       
69        # escape any apostrophes
70        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
71
72        # initialise the various record fields
73        self.db_id = None    # the DB ID of the record, for easy reference when it is created
74        self.molesFormat = None
75        self.dcFormat = None
76        self.mdipFormat = None
77        self.iso19139Format = None
78        self.scn = 1    # system change number - keeps track of number of mods to a particular row
79       
80        # spatiotemporal data object
81        self.stData = None
82       
83        # fields to hold author, parameter and scope data
84        self.authors = None
85        self.parameters = None
86        self.scope = None
87
88    def escapeSpecialCharacters(self, inputString):
89        '''
90        Adjust the input string to escape any characters that would interfere with string or DB
91        operations
92        @param inputString: string to correct
93        @return: corrected string
94        '''
95        return re.sub(r'\'', '\\\'', inputString)
96
97
98    def unescapeSpecialCharacters(self, inputString):
99        '''
100        Adjust the input string to remove escaped characters that would interfere with string or DB
101        operations
102        @param inputString: string to correct
103        @return: corrected string
104        '''
105        str = re.sub(r'%20', ' ', inputString)
106        return 
107   
108   
109    def doRecordTransforms(self):
110        '''
111        Run various transforms on the original doc, to populate the record with
112        the other types of doc used elsewhere
113        '''
114        logging.info("Running transforms for all document types")
115        for docType in self.documentTypes:
116            self.getDocumentFormat(docType)
117           
118        logging.info("Transforms complete")
119
120
121    def createMolesFile(self):
122        '''
123        Check if a moles file exists on the system; if not, assume the moles transform has not
124        been ran and then produce this file - to allow for use in the various xqueries
125        '''
126        logging.info("Creating moles file on system - for use with other xquery transforms")
127        self._molesDir = self._dir + "/moles/"
128        FileUtilities.setUpDir(self._molesDir)
129       
130        if self._molesFormat is None:
131            self.doMolesTransform()
132           
133        FileUtilities.createFile(self._molesDir + self.shortFilename, self._molesFormat)
134        logging.info("Moles file created - at %s" %self._molesDir)
135       
136        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on       
137        molesFile = self._molesDir + self.shortFilename
138        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
139       
140        # load in the moles file and put this into an object for direct access to the xml elements
141       
142        self.dgMeta=MRW.dgMetadata()
143        try:
144            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
145        except Exception, detail:
146            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
147
148
149    def doTransform(self, xQueryType):
150        '''
151        Transform the record according to the specified XQuery type
152        @param xQueryType: XQuery doc to use to do the transform
153        @return: the metadata record in the required transformed format
154        '''
155        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
156
157        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
158        # moles file available for the transform - and use the correct dir for the xquery collection
159        dir = self._dir
160        if xQueryType.find('moles2') > -1:
161            if self._molesDir is None:
162                self.createMolesFile()
163               
164            dir = self._molesDir
165           
166        # get the query and set this up to use properly
167       
168        #xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
169        #SJD - added this bit in (missed?) to upgrade to ndgCommon.
170        self.xqueryLib = ndgResources()       
171        xquery = self.xqueryLib.createXQuery(xQueryType,dir, self._repository_local_id, self._local_id)
172     
173        # sort out the input ID stuff
174        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
175        xquery=xquery.replace('repository_localid', self._repository)
176
177        # strip out the eXist reference to the libraries; these files should be available in the
178        # running dir - as set up by oai_ingest.py
179        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
180        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
181
182        # write the query to file, to make it easier to input
183        # NB, running directly at the command line leads to problems with the interpretation of $ characters
184        xqFile = "currentQuery" + xQueryType + ".xq" 
185        FileUtilities.createFile(xqFile, xquery)
186
187        # Now do the transform
188        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
189        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes"
190        logging.debug("Running saxon command: " + xqCommand)
191        pipe = os.popen(xqCommand + " 2>&1")
192        output = pipe.read()
193        status = pipe.close()
194
195        if status is not None:
196            raise SystemError, 'Failed at running the XQuery'
197
198        # now remove the temp xquery file
199        '''status = os.unlink(xqFile)
200        if status is not None:
201            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile'''
202       
203        logging.info("Transform completed successfully")
204
205        return output
206
207
208    def doMolesTransform(self):
209        '''
210        Set up the basic moles doc - according to the type of document first ingested
211        '''
212        logging.info("Creating moles document - for use with other transforms")
213        xqName = None
214        if self.docType == "DIF":
215            xqName = "dif2moles"
216        elif self.docType == "MDIP":
217            xqName = "mdip2moles"
218        else:
219            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
220                     %self.docType
221
222        # now run the appropriate transform and set the attribute
223        setattr(self, "_molesFormat", self.doTransform(xqName))
224
225        # add keywords, if required
226        if self._datacentre_groups:
227            self.addKeywords()
228       
229        # escape any apostrophes
230        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
231
232        logging.info("moles document created")
233       
234
235    def addKeywords(self):
236        '''
237        If datacentre groups have been specified, these need to be added as keywords
238        - NB, this is rather clumsy approach but uses old code to achieve the result
239        '''
240        logging.info("Adding datacentre keywords to moles file")
241
242        # NB, use temporary directories to do the keyword additions
243        tmpDir = os.getcwd() + "/tmp/"
244        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
245        FileUtilities.setUpDir(tmpDir)
246        FileUtilities.setUpDir(tmpKeywordsDir)
247        tmpFile = 'tmpFile.xml'
248        FileUtilities.createFile(tmpDir + tmpFile, self._molesFormat)
249
250        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
251
252        # Now load in the converted file
253        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
254        self._molesFormat = f.read()
255        f.close
256       
257        # Finally, tidy up temp dirs
258        FileUtilities.cleanDir(tmpDir)
259        FileUtilities.cleanDir(tmpKeywordsDir)
260        logging.info("Completed adding keywords")
261       
262
263    def getDocumentFormat(self, docType):
264        '''
265        Lookup document format; if it is already defined then return it, else do the required XQuery
266        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
267        @param docType: format of document to return
268        '''
269        logging.info("Retrieving document type, " + docType)
270        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
271        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
272       
273        # check we have the moles format available; if not create it
274        if self._molesFormat is None:
275            self.doMolesTransform()
276            self.createMolesFile()
277       
278        # check the document isn't already defined
279        try:
280            doc = getattr(self, attributeName)
281            if doc is not None:
282                logging.info("Found existing document - returning this now")
283                return doc
284        except:
285            logging.info("Document not available - creating new transformed document")
286
287        # the doc type doesn't exist - so run the xquery
288        transformedDoc = self.doTransform(xqName)
289        setattr(self, attributeName, transformedDoc)
290        return transformedDoc
291       
292   
293    def getAllDocs(self):
294        '''
295        Return a list of all the available doc types in the record
296        '''
297        # if the stored docs array is the same size as the array of all doc types
298        # assume all transforms have been done - and just return these
299        if len(self._allDocs) == len(self.documentTypes):
300            return self._allDocs
301       
302        for docType in self.documentTypes:
303            self._allDocs.append([docType, self.getDocumentFormat(docType)])
304
305        return self._allDocs
306       
307   
308    def getTemporalData(self):
309        '''
310        Retrieves the temporal data for the record; if this hasn't been discovered yet,
311        do the necessary parsing
312        @return: TimeRange object array with temporal data
313        '''
314        if self.stData is None:
315            self.getSpatioTemporalData()
316       
317        return self.stData.getTemporalData()
318       
319   
320    def getSpatialData(self):
321        '''
322        Retrieves the spatial data for the record; if this hasn't been discovered yet,
323        do the necessary parsing
324        @return: Coords object array with spatial data
325        '''
326        if self.stData is None:
327            self.getSpatioTemporalData()
328       
329        return self.stData.getSpatialData()
330       
331
332    def listify(self, item):
333        '''
334        listify checks if an item is a list, if it isn't it puts it
335        inside a list and returns it. Always returns a list object.
336        @param item: object to check
337        @return: item as a list object
338        '''
339        if type(item) is list:
340            return item
341        else:
342            return [item]
343       
344   
345    def getSpatioTemporalData(self):
346        '''
347        Extract spatio temporal data from the original document
348        '''
349        logging.info('Retrieving spatiotemporal info from moles file')
350        # initialise the various spatiotemporal arrays used to extract data to
351        self.stData = SpatioTemporalData()
352       
353        if self.dgMeta is None:
354            self.createMolesFile()
355           
356        # do quick checks to see if the relevant data exists
357        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
358            logging.info("No data summary elements found - assuming no spatiotemporal data available")
359            return
360       
361        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
362            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
363            return
364       
365        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
366            logging.info("No spatial coverage elements found - assuming no spatial data available")
367        else:
368            self.getCoordData(self.dgMeta)
369
370        #SJD error with line below- this is where 23/09/08 edit in PostgresDAO fudge sorts...
371        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
372            logging.info("No temporal coverage elements found - assuming no temporal data available")
373        else:
374            self.getTimeRangeData(self.dgMeta)
375
376   
377    def getAuthorsInfo(self):
378        '''
379        Extract authors info from the moles file
380        '''
381        logging.info('Retrieving authors info from moles file')
382       
383        if self.dgMeta is None:
384            self.createMolesFile()
385           
386        logging.info("Extracting author info")
387        creators = ""
388        authors = ""
389        try:
390            # TODO: check this is the correct path for author data - NB, this is not obvious from example files
391            # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn
392            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier
393            logging.info("Found creator information - adding this to authors record")
394           
395        except Exception, detail:
396            logging.info("Exception thrown whilst trying to find creator information:")
397            logging.info(detail)
398            logging.info("- this suggests document does not contain creator information.")
399
400        try:
401            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors
402            logging.info("Found cited author information - adding this to authors record")
403           
404        except Exception, detail:
405            logging.info("Exception thrown whilst trying to find cited author information:")
406            logging.info(detail)
407            logging.info("- this suggests document does not contain cited author information.")
408       
409        self.authors = authors + " " + creators
410        return self.authors
411   
412   
413    def getParametersInfo(self):
414        '''
415        Extract parameters info from the moles file
416        '''
417        logging.info('Retrieving parameters info from moles file')
418       
419        if self.dgMeta is None:
420            self.createMolesFile()
421           
422        params = ""
423        try:
424            # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files
425            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured
426            parameters_list = self.listify(parameters)
427            for parameter in parameters_list:
428                if parameters.dgValidTerm:
429                    logging.info("Found parameter information - adding this to record")
430                    params += " " + parameters.dgValidTerm
431           
432           
433        except Exception, detail:
434            logging.info("Exception thrown whilst trying to find parameter information:")
435            logging.info(detail)
436            logging.info("- this suggests document does not contain parameter information.")
437       
438        self.parameters = params
439        return self.parameters
440   
441   
442    def getScopeInfo(self):
443        '''
444        Extract scope info from the moles file
445        '''
446        logging.info('Retrieving scope info from moles file')
447       
448        if self.dgMeta is None:
449            self.createMolesFile()
450           
451        scope = ""
452        try:
453            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword
454            logging.info("Found keyword information - parsing this for scope")
455
456            keywords_list = self.listify(keywords)
457            for keyword in keywords_list:
458                if keyword.dgValidTermID:
459                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab):
460                        logging.info("Found scope value - adding this to record")
461                        scope += " " + keyword.dgValidTerm.strip()
462           
463        except Exception, detail:
464            logging.info("Exception thrown whilst trying to find scope information:")
465            logging.info(detail)
466            logging.info("- this suggests document does not contain scope information.")
467
468        # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC'
469        # - to avoid this, use the following delimiter
470        self.scope = re.sub(r'_', 'UNDERSCORE', scope)
471        return self.scope
472           
473           
474    def getTimeRangeData(self, dgMeta):
475        '''
476        Parse an xml tree and add any time range data found
477        @param dgMeta: xml fragment for the time range
478        '''
479        logging.info("Extracting time range info")
480        try:
481            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
482           
483            if not dates:
484                logging.info("No temporal info found for document")
485               
486            dates_list = self.listify(dates)
487            for date in dates_list:
488                startdate=date.DateRangeStart
489                enddate= date.DateRangeEnd
490                if startdate==None or startdate=='None':
491                    startdate="null"
492                if enddate==None or enddate=='None':
493                    enddate="null"
494                   
495                self.stData.addTimeRange(startdate, enddate)
496                logging.info("Temporal info: startdate " + \
497                             startdate + ", enddate " + enddate) 
498        except Exception, detail:
499            logging.info("Document does not contain temporal info.")
500            logging.info(detail)
501
502       
503    def getCoordData(self, dgMeta):
504        '''
505        Parse an xml tree and add any coord data found
506        @param dgMeta: xml fragment for the bounding boxes
507        '''
508        logging.info("Extracting bounding box info")
509        try:
510
511            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
512           
513            if not bboxes:
514                logging.info("No bounding box info found for document")
515                return
516               
517            bbox_list=self.listify(bboxes)
518            #parse the list of coordinates
519            for bbox in bbox_list:
520                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
521                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
522                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
523                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
524                self.stData.addCoords(north, south, east, west)
525                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
526                    east + ", north " + north + "")
527               
528        except Exception, detail:
529            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
530                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
531
532
533    def parseCoord(self, coordValue, minField, maxField):
534        '''
535        Take a coordinate value extracted from a molefile bbox limit - together with
536        the appropriate max/min limits and extract the correct value from it
537        @param coordValue: the contents of the bbox limit tage
538        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
539        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
540        @return: coord - the value of the coordinate as a string   
541        '''
542        logging.debug("Parsing document coordinates")
543        try:
544            coord = coordValue.strip()
545            if coord.endswith(maxField):
546                coord=coordValue.split(maxField)[0]
547            elif coord.endswith(minField):
548                if coord.startswith('-'):
549                    coord = coordValue.split(minField)[0]
550                else:
551                    coord = "-" + coordValue.split(minField)[0]
552   
553            return '%s' % float(coord)
554        except:
555            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
556
557           
558    def hasNullCoords():
559        '''
560        Checks a record to determine whether it has any coordinates set to null
561        '''
562        if str(self.west)=='null' or \
563            str(self.south)=='null' or \
564            str(self.east)=='null' or \
565            str(self.north)=='null':
566            return True;
567        else:
568            return False;
569       
Note: See TracBrowser for help on using the repository browser.