source: TI01-discovery/tags/stable-TI01-ingestAutomation_Proglue_upgradesAndReporting/temp/OAIBatch/PostgresRecord.py @ 5853

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/tags/stable-TI01-ingestAutomation_Proglue_upgradesAndReporting/temp/OAIBatch/PostgresRecord.py@5853
Revision 5853, 24.1 KB checked in by sdonegan, 11 years ago (diff)

Escape special chars in dataset title

Line 
1#!/usr/bin/env python
2'''
3Class representing the a document to be ingested into the postgres DB table
4C Byrom Apr 08
5'''
6try: #python 2.5
7    from xml.etree import cElementTree
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree
15
16import os, sys, logging, re
17import csml.csml2Moles.molesReadWrite as MRW
18from ndgUtils.ndgObject import ndgObject
19from FileUtilities import FileUtilities
20from SpatioTemporalData import SpatioTemporalData
21import keywordAdder
22
23class PostgresRecord:
24    '''
25    Class representing the a document to be ingested into the postgres DB table
26    @param filename: Name of file to use a metadata record
27    @param ndg_dataprovider
28    @param datacentre_groups
29    @param datacentre_namespace
30    @param discovery_id
31    @param xq
32    @param doctype - type of doc to process
33    '''
34    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
35    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139','MDIP']
36   
37    # vocab server - used for finding scope values in the moles files
38    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010"
39       
40    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id,datasetName,datacentreName,datasetLastEditUpdateDate, xq, docType):
41        logging.info("Setting up Postgres record for file, " + filename)
42        self.filename = filename
43   
44        # NB, if we're dealing with an NDG data provider, the details are slightly different
45        if ndg_dataprovider:
46            discObj=ndgObject(discovery_id)
47            self._local_id = discObj.localID
48            self._repository_local_id = discObj.repository
49        else:
50            self._local_id = discovery_id
51            self._repository_local_id = datacentre_namespace
52           
53        self._datacentre_groups = datacentre_groups
54        self._repository = datacentre_namespace
55        self.discovery_id = discovery_id
56        self._xq = xq
57        self.docType = docType
58       
59                #reponse to Kay's problem etc on 17/10/09 - escape special chars in title & name fields
60        self.dataset_name = self.escapeSpecialCharacters(datasetName)
61                self.datacentre_name = self.escapeSpecialCharacters(datacentreName)
62
63        self.dataset_lastEdit = datasetLastEditUpdateDate
64        self.datacentre_name = datacentreName
65
66        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
67        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
68
69        self._fileUtils = FileUtilities()
70
71        # get the dir of the file - needed by the xquery to use as the target collection
72        tmp = filename.split('/')
73        self._dir = '/'.join(tmp[0:len(tmp)-1])
74        self.shortFilename = tmp[len(tmp)-1]
75       
76        # dir to store a temp copy of the moles file, when produced - for use by other transforms
77        self._molesDir = None
78        # object to hold the moles file - this will be loaded in when it is created - in order to extract
79        # spatiotemporal data, etc
80        self.dgMeta = None
81
82        # firstly load contents of file
83        self.originalFormat = file(filename).read()
84       
85        # escape any apostrophes
86        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
87
88        # initialise the various record fields
89        self.db_id = None    # the DB ID of the record, for easy reference when it is created
90        self.molesFormat = None
91        self.dcFormat = None
92        self.mdipFormat = None
93        self.iso19139Format = None
94        self.scn = 1    # system change number - keeps track of number of mods to a particular row
95       
96        # spatiotemporal data object
97        self.stData = None
98       
99        # fields to hold author, parameter and scope data
100        self.authors = None
101        self.parameters = None
102        self.scope = None
103
104    def escapeSpecialCharacters(self, inputString):
105        '''
106        Adjust the input string to escape any characters that would interfere with string or DB
107        operations
108        @param inputString: string to correct
109        @return: corrected string
110        '''
111        return re.sub(r'\'', '\\\'', inputString)
112
113
114    def unescapeSpecialCharacters(self, inputString):
115        '''
116        Adjust the input string to remove escaped characters that would interfere with string or DB
117        operations
118        @param inputString: string to correct
119        @return: corrected string
120        '''
121        str = re.sub(r'%20', ' ', inputString)
122        return 
123   
124   
125    def doRecordTransforms(self):
126        '''
127        Run various transforms on the original doc, to populate the record with
128        the other types of doc used elsewhere
129        '''
130        logging.info("Running transforms for all document types")
131        for docType in self.documentTypes:
132            self.getDocumentFormat(docType)
133           
134        logging.info("Transforms complete")
135
136
137    def createMolesFile(self):
138        '''
139        Check if a moles file exists on the system; if not, assume the moles transform has not
140        been ran and then produce this file - to allow for use in the various xqueries
141        '''
142        logging.info("Creating moles file on system - for use with other xquery transforms")
143        self._molesDir = self._dir + "/moles/"
144        self._fileUtils.setUpDir(self._molesDir)
145       
146        if self._molesFormat is None:
147            self.doMolesTransform()
148           
149        self._fileUtils.createFile(self._molesDir + self.shortFilename, self._molesFormat)
150        logging.info("Moles file created - at %s" %self._molesDir)
151       
152        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on       
153        molesFile = self._molesDir + self.shortFilename
154        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
155       
156        # load in the moles file and put this into an object for direct access to the xml elements
157        self.dgMeta=MRW.dgMetadata()
158        try:
159            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
160        except Exception, detail:
161            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
162
163           
164
165    def doTransform(self, xQueryType):
166        '''
167        Transform the record according to the specified XQuery type
168        @param xQueryType: XQuery doc to use to do the transform
169        @return: the metadata record in the required transformed format
170        '''
171        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
172
173        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
174        # moles file available for the transform - and use the correct dir for the xquery collection
175        dir = self._dir
176        if xQueryType.find('moles2') > -1:
177            if self._molesDir is None:
178                self.createMolesFile()
179               
180            dir = self._molesDir
181           
182        # get the query and set this up to use properly
183        xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
184
185        # sort out the input ID stuff
186        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
187        xquery=xquery.replace('repository_localid', self._repository)
188
189        # strip out the eXist reference to the libraries; these files should be available in the
190        # running dir - as set up by oai_ingest.py
191        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
192        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
193
194        # write the query to file, to make it easier to input
195        # NB, running directly at the command line leads to problems with the interpretation of $ characters
196        #xqFile = "currentQuery.xq"
197        xqFile="currentQuery_" +xQueryType + ".xq"
198        self._fileUtils.createFile(xqFile, xquery)
199
200        # Now do the transform
201        #os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
202        os.putenv ('PATH', ':/usr/java/jdk1.5.0_06/bin:/usr/java/jdk1.5.0_06:/usr/java/jdk1.5.0_06/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
203        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes"
204        logging.debug("Running saxon command: " + xqCommand)
205        pipe = os.popen(xqCommand + " 2>&1")
206       
207        output = pipe.read()
208        status = pipe.close()
209
210        #for debug - write output to a local file for checking
211        #xqOpLocalFile= xQueryType + "_results.xml"
212        #print "++++++++++++++++++++++++++++++++++++++++++++++SJD: Printing xq op to local file: " + xqOpLocalFile
213        #self._fileUtils.createFile(xqOpLocalFile,output)
214        #file=open(xqOpLocalFile)
215        #file.writelines(output)
216        #file.close
217       
218        if status is not None:
219            raise SystemError, 'Failed at running the XQuery'
220
221        # now remove the temp xquery file
222        #status = os.unlink(xqFile)
223        #if status is not None:
224         #   raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile
225       
226        logging.info("Transform completed successfully")
227
228        return output
229
230
231    def doMolesTransform(self):
232        '''
233        Set up the basic moles doc - according to the type of document first ingested
234        '''
235        logging.info("Creating moles document - for use with other transforms")
236        xqName = None
237        if self.docType == "DIF":
238            xqName = "dif2moles"
239        elif self.docType == "MDIP":
240            xqName = "mdip2moles"
241        else:
242            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
243                     %self.docType
244
245        # now run the appropriate transform and set the attribute
246        setattr(self, "_molesFormat", self.doTransform(xqName))
247
248        # add keywords, if required
249        if self._datacentre_groups != "":
250            self.addKeywords()
251       
252        # escape any apostrophes
253        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
254
255        logging.info("moles document created")
256       
257
258    def addKeywords(self):
259        '''
260        If datacentre groups have been specified, these need to be added as keywords
261        - NB, this is rather clumsy approach but uses old code to achieve the result
262        '''
263        logging.info("Adding datacentre keywords to moles file")
264
265        # NB, use temporary directories to do the keyword additions
266        tmpDir = os.getcwd() + "/tmp/"
267        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
268        self._fileUtils.setUpDir(tmpDir)
269        self._fileUtils.setUpDir(tmpKeywordsDir)
270        tmpFile = 'tmpFile.xml'
271        self._fileUtils.createFile(tmpDir + tmpFile, self._molesFormat)
272
273        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
274
275        # Now load in the converted file
276        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
277        self._molesFormat = f.read()
278        f.close
279       
280        # Finally, tidy up temp dirs
281        self._fileUtils.cleanDir(tmpDir)
282        self._fileUtils.cleanDir(tmpKeywordsDir)
283        logging.info("Completed adding keywords")
284       
285
286    def getDocumentFormat(self, docType):
287        '''
288        Lookup document format; if it is already defined then return it, else do the required XQuery
289        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
290        @param docType: format of document to return
291        '''
292        logging.info("Retrieving document type, " + docType)
293        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
294        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
295       
296        # check we have the moles format available; if not create it
297        if self._molesFormat is None:
298            self.doMolesTransform()
299            self.createMolesFile()
300       
301        # check the document isn't already defined
302        try:
303            doc = getattr(self, attributeName)
304            if doc is not None:
305                logging.info("Found existing document - returning this now")
306                return doc
307        except:
308            logging.info("Document not available - creating new transformed document")
309
310        # the doc type doesn't exist - so run the xquery
311        transformedDoc = self.doTransform(xqName)
312        setattr(self, attributeName, transformedDoc)
313        return transformedDoc
314       
315   
316    def getAllDocs(self):
317        '''
318        Return a list of all the available doc types in the record
319        '''
320        # if the stored docs array is the same size as the array of all doc types
321        # assume all transforms have been done - and just return these
322        if len(self._allDocs) == len(self.documentTypes):
323            return self._allDocs
324       
325        for docType in self.documentTypes:
326            self._allDocs.append([docType, self.getDocumentFormat(docType)])
327
328        return self._allDocs
329       
330   
331    def getTemporalData(self):
332        '''
333        Retrieves the temporal data for the record; if this hasn't been discovered yet,
334        do the necessary parsing
335        @return: TimeRange object array with temporal data
336        '''
337       
338        if self.stData is None:
339           
340            self.getSpatioTemporalData()
341 
342        return self.stData.getTemporalData()
343       
344   
345    def getSpatialData(self):
346        '''
347        Retrieves the spatial data for the record; if this hasn't been discovered yet,
348        do the necessary parsing
349        @return: Coords object array with spatial data
350        '''
351       
352        if self.stData is None:           
353            self.getSpatioTemporalData()
354       
355        return self.stData.getSpatialData()
356       
357
358    def listify(self, item):
359        '''
360        listify checks if an item is a list, if it isn't it puts it
361        inside a list and returns it. Always returns a list object.
362        @param item: object to check
363        @return: item as a list object
364        '''
365        if type(item) is list:
366            return item
367        else:
368            return [item]
369       
370   
371    def getSpatioTemporalData(self):
372        '''
373        Extract spatio temporal data from the original document
374        '''
375       
376        logging.info('Retrieving spatiotemporal info from moles file')
377        # initialise the various spatiotemporal arrays used to extract data to
378        self.stData = SpatioTemporalData()
379       
380        if self.dgMeta is None:
381            self.createMolesFile()
382           
383        # do quick checks to see if the relevant data exists
384        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
385            logging.info("No data summary elements found - assuming no spatiotemporal data available")
386            return
387       
388        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
389            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
390            return
391       
392        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
393            logging.info("No spatial coverage elements found - assuming no spatial data available")
394        else:
395            self.getCoordData(self.dgMeta)
396
397       
398
399        #SJD error with line below- this is where 23/09/08 edit in PostgresDAO fudge sorts...
400        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
401           
402            logging.info("No temporal coverage elements found - assuming no temporal data available")
403        else:
404           
405            self.getTimeRangeData(self.dgMeta)
406
407   
408    def getAuthorsInfo(self):
409        '''
410        Extract authors info from the moles file
411        '''
412        logging.info('Retrieving authors info from moles file')
413       
414        if self.dgMeta is None:
415            self.createMolesFile()
416           
417        logging.info("Extracting author info")
418        creators = ""
419        authors = ""
420        try:
421            # TODO: check this is the correct path for author data - NB, this is not obvious from example files
422            # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn
423            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier
424            logging.info("Found creator information - adding this to authors record")
425           
426        except Exception, detail:
427            logging.info("Exception thrown whilst trying to find creator information:")
428            logging.info(detail)
429            logging.info("- this suggests document does not contain creator information.")
430
431        try:
432            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors
433            logging.info("Found cited author information - adding this to authors record")
434           
435        except Exception, detail:
436            logging.info("Exception thrown whilst trying to find cited author information:")
437            logging.info(detail)
438            logging.info("- this suggests document does not contain cited author information.")
439       
440        self.authors = authors + " " + creators
441        return self.authors
442   
443   
444    def getParametersInfo(self):
445        '''
446        Extract parameters info from the moles file
447        '''
448        logging.info('Retrieving parameters info from moles file')
449       
450        if self.dgMeta is None:
451            self.createMolesFile()
452           
453        params = ""
454        try:
455            # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files
456            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured
457            parameters_list = self.listify(parameters)
458            for parameter in parameters_list:
459                if parameters.dgValidTerm:
460                    logging.info("Found parameter information - adding this to record")
461                    params += " " + parameters.dgValidTerm
462           
463           
464        except Exception, detail:
465            logging.info("Exception thrown whilst trying to find parameter information:")
466            logging.info(detail)
467            logging.info("- this suggests document does not contain parameter information.")
468       
469        self.parameters = params
470        return self.parameters
471   
472   
473    def getScopeInfo(self):
474        '''
475        Extract scope info from the moles file
476        '''
477        logging.info('Retrieving scope info from moles file')
478       
479        if self.dgMeta is None:
480            self.createMolesFile()
481           
482        scope = ""
483        try:
484            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword
485            logging.info("Found keyword information - parsing this for scope")
486
487            keywords_list = self.listify(keywords)
488            for keyword in keywords_list:
489                if keyword.dgValidTermID:
490                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab):
491                        logging.info("Found scope value - adding this to record")
492                        scope += " " + keyword.dgValidTerm.strip()
493           
494        except Exception, detail:
495            logging.info("Exception thrown whilst trying to find scope information:")
496            logging.info(detail)
497            logging.info("- this suggests document does not contain scope information.")
498
499        # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC'
500        # - to avoid this, use the following delimiter
501        self.scope = re.sub(r'_', 'UNDERSCORE', scope)
502        return self.scope
503           
504           
505    def getTimeRangeData(self, dgMeta):
506        '''
507        Parse an xml tree and add any time range data found
508        @param dgMeta: xml fragment for the time range
509        '''
510        logging.info("Extracting time range info")
511        try:
512            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
513           
514            if not dates:
515                logging.info("No temporal info found for document")
516               
517            dates_list = self.listify(dates)
518            for date in dates_list:
519                startdate=date.DateRangeStart
520                enddate= date.DateRangeEnd
521                if startdate==None or startdate=='None':
522                    startdate="null"
523                if enddate==None or enddate=='None':
524                    enddate="null"
525                   
526                self.stData.addTimeRange(startdate, enddate)
527                logging.info("Temporal info: startdate " + \
528                             startdate + ", enddate " + enddate) 
529        except Exception, detail:
530            logging.info("Document does not contain temporal info.")
531            logging.info(detail)
532
533       
534    def getCoordData(self, dgMeta):
535        '''
536        Parse an xml tree and add any coord data found
537        @param dgMeta: xml fragment for the bounding boxes
538        '''
539        logging.info("Extracting bounding box info")
540       
541        try:
542
543            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
544           
545            bbox_list=self.listify(bboxes)
546           
547            #parse the list of coordinates
548            for bbox in bbox_list:
549                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
550                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
551                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
552                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
553                self.stData.addCoords(north, south, east, west)
554                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
555                    east + ", north " + north + "")
556       
557        except Exception, detail:
558           
559            logging.info("No bounding box info found for document")
560            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
561                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)               
562            return
563           
564
565
566
567    def parseCoord(self, coordValue, minField, maxField):
568        '''
569        Take a coordinate value extracted from a molefile bbox limit - together with
570        the appropriate max/min limits and extract the correct value from it
571        @param coordValue: the contents of the bbox limit tage
572        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
573        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
574        @return: coord - the value of the coordinate as a string   
575        '''
576        logging.debug("Parsing document coordinates")
577        try:
578            coord = coordValue.strip()
579            if coord.endswith(maxField):
580                coord=coordValue.split(maxField)[0]
581            elif coord.endswith(minField):
582                if coord.startswith('-'):
583                    coord = coordValue.split(minField)[0]
584                else:
585                    coord = "-" + coordValue.split(minField)[0]
586   
587            return '%s' % float(coord)
588        except:
589            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
590
591           
592    def hasNullCoords():
593        '''
594        Checks a record to determine whether it has any coordinates set to null
595        '''
596        if str(self.west)=='null' or \
597            str(self.south)=='null' or \
598            str(self.east)=='null' or \
599            str(self.north)=='null':
600            return True;
601        else:
602            return False;
603       
Note: See TracBrowser for help on using the repository browser.