source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 6135

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@6135
Revision 6135, 24.2 KB checked in by sdonegan, 10 years ago (diff)

Further debugs w.r.t non local file handling

Line 
1#!/usr/bin/env python
2'''
3Class representing the a document to be ingested into the postgres DB table
4C Byrom Apr 08
5'''
6from xml.etree import cElementTree
7import os, sys, logging, re, pkg_resources
8import csml.csml2Moles.molesReadWrite as MRW
9from ndg.common.src.models.ndgObject import ndgObject
10from ndg.common.src.lib.ndgresources import ndgResources
11import ndg.common.src.lib.fileutilities as FileUtilities
12from SpatioTemporalData import SpatioTemporalData
13import keywordAdder
14
15SAXON_JAR_FILE = 'lib/saxon9.jar'
16
17class PostgresRecord:
18    '''
19    Class representing the a document to be ingested into the postgres DB table
20    @param filename: Name of file to use a metadata record
21    @param ndg_dataprovider
22    @param datacentre_groups
23    @param datacentre_namespace
24    @param discovery_id
25    @param xq
26    @param doctype - type of doc to process
27    '''
28    # TODO MDIP transforms do not work very well for lots of files - so currently hiding these
29    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
30   
31    # vocab server - used for finding scope values in the moles files
32    ndg_data_provider_vocab = "http://vocab.ndg.nerc.ac.uk/term/N010"
33       
34    #def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
35    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id,datasetName,datacentreName,datasetLastEditUpdateDate,datasetStartDateNom, datasetEndDateNom, xq, docType,baseDir,codeDir):
36                 
37        logging.info("Setting up Postgres record for file, " + filename)
38        self.filename = filename
39   
40        # NB, if we're dealing with an NDG data provider, the details are slightly different
41        if ndg_dataprovider:
42            discObj=ndgObject(discovery_id)
43            self._local_id = discObj.localID
44            self._repository_local_id = discObj.repository
45        else:
46            self._local_id = discovery_id
47            self._repository_local_id = datacentre_namespace
48           
49        self._datacentre_groups = datacentre_groups
50        self._repository = datacentre_namespace
51        self.discovery_id = discovery_id
52        self._xq = xq
53        self.base_dir = baseDir
54        self.code_dir = codeDir
55       
56        # simplify processing by uppercasing format at initialisation
57        self.docType = docType.upper()   
58       
59        #make sure we escape any special characters in this field... SJD 20/10/09       
60        self.dataset_name = self.escapeSpecialCharacters(datasetName)
61        self.datacentre_name = self.escapeSpecialCharacters(datacentreName)
62       
63        self.dataset_lastEdit = datasetLastEditUpdateDate   
64        self.datasetStartNom = datasetStartDateNom
65        self.datasetEndNom = datasetEndDateNom
66
67        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
68        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
69
70        # get the dir of the file - needed by the xquery to use as the target collection
71        tmp = filename.split('/')
72        self._dir = '/'.join(tmp[0:len(tmp)-1])
73        self.shortFilename = tmp[-1]
74       
75        # dir to store a temp copy of the moles file, when produced - for use by other transforms
76        self._molesDir = None
77        # object to hold the moles file - this will be loaded in when it is created - in order to extract
78        # spatiotemporal data, etc
79        self.dgMeta = None
80
81        # firstly load contents of file
82        self.originalFormat = file(filename).read()
83       
84        # escape any apostrophes
85        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
86
87        # initialise the various record fields
88        self.db_id = None    # the DB ID of the record, for easy reference when it is created
89        self.molesFormat = None
90        self.dcFormat = None
91        self.mdipFormat = None
92        self.iso19139Format = None
93        self.scn = 1    # system change number - keeps track of number of mods to a particular row
94       
95        # spatiotemporal data object
96        self.stData = None
97       
98        # fields to hold author, parameter and scope data
99        self.authors = None
100        self.parameters = None
101        self.scope = None
102
103    def escapeSpecialCharacters(self, inputString):
104        '''
105        Adjust the input string to escape any characters that would interfere with string or DB
106        operations
107        @param inputString: string to correct
108        @return: corrected string
109        '''
110        return re.sub(r'\'', '\\\'', inputString)
111
112
113    def unescapeSpecialCharacters(self, inputString):
114        '''
115        Adjust the input string to remove escaped characters that would interfere with string or DB
116        operations
117        @param inputString: string to correct
118        @return: corrected string
119        '''
120        str = re.sub(r'%20', ' ', inputString)
121        return 
122   
123   
124    def doRecordTransforms(self):
125        '''
126        Run various transforms on the original doc, to populate the record with
127        the other types of doc used elsewhere
128        '''
129        logging.info("Running transforms for all document types")
130        for docType in self.documentTypes:
131            self.getDocumentFormat(docType)
132           
133        logging.info("Transforms complete")
134
135
136    def createMolesFile(self):
137        '''
138        Check if a moles file exists on the system; if not, assume the moles transform has not
139        been ran and then produce this file - to allow for use in the various xqueries
140        '''
141        logging.info("Creating moles file on system - for use with other xquery transforms")
142        self._molesDir = self._dir + "/moles/"
143        FileUtilities.setUpDir(self._molesDir)
144       
145        if self._molesFormat is None:
146            self.doMolesTransform()
147           
148        FileUtilities.createFile(self._molesDir + self.shortFilename, self._molesFormat)
149        logging.info("Moles file created - at %s" %self._molesDir)
150       
151        # now load this moles file, for use when parsing out spatiotemporal, author and parameters data later on       
152        molesFile = self._molesDir + self.shortFilename
153        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
154       
155        # load in the moles file and put this into an object for direct access to the xml elements
156       
157        self.dgMeta=MRW.dgMetadata()
158        try:
159            self.dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
160        except Exception, detail:
161            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
162
163
164    def doTransform(self, xQueryType):
165        '''
166        Transform the record according to the specified XQuery type
167        @param xQueryType: XQuery doc to use to do the transform
168        @return: the metadata record in the required transformed format
169        '''
170        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
171
172        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
173        # moles file available for the transform - and use the correct dir for the xquery collection
174        dir = self._dir
175        if xQueryType.find('moles2') > -1:
176            if self._molesDir is None:
177                self.createMolesFile()
178               
179            dir = self._molesDir
180           
181        # get the query and set this up to use properly
182       
183        #xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
184        #SJD - added this bit in (missed?) to upgrade to ndgCommon.
185        self.xqueryLib = ndgResources()       
186        xquery = self.xqueryLib.createXQuery(xQueryType,dir, self._repository_local_id, self._local_id)
187     
188        # sort out the input ID stuff
189        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
190        xquery=xquery.replace('repository_localid', self._repository)
191
192        # strip out the eXist reference to the libraries; these files should be available in the
193        # running dir - as set up by oai_ingest.py
194        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
195        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
196
197        # write the query to file, to make it easier to input
198        # NB, running directly at the command line leads to problems with the interpretation of $ characters
199        xqFile = self.base_dir + "currentQuery" + xQueryType + ".xq" 
200        FileUtilities.createFile(xqFile, xquery)
201       
202        # ensure the jar file is available - NB, this may be running from a different
203        # location - e.g. the OAIInfoEditor.lib.harvester - and this won't have the
204        # saxon file directly on its filesystem
205        #jarFile = pkg_resources.resource_filename('OAIBatch', SAXON_JAR_FILE)
206        jarFile = self.code_dir + SAXON_JAR_FILE
207
208        # Now do the transform
209        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
210        xqCommand = "java -cp %s net.sf.saxon.Query %s !omit-xml-declaration=yes" %(jarFile, xqFile)
211        logging.debug("Running saxon command: " + xqCommand)
212        pipe = os.popen(xqCommand + " 2>&1")
213        output = pipe.read()
214        status = pipe.close()
215
216        if status is not None:
217            raise SystemError, 'Failed at running the XQuery'
218
219        # now remove the temp xquery file
220        '''status = os.unlink(xqFile)
221        if status is not None:
222            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile'''
223       
224        logging.info("Transform completed successfully")
225
226        return output
227
228
229    def doMolesTransform(self):
230        '''
231        Set up the basic moles doc - according to the type of document first ingested
232        '''
233        logging.info("Creating moles document - for use with other transforms")
234        xqName = None
235        if self.docType == "DIF":
236            xqName = "dif2moles"
237        elif self.docType == "MDIP":
238            xqName = "mdip2moles"
239        else:
240            raise TypeError, "ERROR: No XQuery exists to transform input document type, %s, into moles format" \
241                     %self.docType
242
243        # now run the appropriate transform and set the attribute
244        setattr(self, "_molesFormat", self.doTransform(xqName))
245
246        # add keywords, if required
247        if self._datacentre_groups:
248            self.addKeywords()
249       
250        # escape any apostrophes
251        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
252
253        logging.info("moles document created")
254       
255
256    def addKeywords(self):
257        '''
258        If datacentre groups have been specified, these need to be added as keywords
259        - NB, this is rather clumsy approach but uses old code to achieve the result
260        '''
261        logging.info("Adding datacentre keywords to moles file")
262
263        # NB, use temporary directories to do the keyword additions
264        tmpDir = self.baseDir + "/tmp/"
265        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
266        FileUtilities.setUpDir(tmpDir)
267        FileUtilities.setUpDir(tmpKeywordsDir)
268        tmpFile = 'tmpFile.xml'
269        FileUtilities.createFile(tmpDir + tmpFile, self._molesFormat)
270
271        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
272
273        # Now load in the converted file
274        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
275        self._molesFormat = f.read()
276        f.close
277       
278        # Finally, tidy up temp dirs
279        FileUtilities.cleanDir(tmpDir)
280        FileUtilities.cleanDir(tmpKeywordsDir)
281        logging.info("Completed adding keywords")
282       
283
284    def getDocumentFormat(self, docType):
285        '''
286        Lookup document format; if it is already defined then return it, else do the required XQuery
287        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
288        @param docType: format of document to return
289        '''
290        logging.info("Retrieving document type, " + docType)
291        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
292        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
293       
294        # check we have the moles format available; if not create it
295        if self._molesFormat is None:
296            self.doMolesTransform()
297            self.createMolesFile()
298       
299        # check the document isn't already defined
300        try:
301            doc = getattr(self, attributeName)
302            if doc is not None:
303                logging.info("Found existing document - returning this now")
304                return doc
305        except:
306            logging.info("Document not available - creating new transformed document")
307
308        # the doc type doesn't exist - so run the xquery
309        transformedDoc = self.doTransform(xqName)
310        setattr(self, attributeName, transformedDoc)
311        return transformedDoc
312       
313   
314    def getAllDocs(self):
315        '''
316        Return a list of all the available doc types in the record
317        '''
318        # if the stored docs array is the same size as the array of all doc types
319        # assume all transforms have been done - and just return these
320        if len(self._allDocs) == len(self.documentTypes):
321            return self._allDocs
322       
323        for docType in self.documentTypes:
324            self._allDocs.append([docType, self.getDocumentFormat(docType)])
325
326        return self._allDocs
327       
328   
329    def getTemporalData(self):
330        '''
331        Retrieves the temporal data for the record; if this hasn't been discovered yet,
332        do the necessary parsing
333        @return: TimeRange object array with temporal data
334        '''
335        if self.stData is None:
336            self.getSpatioTemporalData()
337       
338        return self.stData.getTemporalData()
339       
340   
341    def getSpatialData(self):
342        '''
343        Retrieves the spatial data for the record; if this hasn't been discovered yet,
344        do the necessary parsing
345        @return: Coords object array with spatial data
346        '''
347        if self.stData is None:
348            self.getSpatioTemporalData()
349       
350        return self.stData.getSpatialData()
351       
352
353    def listify(self, item):
354        '''
355        listify checks if an item is a list, if it isn't it puts it
356        inside a list and returns it. Always returns a list object.
357        @param item: object to check
358        @return: item as a list object
359        '''
360        if type(item) is list:
361            return item
362        else:
363            return [item]
364       
365   
366    def getSpatioTemporalData(self):
367        '''
368        Extract spatio temporal data from the original document
369        '''
370        logging.info('Retrieving spatiotemporal info from moles file')
371        # initialise the various spatiotemporal arrays used to extract data to
372        self.stData = SpatioTemporalData()
373       
374        if self.dgMeta is None:
375            self.createMolesFile()
376           
377        # do quick checks to see if the relevant data exists
378        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
379            logging.info("No data summary elements found - assuming no spatiotemporal data available")
380            return
381       
382        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
383            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
384            return
385       
386        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
387            logging.info("No spatial coverage elements found - assuming no spatial data available")
388        else:
389            self.getCoordData(self.dgMeta)
390
391        #SJD error with line below- this is where 23/09/08 edit in PostgresDAO fudge sorts...
392        if not self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
393            logging.info("No temporal coverage elements found - assuming no temporal data available")
394        else:
395            self.getTimeRangeData(self.dgMeta)
396
397   
398    def getAuthorsInfo(self):
399        '''
400        Extract authors info from the moles file
401        '''
402        logging.info('Retrieving authors info from moles file')
403       
404        if self.dgMeta is None:
405            self.createMolesFile()
406           
407        logging.info("Extracting author info")
408        creators = ""
409        authors = ""
410        try:
411            # TODO: check this is the correct path for author data - NB, this is not obvious from example files
412            # nb, if this is correct, need to escape out the %20 and %3 characters else it doesn't work - see unescape.. fn
413            creators = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataRoles.dgDataCreator.dgRoleHolder.dgMetadataID.localIdentifier
414            logging.info("Found creator information - adding this to authors record")
415           
416        except Exception, detail:
417            logging.info("Exception thrown whilst trying to find creator information:")
418            logging.info(detail)
419            logging.info("- this suggests document does not contain creator information.")
420
421        try:
422            authors = self.dgMeta.dgMetadataRecord.dgMetadataDescription.abstract.abstractOnlineReference.dgCitation.authors
423            logging.info("Found cited author information - adding this to authors record")
424           
425        except Exception, detail:
426            logging.info("Exception thrown whilst trying to find cited author information:")
427            logging.info(detail)
428            logging.info("- this suggests document does not contain cited author information.")
429       
430        self.authors = authors + " " + creators
431        return self.authors
432   
433   
434    def getParametersInfo(self):
435        '''
436        Extract parameters info from the moles file
437        '''
438        logging.info('Retrieving parameters info from moles file')
439       
440        if self.dgMeta is None:
441            self.createMolesFile()
442           
443        params = ""
444        try:
445            # TODO: check this is the correct path for parameters data - NB, this is not obvious from example files
446            parameters = self.dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgParameterSummary.dgStdParameterMeasured
447            parameters_list = self.listify(parameters)
448            for parameter in parameters_list:
449                if parameters.dgValidTerm:
450                    logging.info("Found parameter information - adding this to record")
451                    params += " " + parameters.dgValidTerm
452           
453           
454        except Exception, detail:
455            logging.info("Exception thrown whilst trying to find parameter information:")
456            logging.info(detail)
457            logging.info("- this suggests document does not contain parameter information.")
458       
459        self.parameters = params
460        return self.parameters
461   
462   
463    def getScopeInfo(self):
464        '''
465        Extract scope info from the moles file
466        '''
467        logging.info('Retrieving scope info from moles file')
468       
469        if self.dgMeta is None:
470            self.createMolesFile()
471           
472        scope = ""
473        try:
474            keywords = self.dgMeta.dgMetadataRecord.dgStructuredKeyword
475            logging.info("Found keyword information - parsing this for scope")
476
477            keywords_list = self.listify(keywords)
478            for keyword in keywords_list:
479                if keyword.dgValidTermID:
480                    if keyword.dgValidTermID.ParentListID.strip().startswith(self.ndg_data_provider_vocab):
481                        logging.info("Found scope value - adding this to record")
482                        scope += " " + keyword.dgValidTerm.strip()
483           
484        except Exception, detail:
485            logging.info("Exception thrown whilst trying to find scope information:")
486            logging.info(detail)
487            logging.info("- this suggests document does not contain scope information.")
488
489        # NB, to_tsvector will remove any underscores -leading to, e.g. NERC_DDC becoming tokenised as 'NERC' and 'DDC'
490        # - to avoid this, use the following delimiter
491        self.scope = re.sub(r'_', 'UNDERSCORE', scope)
492        return self.scope
493           
494           
495    def getTimeRangeData(self, dgMeta):
496        '''
497        Parse an xml tree and add any time range data found
498        @param dgMeta: xml fragment for the time range
499        '''
500        logging.info("Extracting time range info")
501        try:
502            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
503           
504            if not dates:
505                logging.info("No temporal info found for document")
506               
507            dates_list = self.listify(dates)
508            for date in dates_list:
509                startdate=date.DateRangeStart
510                enddate= date.DateRangeEnd
511                if startdate==None or startdate=='None':
512                    startdate="null"
513                if enddate==None or enddate=='None':
514                    enddate="null"
515                   
516                self.stData.addTimeRange(startdate, enddate)
517                logging.info("Temporal info: startdate " + \
518                             startdate + ", enddate " + enddate) 
519        except Exception, detail:
520            logging.info("Document does not contain temporal info.")
521            logging.info(detail)
522
523       
524    def getCoordData(self, dgMeta):
525        '''
526        Parse an xml tree and add any coord data found
527        @param dgMeta: xml fragment for the bounding boxes
528        '''
529        logging.info("Extracting bounding box info")
530        try:
531
532            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
533           
534            if not bboxes:
535                logging.info("No bounding box info found for document")
536                return
537               
538            bbox_list=self.listify(bboxes)
539            #parse the list of coordinates
540            for bbox in bbox_list:
541                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
542                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
543                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
544                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
545                self.stData.addCoords(north, south, east, west)
546                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
547                    east + ", north " + north + "")
548               
549        except Exception, detail:
550            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
551                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
552
553
554    def parseCoord(self, coordValue, minField, maxField):
555        '''
556        Take a coordinate value extracted from a molefile bbox limit - together with
557        the appropriate max/min limits and extract the correct value from it
558        @param coordValue: the contents of the bbox limit tage
559        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
560        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
561        @return: coord - the value of the coordinate as a string   
562        '''
563        logging.debug("Parsing document coordinates")
564        try:
565            coord = coordValue.strip()
566            if coord.endswith(maxField):
567                coord=coordValue.split(maxField)[0]
568            elif coord.endswith(minField):
569                if coord.startswith('-'):
570                    coord = coordValue.split(minField)[0]
571                else:
572                    coord = "-" + coordValue.split(minField)[0]
573   
574            return '%s' % float(coord)
575        except:
576            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
577
578           
579    def hasNullCoords():
580        '''
581        Checks a record to determine whether it has any coordinates set to null
582        '''
583        if str(self.west)=='null' or \
584            str(self.south)=='null' or \
585            str(self.east)=='null' or \
586            str(self.north)=='null':
587            return True;
588        else:
589            return False;
590       
Note: See TracBrowser for help on using the repository browser.