source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 3869

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@3869
Revision 3869, 17.5 KB checked in by cbyrom, 13 years ago (diff)

Add code to delete records if they weren't properly ingested - to allow
clean reruns of the ingest + fix some problems with the keyword adding
code.

Line 
1#!/usr/bin/env python
2'''
3Class representing the contents of a row in the metadata_record postgres DB table
4C Byrom Apr 08
5'''
6try: #python 2.5
7    from xml.etree import cElementTree
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree
15
16import os, sys, logging, re
17import molesReadWrite as MRW
18from ndgUtils.ndgObject import ndgObject
19from FileUtilities import FileUtilities
20from SpatioTemporalData import SpatioTemporalData
21import keywordAdder
22
23class PostgresRecord:
24    '''
25    Class representing the contents of a row in the metadata_record postgres DB table
26    @param filename: Name of file to use a metadata record
27    @param
28    '''
29    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
30       
31    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
32        logging.info("Setting up Postgres record for file, " + filename)
33        self.filename = filename
34   
35        # NB, if we're dealing with an NDG data provider, the details are slightly different
36        if ndg_dataprovider:
37            discObj=ndgObject(discovery_id)
38            self._local_id = discObj.localID
39            self._repository_local_id = discObj.repository
40        else:
41            self._local_id = discovery_id
42            self._repository_local_id = datacentre_namespace
43           
44        self._datacentre_groups = datacentre_groups
45        self._repository = datacentre_namespace
46        self.discovery_id = discovery_id
47        self._xq = xq
48        self.docType = docType
49
50        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
51        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
52
53        self._fileUtils = FileUtilities()
54
55        # get the dir of the file - needed by the xquery to use as the target collection
56        tmp = filename.split('/')
57        self._dir = '/'.join(tmp[0:len(tmp)-1])
58        self._shortFilename = tmp[len(tmp)-1]
59       
60        # dir to store a temp copy of the moles file, when produced - for use by other transforms
61        self._molesDir = None
62
63        # firstly load contents of file
64        self.originalFormat = file(filename).read()
65       
66        # escape any apostrophes
67        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
68
69        # initialise the various record fields
70        self.db_id = None    # the DB ID of the record, for easy reference when it is created
71        self.molesFormat = None
72        self.dcFormat = None
73        self.mdipFormat = None
74        self.iso19139Format = None
75        self.scn = 1    # system change number - keeps track of number of mods to a particular row
76       
77        # spatiotemporal data object
78        self.stData = None
79
80    def escapeSpecialCharacters(self, inputString):
81        '''
82        Adjust the input string to escape any characters that would interfere with string or DB
83        operations
84        @param inputString: string to correct
85        @return: corrected string
86        '''
87        return re.sub(r'\'', '\\\'', inputString)
88   
89   
90    def doRecordTransforms(self):
91        '''
92        Run various transforms on the original doc, to populate the record with
93        the other types of doc used elsewhere
94        '''
95        logging.info("Running transforms for all document types")
96        for docType in self.documentTypes:
97            self.getDocumentFormat(docType)
98           
99        logging.info("Transforms complete")
100
101
102    def createMolesFile(self):
103        '''
104        Check if a moles file exists on the system; if not, assume the moles transform has not
105        been ran and then produce this file - to allow for use in the various xqueries
106        '''
107        logging.info("Creating moles file on system - for use with other xquery transforms")
108        self._molesDir = self._dir + "/moles/"
109        self._fileUtils.setUpDir(self._molesDir)
110       
111        if self._molesFormat is None:
112            self.doMolesTransform()
113           
114        self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat)
115        logging.info("Moles file created - at %s" %self._molesDir)
116           
117
118    def doTransform(self, xQueryType):
119        '''
120        Transform the record according to the specified XQuery type
121        @param xQueryType: XQuery doc to use to do the transform
122        @return: the metadata record in the required transformed format
123        '''
124        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
125
126        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
127        # moles file available for the transform - and use the correct dir for the xquery collection
128        dir = self._dir
129        if xQueryType.find('moles2') > -1:
130            if self._molesDir is None:
131                self.createMolesFile()
132               
133            dir = self._molesDir
134           
135        # get the query and set this up to use properly
136        xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
137
138        # sort out the input ID stuff
139        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
140        xquery=xquery.replace('repository_localid', self._repository)
141
142        # strip out the eXist reference to the libraries; these files should be available in the
143        # running dir - as set up by oai_ingest.py
144        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
145        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
146
147        # write the query to file, to make it easier to input
148        # NB, running directly at the command line leads to problems with the interpretation of $ characters
149        xqFile = "currentQuery.xq"
150        self._fileUtils.createFile(xqFile, xquery)
151
152        # Now do the transform
153        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
154        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes"
155        logging.debug("Running saxon command: " + xqCommand)
156        pipe = os.popen(xqCommand + " 2>&1")
157        output = pipe.read()
158        status = pipe.close()
159
160        if status is not None:
161            raise SystemError, 'Failed at running the XQuery'
162
163        # now remove the temp xquery file
164        status = os.unlink(xqFile)
165        if status is not None:
166            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile
167       
168        logging.info("Transform completed successfully")
169
170        return output
171
172
173    def doMolesTransform(self):
174        '''
175        Set up the basic moles doc - according to the type of document first ingested
176        '''
177        logging.info("Creating moles document - for use with other transforms")
178        xqName = None
179        if self.docType == "DIF":
180            xqName = "dif2moles"
181        elif self.docType == "MDIP":
182            xqName = "mdip2moles"
183        else:
184            sys.exit("ERROR: No XQuery exists to transform input document type, %s, into moles format" \
185                     %self.docType)
186
187        # now run the appropriate transform and set the attribute
188        setattr(self, "_molesFormat", self.doTransform(xqName))
189
190        # add keywords, if required
191        if self._datacentre_groups != "":
192            self.addKeywords()
193       
194        # escape any apostrophes
195        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
196
197        logging.info("moles document created")
198       
199
200    def addKeywords(self):
201        '''
202        If datacentre groups have been specified, these need to be added as keywords
203        - NB, this is rather clumsy approach but uses old code to achieve the result
204        '''
205        logging.info("Adding datacentre keywords to moles file")
206
207        # NB, use temporary directories to do the keyword additions
208        tmpDir = os.getcwd() + "/tmp/"
209        tmpKeywordsDir = os.getcwd() + "/keywordsAdded/"
210        self._fileUtils.setUpDir(tmpDir)
211        self._fileUtils.setUpDir(tmpKeywordsDir)
212        tmpFile = 'tmpFile.xml'
213        self._fileUtils.createFile(tmpDir + tmpFile, self._molesFormat)
214
215        keywordAdder.main(tmpDir, tmpKeywordsDir, self._datacentre_groups)
216
217        sys.exit()
218        # Now load in the converted file
219        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
220        self._molesFormat = f.read()
221        f.close
222       
223        # Finally, tidy up temp dirs
224        self._fileUtils.cleanDir(tmpDir)
225        self._fileUtils.clearDir(tmpKeywordsDir)
226        logging.info("Completed adding keywords")
227       
228
229    def getDocumentFormat(self, docType):
230        '''
231        Lookup document format; if it is already defined then return it, else do the required XQuery
232        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
233        @param docType: format of document to return
234        '''
235        logging.info("Retrieving document type, " + docType)
236        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
237        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
238       
239        # check we have the moles format available; if not create it
240        if self._molesFormat is None:
241            self.doMolesTransform()
242            self.createMolesFile()
243       
244        # check the document isn't already defined
245        try:
246            doc = getattr(self, attributeName)
247            if doc is not None:
248                logging.info("Found existing document - returning this now")
249                return doc
250        except:
251            logging.info("Document not available - creating new transformed document")
252
253        # the doc type doesn't exist - so run the xquery
254        transformedDoc = self.doTransform(xqName)
255        setattr(self, attributeName, transformedDoc)
256        return transformedDoc
257       
258   
259    def getAllDocs(self):
260        '''
261        Return a list of all the available doc types in the record
262        '''
263        # if the stored docs array is the same size as the array of all doc types
264        # assume all transforms have been done - and just return these
265        if len(self._allDocs) == len(self.documentTypes):
266            return self._allDocs
267       
268        for docType in self.documentTypes:
269            self._allDocs.append([docType, self.getDocumentFormat(docType)])
270
271        return self._allDocs
272       
273   
274    def getTemporalData(self):
275        '''
276        Retrieves the temporal data for the record; if this hasn't been discovered yet,
277        do the necessary parsing
278        @return: TimeRange object array with temporal data
279        '''
280        if self.stData is None:
281            self.getSpatioTemporalData()
282       
283        return self.stData.getTemporalData()
284       
285   
286    def getSpatialData(self):
287        '''
288        Retrieves the spatial data for the record; if this hasn't been discovered yet,
289        do the necessary parsing
290        @return: Coords object array with spatial data
291        '''
292        if self.stData is None:
293            self.getSpatioTemporalData()
294       
295        return self.stData.getSpatialData()
296       
297
298    def listify(self, item):
299        '''
300        listify checks if an item is a list, if it isn't it puts it
301        inside a list and returns it. Always returns a list object.
302        @param item: object to check
303        @return: item as a list object
304        '''
305        if type(item) is list:
306            return item
307        else:
308            return [item]
309       
310   
311    def getSpatioTemporalData(self):
312        '''
313        Extract spatio temporal data from the original document
314        '''
315        # initialise the various spatiotemporal arrays used to extract data to
316        self.stData = SpatioTemporalData()
317       
318        molesFile = self._molesDir + self._shortFilename
319        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
320       
321        # load in the moles file and put this into an object for direct access to the xml elements
322        dgMeta=MRW.dgMetadata()
323        try:
324            dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
325        except Exception, detail:
326            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
327
328        # do quick checks to see if the relevant data exists
329        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
330            logging.info("No data summary elements found - assuming no spatiotemporal data available")
331            return
332       
333        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
334            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
335            return
336       
337        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
338            logging.info("No spatial coverage elements found - assuming no spatial data available")
339        else:
340            self.getCoordData(dgMeta)
341
342        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
343            logging.info("No temporal coverage elements found - assuming no temporal data available")
344        else:
345            self.getTimeRangeData(dgMeta)
346
347
348    def getTimeRangeData(self, dgMeta):
349        '''
350        Parse an xml tree and add any time range data found
351        @param dgMeta: xml fragment for the time range
352        '''
353        logging.info("Extracting time range info")
354        try:
355            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
356           
357            if not dates:
358                logging.info("No temporal info found for document")
359               
360            dates_list = self.listify(dates)
361            for date in dates_list:
362                startdate=date.DateRangeStart
363                enddate= date.DateRangeEnd
364                if startdate==None or startdate=='None':
365                    startdate="null"
366                if enddate==None or enddate=='None':
367                    enddate="null"
368                   
369                self.stData.addTimeRange(startdate, enddate)
370                logging.info("Temporal info: startdate " + \
371                             startdate + ", enddate " + enddate) 
372        except Exception, detail:
373            logging.info("Document does not contain temporal info.")
374            logging.info(detail)
375
376       
377    def getCoordData(self, dgMeta):
378        '''
379        Parse an xml tree and add any coord data found
380        @param dgMeta: xml fragment for the bounding boxes
381        '''
382        logging.info("Extracting bounding box info")
383        try:
384
385            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
386           
387            if not bboxes:
388                logging.info("No bounding box info found for document")
389                return
390               
391            bbox_list=self.listify(bboxes)
392            #parse the list of coordinates
393            for bbox in bbox_list:
394                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
395                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
396                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
397                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
398                self.stData.addCoords(north, south, east, west)
399                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
400                    east + ", north " + north + "")
401               
402        except Exception, detail:
403            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
404                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
405
406
407    def parseCoord(self, coordValue, minField, maxField):
408        '''
409        Take a coordinate value extracted from a molefile bbox limit - together with
410        the appropriate max/min limits and extract the correct value from it
411        @param coordValue: the contents of the bbox limit tage
412        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
413        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
414        @return: coord - the value of the coordinate as a string   
415        '''
416        logging.debug("Parsing document coordinates")
417        try:
418            coord = coordValue.strip()
419            if coord.endswith(maxField):
420                coord=coordValue.split(maxField)[0]
421            elif coord.endswith(minField):
422                if coord.startswith('-'):
423                    coord = coordValue.split(minField)[0]
424                else:
425                    coord = "-" + coordValue.split(minField)[0]
426   
427            return '%s' % float(coord)
428        except:
429            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
430
431           
432    def hasNullCoords():
433        '''
434        Checks a record to determine whether it has any coordinates set to null
435        '''
436        if str(self.west)=='null' or \
437            str(self.south)=='null' or \
438            str(self.east)=='null' or \
439            str(self.north)=='null':
440            return True;
441        else:
442            return False;
443       
Note: See TracBrowser for help on using the repository browser.