source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 3853

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@3853
Revision 3853, 16.1 KB checked in by cbyrom, 13 years ago (diff)

Update python scripts to call the new stored procedures, to properly
handle SCN (system change number) values in the DB, to provide status
on the update/create of records - to allow 'number of ingested records'
result to be displayed at end + include method to delete existing
spatiotemporal data when doing updates + add additional logging.

Line 
1#!/usr/bin/env python
2'''
3Class representing the contents of a row in the metadata_record postgres DB table
4C Byrom Apr 08
5'''
6try: #python 2.5
7    from xml.etree import cElementTree
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree
15
16import os, sys, logging
17#from ETxmlView import loadET, nsdumb
18import molesReadWrite as MRW
19from ndgUtils.ndgObject import ndgObject
20from FileUtilities import FileUtilities
21from SpatioTemporalData import SpatioTemporalData
22
23class PostgresRecord:
24    '''
25    Class representing the contents of a row in the metadata_record postgres DB table
26    @param filename: Name of file to use a metadata record
27    @param
28    '''
29    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
30       
31    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
32        logging.info("Setting up Postgres record for file, " + filename)
33        self.filename = filename
34   
35        # NB, if we're dealing with an NDG data provider, the details are slightly different
36        if ndg_dataprovider:
37            discObj=ndgObject(discovery_id)
38            self._local_id = discObj.localID
39            self._repository_local_id = discObj.repository
40        else:
41            self._local_id = discovery_id
42            self._repository_local_id = datacentre_namespace
43           
44        self._datacentre_groups = datacentre_groups
45        self._repository = datacentre_namespace
46        self.discovery_id = discovery_id
47        self._xq = xq
48        self.docType = docType
49
50        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
51        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
52
53        self._fileUtils = FileUtilities()
54
55        # get the dir of the file - needed by the xquery to use as the target collection
56        tmp = filename.split('/')
57        self._dir = '/'.join(tmp[0:len(tmp)-1])
58        self._shortFilename = tmp[len(tmp)-1]
59       
60        # dir to store a temp copy of the moles file, when produced - for use by other transforms
61        self._molesDir = None
62
63        # firstly load contents of file
64        self.originalFormat = file(filename).read()
65       
66        # initialise the various record fields
67        self.db_id = None    # the DB ID of the record, for easy reference when it is created
68        self.molesFormat = None
69        self.dcFormat = None
70        self.mdipFormat = None
71        self.iso19139Format = None
72        self.scn = 1    # system change number - keeps track of number of mods to a particular row
73       
74        # do some initial setting up of record
75        self.doRecordTransforms()
76        self.getSpatioTemporalData()
77
78   
79    def doRecordTransforms(self):
80        '''
81        Run various transforms on the original doc, to populate the record with
82        the other types of doc used elsewhere
83        '''
84        logging.info("Running transforms for all document types")
85        for docType in self.documentTypes:
86            self.getDocumentFormat(docType)
87           
88        logging.info("Transforms complete")
89
90
91    def createMolesFile(self):
92        '''
93        Check if a moles file exists on the system; if not, assume the moles transform has not
94        been ran and then produce this file - to allow for use in the various xqueries
95        '''
96        logging.info("Creating moles file on system - for use with other xquery transforms")
97        self._molesDir = self._dir + "/moles/"
98        self._fileUtils.setUpDir(self._molesDir)
99       
100        if self._molesFormat is None:
101            self.doMolesTransform()
102           
103        self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat)
104        logging.info("Moles file created - at %s" %self._molesDir)
105           
106
107    def doTransform(self, xQueryType):
108        '''
109        Transform the record according to the specified XQuery type
110        @param xQueryType: XQuery doc to use to do the transform
111        @return: the metadata record in the required transformed format
112        '''
113        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
114
115        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
116        # moles file available for the transform - and use the correct dir for the xquery collection
117        dir = self._dir
118        if xQueryType.find('moles2') > -1:
119            if self._molesDir is None:
120                self.createMolesFile()
121               
122            dir = self._molesDir
123           
124        # get the query and set this up to use properly
125        xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
126
127        # sort out the input ID stuff
128        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
129        xquery=xquery.replace('repository_localid', self._repository)
130
131        # strip out the eXist reference to the libraries; these files should be available in the
132        # running dir - as set up by oai_ingest.py
133        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
134        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
135
136        # write the query to file, to make it easier to input
137        # NB, running directly at the command line leads to problems with the interpretation of $ characters
138        xqFile = "currentQuery.xq"
139        self._fileUtils.createFile(xqFile, xquery)
140
141        # Now do the transform
142        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
143        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes"
144        logging.debug("Running saxon command: " + xqCommand)
145        pipe = os.popen(xqCommand + " 2>&1")
146        output = pipe.read()
147        status = pipe.close()
148
149        if status is not None:
150            raise SystemError, 'Failed at running the XQuery'
151
152        # now remove the temp xquery file
153        status = os.unlink(xqFile)
154        if status is not None:
155            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile
156       
157        logging.info("Transform completed successfully")
158       
159#        f=open(xQueryType + "_doc.xml", 'w')
160#        f.write(output)
161#        f.close()
162           
163        return output
164
165
166    def doMolesTransform(self):
167        '''
168        Set up the basic moles doc - according to the type of document first ingested
169        '''
170        logging.info("Creating moles document - for use with other transforms")
171        xqName = None
172        if self.docType == "DIF":
173            xqName = "dif2moles"
174        elif self.docType == "MDIP":
175            xqName = "mdip2moles"
176        else:
177            sys.exit("ERROR: No XQuery exists to transform input document type, %s, into moles format" \
178                     %self.docType)
179
180        # add keywords, if required
181        if self._datacentre_groups != "":
182            addKeywords()
183
184        # now run the appropriate transform and set the attribute
185        setattr(self, "_molesFormat", self.doTransform(xqName))
186        logging.info("moles document created")
187       
188
189    def addKeywords(self):
190        '''
191        If datacentre groups have been specified, these need to be added as keywords
192        - NB, this is rather clumsy approach but uses old code to achieve the result
193        '''
194        logging.info("Adding datacentre keywords to moles file")
195        # NB, use temporary directories to do the keyword additions
196        tmpDir = os.getcwd() + "/"
197        tmpKeywordsDir = os.getcwd() + "/kewordsAdded/"
198        self._fileUtils.setUpDir(tmpDir)
199        self._fileUtils.setUpDir(tmpKeywordsDir)
200        tmpFile = 'tmpFile.xml'
201        self._fileUtils.createFile(tmpDir + "/" + tmpFile, self._molesFormat)
202
203        keywordAdder.main(tmpDir, tmpKeywordsDir, self.datacentre_groups)
204
205        # Now load in the converted file
206        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
207        self._molesFormat = f.read()
208        f.close
209       
210        # Finally, tidy up temp dirs
211        self._fileUtils.cleanDir(tmpDir)
212        self._fileUtils.clearDir(tmpKeywordsDir)
213        logging.info("Completed adding keywords")
214       
215
216    def getDocumentFormat(self, docType):
217        '''
218        Lookup document format; if it is already defined then return it, else do the required XQuery
219        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
220        @param docType: format of document to return
221        '''
222        logging.info("Retrieving document type, " + docType)
223        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
224        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
225       
226        # check we have the moles format available; if not create it
227        if self._molesFormat is None:
228            self.doMolesTransform()
229            self.createMolesFile()
230       
231        # check the document isn't already defined
232        try:
233            doc = getattr(self, attributeName)
234            if doc is not None:
235                logging.info("Found existing document - returning this now")
236                return doc
237        except:
238            logging.info("Document not available - creating new transformed document")
239
240        # the doc type doesn't exist - so run the xquery
241        setattr(self, attributeName, self.doTransform(xqName))
242       
243   
244    def getAllDocs(self):
245        '''
246        Return a list of all the available doc types in the record
247        '''
248        if len(self._allDocs) > 0:
249            return self._allDocs
250       
251        for docType in self.documentTypes:
252            self._allDocs.append([docType, self.getDocumentFormat(docType)])
253        return self._allDocs
254       
255
256    def listify(self, item):
257        '''
258        listify checks if an item is a list, if it isn't it puts it
259        inside a list and returns it. Always returns a list object.
260        @param item: object to check
261        @return: item as a list object
262        '''
263        if type(item) is list:
264            return item
265        else:
266            return [item]
267       
268   
269    def getSpatioTemporalData(self):
270        '''
271        Extract spatio temporal data from the original document
272        '''
273        # initialise the various spatiotemporal arrays used to extract data to
274        self.stData = SpatioTemporalData()
275       
276        molesFile = self._molesDir + self._shortFilename
277        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
278       
279        # load in the moles file and put this into an object for direct access to the xml elements
280        dgMeta=MRW.dgMetadata()
281        try:
282            dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
283        except Exception, detail:
284            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
285
286        # do quick checks to see if the relevant data exists
287        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
288            logging.info("No data summary elements found - assuming no spatiotemporal data available")
289            return
290       
291        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
292            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
293            return
294       
295        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
296            logging.info("No spatial coverage elements found - assuming no spatial data available")
297        else:
298            self.getCoordData(dgMeta)
299
300        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
301            logging.info("No temporal coverage elements found - assuming no temporal data available")
302        else:
303            self.getTimeRangeData(dgMeta)
304
305
306    def getTimeRangeData(self, dgMeta):
307        '''
308        Parse an xml tree and add any time range data found
309        @param dgMeta: xml fragment for the time range
310        '''
311        logging.info("Extracting time range info")
312        try:
313            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
314           
315            if not dates:
316                logging.info("No temporal info found for document")
317               
318            dates_list = self.listify(dates)
319            for date in dates_list:
320                startdate=date.DateRangeStart
321                enddate= date.DateRangeEnd
322                if startdate==None or startdate=='None':
323                    startdate="null"
324                if enddate==None or enddate=='None':
325                    enddate="null"
326                   
327                self.stData.addTimeRange(startdate, enddate)
328                logging.info("Temporal info: startdate " + \
329                             startdate + ", enddate " + enddate) 
330        except Exception, detail:
331            logging.info("Document does not contain temporal info.")
332            logging.info(detail)
333
334       
335    def getCoordData(self, dgMeta):
336        '''
337        Parse an xml tree and add any coord data found
338        @param dgMeta: xml fragment for the bounding boxes
339        '''
340        logging.info("Extracting bounding box info")
341        try:
342
343            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
344           
345            if not bboxes:
346                logging.info("No bounding box info found for document")
347                return
348               
349            bbox_list=self.listify(bboxes)
350            #parse the list of coordinates
351            for bbox in bbox_list:
352                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
353                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
354                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
355                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
356                self.stData.addCoords(north, south, east, west)
357                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
358                    east + ", north " + north + "")
359               
360        except Exception, detail:
361            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
362                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
363
364
365    def parseCoord(self, coordValue, minField, maxField):
366        '''
367        Take a coordinate value extracted from a molefile bbox limit - together with
368        the appropriate max/min limits and extract the correct value from it
369        @param coordValue: the contents of the bbox limit tage
370        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
371        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
372        @return: coord - the value of the coordinate as a string   
373        '''
374        logging.debug("Parsing document coordinates")
375        try:
376            coord = coordValue.strip()
377            if coord.endswith(maxField):
378                coord=coordValue.split(maxField)[0]
379            elif coord.endswith(minField):
380                if coord.startswith('-'):
381                    coord = coordValue.split(minField)[0]
382                else:
383                    coord = "-" + coordValue.split(minField)[0]
384   
385            return '%s' % float(coord)
386        except:
387            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
388
389           
390    def hasNullCoords():
391        '''
392        Checks a record to determine whether it has any coordinates set to null
393        '''
394        if str(self.west)=='null' or \
395            str(self.south)=='null' or \
396            str(self.east)=='null' or \
397            str(self.north)=='null':
398            return True;
399        else:
400            return False;
401       
Note: See TracBrowser for help on using the repository browser.