source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 3847

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@3847
Revision 3847, 16.0 KB checked in by cbyrom, 12 years ago (diff)

Create new object, SpatioTemporalData?, to wrapper temporal and spatial data
+ change PostgresRecord? to include all spatiotemporal data from a moles file
using this new object.

Line 
1#!/usr/bin/env python
2'''
3Class representing the contents of a row in the metadata_record postgres DB table
4C Byrom Apr 08
5'''
6try: #python 2.5
7    from xml.etree import cElementTree
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree
15
16import os, sys, logging
17#from ETxmlView import loadET, nsdumb
18import molesReadWrite as MRW
19from ndgUtils.ndgObject import ndgObject
20from FileUtilities import FileUtilities
21from SpatioTemporalData import SpatioTemporalData
22
23class PostgresRecord:
24    '''
25    Class representing the contents of a row in the metadata_record postgres DB table
26    @param filename: Name of file to use a metadata record
27    @param
28    '''
29    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
30       
31    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
32        logging.info("Setting up Postgres record for file, " + filename)
33        self.filename = filename
34   
35        # NB, if we're dealing with an NDG data provider, the details are slightly different
36        if ndg_dataprovider:
37            discObj=ndgObject(discovery_id)
38            self._local_id = discObj.localID
39            self._repository_local_id = discObj.repository
40        else:
41            self._local_id = discovery_id
42            self._repository_local_id = datacentre_namespace
43           
44        self._datacentre_groups = datacentre_groups
45        self._repository = datacentre_namespace
46        self.discovery_id = discovery_id
47        self._xq = xq
48        self.docType = docType
49
50        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
51        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
52
53        self._fileUtils = FileUtilities()
54
55        # get the dir of the file - needed by the xquery to use as the target collection
56        tmp = filename.split('/')
57        self._dir = '/'.join(tmp[0:len(tmp)-1])
58        self._shortFilename = tmp[len(tmp)-1]
59       
60        # dir to store a temp copy of the moles file, when produced - for use by other transforms
61        self._molesDir = None
62
63        # firstly load contents of file
64        self.originalFormat = file(filename).read()
65       
66        # initialise the various record fields
67        self.db_id = None    # the DB ID of the record, for easy reference when it is created
68        self.molesFormat = None
69        self.dcFormat = None
70        self.mdipFormat = None
71        self.iso19139Format = None
72       
73        # do some initial setting up of record
74        self.doRecordTransforms()
75        self.getSpatioTemporalData()
76
77   
78    def doRecordTransforms(self):
79        '''
80        Run various transforms on the original doc, to populate the record with
81        the other types of doc used elsewhere
82        '''
83        logging.info("Running transforms for all document types")
84        for docType in self.documentTypes:
85            self.getDocumentFormat(docType)
86           
87        logging.info("Transforms complete")
88
89
90    def createMolesFile(self):
91        '''
92        Check if a moles file exists on the system; if not, assume the moles transform has not
93        been ran and then produce this file - to allow for use in the various xqueries
94        '''
95        logging.info("Creating moles file on system - for use with other xquery transforms")
96        self._molesDir = self._dir + "/moles/"
97        self._fileUtils.setUpDir(self._molesDir)
98       
99        if self._molesFormat is None:
100            self.doMolesTransform()
101           
102        self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat)
103        logging.info("Moles file created - at %s" %self._molesDir)
104           
105
106    def doTransform(self, xQueryType):
107        '''
108        Transform the record according to the specified XQuery type
109        @param xQueryType: XQuery doc to use to do the transform
110        @return: the metadata record in the required transformed format
111        '''
112        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
113
114        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
115        # moles file available for the transform - and use the correct dir for the xquery collection
116        dir = self._dir
117        if xQueryType.find('moles2') > -1:
118            if self._molesDir is None:
119                self.createMolesFile()
120               
121            dir = self._molesDir
122           
123        # get the query and set this up to use properly
124        xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
125
126        # sort out the input ID stuff
127        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
128        xquery=xquery.replace('repository_localid', self._repository)
129
130        # strip out the eXist reference to the libraries; these files should be available in the
131        # running dir - as set up by oai_ingest.py
132        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
133        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
134
135        # write the query to file, to make it easier to input
136        # NB, running directly at the command line leads to problems with the interpretation of $ characters
137        xqFile = "currentQuery.xq"
138        self._fileUtils.createFile(xqFile, xquery)
139
140        # Now do the transform
141        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
142        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes"
143        logging.debug("Running saxon command: " + xqCommand)
144        pipe = os.popen(xqCommand + " 2>&1")
145        output = pipe.read()
146        status = pipe.close()
147
148        if status is not None:
149            raise SystemError, 'Failed at running the XQuery'
150
151        # now remove the temp xquery file
152        status = os.unlink(xqFile)
153        if status is not None:
154            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile
155       
156        logging.info("Transform completed successfully")
157       
158#        f=open(xQueryType + "_doc.xml", 'w')
159#        f.write(output)
160#        f.close()
161           
162        return output
163
164
165    def doMolesTransform(self):
166        '''
167        Set up the basic moles doc - according to the type of document first ingested
168        '''
169        logging.info("Creating moles document - for use with other transforms")
170        xqName = None
171        if self.docType == "DIF":
172            xqName = "dif2moles"
173        elif self.docType == "MDIP":
174            xqName = "mdip2moles"
175        else:
176            sys.exit("ERROR: No XQuery exists to transform input document type, %s, into moles format" \
177                     %self.docType)
178
179        # add keywords, if required
180        if self._datacentre_groups != "":
181            addKeywords()
182
183        # now run the appropriate transform and set the attribute
184        setattr(self, "_molesFormat", self.doTransform(xqName))
185        logging.info("moles document created")
186       
187
188    def addKeywords(self):
189        '''
190        If datacentre groups have been specified, these need to be added as keywords
191        - NB, this is rather clumsy approach but uses old code to achieve the result
192        '''
193        logging.info("Adding datacentre keywords to moles file")
194        # NB, use temporary directories to do the keyword additions
195        tmpDir = os.getcwd() + "/"
196        tmpKeywordsDir = os.getcwd() + "/kewordsAdded/"
197        self._fileUtils.setUpDir(tmpDir)
198        self._fileUtils.setUpDir(tmpKeywordsDir)
199        tmpFile = 'tmpFile.xml'
200        self._fileUtils.createFile(tmpDir + "/" + tmpFile, self._molesFormat)
201
202        keywordAdder.main(tmpDir, tmpKeywordsDir, self.datacentre_groups)
203
204        # Now load in the converted file
205        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
206        self._molesFormat = f.read()
207        f.close
208       
209        # Finally, tidy up temp dirs
210        self._fileUtils.cleanDir(tmpDir)
211        self._fileUtils.clearDir(tmpKeywordsDir)
212        logging.info("Completed adding keywords")
213       
214
215    def getDocumentFormat(self, docType):
216        '''
217        Lookup document format; if it is already defined then return it, else do the required XQuery
218        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
219        @param docType: format of document to return
220        '''
221        logging.info("Retrieving document type, " + docType)
222        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
223        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
224       
225        # check we have the moles format available; if not create it
226        if self._molesFormat is None:
227            self.doMolesTransform()
228            self.createMolesFile()
229       
230        # check the document isn't already defined
231        try:
232            doc = getattr(self, attributeName)
233            if doc is not None:
234                logging.info("Found existing document - returning this now")
235                return doc
236        except:
237            logging.info("Document not available - creating new transformed document")
238
239        # the doc type doesn't exist - so run the xquery
240        setattr(self, attributeName, self.doTransform(xqName))
241       
242   
243    def getAllDocs(self):
244        '''
245        Return a list of all the available doc types in the record
246        '''
247        if len(self._allDocs) > 0:
248            return self._allDocs
249       
250        for docType in self.documentTypes:
251            self._allDocs.append([docType, self.getDocumentFormat(docType)])
252        return self._allDocs
253       
254
255    def listify(self, item):
256        '''
257        listify checks if an item is a list, if it isn't it puts it
258        inside a list and returns it. Always returns a list object.
259        @param item: object to check
260        @return: item as a list object
261        '''
262        if type(item) is list:
263            return item
264        else:
265            return [item]
266       
267   
268    def getSpatioTemporalData(self):
269        '''
270        Extract spatio temporal data from the original document
271        '''
272        # initialise the various spatiotemporal arrays used to extract data to
273        self.stData = SpatioTemporalData()
274       
275        molesFile = self._molesDir + self._shortFilename
276        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
277       
278        # load in the moles file and put this into an object for direct access to the xml elements
279        dgMeta=MRW.dgMetadata()
280        try:
281            dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
282        except Exception, detail:
283            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
284
285        # do quick checks to see if the relevant data exists
286        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
287            logging.info("No data summary elements found - assuming no spatiotemporal data available")
288            return
289       
290        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
291            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
292            return
293       
294        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
295            logging.info("No spatial coverage elements found - assuming no spatial data available")
296        else:
297            self.getCoordData(dgMeta)
298
299        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
300            logging.info("No temporal coverage elements found - assuming no temporal data available")
301        else:
302            self.getTimeRangeData(dgMeta)
303
304
305    def getTimeRangeData(self, dgMeta):
306        '''
307        Parse an xml tree and add any time range data found
308        @param dgMeta: xml fragment for the time range
309        '''
310        logging.info("Extracting time range info")
311        try:
312            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
313           
314            if not dates:
315                logging.info("No temporal info found for document")
316               
317            dates_list = self.listify(dates)
318            for date in dates_list:
319                startdate=date.DateRangeStart
320                enddate= date.DateRangeEnd
321                if startdate==None or startdate=='None':
322                    startdate="nostartdate"
323                if enddate==None or enddate=='None':
324                    enddate="noenddate"
325                   
326                self.stData.addTimeRange(startdate, enddate)
327                logging.info("Temporal info: startdate " + \
328                             startdate + ", enddate " + enddate) 
329        except Exception, detail:
330            logging.info("Document does not contain temporal info.")
331            logging.info(detail)
332
333       
334    def getCoordData(self, dgMeta):
335        '''
336        Parse an xml tree and add any coord data found
337        @param dgMeta: xml fragment for the bounding boxes
338        '''
339        logging.info("Extracting bounding box info")
340        try:
341
342            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
343           
344            if not bboxes:
345                logging.info("No bounding box info found for document")
346                return
347               
348            bbox_list=self.listify(bboxes)
349            #parse the list of coordinates
350            for bbox in bbox_list:
351                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
352                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
353                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
354                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
355                self.stData.addCoords(north, south, east, west)
356                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
357                    east + ", north " + north + "")
358               
359        except Exception, detail:
360            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
361                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
362
363
364    def parseCoord(self, coordValue, minField, maxField):
365        '''
366        Take a coordinate value extracted from a molefile bbox limit - together with
367        the appropriate max/min limits and extract the correct value from it
368        @param coordValue: the contents of the bbox limit tage
369        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
370        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
371        @return: coord - the value of the coordinate as a string   
372        '''
373        logging.debug("Parsing document coordinates")
374        try:
375            coord = coordValue.strip()
376            if coord.endswith(maxField):
377                coord=coordValue.split(maxField)[0]
378            elif coord.endswith(minField):
379                if coord.startswith('-'):
380                    coord = coordValue.split(minField)[0]
381                else:
382                    coord = "-" + coordValue.split(minField)[0]
383   
384            return '%s' % float(coord)
385        except:
386            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
387
388           
389    def hasNullCoords():
390        '''
391        Checks a record to determine whether it has any coordinates set to null
392        '''
393        if str(self.west)=='null' or \
394            str(self.south)=='null' or \
395            str(self.east)=='null' or \
396            str(self.north)=='null':
397            return True;
398        else:
399            return False;
400       
Note: See TracBrowser for help on using the repository browser.