source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 3860

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@3860
Revision 3860, 16.5 KB checked in by cbyrom, 12 years ago (diff)

Add new function, escapeSpecialCharacters() to PostgresRecord? to correct
the input docs and moles docs - so that they don't feature exposed
apostrophes - to avoid errors when running the PL/SQL stuff.

Line 
1#!/usr/bin/env python
2'''
3Class representing the contents of a row in the metadata_record postgres DB table
4C Byrom Apr 08
5'''
6try: #python 2.5
7    from xml.etree import cElementTree
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree
15
16import os, sys, logging, re
17import molesReadWrite as MRW
18from ndgUtils.ndgObject import ndgObject
19from FileUtilities import FileUtilities
20from SpatioTemporalData import SpatioTemporalData
21
22class PostgresRecord:
23    '''
24    Class representing the contents of a row in the metadata_record postgres DB table
25    @param filename: Name of file to use a metadata record
26    @param
27    '''
28    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
29       
30    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
31        logging.info("Setting up Postgres record for file, " + filename)
32        self.filename = filename
33   
34        # NB, if we're dealing with an NDG data provider, the details are slightly different
35        if ndg_dataprovider:
36            discObj=ndgObject(discovery_id)
37            self._local_id = discObj.localID
38            self._repository_local_id = discObj.repository
39        else:
40            self._local_id = discovery_id
41            self._repository_local_id = datacentre_namespace
42           
43        self._datacentre_groups = datacentre_groups
44        self._repository = datacentre_namespace
45        self.discovery_id = discovery_id
46        self._xq = xq
47        self.docType = docType
48
49        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
50        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
51
52        self._fileUtils = FileUtilities()
53
54        # get the dir of the file - needed by the xquery to use as the target collection
55        tmp = filename.split('/')
56        self._dir = '/'.join(tmp[0:len(tmp)-1])
57        self._shortFilename = tmp[len(tmp)-1]
58       
59        # dir to store a temp copy of the moles file, when produced - for use by other transforms
60        self._molesDir = None
61
62        # firstly load contents of file
63        self.originalFormat = file(filename).read()
64       
65        # escape any apostrophes
66        self.originalFormat = self.escapeSpecialCharacters(self.originalFormat)
67
68        # initialise the various record fields
69        self.db_id = None    # the DB ID of the record, for easy reference when it is created
70        self.molesFormat = None
71        self.dcFormat = None
72        self.mdipFormat = None
73        self.iso19139Format = None
74        self.scn = 1    # system change number - keeps track of number of mods to a particular row
75       
76        # do some initial setting up of record
77        self.doRecordTransforms()
78        self.getSpatioTemporalData()
79
80
81    def escapeSpecialCharacters(self, inputString):
82        '''
83        Adjust the input string to escape any characters that would interfere with string or DB
84        operations
85        @param inputString: string to correct
86        @return: corrected string
87        '''
88        return re.sub(r'\'', '\\\'', inputString)
89   
90   
91    def doRecordTransforms(self):
92        '''
93        Run various transforms on the original doc, to populate the record with
94        the other types of doc used elsewhere
95        '''
96        logging.info("Running transforms for all document types")
97        for docType in self.documentTypes:
98            self.getDocumentFormat(docType)
99           
100        logging.info("Transforms complete")
101
102
103    def createMolesFile(self):
104        '''
105        Check if a moles file exists on the system; if not, assume the moles transform has not
106        been ran and then produce this file - to allow for use in the various xqueries
107        '''
108        logging.info("Creating moles file on system - for use with other xquery transforms")
109        self._molesDir = self._dir + "/moles/"
110        self._fileUtils.setUpDir(self._molesDir)
111       
112        if self._molesFormat is None:
113            self.doMolesTransform()
114           
115        self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat)
116        logging.info("Moles file created - at %s" %self._molesDir)
117           
118
119    def doTransform(self, xQueryType):
120        '''
121        Transform the record according to the specified XQuery type
122        @param xQueryType: XQuery doc to use to do the transform
123        @return: the metadata record in the required transformed format
124        '''
125        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
126
127        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
128        # moles file available for the transform - and use the correct dir for the xquery collection
129        dir = self._dir
130        if xQueryType.find('moles2') > -1:
131            if self._molesDir is None:
132                self.createMolesFile()
133               
134            dir = self._molesDir
135           
136        # get the query and set this up to use properly
137        xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
138
139        # sort out the input ID stuff
140        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
141        xquery=xquery.replace('repository_localid', self._repository)
142
143        # strip out the eXist reference to the libraries; these files should be available in the
144        # running dir - as set up by oai_ingest.py
145        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
146        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
147
148        # write the query to file, to make it easier to input
149        # NB, running directly at the command line leads to problems with the interpretation of $ characters
150        xqFile = "currentQuery.xq"
151        self._fileUtils.createFile(xqFile, xquery)
152
153        # Now do the transform
154        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
155        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes"
156        logging.debug("Running saxon command: " + xqCommand)
157        pipe = os.popen(xqCommand + " 2>&1")
158        output = pipe.read()
159        status = pipe.close()
160
161        if status is not None:
162            raise SystemError, 'Failed at running the XQuery'
163
164        # now remove the temp xquery file
165        status = os.unlink(xqFile)
166        if status is not None:
167            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile
168       
169        logging.info("Transform completed successfully")
170
171        return output
172
173
174    def doMolesTransform(self):
175        '''
176        Set up the basic moles doc - according to the type of document first ingested
177        '''
178        logging.info("Creating moles document - for use with other transforms")
179        xqName = None
180        if self.docType == "DIF":
181            xqName = "dif2moles"
182        elif self.docType == "MDIP":
183            xqName = "mdip2moles"
184        else:
185            sys.exit("ERROR: No XQuery exists to transform input document type, %s, into moles format" \
186                     %self.docType)
187
188        # add keywords, if required
189        if self._datacentre_groups != "":
190            addKeywords()
191
192        # now run the appropriate transform and set the attribute
193        setattr(self, "_molesFormat", self.doTransform(xqName))
194       
195        # escape any apostrophes
196        self._molesFormat = self.escapeSpecialCharacters(self._molesFormat)
197
198        logging.info("moles document created")
199       
200
201    def addKeywords(self):
202        '''
203        If datacentre groups have been specified, these need to be added as keywords
204        - NB, this is rather clumsy approach but uses old code to achieve the result
205        '''
206        logging.info("Adding datacentre keywords to moles file")
207        # NB, use temporary directories to do the keyword additions
208        tmpDir = os.getcwd() + "/"
209        tmpKeywordsDir = os.getcwd() + "/kewordsAdded/"
210        self._fileUtils.setUpDir(tmpDir)
211        self._fileUtils.setUpDir(tmpKeywordsDir)
212        tmpFile = 'tmpFile.xml'
213        self._fileUtils.createFile(tmpDir + "/" + tmpFile, self._molesFormat)
214
215        keywordAdder.main(tmpDir, tmpKeywordsDir, self.datacentre_groups)
216
217        # Now load in the converted file
218        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
219        self._molesFormat = f.read()
220        f.close
221       
222        # Finally, tidy up temp dirs
223        self._fileUtils.cleanDir(tmpDir)
224        self._fileUtils.clearDir(tmpKeywordsDir)
225        logging.info("Completed adding keywords")
226       
227
228    def getDocumentFormat(self, docType):
229        '''
230        Lookup document format; if it is already defined then return it, else do the required XQuery
231        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
232        @param docType: format of document to return
233        '''
234        logging.info("Retrieving document type, " + docType)
235        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
236        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
237       
238        # check we have the moles format available; if not create it
239        if self._molesFormat is None:
240            self.doMolesTransform()
241            self.createMolesFile()
242       
243        # check the document isn't already defined
244        try:
245            doc = getattr(self, attributeName)
246            if doc is not None:
247                logging.info("Found existing document - returning this now")
248                return doc
249        except:
250            logging.info("Document not available - creating new transformed document")
251
252        # the doc type doesn't exist - so run the xquery
253        setattr(self, attributeName, self.doTransform(xqName))
254       
255   
256    def getAllDocs(self):
257        '''
258        Return a list of all the available doc types in the record
259        '''
260        if len(self._allDocs) > 0:
261            return self._allDocs
262       
263        for docType in self.documentTypes:
264            self._allDocs.append([docType, self.getDocumentFormat(docType)])
265        return self._allDocs
266       
267
268    def listify(self, item):
269        '''
270        listify checks if an item is a list, if it isn't it puts it
271        inside a list and returns it. Always returns a list object.
272        @param item: object to check
273        @return: item as a list object
274        '''
275        if type(item) is list:
276            return item
277        else:
278            return [item]
279       
280   
281    def getSpatioTemporalData(self):
282        '''
283        Extract spatio temporal data from the original document
284        '''
285        # initialise the various spatiotemporal arrays used to extract data to
286        self.stData = SpatioTemporalData()
287       
288        molesFile = self._molesDir + self._shortFilename
289        logging.info('Retrieving spatiotemporal info from moles file, %s' %molesFile)
290       
291        # load in the moles file and put this into an object for direct access to the xml elements
292        dgMeta=MRW.dgMetadata()
293        try:
294            dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
295        except Exception, detail:
296            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
297
298        # do quick checks to see if the relevant data exists
299        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary:
300            logging.info("No data summary elements found - assuming no spatiotemporal data available")
301            return
302       
303        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage:
304            logging.info("No data coverage elements found - assuming no spatiotemporal data available")
305            return
306       
307        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage:
308            logging.info("No spatial coverage elements found - assuming no spatial data available")
309        else:
310            self.getCoordData(dgMeta)
311
312        if not dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage:
313            logging.info("No temporal coverage elements found - assuming no temporal data available")
314        else:
315            self.getTimeRangeData(dgMeta)
316
317
318    def getTimeRangeData(self, dgMeta):
319        '''
320        Parse an xml tree and add any time range data found
321        @param dgMeta: xml fragment for the time range
322        '''
323        logging.info("Extracting time range info")
324        try:
325            dates = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
326           
327            if not dates:
328                logging.info("No temporal info found for document")
329               
330            dates_list = self.listify(dates)
331            for date in dates_list:
332                startdate=date.DateRangeStart
333                enddate= date.DateRangeEnd
334                if startdate==None or startdate=='None':
335                    startdate="null"
336                if enddate==None or enddate=='None':
337                    enddate="null"
338                   
339                self.stData.addTimeRange(startdate, enddate)
340                logging.info("Temporal info: startdate " + \
341                             startdate + ", enddate " + enddate) 
342        except Exception, detail:
343            logging.info("Document does not contain temporal info.")
344            logging.info(detail)
345
346       
347    def getCoordData(self, dgMeta):
348        '''
349        Parse an xml tree and add any coord data found
350        @param dgMeta: xml fragment for the bounding boxes
351        '''
352        logging.info("Extracting bounding box info")
353        try:
354
355            bboxes = dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
356           
357            if not bboxes:
358                logging.info("No bounding box info found for document")
359                return
360               
361            bbox_list=self.listify(bboxes)
362            #parse the list of coordinates
363            for bbox in bbox_list:
364                north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
365                south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
366                east = self.parseCoord(bbox.LimitEast, 'W', 'E')
367                west = self.parseCoord(bbox.LimitWest, 'W', 'E')
368                self.stData.addCoords(north, south, east, west)
369                logging.info("Spatial info: west= " + west + ",south " + south + ", east " + \
370                    east + ", north " + north + "")
371               
372        except Exception, detail:
373            logging.warning("Problem encountered whilst parsing bounding box info - this may lead \n" + \
374                            "to an incomplete set of metadata being ingested. \nDetail: %s" %detail)
375
376
377    def parseCoord(self, coordValue, minField, maxField):
378        '''
379        Take a coordinate value extracted from a molefile bbox limit - together with
380        the appropriate max/min limits and extract the correct value from it
381        @param coordValue: the contents of the bbox limit tage
382        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
383        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
384        @return: coord - the value of the coordinate as a string   
385        '''
386        logging.debug("Parsing document coordinates")
387        try:
388            coord = coordValue.strip()
389            if coord.endswith(maxField):
390                coord=coordValue.split(maxField)[0]
391            elif coord.endswith(minField):
392                if coord.startswith('-'):
393                    coord = coordValue.split(minField)[0]
394                else:
395                    coord = "-" + coordValue.split(minField)[0]
396   
397            return '%s' % float(coord)
398        except:
399            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
400
401           
402    def hasNullCoords():
403        '''
404        Checks a record to determine whether it has any coordinates set to null
405        '''
406        if str(self.west)=='null' or \
407            str(self.south)=='null' or \
408            str(self.east)=='null' or \
409            str(self.north)=='null':
410            return True;
411        else:
412            return False;
413       
Note: See TracBrowser for help on using the repository browser.