source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 3846

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@3846
Revision 3846, 14.5 KB checked in by cbyrom, 12 years ago (diff)

Ajudst DAO and Record classes to throw errors rather than catching them

  • to allow processing of multiple files (wrapped by oai_ingest)

to continue more cleanly.

Line 
1#!/usr/bin/env python
2'''
3Class representing the contents of a row in the metadata_record postgres DB table
4C Byrom Apr 08
5'''
6try: #python 2.5
7    from xml.etree import cElementTree
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree
15
16import os, sys, logging
17#from ETxmlView import loadET, nsdumb
18import molesReadWrite as MRW
19from ndgUtils.ndgObject import ndgObject
20from FileUtilities import FileUtilities
21
22class PostgresRecord:
23    '''
24    Class representing the contents of a row in the metadata_record postgres DB table
25    @param filename: Name of file to use a metadata record
26    @param
27    '''
28    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
29       
30    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
31        logging.info("Setting up Postgres record for file, " + filename)
32        self.filename = filename
33   
34        # NB, if we're dealing with an NDG data provider, the details are slightly different
35        if ndg_dataprovider:
36            discObj=ndgObject(discovery_id)
37            self._local_id = discObj.localID
38            self._repository_local_id = discObj.repository
39        else:
40            self._local_id = discovery_id
41            self._repository_local_id = datacentre_namespace
42           
43        self._datacentre_groups = datacentre_groups
44        self._repository = datacentre_namespace
45        self.discovery_id = discovery_id
46        self._xq = xq
47        self.docType = docType
48
49        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
50        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
51
52        self._fileUtils = FileUtilities()
53
54        # get the dir of the file - needed by the xquery to use as the target collection
55        tmp = filename.split('/')
56        self._dir = '/'.join(tmp[0:len(tmp)-1])
57        self._shortFilename = tmp[len(tmp)-1]
58       
59        # dir to store a temp copy of the moles file, when produced - for use by other transforms
60        self._molesDir = None
61
62        # firstly load contents of file
63        self.originalFormat = file(filename).read()
64       
65        # initialise the various record fields
66        self.db_id = None    # the DB ID of the record, for easy reference when it is created
67        self.molesFormat = None
68        self.dcFormat = None
69        self.mdipFormat = None
70        self.iso19139Format = None
71       
72        # do some initial setting up of record
73        self.doRecordTransforms()
74        self.getSpatioTemporalData()
75
76   
77    def doRecordTransforms(self):
78        '''
79        Run various transforms on the original doc, to populate the record with
80        the other types of doc used elsewhere
81        '''
82        logging.info("Running transforms for all document types")
83        for docType in self.documentTypes:
84            self.getDocumentFormat(docType)
85           
86        logging.info("Transforms complete")
87
88
89    def createMolesFile(self):
90        '''
91        Check if a moles file exists on the system; if not, assume the moles transform has not
92        been ran and then produce this file - to allow for use in the various xqueries
93        '''
94        logging.info("Creating moles file on system - for use with other xquery transforms")
95        self._molesDir = self._dir + "/moles/"
96        self._fileUtils.setUpDir(self._molesDir)
97       
98        if self._molesFormat is None:
99            self.doMolesTransform()
100           
101        self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat)
102        logging.info("Moles file created - at %s" %self._molesDir)
103           
104
105    def doTransform(self, xQueryType):
106        '''
107        Transform the record according to the specified XQuery type
108        @param xQueryType: XQuery doc to use to do the transform
109        @return: the metadata record in the required transformed format
110        '''
111        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
112
113        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
114        # moles file available for the transform - and use the correct dir for the xquery collection
115        dir = self._dir
116        if xQueryType.find('moles2') > -1:
117            if self._molesDir is None:
118                self.createMolesFile()
119               
120            dir = self._molesDir
121           
122        # get the query and set this up to use properly
123        xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
124
125        # sort out the input ID stuff
126        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
127        xquery=xquery.replace('repository_localid', self._repository)
128
129        # strip out the eXist reference to the libraries; these files should be available in the
130        # running dir - as set up by oai_ingest.py
131        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
132        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
133
134        # write the query to file, to make it easier to input
135        # NB, running directly at the command line leads to problems with the interpretation of $ characters
136        xqFile = "currentQuery.xq"
137        self._fileUtils.createFile(xqFile, xquery)
138
139        # Now do the transform
140        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
141        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes"
142        logging.debug("Running saxon command: " + xqCommand)
143        pipe = os.popen(xqCommand + " 2>&1")
144        output = pipe.read()
145        status = pipe.close()
146
147        if status is not None:
148            raise SystemError, 'Failed at running the XQuery'
149
150        # now remove the temp xquery file
151        status = os.unlink(xqFile)
152        if status is not None:
153            raise OSError, 'Failed to remove the temporary xquery file, ' + xqFile
154       
155        logging.info("Transform completed successfully")
156       
157#        f=open(xQueryType + "_doc.xml", 'w')
158#        f.write(output)
159#        f.close()
160           
161        return output
162
163
164    def doMolesTransform(self):
165        '''
166        Set up the basic moles doc - according to the type of document first ingested
167        '''
168        logging.info("Creating moles document - for use with other transforms")
169        xqName = None
170        if self.docType == "DIF":
171            xqName = "dif2moles"
172        elif self.docType == "MDIP":
173            xqName = "mdip2moles"
174        else:
175            sys.exit("ERROR: No XQuery exists to transform input document type, %s, into moles format" \
176                     %self.docType)
177
178        # add keywords, if required
179        if self._datacentre_groups != "":
180            addKeywords()
181
182        # now run the appropriate transform and set the attribute
183        setattr(self, "_molesFormat", self.doTransform(xqName))
184        logging.info("moles document created")
185       
186
187    def addKeywords(self):
188        '''
189        If datacentre groups have been specified, these need to be added as keywords
190        - NB, this is rather clumsy approach but uses old code to achieve the result
191        '''
192        logging.info("Adding datacentre keywords to moles file")
193        # NB, use temporary directories to do the keyword additions
194        tmpDir = os.getcwd() + "/"
195        tmpKeywordsDir = os.getcwd() + "/kewordsAdded/"
196        self._fileUtils.setUpDir(tmpDir)
197        self._fileUtils.setUpDir(tmpKeywordsDir)
198        tmpFile = 'tmpFile.xml'
199        self._fileUtils.createFile(tmpDir + "/" + tmpFile, self._molesFormat)
200
201        keywordAdder.main(tmpDir, tmpKeywordsDir, self.datacentre_groups)
202
203        # Now load in the converted file
204        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
205        self._molesFormat = f.read()
206        f.close
207       
208        # Finally, tidy up temp dirs
209        self._fileUtils.cleanDir(tmpDir)
210        self._fileUtils.clearDir(tmpKeywordsDir)
211        logging.info("Completed adding keywords")
212       
213
214    def getDocumentFormat(self, docType):
215        '''
216        Lookup document format; if it is already defined then return it, else do the required XQuery
217        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
218        @param docType: format of document to return
219        '''
220        logging.info("Retrieving document type, " + docType)
221        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
222        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
223       
224        # check we have the moles format available; if not create it
225        if self._molesFormat is None:
226            self.doMolesTransform()
227            self.createMolesFile()
228       
229        # check the document isn't already defined
230        try:
231            doc = getattr(self, attributeName)
232            if doc is not None:
233                logging.info("Found existing document - returning this now")
234                return doc
235        except:
236            logging.info("Document not available - creating new transformed document")
237
238        # the doc type doesn't exist - so run the xquery
239        setattr(self, attributeName, self.doTransform(xqName))
240       
241   
242    def getAllDocs(self):
243        '''
244        Return a list of all the available doc types in the record
245        '''
246        if len(self._allDocs) > 0:
247            return self._allDocs
248       
249        for docType in self.documentTypes:
250            self._allDocs.append([docType, self.getDocumentFormat(docType)])
251        return self._allDocs
252       
253
254    def listify(self, item):
255        '''
256        listify checks if an item is a list, if it isn't it puts it
257        inside a list and returns it. Always returns a list object.
258        @param item: object to check
259        @return: item as a list object
260        '''
261        if type(item) is list:
262            return item
263        else:
264            return [item]
265       
266   
267    def getSpatioTemporalData(self):
268        '''
269        Extract spatio temporal data from the original document
270        '''
271        #this is a fix to the  ElementTree namespace problem that namespaces are usually
272        # represented as ns0, ns1, ns2 etc.
273        #cElementTree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
274        self.east = 'null'
275        self.west = []
276        self.north = 'null'
277        self.south = 'null'
278        self.startdate='nostartdate'
279        self.enddate='noenddate'
280       
281        molesFile = self._molesDir + self._shortFilename
282       
283        dgMeta=MRW.dgMetadata()
284        try:
285            dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
286        except Exception, detail:
287            raise SystemError, 'Cannot parse the XML moles document %s. Detail:\n%s' %(molesFile, detail)
288       
289        bbox_list = []
290        try:
291            logging.info("Extracting bounding box info")
292            bbox_list=self.listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox)
293            #parse the list of coordinates
294            for bbox in bbox_list:
295                self.west.append(self.parseCoord(bbox.LimitWest, 'W', 'E'))
296                self.east = self.parseCoord(bbox.LimitEast, 'W', 'E')
297                self.north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
298                self.south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
299        except Exception, detail:
300            logging.info("XML moles document " + molesFile + " does not contain a bounding box.")
301            logging.debug(detail)
302
303        try:
304            dates=dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
305            print "startdate = %s" %dates.DateRangeStart
306            print "enddate = %s" %dates.DateRangeEnd
307        except:
308            logging.info("XML moles document " + molesFile + " does not contain temporal info.")
309            no_dates=True
310
311        if no_bbox and no_dates:
312            logging.info("XML moles document " + molesFile + " does not contain any spatiotemporal info.")
313            return
314
315        if not no_dates:
316            startdate=dates.DateRangeStart
317            enddate= dates.DateRangeEnd
318            if startdate==None or startdate=='None':
319                startdate="nostartdate"
320            if enddate==None or enddate=='None':
321                enddate="noenddate"
322            self.startdate = startdate
323            self.enddate = enddate
324
325
326        logging.info("Spatial info: west= " + self.west + ",south " + self.south + ", east " + \
327                    self.east + ", north " + self.north + "")
328        logging.info("Temporal info: startdate " + self.startdate + ", enddate " + self.enddate) 
329
330
331
332    def parseCoord(self, coordValue, minField, maxField):
333        '''
334        Take a coordinate value extracted from a molefile bbox limit - together with
335        the appropriate max/min limits and extract the correct value from it
336        @param coordValue: the contents of the bbox limit tage
337        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
338        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
339        @return: coord - the value of the coordinate as a string   
340        '''
341        logging.info("Parsing document coordinates")
342        try:
343            coord = coordValue.strip()
344            if coord.endswith(maxField):
345                coord=coordValue.split(maxField)[0]
346            elif coord.endswith(minField):
347                if coord.startswith('-'):
348                    coord = coordValue.split(minField)[0]
349                else:
350                    coord = "-" + coordValue.split(minField)[0]
351   
352            return '%s' % float(coord)
353        except:
354            raise SyntaxError, 'Will not process File: contains incorrect bounding box limit: ' + coordValue
355
356           
357    def hasNullCoords():
358        '''
359        Checks a record to determine whether it has any coordinates set to null
360        '''
361        if str(self.west)=='null' or \
362            str(self.south)=='null' or \
363            str(self.east)=='null' or \
364            str(self.north)=='null':
365            return True;
366        else:
367            return False;
368       
Note: See TracBrowser for help on using the repository browser.