source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 3821

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@3821
Revision 3821, 16.1 KB checked in by cbyrom, 12 years ago (diff)

Fix a few problems - including referencing the xquery libraries; these
have now been added to the ndgUtils egg and are extracted locally and
referenced directly. Also add functionality to deal with the moles -> other
transforms + add new utility methods and tidy up code and add more logging.

Line 
1#!/usr/bin/env python
2'''
3Class representing the contents of a row in the metadata_record postgres DB table
4C Byrom Apr 08
5'''
6try: #python 2.5
7    from xml.etree import ElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import ElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from elementtree import ElementTree as ET
15#this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc.
16#ET._namespace_map.update({'http://www.oceannet.org/mdip/xml': 'mdip', 'http://www.w3.org/1999/xlink':'xlink'})
17
18import os, sys, logging
19from ETxmlView import loadET, nsdumb
20import molesReadWrite as MRW
21from ndgUtils.ndgObject import ndgObject
22from FileUtilities import FileUtilities
23
24class PostgresRecord:
25    '''
26    Class representing the contents of a row in the metadata_record postgres DB table
27    @param filename: Name of file to use a metadata record
28    @param
29    '''
30    documentTypes = ['MOLES', 'DIF', 'DC', 'MDIP', 'ISO19139']
31       
32    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
33        logging.info("Setting up Postgres record for file, " + filename)
34        self.filename = filename
35   
36        # NB, if we're dealing with an NDG data provider, the details are slightly different
37        if ndg_dataprovider:
38            discObj=ndgObject(discovery_id)
39            self._local_id = discObj.localID
40            self._repository_local_id = discObj.repository
41        else:
42            self._local_id = discovery_id
43            self._repository_local_id = datacentre_namespace
44           
45        self._datacentre_groups = datacentre_groups
46        self._repository = datacentre_namespace
47        self.discovery_id = discovery_id
48        self._xq = xq
49        self.docType = docType
50
51        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
52        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
53
54        self._fileUtils = FileUtilities()
55
56        # get the dir of the file - needed by the xquery to use as the target collection
57        tmp = filename.split('/')
58        self._dir = '/'.join(tmp[0:len(tmp)-1])
59        self._shortFilename = tmp[len(tmp)-1]
60       
61        # dir to store a temp copy of the moles file, when produced - for use by other transforms
62        self._molesDir = None
63
64        # firstly load contents of file
65        self.originalFormat = file(filename).read()
66       
67        # we use loadET to protect ourselves from scummy characters and unicode problems
68        # DO WE NEED TO DO THIS??
69        self.correctedFormat = loadET(self.originalFormat)
70
71       
72        # initialise the various record fields
73        self.db_id = None    # the DB ID of the record, for easy reference when it is created
74        self.molesFormat = None
75        self.dcFormat = None
76        self.mdipFormat = None
77        self.iso19139Format = None
78       
79        # do some initial setting up of record
80        self.doRecordTransforms()
81        self.getSpatioTemporalData()
82
83   
84    def doRecordTransforms(self):
85        '''
86        Run various transforms on the original doc, to populate the record with
87        the other types of doc used elsewhere
88        '''
89        logging.info("Running transforms for all document types")
90        for docType in self.documentTypes:
91            self.getDocumentFormat(docType)
92           
93        logging.info("Transforms complete")
94
95
96    def createMolesFile(self):
97        '''
98        Check if a moles file exists on the system; if not, assume the moles transform has not
99        been ran and then produce this file - to allow for use in the various xqueries
100        '''
101        logging.info("Creating moles file on system - for use with other xquery transforms")
102        self._molesDir = self._dir + "/moles/"
103        self._fileUtils.setUpDir(self._molesDir)
104       
105        if self._molesFormat is None:
106            self.doMolesTransform()
107           
108        self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat)
109        logging.info("Moles file created - at %s" %self._molesDir)
110           
111
112    def doTransform(self, xQueryType):
113        '''
114        Transform the record according to the specified XQuery type
115        @param xQueryType: XQuery doc to use to do the transform
116        @return: the metadata record in the required transformed format
117        '''
118        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
119
120        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
121        # moles file available for the transform - and use the correct dir for the xquery collection
122        dir = self._dir
123        if xQueryType.find('moles2') > -1:
124            if self._molesDir is None:
125                self.createMolesFile()
126               
127            dir = self._molesDir
128           
129        # get the query and set this up to use properly
130        xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
131
132        # sort out the input ID stuff
133        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
134        xquery=xquery.replace('repository_localid', self._repository)
135
136        # strip out the eXist reference to the libraries; these files should be available in the
137        # running dir - as set up by oai_ingest.py
138        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
139        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
140
141        # write the query to file, to make it easier to input
142        # NB, running directly at the command line leads to problems with the interpretation of $ characters
143        xqFile = "currentQuery.xq"
144        self._fileUtils.createFile(xqFile, xquery)
145
146        # Now do the transform
147        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
148        xqCommand = "java -cp saxon9.jar net.sf.saxon.Query " + xqFile
149        logging.debug("Running saxon command: " + xqCommand)
150        pipe = os.popen(xqCommand + " 2>&1")
151        output = pipe.read()
152        status = pipe.close()
153
154        print output
155        if status is not None:
156            sys.exit("Failed at running the XQuery")
157
158        # now remove the temp xquery file
159        status = os.unlink(xqFile)
160        if status is not None:
161            sys.exit("Failed to remove the temporary xquery file, " + xqFile)
162       
163        logging.info("Transform completed successfully")
164           
165        return output
166
167
168    def doMolesTransform(self):
169        '''
170        Set up the basic moles doc - according to the type of document first ingested
171        '''
172        logging.info("Creating moles document - for use with other transforms")
173        xqName = None
174        if self.docType == "DIF":
175            xqName = "dif2moles"
176        elif self.docType == "MDIP":
177            xqName = "mdip2moles"
178        else:
179            sys.exit("ERROR: No XQuery exists to transform input document type, %s, into moles format" \
180                     %self.docType)
181
182        # add keywords, if required
183        if self._datacentre_groups != "":
184            addKeywords()
185
186        # now run the appropriate transform and set the attribute
187        setattr(self, "_molesFormat", self.doTransform(xqName))
188        logging.info("moles document created")
189       
190
191    def addKeywords(self):
192        '''
193        If datacentre groups have been specified, these need to be added as keywords
194        - NB, this is rather clumsy approach but uses old code to achieve the result
195        '''
196        logging.info("Adding datacentre keywords to moles file")
197        # NB, use temporary directories to do the keyword additions
198        tmpDir = os.getcwd() + "/"
199        tmpKeywordsDir = os.getcwd() + "/kewordsAdded/"
200        self._fileUtils.setUpDir(tmpDir)
201        self._fileUtils.setUpDir(tmpKeywordsDir)
202        tmpFile = 'tmpFile.xml'
203        self._fileUtils.createFile(tmpDir + "/" + tmpFile, self._molesFormat)
204
205        keywordAdder.main(tmpDir, tmpKeywordsDir, self.datacentre_groups)
206
207        # Now load in the converted file
208        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
209        self._molesFormat = f.read()
210        f.close
211       
212        # Finally, tidy up temp dirs
213        self._fileUtils.cleanDir(tmpDir)
214        self._fileUtils.clearDir(tmpKeywordsDir)
215        logging.info("Completed adding keywords")
216       
217
218    def getDocumentFormat(self, docType):
219        '''
220        Lookup document format; if it is already defined then return it, else do the required XQuery
221        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
222        @param docType: format of document to return
223        '''
224        logging.info("Retrieving document type, " + docType)
225        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
226        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
227       
228        # check we have the moles format available; if not create it
229        if self._molesFormat is None:
230            self.doMolesTransform()
231            self.createMolesFile()
232       
233        # check the document isn't already defined
234        try:
235            doc = getattr(self, attributeName)
236            if doc is not None:
237                logging.info("Found existing document - returning this now")
238                return doc
239        except:
240            logging.info("Document not available - creating new transformed document")
241
242        # the doc type doesn't exist - so run the xquery
243        setattr(self, attributeName, self.doTransform(xqName))
244       
245   
246    def getAllDocs(self):
247        '''
248        Return a list of all the available doc types in the record
249        '''
250        if len(self._allDocs) > 0:
251            return self._allDocs
252       
253        for docType in documentTypes:
254            self._allDocs.append([docType, getDocumentFormat(docType)])
255        return self._allDocs
256       
257
258    def listify(item):
259        '''
260        listify checks if an item is a list, if it isn't it puts it
261        inside a list and returns it. Always returns a list object.
262        @param item: object to check
263        @return: item as a list object
264        '''
265        if type(item) is list:
266            return item
267        else:
268            return [item]
269       
270   
271    def getSpatioTemporalData(self):
272        '''
273        Extract spatio temporal data from the original document
274        '''
275        ET._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
276        no_bbox = False
277        no_dates = False
278        self.east = 'null'
279        self.west = 'null'
280        self.north = 'null'
281        self.south = 'null'
282        self.startdate='nostartdate'
283        self.enddate='noenddate'
284       
285        dgMeta=MRW.dgMetadata()
286        try:
287            dgMeta.fromXML(cElementTree.ElementTree(file=self.filename).getroot())
288        except:
289            logging.warning("WARNING: Cannot parse the XML moles document %s. Will not process" %self.filename)
290            return
291        try:
292            bbox_list=listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox)
293        except:
294            logging.info("XML moles document " + self.filename + \
295                " does not contain a bounding box.")
296            no_bbox=True
297
298        try:
299            dates=dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
300            print "startdate = %s" %dates.DateRangeStart
301            print "enddate = %s" %dates.DateRangeEnd
302        except:
303            logging.info("XML moles document " + self.filename + " does not contain temporal info.")
304            no_dates=True
305
306        if no_bbox and no_dates:
307            logging.info("XML moles document " + self.filename + " does not contain any spatiotemporal info.")
308            return
309
310        if not no_dates:
311            startdate=dates.DateRangeStart
312            enddate= dates.DateRangeEnd
313            if startdate==None or startdate=='None':
314                startdate="nostartdate"
315            if enddate==None or enddate=='None':
316                enddate="noenddate"
317            self.startdate = startdate
318            self.enddate = enddate
319
320        if not no_bbox:
321            #parse the coordinates somewhat - only use the first bounding box.
322            bbox=bbox_list[0]
323            try:
324                west = bbox.LimitWest.strip()
325            except:
326                print "ERROR:  Will not process File %s. Contains incorrect West bounding box limit." %self.filename
327                return
328            if west.endswith('E'):
329                west=bbox.LimitWest.split('E')[0]
330            elif west.endswith('W'):
331                if west.startswith('-'):
332                    west = bbox.LimitWest.split('W')[0]
333                else:
334                    west = "-" +bbox.LimitWest.split('W')[0]
335            try:
336                float(west)
337            except:
338                print "ERROR:  Will not process File %s. Contains incorrect West bounding box limit." %self.filename
339                return
340            self.west = west
341           
342            try:
343                east = bbox.LimitEast.strip()
344            except:
345                print "ERROR:  Will not process File %s. Contains incorrect East bounding box limit." %self.filename
346                return
347            if east.endswith('E'):
348                east=bbox.LimitEast.split('E')[0]
349            elif east.endswith('W'):
350                if east.startswith('-'):
351                    east = bbox.LimitEast.split('W')[0]
352                else:
353                    east = "-" +bbox.LimitEast.split('W')[0]
354            try:
355                float(east)
356            except:
357                print "ERROR:  Will not process File %s. Contains incorrect East bounding box limit." %self.filename
358                return
359            self.east = east
360           
361            try:
362                north = bbox.LimitNorth.strip()
363            except:
364                print "ERROR:  Will not process File %s. Contains incorrect North bounding box limit." %self.filename
365                return
366            if north.endswith('N'):
367                north=bbox.LimitNorth.split('N')[0]
368            elif north.endswith('S'):
369                if north.startswith('-'):
370                    north = bbox.LimitNorth.split('S')[0]
371                else:
372                    north = "-" +bbox.LimitNorth.split('S')[0]
373            try:
374                float(north)
375            except:
376                print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %self.filename
377                return
378            self.north = north
379           
380            try:
381                south = bbox.LimitSouth.strip()
382            except:
383                print "ERROR:  Will not process File %s. Contains incorrect South bounding box limit." %self.filename
384                return
385            if south.endswith('N'):
386                south=bbox.LimitSouth.split('N')[0]
387            elif south.endswith('S'):
388                if south.startswith('-'):
389                    south = bbox.LimitSouth.split('S')[0]
390                else:
391                    south = "-" +bbox.LimitSouth.split('S')[0]
392            try:
393                float(south)
394            except:
395                print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %self.filename
396                return
397            self.south = south
398
399        logging.info("Spatial info: west= " + self.west + ",south " + self.south + ", east " + \
400                    self.east + ", north " + self.north + "")
401        logging.info("Temporal info: startdate " + self.startdate + ", enddate " + self.enddate) 
402
403           
404    def hasNullCoords():
405        if str(self.west)=='null' or \
406            str(self.south)=='null' or \
407            str(self.east)=='null' or \
408            str(self.north)=='null':
409            return True;
410        else:
411            return False;
412       
Note: See TracBrowser for help on using the repository browser.