source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 3839

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@3839
Revision 3839, 15.2 KB checked in by cbyrom, 12 years ago (diff)

Add script to run ingest for all avaiable config files.
Make oai_ingest_new2 a proper object.
Adjust db_funcs - now pass in details to set up database connection

  • although defaults available, if not done.

Simplify coord parsing in PostgresRecord? by using a reusable function.
+ various tidy ups and fixes.

Line 
1#!/usr/bin/env python
2'''
3Class representing the contents of a row in the metadata_record postgres DB table
4C Byrom Apr 08
5'''
6try: #python 2.5
7    from xml.etree import cElementTree
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import cElementTree
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from ndgUtils.elementtree import cElementTree
15
16import os, sys, logging
17#from ETxmlView import loadET, nsdumb
18import molesReadWrite as MRW
19from ndgUtils.ndgObject import ndgObject
20from FileUtilities import FileUtilities
21
22class PostgresRecord:
23    '''
24    Class representing the contents of a row in the metadata_record postgres DB table
25    @param filename: Name of file to use a metadata record
26    @param
27    '''
28    documentTypes = ['MOLES', 'DIF', 'DC', 'ISO19139']#, 'MDIP']
29       
30    def __init__(self, filename, ndg_dataprovider, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
31        logging.info("Setting up Postgres record for file, " + filename)
32        self.filename = filename
33   
34        # NB, if we're dealing with an NDG data provider, the details are slightly different
35        if ndg_dataprovider:
36            discObj=ndgObject(discovery_id)
37            self._local_id = discObj.localID
38            self._repository_local_id = discObj.repository
39        else:
40            self._local_id = discovery_id
41            self._repository_local_id = datacentre_namespace
42           
43        self._datacentre_groups = datacentre_groups
44        self._repository = datacentre_namespace
45        self.discovery_id = discovery_id
46        self._xq = xq
47        self.docType = docType
48
49        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
50        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
51
52        self._fileUtils = FileUtilities()
53
54        # get the dir of the file - needed by the xquery to use as the target collection
55        tmp = filename.split('/')
56        self._dir = '/'.join(tmp[0:len(tmp)-1])
57        self._shortFilename = tmp[len(tmp)-1]
58       
59        # dir to store a temp copy of the moles file, when produced - for use by other transforms
60        self._molesDir = None
61
62        # firstly load contents of file
63        self.originalFormat = file(filename).read()
64       
65        # initialise the various record fields
66        self.db_id = None    # the DB ID of the record, for easy reference when it is created
67        self.molesFormat = None
68        self.dcFormat = None
69        self.mdipFormat = None
70        self.iso19139Format = None
71       
72        # do some initial setting up of record
73        self.doRecordTransforms()
74        self.getSpatioTemporalData()
75
76   
77    def doRecordTransforms(self):
78        '''
79        Run various transforms on the original doc, to populate the record with
80        the other types of doc used elsewhere
81        '''
82        logging.info("Running transforms for all document types")
83        for docType in self.documentTypes:
84            self.getDocumentFormat(docType)
85           
86        logging.info("Transforms complete")
87
88
89    def createMolesFile(self):
90        '''
91        Check if a moles file exists on the system; if not, assume the moles transform has not
92        been ran and then produce this file - to allow for use in the various xqueries
93        '''
94        logging.info("Creating moles file on system - for use with other xquery transforms")
95        self._molesDir = self._dir + "/moles/"
96        self._fileUtils.setUpDir(self._molesDir)
97       
98        if self._molesFormat is None:
99            self.doMolesTransform()
100           
101        self._fileUtils.createFile(self._molesDir + self._shortFilename, self._molesFormat)
102        logging.info("Moles file created - at %s" %self._molesDir)
103           
104
105    def doTransform(self, xQueryType):
106        '''
107        Transform the record according to the specified XQuery type
108        @param xQueryType: XQuery doc to use to do the transform
109        @return: the metadata record in the required transformed format
110        '''
111        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
112
113        # firstly, check if this is a moles -> something else query; if so, ensure there is a valid
114        # moles file available for the transform - and use the correct dir for the xquery collection
115        dir = self._dir
116        if xQueryType.find('moles2') > -1:
117            if self._molesDir is None:
118                self.createMolesFile()
119               
120            dir = self._molesDir
121           
122        # get the query and set this up to use properly
123        xquery = self._xq.actual(xQueryType, dir, self._repository_local_id, self._local_id)
124
125        # sort out the input ID stuff
126        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
127        xquery=xquery.replace('repository_localid', self._repository)
128
129        # strip out the eXist reference to the libraries; these files should be available in the
130        # running dir - as set up by oai_ingest.py
131        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Vocabs/', '')
132        xquery=xquery.replace('xmldb:exist:///db/xqueryLib/Utilities/', '')
133
134        # write the query to file, to make it easier to input
135        # NB, running directly at the command line leads to problems with the interpretation of $ characters
136        xqFile = "currentQuery.xq"
137        self._fileUtils.createFile(xqFile, xquery)
138
139        # Now do the transform
140        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
141        xqCommand = "java -cp ./lib/saxon9.jar net.sf.saxon.Query " + xqFile + " !omit-xml-declaration=yes"
142        logging.debug("Running saxon command: " + xqCommand)
143        pipe = os.popen(xqCommand + " 2>&1")
144        output = pipe.read()
145        status = pipe.close()
146
147        if status is not None:
148            sys.exit("Failed at running the XQuery")
149
150        # now remove the temp xquery file
151        status = os.unlink(xqFile)
152        if status is not None:
153            sys.exit("Failed to remove the temporary xquery file, " + xqFile)
154       
155        logging.info("Transform completed successfully")
156       
157#        f=open(xQueryType + "_doc.xml", 'w')
158#        f.write(output)
159#        f.close()
160           
161        return output
162
163
164    def doMolesTransform(self):
165        '''
166        Set up the basic moles doc - according to the type of document first ingested
167        '''
168        logging.info("Creating moles document - for use with other transforms")
169        xqName = None
170        if self.docType == "DIF":
171            xqName = "dif2moles"
172        elif self.docType == "MDIP":
173            xqName = "mdip2moles"
174        else:
175            sys.exit("ERROR: No XQuery exists to transform input document type, %s, into moles format" \
176                     %self.docType)
177
178        # add keywords, if required
179        if self._datacentre_groups != "":
180            addKeywords()
181
182        # now run the appropriate transform and set the attribute
183        setattr(self, "_molesFormat", self.doTransform(xqName))
184        logging.info("moles document created")
185       
186
187    def addKeywords(self):
188        '''
189        If datacentre groups have been specified, these need to be added as keywords
190        - NB, this is rather clumsy approach but uses old code to achieve the result
191        '''
192        logging.info("Adding datacentre keywords to moles file")
193        # NB, use temporary directories to do the keyword additions
194        tmpDir = os.getcwd() + "/"
195        tmpKeywordsDir = os.getcwd() + "/kewordsAdded/"
196        self._fileUtils.setUpDir(tmpDir)
197        self._fileUtils.setUpDir(tmpKeywordsDir)
198        tmpFile = 'tmpFile.xml'
199        self._fileUtils.createFile(tmpDir + "/" + tmpFile, self._molesFormat)
200
201        keywordAdder.main(tmpDir, tmpKeywordsDir, self.datacentre_groups)
202
203        # Now load in the converted file
204        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
205        self._molesFormat = f.read()
206        f.close
207       
208        # Finally, tidy up temp dirs
209        self._fileUtils.cleanDir(tmpDir)
210        self._fileUtils.clearDir(tmpKeywordsDir)
211        logging.info("Completed adding keywords")
212       
213
214    def getDocumentFormat(self, docType):
215        '''
216        Lookup document format; if it is already defined then return it, else do the required XQuery
217        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
218        @param docType: format of document to return
219        '''
220        logging.info("Retrieving document type, " + docType)
221        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
222        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
223       
224        # check we have the moles format available; if not create it
225        if self._molesFormat is None:
226            self.doMolesTransform()
227            self.createMolesFile()
228       
229        # check the document isn't already defined
230        try:
231            doc = getattr(self, attributeName)
232            if doc is not None:
233                logging.info("Found existing document - returning this now")
234                return doc
235        except:
236            logging.info("Document not available - creating new transformed document")
237
238        # the doc type doesn't exist - so run the xquery
239        setattr(self, attributeName, self.doTransform(xqName))
240       
241   
242    def getAllDocs(self):
243        '''
244        Return a list of all the available doc types in the record
245        '''
246        if len(self._allDocs) > 0:
247            return self._allDocs
248       
249        for docType in documentTypes:
250            self._allDocs.append([docType, getDocumentFormat(docType)])
251        return self._allDocs
252       
253
254    def listify(self, item):
255        '''
256        listify checks if an item is a list, if it isn't it puts it
257        inside a list and returns it. Always returns a list object.
258        @param item: object to check
259        @return: item as a list object
260        '''
261        if type(item) is list:
262            return item
263        else:
264            return [item]
265       
266   
267    def getSpatioTemporalData(self):
268        '''
269        Extract spatio temporal data from the original document
270        '''
271        #this is a fix to the  ElementTree namespace problem that namespaces are usually
272        # represented as ns0, ns1, ns2 etc.
273        #cElementTree._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
274        no_bbox = False
275        no_dates = False
276        self.east = 'null'
277        self.west = 'null'
278        self.north = 'null'
279        self.south = 'null'
280        self.startdate='nostartdate'
281        self.enddate='noenddate'
282       
283        molesFile = self._molesDir + self._shortFilename
284       
285        dgMeta=MRW.dgMetadata()
286        try:
287            dgMeta.fromXML(cElementTree.ElementTree(file=molesFile).getroot())
288            print dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox
289        except Exception, detail:
290            logging.warning("Cannot parse the XML moles document %s. Will not process" %molesFile)
291            logging.debug(detail)
292            return
293       
294        try:
295            bbox_list=self.listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox)
296        except Exception, detail:
297            logging.info("XML moles document " + molesFile + " does not contain a bounding box.")
298            logging.debug(detail)
299            no_bbox=True
300
301        try:
302            dates=dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
303            print "startdate = %s" %dates.DateRangeStart
304            print "enddate = %s" %dates.DateRangeEnd
305        except:
306            logging.info("XML moles document " + molesFile + " does not contain temporal info.")
307            no_dates=True
308
309        if no_bbox and no_dates:
310            logging.info("XML moles document " + molesFile + " does not contain any spatiotemporal info.")
311            return
312
313        if not no_dates:
314            startdate=dates.DateRangeStart
315            enddate= dates.DateRangeEnd
316            if startdate==None or startdate=='None':
317                startdate="nostartdate"
318            if enddate==None or enddate=='None':
319                enddate="noenddate"
320            self.startdate = startdate
321            self.enddate = enddate
322
323        if not no_bbox:
324            #parse the coordinates somewhat - only use the first bounding box.
325            bbox=bbox_list[0]
326            try:
327                self.west = self.parseCoord(bbox.LimitWest, 'W', 'E')
328            except:
329                logging.error("Will not process File %s. Contains incorrect East bounding box limit." %molesFile)
330                return
331           
332            try:
333                self.east = self.parseCoord(bbox.LimitEast, 'W', 'E')
334            except:
335                logging.error("Will not process File %s. Contains incorrect East bounding box limit." %molesFile)
336                return
337           
338            try:
339                self.north = self.parseCoord(bbox.LimitNorth, 'S', 'N')
340            except:
341                logging.error("Will not process File %s. Contains incorrect North bounding box limit." %molesFile)
342                return
343           
344            try:
345                self.south = self.parseCoord(bbox.LimitSouth, 'S', 'N')
346            except:
347                logging.error("Will not process File %s. Contains incorrect South bounding box limit." %molesFile)
348                return
349
350        logging.info("Spatial info: west= " + self.west + ",south " + self.south + ", east " + \
351                    self.east + ", north " + self.north + "")
352        logging.info("Temporal info: startdate " + self.startdate + ", enddate " + self.enddate) 
353
354
355
356    def parseCoord(self, coordValue, minField, maxField):
357        '''
358        Take a coordinate value extracted from a molefile bbox limit - together with
359        the appropriate max/min limits and extract the correct value from it
360        @param coordValue: the contents of the bbox limit tage
361        @param minField: the expected min field of the coord range - i.e. 'W' or 'S'
362        @param maxField: the expected max field of the coord range - i.e. 'E' or 'N'
363        @return: coord - the value of the coordinate as a string   
364        '''
365
366        coord = coordValue.strip()
367        if coord.endswith(maxField):
368            coord=coordValue.split(maxField)[0]
369        elif coord.endswith(minField):
370            if coord.startswith('-'):
371                coord = coordValue.split(minField)[0]
372            else:
373                coord = "-" + coordValue.split(minField)[0]
374
375        return '%s' % float(coord)
376           
377    def hasNullCoords():
378        '''
379        Checks a record to determine whether it has any coordinates set to null
380        '''
381        if str(self.west)=='null' or \
382            str(self.south)=='null' or \
383            str(self.east)=='null' or \
384            str(self.north)=='null':
385            return True;
386        else:
387            return False;
388       
Note: See TracBrowser for help on using the repository browser.