source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py @ 3816

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/PostgresRecord.py@3816
Revision 3816, 14.6 KB checked in by cbyrom, 13 years ago (diff)

Add workflow to allow creation of moles doc from whatever starting point
when creating PostgresRecord? + add logging support and extend logging and
tidy up code.

Line 
1#!/usr/bin/env python
2'''
3Class representing the contents of a row in the metadata_record postgres DB table
4C Byrom Apr 08
5'''
6try: #python 2.5
7    from xml.etree import ElementTree as ET
8except ImportError:
9    try:
10        # if you've installed it yourself it comes this way
11        import ElementTree as ET
12    except ImportError:
13        # if you've egged it this is the way it comes
14        from elementtree import ElementTree as ET
15#this is a fix to the  ElementTree namespace problem that namespaces are usually represented as ns0, ns1, ns2 etc.
16#ET._namespace_map.update({'http://www.oceannet.org/mdip/xml': 'mdip', 'http://www.w3.org/1999/xlink':'xlink'})
17
18import os, sys, logging
19from ETxmlView import loadET, nsdumb
20import molesReadWrite as MRW
21
22class PostgresRecord:
23    '''
24    Class representing the contents of a row in the metadata_record postgres DB table
25    @param filename: Name of file to use a metadata record
26    @param
27    '''
28    documentTypes = ['MOLES', 'DIF', 'DC', 'MDIP', 'ISO19139']
29       
30    def __init__(self, filename, datacentre_groups, datacentre_namespace, discovery_id, xq, docType):
31        logging.info("Setting up Postgres record for file, " + filename)
32        self.filename = filename
33        self._datacentre_groups = datacentre_groups
34        self._repository = datacentre_namespace
35        self.discovery_id = discovery_id
36        self._xq = xq
37        self.docType = docType
38
39        self._molesFormat = None    # initialise this, so we can guarantee a value - to avoid using getattr
40        self._allDocs = []  # array to store all the transformed docs - for easy retrieval by the DAO
41
42        # firstly load contents of file
43        self.originalFormat = file(filename).read()
44       
45        # we use loadET to protect ourselves from scummy characters and unicode problems
46        # DO WE NEED TO DO THIS??
47        self.correctedFormat = loadET(self.originalFormat)
48       
49
50        #debugging stuff
51#        self.logger.printOutput("vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv")
52#        print self.correctedFormat
53#        print self.originalFormat.keys()
54#        for i in self.originalFormat: print i.tag
55#        print dir(self.originalFormat)
56#        self.logger.printOutput("vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv")
57        #we use nsdumb in case the namespace causes difficulties ...
58#        helper=nsdumb(self.originalFormat)
59        #print helper
60#        self.id=helper.getText(self.originalFormat,'DatasetIdentifier')
61
62       
63        # initialise the various record fields
64        self.db_id = None    # the DB ID of the record, for easy reference when it is created
65        self.molesFormat = None
66        self.dcFormat = None
67        self.mdipFormat = None
68        self.iso19139Format = None
69       
70        # do some initial setting up of record
71#        self.setUpRecord()
72        self.doRecordTransforms()
73        self.getSpatioTemporalData()
74
75   
76    def doRecordTransforms(self):
77        '''
78        Run various transforms on the original doc, to populate the record with
79        the other types of doc used elsewhere
80        '''
81        logging.info("Running transforms for all document types")
82        for docType in self.documentTypes:
83            self.getDocumentFormat(docType)
84        logging.info("Transforms complete")
85
86
87    def doTransform(self, xQueryType):
88        '''
89        Transform the record according to the specified XQuery type
90        @param xQueryType: XQuery doc to use to do the transform
91        @return: the metadata record in the required transformed format
92        '''
93        logging.info("Running XQuery transform, " + xQueryType + " to create transformed document")
94
95        # get the query and set this up to use properly
96        xquery = self._xq.actual(xQueryType, self.filename, self._repository, self.discovery_id)
97
98        # sort out the input ID stuff
99        xquery=xquery.replace('Input_Entry_ID', self.discovery_id)
100        xquery=xquery.replace('repository_localid', self._repository)
101
102        # write the query to file, to make it easier to input
103        # NB, running directly at the command line leads to problems with the interpretation of $ characters
104        xqFile = "currentQuery.xq"
105        f=open(xqFile,'w')
106        f.write(xquery)
107        f.close()
108
109        # Now do the transform
110#        xqCommand = "java -cp /home/users/cbyrom/opt/saxonsa/saxon9sa.jar:/home/users/cbyrom/opt/saxonsa net.sf.saxon.Query " + \
111        os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
112        xqCommand = "java -cp saxon9.jar net.sf.saxon.Query " + xqFile
113        logging.debug("Running saxon command: " + xqCommand)
114        pipe = os.popen(xqCommand + " 2>&1")
115        output = pipe.read()
116        status = pipe.close()
117
118        print output
119        print "ss,", status
120        if status is not None:
121            sys.exit("Failed at running the XQuery")
122
123        # now remove the temp xquery file
124        status = os.unlink(xqFile)
125        if status is not None:
126            sys.exit("Failed to remove the temporary xquery file, " + xqFile)
127       
128        logging.info("Transform completed successfully")
129           
130        return output
131
132
133    def doMolesTransform(self):
134        '''
135        Set up the basic moles doc - according to the type of document first ingested
136        '''
137        logging.info("Creating moles document - for use with other transforms")
138        xqName = None
139        if self.docType == "DIF":
140            xqName = "dif2moles"
141        elif self.docType == "MDIP":
142            xqName = "mdip2moles"
143        else:
144            sys.exit("ERROR: No XQuery exists to transform input document type, %s, into moles format" \
145                     %self.docType)
146
147        # add keywords, if required
148        if self._datacentre_groups != "":
149            addKeywords()
150
151        # now run the appropriate transform and set the attribute
152        setattr(self, "_molesFormat", self.doTransform(xqName))
153        logging.info("moles document created")
154       
155
156    def addKeywords(self):
157        '''
158        If datacentre groups have been specified, these need to be added as keywords
159        - NB, this is rather clumsy approach but uses old code to achieve the result
160        '''
161        logging.info("Adding datacentre keywords to moles file")
162        # NB, use temporary directories to do the keyword additions
163        tmpDir = os.getcwd() + "/"
164        tmpKeywordsDir = os.getcwd() + "/kewordsAdded/"
165        fileUtils = FileUtilities(self.logger)
166        fileUtils.setUpDir(tmpDir)
167        fileUtils.setUpDir(tmpKeywordsDir)
168        tmpFile = 'tmpFile.xml'
169        f=open(tmpDir + "/" + tmpFile,'w')
170        f.write(self._molesFormat)
171        f.close()
172
173        keywordAdder.main(tmpDir, tmpKeywordsDir, self.datacentre_groups)
174
175        # Now load in the converted file
176        f=open(tmpKeywordsDir + "/" + tmpFile, 'r')
177        self._molesFormat = f.read()
178        f.close
179       
180        # Finally, tidy up temp dirs
181        fileUtils.cleanDir(tmpDir)
182        fileUtils.clearDir(tmpKeywordsDir)
183        logging.info("Completed adding keywords")
184       
185
186    def getDocumentFormat(self, docType):
187        '''
188        Lookup document format; if it is already defined then return it, else do the required XQuery
189        transform.  NB, transforms are ran on the molesFormat document - so ensure this is available
190        @param docType: format of document to return
191        '''
192        logging.info("Retrieving document type, " + docType)
193        xqName = {'DIF':'moles2dif', 'MOLES':'moles', 'DC':'moles2DC', 'MDIP':'moles2mdip', 'ISO19139':'moles2iso19139'}[docType]
194        attributeName = {'DIF':'_difFormat', 'MOLES':'_molesFormat', 'DC':'_dcFormat', 'MDIP':'_mdipFormat', 'ISO19139':'_iso19139Format'}[docType]
195       
196        # check we have the moles format available; if not create it
197        if self._molesFormat is None:
198            self.doMolesTransform()
199       
200        # check the document isn't already defined
201        try:
202            doc = getattr(self, attributeName)
203            if doc is not None:
204                logging.info("Found existing document - returning this now")
205                return doc
206        except:
207            logging.info("Creating new transformed document")
208
209        # the doc type doesn't exist - so run the xquery
210        setattr(self, attributeName, self.doTransform(xqName))
211       
212   
213    def getAllDocs(self):
214        '''
215        Return a list of all the available doc types in the record
216        '''
217        if len(self._allDocs) > 0:
218            return self._allDocs
219       
220        for docType in documentTypes:
221            self._allDocs.append([docType, getDocumentFormat(docType)])
222        return self._allDocs
223       
224
225    def listify(item):
226        '''
227        listify checks if an item is a list, if it isn't it puts it
228        inside a list and returns it. Always returns a list object.
229        @param item: object to check
230        @return: item as a list object
231        '''
232        if type(item) is list:
233            return item
234        else:
235            return [item]
236       
237   
238    def getSpatioTemporalData(self):
239        '''
240        Extract spatio temporal data from the original document
241        '''
242        ET._namespace_map.update({'http://ndg.nerc.ac.uk/moles': 'moles', 'http://www.w3.org/1999/xlink':'xlink'})
243        no_bbox = False
244        no_dates = False
245        self.east = 'null'
246        self.west = 'null'
247        self.north = 'null'
248        self.south = 'null'
249        self.startdate='nostartdate'
250        self.enddate='noenddate'
251       
252        dgMeta=MRW.dgMetadata()
253        try:
254            dgMeta.fromXML(cElementTree.ElementTree(file=self.filename).getroot())
255        except:
256            logging.warning("WARNING: Cannot parse the XML moles document %s. Will not process" %self.filename)
257            return
258        try:
259            bbox_list=listify(dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgSpatialCoverage.BoundingBox)
260        except:
261            logging.info("XML moles document " + self.filename + \
262                " does not contain a bounding box.")
263            no_bbox=True
264
265        try:
266            dates=dgMeta.dgMetadataRecord.dgDataEntity.dgDataSummary.dgDataCoverage.dgTemporalCoverage.DateRange
267            print "startdate = %s" %dates.DateRangeStart
268            print "enddate = %s" %dates.DateRangeEnd
269        except:
270            logging.info("XML moles document " + self.filename + " does not contain temporal info.")
271            no_dates=True
272
273        if no_bbox and no_dates:
274            logging.info("XML moles document " + self.filename + " does not contain any spatiotemporal info.")
275            return
276
277        if not no_dates:
278            startdate=dates.DateRangeStart
279            enddate= dates.DateRangeEnd
280            if startdate==None or startdate=='None':
281                startdate="nostartdate"
282            if enddate==None or enddate=='None':
283                enddate="noenddate"
284            self.startdate = startdate
285            self.enddate = enddate
286
287        if not no_bbox:
288            #parse the coordinates somewhat - only use the first bounding box.
289            bbox=bbox_list[0]
290            try:
291                west = bbox.LimitWest.strip()
292            except:
293                print "ERROR:  Will not process File %s. Contains incorrect West bounding box limit." %self.filename
294                return
295            if west.endswith('E'):
296                west=bbox.LimitWest.split('E')[0]
297            elif west.endswith('W'):
298                if west.startswith('-'):
299                    west = bbox.LimitWest.split('W')[0]
300                else:
301                    west = "-" +bbox.LimitWest.split('W')[0]
302            try:
303                float(west)
304            except:
305                print "ERROR:  Will not process File %s. Contains incorrect West bounding box limit." %self.filename
306                return
307            self.west = west
308           
309            try:
310                east = bbox.LimitEast.strip()
311            except:
312                print "ERROR:  Will not process File %s. Contains incorrect East bounding box limit." %self.filename
313                return
314            if east.endswith('E'):
315                east=bbox.LimitEast.split('E')[0]
316            elif east.endswith('W'):
317                if east.startswith('-'):
318                    east = bbox.LimitEast.split('W')[0]
319                else:
320                    east = "-" +bbox.LimitEast.split('W')[0]
321            try:
322                float(east)
323            except:
324                print "ERROR:  Will not process File %s. Contains incorrect East bounding box limit." %self.filename
325                return
326            self.east = east
327           
328            try:
329                north = bbox.LimitNorth.strip()
330            except:
331                print "ERROR:  Will not process File %s. Contains incorrect North bounding box limit." %self.filename
332                return
333            if north.endswith('N'):
334                north=bbox.LimitNorth.split('N')[0]
335            elif north.endswith('S'):
336                if north.startswith('-'):
337                    north = bbox.LimitNorth.split('S')[0]
338                else:
339                    north = "-" +bbox.LimitNorth.split('S')[0]
340            try:
341                float(north)
342            except:
343                print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %self.filename
344                return
345            self.north = north
346           
347            try:
348                south = bbox.LimitSouth.strip()
349            except:
350                print "ERROR:  Will not process File %s. Contains incorrect South bounding box limit." %self.filename
351                return
352            if south.endswith('N'):
353                south=bbox.LimitSouth.split('N')[0]
354            elif south.endswith('S'):
355                if south.startswith('-'):
356                    south = bbox.LimitSouth.split('S')[0]
357                else:
358                    south = "-" +bbox.LimitSouth.split('S')[0]
359            try:
360                float(south)
361            except:
362                print "ERROR: Will not process File %s. Contains incorrect North bounding box limit." %self.filename
363                return
364            self.south = south
365
366        logging.info("Spatial info: west= " + self.west + ",south " + self.south + ", east " + \
367                    self.east + ", north " + self.north + "")
368        logging.info("Temporal info: startdate " + self.startdate + ", enddate " + self.enddate) 
369
370           
371    def hasNullCoords():
372        if str(self.west)=='null' or \
373            str(self.south)=='null' or \
374            str(self.east)=='null' or \
375            str(self.north)=='null':
376            return True;
377        else:
378            return False;
379       
Note: See TracBrowser for help on using the repository browser.