source: exist/trunk/python/ndgUtils/lib/granulite.py @ 4582

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/granulite.py@4582
Revision 4582, 22.9 KB checked in by cbyrom, 11 years ago (diff)

Add logic to cope with changes to the Atom structure - to allow granulite files and logos data to be stored + improve error handling.

Line 
1#!/usr/bin/env python
2'''
3Data model representing a granulite file - together with utitilities to
4augment atom data with
5
6 @author: C Byrom, Tessella Nov 08
7'''
8import os, sys, string, getopt, logging, re, datetime, cgi
9from ndgUtils.eXistConnector import eXistConnector
10from ndgUtils.models import Atom
11import ndgUtils.lib.utilities as utilities
12from ndgUtils.vocabtermdata import VocabTermData as VTD
13import ndgUtils.lib.existdbclient as edc
14
15class granulite(object):
16    '''
17    Granulite data model
18    '''
19    # expected sections in a granulite file
20    AUTHORS_SECTION = "authors"
21    FILE_SECTION = "files"
22    PARAMETERS_SECTION = "parameters"
23    COVERAGE_SECTION = "coverage"
24    CSML_SECTION = "csml_file"
25    CDML_SECTION = "cdml_file"
26    GRANULE_INFO_SECTION = "granule_info"
27    GRANULE_AUTHORS_SECTION = "granule_authors"
28    SUMMARY_SECTION = "summary"
29    DATA_ENTITY_SECTION = "data_entity_id"
30    LOGO_SECTION = "logo"
31    RELATED_LINKS_SECTION = "related_links"
32   
33    # need to distinguise coverage data between spatial and temporal data - use these variables to do so
34    TEMPORAL_DATA = "temporal_data"
35    SPATIAL_DATA = "spatial_data"
36
37    # group the data together as either single valued or array valued - to ease setting attributes on Atom
38    singleVals = {} 
39    arrayVals = {AUTHORS_SECTION:'authors', \
40                 FILE_SECTION:'files', GRANULE_AUTHORS_SECTION:'atomAuthors', \
41                 SUMMARY_SECTION:'summary'}
42       
43    # config file with eXist DB details
44    DBCONFIG_FILE = "exist.config"
45
46    # default title given to CSML files by csmlscan
47    DEFAULT_CSML_TITLE = "NAME OF DATASET GOES HERE"
48   
49    # flag to use when running in test mode
50    TEST_MODE = "granulite_test_mode"
51   
52    # eXist DB client
53    _eXist = None
54   
55    # flag to mark mode of operation
56    _isOverride = False
57
58    # info on specified CDML + CSML
59    _cdmlFileName = None
60    _cdmlTimeAxis = None
61    _datasetID = None
62    _csmlFileName = None
63
64    # info on the datasets to attach the granule to
65    _dataEntityIDs = []
66   
67    # standard output delimiter
68    LINE_SEPARATOR = "-----------------------------"
69       
70       
71    def __init__(self, granulite, granuleAtom = None, \
72                  eXistClient = None, csmlOrCdmlFile = None, \
73                  aggregateCoverage = False, useCSMLID = False, 
74                  timeAxis = 'time', datasetID = None):
75        '''
76        Constructor for granulite object - NB, the csml/cdml keywords allow use
77        in a webapp - outside of this, this data should be specified in the granulite
78        file
79       
80        @param granulite: either the granulite filename or contents
81        @keyword granuleAtom: an existing granule atom - to add granulite data to
82        - NB, if not specified, a new atom is used
83        @keyword eXistClient: an eXist connection
84        @keyword csmlOrCdmlFile: a cgi.FieldStorage object with CSML or CDML data
85        @keyword aggregateCoverage: if set to True, only coverage data that extends the
86        atom coverage data will be added
87        @keyword useCSMLID: if True, use the CSML doc ID as the dataset ID - NB,
88        this should only be True if creating a new atom - e.g. from a granulite
89        @keyword timeAxis: if using a CDML file, specify the time axis to use - 'time'
90        is the default
91        @keyword datasetID: if using a CDML file, specify the ID of the dataset to use
92        - otherwise one will be randomly generated
93        @return csmlDoc, cdmlDoc: the CsmlParser.Dataset object with the csml data in and,
94        if we're dealing with a CDML doc, return this too
95        '''
96        logging.info("Creating granulite data model")
97       
98        self._granulite = granulite
99       
100        # set up connection to eXist
101        if eXistClient:
102            self._eXist = eXistClient
103        else:
104            self._eXist = edc.eXistDBClient(configFile = self.DBCONFIG_FILE)
105       
106        # override CSML/CDML data specified in the granulite file with data input directly
107        self.ingestGranuliteFiles = True
108        if csmlOrCdmlFile is not None:
109            if not isinstance(csmlOrCdmlFile, cgi.FieldStorage):
110                raise ValueError("Specified CSML/CDML file is not a cgi.FieldStorage object")
111            self.__addCSMLOrCDMLData(csmlOrCdmlFile.filename, csmlOrCdmlFile.value)
112            self.ingestGranuliteFiles = False
113
114        if granuleAtom:
115            if not isinstance(granuleAtom, Atom.Atom):
116                raise ValueError("Specified granule atom is not an Atom object")
117            self._atom = granuleAtom
118        else:
119            # create a skeleton granule
120            self._initialiseGranule()
121
122        self.useCSMLID = useCSMLID
123        self._cdmlTimeAxis = timeAxis
124        self._datasetID = datasetID
125           
126        logging.info("Granulite data model set up")
127
128       
129    def _getSectionName(self, str):
130        '''
131        Checks an input string to see if it contains a section title; if so
132        return this title name, otherwise return None
133        @param str: string to parse for section name
134        @return: section name, if found, None otherwise
135        '''
136        sectionName = None
137        if str.count('::') == 1:
138            sectionName = str.partition('::')[0]
139        return sectionName
140
141
142    def _getGranuliteDetails(self):
143        '''
144        Load the granulite config file and extract the required data 
145        '''
146        logging.info("Retrieving data from granulite config file")
147       
148        # assume we've already read in the file if it is multi-line
149        newLine = None
150        # cope with unix and dos end of lines
151        if self._granulite.find('\n') > -1:
152            newLine = '\n'
153        elif self._granulite.find('\r') > -1:
154            newLine = '\r'
155       
156        if newLine:
157            granuliteData = self._granulite.split(newLine)
158        else:
159            # Check this file exists
160            if not os.path.isfile(self._granulite):
161                raise ValueError("ERROR: Could not find the config file, %s; please specify " \
162                         "a valid file" %self._granulite)
163               
164            granulite_file = open(self._granulite, "r")
165            granuliteData = granulite_file.readlines()
166            granulite_file.close()
167       
168        # create a dictionary of all data in file - then use this to get the required data
169        granulite_data = {}
170       
171        # initialise vocab term data lookup object
172        self.VTD = VTD()
173
174        sectionName = "" # variable to indicate what section we're currently parsing in the granulite file data
175        logging.info("Parsing granulite file...")
176        for line in granuliteData:
177           
178            line = line.strip()
179            # avoid comments
180            if (line.startswith('#')):
181                continue
182           
183            # avoid empty lines - except in the summary section where these may be appropriate
184            if not line and sectionName != self.SUMMARY_SECTION:
185                continue
186           
187            section = self._getSectionName(line)
188
189            # if a section name is returned, we're at a new section of the file - so change the dictionary key
190            if section:
191                sectionName = section
192                continue
193           
194            # cope with case where there is rubbish header info in the file
195            if not sectionName:
196                continue
197           
198            # initialise dictionary array if required
199            if sectionName not in granulite_data:
200                granulite_data[sectionName] = []
201           
202            if sectionName != self.COVERAGE_SECTION:
203                logging.debug("Adding %s data: -%s-" %(sectionName, line))
204                granulite_data[sectionName].append(line)
205            else:
206                # if it's coverage data, determine if it is spatial or temporal data
207                coverageType = self.SPATIAL_DATA
208                if line.lower().find('time') > -1:
209                    coverageType = self.TEMPORAL_DATA
210               
211                # initialise dictionary array if required
212                if coverageType not in granulite_data:
213                    granulite_data[coverageType] = []
214                logging.debug("Adding %s data: %s" %(coverageType, line))
215                granulite_data[coverageType].append(line)
216                   
217        logging.info("Finished parsing granulite file")
218
219        logging.info("Setting up input data")
220        # add the single value attributes to the granule
221        for attribute in self.singleVals:
222            if attribute in granulite_data:
223                self._atom.setAttribute(self.singleVals[attribute], granulite_data[attribute][0])
224
225        # NB, explicitly set the related links before running the arrayVals loop, since
226        # this will replace any file Links created if it occurs after these have been
227        # created (was originally in arrayVals, but this doesn't enforce any specific ordering)
228        if self.RELATED_LINKS_SECTION in granulite_data:
229            self._atom.setAttribute('relatedLinks', \
230                                    granulite_data[self.RELATED_LINKS_SECTION])
231
232        # now add the arrays data
233        for attribute in self.arrayVals:
234            if attribute in granulite_data:
235                self._atom.setAttribute(self.arrayVals[attribute], granulite_data[attribute])
236
237        # add the general granule info
238        if self.GRANULE_INFO_SECTION not in granulite_data:
239            raise ValueError("Need granule_info section in granulite input file")
240       
241        data = utilities.getTripleData(granulite_data[self.GRANULE_INFO_SECTION][0])
242        if not data[0]:
243            raise ValueError("Provider ID is missing for granule; please add this info to the " + \
244                             self.GRANULE_INFO_SECTION + " section of the granulite config file")
245        self._atom.addMolesEntityData(None, data[0], None)
246        self._atom.setDatasetID(data[1])
247        self._atom.title = data[2]
248       
249        if self.LOGO_SECTION in granulite_data:
250            for logo in granulite_data[self.LOGO_SECTION]:
251                self._atom.addRelatedLinks(logo + " | Logo | " + \
252                                           self.VTD.getTermCurrentVocabURL(VTD.LOGO_TERM))
253
254        # add the parameters data via the method - since this does some tidying up of the data
255        if self.PARAMETERS_SECTION in granulite_data:
256            self._atom.addParameters(granulite_data[self.PARAMETERS_SECTION])
257
258        # NB, if running from the web, ignore the CSML/CDML files specified in the granulite
259        if self.ingestGranuliteFiles:
260            # check for CSML/CDML file input - these data are changed before adding to the granule
261            # - since the file will be stored and referenced in eXist by then
262            if not self._csmlFileName:
263                if self.CSML_SECTION in granulite_data:
264                    self.__addCSMLOrCDMLData(granulite_data[self.CSML_SECTION][0], None)
265   
266            if not self._cdmlFileName:
267                if self.CDML_SECTION in granulite_data:
268                    if self._csmlFileName:
269                        raise ValueError("Cannot specify both CDML and CSML file in granulite config file" + \
270                                         "\nNB, CSML file is generated from the specified CDML file")
271               
272                    data = utilities.getTripleData(granulite_data[self.CDML_SECTION][0])
273                    self.__addCSMLOrCDMLData(data[0], None)
274                    self._datasetID = data[1]
275                    self._cdmlTimeAxis = data[2]
276           
277        if self.DATA_ENTITY_SECTION in granulite_data:
278            self._dataEntityIDs = granulite_data[self.DATA_ENTITY_SECTION] 
279
280        # now add any coverage data
281        if self.SPATIAL_DATA in granulite_data:
282            self._extractSpatialData(granulite_data[self.SPATIAL_DATA][0])
283        if self.TEMPORAL_DATA in granulite_data:
284            self._extractTemporalData(granulite_data[self.TEMPORAL_DATA][0])
285
286        logging.info("All input data set up")
287       
288
289    def _extractSpatialData(self, geomString):
290        '''
291        Extract bounding box info from the specified geometry string
292        @param geomString: A string holding geometry info
293        NB, currently the method supports parsing of POLYGONs
294        '''
295        if geomString.upper().find('POLYGON') > -1:
296            logging.debug("Spatial data specified in POLYGON format - extracting data from this")
297            vals = re.findall('([\-\d\.]+)', geomString)
298            # assume we're dealing with a rectangle normal to the equator...
299            if len(vals) == 10:
300                self._atom.minX = vals[0]
301                self._atom.minY = vals[1]
302                self._atom.maxX = vals[4]
303                self._atom.maxY = vals[5]
304        else:
305            errorMessage = "Spatial coverage data not stored in POLYGON format - please correct and rerun"
306            logging.error(errorMessage)
307            raise ValueError(errorMessage)
308
309
310    def _extractTemporalData(self, dateRangeString):
311        '''
312        Extract temporal info from the specified daterange string
313        @param dateRangeString: A string holding temporal info
314        NB, currently the method supports parsing of TIMERANGE objects
315        '''
316        if dateRangeString.upper().find('TIMERANGE') == -1:
317            errorMessage = "Temporal data not stored in TIMERANGE() format - please correct and rerun"
318            logging.error(errorMessage)
319            raise ValueError(errorMessage)
320       
321        vals = re.findall('([0-9][0-9\-:TZ ]+)', dateRangeString)
322        if vals:
323            logging.debug("Adding start time: %s" %vals[0])
324            self._atom.t1 = vals[0]
325            if len(vals) > 1:
326                logging.debug("Adding finish time: %s" %vals[1])
327                self._atom.t2 = vals[1]
328     
329   
330    def _addGranuleToDataEntityRecords(self):
331        '''
332        Augment any data entity records, specified in the granulite, with a reference to
333        the granule
334        '''
335        logging.info("Adding granule info to data entities")
336        if not self._dataEntityIDs:
337            logging.info("No data entity IDs were specified in granulite - so will not add granule data to cedarmoles DB")
338            return
339       
340        # now set up the granule links to the data entities specified
341        for entityID in self._dataEntityIDs:
342            data = utilities.getTripleData(entityID)
343            dataEntityID = data[0]
344            dataEntityProviderID = data[1]
345           
346            try:
347                self.__updateDataEntity(dataEntityID)
348            except:
349                logging.error("Exception thrown - detail: ")
350                logging.error(sys.exc_info())
351                logging.info("Continue processing other data entities")
352
353        logging.info("Granule data added to data entities")
354
355
356    def __updateDataEntity(self, dataEntityID):
357        '''
358        Retrieve the specified data entity and add a link to the current
359        data granule, if required, then update the atom in eXist
360        @param dataEntityID: ID of the data entity to augment with granule link
361        '''
362        logging.debug("Retrieving data entity atom - to attach granule to")
363        doc = self._eXist.getAtom('dataent_' + dataEntityID)
364        logging.debug("DE retrieved - now adding link to granule")
365        de = Atom.Atom(xmlString=str(doc))
366        noLinks = len(de.relatedLinks)
367        de.addRelatedLinks(self._atom.atomBrowseURL + " | " + \
368                           self._atom.title + " | " + \
369                           self._atom.VTD.getTermCurrentVocabURL(VTD.GRANULE_TERM))
370       
371        # only save if need be
372        if len(de.relatedLinks) == noLinks:
373            logging.info("- data entity already contains link to this granule - skpping")
374            return
375       
376        logging.debug("Now, save the updated DE atom back to eXist")
377        self._eXist.createAtomInExist(de)
378        logging.debug("DE atom updated")
379
380
381    def _initialiseGranule(self):
382        '''
383        Create an Atom object ready to populate with data
384        '''
385        self._atom = Atom.Atom(VTD.GRANULE_TERM)
386
387
388    def __processCSMLFile(self):
389        logging.info("Processing CSML file")
390        # only keep file name, if full path specified
391        fileName = self._csmlFileName
392        fileName = fileName.split('/')[-1]
393        fileName = fileName.split('\\')[-1]
394       
395        csmlDoc = self._atom.addCSMLData(self._csmlFileName, \
396                                         self._csmlContent, useCSMLID = self.useCSMLID)
397        logging.info("Adding CSML file to eXist")
398        self._eXist.createOrUpdateEXistFile(csmlDoc.toPrettyXML(), \
399                                        eXistConnector.NDG_A_COLLECTION_PATH + \
400                                        self._atom.ME.providerID + '/', \
401                                        fileName)
402        logging.info("CSML file added to eXist")
403        logging.info("Finished processing CSML file")
404           
405
406    def __processCDMLFile(self):
407        logging.info("Processing CDML file")
408        # only keep file name, if full path specified
409        fileName = self._cdmlFileName
410        fileName = fileName.split('/')[-1]
411        fileName = fileName.split('\\')[-1]
412       
413        # firstly, save the doc to eXist
414        # remove DOCTYPE tags - as these will prevent eXist from ingesting the doc
415        self._cdmlContent = re.sub(r'<!DOCTYPE.*>', '', self._cdmlContent)
416        logging.info("CDML file loaded")
417
418        logging.info("Adding CDML file to eXist")
419        self._eXist.createOrUpdateEXistFile(self._cdmlContent, \
420                                            eXistConnector.NDG_A_COLLECTION_PATH + \
421                                            self._atom.ME.providerID + '/',\
422                                            fileName)
423        self._atom.cdmlFile = eXistConnector.NDG_A_COLLECTION_PATH + fileName
424       
425        # create a local copy of the CDML file - NB, this is required if running
426        # from web app
427        fn = os.path.basename(str(datetime.datetime.today().microsecond) + fileName)
428        cdmlFile = open(fn, 'wb')
429        cdmlFile.write(self._cdmlContent)
430        cdmlFile.close()
431        message = 'The file "' + fn + '" was uploaded successfully'
432       
433        logging.info("Create CSML file from the CDML file - NB, this will be stored in eXist too " + \
434                     "and will be used to extract dataset information from")
435        csmlFileName = utilities.createCSMLFile(fn, self._datasetID, self._cdmlTimeAxis)
436        os.remove(fn)
437       
438        logging.info("CSML file successfully created - now processing this")
439        self.__addCSMLOrCDMLData(csmlFileName, None)
440       
441        # NB, can remove the CSML file now since the contents are read into memory
442        os.remove(csmlFileName)
443        logging.info("Finished processing CDML file")
444
445
446    def __addCSMLOrCDMLData(self, fileName, fileContent):
447        '''
448        Given an unknown file type, determine whether it is CSML or CDML; if it
449        is either, ingest the data appropriately; if not, just skip. NB, CDML docs
450        are converted into CSML ones to allow the data ingest
451        @param fileName: name of the file to ingest data from
452        @param fileContent: content of the file - NB, if this is set to None and the
453        file, fileName, is available locally, CsmlParser.Dataset will read in the file
454        directly
455        '''
456        logging.info("Determining file type to add data from")
457        if not fileContent:
458            logging.info("- NB, file contents not provided - attempt to load")
459            try:
460                f = open(fileName, 'r')
461                fileContent = f.read()
462                f.close
463            except IOError, e:
464                logging.error(e.message)
465               
466            if not fileContent:
467                raise ValueError("Could not load data from file, '%s'" %fileName)
468
469        # if we're dealing with a CDML file, process this to produce a CSML file
470        if utilities.isCSMLFile(fileContent):
471            self._csmlFileName = fileName
472            self._csmlContent = fileContent
473        elif utilities.isCDMLFile(fileContent):
474            self._cdmlFileName = fileName
475            self._cdmlContent = fileContent
476        else:
477            raise ValueError("Unrecognised file type, '%s'" %fileName)
478       
479        logging.info("Finished determining file type")
480       
481
482
483    def processCSMLOrCDMLFile(self):
484        '''
485        If a CSML or CDML file has been specified, process it - i.e.:
486        - extract required data
487        - add to eXist
488        @return atom: Atom object of created data granule with CSML/CDML data
489        added
490        '''
491        logging.info("Processing CSML/CDML data")
492        # NB, if a CDML file is specified, a CSML file will be created from it and this will be used to
493        # extract the required dataset info
494        if self._cdmlFileName:
495            self.__processCDMLFile()
496           
497        if self._csmlFileName:
498            self.__processCSMLFile()
499        logging.info("Finished processing CSML/CDML data")
500        return self._atom
501
502       
503    def processGranulite(self):
504        '''
505        Complete the required granulite actions
506        - ingest granulite data + add to granule atom
507        - ingest CSML or CDML data + add to granule atom
508        - save CSML/CDML data to eXist
509        - add granule atom to eXist
510        - add references to granule atom to specified data entity atoms
511       
512        @return atom: Atom object of created data granule
513        '''
514        logging.info("Processing granulite data")
515        # load in the granulite details
516        self._getGranuliteDetails()
517       
518        # check for CSML or CDML file and load details
519        self.processCSMLOrCDMLFile()
520       
521        # add the granule to eXist
522        # check if we've got a valid datasetid at this point - otherwise we won't have a valid filename or DB PK to use
523        if not self._atom.datasetID:
524            errorMessage = "No dataset ID specified for the granule - this needs " + \
525                "to be set in the granulite file or in \nthe CSML/CDML files.  " + \
526                "NB, without this cannot create valid entry in the eXist " + \
527                "DB - so escaping now"
528            logging.error(errorMessage)
529            raise ValueError(errorMessage)
530
531        logging.info("Creating granule atom, '%s', in eXist DB" %self._atom.atomName)
532        self._eXist.createOrUpdateEXistFile(self._atom.toPrettyXML(), \
533                                            self._atom.getDefaultCollectionPath(), \
534                                            self._atom.atomName)
535
536        # now add the granule data to the data entity in eXist
537        self._addGranuleToDataEntityRecords()
538       
539        logging.info("granulite processing complete")
540        return self._atom
541
542
543if __name__=="__main__":
544    opts, args = getopt.getopt(sys.argv[1:], '-oxuvd')
545    if len(args) < 1:
546        print "Error: Please specify a granulite data file to process"
547        sys.exit(2)
548   
549    logging.basicConfig(level = logging.DEBUG,
550                        format='%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s')
551    g = granulite(args[0])
552    g.processGranulite()
Note: See TracBrowser for help on using the repository browser.