source: exist/trunk/python/ndgUtils/lib/granulite.py @ 4591

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/granulite.py@4591
Revision 4591, 23.9 KB checked in by cbyrom, 11 years ago (diff)

Correctly handle scenario where granulite files are specified online
but with no CSML/CDML file + simplify creation of CSML file.

Line 
1#!/usr/bin/env python
2'''
3Data model representing a granulite file - together with utitilities to
4augment atom data with
5
6 @author: C Byrom, Tessella Nov 08
7'''
8import os, sys, string, getopt, logging, re, datetime, cgi
9from ndgUtils.eXistConnector import eXistConnector
10from ndgUtils.models import Atom
11import ndgUtils.lib.utilities as utilities
12from ndgUtils.vocabtermdata import VocabTermData as VTD
13import ndgUtils.lib.existdbclient as edc
14
15class granulite(object):
16    '''
17    Granulite data model
18    '''
19    # expected sections in a granulite file
20    AUTHORS_SECTION = "authors"
21    FILE_SECTION = "files"
22    PARAMETERS_SECTION = "parameters"
23    COVERAGE_SECTION = "coverage"
24    CSML_SECTION = "csml_file"
25    CDML_SECTION = "cdml_file"
26    GRANULE_INFO_SECTION = "granule_info"
27    GRANULE_AUTHORS_SECTION = "granule_authors"
28    SUMMARY_SECTION = "summary"
29    DATA_ENTITY_SECTION = "data_entity_id"
30    LOGO_SECTION = "logo"
31    RELATED_LINKS_SECTION = "related_links"
32   
33    # need to distinguise coverage data between spatial and temporal data - use these variables to do so
34    TEMPORAL_DATA = "temporal_data"
35    SPATIAL_DATA = "spatial_data"
36
37    # group the data together as either single valued or array valued - to ease setting attributes on Atom
38    singleVals = {} 
39    arrayVals = {AUTHORS_SECTION:'authors', \
40                 FILE_SECTION:'files', GRANULE_AUTHORS_SECTION:'atomAuthors', \
41                 SUMMARY_SECTION:'summary'}
42       
43    # config file with eXist DB details
44    DBCONFIG_FILE = "exist.config"
45
46    # default title given to CSML files by csmlscan
47    DEFAULT_CSML_TITLE = "NAME OF DATASET GOES HERE"
48   
49    # flag to use when running in test mode
50    TEST_MODE = "granulite_test_mode"
51   
52    # eXist DB client
53    _eXist = None
54   
55    # flag to mark mode of operation
56    _isOverride = False
57
58    # info on specified CDML + CSML
59    _cdmlFileName = None
60    _cdmlTimeAxis = None
61    _datasetID = None
62    _csmlFileName = None
63
64    # info on the datasets to attach the granule to
65    _dataEntityIDs = []
66   
67    # standard output delimiter
68    LINE_SEPARATOR = "-----------------------------"
69       
70       
71    def __init__(self, granulite, granuleAtom = None, \
72                  eXistClient = None, csmlOrCdmlFile = None, \
73                  aggregateCoverage = False, useCSMLID = False, 
74                  timeAxis = 'time', datasetID = None):
75        '''
76        Constructor for granulite object - NB, the csml/cdml keywords allow use
77        in a webapp - outside of this, this data should be specified in the granulite
78        file
79       
80        @param granulite: either the granulite filename or contents
81        @keyword granuleAtom: an existing granule atom - to add granulite data to
82        - NB, if not specified, a new atom is used
83        @keyword eXistClient: an eXist connection
84        @keyword csmlOrCdmlFile: a cgi.FieldStorage object with CSML or CDML data
85        @keyword aggregateCoverage: if set to True, only coverage data that extends the
86        atom coverage data will be added
87        @keyword useCSMLID: if True, use the CSML doc ID as the dataset ID - NB,
88        this should only be True if creating a new atom - e.g. from a granulite
89        @keyword timeAxis: if using a CDML file, specify the time axis to use - 'time'
90        is the default
91        @keyword datasetID: if using a CDML file, specify the ID of the dataset to use
92        - otherwise one will be randomly generated
93        '''
94        logging.info("Creating granulite data model")
95       
96        self._granulite = granulite
97       
98        # set up connection to eXist
99        if eXistClient:
100            self._eXist = eXistClient
101        else:
102            self._eXist = edc.eXistDBClient(configFile = self.DBCONFIG_FILE)
103       
104        # override CSML/CDML data specified in the granulite file with data input directly
105        self.ingestGranuliteFiles = True
106        # NB, empty FieldStorage fields end up as empty strings
107        if csmlOrCdmlFile is not None:
108            self.ingestGranuliteFiles = False
109            if csmlOrCdmlFile != '':
110                if not isinstance(csmlOrCdmlFile, cgi.FieldStorage):
111                    raise ValueError("Specified CSML/CDML file is not a cgi.FieldStorage object")
112                self.__addCSMLOrCDMLData(csmlOrCdmlFile.filename, csmlOrCdmlFile.value)
113       
114
115        if granuleAtom:
116            if not isinstance(granuleAtom, Atom.Atom):
117                raise ValueError("Specified granule atom is not an Atom object")
118            self._atom = granuleAtom
119        else:
120            # create a skeleton granule
121            self._initialiseGranule()
122
123        self.useCSMLID = useCSMLID
124        self._cdmlTimeAxis = timeAxis
125        self._datasetID = datasetID
126           
127        logging.info("Granulite data model set up")
128
129       
130    def _getSectionName(self, str):
131        '''
132        Checks an input string to see if it contains a section title; if so
133        return this title name, otherwise return None
134        @param str: string to parse for section name
135        @return: section name, if found, None otherwise
136        '''
137        sectionName = None
138        if str.count('::') == 1:
139            sectionName = str.partition('::')[0]
140        return sectionName
141
142
143    def __getGranuliteDetails(self):
144        '''
145        Load the granulite config file and extract the required data 
146        '''
147        logging.info("Retrieving data from granulite config file")
148       
149        # assume we've already read in the file if it is multi-line
150        newLine = None
151        # cope with unix and dos end of lines
152        if self._granulite.find('\n') > -1:
153            newLine = '\n'
154        elif self._granulite.find('\r') > -1:
155            newLine = '\r'
156       
157        if newLine:
158            granuliteData = self._granulite.split(newLine)
159        else:
160            # Check this file exists
161            if not os.path.isfile(self._granulite):
162                raise ValueError("ERROR: Could not find the config file, %s; please specify " \
163                         "a valid file" %self._granulite)
164               
165            granulite_file = open(self._granulite, "r")
166            granuliteData = granulite_file.readlines()
167            granulite_file.close()
168       
169        # create a dictionary of all data in file - then use this to get the required data
170        granulite_data = {}
171       
172        # initialise vocab term data lookup object
173        self.VTD = VTD()
174
175        sectionName = "" # variable to indicate what section we're currently parsing in the granulite file data
176        logging.info("Parsing granulite file...")
177        for line in granuliteData:
178           
179            line = line.strip()
180            # avoid comments
181            if (line.startswith('#')):
182                continue
183           
184            # avoid empty lines - except in the summary section where these may be appropriate
185            if not line and sectionName != self.SUMMARY_SECTION:
186                continue
187           
188            section = self._getSectionName(line)
189
190            # if a section name is returned, we're at a new section of the file - so change the dictionary key
191            if section:
192                sectionName = section
193                continue
194           
195            # cope with case where there is rubbish header info in the file
196            if not sectionName:
197                continue
198           
199            # initialise dictionary array if required
200            if sectionName not in granulite_data:
201                granulite_data[sectionName] = []
202           
203            if sectionName != self.COVERAGE_SECTION:
204                logging.debug("Adding %s data: -%s-" %(sectionName, line))
205                granulite_data[sectionName].append(line)
206            else:
207                # if it's coverage data, determine if it is spatial or temporal data
208                coverageType = self.SPATIAL_DATA
209                if line.lower().find('time') > -1:
210                    coverageType = self.TEMPORAL_DATA
211               
212                # initialise dictionary array if required
213                if coverageType not in granulite_data:
214                    granulite_data[coverageType] = []
215                logging.debug("Adding %s data: %s" %(coverageType, line))
216                granulite_data[coverageType].append(line)
217                   
218        logging.info("Finished parsing granulite file")
219        logging.info("- returning dict of ingested data")
220        return granulite_data
221       
222       
223    def __applyCoreGranuliteDetails(self, granulite_data):
224        '''
225        Apply the essential data read in from the granulite file to the atom data model
226        being processed by the granulite
227        @param granulite_data: the dict produced by __getGranuliteDetails
228        '''
229        # add the general granule info
230        if self.GRANULE_INFO_SECTION not in granulite_data:
231            raise ValueError("Need granule_info section in granulite input file")
232       
233        data = utilities.getTripleData(granulite_data[self.GRANULE_INFO_SECTION][0])
234        if not data[0]:
235            raise ValueError("Provider ID is missing for granule; please add this info to the " + \
236                             self.GRANULE_INFO_SECTION + " section of the granulite config file")
237        self._atom.addMolesEntityData(None, data[0], None)
238        self._atom.setDatasetID(data[1])
239        self._atom.title = data[2]
240
241        # NB, if running from the web, ignore the CSML/CDML files specified in the granulite
242        if self.ingestGranuliteFiles:
243            # check for CSML/CDML file input - these data are changed before adding to the granule
244            # - since the file will be stored and referenced in eXist by then
245            if not self._csmlFileName:
246                if self.CSML_SECTION in granulite_data:
247                    self.__addCSMLOrCDMLData(granulite_data[self.CSML_SECTION][0], None)
248   
249            if not self._cdmlFileName:
250                if self.CDML_SECTION in granulite_data:
251                    if self._csmlFileName:
252                        raise ValueError("Cannot specify both CDML and CSML file in granulite config file" + \
253                                         "\nNB, CSML file is generated from the specified CDML file")
254               
255                    data = utilities.getTripleData(granulite_data[self.CDML_SECTION][0])
256                    self.__addCSMLOrCDMLData(data[0], None)
257                    self._datasetID = data[1]
258                    self._cdmlTimeAxis = data[2]
259       
260       
261    def __applyGranuliteDetails(self, granulite_data):
262        '''
263        Apply the data read in from the granulite file to the atom data model
264        being processed by the granulite
265        @param granulite_data: the dict produced by __getGranuliteDetails
266        '''
267        logging.info("Applying granulite data to atom")
268        # add the single value attributes to the granule
269        for attribute in self.singleVals:
270            if attribute in granulite_data:
271                self._atom.setAttribute(self.singleVals[attribute], \
272                                        granulite_data[attribute][0])
273
274        # NB, explicitly set the related links before running the arrayVals loop, since
275        # this will replace any file Links created if it occurs after these have been
276        # created (was originally in arrayVals, but this doesn't enforce any specific ordering)
277        if self.RELATED_LINKS_SECTION in granulite_data:
278            self._atom.setAttribute('relatedLinks', \
279                                    granulite_data[self.RELATED_LINKS_SECTION])
280
281        # now add the arrays data
282        for attribute in self.arrayVals:
283            if attribute in granulite_data:
284                self._atom.setAttribute(self.arrayVals[attribute], \
285                                        granulite_data[attribute])
286       
287        if self.LOGO_SECTION in granulite_data:
288            for logo in granulite_data[self.LOGO_SECTION]:
289                self._atom.addRelatedLinks(logo + " | Logo | " + \
290                                           self.VTD.getTermCurrentVocabURL(VTD.LOGO_TERM))
291
292        # add the parameters data via the method - since this does some tidying up of the data
293        if self.PARAMETERS_SECTION in granulite_data:
294            self._atom.addParameters(granulite_data[self.PARAMETERS_SECTION])
295           
296        if self.DATA_ENTITY_SECTION in granulite_data:
297            self._dataEntityIDs = granulite_data[self.DATA_ENTITY_SECTION] 
298
299        # now add any coverage data
300        if self.SPATIAL_DATA in granulite_data:
301            self._extractSpatialData(granulite_data[self.SPATIAL_DATA][0])
302        if self.TEMPORAL_DATA in granulite_data:
303            self._extractTemporalData(granulite_data[self.TEMPORAL_DATA][0])
304
305        logging.info("All input data set up")
306       
307
308    def _extractSpatialData(self, geomString):
309        '''
310        Extract bounding box info from the specified geometry string
311        @param geomString: A string holding geometry info
312        NB, currently the method supports parsing of POLYGONs
313        '''
314        if geomString.upper().find('POLYGON') > -1:
315            logging.debug("Spatial data specified in POLYGON format - extracting data from this")
316            vals = re.findall('([\-\d\.]+)', geomString)
317            # assume we're dealing with a rectangle normal to the equator...
318            if len(vals) == 10:
319                self._atom.minX = vals[0]
320                self._atom.minY = vals[1]
321                self._atom.maxX = vals[4]
322                self._atom.maxY = vals[5]
323        else:
324            errorMessage = "Spatial coverage data not stored in POLYGON format - please correct and rerun"
325            logging.error(errorMessage)
326            raise ValueError(errorMessage)
327
328
329    def _extractTemporalData(self, dateRangeString):
330        '''
331        Extract temporal info from the specified daterange string
332        @param dateRangeString: A string holding temporal info
333        NB, currently the method supports parsing of TIMERANGE objects
334        '''
335        if dateRangeString.upper().find('TIMERANGE') == -1:
336            errorMessage = "Temporal data not stored in TIMERANGE() format - please correct and rerun"
337            logging.error(errorMessage)
338            raise ValueError(errorMessage)
339       
340        vals = re.findall('([0-9][0-9\-:TZ ]+)', dateRangeString)
341        if vals:
342            logging.debug("Adding start time: %s" %vals[0])
343            self._atom.t1 = vals[0]
344            if len(vals) > 1:
345                logging.debug("Adding finish time: %s" %vals[1])
346                self._atom.t2 = vals[1]
347     
348   
349    def _addGranuleToDataEntityRecords(self):
350        '''
351        Augment any data entity records, specified in the granulite, with a reference to
352        the granule
353        '''
354        logging.info("Adding granule info to data entities")
355        if not self._dataEntityIDs:
356            logging.info("No data entity IDs were specified in granulite - so will not add granule data to cedarmoles DB")
357            return
358       
359        # now set up the granule links to the data entities specified
360        for entityID in self._dataEntityIDs:
361            data = utilities.getTripleData(entityID)
362            dataEntityID = data[0]
363            dataEntityProviderID = data[1]
364           
365            try:
366                self.__updateDataEntity(dataEntityID)
367            except:
368                logging.error("Exception thrown - detail: ")
369                logging.error(sys.exc_info())
370                logging.info("Continue processing other data entities")
371
372        logging.info("Granule data added to data entities")
373
374
375    def __updateDataEntity(self, dataEntityID):
376        '''
377        Retrieve the specified data entity and add a link to the current
378        data granule, if required, then update the atom in eXist
379        @param dataEntityID: ID of the data entity to augment with granule link
380        '''
381        logging.debug("Retrieving data entity atom - to attach granule to")
382        doc = self._eXist.getAtom('dataent_' + dataEntityID)
383        logging.debug("DE retrieved - now adding link to granule")
384        de = Atom.Atom(xmlString=str(doc))
385        noLinks = len(de.relatedLinks)
386        de.addRelatedLinks(self._atom.atomBrowseURL + " | " + \
387                           self._atom.title + " | " + \
388                           self._atom.VTD.getTermCurrentVocabURL(VTD.GRANULE_TERM))
389       
390        # only save if need be
391        if len(de.relatedLinks) == noLinks:
392            logging.info("- data entity already contains link to this granule - skpping")
393            return
394       
395        logging.debug("Now, save the updated DE atom back to eXist")
396        self._eXist.createAtomInExist(de)
397        logging.debug("DE atom updated")
398
399
400    def _initialiseGranule(self):
401        '''
402        Create an Atom object ready to populate with data
403        '''
404        self._atom = Atom.Atom(VTD.GRANULE_TERM)
405
406
407    def __processCSMLFile(self):
408        logging.info("Processing CSML file")
409        # only keep file name, if full path specified
410        fileName = self._csmlFileName
411        fileName = fileName.split('/')[-1]
412        fileName = fileName.split('\\')[-1]
413       
414        self._atom.addCSMLData(self._csmlFileName, \
415                               self._csmlContent, useCSMLID = self.useCSMLID)
416        logging.info("Adding CSML file to eXist")
417        self._eXist.createOrUpdateEXistFile(self._csmlContent, \
418                                        eXistConnector.NDG_A_COLLECTION_PATH + \
419                                        self._atom.ME.providerID + '/', \
420                                        fileName)
421        logging.info("CSML file added to eXist")
422        logging.info("Finished processing CSML file")
423           
424
425    def __processCDMLFile(self):
426        logging.info("Processing CDML file")
427        # only keep file name, if full path specified
428        fileName = self._cdmlFileName
429        fileName = fileName.split('/')[-1]
430        fileName = fileName.split('\\')[-1]
431       
432        # firstly, save the doc to eXist
433        # remove DOCTYPE tags - as these will prevent eXist from ingesting the doc
434        self._cdmlContent = re.sub(r'<!DOCTYPE.*>', '', self._cdmlContent)
435        logging.info("CDML file loaded")
436
437        logging.info("Adding CDML file to eXist")
438        self._eXist.createOrUpdateEXistFile(self._cdmlContent, \
439                                            eXistConnector.NDG_A_COLLECTION_PATH + \
440                                            self._atom.ME.providerID + '/',\
441                                            fileName)
442        self._atom.cdmlFile = eXistConnector.NDG_A_COLLECTION_PATH + fileName
443       
444        # create a local copy of the CDML file - NB, this is required if running
445        # from web app
446        fn = os.path.basename(str(datetime.datetime.today().microsecond) + fileName)
447        cdmlFile = open(fn, 'wb')
448        cdmlFile.write(self._cdmlContent)
449        cdmlFile.close()
450        message = 'The file "' + fn + '" was uploaded successfully'
451       
452        logging.info("Create CSML file from the CDML file - NB, this will be stored in eXist too " + \
453                     "and will be used to extract dataset information from")
454        csmlFileName = utilities.createCSMLFile(fn, self._datasetID, self._cdmlTimeAxis)
455        os.remove(fn)
456       
457        logging.info("CSML file successfully created - now processing this")
458        self.__addCSMLOrCDMLData(csmlFileName, None)
459       
460        # NB, can remove the CSML file now since the contents are read into memory
461        os.remove(csmlFileName)
462        logging.info("Finished processing CDML file")
463
464
465    def __addCSMLOrCDMLData(self, fileName, fileContent):
466        '''
467        Given an unknown file type, determine whether it is CSML or CDML; if it
468        is either, ingest the data appropriately; if not, just skip. NB, CDML docs
469        are converted into CSML ones to allow the data ingest
470        @param fileName: name of the file to ingest data from
471        @param fileContent: content of the file - NB, if this is set to None and the
472        file, fileName, is available locally, CsmlParser.Dataset will read in the file
473        directly
474        '''
475        logging.info("Determining file type to add data from")
476        if not fileContent:
477            logging.info("- NB, file contents not provided - attempt to load")
478            try:
479                f = open(fileName, 'r')
480                fileContent = f.read()
481                f.close
482            except IOError, e:
483                logging.error(e.message)
484               
485            if not fileContent:
486                raise ValueError("Could not load data from file, '%s'" %fileName)
487
488        # if we're dealing with a CDML file, process this to produce a CSML file
489        if utilities.isCSMLFile(fileContent):
490            self._csmlFileName = fileName
491            self._csmlContent = fileContent
492        elif utilities.isCDMLFile(fileContent):
493            self._cdmlFileName = fileName
494            self._cdmlContent = fileContent
495        else:
496            raise ValueError("Unrecognised file type, '%s'" %fileName)
497       
498        logging.info("Finished determining file type")
499       
500
501
502    def processCSMLOrCDMLFile(self):
503        '''
504        If a CSML or CDML file has been specified, process it - i.e.:
505        - extract required data
506        - add to eXist
507        @return atom: Atom object of created data granule with CSML/CDML data
508        added
509        '''
510        logging.info("Processing CSML/CDML data")
511        # NB, if a CDML file is specified, a CSML file will be created from it and this will be used to
512        # extract the required dataset info
513        if self._cdmlFileName:
514            self.__processCDMLFile()
515           
516        if self._csmlFileName:
517            self.__processCSMLFile()
518        logging.info("Finished processing CSML/CDML data")
519        return self._atom
520
521       
522    def processGranulite(self):
523        '''
524        Complete the required granulite actions
525        - ingest granulite data + add to granule atom
526        - ingest CSML or CDML data + add to granule atom
527        - save CSML/CDML data to eXist
528        - add granule atom to eXist
529        - add references to granule atom to specified data entity atoms
530       
531        @return atom: Atom object of created data granule
532        '''
533        logging.info("Processing granulite data")
534        # load in the granulite details
535        inputs = self.__getGranuliteDetails()
536       
537        # apply the basic, essential data
538        self.__applyCoreGranuliteDetails(inputs)
539       
540        # check for CSML or CDML file and load details
541        self.processCSMLOrCDMLFile()
542
543        # apply any granulite data; NB, this overrides/augments the
544        # CSML/CDML data by default
545        self.__applyGranuliteDetails(inputs)
546       
547        # add the granule to eXist
548        # check if we've got a valid datasetid at this point - otherwise we won't have a valid filename or DB PK to use
549        if not self._atom.datasetID:
550            errorMessage = "No dataset ID specified for the granule - this needs " + \
551                "to be set in the granulite file or in \nthe CSML/CDML files.  " + \
552                "NB, without this cannot create valid entry in the eXist " + \
553                "DB - so escaping now"
554            logging.error(errorMessage)
555            raise ValueError(errorMessage)
556
557        logging.info("Creating granule atom, '%s', in eXist DB" %self._atom.atomName)
558        self._eXist.createOrUpdateEXistFile(self._atom.toPrettyXML(), \
559                                            self._atom.getDefaultCollectionPath(), \
560                                            self._atom.atomName)
561
562        # now add the granule data to the data entity in eXist
563        self._addGranuleToDataEntityRecords()
564       
565        logging.info("granulite processing complete")
566        return self._atom
567
568
569if __name__=="__main__":
570    opts, args = getopt.getopt(sys.argv[1:], '-oxuvd')
571    if len(args) < 1:
572        print "Error: Please specify a granulite data file to process"
573        sys.exit(2)
574   
575    logging.basicConfig(level = logging.DEBUG,
576                        format='%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s')
577    g = granulite(args[0])
578    g.processGranulite()
Note: See TracBrowser for help on using the repository browser.