source: exist/trunk/python/ndgUtils/lib/granulite.py @ 4627

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/granulite.py@4627
Revision 4627, 24.0 KB checked in by cbyrom, 11 years ago (diff)

Improve input parameter checking + improve scoping of methods + add more structure to validator.

Line 
1#!/usr/bin/env python
2'''
3Data model representing a granulite file - together with utitilities to
4augment atom data with
5
6 @author: C Byrom, Tessella Nov 08
7'''
8import os, sys, string, getopt, logging, re, datetime, cgi
9from ndgUtils.eXistConnector import eXistConnector
10from ndgUtils.models import Atom
11import ndgUtils.lib.utilities as utilities
12from ndgUtils.vocabtermdata import VocabTermData as VTD
13import ndgUtils.lib.existdbclient as edc
14
15class granulite(object):
16    '''
17    Granulite data model
18    '''
19    # expected sections in a granulite file
20    AUTHORS_SECTION = "authors"
21    FILE_SECTION = "files"
22    PARAMETERS_SECTION = "parameters"
23    COVERAGE_SECTION = "coverage"
24    CSML_SECTION = "csml_file"
25    CDML_SECTION = "cdml_file"
26    GRANULE_INFO_SECTION = "granule_info"
27    GRANULE_AUTHORS_SECTION = "granule_authors"
28    SUMMARY_SECTION = "summary"
29    DATA_ENTITY_SECTION = "data_entity_id"
30    LOGO_SECTION = "logo"
31    RELATED_LINKS_SECTION = "related_links"
32   
33    # need to distinguise coverage data between spatial and temporal data - use these variables to do so
34    TEMPORAL_DATA = "temporal_data"
35    SPATIAL_DATA = "spatial_data"
36
37    # group the data together as either single valued or array valued - to ease setting attributes on Atom
38    singleVals = {} 
39    arrayVals = {AUTHORS_SECTION:'authors', \
40                 FILE_SECTION:'files', GRANULE_AUTHORS_SECTION:'atomAuthors', \
41                 SUMMARY_SECTION:'summary'}
42       
43    # config file with eXist DB details
44    DBCONFIG_FILE = "exist.config"
45
46    # default title given to CSML files by csmlscan
47    DEFAULT_CSML_TITLE = "NAME OF DATASET GOES HERE"
48   
49    # flag to use when running in test mode
50    TEST_MODE = "granulite_test_mode"
51   
52    # eXist DB client
53    _eXist = None
54   
55    # flag to mark mode of operation
56    _isOverride = False
57
58    # info on specified CDML + CSML
59    _cdmlFileName = None
60    _cdmlTimeAxis = None
61    _datasetID = None
62    _csmlFileName = None
63
64    # info on the datasets to attach the granule to
65    _dataEntityIDs = []
66   
67    # standard output delimiter
68    LINE_SEPARATOR = "-----------------------------"
69       
70       
71    def __init__(self, granulite, granuleAtom = None, \
72                  eXistClient = None, csmlOrCdmlFile = None, \
73                  aggregateCoverage = False, useCSMLID = False, 
74                  timeAxis = 'time', datasetID = None):
75        '''
76        Constructor for granulite object - NB, the csml/cdml keywords allow use
77        in a webapp - outside of this, this data should be specified in the granulite
78        file
79       
80        @param granulite: either the granulite filename or contents
81        @keyword granuleAtom: an existing granule atom - to add granulite data to
82        - NB, if not specified, a new atom is used
83        @keyword eXistClient: an eXist connection
84        @keyword csmlOrCdmlFile: a cgi.FieldStorage object with CSML or CDML data
85        @keyword aggregateCoverage: if set to True, only coverage data that extends the
86        atom coverage data will be added
87        @keyword useCSMLID: if True, use the CSML doc ID as the dataset ID - NB,
88        this should only be True if creating a new atom - e.g. from a granulite
89        @keyword timeAxis: if using a CDML file, specify the time axis to use - 'time'
90        is the default
91        @keyword datasetID: if using a CDML file, specify the ID of the dataset to use
92        - otherwise one will be randomly generated
93        '''
94        logging.info("Creating granulite data model")
95       
96        self._granulite = granulite
97       
98        # set up connection to eXist
99        if eXistClient:
100            self._eXist = eXistClient
101        else:
102            self._eXist = edc.eXistDBClient(configFile = self.DBCONFIG_FILE)
103       
104        # override CSML/CDML data specified in the granulite file with data input directly
105        self.ingestGranuliteFiles = True
106        # NB, empty FieldStorage fields end up as empty strings
107        if csmlOrCdmlFile is not None:
108            self.ingestGranuliteFiles = False
109            if csmlOrCdmlFile != '':
110                if not isinstance(csmlOrCdmlFile, cgi.FieldStorage):
111                    raise ValueError("Specified CSML/CDML file is not a cgi.FieldStorage object")
112                self.__addCSMLOrCDMLData(csmlOrCdmlFile.filename, csmlOrCdmlFile.value)
113       
114
115        if granuleAtom:
116            if not isinstance(granuleAtom, Atom.Atom):
117                raise ValueError("Specified granule atom is not an Atom object")
118            self._atom = granuleAtom
119        else:
120            # create a skeleton granule
121            self.__initialiseGranule()
122
123        self.useCSMLID = useCSMLID
124        self._cdmlTimeAxis = timeAxis
125        self._datasetID = datasetID
126           
127        logging.info("Granulite data model set up")
128
129       
130    def __getSectionName(self, str):
131        '''
132        Checks an input string to see if it contains a section title; if so
133        return this title name, otherwise return None
134        @param str: string to parse for section name
135        @return: section name, if found, None otherwise
136        '''
137        sectionName = None
138        if str.count('::') == 1:
139            sectionName = str.partition('::')[0]
140        return sectionName
141
142
143    def __getGranuliteDetails(self):
144        '''
145        Load the granulite config file and extract the required data 
146        '''
147        logging.info("Retrieving data from granulite config file")
148       
149        # assume we've already read in the file if it is multi-line
150        newLine = None
151        # cope with unix and dos end of lines
152        if self._granulite.find('\n') > -1:
153            newLine = '\n'
154        elif self._granulite.find('\r') > -1:
155            newLine = '\r'
156       
157        if newLine:
158            granuliteData = self._granulite.split(newLine)
159        else:
160            # Check this file exists
161            if not os.path.isfile(self._granulite):
162                raise ValueError("ERROR: Could not find the config file, %s; please specify " \
163                         "a valid file" %self._granulite)
164               
165            granulite_file = open(self._granulite, "r")
166            granuliteData = granulite_file.readlines()
167            granulite_file.close()
168       
169        # create a dictionary of all data in file - then use this to get the required data
170        granulite_data = {}
171       
172        # initialise vocab term data lookup object
173        self.VTD = VTD()
174
175        sectionName = "" # variable to indicate what section we're currently parsing in the granulite file data
176        logging.info("Parsing granulite file...")
177        for line in granuliteData:
178           
179            line = line.strip()
180            # avoid comments
181            if (line.startswith('#')):
182                continue
183           
184            # avoid empty lines - except in the summary section where these may be appropriate
185            if not line and sectionName != self.SUMMARY_SECTION:
186                continue
187           
188            section = self.__getSectionName(line)
189
190            # if a section name is returned, we're at a new section of the file - so change the dictionary key
191            if section:
192                sectionName = section
193                continue
194           
195            # cope with case where there is rubbish header info in the file
196            if not sectionName:
197                continue
198           
199            # initialise dictionary array if required
200            if sectionName not in granulite_data:
201                granulite_data[sectionName] = []
202           
203            if sectionName != self.COVERAGE_SECTION:
204                logging.debug("Adding %s data: -%s-" %(sectionName, line))
205                granulite_data[sectionName].append(line)
206            else:
207                # if it's coverage data, determine if it is spatial or temporal data
208                coverageType = self.SPATIAL_DATA
209                if line.lower().find('time') > -1:
210                    coverageType = self.TEMPORAL_DATA
211               
212                # initialise dictionary array if required
213                if coverageType not in granulite_data:
214                    granulite_data[coverageType] = []
215                logging.debug("Adding %s data: %s" %(coverageType, line))
216                granulite_data[coverageType].append(line)
217                   
218        logging.info("Finished parsing granulite file")
219        logging.info("- returning dict of ingested data")
220        return granulite_data
221       
222       
223    def __applyCoreGranuliteDetails(self, granulite_data):
224        '''
225        Apply the essential data read in from the granulite file to the atom data model
226        being processed by the granulite
227        @param granulite_data: the dict produced by __getGranuliteDetails
228        '''
229        # add the general granule info
230        if self.GRANULE_INFO_SECTION not in granulite_data:
231            raise ValueError("Need granule_info section in granulite input file")
232       
233        data = utilities.getTripleData(granulite_data[self.GRANULE_INFO_SECTION][0])
234        if not data[0]:
235            raise ValueError("Provider ID is missing for granule; please add this info to the " + \
236                             self.GRANULE_INFO_SECTION + " section of the granulite config file")
237        self._atom.addMolesEntityData(None, data[0], None)
238        self._atom.setDatasetID(data[1])
239        self._atom.title = data[2]
240
241        # NB, if running from the web, ignore the CSML/CDML files specified in the granulite
242        if self.ingestGranuliteFiles:
243            # check for CSML/CDML file input - these data are changed before adding to the granule
244            # - since the file will be stored and referenced in eXist by then
245            if not self._csmlFileName:
246                if self.CSML_SECTION in granulite_data:
247                    self.__addCSMLOrCDMLData(granulite_data[self.CSML_SECTION][0], None)
248   
249            if not self._cdmlFileName:
250                if self.CDML_SECTION in granulite_data:
251                    if self._csmlFileName:
252                        raise ValueError("Cannot specify both CDML and CSML file in granulite config file" + \
253                                         "\nNB, CSML file is generated from the specified CDML file")
254               
255                    data = utilities.getTripleData(granulite_data[self.CDML_SECTION][0])
256                    self.__addCSMLOrCDMLData(data[0], None)
257                    self._datasetID = data[1]
258                    self._cdmlTimeAxis = data[2]
259       
260       
261    def __applyGranuliteDetails(self, granulite_data):
262        '''
263        Apply the data read in from the granulite file to the atom data model
264        being processed by the granulite
265        @param granulite_data: the dict produced by __getGranuliteDetails
266        '''
267        logging.info("Applying granulite data to atom")
268        # add the single value attributes to the granule
269        for attribute in self.singleVals:
270            if attribute in granulite_data:
271                self._atom.setAttribute(self.singleVals[attribute], \
272                                        granulite_data[attribute][0])
273
274        # NB, explicitly set the related links before running the arrayVals loop, since
275        # this will replace any file Links created if it occurs after these have been
276        # created (was originally in arrayVals, but this doesn't enforce any specific ordering)
277        if self.RELATED_LINKS_SECTION in granulite_data:
278            self._atom.setAttribute('relatedLinks', \
279                                    granulite_data[self.RELATED_LINKS_SECTION])
280
281        # now add the arrays data
282        for attribute in self.arrayVals:
283            if attribute in granulite_data:
284                self._atom.setAttribute(self.arrayVals[attribute], \
285                                        granulite_data[attribute])
286       
287        if self.LOGO_SECTION in granulite_data:
288            for logo in granulite_data[self.LOGO_SECTION]:
289                self._atom.addRelatedLinks(logo + " | Logo | " + \
290                                           self.VTD.getTermCurrentVocabURL(VTD.LOGO_TERM))
291
292        # add the parameters data via the method - since this does some tidying up of the data
293        if self.PARAMETERS_SECTION in granulite_data:
294            self._atom.addParameters(granulite_data[self.PARAMETERS_SECTION])
295           
296        if self.DATA_ENTITY_SECTION in granulite_data:
297            self._dataEntityIDs = granulite_data[self.DATA_ENTITY_SECTION] 
298
299        # now add any coverage data
300        if self.SPATIAL_DATA in granulite_data:
301            self._extractSpatialData(granulite_data[self.SPATIAL_DATA][0])
302        if self.TEMPORAL_DATA in granulite_data:
303            self._extractTemporalData(granulite_data[self.TEMPORAL_DATA][0])
304
305        logging.info("All input data set up")
306       
307
308    def _extractSpatialData(self, geomString):
309        '''
310        Extract bounding box info from the specified geometry string
311        @param geomString: A string holding geometry info
312        NB, currently the method supports parsing of POLYGONs
313        '''
314        if geomString.upper().find('POLYGON') > -1:
315            logging.debug("Spatial data specified in POLYGON format - extracting data from this")
316            vals = re.findall('([\-\d\.]+)', geomString)
317            # assume we're dealing with a rectangle normal to the equator...
318            if len(vals) == 10:
319                self._atom.minX = vals[0]
320                self._atom.minY = vals[1]
321                self._atom.maxX = vals[4]
322                self._atom.maxY = vals[5]
323        else:
324            errorMessage = "Spatial coverage data not stored in POLYGON format - please correct and rerun"
325            logging.error(errorMessage)
326            raise ValueError(errorMessage)
327
328
329    def _extractTemporalData(self, dateRangeString):
330        '''
331        Extract temporal info from the specified daterange string
332        @param dateRangeString: A string holding temporal info
333        NB, currently the method supports parsing of TIMERANGE objects
334        '''
335        if dateRangeString.upper().find('TIMERANGE') == -1:
336            errorMessage = "Temporal data not stored in TIMERANGE() format - please correct and rerun"
337            logging.error(errorMessage)
338            raise ValueError(errorMessage)
339       
340        vals = re.findall('([0-9][0-9\-:TZ ]+)', dateRangeString)
341        if vals:
342            logging.debug("Adding start time: %s" %vals[0])
343            self._atom.t1 = vals[0]
344            if len(vals) > 1:
345                logging.debug("Adding finish time: %s" %vals[1])
346                self._atom.t2 = vals[1]
347     
348   
349    def __addGranuleToDataEntityRecords(self):
350        '''
351        Augment any data entity records, specified in the granulite, with a reference to
352        the granule
353        '''
354        logging.info("Adding granule info to data entities")
355        if not self._dataEntityIDs:
356            logging.info("No data entity IDs were specified in granulite - so will not add granule data to cedarmoles DB")
357            return
358       
359        # now set up the granule links to the data entities specified
360        for entityID in self._dataEntityIDs:
361            data = utilities.getTripleData(entityID)
362            dataEntityID = data[0]
363            dataEntityProviderID = data[1]
364           
365            try:
366                self.__updateDataEntity(dataEntityID)
367            except:
368                logging.error("Exception thrown - detail: ")
369                logging.error(sys.exc_info())
370                logging.info("Continue processing other data entities")
371
372        logging.info("Granule data added to data entities")
373
374
375    def __updateDataEntity(self, dataEntityID):
376        '''
377        Retrieve the specified data entity and add a link to the current
378        data granule, if required, then update the atom in eXist
379        @param dataEntityID: ID of the data entity to augment with granule link
380        '''
381        logging.debug("Retrieving data entity atom - to attach granule to")
382#        doc = self._eXist.getAtom('dataent_' + dataEntityID)
383        doc = self._eXist.getAtom(dataEntityID)
384        logging.debug("DE retrieved - now adding link to granule")
385        de = Atom.Atom(xmlString=str(doc))
386        noLinks = len(de.relatedLinks)
387        de.addRelatedLinks(self._atom.atomBrowseURL + " | " + \
388                           self._atom.title + " | " + \
389                           self._atom.VTD.getTermCurrentVocabURL(VTD.GRANULE_TERM))
390       
391        # only save if need be
392        if len(de.relatedLinks) == noLinks:
393            logging.info("- data entity already contains link to this granule - skpping")
394            return
395       
396        logging.debug("Now, save the updated DE atom back to eXist")
397        self._eXist.createAtomInExist(de)
398        logging.debug("DE atom updated")
399
400
401    def __initialiseGranule(self):
402        '''
403        Create an Atom object ready to populate with data
404        '''
405        self._atom = Atom.Atom(VTD.GRANULE_TERM)
406
407
408    def __processCSMLFile(self):
409        logging.info("Processing CSML file")
410        # only keep file name, if full path specified
411        fileName = self._csmlFileName
412        fileName = fileName.split('/')[-1]
413        fileName = fileName.split('\\')[-1]
414       
415        self._atom.addCSMLData(self._csmlFileName, \
416                               self._csmlContent, useCSMLID = self.useCSMLID)
417        logging.info("Adding CSML file to eXist")
418        self._eXist.createOrUpdateEXistFile(self._csmlContent, \
419                                        eXistConnector.NDG_A_COLLECTION_PATH + \
420                                        self._atom.ME.providerID + '/', \
421                                        fileName)
422        logging.info("CSML file added to eXist")
423        logging.info("Finished processing CSML file")
424           
425
426    def __processCDMLFile(self):
427        logging.info("Processing CDML file")
428        # only keep file name, if full path specified
429        fileName = self._cdmlFileName
430        fileName = fileName.split('/')[-1]
431        fileName = fileName.split('\\')[-1]
432       
433        # firstly, save the doc to eXist
434        # remove DOCTYPE tags - as these will prevent eXist from ingesting the doc
435        self._cdmlContent = re.sub(r'<!DOCTYPE.*>', '', self._cdmlContent)
436        logging.info("CDML file loaded")
437
438        logging.info("Adding CDML file to eXist")
439        self._eXist.createOrUpdateEXistFile(self._cdmlContent, \
440                                            eXistConnector.NDG_A_COLLECTION_PATH + \
441                                            self._atom.ME.providerID + '/',\
442                                            fileName)
443        self._atom.cdmlFile = eXistConnector.NDG_A_COLLECTION_PATH + fileName
444       
445        # create a local copy of the CDML file - NB, this is required if running
446        # from web app
447        fn = os.path.basename(str(datetime.datetime.today().microsecond) + fileName)
448        cdmlFile = open(fn, 'wb')
449        cdmlFile.write(self._cdmlContent)
450        cdmlFile.close()
451        message = 'The file "' + fn + '" was uploaded successfully'
452       
453        logging.info("Create CSML file from the CDML file - NB, this will be stored in eXist too " + \
454                     "and will be used to extract dataset information from")
455        csmlFileName = utilities.createCSMLFile(fn, self._datasetID, self._cdmlTimeAxis)
456        os.remove(fn)
457       
458        logging.info("CSML file successfully created - now processing this")
459        self.__addCSMLOrCDMLData(csmlFileName, None)
460       
461        # NB, can remove the CSML file now since the contents are read into memory
462        os.remove(csmlFileName)
463        logging.info("Finished processing CDML file")
464
465
466    def __addCSMLOrCDMLData(self, fileName, fileContent):
467        '''
468        Given an unknown file type, determine whether it is CSML or CDML; if it
469        is either, ingest the data appropriately; if not, just skip. NB, CDML docs
470        are converted into CSML ones to allow the data ingest
471        @param fileName: name of the file to ingest data from
472        @param fileContent: content of the file - NB, if this is set to None and the
473        file, fileName, is available locally, CsmlParser.Dataset will read in the file
474        directly
475        '''
476        logging.info("Determining file type to add data from")
477        if not fileContent:
478            logging.info("- NB, file contents not provided - attempt to load")
479            try:
480                f = open(fileName, 'r')
481                fileContent = f.read()
482                f.close
483            except IOError, e:
484                logging.error(e.message)
485               
486            if not fileContent:
487                raise ValueError("Could not load data from file, '%s'" %fileName)
488
489        # if we're dealing with a CDML file, process this to produce a CSML file
490        if utilities.isCSMLFile(fileContent):
491            self._csmlFileName = fileName
492            self._csmlContent = fileContent
493        elif utilities.isCDMLFile(fileContent):
494            self._cdmlFileName = fileName
495            self._cdmlContent = fileContent
496        else:
497            raise ValueError("Unrecognised file type, '%s'" %fileName)
498       
499        logging.info("Finished determining file type")
500       
501
502
503    def processCSMLOrCDMLFile(self):
504        '''
505        If a CSML or CDML file has been specified, process it - i.e.:
506        - extract required data
507        - add to eXist
508        @return atom: Atom object of created data granule with CSML/CDML data
509        added
510        '''
511        logging.info("Processing CSML/CDML data")
512        # NB, if a CDML file is specified, a CSML file will be created from it and this will be used to
513        # extract the required dataset info
514        if self._cdmlFileName:
515            self.__processCDMLFile()
516           
517        if self._csmlFileName:
518            self.__processCSMLFile()
519        logging.info("Finished processing CSML/CDML data")
520        return self._atom
521
522       
523    def processGranulite(self):
524        '''
525        Complete the required granulite actions
526        - ingest granulite data + add to granule atom
527        - ingest CSML or CDML data + add to granule atom
528        - save CSML/CDML data to eXist
529        - add granule atom to eXist
530        - add references to granule atom to specified data entity atoms
531       
532        @return atom: Atom object of created data granule
533        '''
534        logging.info("Processing granulite data")
535        # load in the granulite details
536        inputs = self.__getGranuliteDetails()
537       
538        # apply the basic, essential data
539        self.__applyCoreGranuliteDetails(inputs)
540       
541        # check for CSML or CDML file and load details
542        self.processCSMLOrCDMLFile()
543
544        # apply any granulite data; NB, this overrides/augments the
545        # CSML/CDML data by default
546        self.__applyGranuliteDetails(inputs)
547       
548        # add the granule to eXist
549        # check if we've got a valid datasetid at this point - otherwise we won't have a valid filename or DB PK to use
550        if not self._atom.datasetID:
551            errorMessage = "No dataset ID specified for the granule - this needs " + \
552                "to be set in the granulite file or in \nthe CSML/CDML files.  " + \
553                "NB, without this cannot create valid entry in the eXist " + \
554                "DB - so escaping now"
555            logging.error(errorMessage)
556            raise ValueError(errorMessage)
557
558        logging.info("Creating granule atom, '%s', in eXist DB" %self._atom.atomName)
559        self._eXist.createOrUpdateEXistFile(self._atom.toPrettyXML(), \
560                                            self._atom.getDefaultCollectionPath(), \
561                                            self._atom.atomName)
562
563        # now add the granule data to the data entity in eXist
564        self.__addGranuleToDataEntityRecords()
565       
566        logging.info("granulite processing complete")
567        return self._atom
568
569
570if __name__=="__main__":
571    opts, args = getopt.getopt(sys.argv[1:], '-oxuvd')
572    if len(args) < 1:
573        print "Error: Please specify a granulite data file to process"
574        sys.exit(2)
575   
576    logging.basicConfig(level = logging.DEBUG,
577                        format='%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s')
578    g = granulite(args[0])
579    g.processGranulite()
Note: See TracBrowser for help on using the repository browser.