source: exist/trunk/python/ndgUtils/lib/granulite.py @ 4585

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/granulite.py@4585
Revision 4585, 23.9 KB checked in by cbyrom, 11 years ago (diff)

Restructure granulite, creating smaller methods, to allow the bulk of the
granulite data to override or augment that in the CSML/CDML files.

Line 
1#!/usr/bin/env python
2'''
3Data model representing a granulite file - together with utitilities to
4augment atom data with
5
6 @author: C Byrom, Tessella Nov 08
7'''
8import os, sys, string, getopt, logging, re, datetime, cgi
9from ndgUtils.eXistConnector import eXistConnector
10from ndgUtils.models import Atom
11import ndgUtils.lib.utilities as utilities
12from ndgUtils.vocabtermdata import VocabTermData as VTD
13import ndgUtils.lib.existdbclient as edc
14
15class granulite(object):
16    '''
17    Granulite data model
18    '''
19    # expected sections in a granulite file
20    AUTHORS_SECTION = "authors"
21    FILE_SECTION = "files"
22    PARAMETERS_SECTION = "parameters"
23    COVERAGE_SECTION = "coverage"
24    CSML_SECTION = "csml_file"
25    CDML_SECTION = "cdml_file"
26    GRANULE_INFO_SECTION = "granule_info"
27    GRANULE_AUTHORS_SECTION = "granule_authors"
28    SUMMARY_SECTION = "summary"
29    DATA_ENTITY_SECTION = "data_entity_id"
30    LOGO_SECTION = "logo"
31    RELATED_LINKS_SECTION = "related_links"
32   
33    # need to distinguise coverage data between spatial and temporal data - use these variables to do so
34    TEMPORAL_DATA = "temporal_data"
35    SPATIAL_DATA = "spatial_data"
36
37    # group the data together as either single valued or array valued - to ease setting attributes on Atom
38    singleVals = {} 
39    arrayVals = {AUTHORS_SECTION:'authors', \
40                 FILE_SECTION:'files', GRANULE_AUTHORS_SECTION:'atomAuthors', \
41                 SUMMARY_SECTION:'summary'}
42       
43    # config file with eXist DB details
44    DBCONFIG_FILE = "exist.config"
45
46    # default title given to CSML files by csmlscan
47    DEFAULT_CSML_TITLE = "NAME OF DATASET GOES HERE"
48   
49    # flag to use when running in test mode
50    TEST_MODE = "granulite_test_mode"
51   
52    # eXist DB client
53    _eXist = None
54   
55    # flag to mark mode of operation
56    _isOverride = False
57
58    # info on specified CDML + CSML
59    _cdmlFileName = None
60    _cdmlTimeAxis = None
61    _datasetID = None
62    _csmlFileName = None
63
64    # info on the datasets to attach the granule to
65    _dataEntityIDs = []
66   
67    # standard output delimiter
68    LINE_SEPARATOR = "-----------------------------"
69       
70       
71    def __init__(self, granulite, granuleAtom = None, \
72                  eXistClient = None, csmlOrCdmlFile = None, \
73                  aggregateCoverage = False, useCSMLID = False, 
74                  timeAxis = 'time', datasetID = None):
75        '''
76        Constructor for granulite object - NB, the csml/cdml keywords allow use
77        in a webapp - outside of this, this data should be specified in the granulite
78        file
79       
80        @param granulite: either the granulite filename or contents
81        @keyword granuleAtom: an existing granule atom - to add granulite data to
82        - NB, if not specified, a new atom is used
83        @keyword eXistClient: an eXist connection
84        @keyword csmlOrCdmlFile: a cgi.FieldStorage object with CSML or CDML data
85        @keyword aggregateCoverage: if set to True, only coverage data that extends the
86        atom coverage data will be added
87        @keyword useCSMLID: if True, use the CSML doc ID as the dataset ID - NB,
88        this should only be True if creating a new atom - e.g. from a granulite
89        @keyword timeAxis: if using a CDML file, specify the time axis to use - 'time'
90        is the default
91        @keyword datasetID: if using a CDML file, specify the ID of the dataset to use
92        - otherwise one will be randomly generated
93        @return csmlDoc, cdmlDoc: the CsmlParser.Dataset object with the csml data in and,
94        if we're dealing with a CDML doc, return this too
95        '''
96        logging.info("Creating granulite data model")
97       
98        self._granulite = granulite
99       
100        # set up connection to eXist
101        if eXistClient:
102            self._eXist = eXistClient
103        else:
104            self._eXist = edc.eXistDBClient(configFile = self.DBCONFIG_FILE)
105       
106        # override CSML/CDML data specified in the granulite file with data input directly
107        self.ingestGranuliteFiles = True
108        if csmlOrCdmlFile is not None:
109            if not isinstance(csmlOrCdmlFile, cgi.FieldStorage):
110                raise ValueError("Specified CSML/CDML file is not a cgi.FieldStorage object")
111            self.__addCSMLOrCDMLData(csmlOrCdmlFile.filename, csmlOrCdmlFile.value)
112            self.ingestGranuliteFiles = False
113
114        if granuleAtom:
115            if not isinstance(granuleAtom, Atom.Atom):
116                raise ValueError("Specified granule atom is not an Atom object")
117            self._atom = granuleAtom
118        else:
119            # create a skeleton granule
120            self._initialiseGranule()
121
122        self.useCSMLID = useCSMLID
123        self._cdmlTimeAxis = timeAxis
124        self._datasetID = datasetID
125           
126        logging.info("Granulite data model set up")
127
128       
129    def _getSectionName(self, str):
130        '''
131        Checks an input string to see if it contains a section title; if so
132        return this title name, otherwise return None
133        @param str: string to parse for section name
134        @return: section name, if found, None otherwise
135        '''
136        sectionName = None
137        if str.count('::') == 1:
138            sectionName = str.partition('::')[0]
139        return sectionName
140
141
142    def __getGranuliteDetails(self):
143        '''
144        Load the granulite config file and extract the required data 
145        '''
146        logging.info("Retrieving data from granulite config file")
147       
148        # assume we've already read in the file if it is multi-line
149        newLine = None
150        # cope with unix and dos end of lines
151        if self._granulite.find('\n') > -1:
152            newLine = '\n'
153        elif self._granulite.find('\r') > -1:
154            newLine = '\r'
155       
156        if newLine:
157            granuliteData = self._granulite.split(newLine)
158        else:
159            # Check this file exists
160            if not os.path.isfile(self._granulite):
161                raise ValueError("ERROR: Could not find the config file, %s; please specify " \
162                         "a valid file" %self._granulite)
163               
164            granulite_file = open(self._granulite, "r")
165            granuliteData = granulite_file.readlines()
166            granulite_file.close()
167       
168        # create a dictionary of all data in file - then use this to get the required data
169        granulite_data = {}
170       
171        # initialise vocab term data lookup object
172        self.VTD = VTD()
173
174        sectionName = "" # variable to indicate what section we're currently parsing in the granulite file data
175        logging.info("Parsing granulite file...")
176        for line in granuliteData:
177           
178            line = line.strip()
179            # avoid comments
180            if (line.startswith('#')):
181                continue
182           
183            # avoid empty lines - except in the summary section where these may be appropriate
184            if not line and sectionName != self.SUMMARY_SECTION:
185                continue
186           
187            section = self._getSectionName(line)
188
189            # if a section name is returned, we're at a new section of the file - so change the dictionary key
190            if section:
191                sectionName = section
192                continue
193           
194            # cope with case where there is rubbish header info in the file
195            if not sectionName:
196                continue
197           
198            # initialise dictionary array if required
199            if sectionName not in granulite_data:
200                granulite_data[sectionName] = []
201           
202            if sectionName != self.COVERAGE_SECTION:
203                logging.debug("Adding %s data: -%s-" %(sectionName, line))
204                granulite_data[sectionName].append(line)
205            else:
206                # if it's coverage data, determine if it is spatial or temporal data
207                coverageType = self.SPATIAL_DATA
208                if line.lower().find('time') > -1:
209                    coverageType = self.TEMPORAL_DATA
210               
211                # initialise dictionary array if required
212                if coverageType not in granulite_data:
213                    granulite_data[coverageType] = []
214                logging.debug("Adding %s data: %s" %(coverageType, line))
215                granulite_data[coverageType].append(line)
216                   
217        logging.info("Finished parsing granulite file")
218        logging.info("- returning dict of ingested data")
219        return granulite_data
220       
221       
222    def __applyCoreGranuliteDetails(self, granulite_data):
223        '''
224        Apply the essential data read in from the granulite file to the atom data model
225        being processed by the granulite
226        @param granulite_data: the dict produced by __getGranuliteDetails
227        '''
228        # add the general granule info
229        if self.GRANULE_INFO_SECTION not in granulite_data:
230            raise ValueError("Need granule_info section in granulite input file")
231       
232        data = utilities.getTripleData(granulite_data[self.GRANULE_INFO_SECTION][0])
233        if not data[0]:
234            raise ValueError("Provider ID is missing for granule; please add this info to the " + \
235                             self.GRANULE_INFO_SECTION + " section of the granulite config file")
236        self._atom.addMolesEntityData(None, data[0], None)
237        self._atom.setDatasetID(data[1])
238        self._atom.title = data[2]
239
240        # NB, if running from the web, ignore the CSML/CDML files specified in the granulite
241        if self.ingestGranuliteFiles:
242            # check for CSML/CDML file input - these data are changed before adding to the granule
243            # - since the file will be stored and referenced in eXist by then
244            if not self._csmlFileName:
245                if self.CSML_SECTION in granulite_data:
246                    self.__addCSMLOrCDMLData(granulite_data[self.CSML_SECTION][0], None)
247   
248            if not self._cdmlFileName:
249                if self.CDML_SECTION in granulite_data:
250                    if self._csmlFileName:
251                        raise ValueError("Cannot specify both CDML and CSML file in granulite config file" + \
252                                         "\nNB, CSML file is generated from the specified CDML file")
253               
254                    data = utilities.getTripleData(granulite_data[self.CDML_SECTION][0])
255                    self.__addCSMLOrCDMLData(data[0], None)
256                    self._datasetID = data[1]
257                    self._cdmlTimeAxis = data[2]
258       
259       
260    def __applyGranuliteDetails(self, granulite_data):
261        '''
262        Apply the data read in from the granulite file to the atom data model
263        being processed by the granulite
264        @param granulite_data: the dict produced by __getGranuliteDetails
265        '''
266        logging.info("Applying granulite data to atom")
267        # add the single value attributes to the granule
268        for attribute in self.singleVals:
269            if attribute in granulite_data:
270                self._atom.setAttribute(self.singleVals[attribute], \
271                                        granulite_data[attribute][0])
272
273        # NB, explicitly set the related links before running the arrayVals loop, since
274        # this will replace any file Links created if it occurs after these have been
275        # created (was originally in arrayVals, but this doesn't enforce any specific ordering)
276        if self.RELATED_LINKS_SECTION in granulite_data:
277            self._atom.setAttribute('relatedLinks', \
278                                    granulite_data[self.RELATED_LINKS_SECTION])
279
280        # now add the arrays data
281        for attribute in self.arrayVals:
282            if attribute in granulite_data:
283                self._atom.setAttribute(self.arrayVals[attribute], \
284                                        granulite_data[attribute])
285       
286        if self.LOGO_SECTION in granulite_data:
287            for logo in granulite_data[self.LOGO_SECTION]:
288                self._atom.addRelatedLinks(logo + " | Logo | " + \
289                                           self.VTD.getTermCurrentVocabURL(VTD.LOGO_TERM))
290
291        # add the parameters data via the method - since this does some tidying up of the data
292        if self.PARAMETERS_SECTION in granulite_data:
293            self._atom.addParameters(granulite_data[self.PARAMETERS_SECTION])
294           
295        if self.DATA_ENTITY_SECTION in granulite_data:
296            self._dataEntityIDs = granulite_data[self.DATA_ENTITY_SECTION] 
297
298        # now add any coverage data
299        if self.SPATIAL_DATA in granulite_data:
300            self._extractSpatialData(granulite_data[self.SPATIAL_DATA][0])
301        if self.TEMPORAL_DATA in granulite_data:
302            self._extractTemporalData(granulite_data[self.TEMPORAL_DATA][0])
303
304        logging.info("All input data set up")
305       
306
307    def _extractSpatialData(self, geomString):
308        '''
309        Extract bounding box info from the specified geometry string
310        @param geomString: A string holding geometry info
311        NB, currently the method supports parsing of POLYGONs
312        '''
313        if geomString.upper().find('POLYGON') > -1:
314            logging.debug("Spatial data specified in POLYGON format - extracting data from this")
315            vals = re.findall('([\-\d\.]+)', geomString)
316            # assume we're dealing with a rectangle normal to the equator...
317            if len(vals) == 10:
318                self._atom.minX = vals[0]
319                self._atom.minY = vals[1]
320                self._atom.maxX = vals[4]
321                self._atom.maxY = vals[5]
322        else:
323            errorMessage = "Spatial coverage data not stored in POLYGON format - please correct and rerun"
324            logging.error(errorMessage)
325            raise ValueError(errorMessage)
326
327
328    def _extractTemporalData(self, dateRangeString):
329        '''
330        Extract temporal info from the specified daterange string
331        @param dateRangeString: A string holding temporal info
332        NB, currently the method supports parsing of TIMERANGE objects
333        '''
334        if dateRangeString.upper().find('TIMERANGE') == -1:
335            errorMessage = "Temporal data not stored in TIMERANGE() format - please correct and rerun"
336            logging.error(errorMessage)
337            raise ValueError(errorMessage)
338       
339        vals = re.findall('([0-9][0-9\-:TZ ]+)', dateRangeString)
340        if vals:
341            logging.debug("Adding start time: %s" %vals[0])
342            self._atom.t1 = vals[0]
343            if len(vals) > 1:
344                logging.debug("Adding finish time: %s" %vals[1])
345                self._atom.t2 = vals[1]
346     
347   
348    def _addGranuleToDataEntityRecords(self):
349        '''
350        Augment any data entity records, specified in the granulite, with a reference to
351        the granule
352        '''
353        logging.info("Adding granule info to data entities")
354        if not self._dataEntityIDs:
355            logging.info("No data entity IDs were specified in granulite - so will not add granule data to cedarmoles DB")
356            return
357       
358        # now set up the granule links to the data entities specified
359        for entityID in self._dataEntityIDs:
360            data = utilities.getTripleData(entityID)
361            dataEntityID = data[0]
362            dataEntityProviderID = data[1]
363           
364            try:
365                self.__updateDataEntity(dataEntityID)
366            except:
367                logging.error("Exception thrown - detail: ")
368                logging.error(sys.exc_info())
369                logging.info("Continue processing other data entities")
370
371        logging.info("Granule data added to data entities")
372
373
374    def __updateDataEntity(self, dataEntityID):
375        '''
376        Retrieve the specified data entity and add a link to the current
377        data granule, if required, then update the atom in eXist
378        @param dataEntityID: ID of the data entity to augment with granule link
379        '''
380        logging.debug("Retrieving data entity atom - to attach granule to")
381        doc = self._eXist.getAtom('dataent_' + dataEntityID)
382        logging.debug("DE retrieved - now adding link to granule")
383        de = Atom.Atom(xmlString=str(doc))
384        noLinks = len(de.relatedLinks)
385        de.addRelatedLinks(self._atom.atomBrowseURL + " | " + \
386                           self._atom.title + " | " + \
387                           self._atom.VTD.getTermCurrentVocabURL(VTD.GRANULE_TERM))
388       
389        # only save if need be
390        if len(de.relatedLinks) == noLinks:
391            logging.info("- data entity already contains link to this granule - skpping")
392            return
393       
394        logging.debug("Now, save the updated DE atom back to eXist")
395        self._eXist.createAtomInExist(de)
396        logging.debug("DE atom updated")
397
398
399    def _initialiseGranule(self):
400        '''
401        Create an Atom object ready to populate with data
402        '''
403        self._atom = Atom.Atom(VTD.GRANULE_TERM)
404
405
406    def __processCSMLFile(self):
407        logging.info("Processing CSML file")
408        # only keep file name, if full path specified
409        fileName = self._csmlFileName
410        fileName = fileName.split('/')[-1]
411        fileName = fileName.split('\\')[-1]
412       
413        csmlDoc = self._atom.addCSMLData(self._csmlFileName, \
414                                         self._csmlContent, useCSMLID = self.useCSMLID)
415        logging.info("Adding CSML file to eXist")
416        self._eXist.createOrUpdateEXistFile(csmlDoc.toPrettyXML(), \
417                                        eXistConnector.NDG_A_COLLECTION_PATH + \
418                                        self._atom.ME.providerID + '/', \
419                                        fileName)
420        logging.info("CSML file added to eXist")
421        logging.info("Finished processing CSML file")
422           
423
424    def __processCDMLFile(self):
425        logging.info("Processing CDML file")
426        # only keep file name, if full path specified
427        fileName = self._cdmlFileName
428        fileName = fileName.split('/')[-1]
429        fileName = fileName.split('\\')[-1]
430       
431        # firstly, save the doc to eXist
432        # remove DOCTYPE tags - as these will prevent eXist from ingesting the doc
433        self._cdmlContent = re.sub(r'<!DOCTYPE.*>', '', self._cdmlContent)
434        logging.info("CDML file loaded")
435
436        logging.info("Adding CDML file to eXist")
437        self._eXist.createOrUpdateEXistFile(self._cdmlContent, \
438                                            eXistConnector.NDG_A_COLLECTION_PATH + \
439                                            self._atom.ME.providerID + '/',\
440                                            fileName)
441        self._atom.cdmlFile = eXistConnector.NDG_A_COLLECTION_PATH + fileName
442       
443        # create a local copy of the CDML file - NB, this is required if running
444        # from web app
445        fn = os.path.basename(str(datetime.datetime.today().microsecond) + fileName)
446        cdmlFile = open(fn, 'wb')
447        cdmlFile.write(self._cdmlContent)
448        cdmlFile.close()
449        message = 'The file "' + fn + '" was uploaded successfully'
450       
451        logging.info("Create CSML file from the CDML file - NB, this will be stored in eXist too " + \
452                     "and will be used to extract dataset information from")
453        csmlFileName = utilities.createCSMLFile(fn, self._datasetID, self._cdmlTimeAxis)
454        os.remove(fn)
455       
456        logging.info("CSML file successfully created - now processing this")
457        self.__addCSMLOrCDMLData(csmlFileName, None)
458       
459        # NB, can remove the CSML file now since the contents are read into memory
460        os.remove(csmlFileName)
461        logging.info("Finished processing CDML file")
462
463
464    def __addCSMLOrCDMLData(self, fileName, fileContent):
465        '''
466        Given an unknown file type, determine whether it is CSML or CDML; if it
467        is either, ingest the data appropriately; if not, just skip. NB, CDML docs
468        are converted into CSML ones to allow the data ingest
469        @param fileName: name of the file to ingest data from
470        @param fileContent: content of the file - NB, if this is set to None and the
471        file, fileName, is available locally, CsmlParser.Dataset will read in the file
472        directly
473        '''
474        logging.info("Determining file type to add data from")
475        if not fileContent:
476            logging.info("- NB, file contents not provided - attempt to load")
477            try:
478                f = open(fileName, 'r')
479                fileContent = f.read()
480                f.close
481            except IOError, e:
482                logging.error(e.message)
483               
484            if not fileContent:
485                raise ValueError("Could not load data from file, '%s'" %fileName)
486
487        # if we're dealing with a CDML file, process this to produce a CSML file
488        if utilities.isCSMLFile(fileContent):
489            self._csmlFileName = fileName
490            self._csmlContent = fileContent
491        elif utilities.isCDMLFile(fileContent):
492            self._cdmlFileName = fileName
493            self._cdmlContent = fileContent
494        else:
495            raise ValueError("Unrecognised file type, '%s'" %fileName)
496       
497        logging.info("Finished determining file type")
498       
499
500
501    def processCSMLOrCDMLFile(self):
502        '''
503        If a CSML or CDML file has been specified, process it - i.e.:
504        - extract required data
505        - add to eXist
506        @return atom: Atom object of created data granule with CSML/CDML data
507        added
508        '''
509        logging.info("Processing CSML/CDML data")
510        # NB, if a CDML file is specified, a CSML file will be created from it and this will be used to
511        # extract the required dataset info
512        if self._cdmlFileName:
513            self.__processCDMLFile()
514           
515        if self._csmlFileName:
516            self.__processCSMLFile()
517        logging.info("Finished processing CSML/CDML data")
518        return self._atom
519
520       
521    def processGranulite(self):
522        '''
523        Complete the required granulite actions
524        - ingest granulite data + add to granule atom
525        - ingest CSML or CDML data + add to granule atom
526        - save CSML/CDML data to eXist
527        - add granule atom to eXist
528        - add references to granule atom to specified data entity atoms
529       
530        @return atom: Atom object of created data granule
531        '''
532        logging.info("Processing granulite data")
533        # load in the granulite details
534        inputs = self.__getGranuliteDetails()
535       
536        # apply the basic, essential data
537        self.__applyCoreGranuliteDetails(inputs)
538       
539        # check for CSML or CDML file and load details
540        self.processCSMLOrCDMLFile()
541
542        # apply any granulite data; NB, this overrides/augments the
543        # CSML/CDML data by default
544        self.__applyGranuliteDetails(inputs)
545       
546        # add the granule to eXist
547        # check if we've got a valid datasetid at this point - otherwise we won't have a valid filename or DB PK to use
548        if not self._atom.datasetID:
549            errorMessage = "No dataset ID specified for the granule - this needs " + \
550                "to be set in the granulite file or in \nthe CSML/CDML files.  " + \
551                "NB, without this cannot create valid entry in the eXist " + \
552                "DB - so escaping now"
553            logging.error(errorMessage)
554            raise ValueError(errorMessage)
555
556        logging.info("Creating granule atom, '%s', in eXist DB" %self._atom.atomName)
557        self._eXist.createOrUpdateEXistFile(self._atom.toPrettyXML(), \
558                                            self._atom.getDefaultCollectionPath(), \
559                                            self._atom.atomName)
560
561        # now add the granule data to the data entity in eXist
562        self._addGranuleToDataEntityRecords()
563       
564        logging.info("granulite processing complete")
565        return self._atom
566
567
568if __name__=="__main__":
569    opts, args = getopt.getopt(sys.argv[1:], '-oxuvd')
570    if len(args) < 1:
571        print "Error: Please specify a granulite data file to process"
572        sys.exit(2)
573   
574    logging.basicConfig(level = logging.DEBUG,
575                        format='%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s')
576    g = granulite(args[0])
577    g.processGranulite()
Note: See TracBrowser for help on using the repository browser.