source: exist/trunk/python/ndgUtils/lib/granulite.py @ 4679

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/granulite.py@4679
Revision 4679, 24.8 KB checked in by cbyrom, 11 years ago (diff)

Extend granulite to allow command line operation - with input options
to specify logging level and 'replace atom' mode - which command line
inputs to ask users if they want to replace duplicated data.

Line 
1#!/usr/bin/env python
2'''
3Data model representing a granulite file - together with utitilities to
4augment atom data with
5
6 @author: C Byrom, Tessella Nov 08
7'''
8import os, sys, string, getopt, logging, re, datetime, cgi
9from ndgUtils.eXistConnector import eXistConnector
10from ndgUtils.models import Atom
11import ndgUtils.lib.utilities as utilities
12from ndgUtils.models.vocabtermdata import VocabTermData as VTD
13import ndgUtils.lib.existdbclient as edc
14
15class granulite(object):
16    '''
17    Granulite data model
18    '''
19    # expected sections in a granulite file
20    AUTHORS_SECTION = "authors"
21    FILE_SECTION = "files"
22    PARAMETERS_SECTION = "parameters"
23    COVERAGE_SECTION = "coverage"
24    CSML_SECTION = "csml_file"
25    CDML_SECTION = "cdml_file"
26    GRANULE_INFO_SECTION = "granule_info"
27    GRANULE_AUTHORS_SECTION = "granule_authors"
28    SUMMARY_SECTION = "summary"
29    DATA_ENTITY_SECTION = "data_entity_id"
30    LOGO_SECTION = "logo"
31    RELATED_LINKS_SECTION = "related_links"
32   
33    # need to distinguise coverage data between spatial and temporal data - use these variables to do so
34    TEMPORAL_DATA = "temporal_data"
35    SPATIAL_DATA = "spatial_data"
36
37    # group the data together as either single valued or array valued - to ease setting attributes on Atom
38    singleVals = {} 
39    arrayVals = {AUTHORS_SECTION:'authors', \
40                 FILE_SECTION:'files', GRANULE_AUTHORS_SECTION:'atomAuthors', \
41                 SUMMARY_SECTION:'summary'}
42       
43    # config file with eXist DB details
44    DBCONFIG_FILE = "exist.config"
45
46    # default title given to CSML files by csmlscan
47    DEFAULT_CSML_TITLE = "NAME OF DATASET GOES HERE"
48   
49    # flag to use when running in test mode
50    TEST_MODE = "granulite_test_mode"
51   
52    # eXist DB client
53    _eXist = None
54   
55    # flag to mark mode of operation
56    _isOverride = False
57
58    # info on specified CDML + CSML
59    _cdmlFileName = None
60    _cdmlTimeAxis = None
61    _datasetID = None
62    _csmlFileName = None
63
64    # info on the datasets to attach the granule to
65    _dataEntityIDs = []
66   
67    # standard output delimiter
68    LINE_SEPARATOR = "-----------------------------"
69       
70       
71    def __init__(self, granulite, granuleAtom = None, \
72                  eXistClient = None, csmlOrCdmlFile = None, \
73                  aggregateCoverage = False, useCSMLID = False, 
74                  timeAxis = 'time', datasetID = None, replaceAtom = False):
75        '''
76        Constructor for granulite object - NB, the csml/cdml keywords allow use
77        in a webapp - outside of this, this data should be specified in the granulite
78        file
79       
80        @param granulite: either the granulite filename or contents
81        @keyword granuleAtom: an existing granule atom - to add granulite data to
82        - NB, if not specified, a new atom is used
83        @keyword eXistClient: an eXist connection
84        @keyword csmlOrCdmlFile: a cgi.FieldStorage object with CSML or CDML data
85        @keyword aggregateCoverage: if set to True, only coverage data that extends the
86        atom coverage data will be added
87        @keyword useCSMLID: if True, use the CSML doc ID as the dataset ID - NB,
88        this should only be True if creating a new atom - e.g. from a granulite
89        @keyword timeAxis: if using a CDML file, specify the time axis to use - 'time'
90        is the default
91        @keyword datasetID: if using a CDML file, specify the ID of the dataset to use
92        - otherwise one will be randomly generated
93        @keyword replaceAtom: if True, and if a granule atom is found with the ID of
94        the atom to be created by the granulite, automatically overwrite the older
95        atom
96        '''
97        logging.info("Creating granulite data model")
98       
99        self._granulite = granulite
100       
101        # set up connection to eXist
102        if eXistClient:
103            self._eXist = eXistClient
104        else:
105            self._eXist = edc.eXistDBClient(configFile = self.DBCONFIG_FILE)
106       
107        # override CSML/CDML data specified in the granulite file with data input directly
108        self.ingestGranuliteFiles = True
109        # NB, empty FieldStorage fields end up as empty strings
110        if csmlOrCdmlFile is not None:
111            self.ingestGranuliteFiles = False
112            if csmlOrCdmlFile != '':
113                if not isinstance(csmlOrCdmlFile, cgi.FieldStorage):
114                    raise ValueError("Specified CSML/CDML file is not a cgi.FieldStorage object")
115                self.__addCSMLOrCDMLData(csmlOrCdmlFile.filename, csmlOrCdmlFile.value)
116       
117
118        if granuleAtom:
119            if not isinstance(granuleAtom, Atom.Atom):
120                raise ValueError("Specified granule atom is not an Atom object")
121            self._atom = granuleAtom
122        else:
123            # create a skeleton granule
124            self.__initialiseGranule()
125           
126        self._replaceAtom = replaceAtom
127        self.useCSMLID = useCSMLID
128        self._cdmlTimeAxis = timeAxis
129        self._datasetID = datasetID
130           
131        logging.info("Granulite data model set up")
132
133       
134    def __getSectionName(self, str):
135        '''
136        Checks an input string to see if it contains a section title; if so
137        return this title name, otherwise return None
138        @param str: string to parse for section name
139        @return: section name, if found, None otherwise
140        '''
141        sectionName = None
142        if str.count('::') == 1:
143            sectionName = str.partition('::')[0]
144        return sectionName
145
146
147    def __getGranuliteDetails(self):
148        '''
149        Load the granulite config file and extract the required data 
150        '''
151        logging.info("Retrieving data from granulite config file")
152       
153        # assume we've already read in the file if it is multi-line
154        newLine = None
155        # cope with unix and dos end of lines
156        if self._granulite.find('\n') > -1:
157            newLine = '\n'
158        elif self._granulite.find('\r') > -1:
159            newLine = '\r'
160       
161        if newLine:
162            granuliteData = self._granulite.split(newLine)
163        else:
164            # Check this file exists
165            if not os.path.isfile(self._granulite):
166                raise ValueError("ERROR: Could not find the config file, %s; please specify " \
167                         "a valid file" %self._granulite)
168               
169            granulite_file = open(self._granulite, "r")
170            granuliteData = granulite_file.readlines()
171            granulite_file.close()
172       
173        # create a dictionary of all data in file - then use this to get the required data
174        granulite_data = {}
175       
176        # initialise vocab term data lookup object
177        self.VTD = VTD()
178
179        sectionName = "" # variable to indicate what section we're currently parsing in the granulite file data
180        logging.info("Parsing granulite file...")
181        for line in granuliteData:
182           
183            line = line.strip()
184            # avoid comments
185            if (line.startswith('#')):
186                continue
187           
188            # avoid empty lines - except in the summary section where these may be appropriate
189            if not line and sectionName != self.SUMMARY_SECTION:
190                continue
191           
192            section = self.__getSectionName(line)
193
194            # if a section name is returned, we're at a new section of the file - so change the dictionary key
195            if section:
196                sectionName = section
197                continue
198           
199            # cope with case where there is rubbish header info in the file
200            if not sectionName:
201                continue
202           
203            # initialise dictionary array if required
204            if sectionName not in granulite_data:
205                granulite_data[sectionName] = []
206           
207            if sectionName != self.COVERAGE_SECTION:
208                logging.debug("Adding %s data: -%s-" %(sectionName, line))
209                granulite_data[sectionName].append(line)
210            else:
211                # if it's coverage data, determine if it is spatial or temporal data
212                coverageType = self.SPATIAL_DATA
213                if line.lower().find('time') > -1:
214                    coverageType = self.TEMPORAL_DATA
215               
216                # initialise dictionary array if required
217                if coverageType not in granulite_data:
218                    granulite_data[coverageType] = []
219                logging.debug("Adding %s data: %s" %(coverageType, line))
220                granulite_data[coverageType].append(line)
221                   
222        logging.info("Finished parsing granulite file")
223        logging.info("- returning dict of ingested data")
224        return granulite_data
225       
226       
227    def __applyCoreGranuliteDetails(self, granulite_data):
228        '''
229        Apply the essential data read in from the granulite file to the atom data model
230        being processed by the granulite
231        @param granulite_data: the dict produced by __getGranuliteDetails
232        '''
233        # add the general granule info
234        if self.GRANULE_INFO_SECTION not in granulite_data:
235            raise ValueError("Need granule_info section in granulite input file")
236       
237        data = utilities.getTripleData(granulite_data[self.GRANULE_INFO_SECTION][0])
238        if not data[0]:
239            raise ValueError("Provider ID is missing for granule; please add this info to the " + \
240                             self.GRANULE_INFO_SECTION + " section of the granulite config file")
241        self._atom.addMolesEntityData(None, data[0], None)
242        self._atom.setDatasetID(data[1])
243        self._atom.title = data[2]
244
245        # NB, if running from the web, ignore the CSML/CDML files specified in the granulite
246        if self.ingestGranuliteFiles:
247            # check for CSML/CDML file input - these data are changed before adding to the granule
248            # - since the file will be stored and referenced in eXist by then
249            if not self._csmlFileName:
250                if self.CSML_SECTION in granulite_data:
251                    self.__addCSMLOrCDMLData(granulite_data[self.CSML_SECTION][0], None)
252   
253            if not self._cdmlFileName:
254                if self.CDML_SECTION in granulite_data:
255                    if self._csmlFileName:
256                        raise ValueError("Cannot specify both CDML and CSML file in granulite config file" + \
257                                         "\nNB, CSML file is generated from the specified CDML file")
258               
259                    data = utilities.getTripleData(granulite_data[self.CDML_SECTION][0])
260                    self.__addCSMLOrCDMLData(data[0], None)
261                    self._datasetID = data[1]
262                    self._cdmlTimeAxis = data[2]
263       
264       
265    def __applyGranuliteDetails(self, granulite_data):
266        '''
267        Apply the data read in from the granulite file to the atom data model
268        being processed by the granulite
269        @param granulite_data: the dict produced by __getGranuliteDetails
270        '''
271        logging.info("Applying granulite data to atom")
272        # add the single value attributes to the granule
273        for attribute in self.singleVals:
274            if attribute in granulite_data:
275                self._atom.setAttribute(self.singleVals[attribute], \
276                                        granulite_data[attribute][0])
277
278        # NB, explicitly set the related links before running the arrayVals loop, since
279        # this will replace any file Links created if it occurs after these have been
280        # created (was originally in arrayVals, but this doesn't enforce any specific ordering)
281        if self.RELATED_LINKS_SECTION in granulite_data:
282            self._atom.setAttribute('relatedLinks', \
283                                    granulite_data[self.RELATED_LINKS_SECTION])
284
285        # now add the arrays data
286        for attribute in self.arrayVals:
287            if attribute in granulite_data:
288                self._atom.setAttribute(self.arrayVals[attribute], \
289                                        granulite_data[attribute])
290       
291        if self.LOGO_SECTION in granulite_data:
292            for logo in granulite_data[self.LOGO_SECTION]:
293                self._atom.addRelatedLinks(logo + " | Logo | " + \
294                                           self.VTD.getTermCurrentVocabURL(VTD.LOGO_TERM))
295
296        # add the parameters data via the method - since this does some tidying up of the data
297        if self.PARAMETERS_SECTION in granulite_data:
298            self._atom.addParameters(granulite_data[self.PARAMETERS_SECTION])
299           
300        if self.DATA_ENTITY_SECTION in granulite_data:
301            self._dataEntityIDs = granulite_data[self.DATA_ENTITY_SECTION] 
302
303        # now add any coverage data
304        if self.SPATIAL_DATA in granulite_data:
305            self._extractSpatialData(granulite_data[self.SPATIAL_DATA][0])
306        if self.TEMPORAL_DATA in granulite_data:
307            self._extractTemporalData(granulite_data[self.TEMPORAL_DATA][0])
308
309        logging.info("All input data set up")
310       
311
312    def _extractSpatialData(self, geomString):
313        '''
314        Extract bounding box info from the specified geometry string
315        @param geomString: A string holding geometry info
316        NB, currently the method supports parsing of POLYGONs
317        '''
318        if geomString.upper().find('POLYGON') > -1:
319            logging.debug("Spatial data specified in POLYGON format - extracting data from this")
320            vals = re.findall('([\-\d\.]+)', geomString)
321            # assume we're dealing with a rectangle normal to the equator...
322            if len(vals) == 10:
323                self._atom.minX = vals[0]
324                self._atom.minY = vals[1]
325                self._atom.maxX = vals[4]
326                self._atom.maxY = vals[5]
327        else:
328            errorMessage = "Spatial coverage data not stored in POLYGON format - please correct and rerun"
329            logging.error(errorMessage)
330            raise ValueError(errorMessage)
331
332
333    def _extractTemporalData(self, dateRangeString):
334        '''
335        Extract temporal info from the specified daterange string
336        @param dateRangeString: A string holding temporal info
337        NB, currently the method supports parsing of TIMERANGE objects
338        '''
339        if dateRangeString.upper().find('TIMERANGE') == -1:
340            errorMessage = "Temporal data not stored in TIMERANGE() format - please correct and rerun"
341            logging.error(errorMessage)
342            raise ValueError(errorMessage)
343       
344        vals = re.findall('([0-9][0-9\-:TZ ]+)', dateRangeString)
345        if vals:
346            logging.debug("Adding start time: %s" %vals[0])
347            self._atom.t1 = vals[0]
348            if len(vals) > 1:
349                logging.debug("Adding finish time: %s" %vals[1])
350                self._atom.t2 = vals[1]
351     
352   
353    def __addGranuleToDataEntityRecords(self):
354        '''
355        Augment any data entity records, specified in the granulite, with a reference to
356        the granule
357        '''
358        logging.info("Adding granule info to data entities")
359        if not self._dataEntityIDs:
360            logging.info("No data entity IDs were specified in granulite - so will not add granule data to cedarmoles DB")
361            return
362       
363        # now set up the granule links to the data entities specified
364        for entityID in self._dataEntityIDs:
365            data = utilities.getTripleData(entityID)
366            dataEntityID = data[0]
367            dataEntityProviderID = data[1]
368           
369            try:
370                self.__updateDataEntity(dataEntityID)
371            except:
372                logging.error("Exception thrown - detail: ")
373                logging.error(sys.exc_info())
374                logging.info("Continue processing other data entities")
375
376        logging.info("Granule data added to data entities")
377
378
379    def __updateDataEntity(self, dataEntityID):
380        '''
381        Retrieve the specified data entity and add a link to the current
382        data granule, if required, then update the atom in eXist
383        @param dataEntityID: ID of the data entity to augment with granule link
384        '''
385        logging.debug("Retrieving data entity atom - to attach granule to")
386        doc = self._eXist.getAtom('dataent_' + dataEntityID)
387        logging.debug("DE retrieved - now adding link to granule")
388        de = Atom.Atom(xmlString=str(doc))
389        noLinks = len(de.relatedLinks)
390        de.addRelatedLinks(self._atom.atomBrowseURL + " | " + \
391                           self._atom.title + " | " + \
392                           self._atom.VTD.getTermCurrentVocabURL(VTD.GRANULE_TERM))
393       
394        # only save if need be
395        if len(de.relatedLinks) == noLinks:
396            logging.info("- data entity already contains link to this granule - skpping")
397            return
398       
399        logging.debug("Now, save the updated DE atom back to eXist")
400        self._eXist.createAtomInExist(de)
401        logging.debug("DE atom updated")
402
403
404    def __initialiseGranule(self):
405        '''
406        Create an Atom object ready to populate with data
407        '''
408        self._atom = Atom.Atom(VTD.GRANULE_TERM)
409
410
411    def __processCSMLFile(self):
412        logging.info("Processing CSML file")
413        # only keep file name, if full path specified
414        fileName = self._csmlFileName
415        fileName = fileName.split('/')[-1]
416        fileName = fileName.split('\\')[-1]
417       
418        self._atom.addCSMLData(self._csmlFileName, \
419                               self._csmlContent, useCSMLID = self.useCSMLID)
420        logging.info("Adding CSML file to eXist")
421        self._eXist.createOrUpdateEXistFile(self._csmlContent, \
422                                        eXistConnector.NDG_A_COLLECTION_PATH + \
423                                        self._atom.ME.providerID + '/', \
424                                        fileName)
425        logging.info("CSML file added to eXist")
426        logging.info("Finished processing CSML file")
427           
428
429    def __processCDMLFile(self):
430        logging.info("Processing CDML file")
431        # only keep file name, if full path specified
432        fileName = self._cdmlFileName
433        fileName = fileName.split('/')[-1]
434        fileName = fileName.split('\\')[-1]
435       
436        # firstly, save the doc to eXist
437        # remove DOCTYPE tags - as these will prevent eXist from ingesting the doc
438        self._cdmlContent = re.sub(r'<!DOCTYPE.*>', '', self._cdmlContent)
439        logging.info("CDML file loaded")
440
441        logging.info("Adding CDML file to eXist")
442        self._eXist.createOrUpdateEXistFile(self._cdmlContent, \
443                                            eXistConnector.NDG_A_COLLECTION_PATH + \
444                                            self._atom.ME.providerID + '/',\
445                                            fileName)
446        self._atom.cdmlFile = eXistConnector.NDG_A_COLLECTION_PATH + fileName
447       
448        # create a local copy of the CDML file - NB, this is required if running
449        # from web app
450        fn = os.path.basename(str(datetime.datetime.today().microsecond) + fileName)
451        cdmlFile = open(fn, 'wb')
452        cdmlFile.write(self._cdmlContent)
453        cdmlFile.close()
454        message = 'The file "' + fn + '" was uploaded successfully'
455       
456        logging.info("Create CSML file from the CDML file - NB, this will be stored in eXist too " + \
457                     "and will be used to extract dataset information from")
458        csmlFileName = utilities.createCSMLFile(fn, self._datasetID, self._cdmlTimeAxis)
459        os.remove(fn)
460       
461        logging.info("CSML file successfully created - now processing this")
462        self.__addCSMLOrCDMLData(csmlFileName, None)
463       
464        # NB, can remove the CSML file now since the contents are read into memory
465        os.remove(csmlFileName)
466        logging.info("Finished processing CDML file")
467
468
469    def __addCSMLOrCDMLData(self, fileName, fileContent):
470        '''
471        Given an unknown file type, determine whether it is CSML or CDML; if it
472        is either, ingest the data appropriately; if not, just skip. NB, CDML docs
473        are converted into CSML ones to allow the data ingest
474        @param fileName: name of the file to ingest data from
475        @param fileContent: content of the file - NB, if this is set to None and the
476        file, fileName, is available locally, CsmlParser.Dataset will read in the file
477        directly
478        '''
479        logging.info("Determining file type to add data from")
480        if not fileContent:
481            logging.info("- NB, file contents not provided - attempt to load")
482            try:
483                f = open(fileName, 'r')
484                fileContent = f.read()
485                f.close
486            except IOError, e:
487                logging.error(e.message)
488               
489            if not fileContent:
490                raise ValueError("Could not load data from file, '%s'" %fileName)
491
492        # if we're dealing with a CDML file, process this to produce a CSML file
493        if utilities.isCSMLFile(fileContent):
494            self._csmlFileName = fileName
495            self._csmlContent = fileContent
496        elif utilities.isCDMLFile(fileContent):
497            self._cdmlFileName = fileName
498            self._cdmlContent = fileContent
499        else:
500            raise ValueError("Unrecognised file type, '%s'" %fileName)
501       
502        logging.info("Finished determining file type")
503       
504
505
506    def processCSMLOrCDMLFile(self):
507        '''
508        If a CSML or CDML file has been specified, process it - i.e.:
509        - extract required data
510        - add to eXist
511        @return atom: Atom object of created data granule with CSML/CDML data
512        added
513        '''
514        logging.info("Processing CSML/CDML data")
515        # NB, if a CDML file is specified, a CSML file will be created from it and this will be used to
516        # extract the required dataset info
517        if self._cdmlFileName:
518            self.__processCDMLFile()
519           
520        if self._csmlFileName:
521            self.__processCSMLFile()
522        logging.info("Finished processing CSML/CDML data")
523        return self._atom
524
525       
526    def processGranulite(self, replaceAtom = False):
527        '''
528        Complete the required granulite actions
529        - ingest granulite data + add to granule atom
530        - ingest CSML or CDML data + add to granule atom
531        - save CSML/CDML data to eXist
532        - add granule atom to eXist
533        - add references to granule atom to specified data entity atoms
534        @keyword replaceAtom: if True, allow granule atoms to replace existing
535        atoms with the same ID - if False, throw a DuplicationError
536        @raise DumplicationError: if allowBackups = False and an atom is found
537        with the same ID as that to be created 
538        @return atom: Atom object of created data granule
539        '''
540        logging.info("Processing granulite data")
541        # load in the granulite details
542        inputs = self.__getGranuliteDetails()
543       
544        # apply the basic, essential data
545        self.__applyCoreGranuliteDetails(inputs)
546       
547        # check for CSML or CDML file and load details
548        self.processCSMLOrCDMLFile()
549
550        # apply any granulite data; NB, this overrides/augments the
551        # CSML/CDML data by default
552        self.__applyGranuliteDetails(inputs)
553       
554        # add the granule to eXist - if this exists already a DuplicationError
555        # will be thrown if backups are not allowed
556        doReplace = replaceAtom or self._replaceAtom
557        logging.info("Creating granule atom, '%s', in eXist DB" %self._atom.atomName)
558        self._eXist.createAtomInExist(self._atom, replaceAtom = doReplace)
559
560        # now add the granule data to the data entity in eXist
561        self.__addGranuleToDataEntityRecords()
562       
563        logging.info("granulite processing complete")
564        return self._atom
565
566
567if __name__=="__main__":
568    opts, args = getopt.getopt(sys.argv[1:], '-rxvd')
569    if len(args) < 1:
570        print "Error: Please specify a granulite data file to process"
571        sys.exit(2)
572       
573    loggingLevel = logging.WARNING
574    isReplace = False
575    for o, a in opts:
576        if o == "-v":
577            print " - Verbose mode ON"
578            loggingLevel = logging.INFO
579        elif o == "-d":
580            print " - Debug mode ON"
581            loggingLevel = logging.DEBUG
582        elif o == "-r":
583            print " - Replace mode ON"
584            isReplace = True
585        elif o == "-x":
586            print " - Delete mode ON"
587            isDelete = True
588   
589    logging.basicConfig(level = loggingLevel,
590                        format='%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s')
591    g = granulite(args[0], replaceAtom = isReplace)
592   
593    try:
594        g.processGranulite()
595    except edc.DuplicateError, e:
596        # allow the user to specify if they want to overwrite the duplicated atom
597        print e.message
598        input = raw_input("- do you want to overwrite this with the granulite data? (y/n) ")
599        if input.strip().upper() == 'Y':
600            print "OK - replacing old granule atom..."
601            g.processGranulite(replaceAtom = True)
602        else:
603            print "Exiting...."
Note: See TracBrowser for help on using the repository browser.