source: exist/trunk/python/ndgUtils/vocabtermdata.py @ 4236

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/vocabtermdata.py@4236
Revision 4236, 19.7 KB checked in by cbyrom, 11 years ago (diff)

Improve the creation of new atom entries in eXist - generating unique
IDs, and checking these, where necessary + fix update date updates +
add methods to the vocab data class to retrieve lists of data categories
+ subtypes - for use in the UI dropdowns.

Line 
1#!/usr/bin/env python
2'''
3 Class to store and access the various vocab term data
4 
5 @author: C Byrom, Tessella Jul 2008
6'''
7import sys, logging, commands, string, os, time, re
8import urllib
9
10class VocabTermItem(object):
11    '''
12    Class representing single vocab term item
13    '''
14    def __init__(self, vocabURL, termID, title=None):
15        self.vocabURL = vocabURL
16        self.termID = termID
17        self.title = title
18
19
20class VocabTermData(object):
21    '''
22    Class representing vocab term data - including
23    methods to look these up to ensure they are current
24    '''
25
26    URI_TERM = 'URI'
27    LOGO_TERM = 'LOGO'
28    NUM_SIM_TERM = 'NumSim'
29    OPENDAP_TERM = 'OPENDAP'
30    THREDDS_TERM = 'THREDDS'
31    WMS_TERM = 'WMS'
32    WCS_TERM = 'WCS'
33    WFS_TERM = 'WFS'
34    LAS_TERM = 'LAS'
35    DATA_EXTRACTOR_TERM = 'DataExtractor' 
36    FILE_BROWSER_TERM = 'FileBrowser'   
37    CSML_TERM = 'CSML'
38   
39    ACTIVITY_TERM = 'ACTIVITY'
40    DPT_TERM = 'DPT'
41    OBS_TERM = 'OBS'
42    GRANULE_TERM = 'GRANULE'
43    DE_TERM = "DE"
44   
45    # dpt subtypes
46    LIDAR_TERM = "dgLidar"
47    RADAR_TERM = "dgRadar"
48    SONDE_TERM = "dgSonde"
49    NAVIGATION_TERM = "dgNavigation"
50    GAS_CHROMATOGRAPH_TERM = "dgGasChromatograph"
51    SPECTROMETER_TERM = "dgSpectrometer"
52    MASS_SPECTROMETER_TERM = "dgMassSpectrometer"
53    MET_SENSOR_TERM = "dgMetSensor"
54    DOAS_TERM = "dgDOAS"
55    ASOZ_TERM = "dgASOZ"
56    RADIOMETER_TERM = "dgRadiometer"
57    FAGE_TERM = "dgFAGE"
58    IMAGER_TERM = "dgImager"
59    FILTER_TERM = "dgFilter"
60    PARTICLE_COUNTER_TERM = "dgParticleCounter"
61    SAMPLER_TERM = "dgSampler"
62    OTHER_INSTRUMENT_TYPE_TERM = "dgOtherInstrumentType"
63    MODEL_TERM = "dgModel"
64    INSTRUMENT_TERM = "dgInstrument"
65   
66    # de subtypes
67    SIMULATION_TERM = "dgSimulation"
68    ANALYSIS_TERM = "dgAnalysis"
69    MEASUREMENT_TERM = "dgMeasurement"
70   
71    # activity subtypes
72    DATA_COLLECTION_TERM = "dgActivityDataCollection"
73    DATA_PROJECT_TERM = "dgActivityDataProject"
74    DATA_CAMPAIGN_TERM = "dgActivityDataCampaign"
75    DATA_INVESTIGATION_TERM = "dgActivityDataInvestigation"
76    FLIGHT_TERM = "dgFlight"
77    CRUISE_TERM = "dgCruise"
78    FUNDING_PROGRAM_TERM = "dgFundingProgram"
79   
80    # obs subtypes
81    STATIONARY_PLATFORM_TERM = "dgStationaryPlatform"
82    MOVING_PLATFORM_TERM = "dgMovingPlatform"
83    LAND_STATION_TERM = "dgLandStation"
84    MOORING_TERM = "dgMooring"
85    STATION_GROUP_TERM = "dgStationGroup"
86    SHIP_TERM = "dgShip"
87    AIRCRAFT_TERM = "dgAircraft"
88    SATELLITE_TERM = "dgSatellite"
89    COMPUTER_TERM = "dgComputer"
90
91    # provider types
92    BADC_TERM = 'badc.nerc.ac.uk'
93    NEODC_TERM = 'neodc.nerc.ac.uk'
94   
95    TERM_DATA = {
96                 URI_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/N041', '3', title = 'Data Home Page'),
97                 BADC_TERM:VocabTermItem('NOT YET SET UP', BADC_TERM, title = 'British Atomspheric Data Centre'),
98                 NEODC_TERM:VocabTermItem('NOT YET SET UP', NEODC_TERM, title = 'NERC Earth Observation Data Centre'),
99                 LOGO_TERM:VocabTermItem('LOGO', 'LOGO', title = 'Logo'),
100                 NUM_SIM_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/N041', '7', title = 'NumSim description'),
101                 OPENDAP_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/P201', 'GCMDU010', title = 'GET DATA > OPENDAP DATA (DODS)'),
102                 THREDDS_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/P201', 'GCMDU013', title = 'GET DATA > THREDDS DATA'),
103                 WMS_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/P201', 'GCMDU026', title = 'GET SERVICE > GET WEB MAP SERVICE (WMS)'),
104                 WCS_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/P201', 'GCMDU024', title = 'GET SERVICE > GET WEB COVERAGE SERVICE (WCS)'),
105                 WFS_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/P201', 'GCMDU025', title = 'GET SERVICE > GET WEB FEATURE SERVICE (WFS)'),
106                 LAS_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/P201', 'GCMDU006', title = 'GET DATA > LAS'),
107                 DATA_EXTRACTOR_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/N041', '8', title = 'NDG DataExtractor'),
108                 FILE_BROWSER_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/N041', '9', title = 'File Browser'),
109                 CSML_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/N021', '1'),
110                 ACTIVITY_TERM:VocabTermItem('Activity - NOT YET SET UP', ACTIVITY_TERM, title = 'Activity'),
111                 DPT_TERM:VocabTermItem('DPT - NOT YET SET UP', DPT_TERM, title = 'Data Production Tool'),
112                 OBS_TERM:VocabTermItem('OBS - NOT YET SET UP', OBS_TERM, title = 'Observation Station'),
113                 GRANULE_TERM:VocabTermItem('GRAN - NOT YET SET UP', GRANULE_TERM, title = 'Data Granule'),
114                 DE_TERM:VocabTermItem('DE - NOT YET SET UP', DE_TERM, title = 'Data Entity'),
115                 
116                 LIDAR_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG02', title = LIDAR_TERM),
117                 RADAR_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG03', title = RADAR_TERM),
118                 SONDE_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG04', title = SONDE_TERM),
119                 NAVIGATION_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG05', title = NAVIGATION_TERM),
120                 GAS_CHROMATOGRAPH_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG06', title = GAS_CHROMATOGRAPH_TERM),
121                 SPECTROMETER_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG07', title = SPECTROMETER_TERM),
122                 MASS_SPECTROMETER_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG08', title = MASS_SPECTROMETER_TERM),
123                 MET_SENSOR_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG09', title = MET_SENSOR_TERM),
124                 DOAS_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG10', title = DOAS_TERM),
125                 ASOZ_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG11', title = ASOZ_TERM),
126                 RADIOMETER_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG12', title = RADIOMETER_TERM),
127                 FAGE_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG13', title = FAGE_TERM),
128                 IMAGER_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG14', title = IMAGER_TERM),
129                 FILTER_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG15', title = FILTER_TERM),
130                 PARTICLE_COUNTER_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG16', title = PARTICLE_COUNTER_TERM),
131                 SAMPLER_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG17', title = SAMPLER_TERM),
132                 OTHER_INSTRUMENT_TYPE_TERM:VocabTermItem('http://vocab.ndg.nerc.ac.uk/term/C330', 'NG99', title = OTHER_INSTRUMENT_TYPE_TERM),
133                 MODEL_TERM:VocabTermItem('dgModel - NOT YET SET UP', MODEL_TERM, title = MODEL_TERM),
134                 INSTRUMENT_TERM:VocabTermItem('dgInstrument - NOT YET SET UP', INSTRUMENT_TERM, title = INSTRUMENT_TERM),
135                 
136                 SIMULATION_TERM:VocabTermItem('NOT YET SET UP', SIMULATION_TERM, title = SIMULATION_TERM),
137                 ANALYSIS_TERM:VocabTermItem('NOT YET SET UP', ANALYSIS_TERM, title = ANALYSIS_TERM),
138                 MEASUREMENT_TERM:VocabTermItem('NOT YET SET UP', MEASUREMENT_TERM, title = MEASUREMENT_TERM),
139                 DATA_COLLECTION_TERM:VocabTermItem('NOT YET SET UP', DATA_COLLECTION_TERM, title = DATA_COLLECTION_TERM),
140                 DATA_PROJECT_TERM:VocabTermItem('NOT YET SET UP', DATA_PROJECT_TERM, title = DATA_PROJECT_TERM),
141                 DATA_CAMPAIGN_TERM:VocabTermItem('NOT YET SET UP', DATA_CAMPAIGN_TERM, title = DATA_CAMPAIGN_TERM),
142                 DATA_INVESTIGATION_TERM:VocabTermItem('NOT YET SET UP', DATA_INVESTIGATION_TERM, title = DATA_INVESTIGATION_TERM),
143                 FLIGHT_TERM:VocabTermItem('NOT YET SET UP', FLIGHT_TERM, title = FLIGHT_TERM),
144                 CRUISE_TERM:VocabTermItem('NOT YET SET UP', CRUISE_TERM, title = CRUISE_TERM),
145                 FUNDING_PROGRAM_TERM:VocabTermItem('NOT YET SET UP', FUNDING_PROGRAM_TERM, title = FUNDING_PROGRAM_TERM),
146                 
147                 STATIONARY_PLATFORM_TERM:VocabTermItem('NOT YET SET UP', STATIONARY_PLATFORM_TERM, title = STATIONARY_PLATFORM_TERM),
148                 MOVING_PLATFORM_TERM:VocabTermItem('NOT YET SET UP', MOVING_PLATFORM_TERM, title = MOVING_PLATFORM_TERM),
149                 LAND_STATION_TERM:VocabTermItem('NOT YET SET UP', LAND_STATION_TERM, title = LAND_STATION_TERM),
150                 MOORING_TERM:VocabTermItem('NOT YET SET UP', MOORING_TERM, title = MOORING_TERM),
151                 STATION_GROUP_TERM:VocabTermItem('NOT YET SET UP', STATION_GROUP_TERM, title = STATION_GROUP_TERM),
152                 SHIP_TERM:VocabTermItem('NOT YET SET UP', SHIP_TERM, title = SHIP_TERM),
153                 AIRCRAFT_TERM:VocabTermItem('NOT YET SET UP', AIRCRAFT_TERM, title = AIRCRAFT_TERM),
154                 SATELLITE_TERM:VocabTermItem('NOT YET SET UP', SATELLITE_TERM, title = SATELLITE_TERM),
155                 COMPUTER_TERM:VocabTermItem('NOT YET SET UP', COMPUTER_TERM, title = COMPUTER_TERM)
156                 }
157
158    ATOM_TYPES = [ACTIVITY_TERM, DE_TERM, DPT_TERM, GRANULE_TERM, OBS_TERM]
159
160    PROVIDER_TYPES = [BADC_TERM, NEODC_TERM]
161
162    ATOM_CATEGORY = "atom"
163    PROVIDER_CATEGORY = "provider"
164   
165    # A dictionary to group the various valid subtypes of atoms - grouped by their
166    # main type
167    SUBTYPE_TERMS = {
168                     DPT_TERM: [
169                        LIDAR_TERM, RADAR_TERM, \
170                        SONDE_TERM, NAVIGATION_TERM, \
171                        GAS_CHROMATOGRAPH_TERM, SPECTROMETER_TERM, \
172                        MASS_SPECTROMETER_TERM,
173                        MET_SENSOR_TERM, DOAS_TERM,
174                        ASOZ_TERM, RADIOMETER_TERM,
175                        FAGE_TERM, IMAGER_TERM,
176                        FILTER_TERM, PARTICLE_COUNTER_TERM,
177                        SAMPLER_TERM, OTHER_INSTRUMENT_TYPE_TERM,
178                        MODEL_TERM, INSTRUMENT_TERM
179                        ],
180                     DE_TERM: [
181                        SIMULATION_TERM, ANALYSIS_TERM, \
182                        MEASUREMENT_TERM       
183                        ],
184                     ACTIVITY_TERM: [
185                        DATA_COLLECTION_TERM, DATA_PROJECT_TERM, \
186                        DATA_CAMPAIGN_TERM, DATA_INVESTIGATION_TERM, \
187                        FLIGHT_TERM, CRUISE_TERM, \
188                        FUNDING_PROGRAM_TERM,
189                        ],
190                     OBS_TERM: [
191                        STATIONARY_PLATFORM_TERM, MOVING_PLATFORM_TERM, \
192                        LAND_STATION_TERM, MOORING_TERM, \
193                        STATION_GROUP_TERM, SHIP_TERM, \
194                        AIRCRAFT_TERM, SATELLITE_TERM, \
195                        COMPUTER_TERM
196                        ]
197                     }
198             
199   
200    ONE_HOUR = 3600.0
201   
202    VOCAB_SERVER_URL = 'http://vocab.ndg.nerc.ac.uk/clients/whatLists'
203
204    BROWSE_ROOT_URL = "http://localhost:5000/view/"
205
206    def __init__(self):
207        logging.info("Setting up VocabTermData object")
208        self.VOCAB_DATA_PAGE = None
209        self.REFRESH_TIME = time.time()
210        self.latestTermVersion = {}
211        logging.info("VocabTermData object set up")
212
213
214    def isValidSubType(self, mainType, subType):
215        '''
216        Determine whether a specified subtype is valid for a particular
217        main type
218        @param mainType: term ID of the main type of the data
219        @param subType: term ID of the subtype of the data
220        @return: True if the subtype is valid
221        @raise ValueError: if the mainType is not recognised or the subType is not valid
222        '''
223        if not self.SUBTYPE_TERMS.has_key(mainType):
224            errorMessage = "Error: unrecognised data type: '%s'" %mainType
225            logging.error(errorMessage)
226            raise ValueError(errorMessage)
227
228        if subType in self.SUBTYPE_TERMS[mainType]:
229            return True
230           
231        errorMessage = "Error: subtype, '%s' is not valid for data type, '%s'" \
232            %(subType, mainType)
233        logging.error(errorMessage)
234        raise ValueError(errorMessage)
235
236
237    def getValidTypes(self, category):
238        '''
239        Return a list of the valid types available for a particular data category
240        @param category: type of term info to look up - e.g. atoms or providers
241        - this should be specified using the ..._CATEGORY instance variables
242        defined above
243        @return list of VocabTermInfo objects for the specified category
244        '''
245        logging.debug("Lookup up list of valid %s types" %category)
246        catList = []
247        if category == self.ATOM_CATEGORY:
248            catList = self.ATOM_TYPES
249        elif category == self.PROVIDER_CATEGORY:
250            catList = self.PROVIDER_TYPES
251        else:
252            errorMessage = "Unrecognised data category, '%s'" %category
253            logging.info(errorMessage)
254            raise ValueError(errorMessage)
255           
256        types = []
257        for st in catList:
258            types.append(self.TERM_DATA[st])
259        return types
260
261
262    def getValidSubTypes(self, atomType):
263        '''
264        Get list of subtypes that are valid wrt a specified atom type
265        @param atomType: term ID for the atom type whose subtypes need to
266        be looked up
267        @return: list of valid subtypes
268        '''
269        logging.debug("Lookup up subtypes for atom type, '%s'" %atomType)
270        subTypes = self.SUBTYPE_TERMS.get(atomType) or []
271        types = []
272        for st in subTypes:
273            item = self.TERM_DATA[st]
274            # tidy up the title slightly, if need be
275            if item.title and item.title.startswith('dg'):
276                item.title = item.title[2:]
277                # and fix any camelcase - putting into words
278                item.title = re.sub('([A-Z])', r" \1", item.title).strip()
279            types.append(item)
280        logging.debug("Found subtypes: %s" %subTypes)
281        return types
282
283       
284    def getTermFromTitle(self, title):
285        '''
286        Given a term title/label, get back the related term id
287        @param title: title/label of term id to retrieve
288        @raise ValueError: if more than one title or no title is returned 
289        '''
290        termID = []
291        for val in self.TERM_DATA.itervalues():
292            if val.title == title:
293                termID.append(val.termID)
294       
295        if len(termID) != 1:
296            errorMessage = "Error: could not accurately determine the vocab term \
297                ID for the label, '%s' - %s values returned" %(title, len(termID))
298            logging.error(errorMessage)
299            raise ValueError(errorMessage)
300
301        return termID[0]
302       
303
304    def _getVocabDataPage(self):
305        '''
306        Getter method to allow regular refreshing of data
307        '''
308        if self.REFRESH_TIME < time.time():
309            self.getVocabServerData()
310       
311        return self.VOCAB_DATA_PAGE
312
313   
314    def getVocabServerData(self):
315        '''
316        Retrieve the web page containing the versioning info for the various vocab terms
317        '''
318        logging.info("Retrieving vocab server data page")
319        f = urllib.urlopen(self.VOCAB_SERVER_URL)
320        self.VOCAB_DATA_PAGE = f.read()
321        f.close()
322        self.REFRESH_TIME = time.time() + self.ONE_HOUR
323        logging.info("Vocab server data retrieved")
324
325
326    def getLatestTermVersions(self):
327        '''
328        Retrieve the latest versions of the vocab terms required
329        - NB, refreshes the data on an hourly basis, if necessary
330        '''
331        # now need to parse the returned data to get the current version number
332        for termName in self.TERM_DATA:
333            if termName not in self.latestTermVersion:
334                self.getLatestTermVersion(termName)
335
336                   
337    def getLatestTermVersion(self, termName):
338        '''
339        Parse the vocab server data and determine the latest version number of the term with the specified name
340        @param termName: name of term whose current version needs to be established
341        '''
342        logging.info("Getting latest term version for term, '%s'" %termName)
343        # check for recent data - NB, the term will probably share a base url with other terms
344        # so may have already have the correct URL set up
345        if termName not in self.latestTermVersion or self.REFRESH_TIME < time.time():
346   
347            uri = self.TERM_DATA[termName].vocabURL
348            currentVersion = self._getURIVersion(uri)
349   
350            self.latestTermVersion[termName] = str(currentVersion)
351        logging.info("Latest term version for term, '%s' retrieved" %termName)
352
353
354    def _getURIVersion(self, uri):
355        '''
356        Parse the server data for a specified uri and return the latest version number of it
357        @param uri: uri to look for
358        @raise ValueError: if specified uri not found in vocab server
359        @return version of vocab uri, '' if uri contains latest version and None if uri not found
360        '''
361        # allow the data to be refreshed on an hourly basis
362        pageData = self._getVocabDataPage()
363       
364        uri = uri.replace('/term/','/list/')
365        if uri.endswith('/'):
366            uri = uri.rstrip('/')
367        # NB, the uri may already have a version number included in it - so allow this in the search
368        regExp = re.compile(r'(' + uri + '(/(\d{1,}))?)')
369        currentVersion = None
370        foundTerm = False
371        for termVals in regExp.findall(pageData):
372            foundTerm = True
373            version = termVals[2]
374            # version number must be included in original uri, if match found without a version number
375            # being split out - so ignore this - since the original uri is already specified in full
376            if not version:
377                currentVersion = ''
378                break
379
380            # TODO: check how the versioning system will be done - i.e. 3.2.4 type versioning will cause this to break
381            if not currentVersion:
382                currentVersion = int(version)
383            elif int(version) > currentVersion:
384                currentVersion = int(version)
385
386        if currentVersion == None:
387            errorMessage = "Could not find information in vocab server for uri, '%s' - exiting" %uri
388            logging.error(errorMessage)
389            # TODO: uncomment the ValueError once all the vocab terms have been defined
390            #raise ValueError(errorMessage)
391
392        return currentVersion
393
394       
395    def getTermCurrentVocabURL(self, termName):
396        '''
397        Get the current URL on the vocab server to the specified term
398        @param termName: name of term whose URL to return
399        '''
400        logging.debug("Looking up vocab data for term: '%s'" %termName)
401       
402        if termName not in self.TERM_DATA:
403            errorMessage = "Could not find term, '%s' in defined list of valid vocab terms - exiting" %termName
404            logging.error(errorMessage)
405            raise ValueError(errorMessage)
406       
407        if termName not in self.latestTermVersion:
408            self.getLatestTermVersion(termName)
409
410        uri = self.TERM_DATA[termName].vocabURL + \
411            "/" + self.latestTermVersion[termName] + "/" + \
412            self.TERM_DATA[termName].termID
413        logging.debug("Returning vocab URL: '%s'" %uri)
414        return  uri
415   
416   
417    def getCurrentVocabURI(self, uri):
418        '''
419        Look up a specified URI and return the current version of it
420        '''
421        logging.debug("Looking up current version of uri: '%s'" %uri)
422        currentVersion = self._getURIVersion(uri)
423        logging.debug("URI version looked up")
424        if currentVersion:
425            if not uri.endswith('/'):
426                uri += '/'
427            return uri + str(currentVersion)
428        return uri
Note: See TracBrowser for help on using the repository browser.