source: exist/trunk/python/ndgUtils/models/utilities.py @ 4427

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/utilities.py@4427
Revision 4427, 7.4 KB checked in by cbyrom, 11 years ago (diff)

Upgrade the various 'list' xqueries - to nest all results in a single
root element - so that only one document need be retrieved to get all
results + adjust ndgDirectory to cope with processing the new results +
fix the various namespaces mentioned in the codebase to map to the
current atom/moles ones.

RevLine 
[4414]1import os, sys, logging, re, cgi
[4209]2from ndgUtils.ETxmlView import subAI
[4419]3import csml.csmlscan as CsmlScan
[4209]4from xml.sax.saxutils import escape
5'''
6Various helper methods for use with the granulator command line tool
7@author: C Byrom
8'''
9_subtool = subAI()    # tool to escape angular brackets - enable as field variable for easy reuse
10
11ISO8601_RE = "([0-9]{4})(-([0-9]{2})(-([0-9]{2})([T\s]?([0-9]{2}):([0-9]{2})(:([0-9]{2})(\.([0-9]+))?)?" + \
12    "(Z|(([-+])([0-9]{2}):([0-9]{2})))?)?)?)?"
13
14regExp = re.compile(ISO8601_RE)
15
16# Regular expression string to allow months to be stripped out of parameters
17MONTHS = "JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER"
18
19esc_chars = {'\xb0':'°','°':'°'}
20
21def getTripleData(tripleString):
22    '''
23    Take a string as input and extract triple data into an array
24    NB, if data not fully in triple form, return empty elements
25    @param tripleString: string containing the triple data
26    @return 1-D array with three elements, representing the data in the triple
27    '''
28    logging.debug("Splitting triple: %s" %tripleString)
29    data = tripleString.split('|')
30    returnData = ["", "", ""]
31    i = 0
32    for val in data:
33        if i > 2:
34            raise ValueError("Triple data has an extra '|' character in it (%s) - please fix and rerun" %tripleString)
35        val = val.strip()
[4414]36        returnData[i] = escapeSpecialCharacters(val)
[4209]37        i += 1
38   
39    return returnData
40
41
42def encodeIntoHTMLNumericalCodes(inputString):
43    '''
44    Take an input string and adjust any bytes it contains into the
45    required html numerical code to display the data
46    @param inputString: string to encode
47    @return encoded string
48    '''
49    # NB, the latin coding accepts unicode up to 255
50    correctedString = inputString.decode('latin-1')
51   
52    # the XMLCHARREFREPLACE does the required character replacement
[4414]53    return correctedString.encode('ascii', 'xmlcharrefreplace')
54    #return correctedString.encode('utf-8')
[4209]55   
56
57def escapeSpecialCharacters(inputString):
58    '''
59    Escape any XML unfriendly characters
60    @param inputString: string whose value to correct
61    @return: corrected string
62    '''
63    correctedString = cgi.escape(inputString)
64    correctedString = encodeIntoHTMLNumericalCodes(correctedString)
65    #correctedString = escape(correctedString, entities=esc_chars)
66   
67    if inputString != correctedString:
68        logging.info("Note: input data made XML friendly (\nold:'%s' \nnew:'%s')" %(inputString, correctedString))
69    return correctedString
70
71       
72def createCSMLFile(CDMLFilePath, datasetID, timeAxis):
73    '''
74    Create a CSML file by running csmlscan.py against the specified CDML file
75    @param CDMLFilePath: string path to CDML file
76    @param datasetID: string dataset ID to use in CSMLfile
77    @param timeAxis: string name of time axis to use in CDML file
78    @return: CSMLFileName: name of CSML file produced
79    '''
80    logging.info("Creating CSML file from CDML file by running csmlscan")
81    CSMLFileName = datasetID + "_csml.xml"
82    logging.debug("Inputs specified: datasetID = %s, timeAxis = %s" %(datasetID, timeAxis))
83    CsmlScan.main(['csmlscan', '-i', datasetID ,'-t', timeAxis, '-o', CSMLFileName, CDMLFilePath])
84    logging.info("Created CSML file: %s" %CSMLFileName)
85    return CSMLFileName
86
87
88def getISO8601Date(datestring):
89    '''
90    Converts an input datestring to the ISO8601 standard, if possible
91    @param datestring: string containing date in some format
92    @return: string in ISO8601 format
93    @raise ValueError: if string not in ISO8601 format - or cannot be parsed into this format
94    '''
[4419]95    if not datestring:
96        return ''
97   
[4209]98    d = regExp.match(datestring)
99    if not d:
100        errorMessage = "Datestring, '%s', not in ISO8601 format" %datestring
101        logging.error(errorMessage)
102        raise ValueError(errorMessage)
103
104    # years
105    outDate = d.group(1)
106   
107    # months
108    if d.group(3):
109        outDate += "-" + d.group(3)
110       
111    # days
112    if d.group(5):
113        outDate += "-" + d.group(5)
114    outDate += "T"
115   
116    # hours
117    if d.group(7):
118        outDate += d.group(7)
119   
120    # minutes
121    if d.group(8):
122        outDate += ":" + d.group(8)
123       
124    # seconds
125    if d.group(10):
126        outDate += ":" + d.group(10)
127
128    outDate += "Z"
129
130    return outDate
131   
132
133def formatDateYYYYMMDD(dateString):
134    '''
135    Parse a date string and attempt to return it in YYYY-MM-DD format
136    @param dateString - a string containing datetime info
137    @return string with date in format YYYY-MM-DD or None, if format not possible
138    '''
139    newDate = getISO8601Date(dateString)
140    if len(newDate) > 11:
141        newDate = newDate[0:10]
142       
143    return newDate
144       
145   
146   
147def tidyUpParameters(params_string):
148    '''
149    Parameters info may contain generic info - including months
150    - also may include unnecessary spaces/cases - so strip out
151    generic info and correct any funnies
152    NB, if we're dealing with parameter triples, ignore the (second) url entry
153    when doing month removal/upper casing
154    '''
155    logging.debug("Tidying up parameters string, %s" %params_string)
156
157    # avoid processing the url, if it has been set
158    data = getTripleData(params_string)
159    newData = []
160    for r in [data[0], data[2]]:
161        if r:
162            # Strip out any months in the string + uppercase everything
163            r = re.sub(MONTHS,'',r.upper())
164           
165            # Now remove any trailing spaces + any unnecessary inner spaces
166            r = re.sub('\s{2,}',' ',r.strip())
167           
168            newData.append(r)
169   
170    # now, recreate the parameters string
171    r = newData[0]
172    if params_string.find("|") > -1:
173        r += " | " + data[1]
174        if len(newData) > 1:
175            r += " | " + newData[1]
176   
177    # lastly, remove any special characters
178    r = escapeSpecialCharacters(r) 
179    logging.debug("Tidied parameter string is now: %s" %r)
180    return r
181
182
183   
184class coverageAggregate:
185    ''' Granules have spatiotemporal boxes, but we want an overall spatio temporal box too, if the
186    boxes are the same, but respecting space and time differently '''
187    def __init__(self,M):
188        self.spaceTime=[]
189        self.coverageList=[]
190        self.M=M
191    def add(self,bbox,time,coverage):
192        if (bbox,time) not in self.spaceTime:
193            logging.debug("Adding coverage data to moles doc:")
194            logging.info("- bbox, '%s'" %bbox)
195            logging.info("- time, '%s'" %time)
196            self.spaceTime.append((bbox,time))
197            self.coverageList.append(coverage.dgSpatioTemporalCoverage.dgSpatioTemporalRange)
198           
199    def makeElement(self):
200        logging.info("Setting up coverage element")
201        if self.spaceTime==[]:
202            return None
203        else:
204            return self.M.dgCoverage(dgSpatioTemporalCoverage=
205                                    self.M.dgSpatioTemporalCoverage(dgSpatioTemporalRange=self.coverageList))
206
207
208class parameterAggregate:
209    ''' Provides a set of parameter summaries, and an index of parameter names to allow
210    aggregation without duplication. Pretty brain dead at the moment. '''
211    def __init__(self):
212        self.paramNameIndex=[]
213        self.paramSummaries=[]
214    def add(self,subset):
215        ''' Add a subset '''
216        for p in subset:
217            if p.ParameterName not in self.paramNameIndex:
218                self.paramNameIndex.append(p.ParameterName)
219                self.paramSummaries.append(p)
220    def get(self):
221        return self.paramSummaries
Note: See TracBrowser for help on using the repository browser.