source: exist/trunk/python/ndgUtils/models/utilities.py @ 4419

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/utilities.py@4419
Revision 4419, 7.4 KB checked in by cbyrom, 11 years ago (diff)

Add method to atom to allow ingest of CSML data + add inputs check for
dbclient method.

Line 
1import os, sys, logging, re, cgi
2from ndgUtils.ETxmlView import subAI
3import csml.csmlscan as CsmlScan
4from xml.sax.saxutils import escape
5'''
6Various helper methods for use with the granulator command line tool
7@author: C Byrom
8'''
9_subtool = subAI()    # tool to escape angular brackets - enable as field variable for easy reuse
10
11ISO8601_RE = "([0-9]{4})(-([0-9]{2})(-([0-9]{2})([T\s]?([0-9]{2}):([0-9]{2})(:([0-9]{2})(\.([0-9]+))?)?" + \
12    "(Z|(([-+])([0-9]{2}):([0-9]{2})))?)?)?)?"
13
14regExp = re.compile(ISO8601_RE)
15
16# Regular expression string to allow months to be stripped out of parameters
17MONTHS = "JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER"
18
19esc_chars = {'\xb0':'°','°':'°'}
20
21YEAR_FORMAT = '%Y-%m-%d'
22   
23def getTripleData(tripleString):
24    '''
25    Take a string as input and extract triple data into an array
26    NB, if data not fully in triple form, return empty elements
27    @param tripleString: string containing the triple data
28    @return 1-D array with three elements, representing the data in the triple
29    '''
30    logging.debug("Splitting triple: %s" %tripleString)
31    data = tripleString.split('|')
32    returnData = ["", "", ""]
33    i = 0
34    for val in data:
35        if i > 2:
36            raise ValueError("Triple data has an extra '|' character in it (%s) - please fix and rerun" %tripleString)
37        val = val.strip()
38        returnData[i] = escapeSpecialCharacters(val)
39        i += 1
40   
41    return returnData
42
43
44def encodeIntoHTMLNumericalCodes(inputString):
45    '''
46    Take an input string and adjust any bytes it contains into the
47    required html numerical code to display the data
48    @param inputString: string to encode
49    @return encoded string
50    '''
51    # NB, the latin coding accepts unicode up to 255
52    correctedString = inputString.decode('latin-1')
53   
54    # the XMLCHARREFREPLACE does the required character replacement
55    return correctedString.encode('ascii', 'xmlcharrefreplace')
56    #return correctedString.encode('utf-8')
57   
58
59def escapeSpecialCharacters(inputString):
60    '''
61    Escape any XML unfriendly characters
62    @param inputString: string whose value to correct
63    @return: corrected string
64    '''
65    correctedString = cgi.escape(inputString)
66    correctedString = encodeIntoHTMLNumericalCodes(correctedString)
67    #correctedString = escape(correctedString, entities=esc_chars)
68   
69    if inputString != correctedString:
70        logging.info("Note: input data made XML friendly (\nold:'%s' \nnew:'%s')" %(inputString, correctedString))
71    return correctedString
72
73       
74def createCSMLFile(CDMLFilePath, datasetID, timeAxis):
75    '''
76    Create a CSML file by running csmlscan.py against the specified CDML file
77    @param CDMLFilePath: string path to CDML file
78    @param datasetID: string dataset ID to use in CSMLfile
79    @param timeAxis: string name of time axis to use in CDML file
80    @return: CSMLFileName: name of CSML file produced
81    '''
82    logging.info("Creating CSML file from CDML file by running csmlscan")
83    CSMLFileName = datasetID + "_csml.xml"
84    logging.debug("Inputs specified: datasetID = %s, timeAxis = %s" %(datasetID, timeAxis))
85    CsmlScan.main(['csmlscan', '-i', datasetID ,'-t', timeAxis, '-o', CSMLFileName, CDMLFilePath])
86    logging.info("Created CSML file: %s" %CSMLFileName)
87    return CSMLFileName
88
89
90def getISO8601Date(datestring):
91    '''
92    Converts an input datestring to the ISO8601 standard, if possible
93    @param datestring: string containing date in some format
94    @return: string in ISO8601 format
95    @raise ValueError: if string not in ISO8601 format - or cannot be parsed into this format
96    '''
97    if not datestring:
98        return ''
99   
100    d = regExp.match(datestring)
101    if not d:
102        errorMessage = "Datestring, '%s', not in ISO8601 format" %datestring
103        logging.error(errorMessage)
104        raise ValueError(errorMessage)
105
106    # years
107    outDate = d.group(1)
108   
109    # months
110    if d.group(3):
111        outDate += "-" + d.group(3)
112       
113    # days
114    if d.group(5):
115        outDate += "-" + d.group(5)
116    outDate += "T"
117   
118    # hours
119    if d.group(7):
120        outDate += d.group(7)
121   
122    # minutes
123    if d.group(8):
124        outDate += ":" + d.group(8)
125       
126    # seconds
127    if d.group(10):
128        outDate += ":" + d.group(10)
129
130    outDate += "Z"
131
132    return outDate
133   
134
135def formatDateYYYYMMDD(dateString):
136    '''
137    Parse a date string and attempt to return it in YYYY-MM-DD format
138    @param dateString - a string containing datetime info
139    @return string with date in format YYYY-MM-DD or None, if format not possible
140    '''
141    newDate = getISO8601Date(dateString)
142    if len(newDate) > 11:
143        newDate = newDate[0:10]
144       
145    return newDate
146       
147   
148   
149def tidyUpParameters(params_string):
150    '''
151    Parameters info may contain generic info - including months
152    - also may include unnecessary spaces/cases - so strip out
153    generic info and correct any funnies
154    NB, if we're dealing with parameter triples, ignore the (second) url entry
155    when doing month removal/upper casing
156    '''
157    logging.debug("Tidying up parameters string, %s" %params_string)
158
159    # avoid processing the url, if it has been set
160    data = getTripleData(params_string)
161    newData = []
162    for r in [data[0], data[2]]:
163        if r:
164            # Strip out any months in the string + uppercase everything
165            r = re.sub(MONTHS,'',r.upper())
166           
167            # Now remove any trailing spaces + any unnecessary inner spaces
168            r = re.sub('\s{2,}',' ',r.strip())
169           
170            newData.append(r)
171   
172    # now, recreate the parameters string
173    r = newData[0]
174    if params_string.find("|") > -1:
175        r += " | " + data[1]
176        if len(newData) > 1:
177            r += " | " + newData[1]
178   
179    # lastly, remove any special characters
180    r = escapeSpecialCharacters(r) 
181    logging.debug("Tidied parameter string is now: %s" %r)
182    return r
183
184
185   
186class coverageAggregate:
187    ''' Granules have spatiotemporal boxes, but we want an overall spatio temporal box too, if the
188    boxes are the same, but respecting space and time differently '''
189    def __init__(self,M):
190        self.spaceTime=[]
191        self.coverageList=[]
192        self.M=M
193    def add(self,bbox,time,coverage):
194        if (bbox,time) not in self.spaceTime:
195            logging.debug("Adding coverage data to moles doc:")
196            logging.info("- bbox, '%s'" %bbox)
197            logging.info("- time, '%s'" %time)
198            self.spaceTime.append((bbox,time))
199            self.coverageList.append(coverage.dgSpatioTemporalCoverage.dgSpatioTemporalRange)
200           
201    def makeElement(self):
202        logging.info("Setting up coverage element")
203        if self.spaceTime==[]:
204            return None
205        else:
206            return self.M.dgCoverage(dgSpatioTemporalCoverage=
207                                    self.M.dgSpatioTemporalCoverage(dgSpatioTemporalRange=self.coverageList))
208
209
210class parameterAggregate:
211    ''' Provides a set of parameter summaries, and an index of parameter names to allow
212    aggregation without duplication. Pretty brain dead at the moment. '''
213    def __init__(self):
214        self.paramNameIndex=[]
215        self.paramSummaries=[]
216    def add(self,subset):
217        ''' Add a subset '''
218        for p in subset:
219            if p.ParameterName not in self.paramNameIndex:
220                self.paramNameIndex.append(p.ParameterName)
221                self.paramSummaries.append(p)
222    def get(self):
223        return self.paramSummaries
Note: See TracBrowser for help on using the repository browser.