source: exist/trunk/python/ndgUtils/models/utilities.py @ 4444

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/utilities.py@4444
Revision 4444, 7.6 KB checked in by cbyrom, 12 years ago (diff)

Add new xquery to lookup deployment atoms associated with an atom ID +
add code to models to allow the use of this to retrieve this information
+ update tests + avoid doubly escaping special characters.

Line 
1import os, sys, logging, re, cgi
2from ndgUtils.ETxmlView import subAI
3import csml.csmlscan as CsmlScan
4from xml.sax.saxutils import escape
5'''
6Various helper methods for use with the granulator command line tool
7@author: C Byrom
8'''
9_subtool = subAI()    # tool to escape angular brackets - enable as field variable for easy reuse
10
11ISO8601_RE = "([0-9]{4})(-([0-9]{2})(-([0-9]{2})([T\s]?([0-9]{2}):([0-9]{2})(:([0-9]{2})(\.([0-9]+))?)?" + \
12    "(Z|(([-+])([0-9]{2}):([0-9]{2})))?)?)?)?"
13
14regExp = re.compile(ISO8601_RE)
15
16# Regular expression string to allow months to be stripped out of parameters
17MONTHS = "JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER"
18
19esc_chars = {'\xb0':'°','°':'°'}
20
21def getTripleData(tripleString, doEscape=True):
22    '''
23    Take a string as input and extract triple data into an array
24    NB, if data not fully in triple form, return empty elements
25    @param tripleString: string containing the triple data
26    @keyword doEscape: if True, escape special characters - e.g. '&' (default True)
27    @return 1-D array with three elements, representing the data in the triple
28    '''
29    logging.debug("Getting triple data: %s" %tripleString)
30    if doEscape:
31        tripleString = escapeSpecialCharacters(tripleString)
32       
33    data = tripleString.split('|')
34    returnData = ["", "", ""]
35    i = 0
36    for val in data:
37        if i > 2:
38            raise ValueError("Triple data has an extra '|' character in it (%s) - please fix and rerun" %tripleString)
39        returnData[i] = val.strip()
40        i += 1
41   
42    logging.debug("- returning triple data in array")
43    return returnData
44
45
46def encodeIntoHTMLNumericalCodes(inputString):
47    '''
48    Take an input string and adjust any bytes it contains into the
49    required html numerical code to display the data
50    @param inputString: string to encode
51    @return encoded string
52    '''
53    # NB, the latin coding accepts unicode up to 255
54    correctedString = inputString.decode('latin-1')
55   
56    # the XMLCHARREFREPLACE does the required character replacement
57    return correctedString.encode('ascii', 'xmlcharrefreplace')
58    #return correctedString.encode('utf-8')
59   
60
61def escapeSpecialCharacters(inputString):
62    '''
63    Escape any XML unfriendly characters
64    @param inputString: string whose value to correct
65    @return: corrected string
66    '''
67    correctedString = cgi.escape(inputString)
68    correctedString = encodeIntoHTMLNumericalCodes(correctedString)
69    #correctedString = escape(correctedString, entities=esc_chars)
70   
71    if inputString != correctedString:
72        logging.info("Note: input data made XML friendly (\nold:'%s' \nnew:'%s')" %(inputString, correctedString))
73    return correctedString
74
75       
76def createCSMLFile(CDMLFilePath, datasetID, timeAxis):
77    '''
78    Create a CSML file by running csmlscan.py against the specified CDML file
79    @param CDMLFilePath: string path to CDML file
80    @param datasetID: string dataset ID to use in CSMLfile
81    @param timeAxis: string name of time axis to use in CDML file
82    @return: CSMLFileName: name of CSML file produced
83    '''
84    logging.info("Creating CSML file from CDML file by running csmlscan")
85    CSMLFileName = datasetID + "_csml.xml"
86    logging.debug("Inputs specified: datasetID = %s, timeAxis = %s" %(datasetID, timeAxis))
87    CsmlScan.main(['csmlscan', '-i', datasetID ,'-t', timeAxis, '-o', CSMLFileName, CDMLFilePath])
88    logging.info("Created CSML file: %s" %CSMLFileName)
89    return CSMLFileName
90
91
92def getISO8601Date(datestring):
93    '''
94    Converts an input datestring to the ISO8601 standard, if possible
95    @param datestring: string containing date in some format
96    @return: string in ISO8601 format
97    @raise ValueError: if string not in ISO8601 format - or cannot be parsed into this format
98    '''
99    if not datestring:
100        return ''
101   
102    d = regExp.match(datestring)
103    if not d:
104        errorMessage = "Datestring, '%s', not in ISO8601 format" %datestring
105        logging.error(errorMessage)
106        raise ValueError(errorMessage)
107
108    # years
109    outDate = d.group(1)
110   
111    # months
112    if d.group(3):
113        outDate += "-" + d.group(3)
114       
115    # days
116    if d.group(5):
117        outDate += "-" + d.group(5)
118    outDate += "T"
119   
120    # hours
121    if d.group(7):
122        outDate += d.group(7)
123   
124    # minutes
125    if d.group(8):
126        outDate += ":" + d.group(8)
127       
128    # seconds
129    if d.group(10):
130        outDate += ":" + d.group(10)
131
132    outDate += "Z"
133
134    return outDate
135   
136
137def formatDateYYYYMMDD(dateString):
138    '''
139    Parse a date string and attempt to return it in YYYY-MM-DD format
140    @param dateString - a string containing datetime info
141    @return string with date in format YYYY-MM-DD or None, if format not possible
142    '''
143    newDate = getISO8601Date(dateString)
144    if len(newDate) > 11:
145        newDate = newDate[0:10]
146       
147    return newDate
148       
149   
150   
151def tidyUpParameters(params_string):
152    '''
153    Parameters info may contain generic info - including months
154    - also may include unnecessary spaces/cases - so strip out
155    generic info and correct any funnies
156    NB, if we're dealing with parameter triples, ignore the (second) url entry
157    when doing month removal/upper casing
158    '''
159    logging.debug("Tidying up parameters string, %s" %params_string)
160
161    # avoid processing the url, if it has been set
162    # - NB, special characters are escaped by getTripleData
163    data = getTripleData(params_string)
164    newData = []
165    for r in [data[0], data[2]]:
166        if r:
167            # Strip out any months in the string + uppercase everything
168            r = re.sub(MONTHS,'',r.upper())
169           
170            # Now remove any trailing spaces + any unnecessary inner spaces
171            r = re.sub('\s{2,}',' ',r.strip())
172           
173            newData.append(r)
174   
175    # now, recreate the parameters string
176    r = newData[0]
177    if params_string.find("|") > -1:
178        r += " | " + data[1]
179        if len(newData) > 1:
180            r += " | " + newData[1]
181   
182    logging.debug("Tidied parameter string is now: %s" %r)
183    return r
184
185
186   
187class coverageAggregate:
188    ''' Granules have spatiotemporal boxes, but we want an overall spatio temporal box too, if the
189    boxes are the same, but respecting space and time differently '''
190    def __init__(self,M):
191        self.spaceTime=[]
192        self.coverageList=[]
193        self.M=M
194    def add(self,bbox,time,coverage):
195        if (bbox,time) not in self.spaceTime:
196            logging.debug("Adding coverage data to moles doc:")
197            logging.info("- bbox, '%s'" %bbox)
198            logging.info("- time, '%s'" %time)
199            self.spaceTime.append((bbox,time))
200            self.coverageList.append(coverage.dgSpatioTemporalCoverage.dgSpatioTemporalRange)
201           
202    def makeElement(self):
203        logging.info("Setting up coverage element")
204        if self.spaceTime==[]:
205            return None
206        else:
207            return self.M.dgCoverage(dgSpatioTemporalCoverage=
208                                    self.M.dgSpatioTemporalCoverage(dgSpatioTemporalRange=self.coverageList))
209
210
211class parameterAggregate:
212    ''' Provides a set of parameter summaries, and an index of parameter names to allow
213    aggregation without duplication. Pretty brain dead at the moment. '''
214    def __init__(self):
215        self.paramNameIndex=[]
216        self.paramSummaries=[]
217    def add(self,subset):
218        ''' Add a subset '''
219        for p in subset:
220            if p.ParameterName not in self.paramNameIndex:
221                self.paramNameIndex.append(p.ParameterName)
222                self.paramSummaries.append(p)
223    def get(self):
224        return self.paramSummaries
Note: See TracBrowser for help on using the repository browser.