source: exist/trunk/python/ndgUtils/models/utilities.py @ 4414

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/models/utilities.py@4414
Revision 4414, 7.4 KB checked in by cbyrom, 11 years ago (diff)

Add new code to allow bulk loading of data from eXist - to improve performance
when doing data ingest.

Line 
1import os, sys, logging, re, cgi
2from ndgUtils.ETxmlView import subAI
3#import csml.csmlscan as CsmlScan
4from xml.sax.saxutils import escape
5'''
6Various helper methods for use with the granulator command line tool
7@author: C Byrom
8'''
9_subtool = subAI()    # tool to escape angular brackets - enable as field variable for easy reuse
10
11ISO8601_RE = "([0-9]{4})(-([0-9]{2})(-([0-9]{2})([T\s]?([0-9]{2}):([0-9]{2})(:([0-9]{2})(\.([0-9]+))?)?" + \
12    "(Z|(([-+])([0-9]{2}):([0-9]{2})))?)?)?)?"
13
14regExp = re.compile(ISO8601_RE)
15
16# Regular expression string to allow months to be stripped out of parameters
17MONTHS = "JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER"
18
19esc_chars = {'\xb0':'°','°':'°'}
20
21YEAR_FORMAT = '%Y-%m-%d'
22   
23def getTripleData(tripleString):
24    '''
25    Take a string as input and extract triple data into an array
26    NB, if data not fully in triple form, return empty elements
27    @param tripleString: string containing the triple data
28    @return 1-D array with three elements, representing the data in the triple
29    '''
30    logging.debug("Splitting triple: %s" %tripleString)
31    data = tripleString.split('|')
32    returnData = ["", "", ""]
33    i = 0
34    for val in data:
35        if i > 2:
36            raise ValueError("Triple data has an extra '|' character in it (%s) - please fix and rerun" %tripleString)
37        val = val.strip()
38        returnData[i] = escapeSpecialCharacters(val)
39        i += 1
40   
41    return returnData
42
43
44def encodeIntoHTMLNumericalCodes(inputString):
45    '''
46    Take an input string and adjust any bytes it contains into the
47    required html numerical code to display the data
48    @param inputString: string to encode
49    @return encoded string
50    '''
51    # NB, the latin coding accepts unicode up to 255
52    correctedString = inputString.decode('latin-1')
53   
54    # the XMLCHARREFREPLACE does the required character replacement
55    return correctedString.encode('ascii', 'xmlcharrefreplace')
56    #return correctedString.encode('utf-8')
57   
58
59def escapeSpecialCharacters(inputString):
60    '''
61    Escape any XML unfriendly characters
62    @param inputString: string whose value to correct
63    @return: corrected string
64    '''
65    correctedString = cgi.escape(inputString)
66    correctedString = encodeIntoHTMLNumericalCodes(correctedString)
67    #correctedString = escape(correctedString, entities=esc_chars)
68   
69    if inputString != correctedString:
70        logging.info("Note: input data made XML friendly (\nold:'%s' \nnew:'%s')" %(inputString, correctedString))
71    return correctedString
72
73       
74def createCSMLFile(CDMLFilePath, datasetID, timeAxis):
75    '''
76    Create a CSML file by running csmlscan.py against the specified CDML file
77    @param CDMLFilePath: string path to CDML file
78    @param datasetID: string dataset ID to use in CSMLfile
79    @param timeAxis: string name of time axis to use in CDML file
80    @return: CSMLFileName: name of CSML file produced
81    '''
82    logging.info("Creating CSML file from CDML file by running csmlscan")
83    CSMLFileName = datasetID + "_csml.xml"
84    logging.debug("Inputs specified: datasetID = %s, timeAxis = %s" %(datasetID, timeAxis))
85    CsmlScan.main(['csmlscan', '-i', datasetID ,'-t', timeAxis, '-o', CSMLFileName, CDMLFilePath])
86    logging.info("Created CSML file: %s" %CSMLFileName)
87    return CSMLFileName
88
89
90def getISO8601Date(datestring):
91    '''
92    Converts an input datestring to the ISO8601 standard, if possible
93    @param datestring: string containing date in some format
94    @return: string in ISO8601 format
95    @raise ValueError: if string not in ISO8601 format - or cannot be parsed into this format
96    '''
97    d = regExp.match(datestring)
98    if not d:
99        errorMessage = "Datestring, '%s', not in ISO8601 format" %datestring
100        logging.error(errorMessage)
101        raise ValueError(errorMessage)
102
103    # years
104    outDate = d.group(1)
105   
106    # months
107    if d.group(3):
108        outDate += "-" + d.group(3)
109       
110    # days
111    if d.group(5):
112        outDate += "-" + d.group(5)
113    outDate += "T"
114   
115    # hours
116    if d.group(7):
117        outDate += d.group(7)
118   
119    # minutes
120    if d.group(8):
121        outDate += ":" + d.group(8)
122       
123    # seconds
124    if d.group(10):
125        outDate += ":" + d.group(10)
126
127    outDate += "Z"
128
129    return outDate
130   
131
132def formatDateYYYYMMDD(dateString):
133    '''
134    Parse a date string and attempt to return it in YYYY-MM-DD format
135    @param dateString - a string containing datetime info
136    @return string with date in format YYYY-MM-DD or None, if format not possible
137    '''
138    newDate = getISO8601Date(dateString)
139    if len(newDate) > 11:
140        newDate = newDate[0:10]
141       
142    return newDate
143       
144   
145   
146def tidyUpParameters(params_string):
147    '''
148    Parameters info may contain generic info - including months
149    - also may include unnecessary spaces/cases - so strip out
150    generic info and correct any funnies
151    NB, if we're dealing with parameter triples, ignore the (second) url entry
152    when doing month removal/upper casing
153    '''
154    logging.debug("Tidying up parameters string, %s" %params_string)
155
156    # avoid processing the url, if it has been set
157    data = getTripleData(params_string)
158    newData = []
159    for r in [data[0], data[2]]:
160        if r:
161            # Strip out any months in the string + uppercase everything
162            r = re.sub(MONTHS,'',r.upper())
163           
164            # Now remove any trailing spaces + any unnecessary inner spaces
165            r = re.sub('\s{2,}',' ',r.strip())
166           
167            newData.append(r)
168   
169    # now, recreate the parameters string
170    r = newData[0]
171    if params_string.find("|") > -1:
172        r += " | " + data[1]
173        if len(newData) > 1:
174            r += " | " + newData[1]
175   
176    # lastly, remove any special characters
177    r = escapeSpecialCharacters(r) 
178    logging.debug("Tidied parameter string is now: %s" %r)
179    return r
180
181
182   
183class coverageAggregate:
184    ''' Granules have spatiotemporal boxes, but we want an overall spatio temporal box too, if the
185    boxes are the same, but respecting space and time differently '''
186    def __init__(self,M):
187        self.spaceTime=[]
188        self.coverageList=[]
189        self.M=M
190    def add(self,bbox,time,coverage):
191        if (bbox,time) not in self.spaceTime:
192            logging.debug("Adding coverage data to moles doc:")
193            logging.info("- bbox, '%s'" %bbox)
194            logging.info("- time, '%s'" %time)
195            self.spaceTime.append((bbox,time))
196            self.coverageList.append(coverage.dgSpatioTemporalCoverage.dgSpatioTemporalRange)
197           
198    def makeElement(self):
199        logging.info("Setting up coverage element")
200        if self.spaceTime==[]:
201            return None
202        else:
203            return self.M.dgCoverage(dgSpatioTemporalCoverage=
204                                    self.M.dgSpatioTemporalCoverage(dgSpatioTemporalRange=self.coverageList))
205
206
207class parameterAggregate:
208    ''' Provides a set of parameter summaries, and an index of parameter names to allow
209    aggregation without duplication. Pretty brain dead at the moment. '''
210    def __init__(self):
211        self.paramNameIndex=[]
212        self.paramSummaries=[]
213    def add(self,subset):
214        ''' Add a subset '''
215        for p in subset:
216            if p.ParameterName not in self.paramNameIndex:
217                self.paramNameIndex.append(p.ParameterName)
218                self.paramSummaries.append(p)
219    def get(self):
220        return self.paramSummaries
221       
222
Note: See TracBrowser for help on using the repository browser.