source: exist/trunk/python/ndgUtils/lib/utilities.py @ 4492

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/utilities.py@4492
Revision 4492, 13.7 KB checked in by cbyrom, 12 years ago (diff)

Adjust visibility of normaliseLongitude method.

Line 
1import os, sys, logging, re, cgi, urlparse, httplib, time
2from ndgUtils.ETxmlView import subAI
3#import csml.csmlscan as CsmlScan
4from xml.sax.saxutils import escape
5'''
6Various helper methods for use with the granulator command line tool
7@author: C Byrom
8'''
9_subtool = subAI()    # tool to escape angular brackets - enable as field variable for easy reuse
10
11ISO8601_RE = "([0-9]{4})(-([0-9]{2})(-([0-9]{2})([T\s]?([0-9]{2}):([0-9]{2})(:([0-9]{2})(\.([0-9]+))?)?" + \
12    "(Z|(([-+])([0-9]{2}):([0-9]{2})))?)?)?)?"
13
14regExp = re.compile(ISO8601_RE)
15
16# Regular expression string to allow months to be stripped out of parameters
17MONTHS = "JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER"
18
19esc_chars = {'\xb0':'°','°':'°'}
20   
21def checkURL(url):
22    '''
23    Lookup a specified url and check if it is valid - judged by the return code
24    being '200'.  NB, will also try to resolve redirects.
25    @param url:  url to lookup
26    @return: 1 if url is valid, 0 if not
27    '''
28    def getRequestHead(url):
29        '''
30        Create a HTTP connection to a specified URL and return the message HEAD
31        @param url: url to retrieve HEAD from
32        @return Response object relating to the HEAD retrieval
33        '''
34        logging.debug("Getting request HEAD for url, '%s'" %url)
35        host, path = urlparse.urlparse(url)[1:3]
36        if not host or not path:
37            raise ValueError("Invalid url - must be of format, 'http://somesite.com/...'")
38
39        connection = httplib.HTTPConnection(host)
40        connection.request("HEAD", path)
41        return connection.getresponse()
42
43    logging.info("Checking validity of URL, '%s'" %url)
44
45    try:
46        # redirection limit, default of 10
47        redirectCounter = 10
48   
49        # Retrieve HEAD
50        resp = getRequestHead(url)
51   
52        # check for redirection - NB, only do this a limited number of times
53        while (resp.status >= 300) and (resp.status <= 399) and redirectCounter > 0:
54            redirectCounter -= 1
55   
56            # lookup redirected location
57            url = resp.getheader('location')
58            logging.info("Redirect response received - checking new location, '%s'" %url)
59            resp = getRequestHead(url)
60   
61        if resp.status >= 200 and resp.status <= 299:
62            logging.info("URL resolved successfully")
63            return 1
64   
65        else:
66            logging.info("Invalid return code received (%s) - link broken" %resp.status)
67            return 0
68    except Exception, e:
69        logging.error("Failed to lookup URL: '%s'" %e.message)
70        return 0
71
72def getTripleData(tripleString, doEscape=True):
73    '''
74    Take a string as input and extract triple data into an array
75    NB, if data not fully in triple form, return empty elements
76    @param tripleString: string containing the triple data
77    @keyword doEscape: if True, escape special characters - e.g. '&' (default True)
78    @return 1-D array with three elements, representing the data in the triple
79    '''
80    logging.debug("Getting triple data: %s" %tripleString)
81    if doEscape:
82        tripleString = escapeSpecialCharacters(tripleString)
83       
84    data = tripleString.split('|')
85    returnData = ["", "", ""]
86    i = 0
87    for val in data:
88        if i > 2:
89            raise ValueError("Triple data has an extra '|' character in it (%s) - please fix and rerun" %tripleString)
90        returnData[i] = val.strip()
91        i += 1
92   
93    logging.debug("- returning triple data in array")
94    return returnData
95
96
97def getString(obj):
98    '''
99    Return the byte string representation of obj
100    @param obj: a byte string or unicode string
101    '''
102    try:
103        return str(obj)
104    except UnicodeEncodeError:
105        # obj is unicode
106        return unicode(obj).encode('utf-8')
107
108
109def encodeIntoHTMLNumericalCodes(inputString):
110    '''
111    Take an input string and adjust any bytes it contains into the
112    required html numerical code to display the data
113    @param inputString: string to encode
114    @return encoded string
115    '''
116    import pdb
117    pdb.set_trace()
118    # NB, the latin coding accepts unicode up to 255
119    correctedString = getString(inputString)#inputString.decode('string_escape')#('latin-1')
120    #unicode(inputString).encode('unicode_escape')
121    # the XMLCHARREFREPLACE does the required character replacement
122    return correctedString.encode('utf-8', 'xmlcharrefreplace')
123    #return correctedString.encode('ascii', 'xmlcharrefreplace')
124    #return correctedString.encode('utf-8')
125
126
127def isValidUnicode(inputString):
128    '''
129    Checks the input string to ensure that it can be validly represented
130    by utf-8 encoded unicode
131    @param inputString: byte string or unicode string to check
132    @return True if valid, False otherwise
133    '''
134    if not inputString:
135        return True
136   
137    logging.debug("Checking string, '%s' is valid" %inputString)
138    isValid = False
139    try:
140        if isinstance(inputString, unicode):
141            if inputString.encode('utf-8'):
142                isValid = True
143        elif unicode(inputString, 'utf-8'):
144            isValid = True
145    except Exception, e:
146        logging.debug(e.message)
147
148    if isValid:
149        logging.debug(" - valid")
150    else:
151        logging.debug("- invalid")
152    return isValid
153   
154
155def escapeSpecialCharacters(inputString):
156    '''
157    Escape any XML unfriendly characters
158    @param inputString: string whose value to correct
159    @return: corrected string
160    '''
161    correctedString = cgi.escape(inputString)
162    if not isValidUnicode(inputString):
163        raise ValueError("Input string, '%s', contains illegal characters" %inputString)
164    #correctedString = encodeIntoHTMLNumericalCodes(correctedString)
165    #correctedString = escape(correctedString, entities=esc_chars)
166   
167    if inputString != correctedString:
168        logging.info("Note: input data made XML friendly (\nold:'%s' \nnew:'%s')" %(inputString, correctedString))
169    return correctedString
170
171       
172def createCSMLFile(CDMLFilePath, datasetID, timeAxis):
173    '''
174    Create a CSML file by running csmlscan.py against the specified CDML file
175    @param CDMLFilePath: string path to CDML file
176    @param datasetID: string dataset ID to use in CSMLfile
177    @param timeAxis: string name of time axis to use in CDML file
178    @return: CSMLFileName: name of CSML file produced
179    '''
180    logging.info("Creating CSML file from CDML file by running csmlscan")
181    CSMLFileName = datasetID + "_csml.xml"
182    logging.debug("Inputs specified: datasetID = %s, timeAxis = %s" %(datasetID, timeAxis))
183    CsmlScan.main(['csmlscan', '-i', datasetID ,'-t', timeAxis, '-o', CSMLFileName, CDMLFilePath])
184    logging.info("Created CSML file: %s" %CSMLFileName)
185    return CSMLFileName
186
187
188def getISO8601Date(datestring):
189    '''
190    Converts an input datestring to the ISO8601 standard, if possible
191    @param datestring: string containing date in some format
192    @return: string in ISO8601 format
193    @raise ValueError: if string not in ISO8601 format - or cannot be parsed into this format
194    '''
195    if not datestring:
196        return ''
197   
198    d = regExp.match(datestring)
199    if not d:
200        errorMessage = "Datestring, '%s', not in ISO8601 format" %datestring
201        logging.error(errorMessage)
202        raise ValueError(errorMessage)
203
204    # years
205    outDate = d.group(1)
206   
207    # months
208    if d.group(3):
209        outDate += "-" + d.group(3)
210       
211    # days
212    if d.group(5):
213        outDate += "-" + d.group(5)
214    outDate += "T"
215   
216    # hours
217    if d.group(7):
218        outDate += d.group(7)
219   
220    # minutes
221    if d.group(8):
222        outDate += ":" + d.group(8)
223       
224    # seconds
225    if d.group(10):
226        outDate += ":" + d.group(10)
227
228    outDate += "Z"
229
230    return outDate
231   
232
233def formatDateYYYYMMDD(dateString):
234    '''
235    Parse a date string and attempt to return it in YYYY-MM-DD format
236    @param dateString - a string containing datetime info
237    @return string with date in format YYYY-MM-DD or None, if format not possible
238    '''
239    newDate = getISO8601Date(dateString)
240    if len(newDate) > 11:
241        newDate = newDate[0:10]
242       
243    return newDate
244
245
246def formatDateYYYY(dateString):
247    ''' Simple date manipulations on a string, if it is understood ...
248       if instruction = YYYY, return the year
249    NB, this is historical relic - used by DIF.  Unsure of format used by DIF - prob
250    better replaced by formatDateYYYYMMDD method
251    '''
252    s=dateString.split('-')
253    if len(s)==3: # expecting year,mon,day or day,mon,year ...
254        if int(s[0])>int(s[2]): 
255            return s[0]
256        else:
257            return s[2]
258    else:
259        return dateString # unknown format as yet ...
260   
261   
262def tidyUpParameters(params_string):
263    '''
264    Parameters info may contain generic info - including months
265    - also may include unnecessary spaces/cases - so strip out
266    generic info and correct any funnies
267    NB, if we're dealing with parameter triples, ignore the (second) url entry
268    when doing month removal/upper casing
269    '''
270    logging.debug("Tidying up parameters string, %s" %params_string)
271
272    # avoid processing the url, if it has been set
273    # - NB, special characters are escaped by getTripleData
274    data = getTripleData(params_string)
275    newData = []
276    for r in [data[0], data[2]]:
277        if r:
278            # Strip out any months in the string + uppercase everything
279            r = re.sub(MONTHS,'',r.upper())
280           
281            # Now remove any trailing spaces + any unnecessary inner spaces
282            r = re.sub('\s{2,}',' ',r.strip())
283           
284            newData.append(r)
285   
286    # now, recreate the parameters string
287    r = newData[0]
288    if params_string.find("|") > -1:
289        r += " | " + data[1]
290        if len(newData) > 1:
291            r += " | " + newData[1]
292   
293    logging.debug("Tidied parameter string is now: %s" %r)
294    return r
295
296
297   
298class coverageAggregate:
299    ''' Granules have spatiotemporal boxes, but we want an overall spatio temporal box too, if the
300    boxes are the same, but respecting space and time differently '''
301    def __init__(self,M):
302        self.spaceTime=[]
303        self.coverageList=[]
304        self.M=M
305    def add(self,bbox,time,coverage):
306        if (bbox,time) not in self.spaceTime:
307            logging.debug("Adding coverage data to moles doc:")
308            logging.info("- bbox, '%s'" %bbox)
309            logging.info("- time, '%s'" %time)
310            self.spaceTime.append((bbox,time))
311            self.coverageList.append(coverage.dgSpatioTemporalCoverage.dgSpatioTemporalRange)
312           
313    def makeElement(self):
314        logging.info("Setting up coverage element")
315        if self.spaceTime==[]:
316            return None
317        else:
318            return self.M.dgCoverage(dgSpatioTemporalCoverage=
319                                    self.M.dgSpatioTemporalCoverage(dgSpatioTemporalRange=self.coverageList))
320
321
322class parameterAggregate:
323    ''' Provides a set of parameter summaries, and an index of parameter names to allow
324    aggregation without duplication. Pretty brain dead at the moment. '''
325    def __init__(self):
326        self.paramNameIndex=[]
327        self.paramSummaries=[]
328    def add(self,subset):
329        ''' Add a subset '''
330        for p in subset:
331            if p.ParameterName not in self.paramNameIndex:
332                self.paramNameIndex.append(p.ParameterName)
333                self.paramSummaries.append(p)
334    def get(self):
335        return self.paramSummaries
336
337
338def wrapGetText(element,xpathExpression,multiple=0):
339    '''
340    Wraps a call to ET to get a text object in an error handler
341    '''
342    def none2txt(i):
343        if i is None: 
344            return ''
345        return i
346
347    if element is None:
348        if multiple:
349            return ['',]
350        else: 
351            return ''
352
353    if multiple:
354        r=element.findall(xpathExpression)
355    else:
356        r=[element.find(xpathExpression),]
357
358    rr = []
359    try:
360        rr=[i.text for i in r]
361    except:
362        rr=map(none2txt,rr) 
363
364    if multiple:
365        return rr
366
367    return rr[0] 
368
369
370# Format a datetime through its full proleptic Gregorian date range.
371#
372# >>> strftime(datetime.date(1850, 8, 2), "%Y/%M/%d was a %A")
373# '1850/00/02 was a Friday'
374# >>>
375# - NB, this is required since native python strftime doesn't work
376# on dates before 1900
377_illegal_s = re.compile(r"((^|[^%])(%%)*%s)")
378
379def _findall(text, substr):
380     # Also finds overlaps
381     sites = []
382     i = 0
383     while 1:
384         j = text.find(substr, i)
385         if j == -1:
386             break
387         sites.append(j)
388         i=j+1
389     return sites
390
391# Every 28 years the calendar repeats, except through century leap
392# years where it's 6 years.  But only if you're using the Gregorian
393# calendar.  ;)
394
395def strftime(dt, fmt):
396    if _illegal_s.search(fmt):
397        raise TypeError("This strftime implementation does not handle %s")
398    if dt.year > 1900:
399        return dt.strftime(fmt)
400
401    year = dt.year
402    # For every non-leap year century, advance by
403    # 6 years to get into the 28-year repeat cycle
404    delta = 2000 - year
405    off = 6*(delta // 100 + delta // 400)
406    year = year + off
407
408    # Move to around the year 2000
409    year = year + ((2000 - year)//28)*28
410    timetuple = dt.timetuple()
411    s1 = time.strftime(fmt, (year,) + timetuple[1:])
412    sites1 = _findall(s1, str(year))
413   
414    s2 = time.strftime(fmt, (year+28,) + timetuple[1:])
415    sites2 = _findall(s2, str(year+28))
416
417    sites = []
418    for site in sites1:
419        if site in sites2:
420            sites.append(site)
421           
422    s = s1
423    syear = "%4d" % (dt.year,)
424    for site in sites:
425        s = s[:site] + syear + s[site+4:]
426    return s
427
428       
429def normaliseLongitude(self, w,e):
430    '''
431    Take a 0,360 bounding box and force into -180,180
432    '''
433    ww,ee=float(w),float(e)
434    if ww<180.0 and ee>180.0:
435        return ww-180.0,ee-180.0
436    else:
437       if ww>180.0:
438           return ww-360.,ee-360.
439       else: 
440           return ww,ee
Note: See TracBrowser for help on using the repository browser.