source: exist/trunk/python/ndgUtils/lib/utilities.py @ 4499

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/utilities.py@4499
Revision 4499, 15.1 KB checked in by cbyrom, 12 years ago (diff)

Improve error logging when testing for valid unicode.

Line 
1import os, sys, logging, re, cgi, urlparse, httplib, time, urllib2, socket
2from ndgUtils.ETxmlView import subAI
3import csml.csmlscan as CsmlScan
4from xml.sax.saxutils import escape
5'''
6Various helper methods for use with the granulator command line tool
7@author: C Byrom
8'''
9_subtool = subAI()    # tool to escape angular brackets - enable as field variable for easy reuse
10
11ISO8601_RE = "([0-9]{4})(-([0-9]{2})(-([0-9]{2})([T\s]?([0-9]{2}):([0-9]{2})(:([0-9]{2})(\.([0-9]+))?)?" + \
12    "(Z|(([-+])([0-9]{2}):([0-9]{2})))?)?)?)?"
13
14regExp = re.compile(ISO8601_RE)
15
16# Regular expression string to allow months to be stripped out of parameters
17MONTHS = "JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER"
18
19esc_chars = {'\xb0':'°','°':'°'}
20   
21def simpleURLCheck(uri):
22    '''
23    Use urllib2.urlopen to check if a url can be accessed.  NB, a better approach
24    would be to use checkURL - which properly checks returned status codes, but can't
25    get this working properly with proxies
26   
27    @param uri: vocab term uri to check
28    @return: 1 if valid, 0 otherwise
29    '''
30    logging.debug("Checking validity of uri, '%s'" %uri)
31    # set the socket timeout period
32    socket.setdefaulttimeout(5)
33   
34    try:
35        redirectCounter = 10
36        page = urllib2.urlopen(uri)
37        status = page.code
38   
39        # check for redirection - NB, only do this a limited number of times
40        while (status >= 300) and (status <= 399) and redirectCounter > 0:
41            redirectCounter -= 1
42   
43            # lookup redirected location
44            url = page.info().get('location')
45            logging.info("Redirect response received - checking new location, '%s'" %url)
46            page = urllib2.urlopen(uri)
47            status = page.code
48   
49        if status >= 200 and status <= 299:
50            logging.info("URL resolved successfully")
51            return 1
52        else:
53            logging.info("Invalid return code received (%s)" %status)
54
55    except ValueError, e:
56        # propagate invalid format errors
57        raise e
58    except Exception, e:
59        logging.error("Exception thrown whilst verifying uri: '%s'" %e.message)
60
61    logging.debug("- url appears to be invalid")
62    return 0
63
64   
65def checkURL(url):
66    '''
67    Lookup a specified url and check if it is valid - judged by the return code
68    being '200'.  NB, will also try to resolve redirects.
69    @param url:  url to lookup
70    @return: 1 if url is valid, 0 if not
71    '''
72    def getRequestHead(url):
73        '''
74        Create a HTTP connection to a specified URL and return the message HEAD
75        @param url: url to retrieve HEAD from
76        @return Response object relating to the HEAD retrieval
77        '''
78        logging.debug("Getting request HEAD for url, '%s'" %url)
79        host, path = urlparse.urlparse(url)[1:3]
80        if not host or not path:
81            raise ValueError("Invalid url (%s) - must be of format, 'http://somesite.com/...'" \
82                             %url)
83
84        connection = httplib.HTTPConnection(host)
85        connection.request("HEAD", path)
86        return connection.getresponse()
87
88    logging.info("Checking validity of URL, '%s'" %url)
89
90    try:
91        # redirection limit, default of 10
92        redirectCounter = 10
93
94        # Retrieve HEAD
95        resp = getRequestHead(url)
96   
97        # check for redirection - NB, only do this a limited number of times
98        while (resp.status >= 300) and (resp.status <= 399) and redirectCounter > 0:
99            redirectCounter -= 1
100   
101            # lookup redirected location
102            url = resp.getheader('location')
103            logging.info("Redirect response received - checking new location, '%s'" %url)
104            resp = getRequestHead(url)
105   
106        if resp.status >= 200 and resp.status <= 299:
107            logging.info("URL resolved successfully")
108            return 1
109   
110        else:
111            logging.info("Invalid return code received (%s) - link broken" %resp.status)
112            return 0
113    except ValueError, e:
114        # propagate invalid format errors
115        raise e
116    except Exception, e:
117        logging.error("Failed to lookup URL: '%s'" %e.message)
118        return 0
119
120
121def getTripleData(tripleString, doEscape=True):
122    '''
123    Take a string as input and extract triple data into an array
124    NB, if data not fully in triple form, return empty elements
125    @param tripleString: string containing the triple data
126    @keyword doEscape: if True, escape special characters - e.g. '&' (default True)
127    @return 1-D array with three elements, representing the data in the triple
128    '''
129    logging.debug("Getting triple data: %s" %tripleString)
130    if doEscape:
131        tripleString = escapeSpecialCharacters(tripleString)
132       
133    data = tripleString.split('|')
134    returnData = ["", "", ""]
135    i = 0
136    for val in data:
137        if i > 2:
138            raise ValueError("Triple data has an extra '|' character in it (%s) - please fix and rerun" %tripleString)
139        returnData[i] = val.strip()
140        i += 1
141   
142    logging.debug("- returning triple data in array")
143    return returnData
144
145
146def getString(obj):
147    '''
148    Return the byte string representation of obj
149    @param obj: a byte string or unicode string
150    '''
151    try:
152        return str(obj)
153    except UnicodeEncodeError:
154        # obj is unicode
155        return unicode(obj).encode('utf-8')
156
157
158def encodeIntoHTMLNumericalCodes(inputString):
159    '''
160    Take an input string and adjust any bytes it contains into the
161    required html numerical code to display the data
162    @param inputString: string to encode
163    @return encoded string
164    '''
165    # NB, the latin coding accepts unicode up to 255
166    correctedString = getString(inputString)#inputString.decode('string_escape')#('latin-1')
167    #unicode(inputString).encode('unicode_escape')
168    # the XMLCHARREFREPLACE does the required character replacement
169    return correctedString.encode('utf-8', 'xmlcharrefreplace')
170    #return correctedString.encode('ascii', 'xmlcharrefreplace')
171    #return correctedString.encode('utf-8')
172
173
174def isValidUnicode(inputString):
175    '''
176    Checks the input string to ensure that it can be validly represented
177    by utf-8 encoded unicode
178    @param inputString: byte string or unicode string to check
179    @return True if valid, False otherwise
180    '''
181    if not inputString:
182        return True
183   
184    logging.debug("Checking string, '%s' is valid" %inputString)
185    isValid = False
186    try:
187        if isinstance(inputString, unicode):
188            if inputString.encode('utf-8'):
189                isValid = True
190        elif unicode(inputString, 'utf-8'):
191            isValid = True
192    except UnicodeDecodeError, e:
193        logging.debug("Error encountered: %s" %e.reason)
194    except Exception, e:
195        logging.debug(e.message)
196
197    if isValid:
198        logging.debug(" - valid")
199    else:
200        logging.debug("- invalid")
201    return isValid
202   
203
204def escapeSpecialCharacters(inputString):
205    '''
206    Escape any XML unfriendly characters
207    @param inputString: string whose value to correct
208    @return: corrected string
209    '''
210    correctedString = cgi.escape(inputString)
211   
212    if inputString != correctedString:
213        logging.info("Note: input data made XML friendly (\nold:'%s' \nnew:'%s')" %(inputString, correctedString))
214    return correctedString
215
216       
217def createCSMLFile(CDMLFilePath, datasetID, timeAxis):
218    '''
219    Create a CSML file by running csmlscan.py against the specified CDML file
220    @param CDMLFilePath: string path to CDML file
221    @param datasetID: string dataset ID to use in CSMLfile
222    @param timeAxis: string name of time axis to use in CDML file
223    @return: CSMLFileName: name of CSML file produced
224    '''
225    logging.info("Creating CSML file from CDML file by running csmlscan")
226    CSMLFileName = datasetID + "_csml.xml"
227    logging.debug("Inputs specified: datasetID = %s, timeAxis = %s" %(datasetID, timeAxis))
228    CsmlScan.main(['csmlscan', '-i', datasetID ,'-t', timeAxis, '-o', CSMLFileName, CDMLFilePath])
229    logging.info("Created CSML file: %s" %CSMLFileName)
230    return CSMLFileName
231
232
233def getISO8601Date(datestring):
234    '''
235    Converts an input datestring to the ISO8601 standard, if possible
236    @param datestring: string containing date in some format
237    @return: string in ISO8601 format
238    @raise ValueError: if string not in ISO8601 format - or cannot be parsed into this format
239    '''
240    if not datestring:
241        return ''
242   
243    d = regExp.match(datestring)
244    if not d:
245        errorMessage = "Datestring, '%s', not in ISO8601 format" %datestring
246        logging.error(errorMessage)
247        raise ValueError(errorMessage)
248
249    # years
250    outDate = d.group(1)
251   
252    # months
253    if d.group(3):
254        outDate += "-" + d.group(3)
255       
256    # days
257    if d.group(5):
258        outDate += "-" + d.group(5)
259    outDate += "T"
260   
261    # hours
262    if d.group(7):
263        outDate += d.group(7)
264   
265    # minutes
266    if d.group(8):
267        outDate += ":" + d.group(8)
268       
269    # seconds
270    if d.group(10):
271        outDate += ":" + d.group(10)
272
273    outDate += "Z"
274
275    return outDate
276   
277
278def formatDateYYYYMMDD(dateString):
279    '''
280    Parse a date string and attempt to return it in YYYY-MM-DD format
281    @param dateString - a string containing datetime info
282    @return string with date in format YYYY-MM-DD or None, if format not possible
283    '''
284    newDate = getISO8601Date(dateString)
285    if len(newDate) > 11:
286        newDate = newDate[0:10]
287       
288    return newDate
289
290
291def formatDateYYYY(dateString):
292    ''' Simple date manipulations on a string, if it is understood ...
293       if instruction = YYYY, return the year
294    NB, this is historical relic - used by DIF.  Unsure of format used by DIF - prob
295    better replaced by formatDateYYYYMMDD method
296    '''
297    s=dateString.split('-')
298    if len(s)==3: # expecting year,mon,day or day,mon,year ...
299        if int(s[0])>int(s[2]): 
300            return s[0]
301        else:
302            return s[2]
303    else:
304        return dateString # unknown format as yet ...
305   
306   
307def tidyUpParameters(params_string):
308    '''
309    Parameters info may contain generic info - including months
310    - also may include unnecessary spaces/cases - so strip out
311    generic info and correct any funnies
312    NB, if we're dealing with parameter triples, ignore the (second) url entry
313    when doing month removal/upper casing
314    '''
315    logging.debug("Tidying up parameters string, %s" %params_string)
316
317    # avoid processing the url, if it has been set
318    # - NB, special characters are escaped by getTripleData
319    data = getTripleData(params_string)
320    newData = []
321    for r in [data[0], data[2]]:
322        if r:
323            # Strip out any months in the string + uppercase everything
324            r = re.sub(MONTHS,'',r.upper())
325           
326            # Now remove any trailing spaces + any unnecessary inner spaces
327            r = re.sub('\s{2,}',' ',r.strip())
328           
329            newData.append(r)
330   
331    # now, recreate the parameters string
332    r = newData[0]
333    if params_string.find("|") > -1:
334        r += " | " + data[1]
335        if len(newData) > 1:
336            r += " | " + newData[1]
337   
338    logging.debug("Tidied parameter string is now: %s" %r)
339    return r
340
341
342   
343class coverageAggregate:
344    ''' Granules have spatiotemporal boxes, but we want an overall spatio temporal box too, if the
345    boxes are the same, but respecting space and time differently '''
346    def __init__(self,M):
347        self.spaceTime=[]
348        self.coverageList=[]
349        self.M=M
350    def add(self,bbox,time,coverage):
351        if (bbox,time) not in self.spaceTime:
352            logging.debug("Adding coverage data to moles doc:")
353            logging.info("- bbox, '%s'" %bbox)
354            logging.info("- time, '%s'" %time)
355            self.spaceTime.append((bbox,time))
356            self.coverageList.append(coverage.dgSpatioTemporalCoverage.dgSpatioTemporalRange)
357           
358    def makeElement(self):
359        logging.info("Setting up coverage element")
360        if self.spaceTime==[]:
361            return None
362        else:
363            return self.M.dgCoverage(dgSpatioTemporalCoverage=
364                                    self.M.dgSpatioTemporalCoverage(dgSpatioTemporalRange=self.coverageList))
365
366
367class parameterAggregate:
368    ''' Provides a set of parameter summaries, and an index of parameter names to allow
369    aggregation without duplication. Pretty brain dead at the moment. '''
370    def __init__(self):
371        self.paramNameIndex=[]
372        self.paramSummaries=[]
373    def add(self,subset):
374        ''' Add a subset '''
375        for p in subset:
376            if p.ParameterName not in self.paramNameIndex:
377                self.paramNameIndex.append(p.ParameterName)
378                self.paramSummaries.append(p)
379    def get(self):
380        return self.paramSummaries
381
382
383def wrapGetText(element,xpathExpression,multiple=0):
384    '''
385    Wraps a call to ET to get a text object in an error handler
386    '''
387    def none2txt(i):
388        if i is None: 
389            return ''
390        return i
391
392    if element is None:
393        if multiple:
394            return ['',]
395        else: 
396            return ''
397
398    if multiple:
399        r=element.findall(xpathExpression)
400    else:
401        r=[element.find(xpathExpression),]
402
403    rr = []
404    try:
405        rr=[i.text for i in r]
406    except:
407        rr=map(none2txt,rr) 
408
409    if multiple:
410        return rr
411
412    return rr[0] 
413
414
415# Format a datetime through its full proleptic Gregorian date range.
416#
417# >>> strftime(datetime.date(1850, 8, 2), "%Y/%M/%d was a %A")
418# '1850/00/02 was a Friday'
419# >>>
420# - NB, this is required since native python strftime doesn't work
421# on dates before 1900
422_illegal_s = re.compile(r"((^|[^%])(%%)*%s)")
423
424def _findall(text, substr):
425     # Also finds overlaps
426     sites = []
427     i = 0
428     while 1:
429         j = text.find(substr, i)
430         if j == -1:
431             break
432         sites.append(j)
433         i=j+1
434     return sites
435
436# Every 28 years the calendar repeats, except through century leap
437# years where it's 6 years.  But only if you're using the Gregorian
438# calendar.  ;)
439
440def strftime(dt, fmt):
441    if _illegal_s.search(fmt):
442        raise TypeError("This strftime implementation does not handle %s")
443    if dt.year > 1900:
444        return dt.strftime(fmt)
445
446    year = dt.year
447    # For every non-leap year century, advance by
448    # 6 years to get into the 28-year repeat cycle
449    delta = 2000 - year
450    off = 6*(delta // 100 + delta // 400)
451    year = year + off
452
453    # Move to around the year 2000
454    year = year + ((2000 - year)//28)*28
455    timetuple = dt.timetuple()
456    s1 = time.strftime(fmt, (year,) + timetuple[1:])
457    sites1 = _findall(s1, str(year))
458   
459    s2 = time.strftime(fmt, (year+28,) + timetuple[1:])
460    sites2 = _findall(s2, str(year+28))
461
462    sites = []
463    for site in sites1:
464        if site in sites2:
465            sites.append(site)
466           
467    s = s1
468    syear = "%4d" % (dt.year,)
469    for site in sites:
470        s = s[:site] + syear + s[site+4:]
471    return s
472
473       
474def normaliseLongitude(self, w,e):
475    '''
476    Take a 0,360 bounding box and force into -180,180
477    '''
478    ww,ee=float(w),float(e)
479    if ww<180.0 and ee>180.0:
480        return ww-180.0,ee-180.0
481    else:
482       if ww>180.0:
483           return ww-360.,ee-360.
484       else: 
485           return ww,ee
Note: See TracBrowser for help on using the repository browser.