source: exist/trunk/python/ndgUtils/lib/utilities.py @ 4494

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/utilities.py@4494
Revision 4494, 15.3 KB checked in by cbyrom, 12 years ago (diff)

Add new methods to lookup simple URLs and vocab term urls. NB, problems were encountered using the checkURL method, which uses httplib, when running with proxy server. Implement usage of new methods + fix small bug with keeping too many related links + tidy up unused imports.

Line 
1import os, sys, logging, re, cgi, urlparse, httplib, time, urllib2, socket
2from ndgUtils.ETxmlView import subAI
3import csml.csmlscan as CsmlScan
4from xml.sax.saxutils import escape
5'''
6Various helper methods for use with the granulator command line tool
7@author: C Byrom
8'''
9_subtool = subAI()    # tool to escape angular brackets - enable as field variable for easy reuse
10
11ISO8601_RE = "([0-9]{4})(-([0-9]{2})(-([0-9]{2})([T\s]?([0-9]{2}):([0-9]{2})(:([0-9]{2})(\.([0-9]+))?)?" + \
12    "(Z|(([-+])([0-9]{2}):([0-9]{2})))?)?)?)?"
13
14regExp = re.compile(ISO8601_RE)
15
16# Regular expression string to allow months to be stripped out of parameters
17MONTHS = "JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER"
18
19esc_chars = {'\xb0':'°','°':'°'}
20   
21def simpleURLCheck(uri):
22    '''
23    Use urllib2.urlopen to check if a url can be accessed.  NB, a better approach
24    would be to use checkURL - which properly checks returned status codes, but can't
25    get this working properly with proxies
26   
27    @param uri: vocab term uri to check
28    @return: 1 if valid, 0 otherwise
29    '''
30    logging.debug("Checking validity of uri, '%s'" %uri)
31    # set the socket timeout period
32    socket.setdefaulttimeout(5)
33   
34    try:
35        redirectCounter = 10
36        page = urllib2.urlopen(uri)
37        status = page.code
38   
39        # check for redirection - NB, only do this a limited number of times
40        while (status >= 300) and (status <= 399) and redirectCounter > 0:
41            redirectCounter -= 1
42   
43            # lookup redirected location
44            url = page.info().get('location')
45            logging.info("Redirect response received - checking new location, '%s'" %url)
46            page = urllib2.urlopen(uri)
47            status = page.code
48   
49        if status >= 200 and status <= 299:
50            logging.info("URL resolved successfully")
51            return 1
52        else:
53            logging.info("Invalid return code received (%s)" %status)
54
55    except ValueError, e:
56        # propagate invalid format errors
57        raise e
58    except Exception, e:
59        logging.error("Exception thrown whilst verifying uri: '%s'" %e.message)
60
61    logging.debug("- url appears to be invalid")
62    return 0
63
64   
65def checkURL(url):
66    '''
67    Lookup a specified url and check if it is valid - judged by the return code
68    being '200'.  NB, will also try to resolve redirects.
69    @param url:  url to lookup
70    @return: 1 if url is valid, 0 if not
71    '''
72    def getRequestHead(url):
73        '''
74        Create a HTTP connection to a specified URL and return the message HEAD
75        @param url: url to retrieve HEAD from
76        @return Response object relating to the HEAD retrieval
77        '''
78        logging.debug("Getting request HEAD for url, '%s'" %url)
79        host, path = urlparse.urlparse(url)[1:3]
80        if not host or not path:
81            raise ValueError("Invalid url (%s) - must be of format, 'http://somesite.com/...'" \
82                             %url)
83
84        connection = httplib.HTTPConnection(host)
85        connection.request("HEAD", path)
86        return connection.getresponse()
87
88    logging.info("Checking validity of URL, '%s'" %url)
89
90    try:
91        # redirection limit, default of 10
92        redirectCounter = 10
93
94        # Retrieve HEAD
95        resp = getRequestHead(url)
96   
97        # check for redirection - NB, only do this a limited number of times
98        while (resp.status >= 300) and (resp.status <= 399) and redirectCounter > 0:
99            redirectCounter -= 1
100   
101            # lookup redirected location
102            url = resp.getheader('location')
103            logging.info("Redirect response received - checking new location, '%s'" %url)
104            resp = getRequestHead(url)
105   
106        if resp.status >= 200 and resp.status <= 299:
107            logging.info("URL resolved successfully")
108            return 1
109   
110        else:
111            logging.info("Invalid return code received (%s) - link broken" %resp.status)
112            return 0
113    except ValueError, e:
114        # propagate invalid format errors
115        raise e
116    except Exception, e:
117        logging.error("Failed to lookup URL: '%s'" %e.message)
118        return 0
119
120
121def getTripleData(tripleString, doEscape=True):
122    '''
123    Take a string as input and extract triple data into an array
124    NB, if data not fully in triple form, return empty elements
125    @param tripleString: string containing the triple data
126    @keyword doEscape: if True, escape special characters - e.g. '&' (default True)
127    @return 1-D array with three elements, representing the data in the triple
128    '''
129    logging.debug("Getting triple data: %s" %tripleString)
130    if doEscape:
131        tripleString = escapeSpecialCharacters(tripleString)
132       
133    data = tripleString.split('|')
134    returnData = ["", "", ""]
135    i = 0
136    for val in data:
137        if i > 2:
138            raise ValueError("Triple data has an extra '|' character in it (%s) - please fix and rerun" %tripleString)
139        returnData[i] = val.strip()
140        i += 1
141   
142    logging.debug("- returning triple data in array")
143    return returnData
144
145
146def getString(obj):
147    '''
148    Return the byte string representation of obj
149    @param obj: a byte string or unicode string
150    '''
151    try:
152        return str(obj)
153    except UnicodeEncodeError:
154        # obj is unicode
155        return unicode(obj).encode('utf-8')
156
157
158def encodeIntoHTMLNumericalCodes(inputString):
159    '''
160    Take an input string and adjust any bytes it contains into the
161    required html numerical code to display the data
162    @param inputString: string to encode
163    @return encoded string
164    '''
165    # NB, the latin coding accepts unicode up to 255
166    correctedString = getString(inputString)#inputString.decode('string_escape')#('latin-1')
167    #unicode(inputString).encode('unicode_escape')
168    # the XMLCHARREFREPLACE does the required character replacement
169    return correctedString.encode('utf-8', 'xmlcharrefreplace')
170    #return correctedString.encode('ascii', 'xmlcharrefreplace')
171    #return correctedString.encode('utf-8')
172
173
174def isValidUnicode(inputString):
175    '''
176    Checks the input string to ensure that it can be validly represented
177    by utf-8 encoded unicode
178    @param inputString: byte string or unicode string to check
179    @return True if valid, False otherwise
180    '''
181    if not inputString:
182        return True
183   
184    logging.debug("Checking string, '%s' is valid" %inputString)
185    isValid = False
186    try:
187        if isinstance(inputString, unicode):
188            if inputString.encode('utf-8'):
189                isValid = True
190        elif unicode(inputString, 'utf-8'):
191            isValid = True
192    except Exception, e:
193        logging.debug(e.message)
194
195    if isValid:
196        logging.debug(" - valid")
197    else:
198        logging.debug("- invalid")
199    return isValid
200   
201
202def escapeSpecialCharacters(inputString):
203    '''
204    Escape any XML unfriendly characters
205    @param inputString: string whose value to correct
206    @return: corrected string
207    '''
208    correctedString = cgi.escape(inputString)
209    if not isValidUnicode(inputString):
210        raise ValueError("Input string, '%s', contains illegal characters" %inputString)
211    #correctedString = encodeIntoHTMLNumericalCodes(correctedString)
212    #correctedString = escape(correctedString, entities=esc_chars)
213   
214    if inputString != correctedString:
215        logging.info("Note: input data made XML friendly (\nold:'%s' \nnew:'%s')" %(inputString, correctedString))
216    return correctedString
217
218       
219def createCSMLFile(CDMLFilePath, datasetID, timeAxis):
220    '''
221    Create a CSML file by running csmlscan.py against the specified CDML file
222    @param CDMLFilePath: string path to CDML file
223    @param datasetID: string dataset ID to use in CSMLfile
224    @param timeAxis: string name of time axis to use in CDML file
225    @return: CSMLFileName: name of CSML file produced
226    '''
227    logging.info("Creating CSML file from CDML file by running csmlscan")
228    CSMLFileName = datasetID + "_csml.xml"
229    logging.debug("Inputs specified: datasetID = %s, timeAxis = %s" %(datasetID, timeAxis))
230    CsmlScan.main(['csmlscan', '-i', datasetID ,'-t', timeAxis, '-o', CSMLFileName, CDMLFilePath])
231    logging.info("Created CSML file: %s" %CSMLFileName)
232    return CSMLFileName
233
234
235def getISO8601Date(datestring):
236    '''
237    Converts an input datestring to the ISO8601 standard, if possible
238    @param datestring: string containing date in some format
239    @return: string in ISO8601 format
240    @raise ValueError: if string not in ISO8601 format - or cannot be parsed into this format
241    '''
242    if not datestring:
243        return ''
244   
245    d = regExp.match(datestring)
246    if not d:
247        errorMessage = "Datestring, '%s', not in ISO8601 format" %datestring
248        logging.error(errorMessage)
249        raise ValueError(errorMessage)
250
251    # years
252    outDate = d.group(1)
253   
254    # months
255    if d.group(3):
256        outDate += "-" + d.group(3)
257       
258    # days
259    if d.group(5):
260        outDate += "-" + d.group(5)
261    outDate += "T"
262   
263    # hours
264    if d.group(7):
265        outDate += d.group(7)
266   
267    # minutes
268    if d.group(8):
269        outDate += ":" + d.group(8)
270       
271    # seconds
272    if d.group(10):
273        outDate += ":" + d.group(10)
274
275    outDate += "Z"
276
277    return outDate
278   
279
280def formatDateYYYYMMDD(dateString):
281    '''
282    Parse a date string and attempt to return it in YYYY-MM-DD format
283    @param dateString - a string containing datetime info
284    @return string with date in format YYYY-MM-DD or None, if format not possible
285    '''
286    newDate = getISO8601Date(dateString)
287    if len(newDate) > 11:
288        newDate = newDate[0:10]
289       
290    return newDate
291
292
293def formatDateYYYY(dateString):
294    ''' Simple date manipulations on a string, if it is understood ...
295       if instruction = YYYY, return the year
296    NB, this is historical relic - used by DIF.  Unsure of format used by DIF - prob
297    better replaced by formatDateYYYYMMDD method
298    '''
299    s=dateString.split('-')
300    if len(s)==3: # expecting year,mon,day or day,mon,year ...
301        if int(s[0])>int(s[2]): 
302            return s[0]
303        else:
304            return s[2]
305    else:
306        return dateString # unknown format as yet ...
307   
308   
309def tidyUpParameters(params_string):
310    '''
311    Parameters info may contain generic info - including months
312    - also may include unnecessary spaces/cases - so strip out
313    generic info and correct any funnies
314    NB, if we're dealing with parameter triples, ignore the (second) url entry
315    when doing month removal/upper casing
316    '''
317    logging.debug("Tidying up parameters string, %s" %params_string)
318
319    # avoid processing the url, if it has been set
320    # - NB, special characters are escaped by getTripleData
321    data = getTripleData(params_string)
322    newData = []
323    for r in [data[0], data[2]]:
324        if r:
325            # Strip out any months in the string + uppercase everything
326            r = re.sub(MONTHS,'',r.upper())
327           
328            # Now remove any trailing spaces + any unnecessary inner spaces
329            r = re.sub('\s{2,}',' ',r.strip())
330           
331            newData.append(r)
332   
333    # now, recreate the parameters string
334    r = newData[0]
335    if params_string.find("|") > -1:
336        r += " | " + data[1]
337        if len(newData) > 1:
338            r += " | " + newData[1]
339   
340    logging.debug("Tidied parameter string is now: %s" %r)
341    return r
342
343
344   
345class coverageAggregate:
346    ''' Granules have spatiotemporal boxes, but we want an overall spatio temporal box too, if the
347    boxes are the same, but respecting space and time differently '''
348    def __init__(self,M):
349        self.spaceTime=[]
350        self.coverageList=[]
351        self.M=M
352    def add(self,bbox,time,coverage):
353        if (bbox,time) not in self.spaceTime:
354            logging.debug("Adding coverage data to moles doc:")
355            logging.info("- bbox, '%s'" %bbox)
356            logging.info("- time, '%s'" %time)
357            self.spaceTime.append((bbox,time))
358            self.coverageList.append(coverage.dgSpatioTemporalCoverage.dgSpatioTemporalRange)
359           
360    def makeElement(self):
361        logging.info("Setting up coverage element")
362        if self.spaceTime==[]:
363            return None
364        else:
365            return self.M.dgCoverage(dgSpatioTemporalCoverage=
366                                    self.M.dgSpatioTemporalCoverage(dgSpatioTemporalRange=self.coverageList))
367
368
369class parameterAggregate:
370    ''' Provides a set of parameter summaries, and an index of parameter names to allow
371    aggregation without duplication. Pretty brain dead at the moment. '''
372    def __init__(self):
373        self.paramNameIndex=[]
374        self.paramSummaries=[]
375    def add(self,subset):
376        ''' Add a subset '''
377        for p in subset:
378            if p.ParameterName not in self.paramNameIndex:
379                self.paramNameIndex.append(p.ParameterName)
380                self.paramSummaries.append(p)
381    def get(self):
382        return self.paramSummaries
383
384
385def wrapGetText(element,xpathExpression,multiple=0):
386    '''
387    Wraps a call to ET to get a text object in an error handler
388    '''
389    def none2txt(i):
390        if i is None: 
391            return ''
392        return i
393
394    if element is None:
395        if multiple:
396            return ['',]
397        else: 
398            return ''
399
400    if multiple:
401        r=element.findall(xpathExpression)
402    else:
403        r=[element.find(xpathExpression),]
404
405    rr = []
406    try:
407        rr=[i.text for i in r]
408    except:
409        rr=map(none2txt,rr) 
410
411    if multiple:
412        return rr
413
414    return rr[0] 
415
416
417# Format a datetime through its full proleptic Gregorian date range.
418#
419# >>> strftime(datetime.date(1850, 8, 2), "%Y/%M/%d was a %A")
420# '1850/00/02 was a Friday'
421# >>>
422# - NB, this is required since native python strftime doesn't work
423# on dates before 1900
424_illegal_s = re.compile(r"((^|[^%])(%%)*%s)")
425
426def _findall(text, substr):
427     # Also finds overlaps
428     sites = []
429     i = 0
430     while 1:
431         j = text.find(substr, i)
432         if j == -1:
433             break
434         sites.append(j)
435         i=j+1
436     return sites
437
438# Every 28 years the calendar repeats, except through century leap
439# years where it's 6 years.  But only if you're using the Gregorian
440# calendar.  ;)
441
442def strftime(dt, fmt):
443    if _illegal_s.search(fmt):
444        raise TypeError("This strftime implementation does not handle %s")
445    if dt.year > 1900:
446        return dt.strftime(fmt)
447
448    year = dt.year
449    # For every non-leap year century, advance by
450    # 6 years to get into the 28-year repeat cycle
451    delta = 2000 - year
452    off = 6*(delta // 100 + delta // 400)
453    year = year + off
454
455    # Move to around the year 2000
456    year = year + ((2000 - year)//28)*28
457    timetuple = dt.timetuple()
458    s1 = time.strftime(fmt, (year,) + timetuple[1:])
459    sites1 = _findall(s1, str(year))
460   
461    s2 = time.strftime(fmt, (year+28,) + timetuple[1:])
462    sites2 = _findall(s2, str(year+28))
463
464    sites = []
465    for site in sites1:
466        if site in sites2:
467            sites.append(site)
468           
469    s = s1
470    syear = "%4d" % (dt.year,)
471    for site in sites:
472        s = s[:site] + syear + s[site+4:]
473    return s
474
475       
476def normaliseLongitude(self, w,e):
477    '''
478    Take a 0,360 bounding box and force into -180,180
479    '''
480    ww,ee=float(w),float(e)
481    if ww<180.0 and ee>180.0:
482        return ww-180.0,ee-180.0
483    else:
484       if ww>180.0:
485           return ww-360.,ee-360.
486       else: 
487           return ww,ee
Note: See TracBrowser for help on using the repository browser.