source: ndgCommon/trunk/ndg/common/src/lib/utilities.py @ 4977

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/ndgCommon/trunk/ndg/common/src/lib/utilities.py@4977
Revision 4977, 22.0 KB checked in by cbyrom, 11 years ago (diff)

Add new utility methods to determine default proxies to use - checking
environment variables to establish this - and checking the url to ensure the proxy is needed for it (i.e. excluding no_proxy conforming urls). Adjust codebase to use this - when doing simple url lookups and when opening normal urls.

Line 
1'''
2Various helper methods for use across different applications
3@author: C Byrom
4'''
5import os, sys, logging, re, cgi, urlparse, httplib, time, urllib2, urllib, socket, uuid
6from ndg.common.src.models.ndgObject import ndgObject
7import csml.csmlscan as CsmlScan
8
9# environment variables to check when looking up default proxy set ups
10PROXY_KEY = 'http_proxy'
11NO_PROXY_KEY = 'no_proxy'
12
13ISO8601_RE = "([0-9]{4})(-([0-9]{2})(-([0-9]{2})([T\s]?([0-9]{2}):([0-9]{2})(:([0-9]{2})(\.([0-9]+))?)?" + \
14    "(Z|(([-+])([0-9]{2}):([0-9]{2})))?)?)?)?"
15
16regExp = re.compile(ISO8601_RE)
17
18YEAR_FORMAT = '%Y-%m-%d'    # format to use when parsing dates
19
20# Regular expression string to allow months to be stripped out of parameters
21MONTHS = "JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER"
22
23esc_chars = {'\xb0':'°','°':'°'}
24   
25URLIB2_INITIALISED = False
26
27       
28class edict(dict):
29    '''An extended dictionary which allows one to set and get values
30    as attributes (kudos Joe Gregorio's 1812)
31    The extended part allows you to get and set values as attributes.
32    That is,
33       d.fred
34    is the same as
35       d['fred']
36    '''
37    def __init__(self,**kw):
38        for a in kw:
39            self[a]=kw[a]
40    def __getattr__(self, key):
41        try:
42            return self.__dict__[key]
43        except KeyError:
44            pass
45        try:
46            assert not key.startswith('_')
47            return self.__getitem__(key)
48        except:
49            raise AttributeError, "object has no attribute '%s'" % key
50    def __setattr__(self, key, value):
51        if key.startswith('_'):
52            self.__dict__[key] = value
53        else:
54            return self.__setitem__(key, value)
55
56
57def loadConfigDetails(configFilename, dbName = None):
58    '''
59    Load the config file details for the DB and return the data relating
60    to the specified db
61    @param configFilename: name of config file to use
62    @keyword dbName: name of DB to use; if specified, must feature in the config file;
63    if not specified, use first config data found in file
64    @raise ValueError: if dbName not featured in the password file
65    @raise IOError: if password file cannot be opened
66    @return userID, password, hostname for specified DB
67    '''   
68    logging.info("Loading DB config data")
69    # Check this file exists
70    if not os.path.isfile(configFilename):
71        errorMessage = "Could not find the DB config file, %s; please make sure this " \
72                 "is available from the running directory" %configFilename
73        logging.error(errorMessage)
74        raise ValueError(errorMessage)
75   
76    f=file(configFilename, 'r')
77    for line in f.readlines():
78        line = line.strip()
79        if not line:
80            continue
81        host,userid,password=line.split(' ')
82        data = (userid, password, host)
83        if not dbName or dbName == host:
84            f.close()
85            logging.debug("Returning config file info for db, '%s'" %host)
86            return data
87
88    raise ValueError('Unable to find valid eXist config data')
89
90
91def getDefaultProxy(url):
92    '''
93    Checks the OS environment to see if any proxies are specified; if so,
94    return the proxy details in a dict
95    @param url: url that is being looked up; NB, there may be a no_proxy env
96    variable set - and this may affect how urls for certain machines will be
97    proxied
98    @return dict: with format key = proxy type, val = proxy value
99    - e.g. 'http':'http://wwwcache.rl.ac.uk:8080'
100    '''
101    proxies = {}
102    proxy = os.getenv(PROXY_KEY)
103    if proxy:
104        proxies['http'] = proxy
105        logging.debug("Found proxy setting - checking if this applies to the url being accessed")
106        noProxyHosts = os.getenv(NO_PROXY_KEY)
107        if noProxyHosts:
108            for host in noProxyHosts.split(';'):
109                if url.lower().find(host.lower().strip()) > -1:
110                    logging.debug("Proxies disabled for communication with uri machine ('%s')"
111                                  %host)
112                    proxies = {}
113    return proxies
114
115
116def openURLWithDefaultProxy(url):
117    '''
118    Checks the OS environment to see if any proxies are specified; if so,
119    use these when opening a URL
120    @param url: url to open
121    '''
122    logging.debug("Checking environment variables for proxies")
123    proxy = getDefaultProxy(url)
124    return openURLWithProxy(url, proxy)
125       
126
127def openURLWithProxy(uri, proxy):
128    '''
129    Open a simple url connection using the specified proxy, and retrieve the contents
130    @param uri: uri to read from
131    @param proxy: dict with proxy info in format key = protocol, val = proxy host
132    @return pageData: data read from uri
133    '''   
134    logging.debug("Reading info from uri, '%s'" %uri)
135    f = urllib.urlopen(uri, proxies = proxy)
136    pageData = f.read()
137    f.close()
138    logging.debug("- returning info from uri")
139    return pageData
140
141   
142def simpleURLCheck(uri):
143    '''
144    Use urllib2.urlopen to check if a url can be accessed.  NB, a better approach
145    would be to use checkURL - which properly checks returned status codes, but can't
146    get this working properly with proxies
147   
148    @param uri: vocab term uri to check
149    @return: 1 if valid, 0 otherwise
150    '''
151    logging.debug("Checking validity of uri, '%s'" %uri)
152   
153    if not URLIB2_INITIALISED:
154        # set the socket timeout period
155        socket.setdefaulttimeout(120)
156   
157        proxy = getDefaultProxy(uri)
158        proxy_support = urllib2.ProxyHandler(proxy)
159   
160        # build a new opener that adds authentication and caching FTP handlers
161        opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
162   
163        # install it
164        urllib2.install_opener(opener)
165   
166    try:
167        redirectCounter = 10
168        page = urllib2.urlopen(uri)
169        status = page.code
170   
171        # check for redirection - NB, only do this a limited number of times
172        while (status >= 300) and (status <= 399) and redirectCounter > 0:
173            redirectCounter -= 1
174   
175            # lookup redirected location
176            url = page.info().get('location')
177            logging.info("Redirect response received - checking new location, '%s'" %url)
178            page = urllib2.urlopen(uri)
179            status = page.code
180   
181        if status >= 200 and status <= 299:
182            logging.info("URL resolved successfully")
183            return 1
184        else:
185            logging.info("Invalid return code received (%s)" %status)
186
187    except ValueError, e:
188        # propagate invalid format errors
189        raise e
190    except Exception, e:
191        logging.error("Exception thrown whilst verifying uri: '%s'" %e.message)
192
193    logging.debug("- url appears to be invalid")
194    return 0
195
196   
197def checkURL(url):
198    '''
199    Lookup a specified url and check if it is valid - judged by the return code
200    being '200'.  NB, will also try to resolve redirects.
201    @param url:  url to lookup
202    @return: 1 if url is valid, 0 if not
203    '''
204    def getRequestHead(url):
205        '''
206        Create a HTTP connection to a specified URL and return the message HEAD
207        @param url: url to retrieve HEAD from
208        @return Response object relating to the HEAD retrieval
209        '''
210        logging.debug("Getting request HEAD for url, '%s'" %url)
211        host, path = urlparse.urlparse(url)[1:3]
212        if not host or not path:
213            raise ValueError("Invalid url (%s) - must be of format, 'http://somesite.com/...'" \
214                             %url)
215
216        connection = httplib.HTTPConnection(host)
217        connection.request("HEAD", path)
218        return connection.getresponse()
219
220    logging.info("Checking validity of URL, '%s'" %url)
221
222    try:
223        # redirection limit, default of 10
224        redirectCounter = 10
225
226        # Retrieve HEAD
227        resp = getRequestHead(url)
228   
229        # check for redirection - NB, only do this a limited number of times
230        while (resp.status >= 300) and (resp.status <= 399) and redirectCounter > 0:
231            redirectCounter -= 1
232   
233            # lookup redirected location
234            url = resp.getheader('location')
235            logging.info("Redirect response received - checking new location, '%s'" %url)
236            resp = getRequestHead(url)
237   
238        if resp.status >= 200 and resp.status <= 299:
239            logging.info("URL resolved successfully")
240            return 1
241   
242        else:
243            logging.info("Invalid return code received (%s) - link broken" %resp.status)
244            return 0
245    except ValueError, e:
246        # propagate invalid format errors
247        raise e
248    except Exception, e:
249        logging.error("Failed to lookup URL: '%s'" %e.message)
250        return 0
251
252
253def getTripleData(tripleString, doEscape=True):
254    '''
255    Take a string as input and extract triple data into an array
256    NB, if data not fully in triple form, return empty elements
257    @param tripleString: string containing the triple data
258    @keyword doEscape: if True, escape special characters - e.g. '&' (default True)
259    @return 1-D array with three elements, representing the data in the triple
260    '''
261    logging.debug("Getting triple data: %s" %tripleString)
262    if doEscape:
263        tripleString = escapeSpecialCharacters(tripleString)
264       
265    data = tripleString.split('|')
266    returnData = ["", "", ""]
267    i = 0
268    for val in data:
269        if i > 2:
270            raise ValueError("Triple data has an extra '|' character in it (%s) - please fix and rerun" %tripleString)
271        returnData[i] = val.strip()
272        i += 1
273   
274    logging.debug("- returning triple data in array")
275    return returnData
276
277
278def getString(obj):
279    '''
280    Return the byte string representation of obj
281    @param obj: a byte string or unicode string
282    '''
283    try:
284        return str(obj)
285    except UnicodeEncodeError:
286        # obj is unicode
287        return unicode(obj).encode('utf-8')
288
289
290def encodeIntoHTMLNumericalCodes(inputString):
291    '''
292    Take an input string and adjust any bytes it contains into the
293    required html numerical code to display the data
294    @param inputString: string to encode
295    @return encoded string
296    '''
297    # NB, the latin coding accepts unicode up to 255
298    correctedString = getString(inputString)#inputString.decode('string_escape')#('latin-1')
299    #unicode(inputString).encode('unicode_escape')
300    # the XMLCHARREFREPLACE does the required character replacement
301    return correctedString.encode('utf-8', 'xmlcharrefreplace')
302    #return correctedString.encode('ascii', 'xmlcharrefreplace')
303    #return correctedString.encode('utf-8')
304
305
306def isValidUnicode(inputString):
307    '''
308    Checks the input string to ensure that it can be validly represented
309    by utf-8 encoded unicode
310    @param inputString: byte string or unicode string to check
311    @return True if valid, False otherwise
312    '''
313    if not inputString:
314        return True
315   
316    logging.debug("Checking string, '%s' is valid" %inputString)
317    isValid = False
318    try:
319        if isinstance(inputString, unicode):
320            if inputString.encode('utf-8'):
321                isValid = True
322        elif unicode(inputString, 'utf-8'):
323            isValid = True
324    except UnicodeDecodeError, e:
325        logging.debug("Error encountered: %s" %e.reason)
326    except Exception, e:
327        logging.debug(e.message)
328
329    if isValid:
330        logging.debug(" - valid")
331    else:
332        logging.debug("- invalid")
333    return isValid
334   
335
336def escapeSpecialCharacters(inputString):
337    '''
338    Escape any XML unfriendly characters
339    @param inputString: string whose value to correct
340    @return: corrected string
341    '''
342    correctedString = cgi.escape(inputString)
343   
344    if inputString != correctedString:
345        logging.info("Note: input data made XML friendly (\nold:'%s' \nnew:'%s')" %(inputString, correctedString))
346    return correctedString
347
348       
349def createCSMLFile(CDMLFilePath, timeAxis, datasetID = None):
350    '''
351    Create a CSML file by running csmlscan.py against the specified CDML file
352    @param CDMLFilePath: string path to CDML file
353    @param timeAxis: string name of time axis to use in CDML file
354    @keyword datasetID: string dataset ID to use in CSMLfile - if not set, a random
355    name will be generated instead
356    @return: CSMLFileName: name of CSML file produced
357    '''
358    logging.info("Creating CSML file from CDML file by running csmlscan")
359   
360    if not datasetID:
361        datasetID = str(uuid.uuid1())
362    CSMLFileName = datasetID + "_csml.xml"
363    logging.debug("Inputs specified: datasetID = %s, timeAxis = %s" %(datasetID, timeAxis))
364    inputVals = ['csmlscan', '-o', CSMLFileName]
365    if datasetID:
366        inputVals.extend(['-i', datasetID])
367    if timeAxis:
368        inputVals.extend(['-t', timeAxis])
369   
370    inputVals.append(CDMLFilePath)
371   
372    CsmlScan.main(inputVals)
373    logging.info("Created CSML file: %s" %CSMLFileName)
374    return CSMLFileName
375
376
377def isCSMLFile(fileContent):
378    '''
379    Given the contents of a file, determine whether it is CSML or not
380    @param fileContent: content of the file to check - as a string
381    @return True if CSML, False otherwise
382    '''
383    logging.info("Checking file content to see if it is a CSML file")
384    if fileContent.find(ndgObject.CSML_NS) > -1 or \
385        fileContent.find('CSMLFeatureCollection') > -1 or \
386        fileContent.find('CSMLStorageDescriptor') > -1:
387        logging.info("- file is of CSML format")
388        return True
389
390    logging.info("- file is not of CSML format")
391    return False
392
393
394def isCDMLFile(fileContent):
395    '''
396    Given the contents of a file, determine whether it is CDML or not
397    @param fileContent: content of the file to check - as a string
398    @return True if CDML, False otherwise
399    '''
400    logging.info("Checking file content to see if it is a CDML file")
401    # NB, this is a bit of a fudge - there may be a better way to check for
402    # CDML-ness
403    if fileContent.find(ndgObject.CDML_DTD) > -1 or \
404        (fileContent.find('dataset') > -1 and \
405        fileContent.find('attr') > -1 and \
406        fileContent.find('axis')):
407        logging.info("- file is of CDML format")
408        return True
409
410    logging.info("- file is not of CDML format")
411    return False
412
413
414def getISO8601Date(datestring):
415    '''
416    Converts an input datestring to the ISO8601 standard, if possible
417    @param datestring: string containing date in some format
418    @return: string in ISO8601 format
419    @raise ValueError: if string not in ISO8601 format - or cannot be parsed into this format
420    '''
421    if not datestring:
422        return ''
423   
424    d = regExp.match(datestring)
425    if not d:
426        errorMessage = "Datestring, '%s', not in ISO8601 format" %datestring
427        logging.error(errorMessage)
428        raise ValueError(errorMessage)
429
430    # years
431    outDate = d.group(1)
432   
433    # months
434    if d.group(3):
435        outDate += "-" + d.group(3)
436       
437    # days
438    if d.group(5):
439        outDate += "-" + d.group(5)
440    outDate += "T"
441   
442    # hours
443    if d.group(7):
444        outDate += d.group(7)
445   
446    # minutes
447    if d.group(8):
448        outDate += ":" + d.group(8)
449       
450    # seconds
451    if d.group(10):
452        outDate += ":" + d.group(10)
453
454    outDate += "Z"
455
456    return outDate
457   
458
459def formatDateYYYYMMDD(dateString):
460    '''
461    Parse a date string and attempt to return it in YYYY-MM-DD format
462    @param dateString - a string containing datetime info
463    @return string with date in format YYYY-MM-DD or None, if format not possible
464    '''
465    newDate = getISO8601Date(dateString)
466    if len(newDate) > 11:
467        newDate = newDate[0:10]
468       
469    return newDate
470
471
472def formatDateYYYY(dateString):
473    ''' Simple date manipulations on a string, if it is understood ...
474       if instruction = YYYY, return the year
475    NB, this is historical relic - used by DIF.  Unsure of format used by DIF - prob
476    better replaced by formatDateYYYYMMDD method
477    '''
478    s=dateString.split('-')
479    if len(s)==3: # expecting year,mon,day or day,mon,year ...
480        if int(s[0])>int(s[2]): 
481            return s[0]
482        else:
483            return s[2]
484    else:
485        return dateString # unknown format as yet ...
486   
487   
488def tidyUpParameters(params_string):
489    '''
490    Parameters info may contain generic info - including months
491    - also may include unnecessary spaces/cases - so strip out
492    generic info and correct any funnies
493    NB, if we're dealing with parameter triples, ignore the (second) url entry
494    when doing month removal/upper casing
495    '''
496    logging.debug("Tidying up parameters string, %s" %params_string)
497
498    # avoid processing the url, if it has been set
499    # - NB, special characters are escaped by getTripleData
500    data = getTripleData(params_string)
501    newData = []
502    for r in [data[0], data[2]]:
503        if r:
504            # Strip out any months in the string + uppercase everything
505            r = re.sub(MONTHS,'',r.upper())
506           
507            # Now remove any trailing spaces + any unnecessary inner spaces
508            r = re.sub('\s{2,}',' ',r.strip())
509           
510            newData.append(r)
511   
512    # now, recreate the parameters string
513    r = newData[0]
514    if params_string.find("|") > -1:
515        r += " | " + data[1]
516        if len(newData) > 1:
517            r += " | " + newData[1]
518   
519    logging.debug("Tidied parameter string is now: %s" %r)
520    return r
521
522
523   
524class coverageAggregate:
525    ''' Granules have spatiotemporal boxes, but we want an overall spatio temporal box too, if the
526    boxes are the same, but respecting space and time differently '''
527    def __init__(self,M):
528        self.spaceTime=[]
529        self.coverageList=[]
530        self.M=M
531    def add(self,bbox,time,coverage):
532        if (bbox,time) not in self.spaceTime:
533            logging.debug("Adding coverage data to moles doc:")
534            logging.info("- bbox, '%s'" %bbox)
535            logging.info("- time, '%s'" %time)
536            self.spaceTime.append((bbox,time))
537            self.coverageList.append(coverage.dgSpatioTemporalCoverage.dgSpatioTemporalRange)
538           
539    def makeElement(self):
540        logging.info("Setting up coverage element")
541        if self.spaceTime==[]:
542            return None
543        else:
544            return self.M.dgCoverage(dgSpatioTemporalCoverage=
545                                    self.M.dgSpatioTemporalCoverage(dgSpatioTemporalRange=self.coverageList))
546
547
548class parameterAggregate:
549    ''' Provides a set of parameter summaries, and an index of parameter names to allow
550    aggregation without duplication. Pretty brain dead at the moment. '''
551    def __init__(self):
552        self.paramNameIndex=[]
553        self.paramSummaries=[]
554    def add(self,subset):
555        ''' Add a subset '''
556        for p in subset:
557            if p.ParameterName not in self.paramNameIndex:
558                self.paramNameIndex.append(p.ParameterName)
559                self.paramSummaries.append(p)
560    def get(self):
561        return self.paramSummaries
562
563
564def wrapGetText(element,xpathExpression,multiple=0):
565    '''
566    Wraps a call to ET to get a text object in an error handler
567    '''
568    if element is None:
569        if multiple:
570            return ['',]
571        else: 
572            return ''
573
574    if multiple:
575        r=element.findall(xpathExpression)
576    else:
577        r=[element.find(xpathExpression),]
578
579    rr = []
580    for elem in r:
581        if elem and isinstance(elem, Element):
582            rr.append(elem.text)
583
584    if multiple:
585        return rr
586
587    if len(rr) > 0:
588        return rr[0]
589    return '' 
590
591
592# Format a datetime through its full proleptic Gregorian date range.
593#
594# >>> strftime(datetime.date(1850, 8, 2), "%Y/%M/%d was a %A")
595# '1850/00/02 was a Friday'
596# >>>
597# - NB, this is required since native python strftime doesn't work
598# on dates before 1900
599_illegal_s = re.compile(r"((^|[^%])(%%)*%s)")
600
601def _findall(text, substr):
602     # Also finds overlaps
603     sites = []
604     i = 0
605     while 1:
606         j = text.find(substr, i)
607         if j == -1:
608             break
609         sites.append(j)
610         i=j+1
611     return sites
612
613# Every 28 years the calendar repeats, except through century leap
614# years where it's 6 years.  But only if you're using the Gregorian
615# calendar.  ;)
616
617def strftime(dt, fmt):
618    if _illegal_s.search(fmt):
619        raise TypeError("This strftime implementation does not handle %s")
620    if dt.year > 1900:
621        return dt.strftime(fmt)
622
623    year = dt.year
624    # For every non-leap year century, advance by
625    # 6 years to get into the 28-year repeat cycle
626    delta = 2000 - year
627    off = 6*(delta // 100 + delta // 400)
628    year = year + off
629
630    # Move to around the year 2000
631    year = year + ((2000 - year)//28)*28
632    timetuple = dt.timetuple()
633    s1 = time.strftime(fmt, (year,) + timetuple[1:])
634    sites1 = _findall(s1, str(year))
635   
636    s2 = time.strftime(fmt, (year+28,) + timetuple[1:])
637    sites2 = _findall(s2, str(year+28))
638
639    sites = []
640    for site in sites1:
641        if site in sites2:
642            sites.append(site)
643           
644    s = s1
645    syear = "%4d" % (dt.year,)
646    for site in sites:
647        s = s[:site] + syear + s[site+4:]
648    return s
649
650       
651def normaliseLongitude(w,e):
652    '''
653    Take a 0,360 bounding box and force into -180,180
654    '''
655    ww,ee=float(w),float(e)
656    if ww<180.0 and ee>180.0:
657        return ww-180.0,ee-180.0
658    else:
659       if ww>180.0:
660           return ww-360.,ee-360.
661       else: 
662           return ww,ee
663
664
665def findElementIndex(tree, elementName, isLast = False):
666    '''
667    Given an elementree object and a name of an element, determine the index
668    of the element in the tree
669    @param tree: tree to search for the element in
670    @param elementName: name of element to find index of
671    @keyword isLast: if False get the index of the first occurance, otherwise get
672    the last occurance
673    @return: index value or -1 if not found
674    '''
675    # not sure if there is a better way of doing this - seems to be no other way
676    # of determining the correct index - NB, order is important for DIF schema
677    # validation
678    logging.debug("Looking for index of element, '%s'" %elementName)
679    index = -1
680    for i, element in enumerate(tree):
681        if element.tag == elementName:
682            logging.debug("Element found (index = %s)" %i)
683            index = i
684            if not isLast:
685                break
686
687    if index < 0:
688        logging.debug("Element not found")
689    return index
Note: See TracBrowser for help on using the repository browser.