source: ndgCommon/trunk/ndg/common/src/lib/utilities.py @ 4991

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/ndgCommon/trunk/ndg/common/src/lib/utilities.py@4991
Revision 4991, 22.1 KB checked in by cbyrom, 11 years ago (diff)

Move generic 'httpify' method to utilities + adjust utilities method
to retrive default proxy to cope with standard formatting + set this
up on each call to urlopen - since different urls may use different
proxies + tidyup and extend test suite.

Line 
1'''
2Various helper methods for use across different applications
3@author: C Byrom
4'''
5import os, sys, logging, re, cgi, urlparse, httplib, time, urllib2, urllib, socket, uuid
6from ndg.common.src.models.ndgObject import ndgObject
7import csml.csmlscan as CsmlScan
8
9# environment variables to check when looking up default proxy set ups
10PROXY_KEY = 'http_proxy'
11NO_PROXY_KEY = 'no_proxy'
12
13ISO8601_RE = "([0-9]{4})(-([0-9]{2})(-([0-9]{2})([T\s]?([0-9]{2}):([0-9]{2})(:([0-9]{2})(\.([0-9]+))?)?" + \
14    "(Z|(([-+])([0-9]{2}):([0-9]{2})))?)?)?)?"
15
16regExp = re.compile(ISO8601_RE)
17
18YEAR_FORMAT = '%Y-%m-%d'    # format to use when parsing dates
19
20# Regular expression string to allow months to be stripped out of parameters
21MONTHS = "JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER"
22
23esc_chars = {'\xb0':'°','°':'°'}
24   
25       
26class edict(dict):
27    '''An extended dictionary which allows one to set and get values
28    as attributes (kudos Joe Gregorio's 1812)
29    The extended part allows you to get and set values as attributes.
30    That is,
31       d.fred
32    is the same as
33       d['fred']
34    '''
35    def __init__(self,**kw):
36        for a in kw:
37            self[a]=kw[a]
38    def __getattr__(self, key):
39        try:
40            return self.__dict__[key]
41        except KeyError:
42            pass
43        try:
44            assert not key.startswith('_')
45            return self.__getitem__(key)
46        except:
47            raise AttributeError, "object has no attribute '%s'" % key
48    def __setattr__(self, key, value):
49        if key.startswith('_'):
50            self.__dict__[key] = value
51        else:
52            return self.__setitem__(key, value)
53
54
55def httpify(url):
56    '''
57    Ensure a url has an http prefix
58    @param url: url to check and, if required, to add 'http' prefix to
59    @return url - with 'http' prefix added, if required
60    '''
61    if not url.startswith('http'):
62        url = 'http://' + url
63    return url
64
65
66def loadConfigDetails(configFilename, dbName = None):
67    '''
68    Load the config file details for the DB and return the data relating
69    to the specified db
70    @param configFilename: name of config file to use
71    @keyword dbName: name of DB to use; if specified, must feature in the config file;
72    if not specified, use first config data found in file
73    @raise ValueError: if dbName not featured in the password file
74    @raise IOError: if password file cannot be opened
75    @return userID, password, hostname for specified DB
76    '''   
77    logging.info("Loading DB config data")
78    # Check this file exists
79    if not os.path.isfile(configFilename):
80        errorMessage = "Could not find the DB config file, %s; please make sure this " \
81                 "is available from the running directory" %configFilename
82        logging.error(errorMessage)
83        raise ValueError(errorMessage)
84   
85    f=file(configFilename, 'r')
86    for line in f.readlines():
87        line = line.strip()
88        if not line:
89            continue
90        host,userid,password=line.split(' ')
91        data = (userid, password, host)
92        if not dbName or dbName == host:
93            f.close()
94            logging.debug("Returning config file info for db, '%s'" %host)
95            return data
96
97    raise ValueError('Unable to find valid eXist config data')
98
99
100def getDefaultProxy(url):
101    '''
102    Checks the OS environment to see if any proxies are specified; if so,
103    return the proxy details in a dict
104    @param url: url that is being looked up; NB, there may be a no_proxy env
105    variable set - and this may affect how urls for certain machines will be
106    proxied
107    @return dict: with format key = proxy type, val = proxy value
108    - e.g. 'http':'http://wwwcache.rl.ac.uk:8080'
109    '''
110    proxies = {}
111    proxy = os.getenv(PROXY_KEY)
112    if proxy:
113        proxies['http'] = proxy
114        logging.debug("Found proxy setting - checking if this applies to the url being accessed")
115        noProxyHosts = os.getenv(NO_PROXY_KEY)
116        if noProxyHosts:
117            for host in noProxyHosts.split(','):
118                if url.lower().find(host.lower().strip()) > -1:
119                    logging.debug("Proxies disabled for communication with uri machine ('%s')"
120                                  %host)
121                    proxies = {}
122    return proxies
123
124
125def openURLWithDefaultProxy(url):
126    '''
127    Checks the OS environment to see if any proxies are specified; if so,
128    use these when opening a URL
129    @param url: url to open
130    '''
131    logging.debug("Checking environment variables for proxies")
132    proxy = getDefaultProxy(url)
133    return openURLWithProxy(url, proxy)
134       
135
136def openURLWithProxy(uri, proxy):
137    '''
138    Open a simple url connection using the specified proxy, and retrieve the contents
139    @param uri: uri to read from
140    @param proxy: dict with proxy info in format key = protocol, val = proxy host
141    @return pageData: data read from uri
142    '''   
143    logging.debug("Reading info from uri, '%s'" %uri)
144    f = urllib.urlopen(uri, proxies = proxy)
145    pageData = f.read()
146    f.close()
147    logging.debug("- returning info from uri")
148    return pageData
149
150   
151def simpleURLCheck(uri):
152    '''
153    Use urllib2.urlopen to check if a url can be accessed.  NB, a better approach
154    would be to use checkURL - which properly checks returned status codes, but can't
155    get this working properly with proxies
156   
157    @param uri: vocab term uri to check
158    @return: 1 if valid, 0 otherwise
159    '''
160    logging.debug("Checking validity of uri, '%s'" %uri)
161
162    # set the socket timeout period
163    socket.setdefaulttimeout(120)
164
165    proxy = getDefaultProxy(uri)
166    proxy_support = urllib2.ProxyHandler(proxy)
167
168    # build a new opener that adds authentication and caching FTP handlers
169    opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
170
171    # install it
172    urllib2.install_opener(opener)
173   
174    try:
175        redirectCounter = 10
176        page = urllib2.urlopen(uri)
177        status = page.code
178   
179        # check for redirection - NB, only do this a limited number of times
180        while (status >= 300) and (status <= 399) and redirectCounter > 0:
181            redirectCounter -= 1
182   
183            # lookup redirected location
184            url = page.info().get('location')
185            logging.info("Redirect response received - checking new location, '%s'" %url)
186            page = urllib2.urlopen(uri)
187            status = page.code
188   
189        if status >= 200 and status <= 299:
190            logging.info("URL resolved successfully")
191            return 1
192        else:
193            logging.info("Invalid return code received (%s)" %status)
194
195    except ValueError, e:
196        # propagate invalid format errors
197        raise e
198    except Exception, e:
199        logging.error("Exception thrown whilst verifying uri: '%s'" %e.message)
200
201    logging.debug("- url appears to be invalid")
202    return 0
203
204   
205def checkURL(url):
206    '''
207    Lookup a specified url and check if it is valid - judged by the return code
208    being '200'.  NB, will also try to resolve redirects.
209    @param url:  url to lookup
210    @return: 1 if url is valid, 0 if not
211    '''
212    def getRequestHead(url):
213        '''
214        Create a HTTP connection to a specified URL and return the message HEAD
215        @param url: url to retrieve HEAD from
216        @return Response object relating to the HEAD retrieval
217        '''
218        logging.debug("Getting request HEAD for url, '%s'" %url)
219        host, path = urlparse.urlparse(url)[1:3]
220        if not host or not path:
221            raise ValueError("Invalid url (%s) - must be of format, 'http://somesite.com/...'" \
222                             %url)
223
224        connection = httplib.HTTPConnection(host)
225        connection.request("HEAD", path)
226        return connection.getresponse()
227
228    logging.info("Checking validity of URL, '%s'" %url)
229
230    try:
231        # redirection limit, default of 10
232        redirectCounter = 10
233
234        # Retrieve HEAD
235        resp = getRequestHead(url)
236   
237        # check for redirection - NB, only do this a limited number of times
238        while (resp.status >= 300) and (resp.status <= 399) and redirectCounter > 0:
239            redirectCounter -= 1
240   
241            # lookup redirected location
242            url = resp.getheader('location')
243            logging.info("Redirect response received - checking new location, '%s'" %url)
244            resp = getRequestHead(url)
245   
246        if resp.status >= 200 and resp.status <= 299:
247            logging.info("URL resolved successfully")
248            return 1
249   
250        else:
251            logging.info("Invalid return code received (%s) - link broken" %resp.status)
252            return 0
253    except ValueError, e:
254        # propagate invalid format errors
255        raise e
256    except Exception, e:
257        logging.error("Failed to lookup URL: '%s'" %e.message)
258        return 0
259
260
261def getTripleData(tripleString, doEscape=True):
262    '''
263    Take a string as input and extract triple data into an array
264    NB, if data not fully in triple form, return empty elements
265    @param tripleString: string containing the triple data
266    @keyword doEscape: if True, escape special characters - e.g. '&' (default True)
267    @return 1-D array with three elements, representing the data in the triple
268    '''
269    logging.debug("Getting triple data: %s" %tripleString)
270    if doEscape:
271        tripleString = escapeSpecialCharacters(tripleString)
272       
273    data = tripleString.split('|')
274    returnData = ["", "", ""]
275    i = 0
276    for val in data:
277        if i > 2:
278            raise ValueError("Triple data has an extra '|' character in it (%s) - please fix and rerun" %tripleString)
279        returnData[i] = val.strip()
280        i += 1
281   
282    logging.debug("- returning triple data in array")
283    return returnData
284
285
286def getString(obj):
287    '''
288    Return the byte string representation of obj
289    @param obj: a byte string or unicode string
290    '''
291    try:
292        return str(obj)
293    except UnicodeEncodeError:
294        # obj is unicode
295        return unicode(obj).encode('utf-8')
296
297
298def encodeIntoHTMLNumericalCodes(inputString):
299    '''
300    Take an input string and adjust any bytes it contains into the
301    required html numerical code to display the data
302    @param inputString: string to encode
303    @return encoded string
304    '''
305    # NB, the latin coding accepts unicode up to 255
306    correctedString = getString(inputString)#inputString.decode('string_escape')#('latin-1')
307    #unicode(inputString).encode('unicode_escape')
308    # the XMLCHARREFREPLACE does the required character replacement
309    return correctedString.encode('utf-8', 'xmlcharrefreplace')
310    #return correctedString.encode('ascii', 'xmlcharrefreplace')
311    #return correctedString.encode('utf-8')
312
313
314def isValidUnicode(inputString):
315    '''
316    Checks the input string to ensure that it can be validly represented
317    by utf-8 encoded unicode
318    @param inputString: byte string or unicode string to check
319    @return True if valid, False otherwise
320    '''
321    if not inputString:
322        return True
323   
324    logging.debug("Checking string, '%s' is valid" %inputString)
325    isValid = False
326    try:
327        if isinstance(inputString, unicode):
328            if inputString.encode('utf-8'):
329                isValid = True
330        elif unicode(inputString, 'utf-8'):
331            isValid = True
332    except UnicodeDecodeError, e:
333        logging.debug("Error encountered: %s" %e.reason)
334    except Exception, e:
335        logging.debug(e.message)
336
337    if isValid:
338        logging.debug(" - valid")
339    else:
340        logging.debug("- invalid")
341    return isValid
342   
343
344def escapeSpecialCharacters(inputString):
345    '''
346    Escape any XML unfriendly characters
347    @param inputString: string whose value to correct
348    @return: corrected string
349    '''
350    correctedString = cgi.escape(inputString)
351   
352    if inputString != correctedString:
353        logging.info("Note: input data made XML friendly (\nold:'%s' \nnew:'%s')" %(inputString, correctedString))
354    return correctedString
355
356       
357def createCSMLFile(CDMLFilePath, timeAxis, datasetID = None):
358    '''
359    Create a CSML file by running csmlscan.py against the specified CDML file
360    @param CDMLFilePath: string path to CDML file
361    @param timeAxis: string name of time axis to use in CDML file
362    @keyword datasetID: string dataset ID to use in CSMLfile - if not set, a random
363    name will be generated instead
364    @return: CSMLFileName: name of CSML file produced
365    '''
366    logging.info("Creating CSML file from CDML file by running csmlscan")
367   
368    if not datasetID:
369        datasetID = str(uuid.uuid1())
370    CSMLFileName = datasetID + "_csml.xml"
371    logging.debug("Inputs specified: datasetID = %s, timeAxis = %s" %(datasetID, timeAxis))
372    inputVals = ['csmlscan', '-o', CSMLFileName]
373    if datasetID:
374        inputVals.extend(['-i', datasetID])
375    if timeAxis:
376        inputVals.extend(['-t', timeAxis])
377   
378    inputVals.append(CDMLFilePath)
379   
380    CsmlScan.main(inputVals)
381    logging.info("Created CSML file: %s" %CSMLFileName)
382    return CSMLFileName
383
384
385def isCSMLFile(fileContent):
386    '''
387    Given the contents of a file, determine whether it is CSML or not
388    @param fileContent: content of the file to check - as a string
389    @return True if CSML, False otherwise
390    '''
391    logging.info("Checking file content to see if it is a CSML file")
392    if fileContent.find(ndgObject.CSML_NS) > -1 or \
393        fileContent.find('CSMLFeatureCollection') > -1 or \
394        fileContent.find('CSMLStorageDescriptor') > -1:
395        logging.info("- file is of CSML format")
396        return True
397
398    logging.info("- file is not of CSML format")
399    return False
400
401
402def isCDMLFile(fileContent):
403    '''
404    Given the contents of a file, determine whether it is CDML or not
405    @param fileContent: content of the file to check - as a string
406    @return True if CDML, False otherwise
407    '''
408    logging.info("Checking file content to see if it is a CDML file")
409    # NB, this is a bit of a fudge - there may be a better way to check for
410    # CDML-ness
411    if fileContent.find(ndgObject.CDML_DTD) > -1 or \
412        (fileContent.find('dataset') > -1 and \
413        fileContent.find('attr') > -1 and \
414        fileContent.find('axis')):
415        logging.info("- file is of CDML format")
416        return True
417
418    logging.info("- file is not of CDML format")
419    return False
420
421
422def getISO8601Date(datestring):
423    '''
424    Converts an input datestring to the ISO8601 standard, if possible
425    @param datestring: string containing date in some format
426    @return: string in ISO8601 format
427    @raise ValueError: if string not in ISO8601 format - or cannot be parsed into this format
428    '''
429    if not datestring:
430        return ''
431   
432    d = regExp.match(datestring)
433    if not d:
434        errorMessage = "Datestring, '%s', not in ISO8601 format" %datestring
435        logging.error(errorMessage)
436        raise ValueError(errorMessage)
437
438    # years
439    outDate = d.group(1)
440   
441    # months
442    if d.group(3):
443        outDate += "-" + d.group(3)
444       
445    # days
446    if d.group(5):
447        outDate += "-" + d.group(5)
448    outDate += "T"
449   
450    # hours
451    if d.group(7):
452        outDate += d.group(7)
453   
454    # minutes
455    if d.group(8):
456        outDate += ":" + d.group(8)
457       
458    # seconds
459    if d.group(10):
460        outDate += ":" + d.group(10)
461
462    outDate += "Z"
463
464    return outDate
465   
466
467def formatDateYYYYMMDD(dateString):
468    '''
469    Parse a date string and attempt to return it in YYYY-MM-DD format
470    @param dateString - a string containing datetime info
471    @return string with date in format YYYY-MM-DD or None, if format not possible
472    '''
473    newDate = getISO8601Date(dateString)
474    if len(newDate) > 11:
475        newDate = newDate[0:10]
476       
477    return newDate
478
479
480def formatDateYYYY(dateString):
481    ''' Simple date manipulations on a string, if it is understood ...
482       if instruction = YYYY, return the year
483    NB, this is historical relic - used by DIF.  Unsure of format used by DIF - prob
484    better replaced by formatDateYYYYMMDD method
485    '''
486    s=dateString.split('-')
487    if len(s)==3: # expecting year,mon,day or day,mon,year ...
488        if int(s[0])>int(s[2]): 
489            return s[0]
490        else:
491            return s[2]
492    else:
493        return dateString # unknown format as yet ...
494   
495   
496def tidyUpParameters(params_string):
497    '''
498    Parameters info may contain generic info - including months
499    - also may include unnecessary spaces/cases - so strip out
500    generic info and correct any funnies
501    NB, if we're dealing with parameter triples, ignore the (second) url entry
502    when doing month removal/upper casing
503    '''
504    logging.debug("Tidying up parameters string, %s" %params_string)
505
506    # avoid processing the url, if it has been set
507    # - NB, special characters are escaped by getTripleData
508    data = getTripleData(params_string)
509    newData = []
510    for r in [data[0], data[2]]:
511        if r:
512            # Strip out any months in the string + uppercase everything
513            r = re.sub(MONTHS,'',r.upper())
514           
515            # Now remove any trailing spaces + any unnecessary inner spaces
516            r = re.sub('\s{2,}',' ',r.strip())
517           
518            newData.append(r)
519   
520    # now, recreate the parameters string
521    r = newData[0]
522    if params_string.find("|") > -1:
523        r += " | " + data[1]
524        if len(newData) > 1:
525            r += " | " + newData[1]
526   
527    logging.debug("Tidied parameter string is now: %s" %r)
528    return r
529
530
531   
532class coverageAggregate:
533    ''' Granules have spatiotemporal boxes, but we want an overall spatio temporal box too, if the
534    boxes are the same, but respecting space and time differently '''
535    def __init__(self,M):
536        self.spaceTime=[]
537        self.coverageList=[]
538        self.M=M
539    def add(self,bbox,time,coverage):
540        if (bbox,time) not in self.spaceTime:
541            logging.debug("Adding coverage data to moles doc:")
542            logging.info("- bbox, '%s'" %bbox)
543            logging.info("- time, '%s'" %time)
544            self.spaceTime.append((bbox,time))
545            self.coverageList.append(coverage.dgSpatioTemporalCoverage.dgSpatioTemporalRange)
546           
547    def makeElement(self):
548        logging.info("Setting up coverage element")
549        if self.spaceTime==[]:
550            return None
551        else:
552            return self.M.dgCoverage(dgSpatioTemporalCoverage=
553                                    self.M.dgSpatioTemporalCoverage(dgSpatioTemporalRange=self.coverageList))
554
555
556class parameterAggregate:
557    ''' Provides a set of parameter summaries, and an index of parameter names to allow
558    aggregation without duplication. Pretty brain dead at the moment. '''
559    def __init__(self):
560        self.paramNameIndex=[]
561        self.paramSummaries=[]
562    def add(self,subset):
563        ''' Add a subset '''
564        for p in subset:
565            if p.ParameterName not in self.paramNameIndex:
566                self.paramNameIndex.append(p.ParameterName)
567                self.paramSummaries.append(p)
568    def get(self):
569        return self.paramSummaries
570
571
572def wrapGetText(element,xpathExpression,multiple=0):
573    '''
574    Wraps a call to ET to get a text object in an error handler
575    '''
576    if element is None:
577        if multiple:
578            return ['',]
579        else: 
580            return ''
581
582    if multiple:
583        r=element.findall(xpathExpression)
584    else:
585        r=[element.find(xpathExpression),]
586
587    rr = []
588    for elem in r:
589        if elem and isinstance(elem, Element):
590            rr.append(elem.text)
591
592    if multiple:
593        return rr
594
595    if len(rr) > 0:
596        return rr[0]
597    return '' 
598
599
600# Format a datetime through its full proleptic Gregorian date range.
601#
602# >>> strftime(datetime.date(1850, 8, 2), "%Y/%M/%d was a %A")
603# '1850/00/02 was a Friday'
604# >>>
605# - NB, this is required since native python strftime doesn't work
606# on dates before 1900
607_illegal_s = re.compile(r"((^|[^%])(%%)*%s)")
608
609def _findall(text, substr):
610     # Also finds overlaps
611     sites = []
612     i = 0
613     while 1:
614         j = text.find(substr, i)
615         if j == -1:
616             break
617         sites.append(j)
618         i=j+1
619     return sites
620
621# Every 28 years the calendar repeats, except through century leap
622# years where it's 6 years.  But only if you're using the Gregorian
623# calendar.  ;)
624
625def strftime(dt, fmt):
626    if _illegal_s.search(fmt):
627        raise TypeError("This strftime implementation does not handle %s")
628    if dt.year > 1900:
629        return dt.strftime(fmt)
630
631    year = dt.year
632    # For every non-leap year century, advance by
633    # 6 years to get into the 28-year repeat cycle
634    delta = 2000 - year
635    off = 6*(delta // 100 + delta // 400)
636    year = year + off
637
638    # Move to around the year 2000
639    year = year + ((2000 - year)//28)*28
640    timetuple = dt.timetuple()
641    s1 = time.strftime(fmt, (year,) + timetuple[1:])
642    sites1 = _findall(s1, str(year))
643   
644    s2 = time.strftime(fmt, (year+28,) + timetuple[1:])
645    sites2 = _findall(s2, str(year+28))
646
647    sites = []
648    for site in sites1:
649        if site in sites2:
650            sites.append(site)
651           
652    s = s1
653    syear = "%4d" % (dt.year,)
654    for site in sites:
655        s = s[:site] + syear + s[site+4:]
656    return s
657
658       
659def normaliseLongitude(w,e):
660    '''
661    Take a 0,360 bounding box and force into -180,180
662    '''
663    ww,ee=float(w),float(e)
664    if ww<180.0 and ee>180.0:
665        return ww-180.0,ee-180.0
666    else:
667       if ww>180.0:
668           return ww-360.,ee-360.
669       else: 
670           return ww,ee
671
672
673def findElementIndex(tree, elementName, isLast = False):
674    '''
675    Given an elementree object and a name of an element, determine the index
676    of the element in the tree
677    @param tree: tree to search for the element in
678    @param elementName: name of element to find index of
679    @keyword isLast: if False get the index of the first occurance, otherwise get
680    the last occurance
681    @return: index value or -1 if not found
682    '''
683    # not sure if there is a better way of doing this - seems to be no other way
684    # of determining the correct index - NB, order is important for DIF schema
685    # validation
686    logging.debug("Looking for index of element, '%s'" %elementName)
687    index = -1
688    for i, element in enumerate(tree):
689        if element.tag == elementName:
690            logging.debug("Element found (index = %s)" %i)
691            index = i
692            if not isLast:
693                break
694
695    if index < 0:
696        logging.debug("Element not found")
697    return index
Note: See TracBrowser for help on using the repository browser.