source: ndgCommon/trunk/ndg/common/src/lib/utilities.py @ 5231

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/ndgCommon/trunk/ndg/common/src/lib/utilities.py@5231
Revision 5231, 23.0 KB checked in by cbyrom, 10 years ago (diff)

Extend ndgcommon code for use with the OAI Info Editor - fix checking
for non-empty ETs - if these only have an attribute set on them, they
won't be evaluated to 'True' in an 'if' statement. Add methods for
getting bools from strings + for setting up select lists.

Line 
1'''
2Various helper methods for use across different applications
3@author: C Byrom
4'''
5import os, sys, logging, re, cgi, urlparse, httplib, time, urllib2, urllib, socket, uuid
6from ndg.common.src.models.ndgObject import ndgObject
7import csml.csmlscan as CsmlScan
8
9# environment variables to check when looking up default proxy set ups
10PROXY_KEY = 'http_proxy'
11NO_PROXY_KEY = 'no_proxy'
12
13ISO8601_RE = "([0-9]{4})(-([0-9]{2})(-([0-9]{2})([T\s]?([0-9]{2}):([0-9]{2})(:([0-9]{2})(\.([0-9]+))?)?" + \
14    "(Z|(([-+])([0-9]{2}):([0-9]{2})))?)?)?)?"
15
16regExp = re.compile(ISO8601_RE)
17
18YEAR_FORMAT = '%Y-%m-%d'    # format to use when parsing dates
19
20# Regular expression string to allow months to be stripped out of parameters
21MONTHS = "JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER"
22
23esc_chars = {'\xb0':'°','°':'°'}
24   
25DATE_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
26
27class edict(dict):
28    '''An extended dictionary which allows one to set and get values
29    as attributes (kudos Joe Gregorio's 1812)
30    The extended part allows you to get and set values as attributes.
31    That is,
32       d.fred
33    is the same as
34       d['fred']
35    '''
36    def __init__(self,**kw):
37        for a in kw:
38            self[a]=kw[a]
39    def __getattr__(self, key):
40        try:
41            return self.__dict__[key]
42        except KeyError:
43            pass
44        try:
45            assert not key.startswith('_')
46            return self.__getitem__(key)
47        except:
48            raise AttributeError, "object has no attribute '%s'" % key
49    def __setattr__(self, key, value):
50        if key.startswith('_'):
51            self.__dict__[key] = value
52        else:
53            return self.__setitem__(key, value)
54
55
56def httpify(url):
57    '''
58    Ensure a url has an http prefix
59    @param url: url to check and, if required, to add 'http' prefix to
60    @return url - with 'http' prefix added, if required
61    '''
62    if not url.startswith('http'):
63        url = 'http://' + url
64    return url
65
66
67def loadConfigDetails(configFilename, dbName = None):
68    '''
69    Load the config file details for the DB and return the data relating
70    to the specified db
71    @param configFilename: name of config file to use
72    @keyword dbName: name of DB to use; if specified, must feature in the config file;
73    if not specified, use first config data found in file
74    @raise ValueError: if dbName not featured in the password file
75    @raise IOError: if password file cannot be opened
76    @return userID, password, hostname for specified DB
77    '''   
78    logging.info("Loading DB config data")
79    # Check this file exists
80    if not os.path.isfile(configFilename):
81        errorMessage = "Could not find the DB config file, %s; please make sure this " \
82                 "is available from the running directory" %configFilename
83        logging.error(errorMessage)
84        raise ValueError(errorMessage)
85   
86    f=file(configFilename, 'r')
87    for line in f.readlines():
88        line = line.strip()
89        if not line:
90            continue
91        host,userid,password=line.split(' ')
92        data = (userid, password, host)
93        if not dbName or dbName == host:
94            f.close()
95            logging.debug("Returning config file info for db, '%s'" %host)
96            return data
97
98    raise ValueError('Unable to find valid eXist config data')
99
100
101def getDefaultProxy(url):
102    '''
103    Checks the OS environment to see if any proxies are specified; if so,
104    return the proxy details in a dict
105    @param url: url that is being looked up; NB, there may be a no_proxy env
106    variable set - and this may affect how urls for certain machines will be
107    proxied
108    @return dict: with format key = proxy type, val = proxy value
109    - e.g. 'http':'http://wwwcache.rl.ac.uk:8080'
110    '''
111    proxies = {}
112    proxy = os.getenv(PROXY_KEY)
113    if proxy:
114        proxies['http'] = proxy
115        logging.debug("Found proxy setting - checking if this applies to the url being accessed")
116        noProxyHosts = os.getenv(NO_PROXY_KEY)
117        if noProxyHosts:
118            for host in noProxyHosts.split(','):
119                if url.lower().find(host.lower().strip()) > -1:
120                    logging.debug("Proxies disabled for communication with uri machine ('%s')"
121                                  %host)
122                    proxies = {}
123    return proxies
124
125
126def openURLWithDefaultProxy(url):
127    '''
128    Checks the OS environment to see if any proxies are specified; if so,
129    use these when opening a URL
130    @param url: url to open
131    '''
132    logging.debug("Checking environment variables for proxies")
133    proxy = getDefaultProxy(url)
134    return openURLWithProxy(url, proxy)
135       
136
137def openURLWithProxy(uri, proxy):
138    '''
139    Open a simple url connection using the specified proxy, and retrieve the contents
140    @param uri: uri to read from
141    @param proxy: dict with proxy info in format key = protocol, val = proxy host
142    @return pageData: data read from uri
143    '''   
144    logging.debug("Reading info from uri, '%s'" %uri)
145    if not proxy:
146        proxy = {}
147    f = urllib.urlopen(uri, proxies = proxy)
148    pageData = f.read()
149    f.close()
150    logging.debug("- returning info from uri")
151    return pageData
152
153   
154def simpleURLCheck(uri):
155    '''
156    Use urllib2.urlopen to check if a url can be accessed.  NB, a better approach
157    would be to use checkURL - which properly checks returned status codes, but can't
158    get this working properly with proxies
159   
160    @param uri: vocab term uri to check
161    @return: 1 if valid, 0 otherwise
162    '''
163    logging.debug("Checking validity of uri, '%s'" %uri)
164
165    # set the socket timeout period
166    socket.setdefaulttimeout(120)
167
168    proxy = getDefaultProxy(uri)
169    proxy_support = urllib2.ProxyHandler(proxy)
170
171    # build a new opener that adds authentication and caching FTP handlers
172    opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
173
174    # install it
175    urllib2.install_opener(opener)
176   
177    try:
178        redirectCounter = 10
179        page = urllib2.urlopen(uri)
180        status = page.code
181   
182        # check for redirection - NB, only do this a limited number of times
183        while (status >= 300) and (status <= 399) and redirectCounter > 0:
184            redirectCounter -= 1
185   
186            # lookup redirected location
187            url = page.info().get('location')
188            logging.info("Redirect response received - checking new location, '%s'" %url)
189            page = urllib2.urlopen(uri)
190            status = page.code
191   
192        if status >= 200 and status <= 299:
193            logging.info("URL resolved successfully")
194            return 1
195        else:
196            logging.info("Invalid return code received (%s)" %status)
197
198    except ValueError, e:
199        # propagate invalid format errors
200        raise e
201    except Exception, e:
202        logging.error("Exception thrown whilst verifying uri: '%s'" %(e.message or e))
203
204    logging.debug("- url appears to be invalid")
205    return 0
206
207   
208def checkURL(url):
209    '''
210    Lookup a specified url and check if it is valid - judged by the return code
211    being '200'.  NB, will also try to resolve redirects.
212    @param url:  url to lookup
213    @return: 1 if url is valid, 0 if not
214    '''
215    def getRequestHead(url):
216        '''
217        Create a HTTP connection to a specified URL and return the message HEAD
218        @param url: url to retrieve HEAD from
219        @return Response object relating to the HEAD retrieval
220        '''
221        logging.debug("Getting request HEAD for url, '%s'" %url)
222        host, path = urlparse.urlparse(url)[1:3]
223        if not host or not path:
224            raise ValueError("Invalid url (%s) - must be of format, 'http://somesite.com/...'" \
225                             %url)
226
227        connection = httplib.HTTPConnection(host)
228        connection.request("HEAD", path)
229        return connection.getresponse()
230
231    logging.info("Checking validity of URL, '%s'" %url)
232
233    try:
234        # redirection limit, default of 10
235        redirectCounter = 10
236
237        # Retrieve HEAD
238        resp = getRequestHead(url)
239   
240        # check for redirection - NB, only do this a limited number of times
241        while (resp.status >= 300) and (resp.status <= 399) and redirectCounter > 0:
242            redirectCounter -= 1
243   
244            # lookup redirected location
245            url = resp.getheader('location')
246            logging.info("Redirect response received - checking new location, '%s'" %url)
247            resp = getRequestHead(url)
248   
249        if resp.status >= 200 and resp.status <= 299:
250            logging.info("URL resolved successfully")
251            return 1
252   
253        else:
254            logging.info("Invalid return code received (%s) - link broken" %resp.status)
255            return 0
256    except ValueError, e:
257        # propagate invalid format errors
258        raise e
259    except Exception, e:
260        logging.error("Failed to lookup URL: '%s'" %e.message)
261        return 0
262
263
264def getTripleData(tripleString, doEscape=True):
265    '''
266    Take a string as input and extract triple data into an array
267    NB, if data not fully in triple form, return empty elements
268    @param tripleString: string containing the triple data
269    @keyword doEscape: if True, escape special characters - e.g. '&' (default True)
270    @return 1-D array with three elements, representing the data in the triple
271    '''
272    logging.debug("Getting triple data: %s" %tripleString)
273    if doEscape:
274        tripleString = escapeSpecialCharacters(tripleString)
275       
276    data = tripleString.split('|')
277    returnData = ["", "", ""]
278    i = 0
279    for val in data:
280        if i > 2:
281            raise ValueError("Triple data has an extra '|' character in it (%s) - please fix and rerun" %tripleString)
282        returnData[i] = val.strip()
283        i += 1
284   
285    logging.debug("- returning triple data in array")
286    return returnData
287
288
289def getString(obj):
290    '''
291    Return the byte string representation of obj
292    @param obj: a byte string or unicode string
293    '''
294    try:
295        return str(obj)
296    except UnicodeEncodeError:
297        # obj is unicode
298        return unicode(obj).encode('utf-8')
299
300
301
302
303def encodeIntoHTMLNumericalCodes(inputString):
304    '''
305    Take an input string and adjust any bytes it contains into the
306    required html numerical code to display the data
307    @param inputString: string to encode
308    @return encoded string
309    '''
310    # NB, the latin coding accepts unicode up to 255
311    #correctedString = getString(inputString)#inputString.decode('string_escape')#('latin-1')
312   
313    if isinstance(inputString, unicode):
314        correctedString = inputString.encode('unicode_escape')
315    else:
316        correctedString = unicode(inputString, errors = 'replace')
317    #unicode(inputString).encode('unicode_escape')
318    # the XMLCHARREFREPLACE does the required character replacement
319    #return correctedString.encode('utf-8', 'xmlcharrefreplace')
320    return correctedString.encode('ascii', 'xmlcharrefreplace')
321    #return correctedString.encode('utf-8')
322
323
324def isValidUnicode(inputString):
325    '''
326    Checks the input string to ensure that it can be validly represented
327    by utf-8 encoded unicode
328    @param inputString: byte string or unicode string to check
329    @return True if valid, False otherwise
330    '''
331    if not inputString:
332        return True
333   
334    logging.debug("Checking string, '%s' is valid" %inputString)
335    isValid = False
336    try:
337        if isinstance(inputString, unicode):
338            if inputString.encode('utf-8'):
339                isValid = True
340        elif unicode(inputString, 'utf-8'):
341            isValid = True
342    except UnicodeDecodeError, e:
343        logging.debug("Error encountered: %s" %e.reason)
344    except Exception, e:
345        logging.debug(e.message)
346
347    if isValid:
348        logging.debug(" - valid")
349    else:
350        logging.debug("- invalid")
351    return isValid
352
353
354def escapeSpecialCharacters(inputString):
355    '''
356    Escape any XML unfriendly characters
357    - NB, do the escape/unescape of non-XML characters by
358    the data models on export/import of data
359    @param inputString: string whose value to correct
360    @return: corrected string
361    '''
362    correctedString = inputString# cgi.escape(inputString)
363   
364    correctedString = encodeIntoHTMLNumericalCodes(correctedString)
365    if inputString != correctedString:
366        logging.info("Note: input data made XML friendly (\nold:'%s' \nnew:'%s')" %(inputString, correctedString))
367    return correctedString
368
369       
370def createCSMLFile(CDMLFilePath, timeAxis, datasetID = None):
371    '''
372    Create a CSML file by running csmlscan.py against the specified CDML file
373    @param CDMLFilePath: string path to CDML file
374    @param timeAxis: string name of time axis to use in CDML file
375    @keyword datasetID: string dataset ID to use in CSMLfile - if not set, a random
376    name will be generated instead
377    @return: CSMLFileName: name of CSML file produced
378    '''
379    logging.info("Creating CSML file from CDML file by running csmlscan")
380   
381    if not datasetID:
382        datasetID = str(uuid.uuid1())
383    CSMLFileName = datasetID + "_csml.xml"
384    logging.debug("Inputs specified: datasetID = %s, timeAxis = %s" %(datasetID, timeAxis))
385    inputVals = ['csmlscan', '-o', CSMLFileName]
386    if datasetID:
387        inputVals.extend(['-i', datasetID])
388    if timeAxis:
389        inputVals.extend(['-t', timeAxis])
390   
391    inputVals.append(CDMLFilePath)
392   
393    CsmlScan.main(inputVals)
394    logging.info("Created CSML file: %s" %CSMLFileName)
395    return CSMLFileName
396
397
398def isCSMLFile(fileContent):
399    '''
400    Given the contents of a file, determine whether it is CSML or not
401    @param fileContent: content of the file to check - as a string
402    @return True if CSML, False otherwise
403    '''
404    logging.info("Checking file content to see if it is a CSML file")
405    if fileContent.find(ndgObject.CSML_NS) > -1 or \
406        fileContent.find('CSMLFeatureCollection') > -1 or \
407        fileContent.find('CSMLStorageDescriptor') > -1:
408        logging.info("- file is of CSML format")
409        return True
410
411    logging.info("- file is not of CSML format")
412    return False
413
414
415def isCDMLFile(fileContent):
416    '''
417    Given the contents of a file, determine whether it is CDML or not
418    @param fileContent: content of the file to check - as a string
419    @return True if CDML, False otherwise
420    '''
421    logging.info("Checking file content to see if it is a CDML file")
422    # NB, this is a bit of a fudge - there may be a better way to check for
423    # CDML-ness
424    if fileContent.find(ndgObject.CDML_DTD) > -1 or \
425        (fileContent.find('dataset') > -1 and \
426        fileContent.find('attr') > -1 and \
427        fileContent.find('axis')):
428        logging.info("- file is of CDML format")
429        return True
430
431    logging.info("- file is not of CDML format")
432    return False
433
434
435def getISO8601Date(datestring):
436    '''
437    Converts an input datestring to the ISO8601 standard, if possible
438    @param datestring: string containing date in some format
439    @return: string in ISO8601 format
440    @raise ValueError: if string not in ISO8601 format - or cannot be parsed into this format
441    '''
442    if not datestring:
443        return ''
444   
445    d = regExp.match(datestring)
446    if not d:
447        errorMessage = "Datestring, '%s', not in ISO8601 format" %datestring
448        logging.error(errorMessage)
449        raise ValueError(errorMessage)
450
451    # years
452    outDate = d.group(1)
453   
454    # months
455    if d.group(3):
456        outDate += "-" + d.group(3)
457       
458    # days
459    if d.group(5):
460        outDate += "-" + d.group(5)
461    outDate += "T"
462   
463    # hours
464    if d.group(7):
465        outDate += d.group(7)
466   
467    # minutes
468    if d.group(8):
469        outDate += ":" + d.group(8)
470       
471    # seconds
472    if d.group(10):
473        outDate += ":" + d.group(10)
474
475    outDate += "Z"
476
477    return outDate
478   
479
480def formatDateYYYYMMDD(dateString):
481    '''
482    Parse a date string and attempt to return it in YYYY-MM-DD format
483    @param dateString - a string containing datetime info
484    @return string with date in format YYYY-MM-DD or None, if format not possible
485    '''
486    newDate = getISO8601Date(dateString)
487    if len(newDate) > 11:
488        newDate = newDate[0:10]
489       
490    return newDate
491
492
493def formatDateYYYY(dateString):
494    ''' Simple date manipulations on a string, if it is understood ...
495       if instruction = YYYY, return the year
496    NB, this is historical relic - used by DIF.  Unsure of format used by DIF - prob
497    better replaced by formatDateYYYYMMDD method
498    '''
499    s=dateString.split('-')
500    if len(s)==3: # expecting year,mon,day or day,mon,year ...
501        if int(s[0])>int(s[2]): 
502            return s[0]
503        else:
504            return s[2]
505    else:
506        return dateString # unknown format as yet ...
507   
508   
509def tidyUpParameters(params_string):
510    '''
511    Parameters info may contain generic info - including months
512    - also may include unnecessary spaces/cases - so strip out
513    generic info and correct any funnies
514    NB, if we're dealing with parameter triples, ignore the (second) url entry
515    when doing month removal/upper casing
516    '''
517    logging.debug("Tidying up parameters string, %s" %params_string)
518
519    # avoid processing the url, if it has been set
520    # - NB, special characters are escaped by getTripleData
521    data = getTripleData(params_string, doEscape=False)
522    newData = []
523    for r in [data[0], data[2]]:
524        if r:
525            # Strip out any months in the string + uppercase everything
526            r = re.sub(MONTHS,'',r.upper())
527           
528            # Now remove any trailing spaces + any unnecessary inner spaces
529            r = re.sub('\s{2,}',' ',r.strip())
530           
531            newData.append(r)
532   
533    # now, recreate the parameters string
534    r = newData[0]
535    if params_string.find("|") > -1:
536        r += " | " + data[1]
537        if len(newData) > 1:
538            r += " | " + newData[1]
539   
540    logging.debug("Tidied parameter string is now: %s" %r)
541    return r
542
543
544   
545class coverageAggregate:
546    ''' Granules have spatiotemporal boxes, but we want an overall spatio temporal box too, if the
547    boxes are the same, but respecting space and time differently '''
548    def __init__(self,M):
549        self.spaceTime=[]
550        self.coverageList=[]
551        self.M=M
552    def add(self,bbox,time,coverage):
553        if (bbox,time) not in self.spaceTime:
554            logging.debug("Adding coverage data to moles doc:")
555            logging.info("- bbox, '%s'" %bbox)
556            logging.info("- time, '%s'" %time)
557            self.spaceTime.append((bbox,time))
558            self.coverageList.append(coverage.dgSpatioTemporalCoverage.dgSpatioTemporalRange)
559           
560    def makeElement(self):
561        logging.info("Setting up coverage element")
562        if self.spaceTime==[]:
563            return None
564        else:
565            return self.M.dgCoverage(dgSpatioTemporalCoverage=
566                                    self.M.dgSpatioTemporalCoverage(dgSpatioTemporalRange=self.coverageList))
567
568
569class parameterAggregate:
570    ''' Provides a set of parameter summaries, and an index of parameter names to allow
571    aggregation without duplication. Pretty brain dead at the moment. '''
572    def __init__(self):
573        self.paramNameIndex=[]
574        self.paramSummaries=[]
575    def add(self,subset):
576        ''' Add a subset '''
577        for p in subset:
578            if p.ParameterName not in self.paramNameIndex:
579                self.paramNameIndex.append(p.ParameterName)
580                self.paramSummaries.append(p)
581    def get(self):
582        return self.paramSummaries
583
584
585def wrapGetText(element,xpathExpression,multiple=0):
586    '''
587    Wraps a call to ET to get a text object in an error handler
588    '''
589    if element is None:
590        if multiple:
591            return ['',]
592        else: 
593            return ''
594
595    if multiple:
596        r=element.findall(xpathExpression)
597    else:
598        r=[element.find(xpathExpression),]
599
600    rr = []
601    for elem in r:
602        if elem and isinstance(elem, Element):
603            rr.append(elem.text)
604
605    if multiple:
606        return rr
607
608    if len(rr) > 0:
609        return rr[0]
610    return '' 
611
612
613# Format a datetime through its full proleptic Gregorian date range.
614#
615# >>> strftime(datetime.date(1850, 8, 2), "%Y/%M/%d was a %A")
616# '1850/00/02 was a Friday'
617# >>>
618# - NB, this is required since native python strftime doesn't work
619# on dates before 1900
620_illegal_s = re.compile(r"((^|[^%])(%%)*%s)")
621
622def _findall(text, substr):
623     # Also finds overlaps
624     sites = []
625     i = 0
626     while 1:
627         j = text.find(substr, i)
628         if j == -1:
629             break
630         sites.append(j)
631         i=j+1
632     return sites
633
634# Every 28 years the calendar repeats, except through century leap
635# years where it's 6 years.  But only if you're using the Gregorian
636# calendar.  ;)
637
638def strftime(dt, fmt):
639    if _illegal_s.search(fmt):
640        raise TypeError("This strftime implementation does not handle %s")
641    if dt.year > 1900:
642        return dt.strftime(fmt)
643
644    year = dt.year
645    # For every non-leap year century, advance by
646    # 6 years to get into the 28-year repeat cycle
647    delta = 2000 - year
648    off = 6*(delta // 100 + delta // 400)
649    year = year + off
650
651    # Move to around the year 2000
652    year = year + ((2000 - year)//28)*28
653    timetuple = dt.timetuple()
654    s1 = time.strftime(fmt, (year,) + timetuple[1:])
655    sites1 = _findall(s1, str(year))
656   
657    s2 = time.strftime(fmt, (year+28,) + timetuple[1:])
658    sites2 = _findall(s2, str(year+28))
659
660    sites = []
661    for site in sites1:
662        if site in sites2:
663            sites.append(site)
664           
665    s = s1
666    syear = "%4d" % (dt.year,)
667    for site in sites:
668        s = s[:site] + syear + s[site+4:]
669    return s
670
671       
672def normaliseLongitude(w,e):
673    '''
674    Take a 0,360 bounding box and force into -180,180
675    '''
676    ww,ee=float(w),float(e)
677    if ww<180.0 and ee>180.0:
678        return ww-180.0,ee-180.0
679    else:
680       if ww>180.0:
681           return ww-360.,ee-360.
682       else: 
683           return ww,ee
684
685
686def findElementIndex(tree, elementName, isLast = False):
687    '''
688    Given an elementree object and a name of an element, determine the index
689    of the element in the tree
690    @param tree: tree to search for the element in
691    @param elementName: name of element to find index of
692    @keyword isLast: if False get the index of the first occurance, otherwise get
693    the last occurance
694    @return: index value or -1 if not found
695    '''
696    # not sure if there is a better way of doing this - seems to be no other way
697    # of determining the correct index - NB, order is important for DIF schema
698    # validation
699    logging.debug("Looking for index of element, '%s'" %elementName)
700    index = -1
701    for i, element in enumerate(tree):
702        if element.tag == elementName:
703            logging.debug("Element found (index = %s)" %i)
704            index = i
705            if not isLast:
706                break
707
708    if index < 0:
709        logging.debug("Element not found")
710    return index
711
712
713def getBool(val):
714    '''
715    Return the boolean version of the input val
716    @param val: string (or bool) to determine the boolean value of
717    @return True/False depending on the input val
718    '''
719    logging.debug("Determining boolean value of '%s'" %val)
720    if val is True or val is False:
721        return val
722   
723    val = str(val).strip().lower()
724    return not val in ['false','f','n','0','']   
725   
Note: See TracBrowser for help on using the repository browser.