source: ndgCommon/trunk/ndg/common/src/lib/utilities.py @ 4970

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/ndgCommon/trunk/ndg/common/src/lib/utilities.py@4970
Revision 4970, 20.5 KB checked in by cbyrom, 12 years ago (diff)

Various fixes, tidy ups and simplications to ndgCommon codebase.

Line 
1'''
2Various helper methods for use across different applications
3@author: C Byrom
4'''
5import os, sys, logging, re, cgi, urlparse, httplib, time, urllib2, urllib, socket, uuid
6from ndg.common.src.models.ndgObject import ndgObject
7import csml.csmlscan as CsmlScan
8
9
10ISO8601_RE = "([0-9]{4})(-([0-9]{2})(-([0-9]{2})([T\s]?([0-9]{2}):([0-9]{2})(:([0-9]{2})(\.([0-9]+))?)?" + \
11    "(Z|(([-+])([0-9]{2}):([0-9]{2})))?)?)?)?"
12
13regExp = re.compile(ISO8601_RE)
14
15YEAR_FORMAT = '%Y-%m-%d'    # format to use when parsing dates
16
17# Regular expression string to allow months to be stripped out of parameters
18MONTHS = "JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER"
19
20esc_chars = {'\xb0':'°','°':'°'}
21   
22# Define proxies required when using urllib
23PROXIES = {'http':'http://wwwcache.rl.ac.uk:8080'}
24
25URLIB2_INITIALISED = False
26
27       
28class edict(dict):
29    '''An extended dictionary which allows one to set and get values
30    as attributes (kudos Joe Gregorio's 1812)
31    The extended part allows you to get and set values as attributes.
32    That is,
33       d.fred
34    is the same as
35       d['fred']
36    '''
37    def __init__(self,**kw):
38        for a in kw:
39            self[a]=kw[a]
40    def __getattr__(self, key):
41        try:
42            return self.__dict__[key]
43        except KeyError:
44            pass
45        try:
46            assert not key.startswith('_')
47            return self.__getitem__(key)
48        except:
49            raise AttributeError, "object has no attribute '%s'" % key
50    def __setattr__(self, key, value):
51        if key.startswith('_'):
52            self.__dict__[key] = value
53        else:
54            return self.__setitem__(key, value)
55
56
57def loadConfigDetails(configFilename, dbName = None):
58    '''
59    Load the config file details for the DB and return the data relating
60    to the specified db
61    @param configFilename: name of config file to use
62    @keyword dbName: name of DB to use; if specified, must feature in the config file;
63    if not specified, use first config data found in file
64    @raise ValueError: if dbName not featured in the password file
65    @raise IOError: if password file cannot be opened
66    @return userID, password, hostname for specified DB
67    '''   
68    logging.info("Loading DB config data")
69    # Check this file exists
70    if not os.path.isfile(configFilename):
71        errorMessage = "Could not find the DB config file, %s; please make sure this " \
72                 "is available from the running directory" %configFilename
73        logging.error(errorMessage)
74        raise ValueError(errorMessage)
75   
76    f=file(configFilename, 'r')
77    for line in f.readlines():
78        line = line.strip()
79        if not line:
80            continue
81        host,userid,password=line.split(' ')
82        data = (userid, password, host)
83        if not dbName or dbName == host:
84            f.close()
85            logging.debug("Returning config file info for db, '%s'" %host)
86            return data
87
88    raise ValueError('Unable to find valid eXist config data')
89
90
91def openURLWithProxy(uri):
92    '''
93    Open a simple url connection using the standard proxy, and retrieve the contents
94    @param uri: uri to read from
95    @return pageData: data read from uri
96    '''   
97    logging.debug("Reading info from uri, '%s'" %uri)
98    f = urllib.urlopen(uri, proxies = PROXIES)
99    pageData = f.read()
100    f.close()
101    logging.debug("- returning info from uri")
102    return pageData
103
104   
105def simpleURLCheck(uri):
106    '''
107    Use urllib2.urlopen to check if a url can be accessed.  NB, a better approach
108    would be to use checkURL - which properly checks returned status codes, but can't
109    get this working properly with proxies
110   
111    @param uri: vocab term uri to check
112    @return: 1 if valid, 0 otherwise
113    '''
114    logging.debug("Checking validity of uri, '%s'" %uri)
115   
116    if not URLIB2_INITIALISED:
117        # set the socket timeout period
118        socket.setdefaulttimeout(120)
119   
120        proxy_support = urllib2.ProxyHandler(PROXIES)
121   
122        # build a new opener that adds authentication and caching FTP handlers
123        opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
124   
125        # install it
126        urllib2.install_opener(opener)
127   
128    try:
129        redirectCounter = 10
130        page = urllib2.urlopen(uri)
131        status = page.code
132   
133        # check for redirection - NB, only do this a limited number of times
134        while (status >= 300) and (status <= 399) and redirectCounter > 0:
135            redirectCounter -= 1
136   
137            # lookup redirected location
138            url = page.info().get('location')
139            logging.info("Redirect response received - checking new location, '%s'" %url)
140            page = urllib2.urlopen(uri)
141            status = page.code
142   
143        if status >= 200 and status <= 299:
144            logging.info("URL resolved successfully")
145            return 1
146        else:
147            logging.info("Invalid return code received (%s)" %status)
148
149    except ValueError, e:
150        # propagate invalid format errors
151        raise e
152    except Exception, e:
153        logging.error("Exception thrown whilst verifying uri: '%s'" %e.message)
154
155    logging.debug("- url appears to be invalid")
156    return 0
157
158   
159def checkURL(url):
160    '''
161    Lookup a specified url and check if it is valid - judged by the return code
162    being '200'.  NB, will also try to resolve redirects.
163    @param url:  url to lookup
164    @return: 1 if url is valid, 0 if not
165    '''
166    def getRequestHead(url):
167        '''
168        Create a HTTP connection to a specified URL and return the message HEAD
169        @param url: url to retrieve HEAD from
170        @return Response object relating to the HEAD retrieval
171        '''
172        logging.debug("Getting request HEAD for url, '%s'" %url)
173        host, path = urlparse.urlparse(url)[1:3]
174        if not host or not path:
175            raise ValueError("Invalid url (%s) - must be of format, 'http://somesite.com/...'" \
176                             %url)
177
178        connection = httplib.HTTPConnection(host)
179        connection.request("HEAD", path)
180        return connection.getresponse()
181
182    logging.info("Checking validity of URL, '%s'" %url)
183
184    try:
185        # redirection limit, default of 10
186        redirectCounter = 10
187
188        # Retrieve HEAD
189        resp = getRequestHead(url)
190   
191        # check for redirection - NB, only do this a limited number of times
192        while (resp.status >= 300) and (resp.status <= 399) and redirectCounter > 0:
193            redirectCounter -= 1
194   
195            # lookup redirected location
196            url = resp.getheader('location')
197            logging.info("Redirect response received - checking new location, '%s'" %url)
198            resp = getRequestHead(url)
199   
200        if resp.status >= 200 and resp.status <= 299:
201            logging.info("URL resolved successfully")
202            return 1
203   
204        else:
205            logging.info("Invalid return code received (%s) - link broken" %resp.status)
206            return 0
207    except ValueError, e:
208        # propagate invalid format errors
209        raise e
210    except Exception, e:
211        logging.error("Failed to lookup URL: '%s'" %e.message)
212        return 0
213
214
215def getTripleData(tripleString, doEscape=True):
216    '''
217    Take a string as input and extract triple data into an array
218    NB, if data not fully in triple form, return empty elements
219    @param tripleString: string containing the triple data
220    @keyword doEscape: if True, escape special characters - e.g. '&' (default True)
221    @return 1-D array with three elements, representing the data in the triple
222    '''
223    logging.debug("Getting triple data: %s" %tripleString)
224    if doEscape:
225        tripleString = escapeSpecialCharacters(tripleString)
226       
227    data = tripleString.split('|')
228    returnData = ["", "", ""]
229    i = 0
230    for val in data:
231        if i > 2:
232            raise ValueError("Triple data has an extra '|' character in it (%s) - please fix and rerun" %tripleString)
233        returnData[i] = val.strip()
234        i += 1
235   
236    logging.debug("- returning triple data in array")
237    return returnData
238
239
240def getString(obj):
241    '''
242    Return the byte string representation of obj
243    @param obj: a byte string or unicode string
244    '''
245    try:
246        return str(obj)
247    except UnicodeEncodeError:
248        # obj is unicode
249        return unicode(obj).encode('utf-8')
250
251
252def encodeIntoHTMLNumericalCodes(inputString):
253    '''
254    Take an input string and adjust any bytes it contains into the
255    required html numerical code to display the data
256    @param inputString: string to encode
257    @return encoded string
258    '''
259    # NB, the latin coding accepts unicode up to 255
260    correctedString = getString(inputString)#inputString.decode('string_escape')#('latin-1')
261    #unicode(inputString).encode('unicode_escape')
262    # the XMLCHARREFREPLACE does the required character replacement
263    return correctedString.encode('utf-8', 'xmlcharrefreplace')
264    #return correctedString.encode('ascii', 'xmlcharrefreplace')
265    #return correctedString.encode('utf-8')
266
267
268def isValidUnicode(inputString):
269    '''
270    Checks the input string to ensure that it can be validly represented
271    by utf-8 encoded unicode
272    @param inputString: byte string or unicode string to check
273    @return True if valid, False otherwise
274    '''
275    if not inputString:
276        return True
277   
278    logging.debug("Checking string, '%s' is valid" %inputString)
279    isValid = False
280    try:
281        if isinstance(inputString, unicode):
282            if inputString.encode('utf-8'):
283                isValid = True
284        elif unicode(inputString, 'utf-8'):
285            isValid = True
286    except UnicodeDecodeError, e:
287        logging.debug("Error encountered: %s" %e.reason)
288    except Exception, e:
289        logging.debug(e.message)
290
291    if isValid:
292        logging.debug(" - valid")
293    else:
294        logging.debug("- invalid")
295    return isValid
296   
297
298def escapeSpecialCharacters(inputString):
299    '''
300    Escape any XML unfriendly characters
301    @param inputString: string whose value to correct
302    @return: corrected string
303    '''
304    correctedString = cgi.escape(inputString)
305   
306    if inputString != correctedString:
307        logging.info("Note: input data made XML friendly (\nold:'%s' \nnew:'%s')" %(inputString, correctedString))
308    return correctedString
309
310       
311def createCSMLFile(CDMLFilePath, timeAxis, datasetID = None):
312    '''
313    Create a CSML file by running csmlscan.py against the specified CDML file
314    @param CDMLFilePath: string path to CDML file
315    @param timeAxis: string name of time axis to use in CDML file
316    @keyword datasetID: string dataset ID to use in CSMLfile - if not set, a random
317    name will be generated instead
318    @return: CSMLFileName: name of CSML file produced
319    '''
320    logging.info("Creating CSML file from CDML file by running csmlscan")
321   
322    if not datasetID:
323        datasetID = str(uuid.uuid1())
324    CSMLFileName = datasetID + "_csml.xml"
325    logging.debug("Inputs specified: datasetID = %s, timeAxis = %s" %(datasetID, timeAxis))
326    inputVals = ['csmlscan', '-o', CSMLFileName]
327    if datasetID:
328        inputVals.extend(['-i', datasetID])
329    if timeAxis:
330        inputVals.extend(['-t', timeAxis])
331   
332    inputVals.append(CDMLFilePath)
333   
334    CsmlScan.main(inputVals)
335    logging.info("Created CSML file: %s" %CSMLFileName)
336    return CSMLFileName
337
338
339def isCSMLFile(fileContent):
340    '''
341    Given the contents of a file, determine whether it is CSML or not
342    @param fileContent: content of the file to check - as a string
343    @return True if CSML, False otherwise
344    '''
345    logging.info("Checking file content to see if it is a CSML file")
346    if fileContent.find(ndgObject.CSML_NS) > -1 or \
347        fileContent.find('CSMLFeatureCollection') > -1 or \
348        fileContent.find('CSMLStorageDescriptor') > -1:
349        logging.info("- file is of CSML format")
350        return True
351
352    logging.info("- file is not of CSML format")
353    return False
354
355
356def isCDMLFile(fileContent):
357    '''
358    Given the contents of a file, determine whether it is CDML or not
359    @param fileContent: content of the file to check - as a string
360    @return True if CDML, False otherwise
361    '''
362    logging.info("Checking file content to see if it is a CDML file")
363    # NB, this is a bit of a fudge - there may be a better way to check for
364    # CDML-ness
365    if fileContent.find(ndgObject.CDML_DTD) > -1 or \
366        (fileContent.find('dataset') > -1 and \
367        fileContent.find('attr') > -1 and \
368        fileContent.find('axis')):
369        logging.info("- file is of CDML format")
370        return True
371
372    logging.info("- file is not of CDML format")
373    return False
374
375
376def getISO8601Date(datestring):
377    '''
378    Converts an input datestring to the ISO8601 standard, if possible
379    @param datestring: string containing date in some format
380    @return: string in ISO8601 format
381    @raise ValueError: if string not in ISO8601 format - or cannot be parsed into this format
382    '''
383    if not datestring:
384        return ''
385   
386    d = regExp.match(datestring)
387    if not d:
388        errorMessage = "Datestring, '%s', not in ISO8601 format" %datestring
389        logging.error(errorMessage)
390        raise ValueError(errorMessage)
391
392    # years
393    outDate = d.group(1)
394   
395    # months
396    if d.group(3):
397        outDate += "-" + d.group(3)
398       
399    # days
400    if d.group(5):
401        outDate += "-" + d.group(5)
402    outDate += "T"
403   
404    # hours
405    if d.group(7):
406        outDate += d.group(7)
407   
408    # minutes
409    if d.group(8):
410        outDate += ":" + d.group(8)
411       
412    # seconds
413    if d.group(10):
414        outDate += ":" + d.group(10)
415
416    outDate += "Z"
417
418    return outDate
419   
420
421def formatDateYYYYMMDD(dateString):
422    '''
423    Parse a date string and attempt to return it in YYYY-MM-DD format
424    @param dateString - a string containing datetime info
425    @return string with date in format YYYY-MM-DD or None, if format not possible
426    '''
427    newDate = getISO8601Date(dateString)
428    if len(newDate) > 11:
429        newDate = newDate[0:10]
430       
431    return newDate
432
433
434def formatDateYYYY(dateString):
435    ''' Simple date manipulations on a string, if it is understood ...
436       if instruction = YYYY, return the year
437    NB, this is historical relic - used by DIF.  Unsure of format used by DIF - prob
438    better replaced by formatDateYYYYMMDD method
439    '''
440    s=dateString.split('-')
441    if len(s)==3: # expecting year,mon,day or day,mon,year ...
442        if int(s[0])>int(s[2]): 
443            return s[0]
444        else:
445            return s[2]
446    else:
447        return dateString # unknown format as yet ...
448   
449   
450def tidyUpParameters(params_string):
451    '''
452    Parameters info may contain generic info - including months
453    - also may include unnecessary spaces/cases - so strip out
454    generic info and correct any funnies
455    NB, if we're dealing with parameter triples, ignore the (second) url entry
456    when doing month removal/upper casing
457    '''
458    logging.debug("Tidying up parameters string, %s" %params_string)
459
460    # avoid processing the url, if it has been set
461    # - NB, special characters are escaped by getTripleData
462    data = getTripleData(params_string)
463    newData = []
464    for r in [data[0], data[2]]:
465        if r:
466            # Strip out any months in the string + uppercase everything
467            r = re.sub(MONTHS,'',r.upper())
468           
469            # Now remove any trailing spaces + any unnecessary inner spaces
470            r = re.sub('\s{2,}',' ',r.strip())
471           
472            newData.append(r)
473   
474    # now, recreate the parameters string
475    r = newData[0]
476    if params_string.find("|") > -1:
477        r += " | " + data[1]
478        if len(newData) > 1:
479            r += " | " + newData[1]
480   
481    logging.debug("Tidied parameter string is now: %s" %r)
482    return r
483
484
485   
486class coverageAggregate:
487    ''' Granules have spatiotemporal boxes, but we want an overall spatio temporal box too, if the
488    boxes are the same, but respecting space and time differently '''
489    def __init__(self,M):
490        self.spaceTime=[]
491        self.coverageList=[]
492        self.M=M
493    def add(self,bbox,time,coverage):
494        if (bbox,time) not in self.spaceTime:
495            logging.debug("Adding coverage data to moles doc:")
496            logging.info("- bbox, '%s'" %bbox)
497            logging.info("- time, '%s'" %time)
498            self.spaceTime.append((bbox,time))
499            self.coverageList.append(coverage.dgSpatioTemporalCoverage.dgSpatioTemporalRange)
500           
501    def makeElement(self):
502        logging.info("Setting up coverage element")
503        if self.spaceTime==[]:
504            return None
505        else:
506            return self.M.dgCoverage(dgSpatioTemporalCoverage=
507                                    self.M.dgSpatioTemporalCoverage(dgSpatioTemporalRange=self.coverageList))
508
509
510class parameterAggregate:
511    ''' Provides a set of parameter summaries, and an index of parameter names to allow
512    aggregation without duplication. Pretty brain dead at the moment. '''
513    def __init__(self):
514        self.paramNameIndex=[]
515        self.paramSummaries=[]
516    def add(self,subset):
517        ''' Add a subset '''
518        for p in subset:
519            if p.ParameterName not in self.paramNameIndex:
520                self.paramNameIndex.append(p.ParameterName)
521                self.paramSummaries.append(p)
522    def get(self):
523        return self.paramSummaries
524
525
526def wrapGetText(element,xpathExpression,multiple=0):
527    '''
528    Wraps a call to ET to get a text object in an error handler
529    '''
530    if element is None:
531        if multiple:
532            return ['',]
533        else: 
534            return ''
535
536    if multiple:
537        r=element.findall(xpathExpression)
538    else:
539        r=[element.find(xpathExpression),]
540
541    rr = []
542    for elem in r:
543        if elem and isinstance(elem, Element):
544            rr.append(elem.text)
545
546    if multiple:
547        return rr
548
549    if len(rr) > 0:
550        return rr[0]
551    return '' 
552
553
554# Format a datetime through its full proleptic Gregorian date range.
555#
556# >>> strftime(datetime.date(1850, 8, 2), "%Y/%M/%d was a %A")
557# '1850/00/02 was a Friday'
558# >>>
559# - NB, this is required since native python strftime doesn't work
560# on dates before 1900
561_illegal_s = re.compile(r"((^|[^%])(%%)*%s)")
562
563def _findall(text, substr):
564     # Also finds overlaps
565     sites = []
566     i = 0
567     while 1:
568         j = text.find(substr, i)
569         if j == -1:
570             break
571         sites.append(j)
572         i=j+1
573     return sites
574
575# Every 28 years the calendar repeats, except through century leap
576# years where it's 6 years.  But only if you're using the Gregorian
577# calendar.  ;)
578
579def strftime(dt, fmt):
580    if _illegal_s.search(fmt):
581        raise TypeError("This strftime implementation does not handle %s")
582    if dt.year > 1900:
583        return dt.strftime(fmt)
584
585    year = dt.year
586    # For every non-leap year century, advance by
587    # 6 years to get into the 28-year repeat cycle
588    delta = 2000 - year
589    off = 6*(delta // 100 + delta // 400)
590    year = year + off
591
592    # Move to around the year 2000
593    year = year + ((2000 - year)//28)*28
594    timetuple = dt.timetuple()
595    s1 = time.strftime(fmt, (year,) + timetuple[1:])
596    sites1 = _findall(s1, str(year))
597   
598    s2 = time.strftime(fmt, (year+28,) + timetuple[1:])
599    sites2 = _findall(s2, str(year+28))
600
601    sites = []
602    for site in sites1:
603        if site in sites2:
604            sites.append(site)
605           
606    s = s1
607    syear = "%4d" % (dt.year,)
608    for site in sites:
609        s = s[:site] + syear + s[site+4:]
610    return s
611
612       
613def normaliseLongitude(w,e):
614    '''
615    Take a 0,360 bounding box and force into -180,180
616    '''
617    ww,ee=float(w),float(e)
618    if ww<180.0 and ee>180.0:
619        return ww-180.0,ee-180.0
620    else:
621       if ww>180.0:
622           return ww-360.,ee-360.
623       else: 
624           return ww,ee
625
626
627def findElementIndex(tree, elementName, isLast = False):
628    '''
629    Given an elementree object and a name of an element, determine the index
630    of the element in the tree
631    @param tree: tree to search for the element in
632    @param elementName: name of element to find index of
633    @keyword isLast: if False get the index of the first occurance, otherwise get
634    the last occurance
635    @return: index value or -1 if not found
636    '''
637    # not sure if there is a better way of doing this - seems to be no other way
638    # of determining the correct index - NB, order is important for DIF schema
639    # validation
640    logging.debug("Looking for index of element, '%s'" %elementName)
641    index = -1
642    for i, element in enumerate(tree):
643        if element.tag == elementName:
644            logging.debug("Element found (index = %s)" %i)
645            index = i
646            if not isLast:
647                break
648
649    if index < 0:
650        logging.debug("Element not found")
651    return index
Note: See TracBrowser for help on using the repository browser.