Changeset 4494


Ignore:
Timestamp:
26/11/08 13:29:20 (11 years ago)
Author:
cbyrom
Message:

Add new methods to lookup simple URLs and vocab term urls. NB, problems were encountered using the checkURL method, which uses httplib, when running with proxy server. Implement usage of new methods + fix small bug with keeping too many related links + tidy up unused imports.

Files:
7 edited

Legend:

Unmodified
Added
Removed
  • MILK/trunk/milk_server/milk_server/controllers/atom_editor/editatom.py

    r4491 r4494  
    305305                                        selected=c.atom.subtype) 
    306306         
    307          
    308307        self.addRelatedLinksDropDowns() 
    309308 
     
    426425                        link.href = inputs.get(keyStem + '.href') or "" 
    427426                        link.title = inputs.get(keyStem + '.title') or "" 
     427                         
     428                        if not link.hasValue(): 
     429                            continue 
    428430                             
    429431                        logging.info("Adding new online reference info") 
  • MILK/trunk/milk_server/milk_server/controllers/visualise/selectedItems.py

    r4487 r4494  
    1414from paste.request import parse_querystring 
    1515from milk_server.models.selectedItem import SelectedItem 
    16 import copy, urllib, logging 
     16import copy, logging 
    1717 
    1818class SelecteditemsController(BaseController): 
  • exist/trunk/python/ndgUtils/eXistConnector.py

    r4286 r4494  
    11# Code inspired by example on eXist website. 
    2 import urllib2, base64, urllib, urlparse, httplib, xmlrpclib, types, os, logging 
     2import urllib2, base64, xmlrpclib, logging 
    33 
    44class InstanceObject(object): 
  • exist/trunk/python/ndgUtils/lib/atomvalidator.py

    r4491 r4494  
    1313import logging, traceback, datetime 
    1414import ndgUtils.models.existdbclient as edc 
    15 from ndgUtils.vocabtermdata import VocabTermData as VTD 
     15from ndgUtils.vocabtermdata import isValidTermURI 
    1616from ndgUtils.models.Atom import Atom 
    1717from ndgUtils.ndgXqueries import ndgXqueries 
    18 from ndgUtils.lib.utilities import isValidUnicode, checkURL, strftime 
     18from ndgUtils.lib.utilities import isValidUnicode, simpleURLCheck, strftime 
    1919 
    2020 
     
    217217        logging.info("Validating atom links") 
    218218        for link in self._atom.relatedLinks: 
    219             if not checkURL(link.href): 
    220                 self.__addError(self.BROKEN_LINKS, "Broken link: '%s'" %link.href) 
     219            if link.hasValue(): 
     220                try: 
     221                    if not simpleURLCheck(link.href): 
     222                        self.__addError(self.BROKEN_LINKS, "Broken link: '%s'" %link.href) 
     223                except Exception, e: 
     224                    self.__addError(self.BROKEN_LINKS, e.message) 
    221225 
    222226        logging.info("Completed link validation") 
     
    229233        logging.info("Validating atom vocab data") 
    230234        for category in self._atom.parameters: 
    231             if not checkURL(category.scheme): 
     235            if not isValidTermURI(category.scheme): 
    232236                self.__addError(self.INVALID_VOCAB_TERM, \ 
    233237                                "Invalid vocab term: '%s'" %category.scheme) 
     
    235239        # also check the terms used in the links 
    236240        for link in self._atom.relatedLinks: 
    237             if link.rel not in self.VALID_RELS: 
    238                 if not checkURL(link.rel): 
    239                     self.__addError(self.INVALID_VOCAB_TERM, \ 
    240                                     "Invalid vocab term: '%s'" %link.rel) 
     241            if link.hasValue(): 
     242                if link.rel not in self.VALID_RELS: 
     243                    if not isValidTermURI(link.rel): 
     244                        self.__addError(self.INVALID_VOCAB_TERM, \ 
     245                                        "Invalid vocab term: '%s'" %link.rel) 
    241246        logging.info("Completed link validation") 
    242247         
     
    256261             
    257262        except Exception, e: 
     263            # check for a meaningful error message 
     264            error = e.message 
     265            if not error: 
     266                error = e.faultString 
     267                 
    258268            errorMessage = "Problem experienced when validating against schema:%s'%s'" \ 
    259                 %(self._nl, e.message) 
     269                %(self._nl, error) 
    260270            traceback.format_exc() 
    261271            logging.error(errorMessage) 
  • exist/trunk/python/ndgUtils/lib/utilities.py

    r4492 r4494  
    1 import os, sys, logging, re, cgi, urlparse, httplib, time 
     1import os, sys, logging, re, cgi, urlparse, httplib, time, urllib2, socket 
    22from ndgUtils.ETxmlView import subAI 
    3 #import csml.csmlscan as CsmlScan 
     3import csml.csmlscan as CsmlScan 
    44from xml.sax.saxutils import escape 
    55''' 
     
    1818 
    1919esc_chars = {'\xb0':'°','°':'°'} 
     20     
     21def simpleURLCheck(uri): 
     22    ''' 
     23    Use urllib2.urlopen to check if a url can be accessed.  NB, a better approach 
     24    would be to use checkURL - which properly checks returned status codes, but can't 
     25    get this working properly with proxies 
     26     
     27    @param uri: vocab term uri to check 
     28    @return: 1 if valid, 0 otherwise 
     29    ''' 
     30    logging.debug("Checking validity of uri, '%s'" %uri) 
     31    # set the socket timeout period 
     32    socket.setdefaulttimeout(5) 
     33     
     34    try: 
     35        redirectCounter = 10 
     36        page = urllib2.urlopen(uri) 
     37        status = page.code 
     38     
     39        # check for redirection - NB, only do this a limited number of times 
     40        while (status >= 300) and (status <= 399) and redirectCounter > 0: 
     41            redirectCounter -= 1 
     42     
     43            # lookup redirected location 
     44            url = page.info().get('location') 
     45            logging.info("Redirect response received - checking new location, '%s'" %url) 
     46            page = urllib2.urlopen(uri) 
     47            status = page.code 
     48     
     49        if status >= 200 and status <= 299: 
     50            logging.info("URL resolved successfully") 
     51            return 1 
     52        else: 
     53            logging.info("Invalid return code received (%s)" %status) 
     54 
     55    except ValueError, e: 
     56        # propagate invalid format errors 
     57        raise e 
     58    except Exception, e: 
     59        logging.error("Exception thrown whilst verifying uri: '%s'" %e.message) 
     60 
     61    logging.debug("- url appears to be invalid") 
     62    return 0 
     63 
    2064     
    2165def checkURL(url): 
     
    3579        host, path = urlparse.urlparse(url)[1:3] 
    3680        if not host or not path: 
    37             raise ValueError("Invalid url - must be of format, 'http://somesite.com/...'") 
     81            raise ValueError("Invalid url (%s) - must be of format, 'http://somesite.com/...'" \ 
     82                             %url) 
    3883 
    3984        connection = httplib.HTTPConnection(host) 
     
    4691        # redirection limit, default of 10 
    4792        redirectCounter = 10 
    48      
     93 
    4994        # Retrieve HEAD 
    5095        resp = getRequestHead(url) 
     
    66111            logging.info("Invalid return code received (%s) - link broken" %resp.status) 
    67112            return 0 
     113    except ValueError, e: 
     114        # propagate invalid format errors 
     115        raise e 
    68116    except Exception, e: 
    69117        logging.error("Failed to lookup URL: '%s'" %e.message) 
    70118        return 0 
     119 
    71120 
    72121def getTripleData(tripleString, doEscape=True): 
     
    114163    @return encoded string 
    115164    ''' 
    116     import pdb 
    117     pdb.set_trace() 
    118165    # NB, the latin coding accepts unicode up to 255 
    119166    correctedString = getString(inputString)#inputString.decode('string_escape')#('latin-1') 
  • exist/trunk/python/ndgUtils/models/Atom.py

    r4490 r4494  
    539539        for relatedLink in self.relatedLinks: 
    540540            if relatedLink.hasValue(): 
    541                 import pdb 
    542                 pdb.set_trace() 
    543541                root.append(relatedLink.toXML()) 
    544542     
  • exist/trunk/python/ndgUtils/vocabtermdata.py

    r4488 r4494  
    55 @author: C Byrom, Tessella Jul 2008 
    66''' 
    7 import sys, logging, commands, string, os, time, re 
    8 import urllib 
     7import sys, logging, commands, string, os, time, re, urllib 
     8     
     9def isValidTermURI(uri): 
     10    ''' 
     11    Determines whether a specific vocab term uri is valid - NB, using the 
     12    utilities.checkURL method won't typically work since the RDF data is 
     13    exposed directly - i.e. without HEAD information which is usually looked 
     14    for 
     15    @param uri: vocab term uri to check 
     16    @return: True if valid, false otherwise 
     17    ''' 
     18    logging.debug("Checking vocab term uri, '%s'" %uri) 
     19    try: 
     20        page = urllib.urlopen(uri) 
     21        pageData = page.read() 
     22        if pageData.find('<rdf:RDF') > -1: 
     23            logging.debug("- found valid term") 
     24            return True 
     25         
     26    except Exception, e: 
     27        logging.error("Exception thrown whilst verifying uri: '%s'" %e.message) 
     28 
     29    logging.debug("- term appears to be invalid") 
     30    return False 
     31                       
    932 
    1033class VocabTermItem(object): 
     
    604627        logging.debug("- item is not a granule") 
    605628        return False 
    606                  
Note: See TracChangeset for help on using the changeset viewer.