source: exist/trunk/python/ndgUtils/lib/atomvalidator.py @ 4494

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/atomvalidator.py@4494
Revision 4494, 11.0 KB checked in by cbyrom, 12 years ago (diff)

Add new methods to lookup simple URLs and vocab term urls. NB, problems were encountered using the checkURL method, which uses httplib, when running with proxy server. Implement usage of new methods + fix small bug with keeping too many related links + tidy up unused imports.

Line 
1#!/usr/bin/env python
2'''
3 Helper class to use with the Atom data model - for data validation
4 Validates:
5 i) External links
6 ii) Vocab data
7 iii) Schema compliance
8 iv) unicode compliance - with utf-8 encoding
9 v) data consistency within the atom data model
10 
11 @author: C Byrom, Tessella Nov 2008
12'''
13import logging, traceback, datetime
14import ndgUtils.models.existdbclient as edc
15from ndgUtils.vocabtermdata import isValidTermURI
16from ndgUtils.models.Atom import Atom
17from ndgUtils.ndgXqueries import ndgXqueries
18from ndgUtils.lib.utilities import isValidUnicode, simpleURLCheck, strftime
19
20
21class ValidationError(Exception):
22    """
23    Exception handling for validation.
24    """
25    def __init__(self, errorDict):
26        msg = "Data validation error"
27        logging.error(msg)
28        Exception.__init__(self, msg)
29        for val in errorDict.itervalues():
30            logging.error(val)
31        self._errorDict = errorDict
32           
33    def unpack_errors(self):
34        return self._errorDict
35   
36   
37class AtomValidator(object):
38    '''
39    Helper class for validating atom data
40    '''
41    # eXist DB client
42    _eXist = None
43   
44    # standard output delimiter
45    LINE_SEPARATOR = "-----------------------------"
46   
47    # constants to use as error dict keys
48    BROKEN_LINKS = 1
49    INVALID_VOCAB_TERM = 2
50    SCHEMA_VALIDATION_FAILURE = 4
51
52    VALID_RELS = ["self", "related"]
53   
54    NEW_LINE = "\n"
55
56       
57    def __init__(self, atom, dbConfigFile = None, raiseException = False, \
58                 newLineChar= NEW_LINE):
59        '''
60        Set up validator object - with atom to validate
61        @param atom: Atom object to validate
62        @keyword dbConfigFile: config file to use with eXist DB connection
63        @keyword raiseException: if True, raise a ValidationException following a failed validation
64        - NB, if not used, errors can be retrieved from the self.errors field
65        '''
66        logging.info("Setting up atomValidation object")
67        self._atom = atom
68        self._nl = newLineChar
69       
70        # set up connection to eXist and postgres DBs
71        if dbConfigFile:
72            self.setUpEXistDBConnection(dbConfigFile)
73
74        # setup the dictionary to store errors
75        self.raiseException = raiseException
76        self.errors = {}
77        logging.info("atomValidator initialised")
78
79   
80    def setUpEXistDBConnection(self, dbConfFile):
81        '''
82        Get the default eXist DB connection - by reading in data from the db config file
83        '''
84        logging.info("Setting up connection to eXist DB")
85        self._eXist = edc.eXistDBClient(configFile = dbConfFile, loadCollectionData=True)
86        logging.info("eXist DB connection now set up")
87
88
89    def setAtom(self, atom):
90        '''
91        Set the atom to use the validator with
92        @param atom: an Atom object to validate
93        '''
94        if not isinstance(atom, Atom):
95            raise ValueError("Input object is not an Atom object")
96        logging.info("Setting new atom with validator (id=%s)" %atom.atomID)
97        self.errors = {} # clear out any existing errors
98        self._atom = atom
99       
100
101    def validateAtom(self):
102        '''
103        Retrieve an atom from the specified path and validate the contents
104        @param atomPath: path to the atom in the eXist DB
105        '''
106        if not self._atom:
107            logging.error("No atom specified to validate - skipping")
108        logging.info("Validating atom, '%s'" %self._atom.atomID)
109        # firstly, check the links point to valid uris
110        self.__validateLinks()
111       
112        # now check the vocab terms
113        self.__validateVocabData()
114       
115        # check the atom conforms to the schema
116        self.__validateSchemaCompliance()
117       
118        # validate the actual atom content - for more specific checks on data
119        self.__validateAtomContent()
120       
121        # lastly check for non-unicode compliant characters
122        self.__validateUnicode()
123           
124        logging.info("Atom validation completed")
125       
126        # remove the error dict entry if no errors receieved
127        if self.errors:
128            logging.info("- atom is invalid")
129           
130            if self.raiseException:
131                logging.warning("Errors found in atom data: %s" %self.errors)
132                raise ValidationError(self.errors)
133        else:
134            logging.info("- atom is valid")
135       
136
137    def __validateAtomContent(self):
138        '''
139        Check the data content of the atom is consistent; if an error with any of
140        these is found, raise a ValueError
141        @raise ValueError: if any atom attributes have a problem
142        '''
143        logging.info("Validating the atom data model consistency")
144        if not self._atom.title:
145            self.__addError('title', "Title attribute cannot be empty")
146           
147        if self._atom.minX or self._atom.maxX or self._atom.minY or self._atom.maxY:
148            missingVals = False
149            incorrectFormat = False 
150            for val in [self._atom.minX, self._atom.maxX, self._atom.minY, self._atom.maxY]:
151                if val == '':
152                    missingVals = True
153                else:
154                    try:
155                        float(val)
156                    except:
157                        incorrectFormat = True
158
159            spatialError = ""
160            if missingVals:
161                spatialError += "Incomplete spatial coverage data.%s"  %self._nl
162            if incorrectFormat:
163                spatialError += "Spatial coverage data not in numerical format."
164
165            if spatialError:
166                self.__addError('spatialcoverage', spatialError)
167
168        if self._atom.t1 or self._atom.t2:
169            timeErrors = ''
170            d1 = None
171            d2 = None
172            if self._atom.t1:
173                try:
174                    d1 = datetime.datetime.strptime(self._atom.t1, self._atom.YEAR_FORMAT)
175                except:
176                    timeErrors += "Incorrect start date format - '%s' - c.f. '2008-04-12'. %s" \
177                        %(self._atom.t1, self._nl)
178            if self._atom.t2:
179                try:
180                    d2 = datetime.datetime.strptime(self._atom.t2, self._atom.YEAR_FORMAT)
181                except:
182                    timeErrors += "Incorrect end date format - '%s' - c.f. '2008-04-12'. %s" \
183                        %(self._atom.t2, self._nl)
184
185            if d1 and d2:
186                if d1 > d2 or d2 < d1:
187                    timeErrors += "Inconsistent date range - '%s' is not before '%s'" \
188                        %(strftime(d1, self._atom.YEAR_FORMAT), strftime(d2, self._atom.YEAR_FORMAT))
189#                        %(d1.strftime(self._atom.YEAR_FORMAT), d2.strftime(self._atom.YEAR_FORMAT))
190
191            if timeErrors:
192                self.__addError('temporalrange', timeErrors)
193
194        logging.info("Atom model consistency validation completed")
195
196
197    def __validateUnicode(self):
198        '''
199        Do a quick recursion over all the attributes to look for non
200        utf-8 compliant characters
201        '''
202        logging.info("Validating unicode UTF-8 compliance")
203        for key, val in self._atom.__dict__.items():
204            if val:
205                if isinstance(val, basestring):
206                    if not isValidUnicode(val):
207                        if not self.errors.has_key(key):
208                            self.errors[key] = ''
209                        self.errors[key] += "Illegal unicode found in string: '%s'.'%s'" %(val, nl)
210        logging.info("Completed validating unicode UTF-8 compliance")
211       
212
213    def __validateLinks(self):
214        '''
215        Check the external links contained in the atom and ensure they are valid
216        '''
217        logging.info("Validating atom links")
218        for link in self._atom.relatedLinks:
219            if link.hasValue():
220                try:
221                    if not simpleURLCheck(link.href):
222                        self.__addError(self.BROKEN_LINKS, "Broken link: '%s'" %link.href)
223                except Exception, e:
224                    self.__addError(self.BROKEN_LINKS, e.message)
225
226        logging.info("Completed link validation")
227
228
229    def __validateVocabData(self):
230        '''
231        Check the vocab data contained in the atom and ensure they are valid
232        '''
233        logging.info("Validating atom vocab data")
234        for category in self._atom.parameters:
235            if not isValidTermURI(category.scheme):
236                self.__addError(self.INVALID_VOCAB_TERM, \
237                                "Invalid vocab term: '%s'" %category.scheme)
238
239        # also check the terms used in the links
240        for link in self._atom.relatedLinks:
241            if link.hasValue():
242                if link.rel not in self.VALID_RELS:
243                    if not isValidTermURI(link.rel):
244                        self.__addError(self.INVALID_VOCAB_TERM, \
245                                        "Invalid vocab term: '%s'" %link.rel)
246        logging.info("Completed link validation")
247       
248
249    def __validateSchemaCompliance(self):
250        '''
251        Validate the atom, against the atom xsd, using eXist validation facilities
252        @param atomPath: collection path to atom in eXist
253        @param atomID: atom ID
254        '''
255        logging.info("Validating schema compliance")
256        atomPath = self._atom.getDefaultCollectionPath() + self._atom.atomName
257        try:
258            errors = self._eXist.checkAtomSchemaCompliance(atomPath, atom = self._atom)
259            for error in errors:
260                self.__addError(self.SCHEMA_VALIDATION_FAILURE, error)
261           
262        except Exception, e:
263            # check for a meaningful error message
264            error = e.message
265            if not error:
266                error = e.faultString
267               
268            errorMessage = "Problem experienced when validating against schema:%s'%s'" \
269                %(self._nl, error)
270            traceback.format_exc()
271            logging.error(errorMessage)
272            self.__addError(self.SCHEMA_VALIDATION_FAILURE, errorMessage)
273        logging.info("Completed validating schema compliance")
274           
275   
276    def __addError(self, errorLabel, errorMessage):
277        '''
278        Add an error with the specified label and message to the error dict for the
279        specified atom ID
280        @param errorLabel: type of error to add
281        @param errorMessage: error message to add
282        '''
283        logging.debug("Adding error to error list")
284        logging.debug(errorMessage)
285       
286        if not self.errors.has_key(errorLabel):
287            self.errors[errorLabel] = []
288           
289        self.errors[errorLabel].append(errorMessage)
290        logging.debug("Error added")
291
292       
293    def logErrors(self):
294        '''
295        Outpus any errors caught during validation to log
296        '''
297        logging.info("The following errors were encountered when validating the atoms:")
298        logging.info('')
299        logging.info("- atom ID '%s'" %self._atom.atomID)
300        logging.info("--------------------------------------")
301        for errors in self.errors.values():
302            for error in errors:
303                logging.info(error)
304        logging.info("--------------------------------------")
Note: See TracBrowser for help on using the repository browser.