source: exist/trunk/python/ndgUtils/lib/atomvalidator.py @ 4589

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/atomvalidator.py@4589
Revision 4589, 13.0 KB checked in by cbyrom, 11 years ago (diff)

Add debug mode to validator to allow for more detailed output + add
better check for authors details + expand error messages for invalid
links to demonstrate the required formatting.

Line 
1#!/usr/bin/env python
2'''
3 Helper class to use with the Atom data model - for data validation
4 Validates:
5 i) External links
6 ii) Vocab data
7 iii) Schema compliance
8 iv) unicode compliance - with utf-8 encoding
9 v) data consistency within the atom data model
10 
11 @author: C Byrom, Tessella Nov 2008
12'''
13import logging, traceback, datetime, xmlrpclib, socket
14import ndgUtils.lib.existdbclient as edc
15from ndgUtils.vocabtermdata import isValidTermURI
16from ndgUtils.models.Atom import Atom
17from ndgUtils.ndgXqueries import ndgXqueries
18from ndgUtils.lib.utilities import isValidUnicode, simpleURLCheck, strftime
19
20
21class ValidationError(Exception):
22    """
23    Exception handling for validation.
24    """
25    def __init__(self, errorDict):
26        msg = "Data validation error"
27        logging.error(msg)
28        Exception.__init__(self, msg)
29        for val in errorDict.itervalues():
30            logging.error(val)
31        self._errorDict = errorDict
32           
33    def unpack_errors(self):
34        return self._errorDict
35   
36   
37class AtomValidator(object):
38    '''
39    Helper class for validating atom data
40    '''
41    # eXist DB client
42    _eXist = None
43   
44    # standard output delimiter
45    LINE_SEPARATOR = "-----------------------------"
46   
47    # constants to use as error dict keys
48    BROKEN_LINKS = 1
49    INVALID_VOCAB_TERM = 2
50    SCHEMA_VALIDATION_FAILURE = 4
51
52    VALID_RELS = ["self", "related"]
53   
54    NEW_LINE = "\n"
55
56       
57    def __init__(self, atom, dbConfigFile, raiseException = False, \
58                 newLineChar= NEW_LINE, loadAllCollections = False, \
59                 isDebug = False):
60        '''
61        Set up validator object - with atom to validate
62        @param atom: Atom object to validate
63        @keyword dbConfigFile: config file to use with eXist DB connection
64        @keyword raiseException: if True, raise a ValidationException following a failed validation
65        - NB, if not used, errors can be retrieved from the self.errors field
66        @keyword loadAllCollections: loads all collections info when initialising eXist
67        connection, if True
68        @keyword isDebug: if True, provide more detailed output
69        '''
70        logging.info("Setting up atomValidation object")
71        self._atom = atom
72        self._nl = newLineChar
73        self._isDebug = isDebug
74       
75        # collections to effectively cache positive results - to avoid multiple
76        # (time consuming) lookups of the same data
77        self._validLinks = []
78        self._validVocabTerms = []
79       
80        # set up connection to eXist
81        self.__setUpEXistDBConnection(dbConfigFile, loadAllCollections = loadAllCollections)
82
83        # setup the dictionary to store errors
84        self.raiseException = raiseException
85        self.errors = {}
86        logging.info("atomValidator initialised")
87
88   
89    def __setUpEXistDBConnection(self, dbConfFile, loadAllCollections = False):
90        '''
91        Get the default eXist DB connection - by reading in data from the db config file
92        @keyword dbConfigFile: config file to use with eXist DB connection
93        @keyword loadAllCollections: loads all collections info when initialising eXist
94        '''
95        logging.info("Setting up connection to eXist DB")
96        self._eXist = edc.eXistDBClient(configFile = dbConfFile, 
97                                        loadCollectionData=loadAllCollections)
98        logging.info("eXist DB connection now set up")
99
100
101    def setAtom(self, atom):
102        '''
103        Set the atom to use the validator with
104        @param atom: an Atom object to validate
105        '''
106        if not isinstance(atom, Atom):
107            raise ValueError("Input object is not an Atom object")
108        logging.info("Setting new atom with validator (id=%s)" %atom.atomID)
109        self.errors = {} # clear out any existing errors
110        self._atom = atom
111       
112
113    def validateAtom(self):
114        '''
115        Retrieve an atom from the specified path and validate the contents
116        @param atomPath: path to the atom in the eXist DB
117        '''
118        logging.info("Validating atom, '%s'" %self._atom.atomID)
119        # firstly, check the links point to valid uris
120        self.__validateLinks()
121       
122        # now check the vocab terms
123        self.__validateVocabData()
124       
125        # check the atom conforms to the schema
126        self.__validateSchemaCompliance()
127       
128        # validate the actual atom content - for more specific checks on data
129        self.__validateAtomContent()
130       
131        # lastly check for non-unicode compliant characters
132        self.__validateUnicode()
133           
134        logging.info("Atom validation completed")
135       
136        # remove the error dict entry if no errors receieved
137        if self.errors:
138            logging.info("- atom is invalid")
139           
140            if self.raiseException:
141                logging.warning("Errors found in atom data: %s" %self.errors)
142                raise ValidationError(self.errors)
143        else:
144            logging.info("- atom is valid")
145       
146
147    def __validateAtomContent(self):
148        '''
149        Check the data content of the atom is consistent; if an error with any of
150        these is found, raise a ValueError
151        @raise ValueError: if any atom attributes have a problem
152        '''
153        logging.info("Validating the atom data model consistency")
154        if not self._atom.title:
155            self.__addError('title', "Title attribute cannot be empty")
156           
157        if not self._atom.author.hasValue():
158            self.__addError('Author.0.name', "Author name cannot be empty")
159           
160        if self._atom.minX or self._atom.maxX or self._atom.minY or self._atom.maxY:
161            missingVals = False
162            incorrectFormat = False 
163            for val in [self._atom.minX, self._atom.maxX, self._atom.minY, self._atom.maxY]:
164                if val == '':
165                    missingVals = True
166                else:
167                    try:
168                        float(val)
169                    except:
170                        incorrectFormat = True
171
172            spatialError = ""
173            if missingVals:
174                spatialError += "Incomplete spatial coverage data.%s"  %self._nl
175            if incorrectFormat:
176                spatialError += "Spatial coverage data not in numerical format."
177
178            if spatialError:
179                self.__addError('spatialcoverage', spatialError)
180
181        if self._atom.t1 or self._atom.t2:
182            timeErrors = ''
183            d1 = None
184            d2 = None
185            if self._atom.t1:
186                try:
187                    d1 = datetime.datetime.strptime(self._atom.t1, self._atom.YEAR_FORMAT)
188                except:
189                    timeErrors += "Incorrect start date format - '%s' - c.f. '2008-04-12'. %s" \
190                        %(self._atom.t1, self._nl)
191            if self._atom.t2:
192                try:
193                    d2 = datetime.datetime.strptime(self._atom.t2, self._atom.YEAR_FORMAT)
194                except:
195                    timeErrors += "Incorrect end date format - '%s' - c.f. '2008-04-12'. %s" \
196                        %(self._atom.t2, self._nl)
197
198            if d1 and d2:
199                if d1 > d2 or d2 < d1:
200                    timeErrors += "Inconsistent date range - '%s' is not before '%s'" \
201                        %(strftime(d1, self._atom.YEAR_FORMAT), strftime(d2, self._atom.YEAR_FORMAT))
202
203            if timeErrors:
204                self.__addError('temporalrange', timeErrors)
205
206        logging.info("Atom model consistency validation completed")
207
208
209    def __validateUnicode(self):
210        '''
211        Do a quick recursion over all the attributes to look for non
212        utf-8 compliant characters
213        '''
214        logging.info("Validating unicode UTF-8 compliance")
215        for key, val in self._atom.__dict__.items():
216            if val:
217                if isinstance(val, basestring):
218                    if not isValidUnicode(val):
219                        if not self.errors.has_key(key):
220                            self.errors[key] = ''
221                        self.errors[key] += "Illegal unicode found in string: '%s'.'%s'" %(val, nl)
222        logging.info("Completed validating unicode UTF-8 compliance")
223       
224
225    def __validateLinks(self):
226        '''
227        Check the external links contained in the atom and ensure they are valid
228        '''
229        logging.info("Validating atom links")
230        for link in self._atom.relatedLinks:
231            if link.hasValue():
232                try:
233                    # don't lookup link, if it has already been validated before
234                    if link.href in self._validLinks:
235                        continue
236                   
237                    if not simpleURLCheck(link.href):
238                        self.__addError(self.BROKEN_LINKS, "Broken link: '%s'" %link.href)
239                    else:
240                        self._validLinks.append(link.href)
241                       
242                except Exception, e:
243                    errorMessage = e.message
244                    if errorMessage.startswith('unknown url type'):
245                        errorMessage += " - NB, url must be of format, 'http://blah.co.uk'"
246                    self.__addError(self.BROKEN_LINKS, errorMessage)
247
248        logging.info("Completed link validation")
249
250
251    def __validateVocabData(self):
252        '''
253        Check the vocab data contained in the atom and ensure they are valid
254        '''
255        logging.info("Validating atom vocab data")
256        for category in self._atom.parameters:
257            if category.hasValue():
258                self.__validateTermURL(category.scheme)
259
260        # also check the terms used in the links
261        for link in self._atom.relatedLinks:
262            if link.hasValue():
263                self.__validateTermURL(link.rel)
264        logging.info("Completed link validation")
265
266
267    def __validateTermURL(self, url):
268        '''
269        Check the specified vocab url - and add any encountered errors
270        to the global error collection.  Also add any validated urls
271        to the global valid term collection.
272        @param url: url string representing a vocab term
273        '''
274        # don't lookup link, if it has already been validated before
275        if url in self._validVocabTerms or url in self.VALID_RELS:
276            logging.info("- term is valid")
277            return
278       
279        if not isValidTermURI(url):
280            logging.info("- term is invalid")
281            self.__addError(self.INVALID_VOCAB_TERM, \
282                            "Invalid vocab term: '%s'" %url)
283        else:
284            logging.info("- term is valid")
285            self._validVocabTerms.append(url)
286       
287
288    def __validateSchemaCompliance(self):
289        '''
290        Validate the atom, against the atom xsd, using eXist validation facilities
291        @param atomPath: collection path to atom in eXist
292        @param atomID: atom ID
293        '''
294        logging.info("Validating schema compliance")
295        atomPath = self._atom.getDefaultCollectionPath() + self._atom.atomName
296        try:
297            errors = self._eXist.checkAtomSchemaCompliance(atomPath, atom = self._atom, \
298                                                           isDebug = self._isDebug)
299            for error in errors:
300                self.__addError(self.SCHEMA_VALIDATION_FAILURE, error)
301           
302        except Exception, e:
303            # check for a meaningful error message
304            error = e.message
305            if isinstance(e, xmlrpclib.Fault):
306                # strip out the exception type - NB, this is usually native library code
307                # and is of no real interest - and will just confuse viewers
308                error = e.faultString.split(':')[-1] 
309            elif isinstance(e, socket.error):
310                error = e.args[1]
311               
312            errorMessage = "Problem experienced when validating against schema:%s'%s'" \
313                %(self._nl, error)
314
315            logging.error(errorMessage)
316            self.__addError(self.SCHEMA_VALIDATION_FAILURE, errorMessage)
317        logging.info("Completed validating schema compliance")
318           
319   
320    def __addError(self, errorLabel, errorMessage):
321        '''
322        Add an error with the specified label and message to the error dict for the
323        specified atom ID
324        @param errorLabel: type of error to add
325        @param errorMessage: error message to add
326        '''
327        logging.debug("Adding error to error list")
328        logging.debug(errorMessage)
329       
330        if not self.errors.has_key(errorLabel):
331            self.errors[errorLabel] = []
332           
333        self.errors[errorLabel].append(errorMessage)
334        logging.debug("Error added")
335
336       
337    def logErrors(self):
338        '''
339        Outputs any errors caught during validation to log
340        '''
341        logging.info("The following errors were encountered when validating the atoms:")
342        logging.info('')
343        logging.info("- atom ID '%s'" %self._atom.atomID)
344        logging.info("--------------------------------------")
345        for errors in self.errors.values():
346            for error in errors:
347                logging.info(error)
348        logging.info("--------------------------------------")
Note: See TracBrowser for help on using the repository browser.