source: exist/trunk/python/ndgUtils/lib/atomvalidator.py @ 4506

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/atomvalidator.py@4506
Revision 4506, 12.4 KB checked in by cbyrom, 12 years ago (diff)

Force atom validator to specify a db config file on construction + tidy up how some particular errors are formatted and retrieved.

Line 
1#!/usr/bin/env python
2'''
3 Helper class to use with the Atom data model - for data validation
4 Validates:
5 i) External links
6 ii) Vocab data
7 iii) Schema compliance
8 iv) unicode compliance - with utf-8 encoding
9 v) data consistency within the atom data model
10 
11 @author: C Byrom, Tessella Nov 2008
12'''
13import logging, traceback, datetime
14import ndgUtils.models.existdbclient as edc
15from ndgUtils.vocabtermdata import isValidTermURI
16from ndgUtils.models.Atom import Atom
17from ndgUtils.ndgXqueries import ndgXqueries
18from ndgUtils.lib.utilities import isValidUnicode, simpleURLCheck, strftime
19
20
21class ValidationError(Exception):
22    """
23    Exception handling for validation.
24    """
25    def __init__(self, errorDict):
26        msg = "Data validation error"
27        logging.error(msg)
28        Exception.__init__(self, msg)
29        for val in errorDict.itervalues():
30            logging.error(val)
31        self._errorDict = errorDict
32           
33    def unpack_errors(self):
34        return self._errorDict
35   
36   
37class AtomValidator(object):
38    '''
39    Helper class for validating atom data
40    '''
41    # eXist DB client
42    _eXist = None
43   
44    # standard output delimiter
45    LINE_SEPARATOR = "-----------------------------"
46   
47    # constants to use as error dict keys
48    BROKEN_LINKS = 1
49    INVALID_VOCAB_TERM = 2
50    SCHEMA_VALIDATION_FAILURE = 4
51
52    VALID_RELS = ["self", "related"]
53   
54    NEW_LINE = "\n"
55
56       
57    def __init__(self, atom, dbConfigFile, raiseException = False, \
58                 newLineChar= NEW_LINE, loadAllCollections = False):
59        '''
60        Set up validator object - with atom to validate
61        @param atom: Atom object to validate
62        @keyword dbConfigFile: config file to use with eXist DB connection
63        @keyword raiseException: if True, raise a ValidationException following a failed validation
64        - NB, if not used, errors can be retrieved from the self.errors field
65        @keyword loadAllCollections: loads all collections info when initialising eXist
66        connection, if True
67        '''
68        logging.info("Setting up atomValidation object")
69        self._atom = atom
70        self._nl = newLineChar
71       
72        # collections to effectively cache positive results - to avoid multiple
73        # (time consuming) lookups of the same data
74        self._validLinks = []
75        self._validVocabTerms = []
76       
77        # set up connection to eXist
78        self.__setUpEXistDBConnection(dbConfigFile, loadAllCollections = loadAllCollections)
79
80        # setup the dictionary to store errors
81        self.raiseException = raiseException
82        self.errors = {}
83        logging.info("atomValidator initialised")
84
85   
86    def __setUpEXistDBConnection(self, dbConfFile, loadAllCollections = False):
87        '''
88        Get the default eXist DB connection - by reading in data from the db config file
89        @keyword dbConfigFile: config file to use with eXist DB connection
90        @keyword loadAllCollections: loads all collections info when initialising eXist
91        '''
92        logging.info("Setting up connection to eXist DB")
93        self._eXist = edc.eXistDBClient(configFile = dbConfFile, 
94                                        loadCollectionData=loadAllCollections)
95        logging.info("eXist DB connection now set up")
96
97
98    def setAtom(self, atom):
99        '''
100        Set the atom to use the validator with
101        @param atom: an Atom object to validate
102        '''
103        if not isinstance(atom, Atom):
104            raise ValueError("Input object is not an Atom object")
105        logging.info("Setting new atom with validator (id=%s)" %atom.atomID)
106        self.errors = {} # clear out any existing errors
107        self._atom = atom
108       
109
110    def validateAtom(self):
111        '''
112        Retrieve an atom from the specified path and validate the contents
113        @param atomPath: path to the atom in the eXist DB
114        '''
115        logging.info("Validating atom, '%s'" %self._atom.atomID)
116        # firstly, check the links point to valid uris
117        self.__validateLinks()
118       
119        # now check the vocab terms
120        self.__validateVocabData()
121       
122        # check the atom conforms to the schema
123        self.__validateSchemaCompliance()
124       
125        # validate the actual atom content - for more specific checks on data
126        self.__validateAtomContent()
127       
128        # lastly check for non-unicode compliant characters
129        self.__validateUnicode()
130           
131        logging.info("Atom validation completed")
132       
133        # remove the error dict entry if no errors receieved
134        if self.errors:
135            logging.info("- atom is invalid")
136           
137            if self.raiseException:
138                logging.warning("Errors found in atom data: %s" %self.errors)
139                raise ValidationError(self.errors)
140        else:
141            logging.info("- atom is valid")
142       
143
144    def __validateAtomContent(self):
145        '''
146        Check the data content of the atom is consistent; if an error with any of
147        these is found, raise a ValueError
148        @raise ValueError: if any atom attributes have a problem
149        '''
150        logging.info("Validating the atom data model consistency")
151        if not self._atom.title:
152            self.__addError('title', "Title attribute cannot be empty")
153           
154        if self._atom.minX or self._atom.maxX or self._atom.minY or self._atom.maxY:
155            missingVals = False
156            incorrectFormat = False 
157            for val in [self._atom.minX, self._atom.maxX, self._atom.minY, self._atom.maxY]:
158                if val == '':
159                    missingVals = True
160                else:
161                    try:
162                        float(val)
163                    except:
164                        incorrectFormat = True
165
166            spatialError = ""
167            if missingVals:
168                spatialError += "Incomplete spatial coverage data.%s"  %self._nl
169            if incorrectFormat:
170                spatialError += "Spatial coverage data not in numerical format."
171
172            if spatialError:
173                self.__addError('spatialcoverage', spatialError)
174
175        if self._atom.t1 or self._atom.t2:
176            timeErrors = ''
177            d1 = None
178            d2 = None
179            if self._atom.t1:
180                try:
181                    d1 = datetime.datetime.strptime(self._atom.t1, self._atom.YEAR_FORMAT)
182                except:
183                    timeErrors += "Incorrect start date format - '%s' - c.f. '2008-04-12'. %s" \
184                        %(self._atom.t1, self._nl)
185            if self._atom.t2:
186                try:
187                    d2 = datetime.datetime.strptime(self._atom.t2, self._atom.YEAR_FORMAT)
188                except:
189                    timeErrors += "Incorrect end date format - '%s' - c.f. '2008-04-12'. %s" \
190                        %(self._atom.t2, self._nl)
191
192            if d1 and d2:
193                if d1 > d2 or d2 < d1:
194                    timeErrors += "Inconsistent date range - '%s' is not before '%s'" \
195                        %(strftime(d1, self._atom.YEAR_FORMAT), strftime(d2, self._atom.YEAR_FORMAT))
196#                        %(d1.strftime(self._atom.YEAR_FORMAT), d2.strftime(self._atom.YEAR_FORMAT))
197
198            if timeErrors:
199                self.__addError('temporalrange', timeErrors)
200
201        logging.info("Atom model consistency validation completed")
202
203
204    def __validateUnicode(self):
205        '''
206        Do a quick recursion over all the attributes to look for non
207        utf-8 compliant characters
208        '''
209        logging.info("Validating unicode UTF-8 compliance")
210        for key, val in self._atom.__dict__.items():
211            if val:
212                if isinstance(val, basestring):
213                    if not isValidUnicode(val):
214                        if not self.errors.has_key(key):
215                            self.errors[key] = ''
216                        self.errors[key] += "Illegal unicode found in string: '%s'.'%s'" %(val, nl)
217        logging.info("Completed validating unicode UTF-8 compliance")
218       
219
220    def __validateLinks(self):
221        '''
222        Check the external links contained in the atom and ensure they are valid
223        '''
224        logging.info("Validating atom links")
225        for link in self._atom.relatedLinks:
226            if link.hasValue():
227                try:
228                    # don't lookup link, if it has already been validated before
229                    if link.href in self._validLinks:
230                        continue
231                   
232                    if not simpleURLCheck(link.href):
233                        self.__addError(self.BROKEN_LINKS, "Broken link: '%s'" %link.href)
234                    else:
235                        self._validLinks.append(link.href)
236                       
237                except Exception, e:
238                    self.__addError(self.BROKEN_LINKS, e.message)
239
240        logging.info("Completed link validation")
241
242
243    def __validateVocabData(self):
244        '''
245        Check the vocab data contained in the atom and ensure they are valid
246        '''
247        logging.info("Validating atom vocab data")
248        for category in self._atom.parameters:
249            self.__validateTermURL(category.scheme)
250
251        # also check the terms used in the links
252        for link in self._atom.relatedLinks:
253            if link.hasValue():
254                self.__validateTermURL(link.rel)
255        logging.info("Completed link validation")
256
257
258    def __validateTermURL(self, url):
259        '''
260        Check the specified vocab url - and add any encountered errors
261        to the global error collection.  Also add any validated urls
262        to the global valid term collection.
263        @param url: url string representing a vocab term
264        '''
265        # don't lookup link, if it has already been validated before
266        if url in self._validVocabTerms or url in self.VALID_RELS:
267            logging.info("- term is valid")
268            return
269       
270        if not isValidTermURI(url):
271            logging.info("- term is invalid")
272            self.__addError(self.INVALID_VOCAB_TERM, \
273                            "Invalid vocab term: '%s'" %url)
274        else:
275            logging.info("- term is valid")
276            self._validVocabTerms.append(url)
277       
278
279    def __validateSchemaCompliance(self):
280        '''
281        Validate the atom, against the atom xsd, using eXist validation facilities
282        @param atomPath: collection path to atom in eXist
283        @param atomID: atom ID
284        '''
285        logging.info("Validating schema compliance")
286        atomPath = self._atom.getDefaultCollectionPath() + self._atom.atomName
287        try:
288            errors = self._eXist.checkAtomSchemaCompliance(atomPath, atom = self._atom)
289            for error in errors:
290                self.__addError(self.SCHEMA_VALIDATION_FAILURE, error)
291           
292        except Exception, e:
293            # check for a meaningful error message
294            error = e.message
295            if not error:
296                # strip out the exception type - NB, this is usually native library code
297                # and is of no real interest - and will just confuse viewers
298                error = e.faultString.split(':')[-1] 
299               
300            errorMessage = "Problem experienced when validating against schema:%s'%s'" \
301                %(self._nl, error)
302
303            logging.error(errorMessage)
304            self.__addError(self.SCHEMA_VALIDATION_FAILURE, errorMessage)
305        logging.info("Completed validating schema compliance")
306           
307   
308    def __addError(self, errorLabel, errorMessage):
309        '''
310        Add an error with the specified label and message to the error dict for the
311        specified atom ID
312        @param errorLabel: type of error to add
313        @param errorMessage: error message to add
314        '''
315        logging.debug("Adding error to error list")
316        logging.debug(errorMessage)
317       
318        if not self.errors.has_key(errorLabel):
319            self.errors[errorLabel] = []
320           
321        self.errors[errorLabel].append(errorMessage)
322        logging.debug("Error added")
323
324       
325    def logErrors(self):
326        '''
327        Outputs any errors caught during validation to log
328        '''
329        logging.info("The following errors were encountered when validating the atoms:")
330        logging.info('')
331        logging.info("- atom ID '%s'" %self._atom.atomID)
332        logging.info("--------------------------------------")
333        for errors in self.errors.values():
334            for error in errors:
335                logging.info(error)
336        logging.info("--------------------------------------")
Note: See TracBrowser for help on using the repository browser.