source: exist/trunk/python/ndgUtils/lib/atomvalidator.py @ 4495

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/atomvalidator.py@4495
Revision 4495, 12.0 KB checked in by cbyrom, 12 years ago (diff)

Store cache of validated urls and vocab terms in validator - to avoid the need to do multiple lookups of the same data + simplify the term validation by creating a re-usable method for both the category and links data.

Line 
1#!/usr/bin/env python
2'''
3 Helper class to use with the Atom data model - for data validation
4 Validates:
5 i) External links
6 ii) Vocab data
7 iii) Schema compliance
8 iv) unicode compliance - with utf-8 encoding
9 v) data consistency within the atom data model
10 
11 @author: C Byrom, Tessella Nov 2008
12'''
13import logging, traceback, datetime
14import ndgUtils.models.existdbclient as edc
15from ndgUtils.vocabtermdata import isValidTermURI
16from ndgUtils.models.Atom import Atom
17from ndgUtils.ndgXqueries import ndgXqueries
18from ndgUtils.lib.utilities import isValidUnicode, simpleURLCheck, strftime
19
20
21class ValidationError(Exception):
22    """
23    Exception handling for validation.
24    """
25    def __init__(self, errorDict):
26        msg = "Data validation error"
27        logging.error(msg)
28        Exception.__init__(self, msg)
29        for val in errorDict.itervalues():
30            logging.error(val)
31        self._errorDict = errorDict
32           
33    def unpack_errors(self):
34        return self._errorDict
35   
36   
37class AtomValidator(object):
38    '''
39    Helper class for validating atom data
40    '''
41    # eXist DB client
42    _eXist = None
43   
44    # standard output delimiter
45    LINE_SEPARATOR = "-----------------------------"
46   
47    # constants to use as error dict keys
48    BROKEN_LINKS = 1
49    INVALID_VOCAB_TERM = 2
50    SCHEMA_VALIDATION_FAILURE = 4
51
52    VALID_RELS = ["self", "related"]
53   
54    NEW_LINE = "\n"
55
56       
57    def __init__(self, atom, dbConfigFile = None, raiseException = False, \
58                 newLineChar= NEW_LINE):
59        '''
60        Set up validator object - with atom to validate
61        @param atom: Atom object to validate
62        @keyword dbConfigFile: config file to use with eXist DB connection
63        @keyword raiseException: if True, raise a ValidationException following a failed validation
64        - NB, if not used, errors can be retrieved from the self.errors field
65        '''
66        logging.info("Setting up atomValidation object")
67        self._atom = atom
68        self._nl = newLineChar
69       
70        # collections to effectively cache positive results - to avoid multiple
71        # (time consuming) lookups of the same data
72        self._validLinks = []
73        self._validVocabTerms = []
74       
75        # set up connection to eXist and postgres DBs
76        if dbConfigFile:
77            self.setUpEXistDBConnection(dbConfigFile)
78
79        # setup the dictionary to store errors
80        self.raiseException = raiseException
81        self.errors = {}
82        logging.info("atomValidator initialised")
83
84   
85    def setUpEXistDBConnection(self, dbConfFile):
86        '''
87        Get the default eXist DB connection - by reading in data from the db config file
88        '''
89        logging.info("Setting up connection to eXist DB")
90        self._eXist = edc.eXistDBClient(configFile = dbConfFile, loadCollectionData=True)
91        logging.info("eXist DB connection now set up")
92
93
94    def setAtom(self, atom):
95        '''
96        Set the atom to use the validator with
97        @param atom: an Atom object to validate
98        '''
99        if not isinstance(atom, Atom):
100            raise ValueError("Input object is not an Atom object")
101        logging.info("Setting new atom with validator (id=%s)" %atom.atomID)
102        self.errors = {} # clear out any existing errors
103        self._atom = atom
104       
105
106    def validateAtom(self):
107        '''
108        Retrieve an atom from the specified path and validate the contents
109        @param atomPath: path to the atom in the eXist DB
110        '''
111        if not self._atom:
112            logging.error("No atom specified to validate - skipping")
113        logging.info("Validating atom, '%s'" %self._atom.atomID)
114        # firstly, check the links point to valid uris
115        self.__validateLinks()
116       
117        # now check the vocab terms
118        self.__validateVocabData()
119       
120        # check the atom conforms to the schema
121        self.__validateSchemaCompliance()
122       
123        # validate the actual atom content - for more specific checks on data
124        self.__validateAtomContent()
125       
126        # lastly check for non-unicode compliant characters
127        self.__validateUnicode()
128           
129        logging.info("Atom validation completed")
130       
131        # remove the error dict entry if no errors receieved
132        if self.errors:
133            logging.info("- atom is invalid")
134           
135            if self.raiseException:
136                logging.warning("Errors found in atom data: %s" %self.errors)
137                raise ValidationError(self.errors)
138        else:
139            logging.info("- atom is valid")
140       
141
142    def __validateAtomContent(self):
143        '''
144        Check the data content of the atom is consistent; if an error with any of
145        these is found, raise a ValueError
146        @raise ValueError: if any atom attributes have a problem
147        '''
148        logging.info("Validating the atom data model consistency")
149        if not self._atom.title:
150            self.__addError('title', "Title attribute cannot be empty")
151           
152        if self._atom.minX or self._atom.maxX or self._atom.minY or self._atom.maxY:
153            missingVals = False
154            incorrectFormat = False 
155            for val in [self._atom.minX, self._atom.maxX, self._atom.minY, self._atom.maxY]:
156                if val == '':
157                    missingVals = True
158                else:
159                    try:
160                        float(val)
161                    except:
162                        incorrectFormat = True
163
164            spatialError = ""
165            if missingVals:
166                spatialError += "Incomplete spatial coverage data.%s"  %self._nl
167            if incorrectFormat:
168                spatialError += "Spatial coverage data not in numerical format."
169
170            if spatialError:
171                self.__addError('spatialcoverage', spatialError)
172
173        if self._atom.t1 or self._atom.t2:
174            timeErrors = ''
175            d1 = None
176            d2 = None
177            if self._atom.t1:
178                try:
179                    d1 = datetime.datetime.strptime(self._atom.t1, self._atom.YEAR_FORMAT)
180                except:
181                    timeErrors += "Incorrect start date format - '%s' - c.f. '2008-04-12'. %s" \
182                        %(self._atom.t1, self._nl)
183            if self._atom.t2:
184                try:
185                    d2 = datetime.datetime.strptime(self._atom.t2, self._atom.YEAR_FORMAT)
186                except:
187                    timeErrors += "Incorrect end date format - '%s' - c.f. '2008-04-12'. %s" \
188                        %(self._atom.t2, self._nl)
189
190            if d1 and d2:
191                if d1 > d2 or d2 < d1:
192                    timeErrors += "Inconsistent date range - '%s' is not before '%s'" \
193                        %(strftime(d1, self._atom.YEAR_FORMAT), strftime(d2, self._atom.YEAR_FORMAT))
194#                        %(d1.strftime(self._atom.YEAR_FORMAT), d2.strftime(self._atom.YEAR_FORMAT))
195
196            if timeErrors:
197                self.__addError('temporalrange', timeErrors)
198
199        logging.info("Atom model consistency validation completed")
200
201
202    def __validateUnicode(self):
203        '''
204        Do a quick recursion over all the attributes to look for non
205        utf-8 compliant characters
206        '''
207        logging.info("Validating unicode UTF-8 compliance")
208        for key, val in self._atom.__dict__.items():
209            if val:
210                if isinstance(val, basestring):
211                    if not isValidUnicode(val):
212                        if not self.errors.has_key(key):
213                            self.errors[key] = ''
214                        self.errors[key] += "Illegal unicode found in string: '%s'.'%s'" %(val, nl)
215        logging.info("Completed validating unicode UTF-8 compliance")
216       
217
218    def __validateLinks(self):
219        '''
220        Check the external links contained in the atom and ensure they are valid
221        '''
222        logging.info("Validating atom links")
223        for link in self._atom.relatedLinks:
224            if link.hasValue():
225                try:
226                    # don't lookup link, if it has already been validated before
227                    if link.href in self._validLinks:
228                        continue
229                   
230                    if not simpleURLCheck(link.href):
231                        self.__addError(self.BROKEN_LINKS, "Broken link: '%s'" %link.href)
232                    else:
233                        self._validLinks.append(link.href)
234                       
235                except Exception, e:
236                    self.__addError(self.BROKEN_LINKS, e.message)
237
238        logging.info("Completed link validation")
239
240
241    def __validateVocabData(self):
242        '''
243        Check the vocab data contained in the atom and ensure they are valid
244        '''
245        logging.info("Validating atom vocab data")
246        for category in self._atom.parameters:
247            self.__validateTermURL(category.scheme)
248
249        # also check the terms used in the links
250        for link in self._atom.relatedLinks:
251            if link.hasValue():
252                self.__validateTermURL(link.rel)
253        logging.info("Completed link validation")
254
255
256    def __validateTermURL(self, url):
257        '''
258        Check the specified vocab url - and add any encountered errors
259        to the global error collection.  Also add any validated urls
260        to the global valid term collection.
261        @param url: url string representing a vocab term
262        '''
263        # don't lookup link, if it has already been validated before
264        if url in self._validVocabTerms or url in self.VALID_RELS:
265            logging.info("- term is valid")
266            return
267       
268        if not isValidTermURI(url):
269            logging.info("- term is invalid")
270            self.__addError(self.INVALID_VOCAB_TERM, \
271                            "Invalid vocab term: '%s'" %url)
272        else:
273            logging.info("- term is valid")
274            self._validVocabTerms.append(url)
275       
276
277    def __validateSchemaCompliance(self):
278        '''
279        Validate the atom, against the atom xsd, using eXist validation facilities
280        @param atomPath: collection path to atom in eXist
281        @param atomID: atom ID
282        '''
283        logging.info("Validating schema compliance")
284        atomPath = self._atom.getDefaultCollectionPath() + self._atom.atomName
285        try:
286            errors = self._eXist.checkAtomSchemaCompliance(atomPath, atom = self._atom)
287            for error in errors:
288                self.__addError(self.SCHEMA_VALIDATION_FAILURE, error)
289           
290        except Exception, e:
291            # check for a meaningful error message
292            error = e.message
293            if not error:
294                error = e.faultString
295               
296            errorMessage = "Problem experienced when validating against schema:%s'%s'" \
297                %(self._nl, error)
298            traceback.format_exc()
299            logging.error(errorMessage)
300            self.__addError(self.SCHEMA_VALIDATION_FAILURE, errorMessage)
301        logging.info("Completed validating schema compliance")
302           
303   
304    def __addError(self, errorLabel, errorMessage):
305        '''
306        Add an error with the specified label and message to the error dict for the
307        specified atom ID
308        @param errorLabel: type of error to add
309        @param errorMessage: error message to add
310        '''
311        logging.debug("Adding error to error list")
312        logging.debug(errorMessage)
313       
314        if not self.errors.has_key(errorLabel):
315            self.errors[errorLabel] = []
316           
317        self.errors[errorLabel].append(errorMessage)
318        logging.debug("Error added")
319
320       
321    def logErrors(self):
322        '''
323        Outpus any errors caught during validation to log
324        '''
325        logging.info("The following errors were encountered when validating the atoms:")
326        logging.info('')
327        logging.info("- atom ID '%s'" %self._atom.atomID)
328        logging.info("--------------------------------------")
329        for errors in self.errors.values():
330            for error in errors:
331                logging.info(error)
332        logging.info("--------------------------------------")
Note: See TracBrowser for help on using the repository browser.