source: exist/trunk/python/ndgUtils/lib/atomvalidator.py @ 4512

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/exist/trunk/python/ndgUtils/lib/atomvalidator.py@4512
Revision 4512, 12.5 KB checked in by cbyrom, 11 years ago (diff)

Fix problem with retaining empty category data + improve output error logging + improve robustness of exist file retrieval.

Line 
1#!/usr/bin/env python
2'''
3 Helper class to use with the Atom data model - for data validation
4 Validates:
5 i) External links
6 ii) Vocab data
7 iii) Schema compliance
8 iv) unicode compliance - with utf-8 encoding
9 v) data consistency within the atom data model
10 
11 @author: C Byrom, Tessella Nov 2008
12'''
13import logging, traceback, datetime
14import ndgUtils.models.existdbclient as edc
15from ndgUtils.vocabtermdata import isValidTermURI
16from ndgUtils.models.Atom import Atom
17from ndgUtils.ndgXqueries import ndgXqueries
18from ndgUtils.lib.utilities import isValidUnicode, simpleURLCheck, strftime
19
20
21class ValidationError(Exception):
22    """
23    Exception handling for validation.
24    """
25    def __init__(self, errorDict):
26        msg = "Data validation error"
27        logging.error(msg)
28        Exception.__init__(self, msg)
29        for val in errorDict.itervalues():
30            logging.error(val)
31        self._errorDict = errorDict
32           
33    def unpack_errors(self):
34        return self._errorDict
35   
36   
37class AtomValidator(object):
38    '''
39    Helper class for validating atom data
40    '''
41    # eXist DB client
42    _eXist = None
43   
44    # standard output delimiter
45    LINE_SEPARATOR = "-----------------------------"
46   
47    # constants to use as error dict keys
48    BROKEN_LINKS = 1
49    INVALID_VOCAB_TERM = 2
50    SCHEMA_VALIDATION_FAILURE = 4
51
52    VALID_RELS = ["self", "related"]
53   
54    NEW_LINE = "\n"
55
56       
57    def __init__(self, atom, dbConfigFile, raiseException = False, \
58                 newLineChar= NEW_LINE, loadAllCollections = False):
59        '''
60        Set up validator object - with atom to validate
61        @param atom: Atom object to validate
62        @keyword dbConfigFile: config file to use with eXist DB connection
63        @keyword raiseException: if True, raise a ValidationException following a failed validation
64        - NB, if not used, errors can be retrieved from the self.errors field
65        @keyword loadAllCollections: loads all collections info when initialising eXist
66        connection, if True
67        '''
68        logging.info("Setting up atomValidation object")
69        self._atom = atom
70        self._nl = newLineChar
71       
72        # collections to effectively cache positive results - to avoid multiple
73        # (time consuming) lookups of the same data
74        self._validLinks = []
75        self._validVocabTerms = []
76       
77        # set up connection to eXist
78        self.__setUpEXistDBConnection(dbConfigFile, loadAllCollections = loadAllCollections)
79
80        # setup the dictionary to store errors
81        self.raiseException = raiseException
82        self.errors = {}
83        logging.info("atomValidator initialised")
84
85   
86    def __setUpEXistDBConnection(self, dbConfFile, loadAllCollections = False):
87        '''
88        Get the default eXist DB connection - by reading in data from the db config file
89        @keyword dbConfigFile: config file to use with eXist DB connection
90        @keyword loadAllCollections: loads all collections info when initialising eXist
91        '''
92        logging.info("Setting up connection to eXist DB")
93        self._eXist = edc.eXistDBClient(configFile = dbConfFile, 
94                                        loadCollectionData=loadAllCollections)
95        logging.info("eXist DB connection now set up")
96
97
98    def setAtom(self, atom):
99        '''
100        Set the atom to use the validator with
101        @param atom: an Atom object to validate
102        '''
103        if not isinstance(atom, Atom):
104            raise ValueError("Input object is not an Atom object")
105        logging.info("Setting new atom with validator (id=%s)" %atom.atomID)
106        self.errors = {} # clear out any existing errors
107        self._atom = atom
108       
109
110    def validateAtom(self):
111        '''
112        Retrieve an atom from the specified path and validate the contents
113        @param atomPath: path to the atom in the eXist DB
114        '''
115        logging.info("Validating atom, '%s'" %self._atom.atomID)
116        # firstly, check the links point to valid uris
117        self.__validateLinks()
118       
119        # now check the vocab terms
120        self.__validateVocabData()
121       
122        # check the atom conforms to the schema
123        self.__validateSchemaCompliance()
124       
125        # validate the actual atom content - for more specific checks on data
126        self.__validateAtomContent()
127       
128        # lastly check for non-unicode compliant characters
129        self.__validateUnicode()
130           
131        logging.info("Atom validation completed")
132       
133        # remove the error dict entry if no errors receieved
134        if self.errors:
135            logging.info("- atom is invalid")
136           
137            if self.raiseException:
138                logging.warning("Errors found in atom data: %s" %self.errors)
139                raise ValidationError(self.errors)
140        else:
141            logging.info("- atom is valid")
142       
143
144    def __validateAtomContent(self):
145        '''
146        Check the data content of the atom is consistent; if an error with any of
147        these is found, raise a ValueError
148        @raise ValueError: if any atom attributes have a problem
149        '''
150        logging.info("Validating the atom data model consistency")
151        if not self._atom.title:
152            self.__addError('title', "Title attribute cannot be empty")
153           
154        if self._atom.minX or self._atom.maxX or self._atom.minY or self._atom.maxY:
155            missingVals = False
156            incorrectFormat = False 
157            for val in [self._atom.minX, self._atom.maxX, self._atom.minY, self._atom.maxY]:
158                if val == '':
159                    missingVals = True
160                else:
161                    try:
162                        float(val)
163                    except:
164                        incorrectFormat = True
165
166            spatialError = ""
167            if missingVals:
168                spatialError += "Incomplete spatial coverage data.%s"  %self._nl
169            if incorrectFormat:
170                spatialError += "Spatial coverage data not in numerical format."
171
172            if spatialError:
173                self.__addError('spatialcoverage', spatialError)
174
175        if self._atom.t1 or self._atom.t2:
176            timeErrors = ''
177            d1 = None
178            d2 = None
179            if self._atom.t1:
180                try:
181                    d1 = datetime.datetime.strptime(self._atom.t1, self._atom.YEAR_FORMAT)
182                except:
183                    timeErrors += "Incorrect start date format - '%s' - c.f. '2008-04-12'. %s" \
184                        %(self._atom.t1, self._nl)
185            if self._atom.t2:
186                try:
187                    d2 = datetime.datetime.strptime(self._atom.t2, self._atom.YEAR_FORMAT)
188                except:
189                    timeErrors += "Incorrect end date format - '%s' - c.f. '2008-04-12'. %s" \
190                        %(self._atom.t2, self._nl)
191
192            if d1 and d2:
193                if d1 > d2 or d2 < d1:
194                    timeErrors += "Inconsistent date range - '%s' is not before '%s'" \
195                        %(strftime(d1, self._atom.YEAR_FORMAT), strftime(d2, self._atom.YEAR_FORMAT))
196#                        %(d1.strftime(self._atom.YEAR_FORMAT), d2.strftime(self._atom.YEAR_FORMAT))
197
198            if timeErrors:
199                self.__addError('temporalrange', timeErrors)
200
201        logging.info("Atom model consistency validation completed")
202
203
204    def __validateUnicode(self):
205        '''
206        Do a quick recursion over all the attributes to look for non
207        utf-8 compliant characters
208        '''
209        logging.info("Validating unicode UTF-8 compliance")
210        for key, val in self._atom.__dict__.items():
211            if val:
212                if isinstance(val, basestring):
213                    if not isValidUnicode(val):
214                        if not self.errors.has_key(key):
215                            self.errors[key] = ''
216                        self.errors[key] += "Illegal unicode found in string: '%s'.'%s'" %(val, nl)
217        logging.info("Completed validating unicode UTF-8 compliance")
218       
219
220    def __validateLinks(self):
221        '''
222        Check the external links contained in the atom and ensure they are valid
223        '''
224        logging.info("Validating atom links")
225        for link in self._atom.relatedLinks:
226            if link.hasValue():
227                try:
228                    # don't lookup link, if it has already been validated before
229                    if link.href in self._validLinks:
230                        continue
231                   
232                    if not simpleURLCheck(link.href):
233                        self.__addError(self.BROKEN_LINKS, "Broken link: '%s'" %link.href)
234                    else:
235                        self._validLinks.append(link.href)
236                       
237                except Exception, e:
238                    self.__addError(self.BROKEN_LINKS, e.message)
239
240        logging.info("Completed link validation")
241
242
243    def __validateVocabData(self):
244        '''
245        Check the vocab data contained in the atom and ensure they are valid
246        '''
247        logging.info("Validating atom vocab data")
248        for category in self._atom.parameters:
249            if category.hasValue():
250                self.__validateTermURL(category.scheme)
251
252        # also check the terms used in the links
253        for link in self._atom.relatedLinks:
254            if link.hasValue():
255                self.__validateTermURL(link.rel)
256        logging.info("Completed link validation")
257
258
259    def __validateTermURL(self, url):
260        '''
261        Check the specified vocab url - and add any encountered errors
262        to the global error collection.  Also add any validated urls
263        to the global valid term collection.
264        @param url: url string representing a vocab term
265        '''
266        # don't lookup link, if it has already been validated before
267        if url in self._validVocabTerms or url in self.VALID_RELS:
268            logging.info("- term is valid")
269            return
270       
271        if not isValidTermURI(url):
272            logging.info("- term is invalid")
273            self.__addError(self.INVALID_VOCAB_TERM, \
274                            "Invalid vocab term: '%s'" %url)
275        else:
276            logging.info("- term is valid")
277            self._validVocabTerms.append(url)
278       
279
280    def __validateSchemaCompliance(self):
281        '''
282        Validate the atom, against the atom xsd, using eXist validation facilities
283        @param atomPath: collection path to atom in eXist
284        @param atomID: atom ID
285        '''
286        logging.info("Validating schema compliance")
287        atomPath = self._atom.getDefaultCollectionPath() + self._atom.atomName
288        try:
289            errors = self._eXist.checkAtomSchemaCompliance(atomPath, atom = self._atom)
290            for error in errors:
291                self.__addError(self.SCHEMA_VALIDATION_FAILURE, error)
292           
293        except Exception, e:
294            # check for a meaningful error message
295            error = e.message
296            if not error:
297                # strip out the exception type - NB, this is usually native library code
298                # and is of no real interest - and will just confuse viewers
299                error = e.faultString.split(':')[-1] 
300               
301            errorMessage = "Problem experienced when validating against schema:%s'%s'" \
302                %(self._nl, error)
303
304            logging.error(errorMessage)
305            self.__addError(self.SCHEMA_VALIDATION_FAILURE, errorMessage)
306        logging.info("Completed validating schema compliance")
307           
308   
309    def __addError(self, errorLabel, errorMessage):
310        '''
311        Add an error with the specified label and message to the error dict for the
312        specified atom ID
313        @param errorLabel: type of error to add
314        @param errorMessage: error message to add
315        '''
316        logging.debug("Adding error to error list")
317        logging.debug(errorMessage)
318       
319        if not self.errors.has_key(errorLabel):
320            self.errors[errorLabel] = []
321           
322        self.errors[errorLabel].append(errorMessage)
323        logging.debug("Error added")
324
325       
326    def logErrors(self):
327        '''
328        Outputs any errors caught during validation to log
329        '''
330        logging.info("The following errors were encountered when validating the atoms:")
331        logging.info('')
332        logging.info("- atom ID '%s'" %self._atom.atomID)
333        logging.info("--------------------------------------")
334        for errors in self.errors.values():
335            for error in errors:
336                logging.info(error)
337        logging.info("--------------------------------------")
Note: See TracBrowser for help on using the repository browser.