1 | #!/usr/bin/env python |
---|
2 | ''' |
---|
3 | Helper class to use with the Atom data model - for data validation |
---|
4 | Validates: |
---|
5 | i) External links |
---|
6 | ii) Vocab data |
---|
7 | iii) Schema compliance |
---|
8 | iv) unicode compliance - with utf-8 encoding |
---|
9 | v) data consistency within the atom data model |
---|
10 | |
---|
11 | @author: C Byrom, Tessella Nov 2008 |
---|
12 | ''' |
---|
13 | import logging, traceback, datetime, xmlrpclib, socket |
---|
14 | import ndgUtils.lib.existdbclient as edc |
---|
15 | from ndgUtils.vocabtermdata import isValidTermURI |
---|
16 | from ndgUtils.models.Atom import Atom |
---|
17 | from ndgUtils.ndgXqueries import ndgXqueries |
---|
18 | from ndgUtils.lib.utilities import isValidUnicode, simpleURLCheck, strftime |
---|
19 | |
---|
20 | |
---|
21 | class ValidationError(Exception): |
---|
22 | """ |
---|
23 | Exception handling for validation. |
---|
24 | """ |
---|
25 | def __init__(self, errorDict): |
---|
26 | msg = "Data validation error" |
---|
27 | logging.error(msg) |
---|
28 | Exception.__init__(self, msg) |
---|
29 | for val in errorDict.itervalues(): |
---|
30 | logging.error(val) |
---|
31 | self._errorDict = errorDict |
---|
32 | |
---|
33 | def unpack_errors(self): |
---|
34 | return self._errorDict |
---|
35 | |
---|
36 | |
---|
37 | class AtomValidator(object): |
---|
38 | ''' |
---|
39 | Helper class for validating atom data |
---|
40 | ''' |
---|
41 | # eXist DB client |
---|
42 | _eXist = None |
---|
43 | |
---|
44 | # standard output delimiter |
---|
45 | LINE_SEPARATOR = "-----------------------------" |
---|
46 | |
---|
47 | # constants to use as error dict keys |
---|
48 | BROKEN_LINKS = 1 |
---|
49 | INVALID_VOCAB_TERM = 2 |
---|
50 | SCHEMA_VALIDATION_FAILURE = 4 |
---|
51 | |
---|
52 | VALID_RELS = ["self", "related"] |
---|
53 | |
---|
54 | NEW_LINE = "\n" |
---|
55 | |
---|
56 | |
---|
57 | def __init__(self, atom, dbConfigFile, raiseException = False, \ |
---|
58 | newLineChar= NEW_LINE, loadAllCollections = False, \ |
---|
59 | isDebug = False): |
---|
60 | ''' |
---|
61 | Set up validator object - with atom to validate |
---|
62 | @param atom: Atom object to validate |
---|
63 | @keyword dbConfigFile: config file to use with eXist DB connection |
---|
64 | @keyword raiseException: if True, raise a ValidationException following a failed validation |
---|
65 | - NB, if not used, errors can be retrieved from the self.errors field |
---|
66 | @keyword loadAllCollections: loads all collections info when initialising eXist |
---|
67 | connection, if True |
---|
68 | @keyword isDebug: if True, provide more detailed output |
---|
69 | ''' |
---|
70 | logging.info("Setting up atomValidation object") |
---|
71 | self._atom = atom |
---|
72 | self._nl = newLineChar |
---|
73 | self._isDebug = isDebug |
---|
74 | |
---|
75 | # collections to effectively cache positive results - to avoid multiple |
---|
76 | # (time consuming) lookups of the same data |
---|
77 | self._validLinks = [] |
---|
78 | self._validVocabTerms = [] |
---|
79 | |
---|
80 | # set up connection to eXist |
---|
81 | self.__setUpEXistDBConnection(dbConfigFile, loadAllCollections = loadAllCollections) |
---|
82 | |
---|
83 | # setup the dictionary to store errors |
---|
84 | self.raiseException = raiseException |
---|
85 | self.errors = {} |
---|
86 | logging.info("atomValidator initialised") |
---|
87 | |
---|
88 | |
---|
89 | def __setUpEXistDBConnection(self, dbConfFile, loadAllCollections = False): |
---|
90 | ''' |
---|
91 | Get the default eXist DB connection - by reading in data from the db config file |
---|
92 | @keyword dbConfigFile: config file to use with eXist DB connection |
---|
93 | @keyword loadAllCollections: loads all collections info when initialising eXist |
---|
94 | ''' |
---|
95 | logging.info("Setting up connection to eXist DB") |
---|
96 | self._eXist = edc.eXistDBClient(configFile = dbConfFile, |
---|
97 | loadCollectionData=loadAllCollections) |
---|
98 | logging.info("eXist DB connection now set up") |
---|
99 | |
---|
100 | |
---|
101 | def setAtom(self, atom): |
---|
102 | ''' |
---|
103 | Set the atom to use the validator with |
---|
104 | @param atom: an Atom object to validate |
---|
105 | ''' |
---|
106 | if not isinstance(atom, Atom): |
---|
107 | raise ValueError("Input object is not an Atom object") |
---|
108 | logging.info("Setting new atom with validator (id=%s)" %atom.atomID) |
---|
109 | self.errors = {} # clear out any existing errors |
---|
110 | self._atom = atom |
---|
111 | |
---|
112 | |
---|
113 | def validateAtom(self): |
---|
114 | ''' |
---|
115 | Retrieve an atom from the specified path and validate the contents |
---|
116 | @param atomPath: path to the atom in the eXist DB |
---|
117 | ''' |
---|
118 | logging.info("Validating atom, '%s'" %self._atom.atomID) |
---|
119 | # firstly, check the links point to valid uris |
---|
120 | self.__validateLinks() |
---|
121 | |
---|
122 | # now check the vocab terms |
---|
123 | self.__validateVocabData() |
---|
124 | |
---|
125 | # check the atom conforms to the schema |
---|
126 | self.__validateSchemaCompliance() |
---|
127 | |
---|
128 | # validate the actual atom content - for more specific checks on data |
---|
129 | self.__validateAtomContent() |
---|
130 | |
---|
131 | # lastly check for non-unicode compliant characters |
---|
132 | self.__validateUnicode() |
---|
133 | |
---|
134 | logging.info("Atom validation completed") |
---|
135 | |
---|
136 | # remove the error dict entry if no errors receieved |
---|
137 | if self.errors: |
---|
138 | logging.info("- atom is invalid") |
---|
139 | |
---|
140 | if self.raiseException: |
---|
141 | logging.warning("Errors found in atom data: %s" %self.errors) |
---|
142 | raise ValidationError(self.errors) |
---|
143 | else: |
---|
144 | logging.info("- atom is valid") |
---|
145 | |
---|
146 | |
---|
147 | def __validateAtomContent(self): |
---|
148 | ''' |
---|
149 | Check the data content of the atom is consistent; if an error with any of |
---|
150 | these is found, raise a ValueError |
---|
151 | @raise ValueError: if any atom attributes have a problem |
---|
152 | ''' |
---|
153 | logging.info("Validating the atom data model consistency") |
---|
154 | if not self._atom.title: |
---|
155 | self.__addError('title', "Title attribute cannot be empty") |
---|
156 | |
---|
157 | if not self._atom.author.hasValue(): |
---|
158 | self.__addError('Author.0.name', "Author name cannot be empty") |
---|
159 | |
---|
160 | if self._atom.minX or self._atom.maxX or self._atom.minY or self._atom.maxY: |
---|
161 | missingVals = False |
---|
162 | incorrectFormat = False |
---|
163 | for val in [self._atom.minX, self._atom.maxX, self._atom.minY, self._atom.maxY]: |
---|
164 | if val == '': |
---|
165 | missingVals = True |
---|
166 | else: |
---|
167 | try: |
---|
168 | float(val) |
---|
169 | except: |
---|
170 | incorrectFormat = True |
---|
171 | |
---|
172 | spatialError = "" |
---|
173 | if missingVals: |
---|
174 | spatialError += "Incomplete spatial coverage data.%s" %self._nl |
---|
175 | if incorrectFormat: |
---|
176 | spatialError += "Spatial coverage data not in numerical format." |
---|
177 | |
---|
178 | if spatialError: |
---|
179 | self.__addError('spatialcoverage', spatialError) |
---|
180 | |
---|
181 | if self._atom.t1 or self._atom.t2: |
---|
182 | timeErrors = '' |
---|
183 | d1 = None |
---|
184 | d2 = None |
---|
185 | if self._atom.t1: |
---|
186 | try: |
---|
187 | d1 = datetime.datetime.strptime(self._atom.t1, self._atom.YEAR_FORMAT) |
---|
188 | except: |
---|
189 | timeErrors += "Incorrect start date format - '%s' - c.f. '2008-04-12'. %s" \ |
---|
190 | %(self._atom.t1, self._nl) |
---|
191 | if self._atom.t2: |
---|
192 | try: |
---|
193 | d2 = datetime.datetime.strptime(self._atom.t2, self._atom.YEAR_FORMAT) |
---|
194 | except: |
---|
195 | timeErrors += "Incorrect end date format - '%s' - c.f. '2008-04-12'. %s" \ |
---|
196 | %(self._atom.t2, self._nl) |
---|
197 | |
---|
198 | if d1 and d2: |
---|
199 | if d1 > d2 or d2 < d1: |
---|
200 | timeErrors += "Inconsistent date range - '%s' is not before '%s'" \ |
---|
201 | %(strftime(d1, self._atom.YEAR_FORMAT), strftime(d2, self._atom.YEAR_FORMAT)) |
---|
202 | |
---|
203 | if timeErrors: |
---|
204 | self.__addError('temporalrange', timeErrors) |
---|
205 | |
---|
206 | logging.info("Atom model consistency validation completed") |
---|
207 | |
---|
208 | |
---|
209 | def __validateUnicode(self): |
---|
210 | ''' |
---|
211 | Do a quick recursion over all the attributes to look for non |
---|
212 | utf-8 compliant characters |
---|
213 | ''' |
---|
214 | logging.info("Validating unicode UTF-8 compliance") |
---|
215 | for key, val in self._atom.__dict__.items(): |
---|
216 | if val: |
---|
217 | if isinstance(val, basestring): |
---|
218 | if not isValidUnicode(val): |
---|
219 | if not self.errors.has_key(key): |
---|
220 | self.errors[key] = '' |
---|
221 | self.errors[key] += "Illegal unicode found in string: '%s'.'%s'" %(val, nl) |
---|
222 | logging.info("Completed validating unicode UTF-8 compliance") |
---|
223 | |
---|
224 | |
---|
225 | def __validateLinks(self): |
---|
226 | ''' |
---|
227 | Check the external links contained in the atom and ensure they are valid |
---|
228 | ''' |
---|
229 | logging.info("Validating atom links") |
---|
230 | for link in self._atom.relatedLinks: |
---|
231 | if link.hasValue(): |
---|
232 | try: |
---|
233 | # don't lookup link, if it has already been validated before |
---|
234 | if link.href in self._validLinks: |
---|
235 | continue |
---|
236 | |
---|
237 | if not simpleURLCheck(link.href): |
---|
238 | self.__addError(self.BROKEN_LINKS, "Broken link: '%s'" %link.href) |
---|
239 | else: |
---|
240 | self._validLinks.append(link.href) |
---|
241 | |
---|
242 | except Exception, e: |
---|
243 | errorMessage = e.message |
---|
244 | if errorMessage.startswith('unknown url type'): |
---|
245 | errorMessage += " - NB, url must be of format, 'http://blah.co.uk'" |
---|
246 | self.__addError(self.BROKEN_LINKS, errorMessage) |
---|
247 | |
---|
248 | logging.info("Completed link validation") |
---|
249 | |
---|
250 | |
---|
251 | def __validateVocabData(self): |
---|
252 | ''' |
---|
253 | Check the vocab data contained in the atom and ensure they are valid |
---|
254 | ''' |
---|
255 | logging.info("Validating atom vocab data") |
---|
256 | for category in self._atom.parameters: |
---|
257 | if category.hasValue(): |
---|
258 | self.__validateTermURL(category.scheme) |
---|
259 | |
---|
260 | # also check the terms used in the links |
---|
261 | for link in self._atom.relatedLinks: |
---|
262 | if link.hasValue(): |
---|
263 | self.__validateTermURL(link.rel) |
---|
264 | logging.info("Completed link validation") |
---|
265 | |
---|
266 | |
---|
267 | def __validateTermURL(self, url): |
---|
268 | ''' |
---|
269 | Check the specified vocab url - and add any encountered errors |
---|
270 | to the global error collection. Also add any validated urls |
---|
271 | to the global valid term collection. |
---|
272 | @param url: url string representing a vocab term |
---|
273 | ''' |
---|
274 | # don't lookup link, if it has already been validated before |
---|
275 | if url in self._validVocabTerms or url in self.VALID_RELS: |
---|
276 | logging.info("- term is valid") |
---|
277 | return |
---|
278 | |
---|
279 | if not isValidTermURI(url): |
---|
280 | logging.info("- term is invalid") |
---|
281 | self.__addError(self.INVALID_VOCAB_TERM, \ |
---|
282 | "Invalid vocab term: '%s'" %url) |
---|
283 | else: |
---|
284 | logging.info("- term is valid") |
---|
285 | self._validVocabTerms.append(url) |
---|
286 | |
---|
287 | |
---|
288 | def __validateSchemaCompliance(self): |
---|
289 | ''' |
---|
290 | Validate the atom, against the atom xsd, using eXist validation facilities |
---|
291 | @param atomPath: collection path to atom in eXist |
---|
292 | @param atomID: atom ID |
---|
293 | ''' |
---|
294 | logging.info("Validating schema compliance") |
---|
295 | atomPath = self._atom.getDefaultCollectionPath() + self._atom.atomName |
---|
296 | try: |
---|
297 | errors = self._eXist.checkAtomSchemaCompliance(atomPath, atom = self._atom, \ |
---|
298 | isDebug = self._isDebug) |
---|
299 | for error in errors: |
---|
300 | self.__addError(self.SCHEMA_VALIDATION_FAILURE, error) |
---|
301 | |
---|
302 | except Exception, e: |
---|
303 | # check for a meaningful error message |
---|
304 | error = e.message |
---|
305 | if isinstance(e, xmlrpclib.Fault): |
---|
306 | # strip out the exception type - NB, this is usually native library code |
---|
307 | # and is of no real interest - and will just confuse viewers |
---|
308 | error = e.faultString.split(':')[-1] |
---|
309 | elif isinstance(e, socket.error): |
---|
310 | error = e.args[1] |
---|
311 | |
---|
312 | errorMessage = "Problem experienced when validating against schema:%s'%s'" \ |
---|
313 | %(self._nl, error) |
---|
314 | |
---|
315 | logging.error(errorMessage) |
---|
316 | self.__addError(self.SCHEMA_VALIDATION_FAILURE, errorMessage) |
---|
317 | logging.info("Completed validating schema compliance") |
---|
318 | |
---|
319 | |
---|
320 | def __addError(self, errorLabel, errorMessage): |
---|
321 | ''' |
---|
322 | Add an error with the specified label and message to the error dict for the |
---|
323 | specified atom ID |
---|
324 | @param errorLabel: type of error to add |
---|
325 | @param errorMessage: error message to add |
---|
326 | ''' |
---|
327 | logging.debug("Adding error to error list") |
---|
328 | logging.debug(errorMessage) |
---|
329 | |
---|
330 | if not self.errors.has_key(errorLabel): |
---|
331 | self.errors[errorLabel] = [] |
---|
332 | |
---|
333 | self.errors[errorLabel].append(errorMessage) |
---|
334 | logging.debug("Error added") |
---|
335 | |
---|
336 | |
---|
337 | def logErrors(self): |
---|
338 | ''' |
---|
339 | Outputs any errors caught during validation to log |
---|
340 | ''' |
---|
341 | logging.info("The following errors were encountered when validating the atoms:") |
---|
342 | logging.info('') |
---|
343 | logging.info("- atom ID '%s'" %self._atom.atomID) |
---|
344 | logging.info("--------------------------------------") |
---|
345 | for errors in self.errors.values(): |
---|
346 | for error in errors: |
---|
347 | logging.info(error) |
---|
348 | logging.info("--------------------------------------") |
---|