source: ndgCommon/trunk/ndg/common/src/tools/linkvalidator.py @ 4936

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/ndgCommon/trunk/ndg/common/src/tools/linkvalidator.py@4936
Revision 4936, 4.7 KB checked in by cbyrom, 11 years ago (diff)

Add command line tools - to validate links + atoms in eXist.

Line 
1#!/usr/bin/env python
2'''
3 Command line tool to validate the internal and external links featured in Published
4 atoms in a specified eXist DB
5 
6 @author: C Byrom, Tessella Nov 2008
7'''
8import os, sys, string, getopt, logging, re
9import ndgUtils.models.existdbclient as edc
10from ndgUtils.vocabtermdata import VocabTermData as VTD
11from ndgUtils.models.Atom import Atom
12from ndgUtils.lib.atomvalidator import AtomValidator
13import httplib, urlparse
14from minixsv import pyxsval
15import traceback
16
17   
18class linkValidator:
19    '''
20    Command line tool for checking links in an eXist atom collection
21    '''
22    # config file with eXist DB details
23    DBCONFIG_FILE = "passwords.txt"
24
25    # standard output delimiter
26    LINE_SEPARATOR = "-----------------------------"
27   
28    # constants to use as error dict keys
29    BROKEN_LINKS = 1
30    INVALID_VOCAB_TERM = 2
31    SCHEMA_VALIDATION_FAILURE = 4
32
33    VALID_RELS = ["self", "related"]
34   
35
36    def __validateAtom(self, atomPath):
37        '''
38        Retrieve an atom from the specified path and validate the contents
39        @param atomPath: path to the atom in the eXist DB
40        '''
41        logging.info("Validating atom, '%s'" %atomPath)
42        logging.info("- retrive atom from DB...")
43        atomString = self.validator._eXist.getEXistFile(atomPath)
44        atom = Atom(xmlString=str(atomString))
45
46        # create an entry in the errors dict for the atom
47        if self._errors.has_key(atom.atomID):
48            raise ValueError("Atom with duplicate ID (%s) encountered - this needs to be fixed in the DB" %atomID)
49       
50        self.validator.setAtom(atom)
51        self.validator.validateAtom()
52        if self.validator.errors:
53            self._errors[atom.atomID] = self.validator.errors
54
55        logging.info("Atom validation completed")
56       
57        # remove the error dict entry if no errors receieved
58        if self._errors[atom.atomID]:
59            logging.info("- atom is invalid")
60        else:
61            logging.info("- atom is valid")
62
63       
64    def __displayErrors(self):
65        '''
66        Display any errors caught during validation
67        '''
68        logging.info("The following errors were encountered when validating the atoms:")
69        for atomID, errorDict in self._errors.items():
70            if not errorDict:
71                continue
72            logging.info('')
73            logging.info("- atom ID '%s'" %atomID)
74            logging.info("--------------------------------------")
75            for errors in errorDict.values():
76                for error in errors:
77                    logging.info(error)
78            logging.info("--------------------------------------")
79
80
81
82    def _setUpOptions(self):
83        '''
84        Determine the logging level to use and configure this appropriately
85        '''
86        try:
87            opts, args = getopt.getopt(sys.argv[1:], "vd")
88        except getopt.GetoptError, err:
89            # print help information and exit:
90            print str(err) # will print something like "option -a not recognized"
91           
92        loggingLevel = logging.WARNING
93        for o, a in opts:
94            if o == "-v":
95                print " - Verbose mode ON"
96                loggingLevel = logging.INFO
97            elif o == "-d":
98                print " - Debug mode ON"
99                loggingLevel = logging.DEBUG
100       
101        print self.LINE_SEPARATOR
102        logging.basicConfig(level=loggingLevel,
103                        format='%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s')
104
105
106    def usage(self):
107        '''
108        Display input params for the script
109        '''
110        print "Usage: python linkValidator.py [OPTION]"
111        print " - where options are:"
112        print " -v - verbose mode for output logging"
113        print " -d - debug mode for output logging"
114        print "\neXist DB details should be stored in a config file called, '%s'" %self.DBCONFIG_FILE
115        sys.exit(2)
116
117       
118    def __init__(self):
119        '''
120        Main entry point for script
121        '''
122        print self.LINE_SEPARATOR
123        print "RUNNING: linkValidator.py"
124       
125        self._setUpOptions()
126
127        self.validator = AtomValidator(None, dbConfigFile = self.DBCONFIG_FILE)
128       
129        # setup the dictionary to store errors
130        self._errors = {}
131       
132        # NB, we've loaded all the atom collection data so now step through this
133        for atom, collection in self.validator._eXist.collections.items():
134            if collection.find('published') > -1:
135                self.__validateAtom(collection + '/' + atom + '.atom')
136       
137        if self._errors:
138            self.__displayErrors()
139        logging.info("linkValidator processing complete")
140       
141   
142if __name__=="__main__":
143    opts, args = getopt.getopt(sys.argv[1:], '-vd')
144    linkValidator()
145   
Note: See TracBrowser for help on using the repository browser.