source: TI01-discovery/trunk/OAIInfoEditor/oai_info_editor/lib/harvester.py @ 5241

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/OAIInfoEditor/oai_info_editor/lib/harvester.py@5241
Revision 5241, 8.6 KB checked in by cbyrom, 12 years ago (diff)

Extend harvest functionality to allow it to run over all provider info

  • and to expose an entry point for scripts to use.
Line 
1'''
2 Class wrapping the jOAI Harvester API - to harvest specified provider repository data
3 
4 @author: C Byrom, Tessella Feb 2009
5'''
6import logging, commands, os, sys
7from oai_info_editor.model.repositoryinfo import RepositoryInfo
8from oai_info_editor.dal.providerinfodao import *
9from ndg.common.src.models.myconfig import myConfig
10
11
12class Harvester(object):
13
14    # this is the command used to run the java client
15    # NB, the inputs required are, outdir, baseURL and format
16    JAVA_COMMAND = 'java %s -cp lib/DLESETools.jar:lib/jdom-b7.jar:lib/xercesImpl.jar:lib/xml-apis.jar org.dlese.dpc.oai.harvester.Harvester %s %s %s'
17
18    def __init__(self, outDir = None, proxyHost = '130.246.135.176',
19                 proxyPort = '8080', configFile = None):
20        '''
21        Constructor - initialise the Harvester class
22        @keyword outDir: directory to harvest files to - NB,
23        this typically doesn't change for different harvests - which is why it
24        @keyword proxyHost: proxy host to use in comms - defaults to wwwcache.rl.ac.uk IP address
25        @keyword proxyPort: port for the proxy host to use - defaults to '8080'
26        is set in the constructor
27        @keyword configFile: ini file to define the outDir and the data for accessing the
28        OAI info editor data.  NB, the settings in configFile override the outDir keyword
29        '''
30        logging.debug("Initialising Harvester object")
31        self.outDir = outDir
32        self.proxyHost = proxyHost
33        self.proxyPort = proxyPort
34        self.cf = None
35        if configFile:
36            self.cf = myConfig(configFile)
37            outDir = self.cf.get('DATA_STORE', 'harvestDir')
38            if outDir:
39                self.outDir = outDir
40       
41        self.dao = None # data access object for retrieving all providers info
42        logging.info("ProviderInfoDAO initialised")
43   
44       
45    def harvestRepository(self, repositoryInfo, outDir = None):
46        '''
47        Harvest docs from the specified repository
48        @param repositoryInfo: RepositoryInfo object with data on the repository to
49        be harvested
50        @keyword outDir: directory to harvest files to
51        @raise ValueError: if repositoryInfo is not a RepositoryInfo object
52        @return status, outMessage: Status = True, if successful, False otherwise
53        outMessage = any error message received if harvesting fails
54        '''
55        logging.info("Running data harvest")
56        localDir = self.outDir
57        if outDir:
58            localDir = outDir
59           
60        if not isinstance(repositoryInfo, RepositoryInfo):
61            raise ValueError("Input object, '%s' is not of type 'RepositoryInfo'" %repositoryInfo)
62        logging.info("- for data at, '%s'" %repositoryInfo.url)
63
64        # NB, the harvester done via the web interface automatically creates a
65        # local dir for the repository info - using the url and format; to keep
66        # things consistent, do this here
67        localHarvestDir = self.__getLocalRepositoryDir(repositoryInfo, localDir)
68       
69        harvestCMD = self.__constructHarvestCommand(localHarvestDir, repositoryInfo)
70       
71        logging.debug(" - using command, '%s'" %harvestCMD)
72       
73        status, message = commands.getstatusoutput(harvestCMD)
74       
75        if status or message.find('cannotDisseminateFormat') > -1:
76            logging.error("Problem occurred whilst running harvest: %s" %message)
77            # NB, the harvest format is case dependent and this can vary across services!
78            # - if there is an error suggesting this is the problem, retry with opposite
79            # case for format
80            if message.find('cannotDisseminateFormat') > -1:
81                oldFormat = repositoryInfo.dataFormat
82                if oldFormat.islower():
83                    repositoryInfo.dataFormat = oldFormat.upper()
84                else:
85                    repositoryInfo.dataFormat = oldFormat.lower()
86                   
87                logging.info("- retrying harvest using format with new casing ('%s' vs '%s')" \
88                             %(oldFormat, repositoryInfo.dataFormat))
89
90                harvestCMD = self.__constructHarvestCommand(localHarvestDir, repositoryInfo)
91                logging.debug(" - using command, '%s'" %harvestCMD)
92               
93                status, message = commands.getstatusoutput(harvestCMD)
94                if status or message.find('cannotDisseminateFormat') > -1:
95                    logging.error("Problem occurred whilst running harvest: %s" %message)
96
97        if status or message.find('cannotDisseminateFormat') > -1:
98            # retrieve pertinent part of error message to return to user
99            data = message.split('\n')
100            # NB, usually the penulimate line has the clearest digest of the error on it
101            # - just incase there are exceptions to this, return just the last line
102            outMessage = data[-1]
103            if len(data) > 1 and message.find('cannotDisseminateFormat') == -1:
104                outMessage = data[-2]
105               
106            # NB, the message will be displayed in a javascript pop up - this doesn't like
107            # apostrophes so remove any in the message string
108            outMessage = outMessage.replace('\'', '')
109            return False, outMessage
110               
111
112        logging.info("- harvest completed successfully")
113        return True, message
114   
115
116    def __getLocalRepositoryDir(self, repositoryInfo, baseDir):
117        '''
118        Use the input repository info to create a local dir for the harvested data
119        to be stored in - NB, this matches what is done by the jOAI web interface
120        - which oddly isn't replicated by the jOAI Harvester API
121        @param repositoryInfo: RepositoryInfo for the repository being harvested
122        @param baseDir: base directory for harvesting from
123        @return localHarvestDir: string dir representing the directory to harvest
124        the repository records to
125        '''
126        logging.debug("Determining local directory for harvest")
127        localHarvestDir = repositoryInfo.url
128        localHarvestDir = localHarvestDir.replace('http://', '')
129        localHarvestDir = localHarvestDir.split('/')[0]
130        localHarvestDir = localHarvestDir.replace('.', '-')
131        localHarvestDir = localHarvestDir.replace(':', '-')
132        localHarvestDir += '-%s' %repositoryInfo.dataFormat
133        localHarvestDir = baseDir + os.sep + localHarvestDir
134        logging.debug(" - harvesting to '%s'" %localHarvestDir)
135        return localHarvestDir
136   
137   
138    def __constructHarvestCommand(self, localHarvestDir, repositoryInfo):
139        '''
140        Construct the java command required to do the appropriate harvest
141        @param localHarvestDir: the directory to harvest to
142        @param repositoryInfo: the RepositoryInfo object representing the repository
143        to be harvested
144        '''
145        proxyInfo = ''
146        if self.proxyHost:
147            proxyInfo = '-DproxySet=true -DproxyHost=%s -DproxyPort=%s' \
148                %(self.proxyHost, self.proxyPort)
149
150        harvestCMD = self.JAVA_COMMAND %(proxyInfo, localHarvestDir, repositoryInfo.url, 
151                                         repositoryInfo.dataFormat)
152        if repositoryInfo.setSpec:
153            harvestCMD += ' -set:%s' %repositoryInfo.setSpec
154
155        if repositoryInfo.splitBySet:
156            harvestCMD += ' -splitBySet:%s' %repositoryInfo.splitBySet
157        return harvestCMD
158
159
160    def harvestAll(self):
161        '''
162        Harvest all data specified in the oai info editor files
163        '''
164        logging.info("Harvesting all available provider info")
165        if not self.cf:
166            raise ValueError("No config file available - cannot get info to harvest all provider info.")
167
168        if not self.dao:
169            self.dao = createDAOWithClient(client = FILE_CLIENT_TYPE, 
170                                           configFile = self.cf)
171       
172        pis = self.dao.getAllProviderInfo()
173        for pi in pis:
174            logging.info("Harvesting info for provider, '%s'" %pi.name)
175            for ri in pi.repositoryInfos:
176                logging.info(" - harvesting repository info, '%s'" %ri.name)
177                self.harvestRepository(ri)
178               
179        logging.info("- harvesting complete")
180   
181       
182# entry point for running as a script - e.g. via crontab
183if __name__=="__main__":
184       
185    loggingLevel = logging.DEBUG
186    logging.basicConfig(level = loggingLevel,
187                        format='%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s')
188   
189    if len(sys.argv) < 2:
190        raise ValueError("Usage: harvester <configFile>\n\n- NB, config file should be the ini file used by the oai info editor.")
191    h = Harvester(configFile = sys.argv[1])
192    h.harvestAll()
193   
Note: See TracBrowser for help on using the repository browser.