source: TI01-discovery/trunk/OAIInfoEditor/oai_info_editor/lib/harvester.py @ 5239

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/OAIInfoEditor/oai_info_editor/lib/harvester.py@5239
Revision 5239, 6.7 KB checked in by cbyrom, 11 years ago (diff)

Add Harvester class to run the jOAI Harvester API and run repository
harvests. Add code throughout the app to allow harvests to be
ran from the UI. Add new test data and tests to exercise the new
functionality and adjust the config file to allow input of required
harvest data. Add confirmation pop up when running deletes.

Line 
1'''
2 Class wrapping the jOAI Harvester API - to harvest specified provider repository data
3 
4 @author: C Byrom, Tessella Feb 2009
5'''
6import logging, commands, os
7from oai_info_editor.model.repositoryinfo import RepositoryInfo
8
9
10class Harvester(object):
11
12    # this is the command used to run the java client
13    # NB, the inputs required are, outdir, baseURL and format
14    JAVA_COMMAND = 'java %s -cp lib/DLESETools.jar:lib/jdom-b7.jar:lib/xercesImpl.jar:lib/xml-apis.jar org.dlese.dpc.oai.harvester.Harvester %s %s %s'
15
16    def __init__(self, outDir = None, proxyHost = '130.246.135.176',
17                 proxyPort = '8080'):
18        '''
19        Constructor - initialise the Harvester class
20        @keyword outDir: directory to harvest files to - NB,
21        this typically doesn't change for different harvests - which is why it
22        @keyword proxyHost: proxy host to use in comms - defaults to wwwcache.rl.ac.uk IP address
23        @keyword proxyPort: port for the proxy host to use - defaults to '8080'   
24        is set in the constructor
25        '''
26        logging.debug("Initialising Harvester object")
27        self.outDir = outDir
28        self.proxyHost = proxyHost
29        self.proxyPort = proxyPort
30        logging.info("ProviderInfoDAO initialised")
31   
32       
33    def harvestRepository(self, repositoryInfo, outDir = None):
34        '''
35        Harvest docs from the specified repository
36        @param repositoryInfo: RepositoryInfo object with data on the repository to
37        be harvested
38        @keyword outDir: directory to harvest files to
39        @raise ValueError: if repositoryInfo is not a RepositoryInfo object
40        @return status, outMessage: Status = True, if successful, False otherwise
41        outMessage = any error message received if harvesting fails
42        '''
43        logging.info("Running data harvest")
44        localDir = self.outDir
45        if outDir:
46            localDir = outDir
47           
48        if not isinstance(repositoryInfo, RepositoryInfo):
49            raise ValueError("Input object, '%s' is not of type 'RepositoryInfo'" %repositoryInfo)
50        logging.info("- for data at, '%s'" %repositoryInfo.url)
51
52        # NB, the harvester done via the web interface automatically creates a
53        # local dir for the repository info - using the url and format; to keep
54        # things consistent, do this here
55        localHarvestDir = self.__getLocalRepositoryDir(repositoryInfo, localDir)
56       
57        harvestCMD = self.__constructHarvestCommand(localHarvestDir, repositoryInfo)
58       
59        logging.debug(" - using command, '%s'" %harvestCMD)
60       
61        status, message = commands.getstatusoutput(harvestCMD)
62       
63        if status or message.find('cannotDisseminateFormat') > -1:
64            logging.error("Problem occurred whilst running harvest: %s" %message)
65            # NB, the harvest format is case dependent and this can vary across services!
66            # - if there is an error suggesting this is the problem, retry with opposite
67            # case for format
68            if message.find('cannotDisseminateFormat') > -1:
69                oldFormat = repositoryInfo.dataFormat
70                if oldFormat.islower():
71                    repositoryInfo.dataFormat = oldFormat.upper()
72                else:
73                    repositoryInfo.dataFormat = oldFormat.lower()
74                   
75                logging.info("- retrying harvest using format with new casing ('%s' vs '%s')" \
76                             %(oldFormat, repositoryInfo.dataFormat))
77
78                harvestCMD = self.__constructHarvestCommand(localHarvestDir, repositoryInfo)
79                logging.debug(" - using command, '%s'" %harvestCMD)
80               
81                status, message = commands.getstatusoutput(harvestCMD)
82                if status or message.find('cannotDisseminateFormat') > -1:
83                    logging.error("Problem occurred whilst running harvest: %s" %message)
84
85        if status or message.find('cannotDisseminateFormat') > -1:
86            # retrieve pertinent part of error message to return to user
87            data = message.split('\n')
88            # NB, usually the penulimate line has the clearest digest of the error on it
89            # - just incase there are exceptions to this, return just the last line
90            outMessage = data[-1]
91            if len(data) > 1 and message.find('cannotDisseminateFormat') == -1:
92                outMessage = data[-2]
93               
94            # NB, the message will be displayed in a javascript pop up - this doesn't like
95            # apostrophes so remove any in the message string
96            outMessage = outMessage.replace('\'', '')
97            return False, outMessage
98               
99
100        logging.info("- harvest completed successfully")
101        return True, message
102   
103
104    def __getLocalRepositoryDir(self, repositoryInfo, baseDir):
105        '''
106        Use the input repository info to create a local dir for the harvested data
107        to be stored in - NB, this matches what is done by the jOAI web interface
108        - which oddly isn't replicated by the jOAI Harvester API
109        @param repositoryInfo: RepositoryInfo for the repository being harvested
110        @param baseDir: base directory for harvesting from
111        @return localHarvestDir: string dir representing the directory to harvest
112        the repository records to
113        '''
114        logging.debug("Determining local directory for harvest")
115        localHarvestDir = repositoryInfo.url
116        localHarvestDir = localHarvestDir.replace('http://', '')
117        localHarvestDir = localHarvestDir.split('/')[0]
118        localHarvestDir = localHarvestDir.replace('.', '-')
119        localHarvestDir = localHarvestDir.replace(':', '-')
120        localHarvestDir += '-%s' %repositoryInfo.dataFormat
121        localHarvestDir = baseDir + os.sep + localHarvestDir
122        logging.debug(" - harvesting to '%s'" %localHarvestDir)
123        return localHarvestDir
124   
125   
126    def __constructHarvestCommand(self, localHarvestDir, repositoryInfo):
127        '''
128        Construct the java command required to do the appropriate harvest
129        @param localHarvestDir: the directory to harvest to
130        @param repositoryInfo: the RepositoryInfo object representing the repository
131        to be harvested
132        '''
133        proxyInfo = ''
134        if self.proxyHost:
135            proxyInfo = '-DproxySet=true -DproxyHost=%s -DproxyPort=%s' \
136                %(self.proxyHost, self.proxyPort)
137
138        harvestCMD = self.JAVA_COMMAND %(proxyInfo, localHarvestDir, repositoryInfo.url, 
139                                         repositoryInfo.dataFormat)
140        if repositoryInfo.setSpec:
141            harvestCMD += ' -set:%s' %repositoryInfo.setSpec
142
143        if repositoryInfo.splitBySet:
144            harvestCMD += ' -splitBySet:%s' %repositoryInfo.splitBySet
145        return harvestCMD
Note: See TracBrowser for help on using the repository browser.