source: TI01-discovery/trunk/OAIInfoEditor/oai_info_editor/lib/harvester.py @ 5245

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/OAIInfoEditor/oai_info_editor/lib/harvester.py@5245
Revision 5245, 11.5 KB checked in by cbyrom, 11 years ago (diff)

Extend harvest functionality to also include ingest of harvested docs
+ improve error handling and catch special cases when creating
providers with names that already exist + improve UI look and feel.

Line 
1'''
2 Class wrapping the jOAI Harvester API - to harvest specified provider repository data
3 
4 @author: C Byrom, Tessella Feb 2009
5'''
6import logging, commands, os, sys
7from oai_info_editor.model.repositoryinfo import RepositoryInfo
8from oai_info_editor.dal.providerinfodao import *
9from ndg.common.src.models.myconfig import myConfig
10from OAIBatch.oai_document_ingester import oai_document_ingester
11
12
13class Harvester(object):
14
15    # this is the command used to run the java client
16    # NB, the inputs required are, outdir, baseURL and format
17    JAVA_COMMAND = 'java %s -cp lib/DLESETools.jar:lib/jdom-b7.jar:lib/xercesImpl.jar:lib/xml-apis.jar org.dlese.dpc.oai.harvester.Harvester %s %s %s'
18
19    def __init__(self, outDir = None, proxyHost = '130.246.135.176',
20                 proxyPort = '8080', configFile = None):
21        '''
22        Constructor - initialise the Harvester class
23        @keyword outDir: directory to harvest files to - NB,
24        this typically doesn't change for different harvests - which is why it
25        @keyword proxyHost: proxy host to use in comms - defaults to wwwcache.rl.ac.uk IP address
26        @keyword proxyPort: port for the proxy host to use - defaults to '8080'
27        is set in the constructor
28        @keyword configFile: ini file to define the outDir and the data for accessing the
29        OAI info editor data.  NB, the settings in configFile override the outDir keyword
30        '''
31        logging.debug("Initialising Harvester object")
32        self.outDir = outDir
33        self.proxyHost = proxyHost
34        self.proxyPort = proxyPort
35        self.cf = None
36        if configFile:
37            self.cf = myConfig(configFile)
38            outDir = self.cf.get('DATA_STORE', 'harvestDir')
39            if outDir:
40                self.outDir = outDir
41
42        self.dao = None # data access object for retrieving all providers info
43        self.ingester = None # ingester script for adding the harvested data to the Discovery service
44
45        logging.info("ProviderInfoDAO initialised")
46   
47       
48    def harvestRepository(self, repositoryInfo, outDir = None):
49        '''
50        Harvest docs from the specified repository
51        @param repositoryInfo: RepositoryInfo object with data on the repository to
52        be harvested
53        @keyword outDir: directory to harvest files to
54        @raise ValueError: if repositoryInfo is not a RepositoryInfo object
55        @return status, outMessage: Status = True, if successful, False otherwise
56        outMessage = summary of harvest outcome
57        '''
58        logging.info("Running data harvest")
59       
60        # check we're not running on windows - this doesn't work since the commands
61        # library for running system commands is only unix systems compatible
62        # NB, this would be better placed in the constructor - since this would stop
63        # the app from running in the first place; put it here temporarily to allow
64        # testing of the app on windows
65        if sys.platform.lower().startswith('win'):
66            raise SystemError("Harvest functionality does not work on Windows machines " + \
67                              "- application should be running on a Unix system.")
68
69        localDir = self.outDir
70        if outDir:
71            localDir = outDir
72           
73        if not isinstance(repositoryInfo, RepositoryInfo):
74            raise ValueError("Input object, '%s' is not of type 'RepositoryInfo'" %repositoryInfo)
75        logging.info("- for data at, '%s'" %repositoryInfo.url)
76
77        # NB, the harvester done via the web interface automatically creates a
78        # local dir for the repository info - using the url and format; to keep
79        # things consistent, do this here
80        localHarvestDir = self.__getLocalRepositoryDir(repositoryInfo, localDir)
81       
82        harvestCMD = self.__constructHarvestCommand(localHarvestDir, repositoryInfo)
83       
84        logging.debug(" - using command, '%s'" %harvestCMD)
85       
86        status, message = commands.getstatusoutput(harvestCMD)
87       
88        logging.debug("Harvest output: '%s'" %message)
89       
90        if status or message.find('cannotDisseminateFormat') > -1:
91            logging.error("Problem occurred whilst running harvest: %s" %message)
92            # NB, the harvest format is case dependent and this can vary across services!
93            # - if there is an error suggesting this is the problem, retry with opposite
94            # case for format
95            if message.find('cannotDisseminateFormat') > -1:
96                oldFormat = repositoryInfo.dataFormat
97                if oldFormat.islower():
98                    repositoryInfo.dataFormat = oldFormat.upper()
99                else:
100                    repositoryInfo.dataFormat = oldFormat.lower()
101                   
102                logging.info("- retrying harvest using format with new casing ('%s' vs '%s')" \
103                             %(oldFormat, repositoryInfo.dataFormat))
104
105                harvestCMD = self.__constructHarvestCommand(localHarvestDir, repositoryInfo)
106                logging.debug(" - using command, '%s'" %harvestCMD)
107               
108                status, message = commands.getstatusoutput(harvestCMD)
109                logging.debug("Harvest output: '%s'" %message)
110                if status or message.find('cannotDisseminateFormat') > -1:
111                    logging.error("Problem occurred whilst running harvest: %s" %message)
112
113        data = message.split('\n')
114        if status or message.find('cannotDisseminateFormat') > -1:
115            # retrieve pertinent part of error message to return to user
116            # NB, usually the penulimate line has the clearest digest of the error on it
117            # - just incase there are exceptions to this, return just the last line
118            outMessage = data[-1]
119            if len(data) > 1 and message.find('cannotDisseminateFormat') == -1:
120                outMessage = data[-2]
121               
122            # NB, the message will be displayed in a javascript pop up - this doesn't like
123            # apostrophes so remove any in the message string
124            outMessage = outMessage.replace('\'', '')
125            return False, outMessage
126
127        logging.info("- harvest completed successfully")
128        # NB, the last line has the summary of the harvest - so just return this
129        return True, data[-1]
130
131
132    def ingestDocuments(self, providerName):
133        '''
134        Ingest harvested documents into the discovery service
135        @param providerName: Name of provider whose documents should be ingested
136        @return status, outMessage: Status = True, if successful, False otherwise
137        outMessage = summary of ingest outcome
138        '''
139        logging.debug("Running ingest of documents")
140        if not self.ingester:
141            self.ingester = oai_document_ingester()
142       
143        result = ""
144        isSuccess = False
145        try:
146            result = ingester.processDataCentre(datacentre)
147            isSuccess = True
148        except:
149            logging.error("Exception thrown - detail: ")
150            result = sys.exc_info()
151            logging.error(result)
152            logging.info("Continue processing next datacentre config file...")
153
154        logging.debug("Document ingest complete")
155        return isSuccess, result
156
157
158    def runHarvestAndIngest(self, providerName, repositoryInfo, outDir = None):
159        '''
160        Harvest and ingest documents into the discovery service
161        @param providerName: Name of provider whose documents should be ingested
162        @param repositoryInfo: RepositoryInfo object with data on the repository to
163        be harvested
164        @keyword outDir: directory to harvest files to
165        @return status, outMessage: Status = True, if successful, False otherwise
166        outMessage = summary of ingest outcome
167        '''
168        logging.debug("Running harvest then ingest")
169        isSuccess, outMessage = self.harvestRepository(repositoryInfo, outDir = outDir)
170        if isSuccess:
171            logging.debug("- harvest successful, so running ingest")
172            isSuccess, ingestMessage = self.ingestDocuments(providerName)
173           
174            outMessage += ingestMessage
175        else:
176            logging.debug(" - harvest failed so avoid document ingest")
177       
178        return isSuccess, outMessage
179       
180
181    def __getLocalRepositoryDir(self, repositoryInfo, baseDir):
182        '''
183        Use the input repository info to create a local dir for the harvested data
184        to be stored in - NB, this matches what is done by the jOAI web interface
185        - which oddly isn't replicated by the jOAI Harvester API
186        @param repositoryInfo: RepositoryInfo for the repository being harvested
187        @param baseDir: base directory for harvesting from
188        @return localHarvestDir: string dir representing the directory to harvest
189        the repository records to
190        '''
191        logging.debug("Determining local directory for harvest")
192        localHarvestDir = repositoryInfo.url
193        localHarvestDir = localHarvestDir.replace('http://', '')
194        localHarvestDir = localHarvestDir.split('/')[0]
195        localHarvestDir = localHarvestDir.replace('.', '-')
196        localHarvestDir = localHarvestDir.replace(':', '-')
197        localHarvestDir += '-%s' %repositoryInfo.dataFormat
198        localHarvestDir = baseDir + os.sep + localHarvestDir
199        logging.debug(" - harvesting to '%s'" %localHarvestDir)
200        return localHarvestDir
201   
202   
203    def __constructHarvestCommand(self, localHarvestDir, repositoryInfo):
204        '''
205        Construct the java command required to do the appropriate harvest
206        @param localHarvestDir: the directory to harvest to
207        @param repositoryInfo: the RepositoryInfo object representing the repository
208        to be harvested
209        '''
210        proxyInfo = ''
211        if self.proxyHost:
212            proxyInfo = '-DproxySet=true -DproxyHost=%s -DproxyPort=%s' \
213                %(self.proxyHost, self.proxyPort)
214
215        harvestCMD = self.JAVA_COMMAND %(proxyInfo, localHarvestDir, repositoryInfo.url, 
216                                         repositoryInfo.dataFormat)
217        if repositoryInfo.setSpec:
218            harvestCMD += ' -set:%s' %repositoryInfo.setSpec
219
220        if repositoryInfo.splitBySet:
221            harvestCMD += ' -splitBySet:%s' %repositoryInfo.splitBySet
222        return harvestCMD
223
224
225    def harvestAll(self):
226        '''
227        Harvest all data specified in the oai info editor files
228        '''
229        logging.info("Harvesting all available provider info")
230        if not self.cf:
231            raise ValueError("No config file available - cannot get info to harvest all provider info.")
232
233        if not self.dao:
234            self.dao = createDAOWithClient(client = FILE_CLIENT_TYPE, 
235                                           configFile = self.cf)
236       
237        pis = self.dao.getAllProviderInfo()
238        for pi in pis:
239            logging.info("Harvesting info for provider, '%s'" %pi.name)
240            for ri in pi.repositoryInfos:
241                logging.info(" - harvesting repository info, '%s'" %ri.name)
242                self.runHarvestAndIngest(pi.name, ri)
243               
244        logging.info("- harvesting complete")
245   
246       
247# entry point for running as a script - e.g. via crontab
248if __name__=="__main__":
249       
250    loggingLevel = logging.DEBUG
251    logging.basicConfig(level = loggingLevel,
252                        format='%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s')
253   
254    if len(sys.argv) < 2:
255        raise ValueError("Usage: harvester <configFile>\n\n- NB, config file should be the ini file used by the oai info editor.")
256    h = Harvester(configFile = sys.argv[1])
257    h.harvestAll()
258   
Note: See TracBrowser for help on using the repository browser.