source: TI01-discovery-Ingest/trunk/v3n_NDG3/ingestAutomation-upgrade/OAIBatch/oai_document_ingester.py @ 6849

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery-Ingest/trunk/v3n_NDG3/ingestAutomation-upgrade/OAIBatch/oai_document_ingester.py@6849
Revision 6849, 7.5 KB checked in by sdonegan, 9 years ago (diff)

Put in workaround to deal with datacentres that insist on using 'gcmd' as format rather than 'dif' - problem when using oai_info_editor - debugged the debug!

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2'''
3 Main script to do the document ingest from the OAI harvested files to the
4 discovery postgres DB.  NB, can be ran for all datacentres using the run_all_ingest.py script
5 or can specify an individual datacentre to run the ingester on.
6 As well as doing the ingest, a backup directory is created to store the created moles files.
7'''
8import os, sys, logging
9# annoyingly, an import (CSML file, I think) sets the logging config during imports - so set this
10# here to get there first - since you can only set the config once
11logging.basicConfig(level=logging.DEBUG,
12                    format='%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s')
13from time import strftime
14import ndg.common.src.lib.fileutilities as FileUtilities
15from abstractdocumentingester import AbstractDocumentIngester
16
17class oai_document_ingester(AbstractDocumentIngester):
18        '''
19        Class to handle the ingest of files from the OAI harvester to the discovery service postgres DB
20        - including running the various transforms and parsings to get all doc types and spatiotemporal
21        data in the correct form in the DB
22        @return outMessage: string summary of ingest outcome
23        '''
24       
25        indFileToIngest=""
26
27        def processDataCentre(self, dataCentre, harvestDir = None, dataFormat = None, configFileName = None):
28                '''
29                indFileToIngest=None
30                Ingest documents from the specified data centre
31                @param dataCentre: data centre to ingest docs from
32                @keyword harvestDir: directory to get docs from - NB, this will override that
33                specified in the associated config file.  Typically this is used when a manual
34                harvest has retrieved docs to a local dir (see OAIInfoEditor.lib.harvester).
35                @param dataFormat: format of data to ingest.  Overrides config file settings.
36                @return isSuccess, outMessage: isSuccess = True if ingest completes ok
37                outMessage = summary of ingest process
38                '''
39
40               
41                self._no_files_ingested = 0
42                self._no_files_changed = 0
43                self._no_files_deleted = 0
44                self._no_problem_files = 0
45               
46                self._error_messages = ''
47                self.dataCentre = dataCentre
48                #self._base_dir = os.getcwd() + "/" # this is the base dir that the script is ran from
49
50                #extract relevant directories etc from processing config gile
51               
52               
53                #self.processingDict = self.getProcessingConfig('oai_document_ingester.config')
54                if configFileName is None:
55                        self.processingDict = self.getProcessingConfig(self.oaiEditorConfig)
56                else:
57                        self.processingDict = self.getProcessingConfig(configFileName)
58                       
59                #self._code_dir = "/home/badc/buildouts/oai_document_ingester/ingestAutomation-upgrade/OAIBatch/" # this is the base dir that the script is ran from
60                #self._base_dir = "/home/badc/discovery_docs/ingestDocs/"
61               
62                self._code_dir = self.processingDict['code_directory']
63                self._base_dir = self.processingDict['base_directory']
64               
65                self._databaseConfigurationFile = self.processingDict['ingestConfig']
66                self._ndgRedirectURL = self.processingDict['NDG_redirect_URL']
67               
68                self.processThread = 'OAI'
69               
70                self._setupDataCentreDirs()
71               
72                #Change os directory to that with the harvested documents in it.
73                os.chdir(self._base_dir)
74                                               
75                # - to run on Windows under cygwin, use the following
76                #os.putenv('PATH', 'C:\\opt\\cygwin\\bin')
77               
78                self.getConfigDetails(dataCentre)
79                # override default settings with input keyword values, if set
80                if harvestDir:
81                        self._harvest_home = harvestDir
82               
83                if dataFormat:
84                        #add fudge to deal with "gcmd" non standard formats used by some dc's - easier to deal with here than faffing with editor code
85                        if dataFormat == 'gcmd':
86                                self._datacentre_format = 'DIF'
87                        else:           
88                                self._datacentre_format = dataFormat
89
90               
91                # check harvest dir exists and that there are any records to harvest?           
92                if self.indFileToIngest == "":
93                        if not os.path.exists(self._harvest_home):
94                                logging.error("Harvest directory for dataCentre %s (%s) could not be found - exiting" \
95                                                 %(dataCentre, self._harvest_home))
96                                return
97                        elif len(os.listdir(self._harvest_home)) == 0:
98                                logging.info("Nothing to harvest this time from %s" %dataCentre)
99                                return
100                       
101                        commandline = "find " + self._harvest_home + " -type f -print | xargs -i cp \{\} " + self.originals_dir
102                        logging.info("Executing : " + commandline)
103                        status = os.system(commandline)
104               
105                else:
106                        #must be looking for an individual file to upload
107                        if not os.path.exists(self.indFileToIngest):
108                               
109                                logging .warn("Specified file does not exist")
110                                return
111                       
112                        # Create/clear the 'in' directory pristine copy of the discovery records
113                        #fileUtils.setUpDir(originals_dir)
114                        commandline = "cp " + self.indFileToIngest + " " +  self.originals_dir
115                        logging.info("Executing : " + commandline)
116                        status = os.system(commandline)
117
118               
119                if status !=0:
120                    sys.exit("Failed at making pristine copy stage")
121               
122                self._setupXQueries()
123
124                # Process the resulting files and put the data into the postgres DB
125                # - firstly set up a db connection to use
126                self._getPostgresDBConnection()
127
128                numfilesproc, processingReport = self._convertAndIngestFiles(self.originals_dir, self.discovery_dir, dataCentre, True)
129               
130                outMessage = "OAI Document ingest processing complete:\n"
131                logging.info("oai_document_ingest processing complete:")
132                isSuccess = False
133                if self._no_problem_files == 0:
134                        logging.info("All files successfully processed - cleaning harvest directory")
135                        #FileUtilities.cleanDir(self._harvest_home) # TODO: uncomment this!
136                        isSuccess = True
137                else:
138                        logging.error("Problems experienced with %s files" %self._no_problem_files)
139                        logging.error("- harvnegest directory will not be cleared until these have been fixed and the script has been reran")
140               
141                logging.info(self.lineSeparator)
142                message = 'Number of files processed = %s\n' %numfilesproc
143                logging.info(message)
144                outMessage += message
145                message = "Number of files created = %s\n" %self._no_files_ingested
146                logging.info(message)
147                outMessage += message
148                message = "Number of files updated = %s\n" %self._no_files_changed
149                logging.info(message)
150                outMessage += message
151               
152                #Changed message to include more detail (SJD) but also now add any errors
153                if self._error_messages:
154                        outMessage += 'Errors: %s' %self._error_messages
155                        processingReport += 'Errors: %s' %self._error_messages
156
157
158               
159                print "Script finished running."
160                return isSuccess, processingReport
161
162        def setIndFileToIngest(self, indFileToIngest):
163                '''
164                Method to set individual file to ingest if "individualFile" is invoked
165                '''
166                self.indFileToIngest = indFileToIngest
167               
168               
169        def setOaiConfigFile(self, configFilePath):
170                '''
171                Set the path to the OAI configuration file - directories etc used for data & reporting etc etc
172                '''
173                print "**********************************************************************"
174                if configFilePath:
175                        self.oaiEditorConfig = configFilePath
176                        logging.info("Using configuration file at: " + configFilePath)
177               
178
179       
180        def usage(self):
181                '''
182                Display input params for the script
183                '''
184                print "Usage: python oai_document_ingester.py [OPTION] <datacentre>"
185                print " - where:\n   <datacentre> is the data centre to ingest data from; and options are:"
186                print " -v - verbose mode for output logging"
187                print " -d - debug mode for output logging"
188                print " individualFile= - specify individual file to upload rather than batch processing as defined in properties file.  \n      (NOTE: script still uses properties file for other parameters)\n"
189
190                sys.exit(2)
191               
192       
193if __name__=="__main__":
194
195        print "================================="
196        print "RUNNING: oai_document_ingester.py"
197       
198       
199        ingester = oai_document_ingester()     
200        args = ingester._setupCmdLineOptions() 
201        ingester.processDataCentre(args[0])
Note: See TracBrowser for help on using the repository browser.