source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/oai_document_ingester.py @ 5252

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/oai_document_ingester.py@5252
Revision 5252, 4.8 KB checked in by cbyrom, 10 years ago (diff)

Simplify error handling, improve output logging + standardise use of
upper case doc formats + switch off MDIP again since this mostly
breaks things.

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2'''
3 Main script to do the document ingest from the OAI harvested files to the
4 discovery postgres DB.  NB, can be ran for all datacentres using the run_all_ingest.py script
5 or can specify an individual datacentre to run the ingester on.
6 As well as doing the ingest, a backup directory is created to store the created moles files.
7'''
8import os, sys, logging
9# annoyingly, an import (CSML file, I think) sets the logging config during imports - so set this
10# here to get there first - since you can only set the config once
11logging.basicConfig(level=logging.DEBUG,
12                    format='%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s')
13from time import strftime
14import ndg.common.src.lib.fileutilities as FileUtilities
15from abstractdocumentingester import AbstractDocumentIngester
16
17class oai_document_ingester(AbstractDocumentIngester):
18        '''
19        Class to handle the ingest of files from the OAI harvester to the discovery service postgres DB
20        - including running the various transforms and parsings to get all doc types and spatiotemporal
21        data in the correct form in the DB
22        @return outMessage: string summary of ingest outcome
23        '''
24
25        def processDataCentre(self, dataCentre, harvestDir = None, dataFormat = None):
26                '''
27                Ingest documents from the specified data centre
28                @param dataCentre: data centre to ingest docs from
29                @keyword harvestDir: directory to get docs from - NB, this will override that
30                specified in the associated config file.  Typically this is used when a manual
31                harvest has retrieved docs to a local dir (see OAIInfoEditor.lib.harvester).
32                @param dataFormat: format of data to ingest.  Overrides config file settings.
33                @return isSuccess, outMessage: isSuccess = True if ingest completes ok
34                outMessage = summary of ingest process
35                '''
36                self._no_files_ingested = 0
37                self._no_problem_files = 0
38                self._error_messages = ''
39                self.dataCentre = dataCentre
40                self._base_dir = os.getcwd() + "/" # this is the base dir that the script is ran from
41                self._setupDataCentreDirs()
42               
43                #Change os directory to that with the harvested documents in it.
44                os.chdir(self._base_dir)
45               
46                # - to run on Windows under cygwin, use the following
47                #os.putenv('PATH', 'C:\\opt\\cygwin\\bin')
48               
49                self.getConfigDetails(dataCentre)
50                # override default settings with input keyword values, if set
51                if harvestDir:
52                        self._harvest_home = harvestDir
53                if dataFormat:
54                        self._datacentre_format = dataFormat
55               
56                # check harvest dir exists and that there are any records to harvest?
57                if not os.path.exists(self._harvest_home):
58                        logging.error("Harvest directory for dataCentre %s (%s) could not be found - exiting" \
59                                                 %(dataCentre, self._harvest_home))
60                        return
61                elif len(os.listdir(self._harvest_home)) == 0:
62                        logging.info("Nothing to harvest this time from %s" %dataCentre)
63                        return
64
65                commandline = "find " + self._harvest_home + " -type f -print | xargs -i cp \{\} " + self.originals_dir
66                logging.info("Executing : " + commandline)
67                status = os.system(commandline)
68
69                if status !=0:
70                    sys.exit("Failed at making pristine copy stage")
71               
72                self._setupXQueries()
73                               
74                # Process the resulting files and put the data into the postgres DB
75                # - firstly set up a db connection to use
76                self._getPostgresDBConnection()
77
78                numfilesproc = self._convertAndIngestFiles(self.originals_dir, self.discovery_dir)
79               
80                outMessage = "OAI Document ingest processing complete:\n"
81                logging.info("oai_document_ingest processing complete:")
82                isSuccess = False
83                if self._no_problem_files == 0:
84                        logging.info("All files successfully processed - cleaning harvest directory")
85                        FileUtilities.cleanDir(self._harvest_home)
86                        isSuccess = True
87                else:
88                        logging.error("Problems experienced with %s files" %self._no_problem_files)
89                        logging.error("- harvest directory will not be cleared until these have been fixed and the script has been reran")
90               
91                logging.info(self.lineSeparator)
92                message = 'Number of files processed = %s\n' %numfilesproc
93                logging.info(message)
94                outMessage += message
95                message = "Number of files ingested = %s\n" %self._no_files_ingested
96                logging.info(message)
97                outMessage += message
98                if self._error_messages:
99                        outMessage += 'Errors: %s' %self._error_messages
100                logging.info(self.lineSeparator)
101                print "Script finished running."
102                return isSuccess, outMessage
103
104       
105        def usage(self):
106                '''
107                Display input params for the script
108                '''
109                print "Usage: python oai_document_ingester.py [OPTION] <datacentre>"
110                print " - where:\n   <datacentre> is the data centre to ingest data from; and options are:"
111                print " -v - verbose mode for output logging"
112                print " -d - debug mode for output logging"
113                sys.exit(2)
114               
115       
116if __name__=="__main__":
117        print "================================="
118        print "RUNNING: oai_document_ingester.py"
119        ingester = oai_document_ingester()
120        args = ingester._setupCmdLineOptions()
121        ingester.processDataCentre(args[0])
Note: See TracBrowser for help on using the repository browser.