source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/oai_document_ingester.py @ 5243

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/oai_document_ingester.py@5243
Revision 5243, 4.1 KB checked in by cbyrom, 11 years ago (diff)

Adjust logging and output of error in ingest scripts.

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2'''
3 Main script to do the document ingest from the OAI harvested files to the
4 discovery postgres DB.  NB, can be ran for all datacentres using the run_all_ingest.py script
5 or can specify an individual datacentre to run the ingester on.
6 As well as doing the ingest, a backup directory is created to store the created moles files.
7'''
8import os, sys, logging
9# annoyingly, an import (CSML file, I think) sets the logging config during imports - so set this
10# here to get there first - since you can only set the config once
11logging.basicConfig(level=logging.DEBUG,
12                    format='%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s')
13from time import strftime
14import ndg.common.src.lib.fileutilities as FileUtilities
15from abstractdocumentingester import AbstractDocumentIngester
16
17class oai_document_ingester(AbstractDocumentIngester):
18        '''
19        Class to handle the ingest of files from the OAI harvester to the discovery service postgres DB
20        - including running the various transforms and parsings to get all doc types and spatiotemporal
21        data in the correct form in the DB
22        @return outMessage: string summary of ingest outcome
23        '''
24
25        def processDataCentre(self, dataCentre):
26               
27                self._no_files_ingested = 0
28                self._no_problem_files = 0
29                self._error_messages = ''
30                self.dataCentre = dataCentre
31               
32                self._base_dir = os.getcwd() + "/" # this is the base dir that the script is ran from
33                self._setupDataCentreDirs()
34               
35                #Change os directory to that with the harvested documents in it.
36                os.chdir(self._base_dir)
37               
38                # - to run on Windows under cygwin, use the following
39                #os.putenv('PATH', 'C:\\opt\\cygwin\\bin')
40               
41                self.getConfigDetails(dataCentre)
42               
43                # check harvest dir exists and that there are any records to harvest?
44                if not os.path.exists(self._harvest_home):
45                        logging.info("Harvest directory for dataCentre %s (%s) could not be found - exiting" \
46                                                 %(dataCentre, self._harvest_home))
47                        return
48                elif len(os.listdir(self._harvest_home)) == 0:
49                        logging.info("Nothing to harvest this time from %s" %dataCentre)
50                        return
51
52                commandline = "find " + self._harvest_home + " -type f -print | xargs -i cp \{\} " + self.originals_dir
53                logging.info("Executing : " + commandline)
54                status = os.system(commandline)
55
56                if status !=0:
57                    sys.exit("Failed at making pristine copy stage")
58               
59                self._setupXQueries()
60                               
61                # Process the resulting files and put the data into the postgres DB
62                # - firstly set up a db connection to use
63                self._getPostgresDBConnection()
64
65                numfilesproc = self._convertAndIngestFiles(self.originals_dir, self.discovery_dir)
66               
67                outMessage = "OAI Document ingest processing complete:"
68                logging.info("oai_document_ingest processing complete:")
69                if self._no_problem_files == 0:
70                        logging.info("All files successfully processed - cleaning harvest directory")
71                        FileUtilities.cleanDir(self._harvest_home)
72                else:
73                        logging.error("Problems experienced with %s files" %self._no_problem_files)
74                        logging.error("- harvest directory will not be cleared until these have been fixed and the script has been reran")
75               
76                logging.info(self.lineSeparator)
77                logging.info("INFO: Number of files processed = %s" %numfilesproc)
78                outMessage += 'Number of files processed = %s' %numfilesproc
79                logging.info("INFO: Number of files ingested = %s" %self._no_files_ingested)
80                outMessage += 'Number of files ingestest = %s' %self._no_files_ingested
81                if self._error_messages:
82                        outMessage += 'Errors: %s' %self._error_messages
83                logging.info(self.lineSeparator)
84                print "Script finished running."
85                return outMessage
86
87       
88        def usage(self):
89                '''
90                Display input params for the script
91                '''
92                print "Usage: python oai_document_ingester.py [OPTION] <datacentre>"
93                print " - where:\n   <datacentre> is the data centre to ingest data from; and options are:"
94                print " -v - verbose mode for output logging"
95                print " -d - debug mode for output logging"
96                sys.exit(2)
97               
98       
99if __name__=="__main__":
100        print "================================="
101        print "RUNNING: oai_document_ingester.py"
102        ingester = oai_document_ingester()
103        args = ingester._setupCmdLineOptions()
104        ingester.processDataCentre(args[0])
Note: See TracBrowser for help on using the repository browser.