source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/oai_document_ingester.py @ 5153

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/oai_document_ingester.py@5153
Revision 5153, 3.4 KB checked in by sdonegan, 11 years ago (diff)

debugged numfilesproc error

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2'''
3 Main script to do the document ingest from the OAI harvested files to the
4 discovery postgres DB.  NB, can be ran for all datacentres using the run_all_ingest.py script
5 or can specify an individual datacentre to run the ingester on.
6 As well as doing the ingest, a backup directory is created to store the created moles files.
7'''
8import os, sys, logging
9from time import strftime
10import ndg.common.src.lib.fileutilities as FileUtilities
11from abstractdocumentingester import AbstractDocumentIngester
12
13class oai_document_ingester(AbstractDocumentIngester):
14        '''
15        Class to handle the ingest of files from the OAI harvester to the discovery service postgres DB
16        - including running the various transforms and parsings to get all doc types and spatiotemporal
17        data in the correct form in the DB
18        '''
19       
20
21        def processDataCentre(self, dataCentre):
22               
23                self._no_files_ingested = 0
24                self._no_problem_files = 0
25                self.dataCentre = dataCentre
26               
27                self._base_dir = os.getcwd() + "/" # this is the base dir that the script is ran from
28                self._setupDataCentreDirs()
29               
30                #Change os directory to that with the harvested documents in it.
31                os.chdir(self._base_dir)
32               
33                # - to run on Windows under cygwin, use the following
34                #os.putenv('PATH', 'C:\\opt\\cygwin\\bin')
35               
36                self.getConfigDetails(dataCentre)
37               
38                # check harvest dir exists and that there are any records to harvest?
39                if not os.path.exists(self._harvest_home):
40                        logging.info("Harvest directory for dataCentre %s (%s) could not be found - exiting" \
41                                                 %(dataCentre, self._harvest_home))
42                        return
43                elif len(os.listdir(self._harvest_home)) == 0:
44                        logging.info("Nothing to harvest this time from %s" %dataCentre)
45                        return
46
47                commandline = "find " + self._harvest_home + " -type f -print | xargs -i cp \{\} " + self.originals_dir
48                logging.info("Executing : " + commandline)
49                status = os.system(commandline)
50
51                if status !=0:
52                    sys.exit("Failed at making pristine copy stage")
53               
54                self._setupXQueries()
55                               
56                # Process the resulting files and put the data into the postgres DB
57                # - firstly set up a db connection to use
58                self._getPostgresDBConnection()
59
60                numfilesproc = self._convertAndIngestFiles(self.originals_dir, self.discovery_dir)
61               
62                logging.info("oai_document_ingest processing complete:")
63                if self._no_problem_files == 0:
64                        logging.info("All files successfully processed - cleaning harvest directory")
65                        FileUtilities.cleanDir(self._harvest_home)
66                else:
67                        logging.error("Problems experienced with %s files" %self._no_problem_files)
68                        logging.error("- harvest directory will not be cleared until these have been fixed and the script has been reran")
69               
70                logging.info(self.lineSeparator)
71                logging.info("INFO: Number of files processed = %s" %numfilesproc)
72                logging.info("INFO: Number of files ingested = %s" %self._no_files_ingested)
73                logging.info(self.lineSeparator)
74                print "Script finished running."
75
76       
77        def usage(self):
78                '''
79                Display input params for the script
80                '''
81                print "Usage: python oai_document_ingester.py [OPTION] <datacentre>"
82                print " - where:\n   <datacentre> is the data centre to ingest data from; and options are:"
83                print " -v - verbose mode for output logging"
84                print " -d - debug mode for output logging"
85                sys.exit(2)
86               
87       
88if __name__=="__main__":
89        print "================================="
90        print "RUNNING: oai_document_ingester.py"
91        ingester = oai_document_ingester()
92        args = ingester._setupCmdLineOptions()
93        ingester.processDataCentre(args[0])
Note: See TracBrowser for help on using the repository browser.