source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/oai_document_ingester.py @ 4854

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/oai_document_ingester.py@4854
Revision 4854, 3.4 KB checked in by cbyrom, 11 years ago (diff)

Add new ingest script - to allow ingest of DIF docs from eXist hosted
atom feed. NB, this required restructure of original OAI harvester
to allow re-use of shared code - by abstracting this out into new class,
absstractdocumentingester.

Add new documentation and tidy up codebase removing dependencies where possible to simplify things.

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2'''
3 Main script to do the document ingest from the OAI harvested files to the
4 discovery postgres DB.  NB, can be ran for all datacentres using the run_all_ingest.py script
5 or can specify an individual datacentre to run the ingester on.
6 As well as doing the ingest, a backup directory is created to store the created moles files.
7'''
8import os, sys, logging
9from time import strftime
10import ndg.common.src.lib.fileutilities as FileUtilities
11from abstractdocumentingester import AbstractDocumentIngester
12
13class oai_document_ingester(AbstractDocumentIngester):
14        '''
15        Class to handle the ingest of files from the OAI harvester to the discovery service postgres DB
16        - including running the various transforms and parsings to get all doc types and spatiotemporal
17        data in the correct form in the DB
18        '''
19
20        def processDataCentre(self, dataCentre):
21               
22                self._no_files_ingested = 0
23                self._no_problem_files = 0
24                self.dataCentre = dataCentre
25               
26                self._base_dir = os.getcwd() + "/" # this is the base dir that the script is ran from
27                self._setupDataCentreDirs()
28               
29                #Change os directory to that with the harvested documents in it.
30                os.chdir(self._base_dir)
31               
32                # - to run on Windows under cygwin, use the following
33                #os.putenv('PATH', 'C:\\opt\\cygwin\\bin')
34               
35                self.getConfigDetails(dataCentre)
36               
37                # check harvest dir exists and that there are any records to harvest?
38                if not os.path.exists(self._harvest_home):
39                        logging.info("Harvest directory for dataCentre %s (%s) could not be found - exiting" \
40                                                 %(dataCentre, self._harvest_home))
41                        return
42                elif len(os.listdir(self._harvest_home)) == 0:
43                        logging.info("Nothing to harvest this time from %s" %dataCentre)
44                        return
45
46                commandline = "find " + self._harvest_home + " -type f -print | xargs -i cp \{\} " + self.originals_dir
47                logging.info("Executing : " + commandline)
48                status = os.system(commandline)
49
50                if status !=0:
51                    sys.exit("Failed at making pristine copy stage")
52               
53                self._setupXQueries()
54                               
55                # Process the resulting files and put the data into the postgres DB
56                # - firstly set up a db connection to use
57                self._getPostgresDBConnection()
58
59                self._convertAndIngestFiles(self.originals_dir, self.discovery_dir)
60               
61                logging.info("oai_document_ingest processing complete:")
62                if self._no_problem_files == 0:
63                        logging.info("All files successfully processed - cleaning harvest directory")
64                        FileUtilities.cleanDir(self._harvest_home)
65                else:
66                        logging.error("Problems experienced with %s files" %self._no_problem_files)
67                        logging.error("- harvest directory will not be cleared until these have been fixed and the script has been reran")
68               
69                logging.info(self.lineSeparator)
70                logging.info("INFO: Number of files processed = %s" %numfilesproc)
71                logging.info("INFO: Number of files ingested = %s" %self._no_files_ingested)
72                logging.info(self.lineSeparator)
73                print "Script finished running."
74
75       
76        def usage(self):
77                '''
78                Display input params for the script
79                '''
80                print "Usage: python oai_document_ingester.py [OPTION] <datacentre>"
81                print " - where:\n   <datacentre> is the data centre to ingest data from; and options are:"
82                print " -v - verbose mode for output logging"
83                print " -d - debug mode for output logging"
84                sys.exit(2)
85               
86       
87if __name__=="__main__":
88        print "================================="
89        print "RUNNING: oai_document_ingester.py"
90        ingester = oai_document_ingester()
91        args = ingester._setupCmdLineOptions()
92        ingester.processDataCentre(args[0])
Note: See TracBrowser for help on using the repository browser.