source: TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/abstractdocumentingester.py @ 5297

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/abstractdocumentingester.py@5326
Revision 5297, 11.8 KB checked in by cbyrom, 11 years ago (diff)

Add additional instructions for setting up ingester + improve the
setup script to include more dependencies.

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2'''
3 Main script to do the document ingest from the OAI harvested files to the
4 discovery postgres DB.  NB, can be ran for all datacentres using the run_all_ingest.py script
5 or can specify an individual datacentre to run the ingester on.
6 As well as doing the ingest, a backup directory is created to store the created moles files.
7'''
8import os, sys, string, getopt, logging, pkg_resources
9from time import strftime
10from SchemaNameSpace import SchemaNameSpace
11from ndg.common.src.lib.ndgresources import ndgResources
12import ndg.common.src.lib.fileutilities as FileUtilities
13from ndg.common.src.clients.reldb.postgres.postgresclient import PostgresClient as pgc
14from PostgresRecord import PostgresRecord
15from PostgresDAO import PostgresDAO
16from Utilities import idget
17
18class AbstractDocumentIngester(object):
19        '''
20        Class to handle the ingest of files from the OAI harvester to the discovery service postgres DB
21        - including running the various transforms and parsings to get all doc types and spatiotemporal
22        data in the correct form in the DB
23        '''
24        lineSeparator = "-----------------------------"
25                       
26        # The directory to put things for a tape backup (should already exist)
27        BACKUP_DIR = '/disks/glue1/oaiBackup/'
28
29        def _setupCmdLineOptions(self):
30                '''
31                Determine the logging level to use and configure this appropriately
32                @return args: any input arguments - excluding options
33                '''
34                # check for verbose option
35                try:
36                        opts, args = getopt.getopt(sys.argv[1:], "vd")
37                except getopt.GetoptError, err:
38                    # print help information and exit:
39                    print str(err) # will print something like "option -a not recognized"
40                    sys.exit(2)
41
42                if len(args) < 1:
43                        self.usage()
44                   
45                loggingLevel = logging.WARNING
46                for o, a in opts:
47                    if o == "-v":
48                        print " - Verbose mode ON"
49                        loggingLevel = logging.INFO
50                    elif o == "-d":
51                        print " - Debug mode ON"
52                        loggingLevel = logging.DEBUG
53               
54                # set up any keywords on the object
55                # NB, be careful to keep the instance variables the same name as the keywords!
56                for arg in args:
57                        bits = arg.split('=')
58                        if len(bits) == 2:
59                                if bits[0] == 'ingestFromDate':
60                                        self.setIngestFromDate(bits[1])
61                                elif bits[0] == 'interval':
62                                        self.setPollInterval(bits[1])
63                                else:
64                                        setattr(self, bits[0], bits[1])
65               
66                print self.lineSeparator
67                # NB, this is a slight fudge as cannot get the detailed logging to work
68                # without setting up a new logger here - which means we get two loggers
69                # outputing data. The initial call to logging needs to be tracked down
70                # and configured correctly, so this can be avoided...
71#               self.logger = logging.getLogger()
72#               self.logger.setLevel(loggingLevel)
73
74                # create console handler and set level to debug
75#               ch = logging.StreamHandler()
76#               ch.setLevel(loggingLevel)
77                # create formatter
78#               formatter = logging.Formatter('%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s')
79                # add formatter to ch
80#               ch.setFormatter(formatter)
81                # add ch to logger
82#               self.logger.addHandler(ch)
83                return args
84
85
86        def getID(self, filename):
87                '''
88                Gets the identifier out of an input metadata xml record.
89                @param filename - name of document file being processed
90                @return: ID - id to use to refer to the document
91                '''
92                logging.info("Retrieving identifier for metadata record " + filename)
93                xml=file(filename).read()
94                ID = idget(xml)
95                return ID
96       
97       
98        def addFileToPostgresDB(self, filename):
99                '''
100                Add a file to the postgres DB - extracting and storing all the required
101                data in the process
102                @param filename: full path of file to add to postgres DB
103                '''
104                logging.info("Adding file, " + filename + ", to postgres DB")
105               
106                # first of all create a PostgresRecord - this object represents all the data required
107                # for a DB entry
108                dao = None
109                try:
110                        discoveryID = self.getID(filename)
111                       
112                        record = PostgresRecord(filename, self._NDG_dataProvider, \
113                                                            self._datacentre_groups, self._datacentre_namespace, \
114                                                            discoveryID, self._xq, self._datacentre_format)
115                       
116                        print self._xq
117                        # Now create the data access object to interface to the DB
118                        dao = PostgresDAO(record, pgClient = self.pgc)
119               
120                        # Finally, write the new record
121                        if dao.createOrUpdateRecord():
122                                self._no_files_ingested += 1
123                except:
124                        logging.error("Exception thrown - detail: ")
125                        errors = sys.exc_info()
126                        logging.error(errors)
127                        self._error_messages += "%s\n" %str(errors[1])
128                       
129                        if dao:
130                                logging.info("Removing record and its associated info from DB")
131                                logging.info("- to allow clean ingestion on rerun")
132                                try:
133                                        dao.deleteOriginalRecord()
134                                except:
135                                        logging.error("Problem encountered when removing record: ")
136                                        logging.error(sys.exc_info())
137                                        logging.error("NB, this record will need to be cleared manually from DB to ensure all relevant data is ingested")
138
139                        self._no_problem_files += 1
140                        logging.info("Continue processing other files")
141       
142       
143        def getConfigDetails(self, datacentre):
144                '''
145                Get the harvested records directory and groups for this datacentre from the
146                datacentre specific config file.  The harvested records directory depends on the
147                datacentres OAI base url, the set and format. These have to be know up-front.
148                The groups denote which 'portal groups' they belong to - for limiting searches to
149                say NERC-only datacentres records.
150                Groups are added to the intermediate MOLES when it is created.
151                @param datacentre: datacentre to use when looking up config file
152                '''
153                # initialise the variables to retrieve from the config file
154                self._harvest_home = ""
155                self._datacentre_groups = ""
156                self._datacentre_format = ""
157                self._datacentre_namespace = ""
158                self._NDG_dataProvider = False
159
160                datacentre_config_filename = 'datacentre_config/' + datacentre + "_config.properties"
161                logging.info("Retrieving data from datacentre config file, " + datacentre_config_filename)
162               
163                file = pkg_resources.resource_string('OAIBatch', datacentre_config_filename)
164
165                for line in file.split('\n'):