Changeset 3861


Ignore:
Timestamp:
07/05/08 15:54:12 (11 years ago)
Author:
cbyrom
Message:

Clear out redundant code and rename main script, oai_document_ingester

  • change references to this accordingly.
Location:
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch
Files:
6 deleted
1 edited
1 moved

Legend:

Unmodified
Added
Removed
  • TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/oai_document_ingester.py

    r3857 r3861  
    11#!/usr/bin/env python 
    2 """ Script oai_ingest.py takes parameters <datacentre> <dbinfofile>. 
    3 The /usr/local/WSClients/OAIBatch directory contains:- 
    4  - this python script, plus some other modules eg ndgUtils for parts of the process. 
    5  - a DataProvider specific config file, 
    6  - the python module for extracting spatiotemporal information and adding to postgres db. 
    7 Under this directory the following structure should be maintained: 
    8  ./data 
    9  - /DATACENTRE/ 
    10                 - discovery/:         Re-named documents. 
    11         - discovery_corrected Documents with schema namespaces corrected, ready to ingest in the discovery service. 
    12                 - oai/difYYYYMMDD/    Documents as harvested from OAI 
    13  Where  /DATACENTRE  varies for the different data providers 
    14 """ 
    15 #History: 
    16 # 12/05/06 SEL spelling correction 
    17 # 30/05/06 SEL cope with many files for processing."Argument list too long" problem. 
    18 # 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version). 
    19 # 16/10/06 SEL Changed to using python oaiClean.py module instead of java code. 
    20 # 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade. 
    21 # 17/10/06 SEL cope with different discovery formats - not just DIF. 
    22 # 23/10/06 SEL keywords not mandatory in config file. 
    23 # 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running. 
    24 #  December 2007 SEL rewrite to use Bryans' python XQuery stuff to create mini-moles instead of java. 
    25 #                    Also extracted hard coded pwds into a file. 
    26 # 11/04/08 CByrom Tidy up script by organising code into reusable functions + variables  
    27 # + remove dependency on eXist DB 
    28  
     2''' 
     3 Main script to do the document ingest from the OAI harvested files to the  
     4 discovery postgres DB.  NB, can be ran for all datacentres using the run_all_ingest.py script 
     5 or can specify an individual datacentre to run the ingester on. 
     6 As well as doing the ingest, a backup directory is created to store the created moles files. 
     7''' 
    298import os, sys, string, getopt, logging 
    309from time import strftime 
     
    3918import db_funcs 
    4019 
    41 class oai_ingest: 
     20class oai_document_ingester: 
    4221        ''' 
    4322        Class to handle the ingest of files from the OAI harvester to the discovery service postgres DB 
    44         - including running the various transforms and parsings to get all doc types and spatiotemporal data 
    45         in the correct form in the DB 
     23        - including running the various transforms and parsings to get all doc types and spatiotemporal  
     24        data in the correct form in the DB 
    4625        ''' 
    4726 
     
    8362                # first of all create a PostgresRecord - this object represents all the data required 
    8463                # for a DB entry 
    85                 record = PostgresRecord(filename, self._NDG_dataProvider, \ 
     64                try: 
     65                        record = PostgresRecord(filename, self._NDG_dataProvider, \ 
    8666                                                            self._datacentre_groups, self._datacentre_namespace, \ 
    8767                                                            discoveryID, self._xq, self._datacentre_format) 
    8868         
    89                 # Now create the data access object to interface to the DB 
    90                 dao = PostgresDAO(record, self._dbConnection) 
    91                  
    92                 # Finally, write the new record 
    93                 if dao.createOrUpdateRecord(): 
    94                         self._no_files_ingested += 1 
     69                        # Now create the data access object to interface to the DB 
     70                        dao = PostgresDAO(record, self._dbConnection) 
     71                 
     72                        # Finally, write the new record 
     73                        if dao.createOrUpdateRecord(): 
     74                                self._no_files_ingested += 1 
     75                except: 
     76                        logging.error("Exception thrown - detail: ") 
     77                        logging.error(sys.exc_info()) 
    9578                         
    9679         
     
    141124                    logging.info("No groups/keywords set for datacentre " + datacentre) 
    142125                else: 
    143                     logging.info("datacentre groups/keywords: " + self._datacentre_groups) 
     126                    logging.info("datacentre groups/keywords: " + str(self._datacentre_groups)) 
    144127                 
    145128                if self._datacentre_format == "": 
     
    182165                Display input params for the script 
    183166                ''' 
    184                 print "Usage: python -v oai_ingest.py <datacentre>" 
     167                print "Usage: python oai_document_ingester.py -v|d<datacentre>" 
    185168                print " - where:\n   <datacentre> is the data centre to ingest data from; and" 
    186169                print " -v - verbose mode for output logging" 
     
    195178                self.lineSeparator = "-----------------------------" 
    196179                print self.lineSeparator 
    197                 print "RUNNING: oai_ingest.py"           
     180                print "RUNNING: oai_document_ingester.py"                
    198181                 
    199182                # check for verbose option 
     
    246229                self.getConfigDetails(datacentre) 
    247230                 
    248                 #any records to harvest? 
    249                 if len( os.listdir(self._harvest_home)) == 0: 
    250                     logging.info("Nothing to harvest this time from " + datacentre) 
    251                     sys.exit() 
     231                # check harvest dir exists and that there are any records to harvest? 
     232                dpath = os.path.normpath(os.path.dirname(self._harvest_home)) 
     233                if not os.path.exists(dpath): 
     234                        logging.info("Harvest directory for datacentre %s (%s) could not be found - exiting" \ 
     235                                                 %(datacentre, self._harvest_home)) 
     236                        return 
     237                elif len(os.listdir(self._harvest_home)) == 0: 
     238                        logging.info("Nothing to harvest this time from %s" %datacentre) 
     239                        return 
     240                 
    252241                 
    253242                # The directory to put things for a tape backup (should already exist) 
     
    262251                # Create/clear the 'in' directory pristine copy of the discovery records 
    263252                fileUtils.setUpDir(originals_dir) 
    264                 commandline = "ls -1 " + self._harvest_home + "/ | xargs -i cp " + self._harvest_home + "/{\} " + originals_dir 
    265                 #commandline = "find " + self._harvest_home + " -type f -print | xargs -i cp \{\} " + originals_dir 
     253                commandline = "find " + self._harvest_home + " -type f -print | xargs -i cp \{\} " + originals_dir 
    266254                logging.info("Executing : " + commandline) 
    267255                status = os.system(commandline) 
    268                  
     256 
    269257                if status !=0: 
    270258                    sys.exit("Failed at making pristine copy stage") 
     
    334322                print "INFO: Number of files ingested = %s" %self._no_files_ingested 
    335323                if status == 0: 
    336                     print "INFO: Procedure oai_ingest.py completed" 
     324                    print "INFO: Procedure oai_document_ingester.py completed" 
    337325                else: 
    338                     print "ERROR: Procedure oai_ingest.py FAILED with status %s" %status 
     326                    print "ERROR: Procedure oai_document_ingester.py FAILED with status %s" %status 
    339327                print self.lineSeparator 
    340328                 
    341329         
    342330if __name__=="__main__": 
    343         opts, args = getopt.getopt(sys.argv[1:], "v") 
     331        opts, args = getopt.getopt(sys.argv[1:], '-vd') 
    344332        if len(args) < 1: 
    345                 oai_ingest() 
    346          
    347         oai_ingest(args[0]) 
     333                oai_document_ingester() 
     334         
     335        oai_document_ingester(args[0]) 
  • TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/run_all_ingest.py

    r3839 r3861  
    33""" 
    44import os, sys, logging 
    5 from oai_ingest_new2 import oai_ingest 
     5from oai_document_ingester import oai_document_ingester 
    66 
    77run_counter = 0 
     8error_counter = 0 
    89 
    910lineSeparator = "-----------------------------" 
     
    1314 
    1415# configure logging 
    15 logging.basicConfig(level=logging.DEBUG, 
     16logging.basicConfig(level=logging.INFO, 
    1617                    format='%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s') 
    1718 
     
    3536            # now invoke the ingest script 
    3637            logging.info("Running the ingest script for datacentre: %s" %datacentre) 
    37             oai_ingest(datacentre) 
    38             run_counter += 1 
     38            try: 
     39                run_counter += 1 
     40                oai_document_ingester(datacentre) 
     41            except: 
     42                error_counter +=1 
    3943                 
    40 print "run_all_ingest.py complete - processed %s config files" %run_counter              
     44print "run_all_ingest.py complete - processed %s config files" %run_counter 
     45if error_counter > 0:            
     46    print "WARNING: %s errors were encountered during the run - check logfiles for more details" %error_counter 
    4147 
    42  
Note: See TracChangeset for help on using the changeset viewer.