Changeset 4956 for TI01-discovery


Ignore:
Timestamp:
11/02/09 15:13:10 (11 years ago)
Author:
sdonegan
Message:

Updated ingestion scripts: oai_document_ingester.py now has option to ingest a single file (-i option). Also added classes for complete deletion of single records - based on supplied "oai" file name, discovery dir filename or discoveryID in database. These require the updated sql script ingest_procedures.sql

Location:
TI01-discovery/tags/stable-TI01-ingestAutomation_Proglue_upgradesAndReporting/temp/OAIBatch
Files:
2 added
4 edited

Legend:

Unmodified
Added
Removed
  • TI01-discovery/tags/stable-TI01-ingestAutomation_Proglue_upgradesAndReporting/temp/OAIBatch/PostgresDAO.py

    r4908 r4956  
    1616        ''' 
    1717        if record == "": 
     18            logging.error("Invalid call to PostgresDAO!") 
    1819            sys.exit("USAGE: argument 1 = PostgresRecord object to process") 
    19         else: 
    20             logging.info("INFO: Creating/updating DB entry for record, %s" %record.discovery_id) 
     20        elif record == "deleting": 
     21            logging.info("Record object not supplied as DELETING records FROM database..") 
     22        else: 
     23            logging.info("Creating/updating DB entry for record, %s" %record.discovery_id) 
    2124 
    2225        # setup a connection to the db - if none specified 
     
    4346        if dbId: 
    4447            self._record.db_id = dbId[0][0] 
    45          
     48             
     49 
     50    def getRecordID_using_OriginalDocumentFilename(self): 
     51        ''' 
     52        Looks up a record in the DB and returns its DB ID, if it exists, otherwise  
     53        returns '-1' 
     54        @return: id of record, if it exists, '-1' if it doesn't 
     55        ''' 
     56        logging.info("Looking up original_document_id for filename: " + self._record.filename + " in DB") 
     57         
     58        '''if self._record.db_id is not None and self._record.db_id > 0: 
     59            logging.info("Already looked up record - ID is " + str(self._record.db_id)) 
     60            return self._record.db_id''' 
     61         
     62        sql = "SELECT original_document_id FROM ORIGINAL_DOCUMENT where original_document_filename = '" + self._record.filename + "';" 
     63         
     64        dbId = db_funcs.runSQLCommand(self._connection, sql) 
     65         
     66        if dbId: 
     67            self._record.db_id = dbId[0][0] 
     68             
     69             
     70    def getDiscoveryID_using_OriginalDocumentFilename(self): 
     71        ''' 
     72        Looks up a record in the DB and returns its discovery ID, if it exists, otherwise  
     73        returns '-1' 
     74        @return: id of record, if it exists, '-1' if it doesn't 
     75        ''' 
     76        logging.info("Looking up discovery_of for filename: " + self._record.filename + " in DB") 
     77         
     78        '''if self._record.db_id is not None and self._record.db_id > 0: 
     79            logging.info("Already looked up record - ID is " + str(self._record.db_id)) 
     80            return self._record.db_id''' 
     81         
     82        sql = "SELECT discovery_id FROM ORIGINAL_DOCUMENT where original_document_filename = '" + self._record.filename + "';" 
     83         
     84        dbId = db_funcs.runSQLCommand(self._connection, sql) 
     85         
     86        if dbId: 
     87            self._record.discovery_id = dbId[0][0] 
     88             
     89         
     90    def getTemporalDataId(self): 
     91         
     92        ''' 
     93        Looks up the temporal data id using the original document id 
     94        ''' 
     95         
     96        logging.info("Looking up temporal_data_id for filename: " + self._record.filename + " in DB") 
     97         
     98        sql = "SELECT discovery_id FROM ORIGINAL_DOCUMENT where original_document_filename = '" + self._record.filename + "';" 
    4699 
    47100    def createOrUpdateRecord(self): 
     
    260313        ''' 
    261314        logging.info("Deleting original document from Postgres DB") 
     315         
    262316        sqlCmd = "SELECT delete_document('" + str(self._record.db_id) + "');"  
    263317 
  • TI01-discovery/tags/stable-TI01-ingestAutomation_Proglue_upgradesAndReporting/temp/OAIBatch/PostgresRecord.py

    r4908 r4956  
    531531        ''' 
    532532        logging.info("Extracting bounding box info") 
    533         print "Extracting bounding box info"  
    534533         
    535534        try: 
  • TI01-discovery/tags/stable-TI01-ingestAutomation_Proglue_upgradesAndReporting/temp/OAIBatch/oai_document_ingester.py

    r4888 r4956  
    1616from PostgresRecord import PostgresRecord 
    1717from PostgresDAO import PostgresDAO 
    18 from datetime import date 
     18import datetime,time 
    1919import db_funcs 
    2020 
     
    196196                Display input params for the script 
    197197                ''' 
    198                 print "Usage: python oai_document_ingester.py [OPTION] <datacentre>" 
     198                print "Usage: python oai_document_ingester.py [OPTION] <datacentre> <individual update file>" 
    199199                print " - where:\n   <datacentre> is the data centre to ingest data from; and options are:" 
    200200                print " -v - verbose mode for output logging" 
    201201                print " -d - debug mode for output logging" 
     202                print " -i - specify individual file to upload rather than batch processing as defined in properties file.  \n      (NOTE: script still uses properties file for other parameters)\n" 
    202203                sys.exit(2) 
    203204 
    204205                 
    205         def __init__(self, datacentre=None): 
     206        def __init__(self, datacentre=None, indFileToIngest=None): 
    206207                ''' 
    207208                Main entry point for script 
     
    213214                # check for verbose option 
    214215                try: 
    215                     opts, args = getopt.getopt(sys.argv[1:], "vd") 
     216                    opts, args = getopt.getopt(sys.argv[1:], "vdi") 
    216217                except getopt.GetoptError, err: 
    217218                    # print help information and exit: 
     
    219220                     
    220221                loggingLevel = logging.WARNING 
     222                indFile = False 
    221223                for o, a in opts: 
    222224                    if o == "-v": 
     
    226228                        print " - Debug mode ON" 
    227229                        loggingLevel = logging.DEBUG 
    228                  
     230                    elif o == "-i": 
     231                        indFile = True 
     232                         
     233                        #check second arguement is present 
     234                        if len(sys.argv) < 4: 
     235                                print " - could not find individual path to file/ specified data centre!\n\n" 
     236                                self.usage() 
     237                         
     238                        print " - Use INDIVIDUAL file: " + indFileToIngest + " to load" 
     239                         
     240                   
    229241                print self.lineSeparator 
    230242                logging.basicConfig(level=loggingLevel, 
     
    259271                self._datacentre_namespace = "" 
    260272                self._NDG_dataProvider = False 
     273                 
    261274                self.getConfigDetails(datacentre) 
    262                  
    263                 # check harvest dir exists and that there are any records to harvest? 
    264                 if not os.path.exists(self._harvest_home): 
    265                         logging.info("Harvest directory for datacentre %s (%s) could not be found - exiting" \ 
    266                                                  %(datacentre, self._harvest_home)) 
    267                         return 
    268                 elif len(os.listdir(self._harvest_home)) == 0: 
    269                         logging.info("Nothing to harvest this time from %s" %datacentre) 
    270                         return 
    271275                 
    272276                # The directory to put things for a tape backup (should already exist) 
     
    277281                discovery_dir = data_dir + "/discovery/" 
    278282                 
    279                 # Create/clear the 'in' directory pristine copy of the discovery records 
    280                 fileUtils.setUpDir(originals_dir) 
    281                 commandline = "find " + self._harvest_home + " -type f -print | xargs -i cp \{\} " + originals_dir 
    282                 logging.info("Executing : " + commandline) 
    283                 status = os.system(commandline) 
    284  
     283                 
     284                # check harvest dir exists and that there are any records to harvest? 
     285                if not indFile: 
     286                        if not os.path.exists(self._harvest_home): 
     287                                logging.warn("Harvest directory for datacentre %s (%s) could not be found - exiting" \ 
     288                                                 %(datacentre, self._harvest_home)) 
     289                                return 
     290                        elif len(os.listdir(self._harvest_home)) == 0: 
     291                                logging.warn("Nothing to harvest this time from %s" %datacentre) 
     292                                return 
     293                         
     294                        # Create/clear the 'in' directory pristine copy of the discovery records 
     295                        fileUtils.setUpDir(originals_dir) 
     296                        commandline = "find " + self._harvest_home + " -type f -print | xargs -i cp \{\} " + originals_dir 
     297                        logging.info("Executing : " + commandline) 
     298                        status = os.system(commandline) 
     299                                                 
     300                else: 
     301                        #must be looking for an individual file to upload 
     302                        if not os.path.exists(indFileToIngest): 
     303                                logging .warn("Specified file does not exist") 
     304                                return 
     305                         
     306                        # Create/clear the 'in' directory pristine copy of the discovery records 
     307                        fileUtils.setUpDir(originals_dir) 
     308                        commandline = "cp " + indFileToIngest + " " + originals_dir 
     309                        logging.info("Executing : " + commandline) 
     310                        status = os.system(commandline) 
     311 
     312                #did transfer command work? 
    285313                if status !=0: 
    286                     sys.exit("Failed at making pristine copy stage") 
     314                        sys.exit("Failed at making pristine copy stage") 
    287315                 
    288316                # Create/clear the directory for the 'out' processed copy of the discovery records. 
     
    377405                 
    378406                recOpFile.write("Ingest report for data centre: " + datacentre + "\n") 
    379                 recOpFile.write("Ingest date: " + str(date.today()) + "\n") 
     407                recOpFile.write("Ingest date: " + str(datetime.datetime.now()) + "\n") 
    380408                recOpFile.write("Original metadata directory: " + self._harvest_home + "\n\n") 
    381409                recOpFile.write("PROCESSED " + str(numfilesproc) + "\n") 
     
    403431                 
    404432                recOpFile.close() 
     433                 
     434                #if run on single file report info to screen 
     435                if indFile: 
     436                        print self.lineSeparator 
     437                        if self._no_problem_files == 0: 
     438                                print "File successfully ingested at " + str(datetime.datetime.now()) 
    405439                                 
    406                  
    407                 print "\nScript finished running." 
     440                        else:    
     441                                for badFile in self.updateFailList: 
     442                                        print "Could not ingest: " + badFile 
     443                        print self.lineSeparator 
     444                 
     445                else:            
     446                        print "\nScript finished running." 
     447                 
    408448         
    409449if __name__=="__main__": 
    410         opts, args = getopt.getopt(sys.argv[1:], '-vd') 
     450         
     451        opts, args = getopt.getopt(sys.argv[1:], '-vdi') 
     452         
    411453        if len(args) < 1: 
    412454                oai_document_ingester() 
    413455         
    414         oai_document_ingester(args[0]) 
     456        oai_document_ingester(args[0], args[1]) 
  • TI01-discovery/tags/stable-TI01-ingestAutomation_Proglue_upgradesAndReporting/temp/OAIBatch/run_all_ingest.py

    r4888 r4956  
    44import os, sys, logging,time,string 
    55from oai_document_ingester import oai_document_ingester 
     6import datetime,time 
    67 
    78 
     
    2223logging.info("Running ingest script for all config files in the current run directory (%s)" %current_dir) 
    2324 
    24 #create file for summary report on all ingests 
    25 summaryFileName = "data/runAllIngestSummary.txt"         
     25#create file for summary report on all ingests - hardwired for production use on PROGLUE!!! 
     26summaryFileName = "/usr/local/WSClientsIngestUpdate/stable-TI01-ingestAutomation_Proglue_upgradesAndReporting/temp/OAIBatch/data/runAllIngestSummary.txt" 
     27logging.warn("NOTE: Using " + summaryFileName + "if not running as user BADC in production change THIS!!")         
    2628summaryFile = open(summaryFileName,'w')             
    2729 
     
    3739            datacentre = filename.replace(config_suffix, '') 
    3840            summaryFile.write("\n=====================================================================\n") 
    39             summaryFile.write("Ingest report for data centre: " + datacentre + " at " + str(time.asctime()) + "\n") 
     41            summaryFile.write("Ingest report for data centre: " + datacentre + " at " + str(datetime.datetime.now()) + "\n") 
    4042                     
    4143            if datacentre.find('backup') > -1: 
     
    5658                 
    5759            #summarise harvest info from report doc now produced by ingester program 
    58             recOpFileName = "data/" + datacentre + "_ingestSummary.txt" 
     60            recOpFileName = "/usr/local/WSClientsIngestUpdate/stable-TI01-ingestAutomation_Proglue_upgradesAndReporting/temp/OAIBatch/data/" + datacentre + "_ingestSummary.txt" 
    5961             
    6062            try: 
     
    8486 
    8587if os.path.exists(summaryFileName): 
    86     commandline = "cat " + summaryFileName + " | mail -s 'ignis discovery ingest report' s.j.donegan@rl.ac.uk" 
     88    commandline = "cat " + summaryFileName + " | mail -s 'PROGLUE Discovery Ingest report' steve.donegan@stfc.ac.uk" 
    8789    os.system(commandline) 
    8890 
     
    9294if error_counter > 0:            
    9395    logging.error("WARNING: %s errors were encountered during the run - check logfiles for more details" %error_counter) 
    94  
     96    commandline = "echo 'WARNING: run_all_ingest.py failed at " + str(datetime.datetime.now()) + " !!' | mail -s 'PROGLUE Ingest Report: FAILURE!' steve.donegan@stfc.ac.uk" 
Note: See TracChangeset for help on using the changeset viewer.