Changeset 3861 for TI01-discovery
- Timestamp:
- 07/05/08 15:54:12 (13 years ago)
- Location:
- TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch
- Files:
-
- 6 deleted
- 1 edited
- 1 moved
Legend:
- Unmodified
- Added
- Removed
-
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/oai_document_ingester.py
r3857 r3861 1 1 #!/usr/bin/env python 2 """ Script oai_ingest.py takes parameters <datacentre> <dbinfofile>. 3 The /usr/local/WSClients/OAIBatch directory contains:- 4 - this python script, plus some other modules eg ndgUtils for parts of the process. 5 - a DataProvider specific config file, 6 - the python module for extracting spatiotemporal information and adding to postgres db. 7 Under this directory the following structure should be maintained: 8 ./data 9 - /DATACENTRE/ 10 - discovery/: Re-named documents. 11 - discovery_corrected Documents with schema namespaces corrected, ready to ingest in the discovery service. 12 - oai/difYYYYMMDD/ Documents as harvested from OAI 13 Where /DATACENTRE varies for the different data providers 14 """ 15 #History: 16 # 12/05/06 SEL spelling correction 17 # 30/05/06 SEL cope with many files for processing."Argument list too long" problem. 18 # 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version). 19 # 16/10/06 SEL Changed to using python oaiClean.py module instead of java code. 20 # 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade. 21 # 17/10/06 SEL cope with different discovery formats - not just DIF. 22 # 23/10/06 SEL keywords not mandatory in config file. 23 # 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running. 24 # December 2007 SEL rewrite to use Bryans' python XQuery stuff to create mini-moles instead of java. 25 # Also extracted hard coded pwds into a file. 26 # 11/04/08 CByrom Tidy up script by organising code into reusable functions + variables 27 # + remove dependency on eXist DB 28 2 ''' 3 Main script to do the document ingest from the OAI harvested files to the 4 discovery postgres DB. NB, can be ran for all datacentres using the run_all_ingest.py script 5 or can specify an individual datacentre to run the ingester on. 6 As well as doing the ingest, a backup directory is created to store the created moles files. 7 ''' 29 8 import os, sys, string, getopt, logging 30 9 from time import strftime … … 39 18 import db_funcs 40 19 41 class oai_ ingest:20 class oai_document_ingester: 42 21 ''' 43 22 Class to handle the ingest of files from the OAI harvester to the discovery service postgres DB 44 - including running the various transforms and parsings to get all doc types and spatiotemporal data45 in the correct form in the DB23 - including running the various transforms and parsings to get all doc types and spatiotemporal 24 data in the correct form in the DB 46 25 ''' 47 26 … … 83 62 # first of all create a PostgresRecord - this object represents all the data required 84 63 # for a DB entry 85 record = PostgresRecord(filename, self._NDG_dataProvider, \ 64 try: 65 record = PostgresRecord(filename, self._NDG_dataProvider, \ 86 66 self._datacentre_groups, self._datacentre_namespace, \ 87 67 discoveryID, self._xq, self._datacentre_format) 88 68 89 # Now create the data access object to interface to the DB 90 dao = PostgresDAO(record, self._dbConnection) 91 92 # Finally, write the new record 93 if dao.createOrUpdateRecord(): 94 self._no_files_ingested += 1 69 # Now create the data access object to interface to the DB 70 dao = PostgresDAO(record, self._dbConnection) 71 72 # Finally, write the new record 73 if dao.createOrUpdateRecord(): 74 self._no_files_ingested += 1 75 except: 76 logging.error("Exception thrown - detail: ") 77 logging.error(sys.exc_info()) 95 78 96 79 … … 141 124 logging.info("No groups/keywords set for datacentre " + datacentre) 142 125 else: 143 logging.info("datacentre groups/keywords: " + s elf._datacentre_groups)126 logging.info("datacentre groups/keywords: " + str(self._datacentre_groups)) 144 127 145 128 if self._datacentre_format == "": … … 182 165 Display input params for the script 183 166 ''' 184 print "Usage: python -v oai_ingest.py<datacentre>"167 print "Usage: python oai_document_ingester.py -v|d<datacentre>" 185 168 print " - where:\n <datacentre> is the data centre to ingest data from; and" 186 169 print " -v - verbose mode for output logging" … … 195 178 self.lineSeparator = "-----------------------------" 196 179 print self.lineSeparator 197 print "RUNNING: oai_ ingest.py"180 print "RUNNING: oai_document_ingester.py" 198 181 199 182 # check for verbose option … … 246 229 self.getConfigDetails(datacentre) 247 230 248 #any records to harvest? 249 if len( os.listdir(self._harvest_home)) == 0: 250 logging.info("Nothing to harvest this time from " + datacentre) 251 sys.exit() 231 # check harvest dir exists and that there are any records to harvest? 232 dpath = os.path.normpath(os.path.dirname(self._harvest_home)) 233 if not os.path.exists(dpath): 234 logging.info("Harvest directory for datacentre %s (%s) could not be found - exiting" \ 235 %(datacentre, self._harvest_home)) 236 return 237 elif len(os.listdir(self._harvest_home)) == 0: 238 logging.info("Nothing to harvest this time from %s" %datacentre) 239 return 240 252 241 253 242 # The directory to put things for a tape backup (should already exist) … … 262 251 # Create/clear the 'in' directory pristine copy of the discovery records 263 252 fileUtils.setUpDir(originals_dir) 264 commandline = "ls -1 " + self._harvest_home + "/ | xargs -i cp " + self._harvest_home + "/{\} " + originals_dir 265 #commandline = "find " + self._harvest_home + " -type f -print | xargs -i cp \{\} " + originals_dir 253 commandline = "find " + self._harvest_home + " -type f -print | xargs -i cp \{\} " + originals_dir 266 254 logging.info("Executing : " + commandline) 267 255 status = os.system(commandline) 268 256 269 257 if status !=0: 270 258 sys.exit("Failed at making pristine copy stage") … … 334 322 print "INFO: Number of files ingested = %s" %self._no_files_ingested 335 323 if status == 0: 336 print "INFO: Procedure oai_ ingest.py completed"324 print "INFO: Procedure oai_document_ingester.py completed" 337 325 else: 338 print "ERROR: Procedure oai_ ingest.py FAILED with status %s" %status326 print "ERROR: Procedure oai_document_ingester.py FAILED with status %s" %status 339 327 print self.lineSeparator 340 328 341 329 342 330 if __name__=="__main__": 343 opts, args = getopt.getopt(sys.argv[1:], "v")331 opts, args = getopt.getopt(sys.argv[1:], '-vd') 344 332 if len(args) < 1: 345 oai_ ingest()346 347 oai_ ingest(args[0])333 oai_document_ingester() 334 335 oai_document_ingester(args[0]) -
TI01-discovery/branches/ingestAutomation-upgrade/OAIBatch/run_all_ingest.py
r3839 r3861 3 3 """ 4 4 import os, sys, logging 5 from oai_ ingest_new2 import oai_ingest5 from oai_document_ingester import oai_document_ingester 6 6 7 7 run_counter = 0 8 error_counter = 0 8 9 9 10 lineSeparator = "-----------------------------" … … 13 14 14 15 # configure logging 15 logging.basicConfig(level=logging. DEBUG,16 logging.basicConfig(level=logging.INFO, 16 17 format='%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s') 17 18 … … 35 36 # now invoke the ingest script 36 37 logging.info("Running the ingest script for datacentre: %s" %datacentre) 37 oai_ingest(datacentre) 38 run_counter += 1 38 try: 39 run_counter += 1 40 oai_document_ingester(datacentre) 41 except: 42 error_counter +=1 39 43 40 print "run_all_ingest.py complete - processed %s config files" %run_counter 44 print "run_all_ingest.py complete - processed %s config files" %run_counter 45 if error_counter > 0: 46 print "WARNING: %s errors were encountered during the run - check logfiles for more details" %error_counter 41 47 42
Note: See TracChangeset
for help on using the changeset viewer.