Changeset 1591 for TI01-discovery


Ignore:
Timestamp:
16/10/06 18:19:44 (13 years ago)
Author:
selatham
Message:

Uses oaiClean.py instead of java clean-up code. Also deals with upgrade and re-deployment of exist and java and tomcat.

Location:
TI01-discovery/trunk/ingestAutomation/OAIBatch
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • TI01-discovery/trunk/ingestAutomation/OAIBatch/bodc_config.properties

    r1481 r1591  
    11#Note - don't have any blank lines in this config file 
    22# Define host_OAI as the string that OAI adds to the filenames after harvesting 
    3  
    43# 
    54host_OAI = oai%3Agrid.bodc.nerc.ac.uk%3A 
  • TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py

    r1172 r1591  
    22""" Script oai_ingest.py takes parameter <datacentre>. 
    33The /usr/local/WSClients/OAIBatch directory contains this python script, a config file 
    4 and some java which handle difs after harvesting. The pre-processed files are then ingested 
    5 to the eXist XML db. 
     4and the oaiClean.py class which cleans up discovery records after harvesting. 
     5The pre-processed files are then ingested to the eXist XML db. 
    66 
    77 Under this directory the following structure should be maintained: 
     
    99 ./data 
    1010 - /DATACENTRE/ 
    11                 - discovery/:           Records with namespace, schema declaration deleted. After having run the script. 
    12                                       Ready to ingest in the discovery service. 
    13                 - oai/difYYYYMMDD/      Records as harvested from OAI 
     11                - discovery/:         Records with namespace, schema declaration deleted - after having run 
     12                                      the oaiClean script. Ready to ingest in the discovery service. 
     13                - oai/difYYYYMMDD/    Records as harvested from OAI 
    1414 
    1515 Where  /DATACENTRE  varies for the different data providers 
     
    2020# 30/05/06 SEL cope with many files for processing."Argument list too long" problem. 
    2121# 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version). 
     22# 16/10/06 SEL Changed to using python oaiClean.py module instead of java code. 
     23# 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade. 
    2224 
    2325import os 
     
    2527import commands 
    2628import string 
     29import oaiClean 
    2730 
    2831status = 0 
     
    3841# Other settings and constants 
    3942date_string = commands.getoutput ("date +'%y%m%d_%H%M'") 
    40 os.putenv ('EXIST_HOME', '/usr/local/eXist') 
    41 os.putenv ('PATH', ':/usr/java/j2sdk1.4.2_04/lib/tools.jar:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch:/usr/local/eXist/bin:/bin:/usr/bin:.') 
     43os.putenv ('EXIST_HOME', '/usr/local/exist-client') 
     44os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/java/jdk1.5.0_03/bin:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.') 
    4245os.putenv ('CLASSPATH','.:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch') 
     46os.putenv ('JAVA_HOME','=/usr/java/jdk1.5.0_03') 
    4347 
    4448# Get the harvested records directory for this datacentre from the config file for that data centre 
     
    6771backupdir = '/disks/glue1/oaiBackup/' 
    6872 
    69 # Create/clear the directory for a pristine copy of the difs in case the script rewrites something wrong 
     73# Create/clear the 'in' directory pristine copy of the discovery records 
    7074if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"): 
    7175    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/{\}" 
    7276    print "Executing : " + commandline 
    7377    status = os.system(commandline) 
    74 else:    
     78else: 
    7579    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy" 
    7680    print "Executing : " + commandline 
     
    8084    sys.exit("Failed at creating copy dir stage") 
    8185 
    82 # make the pristine copy. Cope with there being lots of files in the directory. 
     86# make the 'in' pristine copy. Cope with there being lots of files in the directory. 
    8387 
    8488commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/difcopy" 
     
    8892    sys.exit("Failed at making pristine copy stage") 
    8993 
    90 # Create/clear the directory for the processing copy of the difs. 
     94# Create/clear the directory for the 'out' processed copy of the discovery records. 
    9195if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"): 
    9296    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm -f {\}" 
     
    98102    status= os.system(commandline) 
    99103 
     104# Removed 16/10/06 - will hold 'out' processed records only 
    100105# make the processing copy 
    101 commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/discovery" 
    102 print "Executing : " + commandline 
    103 status = os.system(commandline) 
    104 if status !=0: 
    105     sys.exit("Failed at making processing copy stage") 
    106  
    107 # The file config.properties contains the name=value pair to parse the filename in java oaiProc.jar. 
     106#commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/discovery" 
     107#print "Executing : " + commandline 
     108#status = os.system(commandline) 
     109#if status !=0: 
     110#    sys.exit("Failed at making processing copy stage") 
     111 
     112# The file config.properties contains the location of the particular datacentres harvested records. 
    108113# Copy the datacentre specific version of config to config.properties file.  
    109114 
     
    114119    sys.exit("Failed at copying config file stage") 
    115120 
    116 #Change os directory to that with the java.jar in it.     
     121#Change os directory to that with the oaiClean.py in it. (need this?) 
    117122os.chdir('/usr/local/WSClients/OAIBatch') 
    118123 
    119124 
    120125#Execute the script which processes the files 
    121 filenames = os.listdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery") 
     126indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy" 
     127outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery" 
     128filenames = os.listdir(indir) 
    122129for filename in filenames: 
    123130        if filename.find('.xml') != -1: 
    124                 full_filename = "/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/" + filename 
     131                full_filename = indir + "/" + filename 
    125132                print "Processing : " + full_filename 
    126                 commandline = "java -jar /usr/local/WSClients/OAIBatch/oai_Proc.jar %s " %(full_filename) 
    127                 print commandline 
    128                 status= os.system(commandline) 
    129                 if status!=0: 
    130                     break 
     133                #try: 
     134                oaiClean.oaiClean(indir,outdir,filename,['NERC-DDC']) 
     135                #except: 
     136                #    sys.exit("Failed at processing file %s with oaiClean.py stage with status %s" %(full_filename, sys.exc_info())) 
     137                #    break 
    131138                numfilesproc += 1 
    132139        else: 
    133140                print 'File %s is not xml format. Not processed'  %(full_filename) 
    134 if status!=0: 
    135     sys.exit("Failed at processing file %s with java oai_Proc.jar stage with status %s" %(full_filename, status)) 
    136  
    137 #The script reads the files from OAIBatch/data/datacentre/discovery and outputs within the same directory the files. 
    138 #The result will get rid of the "oai%3Aucar.ncar.scd.cdp%3A" type of thing that oai adds to 
    139 #the filenames and it will leave <DIF> as the root element. 
    140 # 
     141 
     142# Removed 16/10/06 Don't need this anymore 
    141143#Once the pre-processing has finished remove the originals from the discovery directory: 
    142 commandline = "find /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/oai* -print | xargs -i rm /{\}" 
    143 print "Executing : " + commandline 
    144 status = os.system(commandline) 
    145 if status !=0: 
    146     sys.exit("Failed at removing original oai style records from discovery directory") 
     144#commandline = "find /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/oai* -print | xargs -i rm /{\}" 
     145#print "Executing : " + commandline 
     146#status = os.system(commandline) 
     147#if status !=0: 
     148#    sys.exit("Failed at removing original oai style records from discovery directory") 
     149 
    147150 
    148151# ingest the datacentres records into eXist db (backups of exist happen nightly). 
    149 commandline = "client.sh -c /db/dif/" + datacentre + " -u admin -P xxxxxx -p /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/" 
    150 #print "Executing : " + commandline 
    151 status = os.system(commandline) 
    152 if status !=0: 
    153     sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status)) 
     152commandline = "client.sh -c /db/discovery/dif/" + datacentre + " -u admin -P xxxxxx -p /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/" 
     153#print "Executing : " + commandline 
     154#status = os.system(commandline) 
     155#if status !=0: 
     156#    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status)) 
    154157 
    155158#Make copies of discovery and oai/difcopy areas to backup area for tape backups 
     
    182185#Clear out the original harvest records area 
    183186commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}" 
    184 print "Executing : " + commandline 
    185 status = os.system(commandline) 
    186 if status !=0: 
    187     sys.exit("Failed at clearing out original harvest records area %s" %harvest_home) 
     187#print "Executing : " + commandline 
     188#status = os.system(commandline) 
     189#if status !=0: 
     190#    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home) 
    188191 
    189192 
Note: See TracChangeset for help on using the changeset viewer.