Changeset 1593


Ignore:
Timestamp:
17/10/06 18:41:53 (13 years ago)
Author:
selatham
Message:

Cope with different incoming formats. Also add keywords from config file.

Location:
TI01-discovery/trunk/ingestAutomation/OAIBatch
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • TI01-discovery/trunk/ingestAutomation/OAIBatch/bodc_config.properties

    r1591 r1593  
    66#Define host_path as the full directory name where this data centre's records will be harvested into. 
    77host_path /usr/local/jakarta-tomcat/webapps/oai/WEB-INF/harvested_records/grid-bodc-nerc-ac-uk-dif-ndg_bodc 
     8# 
     9#Define groups - portal groups for limiting searches by 'group of datacentres'. 
     10groups NERC-DDC NERC MDIP 
     11# 
     12#Define which format is harvested from the data centre (one only) 
     13format dif 
  • TI01-discovery/trunk/ingestAutomation/OAIBatch/nocs_config.properties

    r1481 r1593  
    11#Note - don't have any blank lines in this config file 
    22# Define host_OAI as the string that OAI adds to the filenames after harvesting 
    3  
    43# 
    54host_OAI = oai%3Aoai.noc.soton.ac.uk%3A 
     
    76#Define host_path as the full directory name where this data centre's records will be harvested into. 
    87host_path /usr/local/jakarta-tomcat/webapps/oai/WEB-INF/harvested_records/oai-noc-soton-ac-uk-dif 
     8# 
     9#Define groups - portal groups for limiting searches by 'group of datacentres'. 
     10groups NERC 
     11# 
     12#Define which format is harvested from the data centre (one only) 
     13format dif 
  • TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py

    r1591 r1593  
    2222# 16/10/06 SEL Changed to using python oaiClean.py module instead of java code. 
    2323# 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade. 
     24# 17/10/06 SEL cope with different discovery formats - not just DIF. 
    2425 
    2526import os 
     
    3233numfilesproc = 0 
    3334harvest_home = "" 
     35datacentre_groups = "" 
     36datacentre_format = "" 
    3437 
    3538if (len(sys.argv) < 2): 
     
    4245date_string = commands.getoutput ("date +'%y%m%d_%H%M'") 
    4346os.putenv ('EXIST_HOME', '/usr/local/exist-client') 
    44 os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/java/jdk1.5.0_03/bin:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.') 
     47os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/jre:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.') 
    4548os.putenv ('CLASSPATH','.:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch') 
    46 os.putenv ('JAVA_HOME','=/usr/java/jdk1.5.0_03') 
    47  
    48 # Get the harvested records directory for this datacentre from the config file for that data centre 
     49 
     50# Get the harvested records directory and groups for this datacentre from the datacentre specific config file 
    4951# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front. 
     52# The groups denote which 'portal groups' they belong to - for limiting searches to say NERC-only datacentres records. 
     53# Groups are added to the xml record by oaiClean.py. 
    5054datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties" 
    5155print "Datacentre config file = %s" %datacentre_config_filename 
     
    5862    if words[0] == 'host_path': 
    5963        harvest_home = string.rstrip(words[1]) 
    60         break 
    61  
     64    if words[0] == 'groups': 
     65        datacentre_groups = words[1:] 
     66    if words[0] == 'format': 
     67        datacentre_format = words[1] 
     68datacentre_config_file.close() 
    6269if harvest_home == "": 
    6370    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename) 
    64 datacentre_config_file.close() 
    65  
     71if datacentre_groups == "": 
     72    sys.exit("Failed at stage: getting datacentre groups. datacentre config file tried = %s" %datacentre_config_filename) 
     73if datacentre_format == "": 
     74    sys.exit("Failed at stage: getting datacentre format. datacentre config file tried = %s" %datacentre_config_filename) 
     75 
     76#any records to harvest? 
    6677if len( os.listdir(harvest_home)) == 0: 
    6778    print "Nothing to harvest this time from %s" %datacentre 
     
    7283 
    7384# Create/clear the 'in' directory pristine copy of the discovery records 
    74 if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"): 
    75     commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/{\}" 
     85if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"): 
     86    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\}" 
    7687    print "Executing : " + commandline 
    7788    status = os.system(commandline) 
    7889else: 
    79     commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy" 
     90    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals" 
    8091    print "Executing : " + commandline 
    8192    status= os.system(commandline) 
     
    8697# make the 'in' pristine copy. Cope with there being lots of files in the directory. 
    8798 
    88 commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/difcopy" 
     99commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/originals" 
    89100print "Executing : " + commandline 
    90101status = os.system(commandline) 
     
    102113    status= os.system(commandline) 
    103114 
    104 # Removed 16/10/06 - will hold 'out' processed records only 
     115# Removed 16/10/06 - directory will hold 'out' processed records only 
    105116# make the processing copy 
    106117#commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/discovery" 
     
    111122 
    112123# The file config.properties contains the location of the particular datacentres harvested records. 
    113 # Copy the datacentre specific version of config to config.properties file.  
     124# Copy the datacentre specific version of config to config.properties file. 
    114125 
    115126commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties" 
     
    124135 
    125136#Execute the script which processes the files 
    126 indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy" 
     137indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals" 
    127138outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery" 
    128139filenames = os.listdir(indir) 
     
    132143                print "Processing : " + full_filename 
    133144                #try: 
    134                 oaiClean.oaiClean(indir,outdir,filename,['NERC-DDC']) 
     145                oaiClean.oaiClean(indir,outdir,filename,datacentre_groups) 
    135146                #except: 
    136147                #    sys.exit("Failed at processing file %s with oaiClean.py stage with status %s" %(full_filename, sys.exc_info())) 
     
    150161 
    151162# ingest the datacentres records into eXist db (backups of exist happen nightly). 
    152 commandline = "client.sh -c /db/discovery/dif/" + datacentre + " -u admin -P xxxxxx -p /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/" 
    153 #print "Executing : " + commandline 
    154 #status = os.system(commandline) 
    155 #if status !=0: 
    156 #    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status)) 
    157  
    158 #Make copies of discovery and oai/difcopy areas to backup area for tape backups 
    159 this_backupdir = backupdir + datacentre + "_" + date_string + "_difcopy" 
    160 commandline = "mkdir " + this_backupdir  
     163commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/" +datacentre_format+ "/" + datacentre + " -u admin -P xxxxxx -p /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/" 
     164print "Executing : actual command to ingest into exist db" 
     165status = os.system(commandline) 
     166if status !=0: 
     167    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status)) 
     168 
     169#Make copies of discovery and oai/originals areas to backup area for tape backups 
     170this_backupdir = backupdir + datacentre + "_" + date_string + "_originals" 
     171commandline = "mkdir " + this_backupdir 
    161172print "Executing : " + commandline 
    162173status = os.system(commandline) 
     
    164175    sys.exit("Failed at creating backup directory %s" %this_backupdir) 
    165176 
    166 commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/{\} " + this_backupdir 
     177commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\} " + this_backupdir 
    167178print "Executing : " + commandline 
    168179status = os.system(commandline) 
     
    182193if status !=0: 
    183194    sys.exit("Failed at copying to backup directory %s" %this_backupdir) 
    184          
     195 
    185196#Clear out the original harvest records area 
    186197commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}" 
    187 #print "Executing : " + commandline 
    188 #status = os.system(commandline) 
    189 #if status !=0: 
    190 #    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home) 
     198print "Executing : " + commandline 
     199status = os.system(commandline) 
     200if status !=0: 
     201    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home) 
    191202 
    192203 
     
    196207    print " Procedure oai_ingest.py ran to end" 
    197208else: 
    198     print "Procedure oai_ingest.py FAILED with status %s" %status  
    199      
     209    print "Procedure oai_ingest.py FAILED with status %s" %status 
     210 
    200211print "======================================================" 
  • TI01-discovery/trunk/ingestAutomation/OAIBatch/pml_config.properties

    r1481 r1593  
    11#Note - don't have any blank lines in this config file 
    22# Define host_OAI as the string that OAI adds to the filenames after harvesting 
    3  
    43# 
    54host_OAI = oai%3Anpm.ac.uk%3A 
     
    76#Define host_path as the full directory name where this data centre's records will be harvested into. 
    87host_path         usr/local/jakarta-tomcat/webapps/oai/WEB-INF/harvested_records/www-npm-ac-uk-8080-dif 
     8# 
     9#Define groups - portal groups for limiting searches by 'group of datacentres'. 
     10groups NERC 
     11# 
     12#Define which format is harvested from the data centre (one only) 
     13format dif 
Note: See TracChangeset for help on using the changeset viewer.