Changeset 1768


Ignore:
Timestamp:
24/11/06 18:32:12 (13 years ago)
Author:
selatham
Message:

Removed use of oaiClean. Just do renaming. Use data centres namespace property

Location:
TI01-discovery/trunk/ingestAutomation/OAIBatch
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • TI01-discovery/trunk/ingestAutomation/OAIBatch/bodc_config.properties

    r1593 r1768  
    1212#Define which format is harvested from the data centre (one only) 
    1313format dif 
     14# 
     15#Define the data providers namespace 
     16namespace bodc.nerc.ac.uk 
  • TI01-discovery/trunk/ingestAutomation/OAIBatch/nocs_config.properties

    r1593 r1768  
    1212#Define which format is harvested from the data centre (one only) 
    1313format dif 
     14# 
     15#Define the data providers namespace 
     16namespace noc.soton.ac.uk 
  • TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py

    r1755 r1768  
    11#!/usr/bin/env python 
    22""" Script oai_ingest.py takes parameter <datacentre>. 
    3 The /usr/local/WSClients/OAIBatch directory contains this python script, a config file 
     3The /usr/local/WSClients/OAIBatch directory contains this python script, a DataProvider specific config file 
    44and the oaiClean.py class which cleans up discovery records after harvesting. 
    55The pre-processed files are then ingested to the eXist XML db. 
     
    3030import commands 
    3131import string 
    32 import oaiClean 
     32#import oaiClean 
    3333 
    3434status = 0 
     
    3737datacentre_groups = "" 
    3838datacentre_format = "" 
     39datacentre_namespace = "" 
    3940 
    4041if (len(sys.argv) < 2): 
     
    5354# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front. 
    5455# The groups denote which 'portal groups' they belong to - for limiting searches to say NERC-only datacentres records. 
    55 # Groups are added to the xml record by oaiClean.py. 
     56# Groups are added to the intermediate MOLES when it is created. 
    5657datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties" 
    5758print "Datacentre config file = %s" %datacentre_config_filename 
     
    6869    if words[0] == 'format': 
    6970        datacentre_format = words[1] 
     71    if words[0] == 'namespace': 
     72        datacentre_namespace = words[1] 
    7073datacentre_config_file.close() 
     74 
    7175if harvest_home == "": 
    7276    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename) 
     
    8387else: 
    8488    print "INFO: format being harvested = %s" %datacentre_format 
     89 
     90if datacentre_namespace == "": 
     91    sys.exit("Failed at stage: getting datacentre namespace. datacentre config file tried = %s" %datacentre_config_filename) 
     92else: 
     93    print "INFO: datacentre namespace = %s" %datacentre_namespace 
    8594 
    8695#any records to harvest? 
     
    123132    status= os.system(commandline) 
    124133 
    125 # Removed 16/10/06 - directory will hold 'out' processed records only 
    126 # make the processing copy 
    127 #commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/discovery" 
    128 #print "Executing : " + commandline 
    129 #status = os.system(commandline) 
    130 #if status !=0: 
    131 #    sys.exit("Failed at making processing copy stage") 
    132  
    133134# The file config.properties contains the location of the particular datacentres harvested records. 
    134135# Copy the datacentre specific version of config to config.properties file. 
     
    143144 
    144145 
    145 #Execute the script which processes the files 
     146#Execute the script which processes/renames the files (changed 24/11/06 to simply create a re-named file in the outdir) 
    146147indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals" 
    147148outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery" 
    148 wrapFlag=False 
     149#wrapFlag=False 
    149150filenames = os.listdir(indir) 
    150151for filename in filenames: 
    151152        if filename.find('.xml') != -1: 
    152                 full_filename = indir + "/" + filename 
    153                 print "Processing : " + full_filename 
    154                 #try: 
    155                 oaiClean.oaiClean(indir,outdir,filename,wrapFlag) 
    156                 #except: 
    157                 #    sys.exit("Failed at processing file %s with oaiClean.py stage with status %s" %(full_filename, sys.exc_info())) 
    158                 #    break 
     153                original_filename = indir + "/" + filename 
     154                print "Creating renamed file : " 
     155                new_filename = outdir + "/" +datacentre_namespace+ "__" +filename.split('%3A')[-1] 
     156                print "original file = %s, newfile = %s" %(original_filename, new_filename) 
     157                commandline = "cp "+original_filename+ " " +new_filename 
     158                print "Executing : " + commandline 
     159                status = os.system(commandline) 
     160                if status !=0: 
     161                        sys.exit("Failed at re-naming file stage") 
     162                #oaiClean.oaiClean(indir,outdir,filename,wrapFlag) 
    159163                numfilesproc += 1 
    160164        else: 
    161165                print 'File %s is not xml format. Not processed'  %(full_filename) 
    162166 
    163 # Removed 16/10/06 Don't need this anymore 
    164 #Once the pre-processing has finished remove the originals from the discovery directory: 
    165 #commandline = "find /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/oai* -print | xargs -i rm /{\}" 
    166 #print "Executing : " + commandline 
    167 #status = os.system(commandline) 
    168 #if status !=0: 
    169 #    sys.exit("Failed at removing original oai style records from discovery directory") 
    170  
    171  
    172167# ingest the datacentres records into eXist db (backups of exist happen nightly). 
    173 commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/" +datacentre_format+ "/" + datacentre + " -u admin -P xxxxxx -p /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/" 
     168commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P xxxxxx -p "+outdir 
    174169print "Executing : actual command to ingest into exist db" 
    175170status = os.system(commandline) 
    176171if status !=0: 
    177172    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status)) 
     173 
     174# Then run the minimum moles creator  which will run over all records in the supplied collection 
     175# Will it pass back records? or add staright to /db/discovery/moles? 
     176 
     177 
    178178 
    179179#Make copies of discovery and oai/originals areas to backup area for tape backups 
  • TI01-discovery/trunk/ingestAutomation/OAIBatch/pml_config.properties

    r1619 r1768  
    1111#Define which format is harvested from the data centre (one only) 
    1212format dif 
     13# 
     14#Define the data providers namespace 
     15namespace npm.ac.uk 
Note: See TracChangeset for help on using the changeset viewer.