source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 863

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py@863
Revision 863, 7.9 KB checked in by selatham, 14 years ago (diff)

bug fix to oai_ingest and updated pml and bodc config files

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameter <datacentre>.
3The /usr/local/WSClients/OAIBatch directory contains this python script, a config file
4and some java which handle difs after harvesting. The pre-processed files are then ingested
5to the eXist XML db.
6
7 Under this directory the following structure should be maintained:
8
9 ./data
10 - /DATACENTRE/
11                - discovery/:           Records with namespace, schema declaration deleted. After having run the script.
12                                       Ready to ingest in the discovery service.
13                - oai/difYYYYMMDD/      Records as harvested from OAI
14
15 Where  /DATACENTRE  varies for the different data providers
16"""
17import os
18import sys
19import commands
20import string
21
22status = 0
23numfilesproc = 0
24harvest_home = ""
25
26if (len(sys.argv) < 2):
27    print "<datacentre>  parameter not supplied."
28    sys.exit()
29else:
30    datacentre = sys.argv[1]
31
32# Other settings and constants
33date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
34os.putenv ('EXIST_HOME', '/usr/local/eXist')
35os.putenv ('PATH', ':/usr/java/j2sdk1.4.2_04/lib/tools.jar:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch:/usr/local/eXist/bin:/bin:/usr/bin:.')
36os.putenv ('CLASSPATH','.:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch')
37
38# Get the harvested records directory for this datacentre from the config file for that data centre
39# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
40datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
41print "Datacentre config file = %s" %datacentre_config_filename
42datacentre_config_file = open(datacentre_config_filename, "r")
43
44for line in datacentre_config_file.readlines():
45    words  = string.split(line)
46    if len(words) == 0:
47        continue
48    if words[0] == 'host_path':
49        harvest_home = string.rstrip(words[1])
50        break
51
52if harvest_home == "":
53    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
54datacentre_config_file.close()
55
56if len( os.listdir(harvest_home)) == 0:
57    print "Nothing to harvest this time from %s" %datacentre
58    sys.exit()
59
60# The directory to put things for a tape backup (should already exist)
61backupdir = '/disks/glue1/oaiBackup/'
62
63# Create/clear the directory for a pristine copy of the difs in case the script rewrites something wrong
64if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"):
65    commandline = "rm -f /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/*"
66    print "Executing : " + commandline
67    status = os.system(commandline)
68else:   
69    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"
70    print "Executing : " + commandline
71    status= os.system(commandline)
72
73if status != 0:
74    sys.exit("Failed at creating copy dir stage")
75
76# make the pristine copy
77commandline = "cp " + harvest_home + "/*.xml /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"
78print "Executing : " + commandline
79status = os.system(commandline)
80if status !=0:
81    sys.exit("Failed at making pristine copy stage")
82
83# Create/clear the directory for the processing copy of the difs.
84if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
85    commandline = "rm -f /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/*"
86    print "Executing : " + commandline
87    status = os.system(commandline)
88else:
89    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
90    print "Executing : " + commandline
91    status= os.system(commandline)
92
93# make the processing copy
94commandline = "cp " + harvest_home + "/*.xml /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
95print "Executing : " + commandline
96status = os.system(commandline)
97if status !=0:
98    sys.exit("Failed at making processing copy stage")
99
100# The file config.properties contains the name=value pair to parse the filename in java oaiProc.jar.
101# Copy the datacentre specific version of config to config.properties file.
102
103commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
104print "Executing : " + commandline
105status = os.system(commandline)
106if status !=0:
107    sys.exit("Failed at copying config file stage")
108
109#Change os directory to that with the java.jar in it.   
110os.chdir('/usr/local/WSClients/OAIBatch')
111
112
113#Execute the script which processes the files
114filenames = os.listdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery")
115for filename in filenames:
116        if filename.find('.xml') != -1:
117                full_filename = "/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/" + filename
118                print "Processing : " + full_filename
119                commandline = "java -jar /usr/local/WSClients/OAIBatch/oai_Proc.jar %s " %(full_filename)
120                print commandline
121                status= os.system(commandline)
122                if status!=0:
123                    break
124                numfilesproc += 1
125        else:
126                print 'File %s is not xml format. Not processed'  %(full_filename)
127if status!=0:
128    sys.exit("Failed at processing file %s with java oai_Proc.jar stage with status %s" %(full_filename, status))
129
130#The script reads the files from OAIBatch/data/datacentre/discovery and outputs within the same directory the files.
131#The result will get rid of the "oai%3Aucar.ncar.scd.cdp%3A" type of thing that oai adds to
132#the filenames and it will leave <DIF> as the root element.
133#
134#Once the pre-processing has finished remove the originals from the discovery directory:
135commandline = "rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/oai*"
136print "Executing : " + commandline
137status = os.system(commandline)
138if status !=0:
139    sys.exit("Failed at removing original oai style records from discovery directory")
140
141# ingest the datacentres records into eXist db (backups of exist happen nightly).
142commandline = "client.sh -c /db/dif/" + datacentre + " -p /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/"
143print "Executing : " + commandline
144status = os.system(commandline)
145if status !=0:
146    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
147
148#Make copies of discovery and oai/difcopy areas to backup area for tape backups
149this_backupdir = backupdir + datacentre + "_" + date_string + "_difcopy"
150commandline = "mkdir " + this_backupdir
151print "Executing : " + commandline
152status = os.system(commandline)
153if status !=0:
154    sys.exit("Failed at creating backup directory %s" %this_backupdir)
155commandline = "cp " + "/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/* " + this_backupdir
156print "Executing : " + commandline
157status = os.system(commandline)
158if status !=0:
159    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
160
161this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
162commandline = "mkdir " + this_backupdir
163print "Executing : " + commandline
164status = os.system(commandline)
165if status !=0:
166    sys.exit("Failed at creating backup directory %s" %this_backupdir)
167commandline = "cp " + "/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/* " + this_backupdir
168print "Executing : " + commandline
169status = os.system(commandline)
170if status !=0:
171    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
172       
173#Clear out the original harvest records area
174commandline = "rm -f " +  harvest_home +"/*"
175print "Executing : " + commandline
176status = os.system(commandline)
177if status !=0:
178    sys.exit("Failed at clearing out original harvest records area %s" %harvet_home)
179
180
181print "======================================================"
182print "No. of files pre-processed = %s" %numfilesproc
183if status == 0:
184    print " Procedure oai_ingest.py ran to end"
185else:
186    print "Procedure oai_ingest.py FAILED with status %s" %status
187   
188print "======================================================"
Note: See TracBrowser for help on using the repository browser.