source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 916

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py@916
Revision 916, 8.0 KB checked in by root, 14 years ago (diff)

correct spelling

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameter <datacentre>.
3The /usr/local/WSClients/OAIBatch directory contains this python script, a config file
4and some java which handle difs after harvesting. The pre-processed files are then ingested
5to the eXist XML db.
6
7 Under this directory the following structure should be maintained:
8
9 ./data
10 - /DATACENTRE/
11                - discovery/:           Records with namespace, schema declaration deleted. After having run the script.
12                                       Ready to ingest in the discovery service.
13                - oai/difYYYYMMDD/      Records as harvested from OAI
14
15 Where  /DATACENTRE  varies for the different data providers
16
17"""
18#History:
19# 12/05/06 SEL spelling correction
20#
21#
22
23import os
24import sys
25import commands
26import string
27
28status = 0
29numfilesproc = 0
30harvest_home = ""
31
32if (len(sys.argv) < 2):
33    print "<datacentre>  parameter not supplied."
34    sys.exit()
35else:
36    datacentre = sys.argv[1]
37
38# Other settings and constants
39date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
40os.putenv ('EXIST_HOME', '/usr/local/eXist')
41os.putenv ('PATH', ':/usr/java/j2sdk1.4.2_04/lib/tools.jar:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch:/usr/local/eXist/bin:/bin:/usr/bin:.')
42os.putenv ('CLASSPATH','.:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch')
43
44# Get the harvested records directory for this datacentre from the config file for that data centre
45# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
46datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
47print "Datacentre config file = %s" %datacentre_config_filename
48datacentre_config_file = open(datacentre_config_filename, "r")
49
50for line in datacentre_config_file.readlines():
51    words  = string.split(line)
52    if len(words) == 0:
53        continue
54    if words[0] == 'host_path':
55        harvest_home = string.rstrip(words[1])
56        break
57
58if harvest_home == "":
59    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
60datacentre_config_file.close()
61
62if len( os.listdir(harvest_home)) == 0:
63    print "Nothing to harvest this time from %s" %datacentre
64    sys.exit()
65
66# The directory to put things for a tape backup (should already exist)
67backupdir = '/disks/glue1/oaiBackup/'
68
69# Create/clear the directory for a pristine copy of the difs in case the script rewrites something wrong
70if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"):
71    commandline = "rm -f /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/*"
72    print "Executing : " + commandline
73    status = os.system(commandline)
74else:   
75    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"
76    print "Executing : " + commandline
77    status= os.system(commandline)
78
79if status != 0:
80    sys.exit("Failed at creating copy dir stage")
81
82# make the pristine copy
83commandline = "cp " + harvest_home + "/*.xml /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"
84print "Executing : " + commandline
85status = os.system(commandline)
86if status !=0:
87    sys.exit("Failed at making pristine copy stage")
88
89# Create/clear the directory for the processing copy of the difs.
90if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
91    commandline = "rm -f /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/*"
92    print "Executing : " + commandline
93    status = os.system(commandline)
94else:
95    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
96    print "Executing : " + commandline
97    status= os.system(commandline)
98
99# make the processing copy
100commandline = "cp " + harvest_home + "/*.xml /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
101print "Executing : " + commandline
102status = os.system(commandline)
103if status !=0:
104    sys.exit("Failed at making processing copy stage")
105
106# The file config.properties contains the name=value pair to parse the filename in java oaiProc.jar.
107# Copy the datacentre specific version of config to config.properties file.
108
109commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
110print "Executing : " + commandline
111status = os.system(commandline)
112if status !=0:
113    sys.exit("Failed at copying config file stage")
114
115#Change os directory to that with the java.jar in it.   
116os.chdir('/usr/local/WSClients/OAIBatch')
117
118
119#Execute the script which processes the files
120filenames = os.listdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery")
121for filename in filenames:
122        if filename.find('.xml') != -1:
123                full_filename = "/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/" + filename
124                print "Processing : " + full_filename
125                commandline = "java -jar /usr/local/WSClients/OAIBatch/oai_Proc.jar %s " %(full_filename)
126                print commandline
127                status= os.system(commandline)
128                if status!=0:
129                    break
130                numfilesproc += 1
131        else:
132                print 'File %s is not xml format. Not processed'  %(full_filename)
133if status!=0:
134    sys.exit("Failed at processing file %s with java oai_Proc.jar stage with status %s" %(full_filename, status))
135
136#The script reads the files from OAIBatch/data/datacentre/discovery and outputs within the same directory the files.
137#The result will get rid of the "oai%3Aucar.ncar.scd.cdp%3A" type of thing that oai adds to
138#the filenames and it will leave <DIF> as the root element.
139#
140#Once the pre-processing has finished remove the originals from the discovery directory:
141commandline = "rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/oai*"
142print "Executing : " + commandline
143status = os.system(commandline)
144if status !=0:
145    sys.exit("Failed at removing original oai style records from discovery directory")
146
147# ingest the datacentres records into eXist db (backups of exist happen nightly).
148commandline = "client.sh -c /db/dif/" + datacentre + " -p /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/"
149print "Executing : " + commandline
150status = os.system(commandline)
151if status !=0:
152    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
153
154#Make copies of discovery and oai/difcopy areas to backup area for tape backups
155this_backupdir = backupdir + datacentre + "_" + date_string + "_difcopy"
156commandline = "mkdir " + this_backupdir
157print "Executing : " + commandline
158status = os.system(commandline)
159if status !=0:
160    sys.exit("Failed at creating backup directory %s" %this_backupdir)
161commandline = "cp " + "/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/* " + this_backupdir
162print "Executing : " + commandline
163status = os.system(commandline)
164if status !=0:
165    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
166
167this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
168commandline = "mkdir " + this_backupdir
169print "Executing : " + commandline
170status = os.system(commandline)
171if status !=0:
172    sys.exit("Failed at creating backup directory %s" %this_backupdir)
173commandline = "cp " + "/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/* " + this_backupdir
174print "Executing : " + commandline
175status = os.system(commandline)
176if status !=0:
177    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
178       
179#Clear out the original harvest records area
180commandline = "rm -f " +  harvest_home +"/*"
181print "Executing : " + commandline
182status = os.system(commandline)
183if status !=0:
184    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home)
185
186
187print "======================================================"
188print "No. of files pre-processed = %s" %numfilesproc
189if status == 0:
190    print " Procedure oai_ingest.py ran to end"
191else:
192    print "Procedure oai_ingest.py FAILED with status %s" %status
193   
194print "======================================================"
Note: See TracBrowser for help on using the repository browser.