source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 721

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py@721
Revision 721, 8.0 KB checked in by selatham, 14 years ago (diff)

adding ingest scripts, config files etc

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameter <datacentre>.
3The /usr/local/WSClients/OAIBatch directory contains this python script, a config file
4and some java which handle difs after harvesting. The pre-processed files are then ingested
5to the eXist XML db.
6
7 Under this directory the following structure should be maintained:
8
9 ./data
10 - /DATACENTRE/
11                - discovery/:           Records with namespace, schema declaration deleted. After having run the script.
12                                       Ready to ingest in the discovery service.
13                - oai/difYYYYMMDD/      Records as harvested from OAI
14
15 Where  /DATACENTRE  varies for the different data providers
16"""
17import os
18import sys
19import commands
20import string
21
22status = 0
23numfilesproc = 0
24harvest_home = ""
25
26if (len(sys.argv) < 2):
27    print "<datacentre>  parameter not supplied."
28    sys.exit()
29else:
30    datacentre = sys.argv[1]
31
32# Other settings and constants
33date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
34os.putenv ('EXIST_HOME', '/usr/local/eXist')
35os.putenv ('PATH', ':/usr/java/j2sdk1.4.2_04/lib/tools.jar:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch:/usr/local/eXist/bin:/bin:/usr/bin:.')
36os.putenv ('CLASSPATH','.:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch')
37
38# Get the harvested records directory for this datacentre from the config file for that data centre
39# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
40datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
41print "Datacentre config file = %s" %datacentre_config_filename
42datacentre_config_file = open(datacentre_config_filename, "r")
43
44for line in datacentre_config_file.readlines():
45    words  = string.split(line)
46    if words[0] == 'host_path':
47        harvest_home = string.rstrip(words[1])
48        break
49
50if harvest_home == "":
51    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
52datacentre_config_file.close()
53
54# The harvested_records directory
55#harvest_home = "/home/tomcat/oai/WEB-INF/harvested_records/www.npm.ac.uk/8080/oai/provider/rsdas/dif"
56
57# The directory to put things for a tape backup (should already exist)
58backupdir = '/disks/glue1/oaiBackup/'
59
60# Create/clear the directory for a pristine copy of the difs in case the script rewrites something wrong
61if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"):
62    commandline = "rm -f /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/*"
63    print "Executing : " + commandline
64    status = os.system(commandline)
65else:   
66    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"
67    print "Executing : " + commandline
68    status= os.system(commandline)
69
70if status != 0:
71    sys.exit("Failed at creating copy dir stage")
72
73# make the pristine copy
74commandline = "cp " + harvest_home + "/*.xml /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"
75print "Executing : " + commandline
76status = os.system(commandline)
77if status !=0:
78    sys.exit("Failed at making pristine copy stage")
79
80# Create/clear the directory for the processing copy of the difs.
81if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
82    commandline = "rm -f /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/*"
83    print "Executing : " + commandline
84    status = os.system(commandline)
85else:
86    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
87    print "Executing : " + commandline
88    status= os.system(commandline)
89
90# make the processing copy
91commandline = "cp " + harvest_home + "/*.xml /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
92print "Executing : " + commandline
93status = os.system(commandline)
94if status !=0:
95    sys.exit("Failed at making processing copy stage")
96
97# The file config.properties contains the name=value pair to parse the filename in java oaiProc.jar.
98# Copy the datacentre specific version of config to config.properties file.
99# e.g. for bodc:-
100#cat bodc_config.properties
101#       #### config.properties #######
102# Define host_OAI as the string that OAI adds to the filenames after harvesting
103# String added by OAI for BODC, SOC, NCAR
104# BODC = oai%3Agrid.bodc.nerc.ac.uk%3A
105# SOC = oai%3Aoai.noc.soton.ac.uk%3A
106# NCAR = oai%3Aucar.ncar.scd.cdp%3A
107
108#host_OAI=oai%3Agrid.bodc.nerc.ac.uk%3A
109#
110#               ###########
111
112commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
113print "Executing : " + commandline
114status = os.system(commandline)
115if status !=0:
116    sys.exit("Failed at copying config file stage")
117
118#Change os directory to that with the java.jar in it.   
119os.chdir('/usr/local/WSClients/OAIBatch')
120
121
122#Execute the script which processes the files
123filenames = os.listdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery")
124for filename in filenames:
125        if filename.find('.xml') != -1:
126                full_filename = "/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/" + filename
127                print "Processing : " + full_filename
128                commandline = "java -jar /usr/local/WSClients/OAIBatch/oai_Proc.jar %s " %(full_filename)
129                print commandline
130                status= os.system(commandline)
131                if status!=0:
132                    break
133                numfilesproc += 1
134        else:
135                print 'File %s is not xml format. Not processed'  %(full_filename)
136if status!=0:
137    sys.exit("Failed at processing file %s with java oai_Proc.jar stage with status %s" %(full_filename, status))
138
139#The script reads the files from OAIBatch/data/datacentre/discovery and outputs within the same directory the files.
140#The result will get rid of the "oai%3Aucar.ncar.scd.cdp%3A" type of thing that oai adds to
141#the filenames and it will leave <DIF> as the root element.
142#
143#Once the pre-processing has finished remove the originals from the discovery directory:
144commandline = "rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/oai*"
145print "Executing : " + commandline
146status = os.system(commandline)
147if status !=0:
148    sys.exit("Failed at removing original oai style records from discovery directory")
149
150# ingest the datacentres records into eXist db (backups of exist happen nightly).
151commandline = "client.sh -c /db/dif/" + datacentre + " -p /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/"
152print "Executing : " + commandline
153status = os.system(commandline)
154if status !=0:
155    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
156
157#Make copies of discovery and oai/difcopy areas to backup area for tape backups
158this_backupdir = backupdir + datacentre + "_" + date_string + "_difcopy"
159commandline = "mkdir " + this_backupdir
160print "Executing : " + commandline
161status = os.system(commandline)
162if status !=0:
163    sys.exit("Failed at creating backup directory %s" %this_backupdir)
164commandline = "cp " + "/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/* " + this_backupdir
165print "Executing : " + commandline
166status = os.system(commandline)
167if status !=0:
168    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
169
170this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
171commandline = "mkdir " + this_backupdir
172print "Executing : " + commandline
173status = os.system(commandline)
174if status !=0:
175    sys.exit("Failed at creating backup directory %s" %this_backupdir)
176commandline = "cp " + "/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/* " + this_backupdir
177print "Executing : " + commandline
178status = os.system(commandline)
179if status !=0:
180    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
181       
182   
183
184print "======================================================"
185print "No. of files pre-processed = %s" %numfilesproc
186if status == 0:
187    print " Procedure oai_ingest.py ran to end"
188else:
189    print "Procedure oai_ingest.py FAILED with status %s" %status
190   
191print "======================================================"
Note: See TracBrowser for help on using the repository browser.