source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 1591

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py@1591
Revision 1591, 8.7 KB checked in by selatham, 13 years ago (diff)

Uses oaiClean.py instead of java clean-up code. Also deals with upgrade and re-deployment of exist and java and tomcat.

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameter <datacentre>.
3The /usr/local/WSClients/OAIBatch directory contains this python script, a config file
4and the oaiClean.py class which cleans up discovery records after harvesting.
5The pre-processed files are then ingested to the eXist XML db.
6
7 Under this directory the following structure should be maintained:
8
9 ./data
10 - /DATACENTRE/
11                - discovery/:         Records with namespace, schema declaration deleted - after having run
12                                      the oaiClean script. Ready to ingest in the discovery service.
13                - oai/difYYYYMMDD/    Records as harvested from OAI
14
15 Where  /DATACENTRE  varies for the different data providers
16
17"""
18#History:
19# 12/05/06 SEL spelling correction
20# 30/05/06 SEL cope with many files for processing."Argument list too long" problem.
21# 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version).
22# 16/10/06 SEL Changed to using python oaiClean.py module instead of java code.
23# 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade.
24
25import os
26import sys
27import commands
28import string
29import oaiClean
30
31status = 0
32numfilesproc = 0
33harvest_home = ""
34
35if (len(sys.argv) < 2):
36    print "<datacentre>  parameter not supplied."
37    sys.exit()
38else:
39    datacentre = sys.argv[1]
40
41# Other settings and constants
42date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
43os.putenv ('EXIST_HOME', '/usr/local/exist-client')
44os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/java/jdk1.5.0_03/bin:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
45os.putenv ('CLASSPATH','.:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch')
46os.putenv ('JAVA_HOME','=/usr/java/jdk1.5.0_03')
47
48# Get the harvested records directory for this datacentre from the config file for that data centre
49# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
50datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
51print "Datacentre config file = %s" %datacentre_config_filename
52datacentre_config_file = open(datacentre_config_filename, "r")
53
54for line in datacentre_config_file.readlines():
55    words  = string.split(line)
56    if len(words) == 0:
57        continue
58    if words[0] == 'host_path':
59        harvest_home = string.rstrip(words[1])
60        break
61
62if harvest_home == "":
63    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
64datacentre_config_file.close()
65
66if len( os.listdir(harvest_home)) == 0:
67    print "Nothing to harvest this time from %s" %datacentre
68    sys.exit()
69
70# The directory to put things for a tape backup (should already exist)
71backupdir = '/disks/glue1/oaiBackup/'
72
73# Create/clear the 'in' directory pristine copy of the discovery records
74if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"):
75    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/{\}"
76    print "Executing : " + commandline
77    status = os.system(commandline)
78else:
79    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"
80    print "Executing : " + commandline
81    status= os.system(commandline)
82
83if status != 0:
84    sys.exit("Failed at creating copy dir stage")
85
86# make the 'in' pristine copy. Cope with there being lots of files in the directory.
87
88commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/difcopy"
89print "Executing : " + commandline
90status = os.system(commandline)
91if status !=0:
92    sys.exit("Failed at making pristine copy stage")
93
94# Create/clear the directory for the 'out' processed copy of the discovery records.
95if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
96    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm -f {\}"
97    print "Executing : " + commandline
98    status = os.system(commandline)
99else:
100    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
101    print "Executing : " + commandline
102    status= os.system(commandline)
103
104# Removed 16/10/06 - will hold 'out' processed records only
105# make the processing copy
106#commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/discovery"
107#print "Executing : " + commandline
108#status = os.system(commandline)
109#if status !=0:
110#    sys.exit("Failed at making processing copy stage")
111
112# The file config.properties contains the location of the particular datacentres harvested records.
113# Copy the datacentre specific version of config to config.properties file.
114
115commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
116print "Executing : " + commandline
117status = os.system(commandline)
118if status !=0:
119    sys.exit("Failed at copying config file stage")
120
121#Change os directory to that with the oaiClean.py in it. (need this?)
122os.chdir('/usr/local/WSClients/OAIBatch')
123
124
125#Execute the script which processes the files
126indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"
127outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
128filenames = os.listdir(indir)
129for filename in filenames:
130        if filename.find('.xml') != -1:
131                full_filename = indir + "/" + filename
132                print "Processing : " + full_filename
133                #try:
134                oaiClean.oaiClean(indir,outdir,filename,['NERC-DDC'])
135                #except:
136                #    sys.exit("Failed at processing file %s with oaiClean.py stage with status %s" %(full_filename, sys.exc_info()))
137                #    break
138                numfilesproc += 1
139        else:
140                print 'File %s is not xml format. Not processed'  %(full_filename)
141
142# Removed 16/10/06 Don't need this anymore
143#Once the pre-processing has finished remove the originals from the discovery directory:
144#commandline = "find /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/oai* -print | xargs -i rm /{\}"
145#print "Executing : " + commandline
146#status = os.system(commandline)
147#if status !=0:
148#    sys.exit("Failed at removing original oai style records from discovery directory")
149
150
151# ingest the datacentres records into eXist db (backups of exist happen nightly).
152commandline = "client.sh -c /db/discovery/dif/" + datacentre + " -u admin -P xxxxxx -p /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/"
153#print "Executing : " + commandline
154#status = os.system(commandline)
155#if status !=0:
156#    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
157
158#Make copies of discovery and oai/difcopy areas to backup area for tape backups
159this_backupdir = backupdir + datacentre + "_" + date_string + "_difcopy"
160commandline = "mkdir " + this_backupdir
161print "Executing : " + commandline
162status = os.system(commandline)
163if status !=0:
164    sys.exit("Failed at creating backup directory %s" %this_backupdir)
165
166commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/{\} " + this_backupdir
167print "Executing : " + commandline
168status = os.system(commandline)
169if status !=0:
170    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
171
172this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
173commandline = "mkdir " + this_backupdir
174print "Executing : " + commandline
175status = os.system(commandline)
176if status !=0:
177    sys.exit("Failed at creating backup directory %s" %this_backupdir)
178
179commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir
180print "Executing : " + commandline
181status = os.system(commandline)
182if status !=0:
183    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
184       
185#Clear out the original harvest records area
186commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}"
187#print "Executing : " + commandline
188#status = os.system(commandline)
189#if status !=0:
190#    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home)
191
192
193print "======================================================"
194print "No. of files pre-processed = %s" %numfilesproc
195if status == 0:
196    print " Procedure oai_ingest.py ran to end"
197else:
198    print "Procedure oai_ingest.py FAILED with status %s" %status
199   
200print "======================================================"
Note: See TracBrowser for help on using the repository browser.