source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 1755

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py@1755
Revision 1755, 9.8 KB checked in by selatham, 13 years ago (diff)

corrected boolean.

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameter <datacentre>.
3The /usr/local/WSClients/OAIBatch directory contains this python script, a config file
4and the oaiClean.py class which cleans up discovery records after harvesting.
5The pre-processed files are then ingested to the eXist XML db.
6
7 Under this directory the following structure should be maintained:
8
9 ./data
10 - /DATACENTRE/
11                - discovery/:         Records with namespace, schema declaration deleted - after having run
12                                      the oaiClean script. Ready to ingest in the discovery service.
13                - oai/difYYYYMMDD/    Records as harvested from OAI
14
15 Where  /DATACENTRE  varies for the different data providers
16
17"""
18#History:
19# 12/05/06 SEL spelling correction
20# 30/05/06 SEL cope with many files for processing."Argument list too long" problem.
21# 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version).
22# 16/10/06 SEL Changed to using python oaiClean.py module instead of java code.
23# 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade.
24# 17/10/06 SEL cope with different discovery formats - not just DIF.
25# 23/10/06 SEL keywords not mandatory in config file.
26# 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running.
27
28import os
29import sys
30import commands
31import string
32import oaiClean
33
34status = 0
35numfilesproc = 0
36harvest_home = ""
37datacentre_groups = ""
38datacentre_format = ""
39
40if (len(sys.argv) < 2):
41    print "<datacentre>  parameter not supplied."
42    sys.exit()
43else:
44    datacentre = sys.argv[1]
45
46# Other settings and constants
47date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
48os.putenv ('EXIST_HOME', '/usr/local/exist-client')
49os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/jre:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
50os.putenv ('CLASSPATH','.:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch')
51
52# Get the harvested records directory and groups for this datacentre from the datacentre specific config file
53# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
54# The groups denote which 'portal groups' they belong to - for limiting searches to say NERC-only datacentres records.
55# Groups are added to the xml record by oaiClean.py.
56datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
57print "Datacentre config file = %s" %datacentre_config_filename
58datacentre_config_file = open(datacentre_config_filename, "r")
59
60for line in datacentre_config_file.readlines():
61    words  = string.split(line)
62    if len(words) == 0:
63        continue
64    if words[0] == 'host_path':
65        harvest_home = string.rstrip(words[1])
66    if words[0] == 'groups':
67        datacentre_groups = words[1:]
68    if words[0] == 'format':
69        datacentre_format = words[1]
70datacentre_config_file.close()
71if harvest_home == "":
72    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
73else:
74    print "INFO: harvested records are in %s" %harvest_home
75
76if datacentre_groups == "":
77    print "INFO: No groups/keywords set for datacentre %s" %datacentre
78else:
79    print "INFO: datacentre groups/keywords = %s" %datacentre_groups
80
81if datacentre_format == "":
82    sys.exit("Failed at stage: getting datacentre format. datacentre config file tried = %s" %datacentre_config_filename)
83else:
84    print "INFO: format being harvested = %s" %datacentre_format
85
86#any records to harvest?
87if len( os.listdir(harvest_home)) == 0:
88    print "Nothing to harvest this time from %s" %datacentre
89    sys.exit()
90
91# The directory to put things for a tape backup (should already exist)
92backupdir = '/disks/glue1/oaiBackup/'
93
94# Create/clear the 'in' directory pristine copy of the discovery records
95if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"):
96    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\}"
97    print "Executing : " + commandline
98    status = os.system(commandline)
99else:
100    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
101    print "Executing : " + commandline
102    status= os.system(commandline)
103
104if status != 0:
105    sys.exit("Failed at creating copy dir stage")
106
107# make the 'in' pristine copy. Cope with there being lots of files in the directory.
108
109commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/originals"
110print "Executing : " + commandline
111status = os.system(commandline)
112if status !=0:
113    sys.exit("Failed at making pristine copy stage")
114
115# Create/clear the directory for the 'out' processed copy of the discovery records.
116if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
117    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}"
118    print "Executing : " + commandline
119    status = os.system(commandline)
120else:
121    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
122    print "Executing : " + commandline
123    status= os.system(commandline)
124
125# Removed 16/10/06 - directory will hold 'out' processed records only
126# make the processing copy
127#commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/discovery"
128#print "Executing : " + commandline
129#status = os.system(commandline)
130#if status !=0:
131#    sys.exit("Failed at making processing copy stage")
132
133# The file config.properties contains the location of the particular datacentres harvested records.
134# Copy the datacentre specific version of config to config.properties file.
135commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
136print "Executing : " + commandline
137status = os.system(commandline)
138if status !=0:
139    sys.exit("Failed at copying config file stage")
140
141#Change os directory to that with the oaiClean.py in it. (need this?)
142os.chdir('/usr/local/WSClients/OAIBatch')
143
144
145#Execute the script which processes the files
146indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
147outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
148wrapFlag=False
149filenames = os.listdir(indir)
150for filename in filenames:
151        if filename.find('.xml') != -1:
152                full_filename = indir + "/" + filename
153                print "Processing : " + full_filename
154                #try:
155                oaiClean.oaiClean(indir,outdir,filename,wrapFlag)
156                #except:
157                #    sys.exit("Failed at processing file %s with oaiClean.py stage with status %s" %(full_filename, sys.exc_info()))
158                #    break
159                numfilesproc += 1
160        else:
161                print 'File %s is not xml format. Not processed'  %(full_filename)
162
163# Removed 16/10/06 Don't need this anymore
164#Once the pre-processing has finished remove the originals from the discovery directory:
165#commandline = "find /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/oai* -print | xargs -i rm /{\}"
166#print "Executing : " + commandline
167#status = os.system(commandline)
168#if status !=0:
169#    sys.exit("Failed at removing original oai style records from discovery directory")
170
171
172# ingest the datacentres records into eXist db (backups of exist happen nightly).
173commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/" +datacentre_format+ "/" + datacentre + " -u admin -P xxxxxx -p /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/"
174print "Executing : actual command to ingest into exist db"
175status = os.system(commandline)
176if status !=0:
177    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
178
179#Make copies of discovery and oai/originals areas to backup area for tape backups
180this_backupdir = backupdir + datacentre + "_" + date_string + "_originals"
181commandline = "mkdir " + this_backupdir
182print "Executing : " + commandline
183status = os.system(commandline)
184if status !=0:
185    sys.exit("Failed at creating backup directory %s" %this_backupdir)
186
187commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\} " + this_backupdir
188print "Executing : " + commandline
189status = os.system(commandline)
190if status !=0:
191    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
192
193this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
194commandline = "mkdir " + this_backupdir
195print "Executing : " + commandline
196status = os.system(commandline)
197if status !=0:
198    sys.exit("Failed at creating backup directory %s" %this_backupdir)
199
200commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir
201print "Executing : " + commandline
202status = os.system(commandline)
203if status !=0:
204    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
205
206#Clear out the original harvest records area
207commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}"
208print "Executing : " + commandline
209status = os.system(commandline)
210if status !=0:
211    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home)
212
213
214print "======================================================"
215print "No. of files pre-processed = %s" %numfilesproc
216if status == 0:
217    print " Procedure oai_ingest.py ran to end"
218else:
219    print "Procedure oai_ingest.py FAILED with status %s" %status
220
221print "======================================================"
Note: See TracBrowser for help on using the repository browser.