source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 1172

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py@1172
Revision 1172, 8.6 KB checked in by selatham, 14 years ago (diff)

fixing bug in directory tidying

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameter <datacentre>.
3The /usr/local/WSClients/OAIBatch directory contains this python script, a config file
4and some java which handle difs after harvesting. The pre-processed files are then ingested
5to the eXist XML db.
6
7 Under this directory the following structure should be maintained:
8
9 ./data
10 - /DATACENTRE/
11                - discovery/:           Records with namespace, schema declaration deleted. After having run the script.
12                                       Ready to ingest in the discovery service.
13                - oai/difYYYYMMDD/      Records as harvested from OAI
14
15 Where  /DATACENTRE  varies for the different data providers
16
17"""
18#History:
19# 12/05/06 SEL spelling correction
20# 30/05/06 SEL cope with many files for processing."Argument list too long" problem.
21# 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version).
22
23import os
24import sys
25import commands
26import string
27
28status = 0
29numfilesproc = 0
30harvest_home = ""
31
32if (len(sys.argv) < 2):
33    print "<datacentre>  parameter not supplied."
34    sys.exit()
35else:
36    datacentre = sys.argv[1]
37
38# Other settings and constants
39date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
40os.putenv ('EXIST_HOME', '/usr/local/eXist')
41os.putenv ('PATH', ':/usr/java/j2sdk1.4.2_04/lib/tools.jar:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch:/usr/local/eXist/bin:/bin:/usr/bin:.')
42os.putenv ('CLASSPATH','.:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch')
43
44# Get the harvested records directory for this datacentre from the config file for that data centre
45# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
46datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
47print "Datacentre config file = %s" %datacentre_config_filename
48datacentre_config_file = open(datacentre_config_filename, "r")
49
50for line in datacentre_config_file.readlines():
51    words  = string.split(line)
52    if len(words) == 0:
53        continue
54    if words[0] == 'host_path':
55        harvest_home = string.rstrip(words[1])
56        break
57
58if harvest_home == "":
59    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
60datacentre_config_file.close()
61
62if len( os.listdir(harvest_home)) == 0:
63    print "Nothing to harvest this time from %s" %datacentre
64    sys.exit()
65
66# The directory to put things for a tape backup (should already exist)
67backupdir = '/disks/glue1/oaiBackup/'
68
69# Create/clear the directory for a pristine copy of the difs in case the script rewrites something wrong
70if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"):
71    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/{\}"
72    print "Executing : " + commandline
73    status = os.system(commandline)
74else:   
75    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy"
76    print "Executing : " + commandline
77    status= os.system(commandline)
78
79if status != 0:
80    sys.exit("Failed at creating copy dir stage")
81
82# make the pristine copy. Cope with there being lots of files in the directory.
83
84commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/difcopy"
85print "Executing : " + commandline
86status = os.system(commandline)
87if status !=0:
88    sys.exit("Failed at making pristine copy stage")
89
90# Create/clear the directory for the processing copy of the difs.
91if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
92    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm -f {\}"
93    print "Executing : " + commandline
94    status = os.system(commandline)
95else:
96    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
97    print "Executing : " + commandline
98    status= os.system(commandline)
99
100# make the processing copy
101commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/discovery"
102print "Executing : " + commandline
103status = os.system(commandline)
104if status !=0:
105    sys.exit("Failed at making processing copy stage")
106
107# The file config.properties contains the name=value pair to parse the filename in java oaiProc.jar.
108# Copy the datacentre specific version of config to config.properties file.
109
110commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
111print "Executing : " + commandline
112status = os.system(commandline)
113if status !=0:
114    sys.exit("Failed at copying config file stage")
115
116#Change os directory to that with the java.jar in it.   
117os.chdir('/usr/local/WSClients/OAIBatch')
118
119
120#Execute the script which processes the files
121filenames = os.listdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery")
122for filename in filenames:
123        if filename.find('.xml') != -1:
124                full_filename = "/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/" + filename
125                print "Processing : " + full_filename
126                commandline = "java -jar /usr/local/WSClients/OAIBatch/oai_Proc.jar %s " %(full_filename)
127                print commandline
128                status= os.system(commandline)
129                if status!=0:
130                    break
131                numfilesproc += 1
132        else:
133                print 'File %s is not xml format. Not processed'  %(full_filename)
134if status!=0:
135    sys.exit("Failed at processing file %s with java oai_Proc.jar stage with status %s" %(full_filename, status))
136
137#The script reads the files from OAIBatch/data/datacentre/discovery and outputs within the same directory the files.
138#The result will get rid of the "oai%3Aucar.ncar.scd.cdp%3A" type of thing that oai adds to
139#the filenames and it will leave <DIF> as the root element.
140#
141#Once the pre-processing has finished remove the originals from the discovery directory:
142commandline = "find /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/oai* -print | xargs -i rm /{\}"
143print "Executing : " + commandline
144status = os.system(commandline)
145if status !=0:
146    sys.exit("Failed at removing original oai style records from discovery directory")
147
148# ingest the datacentres records into eXist db (backups of exist happen nightly).
149commandline = "client.sh -c /db/dif/" + datacentre + " -u admin -P xxxxxx -p /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/"
150#print "Executing : " + commandline
151status = os.system(commandline)
152if status !=0:
153    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
154
155#Make copies of discovery and oai/difcopy areas to backup area for tape backups
156this_backupdir = backupdir + datacentre + "_" + date_string + "_difcopy"
157commandline = "mkdir " + this_backupdir
158print "Executing : " + commandline
159status = os.system(commandline)
160if status !=0:
161    sys.exit("Failed at creating backup directory %s" %this_backupdir)
162
163commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/difcopy/{\} " + this_backupdir
164print "Executing : " + commandline
165status = os.system(commandline)
166if status !=0:
167    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
168
169this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
170commandline = "mkdir " + this_backupdir
171print "Executing : " + commandline
172status = os.system(commandline)
173if status !=0:
174    sys.exit("Failed at creating backup directory %s" %this_backupdir)
175
176commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir
177print "Executing : " + commandline
178status = os.system(commandline)
179if status !=0:
180    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
181       
182#Clear out the original harvest records area
183commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}"
184print "Executing : " + commandline
185status = os.system(commandline)
186if status !=0:
187    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home)
188
189
190print "======================================================"
191print "No. of files pre-processed = %s" %numfilesproc
192if status == 0:
193    print " Procedure oai_ingest.py ran to end"
194else:
195    print "Procedure oai_ingest.py FAILED with status %s" %status
196   
197print "======================================================"
Note: See TracBrowser for help on using the repository browser.