source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 1898

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py@1898
Revision 1898, 11.8 KB checked in by selatham, 13 years ago (diff)

further ingest work

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameter <datacentre>.
3The /usr/local/WSClients/OAIBatch directory contains:-
4 - this python script,
5 - a DataProvider specific config file,
6 - the d2b.jar moles creator class which creates moles discovery records,
7 - the python module for extracting spatiotemporal information and adding to postgres db.
8Under this directory the following structure should be maintained:
9 ./data
10 - /DATACENTRE/
11                - discovery/:         Re-named documents ready to ingest in the discovery service.
12                - oai/difYYYYMMDD/    Documents as harvested from OAI
13 Where  /DATACENTRE  varies for the different data providers
14"""
15#History:
16# 12/05/06 SEL spelling correction
17# 30/05/06 SEL cope with many files for processing."Argument list too long" problem.
18# 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version).
19# 16/10/06 SEL Changed to using python oaiClean.py module instead of java code.
20# 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade.
21# 17/10/06 SEL cope with different discovery formats - not just DIF.
22# 23/10/06 SEL keywords not mandatory in config file.
23# 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running.
24
25import os
26import sys
27import commands
28import string
29import SpaceTimeIngestFromMOLES
30#import oaiClean
31
32status = 0
33numfilesproc = 0
34harvest_home = ""
35datacentre_groups = ""
36datacentre_format = ""
37datacentre_namespace = ""
38
39if (len(sys.argv) < 2):
40    print "<datacentre>  parameter not supplied."
41    sys.exit()
42else:
43    datacentre = sys.argv[1]
44
45# Other settings and constants
46date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
47os.putenv ('EXIST_HOME', '/usr/local/exist-client')
48os.putenv ('JAVA_HOME', '/usr/java/jdk1.5.0_03')
49os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
50os.putenv ('CLASSPATH','.:/usr/java/jdk1.5.0_03/lib/tools.jar')
51
52# Get the harvested records directory and groups for this datacentre from the datacentre specific config file
53# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
54# The groups denote which 'portal groups' they belong to - for limiting searches to say NERC-only datacentres records.
55# Groups are added to the intermediate MOLES when it is created.
56datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
57print "Datacentre config file = %s" %datacentre_config_filename
58datacentre_config_file = open(datacentre_config_filename, "r")
59
60for line in datacentre_config_file.readlines():
61    words  = string.split(line)
62    if len(words) == 0:
63        continue
64    if words[0] == 'host_path':
65        harvest_home = string.rstrip(words[1])
66    if words[0] == 'groups':
67        datacentre_groups = words[1:]
68    if words[0] == 'format':
69        datacentre_format = words[1]
70    if words[0] == 'namespace':
71        datacentre_namespace = words[1]
72datacentre_config_file.close()
73
74if harvest_home == "":
75    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
76else:
77    print "INFO: harvested records are in %s" %harvest_home
78
79if datacentre_groups == "":
80    print "INFO: No groups/keywords set for datacentre %s" %datacentre
81else:
82    print "INFO: datacentre groups/keywords = %s" %datacentre_groups
83
84if datacentre_format == "":
85    sys.exit("Failed at stage: getting datacentre format. datacentre config file tried = %s" %datacentre_config_filename)
86else:
87    print "INFO: format being harvested = %s" %datacentre_format
88
89if datacentre_namespace == "":
90    sys.exit("Failed at stage: getting datacentre namespace. datacentre config file tried = %s" %datacentre_config_filename)
91else:
92    print "INFO: datacentre namespace = %s" %datacentre_namespace
93
94#any records to harvest?
95if len( os.listdir(harvest_home)) == 0:
96    print "Nothing to harvest this time from %s" %datacentre
97    sys.exit()
98
99# The directory to put things for a tape backup (should already exist)
100backupdir = '/disks/glue1/oaiBackup/'
101
102# Create/clear the 'in' directory pristine copy of the discovery records
103if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"):
104    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\}"
105    print "Executing : " + commandline
106    status = os.system(commandline)
107else:
108    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
109    print "Executing : " + commandline
110    status= os.system(commandline)
111
112if status != 0:
113    sys.exit("Failed at creating copy dir stage")
114
115# make the 'in' pristine copy. Cope with there being lots of files in the directory.
116
117commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/originals"
118print "Executing : " + commandline
119status = os.system(commandline)
120if status !=0:
121    sys.exit("Failed at making pristine copy stage")
122
123# Create/clear the directory for the 'out' processed copy of the discovery records.
124if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
125    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}"
126    print "Executing : " + commandline
127    status = os.system(commandline)
128else:
129    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
130    print "Executing : " + commandline
131    status= os.system(commandline)
132
133# The file config.properties contains the location of the particular datacentres harvested records.
134# Copy the datacentre specific version of config to config.properties file.
135commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
136print "Executing : " + commandline
137status = os.system(commandline)
138if status !=0:
139    sys.exit("Failed at copying config file stage")
140
141#Change os directory to that with the code in it.
142os.chdir('/usr/local/WSClients/OAIBatch')
143
144
145#Execute the script which processes/renames the files (changed 24/11/06 to simply create a re-named file in the outdir)
146indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
147outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
148#wrapFlag=False
149filenames = os.listdir(indir)
150for filename in filenames:
151        if filename.find('.xml') != -1:
152                original_filename = indir + "/" + filename
153                #print "Creating renamed file : "
154                new_filename = outdir + "/" +datacentre_namespace+ "__" +filename.split('%3A')[-1]
155                #print "original file = %s, newfile = %s" %(original_filename, new_filename)
156                commandline = "cp "+original_filename+ " " +new_filename
157                #print "Executing : " + commandline
158                status = os.system(commandline)
159                if status !=0:
160                        sys.exit("Failed at re-naming file stage")
161                #oaiClean.oaiClean(indir,outdir,filename,wrapFlag)
162                numfilesproc += 1
163        else:
164                print 'File %s is not xml format. Not processed'  %(full_filename)
165
166# ingest the datacentres records into eXist db (backups of exist happen nightly).
167commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P xxxxxx -p "+outdir
168print "Executing : actual command to ingest into exist db"
169status = os.system(commandline)
170if status !=0:
171    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
172
173# Then run the minimum moles creator  which will run over all records in the supplied collection
174# creates a directory ./DIF2MOLES to pass back records with original filename
175commandline = "java -jar D2B/d2b.jar repositoryID " +datacentre_namespace+" repositoryLocalID "+datacentre+" format "+datacentre_format+" repository xmldb:exist://glue.badc.rl.ac.uk:8080/exist/xmlrpc userpw xxxxxx targetCollection /db/discovery/original/"+datacentre_format+"/"+datacentre_namespace
176print "Executing command to run d2b.jar"
177status= os.system(commandline)
178if status!=0:
179    print "ERROR: couldn't create the minimum moles records"
180    sys.exit
181#are there any records
182outdir = "./DIF2MOLES"
183try:
184    os.stat(outdir)
185except:
186    print "ERROR: couldn't create the minimum moles records for %s" %datacentre
187    sys.exit()
188
189# ingest the created discovery minimum molesrecords into eXist db.
190commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/moles -u admin -P xxxxxx -p ./DIF2MOLES"
191print "Executing : actual command to ingest into exist db"
192status = os.system(commandline)
193if status !=0:
194    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
195
196#Extract the spatiotemporal info from created moles and put in Postgres db
197SpaceTimeIngestFromMOLES.main(outdir)
198
199#Make copies of discovery and oai/originals and DIF2MOLES areas to backup area for tape backups
200this_backupdir = backupdir + datacentre + "_" + date_string + "_originals"
201commandline = "mkdir " + this_backupdir
202print "Executing : " + commandline
203status = os.system(commandline)
204if status !=0:
205    sys.exit("Failed at creating backup directory %s" %this_backupdir)
206
207commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\} " + this_backupdir
208print "Executing : " + commandline
209status = os.system(commandline)
210if status !=0:
211    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
212
213this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
214commandline = "mkdir " + this_backupdir
215print "Executing : " + commandline
216status = os.system(commandline)
217if status !=0:
218    sys.exit("Failed at creating backup directory %s" %this_backupdir)
219
220commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir
221print "Executing : " + commandline
222status = os.system(commandline)
223if status !=0:
224    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
225
226this_backupdir = backupdir + datacentre + "_" + date_string + "_DIF2MOLES"
227commandline = "mkdir " + this_backupdir
228print "Executing : " + commandline
229status = os.system(commandline)
230if status !=0:
231    sys.exit("Failed at creating backup directory %s" %this_backupdir)
232
233commandline = "ls -1 ./DIF2MOLES | xargs -i cp ./DIF2MOLES/{\} " + this_backupdir
234print "Executing : " + commandline
235status = os.system(commandline)
236if status !=0:
237    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
238
239#Clear out the original harvest records area and DIF2MOLES
240commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}"
241print "Executing : " + commandline
242status = os.system(commandline)
243if status !=0:
244    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home)
245
246commandline = "ls -1 ./DIF2MOLES | xargs -i rm ./DIF2MOLES/{\}"
247print "Executing : " + commandline
248status = os.system(commandline)
249if status !=0:
250    sys.exit("Failed at clearing out DIF2MOLES area %s" %harvest_home)
251
252#remove the DIF2MOLES directory
253commandline = "rmdir ./DIF2MOLES"
254print "Executing : " + commandline
255status = os.system(commandline)
256if status !=0:
257    sys.exit("Failed at removing DIF2MOLES directory %s" %harvest_home)
258
259print "======================================================"
260print "No. of files pre-processed = %s" %numfilesproc
261if status == 0:
262    print " Procedure oai_ingest.py ran to end"
263else:
264    print "Procedure oai_ingest.py FAILED with status %s" %status
265
266print "======================================================"
Note: See TracBrowser for help on using the repository browser.