source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 1880

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py@1880
Revision 1880, 11.7 KB checked in by selatham, 13 years ago (diff)

putting all new parts in

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameter <datacentre>.
3The /usr/local/WSClients/OAIBatch directory contains this python script, a DataProvider specific config file
4and the oaiClean.py class which cleans up discovery records after harvesting.
5The pre-processed files are then ingested to the eXist XML db.
6
7 Under this directory the following structure should be maintained:
8
9 ./data
10 - /DATACENTRE/
11                - discovery/:         Records with namespace, schema declaration deleted - after having run
12                                      the oaiClean script. Ready to ingest in the discovery service.
13                - oai/difYYYYMMDD/    Records as harvested from OAI
14
15 Where  /DATACENTRE  varies for the different data providers
16
17"""
18#History:
19# 12/05/06 SEL spelling correction
20# 30/05/06 SEL cope with many files for processing."Argument list too long" problem.
21# 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version).
22# 16/10/06 SEL Changed to using python oaiClean.py module instead of java code.
23# 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade.
24# 17/10/06 SEL cope with different discovery formats - not just DIF.
25# 23/10/06 SEL keywords not mandatory in config file.
26# 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running.
27
28import os
29import sys
30import commands
31import string
32#import oaiClean
33
34status = 0
35numfilesproc = 0
36harvest_home = ""
37datacentre_groups = ""
38datacentre_format = ""
39datacentre_namespace = ""
40
41if (len(sys.argv) < 2):
42    print "<datacentre>  parameter not supplied."
43    sys.exit()
44else:
45    datacentre = sys.argv[1]
46
47# Other settings and constants
48date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
49os.putenv ('EXIST_HOME', '/usr/local/exist-client')
50os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/jre:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
51os.putenv ('CLASSPATH','.:/usr/java/j2sdk1.4.2_04/bin:/usr/local/WSClients/OAIBatch')
52
53# Get the harvested records directory and groups for this datacentre from the datacentre specific config file
54# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
55# The groups denote which 'portal groups' they belong to - for limiting searches to say NERC-only datacentres records.
56# Groups are added to the intermediate MOLES when it is created.
57datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
58print "Datacentre config file = %s" %datacentre_config_filename
59datacentre_config_file = open(datacentre_config_filename, "r")
60
61for line in datacentre_config_file.readlines():
62    words  = string.split(line)
63    if len(words) == 0:
64        continue
65    if words[0] == 'host_path':
66        harvest_home = string.rstrip(words[1])
67    if words[0] == 'groups':
68        datacentre_groups = words[1:]
69    if words[0] == 'format':
70        datacentre_format = words[1]
71    if words[0] == 'namespace':
72        datacentre_namespace = words[1]
73datacentre_config_file.close()
74
75if harvest_home == "":
76    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
77else:
78    print "INFO: harvested records are in %s" %harvest_home
79
80if datacentre_groups == "":
81    print "INFO: No groups/keywords set for datacentre %s" %datacentre
82else:
83    print "INFO: datacentre groups/keywords = %s" %datacentre_groups
84
85if datacentre_format == "":
86    sys.exit("Failed at stage: getting datacentre format. datacentre config file tried = %s" %datacentre_config_filename)
87else:
88    print "INFO: format being harvested = %s" %datacentre_format
89
90if datacentre_namespace == "":
91    sys.exit("Failed at stage: getting datacentre namespace. datacentre config file tried = %s" %datacentre_config_filename)
92else:
93    print "INFO: datacentre namespace = %s" %datacentre_namespace
94
95#any records to harvest?
96if len( os.listdir(harvest_home)) == 0:
97    print "Nothing to harvest this time from %s" %datacentre
98    sys.exit()
99
100# The directory to put things for a tape backup (should already exist)
101backupdir = '/disks/glue1/oaiBackup/'
102
103# Create/clear the 'in' directory pristine copy of the discovery records
104if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"):
105    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\}"
106    print "Executing : " + commandline
107    status = os.system(commandline)
108else:
109    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
110    print "Executing : " + commandline
111    status= os.system(commandline)
112
113if status != 0:
114    sys.exit("Failed at creating copy dir stage")
115
116# make the 'in' pristine copy. Cope with there being lots of files in the directory.
117
118commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/originals"
119print "Executing : " + commandline
120status = os.system(commandline)
121if status !=0:
122    sys.exit("Failed at making pristine copy stage")
123
124# Create/clear the directory for the 'out' processed copy of the discovery records.
125if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
126    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}"
127    print "Executing : " + commandline
128    status = os.system(commandline)
129else:
130    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
131    print "Executing : " + commandline
132    status= os.system(commandline)
133
134# The file config.properties contains the location of the particular datacentres harvested records.
135# Copy the datacentre specific version of config to config.properties file.
136commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
137print "Executing : " + commandline
138status = os.system(commandline)
139if status !=0:
140    sys.exit("Failed at copying config file stage")
141
142#Change os directory to that with the other code in it. (need this?)
143os.chdir('/usr/local/WSClients/OAIBatch')
144
145
146#Execute the script which processes/renames the files (changed 24/11/06 to simply create a re-named file in the outdir)
147indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
148outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
149#wrapFlag=False
150filenames = os.listdir(indir)
151for filename in filenames:
152        if filename.find('.xml') != -1:
153                original_filename = indir + "/" + filename
154                #print "Creating renamed file : "
155                new_filename = outdir + "/" +datacentre_namespace+ "__" +filename.split('%3A')[-1]
156                #print "original file = %s, newfile = %s" %(original_filename, new_filename)
157                commandline = "cp "+original_filename+ " " +new_filename
158                print "Executing : " + commandline
159                status = os.system(commandline)
160                if status !=0:
161                        sys.exit("Failed at re-naming file stage")
162                #oaiClean.oaiClean(indir,outdir,filename,wrapFlag)
163                numfilesproc += 1
164        else:
165                print 'File %s is not xml format. Not processed'  %(full_filename)
166
167# ingest the datacentres records into eXist db (backups of exist happen nightly).
168commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P xxxxxx -p "+outdir
169print "Executing : actual command to ingest into exist db"
170status = os.system(commandline)
171if status !=0:
172    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
173
174# Then run the minimum moles creator  which will run over all records in the supplied collection
175# creates a directory ./DIF2MOLES to pass back records with original filename
176commandline = "java -jar d2b.jar repositoryID " +datacentre_namespace+" repositoryLocalID "+datacentre+" format "+datacentre_format+" repository xmldb:exist://glue.badc.rl.ac.uk:8080/exist/xmlrpc userpw xxxxxx targetCollection /db/discovery/original/"+datacentre_format+"/"+datacentre_namespace
177print commandline
178status= os.system(commandline)
179if status!=0:
180    print "ERROR: couldn't create the minimum moles records"
181    sys.exit
182
183# ingest the created discovery minimum molesrecords into eXist db.
184commandline = "$EXIST_HOME/bin/client.sh -c ./DIF2MOLES -u admin -P xxxxxx -p ./DIF2MOLES"
185print "Executing : actual command to ingest into exist db"
186status = os.system(commandline)
187if status !=0:
188    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
189
190#Extract the spatiotemporal info from created moles and put in Postgres db
191outdir = "./DIF2MOLES"
192try:
193    SpaceTimeIngestFromMOLES.main(outdir)
194except:
195    print "ERROR: SpaceTimeIngestFromMOLES failed. Carrying on to do backups"
196
197#Make copies of discovery and oai/originals and DIF2MOLES areas to backup area for tape backups
198this_backupdir = backupdir + datacentre + "_" + date_string + "_originals"
199commandline = "mkdir " + this_backupdir
200print "Executing : " + commandline
201status = os.system(commandline)
202if status !=0:
203    sys.exit("Failed at creating backup directory %s" %this_backupdir)
204
205commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\} " + this_backupdir
206print "Executing : " + commandline
207status = os.system(commandline)
208if status !=0:
209    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
210
211this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
212commandline = "mkdir " + this_backupdir
213print "Executing : " + commandline
214status = os.system(commandline)
215if status !=0:
216    sys.exit("Failed at creating backup directory %s" %this_backupdir)
217
218commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir
219print "Executing : " + commandline
220status = os.system(commandline)
221if status !=0:
222    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
223
224this_backupdir = backupdir + datacentre + "_" + date_string + "_DIF2MOLES"
225commandline = "mkdir " + this_backupdir
226print "Executing : " + commandline
227status = os.system(commandline)
228if status !=0:
229    sys.exit("Failed at creating backup directory %s" %this_backupdir)
230
231commandline = "ls -1 ./DIF2MOLES | xargs -i cp ./DIF2MOLES{\} " + this_backupdir
232print "Executing : " + commandline
233status = os.system(commandline)
234if status !=0:
235    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
236
237#Clear out the original harvest records area and DIF2MOLES
238commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}"
239print "Executing : " + commandline
240status = os.system(commandline)
241if status !=0:
242    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home)
243
244commandline = "ls -1 ./DIF2MOLES | xargs -i rm ./DIF2MOLES/{\}"
245print "Executing : " + commandline
246status = os.system(commandline)
247if status !=0:
248    sys.exit("Failed at clearing out DIF2MOLES area %s" %harvest_home)
249
250#remove the DIF2MOLES directory
251commandline = "rmdir ./DIF2MOLES"
252print "Executing : " + commandline
253status = os.system(commandline)
254if status !=0:
255    sys.exit("Failed at removing DIF2MOLES directory %s" %harvest_home)
256
257print "======================================================"
258print "No. of files pre-processed = %s" %numfilesproc
259if status == 0:
260    print " Procedure oai_ingest.py ran to end"
261else:
262    print "Procedure oai_ingest.py FAILED with status %s" %status
263
264print "======================================================"
Note: See TracBrowser for help on using the repository browser.