source: TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py @ 2067

Subversion URL: http://proj.badc.rl.ac.uk/svn/ndg/TI01-discovery/trunk/ingestAutomation/OAIBatch/oai_ingest.py@2067
Revision 2067, 14.0 KB checked in by selatham, 13 years ago (diff)

Plugging in new d2boneoff.

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2""" Script oai_ingest.py takes parameter <datacentre>.
3The /usr/local/WSClients/OAIBatch directory contains:-
4 - this python script,
5 - a DataProvider specific config file,
6 - the d2b.jar moles creator class which creates moles discovery records,
7 - the python module for extracting spatiotemporal information and adding to postgres db.
8Under this directory the following structure should be maintained:
9 ./data
10 - /DATACENTRE/
11                - discovery/:         Re-named documents ready to ingest in the discovery service.
12                - oai/difYYYYMMDD/    Documents as harvested from OAI
13 Where  /DATACENTRE  varies for the different data providers
14"""
15#History:
16# 12/05/06 SEL spelling correction
17# 30/05/06 SEL cope with many files for processing."Argument list too long" problem.
18# 31/05/06 SEL need to use passwords now. (replace xxxxxx in real version).
19# 16/10/06 SEL Changed to using python oaiClean.py module instead of java code.
20# 16/10/06 SEL exist db upgraded and deployed at different location, java upgrade.
21# 17/10/06 SEL cope with different discovery formats - not just DIF.
22# 23/10/06 SEL keywords not mandatory in config file.
23# 24/10/06 SEL fix bug where 'in' directory not being cleared initially. display more information when running.
24
25import os
26import sys
27import commands
28import string
29import SpaceTimeIngestFromMOLES
30#import oaiClean
31
32status = 0
33numfilesproc = 0
34harvest_home = ""
35datacentre_groups = ""
36datacentre_format = ""
37datacentre_namespace = ""
38NDG_dataProvider = False
39
40if (len(sys.argv) < 2):
41    print "<datacentre>  parameter not supplied."
42    sys.exit()
43else:
44    datacentre = sys.argv[1]
45
46# Other settings and constants
47date_string = commands.getoutput ("date +'%y%m%d_%H%M'")
48os.putenv ('EXIST_HOME', '/usr/local/exist-client')
49os.putenv ('JAVA_HOME', '/usr/java/jdk1.5.0_03')
50os.putenv ('PATH', ':/usr/java/jdk1.5.0_03/bin:/usr/java/jdk1.5.0_03:/usr/java/jdk1.5.0_03/lib/tools.jar:/usr/local/WSClients/OAIBatch:/usr/local/exist-client/bin:/bin:/usr/bin:.')
51os.putenv ('CLASSPATH','.:/usr/java/jdk1.5.0_03/lib/tools.jar')
52
53# Get the harvested records directory and groups for this datacentre from the datacentre specific config file
54# The harvested records directory depends on the datacentres OAI base url, the set and  format. These have to be know up-front.
55# The groups denote which 'portal groups' they belong to - for limiting searches to say NERC-only datacentres records.
56# Groups are added to the intermediate MOLES when it is created.
57datacentre_config_filename = "/usr/local/WSClients/OAIBatch/" + datacentre + "_config.properties"
58print "Datacentre config file = %s" %datacentre_config_filename
59datacentre_config_file = open(datacentre_config_filename, "r")
60
61for line in datacentre_config_file.readlines():
62    words  = string.split(line)
63    if len(words) == 0:
64        continue
65    if words[0] == 'host_path':
66        harvest_home = string.rstrip(words[1])
67    if words[0] == 'groups':
68        datacentre_groups = words[1:]
69    if words[0] == 'format':
70        datacentre_format = words[1]
71    if words[0] == 'namespace':
72        datacentre_namespace = words[1]
73    if words[0] == 'NDG_dataProvider':
74        NDG_dataProvider = True
75
76datacentre_config_file.close()
77
78if harvest_home == "":
79    sys.exit("Failed at getting harvested records directory stage. datacentre config file tried = %s" %datacentre_config_filename)
80else:
81    print "INFO: harvested records are in %s" %harvest_home
82
83if datacentre_groups == "":
84    print "INFO: No groups/keywords set for datacentre %s" %datacentre
85else:
86    print "INFO: datacentre groups/keywords = %s" %datacentre_groups
87
88if datacentre_format == "":
89    sys.exit("Failed at stage: getting datacentre format. datacentre config file tried = %s" %datacentre_config_filename)
90else:
91    print "INFO: format being harvested = %s" %datacentre_format
92
93if datacentre_namespace == "":
94    sys.exit("Failed at stage: getting datacentre namespace. datacentre config file tried = %s" %datacentre_config_filename)
95else:
96    print "INFO: datacentre namespace = %s" %datacentre_namespace
97
98#any records to harvest?
99if len( os.listdir(harvest_home)) == 0:
100    print "Nothing to harvest this time from %s" %datacentre
101    sys.exit()
102
103# The directory to put things for a tape backup (should already exist)
104backupdir = '/disks/glue1/oaiBackup/'
105
106# Create/clear the 'in' directory pristine copy of the discovery records
107if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"):
108    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\}"
109    print "Executing : " + commandline
110    status = os.system(commandline)
111else:
112    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
113    print "Executing : " + commandline
114    status= os.system(commandline)
115
116if status != 0:
117    sys.exit("Failed at creating copy dir stage")
118
119# make the 'in' pristine copy. Cope with there being lots of files in the directory.
120
121commandline = "ls -1 " + harvest_home + "/ | xargs -i cp " + harvest_home + "/{\} /usr/local/WSClients/OAIBatch/data/" + datacentre + "/oai/originals"
122print "Executing : " + commandline
123status = os.system(commandline)
124if status !=0:
125    sys.exit("Failed at making pristine copy stage")
126
127# Create/clear the directory for the 'out' processed copy of the discovery records.
128if os.path.isdir("/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"):
129    commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i rm /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\}"
130    print "Executing : " + commandline
131    status = os.system(commandline)
132else:
133    commandline = "mkdir /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
134    print "Executing : " + commandline
135    status= os.system(commandline)
136
137# The file config.properties contains the location of the particular datacentres harvested records.
138# Copy the datacentre specific version of config to config.properties file.
139commandline = "cp /usr/local/WSClients/OAIBatch/" + datacentre +"_config.properties /usr/local/WSClients/OAIBatch/config.properties"
140print "Executing : " + commandline
141status = os.system(commandline)
142if status !=0:
143    sys.exit("Failed at copying config file stage")
144
145#Change os directory to that with the code in it.
146os.chdir('/usr/local/WSClients/OAIBatch')
147
148
149#Execute the script which processes/renames the files (changed 08/01/07 to get id from inside file)
150indir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals"
151outdir="/usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery"
152#wrapFlag=False
153filenames = os.listdir(indir)
154for filename in filenames:
155        if filename.find('.xml') != -1:
156                original_filename = indir + "/" + filename
157                if datacentre_format == "DIF":
158                    from DIF import DIF
159                    from ETxmlView import loadET
160                    xml=file(original_filename).read()
161                    y=loadET(xml)
162                    d=DIF(xml)
163                    #print "ID extracted from the DIF = %s" %d.entryID
164                    if NDG_dataProvider:
165                        new_filename = outdir + "/"+d.entryID.replace(":","__")+".xml"
166                    else:
167                        new_filename = outdir + "/" +datacentre_namespace+ "__"+datacentre_format+ "__"+d.entryID+".xml"
168                else:
169                    sys.exit("Doesn't handle anything else but DIF here.")
170                #print "original file = %s, newfile = %s" %(original_filename, new_filename)
171                commandline = "cp "+original_filename+ " " +new_filename
172                #print "Executing : " + commandline
173                status = os.system(commandline)
174                if status !=0:
175                    sys.exit("Failed at re-naming file stage")
176                numfilesproc += 1
177        else:
178                print 'File %s is not xml format. Not processed'  %(full_filename)
179
180# ingest the datacentres records into eXist db (backups of exist happen nightly).
181commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/original/"+datacentre_format+ "/" +datacentre_namespace+ " -u admin -P xxxxxx -p "+outdir
182print "Executing : actual command to ingest into exist db"
183status = os.system(commandline)
184if status !=0:
185    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
186
187#are there any old records hanging around.If so, copy away
188try:
189    os.stat("./DIF2MOLES")
190except:
191    print "No old moles records hanging around"
192else:
193    commandline = "ls -1 ./DIF2MOLES | xargs -i rm ./DIF2MOLES/{\}"
194    print "Executing : " + commandline
195    status = os.system(commandline)
196    if status !=0:
197        sys.exit("Failed at clearing out DIF2MOLES area.")
198    #commandline = "rmdir ./DIF2MOLES"
199    #print "Executing : " + commandline
200    #status = os.system(commandline)
201    i#f status !=0:
202     #   sys.exit("Failed at removing DIF2MOLES directory.")
203
204# Then run the minimum moles creator for each discovery record
205# Put records in ./DIF2MOLES with original filename
206filenames = os.listdir(outdir)
207for filename in filenames:
208        if filename.find('.xml') != -1:
209                if datacentre_format == "DIF":
210                    original_filename = outdir + "/" + filename
211                    from DIF import DIF
212                    from ETxmlView import loadET
213                    xml=file(original_filename).read()
214                    y=loadET(xml)
215                    d=DIF(xml)
216                    print "ID extracted from the DIF = %s" %d.entryID
217                    if NDG_dataProvider:
218                        molesLocalID = d.entryID.split(":",2)[2]
219                    else:
220                        molesLocalID = d.entryID
221                    print "molesLocalID is %s" %molesLocalID
222                    commandline = "java -jar D2B/d2boneoff.jar repositoryID " +datacentre_namespace+" repositoryLocalID "+datacentre+" format "+ \
223                    datacentre_format+" repository xmldb:exist://glue.badc.rl.ac.uk:8080/exist/xmlrpc userpw xxxxxx targetCollection /db/discovery/original/"+ \
224                    datacentre_format+"/"+datacentre_namespace +" inputRecordID "+d.entryID+ " outputLocalID "+molesLocalID+ " > ./DIF2MOLES/"+filename
225                    print "Executing command to run d2boneoff.jar"
226                    status= os.system(commandline)
227                    if status==10:
228                        print "WARNING: couldn't find the record"
229                    elif status!=0:
230                        print "ERROR: couldn't create the minimum moles records"
231                        sys.exit
232#There should be some records now
233try:
234    os.stat("./DIF2MOLES")
235except:
236    print "ERROR: couldn't create any minimum moles records for %s" %datacentre
237    sys.exit()
238
239# ingest the created discovery minimum molesrecords into eXist db.
240commandline = "$EXIST_HOME/bin/client.sh -c /db/discovery/moles -u admin -P xxxxxx -p ./DIF2MOLES"
241print "Executing : actual command to ingest into exist db"
242status = os.system(commandline)
243if status !=0:
244    sys.exit("Failed at ingesting into exist db. Datacentre =  %s. Status = %s" %(datacentre,status))
245
246#Extract the spatiotemporal info from created moles and put in Postgres db
247SpaceTimeIngestFromMOLES.main("./DIF2MOLES")
248
249#Make copies of discovery and oai/originals and DIF2MOLES areas to backup area for tape backups
250this_backupdir = backupdir + datacentre + "_" + date_string + "_originals"
251commandline = "mkdir " + this_backupdir
252print "Executing : " + commandline
253status = os.system(commandline)
254if status !=0:
255    sys.exit("Failed at creating backup directory %s" %this_backupdir)
256
257commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/oai/originals/{\} " + this_backupdir
258print "Executing : " + commandline
259status = os.system(commandline)
260if status !=0:
261    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
262
263this_backupdir = backupdir + datacentre + "_" + date_string + "_discovery"
264commandline = "mkdir " + this_backupdir
265print "Executing : " + commandline
266status = os.system(commandline)
267if status !=0:
268    sys.exit("Failed at creating backup directory %s" %this_backupdir)
269
270commandline = "ls -1 /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/ | xargs -i cp /usr/local/WSClients/OAIBatch/data/" + datacentre +"/discovery/{\} " + this_backupdir
271print "Executing : " + commandline
272status = os.system(commandline)
273if status !=0:
274    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
275
276this_backupdir = backupdir + datacentre + "_" + date_string + "_DIF2MOLES"
277commandline = "mkdir " + this_backupdir
278print "Executing : " + commandline
279status = os.system(commandline)
280if status !=0:
281    sys.exit("Failed at creating backup directory %s" %this_backupdir)
282
283commandline = "ls -1 ./DIF2MOLES | xargs -i cp ./DIF2MOLES/{\} " + this_backupdir
284print "Executing : " + commandline
285status = os.system(commandline)
286if status !=0:
287    sys.exit("Failed at copying to backup directory %s" %this_backupdir)
288
289#Clear out the original harvest records area and DIF2MOLES
290commandline = "ls -1 " + harvest_home + " | xargs -i rm " + harvest_home + "/{\}"
291print "Executing : " + commandline
292status = os.system(commandline)
293if status !=0:
294    sys.exit("Failed at clearing out original harvest records area %s" %harvest_home)
295
296commandline = "ls -1 ./DIF2MOLES | xargs -i rm ./DIF2MOLES/{\}"
297print "Executing : " + commandline
298status = os.system(commandline)
299if status !=0:
300    sys.exit("Failed at clearing out DIF2MOLES area %s" %harvest_home)
301
302#remove the DIF2MOLES directory
303##commandline = "rmdir ./DIF2MOLES"
304#print "Executing : " + commandline
305#status = os.system(commandline)
306#if status !=0:
307#    sys.exit("Failed at removing DIF2MOLES directory %s" %harvest_home)
308
309print "======================================================"
310print "No. of files pre-processed = %s" %numfilesproc
311if status == 0:
312    print " Procedure oai_ingest.py ran to end"
313else:
314    print "Procedure oai_ingest.py FAILED with status %s" %status
315
316print "======================================================"
Note: See TracBrowser for help on using the repository browser.